diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,119149 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.979883566776183, + "eval_steps": 1000, + "global_step": 8500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 2.2900763358778623e-09, + "logits/chosen": -1.9750971794128418, + "logits/rejected": -2.133049249649048, + "logps/chosen": -316.58319091796875, + "logps/rejected": -299.61151123046875, + "loss": 1.0704, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.3187603950500488, + "rewards/margins": -0.4163769483566284, + "rewards/rejected": -0.9023834466934204, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 4.580152671755725e-09, + "logits/chosen": -2.2404322624206543, + "logits/rejected": -2.3381447792053223, + "logps/chosen": -200.2151336669922, + "logps/rejected": -175.1322784423828, + "loss": 1.7158, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.1643232107162476, + "rewards/margins": -1.1051160097122192, + "rewards/rejected": -0.0592072457075119, + "step": 2 + }, + { + "epoch": 0.0, + "learning_rate": 6.870229007633587e-09, + "logits/chosen": -2.4064407348632812, + "logits/rejected": -2.4977505207061768, + "logps/chosen": -361.6041259765625, + "logps/rejected": -263.5328369140625, + "loss": 0.866, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5388582944869995, + "rewards/margins": -0.23962189257144928, + "rewards/rejected": -0.29923638701438904, + "step": 3 + }, + { + "epoch": 0.0, + "learning_rate": 9.16030534351145e-09, + "logits/chosen": -2.1674671173095703, + "logits/rejected": -2.1854279041290283, + "logps/chosen": -490.7853088378906, + "logps/rejected": -556.08642578125, + "loss": 0.7756, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.1562635600566864, + "rewards/margins": 0.05910694599151611, + "rewards/rejected": -0.2153705358505249, + "step": 4 + }, + { + "epoch": 0.0, + "learning_rate": 1.1450381679389314e-08, + "logits/chosen": -2.3361072540283203, + "logits/rejected": -2.1829276084899902, + "logps/chosen": -341.6460266113281, + "logps/rejected": -243.17156982421875, + "loss": 1.0248, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.497459352016449, + "rewards/margins": -0.41495656967163086, + "rewards/rejected": -0.08250277489423752, + "step": 5 + }, + { + "epoch": 0.0, + "learning_rate": 1.3740458015267175e-08, + "logits/chosen": -2.1206517219543457, + "logits/rejected": -2.1382222175598145, + "logps/chosen": -307.3616943359375, + "logps/rejected": -319.8988037109375, + "loss": 0.6274, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.37196001410484314, + "rewards/margins": 0.22191719710826874, + "rewards/rejected": -0.5938771963119507, + "step": 6 + }, + { + "epoch": 0.0, + "learning_rate": 1.6030534351145036e-08, + "logits/chosen": -2.1799986362457275, + "logits/rejected": -2.205508232116699, + "logps/chosen": -166.6189422607422, + "logps/rejected": -277.2920837402344, + "loss": 0.9102, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.42578548192977905, + "rewards/margins": -0.29736950993537903, + "rewards/rejected": -0.12841594219207764, + "step": 7 + }, + { + "epoch": 0.0, + "learning_rate": 1.83206106870229e-08, + "logits/chosen": -2.061519145965576, + "logits/rejected": -2.0094611644744873, + "logps/chosen": -212.05564880371094, + "logps/rejected": -286.8177795410156, + "loss": 0.531, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5174831748008728, + "rewards/margins": 0.4306480586528778, + "rewards/rejected": -0.948131263256073, + "step": 8 + }, + { + "epoch": 0.0, + "learning_rate": 2.0610687022900764e-08, + "logits/chosen": -2.5138297080993652, + "logits/rejected": -2.5893959999084473, + "logps/chosen": -324.5540466308594, + "logps/rejected": -226.5984344482422, + "loss": 0.7292, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.28003400564193726, + "rewards/margins": 0.07271306961774826, + "rewards/rejected": -0.3527470827102661, + "step": 9 + }, + { + "epoch": 0.0, + "learning_rate": 2.2900763358778627e-08, + "logits/chosen": -1.8515403270721436, + "logits/rejected": -1.947291374206543, + "logps/chosen": -311.40765380859375, + "logps/rejected": -264.5517883300781, + "loss": 1.0134, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.7975293397903442, + "rewards/margins": -0.5363438129425049, + "rewards/rejected": -0.26118552684783936, + "step": 10 + }, + { + "epoch": 0.0, + "learning_rate": 2.5190839694656487e-08, + "logits/chosen": -2.613947629928589, + "logits/rejected": -2.617710828781128, + "logps/chosen": -158.73431396484375, + "logps/rejected": -170.8790283203125, + "loss": 0.6065, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3946467339992523, + "rewards/margins": 0.2493477761745453, + "rewards/rejected": -0.6439945697784424, + "step": 11 + }, + { + "epoch": 0.0, + "learning_rate": 2.748091603053435e-08, + "logits/chosen": -1.895628571510315, + "logits/rejected": -2.1169722080230713, + "logps/chosen": -463.9751892089844, + "logps/rejected": -274.55828857421875, + "loss": 0.9812, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.7822589874267578, + "rewards/margins": -0.44616955518722534, + "rewards/rejected": -0.33608943223953247, + "step": 12 + }, + { + "epoch": 0.0, + "learning_rate": 2.9770992366412212e-08, + "logits/chosen": -2.2829413414001465, + "logits/rejected": -2.5391359329223633, + "logps/chosen": -211.5141143798828, + "logps/rejected": -204.0972442626953, + "loss": 0.8386, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.8917858600616455, + "rewards/margins": 0.9014064073562622, + "rewards/rejected": -1.7931922674179077, + "step": 13 + }, + { + "epoch": 0.0, + "learning_rate": 3.206106870229007e-08, + "logits/chosen": -2.926093339920044, + "logits/rejected": -2.973752021789551, + "logps/chosen": -391.0594482421875, + "logps/rejected": -250.52919006347656, + "loss": 0.7115, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.20436550676822662, + "rewards/margins": 0.06553763151168823, + "rewards/rejected": -0.2699030935764313, + "step": 14 + }, + { + "epoch": 0.0, + "learning_rate": 3.435114503816794e-08, + "logits/chosen": -2.756786346435547, + "logits/rejected": -2.73586106300354, + "logps/chosen": -213.4902801513672, + "logps/rejected": -194.86013793945312, + "loss": 0.9076, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.9958206415176392, + "rewards/margins": -0.3495056629180908, + "rewards/rejected": -0.6463149785995483, + "step": 15 + }, + { + "epoch": 0.0, + "learning_rate": 3.66412213740458e-08, + "logits/chosen": -2.3104796409606934, + "logits/rejected": -2.510688543319702, + "logps/chosen": -208.1526641845703, + "logps/rejected": -132.6660614013672, + "loss": 0.8433, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3791996240615845, + "rewards/margins": -0.0840064287185669, + "rewards/rejected": -1.2951933145523071, + "step": 16 + }, + { + "epoch": 0.0, + "learning_rate": 3.893129770992366e-08, + "logits/chosen": -2.6756374835968018, + "logits/rejected": -2.7548515796661377, + "logps/chosen": -264.1112365722656, + "logps/rejected": -234.58169555664062, + "loss": 0.9664, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.8757508397102356, + "rewards/margins": -0.38465791940689087, + "rewards/rejected": -0.4910929501056671, + "step": 17 + }, + { + "epoch": 0.0, + "learning_rate": 4.122137404580153e-08, + "logits/chosen": -1.7714996337890625, + "logits/rejected": -2.3600833415985107, + "logps/chosen": -569.4312744140625, + "logps/rejected": -178.4188232421875, + "loss": 1.2667, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.3701788187026978, + "rewards/margins": -0.7493028044700623, + "rewards/rejected": -0.6208760142326355, + "step": 18 + }, + { + "epoch": 0.0, + "learning_rate": 4.351145038167938e-08, + "logits/chosen": -2.2468879222869873, + "logits/rejected": -2.3052186965942383, + "logps/chosen": -249.13308715820312, + "logps/rejected": -283.84326171875, + "loss": 0.6935, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2587800621986389, + "rewards/margins": 0.2753233313560486, + "rewards/rejected": -0.5341034531593323, + "step": 19 + }, + { + "epoch": 0.0, + "learning_rate": 4.5801526717557254e-08, + "logits/chosen": -2.6380553245544434, + "logits/rejected": -2.7128453254699707, + "logps/chosen": -303.8550109863281, + "logps/rejected": -248.16632080078125, + "loss": 0.4131, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.404636025428772, + "rewards/margins": 1.158783197402954, + "rewards/rejected": -1.5634194612503052, + "step": 20 + }, + { + "epoch": 0.0, + "learning_rate": 4.809160305343511e-08, + "logits/chosen": -2.5582213401794434, + "logits/rejected": -2.7067317962646484, + "logps/chosen": -289.4676818847656, + "logps/rejected": -191.08331298828125, + "loss": 0.6715, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07961349189281464, + "rewards/margins": 0.05718378722667694, + "rewards/rejected": -0.13679727911949158, + "step": 21 + }, + { + "epoch": 0.0, + "learning_rate": 5.038167938931297e-08, + "logits/chosen": -3.0572586059570312, + "logits/rejected": -2.997605562210083, + "logps/chosen": -205.65980529785156, + "logps/rejected": -183.48678588867188, + "loss": 0.8547, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.1849091798067093, + "rewards/margins": -0.21118591725826263, + "rewards/rejected": 0.026276730000972748, + "step": 22 + }, + { + "epoch": 0.0, + "learning_rate": 5.267175572519083e-08, + "logits/chosen": -2.3936007022857666, + "logits/rejected": -2.231642246246338, + "logps/chosen": -310.0625, + "logps/rejected": -344.1485595703125, + "loss": 0.9774, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8411752581596375, + "rewards/margins": 0.14402271807193756, + "rewards/rejected": -0.9851980209350586, + "step": 23 + }, + { + "epoch": 0.0, + "learning_rate": 5.49618320610687e-08, + "logits/chosen": -2.0176007747650146, + "logits/rejected": -2.031891107559204, + "logps/chosen": -385.6308288574219, + "logps/rejected": -388.72216796875, + "loss": 1.1028, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.9154019355773926, + "rewards/margins": -0.5807561874389648, + "rewards/rejected": -0.3346458077430725, + "step": 24 + }, + { + "epoch": 0.0, + "learning_rate": 5.7251908396946565e-08, + "logits/chosen": -1.946035385131836, + "logits/rejected": -2.0387814044952393, + "logps/chosen": -662.7486572265625, + "logps/rejected": -401.9673156738281, + "loss": 0.6039, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.20486459136009216, + "rewards/margins": 0.6153227090835571, + "rewards/rejected": -0.8201872706413269, + "step": 25 + }, + { + "epoch": 0.0, + "learning_rate": 5.9541984732824424e-08, + "logits/chosen": -2.617624044418335, + "logits/rejected": -2.5860390663146973, + "logps/chosen": -302.3328857421875, + "logps/rejected": -244.5016632080078, + "loss": 3.552, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.140359878540039, + "rewards/margins": -2.900796890258789, + "rewards/rejected": -1.23956298828125, + "step": 26 + }, + { + "epoch": 0.0, + "learning_rate": 6.183206106870229e-08, + "logits/chosen": -2.105443000793457, + "logits/rejected": -2.173057794570923, + "logps/chosen": -215.8130340576172, + "logps/rejected": -201.30670166015625, + "loss": 0.5505, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2859535217285156, + "rewards/margins": 0.4325997829437256, + "rewards/rejected": -0.7185533046722412, + "step": 27 + }, + { + "epoch": 0.0, + "learning_rate": 6.412213740458014e-08, + "logits/chosen": -2.8369719982147217, + "logits/rejected": -2.5481364727020264, + "logps/chosen": -388.47271728515625, + "logps/rejected": -444.9051208496094, + "loss": 0.5759, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06337395310401917, + "rewards/margins": 0.3184322714805603, + "rewards/rejected": -0.3818061947822571, + "step": 28 + }, + { + "epoch": 0.0, + "learning_rate": 6.641221374045801e-08, + "logits/chosen": -1.941672444343567, + "logits/rejected": -2.2397942543029785, + "logps/chosen": -377.6513671875, + "logps/rejected": -308.6730041503906, + "loss": 0.8989, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5043485164642334, + "rewards/margins": -0.16397836804389954, + "rewards/rejected": -0.3403700888156891, + "step": 29 + }, + { + "epoch": 0.0, + "learning_rate": 6.870229007633587e-08, + "logits/chosen": -2.2374773025512695, + "logits/rejected": -2.1506595611572266, + "logps/chosen": -183.03050231933594, + "logps/rejected": -204.79141235351562, + "loss": 0.4557, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15584778785705566, + "rewards/margins": 0.7919732332229614, + "rewards/rejected": -0.9478210210800171, + "step": 30 + }, + { + "epoch": 0.0, + "learning_rate": 7.099236641221374e-08, + "logits/chosen": -2.4230284690856934, + "logits/rejected": -2.363673210144043, + "logps/chosen": -167.5436553955078, + "logps/rejected": -204.4584503173828, + "loss": 1.1423, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.8426038026809692, + "rewards/margins": -0.6290072202682495, + "rewards/rejected": -0.21359659731388092, + "step": 31 + }, + { + "epoch": 0.0, + "learning_rate": 7.32824427480916e-08, + "logits/chosen": -2.582185983657837, + "logits/rejected": -2.49007511138916, + "logps/chosen": -273.457275390625, + "logps/rejected": -211.5026397705078, + "loss": 0.8905, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.554813027381897, + "rewards/margins": -0.2179584801197052, + "rewards/rejected": -0.3368545472621918, + "step": 32 + }, + { + "epoch": 0.0, + "learning_rate": 7.557251908396946e-08, + "logits/chosen": -2.045586109161377, + "logits/rejected": -1.5540440082550049, + "logps/chosen": -441.2231140136719, + "logps/rejected": -401.49237060546875, + "loss": 0.5288, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14635416865348816, + "rewards/margins": 0.3988250195980072, + "rewards/rejected": -0.5451791882514954, + "step": 33 + }, + { + "epoch": 0.0, + "learning_rate": 7.786259541984733e-08, + "logits/chosen": -1.6741193532943726, + "logits/rejected": -1.874808669090271, + "logps/chosen": -258.84381103515625, + "logps/rejected": -217.111328125, + "loss": 0.9389, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.8949853181838989, + "rewards/margins": -0.3878706693649292, + "rewards/rejected": -0.5071146488189697, + "step": 34 + }, + { + "epoch": 0.0, + "learning_rate": 8.015267175572519e-08, + "logits/chosen": -1.985948085784912, + "logits/rejected": -2.0426454544067383, + "logps/chosen": -178.3227996826172, + "logps/rejected": -179.11599731445312, + "loss": 0.6584, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0549643449485302, + "rewards/margins": 0.08349791169166565, + "rewards/rejected": -0.13846226036548615, + "step": 35 + }, + { + "epoch": 0.0, + "learning_rate": 8.244274809160306e-08, + "logits/chosen": -2.4873952865600586, + "logits/rejected": -2.498530626296997, + "logps/chosen": -335.0387878417969, + "logps/rejected": -400.9539794921875, + "loss": 1.022, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.9617574214935303, + "rewards/margins": -0.45800772309303284, + "rewards/rejected": -0.503749668598175, + "step": 36 + }, + { + "epoch": 0.0, + "learning_rate": 8.473282442748092e-08, + "logits/chosen": -1.659022569656372, + "logits/rejected": -1.9016667604446411, + "logps/chosen": -235.52261352539062, + "logps/rejected": -167.04910278320312, + "loss": 0.9387, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0728442668914795, + "rewards/margins": -0.2747955024242401, + "rewards/rejected": -0.7980486750602722, + "step": 37 + }, + { + "epoch": 0.0, + "learning_rate": 8.702290076335876e-08, + "logits/chosen": -2.6436357498168945, + "logits/rejected": -2.3810408115386963, + "logps/chosen": -124.76078033447266, + "logps/rejected": -286.326904296875, + "loss": 0.6425, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.16016040742397308, + "rewards/margins": 0.19057410955429077, + "rewards/rejected": -0.030413687229156494, + "step": 38 + }, + { + "epoch": 0.0, + "learning_rate": 8.931297709923663e-08, + "logits/chosen": -2.5750656127929688, + "logits/rejected": -2.2292747497558594, + "logps/chosen": -154.95602416992188, + "logps/rejected": -302.56072998046875, + "loss": 0.6327, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5827468633651733, + "rewards/margins": 0.3230225741863251, + "rewards/rejected": -0.905769407749176, + "step": 39 + }, + { + "epoch": 0.0, + "learning_rate": 9.160305343511451e-08, + "logits/chosen": -2.409855842590332, + "logits/rejected": -2.422008991241455, + "logps/chosen": -327.82916259765625, + "logps/rejected": -278.3985595703125, + "loss": 0.6321, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.35604822635650635, + "rewards/margins": 0.20897354185581207, + "rewards/rejected": -0.5650217533111572, + "step": 40 + }, + { + "epoch": 0.0, + "learning_rate": 9.389312977099237e-08, + "logits/chosen": -2.192779779434204, + "logits/rejected": -1.9526921510696411, + "logps/chosen": -342.6790771484375, + "logps/rejected": -410.8804016113281, + "loss": 0.6475, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.16011175513267517, + "rewards/margins": 0.1761222779750824, + "rewards/rejected": -0.33623403310775757, + "step": 41 + }, + { + "epoch": 0.0, + "learning_rate": 9.618320610687021e-08, + "logits/chosen": -2.737168788909912, + "logits/rejected": -2.723001718521118, + "logps/chosen": -189.64080810546875, + "logps/rejected": -219.5543670654297, + "loss": 0.8542, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.20788323879241943, + "rewards/margins": -0.24455073475837708, + "rewards/rejected": 0.03666749596595764, + "step": 42 + }, + { + "epoch": 0.0, + "learning_rate": 9.847328244274808e-08, + "logits/chosen": -2.3815670013427734, + "logits/rejected": -2.4664697647094727, + "logps/chosen": -340.20904541015625, + "logps/rejected": -269.360107421875, + "loss": 0.8657, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.341864675283432, + "rewards/margins": -0.25245144963264465, + "rewards/rejected": -0.08941324055194855, + "step": 43 + }, + { + "epoch": 0.01, + "learning_rate": 1.0076335877862595e-07, + "logits/chosen": -1.6491082906723022, + "logits/rejected": -1.9059059619903564, + "logps/chosen": -566.0816650390625, + "logps/rejected": -323.6033935546875, + "loss": 0.7528, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.42247000336647034, + "rewards/margins": 0.0023079756647348404, + "rewards/rejected": -0.4247779846191406, + "step": 44 + }, + { + "epoch": 0.01, + "learning_rate": 1.0305343511450381e-07, + "logits/chosen": -2.6496200561523438, + "logits/rejected": -2.620922327041626, + "logps/chosen": -249.3474884033203, + "logps/rejected": -173.9456024169922, + "loss": 1.0469, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.7389885783195496, + "rewards/margins": -0.43665528297424316, + "rewards/rejected": -0.3023332357406616, + "step": 45 + }, + { + "epoch": 0.01, + "learning_rate": 1.0534351145038167e-07, + "logits/chosen": -2.016296148300171, + "logits/rejected": -2.1679000854492188, + "logps/chosen": -329.47515869140625, + "logps/rejected": -260.34332275390625, + "loss": 0.7078, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.21543511748313904, + "rewards/margins": -0.02174924686551094, + "rewards/rejected": -0.1936858743429184, + "step": 46 + }, + { + "epoch": 0.01, + "learning_rate": 1.0763358778625953e-07, + "logits/chosen": -2.6779704093933105, + "logits/rejected": -2.898040771484375, + "logps/chosen": -408.3302917480469, + "logps/rejected": -216.73324584960938, + "loss": 0.831, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.33390918374061584, + "rewards/margins": -0.21242284774780273, + "rewards/rejected": -0.12148632109165192, + "step": 47 + }, + { + "epoch": 0.01, + "learning_rate": 1.099236641221374e-07, + "logits/chosen": -2.5079593658447266, + "logits/rejected": -2.56718373298645, + "logps/chosen": -308.8442077636719, + "logps/rejected": -257.7294921875, + "loss": 0.635, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.29763063788414, + "rewards/margins": 0.4401213228702545, + "rewards/rejected": -0.7377519607543945, + "step": 48 + }, + { + "epoch": 0.01, + "learning_rate": 1.1221374045801526e-07, + "logits/chosen": -2.2288522720336914, + "logits/rejected": -2.4051127433776855, + "logps/chosen": -321.2168884277344, + "logps/rejected": -232.0726318359375, + "loss": 0.9295, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.449910968542099, + "rewards/margins": -0.33836498856544495, + "rewards/rejected": -0.11154599487781525, + "step": 49 + }, + { + "epoch": 0.01, + "learning_rate": 1.1450381679389313e-07, + "logits/chosen": -2.3915767669677734, + "logits/rejected": -2.5971803665161133, + "logps/chosen": -312.1945495605469, + "logps/rejected": -238.73965454101562, + "loss": 0.7961, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3704115152359009, + "rewards/margins": -0.1450347602367401, + "rewards/rejected": -0.22537675499916077, + "step": 50 + }, + { + "epoch": 0.01, + "learning_rate": 1.1679389312977098e-07, + "logits/chosen": -2.0661706924438477, + "logits/rejected": -2.253471851348877, + "logps/chosen": -329.1202392578125, + "logps/rejected": -203.4114532470703, + "loss": 0.9351, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5144922733306885, + "rewards/margins": -0.26344960927963257, + "rewards/rejected": -0.2510427236557007, + "step": 51 + }, + { + "epoch": 0.01, + "learning_rate": 1.1908396946564885e-07, + "logits/chosen": -1.9238640069961548, + "logits/rejected": -2.3222665786743164, + "logps/chosen": -502.28546142578125, + "logps/rejected": -392.35601806640625, + "loss": 0.8792, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.6400651335716248, + "rewards/margins": -0.24453896284103394, + "rewards/rejected": -0.3955261707305908, + "step": 52 + }, + { + "epoch": 0.01, + "learning_rate": 1.2137404580152673e-07, + "logits/chosen": -2.311481475830078, + "logits/rejected": -2.627573013305664, + "logps/chosen": -276.9031982421875, + "logps/rejected": -160.40509033203125, + "loss": 1.0029, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1920796632766724, + "rewards/margins": -0.276699036359787, + "rewards/rejected": -0.915380597114563, + "step": 53 + }, + { + "epoch": 0.01, + "learning_rate": 1.2366412213740458e-07, + "logits/chosen": -2.7160816192626953, + "logits/rejected": -2.738851547241211, + "logps/chosen": -203.5990447998047, + "logps/rejected": -143.712890625, + "loss": 0.829, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.29252925515174866, + "rewards/margins": -0.23242494463920593, + "rewards/rejected": -0.06010432541370392, + "step": 54 + }, + { + "epoch": 0.01, + "learning_rate": 1.2595419847328243e-07, + "logits/chosen": -2.1126675605773926, + "logits/rejected": -2.1881027221679688, + "logps/chosen": -301.782958984375, + "logps/rejected": -351.26629638671875, + "loss": 0.6394, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.16022339463233948, + "rewards/margins": 0.2575991749763489, + "rewards/rejected": -0.41782253980636597, + "step": 55 + }, + { + "epoch": 0.01, + "learning_rate": 1.2824427480916029e-07, + "logits/chosen": -2.570765972137451, + "logits/rejected": -2.315011978149414, + "logps/chosen": -177.96237182617188, + "logps/rejected": -254.08297729492188, + "loss": 0.5526, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07504735887050629, + "rewards/margins": 0.3572021722793579, + "rewards/rejected": -0.4322494864463806, + "step": 56 + }, + { + "epoch": 0.01, + "learning_rate": 1.3053435114503817e-07, + "logits/chosen": -2.153360366821289, + "logits/rejected": -2.0792431831359863, + "logps/chosen": -128.224853515625, + "logps/rejected": -167.3650665283203, + "loss": 0.6689, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4980829060077667, + "rewards/margins": 0.19277578592300415, + "rewards/rejected": -0.6908587217330933, + "step": 57 + }, + { + "epoch": 0.01, + "learning_rate": 1.3282442748091602e-07, + "logits/chosen": -2.4488320350646973, + "logits/rejected": -2.4893975257873535, + "logps/chosen": -187.1078338623047, + "logps/rejected": -154.1192626953125, + "loss": 0.7267, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.15653039515018463, + "rewards/margins": -0.04668125510215759, + "rewards/rejected": -0.10984914004802704, + "step": 58 + }, + { + "epoch": 0.01, + "learning_rate": 1.3511450381679387e-07, + "logits/chosen": -2.479457378387451, + "logits/rejected": -2.5191397666931152, + "logps/chosen": -95.935546875, + "logps/rejected": -116.56040954589844, + "loss": 1.2151, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.156261920928955, + "rewards/margins": -0.4258803129196167, + "rewards/rejected": -1.7303813695907593, + "step": 59 + }, + { + "epoch": 0.01, + "learning_rate": 1.3740458015267175e-07, + "logits/chosen": -2.5745725631713867, + "logits/rejected": -2.43011736869812, + "logps/chosen": -240.24002075195312, + "logps/rejected": -226.78219604492188, + "loss": 0.8975, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.38892048597335815, + "rewards/margins": -0.3072735369205475, + "rewards/rejected": -0.08164696395397186, + "step": 60 + }, + { + "epoch": 0.01, + "learning_rate": 1.396946564885496e-07, + "logits/chosen": -2.1350836753845215, + "logits/rejected": -2.589618682861328, + "logps/chosen": -195.99359130859375, + "logps/rejected": -97.71050262451172, + "loss": 0.9705, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5759148001670837, + "rewards/margins": -0.31885644793510437, + "rewards/rejected": -0.25705835223197937, + "step": 61 + }, + { + "epoch": 0.01, + "learning_rate": 1.4198473282442748e-07, + "logits/chosen": -2.187352180480957, + "logits/rejected": -2.3793580532073975, + "logps/chosen": -332.056640625, + "logps/rejected": -278.0309753417969, + "loss": 0.8427, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.47337979078292847, + "rewards/margins": -0.24772848188877106, + "rewards/rejected": -0.2256513237953186, + "step": 62 + }, + { + "epoch": 0.01, + "learning_rate": 1.4427480916030533e-07, + "logits/chosen": -2.7211153507232666, + "logits/rejected": -2.7480669021606445, + "logps/chosen": -153.7807159423828, + "logps/rejected": -188.0388641357422, + "loss": 1.0189, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.42654433846473694, + "rewards/margins": -0.40959441661834717, + "rewards/rejected": -0.016949936747550964, + "step": 63 + }, + { + "epoch": 0.01, + "learning_rate": 1.465648854961832e-07, + "logits/chosen": -2.3811964988708496, + "logits/rejected": -2.434969186782837, + "logps/chosen": -234.04910278320312, + "logps/rejected": -212.18319702148438, + "loss": 0.7733, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5168054699897766, + "rewards/margins": -0.12524648010730743, + "rewards/rejected": -0.39155900478363037, + "step": 64 + }, + { + "epoch": 0.01, + "learning_rate": 1.4885496183206107e-07, + "logits/chosen": -2.744743824005127, + "logits/rejected": -2.759885311126709, + "logps/chosen": -225.70249938964844, + "logps/rejected": -205.3494415283203, + "loss": 0.7037, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1410490721464157, + "rewards/margins": 0.027377553284168243, + "rewards/rejected": -0.16842660307884216, + "step": 65 + }, + { + "epoch": 0.01, + "learning_rate": 1.5114503816793892e-07, + "logits/chosen": -2.4948062896728516, + "logits/rejected": -2.325800657272339, + "logps/chosen": -127.26695251464844, + "logps/rejected": -258.8333740234375, + "loss": 0.6647, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4185584485530853, + "rewards/margins": 0.4995752274990082, + "rewards/rejected": -0.9181336164474487, + "step": 66 + }, + { + "epoch": 0.01, + "learning_rate": 1.5343511450381677e-07, + "logits/chosen": -2.1426188945770264, + "logits/rejected": -2.2589304447174072, + "logps/chosen": -433.1138610839844, + "logps/rejected": -489.08758544921875, + "loss": 0.6354, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.24940542876720428, + "rewards/margins": 0.16110187768936157, + "rewards/rejected": -0.41050732135772705, + "step": 67 + }, + { + "epoch": 0.01, + "learning_rate": 1.5572519083969465e-07, + "logits/chosen": -2.3982229232788086, + "logits/rejected": -2.653325080871582, + "logps/chosen": -486.97369384765625, + "logps/rejected": -341.5960693359375, + "loss": 0.8621, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.6063442826271057, + "rewards/margins": -0.2576339840888977, + "rewards/rejected": -0.34871023893356323, + "step": 68 + }, + { + "epoch": 0.01, + "learning_rate": 1.580152671755725e-07, + "logits/chosen": -2.156904935836792, + "logits/rejected": -2.197748899459839, + "logps/chosen": -164.81756591796875, + "logps/rejected": -178.4713134765625, + "loss": 0.7337, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2953484356403351, + "rewards/margins": -0.036574773490428925, + "rewards/rejected": -0.25877368450164795, + "step": 69 + }, + { + "epoch": 0.01, + "learning_rate": 1.6030534351145038e-07, + "logits/chosen": -2.2366533279418945, + "logits/rejected": -2.0623035430908203, + "logps/chosen": -191.4144744873047, + "logps/rejected": -239.08575439453125, + "loss": 0.6619, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8510591983795166, + "rewards/margins": 1.085920810699463, + "rewards/rejected": -1.9369800090789795, + "step": 70 + }, + { + "epoch": 0.01, + "learning_rate": 1.6259541984732824e-07, + "logits/chosen": -2.365171432495117, + "logits/rejected": -2.4281835556030273, + "logps/chosen": -181.4813232421875, + "logps/rejected": -181.92996215820312, + "loss": 0.5506, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7596848011016846, + "rewards/margins": 0.48701560497283936, + "rewards/rejected": -1.246700406074524, + "step": 71 + }, + { + "epoch": 0.01, + "learning_rate": 1.6488549618320612e-07, + "logits/chosen": -2.002502202987671, + "logits/rejected": -2.327653169631958, + "logps/chosen": -333.9010009765625, + "logps/rejected": -114.25791931152344, + "loss": 0.773, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.3535059988498688, + "rewards/margins": -0.12664301693439484, + "rewards/rejected": -0.22686301171779633, + "step": 72 + }, + { + "epoch": 0.01, + "learning_rate": 1.6717557251908397e-07, + "logits/chosen": -2.3541483879089355, + "logits/rejected": -2.298828125, + "logps/chosen": -255.6683349609375, + "logps/rejected": -276.131591796875, + "loss": 0.791, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3042502701282501, + "rewards/margins": -0.12855060398578644, + "rewards/rejected": -0.1756996512413025, + "step": 73 + }, + { + "epoch": 0.01, + "learning_rate": 1.6946564885496185e-07, + "logits/chosen": -2.324463367462158, + "logits/rejected": -2.5767199993133545, + "logps/chosen": -257.05419921875, + "logps/rejected": -258.8433837890625, + "loss": 0.7725, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1999940127134323, + "rewards/margins": -0.03499063849449158, + "rewards/rejected": -0.16500338912010193, + "step": 74 + }, + { + "epoch": 0.01, + "learning_rate": 1.7175572519083967e-07, + "logits/chosen": -2.066019058227539, + "logits/rejected": -2.5518083572387695, + "logps/chosen": -313.2525939941406, + "logps/rejected": -142.41119384765625, + "loss": 1.0653, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5495986342430115, + "rewards/margins": -0.418325275182724, + "rewards/rejected": -0.13127344846725464, + "step": 75 + }, + { + "epoch": 0.01, + "learning_rate": 1.7404580152671753e-07, + "logits/chosen": -2.593916177749634, + "logits/rejected": -2.5557267665863037, + "logps/chosen": -231.92092895507812, + "logps/rejected": -302.9488525390625, + "loss": 0.8742, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.3385533392429352, + "rewards/margins": -0.299041211605072, + "rewards/rejected": -0.03951215371489525, + "step": 76 + }, + { + "epoch": 0.01, + "learning_rate": 1.763358778625954e-07, + "logits/chosen": -2.154815196990967, + "logits/rejected": -2.154663562774658, + "logps/chosen": -327.425537109375, + "logps/rejected": -325.5865783691406, + "loss": 0.6032, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3638656735420227, + "rewards/margins": 0.27159053087234497, + "rewards/rejected": -0.6354561448097229, + "step": 77 + }, + { + "epoch": 0.01, + "learning_rate": 1.7862595419847326e-07, + "logits/chosen": -1.9569071531295776, + "logits/rejected": -2.0589492321014404, + "logps/chosen": -472.8673400878906, + "logps/rejected": -303.68603515625, + "loss": 0.5483, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17242909967899323, + "rewards/margins": 0.35100871324539185, + "rewards/rejected": -0.5234378576278687, + "step": 78 + }, + { + "epoch": 0.01, + "learning_rate": 1.8091603053435114e-07, + "logits/chosen": -2.4680259227752686, + "logits/rejected": -2.428426742553711, + "logps/chosen": -432.3888244628906, + "logps/rejected": -472.05572509765625, + "loss": 0.7613, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.4350716769695282, + "rewards/margins": -0.08119373768568039, + "rewards/rejected": -0.3538779020309448, + "step": 79 + }, + { + "epoch": 0.01, + "learning_rate": 1.8320610687022902e-07, + "logits/chosen": -2.7242941856384277, + "logits/rejected": -2.7611584663391113, + "logps/chosen": -365.14398193359375, + "logps/rejected": -361.33770751953125, + "loss": 0.5273, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04933638870716095, + "rewards/margins": 0.5239064693450928, + "rewards/rejected": -0.5732429623603821, + "step": 80 + }, + { + "epoch": 0.01, + "learning_rate": 1.8549618320610687e-07, + "logits/chosen": -2.1398444175720215, + "logits/rejected": -2.398286819458008, + "logps/chosen": -325.33673095703125, + "logps/rejected": -289.38037109375, + "loss": 0.834, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5741604566574097, + "rewards/margins": -0.16153177618980408, + "rewards/rejected": -0.4126286506652832, + "step": 81 + }, + { + "epoch": 0.01, + "learning_rate": 1.8778625954198475e-07, + "logits/chosen": -2.3270933628082275, + "logits/rejected": -2.2321386337280273, + "logps/chosen": -284.534423828125, + "logps/rejected": -263.6170654296875, + "loss": 0.9006, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.46427351236343384, + "rewards/margins": -0.26414182782173157, + "rewards/rejected": -0.20013171434402466, + "step": 82 + }, + { + "epoch": 0.01, + "learning_rate": 1.9007633587786258e-07, + "logits/chosen": -2.4947450160980225, + "logits/rejected": -2.501957416534424, + "logps/chosen": -205.22103881835938, + "logps/rejected": -320.90582275390625, + "loss": 0.5123, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3416628837585449, + "rewards/margins": 0.4375176429748535, + "rewards/rejected": -0.7791805267333984, + "step": 83 + }, + { + "epoch": 0.01, + "learning_rate": 1.9236641221374043e-07, + "logits/chosen": -2.7704131603240967, + "logits/rejected": -2.6021013259887695, + "logps/chosen": -288.86968994140625, + "logps/rejected": -212.87905883789062, + "loss": 0.568, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.045164451003074646, + "rewards/margins": 0.29489991068840027, + "rewards/rejected": -0.3400643467903137, + "step": 84 + }, + { + "epoch": 0.01, + "learning_rate": 1.946564885496183e-07, + "logits/chosen": -2.3054749965667725, + "logits/rejected": -2.4619479179382324, + "logps/chosen": -203.48092651367188, + "logps/rejected": -160.01266479492188, + "loss": 0.7066, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2686072587966919, + "rewards/margins": -0.007550451904535294, + "rewards/rejected": -0.2610568106174469, + "step": 85 + }, + { + "epoch": 0.01, + "learning_rate": 1.9694656488549616e-07, + "logits/chosen": -2.2914071083068848, + "logits/rejected": -2.189410448074341, + "logps/chosen": -212.2104949951172, + "logps/rejected": -309.00341796875, + "loss": 0.6814, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1631581336259842, + "rewards/margins": 0.1299111694097519, + "rewards/rejected": -0.2930693030357361, + "step": 86 + }, + { + "epoch": 0.01, + "learning_rate": 1.9923664122137404e-07, + "logits/chosen": -2.1116058826446533, + "logits/rejected": -2.1005420684814453, + "logps/chosen": -280.9657897949219, + "logps/rejected": -277.23028564453125, + "loss": 1.1843, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8561821579933167, + "rewards/margins": -0.358293354511261, + "rewards/rejected": -0.49788886308670044, + "step": 87 + }, + { + "epoch": 0.01, + "learning_rate": 2.015267175572519e-07, + "logits/chosen": -2.3410706520080566, + "logits/rejected": -2.322293281555176, + "logps/chosen": -214.05084228515625, + "logps/rejected": -200.9133758544922, + "loss": 0.6079, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06821799278259277, + "rewards/margins": 0.36315852403640747, + "rewards/rejected": -0.43137648701667786, + "step": 88 + }, + { + "epoch": 0.01, + "learning_rate": 2.0381679389312977e-07, + "logits/chosen": -2.948211908340454, + "logits/rejected": -2.836721897125244, + "logps/chosen": -306.71575927734375, + "logps/rejected": -279.5546569824219, + "loss": 0.5892, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.14219284057617188, + "rewards/margins": 0.29721298813819885, + "rewards/rejected": -0.4394058287143707, + "step": 89 + }, + { + "epoch": 0.01, + "learning_rate": 2.0610687022900762e-07, + "logits/chosen": -2.7756173610687256, + "logits/rejected": -2.732107162475586, + "logps/chosen": -214.74972534179688, + "logps/rejected": -204.3527374267578, + "loss": 0.5037, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06205669045448303, + "rewards/margins": 0.5531842708587646, + "rewards/rejected": -0.6152409911155701, + "step": 90 + }, + { + "epoch": 0.01, + "learning_rate": 2.083969465648855e-07, + "logits/chosen": -2.372587203979492, + "logits/rejected": -2.3920514583587646, + "logps/chosen": -315.33111572265625, + "logps/rejected": -294.655029296875, + "loss": 0.6054, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08699588477611542, + "rewards/margins": 0.2688751816749573, + "rewards/rejected": -0.3558710813522339, + "step": 91 + }, + { + "epoch": 0.01, + "learning_rate": 2.1068702290076333e-07, + "logits/chosen": -2.8618431091308594, + "logits/rejected": -2.7882227897644043, + "logps/chosen": -193.18939208984375, + "logps/rejected": -237.15237426757812, + "loss": 0.6541, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.024995815008878708, + "rewards/margins": 0.21004626154899597, + "rewards/rejected": -0.18505047261714935, + "step": 92 + }, + { + "epoch": 0.01, + "learning_rate": 2.129770992366412e-07, + "logits/chosen": -2.506242275238037, + "logits/rejected": -2.6615095138549805, + "logps/chosen": -354.70806884765625, + "logps/rejected": -254.6879119873047, + "loss": 0.6314, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11528228223323822, + "rewards/margins": 0.2625873386859894, + "rewards/rejected": -0.3778696060180664, + "step": 93 + }, + { + "epoch": 0.01, + "learning_rate": 2.1526717557251906e-07, + "logits/chosen": -1.602285623550415, + "logits/rejected": -1.859665870666504, + "logps/chosen": -196.45779418945312, + "logps/rejected": -147.09693908691406, + "loss": 0.8297, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4564325213432312, + "rewards/margins": -0.1847381591796875, + "rewards/rejected": -0.2716943919658661, + "step": 94 + }, + { + "epoch": 0.01, + "learning_rate": 2.1755725190839694e-07, + "logits/chosen": -2.744333505630493, + "logits/rejected": -2.8361194133758545, + "logps/chosen": -268.88232421875, + "logps/rejected": -145.11460876464844, + "loss": 0.6883, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09802322089672089, + "rewards/margins": 0.09315426647663116, + "rewards/rejected": -0.19117748737335205, + "step": 95 + }, + { + "epoch": 0.01, + "learning_rate": 2.198473282442748e-07, + "logits/chosen": -2.767359495162964, + "logits/rejected": -2.9254443645477295, + "logps/chosen": -164.02508544921875, + "logps/rejected": -237.54248046875, + "loss": 0.5351, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.02290382608771324, + "rewards/margins": 0.38309746980667114, + "rewards/rejected": -0.4060012996196747, + "step": 96 + }, + { + "epoch": 0.01, + "learning_rate": 2.2213740458015267e-07, + "logits/chosen": -1.9021170139312744, + "logits/rejected": -1.805161476135254, + "logps/chosen": -180.55393981933594, + "logps/rejected": -286.18463134765625, + "loss": 0.5789, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3568944036960602, + "rewards/margins": 0.3100224733352661, + "rewards/rejected": -0.6669168472290039, + "step": 97 + }, + { + "epoch": 0.01, + "learning_rate": 2.2442748091603053e-07, + "logits/chosen": -2.1188132762908936, + "logits/rejected": -2.2542004585266113, + "logps/chosen": -492.9274597167969, + "logps/rejected": -340.6473693847656, + "loss": 0.5288, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4381915330886841, + "rewards/margins": 0.41978058218955994, + "rewards/rejected": -0.8579720854759216, + "step": 98 + }, + { + "epoch": 0.01, + "learning_rate": 2.267175572519084e-07, + "logits/chosen": -2.280924081802368, + "logits/rejected": -2.2408645153045654, + "logps/chosen": -197.37806701660156, + "logps/rejected": -204.40785217285156, + "loss": 0.6471, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.25138574838638306, + "rewards/margins": 0.19469879567623138, + "rewards/rejected": -0.44608455896377563, + "step": 99 + }, + { + "epoch": 0.01, + "learning_rate": 2.2900763358778626e-07, + "logits/chosen": -2.3035988807678223, + "logits/rejected": -2.0804824829101562, + "logps/chosen": -161.7123260498047, + "logps/rejected": -253.2257537841797, + "loss": 0.5706, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17242667078971863, + "rewards/margins": 0.28590071201324463, + "rewards/rejected": -0.45832741260528564, + "step": 100 + }, + { + "epoch": 0.01, + "learning_rate": 2.3129770992366408e-07, + "logits/chosen": -2.5587899684906006, + "logits/rejected": -2.6784329414367676, + "logps/chosen": -249.72059631347656, + "logps/rejected": -134.2333221435547, + "loss": 0.5269, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.01845679245889187, + "rewards/margins": 0.41758784651756287, + "rewards/rejected": -0.39913108944892883, + "step": 101 + }, + { + "epoch": 0.01, + "learning_rate": 2.3358778625954196e-07, + "logits/chosen": -2.1819863319396973, + "logits/rejected": -1.8228100538253784, + "logps/chosen": -182.82229614257812, + "logps/rejected": -270.3923034667969, + "loss": 0.6435, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.24791431427001953, + "rewards/margins": 0.22106218338012695, + "rewards/rejected": -0.4689764976501465, + "step": 102 + }, + { + "epoch": 0.01, + "learning_rate": 2.3587786259541982e-07, + "logits/chosen": -2.6916818618774414, + "logits/rejected": -2.7056612968444824, + "logps/chosen": -246.18907165527344, + "logps/rejected": -268.25274658203125, + "loss": 0.6728, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1263716071844101, + "rewards/margins": 0.10814264416694641, + "rewards/rejected": -0.2345142513513565, + "step": 103 + }, + { + "epoch": 0.01, + "learning_rate": 2.381679389312977e-07, + "logits/chosen": -1.949310064315796, + "logits/rejected": -2.2964773178100586, + "logps/chosen": -375.954345703125, + "logps/rejected": -205.28775024414062, + "loss": 0.5834, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3803815245628357, + "rewards/margins": 0.33243268728256226, + "rewards/rejected": -0.7128142714500427, + "step": 104 + }, + { + "epoch": 0.01, + "learning_rate": 2.4045801526717555e-07, + "logits/chosen": -2.5010626316070557, + "logits/rejected": -2.7945780754089355, + "logps/chosen": -334.40924072265625, + "logps/rejected": -138.94381713867188, + "loss": 1.018, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8968489766120911, + "rewards/margins": -0.030568808317184448, + "rewards/rejected": -0.866280198097229, + "step": 105 + }, + { + "epoch": 0.01, + "learning_rate": 2.4274809160305345e-07, + "logits/chosen": -2.363703489303589, + "logits/rejected": -2.328928232192993, + "logps/chosen": -331.54742431640625, + "logps/rejected": -318.25543212890625, + "loss": 2.0823, + "rewards/accuracies": 0.375, + "rewards/chosen": -2.1888794898986816, + "rewards/margins": -1.5146251916885376, + "rewards/rejected": -0.6742541790008545, + "step": 106 + }, + { + "epoch": 0.01, + "learning_rate": 2.450381679389313e-07, + "logits/chosen": -2.5637428760528564, + "logits/rejected": -2.646881103515625, + "logps/chosen": -178.76820373535156, + "logps/rejected": -242.5869140625, + "loss": 0.5785, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.32375162839889526, + "rewards/margins": 0.33530884981155396, + "rewards/rejected": -0.6590604782104492, + "step": 107 + }, + { + "epoch": 0.01, + "learning_rate": 2.4732824427480916e-07, + "logits/chosen": -2.934439182281494, + "logits/rejected": -2.8071095943450928, + "logps/chosen": -220.26651000976562, + "logps/rejected": -286.7970886230469, + "loss": 0.55, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.16010406613349915, + "rewards/margins": 0.3499263525009155, + "rewards/rejected": -0.18982228636741638, + "step": 108 + }, + { + "epoch": 0.01, + "learning_rate": 2.49618320610687e-07, + "logits/chosen": -2.852649211883545, + "logits/rejected": -2.8970160484313965, + "logps/chosen": -119.42755889892578, + "logps/rejected": -89.35293579101562, + "loss": 0.6321, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.28762611746788025, + "rewards/margins": 0.20279823243618011, + "rewards/rejected": -0.49042433500289917, + "step": 109 + }, + { + "epoch": 0.01, + "learning_rate": 2.5190839694656487e-07, + "logits/chosen": -2.8163609504699707, + "logits/rejected": -2.8482823371887207, + "logps/chosen": -263.84033203125, + "logps/rejected": -238.72097778320312, + "loss": 0.7697, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.46918508410453796, + "rewards/margins": -0.023306522518396378, + "rewards/rejected": -0.4458785355091095, + "step": 110 + }, + { + "epoch": 0.01, + "learning_rate": 2.541984732824427e-07, + "logits/chosen": -2.2213683128356934, + "logits/rejected": -2.3973162174224854, + "logps/chosen": -314.09014892578125, + "logps/rejected": -155.99093627929688, + "loss": 0.7096, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1589728444814682, + "rewards/margins": 0.06125228852033615, + "rewards/rejected": -0.22022512555122375, + "step": 111 + }, + { + "epoch": 0.01, + "learning_rate": 2.5648854961832057e-07, + "logits/chosen": -2.2919867038726807, + "logits/rejected": -2.574983835220337, + "logps/chosen": -466.041748046875, + "logps/rejected": -295.208251953125, + "loss": 0.7731, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3285427987575531, + "rewards/margins": -0.0628364235162735, + "rewards/rejected": -0.2657063603401184, + "step": 112 + }, + { + "epoch": 0.01, + "learning_rate": 2.587786259541985e-07, + "logits/chosen": -2.403656244277954, + "logits/rejected": -2.4967668056488037, + "logps/chosen": -360.6982421875, + "logps/rejected": -344.580810546875, + "loss": 0.7106, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.18780887126922607, + "rewards/margins": 0.16123639047145844, + "rewards/rejected": -0.3490452766418457, + "step": 113 + }, + { + "epoch": 0.01, + "learning_rate": 2.6106870229007633e-07, + "logits/chosen": -2.7740559577941895, + "logits/rejected": -2.7963755130767822, + "logps/chosen": -221.16213989257812, + "logps/rejected": -149.54148864746094, + "loss": 0.8459, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.7785383462905884, + "rewards/margins": -0.21432536840438843, + "rewards/rejected": -0.5642129778862, + "step": 114 + }, + { + "epoch": 0.01, + "learning_rate": 2.633587786259542e-07, + "logits/chosen": -2.2895030975341797, + "logits/rejected": -2.077768564224243, + "logps/chosen": -296.2397155761719, + "logps/rejected": -218.25128173828125, + "loss": 0.7553, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2457389533519745, + "rewards/margins": -0.09706494957208633, + "rewards/rejected": -0.14867401123046875, + "step": 115 + }, + { + "epoch": 0.01, + "learning_rate": 2.6564885496183204e-07, + "logits/chosen": -1.863695740699768, + "logits/rejected": -1.85211181640625, + "logps/chosen": -502.9945983886719, + "logps/rejected": -411.41363525390625, + "loss": 0.6172, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03159421682357788, + "rewards/margins": 0.2721397280693054, + "rewards/rejected": -0.3037339448928833, + "step": 116 + }, + { + "epoch": 0.01, + "learning_rate": 2.6793893129770994e-07, + "logits/chosen": -2.275357723236084, + "logits/rejected": -2.6825430393218994, + "logps/chosen": -216.376708984375, + "logps/rejected": -162.83584594726562, + "loss": 1.0123, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5102656483650208, + "rewards/margins": -0.17860198020935059, + "rewards/rejected": -0.33166369795799255, + "step": 117 + }, + { + "epoch": 0.01, + "learning_rate": 2.7022900763358774e-07, + "logits/chosen": -2.1855790615081787, + "logits/rejected": -1.9890434741973877, + "logps/chosen": -373.5499572753906, + "logps/rejected": -495.36297607421875, + "loss": 0.8367, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.5336794853210449, + "rewards/margins": -0.18511012196540833, + "rewards/rejected": -0.3485693037509918, + "step": 118 + }, + { + "epoch": 0.01, + "learning_rate": 2.7251908396946565e-07, + "logits/chosen": -2.451413154602051, + "logits/rejected": -2.4292681217193604, + "logps/chosen": -385.5609436035156, + "logps/rejected": -313.583251953125, + "loss": 0.7742, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.28066447377204895, + "rewards/margins": 0.07001002132892609, + "rewards/rejected": -0.35067445039749146, + "step": 119 + }, + { + "epoch": 0.01, + "learning_rate": 2.748091603053435e-07, + "logits/chosen": -2.4460012912750244, + "logits/rejected": -2.3946683406829834, + "logps/chosen": -256.7662658691406, + "logps/rejected": -355.2879638671875, + "loss": 0.4939, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.18510684370994568, + "rewards/margins": 0.6432151794433594, + "rewards/rejected": -0.8283219337463379, + "step": 120 + }, + { + "epoch": 0.01, + "learning_rate": 2.7709923664122135e-07, + "logits/chosen": -2.6626040935516357, + "logits/rejected": -2.805788040161133, + "logps/chosen": -197.54364013671875, + "logps/rejected": -132.472412109375, + "loss": 0.8203, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.41000014543533325, + "rewards/margins": -0.18239977955818176, + "rewards/rejected": -0.2276003658771515, + "step": 121 + }, + { + "epoch": 0.01, + "learning_rate": 2.793893129770992e-07, + "logits/chosen": -2.0719516277313232, + "logits/rejected": -1.9942193031311035, + "logps/chosen": -384.7672119140625, + "logps/rejected": -344.4891052246094, + "loss": 0.7741, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3266003131866455, + "rewards/margins": 0.08221480995416641, + "rewards/rejected": -0.4088151156902313, + "step": 122 + }, + { + "epoch": 0.01, + "learning_rate": 2.816793893129771e-07, + "logits/chosen": -1.9536230564117432, + "logits/rejected": -2.1344528198242188, + "logps/chosen": -158.52890014648438, + "logps/rejected": -166.7032928466797, + "loss": 0.9169, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.44162872433662415, + "rewards/margins": -0.25264281034469604, + "rewards/rejected": -0.1889859139919281, + "step": 123 + }, + { + "epoch": 0.01, + "learning_rate": 2.8396946564885496e-07, + "logits/chosen": -2.217358112335205, + "logits/rejected": -2.3112316131591797, + "logps/chosen": -432.5357360839844, + "logps/rejected": -301.468017578125, + "loss": 0.6205, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.11137276887893677, + "rewards/margins": 0.27412667870521545, + "rewards/rejected": -0.3854994475841522, + "step": 124 + }, + { + "epoch": 0.01, + "learning_rate": 2.862595419847328e-07, + "logits/chosen": -2.4105687141418457, + "logits/rejected": -2.24484920501709, + "logps/chosen": -125.23345184326172, + "logps/rejected": -287.2042541503906, + "loss": 0.6384, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.31639569997787476, + "rewards/margins": 0.4018799662590027, + "rewards/rejected": -0.7182756662368774, + "step": 125 + }, + { + "epoch": 0.01, + "learning_rate": 2.8854961832061067e-07, + "logits/chosen": -1.8933026790618896, + "logits/rejected": -1.8353958129882812, + "logps/chosen": -267.400634765625, + "logps/rejected": -197.3748016357422, + "loss": 1.1702, + "rewards/accuracies": 0.375, + "rewards/chosen": -2.0557467937469482, + "rewards/margins": -0.4830389618873596, + "rewards/rejected": -1.5727078914642334, + "step": 126 + }, + { + "epoch": 0.01, + "learning_rate": 2.908396946564885e-07, + "logits/chosen": -2.325608968734741, + "logits/rejected": -2.2605435848236084, + "logps/chosen": -202.77801513671875, + "logps/rejected": -233.16787719726562, + "loss": 0.5145, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.025129303336143494, + "rewards/margins": 0.49178844690322876, + "rewards/rejected": -0.5169177055358887, + "step": 127 + }, + { + "epoch": 0.01, + "learning_rate": 2.931297709923664e-07, + "logits/chosen": -2.4621360301971436, + "logits/rejected": -2.503307342529297, + "logps/chosen": -266.8580627441406, + "logps/rejected": -274.5615539550781, + "loss": 0.4191, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9194144606590271, + "rewards/margins": 0.8027552366256714, + "rewards/rejected": -1.7221696376800537, + "step": 128 + }, + { + "epoch": 0.01, + "learning_rate": 2.9541984732824423e-07, + "logits/chosen": -1.9078782796859741, + "logits/rejected": -2.1347129344940186, + "logps/chosen": -383.4454345703125, + "logps/rejected": -305.10662841796875, + "loss": 0.794, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4981151819229126, + "rewards/margins": 0.09022119641304016, + "rewards/rejected": -0.5883364081382751, + "step": 129 + }, + { + "epoch": 0.01, + "learning_rate": 2.9770992366412213e-07, + "logits/chosen": -2.4092178344726562, + "logits/rejected": -2.209125518798828, + "logps/chosen": -276.4317626953125, + "logps/rejected": -352.6285095214844, + "loss": 0.7639, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5463846325874329, + "rewards/margins": 0.43442240357398987, + "rewards/rejected": -0.9808070063591003, + "step": 130 + }, + { + "epoch": 0.02, + "learning_rate": 3e-07, + "logits/chosen": -2.60760498046875, + "logits/rejected": -2.579063653945923, + "logps/chosen": -221.06675720214844, + "logps/rejected": -299.2462158203125, + "loss": 0.5558, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.410960853099823, + "rewards/margins": 0.6981890201568604, + "rewards/rejected": -1.1091499328613281, + "step": 131 + }, + { + "epoch": 0.02, + "learning_rate": 2.9996488353037574e-07, + "logits/chosen": -2.4203310012817383, + "logits/rejected": -2.1687204837799072, + "logps/chosen": -269.9006042480469, + "logps/rejected": -222.34347534179688, + "loss": 0.5092, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11583442986011505, + "rewards/margins": 0.5534472465515137, + "rewards/rejected": -0.6692817211151123, + "step": 132 + }, + { + "epoch": 0.02, + "learning_rate": 2.999297670607515e-07, + "logits/chosen": -2.3116743564605713, + "logits/rejected": -2.6592659950256348, + "logps/chosen": -405.3973693847656, + "logps/rejected": -354.34088134765625, + "loss": 0.6678, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.21169975399971008, + "rewards/margins": 0.464550256729126, + "rewards/rejected": -0.6762501001358032, + "step": 133 + }, + { + "epoch": 0.02, + "learning_rate": 2.9989465059112725e-07, + "logits/chosen": -2.193732976913452, + "logits/rejected": -2.358628749847412, + "logps/chosen": -401.5488586425781, + "logps/rejected": -229.86474609375, + "loss": 1.068, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.6789991855621338, + "rewards/margins": -0.4859035015106201, + "rewards/rejected": -0.19309568405151367, + "step": 134 + }, + { + "epoch": 0.02, + "learning_rate": 2.9985953412150295e-07, + "logits/chosen": -2.4940896034240723, + "logits/rejected": -2.642671585083008, + "logps/chosen": -366.9105224609375, + "logps/rejected": -308.8394775390625, + "loss": 0.6062, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5908286571502686, + "rewards/margins": 0.9077165126800537, + "rewards/rejected": -1.4985451698303223, + "step": 135 + }, + { + "epoch": 0.02, + "learning_rate": 2.998244176518787e-07, + "logits/chosen": -2.4374163150787354, + "logits/rejected": -2.7271475791931152, + "logps/chosen": -256.110595703125, + "logps/rejected": -322.71044921875, + "loss": 0.6711, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1832103729248047, + "rewards/margins": 0.3439072370529175, + "rewards/rejected": -0.5271176099777222, + "step": 136 + }, + { + "epoch": 0.02, + "learning_rate": 2.9978930118225446e-07, + "logits/chosen": -2.5402472019195557, + "logits/rejected": -2.6675965785980225, + "logps/chosen": -253.11483764648438, + "logps/rejected": -245.81021118164062, + "loss": 0.6396, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5280009508132935, + "rewards/margins": 0.23283420503139496, + "rewards/rejected": -0.7608351707458496, + "step": 137 + }, + { + "epoch": 0.02, + "learning_rate": 2.997541847126302e-07, + "logits/chosen": -2.4184722900390625, + "logits/rejected": -2.7530837059020996, + "logps/chosen": -278.5036315917969, + "logps/rejected": -193.02859497070312, + "loss": 0.667, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24999132752418518, + "rewards/margins": 0.1436251401901245, + "rewards/rejected": -0.3936164677143097, + "step": 138 + }, + { + "epoch": 0.02, + "learning_rate": 2.9971906824300596e-07, + "logits/chosen": -1.8785145282745361, + "logits/rejected": -2.0254995822906494, + "logps/chosen": -276.607421875, + "logps/rejected": -190.68914794921875, + "loss": 0.5252, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12761712074279785, + "rewards/margins": 0.5265072584152222, + "rewards/rejected": -0.65412437915802, + "step": 139 + }, + { + "epoch": 0.02, + "learning_rate": 2.996839517733817e-07, + "logits/chosen": -2.9154322147369385, + "logits/rejected": -2.9130477905273438, + "logps/chosen": -226.05641174316406, + "logps/rejected": -166.63555908203125, + "loss": 0.7224, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.15829262137413025, + "rewards/margins": -0.017390433698892593, + "rewards/rejected": -0.14090219140052795, + "step": 140 + }, + { + "epoch": 0.02, + "learning_rate": 2.996488353037574e-07, + "logits/chosen": -2.125659942626953, + "logits/rejected": -2.32085919380188, + "logps/chosen": -334.5301513671875, + "logps/rejected": -200.1259002685547, + "loss": 0.9156, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.0168920755386353, + "rewards/margins": -0.22885847091674805, + "rewards/rejected": -0.7880336046218872, + "step": 141 + }, + { + "epoch": 0.02, + "learning_rate": 2.996137188341332e-07, + "logits/chosen": -2.479545831680298, + "logits/rejected": -2.322953701019287, + "logps/chosen": -190.66580200195312, + "logps/rejected": -193.99624633789062, + "loss": 0.594, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.11371585726737976, + "rewards/margins": 0.32169991731643677, + "rewards/rejected": -0.43541577458381653, + "step": 142 + }, + { + "epoch": 0.02, + "learning_rate": 2.9957860236450893e-07, + "logits/chosen": -2.1649937629699707, + "logits/rejected": -2.376577854156494, + "logps/chosen": -327.2272033691406, + "logps/rejected": -317.0970153808594, + "loss": 0.5592, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.20642448961734772, + "rewards/margins": 0.5193942785263062, + "rewards/rejected": -0.7258187532424927, + "step": 143 + }, + { + "epoch": 0.02, + "learning_rate": 2.995434858948847e-07, + "logits/chosen": -2.420531988143921, + "logits/rejected": -2.6374692916870117, + "logps/chosen": -137.49600219726562, + "logps/rejected": -158.76681518554688, + "loss": 0.5627, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.20404651761054993, + "rewards/margins": 0.4299430549144745, + "rewards/rejected": -0.6339895725250244, + "step": 144 + }, + { + "epoch": 0.02, + "learning_rate": 2.9950836942526043e-07, + "logits/chosen": -2.1224117279052734, + "logits/rejected": -2.3761682510375977, + "logps/chosen": -661.471435546875, + "logps/rejected": -348.93670654296875, + "loss": 0.8748, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.7318909168243408, + "rewards/margins": -0.21858134865760803, + "rewards/rejected": -0.5133095979690552, + "step": 145 + }, + { + "epoch": 0.02, + "learning_rate": 2.994732529556362e-07, + "logits/chosen": -2.282810926437378, + "logits/rejected": -2.583918809890747, + "logps/chosen": -574.3483276367188, + "logps/rejected": -229.71710205078125, + "loss": 0.6249, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24886634945869446, + "rewards/margins": 0.2715320289134979, + "rewards/rejected": -0.5203983783721924, + "step": 146 + }, + { + "epoch": 0.02, + "learning_rate": 2.9943813648601194e-07, + "logits/chosen": -1.9734063148498535, + "logits/rejected": -1.997890830039978, + "logps/chosen": -367.6311340332031, + "logps/rejected": -305.9056396484375, + "loss": 0.448, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4675038456916809, + "rewards/margins": 1.0222911834716797, + "rewards/rejected": -1.4897949695587158, + "step": 147 + }, + { + "epoch": 0.02, + "learning_rate": 2.994030200163877e-07, + "logits/chosen": -2.550640106201172, + "logits/rejected": -2.469125747680664, + "logps/chosen": -204.88519287109375, + "logps/rejected": -189.78164672851562, + "loss": 1.0577, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2433724403381348, + "rewards/margins": 0.4517967104911804, + "rewards/rejected": -1.6951693296432495, + "step": 148 + }, + { + "epoch": 0.02, + "learning_rate": 2.993679035467634e-07, + "logits/chosen": -2.5764083862304688, + "logits/rejected": -2.651090145111084, + "logps/chosen": -567.953125, + "logps/rejected": -287.7806091308594, + "loss": 0.4522, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2164136916399002, + "rewards/margins": 0.797977864742279, + "rewards/rejected": -1.014391541481018, + "step": 149 + }, + { + "epoch": 0.02, + "learning_rate": 2.9933278707713915e-07, + "logits/chosen": -2.451231002807617, + "logits/rejected": -2.4753756523132324, + "logps/chosen": -316.8193359375, + "logps/rejected": -266.40679931640625, + "loss": 0.7526, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6040374636650085, + "rewards/margins": 0.20756873488426208, + "rewards/rejected": -0.811606228351593, + "step": 150 + }, + { + "epoch": 0.02, + "learning_rate": 2.992976706075149e-07, + "logits/chosen": -2.453494071960449, + "logits/rejected": -2.4390504360198975, + "logps/chosen": -110.56063842773438, + "logps/rejected": -123.95121765136719, + "loss": 0.4397, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.021291373297572136, + "rewards/margins": 0.7372894287109375, + "rewards/rejected": -0.7585808038711548, + "step": 151 + }, + { + "epoch": 0.02, + "learning_rate": 2.9926255413789066e-07, + "logits/chosen": -2.7214901447296143, + "logits/rejected": -2.811288356781006, + "logps/chosen": -323.4248046875, + "logps/rejected": -274.50274658203125, + "loss": 0.5258, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.006838075816631317, + "rewards/margins": 0.5436097383499146, + "rewards/rejected": -0.5504477620124817, + "step": 152 + }, + { + "epoch": 0.02, + "learning_rate": 2.992274376682664e-07, + "logits/chosen": -2.568958044052124, + "logits/rejected": -2.8773653507232666, + "logps/chosen": -283.8459167480469, + "logps/rejected": -205.28782653808594, + "loss": 0.4078, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2552541494369507, + "rewards/margins": 1.000766396522522, + "rewards/rejected": -1.2560205459594727, + "step": 153 + }, + { + "epoch": 0.02, + "learning_rate": 2.991923211986421e-07, + "logits/chosen": -2.3627471923828125, + "logits/rejected": -2.4680986404418945, + "logps/chosen": -337.25396728515625, + "logps/rejected": -309.17169189453125, + "loss": 0.8633, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5894104242324829, + "rewards/margins": -0.07700803130865097, + "rewards/rejected": -0.512402355670929, + "step": 154 + }, + { + "epoch": 0.02, + "learning_rate": 2.991572047290179e-07, + "logits/chosen": -2.447335720062256, + "logits/rejected": -2.459022045135498, + "logps/chosen": -169.5035400390625, + "logps/rejected": -175.65020751953125, + "loss": 0.6091, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09984609484672546, + "rewards/margins": 0.3782385289669037, + "rewards/rejected": -0.47808465361595154, + "step": 155 + }, + { + "epoch": 0.02, + "learning_rate": 2.991220882593937e-07, + "logits/chosen": -2.4944729804992676, + "logits/rejected": -2.4825661182403564, + "logps/chosen": -269.87432861328125, + "logps/rejected": -214.99488830566406, + "loss": 0.4909, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3480831384658813, + "rewards/margins": 0.7642865180969238, + "rewards/rejected": -2.1123695373535156, + "step": 156 + }, + { + "epoch": 0.02, + "learning_rate": 2.990869717897694e-07, + "logits/chosen": -2.0878241062164307, + "logits/rejected": -2.2856860160827637, + "logps/chosen": -328.4610900878906, + "logps/rejected": -234.4894561767578, + "loss": 0.8896, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.4663807153701782, + "rewards/margins": -0.2542582154273987, + "rewards/rejected": -0.21212251484394073, + "step": 157 + }, + { + "epoch": 0.02, + "learning_rate": 2.9905185532014513e-07, + "logits/chosen": -2.4207615852355957, + "logits/rejected": -2.5171632766723633, + "logps/chosen": -204.0616912841797, + "logps/rejected": -147.55612182617188, + "loss": 0.628, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.38114243745803833, + "rewards/margins": 0.3401077389717102, + "rewards/rejected": -0.7212501764297485, + "step": 158 + }, + { + "epoch": 0.02, + "learning_rate": 2.990167388505209e-07, + "logits/chosen": -2.562579870223999, + "logits/rejected": -2.391113519668579, + "logps/chosen": -287.1980285644531, + "logps/rejected": -340.826904296875, + "loss": 0.6138, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4272283911705017, + "rewards/margins": 0.2808589041233063, + "rewards/rejected": -0.7080873250961304, + "step": 159 + }, + { + "epoch": 0.02, + "learning_rate": 2.9898162238089664e-07, + "logits/chosen": -2.5808067321777344, + "logits/rejected": -2.551661491394043, + "logps/chosen": -478.4233703613281, + "logps/rejected": -378.5386962890625, + "loss": 0.611, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.489298552274704, + "rewards/margins": 0.5485382080078125, + "rewards/rejected": -1.0378366708755493, + "step": 160 + }, + { + "epoch": 0.02, + "learning_rate": 2.989465059112724e-07, + "logits/chosen": -2.4492440223693848, + "logits/rejected": -2.267815351486206, + "logps/chosen": -283.12200927734375, + "logps/rejected": -283.27886962890625, + "loss": 0.4275, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.013501288369297981, + "rewards/margins": 0.69312983751297, + "rewards/rejected": -0.6796284914016724, + "step": 161 + }, + { + "epoch": 0.02, + "learning_rate": 2.989113894416481e-07, + "logits/chosen": -2.5342867374420166, + "logits/rejected": -2.6356759071350098, + "logps/chosen": -264.92633056640625, + "logps/rejected": -209.31661987304688, + "loss": 0.7509, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4302371144294739, + "rewards/margins": 0.05381745845079422, + "rewards/rejected": -0.4840545654296875, + "step": 162 + }, + { + "epoch": 0.02, + "learning_rate": 2.9887627297202385e-07, + "logits/chosen": -2.3213369846343994, + "logits/rejected": -2.334618091583252, + "logps/chosen": -222.22434997558594, + "logps/rejected": -287.5243835449219, + "loss": 0.5187, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5436602234840393, + "rewards/margins": 0.7189856171607971, + "rewards/rejected": -1.2626458406448364, + "step": 163 + }, + { + "epoch": 0.02, + "learning_rate": 2.9884115650239965e-07, + "logits/chosen": -2.4851691722869873, + "logits/rejected": -2.4204745292663574, + "logps/chosen": -279.9588623046875, + "logps/rejected": -183.01373291015625, + "loss": 0.5969, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.683583676815033, + "rewards/margins": 0.2972472310066223, + "rewards/rejected": -0.9808308482170105, + "step": 164 + }, + { + "epoch": 0.02, + "learning_rate": 2.9880604003277535e-07, + "logits/chosen": -2.0253002643585205, + "logits/rejected": -2.1494088172912598, + "logps/chosen": -382.58551025390625, + "logps/rejected": -185.21755981445312, + "loss": 0.5985, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09843750298023224, + "rewards/margins": 0.2576065957546234, + "rewards/rejected": -0.35604411363601685, + "step": 165 + }, + { + "epoch": 0.02, + "learning_rate": 2.987709235631511e-07, + "logits/chosen": -1.9921833276748657, + "logits/rejected": -2.0987050533294678, + "logps/chosen": -298.2403564453125, + "logps/rejected": -347.76458740234375, + "loss": 0.5201, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16889536380767822, + "rewards/margins": 0.44139382243156433, + "rewards/rejected": -0.6102892160415649, + "step": 166 + }, + { + "epoch": 0.02, + "learning_rate": 2.9873580709352686e-07, + "logits/chosen": -2.3673787117004395, + "logits/rejected": -2.204219102859497, + "logps/chosen": -218.9974822998047, + "logps/rejected": -301.60009765625, + "loss": 0.3302, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3807203769683838, + "rewards/margins": 1.3361811637878418, + "rewards/rejected": -1.7169015407562256, + "step": 167 + }, + { + "epoch": 0.02, + "learning_rate": 2.987006906239026e-07, + "logits/chosen": -2.501965045928955, + "logits/rejected": -2.7015767097473145, + "logps/chosen": -153.13014221191406, + "logps/rejected": -163.51382446289062, + "loss": 1.8398, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1570229530334473, + "rewards/margins": 0.15106704831123352, + "rewards/rejected": -2.3080902099609375, + "step": 168 + }, + { + "epoch": 0.02, + "learning_rate": 2.9866557415427837e-07, + "logits/chosen": -2.2952842712402344, + "logits/rejected": -2.3853015899658203, + "logps/chosen": -664.5115356445312, + "logps/rejected": -375.56671142578125, + "loss": 0.5851, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5340291261672974, + "rewards/margins": 0.4372430145740509, + "rewards/rejected": -0.9712720513343811, + "step": 169 + }, + { + "epoch": 0.02, + "learning_rate": 2.9863045768465407e-07, + "logits/chosen": -2.553738832473755, + "logits/rejected": -2.5753140449523926, + "logps/chosen": -280.5697326660156, + "logps/rejected": -126.45502471923828, + "loss": 0.6789, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6713256239891052, + "rewards/margins": 0.3215923011302948, + "rewards/rejected": -0.9929178953170776, + "step": 170 + }, + { + "epoch": 0.02, + "learning_rate": 2.985953412150298e-07, + "logits/chosen": -2.58988094329834, + "logits/rejected": -2.768160820007324, + "logps/chosen": -181.14356994628906, + "logps/rejected": -150.33070373535156, + "loss": 0.6462, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.30725371837615967, + "rewards/margins": 0.3507440686225891, + "rewards/rejected": -0.6579978466033936, + "step": 171 + }, + { + "epoch": 0.02, + "learning_rate": 2.985602247454056e-07, + "logits/chosen": -2.6464884281158447, + "logits/rejected": -2.5524260997772217, + "logps/chosen": -293.4698791503906, + "logps/rejected": -262.4988098144531, + "loss": 0.5291, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.19304871559143066, + "rewards/margins": 0.43519994616508484, + "rewards/rejected": -0.6282486319541931, + "step": 172 + }, + { + "epoch": 0.02, + "learning_rate": 2.9852510827578133e-07, + "logits/chosen": -2.3128650188446045, + "logits/rejected": -2.354677438735962, + "logps/chosen": -308.21868896484375, + "logps/rejected": -203.00347900390625, + "loss": 0.6532, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1558557152748108, + "rewards/margins": 0.16528761386871338, + "rewards/rejected": -0.32114332914352417, + "step": 173 + }, + { + "epoch": 0.02, + "learning_rate": 2.984899918061571e-07, + "logits/chosen": -2.4594385623931885, + "logits/rejected": -2.3858118057250977, + "logps/chosen": -293.0694274902344, + "logps/rejected": -195.05235290527344, + "loss": 0.6248, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09883396327495575, + "rewards/margins": 0.23861609399318695, + "rewards/rejected": -0.3374500572681427, + "step": 174 + }, + { + "epoch": 0.02, + "learning_rate": 2.984548753365328e-07, + "logits/chosen": -2.917207717895508, + "logits/rejected": -2.845625877380371, + "logps/chosen": -418.09283447265625, + "logps/rejected": -347.39996337890625, + "loss": 0.4287, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1148669421672821, + "rewards/margins": 1.0015473365783691, + "rewards/rejected": -1.116414189338684, + "step": 175 + }, + { + "epoch": 0.02, + "learning_rate": 2.984197588669086e-07, + "logits/chosen": -2.5149524211883545, + "logits/rejected": -2.7574353218078613, + "logps/chosen": -235.96426391601562, + "logps/rejected": -330.82177734375, + "loss": 0.8296, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8275305032730103, + "rewards/margins": 0.24107684195041656, + "rewards/rejected": -1.0686073303222656, + "step": 176 + }, + { + "epoch": 0.02, + "learning_rate": 2.9838464239728435e-07, + "logits/chosen": -2.703266143798828, + "logits/rejected": -2.652670383453369, + "logps/chosen": -176.852294921875, + "logps/rejected": -169.80120849609375, + "loss": 0.9057, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.43634289503097534, + "rewards/margins": -0.2578265964984894, + "rewards/rejected": -0.17851629853248596, + "step": 177 + }, + { + "epoch": 0.02, + "learning_rate": 2.9834952592766005e-07, + "logits/chosen": -1.8278846740722656, + "logits/rejected": -1.7919279336929321, + "logps/chosen": -324.2242126464844, + "logps/rejected": -386.99749755859375, + "loss": 0.4311, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11838356405496597, + "rewards/margins": 0.833471417427063, + "rewards/rejected": -0.9518550038337708, + "step": 178 + }, + { + "epoch": 0.02, + "learning_rate": 2.983144094580358e-07, + "logits/chosen": -2.321341037750244, + "logits/rejected": -2.0718722343444824, + "logps/chosen": -216.7677001953125, + "logps/rejected": -256.309814453125, + "loss": 0.584, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7653875350952148, + "rewards/margins": 0.4541943669319153, + "rewards/rejected": -1.219581961631775, + "step": 179 + }, + { + "epoch": 0.02, + "learning_rate": 2.9827929298841155e-07, + "logits/chosen": -2.3463950157165527, + "logits/rejected": -2.0819432735443115, + "logps/chosen": -170.39468383789062, + "logps/rejected": -236.28085327148438, + "loss": 0.4242, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17266258597373962, + "rewards/margins": 0.7345141768455505, + "rewards/rejected": -0.5618515610694885, + "step": 180 + }, + { + "epoch": 0.02, + "learning_rate": 2.982441765187873e-07, + "logits/chosen": -2.2501583099365234, + "logits/rejected": -2.113676071166992, + "logps/chosen": -281.715087890625, + "logps/rejected": -260.8525085449219, + "loss": 0.7523, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5743521451950073, + "rewards/margins": 0.15371260046958923, + "rewards/rejected": -0.7280647158622742, + "step": 181 + }, + { + "epoch": 0.02, + "learning_rate": 2.9820906004916306e-07, + "logits/chosen": -2.5903918743133545, + "logits/rejected": -2.75848388671875, + "logps/chosen": -253.92965698242188, + "logps/rejected": -198.9359130859375, + "loss": 0.5359, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.45662981271743774, + "rewards/margins": 0.949205756187439, + "rewards/rejected": -1.4058356285095215, + "step": 182 + }, + { + "epoch": 0.02, + "learning_rate": 2.9817394357953876e-07, + "logits/chosen": -2.371689558029175, + "logits/rejected": -2.455850124359131, + "logps/chosen": -309.1271057128906, + "logps/rejected": -246.56402587890625, + "loss": 0.5112, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.07257841527462006, + "rewards/margins": 0.6836895942687988, + "rewards/rejected": -0.7562679648399353, + "step": 183 + }, + { + "epoch": 0.02, + "learning_rate": 2.981388271099145e-07, + "logits/chosen": -2.358729124069214, + "logits/rejected": -2.6329524517059326, + "logps/chosen": -181.1085968017578, + "logps/rejected": -157.72854614257812, + "loss": 0.5738, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.26775574684143066, + "rewards/margins": 0.3198274075984955, + "rewards/rejected": -0.5875831842422485, + "step": 184 + }, + { + "epoch": 0.02, + "learning_rate": 2.9810371064029027e-07, + "logits/chosen": -2.0044968128204346, + "logits/rejected": -2.4105594158172607, + "logps/chosen": -292.1424255371094, + "logps/rejected": -189.28848266601562, + "loss": 0.8159, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7932908535003662, + "rewards/margins": 0.15816213190555573, + "rewards/rejected": -0.9514529705047607, + "step": 185 + }, + { + "epoch": 0.02, + "learning_rate": 2.98068594170666e-07, + "logits/chosen": -2.687706708908081, + "logits/rejected": -2.612380027770996, + "logps/chosen": -184.5609130859375, + "logps/rejected": -248.17178344726562, + "loss": 0.7858, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2293112426996231, + "rewards/margins": -0.1112208366394043, + "rewards/rejected": -0.11809039860963821, + "step": 186 + }, + { + "epoch": 0.02, + "learning_rate": 2.980334777010418e-07, + "logits/chosen": -2.9577488899230957, + "logits/rejected": -2.9934511184692383, + "logps/chosen": -199.502685546875, + "logps/rejected": -192.6226806640625, + "loss": 0.2862, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.30555254220962524, + "rewards/margins": 1.5228744745254517, + "rewards/rejected": -1.2173219919204712, + "step": 187 + }, + { + "epoch": 0.02, + "learning_rate": 2.979983612314175e-07, + "logits/chosen": -2.4899063110351562, + "logits/rejected": -2.7087550163269043, + "logps/chosen": -257.3338623046875, + "logps/rejected": -297.03533935546875, + "loss": 0.468, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16984617710113525, + "rewards/margins": 0.9826184511184692, + "rewards/rejected": -1.1524646282196045, + "step": 188 + }, + { + "epoch": 0.02, + "learning_rate": 2.979632447617933e-07, + "logits/chosen": -2.5001046657562256, + "logits/rejected": -2.674595832824707, + "logps/chosen": -305.4355163574219, + "logps/rejected": -293.24969482421875, + "loss": 0.4386, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.28180065751075745, + "rewards/margins": 0.8035825490951538, + "rewards/rejected": -1.0853831768035889, + "step": 189 + }, + { + "epoch": 0.02, + "learning_rate": 2.9792812829216904e-07, + "logits/chosen": -2.2009010314941406, + "logits/rejected": -2.4007530212402344, + "logps/chosen": -294.11114501953125, + "logps/rejected": -177.77401733398438, + "loss": 0.6096, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.23023727536201477, + "rewards/margins": 0.37017127871513367, + "rewards/rejected": -0.6004085540771484, + "step": 190 + }, + { + "epoch": 0.02, + "learning_rate": 2.9789301182254474e-07, + "logits/chosen": -2.654773235321045, + "logits/rejected": -2.6478846073150635, + "logps/chosen": -352.4059753417969, + "logps/rejected": -240.98416137695312, + "loss": 0.5776, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6169270277023315, + "rewards/margins": 0.6146091222763062, + "rewards/rejected": -1.2315361499786377, + "step": 191 + }, + { + "epoch": 0.02, + "learning_rate": 2.978578953529205e-07, + "logits/chosen": -2.689924478530884, + "logits/rejected": -2.6719186305999756, + "logps/chosen": -233.2705078125, + "logps/rejected": -206.71286010742188, + "loss": 0.3828, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.021855361759662628, + "rewards/margins": 1.0711675882339478, + "rewards/rejected": -1.0493122339248657, + "step": 192 + }, + { + "epoch": 0.02, + "learning_rate": 2.9782277888329625e-07, + "logits/chosen": -2.896111011505127, + "logits/rejected": -2.7924633026123047, + "logps/chosen": -239.26052856445312, + "logps/rejected": -278.91827392578125, + "loss": 0.3421, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.48901718854904175, + "rewards/margins": 0.9388021230697632, + "rewards/rejected": -1.4278192520141602, + "step": 193 + }, + { + "epoch": 0.02, + "learning_rate": 2.97787662413672e-07, + "logits/chosen": -2.577284097671509, + "logits/rejected": -2.799912929534912, + "logps/chosen": -191.0710906982422, + "logps/rejected": -207.7033233642578, + "loss": 0.6032, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2397519052028656, + "rewards/margins": 0.5939090251922607, + "rewards/rejected": -0.833660900592804, + "step": 194 + }, + { + "epoch": 0.02, + "learning_rate": 2.9775254594404776e-07, + "logits/chosen": -2.1877381801605225, + "logits/rejected": -2.1371567249298096, + "logps/chosen": -257.25299072265625, + "logps/rejected": -249.8438262939453, + "loss": 1.2487, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.9435697793960571, + "rewards/margins": 0.20922976732254028, + "rewards/rejected": -2.152799606323242, + "step": 195 + }, + { + "epoch": 0.02, + "learning_rate": 2.9771742947442346e-07, + "logits/chosen": -2.0867090225219727, + "logits/rejected": -2.108480215072632, + "logps/chosen": -282.0589294433594, + "logps/rejected": -203.7332305908203, + "loss": 0.5611, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5132110714912415, + "rewards/margins": 0.4507933259010315, + "rewards/rejected": -0.9640043377876282, + "step": 196 + }, + { + "epoch": 0.02, + "learning_rate": 2.976823130047992e-07, + "logits/chosen": -2.011423110961914, + "logits/rejected": -2.225752592086792, + "logps/chosen": -368.1114807128906, + "logps/rejected": -265.9504089355469, + "loss": 0.7081, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.2890989780426025, + "rewards/margins": 0.4088100492954254, + "rewards/rejected": -1.6979089975357056, + "step": 197 + }, + { + "epoch": 0.02, + "learning_rate": 2.97647196535175e-07, + "logits/chosen": -2.449647903442383, + "logits/rejected": -2.1404616832733154, + "logps/chosen": -153.46519470214844, + "logps/rejected": -238.02142333984375, + "loss": 0.6173, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8127787709236145, + "rewards/margins": 0.41039881110191345, + "rewards/rejected": -1.2231775522232056, + "step": 198 + }, + { + "epoch": 0.02, + "learning_rate": 2.976120800655507e-07, + "logits/chosen": -2.1738834381103516, + "logits/rejected": -2.1011743545532227, + "logps/chosen": -346.3476867675781, + "logps/rejected": -354.17706298828125, + "loss": 0.3942, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6179185509681702, + "rewards/margins": 0.9631261825561523, + "rewards/rejected": -1.5810446739196777, + "step": 199 + }, + { + "epoch": 0.02, + "learning_rate": 2.9757696359592647e-07, + "logits/chosen": -2.3892173767089844, + "logits/rejected": -2.5488290786743164, + "logps/chosen": -349.4868469238281, + "logps/rejected": -311.16827392578125, + "loss": 0.5354, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.35129520297050476, + "rewards/margins": 0.9436919689178467, + "rewards/rejected": -1.2949872016906738, + "step": 200 + }, + { + "epoch": 0.02, + "learning_rate": 2.9754184712630223e-07, + "logits/chosen": -2.5370492935180664, + "logits/rejected": -2.393224000930786, + "logps/chosen": -90.33948516845703, + "logps/rejected": -183.60081481933594, + "loss": 0.6206, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.35243886709213257, + "rewards/margins": 0.24766160547733307, + "rewards/rejected": -0.6001004576683044, + "step": 201 + }, + { + "epoch": 0.02, + "learning_rate": 2.97506730656678e-07, + "logits/chosen": -1.981811761856079, + "logits/rejected": -2.1993680000305176, + "logps/chosen": -330.2177734375, + "logps/rejected": -208.48333740234375, + "loss": 0.6912, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.21610409021377563, + "rewards/margins": 0.11671817302703857, + "rewards/rejected": -0.3328222632408142, + "step": 202 + }, + { + "epoch": 0.02, + "learning_rate": 2.9747161418705373e-07, + "logits/chosen": -2.5797362327575684, + "logits/rejected": -2.4459714889526367, + "logps/chosen": -122.42729949951172, + "logps/rejected": -234.9996337890625, + "loss": 0.415, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6043163537979126, + "rewards/margins": 1.1129792928695679, + "rewards/rejected": -1.717295527458191, + "step": 203 + }, + { + "epoch": 0.02, + "learning_rate": 2.9743649771742944e-07, + "logits/chosen": -2.428945779800415, + "logits/rejected": -2.575861930847168, + "logps/chosen": -136.42916870117188, + "logps/rejected": -161.5343017578125, + "loss": 0.6558, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5582849383354187, + "rewards/margins": 0.3106359839439392, + "rewards/rejected": -0.8689209222793579, + "step": 204 + }, + { + "epoch": 0.02, + "learning_rate": 2.974013812478052e-07, + "logits/chosen": -2.393028497695923, + "logits/rejected": -2.521897792816162, + "logps/chosen": -353.99951171875, + "logps/rejected": -287.53125, + "loss": 0.6277, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5958483815193176, + "rewards/margins": 0.42664608359336853, + "rewards/rejected": -1.0224944353103638, + "step": 205 + }, + { + "epoch": 0.02, + "learning_rate": 2.9736626477818094e-07, + "logits/chosen": -2.657825231552124, + "logits/rejected": -2.335735321044922, + "logps/chosen": -224.4635772705078, + "logps/rejected": -367.517333984375, + "loss": 0.251, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6756798624992371, + "rewards/margins": 2.170508861541748, + "rewards/rejected": -2.846188545227051, + "step": 206 + }, + { + "epoch": 0.02, + "learning_rate": 2.973311483085567e-07, + "logits/chosen": -1.961535930633545, + "logits/rejected": -1.806793212890625, + "logps/chosen": -317.8827209472656, + "logps/rejected": -327.2768249511719, + "loss": 0.6632, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6322667002677917, + "rewards/margins": 0.252092570066452, + "rewards/rejected": -0.8843593001365662, + "step": 207 + }, + { + "epoch": 0.02, + "learning_rate": 2.9729603183893245e-07, + "logits/chosen": -2.4104413986206055, + "logits/rejected": -2.356332302093506, + "logps/chosen": -198.67794799804688, + "logps/rejected": -245.61386108398438, + "loss": 0.5426, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2383928745985031, + "rewards/margins": 0.6853629350662231, + "rewards/rejected": -0.9237558841705322, + "step": 208 + }, + { + "epoch": 0.02, + "learning_rate": 2.972609153693082e-07, + "logits/chosen": -2.477627992630005, + "logits/rejected": -2.1903798580169678, + "logps/chosen": -326.60430908203125, + "logps/rejected": -416.16357421875, + "loss": 0.532, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.058834269642829895, + "rewards/margins": 0.4942936897277832, + "rewards/rejected": -0.5531280040740967, + "step": 209 + }, + { + "epoch": 0.02, + "learning_rate": 2.9722579889968396e-07, + "logits/chosen": -2.9259519577026367, + "logits/rejected": -2.9460701942443848, + "logps/chosen": -404.5721435546875, + "logps/rejected": -240.0440673828125, + "loss": 0.5186, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.585553765296936, + "rewards/margins": 0.7215762734413147, + "rewards/rejected": -1.307129979133606, + "step": 210 + }, + { + "epoch": 0.02, + "learning_rate": 2.971906824300597e-07, + "logits/chosen": -2.4004435539245605, + "logits/rejected": -2.4053797721862793, + "logps/chosen": -554.048583984375, + "logps/rejected": -258.4874572753906, + "loss": 0.9721, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9072767496109009, + "rewards/margins": 0.19633381068706512, + "rewards/rejected": -1.1036107540130615, + "step": 211 + }, + { + "epoch": 0.02, + "learning_rate": 2.971555659604354e-07, + "logits/chosen": -2.567338466644287, + "logits/rejected": -2.598971128463745, + "logps/chosen": -191.63941955566406, + "logps/rejected": -300.2498779296875, + "loss": 0.5242, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2565786838531494, + "rewards/margins": 0.6956162452697754, + "rewards/rejected": -0.9521949291229248, + "step": 212 + }, + { + "epoch": 0.02, + "learning_rate": 2.9712044949081117e-07, + "logits/chosen": -2.494687080383301, + "logits/rejected": -2.3273844718933105, + "logps/chosen": -180.7270965576172, + "logps/rejected": -226.04132080078125, + "loss": 0.9473, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.7107498049736023, + "rewards/margins": -0.34346991777420044, + "rewards/rejected": -0.36727988719940186, + "step": 213 + }, + { + "epoch": 0.02, + "learning_rate": 2.970853330211869e-07, + "logits/chosen": -2.085570812225342, + "logits/rejected": -2.214003086090088, + "logps/chosen": -278.76141357421875, + "logps/rejected": -243.56478881835938, + "loss": 0.6197, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3860725462436676, + "rewards/margins": 0.33070430159568787, + "rewards/rejected": -0.7167768478393555, + "step": 214 + }, + { + "epoch": 0.02, + "learning_rate": 2.970502165515627e-07, + "logits/chosen": -2.360626697540283, + "logits/rejected": -2.4378490447998047, + "logps/chosen": -298.41717529296875, + "logps/rejected": -229.90615844726562, + "loss": 0.7088, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.45959728956222534, + "rewards/margins": 0.13284368813037872, + "rewards/rejected": -0.5924409627914429, + "step": 215 + }, + { + "epoch": 0.02, + "learning_rate": 2.9701510008193843e-07, + "logits/chosen": -2.2999253273010254, + "logits/rejected": -2.280139923095703, + "logps/chosen": -220.4091796875, + "logps/rejected": -288.97235107421875, + "loss": 0.5161, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8143740892410278, + "rewards/margins": 0.544533908367157, + "rewards/rejected": -1.3589081764221191, + "step": 216 + }, + { + "epoch": 0.03, + "learning_rate": 2.969799836123142e-07, + "logits/chosen": -2.644681215286255, + "logits/rejected": -2.5228333473205566, + "logps/chosen": -149.0433349609375, + "logps/rejected": -343.3270568847656, + "loss": 0.5563, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11698313802480698, + "rewards/margins": 0.4626231789588928, + "rewards/rejected": -0.579606294631958, + "step": 217 + }, + { + "epoch": 0.03, + "learning_rate": 2.969448671426899e-07, + "logits/chosen": -2.265197992324829, + "logits/rejected": -2.479604959487915, + "logps/chosen": -463.5638122558594, + "logps/rejected": -258.2206726074219, + "loss": 0.6672, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17464253306388855, + "rewards/margins": 0.3956458866596222, + "rewards/rejected": -0.5702884197235107, + "step": 218 + }, + { + "epoch": 0.03, + "learning_rate": 2.9690975067306564e-07, + "logits/chosen": -2.3617591857910156, + "logits/rejected": -2.5457520484924316, + "logps/chosen": -350.4695739746094, + "logps/rejected": -284.0982666015625, + "loss": 0.764, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.5572122931480408, + "rewards/margins": 0.40497925877571106, + "rewards/rejected": -0.9621915817260742, + "step": 219 + }, + { + "epoch": 0.03, + "learning_rate": 2.968746342034414e-07, + "logits/chosen": -1.7899656295776367, + "logits/rejected": -2.072436809539795, + "logps/chosen": -459.11865234375, + "logps/rejected": -383.92510986328125, + "loss": 0.628, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5790839195251465, + "rewards/margins": 0.5573561191558838, + "rewards/rejected": -1.1364401578903198, + "step": 220 + }, + { + "epoch": 0.03, + "learning_rate": 2.9683951773381715e-07, + "logits/chosen": -2.167630195617676, + "logits/rejected": -2.1590781211853027, + "logps/chosen": -288.0857849121094, + "logps/rejected": -220.23236083984375, + "loss": 0.7462, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0223655700683594, + "rewards/margins": 0.8810625076293945, + "rewards/rejected": -1.903428077697754, + "step": 221 + }, + { + "epoch": 0.03, + "learning_rate": 2.968044012641929e-07, + "logits/chosen": -2.6525497436523438, + "logits/rejected": -2.731782913208008, + "logps/chosen": -99.93539428710938, + "logps/rejected": -122.9131851196289, + "loss": 0.5887, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4499821960926056, + "rewards/margins": 0.4876473546028137, + "rewards/rejected": -0.9376294612884521, + "step": 222 + }, + { + "epoch": 0.03, + "learning_rate": 2.9676928479456865e-07, + "logits/chosen": -2.5498337745666504, + "logits/rejected": -2.6173222064971924, + "logps/chosen": -409.74884033203125, + "logps/rejected": -265.684814453125, + "loss": 0.4644, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6260004043579102, + "rewards/margins": 1.6585582494735718, + "rewards/rejected": -2.2845585346221924, + "step": 223 + }, + { + "epoch": 0.03, + "learning_rate": 2.967341683249444e-07, + "logits/chosen": -2.496267318725586, + "logits/rejected": -2.700869083404541, + "logps/chosen": -382.93975830078125, + "logps/rejected": -154.918212890625, + "loss": 1.0333, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9471506476402283, + "rewards/margins": -0.04603327810764313, + "rewards/rejected": -0.9011173248291016, + "step": 224 + }, + { + "epoch": 0.03, + "learning_rate": 2.9669905185532016e-07, + "logits/chosen": -2.5902137756347656, + "logits/rejected": -2.61418080329895, + "logps/chosen": -197.6024932861328, + "logps/rejected": -224.89117431640625, + "loss": 0.911, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6843521595001221, + "rewards/margins": -0.12291333079338074, + "rewards/rejected": -0.5614389181137085, + "step": 225 + }, + { + "epoch": 0.03, + "learning_rate": 2.9666393538569586e-07, + "logits/chosen": -2.0796737670898438, + "logits/rejected": -2.275479555130005, + "logps/chosen": -292.42059326171875, + "logps/rejected": -236.5636749267578, + "loss": 0.7529, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.27176105976104736, + "rewards/margins": 0.43768012523651123, + "rewards/rejected": -0.7094412446022034, + "step": 226 + }, + { + "epoch": 0.03, + "learning_rate": 2.966288189160716e-07, + "logits/chosen": -2.0230627059936523, + "logits/rejected": -2.1392245292663574, + "logps/chosen": -266.52825927734375, + "logps/rejected": -285.5145568847656, + "loss": 0.4745, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14761950075626373, + "rewards/margins": 0.9653831720352173, + "rewards/rejected": -1.1130026578903198, + "step": 227 + }, + { + "epoch": 0.03, + "learning_rate": 2.9659370244644737e-07, + "logits/chosen": -2.330899953842163, + "logits/rejected": -2.290102481842041, + "logps/chosen": -147.01768493652344, + "logps/rejected": -273.56939697265625, + "loss": 0.6143, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.43965208530426025, + "rewards/margins": 0.34881824254989624, + "rewards/rejected": -0.7884702682495117, + "step": 228 + }, + { + "epoch": 0.03, + "learning_rate": 2.965585859768231e-07, + "logits/chosen": -2.4471707344055176, + "logits/rejected": -2.167140007019043, + "logps/chosen": -223.2282257080078, + "logps/rejected": -269.4696350097656, + "loss": 0.4675, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.49625879526138306, + "rewards/margins": 0.7810115218162537, + "rewards/rejected": -1.2772703170776367, + "step": 229 + }, + { + "epoch": 0.03, + "learning_rate": 2.965234695071989e-07, + "logits/chosen": -1.8468825817108154, + "logits/rejected": -1.913553237915039, + "logps/chosen": -316.6802062988281, + "logps/rejected": -239.47091674804688, + "loss": 0.8433, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.890744686126709, + "rewards/margins": -0.05893130972981453, + "rewards/rejected": -0.8318133354187012, + "step": 230 + }, + { + "epoch": 0.03, + "learning_rate": 2.964883530375746e-07, + "logits/chosen": -2.4196152687072754, + "logits/rejected": -2.670041799545288, + "logps/chosen": -461.759521484375, + "logps/rejected": -422.4710998535156, + "loss": 0.6091, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5823503732681274, + "rewards/margins": 0.9311032295227051, + "rewards/rejected": -1.5134536027908325, + "step": 231 + }, + { + "epoch": 0.03, + "learning_rate": 2.964532365679504e-07, + "logits/chosen": -2.646573066711426, + "logits/rejected": -2.6274423599243164, + "logps/chosen": -304.86248779296875, + "logps/rejected": -174.26608276367188, + "loss": 0.6458, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.38454434275627136, + "rewards/margins": 0.4667331278324127, + "rewards/rejected": -0.8512774705886841, + "step": 232 + }, + { + "epoch": 0.03, + "learning_rate": 2.964181200983261e-07, + "logits/chosen": -2.11808180809021, + "logits/rejected": -2.420116424560547, + "logps/chosen": -305.56689453125, + "logps/rejected": -217.71578979492188, + "loss": 1.1551, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3124622106552124, + "rewards/margins": -0.24168062210083008, + "rewards/rejected": -1.0707814693450928, + "step": 233 + }, + { + "epoch": 0.03, + "learning_rate": 2.9638300362870184e-07, + "logits/chosen": -2.182572841644287, + "logits/rejected": -1.8632193803787231, + "logps/chosen": -352.98504638671875, + "logps/rejected": -331.3326416015625, + "loss": 0.563, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1655232310295105, + "rewards/margins": 0.478364497423172, + "rewards/rejected": -0.6438877582550049, + "step": 234 + }, + { + "epoch": 0.03, + "learning_rate": 2.963478871590776e-07, + "logits/chosen": -2.2569808959960938, + "logits/rejected": -2.099720001220703, + "logps/chosen": -306.02215576171875, + "logps/rejected": -300.57122802734375, + "loss": 0.6032, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5238438844680786, + "rewards/margins": 0.8452980518341064, + "rewards/rejected": -1.369141936302185, + "step": 235 + }, + { + "epoch": 0.03, + "learning_rate": 2.9631277068945335e-07, + "logits/chosen": -1.9281139373779297, + "logits/rejected": -1.8957213163375854, + "logps/chosen": -368.9064636230469, + "logps/rejected": -336.4112548828125, + "loss": 0.6008, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.480535089969635, + "rewards/margins": 0.4374768137931824, + "rewards/rejected": -0.9180118441581726, + "step": 236 + }, + { + "epoch": 0.03, + "learning_rate": 2.962776542198291e-07, + "logits/chosen": -2.270738124847412, + "logits/rejected": -2.3990345001220703, + "logps/chosen": -313.4369812011719, + "logps/rejected": -287.0973205566406, + "loss": 0.7967, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4161204695701599, + "rewards/margins": -0.0070289671421051025, + "rewards/rejected": -0.4090915024280548, + "step": 237 + }, + { + "epoch": 0.03, + "learning_rate": 2.9624253775020485e-07, + "logits/chosen": -2.0554771423339844, + "logits/rejected": -2.152637481689453, + "logps/chosen": -259.2019348144531, + "logps/rejected": -241.4027862548828, + "loss": 0.5264, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.25716498494148254, + "rewards/margins": 0.6583040952682495, + "rewards/rejected": -0.9154691696166992, + "step": 238 + }, + { + "epoch": 0.03, + "learning_rate": 2.9620742128058056e-07, + "logits/chosen": -2.6229238510131836, + "logits/rejected": -2.80519962310791, + "logps/chosen": -352.34051513671875, + "logps/rejected": -275.5546569824219, + "loss": 0.414, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.37737661600112915, + "rewards/margins": 1.0245975255966187, + "rewards/rejected": -1.4019742012023926, + "step": 239 + }, + { + "epoch": 0.03, + "learning_rate": 2.961723048109563e-07, + "logits/chosen": -2.0790154933929443, + "logits/rejected": -2.264246940612793, + "logps/chosen": -427.1692810058594, + "logps/rejected": -310.5115661621094, + "loss": 0.4423, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5074750185012817, + "rewards/margins": 0.9478169679641724, + "rewards/rejected": -1.455291986465454, + "step": 240 + }, + { + "epoch": 0.03, + "learning_rate": 2.9613718834133206e-07, + "logits/chosen": -2.0987935066223145, + "logits/rejected": -2.363996982574463, + "logps/chosen": -455.7206726074219, + "logps/rejected": -240.28175354003906, + "loss": 0.5168, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3893233835697174, + "rewards/margins": 0.8785820603370667, + "rewards/rejected": -1.267905354499817, + "step": 241 + }, + { + "epoch": 0.03, + "learning_rate": 2.961020718717078e-07, + "logits/chosen": -2.254518747329712, + "logits/rejected": -2.0137460231781006, + "logps/chosen": -167.371826171875, + "logps/rejected": -178.0348358154297, + "loss": 0.4585, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.557105302810669, + "rewards/margins": 1.2101905345916748, + "rewards/rejected": -1.7672958374023438, + "step": 242 + }, + { + "epoch": 0.03, + "learning_rate": 2.9606695540208357e-07, + "logits/chosen": -1.991156816482544, + "logits/rejected": -2.1559855937957764, + "logps/chosen": -325.771240234375, + "logps/rejected": -327.82476806640625, + "loss": 0.5086, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09179538488388062, + "rewards/margins": 0.6460449695587158, + "rewards/rejected": -0.7378404140472412, + "step": 243 + }, + { + "epoch": 0.03, + "learning_rate": 2.960318389324593e-07, + "logits/chosen": -2.609808921813965, + "logits/rejected": -2.632416009902954, + "logps/chosen": -257.9018249511719, + "logps/rejected": -222.05984497070312, + "loss": 0.3989, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.038080792874097824, + "rewards/margins": 0.9120903015136719, + "rewards/rejected": -0.9501710534095764, + "step": 244 + }, + { + "epoch": 0.03, + "learning_rate": 2.959967224628351e-07, + "logits/chosen": -2.385178565979004, + "logits/rejected": -2.370959758758545, + "logps/chosen": -167.45718383789062, + "logps/rejected": -159.38613891601562, + "loss": 0.5876, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.029159925878047943, + "rewards/margins": 0.3074513077735901, + "rewards/rejected": -0.27829140424728394, + "step": 245 + }, + { + "epoch": 0.03, + "learning_rate": 2.9596160599321083e-07, + "logits/chosen": -2.7521626949310303, + "logits/rejected": -2.7997536659240723, + "logps/chosen": -96.73912811279297, + "logps/rejected": -173.739501953125, + "loss": 0.4748, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.33914893865585327, + "rewards/margins": 1.0791616439819336, + "rewards/rejected": -1.4183106422424316, + "step": 246 + }, + { + "epoch": 0.03, + "learning_rate": 2.9592648952358653e-07, + "logits/chosen": -2.6631202697753906, + "logits/rejected": -2.550213098526001, + "logps/chosen": -181.46681213378906, + "logps/rejected": -240.0803680419922, + "loss": 0.4961, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2750804424285889, + "rewards/margins": 1.3934874534606934, + "rewards/rejected": -2.668567657470703, + "step": 247 + }, + { + "epoch": 0.03, + "learning_rate": 2.958913730539623e-07, + "logits/chosen": -2.8205742835998535, + "logits/rejected": -2.665963649749756, + "logps/chosen": -320.53936767578125, + "logps/rejected": -276.3487548828125, + "loss": 0.3362, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3705751299858093, + "rewards/margins": 1.381079912185669, + "rewards/rejected": -1.0105047225952148, + "step": 248 + }, + { + "epoch": 0.03, + "learning_rate": 2.9585625658433804e-07, + "logits/chosen": -2.377568483352661, + "logits/rejected": -2.506458044052124, + "logps/chosen": -268.6233825683594, + "logps/rejected": -153.5908660888672, + "loss": 0.7913, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9384375810623169, + "rewards/margins": 0.0027603209018707275, + "rewards/rejected": -0.9411978721618652, + "step": 249 + }, + { + "epoch": 0.03, + "learning_rate": 2.958211401147138e-07, + "logits/chosen": -2.4259755611419678, + "logits/rejected": -2.456841230392456, + "logps/chosen": -401.0574645996094, + "logps/rejected": -266.0364074707031, + "loss": 0.6094, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.18601155281066895, + "rewards/margins": 0.3794706463813782, + "rewards/rejected": -0.5654821991920471, + "step": 250 + }, + { + "epoch": 0.03, + "learning_rate": 2.9578602364508955e-07, + "logits/chosen": -1.5761626958847046, + "logits/rejected": -1.6773066520690918, + "logps/chosen": -425.946533203125, + "logps/rejected": -318.69036865234375, + "loss": 0.522, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.30952805280685425, + "rewards/margins": 0.4639835059642792, + "rewards/rejected": -0.7735116481781006, + "step": 251 + }, + { + "epoch": 0.03, + "learning_rate": 2.9575090717546525e-07, + "logits/chosen": -2.394601345062256, + "logits/rejected": -2.403655529022217, + "logps/chosen": -411.64935302734375, + "logps/rejected": -294.298828125, + "loss": 0.3433, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.035547688603401184, + "rewards/margins": 1.005682110786438, + "rewards/rejected": -1.0412298440933228, + "step": 252 + }, + { + "epoch": 0.03, + "learning_rate": 2.95715790705841e-07, + "logits/chosen": -2.508943557739258, + "logits/rejected": -2.383450508117676, + "logps/chosen": -402.05340576171875, + "logps/rejected": -369.0451354980469, + "loss": 0.3757, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.007100097835063934, + "rewards/margins": 1.0251924991607666, + "rewards/rejected": -1.0322926044464111, + "step": 253 + }, + { + "epoch": 0.03, + "learning_rate": 2.956806742362168e-07, + "logits/chosen": -1.994310975074768, + "logits/rejected": -2.1413402557373047, + "logps/chosen": -339.99981689453125, + "logps/rejected": -298.3623046875, + "loss": 0.4682, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6991456747055054, + "rewards/margins": 0.8628280758857727, + "rewards/rejected": -1.5619738101959229, + "step": 254 + }, + { + "epoch": 0.03, + "learning_rate": 2.956455577665925e-07, + "logits/chosen": -2.384492874145508, + "logits/rejected": -2.442148208618164, + "logps/chosen": -428.8415832519531, + "logps/rejected": -346.28289794921875, + "loss": 0.6036, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3061066269874573, + "rewards/margins": 0.5919345617294312, + "rewards/rejected": -0.8980411887168884, + "step": 255 + }, + { + "epoch": 0.03, + "learning_rate": 2.9561044129696827e-07, + "logits/chosen": -2.1962108612060547, + "logits/rejected": -2.4310173988342285, + "logps/chosen": -343.4423828125, + "logps/rejected": -331.4911804199219, + "loss": 0.2417, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10040051490068436, + "rewards/margins": 1.8938237428665161, + "rewards/rejected": -1.7934232950210571, + "step": 256 + }, + { + "epoch": 0.03, + "learning_rate": 2.95575324827344e-07, + "logits/chosen": -2.19378662109375, + "logits/rejected": -2.345290184020996, + "logps/chosen": -261.68682861328125, + "logps/rejected": -263.3494873046875, + "loss": 0.4808, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.31829166412353516, + "rewards/margins": 0.9455233812332153, + "rewards/rejected": -1.2638150453567505, + "step": 257 + }, + { + "epoch": 0.03, + "learning_rate": 2.9554020835771977e-07, + "logits/chosen": -2.722865104675293, + "logits/rejected": -2.7046239376068115, + "logps/chosen": -435.0589904785156, + "logps/rejected": -330.1401672363281, + "loss": 0.4385, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3628096580505371, + "rewards/margins": 0.9128121137619019, + "rewards/rejected": -1.275621771812439, + "step": 258 + }, + { + "epoch": 0.03, + "learning_rate": 2.9550509188809553e-07, + "logits/chosen": -2.687992572784424, + "logits/rejected": -2.4103446006774902, + "logps/chosen": -365.6318359375, + "logps/rejected": -253.79873657226562, + "loss": 0.6006, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5321721434593201, + "rewards/margins": 0.6064667105674744, + "rewards/rejected": -1.1386388540267944, + "step": 259 + }, + { + "epoch": 0.03, + "learning_rate": 2.9546997541847123e-07, + "logits/chosen": -2.582211971282959, + "logits/rejected": -2.7115254402160645, + "logps/chosen": -259.422607421875, + "logps/rejected": -230.20574951171875, + "loss": 0.5987, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5349728465080261, + "rewards/margins": 0.44639042019844055, + "rewards/rejected": -0.9813632369041443, + "step": 260 + }, + { + "epoch": 0.03, + "learning_rate": 2.95434858948847e-07, + "logits/chosen": -2.3440754413604736, + "logits/rejected": -2.2051634788513184, + "logps/chosen": -300.3331604003906, + "logps/rejected": -217.04315185546875, + "loss": 0.7289, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8377606868743896, + "rewards/margins": 0.40144699811935425, + "rewards/rejected": -1.2392076253890991, + "step": 261 + }, + { + "epoch": 0.03, + "learning_rate": 2.9539974247922274e-07, + "logits/chosen": -3.0103206634521484, + "logits/rejected": -2.9713211059570312, + "logps/chosen": -276.83984375, + "logps/rejected": -211.50927734375, + "loss": 0.5159, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.11297862231731415, + "rewards/margins": 0.644722044467926, + "rewards/rejected": -0.7577006220817566, + "step": 262 + }, + { + "epoch": 0.03, + "learning_rate": 2.953646260095985e-07, + "logits/chosen": -2.062161922454834, + "logits/rejected": -2.2040352821350098, + "logps/chosen": -406.58251953125, + "logps/rejected": -276.4377136230469, + "loss": 0.5001, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14264480769634247, + "rewards/margins": 0.5765120983123779, + "rewards/rejected": -0.7191568613052368, + "step": 263 + }, + { + "epoch": 0.03, + "learning_rate": 2.9532950953997424e-07, + "logits/chosen": -2.0667099952697754, + "logits/rejected": -1.9883157014846802, + "logps/chosen": -227.57440185546875, + "logps/rejected": -235.73825073242188, + "loss": 1.1505, + "rewards/accuracies": 0.375, + "rewards/chosen": -2.234545946121216, + "rewards/margins": 0.4185921549797058, + "rewards/rejected": -2.6531381607055664, + "step": 264 + }, + { + "epoch": 0.03, + "learning_rate": 2.9529439307034994e-07, + "logits/chosen": -2.3971495628356934, + "logits/rejected": -2.672471523284912, + "logps/chosen": -237.63194274902344, + "logps/rejected": -250.3622589111328, + "loss": 0.4759, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.31001055240631104, + "rewards/margins": 1.0348756313323975, + "rewards/rejected": -1.344886064529419, + "step": 265 + }, + { + "epoch": 0.03, + "learning_rate": 2.9525927660072575e-07, + "logits/chosen": -1.9253414869308472, + "logits/rejected": -2.121354341506958, + "logps/chosen": -452.89227294921875, + "logps/rejected": -274.84197998046875, + "loss": 0.376, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1288459300994873, + "rewards/margins": 1.4546468257904053, + "rewards/rejected": -1.5834927558898926, + "step": 266 + }, + { + "epoch": 0.03, + "learning_rate": 2.952241601311015e-07, + "logits/chosen": -2.534254312515259, + "logits/rejected": -2.8407325744628906, + "logps/chosen": -241.15347290039062, + "logps/rejected": -186.58734130859375, + "loss": 0.751, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6216213703155518, + "rewards/margins": 0.4163460433483124, + "rewards/rejected": -1.0379674434661865, + "step": 267 + }, + { + "epoch": 0.03, + "learning_rate": 2.951890436614772e-07, + "logits/chosen": -2.502967119216919, + "logits/rejected": -2.6964800357818604, + "logps/chosen": -281.22552490234375, + "logps/rejected": -156.8287811279297, + "loss": 1.0239, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6609859466552734, + "rewards/margins": -0.08337759971618652, + "rewards/rejected": -0.5776082873344421, + "step": 268 + }, + { + "epoch": 0.03, + "learning_rate": 2.9515392719185296e-07, + "logits/chosen": -2.1238110065460205, + "logits/rejected": -2.0988240242004395, + "logps/chosen": -109.81375885009766, + "logps/rejected": -145.91249084472656, + "loss": 0.6036, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6162945032119751, + "rewards/margins": 0.5173001885414124, + "rewards/rejected": -1.1335947513580322, + "step": 269 + }, + { + "epoch": 0.03, + "learning_rate": 2.951188107222287e-07, + "logits/chosen": -2.281275749206543, + "logits/rejected": -2.71203351020813, + "logps/chosen": -214.976318359375, + "logps/rejected": -231.88339233398438, + "loss": 0.3696, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0695578083395958, + "rewards/margins": 1.3117856979370117, + "rewards/rejected": -1.3813436031341553, + "step": 270 + }, + { + "epoch": 0.03, + "learning_rate": 2.9508369425260447e-07, + "logits/chosen": -2.435311794281006, + "logits/rejected": -2.2807459831237793, + "logps/chosen": -205.1721954345703, + "logps/rejected": -184.4014129638672, + "loss": 0.5695, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.21257978677749634, + "rewards/margins": 0.39207518100738525, + "rewards/rejected": -0.6046549081802368, + "step": 271 + }, + { + "epoch": 0.03, + "learning_rate": 2.950485777829802e-07, + "logits/chosen": -2.2403531074523926, + "logits/rejected": -2.433664321899414, + "logps/chosen": -364.83575439453125, + "logps/rejected": -322.3677062988281, + "loss": 0.5652, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5442026257514954, + "rewards/margins": 0.816289484500885, + "rewards/rejected": -1.3604919910430908, + "step": 272 + }, + { + "epoch": 0.03, + "learning_rate": 2.950134613133559e-07, + "logits/chosen": -2.48274564743042, + "logits/rejected": -2.600632905960083, + "logps/chosen": -254.28736877441406, + "logps/rejected": -177.2164306640625, + "loss": 0.5016, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.23177993297576904, + "rewards/margins": 0.6285390257835388, + "rewards/rejected": -0.8603190183639526, + "step": 273 + }, + { + "epoch": 0.03, + "learning_rate": 2.949783448437317e-07, + "logits/chosen": -2.090643882751465, + "logits/rejected": -2.19642972946167, + "logps/chosen": -247.14442443847656, + "logps/rejected": -232.71356201171875, + "loss": 0.5519, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5942323803901672, + "rewards/margins": 0.8580787777900696, + "rewards/rejected": -1.4523111581802368, + "step": 274 + }, + { + "epoch": 0.03, + "learning_rate": 2.9494322837410743e-07, + "logits/chosen": -2.1570487022399902, + "logits/rejected": -2.227642059326172, + "logps/chosen": -331.8638610839844, + "logps/rejected": -194.77944946289062, + "loss": 0.8201, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8403474688529968, + "rewards/margins": 0.025163963437080383, + "rewards/rejected": -0.865511417388916, + "step": 275 + }, + { + "epoch": 0.03, + "learning_rate": 2.949081119044832e-07, + "logits/chosen": -2.5300683975219727, + "logits/rejected": -2.4333624839782715, + "logps/chosen": -259.010009765625, + "logps/rejected": -235.95590209960938, + "loss": 0.5318, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5601819157600403, + "rewards/margins": 0.626712441444397, + "rewards/rejected": -1.186894416809082, + "step": 276 + }, + { + "epoch": 0.03, + "learning_rate": 2.9487299543485894e-07, + "logits/chosen": -2.5485079288482666, + "logits/rejected": -2.631439685821533, + "logps/chosen": -282.4809875488281, + "logps/rejected": -144.06698608398438, + "loss": 0.3305, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.138786181807518, + "rewards/margins": 1.226760983467102, + "rewards/rejected": -1.0879747867584229, + "step": 277 + }, + { + "epoch": 0.03, + "learning_rate": 2.948378789652347e-07, + "logits/chosen": -2.2638251781463623, + "logits/rejected": -2.240534543991089, + "logps/chosen": -372.77972412109375, + "logps/rejected": -348.0865783691406, + "loss": 0.7576, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9804244041442871, + "rewards/margins": 0.4889661371707916, + "rewards/rejected": -1.469390630722046, + "step": 278 + }, + { + "epoch": 0.03, + "learning_rate": 2.9480276249561045e-07, + "logits/chosen": -2.199970245361328, + "logits/rejected": -2.4654924869537354, + "logps/chosen": -425.300537109375, + "logps/rejected": -159.1180877685547, + "loss": 0.6262, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.12179931253194809, + "rewards/margins": 0.6379648447036743, + "rewards/rejected": -0.7597641944885254, + "step": 279 + }, + { + "epoch": 0.03, + "learning_rate": 2.947676460259862e-07, + "logits/chosen": -2.555758237838745, + "logits/rejected": -2.3482275009155273, + "logps/chosen": -375.8440246582031, + "logps/rejected": -320.0694580078125, + "loss": 0.5443, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.27846044301986694, + "rewards/margins": 0.909398078918457, + "rewards/rejected": -1.1878585815429688, + "step": 280 + }, + { + "epoch": 0.03, + "learning_rate": 2.947325295563619e-07, + "logits/chosen": -2.6971516609191895, + "logits/rejected": -2.7209959030151367, + "logps/chosen": -260.6485595703125, + "logps/rejected": -272.27734375, + "loss": 0.3306, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.27597489953041077, + "rewards/margins": 1.2830018997192383, + "rewards/rejected": -1.5589767694473267, + "step": 281 + }, + { + "epoch": 0.03, + "learning_rate": 2.9469741308673765e-07, + "logits/chosen": -2.565629720687866, + "logits/rejected": -2.663017749786377, + "logps/chosen": -190.19911193847656, + "logps/rejected": -191.72955322265625, + "loss": 0.7098, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3977823853492737, + "rewards/margins": 0.29230815172195435, + "rewards/rejected": -0.690090537071228, + "step": 282 + }, + { + "epoch": 0.03, + "learning_rate": 2.946622966171134e-07, + "logits/chosen": -2.3987889289855957, + "logits/rejected": -2.570647716522217, + "logps/chosen": -274.57989501953125, + "logps/rejected": -155.21533203125, + "loss": 0.5328, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.681294858455658, + "rewards/margins": 0.4336378574371338, + "rewards/rejected": -1.1149327754974365, + "step": 283 + }, + { + "epoch": 0.03, + "learning_rate": 2.9462718014748916e-07, + "logits/chosen": -1.9799509048461914, + "logits/rejected": -2.0913195610046387, + "logps/chosen": -460.15106201171875, + "logps/rejected": -382.25848388671875, + "loss": 0.935, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6940089464187622, + "rewards/margins": 0.16911420226097107, + "rewards/rejected": -0.8631231188774109, + "step": 284 + }, + { + "epoch": 0.03, + "learning_rate": 2.945920636778649e-07, + "logits/chosen": -2.282864570617676, + "logits/rejected": -2.3367934226989746, + "logps/chosen": -122.86644744873047, + "logps/rejected": -151.83712768554688, + "loss": 0.4842, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3347893953323364, + "rewards/margins": 0.7834877967834473, + "rewards/rejected": -1.1182773113250732, + "step": 285 + }, + { + "epoch": 0.03, + "learning_rate": 2.945569472082406e-07, + "logits/chosen": -2.117490291595459, + "logits/rejected": -2.329458475112915, + "logps/chosen": -294.0777282714844, + "logps/rejected": -229.04652404785156, + "loss": 0.2997, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07926157116889954, + "rewards/margins": 1.2816565036773682, + "rewards/rejected": -1.3609181642532349, + "step": 286 + }, + { + "epoch": 0.03, + "learning_rate": 2.9452183073861637e-07, + "logits/chosen": -2.5297348499298096, + "logits/rejected": -2.6685540676116943, + "logps/chosen": -267.8245544433594, + "logps/rejected": -167.74212646484375, + "loss": 0.5392, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.42969074845314026, + "rewards/margins": 0.5574628710746765, + "rewards/rejected": -0.9871535897254944, + "step": 287 + }, + { + "epoch": 0.03, + "learning_rate": 2.944867142689922e-07, + "logits/chosen": -2.4031693935394287, + "logits/rejected": -2.490243911743164, + "logps/chosen": -229.80490112304688, + "logps/rejected": -203.2218475341797, + "loss": 0.8946, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6736038327217102, + "rewards/margins": -0.09855803102254868, + "rewards/rejected": -0.5750458836555481, + "step": 288 + }, + { + "epoch": 0.03, + "learning_rate": 2.944515977993679e-07, + "logits/chosen": -2.3816566467285156, + "logits/rejected": -2.5154058933258057, + "logps/chosen": -271.6398010253906, + "logps/rejected": -172.23048400878906, + "loss": 0.6597, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8466386198997498, + "rewards/margins": 0.3224322199821472, + "rewards/rejected": -1.1690709590911865, + "step": 289 + }, + { + "epoch": 0.03, + "learning_rate": 2.9441648132974363e-07, + "logits/chosen": -2.1021440029144287, + "logits/rejected": -2.558478355407715, + "logps/chosen": -351.9532470703125, + "logps/rejected": -249.16995239257812, + "loss": 0.7095, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6725323796272278, + "rewards/margins": 0.5412154793739319, + "rewards/rejected": -1.2137478590011597, + "step": 290 + }, + { + "epoch": 0.03, + "learning_rate": 2.943813648601194e-07, + "logits/chosen": -1.7940722703933716, + "logits/rejected": -2.1903252601623535, + "logps/chosen": -484.86285400390625, + "logps/rejected": -384.2651062011719, + "loss": 0.7542, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.157289743423462, + "rewards/margins": 0.5190855264663696, + "rewards/rejected": -1.6763752698898315, + "step": 291 + }, + { + "epoch": 0.03, + "learning_rate": 2.9434624839049514e-07, + "logits/chosen": -2.013561248779297, + "logits/rejected": -2.265477418899536, + "logps/chosen": -389.9479675292969, + "logps/rejected": -235.366943359375, + "loss": 0.798, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5889512896537781, + "rewards/margins": 0.24121691286563873, + "rewards/rejected": -0.8301681876182556, + "step": 292 + }, + { + "epoch": 0.03, + "learning_rate": 2.943111319208709e-07, + "logits/chosen": -2.7173972129821777, + "logits/rejected": -2.6816351413726807, + "logps/chosen": -144.42135620117188, + "logps/rejected": -186.45896911621094, + "loss": 0.2574, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15038637816905975, + "rewards/margins": 1.5699679851531982, + "rewards/rejected": -1.7203543186187744, + "step": 293 + }, + { + "epoch": 0.03, + "learning_rate": 2.942760154512466e-07, + "logits/chosen": -2.362577438354492, + "logits/rejected": -2.5445141792297363, + "logps/chosen": -355.3098449707031, + "logps/rejected": -291.9595642089844, + "loss": 0.3196, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.40410059690475464, + "rewards/margins": 1.5102170705795288, + "rewards/rejected": -1.9143176078796387, + "step": 294 + }, + { + "epoch": 0.03, + "learning_rate": 2.9424089898162235e-07, + "logits/chosen": -2.14021635055542, + "logits/rejected": -1.9815465211868286, + "logps/chosen": -415.383544921875, + "logps/rejected": -300.37200927734375, + "loss": 0.9256, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.7506393194198608, + "rewards/margins": -0.22799885272979736, + "rewards/rejected": -0.5226405262947083, + "step": 295 + }, + { + "epoch": 0.03, + "learning_rate": 2.942057825119981e-07, + "logits/chosen": -2.9224026203155518, + "logits/rejected": -2.8051273822784424, + "logps/chosen": -200.6275634765625, + "logps/rejected": -171.9537811279297, + "loss": 0.4878, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8506995439529419, + "rewards/margins": 1.016174554824829, + "rewards/rejected": -1.8668742179870605, + "step": 296 + }, + { + "epoch": 0.03, + "learning_rate": 2.9417066604237386e-07, + "logits/chosen": -1.8748866319656372, + "logits/rejected": -2.370699167251587, + "logps/chosen": -408.7513732910156, + "logps/rejected": -273.0702819824219, + "loss": 0.9028, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2204723358154297, + "rewards/margins": 0.7696370482444763, + "rewards/rejected": -1.9901094436645508, + "step": 297 + }, + { + "epoch": 0.03, + "learning_rate": 2.941355495727496e-07, + "logits/chosen": -2.461463689804077, + "logits/rejected": -2.1740200519561768, + "logps/chosen": -364.877197265625, + "logps/rejected": -305.4757080078125, + "loss": 0.574, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04204273223876953, + "rewards/margins": 1.1109271049499512, + "rewards/rejected": -1.0688843727111816, + "step": 298 + }, + { + "epoch": 0.03, + "learning_rate": 2.9410043310312536e-07, + "logits/chosen": -2.5175399780273438, + "logits/rejected": -2.492159366607666, + "logps/chosen": -139.8627471923828, + "logps/rejected": -158.35716247558594, + "loss": 0.9591, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8358961343765259, + "rewards/margins": -0.020810842514038086, + "rewards/rejected": -0.8150852918624878, + "step": 299 + }, + { + "epoch": 0.03, + "learning_rate": 2.940653166335011e-07, + "logits/chosen": -2.526010751724243, + "logits/rejected": -2.416977882385254, + "logps/chosen": -264.51300048828125, + "logps/rejected": -359.6108703613281, + "loss": 0.2999, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.47794821858406067, + "rewards/margins": 1.5804721117019653, + "rewards/rejected": -2.058420419692993, + "step": 300 + }, + { + "epoch": 0.03, + "learning_rate": 2.9403020016387687e-07, + "logits/chosen": -2.1319918632507324, + "logits/rejected": -2.4230334758758545, + "logps/chosen": -272.0066223144531, + "logps/rejected": -194.845947265625, + "loss": 1.0323, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.414311170578003, + "rewards/margins": -0.27127644419670105, + "rewards/rejected": -1.1430346965789795, + "step": 301 + }, + { + "epoch": 0.03, + "learning_rate": 2.9399508369425257e-07, + "logits/chosen": -2.577136993408203, + "logits/rejected": -2.3766889572143555, + "logps/chosen": -243.67385864257812, + "logps/rejected": -217.57290649414062, + "loss": 0.5588, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7634921669960022, + "rewards/margins": 0.6291521787643433, + "rewards/rejected": -1.3926442861557007, + "step": 302 + }, + { + "epoch": 0.03, + "learning_rate": 2.939599672246283e-07, + "logits/chosen": -2.6137471199035645, + "logits/rejected": -2.2707159519195557, + "logps/chosen": -174.1868438720703, + "logps/rejected": -236.3575897216797, + "loss": 0.3797, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4554882049560547, + "rewards/margins": 1.316563367843628, + "rewards/rejected": -1.7720515727996826, + "step": 303 + }, + { + "epoch": 0.04, + "learning_rate": 2.939248507550041e-07, + "logits/chosen": -2.131531238555908, + "logits/rejected": -2.0698797702789307, + "logps/chosen": -272.2153015136719, + "logps/rejected": -315.72906494140625, + "loss": 0.2793, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.42857807874679565, + "rewards/margins": 1.4032621383666992, + "rewards/rejected": -1.8318402767181396, + "step": 304 + }, + { + "epoch": 0.04, + "learning_rate": 2.9388973428537983e-07, + "logits/chosen": -2.192800760269165, + "logits/rejected": -2.289705753326416, + "logps/chosen": -399.411865234375, + "logps/rejected": -335.3656921386719, + "loss": 0.4295, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.10309697687625885, + "rewards/margins": 1.3041479587554932, + "rewards/rejected": -1.2010509967803955, + "step": 305 + }, + { + "epoch": 0.04, + "learning_rate": 2.938546178157556e-07, + "logits/chosen": -2.574787139892578, + "logits/rejected": -2.588287353515625, + "logps/chosen": -166.89559936523438, + "logps/rejected": -361.4928894042969, + "loss": 0.3456, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.39184850454330444, + "rewards/margins": 1.3199741840362549, + "rewards/rejected": -1.7118228673934937, + "step": 306 + }, + { + "epoch": 0.04, + "learning_rate": 2.9381950134613134e-07, + "logits/chosen": -2.964386463165283, + "logits/rejected": -2.9055564403533936, + "logps/chosen": -150.81398010253906, + "logps/rejected": -162.65118408203125, + "loss": 0.8059, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5871843099594116, + "rewards/margins": 0.22496835887432098, + "rewards/rejected": -0.8121526837348938, + "step": 307 + }, + { + "epoch": 0.04, + "learning_rate": 2.9378438487650704e-07, + "logits/chosen": -2.1143198013305664, + "logits/rejected": -2.3336005210876465, + "logps/chosen": -225.89047241210938, + "logps/rejected": -244.87786865234375, + "loss": 0.5198, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5538389682769775, + "rewards/margins": 0.8889042735099792, + "rewards/rejected": -1.442743182182312, + "step": 308 + }, + { + "epoch": 0.04, + "learning_rate": 2.937492684068828e-07, + "logits/chosen": -2.586839199066162, + "logits/rejected": -2.740248203277588, + "logps/chosen": -234.30252075195312, + "logps/rejected": -258.9303283691406, + "loss": 0.2663, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3531617224216461, + "rewards/margins": 1.659736156463623, + "rewards/rejected": -2.0128979682922363, + "step": 309 + }, + { + "epoch": 0.04, + "learning_rate": 2.9371415193725855e-07, + "logits/chosen": -2.14638352394104, + "logits/rejected": -2.129441976547241, + "logps/chosen": -228.14111328125, + "logps/rejected": -230.8649139404297, + "loss": 0.5198, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7805700898170471, + "rewards/margins": 1.1138331890106201, + "rewards/rejected": -1.8944032192230225, + "step": 310 + }, + { + "epoch": 0.04, + "learning_rate": 2.936790354676343e-07, + "logits/chosen": -2.1915736198425293, + "logits/rejected": -2.245243549346924, + "logps/chosen": -329.0081481933594, + "logps/rejected": -308.6106872558594, + "loss": 0.4491, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3285095691680908, + "rewards/margins": 0.6169960498809814, + "rewards/rejected": -0.9455056190490723, + "step": 311 + }, + { + "epoch": 0.04, + "learning_rate": 2.9364391899801006e-07, + "logits/chosen": -2.233534097671509, + "logits/rejected": -2.3738598823547363, + "logps/chosen": -310.25262451171875, + "logps/rejected": -156.22097778320312, + "loss": 0.6632, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.26533278822898865, + "rewards/margins": 0.23298591375350952, + "rewards/rejected": -0.49831870198249817, + "step": 312 + }, + { + "epoch": 0.04, + "learning_rate": 2.936088025283858e-07, + "logits/chosen": -2.247321844100952, + "logits/rejected": -2.4254310131073, + "logps/chosen": -302.04364013671875, + "logps/rejected": -248.67855834960938, + "loss": 0.9912, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6334093809127808, + "rewards/margins": 0.03866736590862274, + "rewards/rejected": -0.6720767617225647, + "step": 313 + }, + { + "epoch": 0.04, + "learning_rate": 2.9357368605876157e-07, + "logits/chosen": -2.3149588108062744, + "logits/rejected": -2.386685609817505, + "logps/chosen": -327.7912292480469, + "logps/rejected": -283.8422546386719, + "loss": 1.3721, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.1702240705490112, + "rewards/margins": -0.6499944925308228, + "rewards/rejected": -0.5202295184135437, + "step": 314 + }, + { + "epoch": 0.04, + "learning_rate": 2.935385695891373e-07, + "logits/chosen": -2.0543737411499023, + "logits/rejected": -2.1373019218444824, + "logps/chosen": -486.49127197265625, + "logps/rejected": -473.36273193359375, + "loss": 0.4392, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4708430767059326, + "rewards/margins": 2.993130922317505, + "rewards/rejected": -4.4639739990234375, + "step": 315 + }, + { + "epoch": 0.04, + "learning_rate": 2.93503453119513e-07, + "logits/chosen": -2.382384777069092, + "logits/rejected": -2.3139007091522217, + "logps/chosen": -244.15234375, + "logps/rejected": -298.92791748046875, + "loss": 0.3518, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16546761989593506, + "rewards/margins": 1.8811949491500854, + "rewards/rejected": -2.0466625690460205, + "step": 316 + }, + { + "epoch": 0.04, + "learning_rate": 2.934683366498888e-07, + "logits/chosen": -2.194863796234131, + "logits/rejected": -2.1163110733032227, + "logps/chosen": -368.8987121582031, + "logps/rejected": -330.4774475097656, + "loss": 0.8136, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.7519022226333618, + "rewards/margins": -0.15950801968574524, + "rewards/rejected": -1.592394232749939, + "step": 317 + }, + { + "epoch": 0.04, + "learning_rate": 2.9343322018026453e-07, + "logits/chosen": -2.5248467922210693, + "logits/rejected": -2.285937547683716, + "logps/chosen": -119.87092590332031, + "logps/rejected": -209.5535888671875, + "loss": 0.4436, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1545644998550415, + "rewards/margins": 0.9422765970230103, + "rewards/rejected": -1.0968410968780518, + "step": 318 + }, + { + "epoch": 0.04, + "learning_rate": 2.933981037106403e-07, + "logits/chosen": -2.7542662620544434, + "logits/rejected": -2.833341121673584, + "logps/chosen": -148.0059814453125, + "logps/rejected": -219.8675537109375, + "loss": 0.4867, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.44089066982269287, + "rewards/margins": 1.1384673118591309, + "rewards/rejected": -1.5793578624725342, + "step": 319 + }, + { + "epoch": 0.04, + "learning_rate": 2.9336298724101604e-07, + "logits/chosen": -2.694143772125244, + "logits/rejected": -2.553379535675049, + "logps/chosen": -296.6516418457031, + "logps/rejected": -332.866455078125, + "loss": 0.5039, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3042624592781067, + "rewards/margins": 1.2235870361328125, + "rewards/rejected": -1.5278496742248535, + "step": 320 + }, + { + "epoch": 0.04, + "learning_rate": 2.9332787077139174e-07, + "logits/chosen": -2.024030923843384, + "logits/rejected": -2.1723318099975586, + "logps/chosen": -365.7708740234375, + "logps/rejected": -352.2488708496094, + "loss": 0.4648, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7730928659439087, + "rewards/margins": 0.745556652545929, + "rewards/rejected": -1.5186495780944824, + "step": 321 + }, + { + "epoch": 0.04, + "learning_rate": 2.9329275430176754e-07, + "logits/chosen": -1.9563077688217163, + "logits/rejected": -2.1159908771514893, + "logps/chosen": -349.0641174316406, + "logps/rejected": -257.67596435546875, + "loss": 0.7272, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2006678581237793, + "rewards/margins": 0.014318808913230896, + "rewards/rejected": -0.2149866670370102, + "step": 322 + }, + { + "epoch": 0.04, + "learning_rate": 2.9325763783214324e-07, + "logits/chosen": -2.79909348487854, + "logits/rejected": -2.708404779434204, + "logps/chosen": -249.9886474609375, + "logps/rejected": -175.96224975585938, + "loss": 0.7872, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7840108871459961, + "rewards/margins": 0.01597757637500763, + "rewards/rejected": -0.7999884486198425, + "step": 323 + }, + { + "epoch": 0.04, + "learning_rate": 2.93222521362519e-07, + "logits/chosen": -2.108778953552246, + "logits/rejected": -2.2780587673187256, + "logps/chosen": -421.2821044921875, + "logps/rejected": -232.55642700195312, + "loss": 0.8384, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7731979489326477, + "rewards/margins": 0.18959054350852966, + "rewards/rejected": -0.9627885818481445, + "step": 324 + }, + { + "epoch": 0.04, + "learning_rate": 2.9318740489289475e-07, + "logits/chosen": -2.3660497665405273, + "logits/rejected": -2.673567771911621, + "logps/chosen": -212.46749877929688, + "logps/rejected": -205.17225646972656, + "loss": 0.8715, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6689513921737671, + "rewards/margins": -0.19271454215049744, + "rewards/rejected": -0.47623687982559204, + "step": 325 + }, + { + "epoch": 0.04, + "learning_rate": 2.931522884232705e-07, + "logits/chosen": -2.760322093963623, + "logits/rejected": -2.594971179962158, + "logps/chosen": -130.59165954589844, + "logps/rejected": -205.90054321289062, + "loss": 0.5231, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.15813301503658295, + "rewards/margins": 1.0807348489761353, + "rewards/rejected": -1.2388678789138794, + "step": 326 + }, + { + "epoch": 0.04, + "learning_rate": 2.9311717195364626e-07, + "logits/chosen": -2.1170926094055176, + "logits/rejected": -2.173877716064453, + "logps/chosen": -233.58583068847656, + "logps/rejected": -286.125, + "loss": 0.6138, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5149972438812256, + "rewards/margins": 0.25206321477890015, + "rewards/rejected": -0.7670604586601257, + "step": 327 + }, + { + "epoch": 0.04, + "learning_rate": 2.93082055484022e-07, + "logits/chosen": -3.026440382003784, + "logits/rejected": -3.0399186611175537, + "logps/chosen": -201.6120147705078, + "logps/rejected": -216.7086181640625, + "loss": 0.2695, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4788179397583008, + "rewards/margins": 1.6626230478286743, + "rewards/rejected": -2.1414408683776855, + "step": 328 + }, + { + "epoch": 0.04, + "learning_rate": 2.930469390143977e-07, + "logits/chosen": -2.179494619369507, + "logits/rejected": -2.2027623653411865, + "logps/chosen": -314.65338134765625, + "logps/rejected": -314.1252746582031, + "loss": 0.4503, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2535134255886078, + "rewards/margins": 1.219962239265442, + "rewards/rejected": -1.473475694656372, + "step": 329 + }, + { + "epoch": 0.04, + "learning_rate": 2.9301182254477347e-07, + "logits/chosen": -2.536802053451538, + "logits/rejected": -2.7230517864227295, + "logps/chosen": -266.58331298828125, + "logps/rejected": -224.19720458984375, + "loss": 0.8593, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8295977711677551, + "rewards/margins": 0.5366829037666321, + "rewards/rejected": -1.3662805557250977, + "step": 330 + }, + { + "epoch": 0.04, + "learning_rate": 2.929767060751492e-07, + "logits/chosen": -2.7782866954803467, + "logits/rejected": -2.8315181732177734, + "logps/chosen": -262.2361755371094, + "logps/rejected": -345.9444580078125, + "loss": 0.5223, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2741527855396271, + "rewards/margins": 0.42722952365875244, + "rewards/rejected": -0.7013822793960571, + "step": 331 + }, + { + "epoch": 0.04, + "learning_rate": 2.92941589605525e-07, + "logits/chosen": -2.0324857234954834, + "logits/rejected": -2.150836944580078, + "logps/chosen": -998.1727294921875, + "logps/rejected": -283.5758361816406, + "loss": 1.9417, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.3074049949645996, + "rewards/margins": -1.2913403511047363, + "rewards/rejected": -1.0160646438598633, + "step": 332 + }, + { + "epoch": 0.04, + "learning_rate": 2.9290647313590073e-07, + "logits/chosen": -2.954866409301758, + "logits/rejected": -2.8165180683135986, + "logps/chosen": -252.54147338867188, + "logps/rejected": -270.80706787109375, + "loss": 0.3611, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.31102657318115234, + "rewards/margins": 1.4083919525146484, + "rewards/rejected": -1.7194185256958008, + "step": 333 + }, + { + "epoch": 0.04, + "learning_rate": 2.928713566662765e-07, + "logits/chosen": -2.1641108989715576, + "logits/rejected": -2.211155891418457, + "logps/chosen": -206.4960174560547, + "logps/rejected": -215.82147216796875, + "loss": 0.6417, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9880786538124084, + "rewards/margins": 0.814436674118042, + "rewards/rejected": -1.8025152683258057, + "step": 334 + }, + { + "epoch": 0.04, + "learning_rate": 2.9283624019665224e-07, + "logits/chosen": -2.1372780799865723, + "logits/rejected": -2.2275197505950928, + "logps/chosen": -255.3368682861328, + "logps/rejected": -215.7132110595703, + "loss": 0.8966, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.233649730682373, + "rewards/margins": -0.18994951248168945, + "rewards/rejected": -1.0437002182006836, + "step": 335 + }, + { + "epoch": 0.04, + "learning_rate": 2.92801123727028e-07, + "logits/chosen": -1.7364871501922607, + "logits/rejected": -1.9822748899459839, + "logps/chosen": -381.71441650390625, + "logps/rejected": -316.5683898925781, + "loss": 0.4319, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.43522724509239197, + "rewards/margins": 0.7411462068557739, + "rewards/rejected": -1.1763734817504883, + "step": 336 + }, + { + "epoch": 0.04, + "learning_rate": 2.927660072574037e-07, + "logits/chosen": -1.9639652967453003, + "logits/rejected": -1.9747326374053955, + "logps/chosen": -293.55181884765625, + "logps/rejected": -243.66012573242188, + "loss": 0.7418, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.48222115635871887, + "rewards/margins": 0.23482392728328705, + "rewards/rejected": -0.7170450687408447, + "step": 337 + }, + { + "epoch": 0.04, + "learning_rate": 2.9273089078777945e-07, + "logits/chosen": -2.0725936889648438, + "logits/rejected": -2.072930335998535, + "logps/chosen": -251.53028869628906, + "logps/rejected": -229.16220092773438, + "loss": 0.3977, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1716436892747879, + "rewards/margins": 0.8058362007141113, + "rewards/rejected": -0.6341924667358398, + "step": 338 + }, + { + "epoch": 0.04, + "learning_rate": 2.926957743181552e-07, + "logits/chosen": -2.180464029312134, + "logits/rejected": -1.9915170669555664, + "logps/chosen": -358.897705078125, + "logps/rejected": -372.27447509765625, + "loss": 0.662, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5425485968589783, + "rewards/margins": 0.7949169874191284, + "rewards/rejected": -1.337465524673462, + "step": 339 + }, + { + "epoch": 0.04, + "learning_rate": 2.9266065784853095e-07, + "logits/chosen": -1.7858573198318481, + "logits/rejected": -1.7678648233413696, + "logps/chosen": -383.10272216796875, + "logps/rejected": -206.48765563964844, + "loss": 0.4265, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14428487420082092, + "rewards/margins": 1.103689432144165, + "rewards/rejected": -1.2479742765426636, + "step": 340 + }, + { + "epoch": 0.04, + "learning_rate": 2.926255413789067e-07, + "logits/chosen": -1.9283461570739746, + "logits/rejected": -2.1182165145874023, + "logps/chosen": -289.877685546875, + "logps/rejected": -184.9083709716797, + "loss": 0.4952, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2174888402223587, + "rewards/margins": 0.6608957052230835, + "rewards/rejected": -0.8783845901489258, + "step": 341 + }, + { + "epoch": 0.04, + "learning_rate": 2.925904249092824e-07, + "logits/chosen": -2.5400097370147705, + "logits/rejected": -2.814152717590332, + "logps/chosen": -347.95037841796875, + "logps/rejected": -223.09178161621094, + "loss": 0.4644, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.29499274492263794, + "rewards/margins": 0.978171706199646, + "rewards/rejected": -1.2731645107269287, + "step": 342 + }, + { + "epoch": 0.04, + "learning_rate": 2.9255530843965816e-07, + "logits/chosen": -2.1393542289733887, + "logits/rejected": -2.4428181648254395, + "logps/chosen": -277.7389221191406, + "logps/rejected": -157.90931701660156, + "loss": 0.7603, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8610785603523254, + "rewards/margins": 0.10699553787708282, + "rewards/rejected": -0.9680740833282471, + "step": 343 + }, + { + "epoch": 0.04, + "learning_rate": 2.9252019197003397e-07, + "logits/chosen": -2.2113163471221924, + "logits/rejected": -2.2069339752197266, + "logps/chosen": -326.8422546386719, + "logps/rejected": -286.1974182128906, + "loss": 0.4747, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5757253766059875, + "rewards/margins": 0.9536944627761841, + "rewards/rejected": -1.5294198989868164, + "step": 344 + }, + { + "epoch": 0.04, + "learning_rate": 2.9248507550040967e-07, + "logits/chosen": -2.3260269165039062, + "logits/rejected": -2.5012564659118652, + "logps/chosen": -240.7285919189453, + "logps/rejected": -198.6743927001953, + "loss": 0.7965, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5423475503921509, + "rewards/margins": 0.6434760689735413, + "rewards/rejected": -1.1858235597610474, + "step": 345 + }, + { + "epoch": 0.04, + "learning_rate": 2.924499590307854e-07, + "logits/chosen": -2.8701748847961426, + "logits/rejected": -2.8296737670898438, + "logps/chosen": -285.1092224121094, + "logps/rejected": -201.68743896484375, + "loss": 0.9916, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9459737539291382, + "rewards/margins": 0.3743845224380493, + "rewards/rejected": -1.3203582763671875, + "step": 346 + }, + { + "epoch": 0.04, + "learning_rate": 2.924148425611612e-07, + "logits/chosen": -2.5335755348205566, + "logits/rejected": -2.8178906440734863, + "logps/chosen": -248.2239532470703, + "logps/rejected": -157.94873046875, + "loss": 0.6617, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6978097558021545, + "rewards/margins": 1.120258092880249, + "rewards/rejected": -1.8180677890777588, + "step": 347 + }, + { + "epoch": 0.04, + "learning_rate": 2.9237972609153693e-07, + "logits/chosen": -2.5619499683380127, + "logits/rejected": -2.5144455432891846, + "logps/chosen": -265.0290222167969, + "logps/rejected": -216.6509246826172, + "loss": 0.6712, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9554342031478882, + "rewards/margins": 1.3588649034500122, + "rewards/rejected": -2.3142991065979004, + "step": 348 + }, + { + "epoch": 0.04, + "learning_rate": 2.923446096219127e-07, + "logits/chosen": -2.670039415359497, + "logits/rejected": -2.632021188735962, + "logps/chosen": -174.1725616455078, + "logps/rejected": -244.06626892089844, + "loss": 0.5621, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5214476585388184, + "rewards/margins": 0.8755272626876831, + "rewards/rejected": -1.396975040435791, + "step": 349 + }, + { + "epoch": 0.04, + "learning_rate": 2.923094931522884e-07, + "logits/chosen": -2.0975537300109863, + "logits/rejected": -2.086503505706787, + "logps/chosen": -382.91607666015625, + "logps/rejected": -284.3114013671875, + "loss": 0.8662, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.777912437915802, + "rewards/margins": -0.04748162627220154, + "rewards/rejected": -0.7304307222366333, + "step": 350 + }, + { + "epoch": 0.04, + "learning_rate": 2.9227437668266414e-07, + "logits/chosen": -1.965627670288086, + "logits/rejected": -2.3399698734283447, + "logps/chosen": -288.694091796875, + "logps/rejected": -220.2798309326172, + "loss": 0.7161, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4856274724006653, + "rewards/margins": 0.35969895124435425, + "rewards/rejected": -0.8453263640403748, + "step": 351 + }, + { + "epoch": 0.04, + "learning_rate": 2.922392602130399e-07, + "logits/chosen": -1.8349500894546509, + "logits/rejected": -2.0622904300689697, + "logps/chosen": -301.8479309082031, + "logps/rejected": -279.8482666015625, + "loss": 0.8048, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.43677785992622375, + "rewards/margins": 0.16251347959041595, + "rewards/rejected": -0.5992913246154785, + "step": 352 + }, + { + "epoch": 0.04, + "learning_rate": 2.9220414374341565e-07, + "logits/chosen": -1.9820833206176758, + "logits/rejected": -1.9992607831954956, + "logps/chosen": -286.7822265625, + "logps/rejected": -294.9604187011719, + "loss": 1.1337, + "rewards/accuracies": 0.375, + "rewards/chosen": -2.033996343612671, + "rewards/margins": 0.6318866014480591, + "rewards/rejected": -2.6658830642700195, + "step": 353 + }, + { + "epoch": 0.04, + "learning_rate": 2.921690272737914e-07, + "logits/chosen": -2.895094156265259, + "logits/rejected": -2.7266335487365723, + "logps/chosen": -275.4627685546875, + "logps/rejected": -291.4943542480469, + "loss": 0.5689, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09963082522153854, + "rewards/margins": 0.45239460468292236, + "rewards/rejected": -0.5520253777503967, + "step": 354 + }, + { + "epoch": 0.04, + "learning_rate": 2.921339108041671e-07, + "logits/chosen": -2.351182699203491, + "logits/rejected": -2.231276750564575, + "logps/chosen": -254.49407958984375, + "logps/rejected": -257.73638916015625, + "loss": 0.666, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.24487818777561188, + "rewards/margins": 0.22477301955223083, + "rewards/rejected": -0.46965116262435913, + "step": 355 + }, + { + "epoch": 0.04, + "learning_rate": 2.920987943345429e-07, + "logits/chosen": -2.4634814262390137, + "logits/rejected": -2.240978717803955, + "logps/chosen": -130.24842834472656, + "logps/rejected": -131.60873413085938, + "loss": 0.4487, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.25739583373069763, + "rewards/margins": 0.6462665796279907, + "rewards/rejected": -0.9036624431610107, + "step": 356 + }, + { + "epoch": 0.04, + "learning_rate": 2.9206367786491866e-07, + "logits/chosen": -1.7970781326293945, + "logits/rejected": -2.164903163909912, + "logps/chosen": -358.35693359375, + "logps/rejected": -281.2889099121094, + "loss": 0.8527, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2108721733093262, + "rewards/margins": 0.14392682909965515, + "rewards/rejected": -1.3547990322113037, + "step": 357 + }, + { + "epoch": 0.04, + "learning_rate": 2.9202856139529436e-07, + "logits/chosen": -2.7267661094665527, + "logits/rejected": -2.6551706790924072, + "logps/chosen": -219.84765625, + "logps/rejected": -227.89236450195312, + "loss": 0.4108, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6640541553497314, + "rewards/margins": 1.2804590463638306, + "rewards/rejected": -1.9445130825042725, + "step": 358 + }, + { + "epoch": 0.04, + "learning_rate": 2.919934449256701e-07, + "logits/chosen": -2.087078332901001, + "logits/rejected": -2.279843807220459, + "logps/chosen": -467.06231689453125, + "logps/rejected": -179.79776000976562, + "loss": 1.2264, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.1490752696990967, + "rewards/margins": -0.17736327648162842, + "rewards/rejected": -1.9717121124267578, + "step": 359 + }, + { + "epoch": 0.04, + "learning_rate": 2.9195832845604587e-07, + "logits/chosen": -1.8708100318908691, + "logits/rejected": -2.154357433319092, + "logps/chosen": -241.65689086914062, + "logps/rejected": -179.92271423339844, + "loss": 0.6073, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2146419733762741, + "rewards/margins": 0.31570303440093994, + "rewards/rejected": -0.5303450226783752, + "step": 360 + }, + { + "epoch": 0.04, + "learning_rate": 2.919232119864216e-07, + "logits/chosen": -2.42993426322937, + "logits/rejected": -2.5357110500335693, + "logps/chosen": -246.03916931152344, + "logps/rejected": -335.40594482421875, + "loss": 0.4649, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6740235686302185, + "rewards/margins": 1.0087673664093018, + "rewards/rejected": -1.682790994644165, + "step": 361 + }, + { + "epoch": 0.04, + "learning_rate": 2.918880955167974e-07, + "logits/chosen": -2.4271085262298584, + "logits/rejected": -2.5628576278686523, + "logps/chosen": -253.90069580078125, + "logps/rejected": -234.41616821289062, + "loss": 0.4554, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4735167920589447, + "rewards/margins": 1.3727918863296509, + "rewards/rejected": -1.846308708190918, + "step": 362 + }, + { + "epoch": 0.04, + "learning_rate": 2.918529790471731e-07, + "logits/chosen": -2.5186169147491455, + "logits/rejected": -2.5501577854156494, + "logps/chosen": -294.24993896484375, + "logps/rejected": -366.64239501953125, + "loss": 0.3924, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5334408283233643, + "rewards/margins": 1.0676581859588623, + "rewards/rejected": -1.6010990142822266, + "step": 363 + }, + { + "epoch": 0.04, + "learning_rate": 2.9181786257754883e-07, + "logits/chosen": -2.830936908721924, + "logits/rejected": -2.8562521934509277, + "logps/chosen": -226.19114685058594, + "logps/rejected": -178.90814208984375, + "loss": 0.408, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.894342839717865, + "rewards/margins": 1.013985276222229, + "rewards/rejected": -1.9083282947540283, + "step": 364 + }, + { + "epoch": 0.04, + "learning_rate": 2.9178274610792464e-07, + "logits/chosen": -2.2748374938964844, + "logits/rejected": -2.189448356628418, + "logps/chosen": -353.1754150390625, + "logps/rejected": -356.9119567871094, + "loss": 0.295, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.09330364316701889, + "rewards/margins": 1.7295153141021729, + "rewards/rejected": -1.636211633682251, + "step": 365 + }, + { + "epoch": 0.04, + "learning_rate": 2.9174762963830034e-07, + "logits/chosen": -2.3839144706726074, + "logits/rejected": -2.4032530784606934, + "logps/chosen": -231.05160522460938, + "logps/rejected": -284.3983154296875, + "loss": 0.4858, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.38171684741973877, + "rewards/margins": 0.7287415266036987, + "rewards/rejected": -1.110458254814148, + "step": 366 + }, + { + "epoch": 0.04, + "learning_rate": 2.917125131686761e-07, + "logits/chosen": -2.877009868621826, + "logits/rejected": -2.6521239280700684, + "logps/chosen": -189.2588348388672, + "logps/rejected": -241.2522430419922, + "loss": 0.6483, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9739365577697754, + "rewards/margins": 0.9730392694473267, + "rewards/rejected": -1.9469759464263916, + "step": 367 + }, + { + "epoch": 0.04, + "learning_rate": 2.9167739669905185e-07, + "logits/chosen": -2.828920602798462, + "logits/rejected": -2.8490872383117676, + "logps/chosen": -243.96926879882812, + "logps/rejected": -198.3373565673828, + "loss": 0.3653, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.035270288586616516, + "rewards/margins": 1.5122712850570679, + "rewards/rejected": -1.4770010709762573, + "step": 368 + }, + { + "epoch": 0.04, + "learning_rate": 2.916422802294276e-07, + "logits/chosen": -2.037053346633911, + "logits/rejected": -1.984724760055542, + "logps/chosen": -234.22047424316406, + "logps/rejected": -251.41908264160156, + "loss": 0.3708, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3176129162311554, + "rewards/margins": 1.160236120223999, + "rewards/rejected": -1.477849006652832, + "step": 369 + }, + { + "epoch": 0.04, + "learning_rate": 2.9160716375980336e-07, + "logits/chosen": -2.5238232612609863, + "logits/rejected": -2.6262450218200684, + "logps/chosen": -268.46685791015625, + "logps/rejected": -273.4368591308594, + "loss": 0.2693, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1040600836277008, + "rewards/margins": 1.7246915102005005, + "rewards/rejected": -1.828751564025879, + "step": 370 + }, + { + "epoch": 0.04, + "learning_rate": 2.9157204729017906e-07, + "logits/chosen": -2.639484405517578, + "logits/rejected": -2.505387544631958, + "logps/chosen": -329.2934875488281, + "logps/rejected": -208.34017944335938, + "loss": 0.5048, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9042493104934692, + "rewards/margins": 0.7870454788208008, + "rewards/rejected": -1.6912946701049805, + "step": 371 + }, + { + "epoch": 0.04, + "learning_rate": 2.915369308205548e-07, + "logits/chosen": -2.0003151893615723, + "logits/rejected": -2.232238292694092, + "logps/chosen": -431.53009033203125, + "logps/rejected": -300.4361877441406, + "loss": 0.5134, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4052274227142334, + "rewards/margins": 0.7702434062957764, + "rewards/rejected": -1.1754709482192993, + "step": 372 + }, + { + "epoch": 0.04, + "learning_rate": 2.9150181435093057e-07, + "logits/chosen": -2.474363088607788, + "logits/rejected": -2.5259947776794434, + "logps/chosen": -406.4010314941406, + "logps/rejected": -360.4688720703125, + "loss": 0.3753, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12155690789222717, + "rewards/margins": 1.6833841800689697, + "rewards/rejected": -1.804941177368164, + "step": 373 + }, + { + "epoch": 0.04, + "learning_rate": 2.914666978813063e-07, + "logits/chosen": -2.428666591644287, + "logits/rejected": -2.480311393737793, + "logps/chosen": -276.1946716308594, + "logps/rejected": -233.4930419921875, + "loss": 0.5073, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5683512687683105, + "rewards/margins": 0.7456830739974976, + "rewards/rejected": -1.314034342765808, + "step": 374 + }, + { + "epoch": 0.04, + "learning_rate": 2.914315814116821e-07, + "logits/chosen": -2.7227675914764404, + "logits/rejected": -2.4635589122772217, + "logps/chosen": -184.11558532714844, + "logps/rejected": -279.4179382324219, + "loss": 0.4542, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.909077525138855, + "rewards/margins": 1.2706514596939087, + "rewards/rejected": -2.1797289848327637, + "step": 375 + }, + { + "epoch": 0.04, + "learning_rate": 2.913964649420578e-07, + "logits/chosen": -2.2076258659362793, + "logits/rejected": -2.2167956829071045, + "logps/chosen": -318.3890380859375, + "logps/rejected": -328.8642578125, + "loss": 0.5685, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4043596684932709, + "rewards/margins": 0.5084593296051025, + "rewards/rejected": -0.9128190279006958, + "step": 376 + }, + { + "epoch": 0.04, + "learning_rate": 2.9136134847243353e-07, + "logits/chosen": -1.8433964252471924, + "logits/rejected": -1.878049612045288, + "logps/chosen": -305.0245666503906, + "logps/rejected": -352.8460388183594, + "loss": 0.4085, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4373244047164917, + "rewards/margins": 0.8888680338859558, + "rewards/rejected": -1.3261923789978027, + "step": 377 + }, + { + "epoch": 0.04, + "learning_rate": 2.9132623200280934e-07, + "logits/chosen": -2.3100380897521973, + "logits/rejected": -2.4540319442749023, + "logps/chosen": -309.055908203125, + "logps/rejected": -267.3284606933594, + "loss": 0.4689, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15616929531097412, + "rewards/margins": 1.2584820985794067, + "rewards/rejected": -1.4146513938903809, + "step": 378 + }, + { + "epoch": 0.04, + "learning_rate": 2.9129111553318504e-07, + "logits/chosen": -2.22947096824646, + "logits/rejected": -2.4576897621154785, + "logps/chosen": -248.74462890625, + "logps/rejected": -233.0343475341797, + "loss": 0.8183, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5269343256950378, + "rewards/margins": 0.6090469360351562, + "rewards/rejected": -1.1359812021255493, + "step": 379 + }, + { + "epoch": 0.04, + "learning_rate": 2.912559990635608e-07, + "logits/chosen": -2.1361465454101562, + "logits/rejected": -2.1500396728515625, + "logps/chosen": -280.6698303222656, + "logps/rejected": -259.13677978515625, + "loss": 0.5722, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8938173055648804, + "rewards/margins": 0.8375884294509888, + "rewards/rejected": -1.7314058542251587, + "step": 380 + }, + { + "epoch": 0.04, + "learning_rate": 2.9122088259393654e-07, + "logits/chosen": -2.233600378036499, + "logits/rejected": -2.163321018218994, + "logps/chosen": -196.08724975585938, + "logps/rejected": -196.56845092773438, + "loss": 0.3285, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.31431150436401367, + "rewards/margins": 1.1862356662750244, + "rewards/rejected": -1.500547170639038, + "step": 381 + }, + { + "epoch": 0.04, + "learning_rate": 2.911857661243123e-07, + "logits/chosen": -2.9481942653656006, + "logits/rejected": -2.982309341430664, + "logps/chosen": -262.39093017578125, + "logps/rejected": -236.77911376953125, + "loss": 0.4643, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.41171228885650635, + "rewards/margins": 1.2981891632080078, + "rewards/rejected": -1.7099014520645142, + "step": 382 + }, + { + "epoch": 0.04, + "learning_rate": 2.9115064965468805e-07, + "logits/chosen": -2.2372519969940186, + "logits/rejected": -2.2700040340423584, + "logps/chosen": -372.286865234375, + "logps/rejected": -365.88104248046875, + "loss": 0.7197, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6732273697853088, + "rewards/margins": 0.7077983617782593, + "rewards/rejected": -1.381025791168213, + "step": 383 + }, + { + "epoch": 0.04, + "learning_rate": 2.9111553318506375e-07, + "logits/chosen": -2.7132439613342285, + "logits/rejected": -2.8456735610961914, + "logps/chosen": -322.32086181640625, + "logps/rejected": -201.84048461914062, + "loss": 0.487, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2249995470046997, + "rewards/margins": 1.2482603788375854, + "rewards/rejected": -1.4732599258422852, + "step": 384 + }, + { + "epoch": 0.04, + "learning_rate": 2.910804167154395e-07, + "logits/chosen": -2.3256945610046387, + "logits/rejected": -2.0586915016174316, + "logps/chosen": -130.87356567382812, + "logps/rejected": -286.7454833984375, + "loss": 0.424, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.24268831312656403, + "rewards/margins": 0.9217972159385681, + "rewards/rejected": -1.1644855737686157, + "step": 385 + }, + { + "epoch": 0.04, + "learning_rate": 2.9104530024581526e-07, + "logits/chosen": -1.8151988983154297, + "logits/rejected": -1.979834794998169, + "logps/chosen": -466.7149658203125, + "logps/rejected": -320.51373291015625, + "loss": 0.4455, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15691949427127838, + "rewards/margins": 0.9595628976821899, + "rewards/rejected": -1.1164823770523071, + "step": 386 + }, + { + "epoch": 0.04, + "learning_rate": 2.91010183776191e-07, + "logits/chosen": -2.5161662101745605, + "logits/rejected": -2.525355339050293, + "logps/chosen": -390.4044189453125, + "logps/rejected": -237.01165771484375, + "loss": 0.4636, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5039903521537781, + "rewards/margins": 0.8422777652740479, + "rewards/rejected": -1.3462681770324707, + "step": 387 + }, + { + "epoch": 0.04, + "learning_rate": 2.9097506730656677e-07, + "logits/chosen": -2.375640392303467, + "logits/rejected": -2.49769926071167, + "logps/chosen": -327.1853942871094, + "logps/rejected": -202.9474639892578, + "loss": 0.7307, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5296344757080078, + "rewards/margins": 0.1416120082139969, + "rewards/rejected": -0.6712464094161987, + "step": 388 + }, + { + "epoch": 0.04, + "learning_rate": 2.909399508369425e-07, + "logits/chosen": -2.5545451641082764, + "logits/rejected": -2.3876705169677734, + "logps/chosen": -193.9999237060547, + "logps/rejected": -166.71563720703125, + "loss": 0.4839, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.47466251254081726, + "rewards/margins": 0.9540408849716187, + "rewards/rejected": -1.4287033081054688, + "step": 389 + }, + { + "epoch": 0.04, + "learning_rate": 2.909048343673183e-07, + "logits/chosen": -2.2759928703308105, + "logits/rejected": -2.4587106704711914, + "logps/chosen": -134.43426513671875, + "logps/rejected": -161.02243041992188, + "loss": 0.7007, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.47460639476776123, + "rewards/margins": 0.7943627834320068, + "rewards/rejected": -1.268969178199768, + "step": 390 + }, + { + "epoch": 0.05, + "learning_rate": 2.9086971789769403e-07, + "logits/chosen": -2.52060866355896, + "logits/rejected": -2.4852681159973145, + "logps/chosen": -257.6805114746094, + "logps/rejected": -252.75814819335938, + "loss": 0.7122, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0286797285079956, + "rewards/margins": 0.3580697774887085, + "rewards/rejected": -1.386749505996704, + "step": 391 + }, + { + "epoch": 0.05, + "learning_rate": 2.9083460142806973e-07, + "logits/chosen": -2.56117582321167, + "logits/rejected": -2.1422765254974365, + "logps/chosen": -235.985107421875, + "logps/rejected": -303.7123718261719, + "loss": 0.7791, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2552099227905273, + "rewards/margins": 0.4943283498287201, + "rewards/rejected": -1.7495381832122803, + "step": 392 + }, + { + "epoch": 0.05, + "learning_rate": 2.907994849584455e-07, + "logits/chosen": -2.3607940673828125, + "logits/rejected": -2.45479679107666, + "logps/chosen": -254.0944061279297, + "logps/rejected": -218.19578552246094, + "loss": 0.4969, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6978374123573303, + "rewards/margins": 0.7463139295578003, + "rewards/rejected": -1.4441514015197754, + "step": 393 + }, + { + "epoch": 0.05, + "learning_rate": 2.9076436848882124e-07, + "logits/chosen": -2.301032543182373, + "logits/rejected": -2.1102499961853027, + "logps/chosen": -120.57257080078125, + "logps/rejected": -331.7785949707031, + "loss": 0.4748, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24335798621177673, + "rewards/margins": 0.8366833925247192, + "rewards/rejected": -1.0800414085388184, + "step": 394 + }, + { + "epoch": 0.05, + "learning_rate": 2.90729252019197e-07, + "logits/chosen": -2.527088165283203, + "logits/rejected": -2.6731882095336914, + "logps/chosen": -308.7522277832031, + "logps/rejected": -204.04254150390625, + "loss": 0.2961, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06870412826538086, + "rewards/margins": 1.5690155029296875, + "rewards/rejected": -1.6377196311950684, + "step": 395 + }, + { + "epoch": 0.05, + "learning_rate": 2.9069413554957275e-07, + "logits/chosen": -1.8452880382537842, + "logits/rejected": -2.150559902191162, + "logps/chosen": -324.23004150390625, + "logps/rejected": -177.35736083984375, + "loss": 0.7045, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3545685410499573, + "rewards/margins": 0.37231162190437317, + "rewards/rejected": -0.7268801331520081, + "step": 396 + }, + { + "epoch": 0.05, + "learning_rate": 2.906590190799485e-07, + "logits/chosen": -2.682062864303589, + "logits/rejected": -2.422858715057373, + "logps/chosen": -103.96527099609375, + "logps/rejected": -219.4734344482422, + "loss": 0.5149, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.293123722076416, + "rewards/margins": 0.6293418407440186, + "rewards/rejected": -0.9224655032157898, + "step": 397 + }, + { + "epoch": 0.05, + "learning_rate": 2.906239026103242e-07, + "logits/chosen": -2.528632164001465, + "logits/rejected": -2.6729369163513184, + "logps/chosen": -293.2714538574219, + "logps/rejected": -237.54335021972656, + "loss": 0.9944, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3139704465866089, + "rewards/margins": -0.28848400712013245, + "rewards/rejected": -1.0254863500595093, + "step": 398 + }, + { + "epoch": 0.05, + "learning_rate": 2.905887861407e-07, + "logits/chosen": -2.4771034717559814, + "logits/rejected": -2.3801727294921875, + "logps/chosen": -289.82403564453125, + "logps/rejected": -354.7469177246094, + "loss": 0.3406, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.29782307147979736, + "rewards/margins": 1.4355717897415161, + "rewards/rejected": -1.7333948612213135, + "step": 399 + }, + { + "epoch": 0.05, + "learning_rate": 2.905536696710757e-07, + "logits/chosen": -2.629490375518799, + "logits/rejected": -2.408463954925537, + "logps/chosen": -264.53656005859375, + "logps/rejected": -306.2720947265625, + "loss": 0.363, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6164662837982178, + "rewards/margins": 1.476449728012085, + "rewards/rejected": -2.0929160118103027, + "step": 400 + }, + { + "epoch": 0.05, + "learning_rate": 2.9051855320145146e-07, + "logits/chosen": -2.3808882236480713, + "logits/rejected": -2.7584078311920166, + "logps/chosen": -253.58831787109375, + "logps/rejected": -187.96896362304688, + "loss": 0.4405, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0579850934445858, + "rewards/margins": 1.0036213397979736, + "rewards/rejected": -0.9456362128257751, + "step": 401 + }, + { + "epoch": 0.05, + "learning_rate": 2.904834367318272e-07, + "logits/chosen": -2.118563413619995, + "logits/rejected": -2.201154947280884, + "logps/chosen": -243.26856994628906, + "logps/rejected": -161.19635009765625, + "loss": 0.7315, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3828990161418915, + "rewards/margins": 0.4871894121170044, + "rewards/rejected": -0.8700883984565735, + "step": 402 + }, + { + "epoch": 0.05, + "learning_rate": 2.9044832026220297e-07, + "logits/chosen": -2.240737199783325, + "logits/rejected": -2.1890201568603516, + "logps/chosen": -266.506103515625, + "logps/rejected": -346.6192932128906, + "loss": 0.4163, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.45085659623146057, + "rewards/margins": 2.1966700553894043, + "rewards/rejected": -2.647526741027832, + "step": 403 + }, + { + "epoch": 0.05, + "learning_rate": 2.904132037925787e-07, + "logits/chosen": -2.2816288471221924, + "logits/rejected": -2.250783920288086, + "logps/chosen": -273.0588684082031, + "logps/rejected": -264.3376770019531, + "loss": 0.2485, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.805108904838562, + "rewards/margins": 1.6039650440216064, + "rewards/rejected": -2.409073829650879, + "step": 404 + }, + { + "epoch": 0.05, + "learning_rate": 2.903780873229545e-07, + "logits/chosen": -2.3899171352386475, + "logits/rejected": -2.448240280151367, + "logps/chosen": -342.86285400390625, + "logps/rejected": -200.74942016601562, + "loss": 0.9483, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9548447728157043, + "rewards/margins": -0.183023601770401, + "rewards/rejected": -0.771821141242981, + "step": 405 + }, + { + "epoch": 0.05, + "learning_rate": 2.903429708533302e-07, + "logits/chosen": -1.783102035522461, + "logits/rejected": -1.898418664932251, + "logps/chosen": -339.82916259765625, + "logps/rejected": -258.9866943359375, + "loss": 0.4261, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4542697072029114, + "rewards/margins": 1.1586456298828125, + "rewards/rejected": -1.612915277481079, + "step": 406 + }, + { + "epoch": 0.05, + "learning_rate": 2.9030785438370593e-07, + "logits/chosen": -2.355727434158325, + "logits/rejected": -2.5316340923309326, + "logps/chosen": -337.219970703125, + "logps/rejected": -170.0535888671875, + "loss": 0.6678, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7918184995651245, + "rewards/margins": 0.2954825758934021, + "rewards/rejected": -1.0873011350631714, + "step": 407 + }, + { + "epoch": 0.05, + "learning_rate": 2.902727379140817e-07, + "logits/chosen": -2.15877103805542, + "logits/rejected": -2.196415901184082, + "logps/chosen": -352.3223571777344, + "logps/rejected": -338.6763000488281, + "loss": 0.6887, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6354883909225464, + "rewards/margins": 1.0406873226165771, + "rewards/rejected": -2.676175832748413, + "step": 408 + }, + { + "epoch": 0.05, + "learning_rate": 2.9023762144445744e-07, + "logits/chosen": -2.3139023780822754, + "logits/rejected": -2.321138381958008, + "logps/chosen": -189.615478515625, + "logps/rejected": -248.27471923828125, + "loss": 0.4573, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.43327760696411133, + "rewards/margins": 1.1666333675384521, + "rewards/rejected": -1.5999109745025635, + "step": 409 + }, + { + "epoch": 0.05, + "learning_rate": 2.902025049748332e-07, + "logits/chosen": -2.222543239593506, + "logits/rejected": -2.3987622261047363, + "logps/chosen": -333.17144775390625, + "logps/rejected": -225.73243713378906, + "loss": 0.3854, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7952386140823364, + "rewards/margins": 1.3609604835510254, + "rewards/rejected": -2.1561989784240723, + "step": 410 + }, + { + "epoch": 0.05, + "learning_rate": 2.901673885052089e-07, + "logits/chosen": -2.7008047103881836, + "logits/rejected": -2.8357295989990234, + "logps/chosen": -113.16555786132812, + "logps/rejected": -150.2096405029297, + "loss": 0.7057, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.37011486291885376, + "rewards/margins": 0.8326514959335327, + "rewards/rejected": -1.2027662992477417, + "step": 411 + }, + { + "epoch": 0.05, + "learning_rate": 2.901322720355847e-07, + "logits/chosen": -2.285874843597412, + "logits/rejected": -2.181692600250244, + "logps/chosen": -386.724853515625, + "logps/rejected": -523.6873168945312, + "loss": 0.4567, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.42406952381134033, + "rewards/margins": 1.1853246688842773, + "rewards/rejected": -1.6093944311141968, + "step": 412 + }, + { + "epoch": 0.05, + "learning_rate": 2.9009715556596046e-07, + "logits/chosen": -2.305635929107666, + "logits/rejected": -1.857632040977478, + "logps/chosen": -190.6840057373047, + "logps/rejected": -247.9942626953125, + "loss": 0.2564, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05717356130480766, + "rewards/margins": 1.6159186363220215, + "rewards/rejected": -1.5587451457977295, + "step": 413 + }, + { + "epoch": 0.05, + "learning_rate": 2.9006203909633616e-07, + "logits/chosen": -2.125108003616333, + "logits/rejected": -2.30303955078125, + "logps/chosen": -355.1351318359375, + "logps/rejected": -228.14703369140625, + "loss": 0.5459, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.44054266810417175, + "rewards/margins": 0.9314007759094238, + "rewards/rejected": -1.371943473815918, + "step": 414 + }, + { + "epoch": 0.05, + "learning_rate": 2.900269226267119e-07, + "logits/chosen": -2.2755231857299805, + "logits/rejected": -2.464488983154297, + "logps/chosen": -140.63880920410156, + "logps/rejected": -233.46133422851562, + "loss": 0.4714, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.020653773099184036, + "rewards/margins": 1.2405142784118652, + "rewards/rejected": -1.2611682415008545, + "step": 415 + }, + { + "epoch": 0.05, + "learning_rate": 2.8999180615708766e-07, + "logits/chosen": -1.8749544620513916, + "logits/rejected": -1.9874260425567627, + "logps/chosen": -365.9632263183594, + "logps/rejected": -238.62303161621094, + "loss": 0.3325, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.28513503074645996, + "rewards/margins": 1.903374195098877, + "rewards/rejected": -2.188508987426758, + "step": 416 + }, + { + "epoch": 0.05, + "learning_rate": 2.899566896874634e-07, + "logits/chosen": -2.572951316833496, + "logits/rejected": -2.416513681411743, + "logps/chosen": -230.96884155273438, + "logps/rejected": -250.75534057617188, + "loss": 1.1844, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6892746686935425, + "rewards/margins": -0.1744169443845749, + "rewards/rejected": -1.5148577690124512, + "step": 417 + }, + { + "epoch": 0.05, + "learning_rate": 2.8992157321783917e-07, + "logits/chosen": -2.0516891479492188, + "logits/rejected": -2.4129509925842285, + "logps/chosen": -413.4346618652344, + "logps/rejected": -327.802734375, + "loss": 0.9449, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.002694845199585, + "rewards/margins": -0.020894289016723633, + "rewards/rejected": -0.9818006157875061, + "step": 418 + }, + { + "epoch": 0.05, + "learning_rate": 2.8988645674821487e-07, + "logits/chosen": -2.9365317821502686, + "logits/rejected": -2.790660858154297, + "logps/chosen": -181.66934204101562, + "logps/rejected": -221.6717071533203, + "loss": 0.4549, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.43192338943481445, + "rewards/margins": 1.3197505474090576, + "rewards/rejected": -1.751673936843872, + "step": 419 + }, + { + "epoch": 0.05, + "learning_rate": 2.8985134027859063e-07, + "logits/chosen": -1.9277281761169434, + "logits/rejected": -1.8603521585464478, + "logps/chosen": -191.6887664794922, + "logps/rejected": -227.04916381835938, + "loss": 0.6365, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4145964980125427, + "rewards/margins": 0.25950008630752563, + "rewards/rejected": -0.6740965843200684, + "step": 420 + }, + { + "epoch": 0.05, + "learning_rate": 2.898162238089664e-07, + "logits/chosen": -2.1123576164245605, + "logits/rejected": -2.3615825176239014, + "logps/chosen": -368.1420593261719, + "logps/rejected": -239.6573944091797, + "loss": 0.6403, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.776563286781311, + "rewards/margins": 0.37927529215812683, + "rewards/rejected": -2.15583872795105, + "step": 421 + }, + { + "epoch": 0.05, + "learning_rate": 2.8978110733934213e-07, + "logits/chosen": -2.5550894737243652, + "logits/rejected": -2.4907073974609375, + "logps/chosen": -231.90731811523438, + "logps/rejected": -215.66470336914062, + "loss": 0.7139, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2556419372558594, + "rewards/margins": 0.39590582251548767, + "rewards/rejected": -1.6515477895736694, + "step": 422 + }, + { + "epoch": 0.05, + "learning_rate": 2.897459908697179e-07, + "logits/chosen": -2.9239327907562256, + "logits/rejected": -2.8341870307922363, + "logps/chosen": -206.65020751953125, + "logps/rejected": -213.07186889648438, + "loss": 0.4548, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9219661951065063, + "rewards/margins": 0.7071079611778259, + "rewards/rejected": -1.6290740966796875, + "step": 423 + }, + { + "epoch": 0.05, + "learning_rate": 2.8971087440009364e-07, + "logits/chosen": -2.9000797271728516, + "logits/rejected": -2.868884563446045, + "logps/chosen": -183.89547729492188, + "logps/rejected": -332.95892333984375, + "loss": 0.4759, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7969914078712463, + "rewards/margins": 1.1864778995513916, + "rewards/rejected": -1.9834693670272827, + "step": 424 + }, + { + "epoch": 0.05, + "learning_rate": 2.896757579304694e-07, + "logits/chosen": -1.6917738914489746, + "logits/rejected": -1.9581644535064697, + "logps/chosen": -343.1259765625, + "logps/rejected": -325.0622253417969, + "loss": 0.6417, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6132412552833557, + "rewards/margins": 0.8575013875961304, + "rewards/rejected": -1.4707427024841309, + "step": 425 + }, + { + "epoch": 0.05, + "learning_rate": 2.8964064146084515e-07, + "logits/chosen": -2.0918123722076416, + "logits/rejected": -2.2852721214294434, + "logps/chosen": -226.48843383789062, + "logps/rejected": -192.74937438964844, + "loss": 0.297, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20671652257442474, + "rewards/margins": 1.5406757593154907, + "rewards/rejected": -1.747392177581787, + "step": 426 + }, + { + "epoch": 0.05, + "learning_rate": 2.8960552499122085e-07, + "logits/chosen": -2.0898277759552, + "logits/rejected": -1.8871090412139893, + "logps/chosen": -285.9032897949219, + "logps/rejected": -208.7993927001953, + "loss": 0.6992, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6262397170066833, + "rewards/margins": 0.12272818386554718, + "rewards/rejected": -0.7489678859710693, + "step": 427 + }, + { + "epoch": 0.05, + "learning_rate": 2.895704085215966e-07, + "logits/chosen": -2.8670899868011475, + "logits/rejected": -2.5570120811462402, + "logps/chosen": -279.3356018066406, + "logps/rejected": -312.0433044433594, + "loss": 0.5918, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6099926829338074, + "rewards/margins": 1.0474438667297363, + "rewards/rejected": -1.6574366092681885, + "step": 428 + }, + { + "epoch": 0.05, + "learning_rate": 2.8953529205197236e-07, + "logits/chosen": -1.9342706203460693, + "logits/rejected": -2.095008373260498, + "logps/chosen": -634.0540161132812, + "logps/rejected": -376.4906311035156, + "loss": 1.0282, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.8742374777793884, + "rewards/margins": -0.033423617482185364, + "rewards/rejected": -0.8408138751983643, + "step": 429 + }, + { + "epoch": 0.05, + "learning_rate": 2.895001755823481e-07, + "logits/chosen": -2.0682079792022705, + "logits/rejected": -1.9955337047576904, + "logps/chosen": -219.9072723388672, + "logps/rejected": -242.13168334960938, + "loss": 0.6911, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8621062636375427, + "rewards/margins": 0.46161508560180664, + "rewards/rejected": -1.3237212896347046, + "step": 430 + }, + { + "epoch": 0.05, + "learning_rate": 2.8946505911272387e-07, + "logits/chosen": -2.4209694862365723, + "logits/rejected": -2.5577762126922607, + "logps/chosen": -491.298583984375, + "logps/rejected": -436.0865783691406, + "loss": 0.222, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6950672268867493, + "rewards/margins": 2.0131146907806396, + "rewards/rejected": -2.708181858062744, + "step": 431 + }, + { + "epoch": 0.05, + "learning_rate": 2.8942994264309957e-07, + "logits/chosen": -2.955657958984375, + "logits/rejected": -2.923943281173706, + "logps/chosen": -174.99566650390625, + "logps/rejected": -241.78469848632812, + "loss": 0.3299, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1844731867313385, + "rewards/margins": 2.0665364265441895, + "rewards/rejected": -2.251009941101074, + "step": 432 + }, + { + "epoch": 0.05, + "learning_rate": 2.893948261734754e-07, + "logits/chosen": -2.2856738567352295, + "logits/rejected": -2.224027156829834, + "logps/chosen": -628.26171875, + "logps/rejected": -647.68994140625, + "loss": 0.4782, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3986232876777649, + "rewards/margins": 1.1419258117675781, + "rewards/rejected": -1.5405490398406982, + "step": 433 + }, + { + "epoch": 0.05, + "learning_rate": 2.8935970970385113e-07, + "logits/chosen": -2.6743392944335938, + "logits/rejected": -2.7891430854797363, + "logps/chosen": -360.0565490722656, + "logps/rejected": -176.5977325439453, + "loss": 0.5266, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7710871696472168, + "rewards/margins": 0.818378210067749, + "rewards/rejected": -1.5894653797149658, + "step": 434 + }, + { + "epoch": 0.05, + "learning_rate": 2.8932459323422683e-07, + "logits/chosen": -1.6452635526657104, + "logits/rejected": -2.0099403858184814, + "logps/chosen": -321.22772216796875, + "logps/rejected": -204.7057647705078, + "loss": 0.9363, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3963351547718048, + "rewards/margins": -0.11807290464639664, + "rewards/rejected": -0.27826225757598877, + "step": 435 + }, + { + "epoch": 0.05, + "learning_rate": 2.892894767646026e-07, + "logits/chosen": -2.0700409412384033, + "logits/rejected": -2.3309831619262695, + "logps/chosen": -328.427734375, + "logps/rejected": -222.39723205566406, + "loss": 0.7864, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.39597710967063904, + "rewards/margins": 0.03586182743310928, + "rewards/rejected": -0.4318389594554901, + "step": 436 + }, + { + "epoch": 0.05, + "learning_rate": 2.8925436029497834e-07, + "logits/chosen": -2.2002313137054443, + "logits/rejected": -2.478156089782715, + "logps/chosen": -317.3978271484375, + "logps/rejected": -228.6250762939453, + "loss": 0.3389, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08136959373950958, + "rewards/margins": 1.018842101097107, + "rewards/rejected": -1.1002117395401, + "step": 437 + }, + { + "epoch": 0.05, + "learning_rate": 2.892192438253541e-07, + "logits/chosen": -2.1456055641174316, + "logits/rejected": -2.5620529651641846, + "logps/chosen": -342.3087158203125, + "logps/rejected": -338.292724609375, + "loss": 0.5488, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7393233180046082, + "rewards/margins": 1.0836955308914185, + "rewards/rejected": -1.8230189085006714, + "step": 438 + }, + { + "epoch": 0.05, + "learning_rate": 2.8918412735572984e-07, + "logits/chosen": -2.8347835540771484, + "logits/rejected": -2.597368001937866, + "logps/chosen": -278.8978576660156, + "logps/rejected": -154.0840301513672, + "loss": 0.3275, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22441405057907104, + "rewards/margins": 1.2584842443466187, + "rewards/rejected": -1.4828983545303345, + "step": 439 + }, + { + "epoch": 0.05, + "learning_rate": 2.8914901088610555e-07, + "logits/chosen": -1.9192113876342773, + "logits/rejected": -1.7875454425811768, + "logps/chosen": -286.71551513671875, + "logps/rejected": -387.8851318359375, + "loss": 0.7784, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.44214102625846863, + "rewards/margins": 0.4669550359249115, + "rewards/rejected": -0.9090961217880249, + "step": 440 + }, + { + "epoch": 0.05, + "learning_rate": 2.891138944164813e-07, + "logits/chosen": -1.980334758758545, + "logits/rejected": -2.076547145843506, + "logps/chosen": -425.4896240234375, + "logps/rejected": -406.1822509765625, + "loss": 0.3678, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1496002972126007, + "rewards/margins": 1.5501468181610107, + "rewards/rejected": -1.699747085571289, + "step": 441 + }, + { + "epoch": 0.05, + "learning_rate": 2.8907877794685705e-07, + "logits/chosen": -2.075300455093384, + "logits/rejected": -2.214099884033203, + "logps/chosen": -269.55364990234375, + "logps/rejected": -193.42410278320312, + "loss": 0.6324, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5896792411804199, + "rewards/margins": 0.756241500377655, + "rewards/rejected": -1.3459206819534302, + "step": 442 + }, + { + "epoch": 0.05, + "learning_rate": 2.890436614772328e-07, + "logits/chosen": -2.232599973678589, + "logits/rejected": -2.341123580932617, + "logps/chosen": -199.04327392578125, + "logps/rejected": -195.17478942871094, + "loss": 0.4637, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6015334129333496, + "rewards/margins": 0.9234079718589783, + "rewards/rejected": -1.5249414443969727, + "step": 443 + }, + { + "epoch": 0.05, + "learning_rate": 2.8900854500760856e-07, + "logits/chosen": -1.7770965099334717, + "logits/rejected": -1.977562427520752, + "logps/chosen": -354.7071228027344, + "logps/rejected": -267.03521728515625, + "loss": 0.5989, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3870108127593994, + "rewards/margins": 0.4828950762748718, + "rewards/rejected": -1.869905948638916, + "step": 444 + }, + { + "epoch": 0.05, + "learning_rate": 2.8897342853798426e-07, + "logits/chosen": -1.9936851263046265, + "logits/rejected": -2.14700984954834, + "logps/chosen": -274.86920166015625, + "logps/rejected": -255.270263671875, + "loss": 0.3992, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.48464804887771606, + "rewards/margins": 1.135652780532837, + "rewards/rejected": -1.6203010082244873, + "step": 445 + }, + { + "epoch": 0.05, + "learning_rate": 2.8893831206836007e-07, + "logits/chosen": -1.9592480659484863, + "logits/rejected": -1.5964264869689941, + "logps/chosen": -214.90811157226562, + "logps/rejected": -269.9524841308594, + "loss": 0.6143, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.39064252376556396, + "rewards/margins": 0.31371065974235535, + "rewards/rejected": -0.7043532133102417, + "step": 446 + }, + { + "epoch": 0.05, + "learning_rate": 2.889031955987358e-07, + "logits/chosen": -2.598388195037842, + "logits/rejected": -2.4737510681152344, + "logps/chosen": -204.11109924316406, + "logps/rejected": -180.9921875, + "loss": 0.3595, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2594675123691559, + "rewards/margins": 1.2581350803375244, + "rewards/rejected": -1.5176026821136475, + "step": 447 + }, + { + "epoch": 0.05, + "learning_rate": 2.888680791291115e-07, + "logits/chosen": -2.7457656860351562, + "logits/rejected": -2.6269965171813965, + "logps/chosen": -241.15277099609375, + "logps/rejected": -180.87200927734375, + "loss": 0.4869, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6603788733482361, + "rewards/margins": 0.763934850692749, + "rewards/rejected": -1.4243137836456299, + "step": 448 + }, + { + "epoch": 0.05, + "learning_rate": 2.888329626594873e-07, + "logits/chosen": -1.884168028831482, + "logits/rejected": -2.029437780380249, + "logps/chosen": -433.01513671875, + "logps/rejected": -329.3001403808594, + "loss": 0.6769, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0112016201019287, + "rewards/margins": 0.27361372113227844, + "rewards/rejected": -1.2848154306411743, + "step": 449 + }, + { + "epoch": 0.05, + "learning_rate": 2.8879784618986303e-07, + "logits/chosen": -2.462709426879883, + "logits/rejected": -2.597428560256958, + "logps/chosen": -126.3593521118164, + "logps/rejected": -120.21089172363281, + "loss": 0.6641, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3866897821426392, + "rewards/margins": 1.1510908603668213, + "rewards/rejected": -2.53778076171875, + "step": 450 + }, + { + "epoch": 0.05, + "learning_rate": 2.887627297202388e-07, + "logits/chosen": -2.4395875930786133, + "logits/rejected": -2.4573137760162354, + "logps/chosen": -259.8878173828125, + "logps/rejected": -220.12220764160156, + "loss": 1.4158, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.0816454887390137, + "rewards/margins": -0.38571155071258545, + "rewards/rejected": -1.6959339380264282, + "step": 451 + }, + { + "epoch": 0.05, + "learning_rate": 2.8872761325061454e-07, + "logits/chosen": -2.5090928077697754, + "logits/rejected": -2.5043561458587646, + "logps/chosen": -202.8241729736328, + "logps/rejected": -343.06317138671875, + "loss": 0.1696, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24117296934127808, + "rewards/margins": 2.388867139816284, + "rewards/rejected": -2.630039930343628, + "step": 452 + }, + { + "epoch": 0.05, + "learning_rate": 2.8869249678099024e-07, + "logits/chosen": -2.333909749984741, + "logits/rejected": -2.171187400817871, + "logps/chosen": -290.83905029296875, + "logps/rejected": -377.9497985839844, + "loss": 0.6404, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8433292508125305, + "rewards/margins": 0.9381197690963745, + "rewards/rejected": -1.7814488410949707, + "step": 453 + }, + { + "epoch": 0.05, + "learning_rate": 2.88657380311366e-07, + "logits/chosen": -2.3312485218048096, + "logits/rejected": -2.6153342723846436, + "logps/chosen": -299.4193115234375, + "logps/rejected": -321.64178466796875, + "loss": 0.4379, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.026061728596687317, + "rewards/margins": 1.2388978004455566, + "rewards/rejected": -1.2128361463546753, + "step": 454 + }, + { + "epoch": 0.05, + "learning_rate": 2.886222638417418e-07, + "logits/chosen": -2.7895545959472656, + "logits/rejected": -2.901029586791992, + "logps/chosen": -316.0106506347656, + "logps/rejected": -232.97802734375, + "loss": 0.7472, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.865118145942688, + "rewards/margins": 0.2172519713640213, + "rewards/rejected": -1.0823700428009033, + "step": 455 + }, + { + "epoch": 0.05, + "learning_rate": 2.885871473721175e-07, + "logits/chosen": -1.8949134349822998, + "logits/rejected": -2.166701555252075, + "logps/chosen": -332.00762939453125, + "logps/rejected": -210.58294677734375, + "loss": 0.6184, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5222382545471191, + "rewards/margins": 0.8983215093612671, + "rewards/rejected": -1.4205597639083862, + "step": 456 + }, + { + "epoch": 0.05, + "learning_rate": 2.8855203090249325e-07, + "logits/chosen": -2.6626710891723633, + "logits/rejected": -2.630781412124634, + "logps/chosen": -240.9728546142578, + "logps/rejected": -174.75482177734375, + "loss": 1.0181, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9888597130775452, + "rewards/margins": 0.017247207462787628, + "rewards/rejected": -1.0061068534851074, + "step": 457 + }, + { + "epoch": 0.05, + "learning_rate": 2.88516914432869e-07, + "logits/chosen": -2.3116161823272705, + "logits/rejected": -2.006218194961548, + "logps/chosen": -174.68646240234375, + "logps/rejected": -231.97708129882812, + "loss": 0.4119, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8277329802513123, + "rewards/margins": 0.9262583255767822, + "rewards/rejected": -1.7539912462234497, + "step": 458 + }, + { + "epoch": 0.05, + "learning_rate": 2.8848179796324476e-07, + "logits/chosen": -2.021129608154297, + "logits/rejected": -2.016531467437744, + "logps/chosen": -302.85626220703125, + "logps/rejected": -328.9703063964844, + "loss": 0.679, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6189159750938416, + "rewards/margins": 0.36308372020721436, + "rewards/rejected": -0.9819996953010559, + "step": 459 + }, + { + "epoch": 0.05, + "learning_rate": 2.884466814936205e-07, + "logits/chosen": -2.373223304748535, + "logits/rejected": -2.4502949714660645, + "logps/chosen": -310.6684265136719, + "logps/rejected": -274.0010070800781, + "loss": 1.7202, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.318345785140991, + "rewards/margins": -0.7566402554512024, + "rewards/rejected": -1.561705231666565, + "step": 460 + }, + { + "epoch": 0.05, + "learning_rate": 2.884115650239962e-07, + "logits/chosen": -2.2397499084472656, + "logits/rejected": -2.1499722003936768, + "logps/chosen": -230.77609252929688, + "logps/rejected": -331.5451354980469, + "loss": 0.3434, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11431356519460678, + "rewards/margins": 1.3693591356277466, + "rewards/rejected": -1.4836727380752563, + "step": 461 + }, + { + "epoch": 0.05, + "learning_rate": 2.8837644855437197e-07, + "logits/chosen": -2.1171231269836426, + "logits/rejected": -2.023129940032959, + "logps/chosen": -224.6741943359375, + "logps/rejected": -220.53021240234375, + "loss": 0.2734, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05649733543395996, + "rewards/margins": 1.4621689319610596, + "rewards/rejected": -1.4056715965270996, + "step": 462 + }, + { + "epoch": 0.05, + "learning_rate": 2.883413320847477e-07, + "logits/chosen": -2.816389560699463, + "logits/rejected": -2.6703922748565674, + "logps/chosen": -249.25286865234375, + "logps/rejected": -298.4347839355469, + "loss": 0.3448, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0013942662626504898, + "rewards/margins": 1.3578014373779297, + "rewards/rejected": -1.359195590019226, + "step": 463 + }, + { + "epoch": 0.05, + "learning_rate": 2.883062156151235e-07, + "logits/chosen": -2.6652283668518066, + "logits/rejected": -2.8439559936523438, + "logps/chosen": -325.799560546875, + "logps/rejected": -214.22471618652344, + "loss": 0.398, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.29761531949043274, + "rewards/margins": 1.691826581954956, + "rewards/rejected": -1.989441990852356, + "step": 464 + }, + { + "epoch": 0.05, + "learning_rate": 2.8827109914549923e-07, + "logits/chosen": -1.9270765781402588, + "logits/rejected": -2.0985045433044434, + "logps/chosen": -246.80665588378906, + "logps/rejected": -308.87811279296875, + "loss": 0.381, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5663408041000366, + "rewards/margins": 1.2942873239517212, + "rewards/rejected": -1.8606280088424683, + "step": 465 + }, + { + "epoch": 0.05, + "learning_rate": 2.8823598267587493e-07, + "logits/chosen": -2.8225066661834717, + "logits/rejected": -2.948239326477051, + "logps/chosen": -321.5458679199219, + "logps/rejected": -239.54803466796875, + "loss": 0.6219, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3875611126422882, + "rewards/margins": 0.4388568699359894, + "rewards/rejected": -0.8264179825782776, + "step": 466 + }, + { + "epoch": 0.05, + "learning_rate": 2.8820086620625074e-07, + "logits/chosen": -2.2527945041656494, + "logits/rejected": -2.33430814743042, + "logps/chosen": -407.0142822265625, + "logps/rejected": -263.49029541015625, + "loss": 0.3703, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7004635334014893, + "rewards/margins": 1.0589730739593506, + "rewards/rejected": -1.7594366073608398, + "step": 467 + }, + { + "epoch": 0.05, + "learning_rate": 2.881657497366265e-07, + "logits/chosen": -2.063936233520508, + "logits/rejected": -2.3350436687469482, + "logps/chosen": -223.6260223388672, + "logps/rejected": -128.9600372314453, + "loss": 0.8438, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8109927177429199, + "rewards/margins": -0.15000019967556, + "rewards/rejected": -0.6609926223754883, + "step": 468 + }, + { + "epoch": 0.05, + "learning_rate": 2.881306332670022e-07, + "logits/chosen": -2.465768575668335, + "logits/rejected": -2.5165598392486572, + "logps/chosen": -254.53631591796875, + "logps/rejected": -263.052978515625, + "loss": 0.7728, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9848107099533081, + "rewards/margins": 0.3145175874233246, + "rewards/rejected": -1.299328327178955, + "step": 469 + }, + { + "epoch": 0.05, + "learning_rate": 2.8809551679737795e-07, + "logits/chosen": -2.8248953819274902, + "logits/rejected": -2.6104865074157715, + "logps/chosen": -161.4761199951172, + "logps/rejected": -122.3411865234375, + "loss": 0.6195, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8118212819099426, + "rewards/margins": 0.6716932058334351, + "rewards/rejected": -1.4835145473480225, + "step": 470 + }, + { + "epoch": 0.05, + "learning_rate": 2.880604003277537e-07, + "logits/chosen": -2.677438259124756, + "logits/rejected": -2.485942840576172, + "logps/chosen": -244.46188354492188, + "logps/rejected": -145.74069213867188, + "loss": 0.7452, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9668164253234863, + "rewards/margins": 0.4592428505420685, + "rewards/rejected": -1.426059365272522, + "step": 471 + }, + { + "epoch": 0.05, + "learning_rate": 2.8802528385812946e-07, + "logits/chosen": -2.166897773742676, + "logits/rejected": -2.581681251525879, + "logps/chosen": -458.53924560546875, + "logps/rejected": -344.70672607421875, + "loss": 0.5757, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5973078608512878, + "rewards/margins": 0.5970226526260376, + "rewards/rejected": -1.1943304538726807, + "step": 472 + }, + { + "epoch": 0.05, + "learning_rate": 2.879901673885052e-07, + "logits/chosen": -2.1751232147216797, + "logits/rejected": -2.0866453647613525, + "logps/chosen": -263.396240234375, + "logps/rejected": -354.48565673828125, + "loss": 0.5073, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9444391131401062, + "rewards/margins": 1.2221899032592773, + "rewards/rejected": -2.166628837585449, + "step": 473 + }, + { + "epoch": 0.05, + "learning_rate": 2.879550509188809e-07, + "logits/chosen": -1.8724539279937744, + "logits/rejected": -1.7525582313537598, + "logps/chosen": -177.92723083496094, + "logps/rejected": -237.5724334716797, + "loss": 0.5779, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.32807657122612, + "rewards/margins": 0.5439035892486572, + "rewards/rejected": -0.8719801306724548, + "step": 474 + }, + { + "epoch": 0.05, + "learning_rate": 2.8791993444925667e-07, + "logits/chosen": -2.2764272689819336, + "logits/rejected": -2.1955795288085938, + "logps/chosen": -287.4642333984375, + "logps/rejected": -263.2330322265625, + "loss": 0.7934, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5761173963546753, + "rewards/margins": 0.5174239873886108, + "rewards/rejected": -2.0935416221618652, + "step": 475 + }, + { + "epoch": 0.05, + "learning_rate": 2.878848179796324e-07, + "logits/chosen": -1.7362464666366577, + "logits/rejected": -2.136260509490967, + "logps/chosen": -288.6219787597656, + "logps/rejected": -213.75225830078125, + "loss": 0.6929, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4926273822784424, + "rewards/margins": 0.3424549698829651, + "rewards/rejected": -0.8350824117660522, + "step": 476 + }, + { + "epoch": 0.05, + "learning_rate": 2.8784970151000817e-07, + "logits/chosen": -2.648416757583618, + "logits/rejected": -2.5122532844543457, + "logps/chosen": -202.4828338623047, + "logps/rejected": -248.96607971191406, + "loss": 0.5995, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9207600355148315, + "rewards/margins": 0.8447424173355103, + "rewards/rejected": -1.7655025720596313, + "step": 477 + }, + { + "epoch": 0.06, + "learning_rate": 2.8781458504038393e-07, + "logits/chosen": -2.4319777488708496, + "logits/rejected": -2.5603342056274414, + "logps/chosen": -287.07000732421875, + "logps/rejected": -198.55580139160156, + "loss": 0.4735, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4855503737926483, + "rewards/margins": 1.4225584268569946, + "rewards/rejected": -1.9081087112426758, + "step": 478 + }, + { + "epoch": 0.06, + "learning_rate": 2.877794685707597e-07, + "logits/chosen": -2.448014974594116, + "logits/rejected": -2.2008793354034424, + "logps/chosen": -150.7436981201172, + "logps/rejected": -270.059326171875, + "loss": 0.3182, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06578931957483292, + "rewards/margins": 1.7694683074951172, + "rewards/rejected": -1.835257649421692, + "step": 479 + }, + { + "epoch": 0.06, + "learning_rate": 2.8774435210113543e-07, + "logits/chosen": -2.4526920318603516, + "logits/rejected": -2.483173370361328, + "logps/chosen": -150.56134033203125, + "logps/rejected": -226.56539916992188, + "loss": 0.4192, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9857232570648193, + "rewards/margins": 1.08680260181427, + "rewards/rejected": -2.0725257396698, + "step": 480 + }, + { + "epoch": 0.06, + "learning_rate": 2.877092356315112e-07, + "logits/chosen": -2.369472026824951, + "logits/rejected": -2.2916154861450195, + "logps/chosen": -146.00363159179688, + "logps/rejected": -157.33538818359375, + "loss": 1.1377, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5152995586395264, + "rewards/margins": -0.2691340446472168, + "rewards/rejected": -1.2461655139923096, + "step": 481 + }, + { + "epoch": 0.06, + "learning_rate": 2.876741191618869e-07, + "logits/chosen": -1.5621750354766846, + "logits/rejected": -2.0374605655670166, + "logps/chosen": -308.89324951171875, + "logps/rejected": -294.60247802734375, + "loss": 1.0958, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1331970691680908, + "rewards/margins": 0.0841522365808487, + "rewards/rejected": -1.2173492908477783, + "step": 482 + }, + { + "epoch": 0.06, + "learning_rate": 2.8763900269226264e-07, + "logits/chosen": -2.783170223236084, + "logits/rejected": -2.8685195446014404, + "logps/chosen": -302.78936767578125, + "logps/rejected": -323.6280822753906, + "loss": 0.7508, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7224998474121094, + "rewards/margins": 0.569266676902771, + "rewards/rejected": -1.2917664051055908, + "step": 483 + }, + { + "epoch": 0.06, + "learning_rate": 2.876038862226384e-07, + "logits/chosen": -2.581831455230713, + "logits/rejected": -2.7536449432373047, + "logps/chosen": -422.5903015136719, + "logps/rejected": -253.06614685058594, + "loss": 0.3724, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.30029651522636414, + "rewards/margins": 1.577310562133789, + "rewards/rejected": -1.8776071071624756, + "step": 484 + }, + { + "epoch": 0.06, + "learning_rate": 2.8756876975301415e-07, + "logits/chosen": -2.2920758724212646, + "logits/rejected": -2.6273365020751953, + "logps/chosen": -307.40472412109375, + "logps/rejected": -248.80511474609375, + "loss": 0.4981, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3173650801181793, + "rewards/margins": 1.0282539129257202, + "rewards/rejected": -1.3456189632415771, + "step": 485 + }, + { + "epoch": 0.06, + "learning_rate": 2.875336532833899e-07, + "logits/chosen": -2.3949155807495117, + "logits/rejected": -2.40276837348938, + "logps/chosen": -254.95411682128906, + "logps/rejected": -281.72039794921875, + "loss": 0.4614, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4673457145690918, + "rewards/margins": 0.7053340077400208, + "rewards/rejected": -1.1726797819137573, + "step": 486 + }, + { + "epoch": 0.06, + "learning_rate": 2.8749853681376566e-07, + "logits/chosen": -2.0277607440948486, + "logits/rejected": -2.1013009548187256, + "logps/chosen": -262.6678466796875, + "logps/rejected": -236.6695098876953, + "loss": 0.6451, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8843861818313599, + "rewards/margins": 0.767354428768158, + "rewards/rejected": -1.651740550994873, + "step": 487 + }, + { + "epoch": 0.06, + "learning_rate": 2.8746342034414136e-07, + "logits/chosen": -2.5579307079315186, + "logits/rejected": -2.766585111618042, + "logps/chosen": -267.28509521484375, + "logps/rejected": -449.15411376953125, + "loss": 0.3453, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5423908829689026, + "rewards/margins": 2.0920729637145996, + "rewards/rejected": -2.6344637870788574, + "step": 488 + }, + { + "epoch": 0.06, + "learning_rate": 2.8742830387451717e-07, + "logits/chosen": -2.1274638175964355, + "logits/rejected": -2.6358425617218018, + "logps/chosen": -263.6199035644531, + "logps/rejected": -214.1932373046875, + "loss": 0.6615, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8314990997314453, + "rewards/margins": 0.6638544797897339, + "rewards/rejected": -1.4953536987304688, + "step": 489 + }, + { + "epoch": 0.06, + "learning_rate": 2.8739318740489287e-07, + "logits/chosen": -2.649379253387451, + "logits/rejected": -2.6136116981506348, + "logps/chosen": -170.674072265625, + "logps/rejected": -214.75099182128906, + "loss": 0.5806, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6439747214317322, + "rewards/margins": 0.7678163051605225, + "rewards/rejected": -1.4117910861968994, + "step": 490 + }, + { + "epoch": 0.06, + "learning_rate": 2.873580709352686e-07, + "logits/chosen": -2.4170830249786377, + "logits/rejected": -2.405801773071289, + "logps/chosen": -261.6668701171875, + "logps/rejected": -174.30252075195312, + "loss": 0.5092, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.16032925248146057, + "rewards/margins": 0.6775404214859009, + "rewards/rejected": -0.5172111988067627, + "step": 491 + }, + { + "epoch": 0.06, + "learning_rate": 2.873229544656444e-07, + "logits/chosen": -2.6369404792785645, + "logits/rejected": -2.651421546936035, + "logps/chosen": -148.99974060058594, + "logps/rejected": -225.1609344482422, + "loss": 0.5885, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5193325281143188, + "rewards/margins": 0.7755799293518066, + "rewards/rejected": -1.2949124574661255, + "step": 492 + }, + { + "epoch": 0.06, + "learning_rate": 2.8728783799602013e-07, + "logits/chosen": -2.9427971839904785, + "logits/rejected": -2.907871723175049, + "logps/chosen": -221.27252197265625, + "logps/rejected": -272.5975341796875, + "loss": 0.6207, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2698073089122772, + "rewards/margins": 0.34591224789619446, + "rewards/rejected": -0.6157194972038269, + "step": 493 + }, + { + "epoch": 0.06, + "learning_rate": 2.872527215263959e-07, + "logits/chosen": -2.0766761302948, + "logits/rejected": -2.494436740875244, + "logps/chosen": -357.1811828613281, + "logps/rejected": -197.42604064941406, + "loss": 0.7649, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5890502333641052, + "rewards/margins": 0.2436237931251526, + "rewards/rejected": -0.8326740264892578, + "step": 494 + }, + { + "epoch": 0.06, + "learning_rate": 2.8721760505677164e-07, + "logits/chosen": -2.163092851638794, + "logits/rejected": -2.2364890575408936, + "logps/chosen": -182.08612060546875, + "logps/rejected": -238.56887817382812, + "loss": 0.462, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4174780249595642, + "rewards/margins": 1.0175573825836182, + "rewards/rejected": -1.4350353479385376, + "step": 495 + }, + { + "epoch": 0.06, + "learning_rate": 2.8718248858714734e-07, + "logits/chosen": -1.6942522525787354, + "logits/rejected": -2.1496667861938477, + "logps/chosen": -437.9544372558594, + "logps/rejected": -236.397705078125, + "loss": 0.4298, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12230338156223297, + "rewards/margins": 1.1169443130493164, + "rewards/rejected": -1.2392476797103882, + "step": 496 + }, + { + "epoch": 0.06, + "learning_rate": 2.871473721175231e-07, + "logits/chosen": -2.705634117126465, + "logits/rejected": -2.6221463680267334, + "logps/chosen": -230.30044555664062, + "logps/rejected": -162.52542114257812, + "loss": 0.3641, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14102941751480103, + "rewards/margins": 1.0244121551513672, + "rewards/rejected": -1.165441632270813, + "step": 497 + }, + { + "epoch": 0.06, + "learning_rate": 2.8711225564789885e-07, + "logits/chosen": -2.347660779953003, + "logits/rejected": -2.2216720581054688, + "logps/chosen": -285.89605712890625, + "logps/rejected": -302.37213134765625, + "loss": 0.6211, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5712305903434753, + "rewards/margins": 0.8850297927856445, + "rewards/rejected": -1.4562602043151855, + "step": 498 + }, + { + "epoch": 0.06, + "learning_rate": 2.870771391782746e-07, + "logits/chosen": -2.7530012130737305, + "logits/rejected": -2.467622756958008, + "logps/chosen": -184.01141357421875, + "logps/rejected": -213.3414306640625, + "loss": 0.4027, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5763751268386841, + "rewards/margins": 1.1180371046066284, + "rewards/rejected": -1.6944122314453125, + "step": 499 + }, + { + "epoch": 0.06, + "learning_rate": 2.8704202270865035e-07, + "logits/chosen": -2.3003745079040527, + "logits/rejected": -2.513579845428467, + "logps/chosen": -434.686279296875, + "logps/rejected": -296.787109375, + "loss": 0.4434, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16831588745117188, + "rewards/margins": 1.1480042934417725, + "rewards/rejected": -1.3163201808929443, + "step": 500 + }, + { + "epoch": 0.06, + "learning_rate": 2.870069062390261e-07, + "logits/chosen": -2.284109592437744, + "logits/rejected": -2.3900132179260254, + "logps/chosen": -436.60430908203125, + "logps/rejected": -209.9559783935547, + "loss": 0.5821, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.25574031472206116, + "rewards/margins": 0.44959646463394165, + "rewards/rejected": -0.7053368091583252, + "step": 501 + }, + { + "epoch": 0.06, + "learning_rate": 2.8697178976940186e-07, + "logits/chosen": -2.0731489658355713, + "logits/rejected": -2.079669237136841, + "logps/chosen": -313.22344970703125, + "logps/rejected": -546.76123046875, + "loss": 0.6076, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.39511266350746155, + "rewards/margins": 0.5448375940322876, + "rewards/rejected": -0.9399503469467163, + "step": 502 + }, + { + "epoch": 0.06, + "learning_rate": 2.869366732997776e-07, + "logits/chosen": -2.0423460006713867, + "logits/rejected": -2.1361429691314697, + "logps/chosen": -537.3745727539062, + "logps/rejected": -417.0511474609375, + "loss": 0.3571, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.05032350495457649, + "rewards/margins": 1.4592599868774414, + "rewards/rejected": -1.4089365005493164, + "step": 503 + }, + { + "epoch": 0.06, + "learning_rate": 2.869015568301533e-07, + "logits/chosen": -2.6322035789489746, + "logits/rejected": -2.737255096435547, + "logps/chosen": -229.77645874023438, + "logps/rejected": -203.5450897216797, + "loss": 0.6131, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7738860845565796, + "rewards/margins": 0.9405642747879028, + "rewards/rejected": -1.714450478553772, + "step": 504 + }, + { + "epoch": 0.06, + "learning_rate": 2.8686644036052907e-07, + "logits/chosen": -2.521639347076416, + "logits/rejected": -2.3514223098754883, + "logps/chosen": -170.552490234375, + "logps/rejected": -189.92916870117188, + "loss": 0.4389, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07575119286775589, + "rewards/margins": 1.3847997188568115, + "rewards/rejected": -1.4605509042739868, + "step": 505 + }, + { + "epoch": 0.06, + "learning_rate": 2.868313238909048e-07, + "logits/chosen": -1.9397516250610352, + "logits/rejected": -2.0363779067993164, + "logps/chosen": -399.8056335449219, + "logps/rejected": -323.70672607421875, + "loss": 0.3741, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2875491976737976, + "rewards/margins": 1.0757614374160767, + "rewards/rejected": -1.363310694694519, + "step": 506 + }, + { + "epoch": 0.06, + "learning_rate": 2.867962074212806e-07, + "logits/chosen": -2.5614371299743652, + "logits/rejected": -2.529707670211792, + "logps/chosen": -92.24776458740234, + "logps/rejected": -110.11508178710938, + "loss": 0.4891, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3042468726634979, + "rewards/margins": 0.7585375905036926, + "rewards/rejected": -1.0627844333648682, + "step": 507 + }, + { + "epoch": 0.06, + "learning_rate": 2.8676109095165633e-07, + "logits/chosen": -2.517011880874634, + "logits/rejected": -2.228072166442871, + "logps/chosen": -186.47988891601562, + "logps/rejected": -169.34869384765625, + "loss": 0.8848, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4946534037590027, + "rewards/margins": 0.23119178414344788, + "rewards/rejected": -0.725845217704773, + "step": 508 + }, + { + "epoch": 0.06, + "learning_rate": 2.8672597448203203e-07, + "logits/chosen": -2.345167875289917, + "logits/rejected": -2.558375120162964, + "logps/chosen": -254.9164276123047, + "logps/rejected": -244.44924926757812, + "loss": 0.3451, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5686162710189819, + "rewards/margins": 1.0440539121627808, + "rewards/rejected": -1.6126701831817627, + "step": 509 + }, + { + "epoch": 0.06, + "learning_rate": 2.866908580124078e-07, + "logits/chosen": -2.4540598392486572, + "logits/rejected": -2.6678848266601562, + "logps/chosen": -383.1554870605469, + "logps/rejected": -325.0205383300781, + "loss": 0.3709, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.535091757774353, + "rewards/margins": 1.155277132987976, + "rewards/rejected": -1.6903687715530396, + "step": 510 + }, + { + "epoch": 0.06, + "learning_rate": 2.866557415427836e-07, + "logits/chosen": -2.205841541290283, + "logits/rejected": -2.265838146209717, + "logps/chosen": -354.46246337890625, + "logps/rejected": -297.1405334472656, + "loss": 0.7219, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5000820159912109, + "rewards/margins": 0.21353541314601898, + "rewards/rejected": -0.7136174440383911, + "step": 511 + }, + { + "epoch": 0.06, + "learning_rate": 2.866206250731593e-07, + "logits/chosen": -2.028069019317627, + "logits/rejected": -2.4222300052642822, + "logps/chosen": -325.65924072265625, + "logps/rejected": -291.92987060546875, + "loss": 1.3152, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.7481026649475098, + "rewards/margins": -0.5096114873886108, + "rewards/rejected": -1.238491177558899, + "step": 512 + }, + { + "epoch": 0.06, + "learning_rate": 2.8658550860353505e-07, + "logits/chosen": -2.5223326683044434, + "logits/rejected": -2.7250735759735107, + "logps/chosen": -261.90228271484375, + "logps/rejected": -193.21585083007812, + "loss": 0.2737, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14694663882255554, + "rewards/margins": 1.3100440502166748, + "rewards/rejected": -1.4569905996322632, + "step": 513 + }, + { + "epoch": 0.06, + "learning_rate": 2.865503921339108e-07, + "logits/chosen": -2.861100435256958, + "logits/rejected": -2.647108793258667, + "logps/chosen": -233.51927185058594, + "logps/rejected": -248.77285766601562, + "loss": 0.5952, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8981175422668457, + "rewards/margins": 0.43512627482414246, + "rewards/rejected": -1.3332438468933105, + "step": 514 + }, + { + "epoch": 0.06, + "learning_rate": 2.8651527566428656e-07, + "logits/chosen": -1.7699674367904663, + "logits/rejected": -1.6591846942901611, + "logps/chosen": -337.2425537109375, + "logps/rejected": -385.3441162109375, + "loss": 0.3433, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2594868838787079, + "rewards/margins": 2.0850014686584473, + "rewards/rejected": -2.3444883823394775, + "step": 515 + }, + { + "epoch": 0.06, + "learning_rate": 2.864801591946623e-07, + "logits/chosen": -2.5089988708496094, + "logits/rejected": -2.6583220958709717, + "logps/chosen": -416.75701904296875, + "logps/rejected": -254.9019317626953, + "loss": 0.427, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.23384761810302734, + "rewards/margins": 0.9408758878707886, + "rewards/rejected": -1.174723505973816, + "step": 516 + }, + { + "epoch": 0.06, + "learning_rate": 2.86445042725038e-07, + "logits/chosen": -2.3576440811157227, + "logits/rejected": -2.2428457736968994, + "logps/chosen": -328.0191650390625, + "logps/rejected": -271.78546142578125, + "loss": 0.6314, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.0598515272140503, + "rewards/margins": 0.8882557153701782, + "rewards/rejected": -1.9481072425842285, + "step": 517 + }, + { + "epoch": 0.06, + "learning_rate": 2.8640992625541376e-07, + "logits/chosen": -1.9472200870513916, + "logits/rejected": -2.115159034729004, + "logps/chosen": -594.66748046875, + "logps/rejected": -453.6813049316406, + "loss": 0.7581, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6571948528289795, + "rewards/margins": 0.4378705620765686, + "rewards/rejected": -2.0950655937194824, + "step": 518 + }, + { + "epoch": 0.06, + "learning_rate": 2.863748097857895e-07, + "logits/chosen": -2.299154281616211, + "logits/rejected": -2.3799805641174316, + "logps/chosen": -155.60800170898438, + "logps/rejected": -152.24407958984375, + "loss": 0.4978, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5776726007461548, + "rewards/margins": 0.7220948338508606, + "rewards/rejected": -1.2997674942016602, + "step": 519 + }, + { + "epoch": 0.06, + "learning_rate": 2.8633969331616527e-07, + "logits/chosen": -2.016576051712036, + "logits/rejected": -2.257237434387207, + "logps/chosen": -302.2783203125, + "logps/rejected": -322.5005798339844, + "loss": 0.4002, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14465925097465515, + "rewards/margins": 1.080497145652771, + "rewards/rejected": -1.2251564264297485, + "step": 520 + }, + { + "epoch": 0.06, + "learning_rate": 2.86304576846541e-07, + "logits/chosen": -3.0388078689575195, + "logits/rejected": -3.004591464996338, + "logps/chosen": -180.15931701660156, + "logps/rejected": -223.7476043701172, + "loss": 0.2766, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3095465302467346, + "rewards/margins": 2.2447900772094727, + "rewards/rejected": -2.5543367862701416, + "step": 521 + }, + { + "epoch": 0.06, + "learning_rate": 2.862694603769167e-07, + "logits/chosen": -2.4702813625335693, + "logits/rejected": -2.5404038429260254, + "logps/chosen": -277.579345703125, + "logps/rejected": -231.14004516601562, + "loss": 1.2975, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.2282202243804932, + "rewards/margins": -0.45997530221939087, + "rewards/rejected": -0.7682449817657471, + "step": 522 + }, + { + "epoch": 0.06, + "learning_rate": 2.8623434390729253e-07, + "logits/chosen": -2.206568956375122, + "logits/rejected": -2.125403642654419, + "logps/chosen": -229.415283203125, + "logps/rejected": -228.7527618408203, + "loss": 0.4504, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2940758764743805, + "rewards/margins": 1.1107146739959717, + "rewards/rejected": -1.4047904014587402, + "step": 523 + }, + { + "epoch": 0.06, + "learning_rate": 2.861992274376683e-07, + "logits/chosen": -2.1400585174560547, + "logits/rejected": -2.1048126220703125, + "logps/chosen": -499.6211242675781, + "logps/rejected": -425.48876953125, + "loss": 0.4943, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.21377448737621307, + "rewards/margins": 1.141815185546875, + "rewards/rejected": -1.3555896282196045, + "step": 524 + }, + { + "epoch": 0.06, + "learning_rate": 2.86164110968044e-07, + "logits/chosen": -2.3273301124572754, + "logits/rejected": -2.365999460220337, + "logps/chosen": -132.45166015625, + "logps/rejected": -182.44036865234375, + "loss": 0.5684, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6656724214553833, + "rewards/margins": 0.4913822412490845, + "rewards/rejected": -2.1570546627044678, + "step": 525 + }, + { + "epoch": 0.06, + "learning_rate": 2.8612899449841974e-07, + "logits/chosen": -2.228276491165161, + "logits/rejected": -2.393404245376587, + "logps/chosen": -375.04595947265625, + "logps/rejected": -307.7579345703125, + "loss": 0.7011, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5203055739402771, + "rewards/margins": 0.9620857238769531, + "rewards/rejected": -1.4823912382125854, + "step": 526 + }, + { + "epoch": 0.06, + "learning_rate": 2.860938780287955e-07, + "logits/chosen": -2.7251243591308594, + "logits/rejected": -2.874398946762085, + "logps/chosen": -337.774169921875, + "logps/rejected": -386.8190002441406, + "loss": 0.323, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5733043551445007, + "rewards/margins": 1.6797397136688232, + "rewards/rejected": -2.2530441284179688, + "step": 527 + }, + { + "epoch": 0.06, + "learning_rate": 2.8605876155917125e-07, + "logits/chosen": -2.2090795040130615, + "logits/rejected": -2.1655170917510986, + "logps/chosen": -346.36016845703125, + "logps/rejected": -355.1721496582031, + "loss": 0.7464, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8035624027252197, + "rewards/margins": 0.45895469188690186, + "rewards/rejected": -1.2625172138214111, + "step": 528 + }, + { + "epoch": 0.06, + "learning_rate": 2.86023645089547e-07, + "logits/chosen": -2.614464521408081, + "logits/rejected": -2.8400464057922363, + "logps/chosen": -347.4024658203125, + "logps/rejected": -340.5770263671875, + "loss": 0.9452, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.518674612045288, + "rewards/margins": 0.9886889457702637, + "rewards/rejected": -2.507363796234131, + "step": 529 + }, + { + "epoch": 0.06, + "learning_rate": 2.859885286199227e-07, + "logits/chosen": -1.949316143989563, + "logits/rejected": -1.6618582010269165, + "logps/chosen": -274.5640563964844, + "logps/rejected": -216.50929260253906, + "loss": 0.6079, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6712145805358887, + "rewards/margins": 0.28072062134742737, + "rewards/rejected": -0.9519351720809937, + "step": 530 + }, + { + "epoch": 0.06, + "learning_rate": 2.8595341215029846e-07, + "logits/chosen": -2.807375907897949, + "logits/rejected": -2.684962272644043, + "logps/chosen": -192.6080322265625, + "logps/rejected": -296.95379638671875, + "loss": 0.3804, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6459552049636841, + "rewards/margins": 1.1762930154800415, + "rewards/rejected": -1.8222482204437256, + "step": 531 + }, + { + "epoch": 0.06, + "learning_rate": 2.859182956806742e-07, + "logits/chosen": -2.2165303230285645, + "logits/rejected": -2.288768768310547, + "logps/chosen": -500.32861328125, + "logps/rejected": -330.06890869140625, + "loss": 0.5669, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9865425825119019, + "rewards/margins": 0.7929470539093018, + "rewards/rejected": -1.779489517211914, + "step": 532 + }, + { + "epoch": 0.06, + "learning_rate": 2.8588317921104997e-07, + "logits/chosen": -2.6247525215148926, + "logits/rejected": -2.5495784282684326, + "logps/chosen": -294.74951171875, + "logps/rejected": -168.83653259277344, + "loss": 0.6235, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5210382342338562, + "rewards/margins": 0.5782433152198792, + "rewards/rejected": -1.0992814302444458, + "step": 533 + }, + { + "epoch": 0.06, + "learning_rate": 2.858480627414257e-07, + "logits/chosen": -2.3525619506835938, + "logits/rejected": -2.351245641708374, + "logps/chosen": -212.06629943847656, + "logps/rejected": -160.16348266601562, + "loss": 1.8775, + "rewards/accuracies": 0.5, + "rewards/chosen": -3.510328769683838, + "rewards/margins": -0.8973580002784729, + "rewards/rejected": -2.6129708290100098, + "step": 534 + }, + { + "epoch": 0.06, + "learning_rate": 2.8581294627180147e-07, + "logits/chosen": -1.96241295337677, + "logits/rejected": -2.248779058456421, + "logps/chosen": -356.6650085449219, + "logps/rejected": -242.39190673828125, + "loss": 0.8121, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8255573511123657, + "rewards/margins": 0.9896891117095947, + "rewards/rejected": -2.81524658203125, + "step": 535 + }, + { + "epoch": 0.06, + "learning_rate": 2.8577782980217723e-07, + "logits/chosen": -2.156428813934326, + "logits/rejected": -2.3613810539245605, + "logps/chosen": -301.8243713378906, + "logps/rejected": -297.7764892578125, + "loss": 0.4731, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1104983240365982, + "rewards/margins": 1.234626293182373, + "rewards/rejected": -1.34512460231781, + "step": 536 + }, + { + "epoch": 0.06, + "learning_rate": 2.85742713332553e-07, + "logits/chosen": -1.995229721069336, + "logits/rejected": -2.030209541320801, + "logps/chosen": -360.53009033203125, + "logps/rejected": -279.0227966308594, + "loss": 0.556, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4612087607383728, + "rewards/margins": 0.9338219165802002, + "rewards/rejected": -1.3950307369232178, + "step": 537 + }, + { + "epoch": 0.06, + "learning_rate": 2.857075968629287e-07, + "logits/chosen": -2.150573253631592, + "logits/rejected": -2.352241277694702, + "logps/chosen": -325.84869384765625, + "logps/rejected": -292.11834716796875, + "loss": 0.7125, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8290085792541504, + "rewards/margins": 0.7548601031303406, + "rewards/rejected": -1.5838687419891357, + "step": 538 + }, + { + "epoch": 0.06, + "learning_rate": 2.8567248039330444e-07, + "logits/chosen": -2.187112808227539, + "logits/rejected": -2.3190693855285645, + "logps/chosen": -218.35446166992188, + "logps/rejected": -244.31045532226562, + "loss": 0.6954, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7664476633071899, + "rewards/margins": 0.8227839469909668, + "rewards/rejected": -1.5892316102981567, + "step": 539 + }, + { + "epoch": 0.06, + "learning_rate": 2.856373639236802e-07, + "logits/chosen": -2.2623817920684814, + "logits/rejected": -2.362368106842041, + "logps/chosen": -235.12709045410156, + "logps/rejected": -162.06362915039062, + "loss": 0.8523, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0410796403884888, + "rewards/margins": -0.06698489934206009, + "rewards/rejected": -0.974094808101654, + "step": 540 + }, + { + "epoch": 0.06, + "learning_rate": 2.8560224745405594e-07, + "logits/chosen": -2.14699125289917, + "logits/rejected": -2.7009549140930176, + "logps/chosen": -460.7552490234375, + "logps/rejected": -186.8417205810547, + "loss": 0.3865, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7028893232345581, + "rewards/margins": 1.1888058185577393, + "rewards/rejected": -1.891695261001587, + "step": 541 + }, + { + "epoch": 0.06, + "learning_rate": 2.855671309844317e-07, + "logits/chosen": -2.4218339920043945, + "logits/rejected": -2.188239574432373, + "logps/chosen": -163.0276641845703, + "logps/rejected": -240.83299255371094, + "loss": 0.6341, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8355984687805176, + "rewards/margins": 1.1080650091171265, + "rewards/rejected": -1.943663477897644, + "step": 542 + }, + { + "epoch": 0.06, + "learning_rate": 2.855320145148074e-07, + "logits/chosen": -2.223639965057373, + "logits/rejected": -2.508796215057373, + "logps/chosen": -422.8446044921875, + "logps/rejected": -327.0591735839844, + "loss": 0.3307, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24083463847637177, + "rewards/margins": 2.274681568145752, + "rewards/rejected": -2.5155160427093506, + "step": 543 + }, + { + "epoch": 0.06, + "learning_rate": 2.8549689804518315e-07, + "logits/chosen": -2.4025659561157227, + "logits/rejected": -2.507927417755127, + "logps/chosen": -261.26177978515625, + "logps/rejected": -211.11166381835938, + "loss": 0.4774, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.39768362045288086, + "rewards/margins": 1.3510444164276123, + "rewards/rejected": -1.7487279176712036, + "step": 544 + }, + { + "epoch": 0.06, + "learning_rate": 2.8546178157555896e-07, + "logits/chosen": -2.4989876747131348, + "logits/rejected": -2.1118927001953125, + "logps/chosen": -363.32232666015625, + "logps/rejected": -321.09136962890625, + "loss": 0.9871, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1684110164642334, + "rewards/margins": 0.3547738790512085, + "rewards/rejected": -1.5231850147247314, + "step": 545 + }, + { + "epoch": 0.06, + "learning_rate": 2.8542666510593466e-07, + "logits/chosen": -2.673856258392334, + "logits/rejected": -2.838371992111206, + "logps/chosen": -189.55638122558594, + "logps/rejected": -226.52960205078125, + "loss": 0.7204, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.871747612953186, + "rewards/margins": 0.5071892142295837, + "rewards/rejected": -1.378936767578125, + "step": 546 + }, + { + "epoch": 0.06, + "learning_rate": 2.853915486363104e-07, + "logits/chosen": -1.6426301002502441, + "logits/rejected": -1.8445273637771606, + "logps/chosen": -395.74200439453125, + "logps/rejected": -293.41082763671875, + "loss": 0.4779, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.28373050689697266, + "rewards/margins": 0.6479069590568542, + "rewards/rejected": -0.9316374659538269, + "step": 547 + }, + { + "epoch": 0.06, + "learning_rate": 2.8535643216668617e-07, + "logits/chosen": -2.408344030380249, + "logits/rejected": -2.328474760055542, + "logps/chosen": -81.5421142578125, + "logps/rejected": -173.15911865234375, + "loss": 0.4207, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7823613286018372, + "rewards/margins": 0.9406634569168091, + "rewards/rejected": -1.723024845123291, + "step": 548 + }, + { + "epoch": 0.06, + "learning_rate": 2.853213156970619e-07, + "logits/chosen": -2.6633124351501465, + "logits/rejected": -2.5509743690490723, + "logps/chosen": -274.98980712890625, + "logps/rejected": -340.84423828125, + "loss": 0.4362, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2036823332309723, + "rewards/margins": 1.4513790607452393, + "rewards/rejected": -1.6550614833831787, + "step": 549 + }, + { + "epoch": 0.06, + "learning_rate": 2.852861992274377e-07, + "logits/chosen": -2.382020950317383, + "logits/rejected": -2.5281803607940674, + "logps/chosen": -414.7915954589844, + "logps/rejected": -256.79107666015625, + "loss": 0.5357, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8271595239639282, + "rewards/margins": 0.7030889987945557, + "rewards/rejected": -1.5302484035491943, + "step": 550 + }, + { + "epoch": 0.06, + "learning_rate": 2.852510827578134e-07, + "logits/chosen": -1.9694145917892456, + "logits/rejected": -1.9629161357879639, + "logps/chosen": -324.4478759765625, + "logps/rejected": -292.4576110839844, + "loss": 0.8479, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.890981137752533, + "rewards/margins": 0.5575976371765137, + "rewards/rejected": -1.4485788345336914, + "step": 551 + }, + { + "epoch": 0.06, + "learning_rate": 2.8521596628818913e-07, + "logits/chosen": -2.5865893363952637, + "logits/rejected": -2.481769323348999, + "logps/chosen": -212.58815002441406, + "logps/rejected": -248.81393432617188, + "loss": 0.4388, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20236466825008392, + "rewards/margins": 1.7607543468475342, + "rewards/rejected": -1.9631189107894897, + "step": 552 + }, + { + "epoch": 0.06, + "learning_rate": 2.851808498185649e-07, + "logits/chosen": -2.4482369422912598, + "logits/rejected": -2.4133033752441406, + "logps/chosen": -373.81915283203125, + "logps/rejected": -380.3388671875, + "loss": 0.5735, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.24945272505283356, + "rewards/margins": 1.6262503862380981, + "rewards/rejected": -1.8757030963897705, + "step": 553 + }, + { + "epoch": 0.06, + "learning_rate": 2.8514573334894064e-07, + "logits/chosen": -2.6767756938934326, + "logits/rejected": -2.563443660736084, + "logps/chosen": -116.90715026855469, + "logps/rejected": -185.7330322265625, + "loss": 0.4795, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9115958213806152, + "rewards/margins": 0.7659070491790771, + "rewards/rejected": -1.6775028705596924, + "step": 554 + }, + { + "epoch": 0.06, + "learning_rate": 2.851106168793164e-07, + "logits/chosen": -2.5113091468811035, + "logits/rejected": -2.502875804901123, + "logps/chosen": -168.72195434570312, + "logps/rejected": -323.6351013183594, + "loss": 0.2676, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4422847628593445, + "rewards/margins": 1.7059640884399414, + "rewards/rejected": -2.1482489109039307, + "step": 555 + }, + { + "epoch": 0.06, + "learning_rate": 2.8507550040969215e-07, + "logits/chosen": -1.9006842374801636, + "logits/rejected": -1.8214384317398071, + "logps/chosen": -296.6321105957031, + "logps/rejected": -255.39059448242188, + "loss": 0.4728, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.23390904068946838, + "rewards/margins": 0.6860014200210571, + "rewards/rejected": -0.9199104905128479, + "step": 556 + }, + { + "epoch": 0.06, + "learning_rate": 2.850403839400679e-07, + "logits/chosen": -2.3318777084350586, + "logits/rejected": -2.470921516418457, + "logps/chosen": -171.65814208984375, + "logps/rejected": -172.55032348632812, + "loss": 0.5322, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7802038192749023, + "rewards/margins": 1.3024945259094238, + "rewards/rejected": -2.082698345184326, + "step": 557 + }, + { + "epoch": 0.06, + "learning_rate": 2.8500526747044365e-07, + "logits/chosen": -2.3028976917266846, + "logits/rejected": -2.216830015182495, + "logps/chosen": -317.55517578125, + "logps/rejected": -336.05999755859375, + "loss": 0.7695, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.502807855606079, + "rewards/margins": 0.11166301369667053, + "rewards/rejected": -1.6144709587097168, + "step": 558 + }, + { + "epoch": 0.06, + "learning_rate": 2.8497015100081935e-07, + "logits/chosen": -2.7770984172821045, + "logits/rejected": -2.7277982234954834, + "logps/chosen": -229.43540954589844, + "logps/rejected": -207.36294555664062, + "loss": 0.2654, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07545699179172516, + "rewards/margins": 1.589446783065796, + "rewards/rejected": -1.6649037599563599, + "step": 559 + }, + { + "epoch": 0.06, + "learning_rate": 2.849350345311951e-07, + "logits/chosen": -1.927074909210205, + "logits/rejected": -2.030076026916504, + "logps/chosen": -407.06903076171875, + "logps/rejected": -244.00379943847656, + "loss": 0.6549, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5719247460365295, + "rewards/margins": 0.32482287287712097, + "rewards/rejected": -0.8967477083206177, + "step": 560 + }, + { + "epoch": 0.06, + "learning_rate": 2.8489991806157086e-07, + "logits/chosen": -2.298581600189209, + "logits/rejected": -2.314584493637085, + "logps/chosen": -299.3289794921875, + "logps/rejected": -361.09808349609375, + "loss": 0.347, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1278810501098633, + "rewards/margins": 1.4803597927093506, + "rewards/rejected": -2.608240842819214, + "step": 561 + }, + { + "epoch": 0.06, + "learning_rate": 2.848648015919466e-07, + "logits/chosen": -1.9909589290618896, + "logits/rejected": -1.8737173080444336, + "logps/chosen": -273.5435485839844, + "logps/rejected": -279.9205017089844, + "loss": 0.6212, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8464503288269043, + "rewards/margins": 0.6620442271232605, + "rewards/rejected": -1.50849449634552, + "step": 562 + }, + { + "epoch": 0.06, + "learning_rate": 2.8482968512232237e-07, + "logits/chosen": -1.7255967855453491, + "logits/rejected": -1.872541904449463, + "logps/chosen": -394.7929382324219, + "logps/rejected": -326.75372314453125, + "loss": 0.4613, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.27955198287963867, + "rewards/margins": 0.7802060842514038, + "rewards/rejected": -0.5006541013717651, + "step": 563 + }, + { + "epoch": 0.07, + "learning_rate": 2.8479456865269807e-07, + "logits/chosen": -1.9681568145751953, + "logits/rejected": -2.234327554702759, + "logps/chosen": -242.15618896484375, + "logps/rejected": -163.4337158203125, + "loss": 0.6662, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1928058862686157, + "rewards/margins": 0.27603790163993835, + "rewards/rejected": -1.468843698501587, + "step": 564 + }, + { + "epoch": 0.07, + "learning_rate": 2.847594521830738e-07, + "logits/chosen": -2.1409876346588135, + "logits/rejected": -2.0046987533569336, + "logps/chosen": -267.84027099609375, + "logps/rejected": -254.59388732910156, + "loss": 0.8739, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5937002897262573, + "rewards/margins": 0.8301939964294434, + "rewards/rejected": -2.4238944053649902, + "step": 565 + }, + { + "epoch": 0.07, + "learning_rate": 2.847243357134496e-07, + "logits/chosen": -2.4054386615753174, + "logits/rejected": -2.382655143737793, + "logps/chosen": -423.8988037109375, + "logps/rejected": -343.419189453125, + "loss": 0.5515, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6034489870071411, + "rewards/margins": 1.0904322862625122, + "rewards/rejected": -1.6938815116882324, + "step": 566 + }, + { + "epoch": 0.07, + "learning_rate": 2.8468921924382533e-07, + "logits/chosen": -1.7665634155273438, + "logits/rejected": -1.7932555675506592, + "logps/chosen": -246.75308227539062, + "logps/rejected": -297.90740966796875, + "loss": 0.3332, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.37182220816612244, + "rewards/margins": 1.3173054456710815, + "rewards/rejected": -1.6891276836395264, + "step": 567 + }, + { + "epoch": 0.07, + "learning_rate": 2.846541027742011e-07, + "logits/chosen": -2.0480284690856934, + "logits/rejected": -2.3054890632629395, + "logps/chosen": -288.6097717285156, + "logps/rejected": -268.9248352050781, + "loss": 0.4497, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4850616455078125, + "rewards/margins": 1.0057929754257202, + "rewards/rejected": -1.4908546209335327, + "step": 568 + }, + { + "epoch": 0.07, + "learning_rate": 2.8461898630457684e-07, + "logits/chosen": -2.0792458057403564, + "logits/rejected": -2.0737805366516113, + "logps/chosen": -374.8741760253906, + "logps/rejected": -258.86517333984375, + "loss": 0.8323, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5435534119606018, + "rewards/margins": -0.14180897176265717, + "rewards/rejected": -0.40174445509910583, + "step": 569 + }, + { + "epoch": 0.07, + "learning_rate": 2.845838698349526e-07, + "logits/chosen": -2.025177478790283, + "logits/rejected": -2.194270610809326, + "logps/chosen": -390.6951599121094, + "logps/rejected": -254.44410705566406, + "loss": 0.4234, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11258922517299652, + "rewards/margins": 1.501561164855957, + "rewards/rejected": -1.6141502857208252, + "step": 570 + }, + { + "epoch": 0.07, + "learning_rate": 2.8454875336532835e-07, + "logits/chosen": -2.1004385948181152, + "logits/rejected": -2.4027154445648193, + "logps/chosen": -307.43426513671875, + "logps/rejected": -160.63084411621094, + "loss": 0.6165, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.33317312598228455, + "rewards/margins": 0.41699057817459106, + "rewards/rejected": -0.750163733959198, + "step": 571 + }, + { + "epoch": 0.07, + "learning_rate": 2.8451363689570405e-07, + "logits/chosen": -2.978102684020996, + "logits/rejected": -2.9726085662841797, + "logps/chosen": -212.57473754882812, + "logps/rejected": -249.69680786132812, + "loss": 0.9697, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.7916678786277771, + "rewards/margins": -0.44307664036750793, + "rewards/rejected": -0.34859126806259155, + "step": 572 + }, + { + "epoch": 0.07, + "learning_rate": 2.844785204260798e-07, + "logits/chosen": -2.4457032680511475, + "logits/rejected": -2.3080337047576904, + "logps/chosen": -374.36090087890625, + "logps/rejected": -276.4313659667969, + "loss": 0.509, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8241673111915588, + "rewards/margins": 0.8696883916854858, + "rewards/rejected": -1.6938557624816895, + "step": 573 + }, + { + "epoch": 0.07, + "learning_rate": 2.8444340395645556e-07, + "logits/chosen": -2.513352632522583, + "logits/rejected": -2.4461588859558105, + "logps/chosen": -206.23480224609375, + "logps/rejected": -204.37313842773438, + "loss": 0.9322, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9788445234298706, + "rewards/margins": 0.12979593873023987, + "rewards/rejected": -1.108640432357788, + "step": 574 + }, + { + "epoch": 0.07, + "learning_rate": 2.844082874868313e-07, + "logits/chosen": -2.187711000442505, + "logits/rejected": -2.467636823654175, + "logps/chosen": -408.4302978515625, + "logps/rejected": -252.083251953125, + "loss": 1.3999, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1259889602661133, + "rewards/margins": -0.41418105363845825, + "rewards/rejected": -0.7118078470230103, + "step": 575 + }, + { + "epoch": 0.07, + "learning_rate": 2.8437317101720706e-07, + "logits/chosen": -2.7810866832733154, + "logits/rejected": -2.807138204574585, + "logps/chosen": -341.135009765625, + "logps/rejected": -299.8137512207031, + "loss": 0.3676, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7735382318496704, + "rewards/margins": 1.174001932144165, + "rewards/rejected": -1.947540044784546, + "step": 576 + }, + { + "epoch": 0.07, + "learning_rate": 2.843380545475828e-07, + "logits/chosen": -2.5215067863464355, + "logits/rejected": -2.6615304946899414, + "logps/chosen": -163.5018310546875, + "logps/rejected": -225.276123046875, + "loss": 0.4069, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5220063924789429, + "rewards/margins": 0.965449869632721, + "rewards/rejected": -1.487456202507019, + "step": 577 + }, + { + "epoch": 0.07, + "learning_rate": 2.843029380779585e-07, + "logits/chosen": -2.0054008960723877, + "logits/rejected": -2.2778141498565674, + "logps/chosen": -372.28009033203125, + "logps/rejected": -199.91506958007812, + "loss": 1.1101, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9682658314704895, + "rewards/margins": -0.5345726609230042, + "rewards/rejected": -0.43369317054748535, + "step": 578 + }, + { + "epoch": 0.07, + "learning_rate": 2.842678216083343e-07, + "logits/chosen": -2.3892321586608887, + "logits/rejected": -2.6326804161071777, + "logps/chosen": -424.85382080078125, + "logps/rejected": -292.8460693359375, + "loss": 0.5141, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.553108811378479, + "rewards/margins": 0.5721700191497803, + "rewards/rejected": -1.1252787113189697, + "step": 579 + }, + { + "epoch": 0.07, + "learning_rate": 2.8423270513871e-07, + "logits/chosen": -2.5878162384033203, + "logits/rejected": -2.2715327739715576, + "logps/chosen": -191.1282958984375, + "logps/rejected": -240.7293701171875, + "loss": 0.6184, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6216900944709778, + "rewards/margins": 0.3214843273162842, + "rewards/rejected": -0.9431743621826172, + "step": 580 + }, + { + "epoch": 0.07, + "learning_rate": 2.841975886690858e-07, + "logits/chosen": -2.3481249809265137, + "logits/rejected": -2.6420738697052, + "logps/chosen": -308.14324951171875, + "logps/rejected": -266.9666748046875, + "loss": 0.5904, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.15720166265964508, + "rewards/margins": 0.7232730388641357, + "rewards/rejected": -0.8804746866226196, + "step": 581 + }, + { + "epoch": 0.07, + "learning_rate": 2.8416247219946153e-07, + "logits/chosen": -2.2702744007110596, + "logits/rejected": -2.443244457244873, + "logps/chosen": -162.43789672851562, + "logps/rejected": -151.09307861328125, + "loss": 0.7195, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.6832289695739746, + "rewards/margins": 0.15245859324932098, + "rewards/rejected": -0.8356875777244568, + "step": 582 + }, + { + "epoch": 0.07, + "learning_rate": 2.841273557298373e-07, + "logits/chosen": -2.5107297897338867, + "logits/rejected": -2.4837851524353027, + "logps/chosen": -346.1005554199219, + "logps/rejected": -181.65591430664062, + "loss": 1.4104, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.7565017938613892, + "rewards/margins": 0.2285371720790863, + "rewards/rejected": -1.9850389957427979, + "step": 583 + }, + { + "epoch": 0.07, + "learning_rate": 2.8409223926021304e-07, + "logits/chosen": -2.256894588470459, + "logits/rejected": -2.3115296363830566, + "logps/chosen": -221.0960693359375, + "logps/rejected": -183.41763305664062, + "loss": 0.6419, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.799615204334259, + "rewards/margins": 0.3813934922218323, + "rewards/rejected": -1.1810086965560913, + "step": 584 + }, + { + "epoch": 0.07, + "learning_rate": 2.840571227905888e-07, + "logits/chosen": -2.5885348320007324, + "logits/rejected": -2.682497024536133, + "logps/chosen": -210.64840698242188, + "logps/rejected": -190.55465698242188, + "loss": 0.5546, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08748228847980499, + "rewards/margins": 0.9237456321716309, + "rewards/rejected": -1.0112278461456299, + "step": 585 + }, + { + "epoch": 0.07, + "learning_rate": 2.840220063209645e-07, + "logits/chosen": -3.0113980770111084, + "logits/rejected": -2.9064345359802246, + "logps/chosen": -391.74322509765625, + "logps/rejected": -292.6578369140625, + "loss": 0.3095, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19839192926883698, + "rewards/margins": 1.135427474975586, + "rewards/rejected": -0.9370355606079102, + "step": 586 + }, + { + "epoch": 0.07, + "learning_rate": 2.8398688985134025e-07, + "logits/chosen": -2.8763883113861084, + "logits/rejected": -2.966191053390503, + "logps/chosen": -223.33209228515625, + "logps/rejected": -163.33116149902344, + "loss": 0.3756, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17561723291873932, + "rewards/margins": 1.1888328790664673, + "rewards/rejected": -1.364450216293335, + "step": 587 + }, + { + "epoch": 0.07, + "learning_rate": 2.83951773381716e-07, + "logits/chosen": -1.9348125457763672, + "logits/rejected": -1.5540663003921509, + "logps/chosen": -415.6250305175781, + "logps/rejected": -450.1877746582031, + "loss": 0.7838, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3086490631103516, + "rewards/margins": 0.20159491896629333, + "rewards/rejected": -1.5102438926696777, + "step": 588 + }, + { + "epoch": 0.07, + "learning_rate": 2.8391665691209176e-07, + "logits/chosen": -2.485164165496826, + "logits/rejected": -2.188173294067383, + "logps/chosen": -419.8287048339844, + "logps/rejected": -497.723388671875, + "loss": 0.3581, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3077627718448639, + "rewards/margins": 1.4692655801773071, + "rewards/rejected": -1.7770283222198486, + "step": 589 + }, + { + "epoch": 0.07, + "learning_rate": 2.838815404424675e-07, + "logits/chosen": -2.855917453765869, + "logits/rejected": -2.5991122722625732, + "logps/chosen": -378.3651428222656, + "logps/rejected": -332.6607971191406, + "loss": 0.4903, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.30137282609939575, + "rewards/margins": 1.336424708366394, + "rewards/rejected": -1.6377975940704346, + "step": 590 + }, + { + "epoch": 0.07, + "learning_rate": 2.8384642397284327e-07, + "logits/chosen": -2.3293638229370117, + "logits/rejected": -2.2511515617370605, + "logps/chosen": -179.00851440429688, + "logps/rejected": -206.40618896484375, + "loss": 0.5308, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4920860528945923, + "rewards/margins": 0.8915743231773376, + "rewards/rejected": -1.3836603164672852, + "step": 591 + }, + { + "epoch": 0.07, + "learning_rate": 2.83811307503219e-07, + "logits/chosen": -2.481503486633301, + "logits/rejected": -2.568618059158325, + "logps/chosen": -266.6557922363281, + "logps/rejected": -196.52406311035156, + "loss": 0.5249, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4607193171977997, + "rewards/margins": 0.9590588212013245, + "rewards/rejected": -1.4197781085968018, + "step": 592 + }, + { + "epoch": 0.07, + "learning_rate": 2.8377619103359477e-07, + "logits/chosen": -2.0719122886657715, + "logits/rejected": -1.915618658065796, + "logps/chosen": -305.3727111816406, + "logps/rejected": -326.4906005859375, + "loss": 0.4906, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6567394137382507, + "rewards/margins": 1.2478952407836914, + "rewards/rejected": -1.904634714126587, + "step": 593 + }, + { + "epoch": 0.07, + "learning_rate": 2.837410745639705e-07, + "logits/chosen": -2.410491466522217, + "logits/rejected": -2.138399600982666, + "logps/chosen": -317.99420166015625, + "logps/rejected": -432.5653381347656, + "loss": 0.3657, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8634762167930603, + "rewards/margins": 1.7855429649353027, + "rewards/rejected": -2.6490190029144287, + "step": 594 + }, + { + "epoch": 0.07, + "learning_rate": 2.8370595809434623e-07, + "logits/chosen": -2.387751817703247, + "logits/rejected": -2.341991662979126, + "logps/chosen": -265.4415283203125, + "logps/rejected": -226.33404541015625, + "loss": 0.487, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5028514862060547, + "rewards/margins": 1.0609010457992554, + "rewards/rejected": -1.56375253200531, + "step": 595 + }, + { + "epoch": 0.07, + "learning_rate": 2.83670841624722e-07, + "logits/chosen": -2.209841251373291, + "logits/rejected": -2.167846202850342, + "logps/chosen": -271.9286804199219, + "logps/rejected": -247.56863403320312, + "loss": 0.4305, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3818988800048828, + "rewards/margins": 1.1262422800064087, + "rewards/rejected": -1.508141040802002, + "step": 596 + }, + { + "epoch": 0.07, + "learning_rate": 2.8363572515509774e-07, + "logits/chosen": -2.4940805435180664, + "logits/rejected": -2.5851833820343018, + "logps/chosen": -159.05477905273438, + "logps/rejected": -104.62156677246094, + "loss": 0.8611, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.8037114143371582, + "rewards/margins": 0.03150567412376404, + "rewards/rejected": -0.8352171182632446, + "step": 597 + }, + { + "epoch": 0.07, + "learning_rate": 2.836006086854735e-07, + "logits/chosen": -2.096557855606079, + "logits/rejected": -2.335763454437256, + "logps/chosen": -350.3111572265625, + "logps/rejected": -121.9441146850586, + "loss": 0.8618, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.636158287525177, + "rewards/margins": 0.20762032270431519, + "rewards/rejected": -0.8437786102294922, + "step": 598 + }, + { + "epoch": 0.07, + "learning_rate": 2.835654922158492e-07, + "logits/chosen": -2.838256359100342, + "logits/rejected": -2.777284622192383, + "logps/chosen": -261.7278747558594, + "logps/rejected": -286.91412353515625, + "loss": 0.4896, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3305845856666565, + "rewards/margins": 1.23065185546875, + "rewards/rejected": -1.5612365007400513, + "step": 599 + }, + { + "epoch": 0.07, + "learning_rate": 2.8353037574622494e-07, + "logits/chosen": -2.2389888763427734, + "logits/rejected": -2.3383562564849854, + "logps/chosen": -260.9029235839844, + "logps/rejected": -278.4179992675781, + "loss": 0.2117, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.013519763946533, + "rewards/margins": 1.8358051776885986, + "rewards/rejected": -3.849324941635132, + "step": 600 + }, + { + "epoch": 0.07, + "learning_rate": 2.8349525927660075e-07, + "logits/chosen": -2.01963472366333, + "logits/rejected": -2.2305614948272705, + "logps/chosen": -221.33319091796875, + "logps/rejected": -162.1153564453125, + "loss": 0.7658, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.103001594543457, + "rewards/margins": 0.026265501976013184, + "rewards/rejected": -1.1292670965194702, + "step": 601 + }, + { + "epoch": 0.07, + "learning_rate": 2.8346014280697645e-07, + "logits/chosen": -1.8874871730804443, + "logits/rejected": -2.0722951889038086, + "logps/chosen": -315.831298828125, + "logps/rejected": -333.26214599609375, + "loss": 0.2478, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11636020243167877, + "rewards/margins": 2.1188104152679443, + "rewards/rejected": -2.235170602798462, + "step": 602 + }, + { + "epoch": 0.07, + "learning_rate": 2.834250263373522e-07, + "logits/chosen": -2.492694139480591, + "logits/rejected": -2.7566301822662354, + "logps/chosen": -332.27435302734375, + "logps/rejected": -257.67327880859375, + "loss": 0.4865, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7292554974555969, + "rewards/margins": 1.8310401439666748, + "rewards/rejected": -2.560295820236206, + "step": 603 + }, + { + "epoch": 0.07, + "learning_rate": 2.8338990986772796e-07, + "logits/chosen": -2.0543711185455322, + "logits/rejected": -2.1177635192871094, + "logps/chosen": -174.86257934570312, + "logps/rejected": -240.09576416015625, + "loss": 1.1278, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9838579893112183, + "rewards/margins": 1.0295653343200684, + "rewards/rejected": -3.013423442840576, + "step": 604 + }, + { + "epoch": 0.07, + "learning_rate": 2.833547933981037e-07, + "logits/chosen": -1.9853198528289795, + "logits/rejected": -2.3912148475646973, + "logps/chosen": -383.35888671875, + "logps/rejected": -143.63754272460938, + "loss": 0.5674, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.14619752764701843, + "rewards/margins": 0.40495479106903076, + "rewards/rejected": -0.2587572932243347, + "step": 605 + }, + { + "epoch": 0.07, + "learning_rate": 2.8331967692847947e-07, + "logits/chosen": -2.2137343883514404, + "logits/rejected": -2.492056369781494, + "logps/chosen": -346.4584045410156, + "logps/rejected": -187.76974487304688, + "loss": 0.6233, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.41237983107566833, + "rewards/margins": 0.44383037090301514, + "rewards/rejected": -0.8562101125717163, + "step": 606 + }, + { + "epoch": 0.07, + "learning_rate": 2.8328456045885517e-07, + "logits/chosen": -2.269853353500366, + "logits/rejected": -2.5557377338409424, + "logps/chosen": -268.62005615234375, + "logps/rejected": -111.5399169921875, + "loss": 0.919, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.0587838888168335, + "rewards/margins": 0.03683331608772278, + "rewards/rejected": -1.0956172943115234, + "step": 607 + }, + { + "epoch": 0.07, + "learning_rate": 2.832494439892309e-07, + "logits/chosen": -2.5230770111083984, + "logits/rejected": -2.4917781352996826, + "logps/chosen": -193.80938720703125, + "logps/rejected": -126.1186294555664, + "loss": 0.5424, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.21096931397914886, + "rewards/margins": 0.5498733520507812, + "rewards/rejected": -0.3389040231704712, + "step": 608 + }, + { + "epoch": 0.07, + "learning_rate": 2.832143275196067e-07, + "logits/chosen": -2.490896701812744, + "logits/rejected": -2.672422409057617, + "logps/chosen": -425.5343017578125, + "logps/rejected": -247.02392578125, + "loss": 0.5364, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5454726815223694, + "rewards/margins": 0.6740742325782776, + "rewards/rejected": -1.219546914100647, + "step": 609 + }, + { + "epoch": 0.07, + "learning_rate": 2.8317921104998243e-07, + "logits/chosen": -2.0126869678497314, + "logits/rejected": -2.3111400604248047, + "logps/chosen": -454.74066162109375, + "logps/rejected": -281.052001953125, + "loss": 0.496, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2025426626205444, + "rewards/margins": 1.1307690143585205, + "rewards/rejected": -2.3333117961883545, + "step": 610 + }, + { + "epoch": 0.07, + "learning_rate": 2.831440945803582e-07, + "logits/chosen": -2.4970033168792725, + "logits/rejected": -2.3746609687805176, + "logps/chosen": -312.29888916015625, + "logps/rejected": -343.3963623046875, + "loss": 0.4619, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6497325897216797, + "rewards/margins": 1.461267352104187, + "rewards/rejected": -2.110999822616577, + "step": 611 + }, + { + "epoch": 0.07, + "learning_rate": 2.831089781107339e-07, + "logits/chosen": -2.192348003387451, + "logits/rejected": -2.180023193359375, + "logps/chosen": -182.81939697265625, + "logps/rejected": -210.64974975585938, + "loss": 0.9492, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7394486665725708, + "rewards/margins": 0.00923699140548706, + "rewards/rejected": -0.7486856579780579, + "step": 612 + }, + { + "epoch": 0.07, + "learning_rate": 2.830738616411097e-07, + "logits/chosen": -1.9650774002075195, + "logits/rejected": -1.9767951965332031, + "logps/chosen": -268.37139892578125, + "logps/rejected": -222.44737243652344, + "loss": 0.4301, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1854250282049179, + "rewards/margins": 1.2275829315185547, + "rewards/rejected": -1.4130079746246338, + "step": 613 + }, + { + "epoch": 0.07, + "learning_rate": 2.8303874517148545e-07, + "logits/chosen": -1.5763193368911743, + "logits/rejected": -1.8386644124984741, + "logps/chosen": -401.9755859375, + "logps/rejected": -314.63555908203125, + "loss": 1.2858, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.218818187713623, + "rewards/margins": -0.15003274381160736, + "rewards/rejected": -1.068785309791565, + "step": 614 + }, + { + "epoch": 0.07, + "learning_rate": 2.8300362870186115e-07, + "logits/chosen": -2.2595667839050293, + "logits/rejected": -2.253070831298828, + "logps/chosen": -284.88372802734375, + "logps/rejected": -334.0962219238281, + "loss": 0.2957, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16701345145702362, + "rewards/margins": 1.5150341987609863, + "rewards/rejected": -1.6820476055145264, + "step": 615 + }, + { + "epoch": 0.07, + "learning_rate": 2.829685122322369e-07, + "logits/chosen": -2.4772329330444336, + "logits/rejected": -2.276102304458618, + "logps/chosen": -203.02130126953125, + "logps/rejected": -303.4725646972656, + "loss": 0.3171, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21712753176689148, + "rewards/margins": 1.15693998336792, + "rewards/rejected": -1.3740675449371338, + "step": 616 + }, + { + "epoch": 0.07, + "learning_rate": 2.8293339576261265e-07, + "logits/chosen": -2.476404905319214, + "logits/rejected": -2.5124106407165527, + "logps/chosen": -246.77987670898438, + "logps/rejected": -331.6686096191406, + "loss": 0.6776, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9454853534698486, + "rewards/margins": 1.037245750427246, + "rewards/rejected": -1.9827309846878052, + "step": 617 + }, + { + "epoch": 0.07, + "learning_rate": 2.828982792929884e-07, + "logits/chosen": -2.3647682666778564, + "logits/rejected": -2.1550421714782715, + "logps/chosen": -409.0311279296875, + "logps/rejected": -436.6133117675781, + "loss": 0.4926, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.23959536850452423, + "rewards/margins": 1.4159497022628784, + "rewards/rejected": -1.6555449962615967, + "step": 618 + }, + { + "epoch": 0.07, + "learning_rate": 2.8286316282336416e-07, + "logits/chosen": -3.0056519508361816, + "logits/rejected": -2.8792595863342285, + "logps/chosen": -201.8819122314453, + "logps/rejected": -157.08401489257812, + "loss": 0.6115, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.37284061312675476, + "rewards/margins": 0.9126477241516113, + "rewards/rejected": -1.2854883670806885, + "step": 619 + }, + { + "epoch": 0.07, + "learning_rate": 2.8282804635373986e-07, + "logits/chosen": -2.578747272491455, + "logits/rejected": -2.6838903427124023, + "logps/chosen": -298.3250732421875, + "logps/rejected": -274.04498291015625, + "loss": 0.3619, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.22988323867321014, + "rewards/margins": 1.6534756422042847, + "rewards/rejected": -1.4235923290252686, + "step": 620 + }, + { + "epoch": 0.07, + "learning_rate": 2.827929298841156e-07, + "logits/chosen": -2.171499490737915, + "logits/rejected": -2.199965000152588, + "logps/chosen": -249.81494140625, + "logps/rejected": -262.7466735839844, + "loss": 0.895, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.317652940750122, + "rewards/margins": 0.04661652445793152, + "rewards/rejected": -1.364269495010376, + "step": 621 + }, + { + "epoch": 0.07, + "learning_rate": 2.827578134144914e-07, + "logits/chosen": -2.7277536392211914, + "logits/rejected": -2.6778573989868164, + "logps/chosen": -122.53204345703125, + "logps/rejected": -223.12152099609375, + "loss": 0.393, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7706204652786255, + "rewards/margins": 1.2374976873397827, + "rewards/rejected": -2.008118152618408, + "step": 622 + }, + { + "epoch": 0.07, + "learning_rate": 2.827226969448671e-07, + "logits/chosen": -2.7635350227355957, + "logits/rejected": -2.690786123275757, + "logps/chosen": -225.25994873046875, + "logps/rejected": -334.81329345703125, + "loss": 0.3785, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7724786996841431, + "rewards/margins": 1.2916539907455444, + "rewards/rejected": -2.0641326904296875, + "step": 623 + }, + { + "epoch": 0.07, + "learning_rate": 2.826875804752429e-07, + "logits/chosen": -2.406170129776001, + "logits/rejected": -2.533757448196411, + "logps/chosen": -750.5345458984375, + "logps/rejected": -295.5158996582031, + "loss": 0.6307, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.115107536315918, + "rewards/margins": 0.7633681297302246, + "rewards/rejected": -1.8784756660461426, + "step": 624 + }, + { + "epoch": 0.07, + "learning_rate": 2.8265246400561863e-07, + "logits/chosen": -2.3439629077911377, + "logits/rejected": -2.534212827682495, + "logps/chosen": -482.3173828125, + "logps/rejected": -364.580810546875, + "loss": 0.2211, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18997615575790405, + "rewards/margins": 2.050342321395874, + "rewards/rejected": -2.2403182983398438, + "step": 625 + }, + { + "epoch": 0.07, + "learning_rate": 2.826173475359944e-07, + "logits/chosen": -1.9570854902267456, + "logits/rejected": -1.9366888999938965, + "logps/chosen": -218.934326171875, + "logps/rejected": -276.7323913574219, + "loss": 0.4644, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4769183397293091, + "rewards/margins": 1.3866766691207886, + "rewards/rejected": -1.8635951280593872, + "step": 626 + }, + { + "epoch": 0.07, + "learning_rate": 2.8258223106637014e-07, + "logits/chosen": -1.9664170742034912, + "logits/rejected": -2.1062068939208984, + "logps/chosen": -432.6546325683594, + "logps/rejected": -264.1221923828125, + "loss": 0.2724, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07906076312065125, + "rewards/margins": 1.4525604248046875, + "rewards/rejected": -1.3734996318817139, + "step": 627 + }, + { + "epoch": 0.07, + "learning_rate": 2.8254711459674584e-07, + "logits/chosen": -2.440387725830078, + "logits/rejected": -2.0619187355041504, + "logps/chosen": -236.20074462890625, + "logps/rejected": -444.21539306640625, + "loss": 0.2474, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6125345826148987, + "rewards/margins": 2.1352896690368652, + "rewards/rejected": -2.7478244304656982, + "step": 628 + }, + { + "epoch": 0.07, + "learning_rate": 2.825119981271216e-07, + "logits/chosen": -2.6708524227142334, + "logits/rejected": -2.74470591545105, + "logps/chosen": -215.59808349609375, + "logps/rejected": -236.48464965820312, + "loss": 0.5208, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.35500961542129517, + "rewards/margins": 1.4399056434631348, + "rewards/rejected": -1.7949153184890747, + "step": 629 + }, + { + "epoch": 0.07, + "learning_rate": 2.8247688165749735e-07, + "logits/chosen": -2.0033397674560547, + "logits/rejected": -2.1881468296051025, + "logps/chosen": -401.00848388671875, + "logps/rejected": -242.2111358642578, + "loss": 1.1838, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.558022975921631, + "rewards/margins": 0.2904486656188965, + "rewards/rejected": -2.8484716415405273, + "step": 630 + }, + { + "epoch": 0.07, + "learning_rate": 2.824417651878731e-07, + "logits/chosen": -1.6877994537353516, + "logits/rejected": -1.7217804193496704, + "logps/chosen": -406.7537536621094, + "logps/rejected": -456.74334716796875, + "loss": 0.5008, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.052463360130786896, + "rewards/margins": 0.7956773042678833, + "rewards/rejected": -0.8481407165527344, + "step": 631 + }, + { + "epoch": 0.07, + "learning_rate": 2.8240664871824886e-07, + "logits/chosen": -2.539409637451172, + "logits/rejected": -2.6233086585998535, + "logps/chosen": -204.0065155029297, + "logps/rejected": -226.86817932128906, + "loss": 0.4791, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.23734669387340546, + "rewards/margins": 0.8846979141235352, + "rewards/rejected": -1.122044563293457, + "step": 632 + }, + { + "epoch": 0.07, + "learning_rate": 2.8237153224862456e-07, + "logits/chosen": -2.6619770526885986, + "logits/rejected": -2.6246113777160645, + "logps/chosen": -269.648681640625, + "logps/rejected": -200.05438232421875, + "loss": 0.3724, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.42062121629714966, + "rewards/margins": 1.8276509046554565, + "rewards/rejected": -2.24827241897583, + "step": 633 + }, + { + "epoch": 0.07, + "learning_rate": 2.823364157790003e-07, + "logits/chosen": -2.0954527854919434, + "logits/rejected": -2.219635009765625, + "logps/chosen": -218.19268798828125, + "logps/rejected": -183.45257568359375, + "loss": 0.4125, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.02984705939888954, + "rewards/margins": 0.9590607285499573, + "rewards/rejected": -0.9889078140258789, + "step": 634 + }, + { + "epoch": 0.07, + "learning_rate": 2.823012993093761e-07, + "logits/chosen": -2.6564955711364746, + "logits/rejected": -2.5149855613708496, + "logps/chosen": -273.099853515625, + "logps/rejected": -225.57284545898438, + "loss": 0.3569, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.33076006174087524, + "rewards/margins": 1.3690595626831055, + "rewards/rejected": -1.699819803237915, + "step": 635 + }, + { + "epoch": 0.07, + "learning_rate": 2.822661828397518e-07, + "logits/chosen": -1.8517732620239258, + "logits/rejected": -2.325212240219116, + "logps/chosen": -439.01788330078125, + "logps/rejected": -387.389404296875, + "loss": 0.8883, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8667185306549072, + "rewards/margins": 0.2184535562992096, + "rewards/rejected": -1.0851720571517944, + "step": 636 + }, + { + "epoch": 0.07, + "learning_rate": 2.8223106637012757e-07, + "logits/chosen": -2.426697254180908, + "logits/rejected": -2.0543205738067627, + "logps/chosen": -133.8959503173828, + "logps/rejected": -353.39208984375, + "loss": 0.1922, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.34824231266975403, + "rewards/margins": 2.0190012454986572, + "rewards/rejected": -2.3672432899475098, + "step": 637 + }, + { + "epoch": 0.07, + "learning_rate": 2.821959499005033e-07, + "logits/chosen": -2.3917136192321777, + "logits/rejected": -2.1776623725891113, + "logps/chosen": -146.8810272216797, + "logps/rejected": -264.3834228515625, + "loss": 0.6546, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.074717402458191, + "rewards/margins": 0.5941792726516724, + "rewards/rejected": -1.6688966751098633, + "step": 638 + }, + { + "epoch": 0.07, + "learning_rate": 2.821608334308791e-07, + "logits/chosen": -2.2456889152526855, + "logits/rejected": -2.43654727935791, + "logps/chosen": -363.94830322265625, + "logps/rejected": -299.738525390625, + "loss": 0.5956, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.28985321521759033, + "rewards/margins": 0.7587301135063171, + "rewards/rejected": -1.0485832691192627, + "step": 639 + }, + { + "epoch": 0.07, + "learning_rate": 2.8212571696125483e-07, + "logits/chosen": -2.381514072418213, + "logits/rejected": -2.2565572261810303, + "logps/chosen": -207.4835662841797, + "logps/rejected": -366.3134765625, + "loss": 0.2691, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5103274583816528, + "rewards/margins": 2.6497609615325928, + "rewards/rejected": -3.160088539123535, + "step": 640 + }, + { + "epoch": 0.07, + "learning_rate": 2.8209060049163053e-07, + "logits/chosen": -2.390377998352051, + "logits/rejected": -2.636819839477539, + "logps/chosen": -321.63580322265625, + "logps/rejected": -285.4607238769531, + "loss": 0.2538, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03827035427093506, + "rewards/margins": 1.2898340225219727, + "rewards/rejected": -1.3281043767929077, + "step": 641 + }, + { + "epoch": 0.07, + "learning_rate": 2.820554840220063e-07, + "logits/chosen": -2.387016773223877, + "logits/rejected": -2.3572449684143066, + "logps/chosen": -231.04025268554688, + "logps/rejected": -242.41448974609375, + "loss": 0.3565, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3494061529636383, + "rewards/margins": 1.0369763374328613, + "rewards/rejected": -1.3863825798034668, + "step": 642 + }, + { + "epoch": 0.07, + "learning_rate": 2.8202036755238204e-07, + "logits/chosen": -2.128512382507324, + "logits/rejected": -2.1113529205322266, + "logps/chosen": -442.78570556640625, + "logps/rejected": -314.4978332519531, + "loss": 0.459, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.931515097618103, + "rewards/margins": 1.1765776872634888, + "rewards/rejected": -2.108092784881592, + "step": 643 + }, + { + "epoch": 0.07, + "learning_rate": 2.819852510827578e-07, + "logits/chosen": -2.5780107975006104, + "logits/rejected": -2.654625415802002, + "logps/chosen": -366.4516906738281, + "logps/rejected": -275.19097900390625, + "loss": 0.1804, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1635947227478027, + "rewards/margins": 2.4193501472473145, + "rewards/rejected": -3.582944869995117, + "step": 644 + }, + { + "epoch": 0.07, + "learning_rate": 2.8195013461313355e-07, + "logits/chosen": -2.299079418182373, + "logits/rejected": -2.4038236141204834, + "logps/chosen": -320.19659423828125, + "logps/rejected": -194.92889404296875, + "loss": 0.2089, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07813926786184311, + "rewards/margins": 1.953416347503662, + "rewards/rejected": -2.031555652618408, + "step": 645 + }, + { + "epoch": 0.07, + "learning_rate": 2.819150181435093e-07, + "logits/chosen": -2.070345878601074, + "logits/rejected": -2.235501289367676, + "logps/chosen": -345.6457824707031, + "logps/rejected": -351.4801330566406, + "loss": 0.1556, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8219394087791443, + "rewards/margins": 2.4268746376037598, + "rewards/rejected": -3.248814105987549, + "step": 646 + }, + { + "epoch": 0.07, + "learning_rate": 2.8187990167388506e-07, + "logits/chosen": -2.487227439880371, + "logits/rejected": -2.6960034370422363, + "logps/chosen": -181.2325439453125, + "logps/rejected": -231.54013061523438, + "loss": 0.3867, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.42247340083122253, + "rewards/margins": 1.5322457551956177, + "rewards/rejected": -1.9547193050384521, + "step": 647 + }, + { + "epoch": 0.07, + "learning_rate": 2.818447852042608e-07, + "logits/chosen": -2.597653865814209, + "logits/rejected": -2.525728702545166, + "logps/chosen": -264.2349853515625, + "logps/rejected": -182.57635498046875, + "loss": 0.7965, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1252950429916382, + "rewards/margins": 0.22656959295272827, + "rewards/rejected": -1.3518646955490112, + "step": 648 + }, + { + "epoch": 0.07, + "learning_rate": 2.818096687346365e-07, + "logits/chosen": -2.9054975509643555, + "logits/rejected": -2.831308603286743, + "logps/chosen": -281.11541748046875, + "logps/rejected": -196.47691345214844, + "loss": 0.5432, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7383607625961304, + "rewards/margins": 0.7871567606925964, + "rewards/rejected": -1.525517463684082, + "step": 649 + }, + { + "epoch": 0.07, + "learning_rate": 2.8177455226501227e-07, + "logits/chosen": -2.5604476928710938, + "logits/rejected": -2.4529614448547363, + "logps/chosen": -188.1148681640625, + "logps/rejected": -246.44515991210938, + "loss": 0.6152, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6489608287811279, + "rewards/margins": 0.8503326177597046, + "rewards/rejected": -1.4992934465408325, + "step": 650 + }, + { + "epoch": 0.08, + "learning_rate": 2.81739435795388e-07, + "logits/chosen": -3.037039279937744, + "logits/rejected": -2.9462547302246094, + "logps/chosen": -287.00872802734375, + "logps/rejected": -198.03219604492188, + "loss": 0.3793, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3345130383968353, + "rewards/margins": 1.8743391036987305, + "rewards/rejected": -2.2088522911071777, + "step": 651 + }, + { + "epoch": 0.08, + "learning_rate": 2.817043193257638e-07, + "logits/chosen": -2.5542171001434326, + "logits/rejected": -2.532960891723633, + "logps/chosen": -332.9786071777344, + "logps/rejected": -322.94476318359375, + "loss": 0.4675, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6539329886436462, + "rewards/margins": 1.0393493175506592, + "rewards/rejected": -1.693282127380371, + "step": 652 + }, + { + "epoch": 0.08, + "learning_rate": 2.8166920285613953e-07, + "logits/chosen": -1.8641458749771118, + "logits/rejected": -1.9792832136154175, + "logps/chosen": -304.7632751464844, + "logps/rejected": -288.716552734375, + "loss": 0.2648, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7057795524597168, + "rewards/margins": 1.7230284214019775, + "rewards/rejected": -2.4288077354431152, + "step": 653 + }, + { + "epoch": 0.08, + "learning_rate": 2.816340863865153e-07, + "logits/chosen": -1.814190149307251, + "logits/rejected": -2.126420259475708, + "logps/chosen": -423.5740661621094, + "logps/rejected": -326.1535339355469, + "loss": 0.7514, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0513386726379395, + "rewards/margins": 0.1767864227294922, + "rewards/rejected": -1.2281250953674316, + "step": 654 + }, + { + "epoch": 0.08, + "learning_rate": 2.81598969916891e-07, + "logits/chosen": -2.446232795715332, + "logits/rejected": -2.411175012588501, + "logps/chosen": -285.80584716796875, + "logps/rejected": -209.7812042236328, + "loss": 0.4497, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.192529320716858, + "rewards/margins": 0.8039804697036743, + "rewards/rejected": -1.9965097904205322, + "step": 655 + }, + { + "epoch": 0.08, + "learning_rate": 2.815638534472668e-07, + "logits/chosen": -2.613006114959717, + "logits/rejected": -2.617305278778076, + "logps/chosen": -184.25762939453125, + "logps/rejected": -280.9164123535156, + "loss": 0.2019, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.023012347519397736, + "rewards/margins": 2.9113833904266357, + "rewards/rejected": -2.888371467590332, + "step": 656 + }, + { + "epoch": 0.08, + "learning_rate": 2.815287369776425e-07, + "logits/chosen": -2.07599139213562, + "logits/rejected": -2.237638473510742, + "logps/chosen": -207.68003845214844, + "logps/rejected": -213.77630615234375, + "loss": 0.4693, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7654252648353577, + "rewards/margins": 1.422724962234497, + "rewards/rejected": -2.18815016746521, + "step": 657 + }, + { + "epoch": 0.08, + "learning_rate": 2.8149362050801824e-07, + "logits/chosen": -2.4694700241088867, + "logits/rejected": -2.2042503356933594, + "logps/chosen": -308.64569091796875, + "logps/rejected": -355.19696044921875, + "loss": 0.2503, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4395456910133362, + "rewards/margins": 2.22979736328125, + "rewards/rejected": -2.6693427562713623, + "step": 658 + }, + { + "epoch": 0.08, + "learning_rate": 2.81458504038394e-07, + "logits/chosen": -2.015540599822998, + "logits/rejected": -2.020420551300049, + "logps/chosen": -183.70059204101562, + "logps/rejected": -159.38302612304688, + "loss": 1.0713, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4110078811645508, + "rewards/margins": 0.3136792480945587, + "rewards/rejected": -1.724687099456787, + "step": 659 + }, + { + "epoch": 0.08, + "learning_rate": 2.8142338756876975e-07, + "logits/chosen": -2.5635151863098145, + "logits/rejected": -2.4932403564453125, + "logps/chosen": -141.72267150878906, + "logps/rejected": -308.15606689453125, + "loss": 0.7423, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8795514106750488, + "rewards/margins": 0.2741795480251312, + "rewards/rejected": -1.153730869293213, + "step": 660 + }, + { + "epoch": 0.08, + "learning_rate": 2.813882710991455e-07, + "logits/chosen": -1.7277942895889282, + "logits/rejected": -1.925363540649414, + "logps/chosen": -436.93865966796875, + "logps/rejected": -361.3018798828125, + "loss": 0.5013, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07024441659450531, + "rewards/margins": 0.8134360313415527, + "rewards/rejected": -0.8836804628372192, + "step": 661 + }, + { + "epoch": 0.08, + "learning_rate": 2.813531546295212e-07, + "logits/chosen": -2.0592093467712402, + "logits/rejected": -2.376985788345337, + "logps/chosen": -423.47796630859375, + "logps/rejected": -221.44515991210938, + "loss": 0.4138, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8840816617012024, + "rewards/margins": 0.9488204717636108, + "rewards/rejected": -1.8329020738601685, + "step": 662 + }, + { + "epoch": 0.08, + "learning_rate": 2.8131803815989696e-07, + "logits/chosen": -2.3550825119018555, + "logits/rejected": -2.6616322994232178, + "logps/chosen": -329.29046630859375, + "logps/rejected": -220.52093505859375, + "loss": 0.3851, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20492278039455414, + "rewards/margins": 1.2830890417099, + "rewards/rejected": -1.4880117177963257, + "step": 663 + }, + { + "epoch": 0.08, + "learning_rate": 2.812829216902727e-07, + "logits/chosen": -1.8473544120788574, + "logits/rejected": -1.8562769889831543, + "logps/chosen": -719.3099975585938, + "logps/rejected": -517.6641235351562, + "loss": 0.6945, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5823758244514465, + "rewards/margins": 1.3923704624176025, + "rewards/rejected": -1.9747464656829834, + "step": 664 + }, + { + "epoch": 0.08, + "learning_rate": 2.8124780522064847e-07, + "logits/chosen": -2.078197479248047, + "logits/rejected": -2.190197229385376, + "logps/chosen": -230.0837860107422, + "logps/rejected": -245.03616333007812, + "loss": 0.7319, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0529155731201172, + "rewards/margins": 0.5329278707504272, + "rewards/rejected": -1.5858434438705444, + "step": 665 + }, + { + "epoch": 0.08, + "learning_rate": 2.812126887510242e-07, + "logits/chosen": -1.9105074405670166, + "logits/rejected": -1.8378106355667114, + "logps/chosen": -158.76065063476562, + "logps/rejected": -206.23477172851562, + "loss": 0.441, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1585404872894287, + "rewards/margins": 1.0974591970443726, + "rewards/rejected": -1.2559998035430908, + "step": 666 + }, + { + "epoch": 0.08, + "learning_rate": 2.811775722814e-07, + "logits/chosen": -2.6381020545959473, + "logits/rejected": -2.439478874206543, + "logps/chosen": -284.014892578125, + "logps/rejected": -372.5776062011719, + "loss": 0.9023, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.215943455696106, + "rewards/margins": 0.49847903847694397, + "rewards/rejected": -1.7144224643707275, + "step": 667 + }, + { + "epoch": 0.08, + "learning_rate": 2.811424558117757e-07, + "logits/chosen": -2.1988229751586914, + "logits/rejected": -2.1402299404144287, + "logps/chosen": -451.5899353027344, + "logps/rejected": -426.776123046875, + "loss": 0.729, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3938140869140625, + "rewards/margins": 0.5835897922515869, + "rewards/rejected": -0.9774038791656494, + "step": 668 + }, + { + "epoch": 0.08, + "learning_rate": 2.811073393421515e-07, + "logits/chosen": -2.0592234134674072, + "logits/rejected": -2.3253207206726074, + "logps/chosen": -352.448486328125, + "logps/rejected": -215.867431640625, + "loss": 0.7487, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6072904467582703, + "rewards/margins": 0.307243287563324, + "rewards/rejected": -0.9145337343215942, + "step": 669 + }, + { + "epoch": 0.08, + "learning_rate": 2.810722228725272e-07, + "logits/chosen": -2.1171090602874756, + "logits/rejected": -2.3445823192596436, + "logps/chosen": -439.3739929199219, + "logps/rejected": -452.7837829589844, + "loss": 0.486, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.785253643989563, + "rewards/margins": 1.658212661743164, + "rewards/rejected": -2.4434661865234375, + "step": 670 + }, + { + "epoch": 0.08, + "learning_rate": 2.8103710640290294e-07, + "logits/chosen": -2.3758716583251953, + "logits/rejected": -2.2815237045288086, + "logps/chosen": -316.3427429199219, + "logps/rejected": -265.7826232910156, + "loss": 0.2658, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5815210938453674, + "rewards/margins": 1.381862998008728, + "rewards/rejected": -1.9633840322494507, + "step": 671 + }, + { + "epoch": 0.08, + "learning_rate": 2.810019899332787e-07, + "logits/chosen": -2.7975997924804688, + "logits/rejected": -2.761309862136841, + "logps/chosen": -256.3267822265625, + "logps/rejected": -191.085693359375, + "loss": 0.6144, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8847416043281555, + "rewards/margins": 0.41729187965393066, + "rewards/rejected": -1.3020334243774414, + "step": 672 + }, + { + "epoch": 0.08, + "learning_rate": 2.8096687346365445e-07, + "logits/chosen": -2.570150852203369, + "logits/rejected": -2.5853333473205566, + "logps/chosen": -400.1475830078125, + "logps/rejected": -340.15960693359375, + "loss": 0.4165, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.19714151322841644, + "rewards/margins": 1.5226893424987793, + "rewards/rejected": -1.719830870628357, + "step": 673 + }, + { + "epoch": 0.08, + "learning_rate": 2.809317569940302e-07, + "logits/chosen": -2.234131336212158, + "logits/rejected": -2.421767234802246, + "logps/chosen": -405.0108337402344, + "logps/rejected": -264.6357727050781, + "loss": 0.3474, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0907490253448486, + "rewards/margins": 1.404133915901184, + "rewards/rejected": -2.4948830604553223, + "step": 674 + }, + { + "epoch": 0.08, + "learning_rate": 2.8089664052440595e-07, + "logits/chosen": -2.267219305038452, + "logits/rejected": -2.348280906677246, + "logps/chosen": -222.4208984375, + "logps/rejected": -275.7489013671875, + "loss": 0.4499, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5908468961715698, + "rewards/margins": 0.9151750802993774, + "rewards/rejected": -1.5060219764709473, + "step": 675 + }, + { + "epoch": 0.08, + "learning_rate": 2.8086152405478166e-07, + "logits/chosen": -2.663835048675537, + "logits/rejected": -2.2205018997192383, + "logps/chosen": -330.6203308105469, + "logps/rejected": -387.1350402832031, + "loss": 0.3534, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3766900897026062, + "rewards/margins": 2.2806873321533203, + "rewards/rejected": -2.6573777198791504, + "step": 676 + }, + { + "epoch": 0.08, + "learning_rate": 2.808264075851574e-07, + "logits/chosen": -2.6273460388183594, + "logits/rejected": -2.567544460296631, + "logps/chosen": -298.59539794921875, + "logps/rejected": -132.38710021972656, + "loss": 0.529, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4363615810871124, + "rewards/margins": 0.785159707069397, + "rewards/rejected": -1.221521258354187, + "step": 677 + }, + { + "epoch": 0.08, + "learning_rate": 2.8079129111553316e-07, + "logits/chosen": -1.9139106273651123, + "logits/rejected": -2.134880542755127, + "logps/chosen": -403.826904296875, + "logps/rejected": -269.96380615234375, + "loss": 0.5884, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.6947274208068848, + "rewards/margins": 0.7175753712654114, + "rewards/rejected": -1.412302851676941, + "step": 678 + }, + { + "epoch": 0.08, + "learning_rate": 2.807561746459089e-07, + "logits/chosen": -2.521139621734619, + "logits/rejected": -2.4943532943725586, + "logps/chosen": -398.18084716796875, + "logps/rejected": -367.4175720214844, + "loss": 0.2932, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6317983865737915, + "rewards/margins": 1.6558722257614136, + "rewards/rejected": -2.287670612335205, + "step": 679 + }, + { + "epoch": 0.08, + "learning_rate": 2.8072105817628467e-07, + "logits/chosen": -2.245542526245117, + "logits/rejected": -2.329190492630005, + "logps/chosen": -254.12753295898438, + "logps/rejected": -266.9394836425781, + "loss": 0.6241, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1679702997207642, + "rewards/margins": 1.099932074546814, + "rewards/rejected": -2.267902374267578, + "step": 680 + }, + { + "epoch": 0.08, + "learning_rate": 2.806859417066604e-07, + "logits/chosen": -2.6794023513793945, + "logits/rejected": -2.395974636077881, + "logps/chosen": -302.4727783203125, + "logps/rejected": -229.53048706054688, + "loss": 0.6483, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0870378017425537, + "rewards/margins": 0.8876568675041199, + "rewards/rejected": -1.9746947288513184, + "step": 681 + }, + { + "epoch": 0.08, + "learning_rate": 2.806508252370362e-07, + "logits/chosen": -1.8205779790878296, + "logits/rejected": -1.8277878761291504, + "logps/chosen": -428.9998779296875, + "logps/rejected": -500.19024658203125, + "loss": 0.8324, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.4409232139587402, + "rewards/margins": 0.22413292527198792, + "rewards/rejected": -1.6650559902191162, + "step": 682 + }, + { + "epoch": 0.08, + "learning_rate": 2.8061570876741193e-07, + "logits/chosen": -2.32907772064209, + "logits/rejected": -2.290468692779541, + "logps/chosen": -239.64076232910156, + "logps/rejected": -258.4351806640625, + "loss": 0.6413, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6903963088989258, + "rewards/margins": 0.6951419115066528, + "rewards/rejected": -1.3855382204055786, + "step": 683 + }, + { + "epoch": 0.08, + "learning_rate": 2.8058059229778763e-07, + "logits/chosen": -2.131488800048828, + "logits/rejected": -2.299503803253174, + "logps/chosen": -320.7708740234375, + "logps/rejected": -240.27822875976562, + "loss": 0.5854, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6859288215637207, + "rewards/margins": 0.5089601874351501, + "rewards/rejected": -1.1948890686035156, + "step": 684 + }, + { + "epoch": 0.08, + "learning_rate": 2.805454758281634e-07, + "logits/chosen": -2.0032408237457275, + "logits/rejected": -2.208956480026245, + "logps/chosen": -314.1114501953125, + "logps/rejected": -304.3875427246094, + "loss": 0.5756, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0647172927856445, + "rewards/margins": 0.8593162894248962, + "rewards/rejected": -1.9240336418151855, + "step": 685 + }, + { + "epoch": 0.08, + "learning_rate": 2.8051035935853914e-07, + "logits/chosen": -2.2362003326416016, + "logits/rejected": -2.5087814331054688, + "logps/chosen": -199.20481872558594, + "logps/rejected": -178.01394653320312, + "loss": 0.6303, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0022857189178467, + "rewards/margins": 0.7246631979942322, + "rewards/rejected": -1.726948857307434, + "step": 686 + }, + { + "epoch": 0.08, + "learning_rate": 2.804752428889149e-07, + "logits/chosen": -3.0552687644958496, + "logits/rejected": -2.900855541229248, + "logps/chosen": -344.8083801269531, + "logps/rejected": -154.27078247070312, + "loss": 0.5562, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21084101498126984, + "rewards/margins": 1.6063833236694336, + "rewards/rejected": -1.817224383354187, + "step": 687 + }, + { + "epoch": 0.08, + "learning_rate": 2.8044012641929065e-07, + "logits/chosen": -1.7707819938659668, + "logits/rejected": -1.92073655128479, + "logps/chosen": -507.3825988769531, + "logps/rejected": -381.27911376953125, + "loss": 0.7936, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0641158819198608, + "rewards/margins": 0.47017186880111694, + "rewards/rejected": -1.5342878103256226, + "step": 688 + }, + { + "epoch": 0.08, + "learning_rate": 2.8040500994966635e-07, + "logits/chosen": -2.02783465385437, + "logits/rejected": -2.349900960922241, + "logps/chosen": -429.06976318359375, + "logps/rejected": -327.5303955078125, + "loss": 0.4573, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5310387015342712, + "rewards/margins": 1.403000831604004, + "rewards/rejected": -1.93403959274292, + "step": 689 + }, + { + "epoch": 0.08, + "learning_rate": 2.8036989348004216e-07, + "logits/chosen": -2.6429057121276855, + "logits/rejected": -2.3235366344451904, + "logps/chosen": -238.00787353515625, + "logps/rejected": -326.2555236816406, + "loss": 0.3359, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.37105637788772583, + "rewards/margins": 2.093505859375, + "rewards/rejected": -2.464562177658081, + "step": 690 + }, + { + "epoch": 0.08, + "learning_rate": 2.803347770104179e-07, + "logits/chosen": -2.029834032058716, + "logits/rejected": -2.1742172241210938, + "logps/chosen": -304.9882507324219, + "logps/rejected": -253.61981201171875, + "loss": 1.0707, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.9432415962219238, + "rewards/margins": -0.5239319801330566, + "rewards/rejected": -0.4193095564842224, + "step": 691 + }, + { + "epoch": 0.08, + "learning_rate": 2.802996605407936e-07, + "logits/chosen": -2.6132450103759766, + "logits/rejected": -2.6220972537994385, + "logps/chosen": -304.1394958496094, + "logps/rejected": -239.97413635253906, + "loss": 0.3253, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2799456715583801, + "rewards/margins": 1.3886350393295288, + "rewards/rejected": -1.6685807704925537, + "step": 692 + }, + { + "epoch": 0.08, + "learning_rate": 2.8026454407116936e-07, + "logits/chosen": -2.1360645294189453, + "logits/rejected": -2.084162712097168, + "logps/chosen": -204.8944549560547, + "logps/rejected": -230.28707885742188, + "loss": 0.3972, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5383663177490234, + "rewards/margins": 1.3008499145507812, + "rewards/rejected": -1.8392161130905151, + "step": 693 + }, + { + "epoch": 0.08, + "learning_rate": 2.802294276015451e-07, + "logits/chosen": -2.4438400268554688, + "logits/rejected": -2.4935483932495117, + "logps/chosen": -142.31088256835938, + "logps/rejected": -205.71018981933594, + "loss": 0.4965, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20381927490234375, + "rewards/margins": 0.7934083342552185, + "rewards/rejected": -0.9972276091575623, + "step": 694 + }, + { + "epoch": 0.08, + "learning_rate": 2.8019431113192087e-07, + "logits/chosen": -2.5919721126556396, + "logits/rejected": -2.6185994148254395, + "logps/chosen": -122.76292419433594, + "logps/rejected": -151.73583984375, + "loss": 0.6719, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6389726996421814, + "rewards/margins": 0.6780520677566528, + "rewards/rejected": -1.317024827003479, + "step": 695 + }, + { + "epoch": 0.08, + "learning_rate": 2.801591946622966e-07, + "logits/chosen": -2.7582859992980957, + "logits/rejected": -2.7515788078308105, + "logps/chosen": -314.8951110839844, + "logps/rejected": -222.64935302734375, + "loss": 0.5774, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2371395826339722, + "rewards/margins": 0.9459854364395142, + "rewards/rejected": -2.1831250190734863, + "step": 696 + }, + { + "epoch": 0.08, + "learning_rate": 2.8012407819267233e-07, + "logits/chosen": -2.73009991645813, + "logits/rejected": -2.6731438636779785, + "logps/chosen": -327.2347412109375, + "logps/rejected": -330.67388916015625, + "loss": 0.6935, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7888697385787964, + "rewards/margins": 0.5356478691101074, + "rewards/rejected": -1.3245177268981934, + "step": 697 + }, + { + "epoch": 0.08, + "learning_rate": 2.800889617230481e-07, + "logits/chosen": -2.2615129947662354, + "logits/rejected": -2.0630664825439453, + "logps/chosen": -648.8883056640625, + "logps/rejected": -498.21063232421875, + "loss": 0.5019, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.38887959718704224, + "rewards/margins": 1.6598315238952637, + "rewards/rejected": -2.048710823059082, + "step": 698 + }, + { + "epoch": 0.08, + "learning_rate": 2.8005384525342383e-07, + "logits/chosen": -2.316779851913452, + "logits/rejected": -2.3059120178222656, + "logps/chosen": -235.22337341308594, + "logps/rejected": -253.91859436035156, + "loss": 0.5021, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9778397083282471, + "rewards/margins": 0.6402310729026794, + "rewards/rejected": -1.6180708408355713, + "step": 699 + }, + { + "epoch": 0.08, + "learning_rate": 2.800187287837996e-07, + "logits/chosen": -2.1962263584136963, + "logits/rejected": -2.1431996822357178, + "logps/chosen": -216.54037475585938, + "logps/rejected": -274.9383544921875, + "loss": 0.3712, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17748500406742096, + "rewards/margins": 1.5366101264953613, + "rewards/rejected": -1.714095115661621, + "step": 700 + }, + { + "epoch": 0.08, + "learning_rate": 2.7998361231417534e-07, + "logits/chosen": -2.937756061553955, + "logits/rejected": -2.882340431213379, + "logps/chosen": -349.8114929199219, + "logps/rejected": -309.05645751953125, + "loss": 0.4199, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8099647164344788, + "rewards/margins": 1.6379032135009766, + "rewards/rejected": -2.4478681087493896, + "step": 701 + }, + { + "epoch": 0.08, + "learning_rate": 2.7994849584455104e-07, + "logits/chosen": -2.79178524017334, + "logits/rejected": -2.6943252086639404, + "logps/chosen": -166.79367065429688, + "logps/rejected": -189.70457458496094, + "loss": 0.6739, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.285910964012146, + "rewards/margins": 0.8443875312805176, + "rewards/rejected": -2.130298376083374, + "step": 702 + }, + { + "epoch": 0.08, + "learning_rate": 2.7991337937492685e-07, + "logits/chosen": -2.5196568965911865, + "logits/rejected": -2.6160190105438232, + "logps/chosen": -155.53172302246094, + "logps/rejected": -110.91557312011719, + "loss": 0.5247, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.42316287755966187, + "rewards/margins": 1.0138916969299316, + "rewards/rejected": -1.4370546340942383, + "step": 703 + }, + { + "epoch": 0.08, + "learning_rate": 2.798782629053026e-07, + "logits/chosen": -2.086639404296875, + "logits/rejected": -1.9545331001281738, + "logps/chosen": -299.3009033203125, + "logps/rejected": -359.3966369628906, + "loss": 0.2084, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3349303603172302, + "rewards/margins": 2.018390655517578, + "rewards/rejected": -2.353321075439453, + "step": 704 + }, + { + "epoch": 0.08, + "learning_rate": 2.798431464356783e-07, + "logits/chosen": -2.4301352500915527, + "logits/rejected": -2.6982836723327637, + "logps/chosen": -326.6411437988281, + "logps/rejected": -202.58250427246094, + "loss": 1.7672, + "rewards/accuracies": 0.375, + "rewards/chosen": -2.2091939449310303, + "rewards/margins": -1.0574036836624146, + "rewards/rejected": -1.1517902612686157, + "step": 705 + }, + { + "epoch": 0.08, + "learning_rate": 2.7980802996605406e-07, + "logits/chosen": -1.9978244304656982, + "logits/rejected": -2.031765937805176, + "logps/chosen": -626.57861328125, + "logps/rejected": -517.2281494140625, + "loss": 0.8596, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.2596077919006348, + "rewards/margins": -0.040851205587387085, + "rewards/rejected": -1.2187566757202148, + "step": 706 + }, + { + "epoch": 0.08, + "learning_rate": 2.797729134964298e-07, + "logits/chosen": -2.2657134532928467, + "logits/rejected": -1.8404864072799683, + "logps/chosen": -213.7327880859375, + "logps/rejected": -280.5069580078125, + "loss": 0.4814, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5165994167327881, + "rewards/margins": 1.6004754304885864, + "rewards/rejected": -2.117074966430664, + "step": 707 + }, + { + "epoch": 0.08, + "learning_rate": 2.7973779702680557e-07, + "logits/chosen": -2.5539488792419434, + "logits/rejected": -2.6839704513549805, + "logps/chosen": -431.525146484375, + "logps/rejected": -320.3194885253906, + "loss": 0.3508, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.010624215006828308, + "rewards/margins": 1.9260873794555664, + "rewards/rejected": -1.9154632091522217, + "step": 708 + }, + { + "epoch": 0.08, + "learning_rate": 2.797026805571813e-07, + "logits/chosen": -2.479395627975464, + "logits/rejected": -2.5525150299072266, + "logps/chosen": -225.06614685058594, + "logps/rejected": -194.84739685058594, + "loss": 0.6278, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6600273251533508, + "rewards/margins": 1.4810587167739868, + "rewards/rejected": -2.1410861015319824, + "step": 709 + }, + { + "epoch": 0.08, + "learning_rate": 2.79667564087557e-07, + "logits/chosen": -2.2936363220214844, + "logits/rejected": -2.355228900909424, + "logps/chosen": -396.47802734375, + "logps/rejected": -378.2413024902344, + "loss": 0.4136, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6209701299667358, + "rewards/margins": 1.2491085529327393, + "rewards/rejected": -1.8700788021087646, + "step": 710 + }, + { + "epoch": 0.08, + "learning_rate": 2.796324476179328e-07, + "logits/chosen": -2.680684804916382, + "logits/rejected": -2.72678804397583, + "logps/chosen": -278.40045166015625, + "logps/rejected": -201.2493133544922, + "loss": 0.2311, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09749492257833481, + "rewards/margins": 2.143585681915283, + "rewards/rejected": -2.2410805225372314, + "step": 711 + }, + { + "epoch": 0.08, + "learning_rate": 2.795973311483086e-07, + "logits/chosen": -2.416015148162842, + "logits/rejected": -2.6446142196655273, + "logps/chosen": -318.6568603515625, + "logps/rejected": -371.7214660644531, + "loss": 0.3889, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.49673452973365784, + "rewards/margins": 1.4491279125213623, + "rewards/rejected": -1.9458624124526978, + "step": 712 + }, + { + "epoch": 0.08, + "learning_rate": 2.795622146786843e-07, + "logits/chosen": -2.478419780731201, + "logits/rejected": -2.3824496269226074, + "logps/chosen": -188.89947509765625, + "logps/rejected": -212.24905395507812, + "loss": 0.691, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9767972230911255, + "rewards/margins": 0.6111893653869629, + "rewards/rejected": -2.587986469268799, + "step": 713 + }, + { + "epoch": 0.08, + "learning_rate": 2.7952709820906004e-07, + "logits/chosen": -2.3635904788970947, + "logits/rejected": -2.8019320964813232, + "logps/chosen": -404.677001953125, + "logps/rejected": -173.48504638671875, + "loss": 0.5348, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07509668171405792, + "rewards/margins": 0.47066807746887207, + "rewards/rejected": -0.5457647442817688, + "step": 714 + }, + { + "epoch": 0.08, + "learning_rate": 2.794919817394358e-07, + "logits/chosen": -2.405529499053955, + "logits/rejected": -2.4304258823394775, + "logps/chosen": -294.8157958984375, + "logps/rejected": -273.005859375, + "loss": 0.4242, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3974977433681488, + "rewards/margins": 0.9809131622314453, + "rewards/rejected": -1.3784109354019165, + "step": 715 + }, + { + "epoch": 0.08, + "learning_rate": 2.7945686526981154e-07, + "logits/chosen": -2.5095744132995605, + "logits/rejected": -2.3894219398498535, + "logps/chosen": -53.419830322265625, + "logps/rejected": -190.56390380859375, + "loss": 0.3072, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.30214959383010864, + "rewards/margins": 1.2248347997665405, + "rewards/rejected": -1.526984453201294, + "step": 716 + }, + { + "epoch": 0.08, + "learning_rate": 2.794217488001873e-07, + "logits/chosen": -2.0006296634674072, + "logits/rejected": -1.9395371675491333, + "logps/chosen": -348.99505615234375, + "logps/rejected": -307.7448425292969, + "loss": 0.2834, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18990512192249298, + "rewards/margins": 1.2684824466705322, + "rewards/rejected": -1.4583876132965088, + "step": 717 + }, + { + "epoch": 0.08, + "learning_rate": 2.79386632330563e-07, + "logits/chosen": -2.539247989654541, + "logits/rejected": -2.58859920501709, + "logps/chosen": -197.9044952392578, + "logps/rejected": -221.37893676757812, + "loss": 0.4654, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5534723997116089, + "rewards/margins": 1.717210054397583, + "rewards/rejected": -2.2706825733184814, + "step": 718 + }, + { + "epoch": 0.08, + "learning_rate": 2.7935151586093875e-07, + "logits/chosen": -1.9093672037124634, + "logits/rejected": -2.4705100059509277, + "logps/chosen": -434.5914611816406, + "logps/rejected": -296.814453125, + "loss": 0.3162, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.616600513458252, + "rewards/margins": 2.1367344856262207, + "rewards/rejected": -2.7533347606658936, + "step": 719 + }, + { + "epoch": 0.08, + "learning_rate": 2.793163993913145e-07, + "logits/chosen": -1.9828343391418457, + "logits/rejected": -2.2419931888580322, + "logps/chosen": -461.78021240234375, + "logps/rejected": -250.46038818359375, + "loss": 0.6498, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9554646015167236, + "rewards/margins": 0.3962123990058899, + "rewards/rejected": -1.3516771793365479, + "step": 720 + }, + { + "epoch": 0.08, + "learning_rate": 2.7928128292169026e-07, + "logits/chosen": -2.449625015258789, + "logits/rejected": -2.633025884628296, + "logps/chosen": -322.92681884765625, + "logps/rejected": -253.07508850097656, + "loss": 1.1237, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2268157005310059, + "rewards/margins": 0.5682933330535889, + "rewards/rejected": -1.7951092720031738, + "step": 721 + }, + { + "epoch": 0.08, + "learning_rate": 2.79246166452066e-07, + "logits/chosen": -2.4509997367858887, + "logits/rejected": -2.3347790241241455, + "logps/chosen": -455.2037048339844, + "logps/rejected": -398.1317443847656, + "loss": 0.3626, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5489825010299683, + "rewards/margins": 1.6501595973968506, + "rewards/rejected": -2.1991419792175293, + "step": 722 + }, + { + "epoch": 0.08, + "learning_rate": 2.792110499824417e-07, + "logits/chosen": -2.865849018096924, + "logits/rejected": -2.897186517715454, + "logps/chosen": -185.42413330078125, + "logps/rejected": -212.64036560058594, + "loss": 0.4476, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.35055071115493774, + "rewards/margins": 1.4514844417572021, + "rewards/rejected": -1.8020353317260742, + "step": 723 + }, + { + "epoch": 0.08, + "learning_rate": 2.791759335128175e-07, + "logits/chosen": -2.0101821422576904, + "logits/rejected": -1.9046530723571777, + "logps/chosen": -326.971923828125, + "logps/rejected": -221.34593200683594, + "loss": 0.5356, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2178099155426025, + "rewards/margins": 1.0890583992004395, + "rewards/rejected": -2.306868553161621, + "step": 724 + }, + { + "epoch": 0.08, + "learning_rate": 2.791408170431933e-07, + "logits/chosen": -2.0405972003936768, + "logits/rejected": -2.632497787475586, + "logps/chosen": -384.88720703125, + "logps/rejected": -201.63937377929688, + "loss": 0.3033, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03389453887939453, + "rewards/margins": 1.4113130569458008, + "rewards/rejected": -1.4452075958251953, + "step": 725 + }, + { + "epoch": 0.08, + "learning_rate": 2.79105700573569e-07, + "logits/chosen": -2.103893280029297, + "logits/rejected": -1.7989366054534912, + "logps/chosen": -393.6510314941406, + "logps/rejected": -406.809326171875, + "loss": 0.6394, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7649280428886414, + "rewards/margins": 0.3755410611629486, + "rewards/rejected": -1.1404691934585571, + "step": 726 + }, + { + "epoch": 0.08, + "learning_rate": 2.7907058410394473e-07, + "logits/chosen": -2.1564762592315674, + "logits/rejected": -2.2705469131469727, + "logps/chosen": -423.9773254394531, + "logps/rejected": -173.9842071533203, + "loss": 0.6089, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.25945907831192017, + "rewards/margins": 0.5086768269538879, + "rewards/rejected": -0.7681358456611633, + "step": 727 + }, + { + "epoch": 0.08, + "learning_rate": 2.790354676343205e-07, + "logits/chosen": -2.904965877532959, + "logits/rejected": -2.9615092277526855, + "logps/chosen": -98.97711944580078, + "logps/rejected": -169.60946655273438, + "loss": 0.3051, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.27126768231391907, + "rewards/margins": 1.8665122985839844, + "rewards/rejected": -2.137779951095581, + "step": 728 + }, + { + "epoch": 0.08, + "learning_rate": 2.7900035116469624e-07, + "logits/chosen": -2.4797611236572266, + "logits/rejected": -2.2871885299682617, + "logps/chosen": -169.9053955078125, + "logps/rejected": -264.39288330078125, + "loss": 0.2518, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1380294561386108, + "rewards/margins": 2.1550018787384033, + "rewards/rejected": -3.2930312156677246, + "step": 729 + }, + { + "epoch": 0.08, + "learning_rate": 2.78965234695072e-07, + "logits/chosen": -2.00386381149292, + "logits/rejected": -2.5667591094970703, + "logps/chosen": -514.843017578125, + "logps/rejected": -270.07464599609375, + "loss": 0.5209, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.34354862570762634, + "rewards/margins": 1.8427236080169678, + "rewards/rejected": -2.186272144317627, + "step": 730 + }, + { + "epoch": 0.08, + "learning_rate": 2.789301182254477e-07, + "logits/chosen": -1.9726481437683105, + "logits/rejected": -1.8368017673492432, + "logps/chosen": -381.16925048828125, + "logps/rejected": -315.464111328125, + "loss": 0.2257, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12431156635284424, + "rewards/margins": 2.3253610134124756, + "rewards/rejected": -2.4496724605560303, + "step": 731 + }, + { + "epoch": 0.08, + "learning_rate": 2.7889500175582345e-07, + "logits/chosen": -2.8554983139038086, + "logits/rejected": -2.8587186336517334, + "logps/chosen": -293.8827819824219, + "logps/rejected": -249.01268005371094, + "loss": 0.1967, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7902874946594238, + "rewards/margins": 2.3163700103759766, + "rewards/rejected": -3.1066575050354004, + "step": 732 + }, + { + "epoch": 0.08, + "learning_rate": 2.788598852861992e-07, + "logits/chosen": -2.3022568225860596, + "logits/rejected": -2.390946388244629, + "logps/chosen": -322.617919921875, + "logps/rejected": -188.49871826171875, + "loss": 0.346, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7357900142669678, + "rewards/margins": 1.1942998170852661, + "rewards/rejected": -1.9300897121429443, + "step": 733 + }, + { + "epoch": 0.08, + "learning_rate": 2.7882476881657496e-07, + "logits/chosen": -2.3073623180389404, + "logits/rejected": -2.478757858276367, + "logps/chosen": -145.08685302734375, + "logps/rejected": -149.0207977294922, + "loss": 0.8601, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8915119767189026, + "rewards/margins": 0.3156804144382477, + "rewards/rejected": -1.2071921825408936, + "step": 734 + }, + { + "epoch": 0.08, + "learning_rate": 2.787896523469507e-07, + "logits/chosen": -2.6265363693237305, + "logits/rejected": -2.2809383869171143, + "logps/chosen": -206.03904724121094, + "logps/rejected": -314.41082763671875, + "loss": 0.3057, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9376669526100159, + "rewards/margins": 1.5097805261611938, + "rewards/rejected": -2.4474472999572754, + "step": 735 + }, + { + "epoch": 0.08, + "learning_rate": 2.7875453587732646e-07, + "logits/chosen": -2.133500814437866, + "logits/rejected": -2.4068775177001953, + "logps/chosen": -368.5870056152344, + "logps/rejected": -299.3402404785156, + "loss": 0.5493, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9990691542625427, + "rewards/margins": 1.0668139457702637, + "rewards/rejected": -2.065883159637451, + "step": 736 + }, + { + "epoch": 0.08, + "learning_rate": 2.787194194077022e-07, + "logits/chosen": -2.701331615447998, + "logits/rejected": -2.667832374572754, + "logps/chosen": -290.7927551269531, + "logps/rejected": -206.81031799316406, + "loss": 0.5882, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.47529274225234985, + "rewards/margins": 1.0503007173538208, + "rewards/rejected": -1.5255934000015259, + "step": 737 + }, + { + "epoch": 0.09, + "learning_rate": 2.7868430293807797e-07, + "logits/chosen": -2.747249126434326, + "logits/rejected": -2.7000484466552734, + "logps/chosen": -139.12286376953125, + "logps/rejected": -162.1671142578125, + "loss": 0.4171, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.228592187166214, + "rewards/margins": 1.4609832763671875, + "rewards/rejected": -1.6895753145217896, + "step": 738 + }, + { + "epoch": 0.09, + "learning_rate": 2.7864918646845367e-07, + "logits/chosen": -2.4657888412475586, + "logits/rejected": -2.3490302562713623, + "logps/chosen": -163.99191284179688, + "logps/rejected": -245.33551025390625, + "loss": 0.266, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4627310335636139, + "rewards/margins": 2.484276294708252, + "rewards/rejected": -2.947007417678833, + "step": 739 + }, + { + "epoch": 0.09, + "learning_rate": 2.786140699988294e-07, + "logits/chosen": -2.638301372528076, + "logits/rejected": -2.7010467052459717, + "logps/chosen": -288.8241882324219, + "logps/rejected": -233.3267059326172, + "loss": 0.3536, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6250572204589844, + "rewards/margins": 1.7648452520370483, + "rewards/rejected": -2.3899025917053223, + "step": 740 + }, + { + "epoch": 0.09, + "learning_rate": 2.785789535292052e-07, + "logits/chosen": -2.856125831604004, + "logits/rejected": -2.620739459991455, + "logps/chosen": -356.588134765625, + "logps/rejected": -271.87347412109375, + "loss": 0.1792, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23812440037727356, + "rewards/margins": 2.3147246837615967, + "rewards/rejected": -2.552849054336548, + "step": 741 + }, + { + "epoch": 0.09, + "learning_rate": 2.7854383705958093e-07, + "logits/chosen": -2.5585265159606934, + "logits/rejected": -2.4543983936309814, + "logps/chosen": -248.7747802734375, + "logps/rejected": -251.97425842285156, + "loss": 0.312, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.31925728917121887, + "rewards/margins": 1.6202192306518555, + "rewards/rejected": -1.939476490020752, + "step": 742 + }, + { + "epoch": 0.09, + "learning_rate": 2.785087205899567e-07, + "logits/chosen": -2.247288703918457, + "logits/rejected": -2.2474498748779297, + "logps/chosen": -253.24298095703125, + "logps/rejected": -221.65853881835938, + "loss": 0.4024, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.10257603228092194, + "rewards/margins": 1.8945385217666626, + "rewards/rejected": -1.997114658355713, + "step": 743 + }, + { + "epoch": 0.09, + "learning_rate": 2.7847360412033244e-07, + "logits/chosen": -1.8821656703948975, + "logits/rejected": -2.251154661178589, + "logps/chosen": -403.494140625, + "logps/rejected": -319.9723205566406, + "loss": 0.5001, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16629216074943542, + "rewards/margins": 1.2086544036865234, + "rewards/rejected": -1.3749465942382812, + "step": 744 + }, + { + "epoch": 0.09, + "learning_rate": 2.7843848765070814e-07, + "logits/chosen": -2.2722883224487305, + "logits/rejected": -2.4901885986328125, + "logps/chosen": -230.59585571289062, + "logps/rejected": -219.68524169921875, + "loss": 0.6433, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7060167789459229, + "rewards/margins": 0.9292857646942139, + "rewards/rejected": -1.6353026628494263, + "step": 745 + }, + { + "epoch": 0.09, + "learning_rate": 2.7840337118108395e-07, + "logits/chosen": -2.292673349380493, + "logits/rejected": -2.436624765396118, + "logps/chosen": -85.56108093261719, + "logps/rejected": -125.19706726074219, + "loss": 0.5999, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5408560633659363, + "rewards/margins": 0.6594398021697998, + "rewards/rejected": -1.2002959251403809, + "step": 746 + }, + { + "epoch": 0.09, + "learning_rate": 2.7836825471145965e-07, + "logits/chosen": -2.428077459335327, + "logits/rejected": -2.225961923599243, + "logps/chosen": -284.57940673828125, + "logps/rejected": -418.7127685546875, + "loss": 0.3075, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20279741287231445, + "rewards/margins": 2.1873555183410645, + "rewards/rejected": -2.390153169631958, + "step": 747 + }, + { + "epoch": 0.09, + "learning_rate": 2.783331382418354e-07, + "logits/chosen": -2.597282648086548, + "logits/rejected": -2.702962875366211, + "logps/chosen": -206.4195556640625, + "logps/rejected": -365.14056396484375, + "loss": 0.527, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8358706831932068, + "rewards/margins": 0.9170832633972168, + "rewards/rejected": -1.7529540061950684, + "step": 748 + }, + { + "epoch": 0.09, + "learning_rate": 2.7829802177221116e-07, + "logits/chosen": -1.9807459115982056, + "logits/rejected": -2.003068447113037, + "logps/chosen": -255.17144775390625, + "logps/rejected": -264.5368957519531, + "loss": 0.4063, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.032508306205272675, + "rewards/margins": 1.1680909395217896, + "rewards/rejected": -1.200599193572998, + "step": 749 + }, + { + "epoch": 0.09, + "learning_rate": 2.782629053025869e-07, + "logits/chosen": -2.45750093460083, + "logits/rejected": -2.52091121673584, + "logps/chosen": -383.9385986328125, + "logps/rejected": -213.15960693359375, + "loss": 0.3063, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0036301128566265106, + "rewards/margins": 1.401827096939087, + "rewards/rejected": -1.398197054862976, + "step": 750 + }, + { + "epoch": 0.09, + "learning_rate": 2.7822778883296266e-07, + "logits/chosen": -2.3049614429473877, + "logits/rejected": -2.0991125106811523, + "logps/chosen": -213.64395141601562, + "logps/rejected": -276.2122497558594, + "loss": 0.4695, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.421783447265625, + "rewards/margins": 1.61344575881958, + "rewards/rejected": -3.035229206085205, + "step": 751 + }, + { + "epoch": 0.09, + "learning_rate": 2.7819267236333837e-07, + "logits/chosen": -2.357948064804077, + "logits/rejected": -2.619699478149414, + "logps/chosen": -182.0171356201172, + "logps/rejected": -211.67364501953125, + "loss": 0.547, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.13372477889060974, + "rewards/margins": 0.6995238065719604, + "rewards/rejected": -0.8332486748695374, + "step": 752 + }, + { + "epoch": 0.09, + "learning_rate": 2.781575558937141e-07, + "logits/chosen": -2.7189061641693115, + "logits/rejected": -2.686493158340454, + "logps/chosen": -310.7188720703125, + "logps/rejected": -220.33853149414062, + "loss": 0.2728, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2922452688217163, + "rewards/margins": 2.1335482597351074, + "rewards/rejected": -2.4257936477661133, + "step": 753 + }, + { + "epoch": 0.09, + "learning_rate": 2.7812243942408987e-07, + "logits/chosen": -2.558136224746704, + "logits/rejected": -2.310641288757324, + "logps/chosen": -468.5171203613281, + "logps/rejected": -374.7884521484375, + "loss": 0.442, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.49272096157073975, + "rewards/margins": 2.001638889312744, + "rewards/rejected": -2.4943599700927734, + "step": 754 + }, + { + "epoch": 0.09, + "learning_rate": 2.7808732295446563e-07, + "logits/chosen": -2.8514609336853027, + "logits/rejected": -2.716670513153076, + "logps/chosen": -384.1571044921875, + "logps/rejected": -180.96400451660156, + "loss": 0.3437, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.43284541368484497, + "rewards/margins": 1.706430435180664, + "rewards/rejected": -2.1392757892608643, + "step": 755 + }, + { + "epoch": 0.09, + "learning_rate": 2.780522064848414e-07, + "logits/chosen": -2.8035736083984375, + "logits/rejected": -2.7277421951293945, + "logps/chosen": -146.4972686767578, + "logps/rejected": -379.0042724609375, + "loss": 0.2885, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.38977399468421936, + "rewards/margins": 3.091031789779663, + "rewards/rejected": -3.4808061122894287, + "step": 756 + }, + { + "epoch": 0.09, + "learning_rate": 2.7801709001521714e-07, + "logits/chosen": -2.142362117767334, + "logits/rejected": -2.4062600135803223, + "logps/chosen": -446.80615234375, + "logps/rejected": -249.9421844482422, + "loss": 0.9306, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.310259461402893, + "rewards/margins": -0.13450844585895538, + "rewards/rejected": -1.175750970840454, + "step": 757 + }, + { + "epoch": 0.09, + "learning_rate": 2.779819735455929e-07, + "logits/chosen": -2.1674728393554688, + "logits/rejected": -2.1524808406829834, + "logps/chosen": -196.52386474609375, + "logps/rejected": -292.43218994140625, + "loss": 0.1807, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3466280698776245, + "rewards/margins": 2.4361257553100586, + "rewards/rejected": -2.7827539443969727, + "step": 758 + }, + { + "epoch": 0.09, + "learning_rate": 2.7794685707596864e-07, + "logits/chosen": -1.9704647064208984, + "logits/rejected": -1.731384515762329, + "logps/chosen": -250.8128204345703, + "logps/rejected": -370.5721130371094, + "loss": 0.3614, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6528977155685425, + "rewards/margins": 1.128127098083496, + "rewards/rejected": -1.7810248136520386, + "step": 759 + }, + { + "epoch": 0.09, + "learning_rate": 2.7791174060634434e-07, + "logits/chosen": -2.289867877960205, + "logits/rejected": -2.2141268253326416, + "logps/chosen": -206.8010711669922, + "logps/rejected": -249.180908203125, + "loss": 0.4391, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2911333441734314, + "rewards/margins": 1.5061633586883545, + "rewards/rejected": -1.7972966432571411, + "step": 760 + }, + { + "epoch": 0.09, + "learning_rate": 2.778766241367201e-07, + "logits/chosen": -2.644782543182373, + "logits/rejected": -2.7462925910949707, + "logps/chosen": -302.6006164550781, + "logps/rejected": -258.2138671875, + "loss": 0.2969, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1155666932463646, + "rewards/margins": 1.7323131561279297, + "rewards/rejected": -1.6167463064193726, + "step": 761 + }, + { + "epoch": 0.09, + "learning_rate": 2.7784150766709585e-07, + "logits/chosen": -2.731560230255127, + "logits/rejected": -2.782796621322632, + "logps/chosen": -225.71603393554688, + "logps/rejected": -228.3634033203125, + "loss": 0.817, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4918091297149658, + "rewards/margins": 0.6476296186447144, + "rewards/rejected": -2.1394386291503906, + "step": 762 + }, + { + "epoch": 0.09, + "learning_rate": 2.778063911974716e-07, + "logits/chosen": -2.8393476009368896, + "logits/rejected": -2.593891143798828, + "logps/chosen": -343.1087646484375, + "logps/rejected": -390.2741394042969, + "loss": 0.5207, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.31450292468070984, + "rewards/margins": 0.8706681132316589, + "rewards/rejected": -1.1851708889007568, + "step": 763 + }, + { + "epoch": 0.09, + "learning_rate": 2.7777127472784736e-07, + "logits/chosen": -1.792628526687622, + "logits/rejected": -1.9294941425323486, + "logps/chosen": -302.90631103515625, + "logps/rejected": -227.48141479492188, + "loss": 0.7669, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9863220453262329, + "rewards/margins": 0.43321341276168823, + "rewards/rejected": -1.4195353984832764, + "step": 764 + }, + { + "epoch": 0.09, + "learning_rate": 2.777361582582231e-07, + "logits/chosen": -1.918544054031372, + "logits/rejected": -1.5655250549316406, + "logps/chosen": -350.36456298828125, + "logps/rejected": -472.0162353515625, + "loss": 0.432, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0057237148284912, + "rewards/margins": 1.4663970470428467, + "rewards/rejected": -2.472120761871338, + "step": 765 + }, + { + "epoch": 0.09, + "learning_rate": 2.777010417885988e-07, + "logits/chosen": -2.4687440395355225, + "logits/rejected": -2.268791437149048, + "logps/chosen": -269.36346435546875, + "logps/rejected": -262.65789794921875, + "loss": 0.3101, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.33172348141670227, + "rewards/margins": 2.2509512901306152, + "rewards/rejected": -2.582674980163574, + "step": 766 + }, + { + "epoch": 0.09, + "learning_rate": 2.7766592531897457e-07, + "logits/chosen": -1.9792580604553223, + "logits/rejected": -2.2040436267852783, + "logps/chosen": -316.4862365722656, + "logps/rejected": -240.36778259277344, + "loss": 0.7805, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0745248794555664, + "rewards/margins": 0.8298572301864624, + "rewards/rejected": -1.9043819904327393, + "step": 767 + }, + { + "epoch": 0.09, + "learning_rate": 2.776308088493503e-07, + "logits/chosen": -2.53139328956604, + "logits/rejected": -2.373903274536133, + "logps/chosen": -403.6836853027344, + "logps/rejected": -296.85968017578125, + "loss": 0.1106, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.713801622390747, + "rewards/margins": 3.4527337551116943, + "rewards/rejected": -5.166535377502441, + "step": 768 + }, + { + "epoch": 0.09, + "learning_rate": 2.775956923797261e-07, + "logits/chosen": -2.0682640075683594, + "logits/rejected": -1.8539254665374756, + "logps/chosen": -354.50701904296875, + "logps/rejected": -399.3261413574219, + "loss": 0.2172, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36444664001464844, + "rewards/margins": 2.1670889854431152, + "rewards/rejected": -2.5315358638763428, + "step": 769 + }, + { + "epoch": 0.09, + "learning_rate": 2.7756057591010183e-07, + "logits/chosen": -2.42990779876709, + "logits/rejected": -2.22268009185791, + "logps/chosen": -131.23590087890625, + "logps/rejected": -218.83102416992188, + "loss": 0.3119, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5515226721763611, + "rewards/margins": 1.617897629737854, + "rewards/rejected": -2.1694202423095703, + "step": 770 + }, + { + "epoch": 0.09, + "learning_rate": 2.775254594404776e-07, + "logits/chosen": -2.3149542808532715, + "logits/rejected": -2.1969120502471924, + "logps/chosen": -268.20843505859375, + "logps/rejected": -236.57177734375, + "loss": 0.2457, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.34965085983276367, + "rewards/margins": 1.7243350744247437, + "rewards/rejected": -2.073986053466797, + "step": 771 + }, + { + "epoch": 0.09, + "learning_rate": 2.7749034297085334e-07, + "logits/chosen": -1.9135124683380127, + "logits/rejected": -2.371267557144165, + "logps/chosen": -482.47222900390625, + "logps/rejected": -291.6490783691406, + "loss": 0.6157, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3984224200248718, + "rewards/margins": 0.9145254492759705, + "rewards/rejected": -1.3129479885101318, + "step": 772 + }, + { + "epoch": 0.09, + "learning_rate": 2.774552265012291e-07, + "logits/chosen": -2.2902517318725586, + "logits/rejected": -2.1618540287017822, + "logps/chosen": -324.3194580078125, + "logps/rejected": -293.4084777832031, + "loss": 0.1783, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07815402746200562, + "rewards/margins": 2.0101842880249023, + "rewards/rejected": -2.0883381366729736, + "step": 773 + }, + { + "epoch": 0.09, + "learning_rate": 2.774201100316048e-07, + "logits/chosen": -2.3209140300750732, + "logits/rejected": -2.0561859607696533, + "logps/chosen": -265.65740966796875, + "logps/rejected": -312.0859069824219, + "loss": 0.4827, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4054551124572754, + "rewards/margins": 0.9727126955986023, + "rewards/rejected": -1.3781678676605225, + "step": 774 + }, + { + "epoch": 0.09, + "learning_rate": 2.7738499356198055e-07, + "logits/chosen": -2.3458290100097656, + "logits/rejected": -2.3867058753967285, + "logps/chosen": -308.12408447265625, + "logps/rejected": -241.7742919921875, + "loss": 1.6632, + "rewards/accuracies": 0.5, + "rewards/chosen": -3.856671094894409, + "rewards/margins": 0.08338057994842529, + "rewards/rejected": -3.940051555633545, + "step": 775 + }, + { + "epoch": 0.09, + "learning_rate": 2.773498770923563e-07, + "logits/chosen": -2.1869499683380127, + "logits/rejected": -2.3535428047180176, + "logps/chosen": -360.4064025878906, + "logps/rejected": -367.1676330566406, + "loss": 0.586, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7132984399795532, + "rewards/margins": 0.6265285015106201, + "rewards/rejected": -1.3398269414901733, + "step": 776 + }, + { + "epoch": 0.09, + "learning_rate": 2.7731476062273205e-07, + "logits/chosen": -2.5422024726867676, + "logits/rejected": -2.404184579849243, + "logps/chosen": -215.18165588378906, + "logps/rejected": -196.843994140625, + "loss": 0.6173, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8687412142753601, + "rewards/margins": 0.4853454828262329, + "rewards/rejected": -1.3540866374969482, + "step": 777 + }, + { + "epoch": 0.09, + "learning_rate": 2.772796441531078e-07, + "logits/chosen": -2.633431911468506, + "logits/rejected": -2.5928125381469727, + "logps/chosen": -251.94012451171875, + "logps/rejected": -278.18560791015625, + "loss": 0.3056, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3359241485595703, + "rewards/margins": 1.2381036281585693, + "rewards/rejected": -1.5740277767181396, + "step": 778 + }, + { + "epoch": 0.09, + "learning_rate": 2.772445276834835e-07, + "logits/chosen": -2.131000518798828, + "logits/rejected": -1.8269717693328857, + "logps/chosen": -270.811279296875, + "logps/rejected": -282.6021423339844, + "loss": 0.2865, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09141065925359726, + "rewards/margins": 2.154982805252075, + "rewards/rejected": -2.2463932037353516, + "step": 779 + }, + { + "epoch": 0.09, + "learning_rate": 2.772094112138593e-07, + "logits/chosen": -2.4289984703063965, + "logits/rejected": -2.779658794403076, + "logps/chosen": -476.529541015625, + "logps/rejected": -237.01864624023438, + "loss": 0.235, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1594228893518448, + "rewards/margins": 1.4775404930114746, + "rewards/rejected": -1.6369632482528687, + "step": 780 + }, + { + "epoch": 0.09, + "learning_rate": 2.7717429474423507e-07, + "logits/chosen": -3.0218939781188965, + "logits/rejected": -2.9407966136932373, + "logps/chosen": -237.39239501953125, + "logps/rejected": -183.576416015625, + "loss": 0.6678, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8122608065605164, + "rewards/margins": 0.5975229740142822, + "rewards/rejected": -1.4097838401794434, + "step": 781 + }, + { + "epoch": 0.09, + "learning_rate": 2.7713917827461077e-07, + "logits/chosen": -2.660285472869873, + "logits/rejected": -2.7723500728607178, + "logps/chosen": -292.4368591308594, + "logps/rejected": -304.1983337402344, + "loss": 0.5058, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1753994226455688, + "rewards/margins": 1.6772267818450928, + "rewards/rejected": -2.852625846862793, + "step": 782 + }, + { + "epoch": 0.09, + "learning_rate": 2.771040618049865e-07, + "logits/chosen": -2.66011381149292, + "logits/rejected": -2.7857065200805664, + "logps/chosen": -811.8271484375, + "logps/rejected": -287.11883544921875, + "loss": 0.4613, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3670165538787842, + "rewards/margins": 1.9055882692337036, + "rewards/rejected": -3.2726049423217773, + "step": 783 + }, + { + "epoch": 0.09, + "learning_rate": 2.770689453353623e-07, + "logits/chosen": -2.480001449584961, + "logits/rejected": -2.253047227859497, + "logps/chosen": -185.6260223388672, + "logps/rejected": -360.7357482910156, + "loss": 0.3598, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8001412153244019, + "rewards/margins": 1.9279874563217163, + "rewards/rejected": -2.728128671646118, + "step": 784 + }, + { + "epoch": 0.09, + "learning_rate": 2.7703382886573803e-07, + "logits/chosen": -2.730983018875122, + "logits/rejected": -2.5935497283935547, + "logps/chosen": -140.86058044433594, + "logps/rejected": -263.728759765625, + "loss": 0.8538, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.674119234085083, + "rewards/margins": 0.06308918446302414, + "rewards/rejected": -0.7372084856033325, + "step": 785 + }, + { + "epoch": 0.09, + "learning_rate": 2.769987123961138e-07, + "logits/chosen": -2.459770441055298, + "logits/rejected": -2.322490692138672, + "logps/chosen": -109.2947998046875, + "logps/rejected": -181.970458984375, + "loss": 0.5448, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4405767023563385, + "rewards/margins": 0.5352679491043091, + "rewards/rejected": -0.9758446216583252, + "step": 786 + }, + { + "epoch": 0.09, + "learning_rate": 2.769635959264895e-07, + "logits/chosen": -2.3202192783355713, + "logits/rejected": -2.466341733932495, + "logps/chosen": -283.0531921386719, + "logps/rejected": -385.3746337890625, + "loss": 0.224, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36807844042778015, + "rewards/margins": 2.632978677749634, + "rewards/rejected": -3.0010571479797363, + "step": 787 + }, + { + "epoch": 0.09, + "learning_rate": 2.7692847945686524e-07, + "logits/chosen": -2.379056453704834, + "logits/rejected": -2.5452170372009277, + "logps/chosen": -221.553955078125, + "logps/rejected": -324.5672607421875, + "loss": 0.2287, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11922778189182281, + "rewards/margins": 2.3913726806640625, + "rewards/rejected": -2.5106003284454346, + "step": 788 + }, + { + "epoch": 0.09, + "learning_rate": 2.76893362987241e-07, + "logits/chosen": -1.8078914880752563, + "logits/rejected": -1.7400660514831543, + "logps/chosen": -382.33465576171875, + "logps/rejected": -389.85369873046875, + "loss": 0.8081, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4120731353759766, + "rewards/margins": 0.16242142021656036, + "rewards/rejected": -1.5744946002960205, + "step": 789 + }, + { + "epoch": 0.09, + "learning_rate": 2.7685824651761675e-07, + "logits/chosen": -2.3011720180511475, + "logits/rejected": -2.411299467086792, + "logps/chosen": -258.9205322265625, + "logps/rejected": -262.3006591796875, + "loss": 0.2451, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.38116616010665894, + "rewards/margins": 1.79221773147583, + "rewards/rejected": -2.1733837127685547, + "step": 790 + }, + { + "epoch": 0.09, + "learning_rate": 2.768231300479925e-07, + "logits/chosen": -1.9385287761688232, + "logits/rejected": -1.8209927082061768, + "logps/chosen": -361.3316650390625, + "logps/rejected": -333.3839416503906, + "loss": 0.6282, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6450623273849487, + "rewards/margins": 0.2793193459510803, + "rewards/rejected": -0.9243816137313843, + "step": 791 + }, + { + "epoch": 0.09, + "learning_rate": 2.7678801357836826e-07, + "logits/chosen": -2.529025077819824, + "logits/rejected": -2.3515818119049072, + "logps/chosen": -149.78668212890625, + "logps/rejected": -156.13894653320312, + "loss": 0.4233, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.679265022277832, + "rewards/margins": 1.6867103576660156, + "rewards/rejected": -2.3659753799438477, + "step": 792 + }, + { + "epoch": 0.09, + "learning_rate": 2.76752897108744e-07, + "logits/chosen": -2.3489503860473633, + "logits/rejected": -2.4997496604919434, + "logps/chosen": -368.7309875488281, + "logps/rejected": -243.73675537109375, + "loss": 0.656, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8626235723495483, + "rewards/margins": 0.6128016710281372, + "rewards/rejected": -1.4754252433776855, + "step": 793 + }, + { + "epoch": 0.09, + "learning_rate": 2.7671778063911976e-07, + "logits/chosen": -2.4969687461853027, + "logits/rejected": -2.5030126571655273, + "logps/chosen": -238.98165893554688, + "logps/rejected": -175.31439208984375, + "loss": 1.5089, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.9254487752914429, + "rewards/margins": -0.38255470991134644, + "rewards/rejected": -1.5428940057754517, + "step": 794 + }, + { + "epoch": 0.09, + "learning_rate": 2.7668266416949546e-07, + "logits/chosen": -2.4602303504943848, + "logits/rejected": -2.3668172359466553, + "logps/chosen": -292.673583984375, + "logps/rejected": -230.04701232910156, + "loss": 0.7572, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8530017733573914, + "rewards/margins": 0.7057427763938904, + "rewards/rejected": -1.5587445497512817, + "step": 795 + }, + { + "epoch": 0.09, + "learning_rate": 2.766475476998712e-07, + "logits/chosen": -2.3963472843170166, + "logits/rejected": -2.362884044647217, + "logps/chosen": -247.59979248046875, + "logps/rejected": -218.88392639160156, + "loss": 0.5071, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.8617167472839355, + "rewards/margins": 1.4561071395874023, + "rewards/rejected": -3.317823886871338, + "step": 796 + }, + { + "epoch": 0.09, + "learning_rate": 2.7661243123024697e-07, + "logits/chosen": -1.6827162504196167, + "logits/rejected": -2.108505964279175, + "logps/chosen": -206.34042358398438, + "logps/rejected": -184.33303833007812, + "loss": 0.7263, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.186894178390503, + "rewards/margins": 1.0954480171203613, + "rewards/rejected": -2.2823421955108643, + "step": 797 + }, + { + "epoch": 0.09, + "learning_rate": 2.765773147606227e-07, + "logits/chosen": -2.3807973861694336, + "logits/rejected": -2.4547011852264404, + "logps/chosen": -307.69866943359375, + "logps/rejected": -375.5882568359375, + "loss": 0.2681, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.07693809270858765, + "rewards/margins": 2.692800283432007, + "rewards/rejected": -2.6158621311187744, + "step": 798 + }, + { + "epoch": 0.09, + "learning_rate": 2.765421982909985e-07, + "logits/chosen": -2.5376977920532227, + "logits/rejected": -2.642284393310547, + "logps/chosen": -258.1460876464844, + "logps/rejected": -166.19479370117188, + "loss": 0.6283, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.891119122505188, + "rewards/margins": 1.082230567932129, + "rewards/rejected": -1.9733495712280273, + "step": 799 + }, + { + "epoch": 0.09, + "learning_rate": 2.765070818213742e-07, + "logits/chosen": -2.2322821617126465, + "logits/rejected": -2.202754259109497, + "logps/chosen": -277.509521484375, + "logps/rejected": -246.30523681640625, + "loss": 0.4835, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.015120640397071838, + "rewards/margins": 1.1079548597335815, + "rewards/rejected": -1.092834234237671, + "step": 800 + }, + { + "epoch": 0.09, + "learning_rate": 2.7647196535174993e-07, + "logits/chosen": -1.890861988067627, + "logits/rejected": -1.8267241716384888, + "logps/chosen": -228.05889892578125, + "logps/rejected": -318.183349609375, + "loss": 0.5819, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8715710043907166, + "rewards/margins": 0.5666001439094543, + "rewards/rejected": -1.438171148300171, + "step": 801 + }, + { + "epoch": 0.09, + "learning_rate": 2.7643684888212574e-07, + "logits/chosen": -2.448352098464966, + "logits/rejected": -2.234152317047119, + "logps/chosen": -126.36993408203125, + "logps/rejected": -274.0322570800781, + "loss": 0.3436, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3248036801815033, + "rewards/margins": 1.279839277267456, + "rewards/rejected": -1.6046429872512817, + "step": 802 + }, + { + "epoch": 0.09, + "learning_rate": 2.7640173241250144e-07, + "logits/chosen": -2.4320785999298096, + "logits/rejected": -2.1780378818511963, + "logps/chosen": -89.57301330566406, + "logps/rejected": -324.46063232421875, + "loss": 0.3254, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3537755608558655, + "rewards/margins": 1.626603603363037, + "rewards/rejected": -1.9803792238235474, + "step": 803 + }, + { + "epoch": 0.09, + "learning_rate": 2.763666159428772e-07, + "logits/chosen": -2.037156820297241, + "logits/rejected": -2.2097091674804688, + "logps/chosen": -211.17877197265625, + "logps/rejected": -200.93203735351562, + "loss": 0.746, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.927579641342163, + "rewards/margins": 1.2979315519332886, + "rewards/rejected": -4.225510597229004, + "step": 804 + }, + { + "epoch": 0.09, + "learning_rate": 2.7633149947325295e-07, + "logits/chosen": -2.3153269290924072, + "logits/rejected": -2.0538394451141357, + "logps/chosen": -161.91587829589844, + "logps/rejected": -243.3095245361328, + "loss": 0.5666, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5971871614456177, + "rewards/margins": 0.658798098564148, + "rewards/rejected": -1.2559852600097656, + "step": 805 + }, + { + "epoch": 0.09, + "learning_rate": 2.762963830036287e-07, + "logits/chosen": -2.7697856426239014, + "logits/rejected": -2.866651773452759, + "logps/chosen": -300.8506164550781, + "logps/rejected": -264.66619873046875, + "loss": 0.5743, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7335798740386963, + "rewards/margins": 0.8006190061569214, + "rewards/rejected": -1.5341989994049072, + "step": 806 + }, + { + "epoch": 0.09, + "learning_rate": 2.7626126653400446e-07, + "logits/chosen": -2.239635944366455, + "logits/rejected": -2.286318778991699, + "logps/chosen": -330.79437255859375, + "logps/rejected": -265.9227294921875, + "loss": 0.354, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3670031726360321, + "rewards/margins": 1.3751349449157715, + "rewards/rejected": -1.742138147354126, + "step": 807 + }, + { + "epoch": 0.09, + "learning_rate": 2.7622615006438016e-07, + "logits/chosen": -2.202141523361206, + "logits/rejected": -2.299286365509033, + "logps/chosen": -330.3397216796875, + "logps/rejected": -331.6002197265625, + "loss": 0.5484, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5464947819709778, + "rewards/margins": 1.3692883253097534, + "rewards/rejected": -1.915783166885376, + "step": 808 + }, + { + "epoch": 0.09, + "learning_rate": 2.761910335947559e-07, + "logits/chosen": -2.0013108253479004, + "logits/rejected": -1.9238883256912231, + "logps/chosen": -431.55108642578125, + "logps/rejected": -390.1378173828125, + "loss": 0.5982, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6543668508529663, + "rewards/margins": 0.6434647440910339, + "rewards/rejected": -1.2978315353393555, + "step": 809 + }, + { + "epoch": 0.09, + "learning_rate": 2.7615591712513167e-07, + "logits/chosen": -2.387172222137451, + "logits/rejected": -2.634730577468872, + "logps/chosen": -299.27734375, + "logps/rejected": -148.91357421875, + "loss": 0.4667, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2991892397403717, + "rewards/margins": 1.3039966821670532, + "rewards/rejected": -1.6031858921051025, + "step": 810 + }, + { + "epoch": 0.09, + "learning_rate": 2.761208006555074e-07, + "logits/chosen": -2.4726614952087402, + "logits/rejected": -2.4773852825164795, + "logps/chosen": -230.71060180664062, + "logps/rejected": -296.1785888671875, + "loss": 0.2472, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.854118287563324, + "rewards/margins": 2.985917806625366, + "rewards/rejected": -3.840036153793335, + "step": 811 + }, + { + "epoch": 0.09, + "learning_rate": 2.7608568418588317e-07, + "logits/chosen": -2.7905197143554688, + "logits/rejected": -2.8106892108917236, + "logps/chosen": -106.81831359863281, + "logps/rejected": -209.6323699951172, + "loss": 0.3129, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4995810389518738, + "rewards/margins": 2.076575756072998, + "rewards/rejected": -2.5761566162109375, + "step": 812 + }, + { + "epoch": 0.09, + "learning_rate": 2.760505677162589e-07, + "logits/chosen": -2.1269025802612305, + "logits/rejected": -2.4080748558044434, + "logps/chosen": -569.4434814453125, + "logps/rejected": -343.39666748046875, + "loss": 0.5829, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.27389082312583923, + "rewards/margins": 0.285868376493454, + "rewards/rejected": -0.5597591996192932, + "step": 813 + }, + { + "epoch": 0.09, + "learning_rate": 2.760154512466347e-07, + "logits/chosen": -1.8805909156799316, + "logits/rejected": -1.7956852912902832, + "logps/chosen": -301.13250732421875, + "logps/rejected": -375.5550231933594, + "loss": 0.3467, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.42576271295547485, + "rewards/margins": 1.4805558919906616, + "rewards/rejected": -1.9063186645507812, + "step": 814 + }, + { + "epoch": 0.09, + "learning_rate": 2.7598033477701044e-07, + "logits/chosen": -2.4875407218933105, + "logits/rejected": -2.4192581176757812, + "logps/chosen": -205.32054138183594, + "logps/rejected": -220.00331115722656, + "loss": 0.6743, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5757603049278259, + "rewards/margins": 0.6346995830535889, + "rewards/rejected": -1.21045982837677, + "step": 815 + }, + { + "epoch": 0.09, + "learning_rate": 2.7594521830738614e-07, + "logits/chosen": -2.2095847129821777, + "logits/rejected": -2.5800657272338867, + "logps/chosen": -410.66912841796875, + "logps/rejected": -199.54510498046875, + "loss": 0.5907, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.35392671823501587, + "rewards/margins": 1.4132859706878662, + "rewards/rejected": -1.7672126293182373, + "step": 816 + }, + { + "epoch": 0.09, + "learning_rate": 2.759101018377619e-07, + "logits/chosen": -2.4905989170074463, + "logits/rejected": -2.2399847507476807, + "logps/chosen": -115.71815490722656, + "logps/rejected": -244.98416137695312, + "loss": 0.4342, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4339706301689148, + "rewards/margins": 1.0889227390289307, + "rewards/rejected": -1.5228933095932007, + "step": 817 + }, + { + "epoch": 0.09, + "learning_rate": 2.7587498536813764e-07, + "logits/chosen": -2.1160736083984375, + "logits/rejected": -2.2713463306427, + "logps/chosen": -212.38023376464844, + "logps/rejected": -206.2078857421875, + "loss": 1.141, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.1697194576263428, + "rewards/margins": -0.2668731212615967, + "rewards/rejected": -0.9028463363647461, + "step": 818 + }, + { + "epoch": 0.09, + "learning_rate": 2.758398688985134e-07, + "logits/chosen": -2.5584683418273926, + "logits/rejected": -2.595224142074585, + "logps/chosen": -143.10720825195312, + "logps/rejected": -163.1663055419922, + "loss": 0.4011, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.32224857807159424, + "rewards/margins": 1.427194595336914, + "rewards/rejected": -1.7494432926177979, + "step": 819 + }, + { + "epoch": 0.09, + "learning_rate": 2.7580475242888915e-07, + "logits/chosen": -2.3183650970458984, + "logits/rejected": -2.403165817260742, + "logps/chosen": -316.69769287109375, + "logps/rejected": -275.93084716796875, + "loss": 0.4419, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4276745319366455, + "rewards/margins": 1.7524456977844238, + "rewards/rejected": -2.1801204681396484, + "step": 820 + }, + { + "epoch": 0.09, + "learning_rate": 2.7576963595926485e-07, + "logits/chosen": -2.5012881755828857, + "logits/rejected": -2.5587267875671387, + "logps/chosen": -425.2368469238281, + "logps/rejected": -399.05078125, + "loss": 1.0292, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7898065447807312, + "rewards/margins": 0.42737722396850586, + "rewards/rejected": -1.2171838283538818, + "step": 821 + }, + { + "epoch": 0.09, + "learning_rate": 2.757345194896406e-07, + "logits/chosen": -1.7849780321121216, + "logits/rejected": -1.6907812356948853, + "logps/chosen": -303.44482421875, + "logps/rejected": -364.76361083984375, + "loss": 0.3254, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6364241242408752, + "rewards/margins": 1.576251745223999, + "rewards/rejected": -2.2126760482788086, + "step": 822 + }, + { + "epoch": 0.09, + "learning_rate": 2.7569940302001636e-07, + "logits/chosen": -2.7258334159851074, + "logits/rejected": -2.8446929454803467, + "logps/chosen": -224.97271728515625, + "logps/rejected": -216.55743408203125, + "loss": 0.7265, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8505507111549377, + "rewards/margins": 0.7897355556488037, + "rewards/rejected": -1.6402863264083862, + "step": 823 + }, + { + "epoch": 0.09, + "learning_rate": 2.756642865503921e-07, + "logits/chosen": -1.744121789932251, + "logits/rejected": -1.7105926275253296, + "logps/chosen": -279.3056640625, + "logps/rejected": -210.86709594726562, + "loss": 0.5741, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.34027841687202454, + "rewards/margins": 0.9451712369918823, + "rewards/rejected": -1.2854496240615845, + "step": 824 + }, + { + "epoch": 0.1, + "learning_rate": 2.7562917008076787e-07, + "logits/chosen": -2.5586776733398438, + "logits/rejected": -2.58437180519104, + "logps/chosen": -282.70806884765625, + "logps/rejected": -191.7822723388672, + "loss": 0.6564, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8704094886779785, + "rewards/margins": 0.899133563041687, + "rewards/rejected": -1.769542932510376, + "step": 825 + }, + { + "epoch": 0.1, + "learning_rate": 2.755940536111436e-07, + "logits/chosen": -2.8182952404022217, + "logits/rejected": -2.7282845973968506, + "logps/chosen": -161.39942932128906, + "logps/rejected": -238.90745544433594, + "loss": 0.7449, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.803224503993988, + "rewards/margins": 1.7648098468780518, + "rewards/rejected": -2.5680344104766846, + "step": 826 + }, + { + "epoch": 0.1, + "learning_rate": 2.755589371415194e-07, + "logits/chosen": -2.2921454906463623, + "logits/rejected": -2.337754249572754, + "logps/chosen": -377.9394226074219, + "logps/rejected": -325.0662841796875, + "loss": 0.5698, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9037119746208191, + "rewards/margins": 0.4863564074039459, + "rewards/rejected": -1.3900682926177979, + "step": 827 + }, + { + "epoch": 0.1, + "learning_rate": 2.7552382067189513e-07, + "logits/chosen": -2.212531805038452, + "logits/rejected": -2.314936399459839, + "logps/chosen": -208.48497009277344, + "logps/rejected": -264.6201171875, + "loss": 0.3402, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4914950132369995, + "rewards/margins": 2.359457015991211, + "rewards/rejected": -2.8509521484375, + "step": 828 + }, + { + "epoch": 0.1, + "learning_rate": 2.7548870420227083e-07, + "logits/chosen": -2.473080635070801, + "logits/rejected": -2.4388420581817627, + "logps/chosen": -119.92377471923828, + "logps/rejected": -261.00201416015625, + "loss": 0.252, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18867899477481842, + "rewards/margins": 1.8322033882141113, + "rewards/rejected": -2.0208823680877686, + "step": 829 + }, + { + "epoch": 0.1, + "learning_rate": 2.754535877326466e-07, + "logits/chosen": -2.4411096572875977, + "logits/rejected": -2.70658540725708, + "logps/chosen": -400.7325439453125, + "logps/rejected": -391.9275817871094, + "loss": 0.4076, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3700386881828308, + "rewards/margins": 1.7321863174438477, + "rewards/rejected": -2.102224826812744, + "step": 830 + }, + { + "epoch": 0.1, + "learning_rate": 2.7541847126302234e-07, + "logits/chosen": -2.8889126777648926, + "logits/rejected": -2.6691818237304688, + "logps/chosen": -264.3702697753906, + "logps/rejected": -223.3623046875, + "loss": 0.258, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5418584942817688, + "rewards/margins": 2.36431622505188, + "rewards/rejected": -2.906174659729004, + "step": 831 + }, + { + "epoch": 0.1, + "learning_rate": 2.753833547933981e-07, + "logits/chosen": -2.2791666984558105, + "logits/rejected": -2.579025983810425, + "logps/chosen": -270.8998107910156, + "logps/rejected": -164.4615020751953, + "loss": 0.3986, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2629741430282593, + "rewards/margins": 1.2688350677490234, + "rewards/rejected": -1.5318092107772827, + "step": 832 + }, + { + "epoch": 0.1, + "learning_rate": 2.7534823832377385e-07, + "logits/chosen": -2.5243184566497803, + "logits/rejected": -2.5544021129608154, + "logps/chosen": -494.301025390625, + "logps/rejected": -233.87103271484375, + "loss": 0.6722, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6064411401748657, + "rewards/margins": 1.0629085302352905, + "rewards/rejected": -1.6693496704101562, + "step": 833 + }, + { + "epoch": 0.1, + "learning_rate": 2.753131218541496e-07, + "logits/chosen": -2.047807455062866, + "logits/rejected": -2.254352569580078, + "logps/chosen": -304.7427978515625, + "logps/rejected": -250.58560180664062, + "loss": 0.4988, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6589237451553345, + "rewards/margins": 0.992977499961853, + "rewards/rejected": -1.6519012451171875, + "step": 834 + }, + { + "epoch": 0.1, + "learning_rate": 2.752780053845253e-07, + "logits/chosen": -2.2769858837127686, + "logits/rejected": -2.441960573196411, + "logps/chosen": -146.03787231445312, + "logps/rejected": -351.67059326171875, + "loss": 0.6425, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.42524731159210205, + "rewards/margins": 0.5899673700332642, + "rewards/rejected": -1.0152146816253662, + "step": 835 + }, + { + "epoch": 0.1, + "learning_rate": 2.752428889149011e-07, + "logits/chosen": -2.376035451889038, + "logits/rejected": -2.5929765701293945, + "logps/chosen": -379.23681640625, + "logps/rejected": -304.533935546875, + "loss": 0.3691, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6694204807281494, + "rewards/margins": 1.1649707555770874, + "rewards/rejected": -1.8343911170959473, + "step": 836 + }, + { + "epoch": 0.1, + "learning_rate": 2.752077724452768e-07, + "logits/chosen": -2.6002602577209473, + "logits/rejected": -2.507941484451294, + "logps/chosen": -243.19131469726562, + "logps/rejected": -203.5678253173828, + "loss": 0.7361, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7152911424636841, + "rewards/margins": 0.7182618975639343, + "rewards/rejected": -1.4335530996322632, + "step": 837 + }, + { + "epoch": 0.1, + "learning_rate": 2.7517265597565256e-07, + "logits/chosen": -2.9591288566589355, + "logits/rejected": -2.9634807109832764, + "logps/chosen": -142.01194763183594, + "logps/rejected": -190.72084045410156, + "loss": 0.8908, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2307944297790527, + "rewards/margins": 0.3192315995693207, + "rewards/rejected": -1.5500259399414062, + "step": 838 + }, + { + "epoch": 0.1, + "learning_rate": 2.751375395060283e-07, + "logits/chosen": -2.3548974990844727, + "logits/rejected": -2.2194857597351074, + "logps/chosen": -95.66849517822266, + "logps/rejected": -239.25558471679688, + "loss": 0.3916, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13998840749263763, + "rewards/margins": 1.6643548011779785, + "rewards/rejected": -1.8043432235717773, + "step": 839 + }, + { + "epoch": 0.1, + "learning_rate": 2.7510242303640407e-07, + "logits/chosen": -2.106090784072876, + "logits/rejected": -1.8516467809677124, + "logps/chosen": -278.689453125, + "logps/rejected": -352.3238525390625, + "loss": 0.2742, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8581905364990234, + "rewards/margins": 1.6817278861999512, + "rewards/rejected": -3.5399184226989746, + "step": 840 + }, + { + "epoch": 0.1, + "learning_rate": 2.750673065667798e-07, + "logits/chosen": -2.4165797233581543, + "logits/rejected": -2.7675082683563232, + "logps/chosen": -560.032470703125, + "logps/rejected": -261.9199523925781, + "loss": 0.5332, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4780290722846985, + "rewards/margins": 1.7207958698272705, + "rewards/rejected": -2.1988251209259033, + "step": 841 + }, + { + "epoch": 0.1, + "learning_rate": 2.750321900971556e-07, + "logits/chosen": -2.6284687519073486, + "logits/rejected": -2.636730909347534, + "logps/chosen": -125.65663146972656, + "logps/rejected": -169.54039001464844, + "loss": 0.4726, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1487689018249512, + "rewards/margins": 0.6182979345321655, + "rewards/rejected": -1.7670668363571167, + "step": 842 + }, + { + "epoch": 0.1, + "learning_rate": 2.749970736275313e-07, + "logits/chosen": -2.049978733062744, + "logits/rejected": -2.197664260864258, + "logps/chosen": -175.21444702148438, + "logps/rejected": -197.3688507080078, + "loss": 0.4727, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5821592807769775, + "rewards/margins": 1.0119600296020508, + "rewards/rejected": -1.5941193103790283, + "step": 843 + }, + { + "epoch": 0.1, + "learning_rate": 2.7496195715790703e-07, + "logits/chosen": -2.223905086517334, + "logits/rejected": -2.3344764709472656, + "logps/chosen": -258.4309387207031, + "logps/rejected": -262.2900085449219, + "loss": 0.6247, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0710797309875488, + "rewards/margins": 1.6142706871032715, + "rewards/rejected": -2.6853504180908203, + "step": 844 + }, + { + "epoch": 0.1, + "learning_rate": 2.749268406882828e-07, + "logits/chosen": -2.9065511226654053, + "logits/rejected": -2.833789587020874, + "logps/chosen": -186.0256805419922, + "logps/rejected": -108.05865478515625, + "loss": 0.5612, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.46830418705940247, + "rewards/margins": 0.8217259049415588, + "rewards/rejected": -1.2900301218032837, + "step": 845 + }, + { + "epoch": 0.1, + "learning_rate": 2.7489172421865854e-07, + "logits/chosen": -2.447984218597412, + "logits/rejected": -2.348557233810425, + "logps/chosen": -108.98347473144531, + "logps/rejected": -260.2140808105469, + "loss": 0.3335, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13755767047405243, + "rewards/margins": 1.3483399152755737, + "rewards/rejected": -1.4858977794647217, + "step": 846 + }, + { + "epoch": 0.1, + "learning_rate": 2.748566077490343e-07, + "logits/chosen": -2.0561609268188477, + "logits/rejected": -2.3827569484710693, + "logps/chosen": -293.29046630859375, + "logps/rejected": -303.82501220703125, + "loss": 1.6112, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.525341749191284, + "rewards/margins": 0.26127374172210693, + "rewards/rejected": -2.7866153717041016, + "step": 847 + }, + { + "epoch": 0.1, + "learning_rate": 2.7482149127941005e-07, + "logits/chosen": -2.0406079292297363, + "logits/rejected": -2.196289300918579, + "logps/chosen": -407.5369873046875, + "logps/rejected": -295.21820068359375, + "loss": 0.5133, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.003853633999824524, + "rewards/margins": 0.9928991794586182, + "rewards/rejected": -0.9967528581619263, + "step": 848 + }, + { + "epoch": 0.1, + "learning_rate": 2.747863748097858e-07, + "logits/chosen": -2.192006826400757, + "logits/rejected": -2.193291187286377, + "logps/chosen": -205.3574676513672, + "logps/rejected": -253.58053588867188, + "loss": 0.3731, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6278808116912842, + "rewards/margins": 0.9057722687721252, + "rewards/rejected": -1.5336530208587646, + "step": 849 + }, + { + "epoch": 0.1, + "learning_rate": 2.747512583401615e-07, + "logits/chosen": -2.3858213424682617, + "logits/rejected": -2.4065263271331787, + "logps/chosen": -281.6358947753906, + "logps/rejected": -234.0752410888672, + "loss": 0.2879, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3935432434082031, + "rewards/margins": 1.7462352514266968, + "rewards/rejected": -2.1397783756256104, + "step": 850 + }, + { + "epoch": 0.1, + "learning_rate": 2.7471614187053726e-07, + "logits/chosen": -2.760241985321045, + "logits/rejected": -2.7973506450653076, + "logps/chosen": -216.3712615966797, + "logps/rejected": -222.0390167236328, + "loss": 0.5075, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.591031789779663, + "rewards/margins": 1.069801926612854, + "rewards/rejected": -2.6608338356018066, + "step": 851 + }, + { + "epoch": 0.1, + "learning_rate": 2.74681025400913e-07, + "logits/chosen": -1.9072537422180176, + "logits/rejected": -2.3966009616851807, + "logps/chosen": -279.09814453125, + "logps/rejected": -200.498291015625, + "loss": 0.6545, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6460846066474915, + "rewards/margins": 1.111264705657959, + "rewards/rejected": -1.7573493719100952, + "step": 852 + }, + { + "epoch": 0.1, + "learning_rate": 2.7464590893128876e-07, + "logits/chosen": -2.0115225315093994, + "logits/rejected": -2.155808210372925, + "logps/chosen": -240.0186309814453, + "logps/rejected": -249.575439453125, + "loss": 0.5463, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9515892863273621, + "rewards/margins": 1.2742524147033691, + "rewards/rejected": -2.225841760635376, + "step": 853 + }, + { + "epoch": 0.1, + "learning_rate": 2.746107924616645e-07, + "logits/chosen": -2.3084144592285156, + "logits/rejected": -2.498538017272949, + "logps/chosen": -247.79840087890625, + "logps/rejected": -189.2851104736328, + "loss": 0.3704, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6037533283233643, + "rewards/margins": 1.5570805072784424, + "rewards/rejected": -2.1608338356018066, + "step": 854 + }, + { + "epoch": 0.1, + "learning_rate": 2.7457567599204027e-07, + "logits/chosen": -2.385681629180908, + "logits/rejected": -2.621626138687134, + "logps/chosen": -405.31903076171875, + "logps/rejected": -242.27801513671875, + "loss": 0.3085, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09067822992801666, + "rewards/margins": 1.663391351699829, + "rewards/rejected": -1.5727131366729736, + "step": 855 + }, + { + "epoch": 0.1, + "learning_rate": 2.7454055952241597e-07, + "logits/chosen": -2.049204111099243, + "logits/rejected": -2.000635862350464, + "logps/chosen": -312.63531494140625, + "logps/rejected": -407.718994140625, + "loss": 0.1566, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40755602717399597, + "rewards/margins": 2.6576168537139893, + "rewards/rejected": -3.0651729106903076, + "step": 856 + }, + { + "epoch": 0.1, + "learning_rate": 2.745054430527917e-07, + "logits/chosen": -2.4714720249176025, + "logits/rejected": -2.5937280654907227, + "logps/chosen": -319.82177734375, + "logps/rejected": -250.0519561767578, + "loss": 0.7167, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5579330921173096, + "rewards/margins": 0.7020557522773743, + "rewards/rejected": -1.259988784790039, + "step": 857 + }, + { + "epoch": 0.1, + "learning_rate": 2.744703265831675e-07, + "logits/chosen": -2.2892305850982666, + "logits/rejected": -2.6300041675567627, + "logps/chosen": -236.0145263671875, + "logps/rejected": -196.89122009277344, + "loss": 0.6294, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.8984473943710327, + "rewards/margins": 0.6839612722396851, + "rewards/rejected": -1.5824086666107178, + "step": 858 + }, + { + "epoch": 0.1, + "learning_rate": 2.7443521011354323e-07, + "logits/chosen": -2.416809558868408, + "logits/rejected": -2.3566606044769287, + "logps/chosen": -450.44659423828125, + "logps/rejected": -428.8044128417969, + "loss": 0.4298, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.48858991265296936, + "rewards/margins": 1.4331920146942139, + "rewards/rejected": -1.9217817783355713, + "step": 859 + }, + { + "epoch": 0.1, + "learning_rate": 2.74400093643919e-07, + "logits/chosen": -2.452394962310791, + "logits/rejected": -2.4533944129943848, + "logps/chosen": -224.89553833007812, + "logps/rejected": -180.6106719970703, + "loss": 0.493, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2268909215927124, + "rewards/margins": 0.8081058263778687, + "rewards/rejected": -2.034996747970581, + "step": 860 + }, + { + "epoch": 0.1, + "learning_rate": 2.7436497717429474e-07, + "logits/chosen": -2.3893582820892334, + "logits/rejected": -2.5670642852783203, + "logps/chosen": -183.88412475585938, + "logps/rejected": -191.48509216308594, + "loss": 0.5916, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7178572416305542, + "rewards/margins": 1.0565249919891357, + "rewards/rejected": -1.7743823528289795, + "step": 861 + }, + { + "epoch": 0.1, + "learning_rate": 2.743298607046705e-07, + "logits/chosen": -2.268998146057129, + "logits/rejected": -2.096442699432373, + "logps/chosen": -396.4952392578125, + "logps/rejected": -342.6715393066406, + "loss": 0.7102, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.15142822265625, + "rewards/margins": 0.8021333813667297, + "rewards/rejected": -1.953561544418335, + "step": 862 + }, + { + "epoch": 0.1, + "learning_rate": 2.7429474423504625e-07, + "logits/chosen": -2.76934552192688, + "logits/rejected": -2.6023356914520264, + "logps/chosen": -418.47186279296875, + "logps/rejected": -218.6945037841797, + "loss": 0.2228, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08372455835342407, + "rewards/margins": 1.9349303245544434, + "rewards/rejected": -1.8512059450149536, + "step": 863 + }, + { + "epoch": 0.1, + "learning_rate": 2.7425962776542195e-07, + "logits/chosen": -2.6738338470458984, + "logits/rejected": -2.6124989986419678, + "logps/chosen": -291.07183837890625, + "logps/rejected": -195.96524047851562, + "loss": 1.8998, + "rewards/accuracies": 0.25, + "rewards/chosen": -2.386781692504883, + "rewards/margins": -1.2577769756317139, + "rewards/rejected": -1.1290045976638794, + "step": 864 + }, + { + "epoch": 0.1, + "learning_rate": 2.742245112957977e-07, + "logits/chosen": -2.842454433441162, + "logits/rejected": -2.9303057193756104, + "logps/chosen": -132.208984375, + "logps/rejected": -154.64987182617188, + "loss": 0.31, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04956609755754471, + "rewards/margins": 1.2155320644378662, + "rewards/rejected": -1.2650983333587646, + "step": 865 + }, + { + "epoch": 0.1, + "learning_rate": 2.7418939482617346e-07, + "logits/chosen": -1.5846633911132812, + "logits/rejected": -1.7510769367218018, + "logps/chosen": -359.94476318359375, + "logps/rejected": -376.2521667480469, + "loss": 0.2446, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4198935329914093, + "rewards/margins": 1.37806236743927, + "rewards/rejected": -1.797955870628357, + "step": 866 + }, + { + "epoch": 0.1, + "learning_rate": 2.741542783565492e-07, + "logits/chosen": -2.2946205139160156, + "logits/rejected": -2.5097503662109375, + "logps/chosen": -179.63595581054688, + "logps/rejected": -246.64617919921875, + "loss": 0.4547, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6329889297485352, + "rewards/margins": 1.1715987920761108, + "rewards/rejected": -1.8045876026153564, + "step": 867 + }, + { + "epoch": 0.1, + "learning_rate": 2.7411916188692497e-07, + "logits/chosen": -2.161911964416504, + "logits/rejected": -1.980193018913269, + "logps/chosen": -90.79429626464844, + "logps/rejected": -216.4267578125, + "loss": 0.3348, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1536362171173096, + "rewards/margins": 1.612606167793274, + "rewards/rejected": -2.766242742538452, + "step": 868 + }, + { + "epoch": 0.1, + "learning_rate": 2.7408404541730067e-07, + "logits/chosen": -2.6160354614257812, + "logits/rejected": -2.7752623558044434, + "logps/chosen": -238.42227172851562, + "logps/rejected": -188.59486389160156, + "loss": 0.5305, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8613348007202148, + "rewards/margins": 1.01069974899292, + "rewards/rejected": -1.8720346689224243, + "step": 869 + }, + { + "epoch": 0.1, + "learning_rate": 2.740489289476765e-07, + "logits/chosen": -2.3218894004821777, + "logits/rejected": -2.5129358768463135, + "logps/chosen": -326.834228515625, + "logps/rejected": -262.6706848144531, + "loss": 0.4566, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4429580569267273, + "rewards/margins": 0.9399796724319458, + "rewards/rejected": -1.3829376697540283, + "step": 870 + }, + { + "epoch": 0.1, + "learning_rate": 2.7401381247805223e-07, + "logits/chosen": -2.5625972747802734, + "logits/rejected": -2.655690908432007, + "logps/chosen": -263.1837158203125, + "logps/rejected": -188.22853088378906, + "loss": 0.5976, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3155396580696106, + "rewards/margins": 1.1391363143920898, + "rewards/rejected": -1.4546760320663452, + "step": 871 + }, + { + "epoch": 0.1, + "learning_rate": 2.7397869600842793e-07, + "logits/chosen": -1.8586876392364502, + "logits/rejected": -1.847339391708374, + "logps/chosen": -227.23606872558594, + "logps/rejected": -292.5487365722656, + "loss": 1.8094, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.643810749053955, + "rewards/margins": -0.24091732501983643, + "rewards/rejected": -3.40289306640625, + "step": 872 + }, + { + "epoch": 0.1, + "learning_rate": 2.739435795388037e-07, + "logits/chosen": -2.62186861038208, + "logits/rejected": -2.8167951107025146, + "logps/chosen": -227.745361328125, + "logps/rejected": -235.97825622558594, + "loss": 0.2304, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3649876117706299, + "rewards/margins": 2.355353593826294, + "rewards/rejected": -2.720341205596924, + "step": 873 + }, + { + "epoch": 0.1, + "learning_rate": 2.7390846306917944e-07, + "logits/chosen": -2.839649200439453, + "logits/rejected": -2.781049966812134, + "logps/chosen": -302.4234619140625, + "logps/rejected": -374.5107727050781, + "loss": 0.1132, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.44819375872612, + "rewards/margins": 2.306586742401123, + "rewards/rejected": -2.7547802925109863, + "step": 874 + }, + { + "epoch": 0.1, + "learning_rate": 2.738733465995552e-07, + "logits/chosen": -2.487473964691162, + "logits/rejected": -2.5366427898406982, + "logps/chosen": -272.7265319824219, + "logps/rejected": -243.0287628173828, + "loss": 0.3496, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6214907765388489, + "rewards/margins": 1.146028995513916, + "rewards/rejected": -1.7675198316574097, + "step": 875 + }, + { + "epoch": 0.1, + "learning_rate": 2.7383823012993094e-07, + "logits/chosen": -2.9904234409332275, + "logits/rejected": -2.9225430488586426, + "logps/chosen": -302.1584777832031, + "logps/rejected": -245.98492431640625, + "loss": 0.2034, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17861181497573853, + "rewards/margins": 3.2799367904663086, + "rewards/rejected": -3.4585485458374023, + "step": 876 + }, + { + "epoch": 0.1, + "learning_rate": 2.7380311366030664e-07, + "logits/chosen": -2.696608543395996, + "logits/rejected": -2.711982250213623, + "logps/chosen": -310.5008544921875, + "logps/rejected": -357.9404296875, + "loss": 0.4441, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3918878436088562, + "rewards/margins": 1.286521077156067, + "rewards/rejected": -1.6784089803695679, + "step": 877 + }, + { + "epoch": 0.1, + "learning_rate": 2.737679971906824e-07, + "logits/chosen": -2.236459970474243, + "logits/rejected": -2.4766292572021484, + "logps/chosen": -433.4685974121094, + "logps/rejected": -223.2201690673828, + "loss": 0.5781, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7428615093231201, + "rewards/margins": 0.5256321430206299, + "rewards/rejected": -1.26849365234375, + "step": 878 + }, + { + "epoch": 0.1, + "learning_rate": 2.737328807210582e-07, + "logits/chosen": -2.2882437705993652, + "logits/rejected": -2.269320487976074, + "logps/chosen": -215.55458068847656, + "logps/rejected": -294.8399658203125, + "loss": 0.5297, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.40013667941093445, + "rewards/margins": 1.805371642112732, + "rewards/rejected": -2.205508232116699, + "step": 879 + }, + { + "epoch": 0.1, + "learning_rate": 2.736977642514339e-07, + "logits/chosen": -2.0284576416015625, + "logits/rejected": -2.1417651176452637, + "logps/chosen": -167.98931884765625, + "logps/rejected": -172.12062072753906, + "loss": 1.3149, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.7394373416900635, + "rewards/margins": -0.1815958023071289, + "rewards/rejected": -1.5578415393829346, + "step": 880 + }, + { + "epoch": 0.1, + "learning_rate": 2.7366264778180966e-07, + "logits/chosen": -1.7408301830291748, + "logits/rejected": -2.127826452255249, + "logps/chosen": -367.0542907714844, + "logps/rejected": -169.72154235839844, + "loss": 0.5433, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5862451791763306, + "rewards/margins": 0.6623046398162842, + "rewards/rejected": -1.2485496997833252, + "step": 881 + }, + { + "epoch": 0.1, + "learning_rate": 2.736275313121854e-07, + "logits/chosen": -2.061760187149048, + "logits/rejected": -2.502890110015869, + "logps/chosen": -321.2643737792969, + "logps/rejected": -217.29336547851562, + "loss": 0.2684, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4814087450504303, + "rewards/margins": 1.5957975387573242, + "rewards/rejected": -2.0772061347961426, + "step": 882 + }, + { + "epoch": 0.1, + "learning_rate": 2.7359241484256117e-07, + "logits/chosen": -2.239964008331299, + "logits/rejected": -2.3058643341064453, + "logps/chosen": -320.699462890625, + "logps/rejected": -311.1190185546875, + "loss": 0.3654, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3407551050186157, + "rewards/margins": 1.5086952447891235, + "rewards/rejected": -1.8494503498077393, + "step": 883 + }, + { + "epoch": 0.1, + "learning_rate": 2.735572983729369e-07, + "logits/chosen": -2.35732364654541, + "logits/rejected": -2.4461846351623535, + "logps/chosen": -101.029052734375, + "logps/rejected": -220.3807373046875, + "loss": 0.2832, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.030410315841436386, + "rewards/margins": 2.2272849082946777, + "rewards/rejected": -2.257695436477661, + "step": 884 + }, + { + "epoch": 0.1, + "learning_rate": 2.735221819033126e-07, + "logits/chosen": -1.6367770433425903, + "logits/rejected": -1.9972889423370361, + "logps/chosen": -385.45086669921875, + "logps/rejected": -285.4366760253906, + "loss": 0.5566, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.41470444202423096, + "rewards/margins": 0.7031936645507812, + "rewards/rejected": -1.1178981065750122, + "step": 885 + }, + { + "epoch": 0.1, + "learning_rate": 2.734870654336884e-07, + "logits/chosen": -2.6313066482543945, + "logits/rejected": -2.260723352432251, + "logps/chosen": -109.0986328125, + "logps/rejected": -202.43942260742188, + "loss": 0.4602, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7822294235229492, + "rewards/margins": 1.8931256532669067, + "rewards/rejected": -2.6753549575805664, + "step": 886 + }, + { + "epoch": 0.1, + "learning_rate": 2.7345194896406413e-07, + "logits/chosen": -2.0518956184387207, + "logits/rejected": -2.430412769317627, + "logps/chosen": -417.9922180175781, + "logps/rejected": -262.50665283203125, + "loss": 0.59, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6057643890380859, + "rewards/margins": 0.6457421779632568, + "rewards/rejected": -1.2515065670013428, + "step": 887 + }, + { + "epoch": 0.1, + "learning_rate": 2.734168324944399e-07, + "logits/chosen": -2.315666437149048, + "logits/rejected": -2.5556840896606445, + "logps/chosen": -189.8325958251953, + "logps/rejected": -202.45098876953125, + "loss": 0.2779, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8676235675811768, + "rewards/margins": 2.744389057159424, + "rewards/rejected": -3.6120128631591797, + "step": 888 + }, + { + "epoch": 0.1, + "learning_rate": 2.7338171602481564e-07, + "logits/chosen": -2.178572177886963, + "logits/rejected": -2.309739112854004, + "logps/chosen": -298.0235290527344, + "logps/rejected": -239.94570922851562, + "loss": 0.6348, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7324139475822449, + "rewards/margins": 1.186073660850525, + "rewards/rejected": -1.918487548828125, + "step": 889 + }, + { + "epoch": 0.1, + "learning_rate": 2.7334659955519134e-07, + "logits/chosen": -1.9351143836975098, + "logits/rejected": -1.6779600381851196, + "logps/chosen": -318.2749938964844, + "logps/rejected": -408.49298095703125, + "loss": 0.2726, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7676053643226624, + "rewards/margins": 1.600421667098999, + "rewards/rejected": -2.3680272102355957, + "step": 890 + }, + { + "epoch": 0.1, + "learning_rate": 2.733114830855671e-07, + "logits/chosen": -2.2161264419555664, + "logits/rejected": -2.2616453170776367, + "logps/chosen": -196.74314880371094, + "logps/rejected": -265.27288818359375, + "loss": 0.5028, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5660105347633362, + "rewards/margins": 1.4353567361831665, + "rewards/rejected": -2.0013673305511475, + "step": 891 + }, + { + "epoch": 0.1, + "learning_rate": 2.732763666159429e-07, + "logits/chosen": -1.7486870288848877, + "logits/rejected": -1.7805310487747192, + "logps/chosen": -392.2126770019531, + "logps/rejected": -419.1914978027344, + "loss": 0.2273, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6669938564300537, + "rewards/margins": 3.249483346939087, + "rewards/rejected": -3.9164772033691406, + "step": 892 + }, + { + "epoch": 0.1, + "learning_rate": 2.732412501463186e-07, + "logits/chosen": -2.305424928665161, + "logits/rejected": -2.2936127185821533, + "logps/chosen": -406.2883605957031, + "logps/rejected": -399.64288330078125, + "loss": 0.2347, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2664914131164551, + "rewards/margins": 2.32843017578125, + "rewards/rejected": -2.594921588897705, + "step": 893 + }, + { + "epoch": 0.1, + "learning_rate": 2.7320613367669435e-07, + "logits/chosen": -2.1703996658325195, + "logits/rejected": -1.982246994972229, + "logps/chosen": -332.732666015625, + "logps/rejected": -466.397216796875, + "loss": 0.694, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4209740161895752, + "rewards/margins": 0.7064259648323059, + "rewards/rejected": -1.1274000406265259, + "step": 894 + }, + { + "epoch": 0.1, + "learning_rate": 2.731710172070701e-07, + "logits/chosen": -2.1892919540405273, + "logits/rejected": -2.212963104248047, + "logps/chosen": -355.6367492675781, + "logps/rejected": -379.8904724121094, + "loss": 0.3111, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18291088938713074, + "rewards/margins": 1.6307588815689087, + "rewards/rejected": -1.8136695623397827, + "step": 895 + }, + { + "epoch": 0.1, + "learning_rate": 2.7313590073744586e-07, + "logits/chosen": -2.3171658515930176, + "logits/rejected": -2.35357928276062, + "logps/chosen": -587.069580078125, + "logps/rejected": -425.15533447265625, + "loss": 0.3979, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.015103891491889954, + "rewards/margins": 1.411621332168579, + "rewards/rejected": -1.426725149154663, + "step": 896 + }, + { + "epoch": 0.1, + "learning_rate": 2.731007842678216e-07, + "logits/chosen": -1.7953616380691528, + "logits/rejected": -2.165194511413574, + "logps/chosen": -319.9486083984375, + "logps/rejected": -225.694091796875, + "loss": 0.6043, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1881155967712402, + "rewards/margins": 1.155165672302246, + "rewards/rejected": -2.3432812690734863, + "step": 897 + }, + { + "epoch": 0.1, + "learning_rate": 2.730656677981973e-07, + "logits/chosen": -2.2874317169189453, + "logits/rejected": -2.3359129428863525, + "logps/chosen": -287.706298828125, + "logps/rejected": -237.74317932128906, + "loss": 0.5499, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.872846782207489, + "rewards/margins": 0.8492374420166016, + "rewards/rejected": -1.7220841646194458, + "step": 898 + }, + { + "epoch": 0.1, + "learning_rate": 2.7303055132857307e-07, + "logits/chosen": -2.3546836376190186, + "logits/rejected": -2.602132558822632, + "logps/chosen": -324.95989990234375, + "logps/rejected": -164.28134155273438, + "loss": 0.6043, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.49545741081237793, + "rewards/margins": 0.44144928455352783, + "rewards/rejected": -0.9369067549705505, + "step": 899 + }, + { + "epoch": 0.1, + "learning_rate": 2.729954348589488e-07, + "logits/chosen": -2.6096599102020264, + "logits/rejected": -2.882143497467041, + "logps/chosen": -354.230712890625, + "logps/rejected": -155.22064208984375, + "loss": 0.4874, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.43517130613327026, + "rewards/margins": 1.076342225074768, + "rewards/rejected": -1.511513590812683, + "step": 900 + }, + { + "epoch": 0.1, + "learning_rate": 2.729603183893246e-07, + "logits/chosen": -2.6765828132629395, + "logits/rejected": -2.5943284034729004, + "logps/chosen": -152.5860595703125, + "logps/rejected": -253.0933074951172, + "loss": 0.5087, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9579920768737793, + "rewards/margins": 1.6806248426437378, + "rewards/rejected": -2.6386168003082275, + "step": 901 + }, + { + "epoch": 0.1, + "learning_rate": 2.7292520191970033e-07, + "logits/chosen": -2.174633264541626, + "logits/rejected": -1.9265437126159668, + "logps/chosen": -328.0870361328125, + "logps/rejected": -386.87408447265625, + "loss": 0.475, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2026169300079346, + "rewards/margins": 1.92780601978302, + "rewards/rejected": -3.130423069000244, + "step": 902 + }, + { + "epoch": 0.1, + "learning_rate": 2.7289008545007603e-07, + "logits/chosen": -2.664370059967041, + "logits/rejected": -2.734895706176758, + "logps/chosen": -338.24176025390625, + "logps/rejected": -256.3707275390625, + "loss": 0.5873, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9760487675666809, + "rewards/margins": 0.6431657671928406, + "rewards/rejected": -1.6192145347595215, + "step": 903 + }, + { + "epoch": 0.1, + "learning_rate": 2.7285496898045184e-07, + "logits/chosen": -2.418470859527588, + "logits/rejected": -2.3857643604278564, + "logps/chosen": -328.44073486328125, + "logps/rejected": -182.70664978027344, + "loss": 0.3611, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4334130585193634, + "rewards/margins": 2.372704267501831, + "rewards/rejected": -2.806117534637451, + "step": 904 + }, + { + "epoch": 0.1, + "learning_rate": 2.728198525108276e-07, + "logits/chosen": -2.7596099376678467, + "logits/rejected": -2.5041770935058594, + "logps/chosen": -270.47613525390625, + "logps/rejected": -294.5979309082031, + "loss": 0.2661, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.52459716796875, + "rewards/margins": 1.6848387718200684, + "rewards/rejected": -2.2094359397888184, + "step": 905 + }, + { + "epoch": 0.1, + "learning_rate": 2.727847360412033e-07, + "logits/chosen": -2.945510149002075, + "logits/rejected": -2.9139673709869385, + "logps/chosen": -279.41864013671875, + "logps/rejected": -227.04005432128906, + "loss": 0.8769, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9047845602035522, + "rewards/margins": 0.029307126998901367, + "rewards/rejected": -0.9340916872024536, + "step": 906 + }, + { + "epoch": 0.1, + "learning_rate": 2.7274961957157905e-07, + "logits/chosen": -2.5785350799560547, + "logits/rejected": -2.494663953781128, + "logps/chosen": -193.6589813232422, + "logps/rejected": -190.41563415527344, + "loss": 0.4742, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7568491101264954, + "rewards/margins": 0.5829892158508301, + "rewards/rejected": -1.3398383855819702, + "step": 907 + }, + { + "epoch": 0.1, + "learning_rate": 2.727145031019548e-07, + "logits/chosen": -1.7279045581817627, + "logits/rejected": -1.4416168928146362, + "logps/chosen": -391.1516418457031, + "logps/rejected": -375.1801452636719, + "loss": 0.4477, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8019564747810364, + "rewards/margins": 1.5964951515197754, + "rewards/rejected": -2.398451805114746, + "step": 908 + }, + { + "epoch": 0.1, + "learning_rate": 2.7267938663233056e-07, + "logits/chosen": -2.198636770248413, + "logits/rejected": -2.171288251876831, + "logps/chosen": -245.0402374267578, + "logps/rejected": -335.2845764160156, + "loss": 0.474, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9793490767478943, + "rewards/margins": 1.490941047668457, + "rewards/rejected": -2.470290184020996, + "step": 909 + }, + { + "epoch": 0.1, + "learning_rate": 2.726442701627063e-07, + "logits/chosen": -2.1542720794677734, + "logits/rejected": -2.1895663738250732, + "logps/chosen": -212.97398376464844, + "logps/rejected": -249.56834411621094, + "loss": 0.3527, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1207932233810425, + "rewards/margins": 2.5179107189178467, + "rewards/rejected": -3.6387040615081787, + "step": 910 + }, + { + "epoch": 0.11, + "learning_rate": 2.72609153693082e-07, + "logits/chosen": -2.6544461250305176, + "logits/rejected": -2.7239012718200684, + "logps/chosen": -304.85467529296875, + "logps/rejected": -317.8792724609375, + "loss": 0.2294, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5325814485549927, + "rewards/margins": 2.7240521907806396, + "rewards/rejected": -3.256633758544922, + "step": 911 + }, + { + "epoch": 0.11, + "learning_rate": 2.7257403722345776e-07, + "logits/chosen": -2.941321849822998, + "logits/rejected": -2.782184362411499, + "logps/chosen": -168.43814086914062, + "logps/rejected": -191.325439453125, + "loss": 0.4083, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3670191168785095, + "rewards/margins": 1.3913077116012573, + "rewards/rejected": -1.7583266496658325, + "step": 912 + }, + { + "epoch": 0.11, + "learning_rate": 2.7253892075383357e-07, + "logits/chosen": -2.126332998275757, + "logits/rejected": -1.7857770919799805, + "logps/chosen": -121.19863891601562, + "logps/rejected": -194.9742431640625, + "loss": 0.6055, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6880748867988586, + "rewards/margins": 0.678730309009552, + "rewards/rejected": -1.3668051958084106, + "step": 913 + }, + { + "epoch": 0.11, + "learning_rate": 2.7250380428420927e-07, + "logits/chosen": -1.8102284669876099, + "logits/rejected": -1.9606794118881226, + "logps/chosen": -409.20721435546875, + "logps/rejected": -376.234130859375, + "loss": 0.27, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.017630569636821747, + "rewards/margins": 1.6213726997375488, + "rewards/rejected": -1.6390032768249512, + "step": 914 + }, + { + "epoch": 0.11, + "learning_rate": 2.7246868781458503e-07, + "logits/chosen": -2.847503423690796, + "logits/rejected": -2.7250118255615234, + "logps/chosen": -283.68927001953125, + "logps/rejected": -135.90797424316406, + "loss": 0.3554, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2845492660999298, + "rewards/margins": 1.5355963706970215, + "rewards/rejected": -1.820145606994629, + "step": 915 + }, + { + "epoch": 0.11, + "learning_rate": 2.724335713449608e-07, + "logits/chosen": -2.7249951362609863, + "logits/rejected": -2.648851156234741, + "logps/chosen": -561.4984130859375, + "logps/rejected": -227.2781524658203, + "loss": 0.8052, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0013887882232666, + "rewards/margins": 0.5076630711555481, + "rewards/rejected": -1.50905179977417, + "step": 916 + }, + { + "epoch": 0.11, + "learning_rate": 2.7239845487533653e-07, + "logits/chosen": -2.199678421020508, + "logits/rejected": -2.4873173236846924, + "logps/chosen": -487.6723327636719, + "logps/rejected": -247.61563110351562, + "loss": 0.3305, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6441419720649719, + "rewards/margins": 1.7620280981063843, + "rewards/rejected": -2.406169891357422, + "step": 917 + }, + { + "epoch": 0.11, + "learning_rate": 2.723633384057123e-07, + "logits/chosen": -2.0813002586364746, + "logits/rejected": -2.252007484436035, + "logps/chosen": -305.5802917480469, + "logps/rejected": -210.3858642578125, + "loss": 0.4427, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.39607512950897217, + "rewards/margins": 1.069615364074707, + "rewards/rejected": -1.4656906127929688, + "step": 918 + }, + { + "epoch": 0.11, + "learning_rate": 2.72328221936088e-07, + "logits/chosen": -2.6110198497772217, + "logits/rejected": -2.6111044883728027, + "logps/chosen": -251.50186157226562, + "logps/rejected": -215.8858642578125, + "loss": 0.3481, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.571472704410553, + "rewards/margins": 2.270409107208252, + "rewards/rejected": -2.84188175201416, + "step": 919 + }, + { + "epoch": 0.11, + "learning_rate": 2.7229310546646374e-07, + "logits/chosen": -2.126147985458374, + "logits/rejected": -2.257380723953247, + "logps/chosen": -506.49188232421875, + "logps/rejected": -379.3800048828125, + "loss": 0.4938, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05539850890636444, + "rewards/margins": 1.1930655241012573, + "rewards/rejected": -1.2484641075134277, + "step": 920 + }, + { + "epoch": 0.11, + "learning_rate": 2.722579889968395e-07, + "logits/chosen": -2.5572211742401123, + "logits/rejected": -2.606908082962036, + "logps/chosen": -206.45486450195312, + "logps/rejected": -280.3768310546875, + "loss": 0.3101, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5094765424728394, + "rewards/margins": 1.379276156425476, + "rewards/rejected": -1.888752818107605, + "step": 921 + }, + { + "epoch": 0.11, + "learning_rate": 2.7222287252721525e-07, + "logits/chosen": -2.468951940536499, + "logits/rejected": -2.682056188583374, + "logps/chosen": -173.07986450195312, + "logps/rejected": -204.8521270751953, + "loss": 0.4241, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5862125158309937, + "rewards/margins": 1.4682295322418213, + "rewards/rejected": -2.0544419288635254, + "step": 922 + }, + { + "epoch": 0.11, + "learning_rate": 2.72187756057591e-07, + "logits/chosen": -2.1948676109313965, + "logits/rejected": -2.2450318336486816, + "logps/chosen": -373.1836853027344, + "logps/rejected": -230.53964233398438, + "loss": 0.5687, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3812295198440552, + "rewards/margins": 0.6769253015518188, + "rewards/rejected": -2.058154821395874, + "step": 923 + }, + { + "epoch": 0.11, + "learning_rate": 2.7215263958796676e-07, + "logits/chosen": -2.478496551513672, + "logits/rejected": -2.1888155937194824, + "logps/chosen": -216.64474487304688, + "logps/rejected": -251.8866729736328, + "loss": 0.53, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.009973406791687, + "rewards/margins": 1.0626217126846313, + "rewards/rejected": -2.0725951194763184, + "step": 924 + }, + { + "epoch": 0.11, + "learning_rate": 2.7211752311834246e-07, + "logits/chosen": -2.225806474685669, + "logits/rejected": -2.502622365951538, + "logps/chosen": -173.7615966796875, + "logps/rejected": -210.58456420898438, + "loss": 0.4407, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.180751085281372, + "rewards/margins": 1.0715283155441284, + "rewards/rejected": -2.25227952003479, + "step": 925 + }, + { + "epoch": 0.11, + "learning_rate": 2.7208240664871827e-07, + "logits/chosen": -2.00040864944458, + "logits/rejected": -2.0519180297851562, + "logps/chosen": -325.2917785644531, + "logps/rejected": -312.35833740234375, + "loss": 0.699, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4832170307636261, + "rewards/margins": 1.1033408641815186, + "rewards/rejected": -1.5865578651428223, + "step": 926 + }, + { + "epoch": 0.11, + "learning_rate": 2.7204729017909397e-07, + "logits/chosen": -2.2175116539001465, + "logits/rejected": -2.4330599308013916, + "logps/chosen": -348.10418701171875, + "logps/rejected": -163.45779418945312, + "loss": 0.6426, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3186194896697998, + "rewards/margins": 0.6753276586532593, + "rewards/rejected": -0.9939471483230591, + "step": 927 + }, + { + "epoch": 0.11, + "learning_rate": 2.720121737094697e-07, + "logits/chosen": -2.230579376220703, + "logits/rejected": -2.4541306495666504, + "logps/chosen": -361.39501953125, + "logps/rejected": -170.25912475585938, + "loss": 0.8791, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.807267963886261, + "rewards/margins": 0.2671632170677185, + "rewards/rejected": -1.07443106174469, + "step": 928 + }, + { + "epoch": 0.11, + "learning_rate": 2.719770572398455e-07, + "logits/chosen": -2.3988020420074463, + "logits/rejected": -2.4642562866210938, + "logps/chosen": -274.8864440917969, + "logps/rejected": -275.232421875, + "loss": 0.2945, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7664299011230469, + "rewards/margins": 2.097182512283325, + "rewards/rejected": -2.863612413406372, + "step": 929 + }, + { + "epoch": 0.11, + "learning_rate": 2.7194194077022123e-07, + "logits/chosen": -2.5836448669433594, + "logits/rejected": -2.4034271240234375, + "logps/chosen": -367.62860107421875, + "logps/rejected": -202.362060546875, + "loss": 0.4799, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1583101451396942, + "rewards/margins": 1.0544875860214233, + "rewards/rejected": -0.8961775302886963, + "step": 930 + }, + { + "epoch": 0.11, + "learning_rate": 2.71906824300597e-07, + "logits/chosen": -2.1454691886901855, + "logits/rejected": -2.0927884578704834, + "logps/chosen": -348.0225830078125, + "logps/rejected": -308.8337097167969, + "loss": 0.4722, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5471814870834351, + "rewards/margins": 1.0332090854644775, + "rewards/rejected": -1.5803905725479126, + "step": 931 + }, + { + "epoch": 0.11, + "learning_rate": 2.7187170783097274e-07, + "logits/chosen": -2.071117877960205, + "logits/rejected": -2.369755744934082, + "logps/chosen": -371.4228515625, + "logps/rejected": -253.86639404296875, + "loss": 0.4515, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6557203531265259, + "rewards/margins": 1.2691633701324463, + "rewards/rejected": -1.9248838424682617, + "step": 932 + }, + { + "epoch": 0.11, + "learning_rate": 2.7183659136134844e-07, + "logits/chosen": -2.6217257976531982, + "logits/rejected": -2.7141366004943848, + "logps/chosen": -210.74627685546875, + "logps/rejected": -170.72317504882812, + "loss": 0.3674, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7493433952331543, + "rewards/margins": 1.4246512651443481, + "rewards/rejected": -2.173994779586792, + "step": 933 + }, + { + "epoch": 0.11, + "learning_rate": 2.718014748917242e-07, + "logits/chosen": -2.0822627544403076, + "logits/rejected": -2.223673105239868, + "logps/chosen": -379.9629211425781, + "logps/rejected": -295.5174560546875, + "loss": 0.6787, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1290429830551147, + "rewards/margins": 0.4445805549621582, + "rewards/rejected": -1.573623538017273, + "step": 934 + }, + { + "epoch": 0.11, + "learning_rate": 2.7176635842209994e-07, + "logits/chosen": -2.8600969314575195, + "logits/rejected": -2.8231778144836426, + "logps/chosen": -185.5566864013672, + "logps/rejected": -157.08026123046875, + "loss": 0.5296, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6809967756271362, + "rewards/margins": 1.4146416187286377, + "rewards/rejected": -2.0956385135650635, + "step": 935 + }, + { + "epoch": 0.11, + "learning_rate": 2.717312419524757e-07, + "logits/chosen": -2.6273365020751953, + "logits/rejected": -2.5254313945770264, + "logps/chosen": -274.6302490234375, + "logps/rejected": -238.43116760253906, + "loss": 0.5351, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5794265270233154, + "rewards/margins": 0.9293777942657471, + "rewards/rejected": -1.508804202079773, + "step": 936 + }, + { + "epoch": 0.11, + "learning_rate": 2.7169612548285145e-07, + "logits/chosen": -1.9871513843536377, + "logits/rejected": -2.0378079414367676, + "logps/chosen": -419.7740783691406, + "logps/rejected": -400.65069580078125, + "loss": 0.4314, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.826262354850769, + "rewards/margins": 1.6988157033920288, + "rewards/rejected": -2.525078296661377, + "step": 937 + }, + { + "epoch": 0.11, + "learning_rate": 2.716610090132272e-07, + "logits/chosen": -2.136427879333496, + "logits/rejected": -2.328068494796753, + "logps/chosen": -282.0058288574219, + "logps/rejected": -188.35641479492188, + "loss": 1.0473, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4036636352539062, + "rewards/margins": 0.3556753396987915, + "rewards/rejected": -1.7593390941619873, + "step": 938 + }, + { + "epoch": 0.11, + "learning_rate": 2.7162589254360296e-07, + "logits/chosen": -2.6282224655151367, + "logits/rejected": -2.6513731479644775, + "logps/chosen": -304.37322998046875, + "logps/rejected": -323.38818359375, + "loss": 0.6981, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6181282997131348, + "rewards/margins": 0.47098174691200256, + "rewards/rejected": -2.0891098976135254, + "step": 939 + }, + { + "epoch": 0.11, + "learning_rate": 2.715907760739787e-07, + "logits/chosen": -2.5294463634490967, + "logits/rejected": -2.409975528717041, + "logps/chosen": -295.607666015625, + "logps/rejected": -281.0257568359375, + "loss": 0.5346, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9592226147651672, + "rewards/margins": 0.8684044480323792, + "rewards/rejected": -1.8276269435882568, + "step": 940 + }, + { + "epoch": 0.11, + "learning_rate": 2.715556596043544e-07, + "logits/chosen": -2.5428879261016846, + "logits/rejected": -2.581329584121704, + "logps/chosen": -329.79547119140625, + "logps/rejected": -266.5238342285156, + "loss": 0.2022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5944668650627136, + "rewards/margins": 1.7819006443023682, + "rewards/rejected": -2.3763675689697266, + "step": 941 + }, + { + "epoch": 0.11, + "learning_rate": 2.7152054313473017e-07, + "logits/chosen": -1.6428226232528687, + "logits/rejected": -1.787173867225647, + "logps/chosen": -247.6787567138672, + "logps/rejected": -319.8815612792969, + "loss": 1.0565, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2835228443145752, + "rewards/margins": -0.33473485708236694, + "rewards/rejected": -0.9487881064414978, + "step": 942 + }, + { + "epoch": 0.11, + "learning_rate": 2.714854266651059e-07, + "logits/chosen": -2.2968833446502686, + "logits/rejected": -2.527090072631836, + "logps/chosen": -269.6842041015625, + "logps/rejected": -370.9697265625, + "loss": 0.4804, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1120758056640625, + "rewards/margins": 1.1229451894760132, + "rewards/rejected": -2.2350211143493652, + "step": 943 + }, + { + "epoch": 0.11, + "learning_rate": 2.714503101954817e-07, + "logits/chosen": -2.555269241333008, + "logits/rejected": -2.5393314361572266, + "logps/chosen": -234.35797119140625, + "logps/rejected": -268.0994567871094, + "loss": 0.5077, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1491222381591797, + "rewards/margins": 1.3216155767440796, + "rewards/rejected": -2.470737934112549, + "step": 944 + }, + { + "epoch": 0.11, + "learning_rate": 2.7141519372585743e-07, + "logits/chosen": -2.2538650035858154, + "logits/rejected": -2.3561336994171143, + "logps/chosen": -322.373291015625, + "logps/rejected": -211.6267852783203, + "loss": 0.6313, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.42692890763282776, + "rewards/margins": 1.0836691856384277, + "rewards/rejected": -1.510598063468933, + "step": 945 + }, + { + "epoch": 0.11, + "learning_rate": 2.7138007725623313e-07, + "logits/chosen": -2.5338010787963867, + "logits/rejected": -2.619286298751831, + "logps/chosen": -274.85638427734375, + "logps/rejected": -245.77793884277344, + "loss": 0.4709, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.36393654346466064, + "rewards/margins": 0.7486838102340698, + "rewards/rejected": -1.1126203536987305, + "step": 946 + }, + { + "epoch": 0.11, + "learning_rate": 2.7134496078660894e-07, + "logits/chosen": -2.5363874435424805, + "logits/rejected": -2.599087715148926, + "logps/chosen": -77.00553131103516, + "logps/rejected": -107.08541870117188, + "loss": 0.518, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5944126844406128, + "rewards/margins": 1.2427858114242554, + "rewards/rejected": -1.8371986150741577, + "step": 947 + }, + { + "epoch": 0.11, + "learning_rate": 2.7130984431698464e-07, + "logits/chosen": -2.176398992538452, + "logits/rejected": -1.8976421356201172, + "logps/chosen": -197.55776977539062, + "logps/rejected": -355.3824462890625, + "loss": 0.2714, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19455352425575256, + "rewards/margins": 1.707262635231018, + "rewards/rejected": -1.9018160104751587, + "step": 948 + }, + { + "epoch": 0.11, + "learning_rate": 2.712747278473604e-07, + "logits/chosen": -1.8871300220489502, + "logits/rejected": -2.0636422634124756, + "logps/chosen": -350.1375427246094, + "logps/rejected": -281.299560546875, + "loss": 0.5422, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9589203596115112, + "rewards/margins": 0.6957119703292847, + "rewards/rejected": -1.6546324491500854, + "step": 949 + }, + { + "epoch": 0.11, + "learning_rate": 2.7123961137773615e-07, + "logits/chosen": -2.4599788188934326, + "logits/rejected": -2.1677660942077637, + "logps/chosen": -168.51527404785156, + "logps/rejected": -382.31597900390625, + "loss": 0.3789, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8122637271881104, + "rewards/margins": 2.1092824935913086, + "rewards/rejected": -2.921546220779419, + "step": 950 + }, + { + "epoch": 0.11, + "learning_rate": 2.712044949081119e-07, + "logits/chosen": -2.160365104675293, + "logits/rejected": -2.540207862854004, + "logps/chosen": -346.0041198730469, + "logps/rejected": -229.1148223876953, + "loss": 0.4278, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7193974256515503, + "rewards/margins": 0.9572546482086182, + "rewards/rejected": -1.6766520738601685, + "step": 951 + }, + { + "epoch": 0.11, + "learning_rate": 2.7116937843848765e-07, + "logits/chosen": -2.370619297027588, + "logits/rejected": -2.1698975563049316, + "logps/chosen": -171.29270935058594, + "logps/rejected": -243.1653594970703, + "loss": 0.3484, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4228106439113617, + "rewards/margins": 1.2561781406402588, + "rewards/rejected": -1.6789888143539429, + "step": 952 + }, + { + "epoch": 0.11, + "learning_rate": 2.711342619688634e-07, + "logits/chosen": -2.721609115600586, + "logits/rejected": -2.620137929916382, + "logps/chosen": -314.390625, + "logps/rejected": -398.5726623535156, + "loss": 0.4856, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1061995029449463, + "rewards/margins": 1.9914754629135132, + "rewards/rejected": -3.09767484664917, + "step": 953 + }, + { + "epoch": 0.11, + "learning_rate": 2.710991454992391e-07, + "logits/chosen": -2.3468003273010254, + "logits/rejected": -2.559495449066162, + "logps/chosen": -421.74273681640625, + "logps/rejected": -374.1211242675781, + "loss": 0.5609, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6081152558326721, + "rewards/margins": 1.2828059196472168, + "rewards/rejected": -1.8909212350845337, + "step": 954 + }, + { + "epoch": 0.11, + "learning_rate": 2.7106402902961486e-07, + "logits/chosen": -2.505345344543457, + "logits/rejected": -2.5174062252044678, + "logps/chosen": -225.994384765625, + "logps/rejected": -237.42904663085938, + "loss": 0.3217, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.28201767802238464, + "rewards/margins": 1.7001006603240967, + "rewards/rejected": -1.9821182489395142, + "step": 955 + }, + { + "epoch": 0.11, + "learning_rate": 2.710289125599906e-07, + "logits/chosen": -2.3664560317993164, + "logits/rejected": -2.3963632583618164, + "logps/chosen": -419.48681640625, + "logps/rejected": -278.7709655761719, + "loss": 0.4898, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6370401978492737, + "rewards/margins": 1.2027740478515625, + "rewards/rejected": -1.8398141860961914, + "step": 956 + }, + { + "epoch": 0.11, + "learning_rate": 2.7099379609036637e-07, + "logits/chosen": -2.4689505100250244, + "logits/rejected": -2.5404486656188965, + "logps/chosen": -215.55833435058594, + "logps/rejected": -332.5155944824219, + "loss": 1.0749, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.702026605606079, + "rewards/margins": 0.27647706866264343, + "rewards/rejected": -1.9785034656524658, + "step": 957 + }, + { + "epoch": 0.11, + "learning_rate": 2.709586796207421e-07, + "logits/chosen": -2.967115879058838, + "logits/rejected": -2.9637362957000732, + "logps/chosen": -434.0400695800781, + "logps/rejected": -331.07080078125, + "loss": 0.2564, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.747287392616272, + "rewards/margins": 2.2556374073028564, + "rewards/rejected": -3.002924680709839, + "step": 958 + }, + { + "epoch": 0.11, + "learning_rate": 2.709235631511178e-07, + "logits/chosen": -2.433946132659912, + "logits/rejected": -2.561072587966919, + "logps/chosen": -399.8831481933594, + "logps/rejected": -306.04290771484375, + "loss": 0.9076, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6465319395065308, + "rewards/margins": 0.5698177814483643, + "rewards/rejected": -2.2163496017456055, + "step": 959 + }, + { + "epoch": 0.11, + "learning_rate": 2.7088844668149363e-07, + "logits/chosen": -2.434425115585327, + "logits/rejected": -2.4489660263061523, + "logps/chosen": -269.5184326171875, + "logps/rejected": -237.9357452392578, + "loss": 0.6479, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2825847864151, + "rewards/margins": 0.8791796565055847, + "rewards/rejected": -2.16176438331604, + "step": 960 + }, + { + "epoch": 0.11, + "learning_rate": 2.708533302118694e-07, + "logits/chosen": -2.5488085746765137, + "logits/rejected": -2.7873733043670654, + "logps/chosen": -274.0318908691406, + "logps/rejected": -209.13372802734375, + "loss": 0.5158, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.769730269908905, + "rewards/margins": 1.3191614151000977, + "rewards/rejected": -2.0888919830322266, + "step": 961 + }, + { + "epoch": 0.11, + "learning_rate": 2.708182137422451e-07, + "logits/chosen": -2.0698444843292236, + "logits/rejected": -2.065962314605713, + "logps/chosen": -182.09768676757812, + "logps/rejected": -252.61068725585938, + "loss": 0.5005, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4298326373100281, + "rewards/margins": 1.054572343826294, + "rewards/rejected": -1.4844050407409668, + "step": 962 + }, + { + "epoch": 0.11, + "learning_rate": 2.7078309727262084e-07, + "logits/chosen": -2.7531023025512695, + "logits/rejected": -2.5286736488342285, + "logps/chosen": -299.5811462402344, + "logps/rejected": -299.72967529296875, + "loss": 0.4208, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4270227253437042, + "rewards/margins": 1.5349041223526, + "rewards/rejected": -1.9619269371032715, + "step": 963 + }, + { + "epoch": 0.11, + "learning_rate": 2.707479808029966e-07, + "logits/chosen": -2.7323315143585205, + "logits/rejected": -2.701444625854492, + "logps/chosen": -249.11666870117188, + "logps/rejected": -253.195068359375, + "loss": 0.7116, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.856501579284668, + "rewards/margins": 0.1604997217655182, + "rewards/rejected": -1.0170012712478638, + "step": 964 + }, + { + "epoch": 0.11, + "learning_rate": 2.7071286433337235e-07, + "logits/chosen": -2.6479439735412598, + "logits/rejected": -2.796797037124634, + "logps/chosen": -297.4308166503906, + "logps/rejected": -272.0747985839844, + "loss": 0.2692, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.500999927520752, + "rewards/margins": 1.5551762580871582, + "rewards/rejected": -2.05617618560791, + "step": 965 + }, + { + "epoch": 0.11, + "learning_rate": 2.706777478637481e-07, + "logits/chosen": -2.563933849334717, + "logits/rejected": -2.7866501808166504, + "logps/chosen": -185.534912109375, + "logps/rejected": -189.98281860351562, + "loss": 0.7172, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2219371795654297, + "rewards/margins": 0.7938193678855896, + "rewards/rejected": -2.015756368637085, + "step": 966 + }, + { + "epoch": 0.11, + "learning_rate": 2.706426313941238e-07, + "logits/chosen": -2.657012462615967, + "logits/rejected": -2.760840654373169, + "logps/chosen": -373.43902587890625, + "logps/rejected": -283.9833984375, + "loss": 0.5397, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6999067664146423, + "rewards/margins": 0.7358461618423462, + "rewards/rejected": -1.4357528686523438, + "step": 967 + }, + { + "epoch": 0.11, + "learning_rate": 2.7060751492449956e-07, + "logits/chosen": -2.327357769012451, + "logits/rejected": -2.3075995445251465, + "logps/chosen": -245.2423095703125, + "logps/rejected": -241.73719787597656, + "loss": 0.3313, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0908496230840683, + "rewards/margins": 1.3139057159423828, + "rewards/rejected": -1.2230560779571533, + "step": 968 + }, + { + "epoch": 0.11, + "learning_rate": 2.7057239845487536e-07, + "logits/chosen": -2.5529866218566895, + "logits/rejected": -2.5059776306152344, + "logps/chosen": -470.5398864746094, + "logps/rejected": -416.46978759765625, + "loss": 0.2787, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6749900579452515, + "rewards/margins": 2.09826922416687, + "rewards/rejected": -2.773259162902832, + "step": 969 + }, + { + "epoch": 0.11, + "learning_rate": 2.7053728198525106e-07, + "logits/chosen": -1.9696028232574463, + "logits/rejected": -2.304751396179199, + "logps/chosen": -360.45306396484375, + "logps/rejected": -238.67868041992188, + "loss": 0.6297, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0107792615890503, + "rewards/margins": 1.0564570426940918, + "rewards/rejected": -2.0672364234924316, + "step": 970 + }, + { + "epoch": 0.11, + "learning_rate": 2.705021655156268e-07, + "logits/chosen": -2.1763908863067627, + "logits/rejected": -1.6474803686141968, + "logps/chosen": -360.3812255859375, + "logps/rejected": -435.92950439453125, + "loss": 0.337, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.36763447523117065, + "rewards/margins": 2.534559965133667, + "rewards/rejected": -2.9021944999694824, + "step": 971 + }, + { + "epoch": 0.11, + "learning_rate": 2.7046704904600257e-07, + "logits/chosen": -2.8358325958251953, + "logits/rejected": -2.7325122356414795, + "logps/chosen": -138.66368103027344, + "logps/rejected": -151.8352508544922, + "loss": 0.8875, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1844799518585205, + "rewards/margins": -0.1047249287366867, + "rewards/rejected": -1.0797550678253174, + "step": 972 + }, + { + "epoch": 0.11, + "learning_rate": 2.7043193257637833e-07, + "logits/chosen": -2.686386823654175, + "logits/rejected": -2.7146177291870117, + "logps/chosen": -385.75531005859375, + "logps/rejected": -190.22117614746094, + "loss": 0.5462, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8976299166679382, + "rewards/margins": 1.3266552686691284, + "rewards/rejected": -2.224285125732422, + "step": 973 + }, + { + "epoch": 0.11, + "learning_rate": 2.703968161067541e-07, + "logits/chosen": -2.4015121459960938, + "logits/rejected": -2.3522114753723145, + "logps/chosen": -164.2830352783203, + "logps/rejected": -259.4168395996094, + "loss": 0.3602, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5948524475097656, + "rewards/margins": 0.9889205098152161, + "rewards/rejected": -1.5837730169296265, + "step": 974 + }, + { + "epoch": 0.11, + "learning_rate": 2.703616996371298e-07, + "logits/chosen": -2.9929914474487305, + "logits/rejected": -3.0092570781707764, + "logps/chosen": -119.12648010253906, + "logps/rejected": -194.90118408203125, + "loss": 0.2075, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11725424975156784, + "rewards/margins": 2.2706706523895264, + "rewards/rejected": -2.3879246711730957, + "step": 975 + }, + { + "epoch": 0.11, + "learning_rate": 2.7032658316750554e-07, + "logits/chosen": -2.1527597904205322, + "logits/rejected": -1.8320688009262085, + "logps/chosen": -148.2198486328125, + "logps/rejected": -223.47349548339844, + "loss": 0.3671, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9650161266326904, + "rewards/margins": 1.104918360710144, + "rewards/rejected": -2.069934368133545, + "step": 976 + }, + { + "epoch": 0.11, + "learning_rate": 2.702914666978813e-07, + "logits/chosen": -2.3601245880126953, + "logits/rejected": -2.265385866165161, + "logps/chosen": -221.90261840820312, + "logps/rejected": -193.5629119873047, + "loss": 0.2444, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.017338473349809647, + "rewards/margins": 1.8554553985595703, + "rewards/rejected": -1.8727937936782837, + "step": 977 + }, + { + "epoch": 0.11, + "learning_rate": 2.7025635022825704e-07, + "logits/chosen": -2.4110865592956543, + "logits/rejected": -2.488083600997925, + "logps/chosen": -206.53152465820312, + "logps/rejected": -223.50320434570312, + "loss": 0.5401, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6350420713424683, + "rewards/margins": 1.5202207565307617, + "rewards/rejected": -2.1552629470825195, + "step": 978 + }, + { + "epoch": 0.11, + "learning_rate": 2.702212337586328e-07, + "logits/chosen": -2.404365301132202, + "logits/rejected": -2.4279592037200928, + "logps/chosen": -224.13333129882812, + "logps/rejected": -311.0357971191406, + "loss": 0.4826, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7509675025939941, + "rewards/margins": 1.3927792310714722, + "rewards/rejected": -2.143746852874756, + "step": 979 + }, + { + "epoch": 0.11, + "learning_rate": 2.701861172890085e-07, + "logits/chosen": -2.404512405395508, + "logits/rejected": -2.5201616287231445, + "logps/chosen": -394.2721862792969, + "logps/rejected": -224.41748046875, + "loss": 0.4303, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21923017501831055, + "rewards/margins": 1.2496510744094849, + "rewards/rejected": -1.4688812494277954, + "step": 980 + }, + { + "epoch": 0.11, + "learning_rate": 2.701510008193843e-07, + "logits/chosen": -2.354667901992798, + "logits/rejected": -2.203829288482666, + "logps/chosen": -344.77813720703125, + "logps/rejected": -522.9458618164062, + "loss": 0.8333, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9103546142578125, + "rewards/margins": 1.6609307527542114, + "rewards/rejected": -2.5712852478027344, + "step": 981 + }, + { + "epoch": 0.11, + "learning_rate": 2.7011588434976006e-07, + "logits/chosen": -2.297982931137085, + "logits/rejected": -2.045866012573242, + "logps/chosen": -221.89959716796875, + "logps/rejected": -381.69146728515625, + "loss": 0.4816, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16302134096622467, + "rewards/margins": 1.5559425354003906, + "rewards/rejected": -1.718963861465454, + "step": 982 + }, + { + "epoch": 0.11, + "learning_rate": 2.7008076788013576e-07, + "logits/chosen": -2.641288995742798, + "logits/rejected": -2.7093191146850586, + "logps/chosen": -242.36294555664062, + "logps/rejected": -277.1359558105469, + "loss": 0.6175, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9580966234207153, + "rewards/margins": 2.033155918121338, + "rewards/rejected": -2.9912524223327637, + "step": 983 + }, + { + "epoch": 0.11, + "learning_rate": 2.700456514105115e-07, + "logits/chosen": -2.358659267425537, + "logits/rejected": -2.554124593734741, + "logps/chosen": -327.4410095214844, + "logps/rejected": -301.54632568359375, + "loss": 1.5916, + "rewards/accuracies": 0.375, + "rewards/chosen": -2.1169614791870117, + "rewards/margins": -0.41911691427230835, + "rewards/rejected": -1.6978445053100586, + "step": 984 + }, + { + "epoch": 0.11, + "learning_rate": 2.7001053494088727e-07, + "logits/chosen": -2.765486001968384, + "logits/rejected": -2.591902494430542, + "logps/chosen": -244.45492553710938, + "logps/rejected": -414.72613525390625, + "loss": 0.4136, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3749640882015228, + "rewards/margins": 1.8054795265197754, + "rewards/rejected": -2.18044376373291, + "step": 985 + }, + { + "epoch": 0.11, + "learning_rate": 2.69975418471263e-07, + "logits/chosen": -2.535788059234619, + "logits/rejected": -2.5502243041992188, + "logps/chosen": -139.384033203125, + "logps/rejected": -123.85874938964844, + "loss": 0.7701, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5365564823150635, + "rewards/margins": 0.5065892338752747, + "rewards/rejected": -2.0431456565856934, + "step": 986 + }, + { + "epoch": 0.11, + "learning_rate": 2.699403020016388e-07, + "logits/chosen": -2.7618892192840576, + "logits/rejected": -2.416962146759033, + "logps/chosen": -117.97174072265625, + "logps/rejected": -279.87738037109375, + "loss": 0.3407, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8874086141586304, + "rewards/margins": 1.5914158821105957, + "rewards/rejected": -2.4788246154785156, + "step": 987 + }, + { + "epoch": 0.11, + "learning_rate": 2.699051855320145e-07, + "logits/chosen": -2.9402472972869873, + "logits/rejected": -2.951664447784424, + "logps/chosen": -181.27223205566406, + "logps/rejected": -395.41033935546875, + "loss": 0.2722, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05722580477595329, + "rewards/margins": 2.3287999629974365, + "rewards/rejected": -2.386025905609131, + "step": 988 + }, + { + "epoch": 0.11, + "learning_rate": 2.6987006906239023e-07, + "logits/chosen": -2.1631314754486084, + "logits/rejected": -1.9699821472167969, + "logps/chosen": -261.3062744140625, + "logps/rejected": -299.01458740234375, + "loss": 0.405, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7265499234199524, + "rewards/margins": 1.43723726272583, + "rewards/rejected": -2.1637871265411377, + "step": 989 + }, + { + "epoch": 0.11, + "learning_rate": 2.69834952592766e-07, + "logits/chosen": -2.1631550788879395, + "logits/rejected": -2.131283760070801, + "logps/chosen": -222.89031982421875, + "logps/rejected": -272.7859191894531, + "loss": 0.437, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1714615672826767, + "rewards/margins": 1.333417534828186, + "rewards/rejected": -1.504879117012024, + "step": 990 + }, + { + "epoch": 0.11, + "learning_rate": 2.6979983612314174e-07, + "logits/chosen": -1.7269456386566162, + "logits/rejected": -1.770137071609497, + "logps/chosen": -360.5963134765625, + "logps/rejected": -386.2558288574219, + "loss": 0.5092, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7052583694458008, + "rewards/margins": 1.2435380220413208, + "rewards/rejected": -1.948796272277832, + "step": 991 + }, + { + "epoch": 0.11, + "learning_rate": 2.697647196535175e-07, + "logits/chosen": -2.922337055206299, + "logits/rejected": -2.6615588665008545, + "logps/chosen": -184.51382446289062, + "logps/rejected": -197.66519165039062, + "loss": 0.6169, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7746655941009521, + "rewards/margins": 0.8469738960266113, + "rewards/rejected": -1.621639609336853, + "step": 992 + }, + { + "epoch": 0.11, + "learning_rate": 2.697296031838932e-07, + "logits/chosen": -2.6192235946655273, + "logits/rejected": -2.55705189704895, + "logps/chosen": -172.16390991210938, + "logps/rejected": -237.51202392578125, + "loss": 0.3739, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5009778141975403, + "rewards/margins": 1.3841487169265747, + "rewards/rejected": -1.8851264715194702, + "step": 993 + }, + { + "epoch": 0.11, + "learning_rate": 2.69694486714269e-07, + "logits/chosen": -1.7882287502288818, + "logits/rejected": -1.792926549911499, + "logps/chosen": -248.8741455078125, + "logps/rejected": -292.991943359375, + "loss": 0.5596, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.571047306060791, + "rewards/margins": 0.9265041351318359, + "rewards/rejected": -1.497551441192627, + "step": 994 + }, + { + "epoch": 0.11, + "learning_rate": 2.6965937024464475e-07, + "logits/chosen": -1.6532566547393799, + "logits/rejected": -1.4072068929672241, + "logps/chosen": -277.1990966796875, + "logps/rejected": -342.3555908203125, + "loss": 0.5327, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.39219874143600464, + "rewards/margins": 0.9276809692382812, + "rewards/rejected": -1.3198797702789307, + "step": 995 + }, + { + "epoch": 0.11, + "learning_rate": 2.6962425377502045e-07, + "logits/chosen": -1.744736909866333, + "logits/rejected": -2.172680139541626, + "logps/chosen": -387.4107666015625, + "logps/rejected": -267.56011962890625, + "loss": 0.8005, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7300555109977722, + "rewards/margins": 1.8519232273101807, + "rewards/rejected": -2.5819787979125977, + "step": 996 + }, + { + "epoch": 0.11, + "learning_rate": 2.695891373053962e-07, + "logits/chosen": -2.57588529586792, + "logits/rejected": -2.5543265342712402, + "logps/chosen": -217.57058715820312, + "logps/rejected": -186.479736328125, + "loss": 0.4014, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7625077962875366, + "rewards/margins": 1.0481702089309692, + "rewards/rejected": -1.8106780052185059, + "step": 997 + }, + { + "epoch": 0.12, + "learning_rate": 2.6955402083577196e-07, + "logits/chosen": -2.34381365776062, + "logits/rejected": -2.4039292335510254, + "logps/chosen": -357.3063049316406, + "logps/rejected": -356.645263671875, + "loss": 0.303, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7693930864334106, + "rewards/margins": 1.4673388004302979, + "rewards/rejected": -2.236731767654419, + "step": 998 + }, + { + "epoch": 0.12, + "learning_rate": 2.695189043661477e-07, + "logits/chosen": -2.943291664123535, + "logits/rejected": -2.943648099899292, + "logps/chosen": -249.21005249023438, + "logps/rejected": -228.87081909179688, + "loss": 0.4221, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9135041832923889, + "rewards/margins": 1.9776610136032104, + "rewards/rejected": -2.891165256500244, + "step": 999 + }, + { + "epoch": 0.12, + "learning_rate": 2.6948378789652347e-07, + "logits/chosen": -2.407505989074707, + "logits/rejected": -2.277204990386963, + "logps/chosen": -283.3604736328125, + "logps/rejected": -298.1053466796875, + "loss": 0.7412, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5865579843521118, + "rewards/margins": 0.4454551339149475, + "rewards/rejected": -1.032013177871704, + "step": 1000 + }, + { + "epoch": 0.12, + "eval_logits/chosen": -1.706773281097412, + "eval_logits/rejected": -1.586951732635498, + "eval_logps/chosen": -297.0463562011719, + "eval_logps/rejected": -263.6312255859375, + "eval_loss": 0.39179232716560364, + "eval_rewards/accuracies": 0.8285714387893677, + "eval_rewards/chosen": -0.43661564588546753, + "eval_rewards/margins": 1.40776789188385, + "eval_rewards/rejected": -1.844383716583252, + "eval_runtime": 24.2763, + "eval_samples_per_second": 2.883, + "eval_steps_per_second": 1.442, + "step": 1000 + }, + { + "epoch": 0.12, + "learning_rate": 2.6944867142689917e-07, + "logits/chosen": -2.6871256828308105, + "logits/rejected": -2.3754889965057373, + "logps/chosen": -267.60858154296875, + "logps/rejected": -311.12420654296875, + "loss": 0.1524, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.44397664070129395, + "rewards/margins": 2.714846611022949, + "rewards/rejected": -3.158823013305664, + "step": 1001 + }, + { + "epoch": 0.12, + "learning_rate": 2.694135549572749e-07, + "logits/chosen": -2.23483943939209, + "logits/rejected": -2.363715887069702, + "logps/chosen": -240.57737731933594, + "logps/rejected": -205.1536407470703, + "loss": 0.5205, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6841340065002441, + "rewards/margins": 0.5696833729743958, + "rewards/rejected": -1.2538173198699951, + "step": 1002 + }, + { + "epoch": 0.12, + "learning_rate": 2.6937843848765073e-07, + "logits/chosen": -2.3711965084075928, + "logits/rejected": -2.254995822906494, + "logps/chosen": -198.3154296875, + "logps/rejected": -182.12042236328125, + "loss": 0.5264, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.740746259689331, + "rewards/margins": 0.6946970224380493, + "rewards/rejected": -1.4354431629180908, + "step": 1003 + }, + { + "epoch": 0.12, + "learning_rate": 2.6934332201802643e-07, + "logits/chosen": -2.5747108459472656, + "logits/rejected": -2.6077847480773926, + "logps/chosen": -453.2989501953125, + "logps/rejected": -334.4549255371094, + "loss": 0.7226, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.352397620677948, + "rewards/margins": 1.1097991466522217, + "rewards/rejected": -1.4621968269348145, + "step": 1004 + }, + { + "epoch": 0.12, + "learning_rate": 2.693082055484022e-07, + "logits/chosen": -2.425201892852783, + "logits/rejected": -2.322390556335449, + "logps/chosen": -275.50592041015625, + "logps/rejected": -353.54296875, + "loss": 0.3391, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5550990700721741, + "rewards/margins": 1.8208730220794678, + "rewards/rejected": -2.375972032546997, + "step": 1005 + }, + { + "epoch": 0.12, + "learning_rate": 2.6927308907877794e-07, + "logits/chosen": -1.8915590047836304, + "logits/rejected": -2.051867961883545, + "logps/chosen": -327.2327880859375, + "logps/rejected": -371.226806640625, + "loss": 0.4685, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3261854946613312, + "rewards/margins": 1.137204647064209, + "rewards/rejected": -1.4633901119232178, + "step": 1006 + }, + { + "epoch": 0.12, + "learning_rate": 2.692379726091537e-07, + "logits/chosen": -2.3066153526306152, + "logits/rejected": -2.1800341606140137, + "logps/chosen": -227.6781463623047, + "logps/rejected": -357.30413818359375, + "loss": 0.5407, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.028306007385254, + "rewards/margins": 0.8943095207214355, + "rewards/rejected": -1.9226155281066895, + "step": 1007 + }, + { + "epoch": 0.12, + "learning_rate": 2.6920285613952945e-07, + "logits/chosen": -2.3130042552948, + "logits/rejected": -2.3157193660736084, + "logps/chosen": -254.05789184570312, + "logps/rejected": -363.0810546875, + "loss": 0.7039, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8648236989974976, + "rewards/margins": 0.7897902131080627, + "rewards/rejected": -1.6546138525009155, + "step": 1008 + }, + { + "epoch": 0.12, + "learning_rate": 2.6916773966990515e-07, + "logits/chosen": -2.6028892993927, + "logits/rejected": -2.656743049621582, + "logps/chosen": -210.17141723632812, + "logps/rejected": -156.30728149414062, + "loss": 0.7389, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.061901330947876, + "rewards/margins": 0.9346155524253845, + "rewards/rejected": -1.9965167045593262, + "step": 1009 + }, + { + "epoch": 0.12, + "learning_rate": 2.691326232002809e-07, + "logits/chosen": -2.2770297527313232, + "logits/rejected": -2.6121740341186523, + "logps/chosen": -351.4130554199219, + "logps/rejected": -220.75790405273438, + "loss": 0.2835, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3354475498199463, + "rewards/margins": 1.4621068239212036, + "rewards/rejected": -1.7975544929504395, + "step": 1010 + }, + { + "epoch": 0.12, + "learning_rate": 2.6909750673065666e-07, + "logits/chosen": -2.528183698654175, + "logits/rejected": -2.2840824127197266, + "logps/chosen": -162.0445098876953, + "logps/rejected": -262.6940002441406, + "loss": 0.6337, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9388628005981445, + "rewards/margins": 0.7780689001083374, + "rewards/rejected": -1.7169318199157715, + "step": 1011 + }, + { + "epoch": 0.12, + "learning_rate": 2.690623902610324e-07, + "logits/chosen": -2.2107248306274414, + "logits/rejected": -2.4987611770629883, + "logps/chosen": -372.55267333984375, + "logps/rejected": -307.487548828125, + "loss": 0.3314, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7720730304718018, + "rewards/margins": 1.7115358114242554, + "rewards/rejected": -2.4836087226867676, + "step": 1012 + }, + { + "epoch": 0.12, + "learning_rate": 2.6902727379140816e-07, + "logits/chosen": -1.777363657951355, + "logits/rejected": -2.1002204418182373, + "logps/chosen": -423.5155944824219, + "logps/rejected": -414.0928955078125, + "loss": 0.2261, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5853309631347656, + "rewards/margins": 2.6972789764404297, + "rewards/rejected": -3.2826099395751953, + "step": 1013 + }, + { + "epoch": 0.12, + "learning_rate": 2.689921573217839e-07, + "logits/chosen": -2.7763285636901855, + "logits/rejected": -2.7480902671813965, + "logps/chosen": -375.3049621582031, + "logps/rejected": -306.4765319824219, + "loss": 0.2894, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3356649875640869, + "rewards/margins": 1.7054390907287598, + "rewards/rejected": -2.0411038398742676, + "step": 1014 + }, + { + "epoch": 0.12, + "learning_rate": 2.6895704085215967e-07, + "logits/chosen": -1.9156256914138794, + "logits/rejected": -2.1536409854888916, + "logps/chosen": -493.5736083984375, + "logps/rejected": -413.0483703613281, + "loss": 0.1904, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0819365531206131, + "rewards/margins": 2.669994831085205, + "rewards/rejected": -2.5880584716796875, + "step": 1015 + }, + { + "epoch": 0.12, + "learning_rate": 2.689219243825354e-07, + "logits/chosen": -2.9395086765289307, + "logits/rejected": -2.99713397026062, + "logps/chosen": -199.24435424804688, + "logps/rejected": -306.3936767578125, + "loss": 0.2908, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09225466847419739, + "rewards/margins": 1.8180997371673584, + "rewards/rejected": -1.9103543758392334, + "step": 1016 + }, + { + "epoch": 0.12, + "learning_rate": 2.688868079129111e-07, + "logits/chosen": -2.87416410446167, + "logits/rejected": -2.6064138412475586, + "logps/chosen": -144.43670654296875, + "logps/rejected": -235.53005981445312, + "loss": 0.1273, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.26715704798698425, + "rewards/margins": 4.5090131759643555, + "rewards/rejected": -4.77617073059082, + "step": 1017 + }, + { + "epoch": 0.12, + "learning_rate": 2.688516914432869e-07, + "logits/chosen": -2.6204092502593994, + "logits/rejected": -2.464315414428711, + "logps/chosen": -286.61614990234375, + "logps/rejected": -331.82427978515625, + "loss": 0.18, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8060864210128784, + "rewards/margins": 2.293489933013916, + "rewards/rejected": -3.099576473236084, + "step": 1018 + }, + { + "epoch": 0.12, + "learning_rate": 2.6881657497366263e-07, + "logits/chosen": -2.430706024169922, + "logits/rejected": -2.5046141147613525, + "logps/chosen": -217.3005828857422, + "logps/rejected": -182.5252227783203, + "loss": 0.5622, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2062290608882904, + "rewards/margins": 1.3949934244155884, + "rewards/rejected": -1.6012225151062012, + "step": 1019 + }, + { + "epoch": 0.12, + "learning_rate": 2.687814585040384e-07, + "logits/chosen": -2.6023058891296387, + "logits/rejected": -2.872363567352295, + "logps/chosen": -279.0948486328125, + "logps/rejected": -302.6192626953125, + "loss": 0.6554, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.743877649307251, + "rewards/margins": 1.3823484182357788, + "rewards/rejected": -2.1262259483337402, + "step": 1020 + }, + { + "epoch": 0.12, + "learning_rate": 2.6874634203441414e-07, + "logits/chosen": -2.025599718093872, + "logits/rejected": -2.0374763011932373, + "logps/chosen": -337.53668212890625, + "logps/rejected": -336.00177001953125, + "loss": 0.687, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9182639718055725, + "rewards/margins": 0.3300376236438751, + "rewards/rejected": -1.2483017444610596, + "step": 1021 + }, + { + "epoch": 0.12, + "learning_rate": 2.687112255647899e-07, + "logits/chosen": -2.1275901794433594, + "logits/rejected": -2.321676015853882, + "logps/chosen": -327.65460205078125, + "logps/rejected": -229.8554229736328, + "loss": 0.4914, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.38587239384651184, + "rewards/margins": 0.7625582218170166, + "rewards/rejected": -1.148430585861206, + "step": 1022 + }, + { + "epoch": 0.12, + "learning_rate": 2.686761090951656e-07, + "logits/chosen": -2.2439115047454834, + "logits/rejected": -2.337334156036377, + "logps/chosen": -525.90576171875, + "logps/rejected": -241.68295288085938, + "loss": 0.219, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.041586846113204956, + "rewards/margins": 1.8241713047027588, + "rewards/rejected": -1.8657581806182861, + "step": 1023 + }, + { + "epoch": 0.12, + "learning_rate": 2.6864099262554135e-07, + "logits/chosen": -2.2134132385253906, + "logits/rejected": -2.6395180225372314, + "logps/chosen": -375.7790832519531, + "logps/rejected": -205.48130798339844, + "loss": 0.4883, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.20716039836406708, + "rewards/margins": 1.2818496227264404, + "rewards/rejected": -1.4890100955963135, + "step": 1024 + }, + { + "epoch": 0.12, + "learning_rate": 2.686058761559171e-07, + "logits/chosen": -2.1547775268554688, + "logits/rejected": -2.4516549110412598, + "logps/chosen": -381.07421875, + "logps/rejected": -364.1540222167969, + "loss": 0.4456, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2695935368537903, + "rewards/margins": 1.1800010204315186, + "rewards/rejected": -1.4495946168899536, + "step": 1025 + }, + { + "epoch": 0.12, + "learning_rate": 2.6857075968629286e-07, + "logits/chosen": -2.6901979446411133, + "logits/rejected": -2.6117262840270996, + "logps/chosen": -183.45664978027344, + "logps/rejected": -333.5751647949219, + "loss": 0.2547, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.13763375580310822, + "rewards/margins": 2.8439114093780518, + "rewards/rejected": -2.70627760887146, + "step": 1026 + }, + { + "epoch": 0.12, + "learning_rate": 2.685356432166686e-07, + "logits/chosen": -2.251824378967285, + "logits/rejected": -2.698047399520874, + "logps/chosen": -243.45042419433594, + "logps/rejected": -159.98001098632812, + "loss": 0.3602, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.23628148436546326, + "rewards/margins": 1.701924204826355, + "rewards/rejected": -1.9382058382034302, + "step": 1027 + }, + { + "epoch": 0.12, + "learning_rate": 2.6850052674704437e-07, + "logits/chosen": -2.617802143096924, + "logits/rejected": -2.582911968231201, + "logps/chosen": -204.16416931152344, + "logps/rejected": -234.08023071289062, + "loss": 0.4719, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6898380517959595, + "rewards/margins": 1.3466852903366089, + "rewards/rejected": -2.0365233421325684, + "step": 1028 + }, + { + "epoch": 0.12, + "learning_rate": 2.684654102774201e-07, + "logits/chosen": -2.1491446495056152, + "logits/rejected": -2.352755069732666, + "logps/chosen": -259.12786865234375, + "logps/rejected": -147.84205627441406, + "loss": 0.7136, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5086299180984497, + "rewards/margins": 1.095106840133667, + "rewards/rejected": -1.6037367582321167, + "step": 1029 + }, + { + "epoch": 0.12, + "learning_rate": 2.6843029380779587e-07, + "logits/chosen": -2.448638916015625, + "logits/rejected": -2.5159413814544678, + "logps/chosen": -234.08935546875, + "logps/rejected": -203.89617919921875, + "loss": 0.5173, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7251067161560059, + "rewards/margins": 0.8838956952095032, + "rewards/rejected": -1.6090024709701538, + "step": 1030 + }, + { + "epoch": 0.12, + "learning_rate": 2.683951773381716e-07, + "logits/chosen": -2.8132452964782715, + "logits/rejected": -2.5948848724365234, + "logps/chosen": -279.93646240234375, + "logps/rejected": -352.2978515625, + "loss": 0.4976, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3049826622009277, + "rewards/margins": 1.0105829238891602, + "rewards/rejected": -2.315565586090088, + "step": 1031 + }, + { + "epoch": 0.12, + "learning_rate": 2.6836006086854733e-07, + "logits/chosen": -2.1750237941741943, + "logits/rejected": -2.4542179107666016, + "logps/chosen": -169.78219604492188, + "logps/rejected": -196.59292602539062, + "loss": 1.0653, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6888020038604736, + "rewards/margins": 0.09021222591400146, + "rewards/rejected": -1.7790143489837646, + "step": 1032 + }, + { + "epoch": 0.12, + "learning_rate": 2.683249443989231e-07, + "logits/chosen": -1.9540486335754395, + "logits/rejected": -2.4818644523620605, + "logps/chosen": -407.60791015625, + "logps/rejected": -325.4987487792969, + "loss": 0.2123, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6291710734367371, + "rewards/margins": 2.1598012447357178, + "rewards/rejected": -2.7889723777770996, + "step": 1033 + }, + { + "epoch": 0.12, + "learning_rate": 2.6828982792929884e-07, + "logits/chosen": -2.245558738708496, + "logits/rejected": -2.161445379257202, + "logps/chosen": -172.4700927734375, + "logps/rejected": -253.35821533203125, + "loss": 0.3329, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7610654234886169, + "rewards/margins": 1.3159769773483276, + "rewards/rejected": -2.0770423412323, + "step": 1034 + }, + { + "epoch": 0.12, + "learning_rate": 2.682547114596746e-07, + "logits/chosen": -2.327101469039917, + "logits/rejected": -2.482271432876587, + "logps/chosen": -323.6680603027344, + "logps/rejected": -309.645263671875, + "loss": 0.4142, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2997632622718811, + "rewards/margins": 1.8667712211608887, + "rewards/rejected": -2.166534662246704, + "step": 1035 + }, + { + "epoch": 0.12, + "learning_rate": 2.682195949900503e-07, + "logits/chosen": -2.6922309398651123, + "logits/rejected": -2.638598680496216, + "logps/chosen": -257.26751708984375, + "logps/rejected": -281.7200622558594, + "loss": 0.2744, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5774619579315186, + "rewards/margins": 2.6256351470947266, + "rewards/rejected": -3.203097105026245, + "step": 1036 + }, + { + "epoch": 0.12, + "learning_rate": 2.681844785204261e-07, + "logits/chosen": -2.0326144695281982, + "logits/rejected": -2.09716796875, + "logps/chosen": -354.40301513671875, + "logps/rejected": -261.86883544921875, + "loss": 0.9884, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4143896102905273, + "rewards/margins": 0.36585286259651184, + "rewards/rejected": -1.7802424430847168, + "step": 1037 + }, + { + "epoch": 0.12, + "learning_rate": 2.681493620508018e-07, + "logits/chosen": -3.0190978050231934, + "logits/rejected": -2.9918882846832275, + "logps/chosen": -213.11346435546875, + "logps/rejected": -212.6721649169922, + "loss": 0.3158, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8972195386886597, + "rewards/margins": 2.005283832550049, + "rewards/rejected": -2.90250301361084, + "step": 1038 + }, + { + "epoch": 0.12, + "learning_rate": 2.6811424558117755e-07, + "logits/chosen": -2.818791627883911, + "logits/rejected": -2.5881409645080566, + "logps/chosen": -323.57403564453125, + "logps/rejected": -224.920654296875, + "loss": 0.6362, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8922632932662964, + "rewards/margins": 0.6930540800094604, + "rewards/rejected": -1.5853173732757568, + "step": 1039 + }, + { + "epoch": 0.12, + "learning_rate": 2.680791291115533e-07, + "logits/chosen": -2.5384116172790527, + "logits/rejected": -2.5360124111175537, + "logps/chosen": -311.36163330078125, + "logps/rejected": -216.65199279785156, + "loss": 0.4265, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3240993320941925, + "rewards/margins": 1.1043888330459595, + "rewards/rejected": -1.4284882545471191, + "step": 1040 + }, + { + "epoch": 0.12, + "learning_rate": 2.6804401264192906e-07, + "logits/chosen": -2.2458388805389404, + "logits/rejected": -2.221508502960205, + "logps/chosen": -238.8441162109375, + "logps/rejected": -237.4669952392578, + "loss": 0.3816, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6334779262542725, + "rewards/margins": 1.002936601638794, + "rewards/rejected": -1.6364145278930664, + "step": 1041 + }, + { + "epoch": 0.12, + "learning_rate": 2.680088961723048e-07, + "logits/chosen": -2.2666611671447754, + "logits/rejected": -2.374659299850464, + "logps/chosen": -335.671142578125, + "logps/rejected": -251.1038360595703, + "loss": 0.4852, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0121570825576782, + "rewards/margins": 2.095823287963867, + "rewards/rejected": -3.107980251312256, + "step": 1042 + }, + { + "epoch": 0.12, + "learning_rate": 2.6797377970268057e-07, + "logits/chosen": -2.149244546890259, + "logits/rejected": -2.2924773693084717, + "logps/chosen": -414.4647521972656, + "logps/rejected": -320.44342041015625, + "loss": 0.2994, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3896363377571106, + "rewards/margins": 2.2185006141662598, + "rewards/rejected": -2.6081368923187256, + "step": 1043 + }, + { + "epoch": 0.12, + "learning_rate": 2.6793866323305627e-07, + "logits/chosen": -2.548565149307251, + "logits/rejected": -2.5032644271850586, + "logps/chosen": -221.1837921142578, + "logps/rejected": -189.4575653076172, + "loss": 0.3084, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8039664030075073, + "rewards/margins": 1.6532090902328491, + "rewards/rejected": -2.4571757316589355, + "step": 1044 + }, + { + "epoch": 0.12, + "learning_rate": 2.67903546763432e-07, + "logits/chosen": -2.524580478668213, + "logits/rejected": -2.7037341594696045, + "logps/chosen": -224.03028869628906, + "logps/rejected": -257.44696044921875, + "loss": 0.4088, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5552821159362793, + "rewards/margins": 1.4910271167755127, + "rewards/rejected": -2.046308994293213, + "step": 1045 + }, + { + "epoch": 0.12, + "learning_rate": 2.678684302938078e-07, + "logits/chosen": -2.827920436859131, + "logits/rejected": -2.6043694019317627, + "logps/chosen": -177.07066345214844, + "logps/rejected": -208.15541076660156, + "loss": 0.5417, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5637058615684509, + "rewards/margins": 0.8643231391906738, + "rewards/rejected": -1.4280290603637695, + "step": 1046 + }, + { + "epoch": 0.12, + "learning_rate": 2.6783331382418353e-07, + "logits/chosen": -2.2604598999023438, + "logits/rejected": -2.3733716011047363, + "logps/chosen": -708.4826049804688, + "logps/rejected": -195.27491760253906, + "loss": 0.6183, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5538811683654785, + "rewards/margins": 0.6915801167488098, + "rewards/rejected": -1.245461344718933, + "step": 1047 + }, + { + "epoch": 0.12, + "learning_rate": 2.677981973545593e-07, + "logits/chosen": -2.50519061088562, + "logits/rejected": -2.495049476623535, + "logps/chosen": -148.19235229492188, + "logps/rejected": -188.00660705566406, + "loss": 0.3824, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06363216042518616, + "rewards/margins": 1.3778142929077148, + "rewards/rejected": -1.4414465427398682, + "step": 1048 + }, + { + "epoch": 0.12, + "learning_rate": 2.6776308088493504e-07, + "logits/chosen": -2.1794588565826416, + "logits/rejected": -2.4002575874328613, + "logps/chosen": -135.9974365234375, + "logps/rejected": -182.86285400390625, + "loss": 0.3526, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3848356306552887, + "rewards/margins": 1.7364051342010498, + "rewards/rejected": -2.1212406158447266, + "step": 1049 + }, + { + "epoch": 0.12, + "learning_rate": 2.677279644153108e-07, + "logits/chosen": -2.381279468536377, + "logits/rejected": -2.40212082862854, + "logps/chosen": -209.23619079589844, + "logps/rejected": -300.5537414550781, + "loss": 0.4175, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8548520803451538, + "rewards/margins": 2.0599751472473145, + "rewards/rejected": -2.9148271083831787, + "step": 1050 + }, + { + "epoch": 0.12, + "learning_rate": 2.6769284794568654e-07, + "logits/chosen": -2.7052440643310547, + "logits/rejected": -2.715540885925293, + "logps/chosen": -176.79794311523438, + "logps/rejected": -189.69540405273438, + "loss": 0.2714, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5539785623550415, + "rewards/margins": 2.5260391235351562, + "rewards/rejected": -3.080017566680908, + "step": 1051 + }, + { + "epoch": 0.12, + "learning_rate": 2.6765773147606225e-07, + "logits/chosen": -2.3064684867858887, + "logits/rejected": -2.3951544761657715, + "logps/chosen": -247.20350646972656, + "logps/rejected": -211.92686462402344, + "loss": 0.3631, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.47101351618766785, + "rewards/margins": 1.6138019561767578, + "rewards/rejected": -2.084815502166748, + "step": 1052 + }, + { + "epoch": 0.12, + "learning_rate": 2.67622615006438e-07, + "logits/chosen": -2.3474135398864746, + "logits/rejected": -2.523688316345215, + "logps/chosen": -257.3155517578125, + "logps/rejected": -166.50344848632812, + "loss": 0.2636, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15836167335510254, + "rewards/margins": 2.130063056945801, + "rewards/rejected": -2.288424491882324, + "step": 1053 + }, + { + "epoch": 0.12, + "learning_rate": 2.6758749853681375e-07, + "logits/chosen": -2.470451831817627, + "logits/rejected": -2.402299165725708, + "logps/chosen": -261.16253662109375, + "logps/rejected": -298.47332763671875, + "loss": 0.5162, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.48303118348121643, + "rewards/margins": 1.0757715702056885, + "rewards/rejected": -1.5588027238845825, + "step": 1054 + }, + { + "epoch": 0.12, + "learning_rate": 2.675523820671895e-07, + "logits/chosen": -2.1494736671447754, + "logits/rejected": -2.2955689430236816, + "logps/chosen": -366.29437255859375, + "logps/rejected": -386.2571105957031, + "loss": 0.5386, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6891046762466431, + "rewards/margins": 0.6135704517364502, + "rewards/rejected": -1.3026750087738037, + "step": 1055 + }, + { + "epoch": 0.12, + "learning_rate": 2.6751726559756526e-07, + "logits/chosen": -2.475816011428833, + "logits/rejected": -2.470611572265625, + "logps/chosen": -291.9108581542969, + "logps/rejected": -309.23565673828125, + "loss": 0.9027, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2201720476150513, + "rewards/margins": 0.0847887471318245, + "rewards/rejected": -1.3049607276916504, + "step": 1056 + }, + { + "epoch": 0.12, + "learning_rate": 2.6748214912794096e-07, + "logits/chosen": -1.8692749738693237, + "logits/rejected": -2.2519121170043945, + "logps/chosen": -459.06158447265625, + "logps/rejected": -378.6885986328125, + "loss": 0.3715, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7761073112487793, + "rewards/margins": 1.737992286682129, + "rewards/rejected": -2.514099597930908, + "step": 1057 + }, + { + "epoch": 0.12, + "learning_rate": 2.674470326583167e-07, + "logits/chosen": -2.891225576400757, + "logits/rejected": -2.7261929512023926, + "logps/chosen": -147.51901245117188, + "logps/rejected": -213.11993408203125, + "loss": 0.1722, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.35538822412490845, + "rewards/margins": 2.605703830718994, + "rewards/rejected": -2.961091995239258, + "step": 1058 + }, + { + "epoch": 0.12, + "learning_rate": 2.674119161886925e-07, + "logits/chosen": -1.9064005613327026, + "logits/rejected": -2.127612352371216, + "logps/chosen": -510.53106689453125, + "logps/rejected": -379.9239196777344, + "loss": 0.3573, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.42226266860961914, + "rewards/margins": 1.336898922920227, + "rewards/rejected": -1.7591617107391357, + "step": 1059 + }, + { + "epoch": 0.12, + "learning_rate": 2.673767997190682e-07, + "logits/chosen": -1.7921055555343628, + "logits/rejected": -2.154395818710327, + "logps/chosen": -534.4351806640625, + "logps/rejected": -274.1014404296875, + "loss": 0.4709, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5214669108390808, + "rewards/margins": 1.0917925834655762, + "rewards/rejected": -1.6132595539093018, + "step": 1060 + }, + { + "epoch": 0.12, + "learning_rate": 2.67341683249444e-07, + "logits/chosen": -2.2455928325653076, + "logits/rejected": -1.9363627433776855, + "logps/chosen": -228.3758544921875, + "logps/rejected": -430.46527099609375, + "loss": 0.4141, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18883110582828522, + "rewards/margins": 1.4101128578186035, + "rewards/rejected": -1.5989439487457275, + "step": 1061 + }, + { + "epoch": 0.12, + "learning_rate": 2.6730656677981973e-07, + "logits/chosen": -2.4870660305023193, + "logits/rejected": -2.7627339363098145, + "logps/chosen": -239.97283935546875, + "logps/rejected": -254.03875732421875, + "loss": 0.2385, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.732452929019928, + "rewards/margins": 2.209810972213745, + "rewards/rejected": -2.9422638416290283, + "step": 1062 + }, + { + "epoch": 0.12, + "learning_rate": 2.672714503101955e-07, + "logits/chosen": -2.4411673545837402, + "logits/rejected": -2.398613214492798, + "logps/chosen": -245.81637573242188, + "logps/rejected": -255.14349365234375, + "loss": 0.4799, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5172300338745117, + "rewards/margins": 1.4681321382522583, + "rewards/rejected": -1.9853620529174805, + "step": 1063 + }, + { + "epoch": 0.12, + "learning_rate": 2.6723633384057124e-07, + "logits/chosen": -2.521651029586792, + "logits/rejected": -2.7774133682250977, + "logps/chosen": -393.33050537109375, + "logps/rejected": -255.50189208984375, + "loss": 0.4637, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9208019971847534, + "rewards/margins": 1.2985007762908936, + "rewards/rejected": -2.2193026542663574, + "step": 1064 + }, + { + "epoch": 0.12, + "learning_rate": 2.6720121737094694e-07, + "logits/chosen": -1.6206508874893188, + "logits/rejected": -1.7928580045700073, + "logps/chosen": -376.1607666015625, + "logps/rejected": -374.3707275390625, + "loss": 1.0005, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3140032291412354, + "rewards/margins": 0.8665757179260254, + "rewards/rejected": -2.18057918548584, + "step": 1065 + }, + { + "epoch": 0.12, + "learning_rate": 2.671661009013227e-07, + "logits/chosen": -2.6656453609466553, + "logits/rejected": -2.7409636974334717, + "logps/chosen": -219.195556640625, + "logps/rejected": -187.58738708496094, + "loss": 0.3598, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5177865028381348, + "rewards/margins": 1.2912431955337524, + "rewards/rejected": -1.8090298175811768, + "step": 1066 + }, + { + "epoch": 0.12, + "learning_rate": 2.6713098443169845e-07, + "logits/chosen": -2.423322916030884, + "logits/rejected": -2.348818063735962, + "logps/chosen": -176.48062133789062, + "logps/rejected": -183.61822509765625, + "loss": 0.4482, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.21063822507858276, + "rewards/margins": 1.1893707513809204, + "rewards/rejected": -1.4000089168548584, + "step": 1067 + }, + { + "epoch": 0.12, + "learning_rate": 2.670958679620742e-07, + "logits/chosen": -2.918290138244629, + "logits/rejected": -2.9874017238616943, + "logps/chosen": -273.52655029296875, + "logps/rejected": -338.39825439453125, + "loss": 0.4557, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6536903977394104, + "rewards/margins": 1.2050628662109375, + "rewards/rejected": -1.8587533235549927, + "step": 1068 + }, + { + "epoch": 0.12, + "learning_rate": 2.6706075149244996e-07, + "logits/chosen": -2.1370601654052734, + "logits/rejected": -2.276836633682251, + "logps/chosen": -168.76239013671875, + "logps/rejected": -210.37451171875, + "loss": 0.7706, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9602185487747192, + "rewards/margins": 0.7259442806243896, + "rewards/rejected": -1.6861629486083984, + "step": 1069 + }, + { + "epoch": 0.12, + "learning_rate": 2.6702563502282566e-07, + "logits/chosen": -2.480320453643799, + "logits/rejected": -2.397895336151123, + "logps/chosen": -153.1981201171875, + "logps/rejected": -194.33372497558594, + "loss": 0.5995, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4509719908237457, + "rewards/margins": 1.042445421218872, + "rewards/rejected": -1.493417501449585, + "step": 1070 + }, + { + "epoch": 0.12, + "learning_rate": 2.6699051855320146e-07, + "logits/chosen": -2.4723031520843506, + "logits/rejected": -2.3100357055664062, + "logps/chosen": -94.97573852539062, + "logps/rejected": -305.1226501464844, + "loss": 0.7163, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1885324716567993, + "rewards/margins": 0.7761526107788086, + "rewards/rejected": -1.964685082435608, + "step": 1071 + }, + { + "epoch": 0.12, + "learning_rate": 2.669554020835772e-07, + "logits/chosen": -2.390186309814453, + "logits/rejected": -2.2103030681610107, + "logps/chosen": -207.7316131591797, + "logps/rejected": -258.3200378417969, + "loss": 0.6578, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5336843729019165, + "rewards/margins": 0.40451881289482117, + "rewards/rejected": -0.9382031559944153, + "step": 1072 + }, + { + "epoch": 0.12, + "learning_rate": 2.669202856139529e-07, + "logits/chosen": -1.9044498205184937, + "logits/rejected": -2.1755945682525635, + "logps/chosen": -252.202392578125, + "logps/rejected": -213.48834228515625, + "loss": 0.5004, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2461862564086914, + "rewards/margins": 1.1743108034133911, + "rewards/rejected": -2.420497179031372, + "step": 1073 + }, + { + "epoch": 0.12, + "learning_rate": 2.6688516914432867e-07, + "logits/chosen": -1.9046101570129395, + "logits/rejected": -2.1822755336761475, + "logps/chosen": -392.9749755859375, + "logps/rejected": -275.94390869140625, + "loss": 0.4837, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2988743782043457, + "rewards/margins": 0.6753022074699402, + "rewards/rejected": -1.9741766452789307, + "step": 1074 + }, + { + "epoch": 0.12, + "learning_rate": 2.668500526747044e-07, + "logits/chosen": -2.3906707763671875, + "logits/rejected": -2.494981527328491, + "logps/chosen": -315.10467529296875, + "logps/rejected": -250.74588012695312, + "loss": 0.2403, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.385532021522522, + "rewards/margins": 1.7412728071212769, + "rewards/rejected": -2.126804828643799, + "step": 1075 + }, + { + "epoch": 0.12, + "learning_rate": 2.668149362050802e-07, + "logits/chosen": -2.5619349479675293, + "logits/rejected": -2.46972393989563, + "logps/chosen": -345.13372802734375, + "logps/rejected": -322.9685974121094, + "loss": 0.4633, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3194964528083801, + "rewards/margins": 1.6168911457061768, + "rewards/rejected": -1.936387538909912, + "step": 1076 + }, + { + "epoch": 0.12, + "learning_rate": 2.6677981973545593e-07, + "logits/chosen": -2.4223978519439697, + "logits/rejected": -2.3516805171966553, + "logps/chosen": -380.0841064453125, + "logps/rejected": -303.3489685058594, + "loss": 0.6357, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7033347487449646, + "rewards/margins": 1.1438854932785034, + "rewards/rejected": -1.8472201824188232, + "step": 1077 + }, + { + "epoch": 0.12, + "learning_rate": 2.6674470326583163e-07, + "logits/chosen": -2.719510078430176, + "logits/rejected": -2.5211026668548584, + "logps/chosen": -306.1302490234375, + "logps/rejected": -292.45294189453125, + "loss": 0.3334, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10928462445735931, + "rewards/margins": 1.8905162811279297, + "rewards/rejected": -1.9998009204864502, + "step": 1078 + }, + { + "epoch": 0.12, + "learning_rate": 2.667095867962074e-07, + "logits/chosen": -1.8650157451629639, + "logits/rejected": -2.1452252864837646, + "logps/chosen": -441.90625, + "logps/rejected": -178.2133331298828, + "loss": 1.252, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.0565388202667236, + "rewards/margins": -0.09565702080726624, + "rewards/rejected": -1.9608818292617798, + "step": 1079 + }, + { + "epoch": 0.12, + "learning_rate": 2.6667447032658314e-07, + "logits/chosen": -2.171823501586914, + "logits/rejected": -2.4182605743408203, + "logps/chosen": -286.4870910644531, + "logps/rejected": -237.45278930664062, + "loss": 0.4816, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12951549887657166, + "rewards/margins": 1.7340223789215088, + "rewards/rejected": -1.8635377883911133, + "step": 1080 + }, + { + "epoch": 0.12, + "learning_rate": 2.666393538569589e-07, + "logits/chosen": -2.5746936798095703, + "logits/rejected": -2.641458749771118, + "logps/chosen": -431.71661376953125, + "logps/rejected": -272.6900329589844, + "loss": 0.1567, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4283021092414856, + "rewards/margins": 2.3309545516967773, + "rewards/rejected": -2.759256601333618, + "step": 1081 + }, + { + "epoch": 0.12, + "learning_rate": 2.6660423738733465e-07, + "logits/chosen": -2.6859586238861084, + "logits/rejected": -2.714351177215576, + "logps/chosen": -249.84963989257812, + "logps/rejected": -202.57894897460938, + "loss": 0.5393, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5310726165771484, + "rewards/margins": 1.4784119129180908, + "rewards/rejected": -2.0094845294952393, + "step": 1082 + }, + { + "epoch": 0.12, + "learning_rate": 2.665691209177104e-07, + "logits/chosen": -2.246601104736328, + "logits/rejected": -2.2797346115112305, + "logps/chosen": -332.4461975097656, + "logps/rejected": -199.379638671875, + "loss": 0.6581, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5044522285461426, + "rewards/margins": 0.6243115067481995, + "rewards/rejected": -1.1287636756896973, + "step": 1083 + }, + { + "epoch": 0.12, + "learning_rate": 2.6653400444808616e-07, + "logits/chosen": -1.987513780593872, + "logits/rejected": -1.85202956199646, + "logps/chosen": -360.0188903808594, + "logps/rejected": -350.7794189453125, + "loss": 0.4342, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6658518314361572, + "rewards/margins": 1.593554973602295, + "rewards/rejected": -2.259406805038452, + "step": 1084 + }, + { + "epoch": 0.13, + "learning_rate": 2.664988879784619e-07, + "logits/chosen": -2.5312883853912354, + "logits/rejected": -2.3823747634887695, + "logps/chosen": -182.31155395507812, + "logps/rejected": -368.6305236816406, + "loss": 0.4663, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1866824626922607, + "rewards/margins": 2.4229094982147217, + "rewards/rejected": -3.6095919609069824, + "step": 1085 + }, + { + "epoch": 0.13, + "learning_rate": 2.664637715088376e-07, + "logits/chosen": -2.280313491821289, + "logits/rejected": -2.3933496475219727, + "logps/chosen": -230.82411193847656, + "logps/rejected": -222.58151245117188, + "loss": 0.3766, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6065807938575745, + "rewards/margins": 1.7476201057434082, + "rewards/rejected": -2.354200839996338, + "step": 1086 + }, + { + "epoch": 0.13, + "learning_rate": 2.6642865503921337e-07, + "logits/chosen": -2.155306816101074, + "logits/rejected": -2.2205288410186768, + "logps/chosen": -266.7474365234375, + "logps/rejected": -191.0703582763672, + "loss": 0.577, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.72831392288208, + "rewards/margins": 0.8046284914016724, + "rewards/rejected": -2.532942295074463, + "step": 1087 + }, + { + "epoch": 0.13, + "learning_rate": 2.663935385695891e-07, + "logits/chosen": -2.481679916381836, + "logits/rejected": -2.689592123031616, + "logps/chosen": -289.3603820800781, + "logps/rejected": -292.8167724609375, + "loss": 0.7563, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.36230960488319397, + "rewards/margins": 0.7926493883132935, + "rewards/rejected": -1.1549590826034546, + "step": 1088 + }, + { + "epoch": 0.13, + "learning_rate": 2.663584220999649e-07, + "logits/chosen": -2.6227540969848633, + "logits/rejected": -2.582620143890381, + "logps/chosen": -379.77496337890625, + "logps/rejected": -354.9800109863281, + "loss": 0.658, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2691524028778076, + "rewards/margins": 0.7236659526824951, + "rewards/rejected": -1.9928183555603027, + "step": 1089 + }, + { + "epoch": 0.13, + "learning_rate": 2.6632330563034063e-07, + "logits/chosen": -2.4290106296539307, + "logits/rejected": -2.622051239013672, + "logps/chosen": -385.2897644042969, + "logps/rejected": -226.70587158203125, + "loss": 0.4958, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4384644031524658, + "rewards/margins": 1.1330833435058594, + "rewards/rejected": -2.571547746658325, + "step": 1090 + }, + { + "epoch": 0.13, + "learning_rate": 2.6628818916071633e-07, + "logits/chosen": -2.5721359252929688, + "logits/rejected": -2.5110411643981934, + "logps/chosen": -200.35784912109375, + "logps/rejected": -270.064697265625, + "loss": 0.2735, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18634068965911865, + "rewards/margins": 1.402700662612915, + "rewards/rejected": -1.5890412330627441, + "step": 1091 + }, + { + "epoch": 0.13, + "learning_rate": 2.662530726910921e-07, + "logits/chosen": -2.0755186080932617, + "logits/rejected": -2.456505298614502, + "logps/chosen": -316.54443359375, + "logps/rejected": -163.2535400390625, + "loss": 0.6787, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.35800379514694214, + "rewards/margins": 0.1616140604019165, + "rewards/rejected": -0.5196178555488586, + "step": 1092 + }, + { + "epoch": 0.13, + "learning_rate": 2.662179562214679e-07, + "logits/chosen": -2.2184767723083496, + "logits/rejected": -2.1074471473693848, + "logps/chosen": -216.74844360351562, + "logps/rejected": -293.3028869628906, + "loss": 0.5134, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8850287199020386, + "rewards/margins": 1.172463059425354, + "rewards/rejected": -2.0574915409088135, + "step": 1093 + }, + { + "epoch": 0.13, + "learning_rate": 2.661828397518436e-07, + "logits/chosen": -1.984240174293518, + "logits/rejected": -1.9937281608581543, + "logps/chosen": -276.34429931640625, + "logps/rejected": -263.3941650390625, + "loss": 0.3959, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.21505218744277954, + "rewards/margins": 2.139070749282837, + "rewards/rejected": -2.3541228771209717, + "step": 1094 + }, + { + "epoch": 0.13, + "learning_rate": 2.6614772328221934e-07, + "logits/chosen": -1.692488431930542, + "logits/rejected": -2.143005847930908, + "logps/chosen": -362.0687255859375, + "logps/rejected": -230.48489379882812, + "loss": 0.5101, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.28216639161109924, + "rewards/margins": 1.0845561027526855, + "rewards/rejected": -1.3667223453521729, + "step": 1095 + }, + { + "epoch": 0.13, + "learning_rate": 2.661126068125951e-07, + "logits/chosen": -2.3524885177612305, + "logits/rejected": -2.3536221981048584, + "logps/chosen": -262.642333984375, + "logps/rejected": -240.25390625, + "loss": 0.2501, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2808707058429718, + "rewards/margins": 1.7196261882781982, + "rewards/rejected": -2.0004968643188477, + "step": 1096 + }, + { + "epoch": 0.13, + "learning_rate": 2.6607749034297085e-07, + "logits/chosen": -2.1674931049346924, + "logits/rejected": -2.207885265350342, + "logps/chosen": -280.2270202636719, + "logps/rejected": -317.42095947265625, + "loss": 0.3756, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5939505696296692, + "rewards/margins": 1.9216067790985107, + "rewards/rejected": -2.515557289123535, + "step": 1097 + }, + { + "epoch": 0.13, + "learning_rate": 2.660423738733466e-07, + "logits/chosen": -2.3933918476104736, + "logits/rejected": -2.5869739055633545, + "logps/chosen": -156.4141082763672, + "logps/rejected": -209.18890380859375, + "loss": 0.2188, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37969353795051575, + "rewards/margins": 2.021559953689575, + "rewards/rejected": -2.4012532234191895, + "step": 1098 + }, + { + "epoch": 0.13, + "learning_rate": 2.660072574037223e-07, + "logits/chosen": -2.2573370933532715, + "logits/rejected": -2.354602098464966, + "logps/chosen": -243.23992919921875, + "logps/rejected": -201.2763671875, + "loss": 0.6929, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1441670656204224, + "rewards/margins": 0.7924379706382751, + "rewards/rejected": -1.9366050958633423, + "step": 1099 + }, + { + "epoch": 0.13, + "learning_rate": 2.6597214093409806e-07, + "logits/chosen": -2.674461841583252, + "logits/rejected": -2.6893184185028076, + "logps/chosen": -174.8634796142578, + "logps/rejected": -255.9910430908203, + "loss": 0.3196, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.37594038248062134, + "rewards/margins": 2.383873224258423, + "rewards/rejected": -2.7598133087158203, + "step": 1100 + }, + { + "epoch": 0.13, + "learning_rate": 2.659370244644738e-07, + "logits/chosen": -1.4718313217163086, + "logits/rejected": -2.014868974685669, + "logps/chosen": -613.858154296875, + "logps/rejected": -403.5902404785156, + "loss": 0.2551, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6186689734458923, + "rewards/margins": 1.8130252361297607, + "rewards/rejected": -2.4316940307617188, + "step": 1101 + }, + { + "epoch": 0.13, + "learning_rate": 2.6590190799484957e-07, + "logits/chosen": -2.3608040809631348, + "logits/rejected": -2.451111078262329, + "logps/chosen": -287.78033447265625, + "logps/rejected": -213.25210571289062, + "loss": 0.5922, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.087461233139038, + "rewards/margins": 1.262712001800537, + "rewards/rejected": -2.350173234939575, + "step": 1102 + }, + { + "epoch": 0.13, + "learning_rate": 2.658667915252253e-07, + "logits/chosen": -1.411136269569397, + "logits/rejected": -1.8422489166259766, + "logps/chosen": -416.09088134765625, + "logps/rejected": -347.2420654296875, + "loss": 0.2819, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22844040393829346, + "rewards/margins": 1.7955336570739746, + "rewards/rejected": -2.0239739418029785, + "step": 1103 + }, + { + "epoch": 0.13, + "learning_rate": 2.658316750556011e-07, + "logits/chosen": -2.143792152404785, + "logits/rejected": -2.269282579421997, + "logps/chosen": -454.22430419921875, + "logps/rejected": -354.683349609375, + "loss": 0.2244, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.190933495759964, + "rewards/margins": 2.139915704727173, + "rewards/rejected": -2.3308494091033936, + "step": 1104 + }, + { + "epoch": 0.13, + "learning_rate": 2.6579655858597683e-07, + "logits/chosen": -1.8835370540618896, + "logits/rejected": -2.147965431213379, + "logps/chosen": -235.85919189453125, + "logps/rejected": -244.64138793945312, + "loss": 0.267, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1333482563495636, + "rewards/margins": 1.3890514373779297, + "rewards/rejected": -1.522399663925171, + "step": 1105 + }, + { + "epoch": 0.13, + "learning_rate": 2.657614421163526e-07, + "logits/chosen": -2.447667121887207, + "logits/rejected": -2.3490991592407227, + "logps/chosen": -310.4669189453125, + "logps/rejected": -255.39474487304688, + "loss": 0.3694, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7335676550865173, + "rewards/margins": 2.0295631885528564, + "rewards/rejected": -2.7631309032440186, + "step": 1106 + }, + { + "epoch": 0.13, + "learning_rate": 2.657263256467283e-07, + "logits/chosen": -2.2892165184020996, + "logits/rejected": -2.084897994995117, + "logps/chosen": -134.66867065429688, + "logps/rejected": -214.27784729003906, + "loss": 0.4753, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.023787498474121, + "rewards/margins": 1.3364969491958618, + "rewards/rejected": -2.3602845668792725, + "step": 1107 + }, + { + "epoch": 0.13, + "learning_rate": 2.6569120917710404e-07, + "logits/chosen": -2.5659966468811035, + "logits/rejected": -2.233072280883789, + "logps/chosen": -228.63641357421875, + "logps/rejected": -287.5979309082031, + "loss": 0.2814, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22910158336162567, + "rewards/margins": 1.5684702396392822, + "rewards/rejected": -1.7975716590881348, + "step": 1108 + }, + { + "epoch": 0.13, + "learning_rate": 2.656560927074798e-07, + "logits/chosen": -2.5366642475128174, + "logits/rejected": -2.5202414989471436, + "logps/chosen": -190.72488403320312, + "logps/rejected": -147.41574096679688, + "loss": 0.374, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.27254632115364075, + "rewards/margins": 1.0227664709091187, + "rewards/rejected": -1.295312762260437, + "step": 1109 + }, + { + "epoch": 0.13, + "learning_rate": 2.6562097623785555e-07, + "logits/chosen": -2.105177402496338, + "logits/rejected": -1.9371999502182007, + "logps/chosen": -342.0680236816406, + "logps/rejected": -383.90106201171875, + "loss": 0.4579, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9063900113105774, + "rewards/margins": 0.9867615699768066, + "rewards/rejected": -1.8931515216827393, + "step": 1110 + }, + { + "epoch": 0.13, + "learning_rate": 2.655858597682313e-07, + "logits/chosen": -2.4145631790161133, + "logits/rejected": -2.3346338272094727, + "logps/chosen": -335.1517639160156, + "logps/rejected": -334.397216796875, + "loss": 0.1574, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1718440055847168, + "rewards/margins": 2.1388773918151855, + "rewards/rejected": -1.9670335054397583, + "step": 1111 + }, + { + "epoch": 0.13, + "learning_rate": 2.6555074329860705e-07, + "logits/chosen": -2.5130062103271484, + "logits/rejected": -2.417605400085449, + "logps/chosen": -263.94366455078125, + "logps/rejected": -310.4676513671875, + "loss": 0.2869, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6532050967216492, + "rewards/margins": 1.6159051656723022, + "rewards/rejected": -2.2691102027893066, + "step": 1112 + }, + { + "epoch": 0.13, + "learning_rate": 2.6551562682898275e-07, + "logits/chosen": -2.3562936782836914, + "logits/rejected": -2.3030917644500732, + "logps/chosen": -308.01336669921875, + "logps/rejected": -374.3434753417969, + "loss": 0.4789, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7761095762252808, + "rewards/margins": 1.2639610767364502, + "rewards/rejected": -2.0400705337524414, + "step": 1113 + }, + { + "epoch": 0.13, + "learning_rate": 2.654805103593585e-07, + "logits/chosen": -2.1247639656066895, + "logits/rejected": -1.7754625082015991, + "logps/chosen": -193.85643005371094, + "logps/rejected": -271.7696533203125, + "loss": 0.3902, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3310196101665497, + "rewards/margins": 1.8946268558502197, + "rewards/rejected": -2.225646495819092, + "step": 1114 + }, + { + "epoch": 0.13, + "learning_rate": 2.6544539388973426e-07, + "logits/chosen": -2.713876247406006, + "logits/rejected": -3.023899793624878, + "logps/chosen": -216.10470581054688, + "logps/rejected": -246.9363250732422, + "loss": 0.3746, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0079684257507324, + "rewards/margins": 2.0506718158721924, + "rewards/rejected": -3.058640241622925, + "step": 1115 + }, + { + "epoch": 0.13, + "learning_rate": 2.6541027742011e-07, + "logits/chosen": -2.013225793838501, + "logits/rejected": -2.0305628776550293, + "logps/chosen": -257.6408996582031, + "logps/rejected": -263.26165771484375, + "loss": 1.2793, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7217341661453247, + "rewards/margins": 0.5633209943771362, + "rewards/rejected": -2.28505539894104, + "step": 1116 + }, + { + "epoch": 0.13, + "learning_rate": 2.6537516095048577e-07, + "logits/chosen": -2.79964542388916, + "logits/rejected": -2.60124135017395, + "logps/chosen": -345.28106689453125, + "logps/rejected": -191.68368530273438, + "loss": 0.4451, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0872669219970703, + "rewards/margins": 1.1146752834320068, + "rewards/rejected": -2.201942205429077, + "step": 1117 + }, + { + "epoch": 0.13, + "learning_rate": 2.653400444808615e-07, + "logits/chosen": -2.0625598430633545, + "logits/rejected": -2.2077999114990234, + "logps/chosen": -312.35833740234375, + "logps/rejected": -275.7291564941406, + "loss": 0.174, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2859232425689697, + "rewards/margins": 2.218991279602051, + "rewards/rejected": -2.5049145221710205, + "step": 1118 + }, + { + "epoch": 0.13, + "learning_rate": 2.653049280112373e-07, + "logits/chosen": -2.107189893722534, + "logits/rejected": -1.8679208755493164, + "logps/chosen": -113.12000274658203, + "logps/rejected": -233.1366424560547, + "loss": 0.8199, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9195275902748108, + "rewards/margins": 0.8865669369697571, + "rewards/rejected": -1.8060946464538574, + "step": 1119 + }, + { + "epoch": 0.13, + "learning_rate": 2.6526981154161303e-07, + "logits/chosen": -2.9682059288024902, + "logits/rejected": -2.894364356994629, + "logps/chosen": -181.93597412109375, + "logps/rejected": -184.84193420410156, + "loss": 0.5403, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.785114586353302, + "rewards/margins": 1.7442646026611328, + "rewards/rejected": -2.52937912940979, + "step": 1120 + }, + { + "epoch": 0.13, + "learning_rate": 2.6523469507198873e-07, + "logits/chosen": -1.7842713594436646, + "logits/rejected": -2.1082346439361572, + "logps/chosen": -514.504638671875, + "logps/rejected": -382.5542297363281, + "loss": 0.6134, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6476815938949585, + "rewards/margins": 0.7575786113739014, + "rewards/rejected": -1.4052602052688599, + "step": 1121 + }, + { + "epoch": 0.13, + "learning_rate": 2.651995786023645e-07, + "logits/chosen": -1.8438313007354736, + "logits/rejected": -2.283789873123169, + "logps/chosen": -449.04986572265625, + "logps/rejected": -281.6604919433594, + "loss": 0.6783, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8568148612976074, + "rewards/margins": 0.7005466818809509, + "rewards/rejected": -1.5573614835739136, + "step": 1122 + }, + { + "epoch": 0.13, + "learning_rate": 2.6516446213274024e-07, + "logits/chosen": -2.2625861167907715, + "logits/rejected": -2.6173624992370605, + "logps/chosen": -325.6612243652344, + "logps/rejected": -141.10830688476562, + "loss": 0.5007, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4914320707321167, + "rewards/margins": 0.7228903770446777, + "rewards/rejected": -1.214322566986084, + "step": 1123 + }, + { + "epoch": 0.13, + "learning_rate": 2.65129345663116e-07, + "logits/chosen": -2.2801318168640137, + "logits/rejected": -2.466940402984619, + "logps/chosen": -228.62657165527344, + "logps/rejected": -250.5317840576172, + "loss": 0.3157, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.29392409324645996, + "rewards/margins": 1.669655680656433, + "rewards/rejected": -1.963579773902893, + "step": 1124 + }, + { + "epoch": 0.13, + "learning_rate": 2.6509422919349175e-07, + "logits/chosen": -2.809922695159912, + "logits/rejected": -2.777984142303467, + "logps/chosen": -186.31698608398438, + "logps/rejected": -210.71148681640625, + "loss": 0.4247, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2869373559951782, + "rewards/margins": 1.0348318815231323, + "rewards/rejected": -1.3217692375183105, + "step": 1125 + }, + { + "epoch": 0.13, + "learning_rate": 2.6505911272386745e-07, + "logits/chosen": -1.7288763523101807, + "logits/rejected": -1.6363084316253662, + "logps/chosen": -382.8839111328125, + "logps/rejected": -405.9707946777344, + "loss": 0.2831, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.49602243304252625, + "rewards/margins": 1.7216607332229614, + "rewards/rejected": -2.2176830768585205, + "step": 1126 + }, + { + "epoch": 0.13, + "learning_rate": 2.6502399625424326e-07, + "logits/chosen": -2.495018243789673, + "logits/rejected": -2.197275400161743, + "logps/chosen": -230.37628173828125, + "logps/rejected": -206.36724853515625, + "loss": 0.403, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8157714605331421, + "rewards/margins": 1.2980040311813354, + "rewards/rejected": -2.1137754917144775, + "step": 1127 + }, + { + "epoch": 0.13, + "learning_rate": 2.64988879784619e-07, + "logits/chosen": -2.7480967044830322, + "logits/rejected": -2.845959186553955, + "logps/chosen": -326.00006103515625, + "logps/rejected": -277.9638671875, + "loss": 0.2539, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16152028739452362, + "rewards/margins": 3.204760789871216, + "rewards/rejected": -3.366281032562256, + "step": 1128 + }, + { + "epoch": 0.13, + "learning_rate": 2.649537633149947e-07, + "logits/chosen": -2.2797718048095703, + "logits/rejected": -2.2628121376037598, + "logps/chosen": -210.32012939453125, + "logps/rejected": -232.1619873046875, + "loss": 0.5796, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1449427604675293, + "rewards/margins": 1.733718752861023, + "rewards/rejected": -2.878661632537842, + "step": 1129 + }, + { + "epoch": 0.13, + "learning_rate": 2.6491864684537046e-07, + "logits/chosen": -2.0994980335235596, + "logits/rejected": -2.1626148223876953, + "logps/chosen": -326.68212890625, + "logps/rejected": -332.23052978515625, + "loss": 0.2822, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15703903138637543, + "rewards/margins": 2.924349784851074, + "rewards/rejected": -3.0813887119293213, + "step": 1130 + }, + { + "epoch": 0.13, + "learning_rate": 2.648835303757462e-07, + "logits/chosen": -2.200550079345703, + "logits/rejected": -2.3525631427764893, + "logps/chosen": -182.7928924560547, + "logps/rejected": -175.07534790039062, + "loss": 0.3374, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7339214086532593, + "rewards/margins": 1.3561310768127441, + "rewards/rejected": -2.090052366256714, + "step": 1131 + }, + { + "epoch": 0.13, + "learning_rate": 2.6484841390612197e-07, + "logits/chosen": -2.1322405338287354, + "logits/rejected": -2.5599019527435303, + "logps/chosen": -563.040771484375, + "logps/rejected": -219.9464111328125, + "loss": 0.5273, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.47562548518180847, + "rewards/margins": 0.8299604654312134, + "rewards/rejected": -1.3055858612060547, + "step": 1132 + }, + { + "epoch": 0.13, + "learning_rate": 2.648132974364977e-07, + "logits/chosen": -2.317383289337158, + "logits/rejected": -2.3984713554382324, + "logps/chosen": -315.70098876953125, + "logps/rejected": -275.713623046875, + "loss": 0.2004, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.613602876663208, + "rewards/margins": 2.379317283630371, + "rewards/rejected": -2.992920398712158, + "step": 1133 + }, + { + "epoch": 0.13, + "learning_rate": 2.6477818096687343e-07, + "logits/chosen": -2.2144224643707275, + "logits/rejected": -2.4418015480041504, + "logps/chosen": -452.8609619140625, + "logps/rejected": -398.8423767089844, + "loss": 0.518, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1885614395141602, + "rewards/margins": 1.4968409538269043, + "rewards/rejected": -2.6854023933410645, + "step": 1134 + }, + { + "epoch": 0.13, + "learning_rate": 2.647430644972492e-07, + "logits/chosen": -2.0631699562072754, + "logits/rejected": -1.969306230545044, + "logps/chosen": -465.4821472167969, + "logps/rejected": -378.48822021484375, + "loss": 0.3585, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.513957142829895, + "rewards/margins": 1.2363797426223755, + "rewards/rejected": -1.7503368854522705, + "step": 1135 + }, + { + "epoch": 0.13, + "learning_rate": 2.6470794802762493e-07, + "logits/chosen": -1.3566601276397705, + "logits/rejected": -1.5810012817382812, + "logps/chosen": -194.3639678955078, + "logps/rejected": -178.45376586914062, + "loss": 0.5884, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.757110059261322, + "rewards/margins": 0.57989102602005, + "rewards/rejected": -1.337001085281372, + "step": 1136 + }, + { + "epoch": 0.13, + "learning_rate": 2.646728315580007e-07, + "logits/chosen": -2.847135305404663, + "logits/rejected": -2.830392599105835, + "logps/chosen": -678.1988525390625, + "logps/rejected": -258.7132263183594, + "loss": 0.3542, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7815035581588745, + "rewards/margins": 2.0905206203460693, + "rewards/rejected": -2.8720240592956543, + "step": 1137 + }, + { + "epoch": 0.13, + "learning_rate": 2.6463771508837644e-07, + "logits/chosen": -2.264774799346924, + "logits/rejected": -2.7626450061798096, + "logps/chosen": -447.107177734375, + "logps/rejected": -156.83778381347656, + "loss": 1.407, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1533327102661133, + "rewards/margins": -0.43458080291748047, + "rewards/rejected": -0.7187519073486328, + "step": 1138 + }, + { + "epoch": 0.13, + "learning_rate": 2.646025986187522e-07, + "logits/chosen": -2.1462302207946777, + "logits/rejected": -2.298706531524658, + "logps/chosen": -235.26266479492188, + "logps/rejected": -176.2874755859375, + "loss": 1.098, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.679473876953125, + "rewards/margins": 0.2067304402589798, + "rewards/rejected": -2.886204242706299, + "step": 1139 + }, + { + "epoch": 0.13, + "learning_rate": 2.6456748214912795e-07, + "logits/chosen": -2.459181070327759, + "logits/rejected": -2.6748578548431396, + "logps/chosen": -367.52142333984375, + "logps/rejected": -331.725830078125, + "loss": 0.55, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6388230323791504, + "rewards/margins": 1.0316869020462036, + "rewards/rejected": -1.6705100536346436, + "step": 1140 + }, + { + "epoch": 0.13, + "learning_rate": 2.645323656795037e-07, + "logits/chosen": -2.3814330101013184, + "logits/rejected": -2.149296998977661, + "logps/chosen": -263.1747131347656, + "logps/rejected": -272.2397766113281, + "loss": 0.6245, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.010901927947998, + "rewards/margins": 0.8823429346084595, + "rewards/rejected": -1.8932448625564575, + "step": 1141 + }, + { + "epoch": 0.13, + "learning_rate": 2.644972492098794e-07, + "logits/chosen": -2.6352193355560303, + "logits/rejected": -2.609928607940674, + "logps/chosen": -241.75079345703125, + "logps/rejected": -277.6235046386719, + "loss": 0.8004, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.306174397468567, + "rewards/margins": 0.35556113719940186, + "rewards/rejected": -1.6617356538772583, + "step": 1142 + }, + { + "epoch": 0.13, + "learning_rate": 2.6446213274025516e-07, + "logits/chosen": -2.747234344482422, + "logits/rejected": -2.8647515773773193, + "logps/chosen": -249.64279174804688, + "logps/rejected": -184.81900024414062, + "loss": 0.7667, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6624103784561157, + "rewards/margins": 0.491845041513443, + "rewards/rejected": -1.1542553901672363, + "step": 1143 + }, + { + "epoch": 0.13, + "learning_rate": 2.644270162706309e-07, + "logits/chosen": -2.5456106662750244, + "logits/rejected": -2.4980549812316895, + "logps/chosen": -404.86767578125, + "logps/rejected": -403.05023193359375, + "loss": 0.438, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8301337361335754, + "rewards/margins": 0.9179759621620178, + "rewards/rejected": -1.7481096982955933, + "step": 1144 + }, + { + "epoch": 0.13, + "learning_rate": 2.6439189980100667e-07, + "logits/chosen": -2.457296133041382, + "logits/rejected": -1.9857008457183838, + "logps/chosen": -319.5179443359375, + "logps/rejected": -334.2062072753906, + "loss": 0.5891, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4768421947956085, + "rewards/margins": 1.502182960510254, + "rewards/rejected": -1.9790250062942505, + "step": 1145 + }, + { + "epoch": 0.13, + "learning_rate": 2.643567833313824e-07, + "logits/chosen": -2.4206483364105225, + "logits/rejected": -2.224607229232788, + "logps/chosen": -303.3848876953125, + "logps/rejected": -279.5226745605469, + "loss": 0.2339, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21466389298439026, + "rewards/margins": 2.314816474914551, + "rewards/rejected": -2.529480457305908, + "step": 1146 + }, + { + "epoch": 0.13, + "learning_rate": 2.643216668617581e-07, + "logits/chosen": -2.756302833557129, + "logits/rejected": -2.4549612998962402, + "logps/chosen": -268.9770202636719, + "logps/rejected": -308.1408386230469, + "loss": 0.3076, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.051728505641222, + "rewards/margins": 2.07413387298584, + "rewards/rejected": -2.0224051475524902, + "step": 1147 + }, + { + "epoch": 0.13, + "learning_rate": 2.642865503921339e-07, + "logits/chosen": -2.6566600799560547, + "logits/rejected": -2.753352403640747, + "logps/chosen": -184.65187072753906, + "logps/rejected": -316.9046630859375, + "loss": 0.7654, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.32448148727417, + "rewards/margins": 1.0053431987762451, + "rewards/rejected": -2.329824686050415, + "step": 1148 + }, + { + "epoch": 0.13, + "learning_rate": 2.642514339225097e-07, + "logits/chosen": -2.633833169937134, + "logits/rejected": -2.6715073585510254, + "logps/chosen": -89.8976058959961, + "logps/rejected": -137.670654296875, + "loss": 0.4599, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.29564669728279114, + "rewards/margins": 1.247879147529602, + "rewards/rejected": -1.5435259342193604, + "step": 1149 + }, + { + "epoch": 0.13, + "learning_rate": 2.642163174528854e-07, + "logits/chosen": -2.770144462585449, + "logits/rejected": -2.5009520053863525, + "logps/chosen": -162.85208129882812, + "logps/rejected": -253.79867553710938, + "loss": 0.2064, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5463243722915649, + "rewards/margins": 2.3018887042999268, + "rewards/rejected": -2.8482131958007812, + "step": 1150 + }, + { + "epoch": 0.13, + "learning_rate": 2.6418120098326114e-07, + "logits/chosen": -2.7195820808410645, + "logits/rejected": -2.7859528064727783, + "logps/chosen": -273.6429138183594, + "logps/rejected": -295.4629821777344, + "loss": 0.227, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05485104024410248, + "rewards/margins": 2.262880325317383, + "rewards/rejected": -2.3177313804626465, + "step": 1151 + }, + { + "epoch": 0.13, + "learning_rate": 2.641460845136369e-07, + "logits/chosen": -2.778646230697632, + "logits/rejected": -2.850022792816162, + "logps/chosen": -170.46188354492188, + "logps/rejected": -148.19967651367188, + "loss": 0.3572, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8437370657920837, + "rewards/margins": 1.3065545558929443, + "rewards/rejected": -2.150291681289673, + "step": 1152 + }, + { + "epoch": 0.13, + "learning_rate": 2.6411096804401264e-07, + "logits/chosen": -2.556429624557495, + "logits/rejected": -2.788400650024414, + "logps/chosen": -142.1055908203125, + "logps/rejected": -182.4561004638672, + "loss": 0.5367, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9808160066604614, + "rewards/margins": 1.7876075506210327, + "rewards/rejected": -2.768423557281494, + "step": 1153 + }, + { + "epoch": 0.13, + "learning_rate": 2.640758515743884e-07, + "logits/chosen": -2.297435998916626, + "logits/rejected": -1.9991726875305176, + "logps/chosen": -128.8997344970703, + "logps/rejected": -205.1623077392578, + "loss": 0.4653, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5508334636688232, + "rewards/margins": 1.1781738996505737, + "rewards/rejected": -1.729007363319397, + "step": 1154 + }, + { + "epoch": 0.13, + "learning_rate": 2.640407351047641e-07, + "logits/chosen": -2.2358710765838623, + "logits/rejected": -2.2295081615448, + "logps/chosen": -256.17547607421875, + "logps/rejected": -198.6510772705078, + "loss": 0.4921, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9438493847846985, + "rewards/margins": 0.7943581342697144, + "rewards/rejected": -1.7382075786590576, + "step": 1155 + }, + { + "epoch": 0.13, + "learning_rate": 2.6400561863513985e-07, + "logits/chosen": -2.1864824295043945, + "logits/rejected": -2.256110191345215, + "logps/chosen": -225.95281982421875, + "logps/rejected": -251.81332397460938, + "loss": 0.3914, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.281169593334198, + "rewards/margins": 1.314547061920166, + "rewards/rejected": -1.5957164764404297, + "step": 1156 + }, + { + "epoch": 0.13, + "learning_rate": 2.639705021655156e-07, + "logits/chosen": -2.347433567047119, + "logits/rejected": -2.220947027206421, + "logps/chosen": -242.9132080078125, + "logps/rejected": -236.07131958007812, + "loss": 0.3599, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6116658449172974, + "rewards/margins": 1.5528123378753662, + "rewards/rejected": -2.164478302001953, + "step": 1157 + }, + { + "epoch": 0.13, + "learning_rate": 2.6393538569589136e-07, + "logits/chosen": -2.0894699096679688, + "logits/rejected": -2.1511662006378174, + "logps/chosen": -311.7491760253906, + "logps/rejected": -460.5232849121094, + "loss": 0.8152, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.350529432296753, + "rewards/margins": 0.7016940712928772, + "rewards/rejected": -2.0522236824035645, + "step": 1158 + }, + { + "epoch": 0.13, + "learning_rate": 2.639002692262671e-07, + "logits/chosen": -2.0921196937561035, + "logits/rejected": -1.6351609230041504, + "logps/chosen": -198.94973754882812, + "logps/rejected": -413.92633056640625, + "loss": 0.5185, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2299716472625732, + "rewards/margins": 1.8189349174499512, + "rewards/rejected": -3.0489063262939453, + "step": 1159 + }, + { + "epoch": 0.13, + "learning_rate": 2.638651527566428e-07, + "logits/chosen": -2.174870014190674, + "logits/rejected": -2.4917006492614746, + "logps/chosen": -218.26284790039062, + "logps/rejected": -206.3188934326172, + "loss": 1.0588, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.17485773563385, + "rewards/margins": 0.39474672079086304, + "rewards/rejected": -1.5696043968200684, + "step": 1160 + }, + { + "epoch": 0.13, + "learning_rate": 2.638300362870186e-07, + "logits/chosen": -2.3976528644561768, + "logits/rejected": -2.225080966949463, + "logps/chosen": -207.25906372070312, + "logps/rejected": -191.12489318847656, + "loss": 0.4015, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5926086902618408, + "rewards/margins": 0.9004392027854919, + "rewards/rejected": -1.4930479526519775, + "step": 1161 + }, + { + "epoch": 0.13, + "learning_rate": 2.637949198173944e-07, + "logits/chosen": -2.375066041946411, + "logits/rejected": -2.3655948638916016, + "logps/chosen": -235.02639770507812, + "logps/rejected": -194.9849853515625, + "loss": 1.2953, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.725496768951416, + "rewards/margins": -0.17173054814338684, + "rewards/rejected": -2.5537660121917725, + "step": 1162 + }, + { + "epoch": 0.13, + "learning_rate": 2.637598033477701e-07, + "logits/chosen": -2.4681334495544434, + "logits/rejected": -2.502537727355957, + "logps/chosen": -216.67510986328125, + "logps/rejected": -272.1827392578125, + "loss": 0.1779, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23560257256031036, + "rewards/margins": 3.027677536010742, + "rewards/rejected": -2.7920751571655273, + "step": 1163 + }, + { + "epoch": 0.13, + "learning_rate": 2.6372468687814583e-07, + "logits/chosen": -2.3605244159698486, + "logits/rejected": -2.2732067108154297, + "logps/chosen": -285.9834899902344, + "logps/rejected": -249.64556884765625, + "loss": 0.6145, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.186070442199707, + "rewards/margins": 0.3366195857524872, + "rewards/rejected": -1.522689938545227, + "step": 1164 + }, + { + "epoch": 0.13, + "learning_rate": 2.636895704085216e-07, + "logits/chosen": -2.315774440765381, + "logits/rejected": -2.3569414615631104, + "logps/chosen": -179.0879364013672, + "logps/rejected": -304.43988037109375, + "loss": 0.2966, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.34383779764175415, + "rewards/margins": 2.1013965606689453, + "rewards/rejected": -2.4452342987060547, + "step": 1165 + }, + { + "epoch": 0.13, + "learning_rate": 2.6365445393889734e-07, + "logits/chosen": -2.3886361122131348, + "logits/rejected": -2.583533763885498, + "logps/chosen": -587.6896362304688, + "logps/rejected": -286.4595947265625, + "loss": 0.4648, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4493083357810974, + "rewards/margins": 1.4516029357910156, + "rewards/rejected": -1.9009112119674683, + "step": 1166 + }, + { + "epoch": 0.13, + "learning_rate": 2.636193374692731e-07, + "logits/chosen": -1.8224546909332275, + "logits/rejected": -1.5858136415481567, + "logps/chosen": -371.58795166015625, + "logps/rejected": -360.80645751953125, + "loss": 0.8642, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6062813997268677, + "rewards/margins": -0.08807447552680969, + "rewards/rejected": -0.5182068943977356, + "step": 1167 + }, + { + "epoch": 0.13, + "learning_rate": 2.635842209996488e-07, + "logits/chosen": -2.273179531097412, + "logits/rejected": -1.9923887252807617, + "logps/chosen": -162.1830596923828, + "logps/rejected": -249.775390625, + "loss": 0.4261, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5939012765884399, + "rewards/margins": 1.3758902549743652, + "rewards/rejected": -1.9697915315628052, + "step": 1168 + }, + { + "epoch": 0.13, + "learning_rate": 2.6354910453002455e-07, + "logits/chosen": -2.231584072113037, + "logits/rejected": -2.346567392349243, + "logps/chosen": -225.83285522460938, + "logps/rejected": -207.6768798828125, + "loss": 0.4447, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3737974166870117, + "rewards/margins": 1.589982509613037, + "rewards/rejected": -1.9637800455093384, + "step": 1169 + }, + { + "epoch": 0.13, + "learning_rate": 2.6351398806040035e-07, + "logits/chosen": -2.3222603797912598, + "logits/rejected": -1.9711601734161377, + "logps/chosen": -347.7451477050781, + "logps/rejected": -360.99639892578125, + "loss": 0.7906, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5152109861373901, + "rewards/margins": 0.5384347438812256, + "rewards/rejected": -2.053645610809326, + "step": 1170 + }, + { + "epoch": 0.13, + "learning_rate": 2.6347887159077605e-07, + "logits/chosen": -2.39858078956604, + "logits/rejected": -2.4723408222198486, + "logps/chosen": -213.8218994140625, + "logps/rejected": -215.48028564453125, + "loss": 0.2909, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.24426868557929993, + "rewards/margins": 1.8297443389892578, + "rewards/rejected": -1.5854756832122803, + "step": 1171 + }, + { + "epoch": 0.14, + "learning_rate": 2.634437551211518e-07, + "logits/chosen": -2.2204060554504395, + "logits/rejected": -2.563915967941284, + "logps/chosen": -362.48382568359375, + "logps/rejected": -201.93634033203125, + "loss": 0.1517, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.407565712928772, + "rewards/margins": 1.9692587852478027, + "rewards/rejected": -2.376824378967285, + "step": 1172 + }, + { + "epoch": 0.14, + "learning_rate": 2.6340863865152756e-07, + "logits/chosen": -1.95209538936615, + "logits/rejected": -1.8903065919876099, + "logps/chosen": -162.58514404296875, + "logps/rejected": -209.6966552734375, + "loss": 0.5422, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6784054040908813, + "rewards/margins": 0.8046982884407043, + "rewards/rejected": -1.4831037521362305, + "step": 1173 + }, + { + "epoch": 0.14, + "learning_rate": 2.633735221819033e-07, + "logits/chosen": -2.6541285514831543, + "logits/rejected": -2.5078558921813965, + "logps/chosen": -186.3480682373047, + "logps/rejected": -297.9615783691406, + "loss": 0.2696, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3699038624763489, + "rewards/margins": 2.539731979370117, + "rewards/rejected": -2.9096360206604004, + "step": 1174 + }, + { + "epoch": 0.14, + "learning_rate": 2.6333840571227907e-07, + "logits/chosen": -2.771940231323242, + "logits/rejected": -2.6366961002349854, + "logps/chosen": -229.66671752929688, + "logps/rejected": -275.444580078125, + "loss": 0.733, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7213906049728394, + "rewards/margins": 1.311477541923523, + "rewards/rejected": -2.0328681468963623, + "step": 1175 + }, + { + "epoch": 0.14, + "learning_rate": 2.6330328924265477e-07, + "logits/chosen": -2.109370470046997, + "logits/rejected": -2.148027181625366, + "logps/chosen": -264.2872314453125, + "logps/rejected": -253.9931640625, + "loss": 0.4274, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6699625253677368, + "rewards/margins": 1.6300594806671143, + "rewards/rejected": -2.3000221252441406, + "step": 1176 + }, + { + "epoch": 0.14, + "learning_rate": 2.632681727730305e-07, + "logits/chosen": -2.1526310443878174, + "logits/rejected": -2.0274696350097656, + "logps/chosen": -343.77178955078125, + "logps/rejected": -436.1939697265625, + "loss": 0.4606, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6942146420478821, + "rewards/margins": 0.9045483469963074, + "rewards/rejected": -1.5987629890441895, + "step": 1177 + }, + { + "epoch": 0.14, + "learning_rate": 2.632330563034063e-07, + "logits/chosen": -1.8822377920150757, + "logits/rejected": -1.3899985551834106, + "logps/chosen": -156.68780517578125, + "logps/rejected": -393.74603271484375, + "loss": 0.3239, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7160618305206299, + "rewards/margins": 2.0396032333374023, + "rewards/rejected": -2.755664825439453, + "step": 1178 + }, + { + "epoch": 0.14, + "learning_rate": 2.6319793983378203e-07, + "logits/chosen": -2.163900136947632, + "logits/rejected": -2.1668546199798584, + "logps/chosen": -597.447509765625, + "logps/rejected": -522.2208251953125, + "loss": 0.3556, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6146150827407837, + "rewards/margins": 1.4564273357391357, + "rewards/rejected": -2.07104229927063, + "step": 1179 + }, + { + "epoch": 0.14, + "learning_rate": 2.631628233641578e-07, + "logits/chosen": -2.179441213607788, + "logits/rejected": -2.167039632797241, + "logps/chosen": -545.7587280273438, + "logps/rejected": -442.6719055175781, + "loss": 2.7161, + "rewards/accuracies": 0.5, + "rewards/chosen": -3.595858097076416, + "rewards/margins": -1.4175463914871216, + "rewards/rejected": -2.178311586380005, + "step": 1180 + }, + { + "epoch": 0.14, + "learning_rate": 2.631277068945335e-07, + "logits/chosen": -1.8776854276657104, + "logits/rejected": -1.7572450637817383, + "logps/chosen": -310.4964599609375, + "logps/rejected": -388.1448974609375, + "loss": 0.3952, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3112090528011322, + "rewards/margins": 1.1409590244293213, + "rewards/rejected": -1.4521681070327759, + "step": 1181 + }, + { + "epoch": 0.14, + "learning_rate": 2.6309259042490924e-07, + "logits/chosen": -2.130707025527954, + "logits/rejected": -1.9776620864868164, + "logps/chosen": -122.27255249023438, + "logps/rejected": -335.1138000488281, + "loss": 0.1578, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.30185505747795105, + "rewards/margins": 2.745950222015381, + "rewards/rejected": -3.0478055477142334, + "step": 1182 + }, + { + "epoch": 0.14, + "learning_rate": 2.6305747395528505e-07, + "logits/chosen": -2.4134578704833984, + "logits/rejected": -2.43038010597229, + "logps/chosen": -313.3255310058594, + "logps/rejected": -195.69161987304688, + "loss": 0.3596, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9334021210670471, + "rewards/margins": 1.2486358880996704, + "rewards/rejected": -2.1820380687713623, + "step": 1183 + }, + { + "epoch": 0.14, + "learning_rate": 2.6302235748566075e-07, + "logits/chosen": -2.6604294776916504, + "logits/rejected": -2.725644826889038, + "logps/chosen": -247.21742248535156, + "logps/rejected": -286.41259765625, + "loss": 0.3216, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2172408401966095, + "rewards/margins": 2.253211498260498, + "rewards/rejected": -2.470452308654785, + "step": 1184 + }, + { + "epoch": 0.14, + "learning_rate": 2.629872410160365e-07, + "logits/chosen": -2.4065260887145996, + "logits/rejected": -2.467524766921997, + "logps/chosen": -107.28704833984375, + "logps/rejected": -248.11212158203125, + "loss": 0.2146, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.14091461896896362, + "rewards/margins": 2.886866331100464, + "rewards/rejected": -2.7459514141082764, + "step": 1185 + }, + { + "epoch": 0.14, + "learning_rate": 2.6295212454641226e-07, + "logits/chosen": -2.606304168701172, + "logits/rejected": -2.5515851974487305, + "logps/chosen": -146.93496704101562, + "logps/rejected": -150.548583984375, + "loss": 0.308, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4547150433063507, + "rewards/margins": 2.1254725456237793, + "rewards/rejected": -2.5801875591278076, + "step": 1186 + }, + { + "epoch": 0.14, + "learning_rate": 2.62917008076788e-07, + "logits/chosen": -2.1124329566955566, + "logits/rejected": -1.8442542552947998, + "logps/chosen": -193.85670471191406, + "logps/rejected": -264.699951171875, + "loss": 0.542, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6066811680793762, + "rewards/margins": 0.7071362137794495, + "rewards/rejected": -1.3138173818588257, + "step": 1187 + }, + { + "epoch": 0.14, + "learning_rate": 2.6288189160716376e-07, + "logits/chosen": -2.1195919513702393, + "logits/rejected": -2.4238975048065186, + "logps/chosen": -284.0792236328125, + "logps/rejected": -186.47213745117188, + "loss": 0.4599, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7662531733512878, + "rewards/margins": 1.1667035818099976, + "rewards/rejected": -1.9329566955566406, + "step": 1188 + }, + { + "epoch": 0.14, + "learning_rate": 2.6284677513753947e-07, + "logits/chosen": -2.320382595062256, + "logits/rejected": -2.075927734375, + "logps/chosen": -268.8757629394531, + "logps/rejected": -313.6043395996094, + "loss": 0.1712, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6470416188240051, + "rewards/margins": 2.369433641433716, + "rewards/rejected": -3.016475200653076, + "step": 1189 + }, + { + "epoch": 0.14, + "learning_rate": 2.628116586679152e-07, + "logits/chosen": -2.495981216430664, + "logits/rejected": -2.4443447589874268, + "logps/chosen": -121.54672241210938, + "logps/rejected": -242.5754852294922, + "loss": 0.4099, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7040584087371826, + "rewards/margins": 1.1963746547698975, + "rewards/rejected": -1.90043306350708, + "step": 1190 + }, + { + "epoch": 0.14, + "learning_rate": 2.6277654219829097e-07, + "logits/chosen": -1.9347459077835083, + "logits/rejected": -2.3134541511535645, + "logps/chosen": -589.193359375, + "logps/rejected": -357.64923095703125, + "loss": 0.6518, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1733235120773315, + "rewards/margins": 0.7712587714195251, + "rewards/rejected": -1.9445823431015015, + "step": 1191 + }, + { + "epoch": 0.14, + "learning_rate": 2.6274142572866673e-07, + "logits/chosen": -2.2872962951660156, + "logits/rejected": -2.2971701622009277, + "logps/chosen": -275.7373352050781, + "logps/rejected": -166.01123046875, + "loss": 0.2776, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.43689683079719543, + "rewards/margins": 1.6419355869293213, + "rewards/rejected": -2.0788323879241943, + "step": 1192 + }, + { + "epoch": 0.14, + "learning_rate": 2.627063092590425e-07, + "logits/chosen": -2.2520885467529297, + "logits/rejected": -2.1221189498901367, + "logps/chosen": -228.8141632080078, + "logps/rejected": -226.7569580078125, + "loss": 0.4563, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6480573415756226, + "rewards/margins": 0.7855912446975708, + "rewards/rejected": -1.4336485862731934, + "step": 1193 + }, + { + "epoch": 0.14, + "learning_rate": 2.6267119278941823e-07, + "logits/chosen": -2.3449878692626953, + "logits/rejected": -2.368161916732788, + "logps/chosen": -365.6710205078125, + "logps/rejected": -221.6266632080078, + "loss": 0.2434, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09418828040361404, + "rewards/margins": 1.7169408798217773, + "rewards/rejected": -1.6227525472640991, + "step": 1194 + }, + { + "epoch": 0.14, + "learning_rate": 2.62636076319794e-07, + "logits/chosen": -2.298729419708252, + "logits/rejected": -2.443502902984619, + "logps/chosen": -294.3287658691406, + "logps/rejected": -242.67384338378906, + "loss": 0.5171, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4942660927772522, + "rewards/margins": 1.0370550155639648, + "rewards/rejected": -1.5313210487365723, + "step": 1195 + }, + { + "epoch": 0.14, + "learning_rate": 2.6260095985016974e-07, + "logits/chosen": -2.3685710430145264, + "logits/rejected": -2.3321774005889893, + "logps/chosen": -118.12751770019531, + "logps/rejected": -99.90470123291016, + "loss": 0.5112, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5160724520683289, + "rewards/margins": 1.009129285812378, + "rewards/rejected": -1.5252017974853516, + "step": 1196 + }, + { + "epoch": 0.14, + "learning_rate": 2.6256584338054544e-07, + "logits/chosen": -1.8907160758972168, + "logits/rejected": -1.975860595703125, + "logps/chosen": -427.292724609375, + "logps/rejected": -377.80859375, + "loss": 0.4468, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6146835088729858, + "rewards/margins": 1.388251781463623, + "rewards/rejected": -2.0029351711273193, + "step": 1197 + }, + { + "epoch": 0.14, + "learning_rate": 2.625307269109212e-07, + "logits/chosen": -2.0047736167907715, + "logits/rejected": -1.770202875137329, + "logps/chosen": -271.2005920410156, + "logps/rejected": -346.3463134765625, + "loss": 0.6199, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.532253623008728, + "rewards/margins": 0.7870959043502808, + "rewards/rejected": -2.319349527359009, + "step": 1198 + }, + { + "epoch": 0.14, + "learning_rate": 2.6249561044129695e-07, + "logits/chosen": -2.201559543609619, + "logits/rejected": -2.016878366470337, + "logps/chosen": -211.8940887451172, + "logps/rejected": -294.4821472167969, + "loss": 0.3156, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03347517177462578, + "rewards/margins": 1.8919622898101807, + "rewards/rejected": -1.9254374504089355, + "step": 1199 + }, + { + "epoch": 0.14, + "learning_rate": 2.624604939716727e-07, + "logits/chosen": -2.314502000808716, + "logits/rejected": -2.559194564819336, + "logps/chosen": -340.1903991699219, + "logps/rejected": -213.24591064453125, + "loss": 2.4743, + "rewards/accuracies": 0.5, + "rewards/chosen": -3.265897274017334, + "rewards/margins": -1.5096607208251953, + "rewards/rejected": -1.7562364339828491, + "step": 1200 + }, + { + "epoch": 0.14, + "learning_rate": 2.6242537750204846e-07, + "logits/chosen": -2.572819948196411, + "logits/rejected": -2.544923782348633, + "logps/chosen": -236.7320098876953, + "logps/rejected": -319.0283508300781, + "loss": 0.2722, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.44470375776290894, + "rewards/margins": 2.3281760215759277, + "rewards/rejected": -2.7728798389434814, + "step": 1201 + }, + { + "epoch": 0.14, + "learning_rate": 2.623902610324242e-07, + "logits/chosen": -2.840245485305786, + "logits/rejected": -2.725034713745117, + "logps/chosen": -222.37254333496094, + "logps/rejected": -314.6162414550781, + "loss": 0.1031, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.062690258026123, + "rewards/margins": 3.905931234359741, + "rewards/rejected": -4.968621253967285, + "step": 1202 + }, + { + "epoch": 0.14, + "learning_rate": 2.623551445627999e-07, + "logits/chosen": -2.696626663208008, + "logits/rejected": -2.695582389831543, + "logps/chosen": -259.7284851074219, + "logps/rejected": -346.36700439453125, + "loss": 0.1436, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21545198559761047, + "rewards/margins": 3.049877166748047, + "rewards/rejected": -2.8344249725341797, + "step": 1203 + }, + { + "epoch": 0.14, + "learning_rate": 2.623200280931757e-07, + "logits/chosen": -2.3469934463500977, + "logits/rejected": -2.474458932876587, + "logps/chosen": -249.98927307128906, + "logps/rejected": -210.7780303955078, + "loss": 0.7592, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7614404559135437, + "rewards/margins": 0.3405945599079132, + "rewards/rejected": -1.1020350456237793, + "step": 1204 + }, + { + "epoch": 0.14, + "learning_rate": 2.622849116235514e-07, + "logits/chosen": -1.9221974611282349, + "logits/rejected": -1.7608872652053833, + "logps/chosen": -291.67425537109375, + "logps/rejected": -330.08514404296875, + "loss": 0.5668, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1900159120559692, + "rewards/margins": 1.4254212379455566, + "rewards/rejected": -2.6154372692108154, + "step": 1205 + }, + { + "epoch": 0.14, + "learning_rate": 2.622497951539272e-07, + "logits/chosen": -2.380122423171997, + "logits/rejected": -2.688448905944824, + "logps/chosen": -219.58139038085938, + "logps/rejected": -133.3443145751953, + "loss": 0.8034, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9942198395729065, + "rewards/margins": 0.11466355621814728, + "rewards/rejected": -1.108883261680603, + "step": 1206 + }, + { + "epoch": 0.14, + "learning_rate": 2.6221467868430293e-07, + "logits/chosen": -2.3649332523345947, + "logits/rejected": -2.6266958713531494, + "logps/chosen": -428.2552490234375, + "logps/rejected": -406.5830993652344, + "loss": 0.4404, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7329409122467041, + "rewards/margins": 2.2104270458221436, + "rewards/rejected": -2.9433679580688477, + "step": 1207 + }, + { + "epoch": 0.14, + "learning_rate": 2.621795622146787e-07, + "logits/chosen": -2.1851704120635986, + "logits/rejected": -2.340515613555908, + "logps/chosen": -403.56939697265625, + "logps/rejected": -311.2399597167969, + "loss": 0.4888, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5273686647415161, + "rewards/margins": 1.1312546730041504, + "rewards/rejected": -1.658623456954956, + "step": 1208 + }, + { + "epoch": 0.14, + "learning_rate": 2.6214444574505444e-07, + "logits/chosen": -2.319629669189453, + "logits/rejected": -2.32088041305542, + "logps/chosen": -395.6915283203125, + "logps/rejected": -427.455810546875, + "loss": 0.4201, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.822769045829773, + "rewards/margins": 1.5130956172943115, + "rewards/rejected": -2.335864305496216, + "step": 1209 + }, + { + "epoch": 0.14, + "learning_rate": 2.621093292754302e-07, + "logits/chosen": -2.16302752494812, + "logits/rejected": -2.152747392654419, + "logps/chosen": -224.1492919921875, + "logps/rejected": -267.7296142578125, + "loss": 0.4231, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1350178718566895, + "rewards/margins": 1.3673169612884521, + "rewards/rejected": -2.5023348331451416, + "step": 1210 + }, + { + "epoch": 0.14, + "learning_rate": 2.620742128058059e-07, + "logits/chosen": -2.562072515487671, + "logits/rejected": -2.4052371978759766, + "logps/chosen": -147.81927490234375, + "logps/rejected": -258.5527038574219, + "loss": 0.5893, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7385566830635071, + "rewards/margins": 0.8087479472160339, + "rewards/rejected": -1.5473047494888306, + "step": 1211 + }, + { + "epoch": 0.14, + "learning_rate": 2.6203909633618164e-07, + "logits/chosen": -2.3918616771698, + "logits/rejected": -2.173098564147949, + "logps/chosen": -299.8861389160156, + "logps/rejected": -427.250732421875, + "loss": 0.8395, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9787390828132629, + "rewards/margins": 2.2599740028381348, + "rewards/rejected": -3.238713264465332, + "step": 1212 + }, + { + "epoch": 0.14, + "learning_rate": 2.620039798665574e-07, + "logits/chosen": -2.367248296737671, + "logits/rejected": -2.2681241035461426, + "logps/chosen": -217.1181640625, + "logps/rejected": -190.81919860839844, + "loss": 0.39, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.23174108564853668, + "rewards/margins": 1.4032435417175293, + "rewards/rejected": -1.6349847316741943, + "step": 1213 + }, + { + "epoch": 0.14, + "learning_rate": 2.6196886339693315e-07, + "logits/chosen": -2.491971015930176, + "logits/rejected": -2.636580228805542, + "logps/chosen": -186.38455200195312, + "logps/rejected": -190.436767578125, + "loss": 0.465, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.47079527378082275, + "rewards/margins": 2.5382659435272217, + "rewards/rejected": -3.009061336517334, + "step": 1214 + }, + { + "epoch": 0.14, + "learning_rate": 2.619337469273089e-07, + "logits/chosen": -2.4897472858428955, + "logits/rejected": -2.5283544063568115, + "logps/chosen": -316.90057373046875, + "logps/rejected": -233.03067016601562, + "loss": 0.6917, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.051842212677002, + "rewards/margins": 0.659302830696106, + "rewards/rejected": -1.7111449241638184, + "step": 1215 + }, + { + "epoch": 0.14, + "learning_rate": 2.618986304576846e-07, + "logits/chosen": -2.693286418914795, + "logits/rejected": -2.573270320892334, + "logps/chosen": -320.287841796875, + "logps/rejected": -222.59127807617188, + "loss": 0.6104, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6777320504188538, + "rewards/margins": 1.2029129266738892, + "rewards/rejected": -1.8806450366973877, + "step": 1216 + }, + { + "epoch": 0.14, + "learning_rate": 2.618635139880604e-07, + "logits/chosen": -2.40132737159729, + "logits/rejected": -2.0916824340820312, + "logps/chosen": -384.0439147949219, + "logps/rejected": -401.3180236816406, + "loss": 0.2375, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1902656853199005, + "rewards/margins": 2.2003626823425293, + "rewards/rejected": -2.010096788406372, + "step": 1217 + }, + { + "epoch": 0.14, + "learning_rate": 2.6182839751843617e-07, + "logits/chosen": -2.1826999187469482, + "logits/rejected": -2.0099363327026367, + "logps/chosen": -228.3021240234375, + "logps/rejected": -264.562744140625, + "loss": 0.4071, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4945262372493744, + "rewards/margins": 1.6322124004364014, + "rewards/rejected": -2.1267385482788086, + "step": 1218 + }, + { + "epoch": 0.14, + "learning_rate": 2.6179328104881187e-07, + "logits/chosen": -1.9786255359649658, + "logits/rejected": -2.077709674835205, + "logps/chosen": -374.93292236328125, + "logps/rejected": -287.2472839355469, + "loss": 0.4622, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6592519283294678, + "rewards/margins": 0.821523904800415, + "rewards/rejected": -1.4807758331298828, + "step": 1219 + }, + { + "epoch": 0.14, + "learning_rate": 2.617581645791876e-07, + "logits/chosen": -2.191058397293091, + "logits/rejected": -1.9813461303710938, + "logps/chosen": -351.0128479003906, + "logps/rejected": -434.2020263671875, + "loss": 0.2362, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7220970392227173, + "rewards/margins": 2.2828147411346436, + "rewards/rejected": -3.0049118995666504, + "step": 1220 + }, + { + "epoch": 0.14, + "learning_rate": 2.617230481095634e-07, + "logits/chosen": -1.9122111797332764, + "logits/rejected": -2.2979345321655273, + "logps/chosen": -366.3553771972656, + "logps/rejected": -175.98922729492188, + "loss": 0.3403, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24583472311496735, + "rewards/margins": 2.008526086807251, + "rewards/rejected": -2.2543606758117676, + "step": 1221 + }, + { + "epoch": 0.14, + "learning_rate": 2.6168793163993913e-07, + "logits/chosen": -2.2026350498199463, + "logits/rejected": -2.29133939743042, + "logps/chosen": -217.5910186767578, + "logps/rejected": -230.13973999023438, + "loss": 0.279, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1783258318901062, + "rewards/margins": 1.4976460933685303, + "rewards/rejected": -1.6759717464447021, + "step": 1222 + }, + { + "epoch": 0.14, + "learning_rate": 2.616528151703149e-07, + "logits/chosen": -2.215341567993164, + "logits/rejected": -2.389342784881592, + "logps/chosen": -291.0817565917969, + "logps/rejected": -228.50643920898438, + "loss": 0.6474, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1257119178771973, + "rewards/margins": 0.4159754514694214, + "rewards/rejected": -1.541687250137329, + "step": 1223 + }, + { + "epoch": 0.14, + "learning_rate": 2.616176987006906e-07, + "logits/chosen": -2.212592363357544, + "logits/rejected": -2.333336114883423, + "logps/chosen": -369.40087890625, + "logps/rejected": -351.35882568359375, + "loss": 0.7413, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.634931743144989, + "rewards/margins": 1.0743565559387207, + "rewards/rejected": -1.709288239479065, + "step": 1224 + }, + { + "epoch": 0.14, + "learning_rate": 2.6158258223106634e-07, + "logits/chosen": -2.4651551246643066, + "logits/rejected": -2.332817554473877, + "logps/chosen": -192.83706665039062, + "logps/rejected": -255.3411407470703, + "loss": 0.7443, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4994051456451416, + "rewards/margins": 0.28595566749572754, + "rewards/rejected": -1.7853609323501587, + "step": 1225 + }, + { + "epoch": 0.14, + "learning_rate": 2.6154746576144215e-07, + "logits/chosen": -2.2488555908203125, + "logits/rejected": -2.660276412963867, + "logps/chosen": -557.8569946289062, + "logps/rejected": -248.9820556640625, + "loss": 0.1861, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06029766798019409, + "rewards/margins": 1.8520750999450684, + "rewards/rejected": -1.7917776107788086, + "step": 1226 + }, + { + "epoch": 0.14, + "learning_rate": 2.6151234929181785e-07, + "logits/chosen": -1.8140621185302734, + "logits/rejected": -2.3547449111938477, + "logps/chosen": -296.48455810546875, + "logps/rejected": -208.09671020507812, + "loss": 0.4326, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.326702356338501, + "rewards/margins": 1.1341652870178223, + "rewards/rejected": -2.4608676433563232, + "step": 1227 + }, + { + "epoch": 0.14, + "learning_rate": 2.614772328221936e-07, + "logits/chosen": -2.7471282482147217, + "logits/rejected": -2.685302734375, + "logps/chosen": -307.746337890625, + "logps/rejected": -334.9953918457031, + "loss": 0.2134, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02764667570590973, + "rewards/margins": 3.0332560539245605, + "rewards/rejected": -3.0609028339385986, + "step": 1228 + }, + { + "epoch": 0.14, + "learning_rate": 2.6144211635256935e-07, + "logits/chosen": -2.7238481044769287, + "logits/rejected": -2.7595155239105225, + "logps/chosen": -242.04241943359375, + "logps/rejected": -214.8228759765625, + "loss": 0.3841, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8853107690811157, + "rewards/margins": 1.4906158447265625, + "rewards/rejected": -2.3759267330169678, + "step": 1229 + }, + { + "epoch": 0.14, + "learning_rate": 2.614069998829451e-07, + "logits/chosen": -2.8816490173339844, + "logits/rejected": -2.6047122478485107, + "logps/chosen": -254.13006591796875, + "logps/rejected": -250.32968139648438, + "loss": 0.1869, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8495271801948547, + "rewards/margins": 2.3216238021850586, + "rewards/rejected": -3.1711509227752686, + "step": 1230 + }, + { + "epoch": 0.14, + "learning_rate": 2.6137188341332086e-07, + "logits/chosen": -2.1959872245788574, + "logits/rejected": -2.312016010284424, + "logps/chosen": -266.3670349121094, + "logps/rejected": -309.56365966796875, + "loss": 0.5206, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.22267881035804749, + "rewards/margins": 1.252880334854126, + "rewards/rejected": -1.475559115409851, + "step": 1231 + }, + { + "epoch": 0.14, + "learning_rate": 2.6133676694369656e-07, + "logits/chosen": -2.3735008239746094, + "logits/rejected": -2.362766742706299, + "logps/chosen": -208.64971923828125, + "logps/rejected": -167.8701171875, + "loss": 0.3154, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4814925193786621, + "rewards/margins": 1.062526822090149, + "rewards/rejected": -1.5440192222595215, + "step": 1232 + }, + { + "epoch": 0.14, + "learning_rate": 2.613016504740723e-07, + "logits/chosen": -2.5004119873046875, + "logits/rejected": -2.2246131896972656, + "logps/chosen": -235.0240478515625, + "logps/rejected": -333.85430908203125, + "loss": 1.0282, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3835958242416382, + "rewards/margins": 0.4790073037147522, + "rewards/rejected": -1.8626030683517456, + "step": 1233 + }, + { + "epoch": 0.14, + "learning_rate": 2.6126653400444807e-07, + "logits/chosen": -2.733006477355957, + "logits/rejected": -2.6361865997314453, + "logps/chosen": -489.84942626953125, + "logps/rejected": -335.875244140625, + "loss": 0.068, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1221366822719574, + "rewards/margins": 2.8611679077148438, + "rewards/rejected": -2.7390313148498535, + "step": 1234 + }, + { + "epoch": 0.14, + "learning_rate": 2.612314175348238e-07, + "logits/chosen": -2.6086812019348145, + "logits/rejected": -2.700587749481201, + "logps/chosen": -315.0569763183594, + "logps/rejected": -364.7425231933594, + "loss": 0.2341, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.36202019453048706, + "rewards/margins": 3.061565399169922, + "rewards/rejected": -3.4235854148864746, + "step": 1235 + }, + { + "epoch": 0.14, + "learning_rate": 2.611963010651996e-07, + "logits/chosen": -1.9043461084365845, + "logits/rejected": -1.9734102487564087, + "logps/chosen": -337.082275390625, + "logps/rejected": -311.52545166015625, + "loss": 0.343, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1681126356124878, + "rewards/margins": 1.5270308256149292, + "rewards/rejected": -2.695143222808838, + "step": 1236 + }, + { + "epoch": 0.14, + "learning_rate": 2.611611845955753e-07, + "logits/chosen": -2.3135061264038086, + "logits/rejected": -2.3962998390197754, + "logps/chosen": -208.94921875, + "logps/rejected": -151.89614868164062, + "loss": 0.3747, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.33812081813812256, + "rewards/margins": 0.9795241355895996, + "rewards/rejected": -1.3176449537277222, + "step": 1237 + }, + { + "epoch": 0.14, + "learning_rate": 2.611260681259511e-07, + "logits/chosen": -1.8027485609054565, + "logits/rejected": -1.9699039459228516, + "logps/chosen": -368.8949279785156, + "logps/rejected": -178.34628295898438, + "loss": 0.5777, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6019672751426697, + "rewards/margins": 1.1809881925582886, + "rewards/rejected": -1.7829554080963135, + "step": 1238 + }, + { + "epoch": 0.14, + "learning_rate": 2.6109095165632684e-07, + "logits/chosen": -2.08000111579895, + "logits/rejected": -2.209047794342041, + "logps/chosen": -358.88629150390625, + "logps/rejected": -304.9125671386719, + "loss": 0.3055, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25006285309791565, + "rewards/margins": 2.3322055339813232, + "rewards/rejected": -2.582268238067627, + "step": 1239 + }, + { + "epoch": 0.14, + "learning_rate": 2.6105583518670254e-07, + "logits/chosen": -2.4406704902648926, + "logits/rejected": -2.1019973754882812, + "logps/chosen": -277.1925048828125, + "logps/rejected": -342.7044982910156, + "loss": 0.8147, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0094940662384033, + "rewards/margins": 0.4721108675003052, + "rewards/rejected": -1.4816049337387085, + "step": 1240 + }, + { + "epoch": 0.14, + "learning_rate": 2.610207187170783e-07, + "logits/chosen": -2.4417812824249268, + "logits/rejected": -2.482100009918213, + "logps/chosen": -159.58827209472656, + "logps/rejected": -302.1109924316406, + "loss": 0.4087, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6362797021865845, + "rewards/margins": 1.632408618927002, + "rewards/rejected": -2.268688201904297, + "step": 1241 + }, + { + "epoch": 0.14, + "learning_rate": 2.6098560224745405e-07, + "logits/chosen": -2.7858710289001465, + "logits/rejected": -2.6903841495513916, + "logps/chosen": -252.63821411132812, + "logps/rejected": -284.96331787109375, + "loss": 0.2745, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.13229218125343323, + "rewards/margins": 2.7428526878356934, + "rewards/rejected": -2.610560417175293, + "step": 1242 + }, + { + "epoch": 0.14, + "learning_rate": 2.609504857778298e-07, + "logits/chosen": -2.133867025375366, + "logits/rejected": -2.4753713607788086, + "logps/chosen": -301.37884521484375, + "logps/rejected": -337.9920654296875, + "loss": 0.5215, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5639783143997192, + "rewards/margins": 1.8206266164779663, + "rewards/rejected": -2.3846049308776855, + "step": 1243 + }, + { + "epoch": 0.14, + "learning_rate": 2.6091536930820556e-07, + "logits/chosen": -1.6375278234481812, + "logits/rejected": -1.9766199588775635, + "logps/chosen": -489.6222839355469, + "logps/rejected": -251.22128295898438, + "loss": 0.4503, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.21854913234710693, + "rewards/margins": 1.2474963665008545, + "rewards/rejected": -1.4660457372665405, + "step": 1244 + }, + { + "epoch": 0.14, + "learning_rate": 2.6088025283858126e-07, + "logits/chosen": -2.3321878910064697, + "logits/rejected": -2.4833123683929443, + "logps/chosen": -303.08929443359375, + "logps/rejected": -249.7528076171875, + "loss": 0.4781, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6681906580924988, + "rewards/margins": 0.6888229846954346, + "rewards/rejected": -1.3570135831832886, + "step": 1245 + }, + { + "epoch": 0.14, + "learning_rate": 2.60845136368957e-07, + "logits/chosen": -2.3020780086517334, + "logits/rejected": -2.066758155822754, + "logps/chosen": -185.47499084472656, + "logps/rejected": -234.49427795410156, + "loss": 0.5423, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6685687303543091, + "rewards/margins": 1.0368448495864868, + "rewards/rejected": -1.705413579940796, + "step": 1246 + }, + { + "epoch": 0.14, + "learning_rate": 2.6081001989933277e-07, + "logits/chosen": -2.674208641052246, + "logits/rejected": -2.491147756576538, + "logps/chosen": -214.20286560058594, + "logps/rejected": -270.48779296875, + "loss": 0.3789, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.753250002861023, + "rewards/margins": 3.809901714324951, + "rewards/rejected": -4.563152313232422, + "step": 1247 + }, + { + "epoch": 0.14, + "learning_rate": 2.607749034297085e-07, + "logits/chosen": -2.3840999603271484, + "logits/rejected": -2.376502752304077, + "logps/chosen": -293.76690673828125, + "logps/rejected": -177.0989227294922, + "loss": 0.2496, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.10226689279079437, + "rewards/margins": 2.100275754928589, + "rewards/rejected": -1.9980087280273438, + "step": 1248 + }, + { + "epoch": 0.14, + "learning_rate": 2.6073978696008427e-07, + "logits/chosen": -2.9502997398376465, + "logits/rejected": -2.957667827606201, + "logps/chosen": -112.1104965209961, + "logps/rejected": -166.54759216308594, + "loss": 0.5364, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0345087051391602, + "rewards/margins": 2.6466851234436035, + "rewards/rejected": -3.6811935901641846, + "step": 1249 + }, + { + "epoch": 0.14, + "learning_rate": 2.6070467049046e-07, + "logits/chosen": -2.396228313446045, + "logits/rejected": -2.5470399856567383, + "logps/chosen": -340.0152282714844, + "logps/rejected": -222.62545776367188, + "loss": 0.3695, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9128315448760986, + "rewards/margins": 2.0067641735076904, + "rewards/rejected": -2.919595718383789, + "step": 1250 + }, + { + "epoch": 0.14, + "learning_rate": 2.606695540208358e-07, + "logits/chosen": -2.1252212524414062, + "logits/rejected": -2.4098565578460693, + "logps/chosen": -465.5081481933594, + "logps/rejected": -227.84217834472656, + "loss": 0.2379, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1414436101913452, + "rewards/margins": 1.9032078981399536, + "rewards/rejected": -3.0446512699127197, + "step": 1251 + }, + { + "epoch": 0.14, + "learning_rate": 2.6063443755121153e-07, + "logits/chosen": -2.3294615745544434, + "logits/rejected": -2.2880473136901855, + "logps/chosen": -143.82479858398438, + "logps/rejected": -240.41529846191406, + "loss": 0.3963, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6840824484825134, + "rewards/margins": 2.231374740600586, + "rewards/rejected": -2.915457248687744, + "step": 1252 + }, + { + "epoch": 0.14, + "learning_rate": 2.6059932108158724e-07, + "logits/chosen": -2.7053589820861816, + "logits/rejected": -2.6990225315093994, + "logps/chosen": -276.1961364746094, + "logps/rejected": -209.21592712402344, + "loss": 0.3513, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5093609690666199, + "rewards/margins": 1.655852198600769, + "rewards/rejected": -2.165213108062744, + "step": 1253 + }, + { + "epoch": 0.14, + "learning_rate": 2.60564204611963e-07, + "logits/chosen": -1.9365088939666748, + "logits/rejected": -2.109623670578003, + "logps/chosen": -318.39508056640625, + "logps/rejected": -244.5581817626953, + "loss": 1.8779, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.6030352115631104, + "rewards/margins": -0.19801437854766846, + "rewards/rejected": -3.4050207138061523, + "step": 1254 + }, + { + "epoch": 0.14, + "learning_rate": 2.6052908814233874e-07, + "logits/chosen": -2.0111594200134277, + "logits/rejected": -2.021592855453491, + "logps/chosen": -305.261474609375, + "logps/rejected": -364.0711669921875, + "loss": 0.4841, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4913962185382843, + "rewards/margins": 1.1318678855895996, + "rewards/rejected": -1.623264193534851, + "step": 1255 + }, + { + "epoch": 0.14, + "learning_rate": 2.604939716727145e-07, + "logits/chosen": -1.8705155849456787, + "logits/rejected": -1.971639633178711, + "logps/chosen": -329.6698913574219, + "logps/rejected": -365.186767578125, + "loss": 0.6691, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0959595441818237, + "rewards/margins": 0.8573148250579834, + "rewards/rejected": -1.9532742500305176, + "step": 1256 + }, + { + "epoch": 0.14, + "learning_rate": 2.6045885520309025e-07, + "logits/chosen": -2.6284637451171875, + "logits/rejected": -2.4800539016723633, + "logps/chosen": -378.4010925292969, + "logps/rejected": -325.838134765625, + "loss": 0.337, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21044570207595825, + "rewards/margins": 1.6278626918792725, + "rewards/rejected": -1.838308334350586, + "step": 1257 + }, + { + "epoch": 0.15, + "learning_rate": 2.6042373873346595e-07, + "logits/chosen": -2.098663091659546, + "logits/rejected": -2.368830919265747, + "logps/chosen": -171.84017944335938, + "logps/rejected": -225.16091918945312, + "loss": 0.4912, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3569262027740479, + "rewards/margins": 1.4811785221099854, + "rewards/rejected": -2.838104724884033, + "step": 1258 + }, + { + "epoch": 0.15, + "learning_rate": 2.603886222638417e-07, + "logits/chosen": -2.040858030319214, + "logits/rejected": -2.044887065887451, + "logps/chosen": -188.71966552734375, + "logps/rejected": -336.5268859863281, + "loss": 0.6324, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7475928068161011, + "rewards/margins": 1.057030200958252, + "rewards/rejected": -1.8046231269836426, + "step": 1259 + }, + { + "epoch": 0.15, + "learning_rate": 2.603535057942175e-07, + "logits/chosen": -2.5217549800872803, + "logits/rejected": -2.7652475833892822, + "logps/chosen": -266.3756408691406, + "logps/rejected": -420.982666015625, + "loss": 0.3352, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7157343626022339, + "rewards/margins": 1.064690351486206, + "rewards/rejected": -1.78042471408844, + "step": 1260 + }, + { + "epoch": 0.15, + "learning_rate": 2.603183893245932e-07, + "logits/chosen": -2.468291997909546, + "logits/rejected": -2.4359326362609863, + "logps/chosen": -309.45123291015625, + "logps/rejected": -380.88995361328125, + "loss": 1.5642, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.969605803489685, + "rewards/margins": 1.3589930534362793, + "rewards/rejected": -3.328598976135254, + "step": 1261 + }, + { + "epoch": 0.15, + "learning_rate": 2.6028327285496897e-07, + "logits/chosen": -2.421710252761841, + "logits/rejected": -2.4593875408172607, + "logps/chosen": -289.607421875, + "logps/rejected": -335.7573547363281, + "loss": 0.3237, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6797513961791992, + "rewards/margins": 2.276355743408203, + "rewards/rejected": -2.9561071395874023, + "step": 1262 + }, + { + "epoch": 0.15, + "learning_rate": 2.602481563853447e-07, + "logits/chosen": -2.535024642944336, + "logits/rejected": -2.503337860107422, + "logps/chosen": -235.5904541015625, + "logps/rejected": -218.057861328125, + "loss": 0.9197, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1402575969696045, + "rewards/margins": 0.24920731782913208, + "rewards/rejected": -1.3894649744033813, + "step": 1263 + }, + { + "epoch": 0.15, + "learning_rate": 2.602130399157205e-07, + "logits/chosen": -2.5362064838409424, + "logits/rejected": -2.548828601837158, + "logps/chosen": -190.4687042236328, + "logps/rejected": -230.01336669921875, + "loss": 0.6772, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9166618585586548, + "rewards/margins": 0.7930564880371094, + "rewards/rejected": -1.7097182273864746, + "step": 1264 + }, + { + "epoch": 0.15, + "learning_rate": 2.6017792344609623e-07, + "logits/chosen": -2.0905661582946777, + "logits/rejected": -2.1359341144561768, + "logps/chosen": -262.92840576171875, + "logps/rejected": -344.87591552734375, + "loss": 0.4705, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.954411506652832, + "rewards/margins": 1.5178147554397583, + "rewards/rejected": -2.472226142883301, + "step": 1265 + }, + { + "epoch": 0.15, + "learning_rate": 2.6014280697647193e-07, + "logits/chosen": -2.930112361907959, + "logits/rejected": -3.0854690074920654, + "logps/chosen": -113.27953338623047, + "logps/rejected": -175.53663635253906, + "loss": 0.3977, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.49380171298980713, + "rewards/margins": 2.321780204772949, + "rewards/rejected": -2.815581798553467, + "step": 1266 + }, + { + "epoch": 0.15, + "learning_rate": 2.601076905068477e-07, + "logits/chosen": -2.454097270965576, + "logits/rejected": -2.703137159347534, + "logps/chosen": -312.2829284667969, + "logps/rejected": -180.25567626953125, + "loss": 0.3236, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10919611901044846, + "rewards/margins": 1.6835612058639526, + "rewards/rejected": -1.792757272720337, + "step": 1267 + }, + { + "epoch": 0.15, + "learning_rate": 2.6007257403722344e-07, + "logits/chosen": -2.7545418739318848, + "logits/rejected": -2.5482006072998047, + "logps/chosen": -321.7467346191406, + "logps/rejected": -235.8668212890625, + "loss": 0.2091, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5058218240737915, + "rewards/margins": 2.551293134689331, + "rewards/rejected": -3.057115077972412, + "step": 1268 + }, + { + "epoch": 0.15, + "learning_rate": 2.600374575675992e-07, + "logits/chosen": -2.6672210693359375, + "logits/rejected": -2.4639198780059814, + "logps/chosen": -171.9901885986328, + "logps/rejected": -253.72784423828125, + "loss": 0.2921, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1084393784403801, + "rewards/margins": 1.6274049282073975, + "rewards/rejected": -1.735844373703003, + "step": 1269 + }, + { + "epoch": 0.15, + "learning_rate": 2.6000234109797495e-07, + "logits/chosen": -2.512528657913208, + "logits/rejected": -2.719599723815918, + "logps/chosen": -131.0909423828125, + "logps/rejected": -124.51785278320312, + "loss": 0.4729, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4371042847633362, + "rewards/margins": 1.2747604846954346, + "rewards/rejected": -1.7118645906448364, + "step": 1270 + }, + { + "epoch": 0.15, + "learning_rate": 2.599672246283507e-07, + "logits/chosen": -2.3080031871795654, + "logits/rejected": -2.329314947128296, + "logps/chosen": -682.6929931640625, + "logps/rejected": -518.1687622070312, + "loss": 0.2621, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11740930378437042, + "rewards/margins": 2.616335868835449, + "rewards/rejected": -2.7337450981140137, + "step": 1271 + }, + { + "epoch": 0.15, + "learning_rate": 2.5993210815872645e-07, + "logits/chosen": -2.295029640197754, + "logits/rejected": -2.3023862838745117, + "logps/chosen": -357.6253356933594, + "logps/rejected": -265.8558044433594, + "loss": 0.6606, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8810425996780396, + "rewards/margins": 1.5352895259857178, + "rewards/rejected": -2.416332244873047, + "step": 1272 + }, + { + "epoch": 0.15, + "learning_rate": 2.598969916891022e-07, + "logits/chosen": -2.351187229156494, + "logits/rejected": -2.5803675651550293, + "logps/chosen": -223.71780395507812, + "logps/rejected": -168.96116638183594, + "loss": 0.348, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15146462619304657, + "rewards/margins": 1.4747024774551392, + "rewards/rejected": -1.6261670589447021, + "step": 1273 + }, + { + "epoch": 0.15, + "learning_rate": 2.598618752194779e-07, + "logits/chosen": -2.0099925994873047, + "logits/rejected": -2.021272897720337, + "logps/chosen": -259.3493347167969, + "logps/rejected": -259.771484375, + "loss": 0.2635, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4844396114349365, + "rewards/margins": 2.1884617805480957, + "rewards/rejected": -2.672901153564453, + "step": 1274 + }, + { + "epoch": 0.15, + "learning_rate": 2.5982675874985366e-07, + "logits/chosen": -2.413701057434082, + "logits/rejected": -2.442497730255127, + "logps/chosen": -214.9688262939453, + "logps/rejected": -219.4199676513672, + "loss": 0.3007, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15146052837371826, + "rewards/margins": 1.3376312255859375, + "rewards/rejected": -1.4890917539596558, + "step": 1275 + }, + { + "epoch": 0.15, + "learning_rate": 2.597916422802294e-07, + "logits/chosen": -2.177332878112793, + "logits/rejected": -2.3232545852661133, + "logps/chosen": -337.7678527832031, + "logps/rejected": -219.5247802734375, + "loss": 0.8026, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7953501343727112, + "rewards/margins": 1.066942572593689, + "rewards/rejected": -1.862292766571045, + "step": 1276 + }, + { + "epoch": 0.15, + "learning_rate": 2.5975652581060517e-07, + "logits/chosen": -2.541180372238159, + "logits/rejected": -2.506412982940674, + "logps/chosen": -247.7923126220703, + "logps/rejected": -226.3607177734375, + "loss": 0.3541, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17759428918361664, + "rewards/margins": 2.0098876953125, + "rewards/rejected": -2.1874818801879883, + "step": 1277 + }, + { + "epoch": 0.15, + "learning_rate": 2.597214093409809e-07, + "logits/chosen": -2.365760087966919, + "logits/rejected": -2.1403117179870605, + "logps/chosen": -158.4734649658203, + "logps/rejected": -239.99363708496094, + "loss": 0.5583, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8573588132858276, + "rewards/margins": 1.176505208015442, + "rewards/rejected": -2.0338642597198486, + "step": 1278 + }, + { + "epoch": 0.15, + "learning_rate": 2.596862928713566e-07, + "logits/chosen": -2.225846290588379, + "logits/rejected": -1.9689433574676514, + "logps/chosen": -203.6602783203125, + "logps/rejected": -252.87486267089844, + "loss": 0.607, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6268620491027832, + "rewards/margins": 1.2016592025756836, + "rewards/rejected": -1.8285211324691772, + "step": 1279 + }, + { + "epoch": 0.15, + "learning_rate": 2.596511764017324e-07, + "logits/chosen": -2.1215643882751465, + "logits/rejected": -2.147946357727051, + "logps/chosen": -249.7301483154297, + "logps/rejected": -270.53326416015625, + "loss": 0.4752, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.730949878692627, + "rewards/margins": 1.2268407344818115, + "rewards/rejected": -1.957790732383728, + "step": 1280 + }, + { + "epoch": 0.15, + "learning_rate": 2.5961605993210813e-07, + "logits/chosen": -2.090332508087158, + "logits/rejected": -2.2015674114227295, + "logps/chosen": -314.36212158203125, + "logps/rejected": -286.95721435546875, + "loss": 0.3275, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3136612176895142, + "rewards/margins": 1.8161003589630127, + "rewards/rejected": -3.1297614574432373, + "step": 1281 + }, + { + "epoch": 0.15, + "learning_rate": 2.595809434624839e-07, + "logits/chosen": -2.237532615661621, + "logits/rejected": -2.614426374435425, + "logps/chosen": -626.42919921875, + "logps/rejected": -216.63790893554688, + "loss": 0.4185, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5841814279556274, + "rewards/margins": 1.651771903038025, + "rewards/rejected": -2.2359533309936523, + "step": 1282 + }, + { + "epoch": 0.15, + "learning_rate": 2.5954582699285964e-07, + "logits/chosen": -2.284329891204834, + "logits/rejected": -2.472717761993408, + "logps/chosen": -811.4769897460938, + "logps/rejected": -519.6988525390625, + "loss": 0.2956, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1669808030128479, + "rewards/margins": 1.1742042303085327, + "rewards/rejected": -1.3411848545074463, + "step": 1283 + }, + { + "epoch": 0.15, + "learning_rate": 2.595107105232354e-07, + "logits/chosen": -2.3302650451660156, + "logits/rejected": -2.357332229614258, + "logps/chosen": -201.724365234375, + "logps/rejected": -251.0634765625, + "loss": 0.6351, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.24587330222129822, + "rewards/margins": 0.5244501829147339, + "rewards/rejected": -0.7703234553337097, + "step": 1284 + }, + { + "epoch": 0.15, + "learning_rate": 2.5947559405361115e-07, + "logits/chosen": -1.9985816478729248, + "logits/rejected": -2.344910144805908, + "logps/chosen": -350.34320068359375, + "logps/rejected": -238.83636474609375, + "loss": 0.4179, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06380248069763184, + "rewards/margins": 1.0335484743118286, + "rewards/rejected": -1.0973509550094604, + "step": 1285 + }, + { + "epoch": 0.15, + "learning_rate": 2.594404775839869e-07, + "logits/chosen": -2.4189982414245605, + "logits/rejected": -2.4023520946502686, + "logps/chosen": -213.75706481933594, + "logps/rejected": -257.52716064453125, + "loss": 0.3231, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1301121711730957, + "rewards/margins": 2.6825389862060547, + "rewards/rejected": -3.8126516342163086, + "step": 1286 + }, + { + "epoch": 0.15, + "learning_rate": 2.594053611143626e-07, + "logits/chosen": -2.3804264068603516, + "logits/rejected": -2.6315295696258545, + "logps/chosen": -193.7206268310547, + "logps/rejected": -178.8940887451172, + "loss": 0.3066, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6029137969017029, + "rewards/margins": 1.9893629550933838, + "rewards/rejected": -2.5922770500183105, + "step": 1287 + }, + { + "epoch": 0.15, + "learning_rate": 2.5937024464473836e-07, + "logits/chosen": -2.8448870182037354, + "logits/rejected": -2.880305767059326, + "logps/chosen": -199.33450317382812, + "logps/rejected": -249.47911071777344, + "loss": 0.4986, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5894572734832764, + "rewards/margins": 1.2824232578277588, + "rewards/rejected": -1.8718804121017456, + "step": 1288 + }, + { + "epoch": 0.15, + "learning_rate": 2.593351281751141e-07, + "logits/chosen": -2.430771589279175, + "logits/rejected": -2.3381595611572266, + "logps/chosen": -294.74359130859375, + "logps/rejected": -349.30908203125, + "loss": 0.2479, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6434617042541504, + "rewards/margins": 2.2894341945648193, + "rewards/rejected": -2.9328956604003906, + "step": 1289 + }, + { + "epoch": 0.15, + "learning_rate": 2.5930001170548986e-07, + "logits/chosen": -2.14392352104187, + "logits/rejected": -2.0273618698120117, + "logps/chosen": -371.15838623046875, + "logps/rejected": -438.2203369140625, + "loss": 0.4873, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0916320085525513, + "rewards/margins": 2.169872283935547, + "rewards/rejected": -3.2615041732788086, + "step": 1290 + }, + { + "epoch": 0.15, + "learning_rate": 2.592648952358656e-07, + "logits/chosen": -2.3402915000915527, + "logits/rejected": -2.247882604598999, + "logps/chosen": -274.9654846191406, + "logps/rejected": -373.93353271484375, + "loss": 0.6631, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.263188898563385, + "rewards/margins": 0.6105824112892151, + "rewards/rejected": -0.8737713098526001, + "step": 1291 + }, + { + "epoch": 0.15, + "learning_rate": 2.5922977876624137e-07, + "logits/chosen": -2.896120548248291, + "logits/rejected": -2.6883628368377686, + "logps/chosen": -64.93666076660156, + "logps/rejected": -124.34422302246094, + "loss": 0.6026, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9887758493423462, + "rewards/margins": 0.6546602249145508, + "rewards/rejected": -1.643436074256897, + "step": 1292 + }, + { + "epoch": 0.15, + "learning_rate": 2.5919466229661707e-07, + "logits/chosen": -2.3188858032226562, + "logits/rejected": -2.4194233417510986, + "logps/chosen": -282.9375915527344, + "logps/rejected": -181.86395263671875, + "loss": 0.3803, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3622114658355713, + "rewards/margins": 2.451237916946411, + "rewards/rejected": -2.0890262126922607, + "step": 1293 + }, + { + "epoch": 0.15, + "learning_rate": 2.591595458269929e-07, + "logits/chosen": -2.4627485275268555, + "logits/rejected": -2.4552316665649414, + "logps/chosen": -319.94732666015625, + "logps/rejected": -388.7334289550781, + "loss": 0.6209, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3484979569911957, + "rewards/margins": 2.14605712890625, + "rewards/rejected": -2.4945549964904785, + "step": 1294 + }, + { + "epoch": 0.15, + "learning_rate": 2.591244293573686e-07, + "logits/chosen": -2.6320266723632812, + "logits/rejected": -2.6478638648986816, + "logps/chosen": -335.1326904296875, + "logps/rejected": -345.1764221191406, + "loss": 0.3356, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7357165813446045, + "rewards/margins": 1.320208191871643, + "rewards/rejected": -2.055924654006958, + "step": 1295 + }, + { + "epoch": 0.15, + "learning_rate": 2.5908931288774433e-07, + "logits/chosen": -2.171818971633911, + "logits/rejected": -2.1164464950561523, + "logps/chosen": -215.00074768066406, + "logps/rejected": -290.93951416015625, + "loss": 0.2274, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4097139835357666, + "rewards/margins": 2.539120674133301, + "rewards/rejected": -2.9488346576690674, + "step": 1296 + }, + { + "epoch": 0.15, + "learning_rate": 2.590541964181201e-07, + "logits/chosen": -2.498106002807617, + "logits/rejected": -2.6085081100463867, + "logps/chosen": -295.7584533691406, + "logps/rejected": -415.49908447265625, + "loss": 0.4915, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0035237073898315, + "rewards/margins": 2.017482280731201, + "rewards/rejected": -3.021005630493164, + "step": 1297 + }, + { + "epoch": 0.15, + "learning_rate": 2.5901907994849584e-07, + "logits/chosen": -2.107144594192505, + "logits/rejected": -2.3224215507507324, + "logps/chosen": -360.1085205078125, + "logps/rejected": -236.0214080810547, + "loss": 0.4606, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6995355486869812, + "rewards/margins": 1.1716111898422241, + "rewards/rejected": -1.8711466789245605, + "step": 1298 + }, + { + "epoch": 0.15, + "learning_rate": 2.589839634788716e-07, + "logits/chosen": -2.4224166870117188, + "logits/rejected": -2.5042800903320312, + "logps/chosen": -426.34307861328125, + "logps/rejected": -376.9884033203125, + "loss": 0.0867, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7103161811828613, + "rewards/margins": 3.5888209342956543, + "rewards/rejected": -4.299137115478516, + "step": 1299 + }, + { + "epoch": 0.15, + "learning_rate": 2.5894884700924735e-07, + "logits/chosen": -2.3631789684295654, + "logits/rejected": -2.199075222015381, + "logps/chosen": -281.9513854980469, + "logps/rejected": -347.13690185546875, + "loss": 0.2124, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.586790144443512, + "rewards/margins": 2.307196855545044, + "rewards/rejected": -2.893986701965332, + "step": 1300 + }, + { + "epoch": 0.15, + "learning_rate": 2.5891373053962305e-07, + "logits/chosen": -1.3406792879104614, + "logits/rejected": -1.8718336820602417, + "logps/chosen": -606.0313720703125, + "logps/rejected": -372.3585510253906, + "loss": 0.5921, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.30214935541152954, + "rewards/margins": 0.5555156469345093, + "rewards/rejected": -0.857664942741394, + "step": 1301 + }, + { + "epoch": 0.15, + "learning_rate": 2.588786140699988e-07, + "logits/chosen": -2.9315428733825684, + "logits/rejected": -3.03650164604187, + "logps/chosen": -223.91656494140625, + "logps/rejected": -274.4059143066406, + "loss": 0.4442, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6243481040000916, + "rewards/margins": 2.294861316680908, + "rewards/rejected": -2.9192094802856445, + "step": 1302 + }, + { + "epoch": 0.15, + "learning_rate": 2.5884349760037456e-07, + "logits/chosen": -2.1771600246429443, + "logits/rejected": -2.060610771179199, + "logps/chosen": -338.8372802734375, + "logps/rejected": -378.67327880859375, + "loss": 0.4408, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1523447036743164, + "rewards/margins": 0.8205806016921997, + "rewards/rejected": -1.9729253053665161, + "step": 1303 + }, + { + "epoch": 0.15, + "learning_rate": 2.588083811307503e-07, + "logits/chosen": -2.6878840923309326, + "logits/rejected": -2.5280470848083496, + "logps/chosen": -251.63137817382812, + "logps/rejected": -301.02996826171875, + "loss": 0.3823, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5709581971168518, + "rewards/margins": 2.0752601623535156, + "rewards/rejected": -2.646218776702881, + "step": 1304 + }, + { + "epoch": 0.15, + "learning_rate": 2.5877326466112607e-07, + "logits/chosen": -2.6314220428466797, + "logits/rejected": -2.8024237155914307, + "logps/chosen": -218.33792114257812, + "logps/rejected": -168.13693237304688, + "loss": 0.3519, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16894719004631042, + "rewards/margins": 2.120260000228882, + "rewards/rejected": -2.2892072200775146, + "step": 1305 + }, + { + "epoch": 0.15, + "learning_rate": 2.587381481915018e-07, + "logits/chosen": -2.4231817722320557, + "logits/rejected": -2.1727163791656494, + "logps/chosen": -286.1910400390625, + "logps/rejected": -432.743896484375, + "loss": 0.2655, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.43909311294555664, + "rewards/margins": 2.0189030170440674, + "rewards/rejected": -2.457995891571045, + "step": 1306 + }, + { + "epoch": 0.15, + "learning_rate": 2.5870303172187757e-07, + "logits/chosen": -1.9324753284454346, + "logits/rejected": -1.9993400573730469, + "logps/chosen": -533.2730712890625, + "logps/rejected": -422.41278076171875, + "loss": 0.3338, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10956725478172302, + "rewards/margins": 1.7955819368362427, + "rewards/rejected": -1.9051491022109985, + "step": 1307 + }, + { + "epoch": 0.15, + "learning_rate": 2.5866791525225333e-07, + "logits/chosen": -2.2439825534820557, + "logits/rejected": -2.4160983562469482, + "logps/chosen": -244.83322143554688, + "logps/rejected": -254.02194213867188, + "loss": 0.347, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8819608688354492, + "rewards/margins": 1.1639026403427124, + "rewards/rejected": -2.045863628387451, + "step": 1308 + }, + { + "epoch": 0.15, + "learning_rate": 2.5863279878262903e-07, + "logits/chosen": -2.3801615238189697, + "logits/rejected": -2.267625570297241, + "logps/chosen": -203.16769409179688, + "logps/rejected": -220.74737548828125, + "loss": 0.2149, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7313941717147827, + "rewards/margins": 2.688594102859497, + "rewards/rejected": -3.4199883937835693, + "step": 1309 + }, + { + "epoch": 0.15, + "learning_rate": 2.585976823130048e-07, + "logits/chosen": -2.31428599357605, + "logits/rejected": -2.2254257202148438, + "logps/chosen": -260.39898681640625, + "logps/rejected": -218.45668029785156, + "loss": 0.308, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12579753994941711, + "rewards/margins": 2.1008918285369873, + "rewards/rejected": -2.226689338684082, + "step": 1310 + }, + { + "epoch": 0.15, + "learning_rate": 2.5856256584338054e-07, + "logits/chosen": -2.607478141784668, + "logits/rejected": -2.766728401184082, + "logps/chosen": -151.02987670898438, + "logps/rejected": -285.40399169921875, + "loss": 0.3059, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.41412103176116943, + "rewards/margins": 1.7458875179290771, + "rewards/rejected": -2.160008430480957, + "step": 1311 + }, + { + "epoch": 0.15, + "learning_rate": 2.585274493737563e-07, + "logits/chosen": -2.507120132446289, + "logits/rejected": -2.4751787185668945, + "logps/chosen": -215.1824188232422, + "logps/rejected": -266.7431640625, + "loss": 0.5057, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8264909386634827, + "rewards/margins": 0.9956766963005066, + "rewards/rejected": -1.8221675157546997, + "step": 1312 + }, + { + "epoch": 0.15, + "learning_rate": 2.5849233290413204e-07, + "logits/chosen": -2.1436500549316406, + "logits/rejected": -2.472377300262451, + "logps/chosen": -219.21124267578125, + "logps/rejected": -146.93704223632812, + "loss": 0.6724, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.358729362487793, + "rewards/margins": 0.8406165838241577, + "rewards/rejected": -2.199345827102661, + "step": 1313 + }, + { + "epoch": 0.15, + "learning_rate": 2.5845721643450774e-07, + "logits/chosen": -1.7944695949554443, + "logits/rejected": -1.9682095050811768, + "logps/chosen": -154.08480834960938, + "logps/rejected": -145.0218963623047, + "loss": 0.3432, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.47057703137397766, + "rewards/margins": 1.4248101711273193, + "rewards/rejected": -1.8953871726989746, + "step": 1314 + }, + { + "epoch": 0.15, + "learning_rate": 2.584220999648835e-07, + "logits/chosen": -2.631751537322998, + "logits/rejected": -2.899205446243286, + "logps/chosen": -530.379150390625, + "logps/rejected": -347.79425048828125, + "loss": 0.4943, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.762345552444458, + "rewards/margins": 2.19183349609375, + "rewards/rejected": -2.954178810119629, + "step": 1315 + }, + { + "epoch": 0.15, + "learning_rate": 2.583869834952593e-07, + "logits/chosen": -1.965468168258667, + "logits/rejected": -2.202430486679077, + "logps/chosen": -275.43719482421875, + "logps/rejected": -159.0050506591797, + "loss": 0.46, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2944314777851105, + "rewards/margins": 1.2589973211288452, + "rewards/rejected": -1.5534287691116333, + "step": 1316 + }, + { + "epoch": 0.15, + "learning_rate": 2.58351867025635e-07, + "logits/chosen": -2.1193509101867676, + "logits/rejected": -2.323918104171753, + "logps/chosen": -303.5712585449219, + "logps/rejected": -279.7637634277344, + "loss": 0.1705, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.02235083281993866, + "rewards/margins": 2.705455780029297, + "rewards/rejected": -2.6831047534942627, + "step": 1317 + }, + { + "epoch": 0.15, + "learning_rate": 2.5831675055601076e-07, + "logits/chosen": -2.5939972400665283, + "logits/rejected": -2.5165796279907227, + "logps/chosen": -197.79725646972656, + "logps/rejected": -208.49777221679688, + "loss": 0.2023, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6576564311981201, + "rewards/margins": 2.1649818420410156, + "rewards/rejected": -2.822638511657715, + "step": 1318 + }, + { + "epoch": 0.15, + "learning_rate": 2.582816340863865e-07, + "logits/chosen": -2.4719536304473877, + "logits/rejected": -2.258282423019409, + "logps/chosen": -249.6863555908203, + "logps/rejected": -346.1636962890625, + "loss": 0.5345, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6135188937187195, + "rewards/margins": 0.8699164390563965, + "rewards/rejected": -1.4834353923797607, + "step": 1319 + }, + { + "epoch": 0.15, + "learning_rate": 2.5824651761676227e-07, + "logits/chosen": -2.9063096046447754, + "logits/rejected": -2.781418800354004, + "logps/chosen": -202.74571228027344, + "logps/rejected": -237.56210327148438, + "loss": 0.4325, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4894087612628937, + "rewards/margins": 1.9542567729949951, + "rewards/rejected": -2.4436655044555664, + "step": 1320 + }, + { + "epoch": 0.15, + "learning_rate": 2.58211401147138e-07, + "logits/chosen": -1.871518850326538, + "logits/rejected": -1.964362382888794, + "logps/chosen": -317.9482727050781, + "logps/rejected": -370.96795654296875, + "loss": 0.2686, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06035449728369713, + "rewards/margins": 3.061295986175537, + "rewards/rejected": -3.121649980545044, + "step": 1321 + }, + { + "epoch": 0.15, + "learning_rate": 2.581762846775137e-07, + "logits/chosen": -2.3818306922912598, + "logits/rejected": -2.5131285190582275, + "logps/chosen": -266.54364013671875, + "logps/rejected": -257.24407958984375, + "loss": 0.282, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7424167394638062, + "rewards/margins": 1.6875847578048706, + "rewards/rejected": -2.4300014972686768, + "step": 1322 + }, + { + "epoch": 0.15, + "learning_rate": 2.581411682078895e-07, + "logits/chosen": -2.31264591217041, + "logits/rejected": -2.442269802093506, + "logps/chosen": -292.2336120605469, + "logps/rejected": -285.8445129394531, + "loss": 0.1589, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6206883192062378, + "rewards/margins": 2.1826438903808594, + "rewards/rejected": -2.8033323287963867, + "step": 1323 + }, + { + "epoch": 0.15, + "learning_rate": 2.5810605173826523e-07, + "logits/chosen": -2.4010567665100098, + "logits/rejected": -2.1085317134857178, + "logps/chosen": -218.77886962890625, + "logps/rejected": -248.01597595214844, + "loss": 0.3838, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6737009286880493, + "rewards/margins": 1.5435842275619507, + "rewards/rejected": -3.217285394668579, + "step": 1324 + }, + { + "epoch": 0.15, + "learning_rate": 2.58070935268641e-07, + "logits/chosen": -2.3618969917297363, + "logits/rejected": -2.6124658584594727, + "logps/chosen": -326.2710876464844, + "logps/rejected": -299.6769104003906, + "loss": 0.4836, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7536834478378296, + "rewards/margins": 1.3570921421051025, + "rewards/rejected": -2.1107754707336426, + "step": 1325 + }, + { + "epoch": 0.15, + "learning_rate": 2.5803581879901674e-07, + "logits/chosen": -1.9131783246994019, + "logits/rejected": -1.9690234661102295, + "logps/chosen": -279.9361877441406, + "logps/rejected": -211.48452758789062, + "loss": 0.5799, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.583713710308075, + "rewards/margins": 0.899489164352417, + "rewards/rejected": -1.4832026958465576, + "step": 1326 + }, + { + "epoch": 0.15, + "learning_rate": 2.5800070232939244e-07, + "logits/chosen": -2.7506818771362305, + "logits/rejected": -2.8069238662719727, + "logps/chosen": -179.81619262695312, + "logps/rejected": -258.9920959472656, + "loss": 0.7235, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.799409031867981, + "rewards/margins": 1.4409950971603394, + "rewards/rejected": -3.2404041290283203, + "step": 1327 + }, + { + "epoch": 0.15, + "learning_rate": 2.5796558585976825e-07, + "logits/chosen": -2.9836838245391846, + "logits/rejected": -2.8571293354034424, + "logps/chosen": -245.98397827148438, + "logps/rejected": -162.70411682128906, + "loss": 0.4532, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9196165204048157, + "rewards/margins": 1.2843073606491089, + "rewards/rejected": -2.2039239406585693, + "step": 1328 + }, + { + "epoch": 0.15, + "learning_rate": 2.57930469390144e-07, + "logits/chosen": -2.628671884536743, + "logits/rejected": -2.2963695526123047, + "logps/chosen": -299.989501953125, + "logps/rejected": -305.6285095214844, + "loss": 0.3658, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.36478692293167114, + "rewards/margins": 3.1832852363586426, + "rewards/rejected": -3.548072099685669, + "step": 1329 + }, + { + "epoch": 0.15, + "learning_rate": 2.578953529205197e-07, + "logits/chosen": -2.327939033508301, + "logits/rejected": -2.0784854888916016, + "logps/chosen": -146.18344116210938, + "logps/rejected": -150.02804565429688, + "loss": 0.6435, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.408252477645874, + "rewards/margins": 0.674196183681488, + "rewards/rejected": -2.082448720932007, + "step": 1330 + }, + { + "epoch": 0.15, + "learning_rate": 2.5786023645089545e-07, + "logits/chosen": -2.2917115688323975, + "logits/rejected": -2.181776762008667, + "logps/chosen": -212.83255004882812, + "logps/rejected": -188.35958862304688, + "loss": 0.1574, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5141304135322571, + "rewards/margins": 2.4662587642669678, + "rewards/rejected": -2.98038911819458, + "step": 1331 + }, + { + "epoch": 0.15, + "learning_rate": 2.578251199812712e-07, + "logits/chosen": -2.940183162689209, + "logits/rejected": -2.9327826499938965, + "logps/chosen": -290.4913635253906, + "logps/rejected": -259.6552734375, + "loss": 0.2648, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2709839344024658, + "rewards/margins": 2.0753397941589355, + "rewards/rejected": -2.3463234901428223, + "step": 1332 + }, + { + "epoch": 0.15, + "learning_rate": 2.5779000351164696e-07, + "logits/chosen": -2.7686946392059326, + "logits/rejected": -2.9333114624023438, + "logps/chosen": -187.0283203125, + "logps/rejected": -167.2555694580078, + "loss": 0.436, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2433328777551651, + "rewards/margins": 1.301999807357788, + "rewards/rejected": -1.5453325510025024, + "step": 1333 + }, + { + "epoch": 0.15, + "learning_rate": 2.577548870420227e-07, + "logits/chosen": -1.8124066591262817, + "logits/rejected": -2.080782175064087, + "logps/chosen": -525.9891357421875, + "logps/rejected": -449.6488952636719, + "loss": 0.4296, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0052751898765563965, + "rewards/margins": 1.4209083318710327, + "rewards/rejected": -1.4156330823898315, + "step": 1334 + }, + { + "epoch": 0.15, + "learning_rate": 2.577197705723984e-07, + "logits/chosen": -2.31445574760437, + "logits/rejected": -2.2767128944396973, + "logps/chosen": -534.22998046875, + "logps/rejected": -363.7313232421875, + "loss": 0.3551, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9163813591003418, + "rewards/margins": 1.8029168844223022, + "rewards/rejected": -2.7192983627319336, + "step": 1335 + }, + { + "epoch": 0.15, + "learning_rate": 2.5768465410277417e-07, + "logits/chosen": -2.209177255630493, + "logits/rejected": -2.2651124000549316, + "logps/chosen": -251.96104431152344, + "logps/rejected": -238.0844268798828, + "loss": 0.513, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9478068351745605, + "rewards/margins": 0.4899388551712036, + "rewards/rejected": -1.4377458095550537, + "step": 1336 + }, + { + "epoch": 0.15, + "learning_rate": 2.576495376331499e-07, + "logits/chosen": -2.384174346923828, + "logits/rejected": -2.580084800720215, + "logps/chosen": -342.215576171875, + "logps/rejected": -234.8739776611328, + "loss": 0.4373, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8318611979484558, + "rewards/margins": 0.9022517204284668, + "rewards/rejected": -1.7341128587722778, + "step": 1337 + }, + { + "epoch": 0.15, + "learning_rate": 2.576144211635257e-07, + "logits/chosen": -2.194056272506714, + "logits/rejected": -2.192368984222412, + "logps/chosen": -234.94357299804688, + "logps/rejected": -388.6813659667969, + "loss": 0.5456, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2669217586517334, + "rewards/margins": 2.214165687561035, + "rewards/rejected": -3.4810874462127686, + "step": 1338 + }, + { + "epoch": 0.15, + "learning_rate": 2.5757930469390143e-07, + "logits/chosen": -2.410252332687378, + "logits/rejected": -2.692206382751465, + "logps/chosen": -184.4392852783203, + "logps/rejected": -181.8663330078125, + "loss": 2.3913, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.6716718673706055, + "rewards/margins": 0.5398175716400146, + "rewards/rejected": -3.21148943901062, + "step": 1339 + }, + { + "epoch": 0.15, + "learning_rate": 2.575441882242772e-07, + "logits/chosen": -1.5746774673461914, + "logits/rejected": -2.0424911975860596, + "logps/chosen": -385.0528564453125, + "logps/rejected": -201.52301025390625, + "loss": 0.7612, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.40619781613349915, + "rewards/margins": 0.2235339730978012, + "rewards/rejected": -0.6297317743301392, + "step": 1340 + }, + { + "epoch": 0.15, + "learning_rate": 2.5750907175465294e-07, + "logits/chosen": -2.7744035720825195, + "logits/rejected": -2.6128792762756348, + "logps/chosen": -331.70086669921875, + "logps/rejected": -263.88751220703125, + "loss": 0.402, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6680219769477844, + "rewards/margins": 1.0864923000335693, + "rewards/rejected": -1.754514217376709, + "step": 1341 + }, + { + "epoch": 0.15, + "learning_rate": 2.574739552850287e-07, + "logits/chosen": -2.669329881668091, + "logits/rejected": -2.5992953777313232, + "logps/chosen": -137.10513305664062, + "logps/rejected": -182.4577178955078, + "loss": 0.3519, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6441555023193359, + "rewards/margins": 2.075806140899658, + "rewards/rejected": -2.719961643218994, + "step": 1342 + }, + { + "epoch": 0.15, + "learning_rate": 2.574388388154044e-07, + "logits/chosen": -2.905240774154663, + "logits/rejected": -2.9100375175476074, + "logps/chosen": -225.3803253173828, + "logps/rejected": -170.0433349609375, + "loss": 0.4302, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2778488099575043, + "rewards/margins": 0.9202213287353516, + "rewards/rejected": -1.1980701684951782, + "step": 1343 + }, + { + "epoch": 0.15, + "learning_rate": 2.5740372234578015e-07, + "logits/chosen": -2.613112211227417, + "logits/rejected": -2.393629550933838, + "logps/chosen": -40.59107208251953, + "logps/rejected": -188.95050048828125, + "loss": 0.1175, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21409204602241516, + "rewards/margins": 3.3718111515045166, + "rewards/rejected": -3.5859029293060303, + "step": 1344 + }, + { + "epoch": 0.16, + "learning_rate": 2.573686058761559e-07, + "logits/chosen": -2.5841245651245117, + "logits/rejected": -2.406404495239258, + "logps/chosen": -201.48480224609375, + "logps/rejected": -369.82672119140625, + "loss": 0.5261, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2464513778686523, + "rewards/margins": 1.065639853477478, + "rewards/rejected": -2.312091112136841, + "step": 1345 + }, + { + "epoch": 0.16, + "learning_rate": 2.5733348940653166e-07, + "logits/chosen": -2.778330087661743, + "logits/rejected": -2.4965059757232666, + "logps/chosen": -200.84310913085938, + "logps/rejected": -162.19342041015625, + "loss": 0.9464, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9193124175071716, + "rewards/margins": 0.3575630784034729, + "rewards/rejected": -1.2768754959106445, + "step": 1346 + }, + { + "epoch": 0.16, + "learning_rate": 2.572983729369074e-07, + "logits/chosen": -1.9439160823822021, + "logits/rejected": -1.9623292684555054, + "logps/chosen": -252.29986572265625, + "logps/rejected": -299.835205078125, + "loss": 0.3291, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9086562395095825, + "rewards/margins": 1.564255714416504, + "rewards/rejected": -2.472911834716797, + "step": 1347 + }, + { + "epoch": 0.16, + "learning_rate": 2.572632564672831e-07, + "logits/chosen": -2.429534673690796, + "logits/rejected": -2.657524585723877, + "logps/chosen": -328.41064453125, + "logps/rejected": -188.67564392089844, + "loss": 0.1887, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.25398245453834534, + "rewards/margins": 2.9257898330688477, + "rewards/rejected": -3.17977237701416, + "step": 1348 + }, + { + "epoch": 0.16, + "learning_rate": 2.5722813999765886e-07, + "logits/chosen": -2.500437021255493, + "logits/rejected": -2.3511929512023926, + "logps/chosen": -231.8284149169922, + "logps/rejected": -254.90049743652344, + "loss": 0.422, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6546056270599365, + "rewards/margins": 1.246127963066101, + "rewards/rejected": -1.9007337093353271, + "step": 1349 + }, + { + "epoch": 0.16, + "learning_rate": 2.5719302352803467e-07, + "logits/chosen": -2.910543203353882, + "logits/rejected": -2.9188621044158936, + "logps/chosen": -128.19012451171875, + "logps/rejected": -249.9622039794922, + "loss": 0.2419, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.408179521560669, + "rewards/margins": 2.999802589416504, + "rewards/rejected": -4.407981872558594, + "step": 1350 + }, + { + "epoch": 0.16, + "learning_rate": 2.5715790705841037e-07, + "logits/chosen": -2.196782112121582, + "logits/rejected": -2.121699333190918, + "logps/chosen": -316.29620361328125, + "logps/rejected": -245.54513549804688, + "loss": 1.0542, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8432096242904663, + "rewards/margins": 0.38113337755203247, + "rewards/rejected": -2.2243428230285645, + "step": 1351 + }, + { + "epoch": 0.16, + "learning_rate": 2.571227905887861e-07, + "logits/chosen": -2.0120017528533936, + "logits/rejected": -2.1095283031463623, + "logps/chosen": -243.48683166503906, + "logps/rejected": -179.4239044189453, + "loss": 0.9091, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.0960702896118164, + "rewards/margins": 0.058596670627593994, + "rewards/rejected": -2.1546669006347656, + "step": 1352 + }, + { + "epoch": 0.16, + "learning_rate": 2.570876741191619e-07, + "logits/chosen": -2.993964910507202, + "logits/rejected": -3.0327043533325195, + "logps/chosen": -269.6951904296875, + "logps/rejected": -320.18505859375, + "loss": 0.5147, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6150935888290405, + "rewards/margins": 0.8790371417999268, + "rewards/rejected": -1.4941308498382568, + "step": 1353 + }, + { + "epoch": 0.16, + "learning_rate": 2.5705255764953763e-07, + "logits/chosen": -1.6297799348831177, + "logits/rejected": -1.8938080072402954, + "logps/chosen": -370.5832214355469, + "logps/rejected": -294.00732421875, + "loss": 0.2845, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6388224363327026, + "rewards/margins": 3.243354558944702, + "rewards/rejected": -3.8821768760681152, + "step": 1354 + }, + { + "epoch": 0.16, + "learning_rate": 2.570174411799134e-07, + "logits/chosen": -2.847116470336914, + "logits/rejected": -2.6884827613830566, + "logps/chosen": -272.16558837890625, + "logps/rejected": -477.5104064941406, + "loss": 0.4904, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.042993426322937, + "rewards/margins": 1.7524523735046387, + "rewards/rejected": -2.795445680618286, + "step": 1355 + }, + { + "epoch": 0.16, + "learning_rate": 2.569823247102891e-07, + "logits/chosen": -2.7494893074035645, + "logits/rejected": -2.724637508392334, + "logps/chosen": -196.70982360839844, + "logps/rejected": -237.25247192382812, + "loss": 0.3879, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04159057140350342, + "rewards/margins": 1.4111782312393188, + "rewards/rejected": -1.4527688026428223, + "step": 1356 + }, + { + "epoch": 0.16, + "learning_rate": 2.5694720824066484e-07, + "logits/chosen": -3.037583351135254, + "logits/rejected": -2.905165672302246, + "logps/chosen": -392.075439453125, + "logps/rejected": -231.39077758789062, + "loss": 0.4071, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1899953931570053, + "rewards/margins": 1.017331600189209, + "rewards/rejected": -1.207327127456665, + "step": 1357 + }, + { + "epoch": 0.16, + "learning_rate": 2.569120917710406e-07, + "logits/chosen": -2.5717787742614746, + "logits/rejected": -2.5558080673217773, + "logps/chosen": -336.192626953125, + "logps/rejected": -249.7107391357422, + "loss": 0.2212, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5121989846229553, + "rewards/margins": 2.454195976257324, + "rewards/rejected": -2.9663949012756348, + "step": 1358 + }, + { + "epoch": 0.16, + "learning_rate": 2.5687697530141635e-07, + "logits/chosen": -2.3790183067321777, + "logits/rejected": -2.377103567123413, + "logps/chosen": -313.9597473144531, + "logps/rejected": -360.13751220703125, + "loss": 0.4592, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5735514163970947, + "rewards/margins": 0.9279046654701233, + "rewards/rejected": -1.5014561414718628, + "step": 1359 + }, + { + "epoch": 0.16, + "learning_rate": 2.568418588317921e-07, + "logits/chosen": -1.9835476875305176, + "logits/rejected": -2.0198943614959717, + "logps/chosen": -269.7320251464844, + "logps/rejected": -239.25253295898438, + "loss": 1.0914, + "rewards/accuracies": 0.375, + "rewards/chosen": -2.5476160049438477, + "rewards/margins": 0.13816730678081512, + "rewards/rejected": -2.6857833862304688, + "step": 1360 + }, + { + "epoch": 0.16, + "learning_rate": 2.5680674236216786e-07, + "logits/chosen": -2.421241283416748, + "logits/rejected": -2.0230884552001953, + "logps/chosen": -236.65914916992188, + "logps/rejected": -276.94696044921875, + "loss": 0.2625, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5897905826568604, + "rewards/margins": 2.366546154022217, + "rewards/rejected": -2.956336736679077, + "step": 1361 + }, + { + "epoch": 0.16, + "learning_rate": 2.567716258925436e-07, + "logits/chosen": -2.0327529907226562, + "logits/rejected": -2.0280213356018066, + "logps/chosen": -279.8519287109375, + "logps/rejected": -377.21502685546875, + "loss": 0.191, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6810139417648315, + "rewards/margins": 2.5356192588806152, + "rewards/rejected": -3.2166333198547363, + "step": 1362 + }, + { + "epoch": 0.16, + "learning_rate": 2.5673650942291937e-07, + "logits/chosen": -2.313729763031006, + "logits/rejected": -2.245346784591675, + "logps/chosen": -452.19659423828125, + "logps/rejected": -468.64410400390625, + "loss": 0.3384, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2005424499511719, + "rewards/margins": 2.331834316253662, + "rewards/rejected": -3.532376766204834, + "step": 1363 + }, + { + "epoch": 0.16, + "learning_rate": 2.5670139295329507e-07, + "logits/chosen": -2.1066172122955322, + "logits/rejected": -2.4190526008605957, + "logps/chosen": -385.91290283203125, + "logps/rejected": -236.59912109375, + "loss": 0.3421, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3658981919288635, + "rewards/margins": 1.8517900705337524, + "rewards/rejected": -2.2176880836486816, + "step": 1364 + }, + { + "epoch": 0.16, + "learning_rate": 2.566662764836708e-07, + "logits/chosen": -2.2153711318969727, + "logits/rejected": -2.0292937755584717, + "logps/chosen": -229.64527893066406, + "logps/rejected": -377.177734375, + "loss": 0.2502, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.47176608443260193, + "rewards/margins": 1.738332986831665, + "rewards/rejected": -1.2665669918060303, + "step": 1365 + }, + { + "epoch": 0.16, + "learning_rate": 2.566311600140466e-07, + "logits/chosen": -2.235607624053955, + "logits/rejected": -2.3953397274017334, + "logps/chosen": -234.0986328125, + "logps/rejected": -249.75657653808594, + "loss": 0.9401, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7270361185073853, + "rewards/margins": -0.012717276811599731, + "rewards/rejected": -1.714318871498108, + "step": 1366 + }, + { + "epoch": 0.16, + "learning_rate": 2.5659604354442233e-07, + "logits/chosen": -2.7087347507476807, + "logits/rejected": -2.6458449363708496, + "logps/chosen": -388.4052734375, + "logps/rejected": -239.9068603515625, + "loss": 0.7115, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.184762954711914, + "rewards/margins": 1.0117141008377075, + "rewards/rejected": -2.196476936340332, + "step": 1367 + }, + { + "epoch": 0.16, + "learning_rate": 2.565609270747981e-07, + "logits/chosen": -1.902994990348816, + "logits/rejected": -2.0216832160949707, + "logps/chosen": -484.9305725097656, + "logps/rejected": -404.7203369140625, + "loss": 0.3398, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0820114612579346, + "rewards/margins": 1.6294660568237305, + "rewards/rejected": -2.711477518081665, + "step": 1368 + }, + { + "epoch": 0.16, + "learning_rate": 2.5652581060517384e-07, + "logits/chosen": -2.5632500648498535, + "logits/rejected": -2.5436222553253174, + "logps/chosen": -295.94097900390625, + "logps/rejected": -289.2312316894531, + "loss": 0.356, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9693864583969116, + "rewards/margins": 1.3191108703613281, + "rewards/rejected": -2.2884974479675293, + "step": 1369 + }, + { + "epoch": 0.16, + "learning_rate": 2.5649069413554954e-07, + "logits/chosen": -2.452225685119629, + "logits/rejected": -2.3772993087768555, + "logps/chosen": -461.6959533691406, + "logps/rejected": -307.401611328125, + "loss": 0.3641, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.397034227848053, + "rewards/margins": 2.2428293228149414, + "rewards/rejected": -2.6398634910583496, + "step": 1370 + }, + { + "epoch": 0.16, + "learning_rate": 2.564555776659253e-07, + "logits/chosen": -2.542052745819092, + "logits/rejected": -2.5616562366485596, + "logps/chosen": -273.10296630859375, + "logps/rejected": -334.3306884765625, + "loss": 0.4185, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8656750321388245, + "rewards/margins": 1.8778042793273926, + "rewards/rejected": -2.7434794902801514, + "step": 1371 + }, + { + "epoch": 0.16, + "learning_rate": 2.5642046119630104e-07, + "logits/chosen": -1.8275141716003418, + "logits/rejected": -1.854561448097229, + "logps/chosen": -345.0947570800781, + "logps/rejected": -332.35009765625, + "loss": 0.5354, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9209791421890259, + "rewards/margins": 0.8586434721946716, + "rewards/rejected": -1.7796225547790527, + "step": 1372 + }, + { + "epoch": 0.16, + "learning_rate": 2.563853447266768e-07, + "logits/chosen": -2.7993240356445312, + "logits/rejected": -2.7498443126678467, + "logps/chosen": -282.8843688964844, + "logps/rejected": -220.24765014648438, + "loss": 0.2779, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2951209545135498, + "rewards/margins": 2.1358823776245117, + "rewards/rejected": -2.4310030937194824, + "step": 1373 + }, + { + "epoch": 0.16, + "learning_rate": 2.5635022825705255e-07, + "logits/chosen": -2.428966522216797, + "logits/rejected": -2.3561110496520996, + "logps/chosen": -135.4336395263672, + "logps/rejected": -163.041259765625, + "loss": 0.4365, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9159126877784729, + "rewards/margins": 1.0420299768447876, + "rewards/rejected": -1.9579427242279053, + "step": 1374 + }, + { + "epoch": 0.16, + "learning_rate": 2.563151117874283e-07, + "logits/chosen": -2.4071171283721924, + "logits/rejected": -2.5613536834716797, + "logps/chosen": -253.55372619628906, + "logps/rejected": -231.68014526367188, + "loss": 0.2635, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7266337871551514, + "rewards/margins": 2.5017483234405518, + "rewards/rejected": -3.228382110595703, + "step": 1375 + }, + { + "epoch": 0.16, + "learning_rate": 2.5627999531780406e-07, + "logits/chosen": -2.2076172828674316, + "logits/rejected": -2.2845635414123535, + "logps/chosen": -210.56265258789062, + "logps/rejected": -323.75592041015625, + "loss": 0.3524, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08914777636528015, + "rewards/margins": 2.110025405883789, + "rewards/rejected": -2.0208778381347656, + "step": 1376 + }, + { + "epoch": 0.16, + "learning_rate": 2.5624487884817976e-07, + "logits/chosen": -2.753430128097534, + "logits/rejected": -2.758307933807373, + "logps/chosen": -153.0931396484375, + "logps/rejected": -264.295654296875, + "loss": 0.1896, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.672450602054596, + "rewards/margins": 2.696049690246582, + "rewards/rejected": -3.368500232696533, + "step": 1377 + }, + { + "epoch": 0.16, + "learning_rate": 2.562097623785555e-07, + "logits/chosen": -2.625208616256714, + "logits/rejected": -2.6999616622924805, + "logps/chosen": -210.9528350830078, + "logps/rejected": -285.80963134765625, + "loss": 0.1623, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.732681930065155, + "rewards/margins": 1.9134355783462524, + "rewards/rejected": -2.6461174488067627, + "step": 1378 + }, + { + "epoch": 0.16, + "learning_rate": 2.5617464590893127e-07, + "logits/chosen": -2.059051990509033, + "logits/rejected": -2.2505507469177246, + "logps/chosen": -169.0402374267578, + "logps/rejected": -193.36265563964844, + "loss": 0.3138, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.27253082394599915, + "rewards/margins": 1.996809482574463, + "rewards/rejected": -2.2693402767181396, + "step": 1379 + }, + { + "epoch": 0.16, + "learning_rate": 2.56139529439307e-07, + "logits/chosen": -2.372788667678833, + "logits/rejected": -2.4418373107910156, + "logps/chosen": -211.27313232421875, + "logps/rejected": -329.1675109863281, + "loss": 0.3605, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04842524230480194, + "rewards/margins": 2.817836046218872, + "rewards/rejected": -2.8662612438201904, + "step": 1380 + }, + { + "epoch": 0.16, + "learning_rate": 2.561044129696828e-07, + "logits/chosen": -2.839876174926758, + "logits/rejected": -2.841144561767578, + "logps/chosen": -128.20249938964844, + "logps/rejected": -197.3280029296875, + "loss": 0.9682, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0985867977142334, + "rewards/margins": 0.4189739525318146, + "rewards/rejected": -1.5175608396530151, + "step": 1381 + }, + { + "epoch": 0.16, + "learning_rate": 2.5606929650005853e-07, + "logits/chosen": -2.094752073287964, + "logits/rejected": -2.1393065452575684, + "logps/chosen": -436.8076171875, + "logps/rejected": -268.1700134277344, + "loss": 0.3223, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9193074703216553, + "rewards/margins": 1.102339506149292, + "rewards/rejected": -2.0216472148895264, + "step": 1382 + }, + { + "epoch": 0.16, + "learning_rate": 2.5603418003043423e-07, + "logits/chosen": -2.5660834312438965, + "logits/rejected": -2.66941499710083, + "logps/chosen": -297.9750671386719, + "logps/rejected": -277.3825378417969, + "loss": 0.2987, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.46636366844177246, + "rewards/margins": 2.512587785720825, + "rewards/rejected": -2.9789514541625977, + "step": 1383 + }, + { + "epoch": 0.16, + "learning_rate": 2.5599906356081004e-07, + "logits/chosen": -1.888533592224121, + "logits/rejected": -1.6215496063232422, + "logps/chosen": -220.8387451171875, + "logps/rejected": -347.01385498046875, + "loss": 0.2778, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12870433926582336, + "rewards/margins": 2.237316608428955, + "rewards/rejected": -2.366020917892456, + "step": 1384 + }, + { + "epoch": 0.16, + "learning_rate": 2.5596394709118574e-07, + "logits/chosen": -2.2462706565856934, + "logits/rejected": -2.1120505332946777, + "logps/chosen": -219.19387817382812, + "logps/rejected": -349.8387145996094, + "loss": 0.2659, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7076311111450195, + "rewards/margins": 2.489677906036377, + "rewards/rejected": -3.1973090171813965, + "step": 1385 + }, + { + "epoch": 0.16, + "learning_rate": 2.559288306215615e-07, + "logits/chosen": -2.5982728004455566, + "logits/rejected": -2.6567435264587402, + "logps/chosen": -120.80181884765625, + "logps/rejected": -147.64674377441406, + "loss": 0.3096, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3887515068054199, + "rewards/margins": 2.2278125286102295, + "rewards/rejected": -2.6165640354156494, + "step": 1386 + }, + { + "epoch": 0.16, + "learning_rate": 2.5589371415193725e-07, + "logits/chosen": -2.098019599914551, + "logits/rejected": -1.9276540279388428, + "logps/chosen": -288.920166015625, + "logps/rejected": -286.7950134277344, + "loss": 0.5354, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9646452069282532, + "rewards/margins": 1.565093755722046, + "rewards/rejected": -2.5297389030456543, + "step": 1387 + }, + { + "epoch": 0.16, + "learning_rate": 2.55858597682313e-07, + "logits/chosen": -2.9463882446289062, + "logits/rejected": -2.8459177017211914, + "logps/chosen": -291.2607116699219, + "logps/rejected": -177.9037628173828, + "loss": 1.3472, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7704172134399414, + "rewards/margins": 0.3339099586009979, + "rewards/rejected": -2.1043272018432617, + "step": 1388 + }, + { + "epoch": 0.16, + "learning_rate": 2.5582348121268875e-07, + "logits/chosen": -2.4269909858703613, + "logits/rejected": -2.3275957107543945, + "logps/chosen": -210.14501953125, + "logps/rejected": -319.9818115234375, + "loss": 0.4659, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9826197624206543, + "rewards/margins": 1.4314978122711182, + "rewards/rejected": -2.4141173362731934, + "step": 1389 + }, + { + "epoch": 0.16, + "learning_rate": 2.557883647430645e-07, + "logits/chosen": -2.084977626800537, + "logits/rejected": -2.050074577331543, + "logps/chosen": -240.43551635742188, + "logps/rejected": -263.3756103515625, + "loss": 0.4778, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8507969379425049, + "rewards/margins": 1.506279468536377, + "rewards/rejected": -2.357076406478882, + "step": 1390 + }, + { + "epoch": 0.16, + "learning_rate": 2.557532482734402e-07, + "logits/chosen": -2.3698596954345703, + "logits/rejected": -2.2614755630493164, + "logps/chosen": -406.29791259765625, + "logps/rejected": -362.904296875, + "loss": 0.5719, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0059120655059814, + "rewards/margins": 1.16517972946167, + "rewards/rejected": -2.1710917949676514, + "step": 1391 + }, + { + "epoch": 0.16, + "learning_rate": 2.5571813180381596e-07, + "logits/chosen": -3.0563833713531494, + "logits/rejected": -3.0014090538024902, + "logps/chosen": -132.59683227539062, + "logps/rejected": -149.25555419921875, + "loss": 0.4744, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.46655261516571045, + "rewards/margins": 1.5050218105316162, + "rewards/rejected": -1.9715744256973267, + "step": 1392 + }, + { + "epoch": 0.16, + "learning_rate": 2.556830153341917e-07, + "logits/chosen": -2.169797658920288, + "logits/rejected": -2.305006265640259, + "logps/chosen": -368.77734375, + "logps/rejected": -279.5542297363281, + "loss": 0.4324, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.743003249168396, + "rewards/margins": 1.2848583459854126, + "rewards/rejected": -2.0278615951538086, + "step": 1393 + }, + { + "epoch": 0.16, + "learning_rate": 2.5564789886456747e-07, + "logits/chosen": -1.6536558866500854, + "logits/rejected": -1.4968795776367188, + "logps/chosen": -302.1551513671875, + "logps/rejected": -280.76788330078125, + "loss": 0.7849, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6141526699066162, + "rewards/margins": 0.213454470038414, + "rewards/rejected": -0.8276070356369019, + "step": 1394 + }, + { + "epoch": 0.16, + "learning_rate": 2.556127823949432e-07, + "logits/chosen": -2.113344192504883, + "logits/rejected": -2.2187817096710205, + "logps/chosen": -359.59454345703125, + "logps/rejected": -364.0041809082031, + "loss": 0.3614, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9749746918678284, + "rewards/margins": 1.5940353870391846, + "rewards/rejected": -2.569010019302368, + "step": 1395 + }, + { + "epoch": 0.16, + "learning_rate": 2.55577665925319e-07, + "logits/chosen": -2.3582892417907715, + "logits/rejected": -2.4117493629455566, + "logps/chosen": -335.68927001953125, + "logps/rejected": -255.28570556640625, + "loss": 0.9221, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8970046043395996, + "rewards/margins": 2.1137139797210693, + "rewards/rejected": -3.010718584060669, + "step": 1396 + }, + { + "epoch": 0.16, + "learning_rate": 2.5554254945569473e-07, + "logits/chosen": -2.1513659954071045, + "logits/rejected": -2.2413244247436523, + "logps/chosen": -312.60430908203125, + "logps/rejected": -276.0049743652344, + "loss": 0.5783, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5285664796829224, + "rewards/margins": 1.3245357275009155, + "rewards/rejected": -1.853102207183838, + "step": 1397 + }, + { + "epoch": 0.16, + "learning_rate": 2.555074329860705e-07, + "logits/chosen": -2.1707003116607666, + "logits/rejected": -1.946972370147705, + "logps/chosen": -347.11505126953125, + "logps/rejected": -401.64984130859375, + "loss": 0.73, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0813257694244385, + "rewards/margins": 0.7325404286384583, + "rewards/rejected": -1.8138660192489624, + "step": 1398 + }, + { + "epoch": 0.16, + "learning_rate": 2.554723165164462e-07, + "logits/chosen": -2.1488189697265625, + "logits/rejected": -2.128652572631836, + "logps/chosen": -372.94049072265625, + "logps/rejected": -258.134765625, + "loss": 1.139, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.520717978477478, + "rewards/margins": -0.40772441029548645, + "rewards/rejected": -1.1129937171936035, + "step": 1399 + }, + { + "epoch": 0.16, + "learning_rate": 2.5543720004682194e-07, + "logits/chosen": -1.5973044633865356, + "logits/rejected": -2.037468433380127, + "logps/chosen": -421.3184509277344, + "logps/rejected": -413.2462158203125, + "loss": 0.2666, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7560914754867554, + "rewards/margins": 1.516695499420166, + "rewards/rejected": -2.272786855697632, + "step": 1400 + }, + { + "epoch": 0.16, + "learning_rate": 2.554020835771977e-07, + "logits/chosen": -2.2191150188446045, + "logits/rejected": -2.34666109085083, + "logps/chosen": -151.86099243164062, + "logps/rejected": -254.05648803710938, + "loss": 0.285, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3743820786476135, + "rewards/margins": 2.3725433349609375, + "rewards/rejected": -2.7469253540039062, + "step": 1401 + }, + { + "epoch": 0.16, + "learning_rate": 2.5536696710757345e-07, + "logits/chosen": -2.1713554859161377, + "logits/rejected": -2.5709593296051025, + "logps/chosen": -383.6734924316406, + "logps/rejected": -240.9033966064453, + "loss": 6.4962, + "rewards/accuracies": 0.75, + "rewards/chosen": -6.635958194732666, + "rewards/margins": -5.106226444244385, + "rewards/rejected": -1.5297311544418335, + "step": 1402 + }, + { + "epoch": 0.16, + "learning_rate": 2.553318506379492e-07, + "logits/chosen": -1.8974180221557617, + "logits/rejected": -2.1302545070648193, + "logps/chosen": -457.1479187011719, + "logps/rejected": -312.1859130859375, + "loss": 0.6593, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8748663663864136, + "rewards/margins": 0.4773463308811188, + "rewards/rejected": -1.35221266746521, + "step": 1403 + }, + { + "epoch": 0.16, + "learning_rate": 2.552967341683249e-07, + "logits/chosen": -2.453695774078369, + "logits/rejected": -2.6477935314178467, + "logps/chosen": -311.2273254394531, + "logps/rejected": -128.4039306640625, + "loss": 0.6064, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6093982458114624, + "rewards/margins": 0.9124983549118042, + "rewards/rejected": -1.5218966007232666, + "step": 1404 + }, + { + "epoch": 0.16, + "learning_rate": 2.5526161769870066e-07, + "logits/chosen": -2.1930830478668213, + "logits/rejected": -1.7604418992996216, + "logps/chosen": -252.33856201171875, + "logps/rejected": -367.5047912597656, + "loss": 0.4084, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5042179226875305, + "rewards/margins": 1.9251999855041504, + "rewards/rejected": -2.429417848587036, + "step": 1405 + }, + { + "epoch": 0.16, + "learning_rate": 2.5522650122907646e-07, + "logits/chosen": -2.0289976596832275, + "logits/rejected": -2.0780200958251953, + "logps/chosen": -308.0894775390625, + "logps/rejected": -231.80770874023438, + "loss": 0.5399, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.766431450843811, + "rewards/margins": 0.8090716004371643, + "rewards/rejected": -1.5755031108856201, + "step": 1406 + }, + { + "epoch": 0.16, + "learning_rate": 2.5519138475945216e-07, + "logits/chosen": -2.501469850540161, + "logits/rejected": -2.682569742202759, + "logps/chosen": -182.4380340576172, + "logps/rejected": -254.89590454101562, + "loss": 0.43, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7104778289794922, + "rewards/margins": 2.104900598526001, + "rewards/rejected": -2.8153786659240723, + "step": 1407 + }, + { + "epoch": 0.16, + "learning_rate": 2.551562682898279e-07, + "logits/chosen": -2.2681736946105957, + "logits/rejected": -2.2292494773864746, + "logps/chosen": -290.347412109375, + "logps/rejected": -367.5643310546875, + "loss": 0.4335, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3730715215206146, + "rewards/margins": 1.119286298751831, + "rewards/rejected": -1.492357850074768, + "step": 1408 + }, + { + "epoch": 0.16, + "learning_rate": 2.5512115182020367e-07, + "logits/chosen": -2.2200005054473877, + "logits/rejected": -2.3557002544403076, + "logps/chosen": -303.14825439453125, + "logps/rejected": -220.3629608154297, + "loss": 0.6003, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.20965047180652618, + "rewards/margins": 1.2158863544464111, + "rewards/rejected": -1.425536870956421, + "step": 1409 + }, + { + "epoch": 0.16, + "learning_rate": 2.550860353505794e-07, + "logits/chosen": -2.1608426570892334, + "logits/rejected": -2.113377571105957, + "logps/chosen": -545.691650390625, + "logps/rejected": -491.25006103515625, + "loss": 0.2571, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08682768791913986, + "rewards/margins": 1.51934814453125, + "rewards/rejected": -1.6061757802963257, + "step": 1410 + }, + { + "epoch": 0.16, + "learning_rate": 2.550509188809552e-07, + "logits/chosen": -2.0782687664031982, + "logits/rejected": -2.2979743480682373, + "logps/chosen": -308.3525085449219, + "logps/rejected": -187.4889373779297, + "loss": 0.656, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8075929880142212, + "rewards/margins": 0.8312444090843201, + "rewards/rejected": -1.6388373374938965, + "step": 1411 + }, + { + "epoch": 0.16, + "learning_rate": 2.550158024113309e-07, + "logits/chosen": -2.969982862472534, + "logits/rejected": -2.7904579639434814, + "logps/chosen": -373.90765380859375, + "logps/rejected": -423.75054931640625, + "loss": 0.6737, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8540161848068237, + "rewards/margins": 1.3113317489624023, + "rewards/rejected": -2.1653480529785156, + "step": 1412 + }, + { + "epoch": 0.16, + "learning_rate": 2.5498068594170663e-07, + "logits/chosen": -2.4990768432617188, + "logits/rejected": -2.3301312923431396, + "logps/chosen": -284.4039611816406, + "logps/rejected": -363.80230712890625, + "loss": 0.211, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7997649908065796, + "rewards/margins": 2.568753242492676, + "rewards/rejected": -3.368518114089966, + "step": 1413 + }, + { + "epoch": 0.16, + "learning_rate": 2.549455694720824e-07, + "logits/chosen": -2.61908221244812, + "logits/rejected": -2.5099196434020996, + "logps/chosen": -251.26239013671875, + "logps/rejected": -296.427001953125, + "loss": 0.2073, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5585911273956299, + "rewards/margins": 2.0440616607666016, + "rewards/rejected": -2.6026525497436523, + "step": 1414 + }, + { + "epoch": 0.16, + "learning_rate": 2.5491045300245814e-07, + "logits/chosen": -2.3159899711608887, + "logits/rejected": -2.467446804046631, + "logps/chosen": -319.263427734375, + "logps/rejected": -305.6278076171875, + "loss": 0.5087, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3144819140434265, + "rewards/margins": 1.3179569244384766, + "rewards/rejected": -1.6324388980865479, + "step": 1415 + }, + { + "epoch": 0.16, + "learning_rate": 2.548753365328339e-07, + "logits/chosen": -2.521547555923462, + "logits/rejected": -2.4065380096435547, + "logps/chosen": -337.61346435546875, + "logps/rejected": -364.20574951171875, + "loss": 0.4019, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2588958740234375, + "rewards/margins": 2.8249869346618652, + "rewards/rejected": -3.0838825702667236, + "step": 1416 + }, + { + "epoch": 0.16, + "learning_rate": 2.548402200632096e-07, + "logits/chosen": -2.287353515625, + "logits/rejected": -2.1838231086730957, + "logps/chosen": -184.32623291015625, + "logps/rejected": -211.8885498046875, + "loss": 0.4102, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6375664472579956, + "rewards/margins": 1.4357351064682007, + "rewards/rejected": -2.073301315307617, + "step": 1417 + }, + { + "epoch": 0.16, + "learning_rate": 2.548051035935854e-07, + "logits/chosen": -2.418119192123413, + "logits/rejected": -2.1481521129608154, + "logps/chosen": -353.79595947265625, + "logps/rejected": -397.22479248046875, + "loss": 0.5627, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7184217572212219, + "rewards/margins": 1.5203888416290283, + "rewards/rejected": -2.2388105392456055, + "step": 1418 + }, + { + "epoch": 0.16, + "learning_rate": 2.5476998712396116e-07, + "logits/chosen": -1.7474031448364258, + "logits/rejected": -1.909925937652588, + "logps/chosen": -373.1262512207031, + "logps/rejected": -441.387451171875, + "loss": 0.4166, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7585400342941284, + "rewards/margins": 1.1221015453338623, + "rewards/rejected": -1.8806415796279907, + "step": 1419 + }, + { + "epoch": 0.16, + "learning_rate": 2.5473487065433686e-07, + "logits/chosen": -1.9791593551635742, + "logits/rejected": -2.06954026222229, + "logps/chosen": -379.489990234375, + "logps/rejected": -322.26556396484375, + "loss": 0.3619, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.636778712272644, + "rewards/margins": 1.4372100830078125, + "rewards/rejected": -2.073988676071167, + "step": 1420 + }, + { + "epoch": 0.16, + "learning_rate": 2.546997541847126e-07, + "logits/chosen": -2.4118824005126953, + "logits/rejected": -2.2230160236358643, + "logps/chosen": -280.47869873046875, + "logps/rejected": -221.1510009765625, + "loss": 0.535, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8343365788459778, + "rewards/margins": 1.6825274229049683, + "rewards/rejected": -2.5168638229370117, + "step": 1421 + }, + { + "epoch": 0.16, + "learning_rate": 2.5466463771508837e-07, + "logits/chosen": -2.6528196334838867, + "logits/rejected": -2.664328098297119, + "logps/chosen": -300.49139404296875, + "logps/rejected": -244.88296508789062, + "loss": 0.3996, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1962132453918457, + "rewards/margins": 1.1296696662902832, + "rewards/rejected": -2.325882911682129, + "step": 1422 + }, + { + "epoch": 0.16, + "learning_rate": 2.546295212454641e-07, + "logits/chosen": -2.0707650184631348, + "logits/rejected": -2.449338436126709, + "logps/chosen": -332.2935791015625, + "logps/rejected": -258.3976135253906, + "loss": 0.4108, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0447063073515892, + "rewards/margins": 1.6464836597442627, + "rewards/rejected": -1.6017773151397705, + "step": 1423 + }, + { + "epoch": 0.16, + "learning_rate": 2.545944047758399e-07, + "logits/chosen": -2.781097173690796, + "logits/rejected": -2.615995407104492, + "logps/chosen": -267.7669677734375, + "logps/rejected": -232.24774169921875, + "loss": 0.2114, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.35735273361206055, + "rewards/margins": 3.160003185272217, + "rewards/rejected": -3.5173561573028564, + "step": 1424 + }, + { + "epoch": 0.16, + "learning_rate": 2.545592883062156e-07, + "logits/chosen": -2.0462217330932617, + "logits/rejected": -1.8761403560638428, + "logps/chosen": -564.3485717773438, + "logps/rejected": -587.2445068359375, + "loss": 0.1841, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7523335814476013, + "rewards/margins": 2.2978458404541016, + "rewards/rejected": -3.0501794815063477, + "step": 1425 + }, + { + "epoch": 0.16, + "learning_rate": 2.5452417183659133e-07, + "logits/chosen": -2.0584867000579834, + "logits/rejected": -1.9652116298675537, + "logps/chosen": -495.0593566894531, + "logps/rejected": -288.56634521484375, + "loss": 0.3109, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.273908406496048, + "rewards/margins": 2.724517583847046, + "rewards/rejected": -2.9984259605407715, + "step": 1426 + }, + { + "epoch": 0.16, + "learning_rate": 2.5448905536696714e-07, + "logits/chosen": -3.079730987548828, + "logits/rejected": -2.9317402839660645, + "logps/chosen": -328.8702392578125, + "logps/rejected": -222.0450439453125, + "loss": 0.3889, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2409555912017822, + "rewards/margins": 1.0532455444335938, + "rewards/rejected": -2.294201135635376, + "step": 1427 + }, + { + "epoch": 0.16, + "learning_rate": 2.5445393889734284e-07, + "logits/chosen": -2.6951913833618164, + "logits/rejected": -2.589536666870117, + "logps/chosen": -275.505615234375, + "logps/rejected": -364.4836730957031, + "loss": 0.1289, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5188965201377869, + "rewards/margins": 2.8423805236816406, + "rewards/rejected": -3.3612771034240723, + "step": 1428 + }, + { + "epoch": 0.16, + "learning_rate": 2.544188224277186e-07, + "logits/chosen": -2.3689723014831543, + "logits/rejected": -2.3747832775115967, + "logps/chosen": -153.8204345703125, + "logps/rejected": -179.10763549804688, + "loss": 0.3591, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.32041919231414795, + "rewards/margins": 1.3696577548980713, + "rewards/rejected": -1.6900769472122192, + "step": 1429 + }, + { + "epoch": 0.16, + "learning_rate": 2.5438370595809434e-07, + "logits/chosen": -2.5295653343200684, + "logits/rejected": -2.764094829559326, + "logps/chosen": -326.60308837890625, + "logps/rejected": -274.0861511230469, + "loss": 0.6933, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4750195741653442, + "rewards/margins": 1.0032870769500732, + "rewards/rejected": -2.478306531906128, + "step": 1430 + }, + { + "epoch": 0.16, + "learning_rate": 2.543485894884701e-07, + "logits/chosen": -2.666820526123047, + "logits/rejected": -2.7429378032684326, + "logps/chosen": -216.80532836914062, + "logps/rejected": -226.7815704345703, + "loss": 0.2005, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8438336849212646, + "rewards/margins": 1.8383017778396606, + "rewards/rejected": -2.6821353435516357, + "step": 1431 + }, + { + "epoch": 0.17, + "learning_rate": 2.5431347301884585e-07, + "logits/chosen": -2.051516532897949, + "logits/rejected": -2.274461030960083, + "logps/chosen": -289.9241638183594, + "logps/rejected": -219.32083129882812, + "loss": 0.2942, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2265232801437378, + "rewards/margins": 1.6901204586029053, + "rewards/rejected": -2.9166438579559326, + "step": 1432 + }, + { + "epoch": 0.17, + "learning_rate": 2.5427835654922155e-07, + "logits/chosen": -2.6106014251708984, + "logits/rejected": -2.8181068897247314, + "logps/chosen": -178.3297119140625, + "logps/rejected": -188.83120727539062, + "loss": 0.3222, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.46786612272262573, + "rewards/margins": 1.9386241436004639, + "rewards/rejected": -2.4064903259277344, + "step": 1433 + }, + { + "epoch": 0.17, + "learning_rate": 2.542432400795973e-07, + "logits/chosen": -2.2804489135742188, + "logits/rejected": -2.1724250316619873, + "logps/chosen": -120.26394653320312, + "logps/rejected": -237.08346557617188, + "loss": 0.3418, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5946686267852783, + "rewards/margins": 1.3180874586105347, + "rewards/rejected": -1.9127562046051025, + "step": 1434 + }, + { + "epoch": 0.17, + "learning_rate": 2.5420812360997306e-07, + "logits/chosen": -2.652851104736328, + "logits/rejected": -2.442258358001709, + "logps/chosen": -243.79164123535156, + "logps/rejected": -220.58290100097656, + "loss": 0.455, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1872429847717285, + "rewards/margins": 1.2356387376785278, + "rewards/rejected": -2.4228813648223877, + "step": 1435 + }, + { + "epoch": 0.17, + "learning_rate": 2.541730071403488e-07, + "logits/chosen": -2.5068931579589844, + "logits/rejected": -2.7336201667785645, + "logps/chosen": -130.79116821289062, + "logps/rejected": -260.77362060546875, + "loss": 1.4014, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.2342493534088135, + "rewards/margins": 1.2723853588104248, + "rewards/rejected": -3.5066347122192383, + "step": 1436 + }, + { + "epoch": 0.17, + "learning_rate": 2.5413789067072457e-07, + "logits/chosen": -2.4566216468811035, + "logits/rejected": -2.381026268005371, + "logps/chosen": -256.9278259277344, + "logps/rejected": -263.6143493652344, + "loss": 0.4938, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7920569777488708, + "rewards/margins": 1.6117010116577148, + "rewards/rejected": -2.4037580490112305, + "step": 1437 + }, + { + "epoch": 0.17, + "learning_rate": 2.5410277420110027e-07, + "logits/chosen": -2.3038506507873535, + "logits/rejected": -2.393563985824585, + "logps/chosen": -307.79443359375, + "logps/rejected": -203.1729736328125, + "loss": 0.576, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.294897437095642, + "rewards/margins": 0.8769643306732178, + "rewards/rejected": -2.1718616485595703, + "step": 1438 + }, + { + "epoch": 0.17, + "learning_rate": 2.54067657731476e-07, + "logits/chosen": -2.13333797454834, + "logits/rejected": -1.9121031761169434, + "logps/chosen": -338.3038330078125, + "logps/rejected": -330.3493347167969, + "loss": 0.557, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5625773072242737, + "rewards/margins": 1.192483901977539, + "rewards/rejected": -1.755061388015747, + "step": 1439 + }, + { + "epoch": 0.17, + "learning_rate": 2.5403254126185183e-07, + "logits/chosen": -2.5868947505950928, + "logits/rejected": -2.7001559734344482, + "logps/chosen": -274.6092834472656, + "logps/rejected": -233.22943115234375, + "loss": 0.3323, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4902866780757904, + "rewards/margins": 2.550830602645874, + "rewards/rejected": -3.0411171913146973, + "step": 1440 + }, + { + "epoch": 0.17, + "learning_rate": 2.5399742479222753e-07, + "logits/chosen": -1.8609799146652222, + "logits/rejected": -1.5829317569732666, + "logps/chosen": -198.66696166992188, + "logps/rejected": -354.5980224609375, + "loss": 0.2232, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4805727005004883, + "rewards/margins": 1.9000868797302246, + "rewards/rejected": -2.380659580230713, + "step": 1441 + }, + { + "epoch": 0.17, + "learning_rate": 2.539623083226033e-07, + "logits/chosen": -2.522505283355713, + "logits/rejected": -2.287116765975952, + "logps/chosen": -190.0419158935547, + "logps/rejected": -273.3203125, + "loss": 0.3819, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7399119138717651, + "rewards/margins": 1.6592246294021606, + "rewards/rejected": -2.399136543273926, + "step": 1442 + }, + { + "epoch": 0.17, + "learning_rate": 2.5392719185297904e-07, + "logits/chosen": -2.81309175491333, + "logits/rejected": -2.902364492416382, + "logps/chosen": -319.22918701171875, + "logps/rejected": -291.9774475097656, + "loss": 0.7573, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.085676908493042, + "rewards/margins": 0.1829744279384613, + "rewards/rejected": -1.2686512470245361, + "step": 1443 + }, + { + "epoch": 0.17, + "learning_rate": 2.538920753833548e-07, + "logits/chosen": -2.7252182960510254, + "logits/rejected": -2.752556800842285, + "logps/chosen": -207.37319946289062, + "logps/rejected": -208.93130493164062, + "loss": 0.5103, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7129666209220886, + "rewards/margins": 0.829361617565155, + "rewards/rejected": -1.5423283576965332, + "step": 1444 + }, + { + "epoch": 0.17, + "learning_rate": 2.5385695891373055e-07, + "logits/chosen": -2.3940515518188477, + "logits/rejected": -2.660245895385742, + "logps/chosen": -479.3466796875, + "logps/rejected": -279.8632507324219, + "loss": 0.5257, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.047689437866211, + "rewards/margins": 1.7742440700531006, + "rewards/rejected": -2.8219335079193115, + "step": 1445 + }, + { + "epoch": 0.17, + "learning_rate": 2.5382184244410625e-07, + "logits/chosen": -2.1846282482147217, + "logits/rejected": -2.3383095264434814, + "logps/chosen": -131.99722290039062, + "logps/rejected": -107.700439453125, + "loss": 0.5903, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5443601012229919, + "rewards/margins": 0.9724735617637634, + "rewards/rejected": -1.5168334245681763, + "step": 1446 + }, + { + "epoch": 0.17, + "learning_rate": 2.53786725974482e-07, + "logits/chosen": -2.1061289310455322, + "logits/rejected": -2.4880852699279785, + "logps/chosen": -458.800048828125, + "logps/rejected": -299.39605712890625, + "loss": 0.4137, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.249666452407837, + "rewards/margins": 1.523927092552185, + "rewards/rejected": -2.7735934257507324, + "step": 1447 + }, + { + "epoch": 0.17, + "learning_rate": 2.5375160950485775e-07, + "logits/chosen": -2.791276693344116, + "logits/rejected": -2.699596881866455, + "logps/chosen": -248.54176330566406, + "logps/rejected": -279.98486328125, + "loss": 0.2898, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0293649435043335, + "rewards/margins": 1.7807517051696777, + "rewards/rejected": -2.810116767883301, + "step": 1448 + }, + { + "epoch": 0.17, + "learning_rate": 2.537164930352335e-07, + "logits/chosen": -1.9808800220489502, + "logits/rejected": -1.862312912940979, + "logps/chosen": -182.97088623046875, + "logps/rejected": -220.57797241210938, + "loss": 0.3719, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6753281354904175, + "rewards/margins": 1.5731937885284424, + "rewards/rejected": -2.2485218048095703, + "step": 1449 + }, + { + "epoch": 0.17, + "learning_rate": 2.5368137656560926e-07, + "logits/chosen": -2.7715158462524414, + "logits/rejected": -2.525665283203125, + "logps/chosen": -101.81318664550781, + "logps/rejected": -162.01724243164062, + "loss": 0.4372, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.508377194404602, + "rewards/margins": 1.2227338552474976, + "rewards/rejected": -1.73111093044281, + "step": 1450 + }, + { + "epoch": 0.17, + "learning_rate": 2.53646260095985e-07, + "logits/chosen": -2.1502315998077393, + "logits/rejected": -2.2793116569519043, + "logps/chosen": -339.6414794921875, + "logps/rejected": -211.42543029785156, + "loss": 0.3183, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6104637384414673, + "rewards/margins": 1.2935353517532349, + "rewards/rejected": -1.9039990901947021, + "step": 1451 + }, + { + "epoch": 0.17, + "learning_rate": 2.5361114362636077e-07, + "logits/chosen": -2.689669370651245, + "logits/rejected": -2.402830123901367, + "logps/chosen": -316.29742431640625, + "logps/rejected": -294.33648681640625, + "loss": 0.0852, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05798250809311867, + "rewards/margins": 2.9540576934814453, + "rewards/rejected": -3.012040138244629, + "step": 1452 + }, + { + "epoch": 0.17, + "learning_rate": 2.535760271567365e-07, + "logits/chosen": -2.5478687286376953, + "logits/rejected": -2.4862771034240723, + "logps/chosen": -140.24554443359375, + "logps/rejected": -257.26947021484375, + "loss": 0.4514, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8154121041297913, + "rewards/margins": 1.7048099040985107, + "rewards/rejected": -2.5202221870422363, + "step": 1453 + }, + { + "epoch": 0.17, + "learning_rate": 2.535409106871122e-07, + "logits/chosen": -2.0611002445220947, + "logits/rejected": -2.426750898361206, + "logps/chosen": -449.2939453125, + "logps/rejected": -314.9515075683594, + "loss": 0.3759, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.34815770387649536, + "rewards/margins": 1.5584619045257568, + "rewards/rejected": -1.906619668006897, + "step": 1454 + }, + { + "epoch": 0.17, + "learning_rate": 2.53505794217488e-07, + "logits/chosen": -2.0905263423919678, + "logits/rejected": -2.058293342590332, + "logps/chosen": -418.3416748046875, + "logps/rejected": -405.9171447753906, + "loss": 0.3834, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7426719069480896, + "rewards/margins": 1.7358003854751587, + "rewards/rejected": -2.4784722328186035, + "step": 1455 + }, + { + "epoch": 0.17, + "learning_rate": 2.5347067774786373e-07, + "logits/chosen": -2.1991074085235596, + "logits/rejected": -2.0831186771392822, + "logps/chosen": -208.51644897460938, + "logps/rejected": -271.2768249511719, + "loss": 0.8948, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6621257066726685, + "rewards/margins": 1.4068602323532104, + "rewards/rejected": -2.0689857006073, + "step": 1456 + }, + { + "epoch": 0.17, + "learning_rate": 2.534355612782395e-07, + "logits/chosen": -2.047623634338379, + "logits/rejected": -2.009730339050293, + "logps/chosen": -323.537841796875, + "logps/rejected": -320.63555908203125, + "loss": 0.5102, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2886481285095215, + "rewards/margins": 1.1147687435150146, + "rewards/rejected": -2.403416633605957, + "step": 1457 + }, + { + "epoch": 0.17, + "learning_rate": 2.5340044480861524e-07, + "logits/chosen": -2.405355215072632, + "logits/rejected": -2.2230451107025146, + "logps/chosen": -202.04840087890625, + "logps/rejected": -266.70745849609375, + "loss": 0.4679, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1368156671524048, + "rewards/margins": 1.232297420501709, + "rewards/rejected": -2.3691132068634033, + "step": 1458 + }, + { + "epoch": 0.17, + "learning_rate": 2.53365328338991e-07, + "logits/chosen": -2.237816572189331, + "logits/rejected": -2.301853895187378, + "logps/chosen": -272.28033447265625, + "logps/rejected": -316.1462097167969, + "loss": 0.3604, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8022960424423218, + "rewards/margins": 2.1576015949249268, + "rewards/rejected": -2.959897518157959, + "step": 1459 + }, + { + "epoch": 0.17, + "learning_rate": 2.533302118693667e-07, + "logits/chosen": -1.6898976564407349, + "logits/rejected": -2.0771236419677734, + "logps/chosen": -256.1647644042969, + "logps/rejected": -179.3927001953125, + "loss": 0.9411, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2242012023925781, + "rewards/margins": 0.5768178701400757, + "rewards/rejected": -1.8010190725326538, + "step": 1460 + }, + { + "epoch": 0.17, + "learning_rate": 2.532950953997425e-07, + "logits/chosen": -1.918241024017334, + "logits/rejected": -2.113833427429199, + "logps/chosen": -287.7816162109375, + "logps/rejected": -255.79994201660156, + "loss": 0.3519, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.35684239864349365, + "rewards/margins": 1.0952796936035156, + "rewards/rejected": -1.4521219730377197, + "step": 1461 + }, + { + "epoch": 0.17, + "learning_rate": 2.532599789301182e-07, + "logits/chosen": -2.4950191974639893, + "logits/rejected": -2.200892210006714, + "logps/chosen": -232.8123779296875, + "logps/rejected": -311.8076171875, + "loss": 0.2875, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7747405767440796, + "rewards/margins": 2.0732250213623047, + "rewards/rejected": -2.847965717315674, + "step": 1462 + }, + { + "epoch": 0.17, + "learning_rate": 2.5322486246049396e-07, + "logits/chosen": -3.0239222049713135, + "logits/rejected": -3.014061450958252, + "logps/chosen": -413.5745849609375, + "logps/rejected": -412.91229248046875, + "loss": 0.4055, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3325994610786438, + "rewards/margins": 2.1693167686462402, + "rewards/rejected": -2.5019164085388184, + "step": 1463 + }, + { + "epoch": 0.17, + "learning_rate": 2.531897459908697e-07, + "logits/chosen": -2.185575485229492, + "logits/rejected": -2.183332920074463, + "logps/chosen": -357.06689453125, + "logps/rejected": -325.5648498535156, + "loss": 0.4979, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9377651214599609, + "rewards/margins": 1.27694833278656, + "rewards/rejected": -2.2147133350372314, + "step": 1464 + }, + { + "epoch": 0.17, + "learning_rate": 2.5315462952124546e-07, + "logits/chosen": -2.079784393310547, + "logits/rejected": -2.276216983795166, + "logps/chosen": -289.32781982421875, + "logps/rejected": -287.7579650878906, + "loss": 0.4363, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5824805498123169, + "rewards/margins": 1.4183870553970337, + "rewards/rejected": -2.0008676052093506, + "step": 1465 + }, + { + "epoch": 0.17, + "learning_rate": 2.531195130516212e-07, + "logits/chosen": -2.9817087650299072, + "logits/rejected": -2.7195425033569336, + "logps/chosen": -286.46563720703125, + "logps/rejected": -291.26171875, + "loss": 0.1203, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0437575578689575, + "rewards/margins": 3.656738042831421, + "rewards/rejected": -4.700495719909668, + "step": 1466 + }, + { + "epoch": 0.17, + "learning_rate": 2.5308439658199697e-07, + "logits/chosen": -2.6500039100646973, + "logits/rejected": -2.8530073165893555, + "logps/chosen": -259.6836853027344, + "logps/rejected": -263.1490478515625, + "loss": 0.2537, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9069244861602783, + "rewards/margins": 2.776585578918457, + "rewards/rejected": -3.6835103034973145, + "step": 1467 + }, + { + "epoch": 0.17, + "learning_rate": 2.5304928011237267e-07, + "logits/chosen": -2.2007908821105957, + "logits/rejected": -2.0457992553710938, + "logps/chosen": -127.44408416748047, + "logps/rejected": -189.06320190429688, + "loss": 0.5043, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6237907409667969, + "rewards/margins": 1.0050709247589111, + "rewards/rejected": -1.6288617849349976, + "step": 1468 + }, + { + "epoch": 0.17, + "learning_rate": 2.5301416364274843e-07, + "logits/chosen": -2.837238073348999, + "logits/rejected": -2.499772548675537, + "logps/chosen": -161.15367126464844, + "logps/rejected": -195.20578002929688, + "loss": 0.426, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05034524202346802, + "rewards/margins": 1.5147403478622437, + "rewards/rejected": -1.5650856494903564, + "step": 1469 + }, + { + "epoch": 0.17, + "learning_rate": 2.529790471731242e-07, + "logits/chosen": -2.144737482070923, + "logits/rejected": -2.1501965522766113, + "logps/chosen": -431.30364990234375, + "logps/rejected": -248.87770080566406, + "loss": 1.6573, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.13703989982605, + "rewards/margins": -0.7797569632530212, + "rewards/rejected": -1.3572829961776733, + "step": 1470 + }, + { + "epoch": 0.17, + "learning_rate": 2.5294393070349993e-07, + "logits/chosen": -2.260286569595337, + "logits/rejected": -2.2041425704956055, + "logps/chosen": -259.46063232421875, + "logps/rejected": -217.04664611816406, + "loss": 0.5886, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8728224635124207, + "rewards/margins": 0.8983784914016724, + "rewards/rejected": -1.7712010145187378, + "step": 1471 + }, + { + "epoch": 0.17, + "learning_rate": 2.529088142338757e-07, + "logits/chosen": -2.183725357055664, + "logits/rejected": -2.3162431716918945, + "logps/chosen": -352.0470275878906, + "logps/rejected": -355.71826171875, + "loss": 0.4368, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.002232551574707, + "rewards/margins": 1.578204870223999, + "rewards/rejected": -2.580437421798706, + "step": 1472 + }, + { + "epoch": 0.17, + "learning_rate": 2.528736977642514e-07, + "logits/chosen": -2.1868815422058105, + "logits/rejected": -2.271221160888672, + "logps/chosen": -343.32275390625, + "logps/rejected": -348.3271484375, + "loss": 0.2301, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.27067774534225464, + "rewards/margins": 1.7345961332321167, + "rewards/rejected": -2.0052738189697266, + "step": 1473 + }, + { + "epoch": 0.17, + "learning_rate": 2.528385812946272e-07, + "logits/chosen": -2.7356133460998535, + "logits/rejected": -2.6315805912017822, + "logps/chosen": -350.7005615234375, + "logps/rejected": -276.1532897949219, + "loss": 0.5868, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7299551963806152, + "rewards/margins": 0.48312902450561523, + "rewards/rejected": -2.2130842208862305, + "step": 1474 + }, + { + "epoch": 0.17, + "learning_rate": 2.528034648250029e-07, + "logits/chosen": -2.121943950653076, + "logits/rejected": -2.4069366455078125, + "logps/chosen": -335.56121826171875, + "logps/rejected": -409.346923828125, + "loss": 0.7743, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.013409972190857, + "rewards/margins": 0.5931792259216309, + "rewards/rejected": -1.6065889596939087, + "step": 1475 + }, + { + "epoch": 0.17, + "learning_rate": 2.5276834835537865e-07, + "logits/chosen": -2.6400935649871826, + "logits/rejected": -2.433692455291748, + "logps/chosen": -117.56189727783203, + "logps/rejected": -231.05679321289062, + "loss": 0.9022, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6586323976516724, + "rewards/margins": 1.1608939170837402, + "rewards/rejected": -2.819526195526123, + "step": 1476 + }, + { + "epoch": 0.17, + "learning_rate": 2.527332318857544e-07, + "logits/chosen": -2.356785774230957, + "logits/rejected": -2.5544486045837402, + "logps/chosen": -223.3623046875, + "logps/rejected": -220.1260986328125, + "loss": 0.6458, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1552656888961792, + "rewards/margins": 1.250363826751709, + "rewards/rejected": -2.4056296348571777, + "step": 1477 + }, + { + "epoch": 0.17, + "learning_rate": 2.5269811541613016e-07, + "logits/chosen": -2.5402843952178955, + "logits/rejected": -2.7233588695526123, + "logps/chosen": -328.81414794921875, + "logps/rejected": -173.1459503173828, + "loss": 0.6443, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3267948627471924, + "rewards/margins": 1.1733636856079102, + "rewards/rejected": -2.5001583099365234, + "step": 1478 + }, + { + "epoch": 0.17, + "learning_rate": 2.526629989465059e-07, + "logits/chosen": -2.12280011177063, + "logits/rejected": -2.4290876388549805, + "logps/chosen": -320.98675537109375, + "logps/rejected": -262.6634826660156, + "loss": 0.2497, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5268356800079346, + "rewards/margins": 2.347543478012085, + "rewards/rejected": -2.8743789196014404, + "step": 1479 + }, + { + "epoch": 0.17, + "learning_rate": 2.5262788247688167e-07, + "logits/chosen": -1.750935673713684, + "logits/rejected": -2.177607536315918, + "logps/chosen": -497.40643310546875, + "logps/rejected": -324.4679870605469, + "loss": 0.2476, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4698697328567505, + "rewards/margins": 1.5603771209716797, + "rewards/rejected": -2.0302469730377197, + "step": 1480 + }, + { + "epoch": 0.17, + "learning_rate": 2.5259276600725737e-07, + "logits/chosen": -2.5945091247558594, + "logits/rejected": -2.802032470703125, + "logps/chosen": -249.27456665039062, + "logps/rejected": -182.26950073242188, + "loss": 0.2894, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.053958535194397, + "rewards/margins": 2.87036395072937, + "rewards/rejected": -3.9243228435516357, + "step": 1481 + }, + { + "epoch": 0.17, + "learning_rate": 2.525576495376331e-07, + "logits/chosen": -2.7544069290161133, + "logits/rejected": -2.8370490074157715, + "logps/chosen": -362.31866455078125, + "logps/rejected": -236.46829223632812, + "loss": 0.5575, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.489181786775589, + "rewards/margins": 0.9001466631889343, + "rewards/rejected": -1.3893283605575562, + "step": 1482 + }, + { + "epoch": 0.17, + "learning_rate": 2.525225330680089e-07, + "logits/chosen": -2.4800589084625244, + "logits/rejected": -2.4145984649658203, + "logps/chosen": -567.6148681640625, + "logps/rejected": -415.4486083984375, + "loss": 0.3637, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5022991299629211, + "rewards/margins": 1.590904712677002, + "rewards/rejected": -2.0932037830352783, + "step": 1483 + }, + { + "epoch": 0.17, + "learning_rate": 2.5248741659838463e-07, + "logits/chosen": -1.87093186378479, + "logits/rejected": -2.203968048095703, + "logps/chosen": -367.1369323730469, + "logps/rejected": -299.3676452636719, + "loss": 0.2015, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09238994121551514, + "rewards/margins": 2.5262022018432617, + "rewards/rejected": -2.6185920238494873, + "step": 1484 + }, + { + "epoch": 0.17, + "learning_rate": 2.524523001287604e-07, + "logits/chosen": -1.6788561344146729, + "logits/rejected": -2.0267133712768555, + "logps/chosen": -446.69305419921875, + "logps/rejected": -295.22607421875, + "loss": 0.8824, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9284944534301758, + "rewards/margins": 0.2866774797439575, + "rewards/rejected": -1.2151719331741333, + "step": 1485 + }, + { + "epoch": 0.17, + "learning_rate": 2.5241718365913614e-07, + "logits/chosen": -2.2229199409484863, + "logits/rejected": -2.0325074195861816, + "logps/chosen": -163.58041381835938, + "logps/rejected": -317.24676513671875, + "loss": 0.4339, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3193521499633789, + "rewards/margins": 1.1435627937316895, + "rewards/rejected": -1.4629149436950684, + "step": 1486 + }, + { + "epoch": 0.17, + "learning_rate": 2.523820671895119e-07, + "logits/chosen": -2.400300979614258, + "logits/rejected": -2.6190855503082275, + "logps/chosen": -267.4810791015625, + "logps/rejected": -237.98696899414062, + "loss": 0.6314, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0247948169708252, + "rewards/margins": 1.064338207244873, + "rewards/rejected": -2.089132785797119, + "step": 1487 + }, + { + "epoch": 0.17, + "learning_rate": 2.5234695071988764e-07, + "logits/chosen": -1.717982292175293, + "logits/rejected": -2.0953235626220703, + "logps/chosen": -531.468994140625, + "logps/rejected": -273.0938720703125, + "loss": 0.3967, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7443298101425171, + "rewards/margins": 1.3125721216201782, + "rewards/rejected": -2.0569019317626953, + "step": 1488 + }, + { + "epoch": 0.17, + "learning_rate": 2.5231183425026335e-07, + "logits/chosen": -2.743671417236328, + "logits/rejected": -2.7621986865997314, + "logps/chosen": -302.71966552734375, + "logps/rejected": -259.0999755859375, + "loss": 0.1653, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3674342930316925, + "rewards/margins": 4.502589702606201, + "rewards/rejected": -4.87002420425415, + "step": 1489 + }, + { + "epoch": 0.17, + "learning_rate": 2.522767177806391e-07, + "logits/chosen": -2.4020228385925293, + "logits/rejected": -2.6062426567077637, + "logps/chosen": -401.56109619140625, + "logps/rejected": -214.11264038085938, + "loss": 0.8034, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.970498263835907, + "rewards/margins": 0.163606196641922, + "rewards/rejected": -1.1341044902801514, + "step": 1490 + }, + { + "epoch": 0.17, + "learning_rate": 2.5224160131101485e-07, + "logits/chosen": -2.582087516784668, + "logits/rejected": -2.6101441383361816, + "logps/chosen": -130.47023010253906, + "logps/rejected": -178.03341674804688, + "loss": 0.3835, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6833630800247192, + "rewards/margins": 1.9894368648529053, + "rewards/rejected": -2.672799825668335, + "step": 1491 + }, + { + "epoch": 0.17, + "learning_rate": 2.522064848413906e-07, + "logits/chosen": -2.1899256706237793, + "logits/rejected": -2.4205644130706787, + "logps/chosen": -361.39581298828125, + "logps/rejected": -201.8250732421875, + "loss": 0.8052, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.027609944343567, + "rewards/margins": 0.3820081651210785, + "rewards/rejected": -1.4096180200576782, + "step": 1492 + }, + { + "epoch": 0.17, + "learning_rate": 2.5217136837176636e-07, + "logits/chosen": -2.2818233966827393, + "logits/rejected": -2.267136573791504, + "logps/chosen": -612.71142578125, + "logps/rejected": -556.1237182617188, + "loss": 0.2417, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2190237045288086, + "rewards/margins": 2.359898090362549, + "rewards/rejected": -3.5789220333099365, + "step": 1493 + }, + { + "epoch": 0.17, + "learning_rate": 2.5213625190214206e-07, + "logits/chosen": -2.0867607593536377, + "logits/rejected": -2.2640206813812256, + "logps/chosen": -313.6826477050781, + "logps/rejected": -248.52105712890625, + "loss": 0.8814, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7575123310089111, + "rewards/margins": 1.2565319538116455, + "rewards/rejected": -2.0140445232391357, + "step": 1494 + }, + { + "epoch": 0.17, + "learning_rate": 2.5210113543251787e-07, + "logits/chosen": -2.59749174118042, + "logits/rejected": -2.674429416656494, + "logps/chosen": -335.1714782714844, + "logps/rejected": -176.70425415039062, + "loss": 0.6559, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8021615743637085, + "rewards/margins": 1.2342896461486816, + "rewards/rejected": -2.0364513397216797, + "step": 1495 + }, + { + "epoch": 0.17, + "learning_rate": 2.520660189628936e-07, + "logits/chosen": -2.266188383102417, + "logits/rejected": -2.2081103324890137, + "logps/chosen": -159.84576416015625, + "logps/rejected": -281.5212097167969, + "loss": 0.2207, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5655825138092041, + "rewards/margins": 2.724238157272339, + "rewards/rejected": -3.289820671081543, + "step": 1496 + }, + { + "epoch": 0.17, + "learning_rate": 2.520309024932693e-07, + "logits/chosen": -2.3181092739105225, + "logits/rejected": -2.2372686862945557, + "logps/chosen": -173.49517822265625, + "logps/rejected": -253.54330444335938, + "loss": 0.6156, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7238270044326782, + "rewards/margins": 2.102701187133789, + "rewards/rejected": -3.826528549194336, + "step": 1497 + }, + { + "epoch": 0.17, + "learning_rate": 2.519957860236451e-07, + "logits/chosen": -2.383615732192993, + "logits/rejected": -2.4175381660461426, + "logps/chosen": -201.11044311523438, + "logps/rejected": -192.37384033203125, + "loss": 0.4995, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6890897750854492, + "rewards/margins": 2.3021199703216553, + "rewards/rejected": -2.9912097454071045, + "step": 1498 + }, + { + "epoch": 0.17, + "learning_rate": 2.5196066955402083e-07, + "logits/chosen": -2.4370129108428955, + "logits/rejected": -2.4099812507629395, + "logps/chosen": -274.5272521972656, + "logps/rejected": -259.6683349609375, + "loss": 0.7516, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1922004222869873, + "rewards/margins": 0.3770023584365845, + "rewards/rejected": -1.5692027807235718, + "step": 1499 + }, + { + "epoch": 0.17, + "learning_rate": 2.519255530843966e-07, + "logits/chosen": -2.4291391372680664, + "logits/rejected": -2.4455552101135254, + "logps/chosen": -186.38543701171875, + "logps/rejected": -224.7554473876953, + "loss": 0.662, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1346888542175293, + "rewards/margins": 0.8852522969245911, + "rewards/rejected": -2.0199413299560547, + "step": 1500 + }, + { + "epoch": 0.17, + "learning_rate": 2.5189043661477234e-07, + "logits/chosen": -2.6457228660583496, + "logits/rejected": -2.589989185333252, + "logps/chosen": -143.7434539794922, + "logps/rejected": -187.97845458984375, + "loss": 0.4456, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6070245504379272, + "rewards/margins": 1.0761301517486572, + "rewards/rejected": -1.6831547021865845, + "step": 1501 + }, + { + "epoch": 0.17, + "learning_rate": 2.5185532014514804e-07, + "logits/chosen": -2.902381658554077, + "logits/rejected": -2.6202268600463867, + "logps/chosen": -185.25689697265625, + "logps/rejected": -228.10646057128906, + "loss": 0.4226, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8534721732139587, + "rewards/margins": 1.8697742223739624, + "rewards/rejected": -2.7232463359832764, + "step": 1502 + }, + { + "epoch": 0.17, + "learning_rate": 2.518202036755238e-07, + "logits/chosen": -2.565622568130493, + "logits/rejected": -2.4744882583618164, + "logps/chosen": -100.48835754394531, + "logps/rejected": -185.7887420654297, + "loss": 0.2644, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8563423752784729, + "rewards/margins": 2.0156164169311523, + "rewards/rejected": -2.8719587326049805, + "step": 1503 + }, + { + "epoch": 0.17, + "learning_rate": 2.5178508720589955e-07, + "logits/chosen": -2.026784896850586, + "logits/rejected": -1.8989284038543701, + "logps/chosen": -454.0887451171875, + "logps/rejected": -404.7982177734375, + "loss": 0.4163, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.41011226177215576, + "rewards/margins": 1.4005322456359863, + "rewards/rejected": -1.8106446266174316, + "step": 1504 + }, + { + "epoch": 0.17, + "learning_rate": 2.517499707362753e-07, + "logits/chosen": -2.288508653640747, + "logits/rejected": -2.4336493015289307, + "logps/chosen": -199.82687377929688, + "logps/rejected": -203.02529907226562, + "loss": 0.6993, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9154245853424072, + "rewards/margins": 1.6614516973495483, + "rewards/rejected": -2.576876163482666, + "step": 1505 + }, + { + "epoch": 0.17, + "learning_rate": 2.5171485426665105e-07, + "logits/chosen": -2.83510684967041, + "logits/rejected": -2.878281831741333, + "logps/chosen": -169.41995239257812, + "logps/rejected": -211.17449951171875, + "loss": 0.6586, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8687607049942017, + "rewards/margins": 1.152592420578003, + "rewards/rejected": -2.021353244781494, + "step": 1506 + }, + { + "epoch": 0.17, + "learning_rate": 2.5167973779702676e-07, + "logits/chosen": -1.8529484272003174, + "logits/rejected": -2.077455997467041, + "logps/chosen": -677.58251953125, + "logps/rejected": -403.63330078125, + "loss": 0.3299, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5345842838287354, + "rewards/margins": 2.1660029888153076, + "rewards/rejected": -2.700587272644043, + "step": 1507 + }, + { + "epoch": 0.17, + "learning_rate": 2.5164462132740256e-07, + "logits/chosen": -2.9761455059051514, + "logits/rejected": -2.963833808898926, + "logps/chosen": -115.8536605834961, + "logps/rejected": -127.11263275146484, + "loss": 0.4996, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7422603368759155, + "rewards/margins": 0.9672908782958984, + "rewards/rejected": -1.7095513343811035, + "step": 1508 + }, + { + "epoch": 0.17, + "learning_rate": 2.516095048577783e-07, + "logits/chosen": -2.293590784072876, + "logits/rejected": -2.4385581016540527, + "logps/chosen": -415.290283203125, + "logps/rejected": -300.81585693359375, + "loss": 0.486, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7375532388687134, + "rewards/margins": 1.1268913745880127, + "rewards/rejected": -1.8644447326660156, + "step": 1509 + }, + { + "epoch": 0.17, + "learning_rate": 2.51574388388154e-07, + "logits/chosen": -1.5779788494110107, + "logits/rejected": -1.607736587524414, + "logps/chosen": -242.29684448242188, + "logps/rejected": -221.63787841796875, + "loss": 0.6782, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8334588408470154, + "rewards/margins": 0.5896294713020325, + "rewards/rejected": -1.4230883121490479, + "step": 1510 + }, + { + "epoch": 0.17, + "learning_rate": 2.5153927191852977e-07, + "logits/chosen": -2.4199178218841553, + "logits/rejected": -2.6325342655181885, + "logps/chosen": -269.6604309082031, + "logps/rejected": -204.7611846923828, + "loss": 0.5625, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9793627858161926, + "rewards/margins": 1.2248787879943848, + "rewards/rejected": -2.2042415142059326, + "step": 1511 + }, + { + "epoch": 0.17, + "learning_rate": 2.515041554489055e-07, + "logits/chosen": -2.5723795890808105, + "logits/rejected": -2.764373779296875, + "logps/chosen": -416.37005615234375, + "logps/rejected": -210.18475341796875, + "loss": 0.6445, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.32551461458206177, + "rewards/margins": 0.5047193169593811, + "rewards/rejected": -0.8302339911460876, + "step": 1512 + }, + { + "epoch": 0.17, + "learning_rate": 2.514690389792813e-07, + "logits/chosen": -2.044833183288574, + "logits/rejected": -2.276482105255127, + "logps/chosen": -342.4708251953125, + "logps/rejected": -353.476806640625, + "loss": 0.4537, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.250699758529663, + "rewards/margins": 1.7532479763031006, + "rewards/rejected": -3.0039477348327637, + "step": 1513 + }, + { + "epoch": 0.17, + "learning_rate": 2.5143392250965703e-07, + "logits/chosen": -2.765519618988037, + "logits/rejected": -2.8107495307922363, + "logps/chosen": -173.07467651367188, + "logps/rejected": -265.0313415527344, + "loss": 0.3801, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1696910858154297, + "rewards/margins": 1.5302252769470215, + "rewards/rejected": -2.699916362762451, + "step": 1514 + }, + { + "epoch": 0.17, + "learning_rate": 2.5139880604003273e-07, + "logits/chosen": -2.879636287689209, + "logits/rejected": -2.598350763320923, + "logps/chosen": -309.3323059082031, + "logps/rejected": -189.40219116210938, + "loss": 0.5481, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1885062456130981, + "rewards/margins": 1.0944628715515137, + "rewards/rejected": -2.2829692363739014, + "step": 1515 + }, + { + "epoch": 0.17, + "learning_rate": 2.513636895704085e-07, + "logits/chosen": -2.653984785079956, + "logits/rejected": -2.6193649768829346, + "logps/chosen": -369.2281494140625, + "logps/rejected": -258.5390625, + "loss": 0.1835, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0122499465942383, + "rewards/margins": 2.2103946208953857, + "rewards/rejected": -3.222644805908203, + "step": 1516 + }, + { + "epoch": 0.17, + "learning_rate": 2.513285731007843e-07, + "logits/chosen": -2.742457866668701, + "logits/rejected": -2.7537779808044434, + "logps/chosen": -318.4855651855469, + "logps/rejected": -169.52439880371094, + "loss": 0.5213, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4410058557987213, + "rewards/margins": 1.1944735050201416, + "rewards/rejected": -1.6354793310165405, + "step": 1517 + }, + { + "epoch": 0.17, + "learning_rate": 2.5129345663116e-07, + "logits/chosen": -2.4890570640563965, + "logits/rejected": -2.6782941818237305, + "logps/chosen": -318.1689758300781, + "logps/rejected": -227.27899169921875, + "loss": 0.5348, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1846905946731567, + "rewards/margins": 1.118526816368103, + "rewards/rejected": -2.3032174110412598, + "step": 1518 + }, + { + "epoch": 0.18, + "learning_rate": 2.5125834016153575e-07, + "logits/chosen": -2.055201530456543, + "logits/rejected": -1.9765236377716064, + "logps/chosen": -494.007080078125, + "logps/rejected": -271.2669372558594, + "loss": 0.2853, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10674116015434265, + "rewards/margins": 1.9446046352386475, + "rewards/rejected": -2.0513458251953125, + "step": 1519 + }, + { + "epoch": 0.18, + "learning_rate": 2.512232236919115e-07, + "logits/chosen": -1.7968982458114624, + "logits/rejected": -2.1774258613586426, + "logps/chosen": -206.50265502929688, + "logps/rejected": -167.2733612060547, + "loss": 0.6962, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7994339466094971, + "rewards/margins": 0.4959246516227722, + "rewards/rejected": -1.295358657836914, + "step": 1520 + }, + { + "epoch": 0.18, + "learning_rate": 2.5118810722228726e-07, + "logits/chosen": -2.0321855545043945, + "logits/rejected": -1.9211211204528809, + "logps/chosen": -326.3836669921875, + "logps/rejected": -380.6479797363281, + "loss": 0.5269, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6979181170463562, + "rewards/margins": 1.624923586845398, + "rewards/rejected": -2.3228416442871094, + "step": 1521 + }, + { + "epoch": 0.18, + "learning_rate": 2.51152990752663e-07, + "logits/chosen": -2.5439274311065674, + "logits/rejected": -2.28711199760437, + "logps/chosen": -187.217041015625, + "logps/rejected": -236.4950408935547, + "loss": 0.3058, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18444910645484924, + "rewards/margins": 1.5871895551681519, + "rewards/rejected": -1.7716388702392578, + "step": 1522 + }, + { + "epoch": 0.18, + "learning_rate": 2.511178742830387e-07, + "logits/chosen": -2.907531261444092, + "logits/rejected": -2.754185199737549, + "logps/chosen": -286.95404052734375, + "logps/rejected": -309.8794250488281, + "loss": 0.2945, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.505256175994873, + "rewards/margins": 2.1244397163391113, + "rewards/rejected": -2.6296958923339844, + "step": 1523 + }, + { + "epoch": 0.18, + "learning_rate": 2.5108275781341447e-07, + "logits/chosen": -2.05380916595459, + "logits/rejected": -2.195934295654297, + "logps/chosen": -213.34909057617188, + "logps/rejected": -193.28001403808594, + "loss": 0.6849, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2171998023986816, + "rewards/margins": 0.9047720432281494, + "rewards/rejected": -2.121971845626831, + "step": 1524 + }, + { + "epoch": 0.18, + "learning_rate": 2.510476413437902e-07, + "logits/chosen": -2.068610191345215, + "logits/rejected": -2.090682029724121, + "logps/chosen": -317.5472717285156, + "logps/rejected": -284.917724609375, + "loss": 0.6758, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3815022706985474, + "rewards/margins": 0.8914804458618164, + "rewards/rejected": -2.2729828357696533, + "step": 1525 + }, + { + "epoch": 0.18, + "learning_rate": 2.5101252487416597e-07, + "logits/chosen": -2.3966753482818604, + "logits/rejected": -2.560401439666748, + "logps/chosen": -339.21539306640625, + "logps/rejected": -217.06546020507812, + "loss": 0.2546, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4790683388710022, + "rewards/margins": 2.072140693664551, + "rewards/rejected": -2.551208972930908, + "step": 1526 + }, + { + "epoch": 0.18, + "learning_rate": 2.5097740840454173e-07, + "logits/chosen": -2.3164217472076416, + "logits/rejected": -1.982287049293518, + "logps/chosen": -135.1831512451172, + "logps/rejected": -274.124755859375, + "loss": 0.2675, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7100237011909485, + "rewards/margins": 1.9353067874908447, + "rewards/rejected": -2.6453304290771484, + "step": 1527 + }, + { + "epoch": 0.18, + "learning_rate": 2.5094229193491743e-07, + "logits/chosen": -2.6175553798675537, + "logits/rejected": -2.6481902599334717, + "logps/chosen": -451.3737487792969, + "logps/rejected": -291.0794982910156, + "loss": 0.2821, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.22280937433242798, + "rewards/margins": 1.947709321975708, + "rewards/rejected": -1.7248998880386353, + "step": 1528 + }, + { + "epoch": 0.18, + "learning_rate": 2.5090717546529323e-07, + "logits/chosen": -2.05659556388855, + "logits/rejected": -1.9805394411087036, + "logps/chosen": -385.59698486328125, + "logps/rejected": -340.3865966796875, + "loss": 0.3786, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5624405741691589, + "rewards/margins": 1.8018486499786377, + "rewards/rejected": -2.3642892837524414, + "step": 1529 + }, + { + "epoch": 0.18, + "learning_rate": 2.50872058995669e-07, + "logits/chosen": -1.8102757930755615, + "logits/rejected": -2.0495643615722656, + "logps/chosen": -304.3263854980469, + "logps/rejected": -220.1307373046875, + "loss": 0.5054, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.208686351776123, + "rewards/margins": 1.4434365034103394, + "rewards/rejected": -2.652122974395752, + "step": 1530 + }, + { + "epoch": 0.18, + "learning_rate": 2.508369425260447e-07, + "logits/chosen": -2.4502358436584473, + "logits/rejected": -2.1790506839752197, + "logps/chosen": -81.51646423339844, + "logps/rejected": -162.7396697998047, + "loss": 0.3075, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.538788080215454, + "rewards/margins": 2.7633860111236572, + "rewards/rejected": -4.302174091339111, + "step": 1531 + }, + { + "epoch": 0.18, + "learning_rate": 2.5080182605642044e-07, + "logits/chosen": -2.6307830810546875, + "logits/rejected": -2.5338830947875977, + "logps/chosen": -186.46148681640625, + "logps/rejected": -149.55955505371094, + "loss": 0.2551, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2514161467552185, + "rewards/margins": 1.53307044506073, + "rewards/rejected": -1.7844866514205933, + "step": 1532 + }, + { + "epoch": 0.18, + "learning_rate": 2.507667095867962e-07, + "logits/chosen": -2.4868228435516357, + "logits/rejected": -2.4353103637695312, + "logps/chosen": -364.79144287109375, + "logps/rejected": -403.99224853515625, + "loss": 0.6051, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0374011993408203, + "rewards/margins": 1.105783224105835, + "rewards/rejected": -2.1431844234466553, + "step": 1533 + }, + { + "epoch": 0.18, + "learning_rate": 2.5073159311717195e-07, + "logits/chosen": -2.6679153442382812, + "logits/rejected": -2.6328914165496826, + "logps/chosen": -140.71360778808594, + "logps/rejected": -139.76443481445312, + "loss": 0.4384, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2574614882469177, + "rewards/margins": 1.0303488969802856, + "rewards/rejected": -1.2878103256225586, + "step": 1534 + }, + { + "epoch": 0.18, + "learning_rate": 2.506964766475477e-07, + "logits/chosen": -2.3959171772003174, + "logits/rejected": -2.4407997131347656, + "logps/chosen": -208.7578887939453, + "logps/rejected": -257.5549621582031, + "loss": 0.4681, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1453748941421509, + "rewards/margins": 1.1838172674179077, + "rewards/rejected": -2.3291921615600586, + "step": 1535 + }, + { + "epoch": 0.18, + "learning_rate": 2.506613601779234e-07, + "logits/chosen": -2.154247283935547, + "logits/rejected": -2.3391964435577393, + "logps/chosen": -285.8394470214844, + "logps/rejected": -200.63729858398438, + "loss": 0.4934, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8229033946990967, + "rewards/margins": 1.780815839767456, + "rewards/rejected": -2.6037192344665527, + "step": 1536 + }, + { + "epoch": 0.18, + "learning_rate": 2.5062624370829916e-07, + "logits/chosen": -2.0855345726013184, + "logits/rejected": -2.2738709449768066, + "logps/chosen": -285.11767578125, + "logps/rejected": -208.68035888671875, + "loss": 0.9931, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4296092987060547, + "rewards/margins": 1.0893975496292114, + "rewards/rejected": -2.5190067291259766, + "step": 1537 + }, + { + "epoch": 0.18, + "learning_rate": 2.505911272386749e-07, + "logits/chosen": -1.9329721927642822, + "logits/rejected": -2.2149569988250732, + "logps/chosen": -290.8802490234375, + "logps/rejected": -250.74378967285156, + "loss": 0.7964, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8103494048118591, + "rewards/margins": 1.059680461883545, + "rewards/rejected": -1.8700298070907593, + "step": 1538 + }, + { + "epoch": 0.18, + "learning_rate": 2.5055601076905067e-07, + "logits/chosen": -2.173102378845215, + "logits/rejected": -2.0136613845825195, + "logps/chosen": -420.189697265625, + "logps/rejected": -467.72210693359375, + "loss": 0.5036, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0542739629745483, + "rewards/margins": 1.5393060445785522, + "rewards/rejected": -2.5935800075531006, + "step": 1539 + }, + { + "epoch": 0.18, + "learning_rate": 2.505208942994264e-07, + "logits/chosen": -2.6846680641174316, + "logits/rejected": -2.6805546283721924, + "logps/chosen": -485.8791198730469, + "logps/rejected": -229.34799194335938, + "loss": 0.8554, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6731443405151367, + "rewards/margins": 0.8011288046836853, + "rewards/rejected": -2.474273204803467, + "step": 1540 + }, + { + "epoch": 0.18, + "learning_rate": 2.504857778298022e-07, + "logits/chosen": -2.629429340362549, + "logits/rejected": -2.525007486343384, + "logps/chosen": -229.12088012695312, + "logps/rejected": -300.17974853515625, + "loss": 0.4552, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4043630361557007, + "rewards/margins": 1.9502670764923096, + "rewards/rejected": -2.3546299934387207, + "step": 1541 + }, + { + "epoch": 0.18, + "learning_rate": 2.5045066136017793e-07, + "logits/chosen": -2.942347526550293, + "logits/rejected": -2.890491485595703, + "logps/chosen": -410.1006774902344, + "logps/rejected": -231.36703491210938, + "loss": 0.6521, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.32585608959198, + "rewards/margins": 0.8723167181015015, + "rewards/rejected": -2.1981728076934814, + "step": 1542 + }, + { + "epoch": 0.18, + "learning_rate": 2.504155448905537e-07, + "logits/chosen": -2.2827978134155273, + "logits/rejected": -2.301215171813965, + "logps/chosen": -272.2730712890625, + "logps/rejected": -343.989990234375, + "loss": 0.7425, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0510903596878052, + "rewards/margins": 1.4585121870040894, + "rewards/rejected": -2.5096027851104736, + "step": 1543 + }, + { + "epoch": 0.18, + "learning_rate": 2.503804284209294e-07, + "logits/chosen": -2.2732362747192383, + "logits/rejected": -2.582937240600586, + "logps/chosen": -343.7640380859375, + "logps/rejected": -361.8148498535156, + "loss": 0.2129, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3317148685455322, + "rewards/margins": 2.3844175338745117, + "rewards/rejected": -2.716132402420044, + "step": 1544 + }, + { + "epoch": 0.18, + "learning_rate": 2.5034531195130514e-07, + "logits/chosen": -2.428518056869507, + "logits/rejected": -2.6231911182403564, + "logps/chosen": -496.81982421875, + "logps/rejected": -209.0087890625, + "loss": 0.1783, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2984122037887573, + "rewards/margins": 2.6578333377838135, + "rewards/rejected": -2.9562454223632812, + "step": 1545 + }, + { + "epoch": 0.18, + "learning_rate": 2.503101954816809e-07, + "logits/chosen": -2.558397054672241, + "logits/rejected": -2.501084804534912, + "logps/chosen": -191.78929138183594, + "logps/rejected": -168.882568359375, + "loss": 0.5471, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2358651161193848, + "rewards/margins": 0.8981565237045288, + "rewards/rejected": -2.134021759033203, + "step": 1546 + }, + { + "epoch": 0.18, + "learning_rate": 2.5027507901205665e-07, + "logits/chosen": -2.323981761932373, + "logits/rejected": -1.8915328979492188, + "logps/chosen": -282.4101867675781, + "logps/rejected": -304.0682373046875, + "loss": 0.3075, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.31685054302215576, + "rewards/margins": 1.8054734468460083, + "rewards/rejected": -2.122323989868164, + "step": 1547 + }, + { + "epoch": 0.18, + "learning_rate": 2.502399625424324e-07, + "logits/chosen": -1.731626272201538, + "logits/rejected": -2.003450870513916, + "logps/chosen": -292.78875732421875, + "logps/rejected": -267.60546875, + "loss": 0.404, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22206822037696838, + "rewards/margins": 2.1102633476257324, + "rewards/rejected": -2.332331657409668, + "step": 1548 + }, + { + "epoch": 0.18, + "learning_rate": 2.5020484607280815e-07, + "logits/chosen": -1.9372670650482178, + "logits/rejected": -2.1983771324157715, + "logps/chosen": -375.668212890625, + "logps/rejected": -306.9173278808594, + "loss": 0.3856, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6179848909378052, + "rewards/margins": 2.105454206466675, + "rewards/rejected": -2.7234392166137695, + "step": 1549 + }, + { + "epoch": 0.18, + "learning_rate": 2.5016972960318385e-07, + "logits/chosen": -2.238081932067871, + "logits/rejected": -2.3096156120300293, + "logps/chosen": -320.4313049316406, + "logps/rejected": -299.1278076171875, + "loss": 0.2321, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4374780058860779, + "rewards/margins": 1.6220752000808716, + "rewards/rejected": -2.0595531463623047, + "step": 1550 + }, + { + "epoch": 0.18, + "learning_rate": 2.5013461313355966e-07, + "logits/chosen": -1.738297700881958, + "logits/rejected": -1.8667881488800049, + "logps/chosen": -329.7322998046875, + "logps/rejected": -265.6767883300781, + "loss": 0.4815, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5749979019165039, + "rewards/margins": 1.7215700149536133, + "rewards/rejected": -2.296567916870117, + "step": 1551 + }, + { + "epoch": 0.18, + "learning_rate": 2.5009949666393536e-07, + "logits/chosen": -1.970654010772705, + "logits/rejected": -1.9667433500289917, + "logps/chosen": -266.9859619140625, + "logps/rejected": -298.43084716796875, + "loss": 0.2775, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7003635168075562, + "rewards/margins": 2.9521048069000244, + "rewards/rejected": -3.652468204498291, + "step": 1552 + }, + { + "epoch": 0.18, + "learning_rate": 2.500643801943111e-07, + "logits/chosen": -2.0663697719573975, + "logits/rejected": -2.185148239135742, + "logps/chosen": -294.646728515625, + "logps/rejected": -270.54296875, + "loss": 0.1788, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0974542647600174, + "rewards/margins": 2.053542375564575, + "rewards/rejected": -2.150996685028076, + "step": 1553 + }, + { + "epoch": 0.18, + "learning_rate": 2.5002926372468687e-07, + "logits/chosen": -2.4250242710113525, + "logits/rejected": -2.366454839706421, + "logps/chosen": -214.58642578125, + "logps/rejected": -321.79168701171875, + "loss": 0.3698, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.42891329526901245, + "rewards/margins": 1.8211779594421387, + "rewards/rejected": -2.250091075897217, + "step": 1554 + }, + { + "epoch": 0.18, + "learning_rate": 2.499941472550626e-07, + "logits/chosen": -2.4593453407287598, + "logits/rejected": -2.655930757522583, + "logps/chosen": -218.3666534423828, + "logps/rejected": -175.93743896484375, + "loss": 0.2027, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5891812443733215, + "rewards/margins": 2.3856008052825928, + "rewards/rejected": -2.9747819900512695, + "step": 1555 + }, + { + "epoch": 0.18, + "learning_rate": 2.499590307854384e-07, + "logits/chosen": -2.8143563270568848, + "logits/rejected": -2.6714484691619873, + "logps/chosen": -233.66993713378906, + "logps/rejected": -174.26492309570312, + "loss": 0.5111, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2812843322753906, + "rewards/margins": 1.0636931657791138, + "rewards/rejected": -2.344977378845215, + "step": 1556 + }, + { + "epoch": 0.18, + "learning_rate": 2.4992391431581413e-07, + "logits/chosen": -2.350741386413574, + "logits/rejected": -1.9745099544525146, + "logps/chosen": -254.73846435546875, + "logps/rejected": -376.6320495605469, + "loss": 0.5865, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1073479652404785, + "rewards/margins": 1.2839926481246948, + "rewards/rejected": -2.391340732574463, + "step": 1557 + }, + { + "epoch": 0.18, + "learning_rate": 2.4988879784618983e-07, + "logits/chosen": -2.618100166320801, + "logits/rejected": -2.5141637325286865, + "logps/chosen": -89.48007202148438, + "logps/rejected": -243.0951385498047, + "loss": 0.2574, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.550902783870697, + "rewards/margins": 1.8070201873779297, + "rewards/rejected": -2.3579230308532715, + "step": 1558 + }, + { + "epoch": 0.18, + "learning_rate": 2.498536813765656e-07, + "logits/chosen": -2.8834917545318604, + "logits/rejected": -2.8859174251556396, + "logps/chosen": -246.24415588378906, + "logps/rejected": -399.62060546875, + "loss": 1.0428, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3589619398117065, + "rewards/margins": 1.189483880996704, + "rewards/rejected": -2.5484459400177, + "step": 1559 + }, + { + "epoch": 0.18, + "learning_rate": 2.4981856490694134e-07, + "logits/chosen": -2.447308301925659, + "logits/rejected": -2.4498462677001953, + "logps/chosen": -355.4088134765625, + "logps/rejected": -228.6422576904297, + "loss": 0.5282, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4280747175216675, + "rewards/margins": 1.4356765747070312, + "rewards/rejected": -2.863751173019409, + "step": 1560 + }, + { + "epoch": 0.18, + "learning_rate": 2.497834484373171e-07, + "logits/chosen": -2.5722382068634033, + "logits/rejected": -2.579864025115967, + "logps/chosen": -271.6571044921875, + "logps/rejected": -313.1540832519531, + "loss": 0.1753, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.411030650138855, + "rewards/margins": 2.1864538192749023, + "rewards/rejected": -2.597484588623047, + "step": 1561 + }, + { + "epoch": 0.18, + "learning_rate": 2.4974833196769285e-07, + "logits/chosen": -2.3632500171661377, + "logits/rejected": -2.576626777648926, + "logps/chosen": -240.87672424316406, + "logps/rejected": -193.84201049804688, + "loss": 0.5182, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9120696187019348, + "rewards/margins": 1.0650715827941895, + "rewards/rejected": -1.9771413803100586, + "step": 1562 + }, + { + "epoch": 0.18, + "learning_rate": 2.4971321549806855e-07, + "logits/chosen": -1.986755132675171, + "logits/rejected": -2.1307475566864014, + "logps/chosen": -258.3935852050781, + "logps/rejected": -268.0465393066406, + "loss": 0.2526, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12291165441274643, + "rewards/margins": 2.851883888244629, + "rewards/rejected": -2.9747955799102783, + "step": 1563 + }, + { + "epoch": 0.18, + "learning_rate": 2.4967809902844435e-07, + "logits/chosen": -2.7763619422912598, + "logits/rejected": -2.865114688873291, + "logps/chosen": -293.1759033203125, + "logps/rejected": -298.12725830078125, + "loss": 0.4206, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7800661325454712, + "rewards/margins": 1.3049534559249878, + "rewards/rejected": -2.085019588470459, + "step": 1564 + }, + { + "epoch": 0.18, + "learning_rate": 2.4964298255882006e-07, + "logits/chosen": -2.3393173217773438, + "logits/rejected": -2.049412488937378, + "logps/chosen": -226.77743530273438, + "logps/rejected": -335.5418395996094, + "loss": 0.1626, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.417337566614151, + "rewards/margins": 3.291382312774658, + "rewards/rejected": -3.7087202072143555, + "step": 1565 + }, + { + "epoch": 0.18, + "learning_rate": 2.496078660891958e-07, + "logits/chosen": -2.6303038597106934, + "logits/rejected": -2.6511764526367188, + "logps/chosen": -403.8421325683594, + "logps/rejected": -218.98870849609375, + "loss": 0.7746, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7978515625, + "rewards/margins": 0.8190271258354187, + "rewards/rejected": -1.6168787479400635, + "step": 1566 + }, + { + "epoch": 0.18, + "learning_rate": 2.4957274961957156e-07, + "logits/chosen": -2.4714596271514893, + "logits/rejected": -2.593947172164917, + "logps/chosen": -329.6327819824219, + "logps/rejected": -327.4980163574219, + "loss": 0.166, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4789884388446808, + "rewards/margins": 2.7559492588043213, + "rewards/rejected": -3.234937906265259, + "step": 1567 + }, + { + "epoch": 0.18, + "learning_rate": 2.495376331499473e-07, + "logits/chosen": -2.1664676666259766, + "logits/rejected": -2.1482248306274414, + "logps/chosen": -195.79818725585938, + "logps/rejected": -296.3406982421875, + "loss": 0.8418, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9111372232437134, + "rewards/margins": 1.2076343297958374, + "rewards/rejected": -2.118771553039551, + "step": 1568 + }, + { + "epoch": 0.18, + "learning_rate": 2.4950251668032307e-07, + "logits/chosen": -2.2394986152648926, + "logits/rejected": -2.34440541267395, + "logps/chosen": -274.90423583984375, + "logps/rejected": -230.89431762695312, + "loss": 0.7382, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9322136640548706, + "rewards/margins": 1.049633502960205, + "rewards/rejected": -1.9818472862243652, + "step": 1569 + }, + { + "epoch": 0.18, + "learning_rate": 2.494674002106988e-07, + "logits/chosen": -2.6476640701293945, + "logits/rejected": -2.92221736907959, + "logps/chosen": -375.49102783203125, + "logps/rejected": -199.76919555664062, + "loss": 0.2792, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.06325289607048035, + "rewards/margins": 2.0336685180664062, + "rewards/rejected": -1.9704155921936035, + "step": 1570 + }, + { + "epoch": 0.18, + "learning_rate": 2.494322837410745e-07, + "logits/chosen": -2.4153177738189697, + "logits/rejected": -2.6009716987609863, + "logps/chosen": -236.72479248046875, + "logps/rejected": -236.26397705078125, + "loss": 0.3475, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0892460346221924, + "rewards/margins": 2.4860308170318604, + "rewards/rejected": -3.5752768516540527, + "step": 1571 + }, + { + "epoch": 0.18, + "learning_rate": 2.493971672714503e-07, + "logits/chosen": -2.6196742057800293, + "logits/rejected": -2.7034215927124023, + "logps/chosen": -214.83251953125, + "logps/rejected": -221.11614990234375, + "loss": 0.4035, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9342131018638611, + "rewards/margins": 1.8506947755813599, + "rewards/rejected": -2.784907817840576, + "step": 1572 + }, + { + "epoch": 0.18, + "learning_rate": 2.4936205080182603e-07, + "logits/chosen": -2.655503273010254, + "logits/rejected": -2.601104736328125, + "logps/chosen": -168.76524353027344, + "logps/rejected": -321.79248046875, + "loss": 0.1837, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04222029447555542, + "rewards/margins": 2.645620822906494, + "rewards/rejected": -2.603400468826294, + "step": 1573 + }, + { + "epoch": 0.18, + "learning_rate": 2.493269343322018e-07, + "logits/chosen": -2.2997570037841797, + "logits/rejected": -2.5109736919403076, + "logps/chosen": -354.3572998046875, + "logps/rejected": -346.29693603515625, + "loss": 0.2653, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37164074182510376, + "rewards/margins": 1.6210947036743164, + "rewards/rejected": -1.9927353858947754, + "step": 1574 + }, + { + "epoch": 0.18, + "learning_rate": 2.4929181786257754e-07, + "logits/chosen": -1.7473480701446533, + "logits/rejected": -2.0759286880493164, + "logps/chosen": -387.485107421875, + "logps/rejected": -275.6123352050781, + "loss": 0.86, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.7933194637298584, + "rewards/margins": 0.6175913214683533, + "rewards/rejected": -2.4109108448028564, + "step": 1575 + }, + { + "epoch": 0.18, + "learning_rate": 2.492567013929533e-07, + "logits/chosen": -2.5610671043395996, + "logits/rejected": -2.767871141433716, + "logps/chosen": -317.3907165527344, + "logps/rejected": -288.82049560546875, + "loss": 0.5075, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3731324076652527, + "rewards/margins": 0.9910906553268433, + "rewards/rejected": -1.3642230033874512, + "step": 1576 + }, + { + "epoch": 0.18, + "learning_rate": 2.4922158492332905e-07, + "logits/chosen": -2.113171100616455, + "logits/rejected": -2.403785228729248, + "logps/chosen": -260.40887451171875, + "logps/rejected": -252.0282440185547, + "loss": 0.7102, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.15816330909729, + "rewards/margins": 1.6426575183868408, + "rewards/rejected": -2.80082106590271, + "step": 1577 + }, + { + "epoch": 0.18, + "learning_rate": 2.491864684537048e-07, + "logits/chosen": -2.4259884357452393, + "logits/rejected": -2.456664562225342, + "logps/chosen": -331.7919006347656, + "logps/rejected": -265.6513366699219, + "loss": 0.225, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6949257254600525, + "rewards/margins": 1.7441328763961792, + "rewards/rejected": -2.439058542251587, + "step": 1578 + }, + { + "epoch": 0.18, + "learning_rate": 2.491513519840805e-07, + "logits/chosen": -2.086388111114502, + "logits/rejected": -2.3958938121795654, + "logps/chosen": -185.11160278320312, + "logps/rejected": -286.8905029296875, + "loss": 0.0718, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25552424788475037, + "rewards/margins": 4.072237968444824, + "rewards/rejected": -4.327762603759766, + "step": 1579 + }, + { + "epoch": 0.18, + "learning_rate": 2.4911623551445626e-07, + "logits/chosen": -2.5096664428710938, + "logits/rejected": -2.5874392986297607, + "logps/chosen": -397.56805419921875, + "logps/rejected": -307.50885009765625, + "loss": 0.4869, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9852223992347717, + "rewards/margins": 1.5412932634353638, + "rewards/rejected": -2.5265157222747803, + "step": 1580 + }, + { + "epoch": 0.18, + "learning_rate": 2.49081119044832e-07, + "logits/chosen": -2.388885259628296, + "logits/rejected": -2.3426740169525146, + "logps/chosen": -150.44696044921875, + "logps/rejected": -267.1241455078125, + "loss": 0.2133, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6513890027999878, + "rewards/margins": 2.275169610977173, + "rewards/rejected": -2.92655873298645, + "step": 1581 + }, + { + "epoch": 0.18, + "learning_rate": 2.4904600257520777e-07, + "logits/chosen": -2.6207008361816406, + "logits/rejected": -2.6579458713531494, + "logps/chosen": -244.21644592285156, + "logps/rejected": -176.27476501464844, + "loss": 0.324, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.397479772567749, + "rewards/margins": 1.394016981124878, + "rewards/rejected": -2.791496753692627, + "step": 1582 + }, + { + "epoch": 0.18, + "learning_rate": 2.490108861055835e-07, + "logits/chosen": -1.691648006439209, + "logits/rejected": -2.059812545776367, + "logps/chosen": -471.63134765625, + "logps/rejected": -338.01531982421875, + "loss": 0.3765, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.65696120262146, + "rewards/margins": 2.03448486328125, + "rewards/rejected": -2.69144606590271, + "step": 1583 + }, + { + "epoch": 0.18, + "learning_rate": 2.489757696359592e-07, + "logits/chosen": -1.976773738861084, + "logits/rejected": -2.135854721069336, + "logps/chosen": -354.87896728515625, + "logps/rejected": -384.957275390625, + "loss": 0.5716, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1565277576446533, + "rewards/margins": 1.414727807044983, + "rewards/rejected": -2.571255683898926, + "step": 1584 + }, + { + "epoch": 0.18, + "learning_rate": 2.4894065316633503e-07, + "logits/chosen": -2.1526198387145996, + "logits/rejected": -2.2484657764434814, + "logps/chosen": -359.0385437011719, + "logps/rejected": -242.36534118652344, + "loss": 0.477, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9705810546875, + "rewards/margins": 1.7949731349945068, + "rewards/rejected": -3.765554189682007, + "step": 1585 + }, + { + "epoch": 0.18, + "learning_rate": 2.489055366967108e-07, + "logits/chosen": -1.8768374919891357, + "logits/rejected": -2.0421371459960938, + "logps/chosen": -365.1597595214844, + "logps/rejected": -302.6670837402344, + "loss": 0.4257, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6820964813232422, + "rewards/margins": 1.1083189249038696, + "rewards/rejected": -1.7904152870178223, + "step": 1586 + }, + { + "epoch": 0.18, + "learning_rate": 2.488704202270865e-07, + "logits/chosen": -2.251528263092041, + "logits/rejected": -2.328104019165039, + "logps/chosen": -338.1918029785156, + "logps/rejected": -309.84124755859375, + "loss": 0.492, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8963527679443359, + "rewards/margins": 1.0317957401275635, + "rewards/rejected": -1.9281483888626099, + "step": 1587 + }, + { + "epoch": 0.18, + "learning_rate": 2.4883530375746224e-07, + "logits/chosen": -2.676071882247925, + "logits/rejected": -2.6901721954345703, + "logps/chosen": -395.193603515625, + "logps/rejected": -215.9188232421875, + "loss": 0.5894, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9509929418563843, + "rewards/margins": 0.7755737900733948, + "rewards/rejected": -1.7265666723251343, + "step": 1588 + }, + { + "epoch": 0.18, + "learning_rate": 2.48800187287838e-07, + "logits/chosen": -2.5658817291259766, + "logits/rejected": -2.423936128616333, + "logps/chosen": -213.04466247558594, + "logps/rejected": -303.71722412109375, + "loss": 0.5295, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9586235880851746, + "rewards/margins": 1.2644596099853516, + "rewards/rejected": -2.223083257675171, + "step": 1589 + }, + { + "epoch": 0.18, + "learning_rate": 2.4876507081821374e-07, + "logits/chosen": -2.771980047225952, + "logits/rejected": -2.4057822227478027, + "logps/chosen": -227.24313354492188, + "logps/rejected": -332.2438659667969, + "loss": 0.1848, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2028518915176392, + "rewards/margins": 2.233734607696533, + "rewards/rejected": -3.436586380004883, + "step": 1590 + }, + { + "epoch": 0.18, + "learning_rate": 2.487299543485895e-07, + "logits/chosen": -2.125380754470825, + "logits/rejected": -2.257431745529175, + "logps/chosen": -389.59173583984375, + "logps/rejected": -269.22393798828125, + "loss": 0.2859, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4245780110359192, + "rewards/margins": 2.151998519897461, + "rewards/rejected": -2.5765767097473145, + "step": 1591 + }, + { + "epoch": 0.18, + "learning_rate": 2.486948378789652e-07, + "logits/chosen": -2.4924890995025635, + "logits/rejected": -2.5220859050750732, + "logps/chosen": -228.2388458251953, + "logps/rejected": -250.18447875976562, + "loss": 0.4205, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18530797958374023, + "rewards/margins": 0.9995994567871094, + "rewards/rejected": -1.18490731716156, + "step": 1592 + }, + { + "epoch": 0.18, + "learning_rate": 2.4865972140934095e-07, + "logits/chosen": -2.6848888397216797, + "logits/rejected": -2.812047004699707, + "logps/chosen": -463.3500061035156, + "logps/rejected": -323.1795654296875, + "loss": 0.3097, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.42052823305130005, + "rewards/margins": 1.8500845432281494, + "rewards/rejected": -2.270612955093384, + "step": 1593 + }, + { + "epoch": 0.18, + "learning_rate": 2.486246049397167e-07, + "logits/chosen": -2.011300802230835, + "logits/rejected": -1.8140312433242798, + "logps/chosen": -372.18804931640625, + "logps/rejected": -403.79510498046875, + "loss": 0.9463, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0452680587768555, + "rewards/margins": 0.2024253010749817, + "rewards/rejected": -1.2476933002471924, + "step": 1594 + }, + { + "epoch": 0.18, + "learning_rate": 2.4858948847009246e-07, + "logits/chosen": -2.1811537742614746, + "logits/rejected": -2.1072661876678467, + "logps/chosen": -206.28103637695312, + "logps/rejected": -239.5809326171875, + "loss": 0.3235, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.38367336988449097, + "rewards/margins": 1.8679237365722656, + "rewards/rejected": -2.2515971660614014, + "step": 1595 + }, + { + "epoch": 0.18, + "learning_rate": 2.485543720004682e-07, + "logits/chosen": -2.529676675796509, + "logits/rejected": -2.434189796447754, + "logps/chosen": -98.12133026123047, + "logps/rejected": -134.86871337890625, + "loss": 0.3779, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.34668976068496704, + "rewards/margins": 1.984108328819275, + "rewards/rejected": -2.3307981491088867, + "step": 1596 + }, + { + "epoch": 0.18, + "learning_rate": 2.485192555308439e-07, + "logits/chosen": -2.0895235538482666, + "logits/rejected": -1.9475890398025513, + "logps/chosen": -248.64553833007812, + "logps/rejected": -192.42373657226562, + "loss": 0.3396, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6281726360321045, + "rewards/margins": 1.3557543754577637, + "rewards/rejected": -1.9839270114898682, + "step": 1597 + }, + { + "epoch": 0.18, + "learning_rate": 2.484841390612197e-07, + "logits/chosen": -2.5610153675079346, + "logits/rejected": -2.6000680923461914, + "logps/chosen": -279.54046630859375, + "logps/rejected": -180.24600219726562, + "loss": 0.2254, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1927214115858078, + "rewards/margins": 2.543883800506592, + "rewards/rejected": -2.736605167388916, + "step": 1598 + }, + { + "epoch": 0.18, + "learning_rate": 2.484490225915955e-07, + "logits/chosen": -2.536159038543701, + "logits/rejected": -2.3591384887695312, + "logps/chosen": -132.79864501953125, + "logps/rejected": -227.48828125, + "loss": 0.2252, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2503281831741333, + "rewards/margins": 2.7149477005004883, + "rewards/rejected": -2.4646191596984863, + "step": 1599 + }, + { + "epoch": 0.18, + "learning_rate": 2.484139061219712e-07, + "logits/chosen": -2.8721923828125, + "logits/rejected": -2.6675915718078613, + "logps/chosen": -242.64599609375, + "logps/rejected": -211.2529296875, + "loss": 0.3288, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3560851812362671, + "rewards/margins": 1.8630754947662354, + "rewards/rejected": -2.219160556793213, + "step": 1600 + }, + { + "epoch": 0.18, + "learning_rate": 2.4837878965234693e-07, + "logits/chosen": -2.104865550994873, + "logits/rejected": -2.4444220066070557, + "logps/chosen": -334.0702819824219, + "logps/rejected": -240.02285766601562, + "loss": 0.1808, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7663949728012085, + "rewards/margins": 2.1721458435058594, + "rewards/rejected": -2.9385409355163574, + "step": 1601 + }, + { + "epoch": 0.18, + "learning_rate": 2.483436731827227e-07, + "logits/chosen": -2.135657787322998, + "logits/rejected": -2.4569554328918457, + "logps/chosen": -355.5649108886719, + "logps/rejected": -290.55023193359375, + "loss": 0.6667, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.7493431568145752, + "rewards/margins": 0.4642691910266876, + "rewards/rejected": -2.2136125564575195, + "step": 1602 + }, + { + "epoch": 0.18, + "learning_rate": 2.4830855671309844e-07, + "logits/chosen": -2.0864479541778564, + "logits/rejected": -2.5903849601745605, + "logps/chosen": -486.1943359375, + "logps/rejected": -193.07342529296875, + "loss": 0.281, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6871283650398254, + "rewards/margins": 2.4403343200683594, + "rewards/rejected": -3.127462863922119, + "step": 1603 + }, + { + "epoch": 0.18, + "learning_rate": 2.482734402434742e-07, + "logits/chosen": -2.300405979156494, + "logits/rejected": -2.33910870552063, + "logps/chosen": -353.6005859375, + "logps/rejected": -335.34478759765625, + "loss": 0.1309, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08446812629699707, + "rewards/margins": 3.2242591381073, + "rewards/rejected": -3.308727264404297, + "step": 1604 + }, + { + "epoch": 0.19, + "learning_rate": 2.482383237738499e-07, + "logits/chosen": -2.4810831546783447, + "logits/rejected": -2.422015905380249, + "logps/chosen": -412.3768310546875, + "logps/rejected": -290.76544189453125, + "loss": 0.2855, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7046505212783813, + "rewards/margins": 1.5903699398040771, + "rewards/rejected": -2.295020580291748, + "step": 1605 + }, + { + "epoch": 0.19, + "learning_rate": 2.4820320730422565e-07, + "logits/chosen": -2.6389353275299072, + "logits/rejected": -2.5060887336730957, + "logps/chosen": -136.4227294921875, + "logps/rejected": -209.56942749023438, + "loss": 0.2284, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9015709757804871, + "rewards/margins": 1.7485371828079224, + "rewards/rejected": -2.6501083374023438, + "step": 1606 + }, + { + "epoch": 0.19, + "learning_rate": 2.4816809083460145e-07, + "logits/chosen": -2.6659352779388428, + "logits/rejected": -2.680722236633301, + "logps/chosen": -149.87326049804688, + "logps/rejected": -190.61463928222656, + "loss": 0.2795, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7040975689888, + "rewards/margins": 1.6010167598724365, + "rewards/rejected": -2.305114507675171, + "step": 1607 + }, + { + "epoch": 0.19, + "learning_rate": 2.4813297436497715e-07, + "logits/chosen": -2.1014554500579834, + "logits/rejected": -2.350635528564453, + "logps/chosen": -176.55520629882812, + "logps/rejected": -231.7963409423828, + "loss": 0.8825, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3984493017196655, + "rewards/margins": 0.1927037090063095, + "rewards/rejected": -1.5911529064178467, + "step": 1608 + }, + { + "epoch": 0.19, + "learning_rate": 2.480978578953529e-07, + "logits/chosen": -1.9788249731063843, + "logits/rejected": -1.7486951351165771, + "logps/chosen": -313.07012939453125, + "logps/rejected": -381.09039306640625, + "loss": 0.355, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7179294228553772, + "rewards/margins": 1.7483484745025635, + "rewards/rejected": -2.466277837753296, + "step": 1609 + }, + { + "epoch": 0.19, + "learning_rate": 2.4806274142572866e-07, + "logits/chosen": -2.2455368041992188, + "logits/rejected": -2.54561710357666, + "logps/chosen": -245.8131103515625, + "logps/rejected": -257.0814208984375, + "loss": 0.5108, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8450612425804138, + "rewards/margins": 0.8502280712127686, + "rewards/rejected": -1.6952892541885376, + "step": 1610 + }, + { + "epoch": 0.19, + "learning_rate": 2.480276249561044e-07, + "logits/chosen": -1.8863894939422607, + "logits/rejected": -2.3769845962524414, + "logps/chosen": -457.47418212890625, + "logps/rejected": -264.5491943359375, + "loss": 0.8527, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.77010577917099, + "rewards/margins": -0.06987576186656952, + "rewards/rejected": -0.7002299427986145, + "step": 1611 + }, + { + "epoch": 0.19, + "learning_rate": 2.4799250848648017e-07, + "logits/chosen": -2.307905673980713, + "logits/rejected": -2.459299087524414, + "logps/chosen": -286.9676513671875, + "logps/rejected": -217.8809814453125, + "loss": 0.6284, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.362645149230957, + "rewards/margins": 0.513384997844696, + "rewards/rejected": -1.8760302066802979, + "step": 1612 + }, + { + "epoch": 0.19, + "learning_rate": 2.4795739201685587e-07, + "logits/chosen": -2.1472556591033936, + "logits/rejected": -2.6570193767547607, + "logps/chosen": -226.44393920898438, + "logps/rejected": -164.27615356445312, + "loss": 0.5428, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7657410502433777, + "rewards/margins": 1.232421875, + "rewards/rejected": -1.9981629848480225, + "step": 1613 + }, + { + "epoch": 0.19, + "learning_rate": 2.479222755472316e-07, + "logits/chosen": -2.5835893154144287, + "logits/rejected": -2.4753384590148926, + "logps/chosen": -311.63458251953125, + "logps/rejected": -209.61996459960938, + "loss": 0.5848, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0252050161361694, + "rewards/margins": 0.5399571657180786, + "rewards/rejected": -1.5651623010635376, + "step": 1614 + }, + { + "epoch": 0.19, + "learning_rate": 2.478871590776074e-07, + "logits/chosen": -2.365985155105591, + "logits/rejected": -2.410783052444458, + "logps/chosen": -319.0787353515625, + "logps/rejected": -245.58563232421875, + "loss": 0.4476, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9494000673294067, + "rewards/margins": 1.712419033050537, + "rewards/rejected": -2.6618189811706543, + "step": 1615 + }, + { + "epoch": 0.19, + "learning_rate": 2.4785204260798313e-07, + "logits/chosen": -2.213170051574707, + "logits/rejected": -1.9422953128814697, + "logps/chosen": -137.2555389404297, + "logps/rejected": -255.68984985351562, + "loss": 0.4127, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.053580932319164276, + "rewards/margins": 0.9333127737045288, + "rewards/rejected": -0.9868937134742737, + "step": 1616 + }, + { + "epoch": 0.19, + "learning_rate": 2.478169261383589e-07, + "logits/chosen": -2.4313442707061768, + "logits/rejected": -2.542314052581787, + "logps/chosen": -319.5057067871094, + "logps/rejected": -330.90313720703125, + "loss": 0.2738, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25396060943603516, + "rewards/margins": 1.5020954608917236, + "rewards/rejected": -1.7560560703277588, + "step": 1617 + }, + { + "epoch": 0.19, + "learning_rate": 2.477818096687346e-07, + "logits/chosen": -2.5109364986419678, + "logits/rejected": -2.5238029956817627, + "logps/chosen": -273.1617431640625, + "logps/rejected": -269.08367919921875, + "loss": 0.1088, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6901732087135315, + "rewards/margins": 3.2606537342071533, + "rewards/rejected": -3.950827121734619, + "step": 1618 + }, + { + "epoch": 0.19, + "learning_rate": 2.477466931991104e-07, + "logits/chosen": -2.3413519859313965, + "logits/rejected": -2.268707752227783, + "logps/chosen": -340.1884765625, + "logps/rejected": -310.6493225097656, + "loss": 0.2343, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37714338302612305, + "rewards/margins": 1.7587429285049438, + "rewards/rejected": -2.1358864307403564, + "step": 1619 + }, + { + "epoch": 0.19, + "learning_rate": 2.4771157672948615e-07, + "logits/chosen": -2.683689594268799, + "logits/rejected": -2.4884133338928223, + "logps/chosen": -284.24298095703125, + "logps/rejected": -364.2023620605469, + "loss": 0.562, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3038368225097656, + "rewards/margins": 0.5899007320404053, + "rewards/rejected": -1.893737554550171, + "step": 1620 + }, + { + "epoch": 0.19, + "learning_rate": 2.4767646025986185e-07, + "logits/chosen": -2.4448297023773193, + "logits/rejected": -2.6226067543029785, + "logps/chosen": -194.15191650390625, + "logps/rejected": -219.8006591796875, + "loss": 0.6003, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.155550479888916, + "rewards/margins": 1.411703109741211, + "rewards/rejected": -2.567253589630127, + "step": 1621 + }, + { + "epoch": 0.19, + "learning_rate": 2.476413437902376e-07, + "logits/chosen": -2.3497695922851562, + "logits/rejected": -2.19209885597229, + "logps/chosen": -198.54660034179688, + "logps/rejected": -288.7807922363281, + "loss": 0.3472, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6086780428886414, + "rewards/margins": 1.952029824256897, + "rewards/rejected": -2.5607080459594727, + "step": 1622 + }, + { + "epoch": 0.19, + "learning_rate": 2.4760622732061336e-07, + "logits/chosen": -2.402900457382202, + "logits/rejected": -2.2815380096435547, + "logps/chosen": -298.33489990234375, + "logps/rejected": -309.80419921875, + "loss": 0.6034, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4263073205947876, + "rewards/margins": 0.757486879825592, + "rewards/rejected": -2.1837942600250244, + "step": 1623 + }, + { + "epoch": 0.19, + "learning_rate": 2.475711108509891e-07, + "logits/chosen": -1.7035391330718994, + "logits/rejected": -2.052853584289551, + "logps/chosen": -417.5054626464844, + "logps/rejected": -296.0009765625, + "loss": 0.6941, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9532411694526672, + "rewards/margins": 0.5683322548866272, + "rewards/rejected": -1.521573543548584, + "step": 1624 + }, + { + "epoch": 0.19, + "learning_rate": 2.4753599438136486e-07, + "logits/chosen": -2.1668882369995117, + "logits/rejected": -2.253288745880127, + "logps/chosen": -339.59722900390625, + "logps/rejected": -287.0159912109375, + "loss": 0.4398, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5999407172203064, + "rewards/margins": 1.65914785861969, + "rewards/rejected": -2.2590885162353516, + "step": 1625 + }, + { + "epoch": 0.19, + "learning_rate": 2.4750087791174056e-07, + "logits/chosen": -2.375866651535034, + "logits/rejected": -2.506442070007324, + "logps/chosen": -206.1678466796875, + "logps/rejected": -271.0548400878906, + "loss": 0.4347, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5144889950752258, + "rewards/margins": 1.656104564666748, + "rewards/rejected": -2.170593500137329, + "step": 1626 + }, + { + "epoch": 0.19, + "learning_rate": 2.474657614421163e-07, + "logits/chosen": -2.3770291805267334, + "logits/rejected": -2.373950242996216, + "logps/chosen": -234.7476806640625, + "logps/rejected": -213.23580932617188, + "loss": 0.4487, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8171393275260925, + "rewards/margins": 1.5520994663238525, + "rewards/rejected": -2.36923885345459, + "step": 1627 + }, + { + "epoch": 0.19, + "learning_rate": 2.4743064497249207e-07, + "logits/chosen": -2.468428134918213, + "logits/rejected": -2.3722894191741943, + "logps/chosen": -124.58255004882812, + "logps/rejected": -338.41552734375, + "loss": 0.0869, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3274645209312439, + "rewards/margins": 5.153946876525879, + "rewards/rejected": -5.481411457061768, + "step": 1628 + }, + { + "epoch": 0.19, + "learning_rate": 2.473955285028678e-07, + "logits/chosen": -2.7061808109283447, + "logits/rejected": -2.646496057510376, + "logps/chosen": -174.12887573242188, + "logps/rejected": -223.999267578125, + "loss": 0.435, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1718182563781738, + "rewards/margins": 1.578212022781372, + "rewards/rejected": -2.750030040740967, + "step": 1629 + }, + { + "epoch": 0.19, + "learning_rate": 2.473604120332436e-07, + "logits/chosen": -2.265134572982788, + "logits/rejected": -1.9445531368255615, + "logps/chosen": -225.20216369628906, + "logps/rejected": -207.21434020996094, + "loss": 0.3801, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4328833818435669, + "rewards/margins": 0.9040175080299377, + "rewards/rejected": -1.3369008302688599, + "step": 1630 + }, + { + "epoch": 0.19, + "learning_rate": 2.4732529556361933e-07, + "logits/chosen": -2.107004165649414, + "logits/rejected": -2.3984975814819336, + "logps/chosen": -279.7205505371094, + "logps/rejected": -225.84042358398438, + "loss": 0.4453, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9535292387008667, + "rewards/margins": 2.171532392501831, + "rewards/rejected": -3.1250617504119873, + "step": 1631 + }, + { + "epoch": 0.19, + "learning_rate": 2.472901790939951e-07, + "logits/chosen": -3.106919527053833, + "logits/rejected": -2.9135146141052246, + "logps/chosen": -244.0538330078125, + "logps/rejected": -188.32647705078125, + "loss": 0.5141, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7566425800323486, + "rewards/margins": 0.9003368020057678, + "rewards/rejected": -2.656979560852051, + "step": 1632 + }, + { + "epoch": 0.19, + "learning_rate": 2.4725506262437084e-07, + "logits/chosen": -2.629652976989746, + "logits/rejected": -2.6703238487243652, + "logps/chosen": -129.33877563476562, + "logps/rejected": -155.73577880859375, + "loss": 0.7649, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.235552191734314, + "rewards/margins": 0.7950652837753296, + "rewards/rejected": -2.0306172370910645, + "step": 1633 + }, + { + "epoch": 0.19, + "learning_rate": 2.4721994615474654e-07, + "logits/chosen": -1.6399822235107422, + "logits/rejected": -2.1640777587890625, + "logps/chosen": -331.4449768066406, + "logps/rejected": -155.2622833251953, + "loss": 0.4002, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.26821720600128174, + "rewards/margins": 0.9701189994812012, + "rewards/rejected": -1.238336205482483, + "step": 1634 + }, + { + "epoch": 0.19, + "learning_rate": 2.471848296851223e-07, + "logits/chosen": -2.2290070056915283, + "logits/rejected": -2.360722541809082, + "logps/chosen": -402.66485595703125, + "logps/rejected": -379.3473205566406, + "loss": 0.3349, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1091859340667725, + "rewards/margins": 2.020179510116577, + "rewards/rejected": -3.1293654441833496, + "step": 1635 + }, + { + "epoch": 0.19, + "learning_rate": 2.4714971321549805e-07, + "logits/chosen": -2.7512502670288086, + "logits/rejected": -2.6447699069976807, + "logps/chosen": -174.3161163330078, + "logps/rejected": -169.1768798828125, + "loss": 0.3793, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0038352906703948975, + "rewards/margins": 2.335407018661499, + "rewards/rejected": -2.331571578979492, + "step": 1636 + }, + { + "epoch": 0.19, + "learning_rate": 2.471145967458738e-07, + "logits/chosen": -2.4241299629211426, + "logits/rejected": -2.418266773223877, + "logps/chosen": -180.0232696533203, + "logps/rejected": -256.8774108886719, + "loss": 0.1542, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4957810044288635, + "rewards/margins": 2.536133050918579, + "rewards/rejected": -3.031913995742798, + "step": 1637 + }, + { + "epoch": 0.19, + "learning_rate": 2.4707948027624956e-07, + "logits/chosen": -2.5167899131774902, + "logits/rejected": -2.5759501457214355, + "logps/chosen": -354.9949951171875, + "logps/rejected": -259.24237060546875, + "loss": 0.6138, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9498881101608276, + "rewards/margins": 0.4558666944503784, + "rewards/rejected": -1.405754804611206, + "step": 1638 + }, + { + "epoch": 0.19, + "learning_rate": 2.470443638066253e-07, + "logits/chosen": -2.051009178161621, + "logits/rejected": -2.0049567222595215, + "logps/chosen": -175.4530029296875, + "logps/rejected": -199.77554321289062, + "loss": 0.3876, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8560108542442322, + "rewards/margins": 0.9264050126075745, + "rewards/rejected": -1.7824158668518066, + "step": 1639 + }, + { + "epoch": 0.19, + "learning_rate": 2.47009247337001e-07, + "logits/chosen": -1.9231789112091064, + "logits/rejected": -1.8469712734222412, + "logps/chosen": -267.821044921875, + "logps/rejected": -348.5379943847656, + "loss": 0.4755, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6886563301086426, + "rewards/margins": 1.3950438499450684, + "rewards/rejected": -3.083700180053711, + "step": 1640 + }, + { + "epoch": 0.19, + "learning_rate": 2.469741308673768e-07, + "logits/chosen": -2.437217950820923, + "logits/rejected": -2.237262725830078, + "logps/chosen": -311.115966796875, + "logps/rejected": -285.3014221191406, + "loss": 0.5621, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.27061524987220764, + "rewards/margins": 1.5207929611206055, + "rewards/rejected": -1.7914083003997803, + "step": 1641 + }, + { + "epoch": 0.19, + "learning_rate": 2.469390143977525e-07, + "logits/chosen": -2.366332769393921, + "logits/rejected": -2.2427303791046143, + "logps/chosen": -409.5748291015625, + "logps/rejected": -369.886962890625, + "loss": 1.1083, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.1697607040405273, + "rewards/margins": 0.9381856322288513, + "rewards/rejected": -3.1079463958740234, + "step": 1642 + }, + { + "epoch": 0.19, + "learning_rate": 2.469038979281283e-07, + "logits/chosen": -2.3992550373077393, + "logits/rejected": -2.7268521785736084, + "logps/chosen": -422.0608215332031, + "logps/rejected": -255.6181640625, + "loss": 0.1625, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9723900556564331, + "rewards/margins": 2.744877338409424, + "rewards/rejected": -3.7172675132751465, + "step": 1643 + }, + { + "epoch": 0.19, + "learning_rate": 2.4686878145850403e-07, + "logits/chosen": -2.4404866695404053, + "logits/rejected": -2.3518965244293213, + "logps/chosen": -231.1533660888672, + "logps/rejected": -272.1662292480469, + "loss": 0.3772, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24092474579811096, + "rewards/margins": 1.1074117422103882, + "rewards/rejected": -1.3483364582061768, + "step": 1644 + }, + { + "epoch": 0.19, + "learning_rate": 2.468336649888798e-07, + "logits/chosen": -1.5965030193328857, + "logits/rejected": -1.5142111778259277, + "logps/chosen": -493.17950439453125, + "logps/rejected": -393.5079650878906, + "loss": 0.4013, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2134250402450562, + "rewards/margins": 1.2532627582550049, + "rewards/rejected": -2.4666879177093506, + "step": 1645 + }, + { + "epoch": 0.19, + "learning_rate": 2.4679854851925554e-07, + "logits/chosen": -2.294180154800415, + "logits/rejected": -2.1904845237731934, + "logps/chosen": -326.2323913574219, + "logps/rejected": -360.0575256347656, + "loss": 0.5155, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4195312261581421, + "rewards/margins": 0.9031838774681091, + "rewards/rejected": -1.3227150440216064, + "step": 1646 + }, + { + "epoch": 0.19, + "learning_rate": 2.467634320496313e-07, + "logits/chosen": -2.3005247116088867, + "logits/rejected": -2.6660897731781006, + "logps/chosen": -379.7894592285156, + "logps/rejected": -287.7015380859375, + "loss": 0.2971, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.961382269859314, + "rewards/margins": 1.8994444608688354, + "rewards/rejected": -2.8608269691467285, + "step": 1647 + }, + { + "epoch": 0.19, + "learning_rate": 2.46728315580007e-07, + "logits/chosen": -2.737966537475586, + "logits/rejected": -2.6705100536346436, + "logps/chosen": -323.8995361328125, + "logps/rejected": -298.1430358886719, + "loss": 0.0973, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0068901777267456, + "rewards/margins": 3.079782009124756, + "rewards/rejected": -4.086671829223633, + "step": 1648 + }, + { + "epoch": 0.19, + "learning_rate": 2.4669319911038274e-07, + "logits/chosen": -1.8747671842575073, + "logits/rejected": -2.0971555709838867, + "logps/chosen": -386.82257080078125, + "logps/rejected": -269.41461181640625, + "loss": 0.4022, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9647752642631531, + "rewards/margins": 1.5528807640075684, + "rewards/rejected": -2.517655849456787, + "step": 1649 + }, + { + "epoch": 0.19, + "learning_rate": 2.466580826407585e-07, + "logits/chosen": -1.8551548719406128, + "logits/rejected": -2.0948874950408936, + "logps/chosen": -352.95953369140625, + "logps/rejected": -254.83905029296875, + "loss": 0.2877, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21497753262519836, + "rewards/margins": 2.153275489807129, + "rewards/rejected": -2.368252992630005, + "step": 1650 + }, + { + "epoch": 0.19, + "learning_rate": 2.4662296617113425e-07, + "logits/chosen": -2.5001089572906494, + "logits/rejected": -2.4687538146972656, + "logps/chosen": -192.6848907470703, + "logps/rejected": -295.28704833984375, + "loss": 0.5149, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5082302093505859, + "rewards/margins": 2.6597609519958496, + "rewards/rejected": -3.1679911613464355, + "step": 1651 + }, + { + "epoch": 0.19, + "learning_rate": 2.4658784970151e-07, + "logits/chosen": -1.9616495370864868, + "logits/rejected": -1.873363971710205, + "logps/chosen": -388.7721252441406, + "logps/rejected": -351.1454772949219, + "loss": 0.4009, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.35772523283958435, + "rewards/margins": 1.2997812032699585, + "rewards/rejected": -1.6575064659118652, + "step": 1652 + }, + { + "epoch": 0.19, + "learning_rate": 2.4655273323188576e-07, + "logits/chosen": -2.4057974815368652, + "logits/rejected": -2.354614496231079, + "logps/chosen": -194.2860107421875, + "logps/rejected": -222.1971893310547, + "loss": 0.2027, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1374562531709671, + "rewards/margins": 2.522706985473633, + "rewards/rejected": -2.660163164138794, + "step": 1653 + }, + { + "epoch": 0.19, + "learning_rate": 2.465176167622615e-07, + "logits/chosen": -2.612335681915283, + "logits/rejected": -2.747063636779785, + "logps/chosen": -393.3033142089844, + "logps/rejected": -293.7920837402344, + "loss": 0.4666, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4444802701473236, + "rewards/margins": 1.4901174306869507, + "rewards/rejected": -1.9345977306365967, + "step": 1654 + }, + { + "epoch": 0.19, + "learning_rate": 2.4648250029263727e-07, + "logits/chosen": -1.7948569059371948, + "logits/rejected": -2.0631155967712402, + "logps/chosen": -416.0630187988281, + "logps/rejected": -444.4719543457031, + "loss": 0.6783, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1603527069091797, + "rewards/margins": 0.4352602958679199, + "rewards/rejected": -1.5956127643585205, + "step": 1655 + }, + { + "epoch": 0.19, + "learning_rate": 2.4644738382301297e-07, + "logits/chosen": -1.9643170833587646, + "logits/rejected": -2.0727524757385254, + "logps/chosen": -441.3453674316406, + "logps/rejected": -350.16876220703125, + "loss": 0.6444, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4236165285110474, + "rewards/margins": 0.6577773690223694, + "rewards/rejected": -2.0813939571380615, + "step": 1656 + }, + { + "epoch": 0.19, + "learning_rate": 2.464122673533887e-07, + "logits/chosen": -2.7060413360595703, + "logits/rejected": -2.7931885719299316, + "logps/chosen": -317.31280517578125, + "logps/rejected": -256.8948059082031, + "loss": 0.398, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3795194625854492, + "rewards/margins": 1.7859705686569214, + "rewards/rejected": -3.165489912033081, + "step": 1657 + }, + { + "epoch": 0.19, + "learning_rate": 2.463771508837645e-07, + "logits/chosen": -1.530039668083191, + "logits/rejected": -1.5539894104003906, + "logps/chosen": -265.99200439453125, + "logps/rejected": -326.17510986328125, + "loss": 1.0694, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.2524030208587646, + "rewards/margins": 0.9094754457473755, + "rewards/rejected": -3.1618785858154297, + "step": 1658 + }, + { + "epoch": 0.19, + "learning_rate": 2.4634203441414023e-07, + "logits/chosen": -2.7942750453948975, + "logits/rejected": -2.7807302474975586, + "logps/chosen": -243.26585388183594, + "logps/rejected": -213.7657470703125, + "loss": 0.4224, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1798081398010254, + "rewards/margins": 1.6476800441741943, + "rewards/rejected": -2.8274879455566406, + "step": 1659 + }, + { + "epoch": 0.19, + "learning_rate": 2.46306917944516e-07, + "logits/chosen": -2.199354648590088, + "logits/rejected": -2.3158462047576904, + "logps/chosen": -387.3917236328125, + "logps/rejected": -271.7964172363281, + "loss": 0.3554, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6297986507415771, + "rewards/margins": 1.6133058071136475, + "rewards/rejected": -2.2431044578552246, + "step": 1660 + }, + { + "epoch": 0.19, + "learning_rate": 2.462718014748917e-07, + "logits/chosen": -2.0115151405334473, + "logits/rejected": -2.003386974334717, + "logps/chosen": -323.5269775390625, + "logps/rejected": -286.18756103515625, + "loss": 0.4396, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4327174723148346, + "rewards/margins": 1.3804211616516113, + "rewards/rejected": -1.813138723373413, + "step": 1661 + }, + { + "epoch": 0.19, + "learning_rate": 2.4623668500526744e-07, + "logits/chosen": -1.7074058055877686, + "logits/rejected": -1.71212637424469, + "logps/chosen": -246.99436950683594, + "logps/rejected": -240.94583129882812, + "loss": 0.702, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7788909077644348, + "rewards/margins": 0.38739579916000366, + "rewards/rejected": -1.1662867069244385, + "step": 1662 + }, + { + "epoch": 0.19, + "learning_rate": 2.462015685356432e-07, + "logits/chosen": -2.003859519958496, + "logits/rejected": -1.9894918203353882, + "logps/chosen": -369.6249084472656, + "logps/rejected": -206.77235412597656, + "loss": 0.3448, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2745112180709839, + "rewards/margins": 1.0475661754608154, + "rewards/rejected": -1.3220775127410889, + "step": 1663 + }, + { + "epoch": 0.19, + "learning_rate": 2.4616645206601895e-07, + "logits/chosen": -2.4101474285125732, + "logits/rejected": -2.706096887588501, + "logps/chosen": -550.04736328125, + "logps/rejected": -355.16021728515625, + "loss": 0.1456, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5533633828163147, + "rewards/margins": 2.5719263553619385, + "rewards/rejected": -3.1252896785736084, + "step": 1664 + }, + { + "epoch": 0.19, + "learning_rate": 2.461313355963947e-07, + "logits/chosen": -2.4041643142700195, + "logits/rejected": -2.5563316345214844, + "logps/chosen": -231.99383544921875, + "logps/rejected": -166.5591278076172, + "loss": 0.456, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.37038999795913696, + "rewards/margins": 1.3655312061309814, + "rewards/rejected": -1.7359211444854736, + "step": 1665 + }, + { + "epoch": 0.19, + "learning_rate": 2.4609621912677045e-07, + "logits/chosen": -2.3210535049438477, + "logits/rejected": -2.395526170730591, + "logps/chosen": -286.67852783203125, + "logps/rejected": -188.63861083984375, + "loss": 0.6068, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6769633293151855, + "rewards/margins": 0.6684436202049255, + "rewards/rejected": -2.345407247543335, + "step": 1666 + }, + { + "epoch": 0.19, + "learning_rate": 2.460611026571462e-07, + "logits/chosen": -2.4113802909851074, + "logits/rejected": -2.4127159118652344, + "logps/chosen": -301.1612548828125, + "logps/rejected": -206.12432861328125, + "loss": 0.4788, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8137022256851196, + "rewards/margins": 0.8798563480377197, + "rewards/rejected": -1.693558692932129, + "step": 1667 + }, + { + "epoch": 0.19, + "learning_rate": 2.4602598618752196e-07, + "logits/chosen": -1.8898708820343018, + "logits/rejected": -2.028468132019043, + "logps/chosen": -200.9937744140625, + "logps/rejected": -194.676025390625, + "loss": 0.4036, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6696151494979858, + "rewards/margins": 1.2188224792480469, + "rewards/rejected": -1.8884377479553223, + "step": 1668 + }, + { + "epoch": 0.19, + "learning_rate": 2.4599086971789766e-07, + "logits/chosen": -2.2706053256988525, + "logits/rejected": -2.3401026725769043, + "logps/chosen": -343.4836120605469, + "logps/rejected": -358.9771423339844, + "loss": 0.4967, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6574480533599854, + "rewards/margins": 1.6647543907165527, + "rewards/rejected": -2.322202444076538, + "step": 1669 + }, + { + "epoch": 0.19, + "learning_rate": 2.459557532482734e-07, + "logits/chosen": -2.2114243507385254, + "logits/rejected": -2.175487995147705, + "logps/chosen": -300.85186767578125, + "logps/rejected": -396.8050537109375, + "loss": 0.3669, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7905483841896057, + "rewards/margins": 2.3121018409729004, + "rewards/rejected": -3.1026501655578613, + "step": 1670 + }, + { + "epoch": 0.19, + "learning_rate": 2.4592063677864917e-07, + "logits/chosen": -2.6457266807556152, + "logits/rejected": -2.604395627975464, + "logps/chosen": -186.39077758789062, + "logps/rejected": -147.95993041992188, + "loss": 0.9878, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4130977392196655, + "rewards/margins": 0.31521332263946533, + "rewards/rejected": -1.7283110618591309, + "step": 1671 + }, + { + "epoch": 0.19, + "learning_rate": 2.458855203090249e-07, + "logits/chosen": -1.8430728912353516, + "logits/rejected": -1.8018715381622314, + "logps/chosen": -365.3834228515625, + "logps/rejected": -340.943603515625, + "loss": 0.6687, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.241715431213379, + "rewards/margins": 0.8190874457359314, + "rewards/rejected": -2.060802936553955, + "step": 1672 + }, + { + "epoch": 0.19, + "learning_rate": 2.458504038394007e-07, + "logits/chosen": -2.2198641300201416, + "logits/rejected": -2.1186063289642334, + "logps/chosen": -273.84844970703125, + "logps/rejected": -314.7901306152344, + "loss": 0.6637, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.571068048477173, + "rewards/margins": 0.4904053211212158, + "rewards/rejected": -3.0614731311798096, + "step": 1673 + }, + { + "epoch": 0.19, + "learning_rate": 2.458152873697764e-07, + "logits/chosen": -1.8776648044586182, + "logits/rejected": -2.2868847846984863, + "logps/chosen": -441.974365234375, + "logps/rejected": -324.28826904296875, + "loss": 0.3192, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.24136623740196228, + "rewards/margins": 1.2856117486953735, + "rewards/rejected": -1.0442456007003784, + "step": 1674 + }, + { + "epoch": 0.19, + "learning_rate": 2.457801709001522e-07, + "logits/chosen": -2.0283474922180176, + "logits/rejected": -2.1645395755767822, + "logps/chosen": -504.73822021484375, + "logps/rejected": -357.3897705078125, + "loss": 0.3635, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3307660222053528, + "rewards/margins": 1.7391588687896729, + "rewards/rejected": -2.06992506980896, + "step": 1675 + }, + { + "epoch": 0.19, + "learning_rate": 2.4574505443052794e-07, + "logits/chosen": -2.1316652297973633, + "logits/rejected": -2.4404311180114746, + "logps/chosen": -405.3109130859375, + "logps/rejected": -321.13787841796875, + "loss": 0.3329, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6755092144012451, + "rewards/margins": 1.8010756969451904, + "rewards/rejected": -2.4765849113464355, + "step": 1676 + }, + { + "epoch": 0.19, + "learning_rate": 2.4570993796090364e-07, + "logits/chosen": -2.8500266075134277, + "logits/rejected": -2.64805269241333, + "logps/chosen": -181.67950439453125, + "logps/rejected": -184.87022399902344, + "loss": 0.4843, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6696137189865112, + "rewards/margins": 1.1002678871154785, + "rewards/rejected": -1.7698814868927002, + "step": 1677 + }, + { + "epoch": 0.19, + "learning_rate": 2.456748214912794e-07, + "logits/chosen": -1.7793786525726318, + "logits/rejected": -2.0320913791656494, + "logps/chosen": -358.8062438964844, + "logps/rejected": -249.18511962890625, + "loss": 0.4995, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6655104756355286, + "rewards/margins": 0.8143996000289917, + "rewards/rejected": -1.479910135269165, + "step": 1678 + }, + { + "epoch": 0.19, + "learning_rate": 2.4563970502165515e-07, + "logits/chosen": -2.1867642402648926, + "logits/rejected": -2.208285093307495, + "logps/chosen": -410.2413024902344, + "logps/rejected": -296.1038818359375, + "loss": 0.7746, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5434332489967346, + "rewards/margins": 0.7795232534408569, + "rewards/rejected": -1.3229565620422363, + "step": 1679 + }, + { + "epoch": 0.19, + "learning_rate": 2.456045885520309e-07, + "logits/chosen": -2.537358045578003, + "logits/rejected": -2.4376025199890137, + "logps/chosen": -115.1676025390625, + "logps/rejected": -191.4144287109375, + "loss": 0.2856, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7194069027900696, + "rewards/margins": 1.9491679668426514, + "rewards/rejected": -2.668574810028076, + "step": 1680 + }, + { + "epoch": 0.19, + "learning_rate": 2.4556947208240666e-07, + "logits/chosen": -2.40211820602417, + "logits/rejected": -2.311521530151367, + "logps/chosen": -392.0479736328125, + "logps/rejected": -306.52264404296875, + "loss": 0.772, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2102307081222534, + "rewards/margins": 1.8760223388671875, + "rewards/rejected": -3.0862529277801514, + "step": 1681 + }, + { + "epoch": 0.19, + "learning_rate": 2.4553435561278236e-07, + "logits/chosen": -1.809664011001587, + "logits/rejected": -2.187422037124634, + "logps/chosen": -359.3562927246094, + "logps/rejected": -247.45068359375, + "loss": 0.2704, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.27084076404571533, + "rewards/margins": 2.594494342803955, + "rewards/rejected": -2.865334987640381, + "step": 1682 + }, + { + "epoch": 0.19, + "learning_rate": 2.454992391431581e-07, + "logits/chosen": -2.450502395629883, + "logits/rejected": -2.2425665855407715, + "logps/chosen": -339.5242614746094, + "logps/rejected": -400.39166259765625, + "loss": 1.2193, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.2639663219451904, + "rewards/margins": -0.4978523254394531, + "rewards/rejected": -0.7661139965057373, + "step": 1683 + }, + { + "epoch": 0.19, + "learning_rate": 2.4546412267353386e-07, + "logits/chosen": -2.056119918823242, + "logits/rejected": -1.7343114614486694, + "logps/chosen": -190.46487426757812, + "logps/rejected": -390.5182800292969, + "loss": 0.4726, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9539757370948792, + "rewards/margins": 2.0407700538635254, + "rewards/rejected": -2.9947457313537598, + "step": 1684 + }, + { + "epoch": 0.19, + "learning_rate": 2.454290062039096e-07, + "logits/chosen": -2.293154239654541, + "logits/rejected": -2.4260339736938477, + "logps/chosen": -331.97918701171875, + "logps/rejected": -294.1297302246094, + "loss": 0.3738, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.25170791149139404, + "rewards/margins": 1.8995006084442139, + "rewards/rejected": -2.1512086391448975, + "step": 1685 + }, + { + "epoch": 0.19, + "learning_rate": 2.4539388973428537e-07, + "logits/chosen": -2.119298219680786, + "logits/rejected": -1.8990097045898438, + "logps/chosen": -310.5208740234375, + "logps/rejected": -344.4461669921875, + "loss": 0.5134, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3304263353347778, + "rewards/margins": 1.0611884593963623, + "rewards/rejected": -2.3916149139404297, + "step": 1686 + }, + { + "epoch": 0.19, + "learning_rate": 2.453587732646611e-07, + "logits/chosen": -2.365549325942993, + "logits/rejected": -2.449434280395508, + "logps/chosen": -296.79461669921875, + "logps/rejected": -286.59185791015625, + "loss": 0.2708, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2956508696079254, + "rewards/margins": 1.580264925956726, + "rewards/rejected": -1.875915765762329, + "step": 1687 + }, + { + "epoch": 0.19, + "learning_rate": 2.453236567950369e-07, + "logits/chosen": -2.8984904289245605, + "logits/rejected": -2.823639392852783, + "logps/chosen": -291.45184326171875, + "logps/rejected": -296.52197265625, + "loss": 0.235, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.42719876766204834, + "rewards/margins": 2.465261936187744, + "rewards/rejected": -2.892460823059082, + "step": 1688 + }, + { + "epoch": 0.19, + "learning_rate": 2.4528854032541263e-07, + "logits/chosen": -2.241706371307373, + "logits/rejected": -2.393019199371338, + "logps/chosen": -326.21881103515625, + "logps/rejected": -234.22959899902344, + "loss": 0.2976, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.020430788397789, + "rewards/margins": 1.7605199813842773, + "rewards/rejected": -1.7809507846832275, + "step": 1689 + }, + { + "epoch": 0.19, + "learning_rate": 2.4525342385578833e-07, + "logits/chosen": -2.6290791034698486, + "logits/rejected": -2.5110068321228027, + "logps/chosen": -205.25506591796875, + "logps/rejected": -248.4744873046875, + "loss": 0.2367, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3906041979789734, + "rewards/margins": 3.242579460144043, + "rewards/rejected": -3.633183717727661, + "step": 1690 + }, + { + "epoch": 0.19, + "learning_rate": 2.452183073861641e-07, + "logits/chosen": -1.9336727857589722, + "logits/rejected": -2.296276569366455, + "logps/chosen": -226.8034210205078, + "logps/rejected": -195.668701171875, + "loss": 0.7165, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.7115800380706787, + "rewards/margins": 1.729042410850525, + "rewards/rejected": -3.4406228065490723, + "step": 1691 + }, + { + "epoch": 0.2, + "learning_rate": 2.4518319091653984e-07, + "logits/chosen": -2.6071548461914062, + "logits/rejected": -2.4399938583374023, + "logps/chosen": -444.005126953125, + "logps/rejected": -384.8004150390625, + "loss": 0.115, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8279662132263184, + "rewards/margins": 2.580198287963867, + "rewards/rejected": -3.4081645011901855, + "step": 1692 + }, + { + "epoch": 0.2, + "learning_rate": 2.451480744469156e-07, + "logits/chosen": -2.1144001483917236, + "logits/rejected": -2.3097872734069824, + "logps/chosen": -329.2967834472656, + "logps/rejected": -183.11183166503906, + "loss": 0.2642, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1214052140712738, + "rewards/margins": 1.9378547668457031, + "rewards/rejected": -2.0592598915100098, + "step": 1693 + }, + { + "epoch": 0.2, + "learning_rate": 2.4511295797729135e-07, + "logits/chosen": -2.701531410217285, + "logits/rejected": -2.6456167697906494, + "logps/chosen": -168.7894287109375, + "logps/rejected": -176.9418487548828, + "loss": 0.3127, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5063639879226685, + "rewards/margins": 1.5046662092208862, + "rewards/rejected": -2.0110301971435547, + "step": 1694 + }, + { + "epoch": 0.2, + "learning_rate": 2.4507784150766705e-07, + "logits/chosen": -2.779634952545166, + "logits/rejected": -2.784498453140259, + "logps/chosen": -252.96893310546875, + "logps/rejected": -333.27056884765625, + "loss": 0.2231, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.037149667739868164, + "rewards/margins": 2.9898862838745117, + "rewards/rejected": -2.9527366161346436, + "step": 1695 + }, + { + "epoch": 0.2, + "learning_rate": 2.450427250380428e-07, + "logits/chosen": -2.538682460784912, + "logits/rejected": -2.5000290870666504, + "logps/chosen": -152.70806884765625, + "logps/rejected": -197.42250061035156, + "loss": 0.2224, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.06885167211294174, + "rewards/margins": 2.3313193321228027, + "rewards/rejected": -2.262467622756958, + "step": 1696 + }, + { + "epoch": 0.2, + "learning_rate": 2.450076085684186e-07, + "logits/chosen": -1.9589228630065918, + "logits/rejected": -2.333251953125, + "logps/chosen": -363.0921630859375, + "logps/rejected": -320.5115966796875, + "loss": 0.4432, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9461827278137207, + "rewards/margins": 1.8873931169509888, + "rewards/rejected": -3.833575963973999, + "step": 1697 + }, + { + "epoch": 0.2, + "learning_rate": 2.449724920987943e-07, + "logits/chosen": -2.0397963523864746, + "logits/rejected": -2.2200992107391357, + "logps/chosen": -216.2469482421875, + "logps/rejected": -168.41531372070312, + "loss": 0.5113, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2252253293991089, + "rewards/margins": 0.7076501846313477, + "rewards/rejected": -1.9328755140304565, + "step": 1698 + }, + { + "epoch": 0.2, + "learning_rate": 2.4493737562917007e-07, + "logits/chosen": -2.7151670455932617, + "logits/rejected": -2.770413637161255, + "logps/chosen": -360.1499938964844, + "logps/rejected": -215.9287567138672, + "loss": 0.3984, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.55147385597229, + "rewards/margins": 1.3876838684082031, + "rewards/rejected": -1.9391577243804932, + "step": 1699 + }, + { + "epoch": 0.2, + "learning_rate": 2.449022591595458e-07, + "logits/chosen": -2.6553688049316406, + "logits/rejected": -2.7054600715637207, + "logps/chosen": -131.82205200195312, + "logps/rejected": -139.0560302734375, + "loss": 0.4321, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3292228877544403, + "rewards/margins": 1.2753883600234985, + "rewards/rejected": -1.6046112775802612, + "step": 1700 + }, + { + "epoch": 0.2, + "learning_rate": 2.448671426899216e-07, + "logits/chosen": -2.59659481048584, + "logits/rejected": -2.5668022632598877, + "logps/chosen": -280.8060302734375, + "logps/rejected": -286.406494140625, + "loss": 0.5089, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7263594269752502, + "rewards/margins": 1.6281771659851074, + "rewards/rejected": -2.354536533355713, + "step": 1701 + }, + { + "epoch": 0.2, + "learning_rate": 2.4483202622029733e-07, + "logits/chosen": -2.371030569076538, + "logits/rejected": -2.2981457710266113, + "logps/chosen": -121.21649169921875, + "logps/rejected": -196.2156982421875, + "loss": 0.3651, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4020608067512512, + "rewards/margins": 1.2781299352645874, + "rewards/rejected": -1.6801908016204834, + "step": 1702 + }, + { + "epoch": 0.2, + "learning_rate": 2.4479690975067303e-07, + "logits/chosen": -2.2784030437469482, + "logits/rejected": -2.114086627960205, + "logps/chosen": -310.0711669921875, + "logps/rejected": -293.06414794921875, + "loss": 0.4216, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7252213358879089, + "rewards/margins": 1.0779478549957275, + "rewards/rejected": -1.8031691312789917, + "step": 1703 + }, + { + "epoch": 0.2, + "learning_rate": 2.447617932810488e-07, + "logits/chosen": -1.822288990020752, + "logits/rejected": -2.081564426422119, + "logps/chosen": -342.31597900390625, + "logps/rejected": -299.51861572265625, + "loss": 0.159, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5357794165611267, + "rewards/margins": 3.38871693611145, + "rewards/rejected": -3.9244964122772217, + "step": 1704 + }, + { + "epoch": 0.2, + "learning_rate": 2.4472667681142454e-07, + "logits/chosen": -2.0031228065490723, + "logits/rejected": -2.257960557937622, + "logps/chosen": -394.279541015625, + "logps/rejected": -215.11349487304688, + "loss": 0.5724, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9167016744613647, + "rewards/margins": 1.2031726837158203, + "rewards/rejected": -2.1198744773864746, + "step": 1705 + }, + { + "epoch": 0.2, + "learning_rate": 2.446915603418003e-07, + "logits/chosen": -2.0385193824768066, + "logits/rejected": -2.1922261714935303, + "logps/chosen": -314.12939453125, + "logps/rejected": -358.5167236328125, + "loss": 0.3373, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.026443377137184143, + "rewards/margins": 2.665680170059204, + "rewards/rejected": -2.6392369270324707, + "step": 1706 + }, + { + "epoch": 0.2, + "learning_rate": 2.4465644387217604e-07, + "logits/chosen": -2.48319935798645, + "logits/rejected": -2.356574296951294, + "logps/chosen": -177.25210571289062, + "logps/rejected": -174.63021850585938, + "loss": 0.4273, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2354623079299927, + "rewards/margins": 1.5407429933547974, + "rewards/rejected": -2.776205539703369, + "step": 1707 + }, + { + "epoch": 0.2, + "learning_rate": 2.4462132740255175e-07, + "logits/chosen": -2.605435609817505, + "logits/rejected": -2.727900743484497, + "logps/chosen": -318.33856201171875, + "logps/rejected": -280.92486572265625, + "loss": 0.4942, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9802804589271545, + "rewards/margins": 1.4560797214508057, + "rewards/rejected": -2.4363598823547363, + "step": 1708 + }, + { + "epoch": 0.2, + "learning_rate": 2.4458621093292755e-07, + "logits/chosen": -2.378760814666748, + "logits/rejected": -2.501760959625244, + "logps/chosen": -329.2039489746094, + "logps/rejected": -262.5628662109375, + "loss": 0.2287, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4294587969779968, + "rewards/margins": 3.4281249046325684, + "rewards/rejected": -3.857583999633789, + "step": 1709 + }, + { + "epoch": 0.2, + "learning_rate": 2.445510944633033e-07, + "logits/chosen": -2.5029690265655518, + "logits/rejected": -2.720283031463623, + "logps/chosen": -310.1170959472656, + "logps/rejected": -200.28213500976562, + "loss": 0.1055, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.39008811116218567, + "rewards/margins": 2.798238754272461, + "rewards/rejected": -2.4081506729125977, + "step": 1710 + }, + { + "epoch": 0.2, + "learning_rate": 2.44515977993679e-07, + "logits/chosen": -2.2072975635528564, + "logits/rejected": -2.238217830657959, + "logps/chosen": -227.14498901367188, + "logps/rejected": -283.04119873046875, + "loss": 0.2631, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7197704315185547, + "rewards/margins": 3.0032641887664795, + "rewards/rejected": -3.723034381866455, + "step": 1711 + }, + { + "epoch": 0.2, + "learning_rate": 2.4448086152405476e-07, + "logits/chosen": -2.496455669403076, + "logits/rejected": -2.5020833015441895, + "logps/chosen": -154.414306640625, + "logps/rejected": -205.02691650390625, + "loss": 0.6039, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.912485659122467, + "rewards/margins": 1.255630373954773, + "rewards/rejected": -2.1681160926818848, + "step": 1712 + }, + { + "epoch": 0.2, + "learning_rate": 2.444457450544305e-07, + "logits/chosen": -2.178593158721924, + "logits/rejected": -2.2579548358917236, + "logps/chosen": -231.26112365722656, + "logps/rejected": -335.9486083984375, + "loss": 0.2045, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8184113502502441, + "rewards/margins": 3.661334991455078, + "rewards/rejected": -4.479746341705322, + "step": 1713 + }, + { + "epoch": 0.2, + "learning_rate": 2.4441062858480627e-07, + "logits/chosen": -2.377840995788574, + "logits/rejected": -2.4630322456359863, + "logps/chosen": -358.5934143066406, + "logps/rejected": -528.326171875, + "loss": 0.2329, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3511918783187866, + "rewards/margins": 2.3192222118377686, + "rewards/rejected": -3.6704142093658447, + "step": 1714 + }, + { + "epoch": 0.2, + "learning_rate": 2.44375512115182e-07, + "logits/chosen": -2.741769790649414, + "logits/rejected": -2.536747932434082, + "logps/chosen": -230.64300537109375, + "logps/rejected": -312.9796142578125, + "loss": 0.3473, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.44741618633270264, + "rewards/margins": 1.5937018394470215, + "rewards/rejected": -2.0411179065704346, + "step": 1715 + }, + { + "epoch": 0.2, + "learning_rate": 2.443403956455577e-07, + "logits/chosen": -2.0459036827087402, + "logits/rejected": -2.200958728790283, + "logps/chosen": -491.86236572265625, + "logps/rejected": -296.08453369140625, + "loss": 0.6765, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1739169359207153, + "rewards/margins": 1.5414820909500122, + "rewards/rejected": -2.7153987884521484, + "step": 1716 + }, + { + "epoch": 0.2, + "learning_rate": 2.443052791759335e-07, + "logits/chosen": -2.399399757385254, + "logits/rejected": -2.6250038146972656, + "logps/chosen": -289.4229431152344, + "logps/rejected": -334.8854675292969, + "loss": 0.4083, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6532971858978271, + "rewards/margins": 1.8591563701629639, + "rewards/rejected": -3.512453556060791, + "step": 1717 + }, + { + "epoch": 0.2, + "learning_rate": 2.4427016270630923e-07, + "logits/chosen": -2.4645230770111084, + "logits/rejected": -2.6755383014678955, + "logps/chosen": -334.1551513671875, + "logps/rejected": -330.78118896484375, + "loss": 0.8247, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0622506141662598, + "rewards/margins": 1.60166335105896, + "rewards/rejected": -2.663914203643799, + "step": 1718 + }, + { + "epoch": 0.2, + "learning_rate": 2.44235046236685e-07, + "logits/chosen": -2.3340842723846436, + "logits/rejected": -2.3594353199005127, + "logps/chosen": -236.51641845703125, + "logps/rejected": -239.3765106201172, + "loss": 0.5427, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7254666090011597, + "rewards/margins": 2.0596656799316406, + "rewards/rejected": -2.7851321697235107, + "step": 1719 + }, + { + "epoch": 0.2, + "learning_rate": 2.4419992976706074e-07, + "logits/chosen": -2.1056020259857178, + "logits/rejected": -1.7980005741119385, + "logps/chosen": -109.90374755859375, + "logps/rejected": -351.3470458984375, + "loss": 0.1012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7057602405548096, + "rewards/margins": 7.043138027191162, + "rewards/rejected": -7.748898029327393, + "step": 1720 + }, + { + "epoch": 0.2, + "learning_rate": 2.441648132974365e-07, + "logits/chosen": -1.7556910514831543, + "logits/rejected": -2.051677703857422, + "logps/chosen": -364.8359375, + "logps/rejected": -295.87786865234375, + "loss": 0.4531, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9517868757247925, + "rewards/margins": 1.4087963104248047, + "rewards/rejected": -2.3605830669403076, + "step": 1721 + }, + { + "epoch": 0.2, + "learning_rate": 2.4412969682781225e-07, + "logits/chosen": -2.636255979537964, + "logits/rejected": -2.5261337757110596, + "logps/chosen": -282.74993896484375, + "logps/rejected": -291.51116943359375, + "loss": 0.4174, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2786960005760193, + "rewards/margins": 1.8646764755249023, + "rewards/rejected": -2.1433725357055664, + "step": 1722 + }, + { + "epoch": 0.2, + "learning_rate": 2.44094580358188e-07, + "logits/chosen": -2.6597354412078857, + "logits/rejected": -2.754729747772217, + "logps/chosen": -254.7504119873047, + "logps/rejected": -272.520751953125, + "loss": 1.0144, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.7554665803909302, + "rewards/margins": -0.14292779564857483, + "rewards/rejected": -1.6125388145446777, + "step": 1723 + }, + { + "epoch": 0.2, + "learning_rate": 2.440594638885637e-07, + "logits/chosen": -2.4080801010131836, + "logits/rejected": -2.2782528400421143, + "logps/chosen": -172.232421875, + "logps/rejected": -294.8476867675781, + "loss": 0.3437, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.24506109952926636, + "rewards/margins": 1.6944336891174316, + "rewards/rejected": -1.9394947290420532, + "step": 1724 + }, + { + "epoch": 0.2, + "learning_rate": 2.4402434741893945e-07, + "logits/chosen": -2.325031280517578, + "logits/rejected": -2.279486656188965, + "logps/chosen": -381.1370849609375, + "logps/rejected": -319.84686279296875, + "loss": 0.3881, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.287662386894226, + "rewards/margins": 1.5051501989364624, + "rewards/rejected": -2.7928128242492676, + "step": 1725 + }, + { + "epoch": 0.2, + "learning_rate": 2.439892309493152e-07, + "logits/chosen": -2.126049757003784, + "logits/rejected": -2.4639079570770264, + "logps/chosen": -331.1451416015625, + "logps/rejected": -232.11387634277344, + "loss": 0.3969, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3480038344860077, + "rewards/margins": 1.1950262784957886, + "rewards/rejected": -1.5430301427841187, + "step": 1726 + }, + { + "epoch": 0.2, + "learning_rate": 2.4395411447969096e-07, + "logits/chosen": -2.3334786891937256, + "logits/rejected": -2.482663154602051, + "logps/chosen": -343.96319580078125, + "logps/rejected": -396.93475341796875, + "loss": 0.4888, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7618715167045593, + "rewards/margins": 0.9998738765716553, + "rewards/rejected": -1.7617453336715698, + "step": 1727 + }, + { + "epoch": 0.2, + "learning_rate": 2.439189980100667e-07, + "logits/chosen": -2.24041748046875, + "logits/rejected": -2.6645185947418213, + "logps/chosen": -334.53338623046875, + "logps/rejected": -180.63819885253906, + "loss": 1.6877, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.741896867752075, + "rewards/margins": -0.36592888832092285, + "rewards/rejected": -2.3759682178497314, + "step": 1728 + }, + { + "epoch": 0.2, + "learning_rate": 2.4388388154044247e-07, + "logits/chosen": -2.1916751861572266, + "logits/rejected": -2.0338306427001953, + "logps/chosen": -200.7450408935547, + "logps/rejected": -227.89060974121094, + "loss": 0.6668, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.9457224607467651, + "rewards/margins": 1.8475219011306763, + "rewards/rejected": -3.7932443618774414, + "step": 1729 + }, + { + "epoch": 0.2, + "learning_rate": 2.4384876507081817e-07, + "logits/chosen": -2.36079740524292, + "logits/rejected": -2.2427823543548584, + "logps/chosen": -411.69256591796875, + "logps/rejected": -346.8800354003906, + "loss": 0.3376, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.31345808506011963, + "rewards/margins": 1.3622002601623535, + "rewards/rejected": -1.6756583452224731, + "step": 1730 + }, + { + "epoch": 0.2, + "learning_rate": 2.43813648601194e-07, + "logits/chosen": -2.680518388748169, + "logits/rejected": -2.8768396377563477, + "logps/chosen": -241.541259765625, + "logps/rejected": -183.81761169433594, + "loss": 0.5216, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9292201995849609, + "rewards/margins": 2.1879825592041016, + "rewards/rejected": -3.1172029972076416, + "step": 1731 + }, + { + "epoch": 0.2, + "learning_rate": 2.437785321315697e-07, + "logits/chosen": -2.0787432193756104, + "logits/rejected": -2.198596239089966, + "logps/chosen": -193.3130645751953, + "logps/rejected": -236.795166015625, + "loss": 0.3139, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6974092721939087, + "rewards/margins": 1.7104650735855103, + "rewards/rejected": -2.40787410736084, + "step": 1732 + }, + { + "epoch": 0.2, + "learning_rate": 2.4374341566194543e-07, + "logits/chosen": -2.3125498294830322, + "logits/rejected": -2.0349812507629395, + "logps/chosen": -218.67901611328125, + "logps/rejected": -234.73826599121094, + "loss": 0.6187, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.016689658164978, + "rewards/margins": 1.5541775226593018, + "rewards/rejected": -2.5708670616149902, + "step": 1733 + }, + { + "epoch": 0.2, + "learning_rate": 2.437082991923212e-07, + "logits/chosen": -2.1569406986236572, + "logits/rejected": -2.3461289405822754, + "logps/chosen": -256.1600036621094, + "logps/rejected": -189.77915954589844, + "loss": 0.5028, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.411794662475586, + "rewards/margins": 0.7822133898735046, + "rewards/rejected": -2.1940083503723145, + "step": 1734 + }, + { + "epoch": 0.2, + "learning_rate": 2.4367318272269694e-07, + "logits/chosen": -2.4292428493499756, + "logits/rejected": -2.270761251449585, + "logps/chosen": -190.79615783691406, + "logps/rejected": -248.9860076904297, + "loss": 0.4846, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.510852575302124, + "rewards/margins": 1.7824156284332275, + "rewards/rejected": -2.2932682037353516, + "step": 1735 + }, + { + "epoch": 0.2, + "learning_rate": 2.436380662530727e-07, + "logits/chosen": -2.09334397315979, + "logits/rejected": -2.1722002029418945, + "logps/chosen": -333.06817626953125, + "logps/rejected": -300.3266296386719, + "loss": 0.4411, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8473309278488159, + "rewards/margins": 1.2285804748535156, + "rewards/rejected": -2.075911521911621, + "step": 1736 + }, + { + "epoch": 0.2, + "learning_rate": 2.4360294978344845e-07, + "logits/chosen": -2.0814008712768555, + "logits/rejected": -2.0238864421844482, + "logps/chosen": -222.44595336914062, + "logps/rejected": -242.28240966796875, + "loss": 0.4051, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.23186321556568146, + "rewards/margins": 2.01662015914917, + "rewards/rejected": -2.248483419418335, + "step": 1737 + }, + { + "epoch": 0.2, + "learning_rate": 2.4356783331382415e-07, + "logits/chosen": -2.3256967067718506, + "logits/rejected": -2.1414883136749268, + "logps/chosen": -211.45733642578125, + "logps/rejected": -259.9481201171875, + "loss": 0.4572, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9053998589515686, + "rewards/margins": 2.391582489013672, + "rewards/rejected": -3.2969820499420166, + "step": 1738 + }, + { + "epoch": 0.2, + "learning_rate": 2.435327168441999e-07, + "logits/chosen": -2.2753002643585205, + "logits/rejected": -2.3119897842407227, + "logps/chosen": -185.42034912109375, + "logps/rejected": -278.974853515625, + "loss": 0.398, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6061793565750122, + "rewards/margins": 2.0134007930755615, + "rewards/rejected": -2.6195802688598633, + "step": 1739 + }, + { + "epoch": 0.2, + "learning_rate": 2.4349760037457566e-07, + "logits/chosen": -1.6190807819366455, + "logits/rejected": -1.8711836338043213, + "logps/chosen": -259.576171875, + "logps/rejected": -153.11032104492188, + "loss": 0.7476, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2452973127365112, + "rewards/margins": 1.253401517868042, + "rewards/rejected": -2.4986987113952637, + "step": 1740 + }, + { + "epoch": 0.2, + "learning_rate": 2.434624839049514e-07, + "logits/chosen": -2.0201616287231445, + "logits/rejected": -2.159672260284424, + "logps/chosen": -169.3037872314453, + "logps/rejected": -225.90756225585938, + "loss": 0.2545, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7282040119171143, + "rewards/margins": 2.361823081970215, + "rewards/rejected": -3.090027093887329, + "step": 1741 + }, + { + "epoch": 0.2, + "learning_rate": 2.4342736743532716e-07, + "logits/chosen": -2.552170991897583, + "logits/rejected": -2.742809772491455, + "logps/chosen": -299.8990783691406, + "logps/rejected": -290.2048645019531, + "loss": 0.3948, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.05312180519104, + "rewards/margins": 1.4450024366378784, + "rewards/rejected": -2.498124122619629, + "step": 1742 + }, + { + "epoch": 0.2, + "learning_rate": 2.433922509657029e-07, + "logits/chosen": -2.610746383666992, + "logits/rejected": -2.70027756690979, + "logps/chosen": -331.77362060546875, + "logps/rejected": -351.33294677734375, + "loss": 0.3316, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6354208588600159, + "rewards/margins": 2.6105034351348877, + "rewards/rejected": -3.245924472808838, + "step": 1743 + }, + { + "epoch": 0.2, + "learning_rate": 2.4335713449607867e-07, + "logits/chosen": -1.833812952041626, + "logits/rejected": -2.079632520675659, + "logps/chosen": -336.558349609375, + "logps/rejected": -313.8045654296875, + "loss": 0.4383, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4967085123062134, + "rewards/margins": 2.259167194366455, + "rewards/rejected": -3.755875825881958, + "step": 1744 + }, + { + "epoch": 0.2, + "learning_rate": 2.433220180264544e-07, + "logits/chosen": -2.568652391433716, + "logits/rejected": -2.4552550315856934, + "logps/chosen": -331.4395751953125, + "logps/rejected": -344.45440673828125, + "loss": 0.4563, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15808454155921936, + "rewards/margins": 2.2793197631835938, + "rewards/rejected": -2.437404155731201, + "step": 1745 + }, + { + "epoch": 0.2, + "learning_rate": 2.4328690155683013e-07, + "logits/chosen": -2.3492214679718018, + "logits/rejected": -2.0677237510681152, + "logps/chosen": -181.12709045410156, + "logps/rejected": -206.2689971923828, + "loss": 1.1086, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7794108390808105, + "rewards/margins": 0.944946825504303, + "rewards/rejected": -2.7243576049804688, + "step": 1746 + }, + { + "epoch": 0.2, + "learning_rate": 2.432517850872059e-07, + "logits/chosen": -2.5833277702331543, + "logits/rejected": -2.6295151710510254, + "logps/chosen": -269.1124572753906, + "logps/rejected": -266.5947265625, + "loss": 0.2356, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3264881670475006, + "rewards/margins": 2.0838327407836914, + "rewards/rejected": -2.410320997238159, + "step": 1747 + }, + { + "epoch": 0.2, + "learning_rate": 2.4321666861758163e-07, + "logits/chosen": -2.228693962097168, + "logits/rejected": -2.133988857269287, + "logps/chosen": -122.92444610595703, + "logps/rejected": -200.03208923339844, + "loss": 0.5043, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8491935133934021, + "rewards/margins": 1.709801197052002, + "rewards/rejected": -2.558994770050049, + "step": 1748 + }, + { + "epoch": 0.2, + "learning_rate": 2.431815521479574e-07, + "logits/chosen": -2.5841774940490723, + "logits/rejected": -2.6444497108459473, + "logps/chosen": -306.74090576171875, + "logps/rejected": -279.3201599121094, + "loss": 0.3618, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5426611304283142, + "rewards/margins": 3.052628755569458, + "rewards/rejected": -3.595290184020996, + "step": 1749 + }, + { + "epoch": 0.2, + "learning_rate": 2.4314643567833314e-07, + "logits/chosen": -2.2098028659820557, + "logits/rejected": -2.3476827144622803, + "logps/chosen": -276.5292663574219, + "logps/rejected": -209.67750549316406, + "loss": 0.4344, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6844555735588074, + "rewards/margins": 1.2801694869995117, + "rewards/rejected": -1.9646251201629639, + "step": 1750 + }, + { + "epoch": 0.2, + "learning_rate": 2.4311131920870884e-07, + "logits/chosen": -2.681504726409912, + "logits/rejected": -2.5552115440368652, + "logps/chosen": -419.37103271484375, + "logps/rejected": -262.3865966796875, + "loss": 0.232, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4580800533294678, + "rewards/margins": 2.6926169395446777, + "rewards/rejected": -3.1506972312927246, + "step": 1751 + }, + { + "epoch": 0.2, + "learning_rate": 2.430762027390846e-07, + "logits/chosen": -1.9505836963653564, + "logits/rejected": -2.1135611534118652, + "logps/chosen": -371.6445617675781, + "logps/rejected": -358.106201171875, + "loss": 0.5198, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4653635025024414, + "rewards/margins": 1.3207805156707764, + "rewards/rejected": -1.7861440181732178, + "step": 1752 + }, + { + "epoch": 0.2, + "learning_rate": 2.430410862694604e-07, + "logits/chosen": -2.5854766368865967, + "logits/rejected": -2.5742404460906982, + "logps/chosen": -224.77935791015625, + "logps/rejected": -197.81336975097656, + "loss": 0.3786, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8665859699249268, + "rewards/margins": 1.4786651134490967, + "rewards/rejected": -3.3452513217926025, + "step": 1753 + }, + { + "epoch": 0.2, + "learning_rate": 2.430059697998361e-07, + "logits/chosen": -2.135680675506592, + "logits/rejected": -2.2765469551086426, + "logps/chosen": -421.134521484375, + "logps/rejected": -334.38507080078125, + "loss": 0.5037, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8152784109115601, + "rewards/margins": 2.0857605934143066, + "rewards/rejected": -2.9010391235351562, + "step": 1754 + }, + { + "epoch": 0.2, + "learning_rate": 2.4297085333021186e-07, + "logits/chosen": -2.1040186882019043, + "logits/rejected": -2.250098943710327, + "logps/chosen": -229.74444580078125, + "logps/rejected": -226.99652099609375, + "loss": 0.5266, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3941288888454437, + "rewards/margins": 1.7816402912139893, + "rewards/rejected": -2.175769329071045, + "step": 1755 + }, + { + "epoch": 0.2, + "learning_rate": 2.429357368605876e-07, + "logits/chosen": -2.44799542427063, + "logits/rejected": -2.171790838241577, + "logps/chosen": -135.63546752929688, + "logps/rejected": -222.56983947753906, + "loss": 0.3466, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2550673484802246, + "rewards/margins": 1.2499346733093262, + "rewards/rejected": -2.505002021789551, + "step": 1756 + }, + { + "epoch": 0.2, + "learning_rate": 2.4290062039096337e-07, + "logits/chosen": -1.8681683540344238, + "logits/rejected": -1.5247111320495605, + "logps/chosen": -193.55950927734375, + "logps/rejected": -354.5760803222656, + "loss": 0.7522, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1021552085876465, + "rewards/margins": 1.566741704940796, + "rewards/rejected": -2.6688966751098633, + "step": 1757 + }, + { + "epoch": 0.2, + "learning_rate": 2.428655039213391e-07, + "logits/chosen": -2.3932785987854004, + "logits/rejected": -2.404676675796509, + "logps/chosen": -255.96713256835938, + "logps/rejected": -295.5830993652344, + "loss": 0.3472, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3065069317817688, + "rewards/margins": 1.5144323110580444, + "rewards/rejected": -1.820939302444458, + "step": 1758 + }, + { + "epoch": 0.2, + "learning_rate": 2.428303874517148e-07, + "logits/chosen": -2.124232530593872, + "logits/rejected": -2.4297683238983154, + "logps/chosen": -418.47900390625, + "logps/rejected": -238.34991455078125, + "loss": 0.5302, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4730449914932251, + "rewards/margins": 1.524375081062317, + "rewards/rejected": -1.9974199533462524, + "step": 1759 + }, + { + "epoch": 0.2, + "learning_rate": 2.427952709820906e-07, + "logits/chosen": -2.6231722831726074, + "logits/rejected": -2.6488304138183594, + "logps/chosen": -177.74688720703125, + "logps/rejected": -280.68701171875, + "loss": 0.3436, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.479947030544281, + "rewards/margins": 3.837874412536621, + "rewards/rejected": -4.317821502685547, + "step": 1760 + }, + { + "epoch": 0.2, + "learning_rate": 2.4276015451246633e-07, + "logits/chosen": -2.221482276916504, + "logits/rejected": -2.529491662979126, + "logps/chosen": -206.96026611328125, + "logps/rejected": -149.4646759033203, + "loss": 0.3537, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.722180962562561, + "rewards/margins": 1.3843914270401, + "rewards/rejected": -2.106572389602661, + "step": 1761 + }, + { + "epoch": 0.2, + "learning_rate": 2.427250380428421e-07, + "logits/chosen": -2.0428264141082764, + "logits/rejected": -2.4575612545013428, + "logps/chosen": -462.4838562011719, + "logps/rejected": -299.2469177246094, + "loss": 0.2149, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3926375210285187, + "rewards/margins": 1.7989957332611084, + "rewards/rejected": -2.191633462905884, + "step": 1762 + }, + { + "epoch": 0.2, + "learning_rate": 2.4268992157321784e-07, + "logits/chosen": -2.6813459396362305, + "logits/rejected": -2.6883792877197266, + "logps/chosen": -383.4931945800781, + "logps/rejected": -254.85067749023438, + "loss": 0.3405, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8303241729736328, + "rewards/margins": 2.057558298110962, + "rewards/rejected": -2.887882709503174, + "step": 1763 + }, + { + "epoch": 0.2, + "learning_rate": 2.4265480510359354e-07, + "logits/chosen": -2.5236926078796387, + "logits/rejected": -2.3449180126190186, + "logps/chosen": -185.0291748046875, + "logps/rejected": -245.60537719726562, + "loss": 0.3593, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7103353142738342, + "rewards/margins": 1.808946132659912, + "rewards/rejected": -2.5192813873291016, + "step": 1764 + }, + { + "epoch": 0.2, + "learning_rate": 2.4261968863396934e-07, + "logits/chosen": -2.3723483085632324, + "logits/rejected": -2.3448874950408936, + "logps/chosen": -164.13735961914062, + "logps/rejected": -227.38287353515625, + "loss": 0.3439, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2856420874595642, + "rewards/margins": 2.262371063232422, + "rewards/rejected": -2.5480127334594727, + "step": 1765 + }, + { + "epoch": 0.2, + "learning_rate": 2.425845721643451e-07, + "logits/chosen": -1.9555854797363281, + "logits/rejected": -1.9693328142166138, + "logps/chosen": -318.9788818359375, + "logps/rejected": -258.8931884765625, + "loss": 0.4547, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.42855486273765564, + "rewards/margins": 0.9614914655685425, + "rewards/rejected": -1.3900463581085205, + "step": 1766 + }, + { + "epoch": 0.2, + "learning_rate": 2.425494556947208e-07, + "logits/chosen": -2.420905828475952, + "logits/rejected": -2.6264822483062744, + "logps/chosen": -260.62548828125, + "logps/rejected": -257.72149658203125, + "loss": 0.6214, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.9284664392471313, + "rewards/margins": 0.8872129917144775, + "rewards/rejected": -2.8156793117523193, + "step": 1767 + }, + { + "epoch": 0.2, + "learning_rate": 2.4251433922509655e-07, + "logits/chosen": -2.1570210456848145, + "logits/rejected": -2.289433479309082, + "logps/chosen": -334.9596862792969, + "logps/rejected": -227.67214965820312, + "loss": 0.2608, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8243740200996399, + "rewards/margins": 2.112914800643921, + "rewards/rejected": -2.937288761138916, + "step": 1768 + }, + { + "epoch": 0.2, + "learning_rate": 2.424792227554723e-07, + "logits/chosen": -2.511112928390503, + "logits/rejected": -2.401444673538208, + "logps/chosen": -307.02716064453125, + "logps/rejected": -230.8401336669922, + "loss": 0.3221, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5600714683532715, + "rewards/margins": 1.6293562650680542, + "rewards/rejected": -2.1894278526306152, + "step": 1769 + }, + { + "epoch": 0.2, + "learning_rate": 2.4244410628584806e-07, + "logits/chosen": -2.486511468887329, + "logits/rejected": -2.3909525871276855, + "logps/chosen": -306.2787170410156, + "logps/rejected": -319.7153625488281, + "loss": 0.6751, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8575354814529419, + "rewards/margins": 1.0319043397903442, + "rewards/rejected": -1.8894397020339966, + "step": 1770 + }, + { + "epoch": 0.2, + "learning_rate": 2.424089898162238e-07, + "logits/chosen": -1.9402285814285278, + "logits/rejected": -2.3144943714141846, + "logps/chosen": -339.2012023925781, + "logps/rejected": -246.93553161621094, + "loss": 0.5374, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1347287893295288, + "rewards/margins": 1.3245265483856201, + "rewards/rejected": -2.4592552185058594, + "step": 1771 + }, + { + "epoch": 0.2, + "learning_rate": 2.423738733465995e-07, + "logits/chosen": -3.0927343368530273, + "logits/rejected": -3.050722122192383, + "logps/chosen": -157.57029724121094, + "logps/rejected": -114.6935043334961, + "loss": 0.1345, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3065428137779236, + "rewards/margins": 2.601393461227417, + "rewards/rejected": -2.2948508262634277, + "step": 1772 + }, + { + "epoch": 0.2, + "learning_rate": 2.4233875687697527e-07, + "logits/chosen": -2.2124030590057373, + "logits/rejected": -2.354235887527466, + "logps/chosen": -234.85366821289062, + "logps/rejected": -201.39944458007812, + "loss": 0.3422, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7997814416885376, + "rewards/margins": 1.2346489429473877, + "rewards/rejected": -2.034430503845215, + "step": 1773 + }, + { + "epoch": 0.2, + "learning_rate": 2.423036404073511e-07, + "logits/chosen": -2.458998441696167, + "logits/rejected": -2.287609100341797, + "logps/chosen": -139.34812927246094, + "logps/rejected": -210.18064880371094, + "loss": 0.6658, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.086080551147461, + "rewards/margins": 1.9787540435791016, + "rewards/rejected": -4.0648345947265625, + "step": 1774 + }, + { + "epoch": 0.2, + "learning_rate": 2.422685239377268e-07, + "logits/chosen": -2.646285057067871, + "logits/rejected": -2.6356959342956543, + "logps/chosen": -69.55776977539062, + "logps/rejected": -121.72145080566406, + "loss": 0.5736, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5833102464675903, + "rewards/margins": 1.0404503345489502, + "rewards/rejected": -2.623760461807251, + "step": 1775 + }, + { + "epoch": 0.2, + "learning_rate": 2.4223340746810253e-07, + "logits/chosen": -2.5046730041503906, + "logits/rejected": -2.5802478790283203, + "logps/chosen": -201.23463439941406, + "logps/rejected": -267.6932067871094, + "loss": 0.2569, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6419551968574524, + "rewards/margins": 1.9627639055252075, + "rewards/rejected": -2.6047191619873047, + "step": 1776 + }, + { + "epoch": 0.2, + "learning_rate": 2.421982909984783e-07, + "logits/chosen": -2.2254951000213623, + "logits/rejected": -2.2110233306884766, + "logps/chosen": -267.57757568359375, + "logps/rejected": -321.55712890625, + "loss": 0.5521, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6848902702331543, + "rewards/margins": 1.2213973999023438, + "rewards/rejected": -1.906287670135498, + "step": 1777 + }, + { + "epoch": 0.2, + "learning_rate": 2.4216317452885404e-07, + "logits/chosen": -2.1894211769104004, + "logits/rejected": -1.8789453506469727, + "logps/chosen": -164.52195739746094, + "logps/rejected": -280.75384521484375, + "loss": 0.9178, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7581601142883301, + "rewards/margins": 0.3447125256061554, + "rewards/rejected": -1.1028727293014526, + "step": 1778 + }, + { + "epoch": 0.21, + "learning_rate": 2.421280580592298e-07, + "logits/chosen": -2.575571060180664, + "logits/rejected": -2.566420078277588, + "logps/chosen": -173.634033203125, + "logps/rejected": -363.5577087402344, + "loss": 0.2751, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3773621916770935, + "rewards/margins": 2.1931533813476562, + "rewards/rejected": -2.5705151557922363, + "step": 1779 + }, + { + "epoch": 0.21, + "learning_rate": 2.420929415896055e-07, + "logits/chosen": -2.373101234436035, + "logits/rejected": -2.644155263900757, + "logps/chosen": -346.7199401855469, + "logps/rejected": -227.94235229492188, + "loss": 0.2993, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3509218096733093, + "rewards/margins": 1.5014338493347168, + "rewards/rejected": -1.852355718612671, + "step": 1780 + }, + { + "epoch": 0.21, + "learning_rate": 2.4205782511998125e-07, + "logits/chosen": -2.3395309448242188, + "logits/rejected": -2.3789806365966797, + "logps/chosen": -329.92120361328125, + "logps/rejected": -315.98529052734375, + "loss": 0.5902, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9802923798561096, + "rewards/margins": 0.67120361328125, + "rewards/rejected": -1.6514959335327148, + "step": 1781 + }, + { + "epoch": 0.21, + "learning_rate": 2.42022708650357e-07, + "logits/chosen": -2.1035473346710205, + "logits/rejected": -2.1316823959350586, + "logps/chosen": -205.3935089111328, + "logps/rejected": -275.32684326171875, + "loss": 0.1809, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.41354674100875854, + "rewards/margins": 3.011805534362793, + "rewards/rejected": -3.4253523349761963, + "step": 1782 + }, + { + "epoch": 0.21, + "learning_rate": 2.4198759218073276e-07, + "logits/chosen": -1.8259491920471191, + "logits/rejected": -2.301915168762207, + "logps/chosen": -376.2497863769531, + "logps/rejected": -329.9544677734375, + "loss": 0.4429, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8414648175239563, + "rewards/margins": 1.8623450994491577, + "rewards/rejected": -2.703809976577759, + "step": 1783 + }, + { + "epoch": 0.21, + "learning_rate": 2.419524757111085e-07, + "logits/chosen": -2.0355398654937744, + "logits/rejected": -2.022796869277954, + "logps/chosen": -258.0689697265625, + "logps/rejected": -293.19598388671875, + "loss": 0.2761, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1973160356283188, + "rewards/margins": 2.007171154022217, + "rewards/rejected": -2.2044870853424072, + "step": 1784 + }, + { + "epoch": 0.21, + "learning_rate": 2.419173592414842e-07, + "logits/chosen": -2.2629284858703613, + "logits/rejected": -2.348555088043213, + "logps/chosen": -322.455810546875, + "logps/rejected": -302.9672546386719, + "loss": 0.5632, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9223359823226929, + "rewards/margins": 1.9979419708251953, + "rewards/rejected": -2.9202780723571777, + "step": 1785 + }, + { + "epoch": 0.21, + "learning_rate": 2.4188224277185996e-07, + "logits/chosen": -2.7162113189697266, + "logits/rejected": -2.6274147033691406, + "logps/chosen": -214.3272705078125, + "logps/rejected": -261.32391357421875, + "loss": 0.5869, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0990519523620605, + "rewards/margins": 1.84844970703125, + "rewards/rejected": -2.9475014209747314, + "step": 1786 + }, + { + "epoch": 0.21, + "learning_rate": 2.4184712630223577e-07, + "logits/chosen": -2.1852288246154785, + "logits/rejected": -1.9386789798736572, + "logps/chosen": -317.7003479003906, + "logps/rejected": -351.7269287109375, + "loss": 0.6744, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7457543015480042, + "rewards/margins": 0.45279234647750854, + "rewards/rejected": -1.1985466480255127, + "step": 1787 + }, + { + "epoch": 0.21, + "learning_rate": 2.4181200983261147e-07, + "logits/chosen": -2.236231565475464, + "logits/rejected": -2.2091407775878906, + "logps/chosen": -357.65771484375, + "logps/rejected": -362.53369140625, + "loss": 0.2855, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5614519119262695, + "rewards/margins": 2.2296364307403564, + "rewards/rejected": -2.791088342666626, + "step": 1788 + }, + { + "epoch": 0.21, + "learning_rate": 2.417768933629872e-07, + "logits/chosen": -2.7946019172668457, + "logits/rejected": -2.881884813308716, + "logps/chosen": -256.2186584472656, + "logps/rejected": -284.451904296875, + "loss": 0.7472, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1961045265197754, + "rewards/margins": 1.7397974729537964, + "rewards/rejected": -3.9359018802642822, + "step": 1789 + }, + { + "epoch": 0.21, + "learning_rate": 2.41741776893363e-07, + "logits/chosen": -2.6429176330566406, + "logits/rejected": -2.7480409145355225, + "logps/chosen": -248.5714569091797, + "logps/rejected": -173.4696044921875, + "loss": 0.3587, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4284107685089111, + "rewards/margins": 1.0821868181228638, + "rewards/rejected": -2.5105977058410645, + "step": 1790 + }, + { + "epoch": 0.21, + "learning_rate": 2.4170666042373873e-07, + "logits/chosen": -2.9532175064086914, + "logits/rejected": -3.0040040016174316, + "logps/chosen": -97.298095703125, + "logps/rejected": -140.52049255371094, + "loss": 0.3716, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.43905651569366455, + "rewards/margins": 2.0606741905212402, + "rewards/rejected": -2.4997305870056152, + "step": 1791 + }, + { + "epoch": 0.21, + "learning_rate": 2.416715439541145e-07, + "logits/chosen": -2.4668447971343994, + "logits/rejected": -2.5253424644470215, + "logps/chosen": -253.27166748046875, + "logps/rejected": -303.1908874511719, + "loss": 0.1404, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.11521568894386292, + "rewards/margins": 3.5801000595092773, + "rewards/rejected": -3.4648842811584473, + "step": 1792 + }, + { + "epoch": 0.21, + "learning_rate": 2.416364274844902e-07, + "logits/chosen": -2.188927173614502, + "logits/rejected": -2.470747709274292, + "logps/chosen": -421.71844482421875, + "logps/rejected": -324.5322265625, + "loss": 0.2491, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3583441376686096, + "rewards/margins": 2.0396909713745117, + "rewards/rejected": -2.3980350494384766, + "step": 1793 + }, + { + "epoch": 0.21, + "learning_rate": 2.4160131101486594e-07, + "logits/chosen": -2.329000949859619, + "logits/rejected": -2.342432975769043, + "logps/chosen": -236.6280975341797, + "logps/rejected": -266.0938720703125, + "loss": 0.5362, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7843548059463501, + "rewards/margins": 1.2237898111343384, + "rewards/rejected": -2.0081446170806885, + "step": 1794 + }, + { + "epoch": 0.21, + "learning_rate": 2.415661945452417e-07, + "logits/chosen": -2.6628270149230957, + "logits/rejected": -2.5077524185180664, + "logps/chosen": -146.09300231933594, + "logps/rejected": -234.31204223632812, + "loss": 0.2519, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8664930462837219, + "rewards/margins": 1.7976312637329102, + "rewards/rejected": -2.6641242504119873, + "step": 1795 + }, + { + "epoch": 0.21, + "learning_rate": 2.4153107807561745e-07, + "logits/chosen": -2.768127679824829, + "logits/rejected": -2.846605062484741, + "logps/chosen": -228.61199951171875, + "logps/rejected": -137.86581420898438, + "loss": 0.5897, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1061137914657593, + "rewards/margins": 1.1476207971572876, + "rewards/rejected": -2.253734588623047, + "step": 1796 + }, + { + "epoch": 0.21, + "learning_rate": 2.414959616059932e-07, + "logits/chosen": -1.978359580039978, + "logits/rejected": -2.342259168624878, + "logps/chosen": -470.91888427734375, + "logps/rejected": -244.8557891845703, + "loss": 0.2174, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.883604884147644, + "rewards/margins": 2.023746967315674, + "rewards/rejected": -2.9073519706726074, + "step": 1797 + }, + { + "epoch": 0.21, + "learning_rate": 2.4146084513636896e-07, + "logits/chosen": -2.2000839710235596, + "logits/rejected": -2.2219579219818115, + "logps/chosen": -409.43548583984375, + "logps/rejected": -393.48590087890625, + "loss": 0.5657, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5532636642456055, + "rewards/margins": 0.6455321907997131, + "rewards/rejected": -2.198795795440674, + "step": 1798 + }, + { + "epoch": 0.21, + "learning_rate": 2.414257286667447e-07, + "logits/chosen": -2.7172412872314453, + "logits/rejected": -2.519165515899658, + "logps/chosen": -200.7042694091797, + "logps/rejected": -288.9590148925781, + "loss": 0.5608, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5502127408981323, + "rewards/margins": 1.0636193752288818, + "rewards/rejected": -2.6138319969177246, + "step": 1799 + }, + { + "epoch": 0.21, + "learning_rate": 2.4139061219712046e-07, + "logits/chosen": -2.554253339767456, + "logits/rejected": -2.6799848079681396, + "logps/chosen": -369.16192626953125, + "logps/rejected": -244.37620544433594, + "loss": 0.44, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8423911929130554, + "rewards/margins": 1.5787194967269897, + "rewards/rejected": -2.4211108684539795, + "step": 1800 + }, + { + "epoch": 0.21, + "learning_rate": 2.4135549572749617e-07, + "logits/chosen": -2.3045454025268555, + "logits/rejected": -2.3510231971740723, + "logps/chosen": -316.7146301269531, + "logps/rejected": -362.8817138671875, + "loss": 0.6946, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5878779888153076, + "rewards/margins": 0.19104835391044617, + "rewards/rejected": -1.7789263725280762, + "step": 1801 + }, + { + "epoch": 0.21, + "learning_rate": 2.413203792578719e-07, + "logits/chosen": -2.315331220626831, + "logits/rejected": -2.36541748046875, + "logps/chosen": -320.300537109375, + "logps/rejected": -405.12750244140625, + "loss": 0.1028, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04066745936870575, + "rewards/margins": 3.3427140712738037, + "rewards/rejected": -3.3833816051483154, + "step": 1802 + }, + { + "epoch": 0.21, + "learning_rate": 2.4128526278824767e-07, + "logits/chosen": -2.137524127960205, + "logits/rejected": -2.2736082077026367, + "logps/chosen": -311.60205078125, + "logps/rejected": -255.68707275390625, + "loss": 0.3899, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2629343569278717, + "rewards/margins": 1.673101544380188, + "rewards/rejected": -1.9360359907150269, + "step": 1803 + }, + { + "epoch": 0.21, + "learning_rate": 2.4125014631862343e-07, + "logits/chosen": -2.2231509685516357, + "logits/rejected": -2.4469685554504395, + "logps/chosen": -346.947021484375, + "logps/rejected": -270.1446838378906, + "loss": 0.5691, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8234492540359497, + "rewards/margins": 1.8452134132385254, + "rewards/rejected": -2.6686627864837646, + "step": 1804 + }, + { + "epoch": 0.21, + "learning_rate": 2.412150298489992e-07, + "logits/chosen": -2.4685311317443848, + "logits/rejected": -2.3603768348693848, + "logps/chosen": -372.0101318359375, + "logps/rejected": -216.2967529296875, + "loss": 0.5285, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8867498636245728, + "rewards/margins": 1.8500795364379883, + "rewards/rejected": -2.7368295192718506, + "step": 1805 + }, + { + "epoch": 0.21, + "learning_rate": 2.411799133793749e-07, + "logits/chosen": -2.637765407562256, + "logits/rejected": -2.7238240242004395, + "logps/chosen": -152.45553588867188, + "logps/rejected": -330.9868469238281, + "loss": 0.1723, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9044240713119507, + "rewards/margins": 2.6248416900634766, + "rewards/rejected": -4.529265880584717, + "step": 1806 + }, + { + "epoch": 0.21, + "learning_rate": 2.4114479690975064e-07, + "logits/chosen": -2.319606304168701, + "logits/rejected": -2.5022192001342773, + "logps/chosen": -530.2899169921875, + "logps/rejected": -454.20220947265625, + "loss": 0.3054, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2443646490573883, + "rewards/margins": 2.226039409637451, + "rewards/rejected": -2.4704041481018066, + "step": 1807 + }, + { + "epoch": 0.21, + "learning_rate": 2.4110968044012644e-07, + "logits/chosen": -2.191018581390381, + "logits/rejected": -1.8595538139343262, + "logps/chosen": -100.25410461425781, + "logps/rejected": -217.74363708496094, + "loss": 0.2753, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11879362910985947, + "rewards/margins": 1.8906481266021729, + "rewards/rejected": -2.00944185256958, + "step": 1808 + }, + { + "epoch": 0.21, + "learning_rate": 2.4107456397050214e-07, + "logits/chosen": -2.6279125213623047, + "logits/rejected": -2.6955978870391846, + "logps/chosen": -123.75236511230469, + "logps/rejected": -132.3733367919922, + "loss": 0.4802, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5866652727127075, + "rewards/margins": 1.3903452157974243, + "rewards/rejected": -1.9770104885101318, + "step": 1809 + }, + { + "epoch": 0.21, + "learning_rate": 2.410394475008779e-07, + "logits/chosen": -2.362215518951416, + "logits/rejected": -2.2505364418029785, + "logps/chosen": -163.34716796875, + "logps/rejected": -190.34915161132812, + "loss": 0.5405, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5480863451957703, + "rewards/margins": 0.4924197196960449, + "rewards/rejected": -1.0405060052871704, + "step": 1810 + }, + { + "epoch": 0.21, + "learning_rate": 2.4100433103125365e-07, + "logits/chosen": -2.2219297885894775, + "logits/rejected": -1.9046905040740967, + "logps/chosen": -245.92041015625, + "logps/rejected": -314.56787109375, + "loss": 0.5865, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3341430127620697, + "rewards/margins": 0.9090607166290283, + "rewards/rejected": -1.24320387840271, + "step": 1811 + }, + { + "epoch": 0.21, + "learning_rate": 2.409692145616294e-07, + "logits/chosen": -2.5836071968078613, + "logits/rejected": -2.224562406539917, + "logps/chosen": -209.3885955810547, + "logps/rejected": -269.3098449707031, + "loss": 0.3108, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0600848197937012, + "rewards/margins": 2.122804641723633, + "rewards/rejected": -3.182889461517334, + "step": 1812 + }, + { + "epoch": 0.21, + "learning_rate": 2.4093409809200516e-07, + "logits/chosen": -1.7962520122528076, + "logits/rejected": -1.5811491012573242, + "logps/chosen": -174.4542694091797, + "logps/rejected": -298.541748046875, + "loss": 0.1394, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5451214909553528, + "rewards/margins": 3.4634621143341064, + "rewards/rejected": -4.0085835456848145, + "step": 1813 + }, + { + "epoch": 0.21, + "learning_rate": 2.4089898162238086e-07, + "logits/chosen": -2.1973788738250732, + "logits/rejected": -2.566818952560425, + "logps/chosen": -258.24371337890625, + "logps/rejected": -219.5543212890625, + "loss": 0.2237, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3669402301311493, + "rewards/margins": 3.201093912124634, + "rewards/rejected": -3.5680341720581055, + "step": 1814 + }, + { + "epoch": 0.21, + "learning_rate": 2.408638651527566e-07, + "logits/chosen": -2.1165785789489746, + "logits/rejected": -1.9662740230560303, + "logps/chosen": -405.5699157714844, + "logps/rejected": -337.3160400390625, + "loss": 0.2526, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.37189897894859314, + "rewards/margins": 2.939746856689453, + "rewards/rejected": -3.311645746231079, + "step": 1815 + }, + { + "epoch": 0.21, + "learning_rate": 2.4082874868313237e-07, + "logits/chosen": -2.209811210632324, + "logits/rejected": -2.1097090244293213, + "logps/chosen": -217.95095825195312, + "logps/rejected": -257.5393981933594, + "loss": 0.359, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.42594024538993835, + "rewards/margins": 1.5000426769256592, + "rewards/rejected": -1.92598295211792, + "step": 1816 + }, + { + "epoch": 0.21, + "learning_rate": 2.407936322135081e-07, + "logits/chosen": -1.843414306640625, + "logits/rejected": -1.822927713394165, + "logps/chosen": -254.34536743164062, + "logps/rejected": -257.618896484375, + "loss": 0.5465, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1044185161590576, + "rewards/margins": 0.5055357813835144, + "rewards/rejected": -1.6099543571472168, + "step": 1817 + }, + { + "epoch": 0.21, + "learning_rate": 2.407585157438839e-07, + "logits/chosen": -2.4756710529327393, + "logits/rejected": -2.364346504211426, + "logps/chosen": -285.5262756347656, + "logps/rejected": -239.14051818847656, + "loss": 0.1987, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3315022587776184, + "rewards/margins": 3.2714178562164307, + "rewards/rejected": -3.6029202938079834, + "step": 1818 + }, + { + "epoch": 0.21, + "learning_rate": 2.4072339927425963e-07, + "logits/chosen": -2.4623899459838867, + "logits/rejected": -2.617771625518799, + "logps/chosen": -347.8703308105469, + "logps/rejected": -235.4872589111328, + "loss": 0.4126, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7668251991271973, + "rewards/margins": 1.8350064754486084, + "rewards/rejected": -2.6018314361572266, + "step": 1819 + }, + { + "epoch": 0.21, + "learning_rate": 2.4068828280463533e-07, + "logits/chosen": -2.2549028396606445, + "logits/rejected": -2.343865156173706, + "logps/chosen": -296.2115173339844, + "logps/rejected": -346.1871032714844, + "loss": 0.5697, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3473693132400513, + "rewards/margins": 0.5590673685073853, + "rewards/rejected": -1.9064366817474365, + "step": 1820 + }, + { + "epoch": 0.21, + "learning_rate": 2.4065316633501114e-07, + "logits/chosen": -2.270474433898926, + "logits/rejected": -2.539175033569336, + "logps/chosen": -254.89706420898438, + "logps/rejected": -232.81982421875, + "loss": 0.5562, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3592070937156677, + "rewards/margins": 1.1130919456481934, + "rewards/rejected": -1.4722989797592163, + "step": 1821 + }, + { + "epoch": 0.21, + "learning_rate": 2.4061804986538684e-07, + "logits/chosen": -2.638720989227295, + "logits/rejected": -2.509942054748535, + "logps/chosen": -203.815185546875, + "logps/rejected": -354.7755432128906, + "loss": 0.2994, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2578689455986023, + "rewards/margins": 1.7970194816589355, + "rewards/rejected": -2.0548884868621826, + "step": 1822 + }, + { + "epoch": 0.21, + "learning_rate": 2.405829333957626e-07, + "logits/chosen": -1.9494028091430664, + "logits/rejected": -2.2757153511047363, + "logps/chosen": -459.8182067871094, + "logps/rejected": -274.93817138671875, + "loss": 0.2717, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1083526611328125, + "rewards/margins": 1.8156578540802002, + "rewards/rejected": -1.9240105152130127, + "step": 1823 + }, + { + "epoch": 0.21, + "learning_rate": 2.4054781692613835e-07, + "logits/chosen": -2.8849143981933594, + "logits/rejected": -2.7396836280822754, + "logps/chosen": -266.1773681640625, + "logps/rejected": -267.2488708496094, + "loss": 0.3054, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20645365118980408, + "rewards/margins": 1.853005051612854, + "rewards/rejected": -2.0594587326049805, + "step": 1824 + }, + { + "epoch": 0.21, + "learning_rate": 2.405127004565141e-07, + "logits/chosen": -2.294161081314087, + "logits/rejected": -2.4439170360565186, + "logps/chosen": -202.70309448242188, + "logps/rejected": -153.807373046875, + "loss": 0.339, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7729872465133667, + "rewards/margins": 1.0224289894104004, + "rewards/rejected": -1.795416235923767, + "step": 1825 + }, + { + "epoch": 0.21, + "learning_rate": 2.4047758398688985e-07, + "logits/chosen": -2.085132122039795, + "logits/rejected": -2.269338369369507, + "logps/chosen": -511.542724609375, + "logps/rejected": -344.50836181640625, + "loss": 0.9073, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.1011176109313965, + "rewards/margins": 0.7822319269180298, + "rewards/rejected": -1.8833494186401367, + "step": 1826 + }, + { + "epoch": 0.21, + "learning_rate": 2.404424675172656e-07, + "logits/chosen": -2.3740110397338867, + "logits/rejected": -2.583950996398926, + "logps/chosen": -253.4356231689453, + "logps/rejected": -393.92694091796875, + "loss": 0.1987, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4944526255130768, + "rewards/margins": 3.462804079055786, + "rewards/rejected": -3.957256555557251, + "step": 1827 + }, + { + "epoch": 0.21, + "learning_rate": 2.404073510476413e-07, + "logits/chosen": -2.1198649406433105, + "logits/rejected": -2.1163909435272217, + "logps/chosen": -325.30712890625, + "logps/rejected": -299.1681213378906, + "loss": 0.6413, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5168971419334412, + "rewards/margins": 0.3567184805870056, + "rewards/rejected": -0.8736156225204468, + "step": 1828 + }, + { + "epoch": 0.21, + "learning_rate": 2.4037223457801706e-07, + "logits/chosen": -2.4335238933563232, + "logits/rejected": -2.5726499557495117, + "logps/chosen": -438.6887512207031, + "logps/rejected": -247.02969360351562, + "loss": 0.7317, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1438512802124023, + "rewards/margins": 0.7394155263900757, + "rewards/rejected": -1.883266806602478, + "step": 1829 + }, + { + "epoch": 0.21, + "learning_rate": 2.403371181083928e-07, + "logits/chosen": -2.0390870571136475, + "logits/rejected": -1.8236382007598877, + "logps/chosen": -186.35226440429688, + "logps/rejected": -241.319091796875, + "loss": 0.9133, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7628560662269592, + "rewards/margins": 0.17575323581695557, + "rewards/rejected": -0.9386093020439148, + "step": 1830 + }, + { + "epoch": 0.21, + "learning_rate": 2.4030200163876857e-07, + "logits/chosen": -2.3294034004211426, + "logits/rejected": -2.4230706691741943, + "logps/chosen": -484.8609924316406, + "logps/rejected": -249.93405151367188, + "loss": 0.6531, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8977600336074829, + "rewards/margins": 1.388117790222168, + "rewards/rejected": -2.2858777046203613, + "step": 1831 + }, + { + "epoch": 0.21, + "learning_rate": 2.402668851691443e-07, + "logits/chosen": -2.142631769180298, + "logits/rejected": -2.6007702350616455, + "logps/chosen": -435.2957763671875, + "logps/rejected": -295.79241943359375, + "loss": 0.3311, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15247294306755066, + "rewards/margins": 1.7341649532318115, + "rewards/rejected": -1.8866381645202637, + "step": 1832 + }, + { + "epoch": 0.21, + "learning_rate": 2.402317686995201e-07, + "logits/chosen": -2.0428545475006104, + "logits/rejected": -2.2203900814056396, + "logps/chosen": -317.36334228515625, + "logps/rejected": -331.79766845703125, + "loss": 0.5698, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.937549889087677, + "rewards/margins": 1.6160224676132202, + "rewards/rejected": -2.553572416305542, + "step": 1833 + }, + { + "epoch": 0.21, + "learning_rate": 2.4019665222989583e-07, + "logits/chosen": -2.670987606048584, + "logits/rejected": -2.690349817276001, + "logps/chosen": -173.01126098632812, + "logps/rejected": -153.3483123779297, + "loss": 0.4724, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4232531189918518, + "rewards/margins": 1.0211623907089233, + "rewards/rejected": -1.4444154500961304, + "step": 1834 + }, + { + "epoch": 0.21, + "learning_rate": 2.401615357602716e-07, + "logits/chosen": -2.2585439682006836, + "logits/rejected": -2.29978609085083, + "logps/chosen": -328.8022766113281, + "logps/rejected": -364.46575927734375, + "loss": 0.3149, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.375353068113327, + "rewards/margins": 1.6392464637756348, + "rewards/rejected": -2.014599561691284, + "step": 1835 + }, + { + "epoch": 0.21, + "learning_rate": 2.401264192906473e-07, + "logits/chosen": -2.7342753410339355, + "logits/rejected": -2.80782413482666, + "logps/chosen": -150.330078125, + "logps/rejected": -239.19635009765625, + "loss": 0.1496, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5046515464782715, + "rewards/margins": 4.591501712799072, + "rewards/rejected": -5.096153736114502, + "step": 1836 + }, + { + "epoch": 0.21, + "learning_rate": 2.4009130282102304e-07, + "logits/chosen": -2.389190912246704, + "logits/rejected": -2.28358793258667, + "logps/chosen": -294.3597106933594, + "logps/rejected": -397.3401184082031, + "loss": 0.1482, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7631075978279114, + "rewards/margins": 4.107019424438477, + "rewards/rejected": -4.870127201080322, + "step": 1837 + }, + { + "epoch": 0.21, + "learning_rate": 2.400561863513988e-07, + "logits/chosen": -2.803971767425537, + "logits/rejected": -2.8231942653656006, + "logps/chosen": -584.712646484375, + "logps/rejected": -272.1742858886719, + "loss": 0.5627, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1188843250274658, + "rewards/margins": 0.9615636467933655, + "rewards/rejected": -2.0804479122161865, + "step": 1838 + }, + { + "epoch": 0.21, + "learning_rate": 2.4002106988177455e-07, + "logits/chosen": -2.0155904293060303, + "logits/rejected": -2.0305089950561523, + "logps/chosen": -362.81646728515625, + "logps/rejected": -305.6097412109375, + "loss": 0.7501, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6948003768920898, + "rewards/margins": 1.3827661275863647, + "rewards/rejected": -2.077566385269165, + "step": 1839 + }, + { + "epoch": 0.21, + "learning_rate": 2.399859534121503e-07, + "logits/chosen": -2.6109914779663086, + "logits/rejected": -2.769300699234009, + "logps/chosen": -431.7980651855469, + "logps/rejected": -352.08819580078125, + "loss": 0.3615, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15211333334445953, + "rewards/margins": 2.302943706512451, + "rewards/rejected": -2.455057382583618, + "step": 1840 + }, + { + "epoch": 0.21, + "learning_rate": 2.39950836942526e-07, + "logits/chosen": -2.71331524848938, + "logits/rejected": -2.9174654483795166, + "logps/chosen": -455.1396789550781, + "logps/rejected": -302.7078857421875, + "loss": 0.2503, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.29086220264434814, + "rewards/margins": 1.810834288597107, + "rewards/rejected": -2.101696491241455, + "step": 1841 + }, + { + "epoch": 0.21, + "learning_rate": 2.399157204729018e-07, + "logits/chosen": -2.4864182472229004, + "logits/rejected": -2.345564842224121, + "logps/chosen": -169.8216094970703, + "logps/rejected": -227.474365234375, + "loss": 0.3477, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6689885854721069, + "rewards/margins": 2.4252569675445557, + "rewards/rejected": -3.094245433807373, + "step": 1842 + }, + { + "epoch": 0.21, + "learning_rate": 2.3988060400327756e-07, + "logits/chosen": -2.1708197593688965, + "logits/rejected": -2.2426257133483887, + "logps/chosen": -333.90771484375, + "logps/rejected": -310.000732421875, + "loss": 0.2104, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0202468633651733, + "rewards/margins": 2.8155453205108643, + "rewards/rejected": -3.835792064666748, + "step": 1843 + }, + { + "epoch": 0.21, + "learning_rate": 2.3984548753365326e-07, + "logits/chosen": -2.7320258617401123, + "logits/rejected": -2.576889753341675, + "logps/chosen": -202.027587890625, + "logps/rejected": -218.84954833984375, + "loss": 0.4488, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2960587739944458, + "rewards/margins": 1.5322586297988892, + "rewards/rejected": -2.828317403793335, + "step": 1844 + }, + { + "epoch": 0.21, + "learning_rate": 2.39810371064029e-07, + "logits/chosen": -2.479710102081299, + "logits/rejected": -2.4629504680633545, + "logps/chosen": -253.3799285888672, + "logps/rejected": -267.59564208984375, + "loss": 0.3479, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9314900636672974, + "rewards/margins": 1.275425910949707, + "rewards/rejected": -2.206915855407715, + "step": 1845 + }, + { + "epoch": 0.21, + "learning_rate": 2.3977525459440477e-07, + "logits/chosen": -2.6950764656066895, + "logits/rejected": -2.52616548538208, + "logps/chosen": -116.18327331542969, + "logps/rejected": -201.73939514160156, + "loss": 0.1132, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.564406156539917, + "rewards/margins": 3.15159010887146, + "rewards/rejected": -3.715996265411377, + "step": 1846 + }, + { + "epoch": 0.21, + "learning_rate": 2.397401381247805e-07, + "logits/chosen": -2.6310906410217285, + "logits/rejected": -2.29799485206604, + "logps/chosen": -244.01779174804688, + "logps/rejected": -246.12350463867188, + "loss": 0.5373, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5784494876861572, + "rewards/margins": 1.1061046123504639, + "rewards/rejected": -2.684554100036621, + "step": 1847 + }, + { + "epoch": 0.21, + "learning_rate": 2.397050216551563e-07, + "logits/chosen": -2.289623260498047, + "logits/rejected": -2.6085715293884277, + "logps/chosen": -510.60760498046875, + "logps/rejected": -266.3421630859375, + "loss": 0.5873, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.632077157497406, + "rewards/margins": 1.9324402809143066, + "rewards/rejected": -2.5645174980163574, + "step": 1848 + }, + { + "epoch": 0.21, + "learning_rate": 2.39669905185532e-07, + "logits/chosen": -2.3586080074310303, + "logits/rejected": -2.5472564697265625, + "logps/chosen": -290.1748046875, + "logps/rejected": -236.11117553710938, + "loss": 0.5507, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9356369972229004, + "rewards/margins": 1.848745346069336, + "rewards/rejected": -2.7843823432922363, + "step": 1849 + }, + { + "epoch": 0.21, + "learning_rate": 2.3963478871590773e-07, + "logits/chosen": -2.355426073074341, + "logits/rejected": -2.1652634143829346, + "logps/chosen": -299.80169677734375, + "logps/rejected": -316.3283996582031, + "loss": 0.2273, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5115973353385925, + "rewards/margins": 2.836256504058838, + "rewards/rejected": -3.347853899002075, + "step": 1850 + }, + { + "epoch": 0.21, + "learning_rate": 2.395996722462835e-07, + "logits/chosen": -2.024786949157715, + "logits/rejected": -1.9620697498321533, + "logps/chosen": -253.3379364013672, + "logps/rejected": -521.573974609375, + "loss": 0.4272, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5354397296905518, + "rewards/margins": 1.6614373922348022, + "rewards/rejected": -2.1968770027160645, + "step": 1851 + }, + { + "epoch": 0.21, + "learning_rate": 2.3956455577665924e-07, + "logits/chosen": -2.5610949993133545, + "logits/rejected": -2.3942856788635254, + "logps/chosen": -229.45223999023438, + "logps/rejected": -278.906494140625, + "loss": 0.4063, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6653168201446533, + "rewards/margins": 1.4570622444152832, + "rewards/rejected": -3.1223790645599365, + "step": 1852 + }, + { + "epoch": 0.21, + "learning_rate": 2.39529439307035e-07, + "logits/chosen": -2.4222569465637207, + "logits/rejected": -2.6758177280426025, + "logps/chosen": -248.36875915527344, + "logps/rejected": -189.25315856933594, + "loss": 0.5483, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9683021306991577, + "rewards/margins": 1.1099299192428589, + "rewards/rejected": -2.0782322883605957, + "step": 1853 + }, + { + "epoch": 0.21, + "learning_rate": 2.394943228374107e-07, + "logits/chosen": -2.4238028526306152, + "logits/rejected": -2.4678142070770264, + "logps/chosen": -166.52767944335938, + "logps/rejected": -132.4582977294922, + "loss": 0.513, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3685823678970337, + "rewards/margins": 0.8679710626602173, + "rewards/rejected": -2.236553430557251, + "step": 1854 + }, + { + "epoch": 0.21, + "learning_rate": 2.394592063677865e-07, + "logits/chosen": -1.5106371641159058, + "logits/rejected": -1.8524574041366577, + "logps/chosen": -595.8264770507812, + "logps/rejected": -474.02789306640625, + "loss": 0.939, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.7333848476409912, + "rewards/margins": -0.0742100328207016, + "rewards/rejected": -1.6591746807098389, + "step": 1855 + }, + { + "epoch": 0.21, + "learning_rate": 2.3942408989816226e-07, + "logits/chosen": -1.588418960571289, + "logits/rejected": -1.8836404085159302, + "logps/chosen": -460.8817138671875, + "logps/rejected": -286.1075744628906, + "loss": 0.3175, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.31306061148643494, + "rewards/margins": 1.4500211477279663, + "rewards/rejected": -1.7630817890167236, + "step": 1856 + }, + { + "epoch": 0.21, + "learning_rate": 2.3938897342853796e-07, + "logits/chosen": -2.6855361461639404, + "logits/rejected": -2.6367058753967285, + "logps/chosen": -189.21080017089844, + "logps/rejected": -160.44863891601562, + "loss": 0.291, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6179702281951904, + "rewards/margins": 2.253661632537842, + "rewards/rejected": -2.8716318607330322, + "step": 1857 + }, + { + "epoch": 0.21, + "learning_rate": 2.393538569589137e-07, + "logits/chosen": -1.9359877109527588, + "logits/rejected": -2.3835482597351074, + "logps/chosen": -208.3299102783203, + "logps/rejected": -145.3487548828125, + "loss": 0.5617, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7792482376098633, + "rewards/margins": 1.3679909706115723, + "rewards/rejected": -2.1472392082214355, + "step": 1858 + }, + { + "epoch": 0.21, + "learning_rate": 2.3931874048928947e-07, + "logits/chosen": -2.347647190093994, + "logits/rejected": -2.2046337127685547, + "logps/chosen": -231.47166442871094, + "logps/rejected": -278.29595947265625, + "loss": 0.3007, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2441967725753784, + "rewards/margins": 2.5454421043395996, + "rewards/rejected": -3.7896392345428467, + "step": 1859 + }, + { + "epoch": 0.21, + "learning_rate": 2.392836240196652e-07, + "logits/chosen": -2.065943956375122, + "logits/rejected": -2.494417428970337, + "logps/chosen": -264.3790588378906, + "logps/rejected": -154.78439331054688, + "loss": 0.4166, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5152082443237305, + "rewards/margins": 0.8737697601318359, + "rewards/rejected": -1.3889780044555664, + "step": 1860 + }, + { + "epoch": 0.21, + "learning_rate": 2.3924850755004097e-07, + "logits/chosen": -2.2565863132476807, + "logits/rejected": -2.3298287391662598, + "logps/chosen": -272.32257080078125, + "logps/rejected": -175.30575561523438, + "loss": 0.5665, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.018544912338257, + "rewards/margins": 1.2345197200775146, + "rewards/rejected": -3.2530648708343506, + "step": 1861 + }, + { + "epoch": 0.21, + "learning_rate": 2.392133910804167e-07, + "logits/chosen": -2.52831768989563, + "logits/rejected": -2.4556314945220947, + "logps/chosen": -357.67730712890625, + "logps/rejected": -493.5612487792969, + "loss": 0.3941, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5954177975654602, + "rewards/margins": 2.9332666397094727, + "rewards/rejected": -3.528684616088867, + "step": 1862 + }, + { + "epoch": 0.21, + "learning_rate": 2.3917827461079243e-07, + "logits/chosen": -2.3656246662139893, + "logits/rejected": -2.683575391769409, + "logps/chosen": -295.96600341796875, + "logps/rejected": -160.37713623046875, + "loss": 0.6281, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0369027853012085, + "rewards/margins": 0.625745952129364, + "rewards/rejected": -1.6626487970352173, + "step": 1863 + }, + { + "epoch": 0.21, + "learning_rate": 2.3914315814116823e-07, + "logits/chosen": -2.4059722423553467, + "logits/rejected": -2.521225929260254, + "logps/chosen": -340.10211181640625, + "logps/rejected": -156.39857482910156, + "loss": 0.8566, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8288462162017822, + "rewards/margins": 0.1903054416179657, + "rewards/rejected": -1.0191516876220703, + "step": 1864 + }, + { + "epoch": 0.21, + "learning_rate": 2.3910804167154394e-07, + "logits/chosen": -2.7985055446624756, + "logits/rejected": -2.874561071395874, + "logps/chosen": -206.85316467285156, + "logps/rejected": -246.80133056640625, + "loss": 0.2134, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6820824146270752, + "rewards/margins": 1.7005670070648193, + "rewards/rejected": -2.3826494216918945, + "step": 1865 + }, + { + "epoch": 0.22, + "learning_rate": 2.390729252019197e-07, + "logits/chosen": -2.4475882053375244, + "logits/rejected": -2.615083932876587, + "logps/chosen": -456.030517578125, + "logps/rejected": -261.5002746582031, + "loss": 0.3771, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2463514804840088, + "rewards/margins": 1.3695555925369263, + "rewards/rejected": -2.6159071922302246, + "step": 1866 + }, + { + "epoch": 0.22, + "learning_rate": 2.3903780873229544e-07, + "logits/chosen": -2.9152491092681885, + "logits/rejected": -2.7439687252044678, + "logps/chosen": -331.9688415527344, + "logps/rejected": -334.018310546875, + "loss": 0.1009, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.456352174282074, + "rewards/margins": 3.2346761226654053, + "rewards/rejected": -3.691028118133545, + "step": 1867 + }, + { + "epoch": 0.22, + "learning_rate": 2.390026922626712e-07, + "logits/chosen": -2.3545193672180176, + "logits/rejected": -2.2795567512512207, + "logps/chosen": -305.1525573730469, + "logps/rejected": -317.0721435546875, + "loss": 0.5723, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4993679523468018, + "rewards/margins": 1.6834255456924438, + "rewards/rejected": -3.182793617248535, + "step": 1868 + }, + { + "epoch": 0.22, + "learning_rate": 2.3896757579304695e-07, + "logits/chosen": -2.019439697265625, + "logits/rejected": -2.1618916988372803, + "logps/chosen": -281.16876220703125, + "logps/rejected": -242.90037536621094, + "loss": 0.3631, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7490295171737671, + "rewards/margins": 1.6545015573501587, + "rewards/rejected": -2.403531074523926, + "step": 1869 + }, + { + "epoch": 0.22, + "learning_rate": 2.3893245932342265e-07, + "logits/chosen": -2.1238675117492676, + "logits/rejected": -2.114086866378784, + "logps/chosen": -350.7742004394531, + "logps/rejected": -367.63336181640625, + "loss": 0.4721, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.690460205078125, + "rewards/margins": 1.222798466682434, + "rewards/rejected": -1.9132585525512695, + "step": 1870 + }, + { + "epoch": 0.22, + "learning_rate": 2.388973428537984e-07, + "logits/chosen": -2.301699161529541, + "logits/rejected": -2.2965750694274902, + "logps/chosen": -282.3863220214844, + "logps/rejected": -231.1263427734375, + "loss": 0.5268, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8227315545082092, + "rewards/margins": 1.0579973459243774, + "rewards/rejected": -1.8807289600372314, + "step": 1871 + }, + { + "epoch": 0.22, + "learning_rate": 2.3886222638417416e-07, + "logits/chosen": -2.4789013862609863, + "logits/rejected": -2.400385618209839, + "logps/chosen": -191.66282653808594, + "logps/rejected": -188.58224487304688, + "loss": 0.4114, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0588653087615967, + "rewards/margins": 2.318835973739624, + "rewards/rejected": -3.3777012825012207, + "step": 1872 + }, + { + "epoch": 0.22, + "learning_rate": 2.388271099145499e-07, + "logits/chosen": -2.1242103576660156, + "logits/rejected": -2.5868172645568848, + "logps/chosen": -445.590576171875, + "logps/rejected": -434.6290283203125, + "loss": 0.421, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.141395926475525, + "rewards/margins": 2.4441840648651123, + "rewards/rejected": -3.585580348968506, + "step": 1873 + }, + { + "epoch": 0.22, + "learning_rate": 2.3879199344492567e-07, + "logits/chosen": -2.8470587730407715, + "logits/rejected": -2.9112415313720703, + "logps/chosen": -350.70697021484375, + "logps/rejected": -200.61993408203125, + "loss": 0.6181, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3324568271636963, + "rewards/margins": 1.6064081192016602, + "rewards/rejected": -2.9388649463653564, + "step": 1874 + }, + { + "epoch": 0.22, + "learning_rate": 2.3875687697530137e-07, + "logits/chosen": -2.2719407081604004, + "logits/rejected": -2.3003954887390137, + "logps/chosen": -277.0545959472656, + "logps/rejected": -247.36782836914062, + "loss": 0.4243, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6434794068336487, + "rewards/margins": 1.4241735935211182, + "rewards/rejected": -2.067652702331543, + "step": 1875 + }, + { + "epoch": 0.22, + "learning_rate": 2.387217605056772e-07, + "logits/chosen": -2.5301342010498047, + "logits/rejected": -2.482360363006592, + "logps/chosen": -155.30410766601562, + "logps/rejected": -122.61559295654297, + "loss": 0.5504, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7509889006614685, + "rewards/margins": 0.5781077146530151, + "rewards/rejected": -1.3290965557098389, + "step": 1876 + }, + { + "epoch": 0.22, + "learning_rate": 2.3868664403605293e-07, + "logits/chosen": -2.1837570667266846, + "logits/rejected": -2.1048529148101807, + "logps/chosen": -427.37017822265625, + "logps/rejected": -482.6123046875, + "loss": 0.1128, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3941292464733124, + "rewards/margins": 2.8542916774749756, + "rewards/rejected": -3.2484209537506104, + "step": 1877 + }, + { + "epoch": 0.22, + "learning_rate": 2.3865152756642863e-07, + "logits/chosen": -1.848222017288208, + "logits/rejected": -1.8821041584014893, + "logps/chosen": -320.1489562988281, + "logps/rejected": -380.77783203125, + "loss": 0.4211, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1483875513076782, + "rewards/margins": 1.3667548894882202, + "rewards/rejected": -2.5151424407958984, + "step": 1878 + }, + { + "epoch": 0.22, + "learning_rate": 2.386164110968044e-07, + "logits/chosen": -2.5507004261016846, + "logits/rejected": -2.671041250228882, + "logps/chosen": -389.99359130859375, + "logps/rejected": -209.98233032226562, + "loss": 0.4117, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.23897847533226013, + "rewards/margins": 1.638261318206787, + "rewards/rejected": -1.87723970413208, + "step": 1879 + }, + { + "epoch": 0.22, + "learning_rate": 2.3858129462718014e-07, + "logits/chosen": -1.5736688375473022, + "logits/rejected": -1.5351197719573975, + "logps/chosen": -299.72430419921875, + "logps/rejected": -379.10137939453125, + "loss": 0.9883, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5595716238021851, + "rewards/margins": 0.7256186008453369, + "rewards/rejected": -1.285190224647522, + "step": 1880 + }, + { + "epoch": 0.22, + "learning_rate": 2.385461781575559e-07, + "logits/chosen": -2.4531192779541016, + "logits/rejected": -2.856436252593994, + "logps/chosen": -405.98504638671875, + "logps/rejected": -219.47216796875, + "loss": 0.6201, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9499214291572571, + "rewards/margins": 1.161475419998169, + "rewards/rejected": -2.1113967895507812, + "step": 1881 + }, + { + "epoch": 0.22, + "learning_rate": 2.3851106168793165e-07, + "logits/chosen": -2.290518283843994, + "logits/rejected": -2.3575279712677, + "logps/chosen": -458.14959716796875, + "logps/rejected": -300.36224365234375, + "loss": 0.3403, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0021748542785645, + "rewards/margins": 1.8670697212219238, + "rewards/rejected": -2.869244337081909, + "step": 1882 + }, + { + "epoch": 0.22, + "learning_rate": 2.3847594521830735e-07, + "logits/chosen": -2.2858974933624268, + "logits/rejected": -2.2261266708374023, + "logps/chosen": -250.77146911621094, + "logps/rejected": -189.79019165039062, + "loss": 0.3676, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7008834481239319, + "rewards/margins": 1.4297807216644287, + "rewards/rejected": -2.130664110183716, + "step": 1883 + }, + { + "epoch": 0.22, + "learning_rate": 2.384408287486831e-07, + "logits/chosen": -2.313026189804077, + "logits/rejected": -2.113095283508301, + "logps/chosen": -271.5746765136719, + "logps/rejected": -310.00885009765625, + "loss": 0.3006, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.43464893102645874, + "rewards/margins": 1.9812484979629517, + "rewards/rejected": -2.4158976078033447, + "step": 1884 + }, + { + "epoch": 0.22, + "learning_rate": 2.3840571227905888e-07, + "logits/chosen": -2.2848453521728516, + "logits/rejected": -2.5046277046203613, + "logps/chosen": -357.8350524902344, + "logps/rejected": -242.469482421875, + "loss": 0.4549, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8568126559257507, + "rewards/margins": 0.6999093890190125, + "rewards/rejected": -1.5567220449447632, + "step": 1885 + }, + { + "epoch": 0.22, + "learning_rate": 2.383705958094346e-07, + "logits/chosen": -1.5252692699432373, + "logits/rejected": -1.8526209592819214, + "logps/chosen": -471.7545166015625, + "logps/rejected": -341.34832763671875, + "loss": 0.69, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1692421436309814, + "rewards/margins": 2.1831202507019043, + "rewards/rejected": -3.3523621559143066, + "step": 1886 + }, + { + "epoch": 0.22, + "learning_rate": 2.3833547933981036e-07, + "logits/chosen": -2.0212998390197754, + "logits/rejected": -1.9984877109527588, + "logps/chosen": -241.70095825195312, + "logps/rejected": -227.55850219726562, + "loss": 0.3959, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.22019599378108978, + "rewards/margins": 1.458804965019226, + "rewards/rejected": -1.6790008544921875, + "step": 1887 + }, + { + "epoch": 0.22, + "learning_rate": 2.3830036287018612e-07, + "logits/chosen": -2.2609665393829346, + "logits/rejected": -2.045670509338379, + "logps/chosen": -211.99148559570312, + "logps/rejected": -286.6866149902344, + "loss": 0.5692, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2994937896728516, + "rewards/margins": 0.8295958638191223, + "rewards/rejected": -2.129089593887329, + "step": 1888 + }, + { + "epoch": 0.22, + "learning_rate": 2.3826524640056184e-07, + "logits/chosen": -1.35108482837677, + "logits/rejected": -1.6038410663604736, + "logps/chosen": -508.49554443359375, + "logps/rejected": -445.4385986328125, + "loss": 0.4945, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5622736215591431, + "rewards/margins": 0.8248645663261414, + "rewards/rejected": -1.3871381282806396, + "step": 1889 + }, + { + "epoch": 0.22, + "learning_rate": 2.382301299309376e-07, + "logits/chosen": -2.7410054206848145, + "logits/rejected": -2.8464250564575195, + "logps/chosen": -371.87554931640625, + "logps/rejected": -231.36199951171875, + "loss": 0.4058, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4819505214691162, + "rewards/margins": 1.4424998760223389, + "rewards/rejected": -1.9244505167007446, + "step": 1890 + }, + { + "epoch": 0.22, + "learning_rate": 2.3819501346131332e-07, + "logits/chosen": -1.8762645721435547, + "logits/rejected": -1.8666222095489502, + "logps/chosen": -182.05599975585938, + "logps/rejected": -231.2353057861328, + "loss": 0.5807, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0911301374435425, + "rewards/margins": 1.2785953283309937, + "rewards/rejected": -2.369725465774536, + "step": 1891 + }, + { + "epoch": 0.22, + "learning_rate": 2.3815989699168908e-07, + "logits/chosen": -2.5054473876953125, + "logits/rejected": -2.3704183101654053, + "logps/chosen": -183.8208770751953, + "logps/rejected": -239.50747680664062, + "loss": 0.7933, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.013898491859436, + "rewards/margins": 0.9446990489959717, + "rewards/rejected": -1.9585976600646973, + "step": 1892 + }, + { + "epoch": 0.22, + "learning_rate": 2.3812478052206486e-07, + "logits/chosen": -1.9569039344787598, + "logits/rejected": -1.6428297758102417, + "logps/chosen": -311.79925537109375, + "logps/rejected": -322.50244140625, + "loss": 0.2969, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1574065089225769, + "rewards/margins": 1.7341423034667969, + "rewards/rejected": -1.891548752784729, + "step": 1893 + }, + { + "epoch": 0.22, + "learning_rate": 2.3808966405244059e-07, + "logits/chosen": -2.113281726837158, + "logits/rejected": -2.456411600112915, + "logps/chosen": -332.5094909667969, + "logps/rejected": -223.7294921875, + "loss": 0.873, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.0014100074768066, + "rewards/margins": -0.12159928679466248, + "rewards/rejected": -1.8798109292984009, + "step": 1894 + }, + { + "epoch": 0.22, + "learning_rate": 2.3805454758281634e-07, + "logits/chosen": -2.5311086177825928, + "logits/rejected": -2.371384859085083, + "logps/chosen": -267.2041320800781, + "logps/rejected": -344.3773498535156, + "loss": 0.4211, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6509290933609009, + "rewards/margins": 2.2796194553375244, + "rewards/rejected": -2.930548667907715, + "step": 1895 + }, + { + "epoch": 0.22, + "learning_rate": 2.380194311131921e-07, + "logits/chosen": -2.3317456245422363, + "logits/rejected": -2.1700072288513184, + "logps/chosen": -130.6432342529297, + "logps/rejected": -150.59249877929688, + "loss": 0.3559, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9232619404792786, + "rewards/margins": 1.4981898069381714, + "rewards/rejected": -2.4214518070220947, + "step": 1896 + }, + { + "epoch": 0.22, + "learning_rate": 2.3798431464356782e-07, + "logits/chosen": -2.7127742767333984, + "logits/rejected": -2.7307872772216797, + "logps/chosen": -374.97772216796875, + "logps/rejected": -177.1351318359375, + "loss": 0.5043, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8739426136016846, + "rewards/margins": 1.8577213287353516, + "rewards/rejected": -2.731663942337036, + "step": 1897 + }, + { + "epoch": 0.22, + "learning_rate": 2.3794919817394357e-07, + "logits/chosen": -2.7658023834228516, + "logits/rejected": -2.677561044692993, + "logps/chosen": -249.3415069580078, + "logps/rejected": -234.80064392089844, + "loss": 0.4089, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4149497747421265, + "rewards/margins": 1.9419695138931274, + "rewards/rejected": -3.356919288635254, + "step": 1898 + }, + { + "epoch": 0.22, + "learning_rate": 2.379140817043193e-07, + "logits/chosen": -1.5645498037338257, + "logits/rejected": -1.845759630203247, + "logps/chosen": -470.34228515625, + "logps/rejected": -372.2930603027344, + "loss": 1.5097, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5694715976715088, + "rewards/margins": -0.20141857862472534, + "rewards/rejected": -1.3680534362792969, + "step": 1899 + }, + { + "epoch": 0.22, + "learning_rate": 2.3787896523469506e-07, + "logits/chosen": -2.8688364028930664, + "logits/rejected": -2.844528913497925, + "logps/chosen": -178.7355499267578, + "logps/rejected": -350.0583190917969, + "loss": 0.1887, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3062291741371155, + "rewards/margins": 2.730888605117798, + "rewards/rejected": -3.0371179580688477, + "step": 1900 + }, + { + "epoch": 0.22, + "learning_rate": 2.378438487650708e-07, + "logits/chosen": -1.752523422241211, + "logits/rejected": -1.9994432926177979, + "logps/chosen": -238.67042541503906, + "logps/rejected": -223.97378540039062, + "loss": 0.2791, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6448785066604614, + "rewards/margins": 1.8912856578826904, + "rewards/rejected": -2.5361642837524414, + "step": 1901 + }, + { + "epoch": 0.22, + "learning_rate": 2.3780873229544654e-07, + "logits/chosen": -2.097412586212158, + "logits/rejected": -2.3026952743530273, + "logps/chosen": -396.99127197265625, + "logps/rejected": -245.67063903808594, + "loss": 0.459, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.29619261622428894, + "rewards/margins": 2.0875585079193115, + "rewards/rejected": -2.383751392364502, + "step": 1902 + }, + { + "epoch": 0.22, + "learning_rate": 2.3777361582582232e-07, + "logits/chosen": -2.394010066986084, + "logits/rejected": -2.4040775299072266, + "logps/chosen": -248.24700927734375, + "logps/rejected": -200.97467041015625, + "loss": 0.3434, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7006380558013916, + "rewards/margins": 1.358536720275879, + "rewards/rejected": -3.0591745376586914, + "step": 1903 + }, + { + "epoch": 0.22, + "learning_rate": 2.3773849935619802e-07, + "logits/chosen": -2.392418384552002, + "logits/rejected": -2.515596389770508, + "logps/chosen": -441.71173095703125, + "logps/rejected": -381.23309326171875, + "loss": 0.3254, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8221156597137451, + "rewards/margins": 1.6403483152389526, + "rewards/rejected": -2.462463855743408, + "step": 1904 + }, + { + "epoch": 0.22, + "learning_rate": 2.377033828865738e-07, + "logits/chosen": -2.4650213718414307, + "logits/rejected": -2.030588150024414, + "logps/chosen": -279.48406982421875, + "logps/rejected": -278.06243896484375, + "loss": 0.3386, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7578955292701721, + "rewards/margins": 1.4631208181381226, + "rewards/rejected": -2.2210164070129395, + "step": 1905 + }, + { + "epoch": 0.22, + "learning_rate": 2.3766826641694955e-07, + "logits/chosen": -2.5566630363464355, + "logits/rejected": -2.3980259895324707, + "logps/chosen": -161.35476684570312, + "logps/rejected": -278.8696594238281, + "loss": 0.3055, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.050922051072120667, + "rewards/margins": 1.5828845500946045, + "rewards/rejected": -1.6338067054748535, + "step": 1906 + }, + { + "epoch": 0.22, + "learning_rate": 2.3763314994732528e-07, + "logits/chosen": -2.9047882556915283, + "logits/rejected": -2.623699188232422, + "logps/chosen": -343.2689208984375, + "logps/rejected": -394.3851623535156, + "loss": 0.2466, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7409877777099609, + "rewards/margins": 2.3692898750305176, + "rewards/rejected": -3.1102776527404785, + "step": 1907 + }, + { + "epoch": 0.22, + "learning_rate": 2.3759803347770103e-07, + "logits/chosen": -2.5122785568237305, + "logits/rejected": -2.5440080165863037, + "logps/chosen": -439.81512451171875, + "logps/rejected": -285.51068115234375, + "loss": 0.2464, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6214703917503357, + "rewards/margins": 1.848227620124817, + "rewards/rejected": -2.469698190689087, + "step": 1908 + }, + { + "epoch": 0.22, + "learning_rate": 2.375629170080768e-07, + "logits/chosen": -2.1882824897766113, + "logits/rejected": -1.7533613443374634, + "logps/chosen": -109.2203140258789, + "logps/rejected": -250.05348205566406, + "loss": 0.256, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8098776340484619, + "rewards/margins": 2.31542706489563, + "rewards/rejected": -3.125304937362671, + "step": 1909 + }, + { + "epoch": 0.22, + "learning_rate": 2.3752780053845252e-07, + "logits/chosen": -2.763434886932373, + "logits/rejected": -2.7696609497070312, + "logps/chosen": -210.70211791992188, + "logps/rejected": -237.04580688476562, + "loss": 0.4152, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9539262056350708, + "rewards/margins": 1.5120006799697876, + "rewards/rejected": -2.4659268856048584, + "step": 1910 + }, + { + "epoch": 0.22, + "learning_rate": 2.3749268406882827e-07, + "logits/chosen": -2.497451066970825, + "logits/rejected": -2.7112011909484863, + "logps/chosen": -301.8977355957031, + "logps/rejected": -365.4998474121094, + "loss": 0.5683, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8907111287117004, + "rewards/margins": 2.4184865951538086, + "rewards/rejected": -3.3091981410980225, + "step": 1911 + }, + { + "epoch": 0.22, + "learning_rate": 2.37457567599204e-07, + "logits/chosen": -2.3488211631774902, + "logits/rejected": -2.6241226196289062, + "logps/chosen": -382.78240966796875, + "logps/rejected": -145.00521850585938, + "loss": 0.461, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8262939453125, + "rewards/margins": 1.242485761642456, + "rewards/rejected": -2.068779468536377, + "step": 1912 + }, + { + "epoch": 0.22, + "learning_rate": 2.3742245112957975e-07, + "logits/chosen": -2.6089062690734863, + "logits/rejected": -2.833162546157837, + "logps/chosen": -115.28060150146484, + "logps/rejected": -204.83999633789062, + "loss": 0.3713, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18395186960697174, + "rewards/margins": 1.9975945949554443, + "rewards/rejected": -2.181546449661255, + "step": 1913 + }, + { + "epoch": 0.22, + "learning_rate": 2.3738733465995553e-07, + "logits/chosen": -2.206120491027832, + "logits/rejected": -2.593902826309204, + "logps/chosen": -353.7024230957031, + "logps/rejected": -224.37562561035156, + "loss": 0.4605, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9997351169586182, + "rewards/margins": 1.2490673065185547, + "rewards/rejected": -2.2488021850585938, + "step": 1914 + }, + { + "epoch": 0.22, + "learning_rate": 2.3735221819033123e-07, + "logits/chosen": -2.626748561859131, + "logits/rejected": -2.481984853744507, + "logps/chosen": -295.6531677246094, + "logps/rejected": -324.2381286621094, + "loss": 0.1272, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22671173512935638, + "rewards/margins": 3.3701651096343994, + "rewards/rejected": -3.596876859664917, + "step": 1915 + }, + { + "epoch": 0.22, + "learning_rate": 2.37317101720707e-07, + "logits/chosen": -2.0796804428100586, + "logits/rejected": -2.238292932510376, + "logps/chosen": -302.81890869140625, + "logps/rejected": -314.2767639160156, + "loss": 0.6429, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9771907329559326, + "rewards/margins": 6.486603260040283, + "rewards/rejected": -8.463793754577637, + "step": 1916 + }, + { + "epoch": 0.22, + "learning_rate": 2.3728198525108277e-07, + "logits/chosen": -1.8705644607543945, + "logits/rejected": -1.8647785186767578, + "logps/chosen": -205.74134826660156, + "logps/rejected": -241.17037963867188, + "loss": 0.5169, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0455278158187866, + "rewards/margins": 1.4651747941970825, + "rewards/rejected": -2.510702610015869, + "step": 1917 + }, + { + "epoch": 0.22, + "learning_rate": 2.372468687814585e-07, + "logits/chosen": -2.568869113922119, + "logits/rejected": -2.6214704513549805, + "logps/chosen": -404.2718505859375, + "logps/rejected": -454.71612548828125, + "loss": 0.353, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.32936638593673706, + "rewards/margins": 2.376239776611328, + "rewards/rejected": -2.70560622215271, + "step": 1918 + }, + { + "epoch": 0.22, + "learning_rate": 2.3721175231183425e-07, + "logits/chosen": -2.398444890975952, + "logits/rejected": -2.41296648979187, + "logps/chosen": -244.3339080810547, + "logps/rejected": -222.6495361328125, + "loss": 0.4032, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0391370058059692, + "rewards/margins": 0.7772130966186523, + "rewards/rejected": -1.8163502216339111, + "step": 1919 + }, + { + "epoch": 0.22, + "learning_rate": 2.3717663584220997e-07, + "logits/chosen": -1.7673308849334717, + "logits/rejected": -1.6245951652526855, + "logps/chosen": -214.42919921875, + "logps/rejected": -298.52362060546875, + "loss": 0.6949, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.778707265853882, + "rewards/margins": 2.04055118560791, + "rewards/rejected": -4.819258689880371, + "step": 1920 + }, + { + "epoch": 0.22, + "learning_rate": 2.3714151937258573e-07, + "logits/chosen": -2.2565979957580566, + "logits/rejected": -2.266058921813965, + "logps/chosen": -505.3515625, + "logps/rejected": -426.5019836425781, + "loss": 0.114, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11233696341514587, + "rewards/margins": 3.2726030349731445, + "rewards/rejected": -3.160266160964966, + "step": 1921 + }, + { + "epoch": 0.22, + "learning_rate": 2.3710640290296148e-07, + "logits/chosen": -2.290102005004883, + "logits/rejected": -2.5889883041381836, + "logps/chosen": -463.08367919921875, + "logps/rejected": -341.1163024902344, + "loss": 0.2836, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3004825711250305, + "rewards/margins": 2.1983306407928467, + "rewards/rejected": -2.4988131523132324, + "step": 1922 + }, + { + "epoch": 0.22, + "learning_rate": 2.370712864333372e-07, + "logits/chosen": -2.2449748516082764, + "logits/rejected": -2.401122808456421, + "logps/chosen": -477.89471435546875, + "logps/rejected": -342.2121276855469, + "loss": 0.2263, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3617750406265259, + "rewards/margins": 3.114492416381836, + "rewards/rejected": -4.476267337799072, + "step": 1923 + }, + { + "epoch": 0.22, + "learning_rate": 2.3703616996371296e-07, + "logits/chosen": -1.7508846521377563, + "logits/rejected": -2.02301025390625, + "logps/chosen": -459.27764892578125, + "logps/rejected": -297.5425720214844, + "loss": 0.2159, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4538509249687195, + "rewards/margins": 1.8754949569702148, + "rewards/rejected": -2.329345941543579, + "step": 1924 + }, + { + "epoch": 0.22, + "learning_rate": 2.3700105349408874e-07, + "logits/chosen": -2.304309606552124, + "logits/rejected": -2.3895626068115234, + "logps/chosen": -383.1131591796875, + "logps/rejected": -295.4279479980469, + "loss": 0.3236, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.34107133746147156, + "rewards/margins": 1.5796128511428833, + "rewards/rejected": -1.9206840991973877, + "step": 1925 + }, + { + "epoch": 0.22, + "learning_rate": 2.3696593702446444e-07, + "logits/chosen": -2.4601430892944336, + "logits/rejected": -2.774451732635498, + "logps/chosen": -222.17994689941406, + "logps/rejected": -180.8857421875, + "loss": 0.1272, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3049226403236389, + "rewards/margins": 2.5716471672058105, + "rewards/rejected": -2.8765697479248047, + "step": 1926 + }, + { + "epoch": 0.22, + "learning_rate": 2.3693082055484022e-07, + "logits/chosen": -2.5503077507019043, + "logits/rejected": -2.681102991104126, + "logps/chosen": -174.80398559570312, + "logps/rejected": -173.7435760498047, + "loss": 0.1935, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6254729628562927, + "rewards/margins": 2.0759644508361816, + "rewards/rejected": -2.701437473297119, + "step": 1927 + }, + { + "epoch": 0.22, + "learning_rate": 2.3689570408521595e-07, + "logits/chosen": -2.211763381958008, + "logits/rejected": -2.2956995964050293, + "logps/chosen": -390.3135986328125, + "logps/rejected": -322.8627014160156, + "loss": 0.4811, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6517125964164734, + "rewards/margins": 1.1754566431045532, + "rewards/rejected": -1.8271691799163818, + "step": 1928 + }, + { + "epoch": 0.22, + "learning_rate": 2.368605876155917e-07, + "logits/chosen": -2.386246919631958, + "logits/rejected": -2.425492763519287, + "logps/chosen": -164.89869689941406, + "logps/rejected": -231.7255096435547, + "loss": 0.9004, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7343862056732178, + "rewards/margins": 0.847625195980072, + "rewards/rejected": -2.5820114612579346, + "step": 1929 + }, + { + "epoch": 0.22, + "learning_rate": 2.3682547114596746e-07, + "logits/chosen": -2.4144933223724365, + "logits/rejected": -2.1635801792144775, + "logps/chosen": -330.1566467285156, + "logps/rejected": -392.3240661621094, + "loss": 0.2213, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3356000185012817, + "rewards/margins": 2.8797450065612793, + "rewards/rejected": -4.2153449058532715, + "step": 1930 + }, + { + "epoch": 0.22, + "learning_rate": 2.367903546763432e-07, + "logits/chosen": -2.335484027862549, + "logits/rejected": -2.24849534034729, + "logps/chosen": -262.5552062988281, + "logps/rejected": -276.7803955078125, + "loss": 0.3889, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.30858826637268066, + "rewards/margins": 1.1574946641921997, + "rewards/rejected": -1.46608304977417, + "step": 1931 + }, + { + "epoch": 0.22, + "learning_rate": 2.3675523820671894e-07, + "logits/chosen": -2.1723060607910156, + "logits/rejected": -2.52302622795105, + "logps/chosen": -397.95074462890625, + "logps/rejected": -251.8291778564453, + "loss": 0.2792, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.23114918172359467, + "rewards/margins": 1.700951337814331, + "rewards/rejected": -1.932100534439087, + "step": 1932 + }, + { + "epoch": 0.22, + "learning_rate": 2.367201217370947e-07, + "logits/chosen": -2.1208667755126953, + "logits/rejected": -2.336517810821533, + "logps/chosen": -393.4767761230469, + "logps/rejected": -302.7851257324219, + "loss": 0.2957, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3682857155799866, + "rewards/margins": 2.2937803268432617, + "rewards/rejected": -2.6620659828186035, + "step": 1933 + }, + { + "epoch": 0.22, + "learning_rate": 2.3668500526747042e-07, + "logits/chosen": -2.4714572429656982, + "logits/rejected": -2.3573951721191406, + "logps/chosen": -393.90411376953125, + "logps/rejected": -251.21151733398438, + "loss": 1.1032, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.419532537460327, + "rewards/margins": 0.47808390855789185, + "rewards/rejected": -2.897616386413574, + "step": 1934 + }, + { + "epoch": 0.22, + "learning_rate": 2.3664988879784618e-07, + "logits/chosen": -1.7405102252960205, + "logits/rejected": -1.7089413404464722, + "logps/chosen": -313.14947509765625, + "logps/rejected": -267.23406982421875, + "loss": 0.4807, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7113149166107178, + "rewards/margins": 0.9909139275550842, + "rewards/rejected": -1.7022289037704468, + "step": 1935 + }, + { + "epoch": 0.22, + "learning_rate": 2.366147723282219e-07, + "logits/chosen": -1.914962887763977, + "logits/rejected": -2.100762367248535, + "logps/chosen": -434.3963623046875, + "logps/rejected": -331.1971435546875, + "loss": 0.2044, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.47234082221984863, + "rewards/margins": 2.492448329925537, + "rewards/rejected": -2.9647889137268066, + "step": 1936 + }, + { + "epoch": 0.22, + "learning_rate": 2.3657965585859768e-07, + "logits/chosen": -2.7205419540405273, + "logits/rejected": -2.6053991317749023, + "logps/chosen": -244.78634643554688, + "logps/rejected": -258.19781494140625, + "loss": 0.3957, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.40990519523620605, + "rewards/margins": 1.3704789876937866, + "rewards/rejected": -1.7803840637207031, + "step": 1937 + }, + { + "epoch": 0.22, + "learning_rate": 2.3654453938897344e-07, + "logits/chosen": -2.2714152336120605, + "logits/rejected": -2.2087714672088623, + "logps/chosen": -318.769287109375, + "logps/rejected": -372.8142395019531, + "loss": 0.3433, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7401076555252075, + "rewards/margins": 2.213914632797241, + "rewards/rejected": -2.9540224075317383, + "step": 1938 + }, + { + "epoch": 0.22, + "learning_rate": 2.3650942291934917e-07, + "logits/chosen": -2.374457836151123, + "logits/rejected": -2.4318108558654785, + "logps/chosen": -462.9864501953125, + "logps/rejected": -331.56884765625, + "loss": 0.2754, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4479324221611023, + "rewards/margins": 1.719651460647583, + "rewards/rejected": -2.16758394241333, + "step": 1939 + }, + { + "epoch": 0.22, + "learning_rate": 2.3647430644972492e-07, + "logits/chosen": -2.4643986225128174, + "logits/rejected": -2.595179557800293, + "logps/chosen": -260.3790588378906, + "logps/rejected": -216.77867126464844, + "loss": 0.6852, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8316831588745117, + "rewards/margins": 0.9855020046234131, + "rewards/rejected": -1.8171851634979248, + "step": 1940 + }, + { + "epoch": 0.22, + "learning_rate": 2.3643918998010067e-07, + "logits/chosen": -2.582648515701294, + "logits/rejected": -2.4003753662109375, + "logps/chosen": -128.98489379882812, + "logps/rejected": -280.87164306640625, + "loss": 0.6379, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.39407044649124146, + "rewards/margins": 1.444550633430481, + "rewards/rejected": -1.8386210203170776, + "step": 1941 + }, + { + "epoch": 0.22, + "learning_rate": 2.364040735104764e-07, + "logits/chosen": -2.368260145187378, + "logits/rejected": -2.5174076557159424, + "logps/chosen": -262.4990539550781, + "logps/rejected": -286.8766784667969, + "loss": 0.1717, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21069781482219696, + "rewards/margins": 2.676140785217285, + "rewards/rejected": -2.886838436126709, + "step": 1942 + }, + { + "epoch": 0.22, + "learning_rate": 2.3636895704085215e-07, + "logits/chosen": -2.2715065479278564, + "logits/rejected": -2.152174711227417, + "logps/chosen": -394.388916015625, + "logps/rejected": -486.4097900390625, + "loss": 0.3499, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.36254265904426575, + "rewards/margins": 2.553584098815918, + "rewards/rejected": -2.9161267280578613, + "step": 1943 + }, + { + "epoch": 0.22, + "learning_rate": 2.3633384057122788e-07, + "logits/chosen": -2.423379898071289, + "logits/rejected": -2.601919412612915, + "logps/chosen": -580.2423095703125, + "logps/rejected": -402.0279541015625, + "loss": 0.3442, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1883525848388672, + "rewards/margins": 1.7517054080963135, + "rewards/rejected": -2.9400579929351807, + "step": 1944 + }, + { + "epoch": 0.22, + "learning_rate": 2.3629872410160364e-07, + "logits/chosen": -2.5029258728027344, + "logits/rejected": -2.166471242904663, + "logps/chosen": -132.900390625, + "logps/rejected": -214.26962280273438, + "loss": 0.2311, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10296076536178589, + "rewards/margins": 1.6950552463531494, + "rewards/rejected": -1.798015832901001, + "step": 1945 + }, + { + "epoch": 0.22, + "learning_rate": 2.362636076319794e-07, + "logits/chosen": -2.4214107990264893, + "logits/rejected": -2.468675374984741, + "logps/chosen": -225.47740173339844, + "logps/rejected": -198.9693603515625, + "loss": 0.3671, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6075953841209412, + "rewards/margins": 1.2183351516723633, + "rewards/rejected": -1.8259304761886597, + "step": 1946 + }, + { + "epoch": 0.22, + "learning_rate": 2.3622849116235512e-07, + "logits/chosen": -1.459368109703064, + "logits/rejected": -2.2584891319274902, + "logps/chosen": -501.2056884765625, + "logps/rejected": -212.09942626953125, + "loss": 0.2521, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3858056664466858, + "rewards/margins": 1.4859222173690796, + "rewards/rejected": -1.8717279434204102, + "step": 1947 + }, + { + "epoch": 0.22, + "learning_rate": 2.361933746927309e-07, + "logits/chosen": -2.3808846473693848, + "logits/rejected": -2.498265027999878, + "logps/chosen": -200.16683959960938, + "logps/rejected": -236.39483642578125, + "loss": 0.313, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5515216588973999, + "rewards/margins": 1.7621972560882568, + "rewards/rejected": -2.313718795776367, + "step": 1948 + }, + { + "epoch": 0.22, + "learning_rate": 2.361582582231066e-07, + "logits/chosen": -2.300076723098755, + "logits/rejected": -2.394552707672119, + "logps/chosen": -301.8490905761719, + "logps/rejected": -509.9092712402344, + "loss": 0.7767, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2944258451461792, + "rewards/margins": 1.3418323993682861, + "rewards/rejected": -2.636258363723755, + "step": 1949 + }, + { + "epoch": 0.22, + "learning_rate": 2.3612314175348238e-07, + "logits/chosen": -2.523480176925659, + "logits/rejected": -2.419995069503784, + "logps/chosen": -175.84193420410156, + "logps/rejected": -233.68206787109375, + "loss": 0.733, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5904350280761719, + "rewards/margins": 1.0675584077835083, + "rewards/rejected": -2.6579935550689697, + "step": 1950 + }, + { + "epoch": 0.22, + "learning_rate": 2.3608802528385813e-07, + "logits/chosen": -2.372199773788452, + "logits/rejected": -2.478088140487671, + "logps/chosen": -369.3635559082031, + "logps/rejected": -275.5186767578125, + "loss": 0.4225, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4290860891342163, + "rewards/margins": 1.6593959331512451, + "rewards/rejected": -2.088481903076172, + "step": 1951 + }, + { + "epoch": 0.23, + "learning_rate": 2.3605290881423386e-07, + "logits/chosen": -2.0455310344696045, + "logits/rejected": -2.3507816791534424, + "logps/chosen": -454.6167297363281, + "logps/rejected": -246.07127380371094, + "loss": 0.4064, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5473623871803284, + "rewards/margins": 0.9683802127838135, + "rewards/rejected": -1.5157426595687866, + "step": 1952 + }, + { + "epoch": 0.23, + "learning_rate": 2.3601779234460961e-07, + "logits/chosen": -2.386018753051758, + "logits/rejected": -2.4219655990600586, + "logps/chosen": -211.44839477539062, + "logps/rejected": -450.72454833984375, + "loss": 0.5339, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5534512996673584, + "rewards/margins": 2.0204367637634277, + "rewards/rejected": -3.5738883018493652, + "step": 1953 + }, + { + "epoch": 0.23, + "learning_rate": 2.3598267587498537e-07, + "logits/chosen": -1.965113878250122, + "logits/rejected": -2.2214183807373047, + "logps/chosen": -364.84173583984375, + "logps/rejected": -274.5127868652344, + "loss": 0.9095, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6936922073364258, + "rewards/margins": 0.5874741673469543, + "rewards/rejected": -1.2811663150787354, + "step": 1954 + }, + { + "epoch": 0.23, + "learning_rate": 2.359475594053611e-07, + "logits/chosen": -1.937732219696045, + "logits/rejected": -1.8982000350952148, + "logps/chosen": -406.55914306640625, + "logps/rejected": -379.31341552734375, + "loss": 0.3467, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.876366138458252, + "rewards/margins": 1.8007502555847168, + "rewards/rejected": -2.6771163940429688, + "step": 1955 + }, + { + "epoch": 0.23, + "learning_rate": 2.3591244293573685e-07, + "logits/chosen": -2.3728601932525635, + "logits/rejected": -2.3123903274536133, + "logps/chosen": -313.4127197265625, + "logps/rejected": -328.407470703125, + "loss": 0.4143, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8842582106590271, + "rewards/margins": 2.055964708328247, + "rewards/rejected": -2.940222978591919, + "step": 1956 + }, + { + "epoch": 0.23, + "learning_rate": 2.3587732646611258e-07, + "logits/chosen": -2.162320137023926, + "logits/rejected": -2.042060136795044, + "logps/chosen": -762.530029296875, + "logps/rejected": -320.49176025390625, + "loss": 0.4146, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5093681812286377, + "rewards/margins": 1.3413792848587036, + "rewards/rejected": -1.8507475852966309, + "step": 1957 + }, + { + "epoch": 0.23, + "learning_rate": 2.3584220999648833e-07, + "logits/chosen": -2.028496503829956, + "logits/rejected": -2.3230361938476562, + "logps/chosen": -361.4127197265625, + "logps/rejected": -335.9014587402344, + "loss": 0.4143, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22937695682048798, + "rewards/margins": 1.1757146120071411, + "rewards/rejected": -1.405091643333435, + "step": 1958 + }, + { + "epoch": 0.23, + "learning_rate": 2.358070935268641e-07, + "logits/chosen": -2.386280059814453, + "logits/rejected": -2.4690678119659424, + "logps/chosen": -278.3632507324219, + "logps/rejected": -232.03363037109375, + "loss": 0.707, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8277366757392883, + "rewards/margins": 1.044783115386963, + "rewards/rejected": -1.872519850730896, + "step": 1959 + }, + { + "epoch": 0.23, + "learning_rate": 2.357719770572398e-07, + "logits/chosen": -2.472212791442871, + "logits/rejected": -2.7196600437164307, + "logps/chosen": -308.03936767578125, + "logps/rejected": -269.74346923828125, + "loss": 0.6511, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2547545433044434, + "rewards/margins": 1.6583049297332764, + "rewards/rejected": -2.9130594730377197, + "step": 1960 + }, + { + "epoch": 0.23, + "learning_rate": 2.357368605876156e-07, + "logits/chosen": -2.1934690475463867, + "logits/rejected": -2.1828103065490723, + "logps/chosen": -523.0359497070312, + "logps/rejected": -367.8273010253906, + "loss": 0.3188, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6968042850494385, + "rewards/margins": 2.2751870155334473, + "rewards/rejected": -2.9719913005828857, + "step": 1961 + }, + { + "epoch": 0.23, + "learning_rate": 2.3570174411799135e-07, + "logits/chosen": -2.571540355682373, + "logits/rejected": -2.779082775115967, + "logps/chosen": -239.26779174804688, + "logps/rejected": -156.90830993652344, + "loss": 0.3992, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1832841634750366, + "rewards/margins": 1.228523850440979, + "rewards/rejected": -2.4118080139160156, + "step": 1962 + }, + { + "epoch": 0.23, + "learning_rate": 2.3566662764836707e-07, + "logits/chosen": -2.376046657562256, + "logits/rejected": -2.102412700653076, + "logps/chosen": -217.5296630859375, + "logps/rejected": -294.4634094238281, + "loss": 0.4192, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.757961094379425, + "rewards/margins": 2.0576395988464355, + "rewards/rejected": -2.8156003952026367, + "step": 1963 + }, + { + "epoch": 0.23, + "learning_rate": 2.3563151117874283e-07, + "logits/chosen": -2.4320077896118164, + "logits/rejected": -2.516369581222534, + "logps/chosen": -122.76754760742188, + "logps/rejected": -110.14886474609375, + "loss": 0.4406, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5575506687164307, + "rewards/margins": 0.7030497789382935, + "rewards/rejected": -1.2606005668640137, + "step": 1964 + }, + { + "epoch": 0.23, + "learning_rate": 2.3559639470911855e-07, + "logits/chosen": -2.8815340995788574, + "logits/rejected": -2.732465982437134, + "logps/chosen": -305.2342834472656, + "logps/rejected": -307.55303955078125, + "loss": 0.7572, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9190911054611206, + "rewards/margins": 0.9719529747962952, + "rewards/rejected": -1.891044020652771, + "step": 1965 + }, + { + "epoch": 0.23, + "learning_rate": 2.355612782394943e-07, + "logits/chosen": -2.502680778503418, + "logits/rejected": -2.505929946899414, + "logps/chosen": -254.4038848876953, + "logps/rejected": -302.8228454589844, + "loss": 0.451, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.525383472442627, + "rewards/margins": 1.572631597518921, + "rewards/rejected": -2.0980148315429688, + "step": 1966 + }, + { + "epoch": 0.23, + "learning_rate": 2.3552616176987006e-07, + "logits/chosen": -1.8319963216781616, + "logits/rejected": -1.875575065612793, + "logps/chosen": -265.66180419921875, + "logps/rejected": -236.0842742919922, + "loss": 0.4724, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.04295589029788971, + "rewards/margins": 1.349731683731079, + "rewards/rejected": -1.3926875591278076, + "step": 1967 + }, + { + "epoch": 0.23, + "learning_rate": 2.354910453002458e-07, + "logits/chosen": -2.533541679382324, + "logits/rejected": -2.1051456928253174, + "logps/chosen": -111.55944061279297, + "logps/rejected": -305.96661376953125, + "loss": 0.1959, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.31900569796562195, + "rewards/margins": 1.6830320358276367, + "rewards/rejected": -2.002037763595581, + "step": 1968 + }, + { + "epoch": 0.23, + "learning_rate": 2.3545592883062154e-07, + "logits/chosen": -2.4996085166931152, + "logits/rejected": -2.5102782249450684, + "logps/chosen": -232.036865234375, + "logps/rejected": -403.07080078125, + "loss": 0.6285, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1440582275390625, + "rewards/margins": 1.8845829963684082, + "rewards/rejected": -3.0286412239074707, + "step": 1969 + }, + { + "epoch": 0.23, + "learning_rate": 2.3542081236099732e-07, + "logits/chosen": -2.3567826747894287, + "logits/rejected": -2.4269609451293945, + "logps/chosen": -438.93927001953125, + "logps/rejected": -380.4429016113281, + "loss": 1.0035, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.206601619720459, + "rewards/margins": 0.934065043926239, + "rewards/rejected": -2.1406664848327637, + "step": 1970 + }, + { + "epoch": 0.23, + "learning_rate": 2.3538569589137305e-07, + "logits/chosen": -2.7113218307495117, + "logits/rejected": -2.799600601196289, + "logps/chosen": -396.60577392578125, + "logps/rejected": -382.4775085449219, + "loss": 1.3049, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.2533249855041504, + "rewards/margins": 0.178491473197937, + "rewards/rejected": -2.431816577911377, + "step": 1971 + }, + { + "epoch": 0.23, + "learning_rate": 2.353505794217488e-07, + "logits/chosen": -1.9782609939575195, + "logits/rejected": -2.1743533611297607, + "logps/chosen": -384.4525146484375, + "logps/rejected": -247.1460723876953, + "loss": 0.4884, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8262224793434143, + "rewards/margins": 1.581484079360962, + "rewards/rejected": -2.4077064990997314, + "step": 1972 + }, + { + "epoch": 0.23, + "learning_rate": 2.3531546295212453e-07, + "logits/chosen": -2.509157657623291, + "logits/rejected": -2.3532333374023438, + "logps/chosen": -284.65045166015625, + "logps/rejected": -272.29559326171875, + "loss": 0.3201, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8714724779129028, + "rewards/margins": 1.845981240272522, + "rewards/rejected": -2.717453718185425, + "step": 1973 + }, + { + "epoch": 0.23, + "learning_rate": 2.3528034648250029e-07, + "logits/chosen": -2.595470905303955, + "logits/rejected": -2.418696880340576, + "logps/chosen": -423.7886962890625, + "logps/rejected": -511.995849609375, + "loss": 0.407, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5911397337913513, + "rewards/margins": 2.0984721183776855, + "rewards/rejected": -2.6896119117736816, + "step": 1974 + }, + { + "epoch": 0.23, + "learning_rate": 2.3524523001287604e-07, + "logits/chosen": -2.175691843032837, + "logits/rejected": -2.138941764831543, + "logps/chosen": -186.9246826171875, + "logps/rejected": -153.0587158203125, + "loss": 1.142, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2859337329864502, + "rewards/margins": 0.3908098340034485, + "rewards/rejected": -1.676743507385254, + "step": 1975 + }, + { + "epoch": 0.23, + "learning_rate": 2.3521011354325177e-07, + "logits/chosen": -2.6464107036590576, + "logits/rejected": -2.638455867767334, + "logps/chosen": -287.6993408203125, + "logps/rejected": -252.62258911132812, + "loss": 0.1973, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.454526424407959, + "rewards/margins": 1.7738425731658936, + "rewards/rejected": -2.2283689975738525, + "step": 1976 + }, + { + "epoch": 0.23, + "learning_rate": 2.3517499707362752e-07, + "logits/chosen": -1.7986927032470703, + "logits/rejected": -2.31923770904541, + "logps/chosen": -362.57415771484375, + "logps/rejected": -315.9385986328125, + "loss": 0.4436, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4635082185268402, + "rewards/margins": 2.2279770374298096, + "rewards/rejected": -2.6914854049682617, + "step": 1977 + }, + { + "epoch": 0.23, + "learning_rate": 2.3513988060400327e-07, + "logits/chosen": -1.9946835041046143, + "logits/rejected": -1.7276904582977295, + "logps/chosen": -188.160400390625, + "logps/rejected": -279.70330810546875, + "loss": 0.635, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9565601944923401, + "rewards/margins": 0.5670727491378784, + "rewards/rejected": -1.5236328840255737, + "step": 1978 + }, + { + "epoch": 0.23, + "learning_rate": 2.35104764134379e-07, + "logits/chosen": -1.9155139923095703, + "logits/rejected": -2.28821063041687, + "logps/chosen": -622.371337890625, + "logps/rejected": -365.6530456542969, + "loss": 0.1689, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15614941716194153, + "rewards/margins": 1.9689702987670898, + "rewards/rejected": -1.8128209114074707, + "step": 1979 + }, + { + "epoch": 0.23, + "learning_rate": 2.3506964766475476e-07, + "logits/chosen": -2.149944305419922, + "logits/rejected": -1.8282915353775024, + "logps/chosen": -376.0434265136719, + "logps/rejected": -442.9112548828125, + "loss": 0.3244, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.930666446685791, + "rewards/margins": 2.1931588649749756, + "rewards/rejected": -3.1238253116607666, + "step": 1980 + }, + { + "epoch": 0.23, + "learning_rate": 2.3503453119513048e-07, + "logits/chosen": -2.6275792121887207, + "logits/rejected": -2.595062017440796, + "logps/chosen": -231.8612060546875, + "logps/rejected": -375.1664733886719, + "loss": 0.5252, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3810646533966064, + "rewards/margins": 1.380710482597351, + "rewards/rejected": -2.761775255203247, + "step": 1981 + }, + { + "epoch": 0.23, + "learning_rate": 2.3499941472550626e-07, + "logits/chosen": -2.3515586853027344, + "logits/rejected": -2.3470749855041504, + "logps/chosen": -166.99069213867188, + "logps/rejected": -246.911376953125, + "loss": 0.2449, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19884975254535675, + "rewards/margins": 2.228792667388916, + "rewards/rejected": -2.427642345428467, + "step": 1982 + }, + { + "epoch": 0.23, + "learning_rate": 2.3496429825588202e-07, + "logits/chosen": -2.8128771781921387, + "logits/rejected": -2.6478142738342285, + "logps/chosen": -204.72950744628906, + "logps/rejected": -355.74200439453125, + "loss": 0.5356, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6694948673248291, + "rewards/margins": 1.1517361402511597, + "rewards/rejected": -1.8212311267852783, + "step": 1983 + }, + { + "epoch": 0.23, + "learning_rate": 2.3492918178625774e-07, + "logits/chosen": -1.8757727146148682, + "logits/rejected": -1.9975954294204712, + "logps/chosen": -511.50543212890625, + "logps/rejected": -392.6116638183594, + "loss": 0.2407, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18150383234024048, + "rewards/margins": 2.655104875564575, + "rewards/rejected": -2.836608648300171, + "step": 1984 + }, + { + "epoch": 0.23, + "learning_rate": 2.348940653166335e-07, + "logits/chosen": -2.7502026557922363, + "logits/rejected": -2.5639257431030273, + "logps/chosen": -153.01736450195312, + "logps/rejected": -201.20713806152344, + "loss": 0.7313, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8136663436889648, + "rewards/margins": 0.6135688424110413, + "rewards/rejected": -1.4272351264953613, + "step": 1985 + }, + { + "epoch": 0.23, + "learning_rate": 2.3485894884700925e-07, + "logits/chosen": -2.0349268913269043, + "logits/rejected": -2.0098695755004883, + "logps/chosen": -297.9768981933594, + "logps/rejected": -246.61685180664062, + "loss": 0.2798, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7456446290016174, + "rewards/margins": 1.6828105449676514, + "rewards/rejected": -2.428455352783203, + "step": 1986 + }, + { + "epoch": 0.23, + "learning_rate": 2.3482383237738498e-07, + "logits/chosen": -1.733748197555542, + "logits/rejected": -2.0367090702056885, + "logps/chosen": -297.2588195800781, + "logps/rejected": -208.91299438476562, + "loss": 0.4933, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9199774265289307, + "rewards/margins": 1.6514222621917725, + "rewards/rejected": -2.571399688720703, + "step": 1987 + }, + { + "epoch": 0.23, + "learning_rate": 2.3478871590776073e-07, + "logits/chosen": -2.3531064987182617, + "logits/rejected": -2.3023228645324707, + "logps/chosen": -259.28924560546875, + "logps/rejected": -253.40560913085938, + "loss": 0.3298, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13697053492069244, + "rewards/margins": 1.5367332696914673, + "rewards/rejected": -1.673703908920288, + "step": 1988 + }, + { + "epoch": 0.23, + "learning_rate": 2.3475359943813646e-07, + "logits/chosen": -2.4385061264038086, + "logits/rejected": -2.7172091007232666, + "logps/chosen": -378.22698974609375, + "logps/rejected": -320.69927978515625, + "loss": 0.2478, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2576143741607666, + "rewards/margins": 2.61124324798584, + "rewards/rejected": -2.8688576221466064, + "step": 1989 + }, + { + "epoch": 0.23, + "learning_rate": 2.3471848296851221e-07, + "logits/chosen": -2.095858097076416, + "logits/rejected": -2.533675193786621, + "logps/chosen": -345.3042297363281, + "logps/rejected": -259.6285705566406, + "loss": 0.5422, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3303771018981934, + "rewards/margins": 1.3528432846069336, + "rewards/rejected": -2.683220386505127, + "step": 1990 + }, + { + "epoch": 0.23, + "learning_rate": 2.3468336649888797e-07, + "logits/chosen": -2.0029094219207764, + "logits/rejected": -2.1897456645965576, + "logps/chosen": -250.78268432617188, + "logps/rejected": -252.89254760742188, + "loss": 0.4195, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8036196231842041, + "rewards/margins": 2.0728611946105957, + "rewards/rejected": -2.8764808177948, + "step": 1991 + }, + { + "epoch": 0.23, + "learning_rate": 2.346482500292637e-07, + "logits/chosen": -2.486280918121338, + "logits/rejected": -2.571074962615967, + "logps/chosen": -129.9585723876953, + "logps/rejected": -194.0367889404297, + "loss": 0.2573, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8111022710800171, + "rewards/margins": 2.9335062503814697, + "rewards/rejected": -3.7446084022521973, + "step": 1992 + }, + { + "epoch": 0.23, + "learning_rate": 2.3461313355963948e-07, + "logits/chosen": -2.5795183181762695, + "logits/rejected": -2.8139448165893555, + "logps/chosen": -202.10125732421875, + "logps/rejected": -330.1205139160156, + "loss": 1.7759, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.630275249481201, + "rewards/margins": 0.23708665370941162, + "rewards/rejected": -2.8673620223999023, + "step": 1993 + }, + { + "epoch": 0.23, + "learning_rate": 2.3457801709001518e-07, + "logits/chosen": -1.7240784168243408, + "logits/rejected": -2.0110809803009033, + "logps/chosen": -498.65631103515625, + "logps/rejected": -501.06219482421875, + "loss": 0.5559, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6033498048782349, + "rewards/margins": 1.3070893287658691, + "rewards/rejected": -1.910439133644104, + "step": 1994 + }, + { + "epoch": 0.23, + "learning_rate": 2.3454290062039096e-07, + "logits/chosen": -2.0295450687408447, + "logits/rejected": -2.2217793464660645, + "logps/chosen": -233.358642578125, + "logps/rejected": -219.3469696044922, + "loss": 0.5832, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.232958436012268, + "rewards/margins": 0.9427558183670044, + "rewards/rejected": -2.1757144927978516, + "step": 1995 + }, + { + "epoch": 0.23, + "learning_rate": 2.345077841507667e-07, + "logits/chosen": -2.46107816696167, + "logits/rejected": -2.5453429222106934, + "logps/chosen": -396.77886962890625, + "logps/rejected": -376.5209045410156, + "loss": 0.2442, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3216157555580139, + "rewards/margins": 2.700773000717163, + "rewards/rejected": -2.379157543182373, + "step": 1996 + }, + { + "epoch": 0.23, + "learning_rate": 2.3447266768114244e-07, + "logits/chosen": -2.7785627841949463, + "logits/rejected": -2.833679676055908, + "logps/chosen": -150.55783081054688, + "logps/rejected": -163.50204467773438, + "loss": 0.4356, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5496151447296143, + "rewards/margins": 1.53055739402771, + "rewards/rejected": -2.0801727771759033, + "step": 1997 + }, + { + "epoch": 0.23, + "learning_rate": 2.344375512115182e-07, + "logits/chosen": -2.874781608581543, + "logits/rejected": -2.69158935546875, + "logps/chosen": -156.19290161132812, + "logps/rejected": -282.837890625, + "loss": 0.2521, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.38345450162887573, + "rewards/margins": 2.3760294914245605, + "rewards/rejected": -2.759483814239502, + "step": 1998 + }, + { + "epoch": 0.23, + "learning_rate": 2.3440243474189395e-07, + "logits/chosen": -2.141495704650879, + "logits/rejected": -2.6641416549682617, + "logps/chosen": -215.69656372070312, + "logps/rejected": -168.34719848632812, + "loss": 0.705, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.803201675415039, + "rewards/margins": 1.0512347221374512, + "rewards/rejected": -2.8544363975524902, + "step": 1999 + }, + { + "epoch": 0.23, + "learning_rate": 2.3436731827226967e-07, + "logits/chosen": -2.467311143875122, + "logits/rejected": -2.435605764389038, + "logps/chosen": -242.74374389648438, + "logps/rejected": -309.1101379394531, + "loss": 0.4053, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.249307632446289, + "rewards/margins": 1.8323636054992676, + "rewards/rejected": -3.0816712379455566, + "step": 2000 + }, + { + "epoch": 0.23, + "eval_logits/chosen": -1.6921049356460571, + "eval_logits/rejected": -1.571356177330017, + "eval_logps/chosen": -299.4908447265625, + "eval_logps/rejected": -270.1173400878906, + "eval_loss": 0.359094500541687, + "eval_rewards/accuracies": 0.8571428656578064, + "eval_rewards/chosen": -0.6810671091079712, + "eval_rewards/margins": 1.8119232654571533, + "eval_rewards/rejected": -2.492990255355835, + "eval_runtime": 24.3183, + "eval_samples_per_second": 2.878, + "eval_steps_per_second": 1.439, + "step": 2000 + }, + { + "epoch": 0.23, + "learning_rate": 2.3433220180264543e-07, + "logits/chosen": -1.7790385484695435, + "logits/rejected": -1.8947688341140747, + "logps/chosen": -556.252685546875, + "logps/rejected": -481.175048828125, + "loss": 0.1391, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14951622486114502, + "rewards/margins": 3.4832587242126465, + "rewards/rejected": -3.333742380142212, + "step": 2001 + }, + { + "epoch": 0.23, + "learning_rate": 2.3429708533302116e-07, + "logits/chosen": -2.248112916946411, + "logits/rejected": -2.11100435256958, + "logps/chosen": -426.8817443847656, + "logps/rejected": -446.09088134765625, + "loss": 0.3337, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09570322930812836, + "rewards/margins": 1.2901027202606201, + "rewards/rejected": -1.1943994760513306, + "step": 2002 + }, + { + "epoch": 0.23, + "learning_rate": 2.342619688633969e-07, + "logits/chosen": -2.946103096008301, + "logits/rejected": -3.039189338684082, + "logps/chosen": -227.60891723632812, + "logps/rejected": -214.59539794921875, + "loss": 0.3385, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8936328291893005, + "rewards/margins": 1.790069341659546, + "rewards/rejected": -2.6837024688720703, + "step": 2003 + }, + { + "epoch": 0.23, + "learning_rate": 2.342268523937727e-07, + "logits/chosen": -2.07792329788208, + "logits/rejected": -2.309405565261841, + "logps/chosen": -247.9423828125, + "logps/rejected": -252.0446014404297, + "loss": 0.3415, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8610216379165649, + "rewards/margins": 2.6314268112182617, + "rewards/rejected": -3.492448329925537, + "step": 2004 + }, + { + "epoch": 0.23, + "learning_rate": 2.3419173592414842e-07, + "logits/chosen": -1.8518024682998657, + "logits/rejected": -2.0797829627990723, + "logps/chosen": -545.822021484375, + "logps/rejected": -413.49371337890625, + "loss": 0.3065, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.40104103088378906, + "rewards/margins": 2.776850938796997, + "rewards/rejected": -3.177891731262207, + "step": 2005 + }, + { + "epoch": 0.23, + "learning_rate": 2.3415661945452417e-07, + "logits/chosen": -2.8556175231933594, + "logits/rejected": -2.7479023933410645, + "logps/chosen": -410.1650695800781, + "logps/rejected": -199.66676330566406, + "loss": 0.5552, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0293148756027222, + "rewards/margins": 0.5292457342147827, + "rewards/rejected": -1.5585603713989258, + "step": 2006 + }, + { + "epoch": 0.23, + "learning_rate": 2.3412150298489992e-07, + "logits/chosen": -2.5313830375671387, + "logits/rejected": -2.5224740505218506, + "logps/chosen": -182.52008056640625, + "logps/rejected": -293.78973388671875, + "loss": 0.3727, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18304431438446045, + "rewards/margins": 3.484940528869629, + "rewards/rejected": -3.667984962463379, + "step": 2007 + }, + { + "epoch": 0.23, + "learning_rate": 2.3408638651527565e-07, + "logits/chosen": -1.9920117855072021, + "logits/rejected": -1.7684900760650635, + "logps/chosen": -387.7423400878906, + "logps/rejected": -413.16064453125, + "loss": 0.4354, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0887737274169922, + "rewards/margins": 1.0375900268554688, + "rewards/rejected": -2.126363754272461, + "step": 2008 + }, + { + "epoch": 0.23, + "learning_rate": 2.340512700456514e-07, + "logits/chosen": -2.745958089828491, + "logits/rejected": -2.5880367755889893, + "logps/chosen": -303.3743896484375, + "logps/rejected": -297.41705322265625, + "loss": 0.581, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.3885982036590576, + "rewards/margins": 1.0470027923583984, + "rewards/rejected": -2.435600996017456, + "step": 2009 + }, + { + "epoch": 0.23, + "learning_rate": 2.3401615357602713e-07, + "logits/chosen": -2.3926854133605957, + "logits/rejected": -2.4082322120666504, + "logps/chosen": -174.591552734375, + "logps/rejected": -222.683349609375, + "loss": 0.6728, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.150158166885376, + "rewards/margins": 0.6741565465927124, + "rewards/rejected": -1.824314832687378, + "step": 2010 + }, + { + "epoch": 0.23, + "learning_rate": 2.339810371064029e-07, + "logits/chosen": -2.6557767391204834, + "logits/rejected": -2.5112130641937256, + "logps/chosen": -201.94137573242188, + "logps/rejected": -294.8179626464844, + "loss": 0.1785, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.33591777086257935, + "rewards/margins": 2.3809399604797363, + "rewards/rejected": -2.71685791015625, + "step": 2011 + }, + { + "epoch": 0.23, + "learning_rate": 2.3394592063677864e-07, + "logits/chosen": -2.4727110862731934, + "logits/rejected": -2.4528377056121826, + "logps/chosen": -192.64817810058594, + "logps/rejected": -248.19049072265625, + "loss": 0.5038, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6997347474098206, + "rewards/margins": 1.7580904960632324, + "rewards/rejected": -2.457825183868408, + "step": 2012 + }, + { + "epoch": 0.23, + "learning_rate": 2.3391080416715437e-07, + "logits/chosen": -2.282580614089966, + "logits/rejected": -2.3858346939086914, + "logps/chosen": -259.53326416015625, + "logps/rejected": -257.5293273925781, + "loss": 0.3384, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.45699119567871094, + "rewards/margins": 2.2107925415039062, + "rewards/rejected": -2.667783737182617, + "step": 2013 + }, + { + "epoch": 0.23, + "learning_rate": 2.3387568769753012e-07, + "logits/chosen": -2.9069948196411133, + "logits/rejected": -2.8442189693450928, + "logps/chosen": -233.66970825195312, + "logps/rejected": -235.14830017089844, + "loss": 0.588, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9140127301216125, + "rewards/margins": 1.107361078262329, + "rewards/rejected": -2.021373748779297, + "step": 2014 + }, + { + "epoch": 0.23, + "learning_rate": 2.338405712279059e-07, + "logits/chosen": -2.0091865062713623, + "logits/rejected": -1.986814022064209, + "logps/chosen": -167.42190551757812, + "logps/rejected": -219.81666564941406, + "loss": 0.1231, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1376221626996994, + "rewards/margins": 2.4923341274261475, + "rewards/rejected": -2.6299562454223633, + "step": 2015 + }, + { + "epoch": 0.23, + "learning_rate": 2.3380545475828163e-07, + "logits/chosen": -2.2340469360351562, + "logits/rejected": -2.366847038269043, + "logps/chosen": -356.82781982421875, + "logps/rejected": -230.961181640625, + "loss": 0.2259, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1343199908733368, + "rewards/margins": 1.7407879829406738, + "rewards/rejected": -1.875108003616333, + "step": 2016 + }, + { + "epoch": 0.23, + "learning_rate": 2.3377033828865738e-07, + "logits/chosen": -2.056260108947754, + "logits/rejected": -2.3278870582580566, + "logps/chosen": -304.4798583984375, + "logps/rejected": -209.89601135253906, + "loss": 0.3903, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.41902920603752136, + "rewards/margins": 1.7991535663604736, + "rewards/rejected": -2.2181828022003174, + "step": 2017 + }, + { + "epoch": 0.23, + "learning_rate": 2.337352218190331e-07, + "logits/chosen": -2.0293400287628174, + "logits/rejected": -1.961611032485962, + "logps/chosen": -341.6667175292969, + "logps/rejected": -244.5962371826172, + "loss": 0.5769, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.26319241523742676, + "rewards/margins": 0.5177469849586487, + "rewards/rejected": -0.7809394598007202, + "step": 2018 + }, + { + "epoch": 0.23, + "learning_rate": 2.3370010534940886e-07, + "logits/chosen": -2.680678606033325, + "logits/rejected": -2.5264573097229004, + "logps/chosen": -202.15965270996094, + "logps/rejected": -112.11998748779297, + "loss": 0.6409, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.467767596244812, + "rewards/margins": 0.5070608854293823, + "rewards/rejected": -1.9748284816741943, + "step": 2019 + }, + { + "epoch": 0.23, + "learning_rate": 2.3366498887978462e-07, + "logits/chosen": -1.6975352764129639, + "logits/rejected": -1.9706246852874756, + "logps/chosen": -296.3081359863281, + "logps/rejected": -257.4990234375, + "loss": 0.494, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7155330181121826, + "rewards/margins": 1.552065372467041, + "rewards/rejected": -2.2675981521606445, + "step": 2020 + }, + { + "epoch": 0.23, + "learning_rate": 2.3362987241016035e-07, + "logits/chosen": -3.0453715324401855, + "logits/rejected": -3.1000452041625977, + "logps/chosen": -232.31626892089844, + "logps/rejected": -258.0870056152344, + "loss": 0.1407, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1795642375946045, + "rewards/margins": 3.083956718444824, + "rewards/rejected": -4.26352071762085, + "step": 2021 + }, + { + "epoch": 0.23, + "learning_rate": 2.335947559405361e-07, + "logits/chosen": -2.669597625732422, + "logits/rejected": -2.8826241493225098, + "logps/chosen": -452.9148254394531, + "logps/rejected": -315.73370361328125, + "loss": 0.1191, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.29545706510543823, + "rewards/margins": 3.843745231628418, + "rewards/rejected": -4.139202117919922, + "step": 2022 + }, + { + "epoch": 0.23, + "learning_rate": 2.3355963947091185e-07, + "logits/chosen": -2.350368022918701, + "logits/rejected": -2.254840850830078, + "logps/chosen": -367.1036071777344, + "logps/rejected": -357.34259033203125, + "loss": 0.23, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.43972575664520264, + "rewards/margins": 2.3870532512664795, + "rewards/rejected": -2.8267788887023926, + "step": 2023 + }, + { + "epoch": 0.23, + "learning_rate": 2.3352452300128758e-07, + "logits/chosen": -2.6922571659088135, + "logits/rejected": -2.8643031120300293, + "logps/chosen": -192.5399169921875, + "logps/rejected": -163.46665954589844, + "loss": 0.5002, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1051019430160522, + "rewards/margins": 1.2698945999145508, + "rewards/rejected": -2.3749964237213135, + "step": 2024 + }, + { + "epoch": 0.23, + "learning_rate": 2.3348940653166334e-07, + "logits/chosen": -2.646204710006714, + "logits/rejected": -2.641751289367676, + "logps/chosen": -215.78883361816406, + "logps/rejected": -242.49072265625, + "loss": 0.3311, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.13756245374679565, + "rewards/margins": 3.051102638244629, + "rewards/rejected": -3.1886651515960693, + "step": 2025 + }, + { + "epoch": 0.23, + "learning_rate": 2.3345429006203906e-07, + "logits/chosen": -2.3321220874786377, + "logits/rejected": -2.1239845752716064, + "logps/chosen": -191.0900115966797, + "logps/rejected": -250.50198364257812, + "loss": 0.4913, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5005408525466919, + "rewards/margins": 0.991843581199646, + "rewards/rejected": -1.492384433746338, + "step": 2026 + }, + { + "epoch": 0.23, + "learning_rate": 2.3341917359241484e-07, + "logits/chosen": -1.4442226886749268, + "logits/rejected": -1.7361663579940796, + "logps/chosen": -413.3050842285156, + "logps/rejected": -331.0401306152344, + "loss": 0.2314, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.05885671079158783, + "rewards/margins": 2.1329994201660156, + "rewards/rejected": -2.0741424560546875, + "step": 2027 + }, + { + "epoch": 0.23, + "learning_rate": 2.333840571227906e-07, + "logits/chosen": -2.689075469970703, + "logits/rejected": -2.59431529045105, + "logps/chosen": -285.1614990234375, + "logps/rejected": -231.83993530273438, + "loss": 0.3273, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.720324695110321, + "rewards/margins": 2.2176241874694824, + "rewards/rejected": -2.9379489421844482, + "step": 2028 + }, + { + "epoch": 0.23, + "learning_rate": 2.3334894065316632e-07, + "logits/chosen": -2.5839219093322754, + "logits/rejected": -2.4136803150177, + "logps/chosen": -280.6333312988281, + "logps/rejected": -250.08819580078125, + "loss": 0.2856, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7549763321876526, + "rewards/margins": 2.495753288269043, + "rewards/rejected": -3.250729560852051, + "step": 2029 + }, + { + "epoch": 0.23, + "learning_rate": 2.3331382418354208e-07, + "logits/chosen": -1.9264485836029053, + "logits/rejected": -2.1608963012695312, + "logps/chosen": -426.3909912109375, + "logps/rejected": -366.1254577636719, + "loss": 0.2467, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1973673105239868, + "rewards/margins": 1.7343595027923584, + "rewards/rejected": -2.9317266941070557, + "step": 2030 + }, + { + "epoch": 0.23, + "learning_rate": 2.3327870771391783e-07, + "logits/chosen": -2.522263526916504, + "logits/rejected": -2.446497917175293, + "logps/chosen": -276.6327209472656, + "logps/rejected": -350.4698486328125, + "loss": 0.1539, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7724612951278687, + "rewards/margins": 3.545032024383545, + "rewards/rejected": -4.317493438720703, + "step": 2031 + }, + { + "epoch": 0.23, + "learning_rate": 2.3324359124429356e-07, + "logits/chosen": -2.31435489654541, + "logits/rejected": -2.474722146987915, + "logps/chosen": -304.4292907714844, + "logps/rejected": -268.6459655761719, + "loss": 1.0447, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.556679606437683, + "rewards/margins": 0.907171905040741, + "rewards/rejected": -2.4638514518737793, + "step": 2032 + }, + { + "epoch": 0.23, + "learning_rate": 2.332084747746693e-07, + "logits/chosen": -2.3158321380615234, + "logits/rejected": -2.2322473526000977, + "logps/chosen": -171.86203002929688, + "logps/rejected": -142.6539306640625, + "loss": 0.3704, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2784310579299927, + "rewards/margins": 2.16532301902771, + "rewards/rejected": -2.443754196166992, + "step": 2033 + }, + { + "epoch": 0.23, + "learning_rate": 2.3317335830504504e-07, + "logits/chosen": -2.0910074710845947, + "logits/rejected": -2.283140182495117, + "logps/chosen": -334.46746826171875, + "logps/rejected": -240.9962158203125, + "loss": 0.5521, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7174403071403503, + "rewards/margins": 0.5903948545455933, + "rewards/rejected": -1.3078352212905884, + "step": 2034 + }, + { + "epoch": 0.23, + "learning_rate": 2.331382418354208e-07, + "logits/chosen": -2.291804075241089, + "logits/rejected": -2.2588584423065186, + "logps/chosen": -179.39309692382812, + "logps/rejected": -324.6695556640625, + "loss": 0.4394, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6219616532325745, + "rewards/margins": 1.5224987268447876, + "rewards/rejected": -2.144460439682007, + "step": 2035 + }, + { + "epoch": 0.23, + "learning_rate": 2.3310312536579655e-07, + "logits/chosen": -2.403940439224243, + "logits/rejected": -2.5425524711608887, + "logps/chosen": -87.75865173339844, + "logps/rejected": -165.9769744873047, + "loss": 0.2177, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.308604896068573, + "rewards/margins": 2.1215713024139404, + "rewards/rejected": -2.430176019668579, + "step": 2036 + }, + { + "epoch": 0.23, + "learning_rate": 2.3306800889617228e-07, + "logits/chosen": -2.37600040435791, + "logits/rejected": -2.6390907764434814, + "logps/chosen": -320.4034729003906, + "logps/rejected": -307.2439270019531, + "loss": 0.389, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6058390140533447, + "rewards/margins": 1.6207244396209717, + "rewards/rejected": -3.2265634536743164, + "step": 2037 + }, + { + "epoch": 0.23, + "learning_rate": 2.3303289242654806e-07, + "logits/chosen": -2.778308868408203, + "logits/rejected": -2.871840476989746, + "logps/chosen": -112.79014587402344, + "logps/rejected": -230.55813598632812, + "loss": 0.2212, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.27859970927238464, + "rewards/margins": 2.888615131378174, + "rewards/rejected": -3.167214870452881, + "step": 2038 + }, + { + "epoch": 0.24, + "learning_rate": 2.329977759569238e-07, + "logits/chosen": -2.2826619148254395, + "logits/rejected": -2.512758731842041, + "logps/chosen": -184.08099365234375, + "logps/rejected": -219.99220275878906, + "loss": 0.5431, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.837226152420044, + "rewards/margins": 1.4426050186157227, + "rewards/rejected": -2.2798311710357666, + "step": 2039 + }, + { + "epoch": 0.24, + "learning_rate": 2.3296265948729954e-07, + "logits/chosen": -2.6827948093414307, + "logits/rejected": -2.45835542678833, + "logps/chosen": -277.99462890625, + "logps/rejected": -217.04049682617188, + "loss": 0.2485, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.44636380672454834, + "rewards/margins": 2.0016074180603027, + "rewards/rejected": -2.4479711055755615, + "step": 2040 + }, + { + "epoch": 0.24, + "learning_rate": 2.329275430176753e-07, + "logits/chosen": -2.6760525703430176, + "logits/rejected": -2.836432695388794, + "logps/chosen": -377.44891357421875, + "logps/rejected": -221.2686004638672, + "loss": 0.3823, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8402947783470154, + "rewards/margins": 1.6055898666381836, + "rewards/rejected": -2.4458847045898438, + "step": 2041 + }, + { + "epoch": 0.24, + "learning_rate": 2.3289242654805102e-07, + "logits/chosen": -2.36391544342041, + "logits/rejected": -2.5093278884887695, + "logps/chosen": -222.2734832763672, + "logps/rejected": -376.3247985839844, + "loss": 0.242, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1995302438735962, + "rewards/margins": 2.980637788772583, + "rewards/rejected": -4.180168151855469, + "step": 2042 + }, + { + "epoch": 0.24, + "learning_rate": 2.3285731007842677e-07, + "logits/chosen": -2.5876734256744385, + "logits/rejected": -2.517984390258789, + "logps/chosen": -105.7376480102539, + "logps/rejected": -188.00547790527344, + "loss": 0.6614, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5085170269012451, + "rewards/margins": 0.8655695915222168, + "rewards/rejected": -2.374086380004883, + "step": 2043 + }, + { + "epoch": 0.24, + "learning_rate": 2.3282219360880253e-07, + "logits/chosen": -2.7603349685668945, + "logits/rejected": -2.675790786743164, + "logps/chosen": -333.260498046875, + "logps/rejected": -232.6012420654297, + "loss": 0.0913, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.32046347856521606, + "rewards/margins": 3.260402202606201, + "rewards/rejected": -3.5808656215667725, + "step": 2044 + }, + { + "epoch": 0.24, + "learning_rate": 2.3278707713917825e-07, + "logits/chosen": -1.9182002544403076, + "logits/rejected": -1.8181111812591553, + "logps/chosen": -201.50038146972656, + "logps/rejected": -289.38763427734375, + "loss": 0.4352, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7607442140579224, + "rewards/margins": 2.546203851699829, + "rewards/rejected": -3.306947946548462, + "step": 2045 + }, + { + "epoch": 0.24, + "learning_rate": 2.32751960669554e-07, + "logits/chosen": -2.3982720375061035, + "logits/rejected": -2.429731845855713, + "logps/chosen": -260.5220642089844, + "logps/rejected": -295.8516540527344, + "loss": 0.7896, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1562211513519287, + "rewards/margins": 0.8470181226730347, + "rewards/rejected": -2.003239393234253, + "step": 2046 + }, + { + "epoch": 0.24, + "learning_rate": 2.3271684419992973e-07, + "logits/chosen": -2.5509495735168457, + "logits/rejected": -2.3867907524108887, + "logps/chosen": -416.5439758300781, + "logps/rejected": -531.12158203125, + "loss": 0.7089, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3120025396347046, + "rewards/margins": 0.759832501411438, + "rewards/rejected": -2.0718350410461426, + "step": 2047 + }, + { + "epoch": 0.24, + "learning_rate": 2.326817277303055e-07, + "logits/chosen": -1.890044093132019, + "logits/rejected": -1.933626413345337, + "logps/chosen": -366.7159118652344, + "logps/rejected": -281.23321533203125, + "loss": 0.2065, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7397196292877197, + "rewards/margins": 1.877143383026123, + "rewards/rejected": -2.6168630123138428, + "step": 2048 + }, + { + "epoch": 0.24, + "learning_rate": 2.3264661126068127e-07, + "logits/chosen": -2.7309679985046387, + "logits/rejected": -2.7206835746765137, + "logps/chosen": -310.72198486328125, + "logps/rejected": -221.763427734375, + "loss": 0.9737, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.776484489440918, + "rewards/margins": 0.06394314765930176, + "rewards/rejected": -1.8404275178909302, + "step": 2049 + }, + { + "epoch": 0.24, + "learning_rate": 2.32611494791057e-07, + "logits/chosen": -1.7542911767959595, + "logits/rejected": -1.7589101791381836, + "logps/chosen": -261.95916748046875, + "logps/rejected": -267.0566711425781, + "loss": 0.4033, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.42033666372299194, + "rewards/margins": 2.925725221633911, + "rewards/rejected": -3.346061944961548, + "step": 2050 + }, + { + "epoch": 0.24, + "learning_rate": 2.3257637832143275e-07, + "logits/chosen": -2.7031397819519043, + "logits/rejected": -2.5936312675476074, + "logps/chosen": -224.62948608398438, + "logps/rejected": -264.6529541015625, + "loss": 0.3901, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.076116919517517, + "rewards/margins": 1.8821451663970947, + "rewards/rejected": -2.9582619667053223, + "step": 2051 + }, + { + "epoch": 0.24, + "learning_rate": 2.325412618518085e-07, + "logits/chosen": -2.6792945861816406, + "logits/rejected": -2.6980020999908447, + "logps/chosen": -294.56390380859375, + "logps/rejected": -225.63348388671875, + "loss": 0.5714, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3376182317733765, + "rewards/margins": 1.8962024450302124, + "rewards/rejected": -3.233820676803589, + "step": 2052 + }, + { + "epoch": 0.24, + "learning_rate": 2.3250614538218423e-07, + "logits/chosen": -2.6064059734344482, + "logits/rejected": -2.2328877449035645, + "logps/chosen": -306.9275817871094, + "logps/rejected": -294.63861083984375, + "loss": 0.6487, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3829573392868042, + "rewards/margins": 0.3196561336517334, + "rewards/rejected": -1.702613353729248, + "step": 2053 + }, + { + "epoch": 0.24, + "learning_rate": 2.3247102891255999e-07, + "logits/chosen": -2.417876720428467, + "logits/rejected": -2.5602195262908936, + "logps/chosen": -213.20265197753906, + "logps/rejected": -235.607666015625, + "loss": 0.2414, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3530226945877075, + "rewards/margins": 1.9866244792938232, + "rewards/rejected": -2.3396472930908203, + "step": 2054 + }, + { + "epoch": 0.24, + "learning_rate": 2.324359124429357e-07, + "logits/chosen": -1.5474905967712402, + "logits/rejected": -2.057279109954834, + "logps/chosen": -561.7322387695312, + "logps/rejected": -345.41632080078125, + "loss": 1.9208, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.4967939853668213, + "rewards/margins": -0.6867049932479858, + "rewards/rejected": -1.8100889921188354, + "step": 2055 + }, + { + "epoch": 0.24, + "learning_rate": 2.3240079597331147e-07, + "logits/chosen": -2.4599404335021973, + "logits/rejected": -2.580504894256592, + "logps/chosen": -305.04180908203125, + "logps/rejected": -369.6037292480469, + "loss": 0.2514, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5875897407531738, + "rewards/margins": 2.9850046634674072, + "rewards/rejected": -3.57259464263916, + "step": 2056 + }, + { + "epoch": 0.24, + "learning_rate": 2.3236567950368722e-07, + "logits/chosen": -2.9396400451660156, + "logits/rejected": -3.0190601348876953, + "logps/chosen": -266.38482666015625, + "logps/rejected": -272.163330078125, + "loss": 0.2499, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.997867226600647, + "rewards/margins": 2.728816509246826, + "rewards/rejected": -3.7266838550567627, + "step": 2057 + }, + { + "epoch": 0.24, + "learning_rate": 2.3233056303406295e-07, + "logits/chosen": -2.6821343898773193, + "logits/rejected": -2.6857070922851562, + "logps/chosen": -513.8663330078125, + "logps/rejected": -397.7135314941406, + "loss": 0.6426, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8125686645507812, + "rewards/margins": 1.90025794506073, + "rewards/rejected": -2.712826728820801, + "step": 2058 + }, + { + "epoch": 0.24, + "learning_rate": 2.322954465644387e-07, + "logits/chosen": -2.2916927337646484, + "logits/rejected": -2.614318609237671, + "logps/chosen": -237.07044982910156, + "logps/rejected": -215.4291534423828, + "loss": 0.2323, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20523639023303986, + "rewards/margins": 2.3537847995758057, + "rewards/rejected": -2.559021234512329, + "step": 2059 + }, + { + "epoch": 0.24, + "learning_rate": 2.3226033009481448e-07, + "logits/chosen": -2.552412986755371, + "logits/rejected": -2.4979443550109863, + "logps/chosen": -96.84933471679688, + "logps/rejected": -205.59060668945312, + "loss": 0.4835, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9218706488609314, + "rewards/margins": 0.8586660623550415, + "rewards/rejected": -1.7805367708206177, + "step": 2060 + }, + { + "epoch": 0.24, + "learning_rate": 2.322252136251902e-07, + "logits/chosen": -1.789186716079712, + "logits/rejected": -1.8565659523010254, + "logps/chosen": -343.2366027832031, + "logps/rejected": -348.9019470214844, + "loss": 0.9413, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7317560911178589, + "rewards/margins": 0.6849212646484375, + "rewards/rejected": -2.416677236557007, + "step": 2061 + }, + { + "epoch": 0.24, + "learning_rate": 2.3219009715556596e-07, + "logits/chosen": -2.422027349472046, + "logits/rejected": -2.3176791667938232, + "logps/chosen": -212.50083923339844, + "logps/rejected": -237.0706329345703, + "loss": 0.4071, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.651311993598938, + "rewards/margins": 1.1054627895355225, + "rewards/rejected": -1.7567747831344604, + "step": 2062 + }, + { + "epoch": 0.24, + "learning_rate": 2.321549806859417e-07, + "logits/chosen": -2.7440648078918457, + "logits/rejected": -2.7969465255737305, + "logps/chosen": -202.78656005859375, + "logps/rejected": -197.65921020507812, + "loss": 0.3839, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1195645332336426, + "rewards/margins": 2.183605432510376, + "rewards/rejected": -3.3031699657440186, + "step": 2063 + }, + { + "epoch": 0.24, + "learning_rate": 2.3211986421631744e-07, + "logits/chosen": -2.581341028213501, + "logits/rejected": -2.62532114982605, + "logps/chosen": -212.75022888183594, + "logps/rejected": -224.93490600585938, + "loss": 0.6946, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2166786193847656, + "rewards/margins": 1.1537095308303833, + "rewards/rejected": -2.3703880310058594, + "step": 2064 + }, + { + "epoch": 0.24, + "learning_rate": 2.320847477466932e-07, + "logits/chosen": -2.4976043701171875, + "logits/rejected": -2.6514768600463867, + "logps/chosen": -226.36265563964844, + "logps/rejected": -229.37681579589844, + "loss": 0.389, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5195232629776001, + "rewards/margins": 1.1852178573608398, + "rewards/rejected": -1.7047410011291504, + "step": 2065 + }, + { + "epoch": 0.24, + "learning_rate": 2.3204963127706893e-07, + "logits/chosen": -2.2903895378112793, + "logits/rejected": -2.372159004211426, + "logps/chosen": -389.42333984375, + "logps/rejected": -328.5714416503906, + "loss": 0.591, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7745757699012756, + "rewards/margins": 1.0372556447982788, + "rewards/rejected": -1.8118314743041992, + "step": 2066 + }, + { + "epoch": 0.24, + "learning_rate": 2.3201451480744468e-07, + "logits/chosen": -2.119826316833496, + "logits/rejected": -1.962324857711792, + "logps/chosen": -236.36143493652344, + "logps/rejected": -545.439697265625, + "loss": 0.7218, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7570024728775024, + "rewards/margins": 1.3010624647140503, + "rewards/rejected": -2.0580649375915527, + "step": 2067 + }, + { + "epoch": 0.24, + "learning_rate": 2.3197939833782043e-07, + "logits/chosen": -2.260939836502075, + "logits/rejected": -1.994672179222107, + "logps/chosen": -325.42755126953125, + "logps/rejected": -334.3144836425781, + "loss": 0.2762, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.30078229308128357, + "rewards/margins": 2.5843777656555176, + "rewards/rejected": -2.885160207748413, + "step": 2068 + }, + { + "epoch": 0.24, + "learning_rate": 2.3194428186819616e-07, + "logits/chosen": -2.2362072467803955, + "logits/rejected": -2.3380534648895264, + "logps/chosen": -300.52801513671875, + "logps/rejected": -313.3498840332031, + "loss": 0.2044, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.594745934009552, + "rewards/margins": 3.170754909515381, + "rewards/rejected": -3.765500545501709, + "step": 2069 + }, + { + "epoch": 0.24, + "learning_rate": 2.3190916539857191e-07, + "logits/chosen": -2.431251049041748, + "logits/rejected": -2.252298593521118, + "logps/chosen": -311.2998046875, + "logps/rejected": -363.5993347167969, + "loss": 0.4194, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8618488311767578, + "rewards/margins": 1.2641868591308594, + "rewards/rejected": -2.126035690307617, + "step": 2070 + }, + { + "epoch": 0.24, + "learning_rate": 2.3187404892894764e-07, + "logits/chosen": -2.276965856552124, + "logits/rejected": -2.6483383178710938, + "logps/chosen": -347.76837158203125, + "logps/rejected": -238.72230529785156, + "loss": 0.693, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.7164626121520996, + "rewards/margins": 1.1837342977523804, + "rewards/rejected": -2.9001970291137695, + "step": 2071 + }, + { + "epoch": 0.24, + "learning_rate": 2.3183893245932342e-07, + "logits/chosen": -2.3029308319091797, + "logits/rejected": -2.5286660194396973, + "logps/chosen": -388.489501953125, + "logps/rejected": -195.45816040039062, + "loss": 0.504, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.310542345046997, + "rewards/margins": 1.6094493865966797, + "rewards/rejected": -2.919991970062256, + "step": 2072 + }, + { + "epoch": 0.24, + "learning_rate": 2.3180381598969918e-07, + "logits/chosen": -1.8831549882888794, + "logits/rejected": -2.098508358001709, + "logps/chosen": -299.55511474609375, + "logps/rejected": -274.69891357421875, + "loss": 1.3334, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.4901580810546875, + "rewards/margins": -0.5840638875961304, + "rewards/rejected": -1.906093955039978, + "step": 2073 + }, + { + "epoch": 0.24, + "learning_rate": 2.317686995200749e-07, + "logits/chosen": -2.4059009552001953, + "logits/rejected": -2.2607784271240234, + "logps/chosen": -231.588623046875, + "logps/rejected": -244.87013244628906, + "loss": 0.3132, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9634074568748474, + "rewards/margins": 2.0708556175231934, + "rewards/rejected": -3.0342628955841064, + "step": 2074 + }, + { + "epoch": 0.24, + "learning_rate": 2.3173358305045066e-07, + "logits/chosen": -2.6842703819274902, + "logits/rejected": -2.8770573139190674, + "logps/chosen": -446.5592956542969, + "logps/rejected": -214.85983276367188, + "loss": 0.4415, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6115028858184814, + "rewards/margins": 1.1856800317764282, + "rewards/rejected": -1.7971830368041992, + "step": 2075 + }, + { + "epoch": 0.24, + "learning_rate": 2.316984665808264e-07, + "logits/chosen": -2.726522207260132, + "logits/rejected": -2.534874677658081, + "logps/chosen": -174.7887420654297, + "logps/rejected": -217.2879638671875, + "loss": 0.7811, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3878953456878662, + "rewards/margins": 0.9031063318252563, + "rewards/rejected": -2.291001796722412, + "step": 2076 + }, + { + "epoch": 0.24, + "learning_rate": 2.3166335011120214e-07, + "logits/chosen": -2.869041681289673, + "logits/rejected": -2.608940362930298, + "logps/chosen": -340.78460693359375, + "logps/rejected": -278.44781494140625, + "loss": 0.2825, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5030579566955566, + "rewards/margins": 2.558773994445801, + "rewards/rejected": -3.0618321895599365, + "step": 2077 + }, + { + "epoch": 0.24, + "learning_rate": 2.316282336415779e-07, + "logits/chosen": -2.106515884399414, + "logits/rejected": -2.165519952774048, + "logps/chosen": -314.98779296875, + "logps/rejected": -286.674072265625, + "loss": 0.2935, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7109346985816956, + "rewards/margins": 1.8670594692230225, + "rewards/rejected": -2.5779941082000732, + "step": 2078 + }, + { + "epoch": 0.24, + "learning_rate": 2.3159311717195362e-07, + "logits/chosen": -2.4994571208953857, + "logits/rejected": -2.4951584339141846, + "logps/chosen": -123.33877563476562, + "logps/rejected": -310.8464660644531, + "loss": 0.5946, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9060128927230835, + "rewards/margins": 1.920460820198059, + "rewards/rejected": -2.8264737129211426, + "step": 2079 + }, + { + "epoch": 0.24, + "learning_rate": 2.3155800070232937e-07, + "logits/chosen": -3.043156147003174, + "logits/rejected": -2.912261724472046, + "logps/chosen": -144.26702880859375, + "logps/rejected": -125.86354064941406, + "loss": 0.91, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3024930953979492, + "rewards/margins": 0.5913704037666321, + "rewards/rejected": -1.8938636779785156, + "step": 2080 + }, + { + "epoch": 0.24, + "learning_rate": 2.3152288423270513e-07, + "logits/chosen": -1.5214011669158936, + "logits/rejected": -1.4597680568695068, + "logps/chosen": -356.3072814941406, + "logps/rejected": -352.19537353515625, + "loss": 0.6771, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.168100357055664, + "rewards/margins": 0.4443114101886749, + "rewards/rejected": -1.6124117374420166, + "step": 2081 + }, + { + "epoch": 0.24, + "learning_rate": 2.3148776776308085e-07, + "logits/chosen": -1.8701611757278442, + "logits/rejected": -2.307325839996338, + "logps/chosen": -464.8197937011719, + "logps/rejected": -304.87164306640625, + "loss": 0.3416, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5626438856124878, + "rewards/margins": 1.2787227630615234, + "rewards/rejected": -1.8413667678833008, + "step": 2082 + }, + { + "epoch": 0.24, + "learning_rate": 2.3145265129345664e-07, + "logits/chosen": -2.3583357334136963, + "logits/rejected": -2.369119882583618, + "logps/chosen": -280.4969482421875, + "logps/rejected": -211.0676727294922, + "loss": 0.4743, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7230374813079834, + "rewards/margins": 0.9068661332130432, + "rewards/rejected": -1.6299035549163818, + "step": 2083 + }, + { + "epoch": 0.24, + "learning_rate": 2.314175348238324e-07, + "logits/chosen": -1.975927710533142, + "logits/rejected": -2.3776769638061523, + "logps/chosen": -259.8713684082031, + "logps/rejected": -179.64744567871094, + "loss": 0.2082, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.39793962240219116, + "rewards/margins": 2.152679681777954, + "rewards/rejected": -1.7547401189804077, + "step": 2084 + }, + { + "epoch": 0.24, + "learning_rate": 2.3138241835420812e-07, + "logits/chosen": -2.53578519821167, + "logits/rejected": -2.789731740951538, + "logps/chosen": -185.47210693359375, + "logps/rejected": -214.30140686035156, + "loss": 0.4014, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4336702823638916, + "rewards/margins": 1.2805569171905518, + "rewards/rejected": -1.7142270803451538, + "step": 2085 + }, + { + "epoch": 0.24, + "learning_rate": 2.3134730188458387e-07, + "logits/chosen": -2.139890432357788, + "logits/rejected": -2.0854690074920654, + "logps/chosen": -275.0867004394531, + "logps/rejected": -246.2828826904297, + "loss": 0.1114, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06977781653404236, + "rewards/margins": 3.2623391151428223, + "rewards/rejected": -3.192561149597168, + "step": 2086 + }, + { + "epoch": 0.24, + "learning_rate": 2.313121854149596e-07, + "logits/chosen": -1.4889240264892578, + "logits/rejected": -1.5879204273223877, + "logps/chosen": -453.2201843261719, + "logps/rejected": -342.4081726074219, + "loss": 0.1109, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.34574270248413086, + "rewards/margins": 2.776885986328125, + "rewards/rejected": -3.122628688812256, + "step": 2087 + }, + { + "epoch": 0.24, + "learning_rate": 2.3127706894533535e-07, + "logits/chosen": -2.279474973678589, + "logits/rejected": -2.3617618083953857, + "logps/chosen": -222.66346740722656, + "logps/rejected": -200.31166076660156, + "loss": 0.3684, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.0853629112243652, + "rewards/margins": 2.5398306846618652, + "rewards/rejected": -4.6251935958862305, + "step": 2088 + }, + { + "epoch": 0.24, + "learning_rate": 2.312419524757111e-07, + "logits/chosen": -2.0199954509735107, + "logits/rejected": -2.073617458343506, + "logps/chosen": -289.8877258300781, + "logps/rejected": -448.46343994140625, + "loss": 0.2156, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7704249620437622, + "rewards/margins": 2.961160182952881, + "rewards/rejected": -3.7315850257873535, + "step": 2089 + }, + { + "epoch": 0.24, + "learning_rate": 2.3120683600608683e-07, + "logits/chosen": -2.447052478790283, + "logits/rejected": -2.586151599884033, + "logps/chosen": -332.45135498046875, + "logps/rejected": -278.8399658203125, + "loss": 0.2819, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.099564552307129, + "rewards/margins": 2.1727705001831055, + "rewards/rejected": -3.2723350524902344, + "step": 2090 + }, + { + "epoch": 0.24, + "learning_rate": 2.3117171953646259e-07, + "logits/chosen": -2.3496949672698975, + "logits/rejected": -2.1483447551727295, + "logps/chosen": -184.26402282714844, + "logps/rejected": -259.50030517578125, + "loss": 0.4516, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.43569689989089966, + "rewards/margins": 2.244008779525757, + "rewards/rejected": -2.6797056198120117, + "step": 2091 + }, + { + "epoch": 0.24, + "learning_rate": 2.3113660306683831e-07, + "logits/chosen": -2.2584421634674072, + "logits/rejected": -2.0580320358276367, + "logps/chosen": -184.95413208007812, + "logps/rejected": -252.0009002685547, + "loss": 0.2694, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.47169554233551025, + "rewards/margins": 1.8446511030197144, + "rewards/rejected": -2.3163464069366455, + "step": 2092 + }, + { + "epoch": 0.24, + "learning_rate": 2.3110148659721407e-07, + "logits/chosen": -2.1412241458892822, + "logits/rejected": -2.280299186706543, + "logps/chosen": -241.42813110351562, + "logps/rejected": -261.9281005859375, + "loss": 0.7026, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5223655700683594, + "rewards/margins": 1.319974422454834, + "rewards/rejected": -2.8423402309417725, + "step": 2093 + }, + { + "epoch": 0.24, + "learning_rate": 2.3106637012758985e-07, + "logits/chosen": -2.016657590866089, + "logits/rejected": -2.123046875, + "logps/chosen": -145.28379821777344, + "logps/rejected": -161.1443328857422, + "loss": 0.43, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5105044841766357, + "rewards/margins": 1.1639925241470337, + "rewards/rejected": -1.6744970083236694, + "step": 2094 + }, + { + "epoch": 0.24, + "learning_rate": 2.3103125365796558e-07, + "logits/chosen": -2.566293716430664, + "logits/rejected": -2.479867458343506, + "logps/chosen": -231.1259765625, + "logps/rejected": -322.29388427734375, + "loss": 0.5548, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5726456046104431, + "rewards/margins": 1.3355894088745117, + "rewards/rejected": -1.9082350730895996, + "step": 2095 + }, + { + "epoch": 0.24, + "learning_rate": 2.3099613718834133e-07, + "logits/chosen": -2.3636951446533203, + "logits/rejected": -2.6183528900146484, + "logps/chosen": -413.4312744140625, + "logps/rejected": -252.80638122558594, + "loss": 0.2098, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.46879836916923523, + "rewards/margins": 2.189828395843506, + "rewards/rejected": -2.6586265563964844, + "step": 2096 + }, + { + "epoch": 0.24, + "learning_rate": 2.3096102071871708e-07, + "logits/chosen": -2.814319372177124, + "logits/rejected": -2.7884953022003174, + "logps/chosen": -405.660400390625, + "logps/rejected": -410.98114013671875, + "loss": 0.3444, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6264143586158752, + "rewards/margins": 2.7561750411987305, + "rewards/rejected": -3.382589340209961, + "step": 2097 + }, + { + "epoch": 0.24, + "learning_rate": 2.309259042490928e-07, + "logits/chosen": -2.282388925552368, + "logits/rejected": -2.2469842433929443, + "logps/chosen": -204.1607666015625, + "logps/rejected": -279.9303894042969, + "loss": 0.329, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.583763837814331, + "rewards/margins": 2.5438079833984375, + "rewards/rejected": -3.1275720596313477, + "step": 2098 + }, + { + "epoch": 0.24, + "learning_rate": 2.3089078777946856e-07, + "logits/chosen": -2.6545562744140625, + "logits/rejected": -2.42716646194458, + "logps/chosen": -204.04251098632812, + "logps/rejected": -308.3486633300781, + "loss": 0.4094, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5022817850112915, + "rewards/margins": 1.1027623414993286, + "rewards/rejected": -1.6050441265106201, + "step": 2099 + }, + { + "epoch": 0.24, + "learning_rate": 2.308556713098443e-07, + "logits/chosen": -2.2975974082946777, + "logits/rejected": -2.4768004417419434, + "logps/chosen": -270.687744140625, + "logps/rejected": -256.20428466796875, + "loss": 0.4147, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.552063226699829, + "rewards/margins": 2.7174344062805176, + "rewards/rejected": -4.269497871398926, + "step": 2100 + }, + { + "epoch": 0.24, + "learning_rate": 2.3082055484022005e-07, + "logits/chosen": -2.2653183937072754, + "logits/rejected": -2.14664363861084, + "logps/chosen": -212.6588134765625, + "logps/rejected": -336.67340087890625, + "loss": 0.1147, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0817475318908691, + "rewards/margins": 3.7562460899353027, + "rewards/rejected": -4.837993621826172, + "step": 2101 + }, + { + "epoch": 0.24, + "learning_rate": 2.307854383705958e-07, + "logits/chosen": -1.804966688156128, + "logits/rejected": -1.8164665699005127, + "logps/chosen": -245.80514526367188, + "logps/rejected": -325.1024169921875, + "loss": 0.482, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8485467433929443, + "rewards/margins": 2.4191298484802246, + "rewards/rejected": -3.267676591873169, + "step": 2102 + }, + { + "epoch": 0.24, + "learning_rate": 2.3075032190097153e-07, + "logits/chosen": -1.9140384197235107, + "logits/rejected": -1.6792799234390259, + "logps/chosen": -258.35540771484375, + "logps/rejected": -380.22698974609375, + "loss": 0.3589, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8819519281387329, + "rewards/margins": 1.8299620151519775, + "rewards/rejected": -2.7119140625, + "step": 2103 + }, + { + "epoch": 0.24, + "learning_rate": 2.3071520543134728e-07, + "logits/chosen": -2.31289005279541, + "logits/rejected": -2.295884370803833, + "logps/chosen": -260.166748046875, + "logps/rejected": -259.4739685058594, + "loss": 0.4504, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.627981424331665, + "rewards/margins": 2.2685980796813965, + "rewards/rejected": -2.8965792655944824, + "step": 2104 + }, + { + "epoch": 0.24, + "learning_rate": 2.3068008896172306e-07, + "logits/chosen": -2.0031585693359375, + "logits/rejected": -2.184422016143799, + "logps/chosen": -207.90359497070312, + "logps/rejected": -177.83180236816406, + "loss": 0.3196, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.791415810585022, + "rewards/margins": 1.31174898147583, + "rewards/rejected": -2.1031646728515625, + "step": 2105 + }, + { + "epoch": 0.24, + "learning_rate": 2.306449724920988e-07, + "logits/chosen": -2.1581358909606934, + "logits/rejected": -2.005885124206543, + "logps/chosen": -299.4461975097656, + "logps/rejected": -247.80307006835938, + "loss": 0.3619, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4568287134170532, + "rewards/margins": 1.4149315357208252, + "rewards/rejected": -2.871760129928589, + "step": 2106 + }, + { + "epoch": 0.24, + "learning_rate": 2.3060985602247454e-07, + "logits/chosen": -2.6625893115997314, + "logits/rejected": -2.6291704177856445, + "logps/chosen": -324.05035400390625, + "logps/rejected": -584.6098022460938, + "loss": 0.3616, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7054082155227661, + "rewards/margins": 3.3572611808776855, + "rewards/rejected": -4.062669277191162, + "step": 2107 + }, + { + "epoch": 0.24, + "learning_rate": 2.3057473955285027e-07, + "logits/chosen": -2.4224889278411865, + "logits/rejected": -2.216273069381714, + "logps/chosen": -326.57635498046875, + "logps/rejected": -321.1375732421875, + "loss": 0.5978, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.008002758026123, + "rewards/margins": 0.8634126782417297, + "rewards/rejected": -1.871415615081787, + "step": 2108 + }, + { + "epoch": 0.24, + "learning_rate": 2.3053962308322602e-07, + "logits/chosen": -2.1006484031677246, + "logits/rejected": -1.8500256538391113, + "logps/chosen": -244.46820068359375, + "logps/rejected": -278.4122314453125, + "loss": 0.2912, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7424764037132263, + "rewards/margins": 2.024942398071289, + "rewards/rejected": -2.76741886138916, + "step": 2109 + }, + { + "epoch": 0.24, + "learning_rate": 2.3050450661360178e-07, + "logits/chosen": -2.2432472705841064, + "logits/rejected": -2.246281623840332, + "logps/chosen": -129.9912872314453, + "logps/rejected": -193.57192993164062, + "loss": 0.7742, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3064398765563965, + "rewards/margins": 1.0101935863494873, + "rewards/rejected": -2.316633701324463, + "step": 2110 + }, + { + "epoch": 0.24, + "learning_rate": 2.304693901439775e-07, + "logits/chosen": -2.823352575302124, + "logits/rejected": -2.9282965660095215, + "logps/chosen": -153.49868774414062, + "logps/rejected": -163.3184814453125, + "loss": 0.419, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4129364490509033, + "rewards/margins": 0.8772532343864441, + "rewards/rejected": -1.2901897430419922, + "step": 2111 + }, + { + "epoch": 0.24, + "learning_rate": 2.3043427367435326e-07, + "logits/chosen": -2.099560260772705, + "logits/rejected": -2.0211877822875977, + "logps/chosen": -185.35879516601562, + "logps/rejected": -218.4176025390625, + "loss": 0.3856, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7238250970840454, + "rewards/margins": 1.3459022045135498, + "rewards/rejected": -2.0697274208068848, + "step": 2112 + }, + { + "epoch": 0.24, + "learning_rate": 2.30399157204729e-07, + "logits/chosen": -2.6121327877044678, + "logits/rejected": -2.4001364707946777, + "logps/chosen": -115.26852416992188, + "logps/rejected": -230.12515258789062, + "loss": 0.2288, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2944125831127167, + "rewards/margins": 2.5428035259246826, + "rewards/rejected": -2.8372161388397217, + "step": 2113 + }, + { + "epoch": 0.24, + "learning_rate": 2.3036404073510474e-07, + "logits/chosen": -1.6540617942810059, + "logits/rejected": -2.2547309398651123, + "logps/chosen": -273.4012451171875, + "logps/rejected": -246.07997131347656, + "loss": 1.5099, + "rewards/accuracies": 0.375, + "rewards/chosen": -2.307286262512207, + "rewards/margins": 0.11977410316467285, + "rewards/rejected": -2.42706036567688, + "step": 2114 + }, + { + "epoch": 0.24, + "learning_rate": 2.303289242654805e-07, + "logits/chosen": -2.5445637702941895, + "logits/rejected": -2.598358392715454, + "logps/chosen": -307.6643371582031, + "logps/rejected": -305.31732177734375, + "loss": 0.3143, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3874647617340088, + "rewards/margins": 2.9252240657806396, + "rewards/rejected": -3.3126888275146484, + "step": 2115 + }, + { + "epoch": 0.24, + "learning_rate": 2.3029380779585622e-07, + "logits/chosen": -3.0256876945495605, + "logits/rejected": -2.9019815921783447, + "logps/chosen": -197.33877563476562, + "logps/rejected": -176.56927490234375, + "loss": 0.7541, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6152705550193787, + "rewards/margins": 2.352562427520752, + "rewards/rejected": -2.9678330421447754, + "step": 2116 + }, + { + "epoch": 0.24, + "learning_rate": 2.30258691326232e-07, + "logits/chosen": -2.4848718643188477, + "logits/rejected": -2.3859472274780273, + "logps/chosen": -423.62408447265625, + "logps/rejected": -332.81982421875, + "loss": 0.5011, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0064294338226318, + "rewards/margins": 2.1442201137542725, + "rewards/rejected": -3.1506495475769043, + "step": 2117 + }, + { + "epoch": 0.24, + "learning_rate": 2.3022357485660776e-07, + "logits/chosen": -2.1736981868743896, + "logits/rejected": -2.4135968685150146, + "logps/chosen": -283.55096435546875, + "logps/rejected": -214.8936767578125, + "loss": 0.4553, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.89214026927948, + "rewards/margins": 1.2071549892425537, + "rewards/rejected": -2.0992953777313232, + "step": 2118 + }, + { + "epoch": 0.24, + "learning_rate": 2.3018845838698348e-07, + "logits/chosen": -2.633059024810791, + "logits/rejected": -2.6250696182250977, + "logps/chosen": -305.02520751953125, + "logps/rejected": -260.8602294921875, + "loss": 0.4996, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3863584995269775, + "rewards/margins": 1.8958349227905273, + "rewards/rejected": -3.282193422317505, + "step": 2119 + }, + { + "epoch": 0.24, + "learning_rate": 2.3015334191735924e-07, + "logits/chosen": -2.0331573486328125, + "logits/rejected": -2.3348286151885986, + "logps/chosen": -366.939697265625, + "logps/rejected": -338.7797546386719, + "loss": 0.1282, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0473852157592773, + "rewards/margins": 2.6087756156921387, + "rewards/rejected": -3.656160831451416, + "step": 2120 + }, + { + "epoch": 0.24, + "learning_rate": 2.30118225447735e-07, + "logits/chosen": -2.7266321182250977, + "logits/rejected": -2.6116387844085693, + "logps/chosen": -199.89620971679688, + "logps/rejected": -307.0933532714844, + "loss": 1.2248, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.457505226135254, + "rewards/margins": 0.369192510843277, + "rewards/rejected": -1.826697587966919, + "step": 2121 + }, + { + "epoch": 0.24, + "learning_rate": 2.3008310897811072e-07, + "logits/chosen": -1.988570213317871, + "logits/rejected": -1.6650278568267822, + "logps/chosen": -361.4005432128906, + "logps/rejected": -418.1929931640625, + "loss": 0.4012, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5312680602073669, + "rewards/margins": 2.0936641693115234, + "rewards/rejected": -2.624932289123535, + "step": 2122 + }, + { + "epoch": 0.24, + "learning_rate": 2.3004799250848647e-07, + "logits/chosen": -2.2324063777923584, + "logits/rejected": -2.372621536254883, + "logps/chosen": -252.10423278808594, + "logps/rejected": -334.24554443359375, + "loss": 0.5625, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.859520673751831, + "rewards/margins": 1.6936100721359253, + "rewards/rejected": -2.553130865097046, + "step": 2123 + }, + { + "epoch": 0.24, + "learning_rate": 2.300128760388622e-07, + "logits/chosen": -1.5897985696792603, + "logits/rejected": -1.7709054946899414, + "logps/chosen": -324.63372802734375, + "logps/rejected": -301.4697265625, + "loss": 0.1748, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.26489248871803284, + "rewards/margins": 1.969963550567627, + "rewards/rejected": -2.234856128692627, + "step": 2124 + }, + { + "epoch": 0.24, + "learning_rate": 2.2997775956923795e-07, + "logits/chosen": -2.634610176086426, + "logits/rejected": -2.927626132965088, + "logps/chosen": -159.97299194335938, + "logps/rejected": -196.91722106933594, + "loss": 0.0873, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1861828863620758, + "rewards/margins": 3.04480242729187, + "rewards/rejected": -3.230985403060913, + "step": 2125 + }, + { + "epoch": 0.25, + "learning_rate": 2.2994264309961373e-07, + "logits/chosen": -2.1752426624298096, + "logits/rejected": -2.2387938499450684, + "logps/chosen": -256.4096374511719, + "logps/rejected": -186.1280517578125, + "loss": 0.6183, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.878975510597229, + "rewards/margins": 0.6278867721557617, + "rewards/rejected": -1.5068622827529907, + "step": 2126 + }, + { + "epoch": 0.25, + "learning_rate": 2.2990752662998943e-07, + "logits/chosen": -2.9769818782806396, + "logits/rejected": -2.8627548217773438, + "logps/chosen": -321.88909912109375, + "logps/rejected": -230.29592895507812, + "loss": 0.203, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3672430515289307, + "rewards/margins": 2.4349870681762695, + "rewards/rejected": -3.8022303581237793, + "step": 2127 + }, + { + "epoch": 0.25, + "learning_rate": 2.2987241016036521e-07, + "logits/chosen": -2.5951969623565674, + "logits/rejected": -2.3110008239746094, + "logps/chosen": -111.15687561035156, + "logps/rejected": -200.88768005371094, + "loss": 0.2, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37778931856155396, + "rewards/margins": 2.3377580642700195, + "rewards/rejected": -2.7155473232269287, + "step": 2128 + }, + { + "epoch": 0.25, + "learning_rate": 2.2983729369074097e-07, + "logits/chosen": -2.5659382343292236, + "logits/rejected": -2.4258546829223633, + "logps/chosen": -187.66534423828125, + "logps/rejected": -275.92889404296875, + "loss": 0.3259, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1855487823486328, + "rewards/margins": 2.4834563732147217, + "rewards/rejected": -3.6690051555633545, + "step": 2129 + }, + { + "epoch": 0.25, + "learning_rate": 2.298021772211167e-07, + "logits/chosen": -2.7656664848327637, + "logits/rejected": -2.671539783477783, + "logps/chosen": -166.08444213867188, + "logps/rejected": -219.959228515625, + "loss": 0.2857, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09244897961616516, + "rewards/margins": 2.5177817344665527, + "rewards/rejected": -2.6102306842803955, + "step": 2130 + }, + { + "epoch": 0.25, + "learning_rate": 2.2976706075149245e-07, + "logits/chosen": -2.431182861328125, + "logits/rejected": -2.4999618530273438, + "logps/chosen": -359.16070556640625, + "logps/rejected": -169.10000610351562, + "loss": 1.3686, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.534382939338684, + "rewards/margins": -0.502703070640564, + "rewards/rejected": -1.0316798686981201, + "step": 2131 + }, + { + "epoch": 0.25, + "learning_rate": 2.2973194428186818e-07, + "logits/chosen": -1.8438916206359863, + "logits/rejected": -1.898488998413086, + "logps/chosen": -264.24237060546875, + "logps/rejected": -504.3713684082031, + "loss": 0.363, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1459095478057861, + "rewards/margins": 2.7132575511932373, + "rewards/rejected": -3.8591668605804443, + "step": 2132 + }, + { + "epoch": 0.25, + "learning_rate": 2.2969682781224393e-07, + "logits/chosen": -2.559990644454956, + "logits/rejected": -2.440889596939087, + "logps/chosen": -151.61167907714844, + "logps/rejected": -219.76800537109375, + "loss": 0.2719, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4510394334793091, + "rewards/margins": 2.117908477783203, + "rewards/rejected": -2.5689477920532227, + "step": 2133 + }, + { + "epoch": 0.25, + "learning_rate": 2.2966171134261968e-07, + "logits/chosen": -1.8160655498504639, + "logits/rejected": -1.7539594173431396, + "logps/chosen": -345.2242736816406, + "logps/rejected": -355.834228515625, + "loss": 0.4812, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5019738674163818, + "rewards/margins": 1.2838658094406128, + "rewards/rejected": -1.785839557647705, + "step": 2134 + }, + { + "epoch": 0.25, + "learning_rate": 2.296265948729954e-07, + "logits/chosen": -2.96675181388855, + "logits/rejected": -2.8878817558288574, + "logps/chosen": -193.98500061035156, + "logps/rejected": -198.00497436523438, + "loss": 0.3831, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2627636194229126, + "rewards/margins": 2.81650972366333, + "rewards/rejected": -4.079273700714111, + "step": 2135 + }, + { + "epoch": 0.25, + "learning_rate": 2.2959147840337117e-07, + "logits/chosen": -2.0566866397857666, + "logits/rejected": -2.3594532012939453, + "logps/chosen": -302.9581604003906, + "logps/rejected": -277.4969482421875, + "loss": 0.2709, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7662238478660583, + "rewards/margins": 2.4707019329071045, + "rewards/rejected": -3.2369258403778076, + "step": 2136 + }, + { + "epoch": 0.25, + "learning_rate": 2.295563619337469e-07, + "logits/chosen": -2.2696008682250977, + "logits/rejected": -2.3720805644989014, + "logps/chosen": -486.67535400390625, + "logps/rejected": -313.2351989746094, + "loss": 0.4946, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0846604108810425, + "rewards/margins": 1.0144519805908203, + "rewards/rejected": -2.0991125106811523, + "step": 2137 + }, + { + "epoch": 0.25, + "learning_rate": 2.2952124546412265e-07, + "logits/chosen": -2.156534433364868, + "logits/rejected": -2.314697742462158, + "logps/chosen": -257.9364013671875, + "logps/rejected": -220.67933654785156, + "loss": 0.712, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3523802757263184, + "rewards/margins": 0.6178972721099854, + "rewards/rejected": -1.9702774286270142, + "step": 2138 + }, + { + "epoch": 0.25, + "learning_rate": 2.2948612899449843e-07, + "logits/chosen": -2.5637588500976562, + "logits/rejected": -2.4644200801849365, + "logps/chosen": -340.2685241699219, + "logps/rejected": -258.4241027832031, + "loss": 0.435, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2321381568908691, + "rewards/margins": 1.1444960832595825, + "rewards/rejected": -2.376634359359741, + "step": 2139 + }, + { + "epoch": 0.25, + "learning_rate": 2.2945101252487415e-07, + "logits/chosen": -2.00388503074646, + "logits/rejected": -2.2069931030273438, + "logps/chosen": -231.91558837890625, + "logps/rejected": -187.88902282714844, + "loss": 0.8128, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4369027614593506, + "rewards/margins": 0.6287804245948792, + "rewards/rejected": -2.065683126449585, + "step": 2140 + }, + { + "epoch": 0.25, + "learning_rate": 2.294158960552499e-07, + "logits/chosen": -2.4953508377075195, + "logits/rejected": -2.608699321746826, + "logps/chosen": -294.0816955566406, + "logps/rejected": -211.91636657714844, + "loss": 0.5086, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.795653223991394, + "rewards/margins": 2.9891223907470703, + "rewards/rejected": -3.784775733947754, + "step": 2141 + }, + { + "epoch": 0.25, + "learning_rate": 2.2938077958562566e-07, + "logits/chosen": -2.9263665676116943, + "logits/rejected": -2.7941975593566895, + "logps/chosen": -241.4775390625, + "logps/rejected": -295.2959899902344, + "loss": 0.3579, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8574913144111633, + "rewards/margins": 1.813427209854126, + "rewards/rejected": -2.6709184646606445, + "step": 2142 + }, + { + "epoch": 0.25, + "learning_rate": 2.293456631160014e-07, + "logits/chosen": -2.1165032386779785, + "logits/rejected": -2.2451529502868652, + "logps/chosen": -251.24478149414062, + "logps/rejected": -234.64808654785156, + "loss": 0.5211, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7403535842895508, + "rewards/margins": 0.9946044683456421, + "rewards/rejected": -1.7349581718444824, + "step": 2143 + }, + { + "epoch": 0.25, + "learning_rate": 2.2931054664637714e-07, + "logits/chosen": -2.219383478164673, + "logits/rejected": -2.0386950969696045, + "logps/chosen": -651.148193359375, + "logps/rejected": -540.8717041015625, + "loss": 0.4739, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.270650625228882, + "rewards/margins": 1.3032728433609009, + "rewards/rejected": -3.5739235877990723, + "step": 2144 + }, + { + "epoch": 0.25, + "learning_rate": 2.2927543017675287e-07, + "logits/chosen": -1.7448681592941284, + "logits/rejected": -2.0208396911621094, + "logps/chosen": -360.540283203125, + "logps/rejected": -322.985595703125, + "loss": 0.6559, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.724504292011261, + "rewards/margins": 1.11068594455719, + "rewards/rejected": -1.8351902961730957, + "step": 2145 + }, + { + "epoch": 0.25, + "learning_rate": 2.2924031370712863e-07, + "logits/chosen": -2.2468819618225098, + "logits/rejected": -2.314243793487549, + "logps/chosen": -166.72467041015625, + "logps/rejected": -136.14321899414062, + "loss": 0.614, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2251262664794922, + "rewards/margins": 1.2828034162521362, + "rewards/rejected": -2.507929801940918, + "step": 2146 + }, + { + "epoch": 0.25, + "learning_rate": 2.2920519723750438e-07, + "logits/chosen": -2.890131950378418, + "logits/rejected": -2.9648475646972656, + "logps/chosen": -133.8203887939453, + "logps/rejected": -210.58673095703125, + "loss": 0.2179, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0498781204223633, + "rewards/margins": 1.7171906232833862, + "rewards/rejected": -2.767068862915039, + "step": 2147 + }, + { + "epoch": 0.25, + "learning_rate": 2.291700807678801e-07, + "logits/chosen": -2.5876944065093994, + "logits/rejected": -2.6854097843170166, + "logps/chosen": -323.89031982421875, + "logps/rejected": -237.5315704345703, + "loss": 0.1537, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4093825817108154, + "rewards/margins": 2.912339687347412, + "rewards/rejected": -4.321722030639648, + "step": 2148 + }, + { + "epoch": 0.25, + "learning_rate": 2.2913496429825586e-07, + "logits/chosen": -2.526411533355713, + "logits/rejected": -2.4868125915527344, + "logps/chosen": -370.7405700683594, + "logps/rejected": -338.829345703125, + "loss": 0.3117, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7842663526535034, + "rewards/margins": 3.0689711570739746, + "rewards/rejected": -3.8532373905181885, + "step": 2149 + }, + { + "epoch": 0.25, + "learning_rate": 2.2909984782863164e-07, + "logits/chosen": -1.8433563709259033, + "logits/rejected": -1.7350354194641113, + "logps/chosen": -305.3700866699219, + "logps/rejected": -427.8119201660156, + "loss": 0.3926, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6144111156463623, + "rewards/margins": 2.102290391921997, + "rewards/rejected": -2.7167015075683594, + "step": 2150 + }, + { + "epoch": 0.25, + "learning_rate": 2.2906473135900737e-07, + "logits/chosen": -2.37888240814209, + "logits/rejected": -2.585021734237671, + "logps/chosen": -268.1743469238281, + "logps/rejected": -224.16445922851562, + "loss": 0.4165, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.31060466170310974, + "rewards/margins": 1.676125168800354, + "rewards/rejected": -1.9867298603057861, + "step": 2151 + }, + { + "epoch": 0.25, + "learning_rate": 2.2902961488938312e-07, + "logits/chosen": -2.489877223968506, + "logits/rejected": -2.2091851234436035, + "logps/chosen": -146.8065185546875, + "logps/rejected": -188.99969482421875, + "loss": 0.5826, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1511781215667725, + "rewards/margins": 0.7608761787414551, + "rewards/rejected": -1.912054419517517, + "step": 2152 + }, + { + "epoch": 0.25, + "learning_rate": 2.2899449841975885e-07, + "logits/chosen": -2.7740306854248047, + "logits/rejected": -2.797565221786499, + "logps/chosen": -163.9853515625, + "logps/rejected": -374.656005859375, + "loss": 0.1035, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8057582974433899, + "rewards/margins": 3.892406463623047, + "rewards/rejected": -4.698164939880371, + "step": 2153 + }, + { + "epoch": 0.25, + "learning_rate": 2.289593819501346e-07, + "logits/chosen": -2.6651418209075928, + "logits/rejected": -2.5458855628967285, + "logps/chosen": -353.21954345703125, + "logps/rejected": -219.3905487060547, + "loss": 0.3244, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4204689860343933, + "rewards/margins": 2.2069427967071533, + "rewards/rejected": -2.6274118423461914, + "step": 2154 + }, + { + "epoch": 0.25, + "learning_rate": 2.2892426548051036e-07, + "logits/chosen": -2.331178665161133, + "logits/rejected": -2.2882087230682373, + "logps/chosen": -391.15240478515625, + "logps/rejected": -430.4087219238281, + "loss": 0.3194, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.766477644443512, + "rewards/margins": 1.7934269905090332, + "rewards/rejected": -2.5599045753479004, + "step": 2155 + }, + { + "epoch": 0.25, + "learning_rate": 2.2888914901088608e-07, + "logits/chosen": -2.5624523162841797, + "logits/rejected": -2.600701093673706, + "logps/chosen": -260.4036865234375, + "logps/rejected": -309.06402587890625, + "loss": 0.181, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.676392138004303, + "rewards/margins": 3.2037923336029053, + "rewards/rejected": -3.8801846504211426, + "step": 2156 + }, + { + "epoch": 0.25, + "learning_rate": 2.2885403254126184e-07, + "logits/chosen": -2.29673433303833, + "logits/rejected": -2.325726270675659, + "logps/chosen": -407.91412353515625, + "logps/rejected": -265.1788330078125, + "loss": 0.7831, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4366540908813477, + "rewards/margins": 0.7861186265945435, + "rewards/rejected": -2.2227725982666016, + "step": 2157 + }, + { + "epoch": 0.25, + "learning_rate": 2.288189160716376e-07, + "logits/chosen": -2.5164120197296143, + "logits/rejected": -2.4538967609405518, + "logps/chosen": -190.43289184570312, + "logps/rejected": -287.2008056640625, + "loss": 0.3235, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.23897600173950195, + "rewards/margins": 1.9959158897399902, + "rewards/rejected": -2.234891891479492, + "step": 2158 + }, + { + "epoch": 0.25, + "learning_rate": 2.2878379960201332e-07, + "logits/chosen": -2.320096015930176, + "logits/rejected": -2.2617030143737793, + "logps/chosen": -315.0549621582031, + "logps/rejected": -335.3157043457031, + "loss": 0.3749, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.846903383731842, + "rewards/margins": 1.740452766418457, + "rewards/rejected": -2.5873563289642334, + "step": 2159 + }, + { + "epoch": 0.25, + "learning_rate": 2.287486831323891e-07, + "logits/chosen": -2.0706288814544678, + "logits/rejected": -2.377373456954956, + "logps/chosen": -153.08383178710938, + "logps/rejected": -172.8192901611328, + "loss": 1.8085, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.3096330165863037, + "rewards/margins": 0.08087730407714844, + "rewards/rejected": -2.390510320663452, + "step": 2160 + }, + { + "epoch": 0.25, + "learning_rate": 2.287135666627648e-07, + "logits/chosen": -2.3419995307922363, + "logits/rejected": -2.3792362213134766, + "logps/chosen": -451.536865234375, + "logps/rejected": -278.18511962890625, + "loss": 0.1419, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04309183731675148, + "rewards/margins": 2.9720969200134277, + "rewards/rejected": -2.9290053844451904, + "step": 2161 + }, + { + "epoch": 0.25, + "learning_rate": 2.2867845019314058e-07, + "logits/chosen": -2.2131614685058594, + "logits/rejected": -2.234508991241455, + "logps/chosen": -568.9154663085938, + "logps/rejected": -409.59844970703125, + "loss": 0.3884, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.823638916015625, + "rewards/margins": 2.079824447631836, + "rewards/rejected": -2.90346360206604, + "step": 2162 + }, + { + "epoch": 0.25, + "learning_rate": 2.2864333372351633e-07, + "logits/chosen": -2.2687594890594482, + "logits/rejected": -2.3336355686187744, + "logps/chosen": -237.43016052246094, + "logps/rejected": -309.6058349609375, + "loss": 0.2013, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.057565394788980484, + "rewards/margins": 3.5137808322906494, + "rewards/rejected": -3.4562153816223145, + "step": 2163 + }, + { + "epoch": 0.25, + "learning_rate": 2.2860821725389206e-07, + "logits/chosen": -2.778045415878296, + "logits/rejected": -2.648719310760498, + "logps/chosen": -371.699951171875, + "logps/rejected": -275.727783203125, + "loss": 0.2741, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9863066673278809, + "rewards/margins": 2.0297799110412598, + "rewards/rejected": -3.0160863399505615, + "step": 2164 + }, + { + "epoch": 0.25, + "learning_rate": 2.2857310078426782e-07, + "logits/chosen": -2.6467537879943848, + "logits/rejected": -2.775279998779297, + "logps/chosen": -372.4508056640625, + "logps/rejected": -132.208251953125, + "loss": 0.562, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.633305549621582, + "rewards/margins": 0.5468835830688477, + "rewards/rejected": -2.1801891326904297, + "step": 2165 + }, + { + "epoch": 0.25, + "learning_rate": 2.2853798431464357e-07, + "logits/chosen": -2.280777931213379, + "logits/rejected": -2.518096446990967, + "logps/chosen": -417.097412109375, + "logps/rejected": -320.51141357421875, + "loss": 0.4898, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2546709775924683, + "rewards/margins": 1.6476610898971558, + "rewards/rejected": -2.902332067489624, + "step": 2166 + }, + { + "epoch": 0.25, + "learning_rate": 2.285028678450193e-07, + "logits/chosen": -2.414416790008545, + "logits/rejected": -2.5517380237579346, + "logps/chosen": -253.46466064453125, + "logps/rejected": -346.93585205078125, + "loss": 0.0983, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19261577725410461, + "rewards/margins": 4.373015403747559, + "rewards/rejected": -4.56563138961792, + "step": 2167 + }, + { + "epoch": 0.25, + "learning_rate": 2.2846775137539505e-07, + "logits/chosen": -1.995978593826294, + "logits/rejected": -2.284385919570923, + "logps/chosen": -217.027587890625, + "logps/rejected": -199.35423278808594, + "loss": 0.1911, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19793012738227844, + "rewards/margins": 2.5905239582061768, + "rewards/rejected": -2.788454055786133, + "step": 2168 + }, + { + "epoch": 0.25, + "learning_rate": 2.2843263490577078e-07, + "logits/chosen": -2.710930347442627, + "logits/rejected": -2.566281795501709, + "logps/chosen": -395.1485595703125, + "logps/rejected": -327.1627197265625, + "loss": 0.1824, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2915126383304596, + "rewards/margins": 2.3080058097839355, + "rewards/rejected": -2.599518299102783, + "step": 2169 + }, + { + "epoch": 0.25, + "learning_rate": 2.2839751843614653e-07, + "logits/chosen": -2.4664440155029297, + "logits/rejected": -2.699550151824951, + "logps/chosen": -236.91348266601562, + "logps/rejected": -159.5465545654297, + "loss": 0.3609, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.47279971837997437, + "rewards/margins": 1.237705111503601, + "rewards/rejected": -1.7105047702789307, + "step": 2170 + }, + { + "epoch": 0.25, + "learning_rate": 2.283624019665223e-07, + "logits/chosen": -2.741270065307617, + "logits/rejected": -2.9558050632476807, + "logps/chosen": -341.7490539550781, + "logps/rejected": -245.62771606445312, + "loss": 0.3317, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2481052875518799, + "rewards/margins": 1.4011597633361816, + "rewards/rejected": -2.6492652893066406, + "step": 2171 + }, + { + "epoch": 0.25, + "learning_rate": 2.2832728549689801e-07, + "logits/chosen": -2.686901092529297, + "logits/rejected": -2.7992939949035645, + "logps/chosen": -181.1145782470703, + "logps/rejected": -338.56243896484375, + "loss": 0.2877, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2670942544937134, + "rewards/margins": 3.4106240272521973, + "rewards/rejected": -3.677718162536621, + "step": 2172 + }, + { + "epoch": 0.25, + "learning_rate": 2.282921690272738e-07, + "logits/chosen": -1.9996020793914795, + "logits/rejected": -1.9335988759994507, + "logps/chosen": -420.49554443359375, + "logps/rejected": -381.27410888671875, + "loss": 0.1776, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.32942765951156616, + "rewards/margins": 2.1244654655456543, + "rewards/rejected": -2.4538931846618652, + "step": 2173 + }, + { + "epoch": 0.25, + "learning_rate": 2.2825705255764955e-07, + "logits/chosen": -2.5702621936798096, + "logits/rejected": -2.5736498832702637, + "logps/chosen": -317.3036804199219, + "logps/rejected": -297.6592712402344, + "loss": 0.9239, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.47245192527771, + "rewards/margins": 1.0736494064331055, + "rewards/rejected": -2.5461010932922363, + "step": 2174 + }, + { + "epoch": 0.25, + "learning_rate": 2.2822193608802528e-07, + "logits/chosen": -2.7451353073120117, + "logits/rejected": -2.784454822540283, + "logps/chosen": -357.574462890625, + "logps/rejected": -364.77490234375, + "loss": 0.2502, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24149170517921448, + "rewards/margins": 2.1592679023742676, + "rewards/rejected": -2.400759696960449, + "step": 2175 + }, + { + "epoch": 0.25, + "learning_rate": 2.2818681961840103e-07, + "logits/chosen": -2.291383981704712, + "logits/rejected": -2.195844888687134, + "logps/chosen": -443.36773681640625, + "logps/rejected": -289.94512939453125, + "loss": 0.4799, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8244742751121521, + "rewards/margins": 1.1117501258850098, + "rewards/rejected": -1.9362244606018066, + "step": 2176 + }, + { + "epoch": 0.25, + "learning_rate": 2.2815170314877676e-07, + "logits/chosen": -2.2110390663146973, + "logits/rejected": -2.4895036220550537, + "logps/chosen": -385.68377685546875, + "logps/rejected": -243.64047241210938, + "loss": 0.3323, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3235129714012146, + "rewards/margins": 1.2205898761749268, + "rewards/rejected": -1.5441029071807861, + "step": 2177 + }, + { + "epoch": 0.25, + "learning_rate": 2.281165866791525e-07, + "logits/chosen": -1.9985201358795166, + "logits/rejected": -2.3249964714050293, + "logps/chosen": -211.37887573242188, + "logps/rejected": -207.96768188476562, + "loss": 0.4477, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7879316806793213, + "rewards/margins": 1.3240244388580322, + "rewards/rejected": -2.1119561195373535, + "step": 2178 + }, + { + "epoch": 0.25, + "learning_rate": 2.2808147020952826e-07, + "logits/chosen": -2.589808702468872, + "logits/rejected": -2.6949543952941895, + "logps/chosen": -157.1138916015625, + "logps/rejected": -280.11328125, + "loss": 0.2063, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0014812089502811432, + "rewards/margins": 2.364112615585327, + "rewards/rejected": -2.365593910217285, + "step": 2179 + }, + { + "epoch": 0.25, + "learning_rate": 2.28046353739904e-07, + "logits/chosen": -1.8858221769332886, + "logits/rejected": -1.5918757915496826, + "logps/chosen": -359.46575927734375, + "logps/rejected": -530.8740234375, + "loss": 0.5718, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7349845767021179, + "rewards/margins": 1.0497065782546997, + "rewards/rejected": -1.7846910953521729, + "step": 2180 + }, + { + "epoch": 0.25, + "learning_rate": 2.2801123727027975e-07, + "logits/chosen": -2.406284809112549, + "logits/rejected": -2.1415646076202393, + "logps/chosen": -313.5418395996094, + "logps/rejected": -270.8910827636719, + "loss": 0.2523, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3925786018371582, + "rewards/margins": 2.154162645339966, + "rewards/rejected": -3.546741008758545, + "step": 2181 + }, + { + "epoch": 0.25, + "learning_rate": 2.2797612080065553e-07, + "logits/chosen": -2.6141457557678223, + "logits/rejected": -2.5640761852264404, + "logps/chosen": -225.07034301757812, + "logps/rejected": -272.32720947265625, + "loss": 0.6901, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5576341152191162, + "rewards/margins": 2.3573718070983887, + "rewards/rejected": -3.915005683898926, + "step": 2182 + }, + { + "epoch": 0.25, + "learning_rate": 2.2794100433103123e-07, + "logits/chosen": -2.8898186683654785, + "logits/rejected": -2.978609561920166, + "logps/chosen": -154.03778076171875, + "logps/rejected": -182.7021484375, + "loss": 0.5304, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5547807812690735, + "rewards/margins": 1.620025873184204, + "rewards/rejected": -2.174806594848633, + "step": 2183 + }, + { + "epoch": 0.25, + "learning_rate": 2.27905887861407e-07, + "logits/chosen": -2.5118207931518555, + "logits/rejected": -2.5360429286956787, + "logps/chosen": -187.09872436523438, + "logps/rejected": -273.7371826171875, + "loss": 0.2382, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1867975890636444, + "rewards/margins": 2.23638653755188, + "rewards/rejected": -2.4231841564178467, + "step": 2184 + }, + { + "epoch": 0.25, + "learning_rate": 2.2787077139178273e-07, + "logits/chosen": -1.8717663288116455, + "logits/rejected": -2.0914955139160156, + "logps/chosen": -279.21356201171875, + "logps/rejected": -301.0054931640625, + "loss": 0.3123, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.77950119972229, + "rewards/margins": 2.0064706802368164, + "rewards/rejected": -2.7859718799591064, + "step": 2185 + }, + { + "epoch": 0.25, + "learning_rate": 2.278356549221585e-07, + "logits/chosen": -2.4827027320861816, + "logits/rejected": -2.548184633255005, + "logps/chosen": -336.70989990234375, + "logps/rejected": -293.4603576660156, + "loss": 0.3209, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0426172018051147, + "rewards/margins": 1.4961719512939453, + "rewards/rejected": -2.5387890338897705, + "step": 2186 + }, + { + "epoch": 0.25, + "learning_rate": 2.2780053845253424e-07, + "logits/chosen": -1.9988884925842285, + "logits/rejected": -1.8921189308166504, + "logps/chosen": -257.94964599609375, + "logps/rejected": -263.231689453125, + "loss": 0.6428, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5293799042701721, + "rewards/margins": 0.8988750576972961, + "rewards/rejected": -1.4282548427581787, + "step": 2187 + }, + { + "epoch": 0.25, + "learning_rate": 2.2776542198290997e-07, + "logits/chosen": -1.897525429725647, + "logits/rejected": -2.197244167327881, + "logps/chosen": -227.3651580810547, + "logps/rejected": -168.87136840820312, + "loss": 0.6748, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8391298651695251, + "rewards/margins": 1.186692714691162, + "rewards/rejected": -2.025822639465332, + "step": 2188 + }, + { + "epoch": 0.25, + "learning_rate": 2.2773030551328572e-07, + "logits/chosen": -1.8842270374298096, + "logits/rejected": -1.9441007375717163, + "logps/chosen": -216.64678955078125, + "logps/rejected": -254.74395751953125, + "loss": 0.2817, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9840049743652344, + "rewards/margins": 2.951446056365967, + "rewards/rejected": -3.9354512691497803, + "step": 2189 + }, + { + "epoch": 0.25, + "learning_rate": 2.2769518904366145e-07, + "logits/chosen": -2.4292328357696533, + "logits/rejected": -2.6131961345672607, + "logps/chosen": -404.407470703125, + "logps/rejected": -308.9150390625, + "loss": 0.2222, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4966660737991333, + "rewards/margins": 2.468308925628662, + "rewards/rejected": -2.964974880218506, + "step": 2190 + }, + { + "epoch": 0.25, + "learning_rate": 2.276600725740372e-07, + "logits/chosen": -2.9005961418151855, + "logits/rejected": -3.0003390312194824, + "logps/chosen": -201.327880859375, + "logps/rejected": -281.0473327636719, + "loss": 0.8218, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6131361126899719, + "rewards/margins": 1.011600136756897, + "rewards/rejected": -1.6247363090515137, + "step": 2191 + }, + { + "epoch": 0.25, + "learning_rate": 2.2762495610441296e-07, + "logits/chosen": -2.5906479358673096, + "logits/rejected": -2.3755249977111816, + "logps/chosen": -139.36709594726562, + "logps/rejected": -303.1982421875, + "loss": 0.252, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2790626883506775, + "rewards/margins": 2.448540210723877, + "rewards/rejected": -2.72760272026062, + "step": 2192 + }, + { + "epoch": 0.25, + "learning_rate": 2.2758983963478869e-07, + "logits/chosen": -2.6897077560424805, + "logits/rejected": -2.658822774887085, + "logps/chosen": -102.4159164428711, + "logps/rejected": -181.8770294189453, + "loss": 0.3818, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12628832459449768, + "rewards/margins": 2.0792460441589355, + "rewards/rejected": -1.9529579877853394, + "step": 2193 + }, + { + "epoch": 0.25, + "learning_rate": 2.2755472316516447e-07, + "logits/chosen": -2.9801039695739746, + "logits/rejected": -2.936737298965454, + "logps/chosen": -305.2618408203125, + "logps/rejected": -249.82139587402344, + "loss": 0.2877, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4836479425430298, + "rewards/margins": 2.811335802078247, + "rewards/rejected": -3.2949838638305664, + "step": 2194 + }, + { + "epoch": 0.25, + "learning_rate": 2.2751960669554022e-07, + "logits/chosen": -2.189192771911621, + "logits/rejected": -1.9933547973632812, + "logps/chosen": -293.85455322265625, + "logps/rejected": -258.478759765625, + "loss": 0.3311, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.23748581111431122, + "rewards/margins": 1.681031584739685, + "rewards/rejected": -1.9185173511505127, + "step": 2195 + }, + { + "epoch": 0.25, + "learning_rate": 2.2748449022591595e-07, + "logits/chosen": -2.0826656818389893, + "logits/rejected": -2.246556043624878, + "logps/chosen": -305.775390625, + "logps/rejected": -275.0023193359375, + "loss": 0.8081, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1401089429855347, + "rewards/margins": 0.697281539440155, + "rewards/rejected": -1.8373905420303345, + "step": 2196 + }, + { + "epoch": 0.25, + "learning_rate": 2.274493737562917e-07, + "logits/chosen": -2.704299211502075, + "logits/rejected": -2.703348398208618, + "logps/chosen": -149.81204223632812, + "logps/rejected": -204.10321044921875, + "loss": 0.3407, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4634874165058136, + "rewards/margins": 1.7077198028564453, + "rewards/rejected": -2.1712071895599365, + "step": 2197 + }, + { + "epoch": 0.25, + "learning_rate": 2.2741425728666743e-07, + "logits/chosen": -2.502751588821411, + "logits/rejected": -2.2347514629364014, + "logps/chosen": -164.726318359375, + "logps/rejected": -293.240234375, + "loss": 0.2253, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6673293113708496, + "rewards/margins": 2.575063705444336, + "rewards/rejected": -3.2423930168151855, + "step": 2198 + }, + { + "epoch": 0.25, + "learning_rate": 2.2737914081704318e-07, + "logits/chosen": -2.3819899559020996, + "logits/rejected": -2.0845775604248047, + "logps/chosen": -215.49362182617188, + "logps/rejected": -321.4194030761719, + "loss": 0.1767, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9228205680847168, + "rewards/margins": 2.721497058868408, + "rewards/rejected": -3.644317626953125, + "step": 2199 + }, + { + "epoch": 0.25, + "learning_rate": 2.2734402434741894e-07, + "logits/chosen": -2.454486846923828, + "logits/rejected": -2.558410167694092, + "logps/chosen": -328.66766357421875, + "logps/rejected": -280.2633056640625, + "loss": 0.2365, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07374158501625061, + "rewards/margins": 1.6190273761749268, + "rewards/rejected": -1.5452858209609985, + "step": 2200 + }, + { + "epoch": 0.25, + "learning_rate": 2.2730890787779466e-07, + "logits/chosen": -1.8954136371612549, + "logits/rejected": -1.8803954124450684, + "logps/chosen": -293.26129150390625, + "logps/rejected": -354.80670166015625, + "loss": 0.363, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7367128133773804, + "rewards/margins": 1.9143033027648926, + "rewards/rejected": -2.6510159969329834, + "step": 2201 + }, + { + "epoch": 0.25, + "learning_rate": 2.2727379140817042e-07, + "logits/chosen": -2.250760555267334, + "logits/rejected": -2.389836311340332, + "logps/chosen": -262.85321044921875, + "logps/rejected": -230.4604949951172, + "loss": 0.3257, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.331120491027832, + "rewards/margins": 2.3500864505767822, + "rewards/rejected": -3.6812069416046143, + "step": 2202 + }, + { + "epoch": 0.25, + "learning_rate": 2.2723867493854617e-07, + "logits/chosen": -2.8847131729125977, + "logits/rejected": -2.8923873901367188, + "logps/chosen": -289.2641296386719, + "logps/rejected": -235.70571899414062, + "loss": 2.3339, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.8018996715545654, + "rewards/margins": -0.15208536386489868, + "rewards/rejected": -2.6498146057128906, + "step": 2203 + }, + { + "epoch": 0.25, + "learning_rate": 2.272035584689219e-07, + "logits/chosen": -2.4815711975097656, + "logits/rejected": -2.5252275466918945, + "logps/chosen": -231.23011779785156, + "logps/rejected": -215.052490234375, + "loss": 0.3015, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6518980264663696, + "rewards/margins": 2.373849868774414, + "rewards/rejected": -3.025747776031494, + "step": 2204 + }, + { + "epoch": 0.25, + "learning_rate": 2.2716844199929768e-07, + "logits/chosen": -2.0595781803131104, + "logits/rejected": -1.9901418685913086, + "logps/chosen": -176.14599609375, + "logps/rejected": -257.51361083984375, + "loss": 0.513, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6091567873954773, + "rewards/margins": 1.566720962524414, + "rewards/rejected": -2.175877809524536, + "step": 2205 + }, + { + "epoch": 0.25, + "learning_rate": 2.2713332552967338e-07, + "logits/chosen": -2.172255277633667, + "logits/rejected": -2.432976007461548, + "logps/chosen": -310.589599609375, + "logps/rejected": -273.9168701171875, + "loss": 0.3826, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8380687236785889, + "rewards/margins": 3.4124319553375244, + "rewards/rejected": -4.250500679016113, + "step": 2206 + }, + { + "epoch": 0.25, + "learning_rate": 2.2709820906004916e-07, + "logits/chosen": -2.08125638961792, + "logits/rejected": -1.8509718179702759, + "logps/chosen": -246.8500518798828, + "logps/rejected": -268.3567199707031, + "loss": 0.5759, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3141766786575317, + "rewards/margins": 1.421684980392456, + "rewards/rejected": -2.7358615398406982, + "step": 2207 + }, + { + "epoch": 0.25, + "learning_rate": 2.2706309259042491e-07, + "logits/chosen": -2.0880682468414307, + "logits/rejected": -2.4774227142333984, + "logps/chosen": -458.8408203125, + "logps/rejected": -290.752197265625, + "loss": 0.3782, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6205048561096191, + "rewards/margins": 1.230960726737976, + "rewards/rejected": -1.8514655828475952, + "step": 2208 + }, + { + "epoch": 0.25, + "learning_rate": 2.2702797612080064e-07, + "logits/chosen": -2.1038243770599365, + "logits/rejected": -1.8908843994140625, + "logps/chosen": -326.9769592285156, + "logps/rejected": -283.62579345703125, + "loss": 0.5576, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8518015146255493, + "rewards/margins": 2.541496992111206, + "rewards/rejected": -3.393298625946045, + "step": 2209 + }, + { + "epoch": 0.25, + "learning_rate": 2.269928596511764e-07, + "logits/chosen": -2.2056922912597656, + "logits/rejected": -2.2321202754974365, + "logps/chosen": -349.6673278808594, + "logps/rejected": -319.37493896484375, + "loss": 0.6133, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1195483207702637, + "rewards/margins": 2.592050075531006, + "rewards/rejected": -3.7115983963012695, + "step": 2210 + }, + { + "epoch": 0.25, + "learning_rate": 2.2695774318155215e-07, + "logits/chosen": -2.2709410190582275, + "logits/rejected": -2.159325361251831, + "logps/chosen": -263.0042724609375, + "logps/rejected": -304.56805419921875, + "loss": 0.2728, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1563175916671753, + "rewards/margins": 1.7396442890167236, + "rewards/rejected": -2.8959619998931885, + "step": 2211 + }, + { + "epoch": 0.26, + "learning_rate": 2.2692262671192788e-07, + "logits/chosen": -2.4458324909210205, + "logits/rejected": -2.4285788536071777, + "logps/chosen": -167.41665649414062, + "logps/rejected": -193.1162872314453, + "loss": 0.2976, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6716150045394897, + "rewards/margins": 1.9933518171310425, + "rewards/rejected": -2.6649670600891113, + "step": 2212 + }, + { + "epoch": 0.26, + "learning_rate": 2.2688751024230363e-07, + "logits/chosen": -2.533505439758301, + "logits/rejected": -2.4310872554779053, + "logps/chosen": -202.62185668945312, + "logps/rejected": -236.2889862060547, + "loss": 0.5483, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1702721118927002, + "rewards/margins": 1.410127878189087, + "rewards/rejected": -2.580399990081787, + "step": 2213 + }, + { + "epoch": 0.26, + "learning_rate": 2.2685239377267936e-07, + "logits/chosen": -2.488128185272217, + "logits/rejected": -2.7384605407714844, + "logps/chosen": -360.41815185546875, + "logps/rejected": -372.1272888183594, + "loss": 0.2253, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4945213794708252, + "rewards/margins": 2.8333582878112793, + "rewards/rejected": -4.327879905700684, + "step": 2214 + }, + { + "epoch": 0.26, + "learning_rate": 2.268172773030551e-07, + "logits/chosen": -2.6305978298187256, + "logits/rejected": -2.5485143661499023, + "logps/chosen": -197.92266845703125, + "logps/rejected": -262.80108642578125, + "loss": 0.6885, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4539265632629395, + "rewards/margins": 1.3874056339263916, + "rewards/rejected": -2.841331958770752, + "step": 2215 + }, + { + "epoch": 0.26, + "learning_rate": 2.267821608334309e-07, + "logits/chosen": -2.2380292415618896, + "logits/rejected": -2.276827812194824, + "logps/chosen": -349.20025634765625, + "logps/rejected": -351.10894775390625, + "loss": 0.206, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2762598991394043, + "rewards/margins": 3.1579532623291016, + "rewards/rejected": -4.434213638305664, + "step": 2216 + }, + { + "epoch": 0.26, + "learning_rate": 2.267470443638066e-07, + "logits/chosen": -2.0844902992248535, + "logits/rejected": -2.073503255844116, + "logps/chosen": -364.03302001953125, + "logps/rejected": -243.65438842773438, + "loss": 0.7998, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.0993998050689697, + "rewards/margins": 0.10004572570323944, + "rewards/rejected": -1.1994454860687256, + "step": 2217 + }, + { + "epoch": 0.26, + "learning_rate": 2.2671192789418237e-07, + "logits/chosen": -2.7217941284179688, + "logits/rejected": -2.5525622367858887, + "logps/chosen": -385.0354919433594, + "logps/rejected": -354.8494567871094, + "loss": 0.4292, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6086193323135376, + "rewards/margins": 2.6179616451263428, + "rewards/rejected": -3.226580858230591, + "step": 2218 + }, + { + "epoch": 0.26, + "learning_rate": 2.2667681142455813e-07, + "logits/chosen": -2.020508289337158, + "logits/rejected": -2.0246634483337402, + "logps/chosen": -316.2764587402344, + "logps/rejected": -382.3546447753906, + "loss": 0.509, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.497926115989685, + "rewards/margins": 3.0811710357666016, + "rewards/rejected": -4.579096794128418, + "step": 2219 + }, + { + "epoch": 0.26, + "learning_rate": 2.2664169495493385e-07, + "logits/chosen": -2.5504205226898193, + "logits/rejected": -2.6110527515411377, + "logps/chosen": -210.48289489746094, + "logps/rejected": -292.6040344238281, + "loss": 0.2638, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7978392243385315, + "rewards/margins": 4.33643102645874, + "rewards/rejected": -5.134270668029785, + "step": 2220 + }, + { + "epoch": 0.26, + "learning_rate": 2.266065784853096e-07, + "logits/chosen": -2.32761287689209, + "logits/rejected": -2.168503522872925, + "logps/chosen": -281.3006286621094, + "logps/rejected": -331.4777526855469, + "loss": 0.5244, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8266622424125671, + "rewards/margins": 1.1203733682632446, + "rewards/rejected": -1.9470356702804565, + "step": 2221 + }, + { + "epoch": 0.26, + "learning_rate": 2.2657146201568534e-07, + "logits/chosen": -2.194918155670166, + "logits/rejected": -1.8714255094528198, + "logps/chosen": -454.07537841796875, + "logps/rejected": -561.680908203125, + "loss": 0.9979, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.7404269576072693, + "rewards/margins": -0.17692312598228455, + "rewards/rejected": -0.5635038614273071, + "step": 2222 + }, + { + "epoch": 0.26, + "learning_rate": 2.265363455460611e-07, + "logits/chosen": -2.2223381996154785, + "logits/rejected": -2.3805418014526367, + "logps/chosen": -473.6243896484375, + "logps/rejected": -422.9305114746094, + "loss": 0.2683, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.37520283460617065, + "rewards/margins": 2.3115711212158203, + "rewards/rejected": -2.6867737770080566, + "step": 2223 + }, + { + "epoch": 0.26, + "learning_rate": 2.2650122907643684e-07, + "logits/chosen": -2.134105920791626, + "logits/rejected": -2.0122783184051514, + "logps/chosen": -499.69537353515625, + "logps/rejected": -425.0669860839844, + "loss": 0.7979, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0060995817184448, + "rewards/margins": 0.6832513213157654, + "rewards/rejected": -1.6893508434295654, + "step": 2224 + }, + { + "epoch": 0.26, + "learning_rate": 2.2646611260681257e-07, + "logits/chosen": -2.663489818572998, + "logits/rejected": -2.524508237838745, + "logps/chosen": -285.52001953125, + "logps/rejected": -346.2835693359375, + "loss": 0.4369, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1255663633346558, + "rewards/margins": 1.605156660079956, + "rewards/rejected": -2.7307231426239014, + "step": 2225 + }, + { + "epoch": 0.26, + "learning_rate": 2.2643099613718832e-07, + "logits/chosen": -2.2637743949890137, + "logits/rejected": -2.5158333778381348, + "logps/chosen": -117.32723999023438, + "logps/rejected": -76.70851135253906, + "loss": 0.7612, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.009706974029541, + "rewards/margins": 0.054044753313064575, + "rewards/rejected": -1.0637516975402832, + "step": 2226 + }, + { + "epoch": 0.26, + "learning_rate": 2.263958796675641e-07, + "logits/chosen": -2.5190846920013428, + "logits/rejected": -2.4611334800720215, + "logps/chosen": -266.4818420410156, + "logps/rejected": -364.839599609375, + "loss": 1.2085, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5865319967269897, + "rewards/margins": -0.015738099813461304, + "rewards/rejected": -1.570793867111206, + "step": 2227 + }, + { + "epoch": 0.26, + "learning_rate": 2.2636076319793983e-07, + "logits/chosen": -2.344900608062744, + "logits/rejected": -2.4051449298858643, + "logps/chosen": -202.3939208984375, + "logps/rejected": -219.4901885986328, + "loss": 0.1539, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010417759418487549, + "rewards/margins": 2.4821927547454834, + "rewards/rejected": -2.492610454559326, + "step": 2228 + }, + { + "epoch": 0.26, + "learning_rate": 2.2632564672831559e-07, + "logits/chosen": -2.0183660984039307, + "logits/rejected": -1.907470941543579, + "logps/chosen": -265.6978759765625, + "logps/rejected": -133.4683074951172, + "loss": 0.6316, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.4014554023742676, + "rewards/margins": 0.3596300482749939, + "rewards/rejected": -2.761085271835327, + "step": 2229 + }, + { + "epoch": 0.26, + "learning_rate": 2.2629053025869131e-07, + "logits/chosen": -1.9977116584777832, + "logits/rejected": -2.0782885551452637, + "logps/chosen": -462.04510498046875, + "logps/rejected": -395.01983642578125, + "loss": 0.4209, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9215266108512878, + "rewards/margins": 2.44720196723938, + "rewards/rejected": -3.3687286376953125, + "step": 2230 + }, + { + "epoch": 0.26, + "learning_rate": 2.2625541378906707e-07, + "logits/chosen": -2.1023099422454834, + "logits/rejected": -2.5275750160217285, + "logps/chosen": -403.79248046875, + "logps/rejected": -215.13406372070312, + "loss": 0.3854, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9978212714195251, + "rewards/margins": 1.739734411239624, + "rewards/rejected": -2.737555742263794, + "step": 2231 + }, + { + "epoch": 0.26, + "learning_rate": 2.2622029731944282e-07, + "logits/chosen": -2.46182918548584, + "logits/rejected": -2.348270893096924, + "logps/chosen": -279.75115966796875, + "logps/rejected": -292.0926513671875, + "loss": 0.6592, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.8649715185165405, + "rewards/margins": 1.320577621459961, + "rewards/rejected": -3.185549259185791, + "step": 2232 + }, + { + "epoch": 0.26, + "learning_rate": 2.2618518084981855e-07, + "logits/chosen": -1.805450677871704, + "logits/rejected": -1.8691151142120361, + "logps/chosen": -379.521240234375, + "logps/rejected": -325.72314453125, + "loss": 0.3339, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7991194128990173, + "rewards/margins": 2.12682843208313, + "rewards/rejected": -2.925947666168213, + "step": 2233 + }, + { + "epoch": 0.26, + "learning_rate": 2.261500643801943e-07, + "logits/chosen": -2.683330535888672, + "logits/rejected": -2.774916172027588, + "logps/chosen": -259.7132873535156, + "logps/rejected": -264.0387268066406, + "loss": 0.2475, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.531401515007019, + "rewards/margins": 2.620683431625366, + "rewards/rejected": -3.152085304260254, + "step": 2234 + }, + { + "epoch": 0.26, + "learning_rate": 2.2611494791057003e-07, + "logits/chosen": -2.302001714706421, + "logits/rejected": -2.267861843109131, + "logps/chosen": -263.1690673828125, + "logps/rejected": -277.19635009765625, + "loss": 0.3661, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4307973384857178, + "rewards/margins": 3.0199432373046875, + "rewards/rejected": -3.4507405757904053, + "step": 2235 + }, + { + "epoch": 0.26, + "learning_rate": 2.2607983144094578e-07, + "logits/chosen": -2.195368528366089, + "logits/rejected": -2.2817115783691406, + "logps/chosen": -244.78176879882812, + "logps/rejected": -198.5033416748047, + "loss": 0.6557, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.823684811592102, + "rewards/margins": 0.9806311130523682, + "rewards/rejected": -1.8043158054351807, + "step": 2236 + }, + { + "epoch": 0.26, + "learning_rate": 2.2604471497132154e-07, + "logits/chosen": -2.8578264713287354, + "logits/rejected": -2.77803897857666, + "logps/chosen": -336.9159240722656, + "logps/rejected": -334.9861755371094, + "loss": 0.3209, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1099797487258911, + "rewards/margins": 2.1383790969848633, + "rewards/rejected": -3.248358964920044, + "step": 2237 + }, + { + "epoch": 0.26, + "learning_rate": 2.2600959850169727e-07, + "logits/chosen": -2.566683530807495, + "logits/rejected": -2.5466136932373047, + "logps/chosen": -295.76666259765625, + "logps/rejected": -276.3410949707031, + "loss": 0.1433, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.543363094329834, + "rewards/margins": 3.4112606048583984, + "rewards/rejected": -3.9546236991882324, + "step": 2238 + }, + { + "epoch": 0.26, + "learning_rate": 2.2597448203207305e-07, + "logits/chosen": -2.6764535903930664, + "logits/rejected": -2.5750579833984375, + "logps/chosen": -231.25515747070312, + "logps/rejected": -243.18533325195312, + "loss": 0.4233, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.36216771602630615, + "rewards/margins": 2.362420082092285, + "rewards/rejected": -2.7245876789093018, + "step": 2239 + }, + { + "epoch": 0.26, + "learning_rate": 2.259393655624488e-07, + "logits/chosen": -2.483665704727173, + "logits/rejected": -2.476867198944092, + "logps/chosen": -215.96255493164062, + "logps/rejected": -255.22067260742188, + "loss": 0.3269, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6021734476089478, + "rewards/margins": 1.7149639129638672, + "rewards/rejected": -2.3171372413635254, + "step": 2240 + }, + { + "epoch": 0.26, + "learning_rate": 2.2590424909282453e-07, + "logits/chosen": -2.54134202003479, + "logits/rejected": -2.260183334350586, + "logps/chosen": -284.07501220703125, + "logps/rejected": -336.33782958984375, + "loss": 0.2497, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.991229772567749, + "rewards/margins": 1.622743010520935, + "rewards/rejected": -2.6139729022979736, + "step": 2241 + }, + { + "epoch": 0.26, + "learning_rate": 2.2586913262320028e-07, + "logits/chosen": -2.148655891418457, + "logits/rejected": -2.5290918350219727, + "logps/chosen": -421.0660400390625, + "logps/rejected": -219.8131561279297, + "loss": 1.5428, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.731522798538208, + "rewards/margins": -0.23115086555480957, + "rewards/rejected": -1.5003719329833984, + "step": 2242 + }, + { + "epoch": 0.26, + "learning_rate": 2.25834016153576e-07, + "logits/chosen": -1.2877280712127686, + "logits/rejected": -1.5238641500473022, + "logps/chosen": -227.34283447265625, + "logps/rejected": -160.6664581298828, + "loss": 1.6581, + "rewards/accuracies": 0.375, + "rewards/chosen": -3.4182891845703125, + "rewards/margins": -0.7835178375244141, + "rewards/rejected": -2.6347713470458984, + "step": 2243 + }, + { + "epoch": 0.26, + "learning_rate": 2.2579889968395176e-07, + "logits/chosen": -2.8280069828033447, + "logits/rejected": -2.8185315132141113, + "logps/chosen": -235.7269744873047, + "logps/rejected": -230.8612060546875, + "loss": 0.3959, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0277900695800781, + "rewards/margins": 2.1550745964050293, + "rewards/rejected": -3.1828649044036865, + "step": 2244 + }, + { + "epoch": 0.26, + "learning_rate": 2.2576378321432752e-07, + "logits/chosen": -2.748713731765747, + "logits/rejected": -2.5205492973327637, + "logps/chosen": -200.1702880859375, + "logps/rejected": -319.0989685058594, + "loss": 0.2362, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3236339092254639, + "rewards/margins": 2.780439853668213, + "rewards/rejected": -4.104073524475098, + "step": 2245 + }, + { + "epoch": 0.26, + "learning_rate": 2.2572866674470324e-07, + "logits/chosen": -2.2483901977539062, + "logits/rejected": -2.3432037830352783, + "logps/chosen": -166.9786376953125, + "logps/rejected": -188.03880310058594, + "loss": 0.4454, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9543991684913635, + "rewards/margins": 2.8224542140960693, + "rewards/rejected": -3.776853322982788, + "step": 2246 + }, + { + "epoch": 0.26, + "learning_rate": 2.25693550275079e-07, + "logits/chosen": -1.73361337184906, + "logits/rejected": -1.8552707433700562, + "logps/chosen": -413.1579895019531, + "logps/rejected": -359.30303955078125, + "loss": 0.437, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9791973829269409, + "rewards/margins": 2.3464856147766113, + "rewards/rejected": -3.325683116912842, + "step": 2247 + }, + { + "epoch": 0.26, + "learning_rate": 2.2565843380545475e-07, + "logits/chosen": -2.556810140609741, + "logits/rejected": -2.5402307510375977, + "logps/chosen": -223.0094451904297, + "logps/rejected": -211.70388793945312, + "loss": 0.7455, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.076109766960144, + "rewards/margins": 1.0280345678329468, + "rewards/rejected": -2.104144334793091, + "step": 2248 + }, + { + "epoch": 0.26, + "learning_rate": 2.2562331733583048e-07, + "logits/chosen": -2.4058258533477783, + "logits/rejected": -2.0460567474365234, + "logps/chosen": -179.923095703125, + "logps/rejected": -376.508544921875, + "loss": 0.527, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5160271525382996, + "rewards/margins": 1.422189474105835, + "rewards/rejected": -1.9382166862487793, + "step": 2249 + }, + { + "epoch": 0.26, + "learning_rate": 2.2558820086620626e-07, + "logits/chosen": -2.432602882385254, + "logits/rejected": -2.445802927017212, + "logps/chosen": -265.0552978515625, + "logps/rejected": -292.20867919921875, + "loss": 0.3917, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.45697858929634094, + "rewards/margins": 1.7890219688415527, + "rewards/rejected": -2.2460007667541504, + "step": 2250 + }, + { + "epoch": 0.26, + "learning_rate": 2.2555308439658196e-07, + "logits/chosen": -2.0320591926574707, + "logits/rejected": -2.0751476287841797, + "logps/chosen": -211.3302459716797, + "logps/rejected": -217.095703125, + "loss": 0.5633, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1769897937774658, + "rewards/margins": 0.5024628639221191, + "rewards/rejected": -1.679452657699585, + "step": 2251 + }, + { + "epoch": 0.26, + "learning_rate": 2.2551796792695774e-07, + "logits/chosen": -2.053224802017212, + "logits/rejected": -2.0477089881896973, + "logps/chosen": -292.2361145019531, + "logps/rejected": -335.9307556152344, + "loss": 0.7648, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.9387906193733215, + "rewards/margins": 0.18496252596378326, + "rewards/rejected": -1.1237531900405884, + "step": 2252 + }, + { + "epoch": 0.26, + "learning_rate": 2.254828514573335e-07, + "logits/chosen": -2.4991958141326904, + "logits/rejected": -2.515364408493042, + "logps/chosen": -229.63511657714844, + "logps/rejected": -267.18609619140625, + "loss": 0.7465, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5916694402694702, + "rewards/margins": 1.1933248043060303, + "rewards/rejected": -2.78499436378479, + "step": 2253 + }, + { + "epoch": 0.26, + "learning_rate": 2.2544773498770922e-07, + "logits/chosen": -2.059314727783203, + "logits/rejected": -2.28560733795166, + "logps/chosen": -469.7930908203125, + "logps/rejected": -414.8186950683594, + "loss": 0.4795, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.810397744178772, + "rewards/margins": 2.6020185947418213, + "rewards/rejected": -4.412416458129883, + "step": 2254 + }, + { + "epoch": 0.26, + "learning_rate": 2.2541261851808497e-07, + "logits/chosen": -2.3722612857818604, + "logits/rejected": -2.2605087757110596, + "logps/chosen": -252.37001037597656, + "logps/rejected": -267.3238525390625, + "loss": 0.3932, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7877167463302612, + "rewards/margins": 1.638225793838501, + "rewards/rejected": -2.4259424209594727, + "step": 2255 + }, + { + "epoch": 0.26, + "learning_rate": 2.2537750204846073e-07, + "logits/chosen": -2.1190149784088135, + "logits/rejected": -2.2510359287261963, + "logps/chosen": -257.1693115234375, + "logps/rejected": -281.3985595703125, + "loss": 0.335, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06573310494422913, + "rewards/margins": 2.4446520805358887, + "rewards/rejected": -2.510385274887085, + "step": 2256 + }, + { + "epoch": 0.26, + "learning_rate": 2.2534238557883646e-07, + "logits/chosen": -2.8223180770874023, + "logits/rejected": -2.611210584640503, + "logps/chosen": -337.3150634765625, + "logps/rejected": -150.1987762451172, + "loss": 0.239, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.83826744556427, + "rewards/margins": 2.0269689559936523, + "rewards/rejected": -2.865236282348633, + "step": 2257 + }, + { + "epoch": 0.26, + "learning_rate": 2.253072691092122e-07, + "logits/chosen": -1.8732280731201172, + "logits/rejected": -1.9835916757583618, + "logps/chosen": -409.5480651855469, + "logps/rejected": -434.6846923828125, + "loss": 0.5885, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6137060523033142, + "rewards/margins": 1.3201711177825928, + "rewards/rejected": -1.9338772296905518, + "step": 2258 + }, + { + "epoch": 0.26, + "learning_rate": 2.2527215263958794e-07, + "logits/chosen": -2.5586318969726562, + "logits/rejected": -2.563234567642212, + "logps/chosen": -199.15480041503906, + "logps/rejected": -207.5373077392578, + "loss": 0.3097, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5117397308349609, + "rewards/margins": 3.1597325801849365, + "rewards/rejected": -3.6714720726013184, + "step": 2259 + }, + { + "epoch": 0.26, + "learning_rate": 2.252370361699637e-07, + "logits/chosen": -2.1437699794769287, + "logits/rejected": -2.510099411010742, + "logps/chosen": -382.01947021484375, + "logps/rejected": -393.64385986328125, + "loss": 0.0597, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3816282749176025, + "rewards/margins": 3.8137497901916504, + "rewards/rejected": -5.195377826690674, + "step": 2260 + }, + { + "epoch": 0.26, + "learning_rate": 2.2520191970033947e-07, + "logits/chosen": -2.1251718997955322, + "logits/rejected": -1.885507345199585, + "logps/chosen": -241.91583251953125, + "logps/rejected": -252.27691650390625, + "loss": 0.3967, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0383270978927612, + "rewards/margins": 2.4071242809295654, + "rewards/rejected": -3.445451498031616, + "step": 2261 + }, + { + "epoch": 0.26, + "learning_rate": 2.2516680323071517e-07, + "logits/chosen": -1.813192367553711, + "logits/rejected": -2.0497214794158936, + "logps/chosen": -409.6390686035156, + "logps/rejected": -280.0774841308594, + "loss": 0.6641, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2122009992599487, + "rewards/margins": 0.36297136545181274, + "rewards/rejected": -1.5751723051071167, + "step": 2262 + }, + { + "epoch": 0.26, + "learning_rate": 2.2513168676109095e-07, + "logits/chosen": -2.738481044769287, + "logits/rejected": -2.6586685180664062, + "logps/chosen": -226.96994018554688, + "logps/rejected": -368.1844177246094, + "loss": 0.6783, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5595406293869019, + "rewards/margins": 1.983555793762207, + "rewards/rejected": -3.5430963039398193, + "step": 2263 + }, + { + "epoch": 0.26, + "learning_rate": 2.250965702914667e-07, + "logits/chosen": -2.3816540241241455, + "logits/rejected": -2.3341639041900635, + "logps/chosen": -428.5921630859375, + "logps/rejected": -472.55255126953125, + "loss": 0.2919, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4752645492553711, + "rewards/margins": 1.885058045387268, + "rewards/rejected": -2.3603224754333496, + "step": 2264 + }, + { + "epoch": 0.26, + "learning_rate": 2.2506145382184243e-07, + "logits/chosen": -2.2582757472991943, + "logits/rejected": -2.348330020904541, + "logps/chosen": -351.8621826171875, + "logps/rejected": -266.9876708984375, + "loss": 0.1391, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2239549607038498, + "rewards/margins": 3.2934086322784424, + "rewards/rejected": -3.5173635482788086, + "step": 2265 + }, + { + "epoch": 0.26, + "learning_rate": 2.250263373522182e-07, + "logits/chosen": -1.9933277368545532, + "logits/rejected": -2.041881799697876, + "logps/chosen": -334.5544128417969, + "logps/rejected": -453.60455322265625, + "loss": 0.4838, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3076850473880768, + "rewards/margins": 1.7824726104736328, + "rewards/rejected": -2.0901575088500977, + "step": 2266 + }, + { + "epoch": 0.26, + "learning_rate": 2.2499122088259392e-07, + "logits/chosen": -2.1002180576324463, + "logits/rejected": -2.2076234817504883, + "logps/chosen": -275.7345886230469, + "logps/rejected": -237.62136840820312, + "loss": 0.4048, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6758148074150085, + "rewards/margins": 1.9600071907043457, + "rewards/rejected": -2.63582181930542, + "step": 2267 + }, + { + "epoch": 0.26, + "learning_rate": 2.2495610441296967e-07, + "logits/chosen": -2.366680383682251, + "logits/rejected": -2.2353787422180176, + "logps/chosen": -424.6832275390625, + "logps/rejected": -388.5319519042969, + "loss": 0.4567, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9283133745193481, + "rewards/margins": 1.1150176525115967, + "rewards/rejected": -2.0433311462402344, + "step": 2268 + }, + { + "epoch": 0.26, + "learning_rate": 2.2492098794334542e-07, + "logits/chosen": -2.7634990215301514, + "logits/rejected": -2.7076973915100098, + "logps/chosen": -275.1183166503906, + "logps/rejected": -399.7446594238281, + "loss": 0.2385, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8585118651390076, + "rewards/margins": 2.3676629066467285, + "rewards/rejected": -3.226174831390381, + "step": 2269 + }, + { + "epoch": 0.26, + "learning_rate": 2.2488587147372115e-07, + "logits/chosen": -2.353712320327759, + "logits/rejected": -2.0701448917388916, + "logps/chosen": -215.70123291015625, + "logps/rejected": -371.11749267578125, + "loss": 0.5061, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1103910207748413, + "rewards/margins": 1.5172895193099976, + "rewards/rejected": -2.627680540084839, + "step": 2270 + }, + { + "epoch": 0.26, + "learning_rate": 2.248507550040969e-07, + "logits/chosen": -2.9926109313964844, + "logits/rejected": -3.0612947940826416, + "logps/chosen": -447.4653625488281, + "logps/rejected": -268.1744384765625, + "loss": 0.3318, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4716314971446991, + "rewards/margins": 1.6591455936431885, + "rewards/rejected": -2.13077712059021, + "step": 2271 + }, + { + "epoch": 0.26, + "learning_rate": 2.2481563853447268e-07, + "logits/chosen": -2.3245882987976074, + "logits/rejected": -2.1982107162475586, + "logps/chosen": -173.55502319335938, + "logps/rejected": -185.87403869628906, + "loss": 0.4457, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2846007347106934, + "rewards/margins": 0.7955055236816406, + "rewards/rejected": -2.080106258392334, + "step": 2272 + }, + { + "epoch": 0.26, + "learning_rate": 2.247805220648484e-07, + "logits/chosen": -2.4028496742248535, + "logits/rejected": -2.463146924972534, + "logps/chosen": -140.8910675048828, + "logps/rejected": -232.3765411376953, + "loss": 0.3399, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4039210081100464, + "rewards/margins": 1.9021832942962646, + "rewards/rejected": -3.3061041831970215, + "step": 2273 + }, + { + "epoch": 0.26, + "learning_rate": 2.2474540559522417e-07, + "logits/chosen": -2.6708669662475586, + "logits/rejected": -2.540605306625366, + "logps/chosen": -330.9004821777344, + "logps/rejected": -230.07608032226562, + "loss": 0.2349, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5394020080566406, + "rewards/margins": 2.2763657569885254, + "rewards/rejected": -2.815768003463745, + "step": 2274 + }, + { + "epoch": 0.26, + "learning_rate": 2.247102891255999e-07, + "logits/chosen": -1.968406081199646, + "logits/rejected": -2.2426979541778564, + "logps/chosen": -705.433837890625, + "logps/rejected": -269.03582763671875, + "loss": 0.9792, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.967134714126587, + "rewards/margins": 0.056145548820495605, + "rewards/rejected": -2.023280143737793, + "step": 2275 + }, + { + "epoch": 0.26, + "learning_rate": 2.2467517265597565e-07, + "logits/chosen": -2.4695234298706055, + "logits/rejected": -2.2441482543945312, + "logps/chosen": -218.1055908203125, + "logps/rejected": -308.49383544921875, + "loss": 0.8069, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0038831233978271, + "rewards/margins": 0.6253352165222168, + "rewards/rejected": -1.6292182207107544, + "step": 2276 + }, + { + "epoch": 0.26, + "learning_rate": 2.246400561863514e-07, + "logits/chosen": -2.4166879653930664, + "logits/rejected": -2.140460968017578, + "logps/chosen": -317.08148193359375, + "logps/rejected": -310.53533935546875, + "loss": 0.0963, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14571547508239746, + "rewards/margins": 3.421292543411255, + "rewards/rejected": -3.5670080184936523, + "step": 2277 + }, + { + "epoch": 0.26, + "learning_rate": 2.2460493971672713e-07, + "logits/chosen": -2.3448681831359863, + "logits/rejected": -2.406852960586548, + "logps/chosen": -305.42193603515625, + "logps/rejected": -228.23980712890625, + "loss": 0.6091, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5581749677658081, + "rewards/margins": 1.4744784832000732, + "rewards/rejected": -2.032653570175171, + "step": 2278 + }, + { + "epoch": 0.26, + "learning_rate": 2.2456982324710288e-07, + "logits/chosen": -1.8381083011627197, + "logits/rejected": -2.007352352142334, + "logps/chosen": -404.5364685058594, + "logps/rejected": -288.34765625, + "loss": 0.5053, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.46484002470970154, + "rewards/margins": 1.1840687990188599, + "rewards/rejected": -1.6489088535308838, + "step": 2279 + }, + { + "epoch": 0.26, + "learning_rate": 2.245347067774786e-07, + "logits/chosen": -2.6665923595428467, + "logits/rejected": -2.3432705402374268, + "logps/chosen": -215.8267059326172, + "logps/rejected": -228.8168182373047, + "loss": 0.2297, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7158694267272949, + "rewards/margins": 3.1237807273864746, + "rewards/rejected": -3.8396501541137695, + "step": 2280 + }, + { + "epoch": 0.26, + "learning_rate": 2.2449959030785436e-07, + "logits/chosen": -2.86283802986145, + "logits/rejected": -2.945611000061035, + "logps/chosen": -268.91790771484375, + "logps/rejected": -307.39117431640625, + "loss": 0.3244, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2657206952571869, + "rewards/margins": 1.4507615566253662, + "rewards/rejected": -1.716482162475586, + "step": 2281 + }, + { + "epoch": 0.26, + "learning_rate": 2.2446447383823012e-07, + "logits/chosen": -2.532721996307373, + "logits/rejected": -2.552459478378296, + "logps/chosen": -306.69140625, + "logps/rejected": -354.48724365234375, + "loss": 0.5436, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8859436511993408, + "rewards/margins": 1.8608611822128296, + "rewards/rejected": -2.74680495262146, + "step": 2282 + }, + { + "epoch": 0.26, + "learning_rate": 2.2442935736860584e-07, + "logits/chosen": -2.463515520095825, + "logits/rejected": -2.431800603866577, + "logps/chosen": -196.6565704345703, + "logps/rejected": -356.2034912109375, + "loss": 0.1158, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16443848609924316, + "rewards/margins": 3.133028030395508, + "rewards/rejected": -3.297466278076172, + "step": 2283 + }, + { + "epoch": 0.26, + "learning_rate": 2.2439424089898162e-07, + "logits/chosen": -2.0878043174743652, + "logits/rejected": -2.266265869140625, + "logps/chosen": -287.73638916015625, + "logps/rejected": -280.81585693359375, + "loss": 0.5597, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2283624410629272, + "rewards/margins": 0.7225887775421143, + "rewards/rejected": -1.950951337814331, + "step": 2284 + }, + { + "epoch": 0.26, + "learning_rate": 2.2435912442935738e-07, + "logits/chosen": -2.427318811416626, + "logits/rejected": -2.3449769020080566, + "logps/chosen": -306.5158996582031, + "logps/rejected": -291.2677307128906, + "loss": 0.2026, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9571738839149475, + "rewards/margins": 2.644435167312622, + "rewards/rejected": -3.601609230041504, + "step": 2285 + }, + { + "epoch": 0.26, + "learning_rate": 2.243240079597331e-07, + "logits/chosen": -1.920284628868103, + "logits/rejected": -2.1212446689605713, + "logps/chosen": -489.4121398925781, + "logps/rejected": -343.0995788574219, + "loss": 0.604, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2017711400985718, + "rewards/margins": 1.3093774318695068, + "rewards/rejected": -2.511148691177368, + "step": 2286 + }, + { + "epoch": 0.26, + "learning_rate": 2.2428889149010886e-07, + "logits/chosen": -2.6886789798736572, + "logits/rejected": -2.4126405715942383, + "logps/chosen": -184.68453979492188, + "logps/rejected": -247.46534729003906, + "loss": 0.5119, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3938630223274231, + "rewards/margins": 1.396337628364563, + "rewards/rejected": -1.7902005910873413, + "step": 2287 + }, + { + "epoch": 0.26, + "learning_rate": 2.242537750204846e-07, + "logits/chosen": -2.359935760498047, + "logits/rejected": -2.7681009769439697, + "logps/chosen": -292.91912841796875, + "logps/rejected": -232.16526794433594, + "loss": 0.4391, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.49203768372535706, + "rewards/margins": 1.3013534545898438, + "rewards/rejected": -1.793391227722168, + "step": 2288 + }, + { + "epoch": 0.26, + "learning_rate": 2.2421865855086034e-07, + "logits/chosen": -2.223177433013916, + "logits/rejected": -2.157022714614868, + "logps/chosen": -369.63739013671875, + "logps/rejected": -416.05474853515625, + "loss": 0.3456, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5456516742706299, + "rewards/margins": 1.6957921981811523, + "rewards/rejected": -2.2414438724517822, + "step": 2289 + }, + { + "epoch": 0.26, + "learning_rate": 2.241835420812361e-07, + "logits/chosen": -2.460305690765381, + "logits/rejected": -2.3285558223724365, + "logps/chosen": -275.8722839355469, + "logps/rejected": -311.869140625, + "loss": 0.4709, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1793022155761719, + "rewards/margins": 1.3604178428649902, + "rewards/rejected": -2.539720058441162, + "step": 2290 + }, + { + "epoch": 0.26, + "learning_rate": 2.2414842561161182e-07, + "logits/chosen": -2.18436861038208, + "logits/rejected": -1.9877527952194214, + "logps/chosen": -317.1431884765625, + "logps/rejected": -475.9342041015625, + "loss": 0.2656, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.781062662601471, + "rewards/margins": 2.6664934158325195, + "rewards/rejected": -3.4475557804107666, + "step": 2291 + }, + { + "epoch": 0.26, + "learning_rate": 2.2411330914198758e-07, + "logits/chosen": -2.660113573074341, + "logits/rejected": -2.6705880165100098, + "logps/chosen": -213.30751037597656, + "logps/rejected": -225.20748901367188, + "loss": 0.3169, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.824638843536377, + "rewards/margins": 2.3586018085479736, + "rewards/rejected": -3.1832406520843506, + "step": 2292 + }, + { + "epoch": 0.26, + "learning_rate": 2.2407819267236333e-07, + "logits/chosen": -2.453216075897217, + "logits/rejected": -2.424966812133789, + "logps/chosen": -157.35028076171875, + "logps/rejected": -195.12095642089844, + "loss": 0.2649, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6951532363891602, + "rewards/margins": 2.281580686569214, + "rewards/rejected": -2.976733684539795, + "step": 2293 + }, + { + "epoch": 0.26, + "learning_rate": 2.2404307620273906e-07, + "logits/chosen": -2.178356170654297, + "logits/rejected": -2.1869704723358154, + "logps/chosen": -154.49891662597656, + "logps/rejected": -207.7222137451172, + "loss": 0.4779, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4176507890224457, + "rewards/margins": 2.483114719390869, + "rewards/rejected": -2.9007654190063477, + "step": 2294 + }, + { + "epoch": 0.26, + "learning_rate": 2.2400795973311484e-07, + "logits/chosen": -3.002077341079712, + "logits/rejected": -2.999166965484619, + "logps/chosen": -318.514892578125, + "logps/rejected": -241.68246459960938, + "loss": 0.3192, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2972984313964844, + "rewards/margins": 1.91385018825531, + "rewards/rejected": -2.211148738861084, + "step": 2295 + }, + { + "epoch": 0.26, + "learning_rate": 2.2397284326349054e-07, + "logits/chosen": -2.4522838592529297, + "logits/rejected": -2.4074230194091797, + "logps/chosen": -521.423828125, + "logps/rejected": -442.538818359375, + "loss": 0.2997, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7846482992172241, + "rewards/margins": 2.6255691051483154, + "rewards/rejected": -3.41021728515625, + "step": 2296 + }, + { + "epoch": 0.26, + "learning_rate": 2.2393772679386632e-07, + "logits/chosen": -2.623943567276001, + "logits/rejected": -2.6641714572906494, + "logps/chosen": -265.8018798828125, + "logps/rejected": -280.6538391113281, + "loss": 0.3896, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.639595091342926, + "rewards/margins": 1.441237211227417, + "rewards/rejected": -2.0808324813842773, + "step": 2297 + }, + { + "epoch": 0.26, + "learning_rate": 2.2390261032424207e-07, + "logits/chosen": -1.8983862400054932, + "logits/rejected": -1.9559857845306396, + "logps/chosen": -443.1282958984375, + "logps/rejected": -426.87945556640625, + "loss": 0.2478, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8515753746032715, + "rewards/margins": 1.6433076858520508, + "rewards/rejected": -2.4948830604553223, + "step": 2298 + }, + { + "epoch": 0.27, + "learning_rate": 2.238674938546178e-07, + "logits/chosen": -2.591737747192383, + "logits/rejected": -2.275867223739624, + "logps/chosen": -260.3439636230469, + "logps/rejected": -291.15106201171875, + "loss": 0.3248, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5943571329116821, + "rewards/margins": 2.475369453430176, + "rewards/rejected": -3.0697264671325684, + "step": 2299 + }, + { + "epoch": 0.27, + "learning_rate": 2.2383237738499355e-07, + "logits/chosen": -2.796947479248047, + "logits/rejected": -2.7315571308135986, + "logps/chosen": -356.1314392089844, + "logps/rejected": -477.3152770996094, + "loss": 1.1513, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5625977516174316, + "rewards/margins": 0.5846811532974243, + "rewards/rejected": -2.1472790241241455, + "step": 2300 + }, + { + "epoch": 0.27, + "learning_rate": 2.237972609153693e-07, + "logits/chosen": -2.5650041103363037, + "logits/rejected": -2.73481822013855, + "logps/chosen": -141.1398162841797, + "logps/rejected": -162.7211151123047, + "loss": 0.9503, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.2347817420959473, + "rewards/margins": 0.9814023375511169, + "rewards/rejected": -2.216184139251709, + "step": 2301 + }, + { + "epoch": 0.27, + "learning_rate": 2.2376214444574504e-07, + "logits/chosen": -2.200770854949951, + "logits/rejected": -2.078885555267334, + "logps/chosen": -250.42803955078125, + "logps/rejected": -346.9822692871094, + "loss": 0.6259, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9829134345054626, + "rewards/margins": 1.7736806869506836, + "rewards/rejected": -2.756594181060791, + "step": 2302 + }, + { + "epoch": 0.27, + "learning_rate": 2.237270279761208e-07, + "logits/chosen": -2.18190336227417, + "logits/rejected": -1.9216426610946655, + "logps/chosen": -241.0242462158203, + "logps/rejected": -456.33734130859375, + "loss": 0.4551, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0567049980163574, + "rewards/margins": 2.30957293510437, + "rewards/rejected": -3.3662776947021484, + "step": 2303 + }, + { + "epoch": 0.27, + "learning_rate": 2.2369191150649652e-07, + "logits/chosen": -2.079880714416504, + "logits/rejected": -2.4438958168029785, + "logps/chosen": -402.67791748046875, + "logps/rejected": -281.0281066894531, + "loss": 0.4708, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.178862452507019, + "rewards/margins": 1.4791995286941528, + "rewards/rejected": -2.658061981201172, + "step": 2304 + }, + { + "epoch": 0.27, + "learning_rate": 2.2365679503687227e-07, + "logits/chosen": -2.0920510292053223, + "logits/rejected": -2.373713970184326, + "logps/chosen": -271.32861328125, + "logps/rejected": -159.08926391601562, + "loss": 0.4567, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6714006066322327, + "rewards/margins": 1.2365280389785767, + "rewards/rejected": -1.9079285860061646, + "step": 2305 + }, + { + "epoch": 0.27, + "learning_rate": 2.2362167856724805e-07, + "logits/chosen": -2.106142044067383, + "logits/rejected": -2.335698366165161, + "logps/chosen": -476.6043701171875, + "logps/rejected": -319.1282653808594, + "loss": 0.4662, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6081722974777222, + "rewards/margins": 1.5238926410675049, + "rewards/rejected": -2.1320648193359375, + "step": 2306 + }, + { + "epoch": 0.27, + "learning_rate": 2.2358656209762378e-07, + "logits/chosen": -2.2473981380462646, + "logits/rejected": -1.8442399501800537, + "logps/chosen": -170.47744750976562, + "logps/rejected": -324.4100646972656, + "loss": 0.4129, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7301777601242065, + "rewards/margins": 4.687042236328125, + "rewards/rejected": -5.417220115661621, + "step": 2307 + }, + { + "epoch": 0.27, + "learning_rate": 2.2355144562799953e-07, + "logits/chosen": -2.119511365890503, + "logits/rejected": -1.8453165292739868, + "logps/chosen": -245.50885009765625, + "logps/rejected": -335.636474609375, + "loss": 0.6295, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1505224704742432, + "rewards/margins": 0.6518568396568298, + "rewards/rejected": -1.8023793697357178, + "step": 2308 + }, + { + "epoch": 0.27, + "learning_rate": 2.2351632915837529e-07, + "logits/chosen": -2.078742027282715, + "logits/rejected": -2.053044319152832, + "logps/chosen": -315.56329345703125, + "logps/rejected": -325.4400634765625, + "loss": 0.2867, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.22723597288131714, + "rewards/margins": 2.253606081008911, + "rewards/rejected": -2.026370048522949, + "step": 2309 + }, + { + "epoch": 0.27, + "learning_rate": 2.23481212688751e-07, + "logits/chosen": -2.210249662399292, + "logits/rejected": -2.285341739654541, + "logps/chosen": -289.9751281738281, + "logps/rejected": -317.10943603515625, + "loss": 0.4492, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.71319180727005, + "rewards/margins": 2.376112461090088, + "rewards/rejected": -3.0893044471740723, + "step": 2310 + }, + { + "epoch": 0.27, + "learning_rate": 2.2344609621912677e-07, + "logits/chosen": -2.562786340713501, + "logits/rejected": -2.763011932373047, + "logps/chosen": -208.00778198242188, + "logps/rejected": -203.8603515625, + "loss": 0.299, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6673613786697388, + "rewards/margins": 1.2805215120315552, + "rewards/rejected": -1.9478827714920044, + "step": 2311 + }, + { + "epoch": 0.27, + "learning_rate": 2.234109797495025e-07, + "logits/chosen": -1.894564151763916, + "logits/rejected": -2.2139601707458496, + "logps/chosen": -317.636962890625, + "logps/rejected": -322.27703857421875, + "loss": 0.3371, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0014984607696533, + "rewards/margins": 1.8308156728744507, + "rewards/rejected": -2.8323140144348145, + "step": 2312 + }, + { + "epoch": 0.27, + "learning_rate": 2.2337586327987825e-07, + "logits/chosen": -2.138413667678833, + "logits/rejected": -2.728994369506836, + "logps/chosen": -505.8308410644531, + "logps/rejected": -289.9732666015625, + "loss": 0.7953, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4618186950683594, + "rewards/margins": 0.5140074491500854, + "rewards/rejected": -1.9758260250091553, + "step": 2313 + }, + { + "epoch": 0.27, + "learning_rate": 2.23340746810254e-07, + "logits/chosen": -2.3780786991119385, + "logits/rejected": -1.8643308877944946, + "logps/chosen": -336.46673583984375, + "logps/rejected": -409.38922119140625, + "loss": 1.1986, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.9732460975646973, + "rewards/margins": -0.19970451295375824, + "rewards/rejected": -1.7735415697097778, + "step": 2314 + }, + { + "epoch": 0.27, + "learning_rate": 2.2330563034062973e-07, + "logits/chosen": -1.968796968460083, + "logits/rejected": -2.332825183868408, + "logps/chosen": -305.62200927734375, + "logps/rejected": -198.396240234375, + "loss": 0.3929, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.37233757972717285, + "rewards/margins": 1.2895704507827759, + "rewards/rejected": -1.6619079113006592, + "step": 2315 + }, + { + "epoch": 0.27, + "learning_rate": 2.2327051387100548e-07, + "logits/chosen": -2.076939105987549, + "logits/rejected": -2.3907361030578613, + "logps/chosen": -361.1890869140625, + "logps/rejected": -278.1602783203125, + "loss": 0.41, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8468712568283081, + "rewards/margins": 1.278102159500122, + "rewards/rejected": -2.1249732971191406, + "step": 2316 + }, + { + "epoch": 0.27, + "learning_rate": 2.2323539740138126e-07, + "logits/chosen": -2.780182123184204, + "logits/rejected": -2.7071564197540283, + "logps/chosen": -147.98878479003906, + "logps/rejected": -268.60308837890625, + "loss": 0.347, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6603364944458008, + "rewards/margins": 1.8329744338989258, + "rewards/rejected": -2.4933106899261475, + "step": 2317 + }, + { + "epoch": 0.27, + "learning_rate": 2.23200280931757e-07, + "logits/chosen": -2.5820188522338867, + "logits/rejected": -2.29664945602417, + "logps/chosen": -233.2395477294922, + "logps/rejected": -263.7537841796875, + "loss": 0.5973, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7560647130012512, + "rewards/margins": 0.8999089002609253, + "rewards/rejected": -1.6559734344482422, + "step": 2318 + }, + { + "epoch": 0.27, + "learning_rate": 2.2316516446213274e-07, + "logits/chosen": -2.3411216735839844, + "logits/rejected": -2.5725257396698, + "logps/chosen": -356.65338134765625, + "logps/rejected": -324.53704833984375, + "loss": 0.0547, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.45386746525764465, + "rewards/margins": 4.3435845375061035, + "rewards/rejected": -4.797451972961426, + "step": 2319 + }, + { + "epoch": 0.27, + "learning_rate": 2.2313004799250847e-07, + "logits/chosen": -2.657813549041748, + "logits/rejected": -2.778353691101074, + "logps/chosen": -437.9046936035156, + "logps/rejected": -227.24488830566406, + "loss": 0.2942, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1547164916992188, + "rewards/margins": 2.6632399559020996, + "rewards/rejected": -3.8179566860198975, + "step": 2320 + }, + { + "epoch": 0.27, + "learning_rate": 2.2309493152288423e-07, + "logits/chosen": -2.574470043182373, + "logits/rejected": -2.386849880218506, + "logps/chosen": -130.22857666015625, + "logps/rejected": -226.09107971191406, + "loss": 0.692, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7153923511505127, + "rewards/margins": 0.7283886671066284, + "rewards/rejected": -1.4437808990478516, + "step": 2321 + }, + { + "epoch": 0.27, + "learning_rate": 2.2305981505325998e-07, + "logits/chosen": -2.7455382347106934, + "logits/rejected": -2.686337471008301, + "logps/chosen": -351.7063903808594, + "logps/rejected": -211.05392456054688, + "loss": 0.1146, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9456315040588379, + "rewards/margins": 2.3047869205474854, + "rewards/rejected": -3.2504184246063232, + "step": 2322 + }, + { + "epoch": 0.27, + "learning_rate": 2.230246985836357e-07, + "logits/chosen": -2.218808650970459, + "logits/rejected": -2.451542854309082, + "logps/chosen": -380.93133544921875, + "logps/rejected": -227.14796447753906, + "loss": 0.5072, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0960239171981812, + "rewards/margins": 1.0132875442504883, + "rewards/rejected": -2.109311580657959, + "step": 2323 + }, + { + "epoch": 0.27, + "learning_rate": 2.2298958211401146e-07, + "logits/chosen": -2.117636203765869, + "logits/rejected": -2.0617213249206543, + "logps/chosen": -279.4798278808594, + "logps/rejected": -299.8276672363281, + "loss": 0.5293, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8020843267440796, + "rewards/margins": 1.8667795658111572, + "rewards/rejected": -2.6688640117645264, + "step": 2324 + }, + { + "epoch": 0.27, + "learning_rate": 2.2295446564438722e-07, + "logits/chosen": -2.7677531242370605, + "logits/rejected": -2.831096649169922, + "logps/chosen": -162.5196990966797, + "logps/rejected": -125.515380859375, + "loss": 0.5473, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5780829191207886, + "rewards/margins": 0.9753417372703552, + "rewards/rejected": -1.553424596786499, + "step": 2325 + }, + { + "epoch": 0.27, + "learning_rate": 2.2291934917476294e-07, + "logits/chosen": -1.9911671876907349, + "logits/rejected": -1.772268295288086, + "logps/chosen": -225.91726684570312, + "logps/rejected": -259.5470275878906, + "loss": 0.3903, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2543535828590393, + "rewards/margins": 1.0614268779754639, + "rewards/rejected": -1.315780520439148, + "step": 2326 + }, + { + "epoch": 0.27, + "learning_rate": 2.228842327051387e-07, + "logits/chosen": -2.652519702911377, + "logits/rejected": -2.481191873550415, + "logps/chosen": -225.17340087890625, + "logps/rejected": -366.6904296875, + "loss": 0.5371, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1778266429901123, + "rewards/margins": 2.0115153789520264, + "rewards/rejected": -3.1893417835235596, + "step": 2327 + }, + { + "epoch": 0.27, + "learning_rate": 2.2284911623551442e-07, + "logits/chosen": -2.6167397499084473, + "logits/rejected": -2.601109504699707, + "logps/chosen": -268.5242919921875, + "logps/rejected": -394.97589111328125, + "loss": 0.0929, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5056313276290894, + "rewards/margins": 4.603331565856934, + "rewards/rejected": -5.1089630126953125, + "step": 2328 + }, + { + "epoch": 0.27, + "learning_rate": 2.228139997658902e-07, + "logits/chosen": -2.644454002380371, + "logits/rejected": -2.814751625061035, + "logps/chosen": -241.59786987304688, + "logps/rejected": -237.32212829589844, + "loss": 0.5099, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5417253971099854, + "rewards/margins": 1.072310447692871, + "rewards/rejected": -2.6140358448028564, + "step": 2329 + }, + { + "epoch": 0.27, + "learning_rate": 2.2277888329626596e-07, + "logits/chosen": -2.914194345474243, + "logits/rejected": -2.8407115936279297, + "logps/chosen": -248.74652099609375, + "logps/rejected": -257.2648010253906, + "loss": 0.2054, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8307681083679199, + "rewards/margins": 3.121891498565674, + "rewards/rejected": -3.952660083770752, + "step": 2330 + }, + { + "epoch": 0.27, + "learning_rate": 2.2274376682664169e-07, + "logits/chosen": -2.1120777130126953, + "logits/rejected": -2.1309189796447754, + "logps/chosen": -396.2763671875, + "logps/rejected": -333.07720947265625, + "loss": 0.3835, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4352627992630005, + "rewards/margins": 1.9320886135101318, + "rewards/rejected": -3.367351531982422, + "step": 2331 + }, + { + "epoch": 0.27, + "learning_rate": 2.2270865035701744e-07, + "logits/chosen": -1.9145435094833374, + "logits/rejected": -2.3583173751831055, + "logps/chosen": -369.1493225097656, + "logps/rejected": -305.28631591796875, + "loss": 0.4741, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8936169743537903, + "rewards/margins": 2.3734569549560547, + "rewards/rejected": -3.2670741081237793, + "step": 2332 + }, + { + "epoch": 0.27, + "learning_rate": 2.2267353388739317e-07, + "logits/chosen": -1.8098840713500977, + "logits/rejected": -1.8379486799240112, + "logps/chosen": -433.43060302734375, + "logps/rejected": -434.41033935546875, + "loss": 0.2508, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22187723219394684, + "rewards/margins": 1.805922269821167, + "rewards/rejected": -2.027799367904663, + "step": 2333 + }, + { + "epoch": 0.27, + "learning_rate": 2.2263841741776892e-07, + "logits/chosen": -2.394463539123535, + "logits/rejected": -2.6185858249664307, + "logps/chosen": -240.04795837402344, + "logps/rejected": -303.8496398925781, + "loss": 0.6522, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5267966985702515, + "rewards/margins": 1.3232052326202393, + "rewards/rejected": -1.8500018119812012, + "step": 2334 + }, + { + "epoch": 0.27, + "learning_rate": 2.2260330094814467e-07, + "logits/chosen": -2.6977767944335938, + "logits/rejected": -2.6940994262695312, + "logps/chosen": -171.8856658935547, + "logps/rejected": -203.81832885742188, + "loss": 0.4471, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6009179949760437, + "rewards/margins": 1.0224062204360962, + "rewards/rejected": -1.6233241558074951, + "step": 2335 + }, + { + "epoch": 0.27, + "learning_rate": 2.225681844785204e-07, + "logits/chosen": -2.371288776397705, + "logits/rejected": -2.394127368927002, + "logps/chosen": -386.759521484375, + "logps/rejected": -290.6531982421875, + "loss": 0.3394, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5208035111427307, + "rewards/margins": 2.6835129261016846, + "rewards/rejected": -3.2043166160583496, + "step": 2336 + }, + { + "epoch": 0.27, + "learning_rate": 2.2253306800889616e-07, + "logits/chosen": -1.8388299942016602, + "logits/rejected": -2.0180723667144775, + "logps/chosen": -430.312255859375, + "logps/rejected": -302.556640625, + "loss": 0.329, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3034083843231201, + "rewards/margins": 2.0435354709625244, + "rewards/rejected": -2.3469438552856445, + "step": 2337 + }, + { + "epoch": 0.27, + "learning_rate": 2.224979515392719e-07, + "logits/chosen": -2.4800219535827637, + "logits/rejected": -2.643282890319824, + "logps/chosen": -366.2763671875, + "logps/rejected": -345.3385009765625, + "loss": 0.3059, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.39767876267433167, + "rewards/margins": 2.935542583465576, + "rewards/rejected": -3.333221197128296, + "step": 2338 + }, + { + "epoch": 0.27, + "learning_rate": 2.2246283506964764e-07, + "logits/chosen": -1.9973983764648438, + "logits/rejected": -2.101494550704956, + "logps/chosen": -306.080810546875, + "logps/rejected": -306.88629150390625, + "loss": 0.4586, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7141282558441162, + "rewards/margins": 0.9203584790229797, + "rewards/rejected": -1.6344866752624512, + "step": 2339 + }, + { + "epoch": 0.27, + "learning_rate": 2.2242771860002342e-07, + "logits/chosen": -2.285529375076294, + "logits/rejected": -2.2523622512817383, + "logps/chosen": -180.410400390625, + "logps/rejected": -217.8936767578125, + "loss": 0.6062, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1861073970794678, + "rewards/margins": 0.7254897356033325, + "rewards/rejected": -1.9115971326828003, + "step": 2340 + }, + { + "epoch": 0.27, + "learning_rate": 2.2239260213039914e-07, + "logits/chosen": -2.208009719848633, + "logits/rejected": -2.2298834323883057, + "logps/chosen": -271.63800048828125, + "logps/rejected": -357.0938720703125, + "loss": 0.1985, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4209343492984772, + "rewards/margins": 2.879362106323242, + "rewards/rejected": -3.3002967834472656, + "step": 2341 + }, + { + "epoch": 0.27, + "learning_rate": 2.223574856607749e-07, + "logits/chosen": -2.9570510387420654, + "logits/rejected": -2.9251291751861572, + "logps/chosen": -138.8157958984375, + "logps/rejected": -132.87847900390625, + "loss": 0.5251, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0708885192871094, + "rewards/margins": 1.3592755794525146, + "rewards/rejected": -2.430164337158203, + "step": 2342 + }, + { + "epoch": 0.27, + "learning_rate": 2.2232236919115065e-07, + "logits/chosen": -2.9475908279418945, + "logits/rejected": -2.93341064453125, + "logps/chosen": -152.5248260498047, + "logps/rejected": -163.50439453125, + "loss": 0.5626, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4463386833667755, + "rewards/margins": 1.3527623414993286, + "rewards/rejected": -1.7991009950637817, + "step": 2343 + }, + { + "epoch": 0.27, + "learning_rate": 2.2228725272152638e-07, + "logits/chosen": -2.4209444522857666, + "logits/rejected": -2.452770709991455, + "logps/chosen": -199.39947509765625, + "logps/rejected": -310.3177795410156, + "loss": 0.8801, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.441758155822754, + "rewards/margins": 1.4588096141815186, + "rewards/rejected": -2.9005680084228516, + "step": 2344 + }, + { + "epoch": 0.27, + "learning_rate": 2.2225213625190213e-07, + "logits/chosen": -1.8815453052520752, + "logits/rejected": -2.137511730194092, + "logps/chosen": -323.1527404785156, + "logps/rejected": -234.67410278320312, + "loss": 0.349, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5032964944839478, + "rewards/margins": 2.114691972732544, + "rewards/rejected": -2.617988348007202, + "step": 2345 + }, + { + "epoch": 0.27, + "learning_rate": 2.222170197822779e-07, + "logits/chosen": -2.1829748153686523, + "logits/rejected": -2.1844396591186523, + "logps/chosen": -281.1679992675781, + "logps/rejected": -281.53570556640625, + "loss": 0.4576, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.45150789618492126, + "rewards/margins": 1.6352264881134033, + "rewards/rejected": -2.0867342948913574, + "step": 2346 + }, + { + "epoch": 0.27, + "learning_rate": 2.2218190331265361e-07, + "logits/chosen": -2.47847580909729, + "logits/rejected": -2.564591884613037, + "logps/chosen": -140.65565490722656, + "logps/rejected": -257.238525390625, + "loss": 0.2177, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1115285158157349, + "rewards/margins": 2.1177432537078857, + "rewards/rejected": -3.22927188873291, + "step": 2347 + }, + { + "epoch": 0.27, + "learning_rate": 2.2214678684302937e-07, + "logits/chosen": -2.467628240585327, + "logits/rejected": -2.5309293270111084, + "logps/chosen": -307.7852783203125, + "logps/rejected": -254.46783447265625, + "loss": 0.7072, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.8163573741912842, + "rewards/margins": 1.8994379043579102, + "rewards/rejected": -3.7157955169677734, + "step": 2348 + }, + { + "epoch": 0.27, + "learning_rate": 2.221116703734051e-07, + "logits/chosen": -2.259749174118042, + "logits/rejected": -2.572483539581299, + "logps/chosen": -296.1116943359375, + "logps/rejected": -179.5673828125, + "loss": 0.6076, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1013505458831787, + "rewards/margins": 1.6502444744110107, + "rewards/rejected": -2.7515952587127686, + "step": 2349 + }, + { + "epoch": 0.27, + "learning_rate": 2.2207655390378085e-07, + "logits/chosen": -2.642601251602173, + "logits/rejected": -2.683359146118164, + "logps/chosen": -157.21798706054688, + "logps/rejected": -220.0619659423828, + "loss": 0.3045, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7170559167861938, + "rewards/margins": 2.254762887954712, + "rewards/rejected": -2.9718189239501953, + "step": 2350 + }, + { + "epoch": 0.27, + "learning_rate": 2.2204143743415663e-07, + "logits/chosen": -2.580897092819214, + "logits/rejected": -2.539088487625122, + "logps/chosen": -443.7417297363281, + "logps/rejected": -274.9808349609375, + "loss": 0.1995, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.01636028289794922, + "rewards/margins": 2.2727341651916504, + "rewards/rejected": -2.2890946865081787, + "step": 2351 + }, + { + "epoch": 0.27, + "learning_rate": 2.2200632096453236e-07, + "logits/chosen": -2.133272171020508, + "logits/rejected": -2.1605677604675293, + "logps/chosen": -376.1703796386719, + "logps/rejected": -351.35589599609375, + "loss": 0.1497, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1748581826686859, + "rewards/margins": 2.7170557975769043, + "rewards/rejected": -2.891913890838623, + "step": 2352 + }, + { + "epoch": 0.27, + "learning_rate": 2.219712044949081e-07, + "logits/chosen": -2.3492956161499023, + "logits/rejected": -2.486117362976074, + "logps/chosen": -234.74758911132812, + "logps/rejected": -157.24978637695312, + "loss": 0.8016, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.966786801815033, + "rewards/margins": 0.2763550281524658, + "rewards/rejected": -1.2431418895721436, + "step": 2353 + }, + { + "epoch": 0.27, + "learning_rate": 2.2193608802528387e-07, + "logits/chosen": -2.987596035003662, + "logits/rejected": -2.89559006690979, + "logps/chosen": -347.63916015625, + "logps/rejected": -263.3689270019531, + "loss": 0.2291, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9938812255859375, + "rewards/margins": 2.3228108882904053, + "rewards/rejected": -3.316692352294922, + "step": 2354 + }, + { + "epoch": 0.27, + "learning_rate": 2.219009715556596e-07, + "logits/chosen": -2.335207462310791, + "logits/rejected": -2.370251178741455, + "logps/chosen": -387.9110107421875, + "logps/rejected": -395.8760986328125, + "loss": 0.4766, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5802103281021118, + "rewards/margins": 1.7760004997253418, + "rewards/rejected": -2.356210708618164, + "step": 2355 + }, + { + "epoch": 0.27, + "learning_rate": 2.2186585508603535e-07, + "logits/chosen": -2.1861867904663086, + "logits/rejected": -2.133204460144043, + "logps/chosen": -467.4580078125, + "logps/rejected": -421.5353698730469, + "loss": 0.1708, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7164255380630493, + "rewards/margins": 2.378657817840576, + "rewards/rejected": -3.095083475112915, + "step": 2356 + }, + { + "epoch": 0.27, + "learning_rate": 2.2183073861641107e-07, + "logits/chosen": -2.493612766265869, + "logits/rejected": -2.34818434715271, + "logps/chosen": -200.64572143554688, + "logps/rejected": -234.55459594726562, + "loss": 0.4985, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0653644800186157, + "rewards/margins": 1.1047426462173462, + "rewards/rejected": -2.170107126235962, + "step": 2357 + }, + { + "epoch": 0.27, + "learning_rate": 2.2179562214678683e-07, + "logits/chosen": -2.5739963054656982, + "logits/rejected": -2.6181867122650146, + "logps/chosen": -175.628173828125, + "logps/rejected": -223.91250610351562, + "loss": 0.5497, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1274449825286865, + "rewards/margins": 0.6008571982383728, + "rewards/rejected": -1.7283021211624146, + "step": 2358 + }, + { + "epoch": 0.27, + "learning_rate": 2.2176050567716258e-07, + "logits/chosen": -2.1682605743408203, + "logits/rejected": -2.4296984672546387, + "logps/chosen": -297.6495361328125, + "logps/rejected": -283.84869384765625, + "loss": 0.1463, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5202710628509521, + "rewards/margins": 2.2745792865753174, + "rewards/rejected": -2.7948505878448486, + "step": 2359 + }, + { + "epoch": 0.27, + "learning_rate": 2.217253892075383e-07, + "logits/chosen": -2.383439064025879, + "logits/rejected": -2.644294023513794, + "logps/chosen": -396.9078369140625, + "logps/rejected": -257.13751220703125, + "loss": 0.3052, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8673213124275208, + "rewards/margins": 1.902815580368042, + "rewards/rejected": -2.770136833190918, + "step": 2360 + }, + { + "epoch": 0.27, + "learning_rate": 2.2169027273791406e-07, + "logits/chosen": -2.9486501216888428, + "logits/rejected": -3.0238823890686035, + "logps/chosen": -267.8651428222656, + "logps/rejected": -237.68438720703125, + "loss": 0.2651, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.27045825123786926, + "rewards/margins": 1.8422611951828003, + "rewards/rejected": -2.1127192974090576, + "step": 2361 + }, + { + "epoch": 0.27, + "learning_rate": 2.2165515626828984e-07, + "logits/chosen": -2.575305938720703, + "logits/rejected": -2.5817079544067383, + "logps/chosen": -151.13548278808594, + "logps/rejected": -151.58734130859375, + "loss": 0.5313, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.060714662075042725, + "rewards/margins": 1.6217812299728394, + "rewards/rejected": -1.6824959516525269, + "step": 2362 + }, + { + "epoch": 0.27, + "learning_rate": 2.2162003979866557e-07, + "logits/chosen": -2.4389002323150635, + "logits/rejected": -2.595860004425049, + "logps/chosen": -283.9049377441406, + "logps/rejected": -183.95474243164062, + "loss": 0.5309, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4264901280403137, + "rewards/margins": 1.2987791299819946, + "rewards/rejected": -1.7252693176269531, + "step": 2363 + }, + { + "epoch": 0.27, + "learning_rate": 2.2158492332904132e-07, + "logits/chosen": -1.6870752573013306, + "logits/rejected": -1.6001951694488525, + "logps/chosen": -473.3648376464844, + "logps/rejected": -631.05615234375, + "loss": 0.909, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3850433826446533, + "rewards/margins": 0.9849206209182739, + "rewards/rejected": -1.3699640035629272, + "step": 2364 + }, + { + "epoch": 0.27, + "learning_rate": 2.2154980685941705e-07, + "logits/chosen": -2.262080192565918, + "logits/rejected": -2.430208683013916, + "logps/chosen": -251.68316650390625, + "logps/rejected": -303.1956787109375, + "loss": 0.4859, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0219839811325073, + "rewards/margins": 1.8706679344177246, + "rewards/rejected": -2.8926520347595215, + "step": 2365 + }, + { + "epoch": 0.27, + "learning_rate": 2.215146903897928e-07, + "logits/chosen": -2.0109810829162598, + "logits/rejected": -2.124664306640625, + "logps/chosen": -279.01361083984375, + "logps/rejected": -321.35791015625, + "loss": 1.2333, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5159120559692383, + "rewards/margins": 1.1507490873336792, + "rewards/rejected": -2.666661024093628, + "step": 2366 + }, + { + "epoch": 0.27, + "learning_rate": 2.2147957392016856e-07, + "logits/chosen": -2.646251916885376, + "logits/rejected": -2.5695509910583496, + "logps/chosen": -272.1863098144531, + "logps/rejected": -293.919189453125, + "loss": 0.2832, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9350436925888062, + "rewards/margins": 1.649257779121399, + "rewards/rejected": -2.584301471710205, + "step": 2367 + }, + { + "epoch": 0.27, + "learning_rate": 2.214444574505443e-07, + "logits/chosen": -2.077392101287842, + "logits/rejected": -2.0770273208618164, + "logps/chosen": -369.7158203125, + "logps/rejected": -410.0614929199219, + "loss": 0.7045, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.44733789563179016, + "rewards/margins": 1.2229883670806885, + "rewards/rejected": -1.6703262329101562, + "step": 2368 + }, + { + "epoch": 0.27, + "learning_rate": 2.2140934098092004e-07, + "logits/chosen": -2.368267297744751, + "logits/rejected": -2.3903355598449707, + "logps/chosen": -302.8724365234375, + "logps/rejected": -302.426025390625, + "loss": 0.1017, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20706647634506226, + "rewards/margins": 3.374300003051758, + "rewards/rejected": -3.581366539001465, + "step": 2369 + }, + { + "epoch": 0.27, + "learning_rate": 2.213742245112958e-07, + "logits/chosen": -2.6683804988861084, + "logits/rejected": -2.8517556190490723, + "logps/chosen": -483.73541259765625, + "logps/rejected": -363.65673828125, + "loss": 0.1468, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.29239723086357117, + "rewards/margins": 5.1540679931640625, + "rewards/rejected": -5.446465015411377, + "step": 2370 + }, + { + "epoch": 0.27, + "learning_rate": 2.2133910804167152e-07, + "logits/chosen": -1.8696742057800293, + "logits/rejected": -2.435608148574829, + "logps/chosen": -491.0266418457031, + "logps/rejected": -285.97845458984375, + "loss": 0.3963, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6071027517318726, + "rewards/margins": 1.2514262199401855, + "rewards/rejected": -1.8585290908813477, + "step": 2371 + }, + { + "epoch": 0.27, + "learning_rate": 2.2130399157204728e-07, + "logits/chosen": -2.2201337814331055, + "logits/rejected": -2.5258193016052246, + "logps/chosen": -524.08642578125, + "logps/rejected": -211.4732666015625, + "loss": 0.6742, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5142760276794434, + "rewards/margins": 1.3403037786483765, + "rewards/rejected": -1.8545796871185303, + "step": 2372 + }, + { + "epoch": 0.27, + "learning_rate": 2.21268875102423e-07, + "logits/chosen": -2.5254735946655273, + "logits/rejected": -2.59126615524292, + "logps/chosen": -214.21389770507812, + "logps/rejected": -194.32034301757812, + "loss": 0.5094, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0030393600463867, + "rewards/margins": 1.1658135652542114, + "rewards/rejected": -2.1688528060913086, + "step": 2373 + }, + { + "epoch": 0.27, + "learning_rate": 2.2123375863279878e-07, + "logits/chosen": -2.528968572616577, + "logits/rejected": -2.5495333671569824, + "logps/chosen": -322.10498046875, + "logps/rejected": -249.1849822998047, + "loss": 0.0978, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07190589606761932, + "rewards/margins": 2.5882861614227295, + "rewards/rejected": -2.5163800716400146, + "step": 2374 + }, + { + "epoch": 0.27, + "learning_rate": 2.2119864216317454e-07, + "logits/chosen": -1.9059357643127441, + "logits/rejected": -2.136474847793579, + "logps/chosen": -306.00262451171875, + "logps/rejected": -272.22222900390625, + "loss": 0.3993, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1895300149917603, + "rewards/margins": 1.8036973476409912, + "rewards/rejected": -2.993227481842041, + "step": 2375 + }, + { + "epoch": 0.27, + "learning_rate": 2.2116352569355026e-07, + "logits/chosen": -2.0937342643737793, + "logits/rejected": -2.0928173065185547, + "logps/chosen": -387.49920654296875, + "logps/rejected": -412.2734375, + "loss": 0.1961, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8239294290542603, + "rewards/margins": 3.5010881423950195, + "rewards/rejected": -4.32501745223999, + "step": 2376 + }, + { + "epoch": 0.27, + "learning_rate": 2.2112840922392602e-07, + "logits/chosen": -1.8557180166244507, + "logits/rejected": -2.0374646186828613, + "logps/chosen": -395.20709228515625, + "logps/rejected": -332.66265869140625, + "loss": 0.1865, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5466362237930298, + "rewards/margins": 3.022686004638672, + "rewards/rejected": -3.5693225860595703, + "step": 2377 + }, + { + "epoch": 0.27, + "learning_rate": 2.2109329275430175e-07, + "logits/chosen": -1.7497583627700806, + "logits/rejected": -1.920516014099121, + "logps/chosen": -292.8885498046875, + "logps/rejected": -269.3721923828125, + "loss": 0.5238, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5246318578720093, + "rewards/margins": 1.5935754776000977, + "rewards/rejected": -2.1182072162628174, + "step": 2378 + }, + { + "epoch": 0.27, + "learning_rate": 2.210581762846775e-07, + "logits/chosen": -2.25970458984375, + "logits/rejected": -2.405925750732422, + "logps/chosen": -363.86407470703125, + "logps/rejected": -188.42758178710938, + "loss": 1.0261, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6829233169555664, + "rewards/margins": 0.17885711789131165, + "rewards/rejected": -0.8617804050445557, + "step": 2379 + }, + { + "epoch": 0.27, + "learning_rate": 2.2102305981505325e-07, + "logits/chosen": -2.1219100952148438, + "logits/rejected": -2.564016580581665, + "logps/chosen": -322.753662109375, + "logps/rejected": -436.9297180175781, + "loss": 0.3487, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13112500309944153, + "rewards/margins": 1.2841987609863281, + "rewards/rejected": -1.4153237342834473, + "step": 2380 + }, + { + "epoch": 0.27, + "learning_rate": 2.2098794334542898e-07, + "logits/chosen": -2.961305618286133, + "logits/rejected": -2.953859806060791, + "logps/chosen": -179.4132537841797, + "logps/rejected": -188.67416381835938, + "loss": 0.2547, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7422632575035095, + "rewards/margins": 1.8796485662460327, + "rewards/rejected": -2.6219117641448975, + "step": 2381 + }, + { + "epoch": 0.27, + "learning_rate": 2.2095282687580473e-07, + "logits/chosen": -2.3975234031677246, + "logits/rejected": -2.487639904022217, + "logps/chosen": -325.08526611328125, + "logps/rejected": -244.52444458007812, + "loss": 0.7361, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2794928550720215, + "rewards/margins": 0.3635224997997284, + "rewards/rejected": -1.6430151462554932, + "step": 2382 + }, + { + "epoch": 0.27, + "learning_rate": 2.209177104061805e-07, + "logits/chosen": -1.6325751543045044, + "logits/rejected": -1.8027052879333496, + "logps/chosen": -243.8502197265625, + "logps/rejected": -194.97836303710938, + "loss": 0.2526, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08689990639686584, + "rewards/margins": 1.801526427268982, + "rewards/rejected": -1.8884263038635254, + "step": 2383 + }, + { + "epoch": 0.27, + "learning_rate": 2.2088259393655622e-07, + "logits/chosen": -2.0785253047943115, + "logits/rejected": -2.09299898147583, + "logps/chosen": -318.3966369628906, + "logps/rejected": -496.0134582519531, + "loss": 0.1003, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8018190860748291, + "rewards/margins": 3.352431058883667, + "rewards/rejected": -4.154250621795654, + "step": 2384 + }, + { + "epoch": 0.27, + "learning_rate": 2.20847477466932e-07, + "logits/chosen": -2.3991806507110596, + "logits/rejected": -2.109764575958252, + "logps/chosen": -189.80503845214844, + "logps/rejected": -297.18035888671875, + "loss": 0.2634, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.728205680847168, + "rewards/margins": 2.5195915699005127, + "rewards/rejected": -3.2477972507476807, + "step": 2385 + }, + { + "epoch": 0.28, + "learning_rate": 2.2081236099730772e-07, + "logits/chosen": -2.1951541900634766, + "logits/rejected": -2.139547109603882, + "logps/chosen": -313.56622314453125, + "logps/rejected": -343.3870849609375, + "loss": 0.4433, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6855318546295166, + "rewards/margins": 1.6657932996749878, + "rewards/rejected": -2.351325273513794, + "step": 2386 + }, + { + "epoch": 0.28, + "learning_rate": 2.2077724452768348e-07, + "logits/chosen": -2.0304999351501465, + "logits/rejected": -1.8919687271118164, + "logps/chosen": -181.15602111816406, + "logps/rejected": -240.35995483398438, + "loss": 0.7679, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.224722146987915, + "rewards/margins": 1.4663596153259277, + "rewards/rejected": -2.6910817623138428, + "step": 2387 + }, + { + "epoch": 0.28, + "learning_rate": 2.2074212805805923e-07, + "logits/chosen": -2.410897970199585, + "logits/rejected": -2.0534427165985107, + "logps/chosen": -291.44970703125, + "logps/rejected": -346.6551818847656, + "loss": 0.4822, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7325148582458496, + "rewards/margins": 1.1019924879074097, + "rewards/rejected": -1.8345073461532593, + "step": 2388 + }, + { + "epoch": 0.28, + "learning_rate": 2.2070701158843496e-07, + "logits/chosen": -1.971901297569275, + "logits/rejected": -1.751090168952942, + "logps/chosen": -217.89456176757812, + "logps/rejected": -313.3287658691406, + "loss": 0.3593, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.38292092084884644, + "rewards/margins": 1.514974594116211, + "rewards/rejected": -1.897895336151123, + "step": 2389 + }, + { + "epoch": 0.28, + "learning_rate": 2.206718951188107e-07, + "logits/chosen": -2.1593892574310303, + "logits/rejected": -1.879225492477417, + "logps/chosen": -340.843505859375, + "logps/rejected": -315.10498046875, + "loss": 0.2253, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9217020273208618, + "rewards/margins": 1.8935468196868896, + "rewards/rejected": -2.815248966217041, + "step": 2390 + }, + { + "epoch": 0.28, + "learning_rate": 2.2063677864918647e-07, + "logits/chosen": -2.774963855743408, + "logits/rejected": -2.5040993690490723, + "logps/chosen": -223.13287353515625, + "logps/rejected": -332.9956359863281, + "loss": 0.0892, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04034671559929848, + "rewards/margins": 4.278153419494629, + "rewards/rejected": -4.237806797027588, + "step": 2391 + }, + { + "epoch": 0.28, + "learning_rate": 2.206016621795622e-07, + "logits/chosen": -2.330514907836914, + "logits/rejected": -2.516277313232422, + "logps/chosen": -279.4637145996094, + "logps/rejected": -284.7630310058594, + "loss": 0.3804, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.860355019569397, + "rewards/margins": 2.676506280899048, + "rewards/rejected": -3.5368614196777344, + "step": 2392 + }, + { + "epoch": 0.28, + "learning_rate": 2.2056654570993795e-07, + "logits/chosen": -2.8402087688446045, + "logits/rejected": -2.6616387367248535, + "logps/chosen": -125.63406372070312, + "logps/rejected": -282.2242431640625, + "loss": 0.6151, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.299598217010498, + "rewards/margins": 1.0049552917480469, + "rewards/rejected": -2.304553508758545, + "step": 2393 + }, + { + "epoch": 0.28, + "learning_rate": 2.2053142924031368e-07, + "logits/chosen": -2.4620003700256348, + "logits/rejected": -2.121260643005371, + "logps/chosen": -212.80453491210938, + "logps/rejected": -220.99609375, + "loss": 0.4673, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04905782639980316, + "rewards/margins": 2.2365262508392334, + "rewards/rejected": -2.1874685287475586, + "step": 2394 + }, + { + "epoch": 0.28, + "learning_rate": 2.2049631277068943e-07, + "logits/chosen": -2.9918112754821777, + "logits/rejected": -3.004377841949463, + "logps/chosen": -217.65628051757812, + "logps/rejected": -155.64474487304688, + "loss": 0.2227, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5616525411605835, + "rewards/margins": 2.136996269226074, + "rewards/rejected": -2.698648691177368, + "step": 2395 + }, + { + "epoch": 0.28, + "learning_rate": 2.204611963010652e-07, + "logits/chosen": -1.8859658241271973, + "logits/rejected": -2.1740736961364746, + "logps/chosen": -190.14622497558594, + "logps/rejected": -84.29082489013672, + "loss": 0.4129, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.014460060745477676, + "rewards/margins": 0.7915611863136292, + "rewards/rejected": -0.8060212135314941, + "step": 2396 + }, + { + "epoch": 0.28, + "learning_rate": 2.2042607983144094e-07, + "logits/chosen": -2.394643783569336, + "logits/rejected": -2.3400518894195557, + "logps/chosen": -222.10736083984375, + "logps/rejected": -240.57183837890625, + "loss": 0.4487, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8692567348480225, + "rewards/margins": 1.2994678020477295, + "rewards/rejected": -2.168724536895752, + "step": 2397 + }, + { + "epoch": 0.28, + "learning_rate": 2.203909633618167e-07, + "logits/chosen": -1.3488788604736328, + "logits/rejected": -1.9583373069763184, + "logps/chosen": -488.72076416015625, + "logps/rejected": -263.3193359375, + "loss": 0.6876, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8467933535575867, + "rewards/margins": 0.7025682926177979, + "rewards/rejected": -1.5493617057800293, + "step": 2398 + }, + { + "epoch": 0.28, + "learning_rate": 2.2035584689219244e-07, + "logits/chosen": -2.2155797481536865, + "logits/rejected": -1.8532993793487549, + "logps/chosen": -160.47857666015625, + "logps/rejected": -247.35321044921875, + "loss": 0.4468, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5911130905151367, + "rewards/margins": 0.850537896156311, + "rewards/rejected": -1.4416511058807373, + "step": 2399 + }, + { + "epoch": 0.28, + "learning_rate": 2.2032073042256817e-07, + "logits/chosen": -1.683570146560669, + "logits/rejected": -2.0189461708068848, + "logps/chosen": -249.63784790039062, + "logps/rejected": -171.45147705078125, + "loss": 0.34, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9943113327026367, + "rewards/margins": 1.6675384044647217, + "rewards/rejected": -2.6618497371673584, + "step": 2400 + }, + { + "epoch": 0.28, + "learning_rate": 2.2028561395294393e-07, + "logits/chosen": -2.3118672370910645, + "logits/rejected": -2.599602222442627, + "logps/chosen": -199.7015380859375, + "logps/rejected": -198.45233154296875, + "loss": 0.3083, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8504457473754883, + "rewards/margins": 1.9321576356887817, + "rewards/rejected": -2.7826035022735596, + "step": 2401 + }, + { + "epoch": 0.28, + "learning_rate": 2.2025049748331965e-07, + "logits/chosen": -2.6827046871185303, + "logits/rejected": -2.617259979248047, + "logps/chosen": -198.2112274169922, + "logps/rejected": -223.09483337402344, + "loss": 0.3617, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3909270763397217, + "rewards/margins": 3.3199141025543213, + "rewards/rejected": -3.710841178894043, + "step": 2402 + }, + { + "epoch": 0.28, + "learning_rate": 2.202153810136954e-07, + "logits/chosen": -2.203894853591919, + "logits/rejected": -2.3268704414367676, + "logps/chosen": -323.5906982421875, + "logps/rejected": -365.87451171875, + "loss": 0.7085, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0424226522445679, + "rewards/margins": 1.400367021560669, + "rewards/rejected": -2.4427895545959473, + "step": 2403 + }, + { + "epoch": 0.28, + "learning_rate": 2.2018026454407116e-07, + "logits/chosen": -2.4256062507629395, + "logits/rejected": -2.4161722660064697, + "logps/chosen": -155.8611297607422, + "logps/rejected": -178.993408203125, + "loss": 0.6483, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9338624477386475, + "rewards/margins": 0.9607514142990112, + "rewards/rejected": -1.8946138620376587, + "step": 2404 + }, + { + "epoch": 0.28, + "learning_rate": 2.201451480744469e-07, + "logits/chosen": -2.0329463481903076, + "logits/rejected": -2.176722764968872, + "logps/chosen": -445.83856201171875, + "logps/rejected": -273.5535888671875, + "loss": 0.5122, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7735891938209534, + "rewards/margins": 1.4234522581100464, + "rewards/rejected": -2.1970412731170654, + "step": 2405 + }, + { + "epoch": 0.28, + "learning_rate": 2.2011003160482264e-07, + "logits/chosen": -2.3549611568450928, + "logits/rejected": -1.9520107507705688, + "logps/chosen": -180.7008514404297, + "logps/rejected": -297.4730224609375, + "loss": 0.6425, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9192309379577637, + "rewards/margins": 1.973548412322998, + "rewards/rejected": -3.8927793502807617, + "step": 2406 + }, + { + "epoch": 0.28, + "learning_rate": 2.2007491513519842e-07, + "logits/chosen": -2.8712069988250732, + "logits/rejected": -2.8810136318206787, + "logps/chosen": -257.8601379394531, + "logps/rejected": -280.24432373046875, + "loss": 1.4825, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.7947702407836914, + "rewards/margins": -0.4868800640106201, + "rewards/rejected": -1.3078901767730713, + "step": 2407 + }, + { + "epoch": 0.28, + "learning_rate": 2.2003979866557415e-07, + "logits/chosen": -2.460440158843994, + "logits/rejected": -2.625044107437134, + "logps/chosen": -335.6321716308594, + "logps/rejected": -267.79449462890625, + "loss": 0.1422, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5189148783683777, + "rewards/margins": 2.5779902935028076, + "rewards/rejected": -3.09690523147583, + "step": 2408 + }, + { + "epoch": 0.28, + "learning_rate": 2.200046821959499e-07, + "logits/chosen": -2.7236154079437256, + "logits/rejected": -2.7522871494293213, + "logps/chosen": -183.46408081054688, + "logps/rejected": -212.1078338623047, + "loss": 0.3491, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4583776891231537, + "rewards/margins": 1.3655753135681152, + "rewards/rejected": -1.8239531517028809, + "step": 2409 + }, + { + "epoch": 0.28, + "learning_rate": 2.1996956572632563e-07, + "logits/chosen": -1.6842350959777832, + "logits/rejected": -1.8193742036819458, + "logps/chosen": -246.3204345703125, + "logps/rejected": -244.92266845703125, + "loss": 0.4591, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.253162682056427, + "rewards/margins": 1.3929805755615234, + "rewards/rejected": -1.6461431980133057, + "step": 2410 + }, + { + "epoch": 0.28, + "learning_rate": 2.1993444925670138e-07, + "logits/chosen": -2.4054393768310547, + "logits/rejected": -2.5681209564208984, + "logps/chosen": -295.2700500488281, + "logps/rejected": -311.75543212890625, + "loss": 0.5096, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6978799700737, + "rewards/margins": 1.1598490476608276, + "rewards/rejected": -1.857729196548462, + "step": 2411 + }, + { + "epoch": 0.28, + "learning_rate": 2.1989933278707714e-07, + "logits/chosen": -2.130776882171631, + "logits/rejected": -2.03635311126709, + "logps/chosen": -333.10638427734375, + "logps/rejected": -406.77972412109375, + "loss": 0.2945, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6551268696784973, + "rewards/margins": 2.522439956665039, + "rewards/rejected": -3.1775670051574707, + "step": 2412 + }, + { + "epoch": 0.28, + "learning_rate": 2.1986421631745287e-07, + "logits/chosen": -2.6250510215759277, + "logits/rejected": -2.6702487468719482, + "logps/chosen": -427.0072021484375, + "logps/rejected": -258.33685302734375, + "loss": 0.2862, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5218257308006287, + "rewards/margins": 2.186528444290161, + "rewards/rejected": -2.7083539962768555, + "step": 2413 + }, + { + "epoch": 0.28, + "learning_rate": 2.1982909984782862e-07, + "logits/chosen": -2.1830947399139404, + "logits/rejected": -2.325137138366699, + "logps/chosen": -152.19631958007812, + "logps/rejected": -304.04412841796875, + "loss": 0.5052, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2557132244110107, + "rewards/margins": 1.6683268547058105, + "rewards/rejected": -2.9240400791168213, + "step": 2414 + }, + { + "epoch": 0.28, + "learning_rate": 2.1979398337820437e-07, + "logits/chosen": -2.0917606353759766, + "logits/rejected": -2.217893600463867, + "logps/chosen": -233.08987426757812, + "logps/rejected": -160.8056640625, + "loss": 0.4726, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1018292903900146, + "rewards/margins": 0.9712811708450317, + "rewards/rejected": -2.073110342025757, + "step": 2415 + }, + { + "epoch": 0.28, + "learning_rate": 2.197588669085801e-07, + "logits/chosen": -2.596494197845459, + "logits/rejected": -2.569225788116455, + "logps/chosen": -135.07293701171875, + "logps/rejected": -182.13418579101562, + "loss": 0.2472, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.29864072799682617, + "rewards/margins": 2.088331699371338, + "rewards/rejected": -2.386972427368164, + "step": 2416 + }, + { + "epoch": 0.28, + "learning_rate": 2.1972375043895586e-07, + "logits/chosen": -2.9223780632019043, + "logits/rejected": -2.917766809463501, + "logps/chosen": -332.3289794921875, + "logps/rejected": -284.0693359375, + "loss": 0.2481, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.61895751953125, + "rewards/margins": 3.4867966175079346, + "rewards/rejected": -4.1057538986206055, + "step": 2417 + }, + { + "epoch": 0.28, + "learning_rate": 2.1968863396933158e-07, + "logits/chosen": -2.79864764213562, + "logits/rejected": -2.7675697803497314, + "logps/chosen": -112.07160949707031, + "logps/rejected": -163.26168823242188, + "loss": 0.5702, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3436325788497925, + "rewards/margins": 2.519044876098633, + "rewards/rejected": -3.8626773357391357, + "step": 2418 + }, + { + "epoch": 0.28, + "learning_rate": 2.1965351749970736e-07, + "logits/chosen": -2.0322842597961426, + "logits/rejected": -2.0304574966430664, + "logps/chosen": -216.74896240234375, + "logps/rejected": -267.59088134765625, + "loss": 0.4033, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8641211986541748, + "rewards/margins": 2.2202508449554443, + "rewards/rejected": -3.084372043609619, + "step": 2419 + }, + { + "epoch": 0.28, + "learning_rate": 2.1961840103008312e-07, + "logits/chosen": -2.416510820388794, + "logits/rejected": -2.744535207748413, + "logps/chosen": -203.13851928710938, + "logps/rejected": -227.20318603515625, + "loss": 0.6051, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.683647096157074, + "rewards/margins": 1.2355327606201172, + "rewards/rejected": -1.919179916381836, + "step": 2420 + }, + { + "epoch": 0.28, + "learning_rate": 2.1958328456045884e-07, + "logits/chosen": -2.388878345489502, + "logits/rejected": -2.472952365875244, + "logps/chosen": -234.2938690185547, + "logps/rejected": -270.3072509765625, + "loss": 0.2998, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6629488468170166, + "rewards/margins": 3.0960304737091064, + "rewards/rejected": -3.758979082107544, + "step": 2421 + }, + { + "epoch": 0.28, + "learning_rate": 2.195481680908346e-07, + "logits/chosen": -2.253048896789551, + "logits/rejected": -1.8686836957931519, + "logps/chosen": -131.77569580078125, + "logps/rejected": -325.99212646484375, + "loss": 0.3492, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3972211480140686, + "rewards/margins": 2.3480305671691895, + "rewards/rejected": -2.7452516555786133, + "step": 2422 + }, + { + "epoch": 0.28, + "learning_rate": 2.1951305162121033e-07, + "logits/chosen": -2.3312301635742188, + "logits/rejected": -2.228127956390381, + "logps/chosen": -189.81216430664062, + "logps/rejected": -273.5445556640625, + "loss": 0.6573, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7640566825866699, + "rewards/margins": 1.445904016494751, + "rewards/rejected": -2.209960699081421, + "step": 2423 + }, + { + "epoch": 0.28, + "learning_rate": 2.1947793515158608e-07, + "logits/chosen": -2.819701671600342, + "logits/rejected": -2.9214327335357666, + "logps/chosen": -182.7758331298828, + "logps/rejected": -195.84042358398438, + "loss": 0.273, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24844267964363098, + "rewards/margins": 1.8445758819580078, + "rewards/rejected": -2.0930185317993164, + "step": 2424 + }, + { + "epoch": 0.28, + "learning_rate": 2.1944281868196183e-07, + "logits/chosen": -2.261744260787964, + "logits/rejected": -2.7065818309783936, + "logps/chosen": -474.8184814453125, + "logps/rejected": -443.3789367675781, + "loss": 0.3635, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0342570543289185, + "rewards/margins": 3.486926794052124, + "rewards/rejected": -4.521183967590332, + "step": 2425 + }, + { + "epoch": 0.28, + "learning_rate": 2.1940770221233756e-07, + "logits/chosen": -2.8755526542663574, + "logits/rejected": -3.0474071502685547, + "logps/chosen": -286.1255187988281, + "logps/rejected": -246.12203979492188, + "loss": 0.2778, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7944788932800293, + "rewards/margins": 1.830936074256897, + "rewards/rejected": -2.625415086746216, + "step": 2426 + }, + { + "epoch": 0.28, + "learning_rate": 2.1937258574271331e-07, + "logits/chosen": -2.78344988822937, + "logits/rejected": -2.5247726440429688, + "logps/chosen": -291.579833984375, + "logps/rejected": -257.131103515625, + "loss": 0.3695, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2823655605316162, + "rewards/margins": 1.8880443572998047, + "rewards/rejected": -3.170409679412842, + "step": 2427 + }, + { + "epoch": 0.28, + "learning_rate": 2.193374692730891e-07, + "logits/chosen": -2.335329532623291, + "logits/rejected": -2.5869433879852295, + "logps/chosen": -262.25714111328125, + "logps/rejected": -221.13279724121094, + "loss": 0.2429, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.28689390420913696, + "rewards/margins": 3.053713798522949, + "rewards/rejected": -3.3406078815460205, + "step": 2428 + }, + { + "epoch": 0.28, + "learning_rate": 2.193023528034648e-07, + "logits/chosen": -2.139686346054077, + "logits/rejected": -2.252156972885132, + "logps/chosen": -208.79652404785156, + "logps/rejected": -253.7653045654297, + "loss": 0.3305, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9268945455551147, + "rewards/margins": 1.8459022045135498, + "rewards/rejected": -2.772796869277954, + "step": 2429 + }, + { + "epoch": 0.28, + "learning_rate": 2.1926723633384058e-07, + "logits/chosen": -1.9936323165893555, + "logits/rejected": -2.082871198654175, + "logps/chosen": -210.2786407470703, + "logps/rejected": -234.03958129882812, + "loss": 0.7456, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9410578012466431, + "rewards/margins": 0.944747269153595, + "rewards/rejected": -1.8858050107955933, + "step": 2430 + }, + { + "epoch": 0.28, + "learning_rate": 2.192321198642163e-07, + "logits/chosen": -2.458162546157837, + "logits/rejected": -2.6097891330718994, + "logps/chosen": -281.7242736816406, + "logps/rejected": -313.25738525390625, + "loss": 0.5373, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6458357572555542, + "rewards/margins": 2.4309496879577637, + "rewards/rejected": -3.076785087585449, + "step": 2431 + }, + { + "epoch": 0.28, + "learning_rate": 2.1919700339459206e-07, + "logits/chosen": -2.3160173892974854, + "logits/rejected": -2.1864781379699707, + "logps/chosen": -297.7412414550781, + "logps/rejected": -382.972412109375, + "loss": 0.5761, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.722222089767456, + "rewards/margins": 2.2620432376861572, + "rewards/rejected": -2.9842653274536133, + "step": 2432 + }, + { + "epoch": 0.28, + "learning_rate": 2.191618869249678e-07, + "logits/chosen": -2.1915712356567383, + "logits/rejected": -2.437457323074341, + "logps/chosen": -259.9017333984375, + "logps/rejected": -197.81629943847656, + "loss": 0.2566, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21500888466835022, + "rewards/margins": 2.290107250213623, + "rewards/rejected": -2.5051162242889404, + "step": 2433 + }, + { + "epoch": 0.28, + "learning_rate": 2.1912677045534354e-07, + "logits/chosen": -1.9252753257751465, + "logits/rejected": -1.9840455055236816, + "logps/chosen": -273.67474365234375, + "logps/rejected": -291.2041320800781, + "loss": 0.2499, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40986132621765137, + "rewards/margins": 1.7023776769638062, + "rewards/rejected": -2.112239122390747, + "step": 2434 + }, + { + "epoch": 0.28, + "learning_rate": 2.190916539857193e-07, + "logits/chosen": -2.6135244369506836, + "logits/rejected": -2.609269380569458, + "logps/chosen": -322.63043212890625, + "logps/rejected": -249.57261657714844, + "loss": 0.9283, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4705978631973267, + "rewards/margins": 2.307497978210449, + "rewards/rejected": -3.7780959606170654, + "step": 2435 + }, + { + "epoch": 0.28, + "learning_rate": 2.1905653751609505e-07, + "logits/chosen": -1.8433232307434082, + "logits/rejected": -2.1052639484405518, + "logps/chosen": -374.66009521484375, + "logps/rejected": -337.7814025878906, + "loss": 0.3262, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.007842868566513062, + "rewards/margins": 2.4846675395965576, + "rewards/rejected": -2.4925105571746826, + "step": 2436 + }, + { + "epoch": 0.28, + "learning_rate": 2.1902142104647077e-07, + "logits/chosen": -2.2914087772369385, + "logits/rejected": -2.242795944213867, + "logps/chosen": -436.4996643066406, + "logps/rejected": -470.3254699707031, + "loss": 0.5543, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2544445991516113, + "rewards/margins": 1.9846495389938354, + "rewards/rejected": -3.2390942573547363, + "step": 2437 + }, + { + "epoch": 0.28, + "learning_rate": 2.1898630457684653e-07, + "logits/chosen": -2.7188189029693604, + "logits/rejected": -2.7371444702148438, + "logps/chosen": -333.9867858886719, + "logps/rejected": -170.19891357421875, + "loss": 0.3629, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0541421175003052, + "rewards/margins": 2.5370748043060303, + "rewards/rejected": -3.591216564178467, + "step": 2438 + }, + { + "epoch": 0.28, + "learning_rate": 2.1895118810722225e-07, + "logits/chosen": -2.6582319736480713, + "logits/rejected": -2.659515857696533, + "logps/chosen": -271.9305419921875, + "logps/rejected": -260.5155029296875, + "loss": 0.5211, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1317111849784851, + "rewards/margins": 1.7377021312713623, + "rewards/rejected": -1.8694133758544922, + "step": 2439 + }, + { + "epoch": 0.28, + "learning_rate": 2.18916071637598e-07, + "logits/chosen": -2.1363348960876465, + "logits/rejected": -2.149104595184326, + "logps/chosen": -521.8580322265625, + "logps/rejected": -371.12896728515625, + "loss": 0.7371, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3837132453918457, + "rewards/margins": 0.8112233877182007, + "rewards/rejected": -2.194936752319336, + "step": 2440 + }, + { + "epoch": 0.28, + "learning_rate": 2.188809551679738e-07, + "logits/chosen": -2.544259786605835, + "logits/rejected": -2.3334155082702637, + "logps/chosen": -406.29705810546875, + "logps/rejected": -457.612548828125, + "loss": 0.5039, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5515165328979492, + "rewards/margins": 1.5565167665481567, + "rewards/rejected": -3.1080331802368164, + "step": 2441 + }, + { + "epoch": 0.28, + "learning_rate": 2.1884583869834952e-07, + "logits/chosen": -2.3073742389678955, + "logits/rejected": -1.9386804103851318, + "logps/chosen": -101.67427825927734, + "logps/rejected": -326.46923828125, + "loss": 0.3026, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.725997805595398, + "rewards/margins": 2.1295580863952637, + "rewards/rejected": -2.855555772781372, + "step": 2442 + }, + { + "epoch": 0.28, + "learning_rate": 2.1881072222872527e-07, + "logits/chosen": -2.701620101928711, + "logits/rejected": -2.734715223312378, + "logps/chosen": -271.55242919921875, + "logps/rejected": -319.9792785644531, + "loss": 0.3756, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.338201880455017, + "rewards/margins": 1.9652851819992065, + "rewards/rejected": -3.3034873008728027, + "step": 2443 + }, + { + "epoch": 0.28, + "learning_rate": 2.1877560575910102e-07, + "logits/chosen": -2.145052433013916, + "logits/rejected": -2.0508599281311035, + "logps/chosen": -182.49302673339844, + "logps/rejected": -279.00177001953125, + "loss": 0.795, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1125556230545044, + "rewards/margins": 0.8905218839645386, + "rewards/rejected": -2.003077507019043, + "step": 2444 + }, + { + "epoch": 0.28, + "learning_rate": 2.1874048928947675e-07, + "logits/chosen": -2.3046319484710693, + "logits/rejected": -2.6178691387176514, + "logps/chosen": -455.47528076171875, + "logps/rejected": -318.9527282714844, + "loss": 0.2466, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.29416751861572266, + "rewards/margins": 2.660541534423828, + "rewards/rejected": -2.954709053039551, + "step": 2445 + }, + { + "epoch": 0.28, + "learning_rate": 2.187053728198525e-07, + "logits/chosen": -2.073875904083252, + "logits/rejected": -2.1162052154541016, + "logps/chosen": -370.3212890625, + "logps/rejected": -387.3321838378906, + "loss": 0.6316, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7475825548171997, + "rewards/margins": 0.8660964965820312, + "rewards/rejected": -1.6136791706085205, + "step": 2446 + }, + { + "epoch": 0.28, + "learning_rate": 2.1867025635022823e-07, + "logits/chosen": -1.6834455728530884, + "logits/rejected": -1.8639960289001465, + "logps/chosen": -416.44952392578125, + "logps/rejected": -341.13287353515625, + "loss": 0.3097, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5562046766281128, + "rewards/margins": 1.6488511562347412, + "rewards/rejected": -2.2050559520721436, + "step": 2447 + }, + { + "epoch": 0.28, + "learning_rate": 2.1863513988060399e-07, + "logits/chosen": -2.5492734909057617, + "logits/rejected": -2.3526597023010254, + "logps/chosen": -256.4029235839844, + "logps/rejected": -303.84259033203125, + "loss": 0.3263, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1527530997991562, + "rewards/margins": 1.7873058319091797, + "rewards/rejected": -1.9400588274002075, + "step": 2448 + }, + { + "epoch": 0.28, + "learning_rate": 2.1860002341097974e-07, + "logits/chosen": -2.80826473236084, + "logits/rejected": -2.639101028442383, + "logps/chosen": -263.2886962890625, + "logps/rejected": -394.19573974609375, + "loss": 0.6294, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5196126699447632, + "rewards/margins": 1.8430471420288086, + "rewards/rejected": -2.3626596927642822, + "step": 2449 + }, + { + "epoch": 0.28, + "learning_rate": 2.1856490694135547e-07, + "logits/chosen": -2.910186529159546, + "logits/rejected": -2.946876287460327, + "logps/chosen": -192.00843811035156, + "logps/rejected": -244.6717529296875, + "loss": 0.2614, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.35963767766952515, + "rewards/margins": 2.8837549686431885, + "rewards/rejected": -3.2433929443359375, + "step": 2450 + }, + { + "epoch": 0.28, + "learning_rate": 2.1852979047173122e-07, + "logits/chosen": -1.871061086654663, + "logits/rejected": -2.0845985412597656, + "logps/chosen": -343.3778381347656, + "logps/rejected": -335.8085021972656, + "loss": 0.0853, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8323565721511841, + "rewards/margins": 3.5186619758605957, + "rewards/rejected": -4.351018905639648, + "step": 2451 + }, + { + "epoch": 0.28, + "learning_rate": 2.18494674002107e-07, + "logits/chosen": -2.309882640838623, + "logits/rejected": -2.4285776615142822, + "logps/chosen": -225.70838928222656, + "logps/rejected": -245.5386962890625, + "loss": 0.3438, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4114147424697876, + "rewards/margins": 1.8642867803573608, + "rewards/rejected": -2.2757015228271484, + "step": 2452 + }, + { + "epoch": 0.28, + "learning_rate": 2.1845955753248273e-07, + "logits/chosen": -2.0365006923675537, + "logits/rejected": -2.1992223262786865, + "logps/chosen": -355.3990783691406, + "logps/rejected": -288.09375, + "loss": 0.3655, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4440334737300873, + "rewards/margins": 1.93074369430542, + "rewards/rejected": -2.37477707862854, + "step": 2453 + }, + { + "epoch": 0.28, + "learning_rate": 2.1842444106285848e-07, + "logits/chosen": -2.441605567932129, + "logits/rejected": -2.526972770690918, + "logps/chosen": -338.7271728515625, + "logps/rejected": -315.3823547363281, + "loss": 0.9788, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.9229191541671753, + "rewards/margins": 0.1503865122795105, + "rewards/rejected": -2.073305606842041, + "step": 2454 + }, + { + "epoch": 0.28, + "learning_rate": 2.183893245932342e-07, + "logits/chosen": -2.1004157066345215, + "logits/rejected": -1.921632170677185, + "logps/chosen": -292.177734375, + "logps/rejected": -287.2791748046875, + "loss": 0.324, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.30867642164230347, + "rewards/margins": 2.2386605739593506, + "rewards/rejected": -2.547337055206299, + "step": 2455 + }, + { + "epoch": 0.28, + "learning_rate": 2.1835420812360996e-07, + "logits/chosen": -1.9977962970733643, + "logits/rejected": -1.9703657627105713, + "logps/chosen": -433.9224853515625, + "logps/rejected": -351.951171875, + "loss": 0.3925, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.603542149066925, + "rewards/margins": 1.4247784614562988, + "rewards/rejected": -2.028320550918579, + "step": 2456 + }, + { + "epoch": 0.28, + "learning_rate": 2.1831909165398572e-07, + "logits/chosen": -2.3126280307769775, + "logits/rejected": -2.189213752746582, + "logps/chosen": -309.74603271484375, + "logps/rejected": -532.9531860351562, + "loss": 0.4335, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.490110844373703, + "rewards/margins": 2.7322986125946045, + "rewards/rejected": -3.22240948677063, + "step": 2457 + }, + { + "epoch": 0.28, + "learning_rate": 2.1828397518436145e-07, + "logits/chosen": -2.4101052284240723, + "logits/rejected": -2.382307291030884, + "logps/chosen": -286.77197265625, + "logps/rejected": -342.23553466796875, + "loss": 0.3186, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3374767005443573, + "rewards/margins": 2.2863833904266357, + "rewards/rejected": -2.6238598823547363, + "step": 2458 + }, + { + "epoch": 0.28, + "learning_rate": 2.182488587147372e-07, + "logits/chosen": -2.77170991897583, + "logits/rejected": -2.906287670135498, + "logps/chosen": -290.781982421875, + "logps/rejected": -228.37890625, + "loss": 0.2657, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.321796178817749, + "rewards/margins": 2.2169530391693115, + "rewards/rejected": -3.5387492179870605, + "step": 2459 + }, + { + "epoch": 0.28, + "learning_rate": 2.1821374224511295e-07, + "logits/chosen": -2.108874559402466, + "logits/rejected": -2.196535110473633, + "logps/chosen": -375.1902160644531, + "logps/rejected": -266.01446533203125, + "loss": 0.3485, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6460241079330444, + "rewards/margins": 3.054100513458252, + "rewards/rejected": -3.700124502182007, + "step": 2460 + }, + { + "epoch": 0.28, + "learning_rate": 2.1817862577548868e-07, + "logits/chosen": -2.220829963684082, + "logits/rejected": -1.9351346492767334, + "logps/chosen": -254.77487182617188, + "logps/rejected": -352.6654357910156, + "loss": 0.2141, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.47597724199295044, + "rewards/margins": 2.628523111343384, + "rewards/rejected": -3.1045002937316895, + "step": 2461 + }, + { + "epoch": 0.28, + "learning_rate": 2.1814350930586446e-07, + "logits/chosen": -2.548750400543213, + "logits/rejected": -2.772766590118408, + "logps/chosen": -129.17416381835938, + "logps/rejected": -164.41690063476562, + "loss": 0.5452, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6442351341247559, + "rewards/margins": 1.938225507736206, + "rewards/rejected": -2.582460641860962, + "step": 2462 + }, + { + "epoch": 0.28, + "learning_rate": 2.1810839283624016e-07, + "logits/chosen": -1.951967477798462, + "logits/rejected": -1.9708914756774902, + "logps/chosen": -378.47943115234375, + "logps/rejected": -268.89459228515625, + "loss": 0.604, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5575700998306274, + "rewards/margins": 1.9665517807006836, + "rewards/rejected": -3.5241219997406006, + "step": 2463 + }, + { + "epoch": 0.28, + "learning_rate": 2.1807327636661594e-07, + "logits/chosen": -2.3427653312683105, + "logits/rejected": -2.2895588874816895, + "logps/chosen": -104.25944519042969, + "logps/rejected": -122.43787384033203, + "loss": 0.1462, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.150942862033844, + "rewards/margins": 2.4883530139923096, + "rewards/rejected": -2.3374102115631104, + "step": 2464 + }, + { + "epoch": 0.28, + "learning_rate": 2.180381598969917e-07, + "logits/chosen": -2.625643730163574, + "logits/rejected": -2.69903826713562, + "logps/chosen": -284.5074462890625, + "logps/rejected": -271.4203796386719, + "loss": 0.1811, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.19656433165073395, + "rewards/margins": 4.133691787719727, + "rewards/rejected": -3.937127113342285, + "step": 2465 + }, + { + "epoch": 0.28, + "learning_rate": 2.1800304342736742e-07, + "logits/chosen": -2.1715316772460938, + "logits/rejected": -2.1443653106689453, + "logps/chosen": -202.24307250976562, + "logps/rejected": -240.83047485351562, + "loss": 0.4853, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08304794132709503, + "rewards/margins": 1.8087880611419678, + "rewards/rejected": -1.8918360471725464, + "step": 2466 + }, + { + "epoch": 0.28, + "learning_rate": 2.1796792695774318e-07, + "logits/chosen": -1.9826703071594238, + "logits/rejected": -1.9146407842636108, + "logps/chosen": -337.1289978027344, + "logps/rejected": -319.1853332519531, + "loss": 0.3386, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7358119487762451, + "rewards/margins": 2.091644287109375, + "rewards/rejected": -2.827456474304199, + "step": 2467 + }, + { + "epoch": 0.28, + "learning_rate": 2.1793281048811893e-07, + "logits/chosen": -2.3507347106933594, + "logits/rejected": -2.0925941467285156, + "logps/chosen": -220.15530395507812, + "logps/rejected": -396.07806396484375, + "loss": 0.7453, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6252729892730713, + "rewards/margins": 1.6866545677185059, + "rewards/rejected": -2.3119277954101562, + "step": 2468 + }, + { + "epoch": 0.28, + "learning_rate": 2.1789769401849466e-07, + "logits/chosen": -2.174959182739258, + "logits/rejected": -1.769439935684204, + "logps/chosen": -162.14407348632812, + "logps/rejected": -285.76007080078125, + "loss": 0.6444, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.920740008354187, + "rewards/margins": 3.4025726318359375, + "rewards/rejected": -5.323312759399414, + "step": 2469 + }, + { + "epoch": 0.28, + "learning_rate": 2.178625775488704e-07, + "logits/chosen": -3.0497822761535645, + "logits/rejected": -2.99080228805542, + "logps/chosen": -277.1462707519531, + "logps/rejected": -370.75823974609375, + "loss": 0.3185, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.255558729171753, + "rewards/margins": 2.567613363265991, + "rewards/rejected": -3.8231723308563232, + "step": 2470 + }, + { + "epoch": 0.28, + "learning_rate": 2.1782746107924614e-07, + "logits/chosen": -2.4754772186279297, + "logits/rejected": -2.2093708515167236, + "logps/chosen": -303.27447509765625, + "logps/rejected": -331.6673583984375, + "loss": 0.4769, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3520278930664062, + "rewards/margins": 2.4878695011138916, + "rewards/rejected": -3.839897394180298, + "step": 2471 + }, + { + "epoch": 0.28, + "learning_rate": 2.177923446096219e-07, + "logits/chosen": -2.9556033611297607, + "logits/rejected": -2.9414756298065186, + "logps/chosen": -159.44468688964844, + "logps/rejected": -172.62277221679688, + "loss": 0.5592, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.3006386756896973, + "rewards/margins": 1.1853097677230835, + "rewards/rejected": -3.4859485626220703, + "step": 2472 + }, + { + "epoch": 0.29, + "learning_rate": 2.1775722813999767e-07, + "logits/chosen": -2.5305275917053223, + "logits/rejected": -2.457031011581421, + "logps/chosen": -234.55902099609375, + "logps/rejected": -223.69874572753906, + "loss": 0.6834, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.0443546772003174, + "rewards/margins": 1.0818841457366943, + "rewards/rejected": -3.1262388229370117, + "step": 2473 + }, + { + "epoch": 0.29, + "learning_rate": 2.1772211167037337e-07, + "logits/chosen": -2.4753174781799316, + "logits/rejected": -2.5601484775543213, + "logps/chosen": -174.16722106933594, + "logps/rejected": -218.34542846679688, + "loss": 0.6866, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6479533910751343, + "rewards/margins": 0.8859540224075317, + "rewards/rejected": -1.533907413482666, + "step": 2474 + }, + { + "epoch": 0.29, + "learning_rate": 2.1768699520074916e-07, + "logits/chosen": -1.917624831199646, + "logits/rejected": -2.2149758338928223, + "logps/chosen": -443.4742736816406, + "logps/rejected": -377.65667724609375, + "loss": 0.2432, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24862027168273926, + "rewards/margins": 2.1499271392822266, + "rewards/rejected": -2.3985471725463867, + "step": 2475 + }, + { + "epoch": 0.29, + "learning_rate": 2.1765187873112488e-07, + "logits/chosen": -2.435507297515869, + "logits/rejected": -2.412458896636963, + "logps/chosen": -239.40869140625, + "logps/rejected": -223.6599884033203, + "loss": 0.2973, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7124112248420715, + "rewards/margins": 2.1439900398254395, + "rewards/rejected": -2.856401205062866, + "step": 2476 + }, + { + "epoch": 0.29, + "learning_rate": 2.1761676226150064e-07, + "logits/chosen": -2.811694622039795, + "logits/rejected": -2.6999518871307373, + "logps/chosen": -291.4810791015625, + "logps/rejected": -263.38934326171875, + "loss": 0.2268, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3172162175178528, + "rewards/margins": 3.106454849243164, + "rewards/rejected": -3.423671007156372, + "step": 2477 + }, + { + "epoch": 0.29, + "learning_rate": 2.175816457918764e-07, + "logits/chosen": -2.7458229064941406, + "logits/rejected": -2.5749449729919434, + "logps/chosen": -232.617919921875, + "logps/rejected": -395.4639892578125, + "loss": 0.5124, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0600087642669678, + "rewards/margins": 2.11266827583313, + "rewards/rejected": -3.1726770401000977, + "step": 2478 + }, + { + "epoch": 0.29, + "learning_rate": 2.1754652932225212e-07, + "logits/chosen": -2.669684886932373, + "logits/rejected": -2.802110195159912, + "logps/chosen": -261.8310546875, + "logps/rejected": -277.34130859375, + "loss": 0.1643, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5377234220504761, + "rewards/margins": 2.5686492919921875, + "rewards/rejected": -3.106372833251953, + "step": 2479 + }, + { + "epoch": 0.29, + "learning_rate": 2.1751141285262787e-07, + "logits/chosen": -2.1284191608428955, + "logits/rejected": -2.3569302558898926, + "logps/chosen": -363.6933288574219, + "logps/rejected": -330.7926025390625, + "loss": 0.6742, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9072563648223877, + "rewards/margins": 1.2650668621063232, + "rewards/rejected": -2.172322988510132, + "step": 2480 + }, + { + "epoch": 0.29, + "learning_rate": 2.1747629638300363e-07, + "logits/chosen": -2.751878499984741, + "logits/rejected": -2.577134609222412, + "logps/chosen": -214.94998168945312, + "logps/rejected": -200.85498046875, + "loss": 0.2102, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1448129117488861, + "rewards/margins": 3.737525463104248, + "rewards/rejected": -3.59271240234375, + "step": 2481 + }, + { + "epoch": 0.29, + "learning_rate": 2.1744117991337935e-07, + "logits/chosen": -2.5389645099639893, + "logits/rejected": -2.6797401905059814, + "logps/chosen": -219.65228271484375, + "logps/rejected": -150.40414428710938, + "loss": 1.2345, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.5204906463623047, + "rewards/margins": 0.08823424577713013, + "rewards/rejected": -2.60872483253479, + "step": 2482 + }, + { + "epoch": 0.29, + "learning_rate": 2.174060634437551e-07, + "logits/chosen": -2.5378336906433105, + "logits/rejected": -2.5552456378936768, + "logps/chosen": -178.38906860351562, + "logps/rejected": -213.99484252929688, + "loss": 0.4633, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.43561533093452454, + "rewards/margins": 0.7418243885040283, + "rewards/rejected": -1.1774396896362305, + "step": 2483 + }, + { + "epoch": 0.29, + "learning_rate": 2.1737094697413083e-07, + "logits/chosen": -2.402651071548462, + "logits/rejected": -2.6140613555908203, + "logps/chosen": -260.8829345703125, + "logps/rejected": -237.16262817382812, + "loss": 0.4737, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8896075487136841, + "rewards/margins": 1.5813407897949219, + "rewards/rejected": -2.4709484577178955, + "step": 2484 + }, + { + "epoch": 0.29, + "learning_rate": 2.173358305045066e-07, + "logits/chosen": -2.6688385009765625, + "logits/rejected": -2.783121109008789, + "logps/chosen": -375.25341796875, + "logps/rejected": -202.53280639648438, + "loss": 0.2169, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.043233923614025116, + "rewards/margins": 2.044422149658203, + "rewards/rejected": -2.087656021118164, + "step": 2485 + }, + { + "epoch": 0.29, + "learning_rate": 2.1730071403488237e-07, + "logits/chosen": -2.263840913772583, + "logits/rejected": -2.3802826404571533, + "logps/chosen": -419.69903564453125, + "logps/rejected": -278.6414794921875, + "loss": 0.2107, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.44679608941078186, + "rewards/margins": 3.3118674755096436, + "rewards/rejected": -3.7586636543273926, + "step": 2486 + }, + { + "epoch": 0.29, + "learning_rate": 2.172655975652581e-07, + "logits/chosen": -2.3917436599731445, + "logits/rejected": -2.548600196838379, + "logps/chosen": -279.53924560546875, + "logps/rejected": -263.5947570800781, + "loss": 0.1724, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.30328693985939026, + "rewards/margins": 2.1714484691619873, + "rewards/rejected": -2.4747354984283447, + "step": 2487 + }, + { + "epoch": 0.29, + "learning_rate": 2.1723048109563385e-07, + "logits/chosen": -2.561877727508545, + "logits/rejected": -2.500582218170166, + "logps/chosen": -397.6167297363281, + "logps/rejected": -356.45892333984375, + "loss": 0.3786, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8215648531913757, + "rewards/margins": 1.483477234840393, + "rewards/rejected": -2.305042028427124, + "step": 2488 + }, + { + "epoch": 0.29, + "learning_rate": 2.171953646260096e-07, + "logits/chosen": -1.850279450416565, + "logits/rejected": -1.9309972524642944, + "logps/chosen": -327.1192626953125, + "logps/rejected": -373.1898193359375, + "loss": 0.3052, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.45602142810821533, + "rewards/margins": 2.5521066188812256, + "rewards/rejected": -3.0081281661987305, + "step": 2489 + }, + { + "epoch": 0.29, + "learning_rate": 2.1716024815638533e-07, + "logits/chosen": -2.056248664855957, + "logits/rejected": -2.277829885482788, + "logps/chosen": -277.70703125, + "logps/rejected": -317.2289123535156, + "loss": 1.1334, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.8710474967956543, + "rewards/margins": 0.02635599672794342, + "rewards/rejected": -1.897403359413147, + "step": 2490 + }, + { + "epoch": 0.29, + "learning_rate": 2.1712513168676108e-07, + "logits/chosen": -2.759068250656128, + "logits/rejected": -2.582048177719116, + "logps/chosen": -135.1173858642578, + "logps/rejected": -234.766357421875, + "loss": 0.2442, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.31361085176467896, + "rewards/margins": 2.1228721141815186, + "rewards/rejected": -2.4364829063415527, + "step": 2491 + }, + { + "epoch": 0.29, + "learning_rate": 2.170900152171368e-07, + "logits/chosen": -1.8766114711761475, + "logits/rejected": -2.162444829940796, + "logps/chosen": -242.281982421875, + "logps/rejected": -245.81317138671875, + "loss": 0.2811, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8675585985183716, + "rewards/margins": 2.257131814956665, + "rewards/rejected": -3.124690532684326, + "step": 2492 + }, + { + "epoch": 0.29, + "learning_rate": 2.1705489874751257e-07, + "logits/chosen": -2.432751178741455, + "logits/rejected": -2.368889808654785, + "logps/chosen": -100.38070678710938, + "logps/rejected": -128.7706298828125, + "loss": 0.1883, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2082517445087433, + "rewards/margins": 2.113367795944214, + "rewards/rejected": -2.3216195106506348, + "step": 2493 + }, + { + "epoch": 0.29, + "learning_rate": 2.1701978227788832e-07, + "logits/chosen": -2.143502712249756, + "logits/rejected": -1.9460036754608154, + "logps/chosen": -239.843017578125, + "logps/rejected": -294.60247802734375, + "loss": 1.2144, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.957772970199585, + "rewards/margins": 0.2772822380065918, + "rewards/rejected": -2.2350552082061768, + "step": 2494 + }, + { + "epoch": 0.29, + "learning_rate": 2.1698466580826405e-07, + "logits/chosen": -2.5868897438049316, + "logits/rejected": -2.5826215744018555, + "logps/chosen": -397.13946533203125, + "logps/rejected": -262.56732177734375, + "loss": 0.3427, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7931755781173706, + "rewards/margins": 1.9120322465896606, + "rewards/rejected": -2.7052078247070312, + "step": 2495 + }, + { + "epoch": 0.29, + "learning_rate": 2.1694954933863983e-07, + "logits/chosen": -2.4368553161621094, + "logits/rejected": -2.409830331802368, + "logps/chosen": -384.52587890625, + "logps/rejected": -395.92376708984375, + "loss": 0.3201, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.01948792114853859, + "rewards/margins": 2.015312433242798, + "rewards/rejected": -2.0348002910614014, + "step": 2496 + }, + { + "epoch": 0.29, + "learning_rate": 2.1691443286901558e-07, + "logits/chosen": -1.5018794536590576, + "logits/rejected": -1.878424882888794, + "logps/chosen": -495.81842041015625, + "logps/rejected": -231.50152587890625, + "loss": 0.5554, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3720028400421143, + "rewards/margins": 0.790477991104126, + "rewards/rejected": -2.1624808311462402, + "step": 2497 + }, + { + "epoch": 0.29, + "learning_rate": 2.168793163993913e-07, + "logits/chosen": -1.6203193664550781, + "logits/rejected": -1.942190408706665, + "logps/chosen": -404.5362854003906, + "logps/rejected": -263.76318359375, + "loss": 0.3077, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19167043268680573, + "rewards/margins": 1.2711715698242188, + "rewards/rejected": -1.4628421068191528, + "step": 2498 + }, + { + "epoch": 0.29, + "learning_rate": 2.1684419992976706e-07, + "logits/chosen": -2.6677346229553223, + "logits/rejected": -2.8319919109344482, + "logps/chosen": -190.47784423828125, + "logps/rejected": -185.7202606201172, + "loss": 0.2836, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1268101930618286, + "rewards/margins": 1.9127918481826782, + "rewards/rejected": -3.0396018028259277, + "step": 2499 + }, + { + "epoch": 0.29, + "learning_rate": 2.168090834601428e-07, + "logits/chosen": -2.209604024887085, + "logits/rejected": -2.6302175521850586, + "logps/chosen": -233.34629821777344, + "logps/rejected": -148.5356903076172, + "loss": 0.4724, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7165534496307373, + "rewards/margins": 1.6547045707702637, + "rewards/rejected": -2.371258020401001, + "step": 2500 + }, + { + "epoch": 0.29, + "learning_rate": 2.1677396699051854e-07, + "logits/chosen": -2.09555721282959, + "logits/rejected": -2.0162136554718018, + "logps/chosen": -458.00506591796875, + "logps/rejected": -466.8785400390625, + "loss": 0.0717, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.31705227494239807, + "rewards/margins": 4.102997303009033, + "rewards/rejected": -4.420049667358398, + "step": 2501 + }, + { + "epoch": 0.29, + "learning_rate": 2.167388505208943e-07, + "logits/chosen": -2.4131948947906494, + "logits/rejected": -2.494016647338867, + "logps/chosen": -145.25401306152344, + "logps/rejected": -211.8494110107422, + "loss": 0.7219, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.793150782585144, + "rewards/margins": 0.9947319030761719, + "rewards/rejected": -1.7878828048706055, + "step": 2502 + }, + { + "epoch": 0.29, + "learning_rate": 2.1670373405127002e-07, + "logits/chosen": -2.5805275440216064, + "logits/rejected": -2.5910987854003906, + "logps/chosen": -375.61285400390625, + "logps/rejected": -169.49935913085938, + "loss": 0.4291, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1756057739257812, + "rewards/margins": 1.1319878101348877, + "rewards/rejected": -2.307593584060669, + "step": 2503 + }, + { + "epoch": 0.29, + "learning_rate": 2.1666861758164578e-07, + "logits/chosen": -2.593813896179199, + "logits/rejected": -2.5796639919281006, + "logps/chosen": -231.72543334960938, + "logps/rejected": -239.4080352783203, + "loss": 0.384, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2376893758773804, + "rewards/margins": 1.881191372871399, + "rewards/rejected": -3.1188807487487793, + "step": 2504 + }, + { + "epoch": 0.29, + "learning_rate": 2.1663350111202153e-07, + "logits/chosen": -2.807332992553711, + "logits/rejected": -2.8836371898651123, + "logps/chosen": -192.61215209960938, + "logps/rejected": -148.04144287109375, + "loss": 0.4247, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5157902240753174, + "rewards/margins": 1.446847677230835, + "rewards/rejected": -1.9626379013061523, + "step": 2505 + }, + { + "epoch": 0.29, + "learning_rate": 2.1659838464239726e-07, + "logits/chosen": -2.0801820755004883, + "logits/rejected": -2.078746795654297, + "logps/chosen": -381.7381286621094, + "logps/rejected": -509.00628662109375, + "loss": 0.3945, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7054781317710876, + "rewards/margins": 2.730571985244751, + "rewards/rejected": -3.4360501766204834, + "step": 2506 + }, + { + "epoch": 0.29, + "learning_rate": 2.1656326817277304e-07, + "logits/chosen": -2.272266387939453, + "logits/rejected": -2.2271904945373535, + "logps/chosen": -287.78302001953125, + "logps/rejected": -207.20150756835938, + "loss": 0.5201, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.482036828994751, + "rewards/margins": 1.4008641242980957, + "rewards/rejected": -1.8829009532928467, + "step": 2507 + }, + { + "epoch": 0.29, + "learning_rate": 2.1652815170314874e-07, + "logits/chosen": -2.6665329933166504, + "logits/rejected": -2.649542808532715, + "logps/chosen": -120.35103607177734, + "logps/rejected": -144.86740112304688, + "loss": 0.8492, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0906883478164673, + "rewards/margins": 0.26724767684936523, + "rewards/rejected": -1.3579360246658325, + "step": 2508 + }, + { + "epoch": 0.29, + "learning_rate": 2.1649303523352452e-07, + "logits/chosen": -2.401902675628662, + "logits/rejected": -2.362541675567627, + "logps/chosen": -257.30426025390625, + "logps/rejected": -187.57640075683594, + "loss": 0.4863, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6017701029777527, + "rewards/margins": 1.120435118675232, + "rewards/rejected": -1.7222051620483398, + "step": 2509 + }, + { + "epoch": 0.29, + "learning_rate": 2.1645791876390028e-07, + "logits/chosen": -2.6592843532562256, + "logits/rejected": -2.763338327407837, + "logps/chosen": -173.8151092529297, + "logps/rejected": -181.837158203125, + "loss": 0.7274, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3095707893371582, + "rewards/margins": 1.2134181261062622, + "rewards/rejected": -2.52298903465271, + "step": 2510 + }, + { + "epoch": 0.29, + "learning_rate": 2.16422802294276e-07, + "logits/chosen": -2.266633987426758, + "logits/rejected": -2.3071794509887695, + "logps/chosen": -141.38037109375, + "logps/rejected": -230.65882873535156, + "loss": 1.6615, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.8996896743774414, + "rewards/margins": 0.18647271394729614, + "rewards/rejected": -3.0861620903015137, + "step": 2511 + }, + { + "epoch": 0.29, + "learning_rate": 2.1638768582465176e-07, + "logits/chosen": -2.2041213512420654, + "logits/rejected": -2.1145517826080322, + "logps/chosen": -198.2747344970703, + "logps/rejected": -258.576904296875, + "loss": 0.5824, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.924216628074646, + "rewards/margins": 1.2111117839813232, + "rewards/rejected": -2.1353282928466797, + "step": 2512 + }, + { + "epoch": 0.29, + "learning_rate": 2.163525693550275e-07, + "logits/chosen": -2.2236173152923584, + "logits/rejected": -2.187129020690918, + "logps/chosen": -372.6287841796875, + "logps/rejected": -323.5574035644531, + "loss": 0.3486, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14380623400211334, + "rewards/margins": 1.1171057224273682, + "rewards/rejected": -1.2609119415283203, + "step": 2513 + }, + { + "epoch": 0.29, + "learning_rate": 2.1631745288540324e-07, + "logits/chosen": -2.6132054328918457, + "logits/rejected": -2.907196044921875, + "logps/chosen": -333.1739501953125, + "logps/rejected": -323.17352294921875, + "loss": 0.211, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3021363317966461, + "rewards/margins": 2.398979663848877, + "rewards/rejected": -2.701115846633911, + "step": 2514 + }, + { + "epoch": 0.29, + "learning_rate": 2.16282336415779e-07, + "logits/chosen": -2.0204129219055176, + "logits/rejected": -1.9572527408599854, + "logps/chosen": -323.29302978515625, + "logps/rejected": -320.7595520019531, + "loss": 0.4199, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6280992031097412, + "rewards/margins": 1.021571397781372, + "rewards/rejected": -1.6496704816818237, + "step": 2515 + }, + { + "epoch": 0.29, + "learning_rate": 2.1624721994615472e-07, + "logits/chosen": -2.314279794692993, + "logits/rejected": -2.408789873123169, + "logps/chosen": -283.04522705078125, + "logps/rejected": -231.9971160888672, + "loss": 0.3994, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0411102771759033, + "rewards/margins": 1.1273887157440186, + "rewards/rejected": -3.168498992919922, + "step": 2516 + }, + { + "epoch": 0.29, + "learning_rate": 2.1621210347653047e-07, + "logits/chosen": -2.403594732284546, + "logits/rejected": -2.355090379714966, + "logps/chosen": -410.1916198730469, + "logps/rejected": -350.2491455078125, + "loss": 0.4056, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4909313917160034, + "rewards/margins": 1.8510679006576538, + "rewards/rejected": -3.341999053955078, + "step": 2517 + }, + { + "epoch": 0.29, + "learning_rate": 2.1617698700690625e-07, + "logits/chosen": -2.5920825004577637, + "logits/rejected": -2.775888442993164, + "logps/chosen": -180.34765625, + "logps/rejected": -246.8555145263672, + "loss": 0.2839, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7689103484153748, + "rewards/margins": 3.467231273651123, + "rewards/rejected": -4.236141681671143, + "step": 2518 + }, + { + "epoch": 0.29, + "learning_rate": 2.1614187053728195e-07, + "logits/chosen": -2.0616137981414795, + "logits/rejected": -2.621957540512085, + "logps/chosen": -469.2490234375, + "logps/rejected": -278.4892578125, + "loss": 0.1544, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13410687446594238, + "rewards/margins": 2.886653423309326, + "rewards/rejected": -3.0207602977752686, + "step": 2519 + }, + { + "epoch": 0.29, + "learning_rate": 2.1610675406765773e-07, + "logits/chosen": -2.3180224895477295, + "logits/rejected": -2.5208468437194824, + "logps/chosen": -364.3057556152344, + "logps/rejected": -335.2153625488281, + "loss": 0.1229, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13761138916015625, + "rewards/margins": 3.3883371353149414, + "rewards/rejected": -3.5259482860565186, + "step": 2520 + }, + { + "epoch": 0.29, + "learning_rate": 2.1607163759803346e-07, + "logits/chosen": -2.2120072841644287, + "logits/rejected": -2.2962582111358643, + "logps/chosen": -178.77207946777344, + "logps/rejected": -182.6817626953125, + "loss": 0.1229, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07194246351718903, + "rewards/margins": 2.556936502456665, + "rewards/rejected": -2.6288790702819824, + "step": 2521 + }, + { + "epoch": 0.29, + "learning_rate": 2.1603652112840922e-07, + "logits/chosen": -1.941472053527832, + "logits/rejected": -2.135868787765503, + "logps/chosen": -385.17724609375, + "logps/rejected": -251.96353149414062, + "loss": 0.9163, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0661958456039429, + "rewards/margins": 0.14143329858779907, + "rewards/rejected": -1.2076290845870972, + "step": 2522 + }, + { + "epoch": 0.29, + "learning_rate": 2.1600140465878497e-07, + "logits/chosen": -3.026573419570923, + "logits/rejected": -2.9602766036987305, + "logps/chosen": -207.67279052734375, + "logps/rejected": -225.2896270751953, + "loss": 0.3831, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.002776235342025757, + "rewards/margins": 2.6906983852386475, + "rewards/rejected": -2.687922477722168, + "step": 2523 + }, + { + "epoch": 0.29, + "learning_rate": 2.159662881891607e-07, + "logits/chosen": -2.5447452068328857, + "logits/rejected": -2.5308167934417725, + "logps/chosen": -229.90420532226562, + "logps/rejected": -277.4078063964844, + "loss": 0.2439, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.716253399848938, + "rewards/margins": 1.5303927659988403, + "rewards/rejected": -2.2466464042663574, + "step": 2524 + }, + { + "epoch": 0.29, + "learning_rate": 2.1593117171953645e-07, + "logits/chosen": -2.491633892059326, + "logits/rejected": -2.41025710105896, + "logps/chosen": -362.05303955078125, + "logps/rejected": -386.0615539550781, + "loss": 0.3566, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6463236808776855, + "rewards/margins": 2.0414326190948486, + "rewards/rejected": -2.687756299972534, + "step": 2525 + }, + { + "epoch": 0.29, + "learning_rate": 2.158960552499122e-07, + "logits/chosen": -1.8470110893249512, + "logits/rejected": -2.2411136627197266, + "logps/chosen": -372.7100830078125, + "logps/rejected": -205.47361755371094, + "loss": 0.7733, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5208802223205566, + "rewards/margins": 0.5984212160110474, + "rewards/rejected": -1.119301438331604, + "step": 2526 + }, + { + "epoch": 0.29, + "learning_rate": 2.1586093878028793e-07, + "logits/chosen": -2.630983352661133, + "logits/rejected": -2.339719772338867, + "logps/chosen": -219.19004821777344, + "logps/rejected": -263.7445983886719, + "loss": 0.0541, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4666021168231964, + "rewards/margins": 4.09747838973999, + "rewards/rejected": -4.564080238342285, + "step": 2527 + }, + { + "epoch": 0.29, + "learning_rate": 2.1582582231066369e-07, + "logits/chosen": -2.3292646408081055, + "logits/rejected": -2.1583759784698486, + "logps/chosen": -486.8126525878906, + "logps/rejected": -328.5509948730469, + "loss": 0.8789, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5193840265274048, + "rewards/margins": 0.8933296203613281, + "rewards/rejected": -2.4127137660980225, + "step": 2528 + }, + { + "epoch": 0.29, + "learning_rate": 2.1579070584103941e-07, + "logits/chosen": -2.3094847202301025, + "logits/rejected": -2.5998241901397705, + "logps/chosen": -244.00936889648438, + "logps/rejected": -203.90426635742188, + "loss": 0.4778, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0074963569641113, + "rewards/margins": 0.9043752551078796, + "rewards/rejected": -1.9118717908859253, + "step": 2529 + }, + { + "epoch": 0.29, + "learning_rate": 2.157555893714152e-07, + "logits/chosen": -1.9506800174713135, + "logits/rejected": -2.2867136001586914, + "logps/chosen": -293.65643310546875, + "logps/rejected": -181.15602111816406, + "loss": 0.5066, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2802982032299042, + "rewards/margins": 1.7155166864395142, + "rewards/rejected": -1.9958148002624512, + "step": 2530 + }, + { + "epoch": 0.29, + "learning_rate": 2.1572047290179095e-07, + "logits/chosen": -2.3501811027526855, + "logits/rejected": -2.579098701477051, + "logps/chosen": -226.302734375, + "logps/rejected": -172.85804748535156, + "loss": 0.4368, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6173152327537537, + "rewards/margins": 1.7641758918762207, + "rewards/rejected": -2.381491184234619, + "step": 2531 + }, + { + "epoch": 0.29, + "learning_rate": 2.1568535643216667e-07, + "logits/chosen": -2.6448915004730225, + "logits/rejected": -2.663639545440674, + "logps/chosen": -232.97772216796875, + "logps/rejected": -147.52154541015625, + "loss": 0.4671, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5582476854324341, + "rewards/margins": 1.0900155305862427, + "rewards/rejected": -1.6482632160186768, + "step": 2532 + }, + { + "epoch": 0.29, + "learning_rate": 2.1565023996254243e-07, + "logits/chosen": -2.249283790588379, + "logits/rejected": -2.3865199089050293, + "logps/chosen": -276.03765869140625, + "logps/rejected": -236.76296997070312, + "loss": 0.3899, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5000569820404053, + "rewards/margins": 1.5978496074676514, + "rewards/rejected": -2.0979065895080566, + "step": 2533 + }, + { + "epoch": 0.29, + "learning_rate": 2.1561512349291818e-07, + "logits/chosen": -2.4983503818511963, + "logits/rejected": -2.367893695831299, + "logps/chosen": -342.3955078125, + "logps/rejected": -275.3607482910156, + "loss": 0.3453, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9140653610229492, + "rewards/margins": 2.1655869483947754, + "rewards/rejected": -3.0796525478363037, + "step": 2534 + }, + { + "epoch": 0.29, + "learning_rate": 2.155800070232939e-07, + "logits/chosen": -2.3417129516601562, + "logits/rejected": -2.514291763305664, + "logps/chosen": -201.36428833007812, + "logps/rejected": -274.4166564941406, + "loss": 0.5141, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7897626161575317, + "rewards/margins": 1.0796482563018799, + "rewards/rejected": -1.869410753250122, + "step": 2535 + }, + { + "epoch": 0.29, + "learning_rate": 2.1554489055366966e-07, + "logits/chosen": -2.4388468265533447, + "logits/rejected": -2.1271839141845703, + "logps/chosen": -298.814697265625, + "logps/rejected": -398.1164855957031, + "loss": 0.3389, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4512139856815338, + "rewards/margins": 2.1147053241729736, + "rewards/rejected": -2.5659191608428955, + "step": 2536 + }, + { + "epoch": 0.29, + "learning_rate": 2.155097740840454e-07, + "logits/chosen": -2.7565269470214844, + "logits/rejected": -2.9070534706115723, + "logps/chosen": -390.2788391113281, + "logps/rejected": -317.38916015625, + "loss": 0.3774, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08782517910003662, + "rewards/margins": 2.0570647716522217, + "rewards/rejected": -1.9692394733428955, + "step": 2537 + }, + { + "epoch": 0.29, + "learning_rate": 2.1547465761442115e-07, + "logits/chosen": -1.7183775901794434, + "logits/rejected": -2.1113641262054443, + "logps/chosen": -302.2200927734375, + "logps/rejected": -185.28549194335938, + "loss": 0.6896, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8673450946807861, + "rewards/margins": 0.35566064715385437, + "rewards/rejected": -1.223005771636963, + "step": 2538 + }, + { + "epoch": 0.29, + "learning_rate": 2.154395411447969e-07, + "logits/chosen": -1.8362398147583008, + "logits/rejected": -1.9546258449554443, + "logps/chosen": -217.7335662841797, + "logps/rejected": -241.28103637695312, + "loss": 0.4375, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6049925088882446, + "rewards/margins": 1.8566267490386963, + "rewards/rejected": -2.4616191387176514, + "step": 2539 + }, + { + "epoch": 0.29, + "learning_rate": 2.1540442467517263e-07, + "logits/chosen": -1.7456085681915283, + "logits/rejected": -1.6231859922409058, + "logps/chosen": -291.3787841796875, + "logps/rejected": -313.04229736328125, + "loss": 0.7225, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.948020875453949, + "rewards/margins": 1.4856127500534058, + "rewards/rejected": -2.433633804321289, + "step": 2540 + }, + { + "epoch": 0.29, + "learning_rate": 2.153693082055484e-07, + "logits/chosen": -2.1474153995513916, + "logits/rejected": -2.039696216583252, + "logps/chosen": -160.99066162109375, + "logps/rejected": -194.1378173828125, + "loss": 0.4474, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0936031341552734, + "rewards/margins": 1.1387839317321777, + "rewards/rejected": -2.232387065887451, + "step": 2541 + }, + { + "epoch": 0.29, + "learning_rate": 2.1533419173592416e-07, + "logits/chosen": -2.7870709896087646, + "logits/rejected": -2.745330333709717, + "logps/chosen": -178.3558807373047, + "logps/rejected": -214.946044921875, + "loss": 0.3748, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.104017734527588, + "rewards/margins": 2.3137288093566895, + "rewards/rejected": -3.4177465438842773, + "step": 2542 + }, + { + "epoch": 0.29, + "learning_rate": 2.152990752662999e-07, + "logits/chosen": -2.4536306858062744, + "logits/rejected": -2.4765725135803223, + "logps/chosen": -216.52684020996094, + "logps/rejected": -152.92518615722656, + "loss": 0.8915, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.7870988845825195, + "rewards/margins": 0.9438062906265259, + "rewards/rejected": -2.730905294418335, + "step": 2543 + }, + { + "epoch": 0.29, + "learning_rate": 2.1526395879667564e-07, + "logits/chosen": -2.3612635135650635, + "logits/rejected": -2.3186137676239014, + "logps/chosen": -192.87791442871094, + "logps/rejected": -256.9006042480469, + "loss": 0.4447, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0633541345596313, + "rewards/margins": 1.3554961681365967, + "rewards/rejected": -2.4188504219055176, + "step": 2544 + }, + { + "epoch": 0.29, + "learning_rate": 2.1522884232705137e-07, + "logits/chosen": -2.6956262588500977, + "logits/rejected": -2.7539496421813965, + "logps/chosen": -203.59365844726562, + "logps/rejected": -146.110107421875, + "loss": 0.9825, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4322446584701538, + "rewards/margins": 0.42081543803215027, + "rewards/rejected": -1.853060245513916, + "step": 2545 + }, + { + "epoch": 0.29, + "learning_rate": 2.1519372585742712e-07, + "logits/chosen": -2.6271469593048096, + "logits/rejected": -2.7570557594299316, + "logps/chosen": -266.27490234375, + "logps/rejected": -354.26934814453125, + "loss": 0.5443, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8520601987838745, + "rewards/margins": 1.5307130813598633, + "rewards/rejected": -2.3827731609344482, + "step": 2546 + }, + { + "epoch": 0.29, + "learning_rate": 2.1515860938780288e-07, + "logits/chosen": -2.0272984504699707, + "logits/rejected": -2.018709421157837, + "logps/chosen": -316.25750732421875, + "logps/rejected": -213.12034606933594, + "loss": 0.6839, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7010682821273804, + "rewards/margins": 0.665378749370575, + "rewards/rejected": -1.3664469718933105, + "step": 2547 + }, + { + "epoch": 0.29, + "learning_rate": 2.151234929181786e-07, + "logits/chosen": -2.92584490776062, + "logits/rejected": -2.918818473815918, + "logps/chosen": -252.51821899414062, + "logps/rejected": -233.9752655029297, + "loss": 0.5788, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2425473928451538, + "rewards/margins": 0.6697263717651367, + "rewards/rejected": -1.91227388381958, + "step": 2548 + }, + { + "epoch": 0.29, + "learning_rate": 2.1508837644855436e-07, + "logits/chosen": -2.370844602584839, + "logits/rejected": -2.4070169925689697, + "logps/chosen": -370.2458801269531, + "logps/rejected": -322.5394592285156, + "loss": 0.314, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.11856741458177567, + "rewards/margins": 1.8314080238342285, + "rewards/rejected": -1.7128405570983887, + "step": 2549 + }, + { + "epoch": 0.29, + "learning_rate": 2.150532599789301e-07, + "logits/chosen": -2.200908660888672, + "logits/rejected": -1.9785445928573608, + "logps/chosen": -136.29002380371094, + "logps/rejected": -255.61473083496094, + "loss": 0.2175, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7358732223510742, + "rewards/margins": 2.3933403491973877, + "rewards/rejected": -3.129213809967041, + "step": 2550 + }, + { + "epoch": 0.29, + "learning_rate": 2.1501814350930584e-07, + "logits/chosen": -2.508871555328369, + "logits/rejected": -2.447932720184326, + "logps/chosen": -332.3914794921875, + "logps/rejected": -320.4074401855469, + "loss": 0.8941, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.197753667831421, + "rewards/margins": 1.4799952507019043, + "rewards/rejected": -2.677748918533325, + "step": 2551 + }, + { + "epoch": 0.29, + "learning_rate": 2.1498302703968162e-07, + "logits/chosen": -2.146695375442505, + "logits/rejected": -2.0144145488739014, + "logps/chosen": -223.9160614013672, + "logps/rejected": -300.8786315917969, + "loss": 0.1967, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.99459308385849, + "rewards/margins": 2.3456974029541016, + "rewards/rejected": -3.3402905464172363, + "step": 2552 + }, + { + "epoch": 0.29, + "learning_rate": 2.1494791057005732e-07, + "logits/chosen": -2.24899959564209, + "logits/rejected": -2.347180128097534, + "logps/chosen": -326.0503845214844, + "logps/rejected": -238.3859405517578, + "loss": 0.8751, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4566758871078491, + "rewards/margins": 0.459940105676651, + "rewards/rejected": -1.9166159629821777, + "step": 2553 + }, + { + "epoch": 0.29, + "learning_rate": 2.149127941004331e-07, + "logits/chosen": -1.6566991806030273, + "logits/rejected": -1.551725149154663, + "logps/chosen": -165.79483032226562, + "logps/rejected": -220.8845977783203, + "loss": 0.397, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13907665014266968, + "rewards/margins": 1.2074376344680786, + "rewards/rejected": -1.3465142250061035, + "step": 2554 + }, + { + "epoch": 0.29, + "learning_rate": 2.1487767763080885e-07, + "logits/chosen": -2.1681439876556396, + "logits/rejected": -1.996596336364746, + "logps/chosen": -297.8442077636719, + "logps/rejected": -883.7598876953125, + "loss": 0.2846, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0602171421051025, + "rewards/margins": 2.5240607261657715, + "rewards/rejected": -3.584277868270874, + "step": 2555 + }, + { + "epoch": 0.29, + "learning_rate": 2.1484256116118458e-07, + "logits/chosen": -1.9422223567962646, + "logits/rejected": -2.355135440826416, + "logps/chosen": -377.27593994140625, + "logps/rejected": -195.30133056640625, + "loss": 0.6185, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.883689284324646, + "rewards/margins": 1.1390266418457031, + "rewards/rejected": -2.0227160453796387, + "step": 2556 + }, + { + "epoch": 0.29, + "learning_rate": 2.1480744469156034e-07, + "logits/chosen": -2.606349468231201, + "logits/rejected": -2.4334778785705566, + "logps/chosen": -85.15834045410156, + "logps/rejected": -217.05355834960938, + "loss": 0.27, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3548019826412201, + "rewards/margins": 2.131492853164673, + "rewards/rejected": -2.486294984817505, + "step": 2557 + }, + { + "epoch": 0.29, + "learning_rate": 2.147723282219361e-07, + "logits/chosen": -2.6095046997070312, + "logits/rejected": -2.6245908737182617, + "logps/chosen": -278.53826904296875, + "logps/rejected": -209.469482421875, + "loss": 0.3095, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0997484922409058, + "rewards/margins": 1.4632453918457031, + "rewards/rejected": -2.5629940032958984, + "step": 2558 + }, + { + "epoch": 0.3, + "learning_rate": 2.1473721175231182e-07, + "logits/chosen": -2.182626485824585, + "logits/rejected": -2.3340649604797363, + "logps/chosen": -230.0453643798828, + "logps/rejected": -235.95318603515625, + "loss": 0.3441, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0601955652236938, + "rewards/margins": 2.227254867553711, + "rewards/rejected": -3.2874507904052734, + "step": 2559 + }, + { + "epoch": 0.3, + "learning_rate": 2.1470209528268757e-07, + "logits/chosen": -2.2831196784973145, + "logits/rejected": -2.4012303352355957, + "logps/chosen": -194.14620971679688, + "logps/rejected": -246.6404571533203, + "loss": 0.3928, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.378558486700058, + "rewards/margins": 2.549515962600708, + "rewards/rejected": -2.170957326889038, + "step": 2560 + }, + { + "epoch": 0.3, + "learning_rate": 2.146669788130633e-07, + "logits/chosen": -2.115734100341797, + "logits/rejected": -2.146048069000244, + "logps/chosen": -372.7840881347656, + "logps/rejected": -313.6387634277344, + "loss": 0.9239, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.932619333267212, + "rewards/margins": 0.7172772288322449, + "rewards/rejected": -2.6498966217041016, + "step": 2561 + }, + { + "epoch": 0.3, + "learning_rate": 2.1463186234343905e-07, + "logits/chosen": -2.095701217651367, + "logits/rejected": -1.7190190553665161, + "logps/chosen": -230.24147033691406, + "logps/rejected": -371.2140197753906, + "loss": 0.1804, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09440355747938156, + "rewards/margins": 4.239691257476807, + "rewards/rejected": -4.334094524383545, + "step": 2562 + }, + { + "epoch": 0.3, + "learning_rate": 2.1459674587381483e-07, + "logits/chosen": -2.3635783195495605, + "logits/rejected": -2.383683443069458, + "logps/chosen": -228.9805908203125, + "logps/rejected": -348.91778564453125, + "loss": 0.1974, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7728286981582642, + "rewards/margins": 2.5836336612701416, + "rewards/rejected": -3.3564624786376953, + "step": 2563 + }, + { + "epoch": 0.3, + "learning_rate": 2.1456162940419056e-07, + "logits/chosen": -2.5250556468963623, + "logits/rejected": -1.9877848625183105, + "logps/chosen": -289.58648681640625, + "logps/rejected": -378.22625732421875, + "loss": 0.2478, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6952117085456848, + "rewards/margins": 2.0706710815429688, + "rewards/rejected": -2.765882968902588, + "step": 2564 + }, + { + "epoch": 0.3, + "learning_rate": 2.1452651293456631e-07, + "logits/chosen": -2.225236654281616, + "logits/rejected": -2.4570021629333496, + "logps/chosen": -375.5469055175781, + "logps/rejected": -342.50732421875, + "loss": 0.6762, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1570258140563965, + "rewards/margins": 2.055387258529663, + "rewards/rejected": -3.2124128341674805, + "step": 2565 + }, + { + "epoch": 0.3, + "learning_rate": 2.1449139646494204e-07, + "logits/chosen": -2.0320286750793457, + "logits/rejected": -2.1444990634918213, + "logps/chosen": -234.5349884033203, + "logps/rejected": -226.81393432617188, + "loss": 0.5014, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7647957801818848, + "rewards/margins": 0.5484486818313599, + "rewards/rejected": -1.3132444620132446, + "step": 2566 + }, + { + "epoch": 0.3, + "learning_rate": 2.144562799953178e-07, + "logits/chosen": -2.7414910793304443, + "logits/rejected": -2.6452620029449463, + "logps/chosen": -243.4730224609375, + "logps/rejected": -257.66827392578125, + "loss": 0.3861, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6543620824813843, + "rewards/margins": 2.019935369491577, + "rewards/rejected": -2.674297332763672, + "step": 2567 + }, + { + "epoch": 0.3, + "learning_rate": 2.1442116352569355e-07, + "logits/chosen": -2.5585527420043945, + "logits/rejected": -2.3212594985961914, + "logps/chosen": -166.16070556640625, + "logps/rejected": -353.0267028808594, + "loss": 0.1805, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.628671407699585, + "rewards/margins": 3.3978047370910645, + "rewards/rejected": -4.0264763832092285, + "step": 2568 + }, + { + "epoch": 0.3, + "learning_rate": 2.1438604705606928e-07, + "logits/chosen": -2.769984245300293, + "logits/rejected": -2.787576913833618, + "logps/chosen": -309.31536865234375, + "logps/rejected": -158.00650024414062, + "loss": 0.4139, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8874894976615906, + "rewards/margins": 0.8937040567398071, + "rewards/rejected": -1.781193733215332, + "step": 2569 + }, + { + "epoch": 0.3, + "learning_rate": 2.1435093058644503e-07, + "logits/chosen": -2.373570680618286, + "logits/rejected": -2.637197971343994, + "logps/chosen": -356.01800537109375, + "logps/rejected": -226.5042266845703, + "loss": 0.3562, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.43357980251312256, + "rewards/margins": 1.5528373718261719, + "rewards/rejected": -1.9864170551300049, + "step": 2570 + }, + { + "epoch": 0.3, + "learning_rate": 2.1431581411682078e-07, + "logits/chosen": -1.5355597734451294, + "logits/rejected": -2.2677805423736572, + "logps/chosen": -323.65802001953125, + "logps/rejected": -262.9590148925781, + "loss": 0.629, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8531265258789062, + "rewards/margins": 3.0200488567352295, + "rewards/rejected": -3.8731753826141357, + "step": 2571 + }, + { + "epoch": 0.3, + "learning_rate": 2.142806976471965e-07, + "logits/chosen": -2.3755943775177, + "logits/rejected": -2.6560959815979004, + "logps/chosen": -297.16693115234375, + "logps/rejected": -237.81092834472656, + "loss": 0.6228, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7923929691314697, + "rewards/margins": 1.0257508754730225, + "rewards/rejected": -1.8181438446044922, + "step": 2572 + }, + { + "epoch": 0.3, + "learning_rate": 2.1424558117757227e-07, + "logits/chosen": -2.3643221855163574, + "logits/rejected": -2.2899680137634277, + "logps/chosen": -347.4182434082031, + "logps/rejected": -220.04275512695312, + "loss": 0.4184, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7549554705619812, + "rewards/margins": 1.3204834461212158, + "rewards/rejected": -2.075438976287842, + "step": 2573 + }, + { + "epoch": 0.3, + "learning_rate": 2.14210464707948e-07, + "logits/chosen": -2.6605629920959473, + "logits/rejected": -2.273625373840332, + "logps/chosen": -225.638671875, + "logps/rejected": -385.0511474609375, + "loss": 0.3841, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6282962560653687, + "rewards/margins": 1.1144015789031982, + "rewards/rejected": -1.742697834968567, + "step": 2574 + }, + { + "epoch": 0.3, + "learning_rate": 2.1417534823832377e-07, + "logits/chosen": -2.1543970108032227, + "logits/rejected": -2.0708248615264893, + "logps/chosen": -356.2413024902344, + "logps/rejected": -305.69647216796875, + "loss": 0.4913, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7101609706878662, + "rewards/margins": 1.2247068881988525, + "rewards/rejected": -1.9348678588867188, + "step": 2575 + }, + { + "epoch": 0.3, + "learning_rate": 2.1414023176869953e-07, + "logits/chosen": -2.258042812347412, + "logits/rejected": -2.2353463172912598, + "logps/chosen": -143.45115661621094, + "logps/rejected": -157.47186279296875, + "loss": 0.3935, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4094882607460022, + "rewards/margins": 1.284521460533142, + "rewards/rejected": -1.694009780883789, + "step": 2576 + }, + { + "epoch": 0.3, + "learning_rate": 2.1410511529907525e-07, + "logits/chosen": -2.5698893070220947, + "logits/rejected": -2.4105913639068604, + "logps/chosen": -301.963623046875, + "logps/rejected": -278.2498779296875, + "loss": 0.2372, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5190140008926392, + "rewards/margins": 2.2784314155578613, + "rewards/rejected": -3.79744553565979, + "step": 2577 + }, + { + "epoch": 0.3, + "learning_rate": 2.14069998829451e-07, + "logits/chosen": -2.058884859085083, + "logits/rejected": -2.105940103530884, + "logps/chosen": -344.0167541503906, + "logps/rejected": -278.63665771484375, + "loss": 0.1901, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7676056623458862, + "rewards/margins": 2.761768341064453, + "rewards/rejected": -3.529374122619629, + "step": 2578 + }, + { + "epoch": 0.3, + "learning_rate": 2.1403488235982676e-07, + "logits/chosen": -2.3690474033355713, + "logits/rejected": -2.3309667110443115, + "logps/chosen": -200.96136474609375, + "logps/rejected": -292.0684814453125, + "loss": 1.2969, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1939997673034668, + "rewards/margins": 0.7213768362998962, + "rewards/rejected": -1.9153766632080078, + "step": 2579 + }, + { + "epoch": 0.3, + "learning_rate": 2.139997658902025e-07, + "logits/chosen": -2.5289924144744873, + "logits/rejected": -2.438127040863037, + "logps/chosen": -160.40982055664062, + "logps/rejected": -161.41114807128906, + "loss": 0.6085, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8303722143173218, + "rewards/margins": 1.0500810146331787, + "rewards/rejected": -1.88045334815979, + "step": 2580 + }, + { + "epoch": 0.3, + "learning_rate": 2.1396464942057824e-07, + "logits/chosen": -2.5328564643859863, + "logits/rejected": -2.20155930519104, + "logps/chosen": -263.8122253417969, + "logps/rejected": -337.48724365234375, + "loss": 0.4002, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7500419616699219, + "rewards/margins": 1.4351928234100342, + "rewards/rejected": -2.185234546661377, + "step": 2581 + }, + { + "epoch": 0.3, + "learning_rate": 2.1392953295095397e-07, + "logits/chosen": -1.3791530132293701, + "logits/rejected": -1.55387544631958, + "logps/chosen": -493.3209228515625, + "logps/rejected": -449.2333068847656, + "loss": 1.2025, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.6340569257736206, + "rewards/margins": 0.5556802153587341, + "rewards/rejected": -2.189736843109131, + "step": 2582 + }, + { + "epoch": 0.3, + "learning_rate": 2.1389441648132972e-07, + "logits/chosen": -2.163374185562134, + "logits/rejected": -2.047855854034424, + "logps/chosen": -274.93756103515625, + "logps/rejected": -279.260009765625, + "loss": 0.1473, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5982179641723633, + "rewards/margins": 3.6204378604888916, + "rewards/rejected": -5.218656063079834, + "step": 2583 + }, + { + "epoch": 0.3, + "learning_rate": 2.1385930001170548e-07, + "logits/chosen": -2.254971742630005, + "logits/rejected": -2.387099027633667, + "logps/chosen": -303.5189208984375, + "logps/rejected": -193.14468383789062, + "loss": 0.222, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.595818281173706, + "rewards/margins": 2.169334888458252, + "rewards/rejected": -1.5735163688659668, + "step": 2584 + }, + { + "epoch": 0.3, + "learning_rate": 2.138241835420812e-07, + "logits/chosen": -2.6492137908935547, + "logits/rejected": -2.4057648181915283, + "logps/chosen": -297.9609680175781, + "logps/rejected": -235.61851501464844, + "loss": 0.3429, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1788300275802612, + "rewards/margins": 2.4224472045898438, + "rewards/rejected": -3.6012773513793945, + "step": 2585 + }, + { + "epoch": 0.3, + "learning_rate": 2.1378906707245699e-07, + "logits/chosen": -2.5962271690368652, + "logits/rejected": -2.629340648651123, + "logps/chosen": -255.43247985839844, + "logps/rejected": -214.9901580810547, + "loss": 0.2279, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.923166036605835, + "rewards/margins": 2.0501532554626465, + "rewards/rejected": -2.9733190536499023, + "step": 2586 + }, + { + "epoch": 0.3, + "learning_rate": 2.1375395060283274e-07, + "logits/chosen": -2.475533962249756, + "logits/rejected": -2.593223810195923, + "logps/chosen": -110.12882995605469, + "logps/rejected": -162.8800811767578, + "loss": 0.5766, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.9236884117126465, + "rewards/margins": 1.156229853630066, + "rewards/rejected": -3.079918146133423, + "step": 2587 + }, + { + "epoch": 0.3, + "learning_rate": 2.1371883413320847e-07, + "logits/chosen": -2.3971827030181885, + "logits/rejected": -2.5594022274017334, + "logps/chosen": -356.9502258300781, + "logps/rejected": -329.6301574707031, + "loss": 0.3785, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1045767217874527, + "rewards/margins": 2.272799491882324, + "rewards/rejected": -2.377375841140747, + "step": 2588 + }, + { + "epoch": 0.3, + "learning_rate": 2.1368371766358422e-07, + "logits/chosen": -2.7008275985717773, + "logits/rejected": -2.772785186767578, + "logps/chosen": -372.66558837890625, + "logps/rejected": -381.1258850097656, + "loss": 0.272, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7198525667190552, + "rewards/margins": 2.709686279296875, + "rewards/rejected": -3.4295389652252197, + "step": 2589 + }, + { + "epoch": 0.3, + "learning_rate": 2.1364860119395995e-07, + "logits/chosen": -2.410445213317871, + "logits/rejected": -2.4388060569763184, + "logps/chosen": -491.2521057128906, + "logps/rejected": -283.9703063964844, + "loss": 0.4253, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9640113711357117, + "rewards/margins": 1.3394439220428467, + "rewards/rejected": -2.303455114364624, + "step": 2590 + }, + { + "epoch": 0.3, + "learning_rate": 2.136134847243357e-07, + "logits/chosen": -2.1992552280426025, + "logits/rejected": -2.2931647300720215, + "logps/chosen": -383.8555603027344, + "logps/rejected": -291.9119873046875, + "loss": 0.3937, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.35741686820983887, + "rewards/margins": 2.4620559215545654, + "rewards/rejected": -2.8194727897644043, + "step": 2591 + }, + { + "epoch": 0.3, + "learning_rate": 2.1357836825471146e-07, + "logits/chosen": -1.8572465181350708, + "logits/rejected": -1.9377610683441162, + "logps/chosen": -316.83892822265625, + "logps/rejected": -439.40869140625, + "loss": 0.4423, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8385158181190491, + "rewards/margins": 1.8560843467712402, + "rewards/rejected": -2.6946001052856445, + "step": 2592 + }, + { + "epoch": 0.3, + "learning_rate": 2.1354325178508718e-07, + "logits/chosen": -2.603959083557129, + "logits/rejected": -2.5741491317749023, + "logps/chosen": -217.260498046875, + "logps/rejected": -206.06500244140625, + "loss": 0.2711, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1386228799819946, + "rewards/margins": 1.9924290180206299, + "rewards/rejected": -3.131051778793335, + "step": 2593 + }, + { + "epoch": 0.3, + "learning_rate": 2.1350813531546294e-07, + "logits/chosen": -1.9308924674987793, + "logits/rejected": -2.068061590194702, + "logps/chosen": -287.1161804199219, + "logps/rejected": -301.7713623046875, + "loss": 0.3458, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0179734230041504, + "rewards/margins": 1.4243574142456055, + "rewards/rejected": -3.442330837249756, + "step": 2594 + }, + { + "epoch": 0.3, + "learning_rate": 2.134730188458387e-07, + "logits/chosen": -2.3490757942199707, + "logits/rejected": -2.4538846015930176, + "logps/chosen": -259.2518005371094, + "logps/rejected": -328.69482421875, + "loss": 0.6655, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6455906629562378, + "rewards/margins": 1.692778468132019, + "rewards/rejected": -3.338369131088257, + "step": 2595 + }, + { + "epoch": 0.3, + "learning_rate": 2.1343790237621442e-07, + "logits/chosen": -2.575906753540039, + "logits/rejected": -2.6684114933013916, + "logps/chosen": -339.8548583984375, + "logps/rejected": -281.6839294433594, + "loss": 0.4361, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6283900141716003, + "rewards/margins": 2.564577579498291, + "rewards/rejected": -3.1929678916931152, + "step": 2596 + }, + { + "epoch": 0.3, + "learning_rate": 2.134027859065902e-07, + "logits/chosen": -1.9912227392196655, + "logits/rejected": -2.2634572982788086, + "logps/chosen": -310.95965576171875, + "logps/rejected": -225.76919555664062, + "loss": 0.7573, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.9686962366104126, + "rewards/margins": 0.2178526520729065, + "rewards/rejected": -1.1865488290786743, + "step": 2597 + }, + { + "epoch": 0.3, + "learning_rate": 2.1336766943696593e-07, + "logits/chosen": -2.224790096282959, + "logits/rejected": -2.294203281402588, + "logps/chosen": -217.87098693847656, + "logps/rejected": -333.2406005859375, + "loss": 0.4359, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8916549682617188, + "rewards/margins": 2.1289777755737305, + "rewards/rejected": -3.020632743835449, + "step": 2598 + }, + { + "epoch": 0.3, + "learning_rate": 2.1333255296734168e-07, + "logits/chosen": -2.793613910675049, + "logits/rejected": -2.7816762924194336, + "logps/chosen": -146.545166015625, + "logps/rejected": -130.0364532470703, + "loss": 0.267, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15303859114646912, + "rewards/margins": 1.401059627532959, + "rewards/rejected": -1.5540982484817505, + "step": 2599 + }, + { + "epoch": 0.3, + "learning_rate": 2.1329743649771743e-07, + "logits/chosen": -1.8489799499511719, + "logits/rejected": -1.975867509841919, + "logps/chosen": -462.5130615234375, + "logps/rejected": -407.5189208984375, + "loss": 0.2458, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7635207176208496, + "rewards/margins": 1.7430291175842285, + "rewards/rejected": -2.5065500736236572, + "step": 2600 + }, + { + "epoch": 0.3, + "learning_rate": 2.1326232002809316e-07, + "logits/chosen": -2.919853925704956, + "logits/rejected": -2.777526378631592, + "logps/chosen": -176.41690063476562, + "logps/rejected": -282.2330627441406, + "loss": 0.2775, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2507912516593933, + "rewards/margins": 2.2875442504882812, + "rewards/rejected": -2.5383355617523193, + "step": 2601 + }, + { + "epoch": 0.3, + "learning_rate": 2.1322720355846892e-07, + "logits/chosen": -2.522873878479004, + "logits/rejected": -2.755542039871216, + "logps/chosen": -249.7965545654297, + "logps/rejected": -147.0345916748047, + "loss": 0.4892, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2927474081516266, + "rewards/margins": 1.6375150680541992, + "rewards/rejected": -1.9302623271942139, + "step": 2602 + }, + { + "epoch": 0.3, + "learning_rate": 2.1319208708884467e-07, + "logits/chosen": -2.375728130340576, + "logits/rejected": -2.3667490482330322, + "logps/chosen": -248.91961669921875, + "logps/rejected": -211.90455627441406, + "loss": 0.2661, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3632280230522156, + "rewards/margins": 2.477334499359131, + "rewards/rejected": -2.840562582015991, + "step": 2603 + }, + { + "epoch": 0.3, + "learning_rate": 2.131569706192204e-07, + "logits/chosen": -2.4729201793670654, + "logits/rejected": -2.3965916633605957, + "logps/chosen": -146.0297088623047, + "logps/rejected": -173.49656677246094, + "loss": 0.5012, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.24834415316581726, + "rewards/margins": 1.0113152265548706, + "rewards/rejected": -1.2596594095230103, + "step": 2604 + }, + { + "epoch": 0.3, + "learning_rate": 2.1312185414959615e-07, + "logits/chosen": -2.48311185836792, + "logits/rejected": -2.359038829803467, + "logps/chosen": -431.4444580078125, + "logps/rejected": -374.1597900390625, + "loss": 0.8456, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.855518102645874, + "rewards/margins": 1.4114460945129395, + "rewards/rejected": -2.2669644355773926, + "step": 2605 + }, + { + "epoch": 0.3, + "learning_rate": 2.1308673767997188e-07, + "logits/chosen": -2.8996620178222656, + "logits/rejected": -2.8458547592163086, + "logps/chosen": -139.0168914794922, + "logps/rejected": -198.68121337890625, + "loss": 0.2339, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09638398885726929, + "rewards/margins": 3.2075748443603516, + "rewards/rejected": -3.3039588928222656, + "step": 2606 + }, + { + "epoch": 0.3, + "learning_rate": 2.1305162121034763e-07, + "logits/chosen": -2.6255104541778564, + "logits/rejected": -2.539196014404297, + "logps/chosen": -109.65428924560547, + "logps/rejected": -259.5006103515625, + "loss": 0.2046, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2652370929718018, + "rewards/margins": 3.369107723236084, + "rewards/rejected": -4.634345054626465, + "step": 2607 + }, + { + "epoch": 0.3, + "learning_rate": 2.130165047407234e-07, + "logits/chosen": -2.666707992553711, + "logits/rejected": -2.622971773147583, + "logps/chosen": -235.65328979492188, + "logps/rejected": -256.0414123535156, + "loss": 0.4328, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7671242952346802, + "rewards/margins": 1.0434988737106323, + "rewards/rejected": -1.8106231689453125, + "step": 2608 + }, + { + "epoch": 0.3, + "learning_rate": 2.1298138827109914e-07, + "logits/chosen": -2.4186582565307617, + "logits/rejected": -2.3984577655792236, + "logps/chosen": -208.7000732421875, + "logps/rejected": -253.18936157226562, + "loss": 0.3992, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5626397132873535, + "rewards/margins": 2.4552602767944336, + "rewards/rejected": -3.017899990081787, + "step": 2609 + }, + { + "epoch": 0.3, + "learning_rate": 2.129462718014749e-07, + "logits/chosen": -2.2479000091552734, + "logits/rejected": -2.448456287384033, + "logps/chosen": -236.75396728515625, + "logps/rejected": -239.6634521484375, + "loss": 0.396, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0938267707824707, + "rewards/margins": 1.4842513799667358, + "rewards/rejected": -2.578078269958496, + "step": 2610 + }, + { + "epoch": 0.3, + "learning_rate": 2.1291115533185065e-07, + "logits/chosen": -2.2075705528259277, + "logits/rejected": -2.030015230178833, + "logps/chosen": -231.40765380859375, + "logps/rejected": -369.9085998535156, + "loss": 0.1974, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.354914903640747, + "rewards/margins": 2.374941110610962, + "rewards/rejected": -3.729856014251709, + "step": 2611 + }, + { + "epoch": 0.3, + "learning_rate": 2.1287603886222637e-07, + "logits/chosen": -2.0757060050964355, + "logits/rejected": -1.8724305629730225, + "logps/chosen": -274.9360046386719, + "logps/rejected": -276.18060302734375, + "loss": 0.1935, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0379600003361702, + "rewards/margins": 2.8242146968841553, + "rewards/rejected": -2.786254644393921, + "step": 2612 + }, + { + "epoch": 0.3, + "learning_rate": 2.1284092239260213e-07, + "logits/chosen": -2.5873827934265137, + "logits/rejected": -2.5443227291107178, + "logps/chosen": -174.42295837402344, + "logps/rejected": -212.17364501953125, + "loss": 0.377, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5471092462539673, + "rewards/margins": 2.1604220867156982, + "rewards/rejected": -2.707531213760376, + "step": 2613 + }, + { + "epoch": 0.3, + "learning_rate": 2.1280580592297786e-07, + "logits/chosen": -2.400352954864502, + "logits/rejected": -2.4961462020874023, + "logps/chosen": -147.81983947753906, + "logps/rejected": -194.00119018554688, + "loss": 0.9576, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.33033567667007446, + "rewards/margins": 0.9162499308586121, + "rewards/rejected": -1.2465856075286865, + "step": 2614 + }, + { + "epoch": 0.3, + "learning_rate": 2.127706894533536e-07, + "logits/chosen": -1.9121122360229492, + "logits/rejected": -2.355313777923584, + "logps/chosen": -563.5240478515625, + "logps/rejected": -447.7594909667969, + "loss": 0.2132, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.23296716809272766, + "rewards/margins": 2.0936920642852783, + "rewards/rejected": -2.3266592025756836, + "step": 2615 + }, + { + "epoch": 0.3, + "learning_rate": 2.1273557298372936e-07, + "logits/chosen": -2.548430919647217, + "logits/rejected": -2.6004695892333984, + "logps/chosen": -204.60911560058594, + "logps/rejected": -198.34475708007812, + "loss": 0.1574, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6191200613975525, + "rewards/margins": 2.963716506958008, + "rewards/rejected": -3.582836389541626, + "step": 2616 + }, + { + "epoch": 0.3, + "learning_rate": 2.127004565141051e-07, + "logits/chosen": -2.7607405185699463, + "logits/rejected": -2.7131142616271973, + "logps/chosen": -251.22598266601562, + "logps/rejected": -333.57427978515625, + "loss": 0.4533, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.369859516620636, + "rewards/margins": 0.8223172426223755, + "rewards/rejected": -1.1921766996383667, + "step": 2617 + }, + { + "epoch": 0.3, + "learning_rate": 2.1266534004448084e-07, + "logits/chosen": -2.4876208305358887, + "logits/rejected": -2.4472198486328125, + "logps/chosen": -163.41796875, + "logps/rejected": -254.348876953125, + "loss": 0.2603, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7181317806243896, + "rewards/margins": 2.1750082969665527, + "rewards/rejected": -2.8931403160095215, + "step": 2618 + }, + { + "epoch": 0.3, + "learning_rate": 2.1263022357485657e-07, + "logits/chosen": -2.2187187671661377, + "logits/rejected": -2.1812844276428223, + "logps/chosen": -334.4085693359375, + "logps/rejected": -289.5395812988281, + "loss": 0.2725, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6845586895942688, + "rewards/margins": 2.437798023223877, + "rewards/rejected": -3.122356414794922, + "step": 2619 + }, + { + "epoch": 0.3, + "learning_rate": 2.1259510710523235e-07, + "logits/chosen": -2.3199331760406494, + "logits/rejected": -2.3162457942962646, + "logps/chosen": -141.6297607421875, + "logps/rejected": -227.0218048095703, + "loss": 0.2723, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7425480484962463, + "rewards/margins": 2.027024269104004, + "rewards/rejected": -2.7695724964141846, + "step": 2620 + }, + { + "epoch": 0.3, + "learning_rate": 2.125599906356081e-07, + "logits/chosen": -2.313141345977783, + "logits/rejected": -2.4537386894226074, + "logps/chosen": -351.9814147949219, + "logps/rejected": -332.6109619140625, + "loss": 0.3465, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8128359317779541, + "rewards/margins": 1.7909976243972778, + "rewards/rejected": -2.6038336753845215, + "step": 2621 + }, + { + "epoch": 0.3, + "learning_rate": 2.1252487416598383e-07, + "logits/chosen": -2.0178945064544678, + "logits/rejected": -2.0685923099517822, + "logps/chosen": -450.2137145996094, + "logps/rejected": -399.78997802734375, + "loss": 0.5043, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8989354372024536, + "rewards/margins": 1.3992360830307007, + "rewards/rejected": -2.2981715202331543, + "step": 2622 + }, + { + "epoch": 0.3, + "learning_rate": 2.124897576963596e-07, + "logits/chosen": -2.1621270179748535, + "logits/rejected": -1.8494329452514648, + "logps/chosen": -273.3951416015625, + "logps/rejected": -386.08447265625, + "loss": 0.5365, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.125458002090454, + "rewards/margins": 1.2439154386520386, + "rewards/rejected": -2.369373321533203, + "step": 2623 + }, + { + "epoch": 0.3, + "learning_rate": 2.1245464122673534e-07, + "logits/chosen": -2.515730381011963, + "logits/rejected": -2.4944028854370117, + "logps/chosen": -291.9100646972656, + "logps/rejected": -248.60873413085938, + "loss": 0.2937, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.28042834997177124, + "rewards/margins": 1.799802303314209, + "rewards/rejected": -2.080230712890625, + "step": 2624 + }, + { + "epoch": 0.3, + "learning_rate": 2.1241952475711107e-07, + "logits/chosen": -2.154043197631836, + "logits/rejected": -1.966158151626587, + "logps/chosen": -424.2607727050781, + "logps/rejected": -407.46136474609375, + "loss": 0.0679, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11252972483634949, + "rewards/margins": 3.471208095550537, + "rewards/rejected": -3.358678102493286, + "step": 2625 + }, + { + "epoch": 0.3, + "learning_rate": 2.1238440828748682e-07, + "logits/chosen": -2.1669654846191406, + "logits/rejected": -1.905188798904419, + "logps/chosen": -177.36305236816406, + "logps/rejected": -270.06219482421875, + "loss": 0.3059, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2884536981582642, + "rewards/margins": 2.2090048789978027, + "rewards/rejected": -3.4974584579467773, + "step": 2626 + }, + { + "epoch": 0.3, + "learning_rate": 2.1234929181786255e-07, + "logits/chosen": -2.396131992340088, + "logits/rejected": -2.5254201889038086, + "logps/chosen": -263.7838134765625, + "logps/rejected": -271.5378723144531, + "loss": 0.3881, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7954407930374146, + "rewards/margins": 2.6368002891540527, + "rewards/rejected": -3.4322409629821777, + "step": 2627 + }, + { + "epoch": 0.3, + "learning_rate": 2.123141753482383e-07, + "logits/chosen": -2.405212640762329, + "logits/rejected": -2.633493423461914, + "logps/chosen": -330.7557373046875, + "logps/rejected": -209.66134643554688, + "loss": 0.3379, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8353760838508606, + "rewards/margins": 1.4883044958114624, + "rewards/rejected": -2.3236804008483887, + "step": 2628 + }, + { + "epoch": 0.3, + "learning_rate": 2.1227905887861406e-07, + "logits/chosen": -2.577300786972046, + "logits/rejected": -2.3546814918518066, + "logps/chosen": -257.873291015625, + "logps/rejected": -202.2041473388672, + "loss": 0.2876, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5898175835609436, + "rewards/margins": 2.1038475036621094, + "rewards/rejected": -2.693665027618408, + "step": 2629 + }, + { + "epoch": 0.3, + "learning_rate": 2.1224394240898979e-07, + "logits/chosen": -2.5046286582946777, + "logits/rejected": -2.5746583938598633, + "logps/chosen": -198.41903686523438, + "logps/rejected": -172.739990234375, + "loss": 0.1468, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.102842926979065, + "rewards/margins": 2.359375, + "rewards/rejected": -3.4622180461883545, + "step": 2630 + }, + { + "epoch": 0.3, + "learning_rate": 2.1220882593936557e-07, + "logits/chosen": -2.3147716522216797, + "logits/rejected": -1.4618935585021973, + "logps/chosen": -148.53988647460938, + "logps/rejected": -572.7411499023438, + "loss": 0.4616, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3859986364841461, + "rewards/margins": 2.5637621879577637, + "rewards/rejected": -2.949760913848877, + "step": 2631 + }, + { + "epoch": 0.3, + "learning_rate": 2.1217370946974132e-07, + "logits/chosen": -2.7466623783111572, + "logits/rejected": -2.6903951168060303, + "logps/chosen": -345.8239440917969, + "logps/rejected": -302.8529052734375, + "loss": 0.9204, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.399731397628784, + "rewards/margins": 1.1598894596099854, + "rewards/rejected": -3.5596210956573486, + "step": 2632 + }, + { + "epoch": 0.3, + "learning_rate": 2.1213859300011705e-07, + "logits/chosen": -2.2052345275878906, + "logits/rejected": -2.2617297172546387, + "logps/chosen": -317.3245544433594, + "logps/rejected": -351.31756591796875, + "loss": 0.3851, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9218258857727051, + "rewards/margins": 1.5823955535888672, + "rewards/rejected": -2.5042214393615723, + "step": 2633 + }, + { + "epoch": 0.3, + "learning_rate": 2.121034765304928e-07, + "logits/chosen": -2.360642433166504, + "logits/rejected": -2.3233656883239746, + "logps/chosen": -251.32186889648438, + "logps/rejected": -206.4317626953125, + "loss": 0.5109, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4761550426483154, + "rewards/margins": 1.2677556276321411, + "rewards/rejected": -2.743910551071167, + "step": 2634 + }, + { + "epoch": 0.3, + "learning_rate": 2.1206836006086853e-07, + "logits/chosen": -2.7285284996032715, + "logits/rejected": -2.74623966217041, + "logps/chosen": -259.1949462890625, + "logps/rejected": -303.7427978515625, + "loss": 0.2361, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7278380393981934, + "rewards/margins": 2.4763238430023193, + "rewards/rejected": -3.204162120819092, + "step": 2635 + }, + { + "epoch": 0.3, + "learning_rate": 2.1203324359124428e-07, + "logits/chosen": -2.676199436187744, + "logits/rejected": -2.8835091590881348, + "logps/chosen": -214.58111572265625, + "logps/rejected": -207.50485229492188, + "loss": 0.5549, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0701870918273926, + "rewards/margins": 1.6853992938995361, + "rewards/rejected": -2.7555863857269287, + "step": 2636 + }, + { + "epoch": 0.3, + "learning_rate": 2.1199812712162004e-07, + "logits/chosen": -2.529000759124756, + "logits/rejected": -2.425200939178467, + "logps/chosen": -332.3180847167969, + "logps/rejected": -312.75897216796875, + "loss": 0.2879, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9330446720123291, + "rewards/margins": 2.0024664402008057, + "rewards/rejected": -2.935511350631714, + "step": 2637 + }, + { + "epoch": 0.3, + "learning_rate": 2.1196301065199576e-07, + "logits/chosen": -2.7907731533050537, + "logits/rejected": -2.6504554748535156, + "logps/chosen": -263.6886291503906, + "logps/rejected": -360.96197509765625, + "loss": 0.1859, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1685313731431961, + "rewards/margins": 2.3399226665496826, + "rewards/rejected": -2.5084540843963623, + "step": 2638 + }, + { + "epoch": 0.3, + "learning_rate": 2.1192789418237152e-07, + "logits/chosen": -1.928160309791565, + "logits/rejected": -2.105875015258789, + "logps/chosen": -352.5225830078125, + "logps/rejected": -258.01324462890625, + "loss": 0.2941, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6013691425323486, + "rewards/margins": 2.2547926902770996, + "rewards/rejected": -2.8561618328094482, + "step": 2639 + }, + { + "epoch": 0.3, + "learning_rate": 2.1189277771274727e-07, + "logits/chosen": -2.4836575984954834, + "logits/rejected": -2.586505174636841, + "logps/chosen": -182.0884246826172, + "logps/rejected": -199.50831604003906, + "loss": 0.6725, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6217401027679443, + "rewards/margins": 1.2423304319381714, + "rewards/rejected": -2.8640706539154053, + "step": 2640 + }, + { + "epoch": 0.3, + "learning_rate": 2.11857661243123e-07, + "logits/chosen": -2.4074208736419678, + "logits/rejected": -2.3233866691589355, + "logps/chosen": -225.94146728515625, + "logps/rejected": -276.94500732421875, + "loss": 0.3201, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4459531009197235, + "rewards/margins": 2.1060128211975098, + "rewards/rejected": -2.5519659519195557, + "step": 2641 + }, + { + "epoch": 0.3, + "learning_rate": 2.1182254477349878e-07, + "logits/chosen": -2.3607428073883057, + "logits/rejected": -2.717050075531006, + "logps/chosen": -353.8540954589844, + "logps/rejected": -226.31793212890625, + "loss": 0.5381, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.083139181137085, + "rewards/margins": 1.223055362701416, + "rewards/rejected": -2.306194543838501, + "step": 2642 + }, + { + "epoch": 0.3, + "learning_rate": 2.117874283038745e-07, + "logits/chosen": -2.390833854675293, + "logits/rejected": -2.434299945831299, + "logps/chosen": -338.81396484375, + "logps/rejected": -332.17376708984375, + "loss": 0.5226, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0806701183319092, + "rewards/margins": 1.6234418153762817, + "rewards/rejected": -2.7041118144989014, + "step": 2643 + }, + { + "epoch": 0.3, + "learning_rate": 2.1175231183425026e-07, + "logits/chosen": -1.9663317203521729, + "logits/rejected": -2.117114782333374, + "logps/chosen": -143.51947021484375, + "logps/rejected": -174.2673797607422, + "loss": 0.4008, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6106756329536438, + "rewards/margins": 1.0680378675460815, + "rewards/rejected": -1.6787135601043701, + "step": 2644 + }, + { + "epoch": 0.3, + "learning_rate": 2.1171719536462601e-07, + "logits/chosen": -2.2771759033203125, + "logits/rejected": -2.292076587677002, + "logps/chosen": -188.79766845703125, + "logps/rejected": -175.3734130859375, + "loss": 1.9737, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.885798931121826, + "rewards/margins": -0.9239658713340759, + "rewards/rejected": -1.961832880973816, + "step": 2645 + }, + { + "epoch": 0.31, + "learning_rate": 2.1168207889500174e-07, + "logits/chosen": -1.8664822578430176, + "logits/rejected": -2.1586406230926514, + "logps/chosen": -314.60260009765625, + "logps/rejected": -156.48345947265625, + "loss": 0.4888, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4102492332458496, + "rewards/margins": 0.8043211102485657, + "rewards/rejected": -1.21457040309906, + "step": 2646 + }, + { + "epoch": 0.31, + "learning_rate": 2.116469624253775e-07, + "logits/chosen": -2.293513298034668, + "logits/rejected": -2.5479836463928223, + "logps/chosen": -375.71417236328125, + "logps/rejected": -215.96051025390625, + "loss": 0.2097, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5037028193473816, + "rewards/margins": 2.436265707015991, + "rewards/rejected": -2.9399685859680176, + "step": 2647 + }, + { + "epoch": 0.31, + "learning_rate": 2.1161184595575325e-07, + "logits/chosen": -2.6817355155944824, + "logits/rejected": -2.719938278198242, + "logps/chosen": -305.412841796875, + "logps/rejected": -216.09619140625, + "loss": 0.2698, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2409363090991974, + "rewards/margins": 2.2861814498901367, + "rewards/rejected": -2.5271177291870117, + "step": 2648 + }, + { + "epoch": 0.31, + "learning_rate": 2.1157672948612898e-07, + "logits/chosen": -2.4290525913238525, + "logits/rejected": -2.1486380100250244, + "logps/chosen": -265.5804748535156, + "logps/rejected": -274.55255126953125, + "loss": 0.4541, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9480488300323486, + "rewards/margins": 3.832843780517578, + "rewards/rejected": -4.780892848968506, + "step": 2649 + }, + { + "epoch": 0.31, + "learning_rate": 2.1154161301650473e-07, + "logits/chosen": -2.5135176181793213, + "logits/rejected": -2.63594913482666, + "logps/chosen": -325.3284912109375, + "logps/rejected": -273.1072998046875, + "loss": 0.2844, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9635632038116455, + "rewards/margins": 2.5451765060424805, + "rewards/rejected": -3.508739709854126, + "step": 2650 + }, + { + "epoch": 0.31, + "learning_rate": 2.1150649654688046e-07, + "logits/chosen": -2.7630844116210938, + "logits/rejected": -2.8550615310668945, + "logps/chosen": -118.67991638183594, + "logps/rejected": -236.557373046875, + "loss": 0.1799, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6240607500076294, + "rewards/margins": 3.029646873474121, + "rewards/rejected": -3.653707504272461, + "step": 2651 + }, + { + "epoch": 0.31, + "learning_rate": 2.114713800772562e-07, + "logits/chosen": -2.594730854034424, + "logits/rejected": -2.4862120151519775, + "logps/chosen": -266.2283020019531, + "logps/rejected": -250.740234375, + "loss": 0.3067, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3173115253448486, + "rewards/margins": 1.5077677965164185, + "rewards/rejected": -2.8250796794891357, + "step": 2652 + }, + { + "epoch": 0.31, + "learning_rate": 2.11436263607632e-07, + "logits/chosen": -2.3456361293792725, + "logits/rejected": -2.698631525039673, + "logps/chosen": -313.92999267578125, + "logps/rejected": -398.459716796875, + "loss": 0.069, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3468824028968811, + "rewards/margins": 4.278311729431152, + "rewards/rejected": -4.625194072723389, + "step": 2653 + }, + { + "epoch": 0.31, + "learning_rate": 2.1140114713800772e-07, + "logits/chosen": -2.408447742462158, + "logits/rejected": -2.5559964179992676, + "logps/chosen": -231.0034637451172, + "logps/rejected": -283.11572265625, + "loss": 0.24, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.721239447593689, + "rewards/margins": 2.4491429328918457, + "rewards/rejected": -3.170382499694824, + "step": 2654 + }, + { + "epoch": 0.31, + "learning_rate": 2.1136603066838347e-07, + "logits/chosen": -2.677835464477539, + "logits/rejected": -2.876169204711914, + "logps/chosen": -251.8473663330078, + "logps/rejected": -324.6608581542969, + "loss": 0.1758, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2601782083511353, + "rewards/margins": 3.104574680328369, + "rewards/rejected": -4.364752292633057, + "step": 2655 + }, + { + "epoch": 0.31, + "learning_rate": 2.1133091419875923e-07, + "logits/chosen": -2.7875289916992188, + "logits/rejected": -2.478060007095337, + "logps/chosen": -291.86328125, + "logps/rejected": -285.09423828125, + "loss": 0.3649, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3558965921401978, + "rewards/margins": 2.9269161224365234, + "rewards/rejected": -4.28281307220459, + "step": 2656 + }, + { + "epoch": 0.31, + "learning_rate": 2.1129579772913495e-07, + "logits/chosen": -1.7215982675552368, + "logits/rejected": -2.0809426307678223, + "logps/chosen": -316.2393798828125, + "logps/rejected": -217.2515869140625, + "loss": 0.4891, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5396252274513245, + "rewards/margins": 0.8392350077629089, + "rewards/rejected": -1.3788602352142334, + "step": 2657 + }, + { + "epoch": 0.31, + "learning_rate": 2.112606812595107e-07, + "logits/chosen": -2.475426435470581, + "logits/rejected": -2.52268648147583, + "logps/chosen": -186.45211791992188, + "logps/rejected": -191.1383819580078, + "loss": 0.458, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.27756741642951965, + "rewards/margins": 1.4676331281661987, + "rewards/rejected": -1.7452006340026855, + "step": 2658 + }, + { + "epoch": 0.31, + "learning_rate": 2.1122556478988644e-07, + "logits/chosen": -2.3097965717315674, + "logits/rejected": -2.379911184310913, + "logps/chosen": -285.2305908203125, + "logps/rejected": -234.89340209960938, + "loss": 0.3245, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0317630767822266, + "rewards/margins": 1.802616834640503, + "rewards/rejected": -2.8343801498413086, + "step": 2659 + }, + { + "epoch": 0.31, + "learning_rate": 2.111904483202622e-07, + "logits/chosen": -2.425274610519409, + "logits/rejected": -2.5562305450439453, + "logps/chosen": -300.2313537597656, + "logps/rejected": -303.1878967285156, + "loss": 0.5367, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5352650880813599, + "rewards/margins": 2.033252000808716, + "rewards/rejected": -2.5685172080993652, + "step": 2660 + }, + { + "epoch": 0.31, + "learning_rate": 2.1115533185063794e-07, + "logits/chosen": -2.1584692001342773, + "logits/rejected": -1.8958284854888916, + "logps/chosen": -441.7482604980469, + "logps/rejected": -457.6128234863281, + "loss": 1.1733, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8463637828826904, + "rewards/margins": 1.4699749946594238, + "rewards/rejected": -3.316338539123535, + "step": 2661 + }, + { + "epoch": 0.31, + "learning_rate": 2.1112021538101367e-07, + "logits/chosen": -2.2137269973754883, + "logits/rejected": -1.7913331985473633, + "logps/chosen": -329.7873840332031, + "logps/rejected": -386.8814392089844, + "loss": 0.3965, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8464211821556091, + "rewards/margins": 1.8240278959274292, + "rewards/rejected": -2.6704492568969727, + "step": 2662 + }, + { + "epoch": 0.31, + "learning_rate": 2.1108509891138942e-07, + "logits/chosen": -2.3253800868988037, + "logits/rejected": -2.186415910720825, + "logps/chosen": -174.31332397460938, + "logps/rejected": -154.66647338867188, + "loss": 0.3395, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.43049386143684387, + "rewards/margins": 2.0665295124053955, + "rewards/rejected": -2.497023344039917, + "step": 2663 + }, + { + "epoch": 0.31, + "learning_rate": 2.1104998244176515e-07, + "logits/chosen": -2.0262861251831055, + "logits/rejected": -2.0376806259155273, + "logps/chosen": -190.77296447753906, + "logps/rejected": -356.9886169433594, + "loss": 0.1095, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.39385294914245605, + "rewards/margins": 3.493417739868164, + "rewards/rejected": -3.887270450592041, + "step": 2664 + }, + { + "epoch": 0.31, + "learning_rate": 2.1101486597214093e-07, + "logits/chosen": -1.6349941492080688, + "logits/rejected": -1.8527214527130127, + "logps/chosen": -352.5596923828125, + "logps/rejected": -256.28570556640625, + "loss": 0.4635, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6673945784568787, + "rewards/margins": 1.1905378103256226, + "rewards/rejected": -1.857932448387146, + "step": 2665 + }, + { + "epoch": 0.31, + "learning_rate": 2.1097974950251669e-07, + "logits/chosen": -2.4061317443847656, + "logits/rejected": -2.4059600830078125, + "logps/chosen": -465.4176025390625, + "logps/rejected": -414.1136474609375, + "loss": 0.4565, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8753427863121033, + "rewards/margins": 1.2864933013916016, + "rewards/rejected": -2.1618361473083496, + "step": 2666 + }, + { + "epoch": 0.31, + "learning_rate": 2.109446330328924e-07, + "logits/chosen": -2.337282180786133, + "logits/rejected": -1.9083099365234375, + "logps/chosen": -293.8794860839844, + "logps/rejected": -345.5462646484375, + "loss": 0.4487, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6182281970977783, + "rewards/margins": 1.2854362726211548, + "rewards/rejected": -1.903664469718933, + "step": 2667 + }, + { + "epoch": 0.31, + "learning_rate": 2.1090951656326817e-07, + "logits/chosen": -2.2448344230651855, + "logits/rejected": -2.267833709716797, + "logps/chosen": -378.932373046875, + "logps/rejected": -267.947265625, + "loss": 0.3459, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.23521766066551208, + "rewards/margins": 1.8922317028045654, + "rewards/rejected": -2.1274495124816895, + "step": 2668 + }, + { + "epoch": 0.31, + "learning_rate": 2.1087440009364392e-07, + "logits/chosen": -2.0786287784576416, + "logits/rejected": -2.112837791442871, + "logps/chosen": -283.2190856933594, + "logps/rejected": -229.65582275390625, + "loss": 0.4232, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6354347467422485, + "rewards/margins": 0.7139739990234375, + "rewards/rejected": -1.3494086265563965, + "step": 2669 + }, + { + "epoch": 0.31, + "learning_rate": 2.1083928362401965e-07, + "logits/chosen": -2.4076595306396484, + "logits/rejected": -2.2392642498016357, + "logps/chosen": -178.7041015625, + "logps/rejected": -268.007080078125, + "loss": 0.4076, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.322270154953003, + "rewards/margins": 2.0917115211486816, + "rewards/rejected": -3.4139814376831055, + "step": 2670 + }, + { + "epoch": 0.31, + "learning_rate": 2.108041671543954e-07, + "logits/chosen": -2.5661749839782715, + "logits/rejected": -2.4302430152893066, + "logps/chosen": -183.1044158935547, + "logps/rejected": -185.04820251464844, + "loss": 0.4134, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4387527108192444, + "rewards/margins": 1.4786995649337769, + "rewards/rejected": -1.9174522161483765, + "step": 2671 + }, + { + "epoch": 0.31, + "learning_rate": 2.1076905068477113e-07, + "logits/chosen": -2.0058560371398926, + "logits/rejected": -2.4493296146392822, + "logps/chosen": -330.7136535644531, + "logps/rejected": -170.99703979492188, + "loss": 1.1807, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.84823477268219, + "rewards/margins": 0.37193113565444946, + "rewards/rejected": -2.220165967941284, + "step": 2672 + }, + { + "epoch": 0.31, + "learning_rate": 2.1073393421514688e-07, + "logits/chosen": -2.213886022567749, + "logits/rejected": -2.3294167518615723, + "logps/chosen": -307.2398681640625, + "logps/rejected": -250.2476348876953, + "loss": 0.2046, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.33338621258735657, + "rewards/margins": 1.8693264722824097, + "rewards/rejected": -1.535940170288086, + "step": 2673 + }, + { + "epoch": 0.31, + "learning_rate": 2.1069881774552264e-07, + "logits/chosen": -2.337857961654663, + "logits/rejected": -2.175901412963867, + "logps/chosen": -274.3938903808594, + "logps/rejected": -304.00579833984375, + "loss": 0.2266, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.01674852892756462, + "rewards/margins": 3.0927302837371826, + "rewards/rejected": -3.075981855392456, + "step": 2674 + }, + { + "epoch": 0.31, + "learning_rate": 2.1066370127589836e-07, + "logits/chosen": -2.2349300384521484, + "logits/rejected": -2.3033981323242188, + "logps/chosen": -202.41432189941406, + "logps/rejected": -213.71566772460938, + "loss": 0.9528, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3840010166168213, + "rewards/margins": 1.2767547369003296, + "rewards/rejected": -2.6607558727264404, + "step": 2675 + }, + { + "epoch": 0.31, + "learning_rate": 2.1062858480627414e-07, + "logits/chosen": -2.5747227668762207, + "logits/rejected": -2.8205642700195312, + "logps/chosen": -304.2304382324219, + "logps/rejected": -207.74349975585938, + "loss": 0.8222, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.256429672241211, + "rewards/margins": 0.6492285132408142, + "rewards/rejected": -1.9056583642959595, + "step": 2676 + }, + { + "epoch": 0.31, + "learning_rate": 2.105934683366499e-07, + "logits/chosen": -2.265155792236328, + "logits/rejected": -2.611119508743286, + "logps/chosen": -483.6532287597656, + "logps/rejected": -275.76776123046875, + "loss": 0.389, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5924148559570312, + "rewards/margins": 2.1407573223114014, + "rewards/rejected": -2.7331719398498535, + "step": 2677 + }, + { + "epoch": 0.31, + "learning_rate": 2.1055835186702563e-07, + "logits/chosen": -2.5164742469787598, + "logits/rejected": -2.655330181121826, + "logps/chosen": -159.45858764648438, + "logps/rejected": -290.87884521484375, + "loss": 0.3097, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8560831546783447, + "rewards/margins": 2.6763811111450195, + "rewards/rejected": -3.5324642658233643, + "step": 2678 + }, + { + "epoch": 0.31, + "learning_rate": 2.1052323539740138e-07, + "logits/chosen": -1.8867506980895996, + "logits/rejected": -2.2343859672546387, + "logps/chosen": -388.21453857421875, + "logps/rejected": -229.72146606445312, + "loss": 0.6958, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0253158807754517, + "rewards/margins": 1.6649425029754639, + "rewards/rejected": -2.690258502960205, + "step": 2679 + }, + { + "epoch": 0.31, + "learning_rate": 2.104881189277771e-07, + "logits/chosen": -1.9955968856811523, + "logits/rejected": -2.13688063621521, + "logps/chosen": -256.93231201171875, + "logps/rejected": -249.73147583007812, + "loss": 0.3811, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5619848966598511, + "rewards/margins": 1.9664710760116577, + "rewards/rejected": -2.528455972671509, + "step": 2680 + }, + { + "epoch": 0.31, + "learning_rate": 2.1045300245815286e-07, + "logits/chosen": -2.7268261909484863, + "logits/rejected": -2.3735930919647217, + "logps/chosen": -202.59188842773438, + "logps/rejected": -298.4215087890625, + "loss": 0.0909, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36385923624038696, + "rewards/margins": 4.367283821105957, + "rewards/rejected": -4.731142997741699, + "step": 2681 + }, + { + "epoch": 0.31, + "learning_rate": 2.1041788598852861e-07, + "logits/chosen": -2.2289280891418457, + "logits/rejected": -1.9846476316452026, + "logps/chosen": -230.74893188476562, + "logps/rejected": -377.64727783203125, + "loss": 0.3618, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.053034782409668, + "rewards/margins": 2.5064404010772705, + "rewards/rejected": -3.5594749450683594, + "step": 2682 + }, + { + "epoch": 0.31, + "learning_rate": 2.1038276951890434e-07, + "logits/chosen": -2.6235461235046387, + "logits/rejected": -2.5404767990112305, + "logps/chosen": -183.88058471679688, + "logps/rejected": -251.8575897216797, + "loss": 0.4761, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3199506998062134, + "rewards/margins": 1.7000582218170166, + "rewards/rejected": -3.0200088024139404, + "step": 2683 + }, + { + "epoch": 0.31, + "learning_rate": 2.103476530492801e-07, + "logits/chosen": -2.119997501373291, + "logits/rejected": -2.297714948654175, + "logps/chosen": -441.2605285644531, + "logps/rejected": -246.57164001464844, + "loss": 0.693, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5883786082267761, + "rewards/margins": 1.805064082145691, + "rewards/rejected": -2.3934426307678223, + "step": 2684 + }, + { + "epoch": 0.31, + "learning_rate": 2.1031253657965588e-07, + "logits/chosen": -2.5267300605773926, + "logits/rejected": -2.438732385635376, + "logps/chosen": -267.87176513671875, + "logps/rejected": -390.383544921875, + "loss": 0.294, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.45737224817276, + "rewards/margins": 2.691026449203491, + "rewards/rejected": -3.1483981609344482, + "step": 2685 + }, + { + "epoch": 0.31, + "learning_rate": 2.1027742011003158e-07, + "logits/chosen": -2.0445985794067383, + "logits/rejected": -1.7737438678741455, + "logps/chosen": -355.00958251953125, + "logps/rejected": -546.4495239257812, + "loss": 0.4156, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.49453607201576233, + "rewards/margins": 2.8976244926452637, + "rewards/rejected": -3.392160415649414, + "step": 2686 + }, + { + "epoch": 0.31, + "learning_rate": 2.1024230364040736e-07, + "logits/chosen": -2.2629618644714355, + "logits/rejected": -2.1795401573181152, + "logps/chosen": -242.2452392578125, + "logps/rejected": -371.6811218261719, + "loss": 0.416, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.891493558883667, + "rewards/margins": 1.273648738861084, + "rewards/rejected": -2.165142297744751, + "step": 2687 + }, + { + "epoch": 0.31, + "learning_rate": 2.1020718717078309e-07, + "logits/chosen": -2.2650556564331055, + "logits/rejected": -2.0789215564727783, + "logps/chosen": -194.5699920654297, + "logps/rejected": -253.68807983398438, + "loss": 0.3916, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6614631414413452, + "rewards/margins": 2.2367515563964844, + "rewards/rejected": -2.898214817047119, + "step": 2688 + }, + { + "epoch": 0.31, + "learning_rate": 2.1017207070115884e-07, + "logits/chosen": -2.022006034851074, + "logits/rejected": -2.079972743988037, + "logps/chosen": -230.59417724609375, + "logps/rejected": -303.08233642578125, + "loss": 0.343, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3723102807998657, + "rewards/margins": 3.3613576889038086, + "rewards/rejected": -3.7336678504943848, + "step": 2689 + }, + { + "epoch": 0.31, + "learning_rate": 2.101369542315346e-07, + "logits/chosen": -2.330974817276001, + "logits/rejected": -2.699890613555908, + "logps/chosen": -508.80914306640625, + "logps/rejected": -368.0151062011719, + "loss": 0.1909, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.021362125873565674, + "rewards/margins": 4.264474868774414, + "rewards/rejected": -4.243112564086914, + "step": 2690 + }, + { + "epoch": 0.31, + "learning_rate": 2.1010183776191032e-07, + "logits/chosen": -2.022221088409424, + "logits/rejected": -2.3977761268615723, + "logps/chosen": -376.4938049316406, + "logps/rejected": -291.922119140625, + "loss": 0.4629, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.522978663444519, + "rewards/margins": 1.1800854206085205, + "rewards/rejected": -1.703064203262329, + "step": 2691 + }, + { + "epoch": 0.31, + "learning_rate": 2.1006672129228607e-07, + "logits/chosen": -2.490233898162842, + "logits/rejected": -2.504859447479248, + "logps/chosen": -189.27133178710938, + "logps/rejected": -277.3870849609375, + "loss": 0.2649, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.718280553817749, + "rewards/margins": 2.271272659301758, + "rewards/rejected": -2.989553213119507, + "step": 2692 + }, + { + "epoch": 0.31, + "learning_rate": 2.1003160482266183e-07, + "logits/chosen": -2.6440367698669434, + "logits/rejected": -2.601635217666626, + "logps/chosen": -270.5170593261719, + "logps/rejected": -363.4310302734375, + "loss": 0.253, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5777231454849243, + "rewards/margins": 2.003716468811035, + "rewards/rejected": -2.581439733505249, + "step": 2693 + }, + { + "epoch": 0.31, + "learning_rate": 2.0999648835303756e-07, + "logits/chosen": -2.545621395111084, + "logits/rejected": -2.6131083965301514, + "logps/chosen": -362.5561828613281, + "logps/rejected": -273.7876281738281, + "loss": 0.3541, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7542310953140259, + "rewards/margins": 1.5246241092681885, + "rewards/rejected": -2.278855085372925, + "step": 2694 + }, + { + "epoch": 0.31, + "learning_rate": 2.099613718834133e-07, + "logits/chosen": -2.642129421234131, + "logits/rejected": -2.42096209526062, + "logps/chosen": -220.91038513183594, + "logps/rejected": -365.2979736328125, + "loss": 0.3817, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9526932239532471, + "rewards/margins": 2.015732765197754, + "rewards/rejected": -2.968425989151001, + "step": 2695 + }, + { + "epoch": 0.31, + "learning_rate": 2.0992625541378904e-07, + "logits/chosen": -2.0924696922302246, + "logits/rejected": -2.1211602687835693, + "logps/chosen": -289.21063232421875, + "logps/rejected": -298.26690673828125, + "loss": 0.3899, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1079578772187233, + "rewards/margins": 1.3686314821243286, + "rewards/rejected": -1.476589322090149, + "step": 2696 + }, + { + "epoch": 0.31, + "learning_rate": 2.098911389441648e-07, + "logits/chosen": -2.628222942352295, + "logits/rejected": -2.6865220069885254, + "logps/chosen": -137.60928344726562, + "logps/rejected": -199.04611206054688, + "loss": 0.1096, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02506151795387268, + "rewards/margins": 2.953294277191162, + "rewards/rejected": -2.978355646133423, + "step": 2697 + }, + { + "epoch": 0.31, + "learning_rate": 2.0985602247454057e-07, + "logits/chosen": -2.2335619926452637, + "logits/rejected": -1.8673529624938965, + "logps/chosen": -259.51275634765625, + "logps/rejected": -349.8844299316406, + "loss": 0.3176, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4514622092247009, + "rewards/margins": 2.1140940189361572, + "rewards/rejected": -2.565556287765503, + "step": 2698 + }, + { + "epoch": 0.31, + "learning_rate": 2.098209060049163e-07, + "logits/chosen": -2.1527788639068604, + "logits/rejected": -2.2582814693450928, + "logps/chosen": -333.36236572265625, + "logps/rejected": -222.23971557617188, + "loss": 0.3357, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.041440486907959, + "rewards/margins": 1.39397394657135, + "rewards/rejected": -2.4354145526885986, + "step": 2699 + }, + { + "epoch": 0.31, + "learning_rate": 2.0978578953529205e-07, + "logits/chosen": -2.124408721923828, + "logits/rejected": -2.287606954574585, + "logps/chosen": -162.00054931640625, + "logps/rejected": -226.88275146484375, + "loss": 0.6015, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3467904329299927, + "rewards/margins": 1.6112120151519775, + "rewards/rejected": -2.9580023288726807, + "step": 2700 + }, + { + "epoch": 0.31, + "learning_rate": 2.097506730656678e-07, + "logits/chosen": -2.551084518432617, + "logits/rejected": -2.537383556365967, + "logps/chosen": -245.57162475585938, + "logps/rejected": -412.14263916015625, + "loss": 0.0804, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7290098667144775, + "rewards/margins": 3.596701145172119, + "rewards/rejected": -4.325711250305176, + "step": 2701 + }, + { + "epoch": 0.31, + "learning_rate": 2.0971555659604353e-07, + "logits/chosen": -2.3258190155029297, + "logits/rejected": -2.1197402477264404, + "logps/chosen": -143.64712524414062, + "logps/rejected": -150.98989868164062, + "loss": 0.5334, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.95977383852005, + "rewards/margins": 1.9830042123794556, + "rewards/rejected": -2.9427781105041504, + "step": 2702 + }, + { + "epoch": 0.31, + "learning_rate": 2.096804401264193e-07, + "logits/chosen": -2.8817906379699707, + "logits/rejected": -2.6398439407348633, + "logps/chosen": -134.67666625976562, + "logps/rejected": -246.19003295898438, + "loss": 0.1961, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9255506992340088, + "rewards/margins": 3.1381678581237793, + "rewards/rejected": -4.063718318939209, + "step": 2703 + }, + { + "epoch": 0.31, + "learning_rate": 2.0964532365679501e-07, + "logits/chosen": -2.350926160812378, + "logits/rejected": -2.4574320316314697, + "logps/chosen": -326.1263122558594, + "logps/rejected": -341.1051025390625, + "loss": 0.2518, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5990500450134277, + "rewards/margins": 2.7297415733337402, + "rewards/rejected": -3.328791618347168, + "step": 2704 + }, + { + "epoch": 0.31, + "learning_rate": 2.0961020718717077e-07, + "logits/chosen": -1.8162314891815186, + "logits/rejected": -1.771700382232666, + "logps/chosen": -265.86181640625, + "logps/rejected": -241.2607421875, + "loss": 0.5134, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.661068856716156, + "rewards/margins": 1.0997123718261719, + "rewards/rejected": -1.7607810497283936, + "step": 2705 + }, + { + "epoch": 0.31, + "learning_rate": 2.0957509071754652e-07, + "logits/chosen": -1.9685288667678833, + "logits/rejected": -2.1765153408050537, + "logps/chosen": -283.3369140625, + "logps/rejected": -322.6715393066406, + "loss": 0.2663, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8392664194107056, + "rewards/margins": 4.224853992462158, + "rewards/rejected": -5.064120292663574, + "step": 2706 + }, + { + "epoch": 0.31, + "learning_rate": 2.0953997424792225e-07, + "logits/chosen": -2.2326467037200928, + "logits/rejected": -1.815114140510559, + "logps/chosen": -236.25115966796875, + "logps/rejected": -334.16412353515625, + "loss": 0.1517, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6167525053024292, + "rewards/margins": 3.057589292526245, + "rewards/rejected": -3.6743416786193848, + "step": 2707 + }, + { + "epoch": 0.31, + "learning_rate": 2.09504857778298e-07, + "logits/chosen": -2.1912646293640137, + "logits/rejected": -2.074315071105957, + "logps/chosen": -158.22528076171875, + "logps/rejected": -279.4290466308594, + "loss": 0.3335, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1221184730529785, + "rewards/margins": 3.3731284141540527, + "rewards/rejected": -4.495246887207031, + "step": 2708 + }, + { + "epoch": 0.31, + "learning_rate": 2.0946974130867373e-07, + "logits/chosen": -2.7844607830047607, + "logits/rejected": -2.66595458984375, + "logps/chosen": -184.6405487060547, + "logps/rejected": -215.91387939453125, + "loss": 0.2954, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6228742003440857, + "rewards/margins": 1.3692388534545898, + "rewards/rejected": -1.9921131134033203, + "step": 2709 + }, + { + "epoch": 0.31, + "learning_rate": 2.094346248390495e-07, + "logits/chosen": -1.9909663200378418, + "logits/rejected": -2.3342912197113037, + "logps/chosen": -236.8087158203125, + "logps/rejected": -166.46778869628906, + "loss": 0.4677, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.42396053671836853, + "rewards/margins": 1.8264594078063965, + "rewards/rejected": -2.250419855117798, + "step": 2710 + }, + { + "epoch": 0.31, + "learning_rate": 2.0939950836942526e-07, + "logits/chosen": -2.10159969329834, + "logits/rejected": -2.095987319946289, + "logps/chosen": -241.5083770751953, + "logps/rejected": -298.3133544921875, + "loss": 0.3427, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8668222427368164, + "rewards/margins": 1.657733678817749, + "rewards/rejected": -2.5245559215545654, + "step": 2711 + }, + { + "epoch": 0.31, + "learning_rate": 2.09364391899801e-07, + "logits/chosen": -1.9361860752105713, + "logits/rejected": -2.1457746028900146, + "logps/chosen": -504.0346984863281, + "logps/rejected": -424.36151123046875, + "loss": 0.3647, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.11465534567832947, + "rewards/margins": 2.184811592102051, + "rewards/rejected": -2.299466848373413, + "step": 2712 + }, + { + "epoch": 0.31, + "learning_rate": 2.0932927543017675e-07, + "logits/chosen": -2.232431173324585, + "logits/rejected": -2.228703737258911, + "logps/chosen": -216.690185546875, + "logps/rejected": -270.92437744140625, + "loss": 0.2272, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5407695770263672, + "rewards/margins": 3.733837842941284, + "rewards/rejected": -4.2746076583862305, + "step": 2713 + }, + { + "epoch": 0.31, + "learning_rate": 2.092941589605525e-07, + "logits/chosen": -2.647352695465088, + "logits/rejected": -2.724882125854492, + "logps/chosen": -237.39352416992188, + "logps/rejected": -197.9309844970703, + "loss": 0.5798, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2707103490829468, + "rewards/margins": 1.2434394359588623, + "rewards/rejected": -2.5141496658325195, + "step": 2714 + }, + { + "epoch": 0.31, + "learning_rate": 2.0925904249092823e-07, + "logits/chosen": -2.1613473892211914, + "logits/rejected": -2.331233501434326, + "logps/chosen": -262.6318359375, + "logps/rejected": -143.722412109375, + "loss": 0.5387, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9277799129486084, + "rewards/margins": 1.116539716720581, + "rewards/rejected": -2.0443196296691895, + "step": 2715 + }, + { + "epoch": 0.31, + "learning_rate": 2.0922392602130398e-07, + "logits/chosen": -2.251851797103882, + "logits/rejected": -2.6025915145874023, + "logps/chosen": -501.49798583984375, + "logps/rejected": -238.54420471191406, + "loss": 0.3607, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6224414110183716, + "rewards/margins": 2.016666889190674, + "rewards/rejected": -2.639108180999756, + "step": 2716 + }, + { + "epoch": 0.31, + "learning_rate": 2.091888095516797e-07, + "logits/chosen": -2.3211722373962402, + "logits/rejected": -2.44064998626709, + "logps/chosen": -303.2532958984375, + "logps/rejected": -279.0360412597656, + "loss": 0.609, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1345791816711426, + "rewards/margins": 1.2592942714691162, + "rewards/rejected": -2.393873453140259, + "step": 2717 + }, + { + "epoch": 0.31, + "learning_rate": 2.0915369308205546e-07, + "logits/chosen": -2.0143377780914307, + "logits/rejected": -2.100159168243408, + "logps/chosen": -308.07861328125, + "logps/rejected": -228.73519897460938, + "loss": 0.8312, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1311016082763672, + "rewards/margins": 0.7115135788917542, + "rewards/rejected": -1.8426152467727661, + "step": 2718 + }, + { + "epoch": 0.31, + "learning_rate": 2.0911857661243124e-07, + "logits/chosen": -2.601602077484131, + "logits/rejected": -2.63686466217041, + "logps/chosen": -337.527099609375, + "logps/rejected": -217.96585083007812, + "loss": 0.4104, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8425840139389038, + "rewards/margins": 1.532795786857605, + "rewards/rejected": -2.375379800796509, + "step": 2719 + }, + { + "epoch": 0.31, + "learning_rate": 2.0908346014280694e-07, + "logits/chosen": -2.394993543624878, + "logits/rejected": -2.614366054534912, + "logps/chosen": -272.63104248046875, + "logps/rejected": -186.74966430664062, + "loss": 0.5987, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0133925676345825, + "rewards/margins": 0.8921200633049011, + "rewards/rejected": -1.9055126905441284, + "step": 2720 + }, + { + "epoch": 0.31, + "learning_rate": 2.0904834367318272e-07, + "logits/chosen": -1.953494906425476, + "logits/rejected": -2.0432114601135254, + "logps/chosen": -289.0672912597656, + "logps/rejected": -346.9803771972656, + "loss": 0.264, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5843742489814758, + "rewards/margins": 1.8854373693466187, + "rewards/rejected": -2.46981143951416, + "step": 2721 + }, + { + "epoch": 0.31, + "learning_rate": 2.0901322720355848e-07, + "logits/chosen": -2.060572862625122, + "logits/rejected": -2.006304979324341, + "logps/chosen": -215.57601928710938, + "logps/rejected": -275.193359375, + "loss": 0.2373, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5622003674507141, + "rewards/margins": 2.4596996307373047, + "rewards/rejected": -3.021899938583374, + "step": 2722 + }, + { + "epoch": 0.31, + "learning_rate": 2.089781107339342e-07, + "logits/chosen": -2.9378104209899902, + "logits/rejected": -2.943495750427246, + "logps/chosen": -272.3619384765625, + "logps/rejected": -345.31060791015625, + "loss": 0.3664, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3692359924316406, + "rewards/margins": 1.8879637718200684, + "rewards/rejected": -3.257199764251709, + "step": 2723 + }, + { + "epoch": 0.31, + "learning_rate": 2.0894299426430996e-07, + "logits/chosen": -1.9986594915390015, + "logits/rejected": -2.1419107913970947, + "logps/chosen": -305.75140380859375, + "logps/rejected": -342.0673828125, + "loss": 0.2116, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0685895681381226, + "rewards/margins": 5.352239608764648, + "rewards/rejected": -6.4208292961120605, + "step": 2724 + }, + { + "epoch": 0.31, + "learning_rate": 2.0890787779468569e-07, + "logits/chosen": -2.2676808834075928, + "logits/rejected": -2.6053974628448486, + "logps/chosen": -340.669189453125, + "logps/rejected": -231.84585571289062, + "loss": 0.6535, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9400598406791687, + "rewards/margins": 1.0811426639556885, + "rewards/rejected": -2.021202564239502, + "step": 2725 + }, + { + "epoch": 0.31, + "learning_rate": 2.0887276132506144e-07, + "logits/chosen": -2.347080945968628, + "logits/rejected": -2.401468515396118, + "logps/chosen": -196.65541076660156, + "logps/rejected": -233.43692016601562, + "loss": 0.3678, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7563194632530212, + "rewards/margins": 1.9054776430130005, + "rewards/rejected": -2.661797046661377, + "step": 2726 + }, + { + "epoch": 0.31, + "learning_rate": 2.088376448554372e-07, + "logits/chosen": -2.328263282775879, + "logits/rejected": -1.9847633838653564, + "logps/chosen": -205.73419189453125, + "logps/rejected": -374.55169677734375, + "loss": 0.7273, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9623489379882812, + "rewards/margins": 0.7693599462509155, + "rewards/rejected": -1.7317090034484863, + "step": 2727 + }, + { + "epoch": 0.31, + "learning_rate": 2.0880252838581292e-07, + "logits/chosen": -1.6417322158813477, + "logits/rejected": -2.2685232162475586, + "logps/chosen": -536.3944091796875, + "logps/rejected": -305.2862548828125, + "loss": 0.4947, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.15724515914917, + "rewards/margins": 2.180267333984375, + "rewards/rejected": -3.337512493133545, + "step": 2728 + }, + { + "epoch": 0.31, + "learning_rate": 2.0876741191618868e-07, + "logits/chosen": -1.944338083267212, + "logits/rejected": -1.8918589353561401, + "logps/chosen": -262.43023681640625, + "logps/rejected": -238.13987731933594, + "loss": 0.3212, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.39932990074157715, + "rewards/margins": 2.058983325958252, + "rewards/rejected": -2.458313226699829, + "step": 2729 + }, + { + "epoch": 0.31, + "learning_rate": 2.0873229544656446e-07, + "logits/chosen": -2.055102825164795, + "logits/rejected": -2.236821174621582, + "logps/chosen": -418.9134521484375, + "logps/rejected": -284.8441162109375, + "loss": 0.5398, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8316920399665833, + "rewards/margins": 1.0926799774169922, + "rewards/rejected": -1.9243719577789307, + "step": 2730 + }, + { + "epoch": 0.31, + "learning_rate": 2.0869717897694016e-07, + "logits/chosen": -2.5742173194885254, + "logits/rejected": -2.516545295715332, + "logps/chosen": -129.6215057373047, + "logps/rejected": -171.14810180664062, + "loss": 0.1901, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4571876525878906, + "rewards/margins": 2.8135547637939453, + "rewards/rejected": -3.270742654800415, + "step": 2731 + }, + { + "epoch": 0.31, + "learning_rate": 2.0866206250731594e-07, + "logits/chosen": -2.6020922660827637, + "logits/rejected": -2.676403760910034, + "logps/chosen": -480.387451171875, + "logps/rejected": -247.1926727294922, + "loss": 0.7263, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.950707733631134, + "rewards/margins": 1.1203949451446533, + "rewards/rejected": -2.0711028575897217, + "step": 2732 + }, + { + "epoch": 0.32, + "learning_rate": 2.0862694603769166e-07, + "logits/chosen": -2.3994569778442383, + "logits/rejected": -2.2227377891540527, + "logps/chosen": -172.22000122070312, + "logps/rejected": -416.3756103515625, + "loss": 0.5669, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2062971591949463, + "rewards/margins": 0.9121830463409424, + "rewards/rejected": -2.1184802055358887, + "step": 2733 + }, + { + "epoch": 0.32, + "learning_rate": 2.0859182956806742e-07, + "logits/chosen": -2.46884822845459, + "logits/rejected": -2.301624059677124, + "logps/chosen": -114.80006408691406, + "logps/rejected": -218.3236846923828, + "loss": 0.2953, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9670981168746948, + "rewards/margins": 1.5414221286773682, + "rewards/rejected": -2.5085203647613525, + "step": 2734 + }, + { + "epoch": 0.32, + "learning_rate": 2.0855671309844317e-07, + "logits/chosen": -1.8514691591262817, + "logits/rejected": -1.8994131088256836, + "logps/chosen": -232.66973876953125, + "logps/rejected": -239.29757690429688, + "loss": 1.0886, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5016090869903564, + "rewards/margins": 1.506947636604309, + "rewards/rejected": -3.008556842803955, + "step": 2735 + }, + { + "epoch": 0.32, + "learning_rate": 2.085215966288189e-07, + "logits/chosen": -2.7116708755493164, + "logits/rejected": -2.402194023132324, + "logps/chosen": -232.5982666015625, + "logps/rejected": -223.95904541015625, + "loss": 0.386, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5125440359115601, + "rewards/margins": 2.3261516094207764, + "rewards/rejected": -2.838695764541626, + "step": 2736 + }, + { + "epoch": 0.32, + "learning_rate": 2.0848648015919465e-07, + "logits/chosen": -2.2201149463653564, + "logits/rejected": -2.4965991973876953, + "logps/chosen": -306.5833740234375, + "logps/rejected": -270.4555358886719, + "loss": 0.558, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14577719569206238, + "rewards/margins": 1.9082239866256714, + "rewards/rejected": -2.0540013313293457, + "step": 2737 + }, + { + "epoch": 0.32, + "learning_rate": 2.084513636895704e-07, + "logits/chosen": -2.5986056327819824, + "logits/rejected": -2.631967067718506, + "logps/chosen": -242.1343231201172, + "logps/rejected": -253.40554809570312, + "loss": 0.3861, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.895363211631775, + "rewards/margins": 1.822792649269104, + "rewards/rejected": -3.718155860900879, + "step": 2738 + }, + { + "epoch": 0.32, + "learning_rate": 2.0841624721994613e-07, + "logits/chosen": -2.624380111694336, + "logits/rejected": -2.453558921813965, + "logps/chosen": -276.5224609375, + "logps/rejected": -302.35174560546875, + "loss": 0.8684, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.680402398109436, + "rewards/margins": 0.520300030708313, + "rewards/rejected": -2.200702428817749, + "step": 2739 + }, + { + "epoch": 0.32, + "learning_rate": 2.083811307503219e-07, + "logits/chosen": -2.3849198818206787, + "logits/rejected": -2.4349212646484375, + "logps/chosen": -367.7228088378906, + "logps/rejected": -258.9012451171875, + "loss": 0.4293, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9593265056610107, + "rewards/margins": 1.399050235748291, + "rewards/rejected": -2.3583767414093018, + "step": 2740 + }, + { + "epoch": 0.32, + "learning_rate": 2.0834601428069762e-07, + "logits/chosen": -2.6109306812286377, + "logits/rejected": -2.4307403564453125, + "logps/chosen": -186.93511962890625, + "logps/rejected": -313.643798828125, + "loss": 0.2736, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.226152777671814, + "rewards/margins": 2.1503632068634033, + "rewards/rejected": -3.3765158653259277, + "step": 2741 + }, + { + "epoch": 0.32, + "learning_rate": 2.0831089781107337e-07, + "logits/chosen": -2.317697286605835, + "logits/rejected": -2.660194158554077, + "logps/chosen": -277.7393493652344, + "logps/rejected": -245.88320922851562, + "loss": 0.796, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1203806400299072, + "rewards/margins": 2.3183629512786865, + "rewards/rejected": -3.438743829727173, + "step": 2742 + }, + { + "epoch": 0.32, + "learning_rate": 2.0827578134144915e-07, + "logits/chosen": -2.023312568664551, + "logits/rejected": -2.039297103881836, + "logps/chosen": -315.03228759765625, + "logps/rejected": -367.41064453125, + "loss": 1.0472, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.935558557510376, + "rewards/margins": 0.2805784046649933, + "rewards/rejected": -2.216136932373047, + "step": 2743 + }, + { + "epoch": 0.32, + "learning_rate": 2.0824066487182488e-07, + "logits/chosen": -2.0895090103149414, + "logits/rejected": -2.242541790008545, + "logps/chosen": -245.65232849121094, + "logps/rejected": -286.5834045410156, + "loss": 0.5846, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9467366933822632, + "rewards/margins": 0.7918686270713806, + "rewards/rejected": -1.738605260848999, + "step": 2744 + }, + { + "epoch": 0.32, + "learning_rate": 2.0820554840220063e-07, + "logits/chosen": -2.238748073577881, + "logits/rejected": -2.3139922618865967, + "logps/chosen": -322.118408203125, + "logps/rejected": -230.97023010253906, + "loss": 0.5117, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8507544994354248, + "rewards/margins": 0.8245752453804016, + "rewards/rejected": -1.6753296852111816, + "step": 2745 + }, + { + "epoch": 0.32, + "learning_rate": 2.0817043193257639e-07, + "logits/chosen": -2.0847713947296143, + "logits/rejected": -2.262535572052002, + "logps/chosen": -500.3838195800781, + "logps/rejected": -279.26409912109375, + "loss": 0.497, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6937698721885681, + "rewards/margins": 2.2063164710998535, + "rewards/rejected": -2.9000864028930664, + "step": 2746 + }, + { + "epoch": 0.32, + "learning_rate": 2.081353154629521e-07, + "logits/chosen": -2.973543405532837, + "logits/rejected": -3.006865978240967, + "logps/chosen": -253.0279541015625, + "logps/rejected": -233.2784423828125, + "loss": 0.3067, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6316075325012207, + "rewards/margins": 2.447420835494995, + "rewards/rejected": -3.079028367996216, + "step": 2747 + }, + { + "epoch": 0.32, + "learning_rate": 2.0810019899332787e-07, + "logits/chosen": -2.193147659301758, + "logits/rejected": -2.0648248195648193, + "logps/chosen": -191.94046020507812, + "logps/rejected": -201.48416137695312, + "loss": 0.5039, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4423988461494446, + "rewards/margins": 3.5172677040100098, + "rewards/rejected": -3.9596662521362305, + "step": 2748 + }, + { + "epoch": 0.32, + "learning_rate": 2.080650825237036e-07, + "logits/chosen": -2.7599709033966064, + "logits/rejected": -2.6813409328460693, + "logps/chosen": -192.7355499267578, + "logps/rejected": -254.32347106933594, + "loss": 0.3859, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3092749118804932, + "rewards/margins": 2.2632055282592773, + "rewards/rejected": -3.5724804401397705, + "step": 2749 + }, + { + "epoch": 0.32, + "learning_rate": 2.0802996605407935e-07, + "logits/chosen": -2.501807928085327, + "logits/rejected": -2.3571627140045166, + "logps/chosen": -382.1121826171875, + "logps/rejected": -306.01922607421875, + "loss": 0.3134, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1057560443878174, + "rewards/margins": 1.2329647541046143, + "rewards/rejected": -2.3387207984924316, + "step": 2750 + }, + { + "epoch": 0.32, + "learning_rate": 2.079948495844551e-07, + "logits/chosen": -2.188143253326416, + "logits/rejected": -2.028346061706543, + "logps/chosen": -112.89546966552734, + "logps/rejected": -270.2606201171875, + "loss": 0.5291, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.33958375453948975, + "rewards/margins": 1.0814614295959473, + "rewards/rejected": -1.421045184135437, + "step": 2751 + }, + { + "epoch": 0.32, + "learning_rate": 2.0795973311483083e-07, + "logits/chosen": -2.7272815704345703, + "logits/rejected": -2.787787437438965, + "logps/chosen": -199.4695281982422, + "logps/rejected": -214.7635040283203, + "loss": 0.6754, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3849077224731445, + "rewards/margins": 1.310697317123413, + "rewards/rejected": -2.6956048011779785, + "step": 2752 + }, + { + "epoch": 0.32, + "learning_rate": 2.079246166452066e-07, + "logits/chosen": -2.106128215789795, + "logits/rejected": -2.0723392963409424, + "logps/chosen": -399.7431640625, + "logps/rejected": -427.4577941894531, + "loss": 0.2748, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9316126108169556, + "rewards/margins": 1.950298547744751, + "rewards/rejected": -2.881911277770996, + "step": 2753 + }, + { + "epoch": 0.32, + "learning_rate": 2.0788950017558236e-07, + "logits/chosen": -2.5683181285858154, + "logits/rejected": -2.7195355892181396, + "logps/chosen": -302.7029724121094, + "logps/rejected": -228.7387237548828, + "loss": 0.9096, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7234569787979126, + "rewards/margins": 0.3710588216781616, + "rewards/rejected": -2.0945160388946533, + "step": 2754 + }, + { + "epoch": 0.32, + "learning_rate": 2.078543837059581e-07, + "logits/chosen": -2.740368366241455, + "logits/rejected": -2.7632699012756348, + "logps/chosen": -204.57791137695312, + "logps/rejected": -292.07244873046875, + "loss": 0.1828, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12654823064804077, + "rewards/margins": 2.7398500442504883, + "rewards/rejected": -2.866398334503174, + "step": 2755 + }, + { + "epoch": 0.32, + "learning_rate": 2.0781926723633384e-07, + "logits/chosen": -2.3467705249786377, + "logits/rejected": -2.6515889167785645, + "logps/chosen": -383.44183349609375, + "logps/rejected": -229.42117309570312, + "loss": 0.4605, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0759053230285645, + "rewards/margins": 1.2502309083938599, + "rewards/rejected": -2.3261361122131348, + "step": 2756 + }, + { + "epoch": 0.32, + "learning_rate": 2.0778415076670957e-07, + "logits/chosen": -1.9917161464691162, + "logits/rejected": -2.28285813331604, + "logps/chosen": -233.2066650390625, + "logps/rejected": -185.7424774169922, + "loss": 0.5325, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7452152967453003, + "rewards/margins": 1.1019656658172607, + "rewards/rejected": -1.847180962562561, + "step": 2757 + }, + { + "epoch": 0.32, + "learning_rate": 2.0774903429708533e-07, + "logits/chosen": -2.528407573699951, + "logits/rejected": -2.4345526695251465, + "logps/chosen": -434.24993896484375, + "logps/rejected": -391.218994140625, + "loss": 0.2248, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0781421661376953, + "rewards/margins": 3.071455240249634, + "rewards/rejected": -4.14959716796875, + "step": 2758 + }, + { + "epoch": 0.32, + "learning_rate": 2.0771391782746108e-07, + "logits/chosen": -2.69781494140625, + "logits/rejected": -2.4046192169189453, + "logps/chosen": -268.54901123046875, + "logps/rejected": -356.9112243652344, + "loss": 0.1087, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2248659133911133, + "rewards/margins": 3.7119240760803223, + "rewards/rejected": -4.936789512634277, + "step": 2759 + }, + { + "epoch": 0.32, + "learning_rate": 2.076788013578368e-07, + "logits/chosen": -2.6133298873901367, + "logits/rejected": -2.424560546875, + "logps/chosen": -97.69856262207031, + "logps/rejected": -188.5050811767578, + "loss": 0.6643, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7272077798843384, + "rewards/margins": 1.75462007522583, + "rewards/rejected": -2.481827735900879, + "step": 2760 + }, + { + "epoch": 0.32, + "learning_rate": 2.0764368488821256e-07, + "logits/chosen": -2.3307509422302246, + "logits/rejected": -2.71112060546875, + "logps/chosen": -458.6762390136719, + "logps/rejected": -158.33367919921875, + "loss": 0.5627, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3719395399093628, + "rewards/margins": 1.2655000686645508, + "rewards/rejected": -2.637439489364624, + "step": 2761 + }, + { + "epoch": 0.32, + "learning_rate": 2.076085684185883e-07, + "logits/chosen": -2.395247459411621, + "logits/rejected": -2.0372462272644043, + "logps/chosen": -237.12353515625, + "logps/rejected": -366.0506896972656, + "loss": 0.3744, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7172194719314575, + "rewards/margins": 2.6804211139678955, + "rewards/rejected": -3.3976407051086426, + "step": 2762 + }, + { + "epoch": 0.32, + "learning_rate": 2.0757345194896404e-07, + "logits/chosen": -1.637451410293579, + "logits/rejected": -2.0120091438293457, + "logps/chosen": -389.99957275390625, + "logps/rejected": -251.3214569091797, + "loss": 0.3847, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3391643166542053, + "rewards/margins": 1.8211798667907715, + "rewards/rejected": -1.4820154905319214, + "step": 2763 + }, + { + "epoch": 0.32, + "learning_rate": 2.0753833547933982e-07, + "logits/chosen": -1.849151372909546, + "logits/rejected": -2.288752555847168, + "logps/chosen": -527.2423706054688, + "logps/rejected": -407.20819091796875, + "loss": 1.0261, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7410643100738525, + "rewards/margins": 0.9975496530532837, + "rewards/rejected": -2.7386138439178467, + "step": 2764 + }, + { + "epoch": 0.32, + "learning_rate": 2.0750321900971552e-07, + "logits/chosen": -2.2070531845092773, + "logits/rejected": -2.4731287956237793, + "logps/chosen": -369.0859375, + "logps/rejected": -306.2019958496094, + "loss": 0.5924, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.158040165901184, + "rewards/margins": 0.8017956614494324, + "rewards/rejected": -1.9598357677459717, + "step": 2765 + }, + { + "epoch": 0.32, + "learning_rate": 2.074681025400913e-07, + "logits/chosen": -2.0110890865325928, + "logits/rejected": -1.9259997606277466, + "logps/chosen": -222.53143310546875, + "logps/rejected": -229.12338256835938, + "loss": 0.4351, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.21723605692386627, + "rewards/margins": 1.768902063369751, + "rewards/rejected": -1.986138105392456, + "step": 2766 + }, + { + "epoch": 0.32, + "learning_rate": 2.0743298607046706e-07, + "logits/chosen": -2.32454776763916, + "logits/rejected": -2.37611985206604, + "logps/chosen": -347.5258483886719, + "logps/rejected": -237.15975952148438, + "loss": 0.7475, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2285419702529907, + "rewards/margins": 0.6522011756896973, + "rewards/rejected": -1.880743145942688, + "step": 2767 + }, + { + "epoch": 0.32, + "learning_rate": 2.0739786960084278e-07, + "logits/chosen": -2.0079104900360107, + "logits/rejected": -2.1880335807800293, + "logps/chosen": -188.06680297851562, + "logps/rejected": -181.37518310546875, + "loss": 0.8629, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1989703178405762, + "rewards/margins": 0.8008305430412292, + "rewards/rejected": -1.9998008012771606, + "step": 2768 + }, + { + "epoch": 0.32, + "learning_rate": 2.0736275313121854e-07, + "logits/chosen": -2.5643725395202637, + "logits/rejected": -2.5310885906219482, + "logps/chosen": -276.04296875, + "logps/rejected": -301.7142639160156, + "loss": 0.4758, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7207544445991516, + "rewards/margins": 2.6717379093170166, + "rewards/rejected": -3.3924922943115234, + "step": 2769 + }, + { + "epoch": 0.32, + "learning_rate": 2.0732763666159427e-07, + "logits/chosen": -2.399465560913086, + "logits/rejected": -2.3339905738830566, + "logps/chosen": -112.52254486083984, + "logps/rejected": -147.8238525390625, + "loss": 1.1553, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2761107683181763, + "rewards/margins": 0.35885024070739746, + "rewards/rejected": -1.6349611282348633, + "step": 2770 + }, + { + "epoch": 0.32, + "learning_rate": 2.0729252019197002e-07, + "logits/chosen": -2.157546043395996, + "logits/rejected": -2.2741739749908447, + "logps/chosen": -300.9049072265625, + "logps/rejected": -354.348388671875, + "loss": 1.5899, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6039460897445679, + "rewards/margins": 1.1743054389953613, + "rewards/rejected": -2.7782514095306396, + "step": 2771 + }, + { + "epoch": 0.32, + "learning_rate": 2.0725740372234577e-07, + "logits/chosen": -2.551260471343994, + "logits/rejected": -2.631087303161621, + "logps/chosen": -241.66171264648438, + "logps/rejected": -245.18785095214844, + "loss": 0.1939, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.738028883934021, + "rewards/margins": 3.6962828636169434, + "rewards/rejected": -4.434311866760254, + "step": 2772 + }, + { + "epoch": 0.32, + "learning_rate": 2.072222872527215e-07, + "logits/chosen": -2.506906509399414, + "logits/rejected": -2.7506422996520996, + "logps/chosen": -378.8830871582031, + "logps/rejected": -202.87547302246094, + "loss": 0.2212, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11420813202857971, + "rewards/margins": 2.9172072410583496, + "rewards/rejected": -3.0314154624938965, + "step": 2773 + }, + { + "epoch": 0.32, + "learning_rate": 2.0718717078309725e-07, + "logits/chosen": -3.073991298675537, + "logits/rejected": -3.185063123703003, + "logps/chosen": -267.98529052734375, + "logps/rejected": -273.5995178222656, + "loss": 0.2825, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0001755952835083, + "rewards/margins": 2.5859103202819824, + "rewards/rejected": -3.5860860347747803, + "step": 2774 + }, + { + "epoch": 0.32, + "learning_rate": 2.0715205431347304e-07, + "logits/chosen": -2.132582902908325, + "logits/rejected": -2.022858142852783, + "logps/chosen": -319.1655578613281, + "logps/rejected": -343.0974426269531, + "loss": 0.3834, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3332783281803131, + "rewards/margins": 1.9341181516647339, + "rewards/rejected": -2.2673964500427246, + "step": 2775 + }, + { + "epoch": 0.32, + "learning_rate": 2.0711693784384874e-07, + "logits/chosen": -1.9692986011505127, + "logits/rejected": -2.350828170776367, + "logps/chosen": -437.0403137207031, + "logps/rejected": -293.84136962890625, + "loss": 0.4424, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5569376945495605, + "rewards/margins": 1.691957712173462, + "rewards/rejected": -2.2488954067230225, + "step": 2776 + }, + { + "epoch": 0.32, + "learning_rate": 2.0708182137422452e-07, + "logits/chosen": -2.1568572521209717, + "logits/rejected": -2.3813021183013916, + "logps/chosen": -233.3666229248047, + "logps/rejected": -173.92483520507812, + "loss": 0.4917, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8573590517044067, + "rewards/margins": 1.6549800634384155, + "rewards/rejected": -2.5123391151428223, + "step": 2777 + }, + { + "epoch": 0.32, + "learning_rate": 2.0704670490460024e-07, + "logits/chosen": -2.5841140747070312, + "logits/rejected": -2.6039505004882812, + "logps/chosen": -236.56124877929688, + "logps/rejected": -334.0174255371094, + "loss": 0.6552, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.159980058670044, + "rewards/margins": 1.1861499547958374, + "rewards/rejected": -2.346129894256592, + "step": 2778 + }, + { + "epoch": 0.32, + "learning_rate": 2.07011588434976e-07, + "logits/chosen": -2.1336822509765625, + "logits/rejected": -1.9822273254394531, + "logps/chosen": -114.61213684082031, + "logps/rejected": -182.45018005371094, + "loss": 0.6075, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0906602144241333, + "rewards/margins": 1.3842613697052002, + "rewards/rejected": -2.474921703338623, + "step": 2779 + }, + { + "epoch": 0.32, + "learning_rate": 2.0697647196535175e-07, + "logits/chosen": -2.2110939025878906, + "logits/rejected": -2.092989683151245, + "logps/chosen": -320.2752380371094, + "logps/rejected": -481.1492004394531, + "loss": 0.4765, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.432222455739975, + "rewards/margins": 1.6904138326644897, + "rewards/rejected": -2.122636318206787, + "step": 2780 + }, + { + "epoch": 0.32, + "learning_rate": 2.0694135549572748e-07, + "logits/chosen": -2.3738765716552734, + "logits/rejected": -2.4376416206359863, + "logps/chosen": -444.3968200683594, + "logps/rejected": -393.78704833984375, + "loss": 0.4113, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.017866611480713, + "rewards/margins": 1.037284016609192, + "rewards/rejected": -2.0551505088806152, + "step": 2781 + }, + { + "epoch": 0.32, + "learning_rate": 2.0690623902610323e-07, + "logits/chosen": -2.5375070571899414, + "logits/rejected": -2.6732919216156006, + "logps/chosen": -214.96127319335938, + "logps/rejected": -174.2030029296875, + "loss": 0.4807, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4709702730178833, + "rewards/margins": 0.9470996856689453, + "rewards/rejected": -2.418070077896118, + "step": 2782 + }, + { + "epoch": 0.32, + "learning_rate": 2.0687112255647899e-07, + "logits/chosen": -2.6494297981262207, + "logits/rejected": -2.408587694168091, + "logps/chosen": -245.55560302734375, + "logps/rejected": -184.8299560546875, + "loss": 0.3306, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.24943505227565765, + "rewards/margins": 1.4820679426193237, + "rewards/rejected": -1.7315030097961426, + "step": 2783 + }, + { + "epoch": 0.32, + "learning_rate": 2.0683600608685471e-07, + "logits/chosen": -2.4992730617523193, + "logits/rejected": -2.632270097732544, + "logps/chosen": -394.5458984375, + "logps/rejected": -342.995361328125, + "loss": 0.3371, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6904540061950684, + "rewards/margins": 1.6202902793884277, + "rewards/rejected": -3.310744285583496, + "step": 2784 + }, + { + "epoch": 0.32, + "learning_rate": 2.0680088961723047e-07, + "logits/chosen": -1.828450322151184, + "logits/rejected": -2.286813735961914, + "logps/chosen": -584.800537109375, + "logps/rejected": -440.856689453125, + "loss": 0.3148, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7122742533683777, + "rewards/margins": 1.8270806074142456, + "rewards/rejected": -2.5393550395965576, + "step": 2785 + }, + { + "epoch": 0.32, + "learning_rate": 2.067657731476062e-07, + "logits/chosen": -2.3486790657043457, + "logits/rejected": -2.4072928428649902, + "logps/chosen": -306.95831298828125, + "logps/rejected": -177.257080078125, + "loss": 0.2996, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0405815988779068, + "rewards/margins": 1.154096245765686, + "rewards/rejected": -1.1946778297424316, + "step": 2786 + }, + { + "epoch": 0.32, + "learning_rate": 2.0673065667798198e-07, + "logits/chosen": -1.9737977981567383, + "logits/rejected": -1.6402769088745117, + "logps/chosen": -195.66514587402344, + "logps/rejected": -220.43714904785156, + "loss": 0.7869, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1850486993789673, + "rewards/margins": 0.8313877582550049, + "rewards/rejected": -2.0164365768432617, + "step": 2787 + }, + { + "epoch": 0.32, + "learning_rate": 2.0669554020835773e-07, + "logits/chosen": -2.216043472290039, + "logits/rejected": -2.3602945804595947, + "logps/chosen": -187.80499267578125, + "logps/rejected": -268.6779479980469, + "loss": 0.4925, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.708319902420044, + "rewards/margins": 3.172084331512451, + "rewards/rejected": -3.880404233932495, + "step": 2788 + }, + { + "epoch": 0.32, + "learning_rate": 2.0666042373873346e-07, + "logits/chosen": -2.444149971008301, + "logits/rejected": -2.321715831756592, + "logps/chosen": -253.6239471435547, + "logps/rejected": -261.3524169921875, + "loss": 0.2927, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6678363680839539, + "rewards/margins": 2.691293716430664, + "rewards/rejected": -3.3591301441192627, + "step": 2789 + }, + { + "epoch": 0.32, + "learning_rate": 2.066253072691092e-07, + "logits/chosen": -2.5075149536132812, + "logits/rejected": -2.4562602043151855, + "logps/chosen": -242.17733764648438, + "logps/rejected": -292.25634765625, + "loss": 0.5532, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3215270042419434, + "rewards/margins": 1.0539863109588623, + "rewards/rejected": -2.3755133152008057, + "step": 2790 + }, + { + "epoch": 0.32, + "learning_rate": 2.0659019079948496e-07, + "logits/chosen": -2.3591413497924805, + "logits/rejected": -2.0096240043640137, + "logps/chosen": -294.7116394042969, + "logps/rejected": -256.2259521484375, + "loss": 0.6171, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8978222608566284, + "rewards/margins": 1.2585512399673462, + "rewards/rejected": -2.1563735008239746, + "step": 2791 + }, + { + "epoch": 0.32, + "learning_rate": 2.065550743298607e-07, + "logits/chosen": -2.6222338676452637, + "logits/rejected": -2.831596612930298, + "logps/chosen": -421.56756591796875, + "logps/rejected": -368.50360107421875, + "loss": 0.4965, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6074007749557495, + "rewards/margins": 1.0608201026916504, + "rewards/rejected": -1.6682207584381104, + "step": 2792 + }, + { + "epoch": 0.32, + "learning_rate": 2.0651995786023645e-07, + "logits/chosen": -2.0736749172210693, + "logits/rejected": -1.9905030727386475, + "logps/chosen": -143.58346557617188, + "logps/rejected": -274.9546203613281, + "loss": 0.5932, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1062467098236084, + "rewards/margins": 1.9501123428344727, + "rewards/rejected": -3.05635929107666, + "step": 2793 + }, + { + "epoch": 0.32, + "learning_rate": 2.0648484139061217e-07, + "logits/chosen": -1.9212554693222046, + "logits/rejected": -1.7336888313293457, + "logps/chosen": -466.44476318359375, + "logps/rejected": -331.36492919921875, + "loss": 0.4002, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2947399616241455, + "rewards/margins": 1.4003524780273438, + "rewards/rejected": -2.6950924396514893, + "step": 2794 + }, + { + "epoch": 0.32, + "learning_rate": 2.0644972492098793e-07, + "logits/chosen": -1.7927520275115967, + "logits/rejected": -2.1273698806762695, + "logps/chosen": -391.6963806152344, + "logps/rejected": -307.1801452636719, + "loss": 0.0786, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7334555387496948, + "rewards/margins": 3.780369520187378, + "rewards/rejected": -4.513824939727783, + "step": 2795 + }, + { + "epoch": 0.32, + "learning_rate": 2.0641460845136368e-07, + "logits/chosen": -1.6313408613204956, + "logits/rejected": -1.5328811407089233, + "logps/chosen": -298.7473449707031, + "logps/rejected": -318.803955078125, + "loss": 0.5633, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4613107442855835, + "rewards/margins": 0.8413810729980469, + "rewards/rejected": -1.30269193649292, + "step": 2796 + }, + { + "epoch": 0.32, + "learning_rate": 2.063794919817394e-07, + "logits/chosen": -2.714010000228882, + "logits/rejected": -2.6394193172454834, + "logps/chosen": -214.1091766357422, + "logps/rejected": -238.49342346191406, + "loss": 0.3447, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3729040622711182, + "rewards/margins": 1.7432029247283936, + "rewards/rejected": -3.116107225418091, + "step": 2797 + }, + { + "epoch": 0.32, + "learning_rate": 2.063443755121152e-07, + "logits/chosen": -2.713062286376953, + "logits/rejected": -2.823172092437744, + "logps/chosen": -303.9522705078125, + "logps/rejected": -133.38128662109375, + "loss": 0.555, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2137759923934937, + "rewards/margins": 1.1374900341033936, + "rewards/rejected": -2.3512659072875977, + "step": 2798 + }, + { + "epoch": 0.32, + "learning_rate": 2.0630925904249094e-07, + "logits/chosen": -1.666973352432251, + "logits/rejected": -1.7350399494171143, + "logps/chosen": -322.07232666015625, + "logps/rejected": -299.19775390625, + "loss": 0.4725, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6800762414932251, + "rewards/margins": 1.2280287742614746, + "rewards/rejected": -1.9081051349639893, + "step": 2799 + }, + { + "epoch": 0.32, + "learning_rate": 2.0627414257286667e-07, + "logits/chosen": -2.138179063796997, + "logits/rejected": -2.2214443683624268, + "logps/chosen": -261.5570068359375, + "logps/rejected": -260.4154968261719, + "loss": 0.3388, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5560028553009033, + "rewards/margins": 2.12715744972229, + "rewards/rejected": -2.6831603050231934, + "step": 2800 + }, + { + "epoch": 0.32, + "learning_rate": 2.0623902610324242e-07, + "logits/chosen": -2.261716604232788, + "logits/rejected": -2.24371337890625, + "logps/chosen": -464.9439697265625, + "logps/rejected": -445.88983154296875, + "loss": 0.2491, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7503080368041992, + "rewards/margins": 1.9594823122024536, + "rewards/rejected": -2.7097902297973633, + "step": 2801 + }, + { + "epoch": 0.32, + "learning_rate": 2.0620390963361815e-07, + "logits/chosen": -1.8163368701934814, + "logits/rejected": -1.9761792421340942, + "logps/chosen": -285.9539489746094, + "logps/rejected": -308.7342834472656, + "loss": 0.5096, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3578909635543823, + "rewards/margins": 1.1329578161239624, + "rewards/rejected": -2.490849018096924, + "step": 2802 + }, + { + "epoch": 0.32, + "learning_rate": 2.061687931639939e-07, + "logits/chosen": -2.409147262573242, + "logits/rejected": -2.487764358520508, + "logps/chosen": -179.51495361328125, + "logps/rejected": -234.28823852539062, + "loss": 0.4009, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.47893163561820984, + "rewards/margins": 1.9303845167160034, + "rewards/rejected": -2.409316301345825, + "step": 2803 + }, + { + "epoch": 0.32, + "learning_rate": 2.0613367669436966e-07, + "logits/chosen": -1.7404470443725586, + "logits/rejected": -1.7209104299545288, + "logps/chosen": -501.8316650390625, + "logps/rejected": -430.7516784667969, + "loss": 0.7456, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8086612224578857, + "rewards/margins": 1.3776980638504028, + "rewards/rejected": -2.186359405517578, + "step": 2804 + }, + { + "epoch": 0.32, + "learning_rate": 2.0609856022474539e-07, + "logits/chosen": -2.497485637664795, + "logits/rejected": -2.6663975715637207, + "logps/chosen": -234.064697265625, + "logps/rejected": -181.51541137695312, + "loss": 0.3851, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.560634434223175, + "rewards/margins": 2.7057080268859863, + "rewards/rejected": -3.2663426399230957, + "step": 2805 + }, + { + "epoch": 0.32, + "learning_rate": 2.0606344375512114e-07, + "logits/chosen": -2.2672672271728516, + "logits/rejected": -2.121419906616211, + "logps/chosen": -187.91529846191406, + "logps/rejected": -270.7955322265625, + "loss": 0.4421, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5237075090408325, + "rewards/margins": 1.2457720041275024, + "rewards/rejected": -1.769479513168335, + "step": 2806 + }, + { + "epoch": 0.32, + "learning_rate": 2.0602832728549687e-07, + "logits/chosen": -1.8687100410461426, + "logits/rejected": -2.0408339500427246, + "logps/chosen": -260.23223876953125, + "logps/rejected": -201.67562866210938, + "loss": 0.3633, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.23344728350639343, + "rewards/margins": 1.3902812004089355, + "rewards/rejected": -1.623728632926941, + "step": 2807 + }, + { + "epoch": 0.32, + "learning_rate": 2.0599321081587262e-07, + "logits/chosen": -2.928806781768799, + "logits/rejected": -2.9757349491119385, + "logps/chosen": -446.5527648925781, + "logps/rejected": -213.8297119140625, + "loss": 0.251, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.351070523262024, + "rewards/margins": 1.9877254962921143, + "rewards/rejected": -3.3387959003448486, + "step": 2808 + }, + { + "epoch": 0.32, + "learning_rate": 2.059580943462484e-07, + "logits/chosen": -2.1445350646972656, + "logits/rejected": -2.018825054168701, + "logps/chosen": -164.68650817871094, + "logps/rejected": -160.40101623535156, + "loss": 0.4108, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7433695793151855, + "rewards/margins": 1.0805847644805908, + "rewards/rejected": -1.8239542245864868, + "step": 2809 + }, + { + "epoch": 0.32, + "learning_rate": 2.059229778766241e-07, + "logits/chosen": -2.4554994106292725, + "logits/rejected": -2.3978798389434814, + "logps/chosen": -201.5494842529297, + "logps/rejected": -212.23316955566406, + "loss": 0.2712, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3049039840698242, + "rewards/margins": 2.2487807273864746, + "rewards/rejected": -2.5536844730377197, + "step": 2810 + }, + { + "epoch": 0.32, + "learning_rate": 2.0588786140699988e-07, + "logits/chosen": -2.4106903076171875, + "logits/rejected": -2.6524386405944824, + "logps/chosen": -317.23468017578125, + "logps/rejected": -254.5279998779297, + "loss": 0.4553, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8768071532249451, + "rewards/margins": 1.1348812580108643, + "rewards/rejected": -2.011688470840454, + "step": 2811 + }, + { + "epoch": 0.32, + "learning_rate": 2.0585274493737564e-07, + "logits/chosen": -2.5041868686676025, + "logits/rejected": -2.532857656478882, + "logps/chosen": -285.6216735839844, + "logps/rejected": -291.039306640625, + "loss": 0.3309, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.48023277521133423, + "rewards/margins": 1.460720181465149, + "rewards/rejected": -1.940953016281128, + "step": 2812 + }, + { + "epoch": 0.32, + "learning_rate": 2.0581762846775136e-07, + "logits/chosen": -1.7581068277359009, + "logits/rejected": -1.9536230564117432, + "logps/chosen": -383.1068115234375, + "logps/rejected": -246.9056854248047, + "loss": 0.4652, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6544975638389587, + "rewards/margins": 0.9875466823577881, + "rewards/rejected": -1.6420443058013916, + "step": 2813 + }, + { + "epoch": 0.32, + "learning_rate": 2.0578251199812712e-07, + "logits/chosen": -2.3962833881378174, + "logits/rejected": -2.3548169136047363, + "logps/chosen": -231.21200561523438, + "logps/rejected": -233.15921020507812, + "loss": 0.3438, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3468739986419678, + "rewards/margins": 1.5498276948928833, + "rewards/rejected": -2.8967018127441406, + "step": 2814 + }, + { + "epoch": 0.32, + "learning_rate": 2.0574739552850285e-07, + "logits/chosen": -2.3479769229888916, + "logits/rejected": -2.5222327709198, + "logps/chosen": -144.77737426757812, + "logps/rejected": -228.61209106445312, + "loss": 0.4666, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9404130578041077, + "rewards/margins": 1.5554449558258057, + "rewards/rejected": -2.4958579540252686, + "step": 2815 + }, + { + "epoch": 0.32, + "learning_rate": 2.057122790588786e-07, + "logits/chosen": -2.262535572052002, + "logits/rejected": -2.3920540809631348, + "logps/chosen": -226.24996948242188, + "logps/rejected": -174.08798217773438, + "loss": 0.2482, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6959034204483032, + "rewards/margins": 1.7760910987854004, + "rewards/rejected": -2.471994638442993, + "step": 2816 + }, + { + "epoch": 0.32, + "learning_rate": 2.0567716258925435e-07, + "logits/chosen": -1.866633415222168, + "logits/rejected": -1.9641683101654053, + "logps/chosen": -329.04498291015625, + "logps/rejected": -268.97845458984375, + "loss": 0.3307, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8891684412956238, + "rewards/margins": 2.322479724884033, + "rewards/rejected": -3.211648464202881, + "step": 2817 + }, + { + "epoch": 0.32, + "learning_rate": 2.0564204611963008e-07, + "logits/chosen": -2.449808120727539, + "logits/rejected": -2.5047802925109863, + "logps/chosen": -352.43597412109375, + "logps/rejected": -262.5508117675781, + "loss": 0.316, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11598965525627136, + "rewards/margins": 1.7915031909942627, + "rewards/rejected": -1.9074926376342773, + "step": 2818 + }, + { + "epoch": 0.32, + "learning_rate": 2.0560692965000583e-07, + "logits/chosen": -1.7498573064804077, + "logits/rejected": -1.9374022483825684, + "logps/chosen": -325.10302734375, + "logps/rejected": -268.709228515625, + "loss": 0.6289, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0045548677444458, + "rewards/margins": 0.9390447735786438, + "rewards/rejected": -1.9435997009277344, + "step": 2819 + }, + { + "epoch": 0.33, + "learning_rate": 2.0557181318038161e-07, + "logits/chosen": -2.784106492996216, + "logits/rejected": -2.7919187545776367, + "logps/chosen": -125.2245864868164, + "logps/rejected": -190.70556640625, + "loss": 0.4999, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9810779690742493, + "rewards/margins": 1.3581202030181885, + "rewards/rejected": -2.339198112487793, + "step": 2820 + }, + { + "epoch": 0.33, + "learning_rate": 2.0553669671075734e-07, + "logits/chosen": -2.070533037185669, + "logits/rejected": -2.494708776473999, + "logps/chosen": -341.3285217285156, + "logps/rejected": -330.72308349609375, + "loss": 0.1985, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19729994237422943, + "rewards/margins": 3.051875114440918, + "rewards/rejected": -3.2491748332977295, + "step": 2821 + }, + { + "epoch": 0.33, + "learning_rate": 2.055015802411331e-07, + "logits/chosen": -1.9524211883544922, + "logits/rejected": -2.0892813205718994, + "logps/chosen": -252.6607666015625, + "logps/rejected": -274.42852783203125, + "loss": 0.3109, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5228258371353149, + "rewards/margins": 2.856675624847412, + "rewards/rejected": -3.3795013427734375, + "step": 2822 + }, + { + "epoch": 0.33, + "learning_rate": 2.0546646377150882e-07, + "logits/chosen": -1.9011764526367188, + "logits/rejected": -1.8191887140274048, + "logps/chosen": -273.37664794921875, + "logps/rejected": -270.3541259765625, + "loss": 0.5043, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5181440711021423, + "rewards/margins": 1.011081576347351, + "rewards/rejected": -1.5292255878448486, + "step": 2823 + }, + { + "epoch": 0.33, + "learning_rate": 2.0543134730188458e-07, + "logits/chosen": -2.011469841003418, + "logits/rejected": -2.0934503078460693, + "logps/chosen": -203.09791564941406, + "logps/rejected": -174.18551635742188, + "loss": 0.8296, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.539879322052002, + "rewards/margins": 0.8402372598648071, + "rewards/rejected": -2.3801167011260986, + "step": 2824 + }, + { + "epoch": 0.33, + "learning_rate": 2.0539623083226033e-07, + "logits/chosen": -2.4901652336120605, + "logits/rejected": -2.1708555221557617, + "logps/chosen": -304.8403015136719, + "logps/rejected": -390.50341796875, + "loss": 0.3175, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0001323223114014, + "rewards/margins": 2.390526294708252, + "rewards/rejected": -3.3906588554382324, + "step": 2825 + }, + { + "epoch": 0.33, + "learning_rate": 2.0536111436263606e-07, + "logits/chosen": -2.2193682193756104, + "logits/rejected": -2.380652666091919, + "logps/chosen": -385.1395568847656, + "logps/rejected": -427.7710266113281, + "loss": 0.3842, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19990544021129608, + "rewards/margins": 2.839085578918457, + "rewards/rejected": -3.0389909744262695, + "step": 2826 + }, + { + "epoch": 0.33, + "learning_rate": 2.053259978930118e-07, + "logits/chosen": -2.0486440658569336, + "logits/rejected": -1.9519922733306885, + "logps/chosen": -268.221435546875, + "logps/rejected": -237.53341674804688, + "loss": 0.4503, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6829490661621094, + "rewards/margins": 1.1053416728973389, + "rewards/rejected": -1.7882906198501587, + "step": 2827 + }, + { + "epoch": 0.33, + "learning_rate": 2.0529088142338757e-07, + "logits/chosen": -2.6186039447784424, + "logits/rejected": -2.6589419841766357, + "logps/chosen": -131.64614868164062, + "logps/rejected": -138.74241638183594, + "loss": 0.4494, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2739458978176117, + "rewards/margins": 1.563845157623291, + "rewards/rejected": -1.8377909660339355, + "step": 2828 + }, + { + "epoch": 0.33, + "learning_rate": 2.052557649537633e-07, + "logits/chosen": -2.6231484413146973, + "logits/rejected": -2.699415922164917, + "logps/chosen": -105.48736572265625, + "logps/rejected": -197.88702392578125, + "loss": 0.6855, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.24578416347503662, + "rewards/margins": 0.7004776000976562, + "rewards/rejected": -0.9462618231773376, + "step": 2829 + }, + { + "epoch": 0.33, + "learning_rate": 2.0522064848413905e-07, + "logits/chosen": -2.6489882469177246, + "logits/rejected": -2.7951607704162598, + "logps/chosen": -151.13046264648438, + "logps/rejected": -271.62469482421875, + "loss": 0.5554, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.89048433303833, + "rewards/margins": 1.6624692678451538, + "rewards/rejected": -3.5529532432556152, + "step": 2830 + }, + { + "epoch": 0.33, + "learning_rate": 2.0518553201451477e-07, + "logits/chosen": -2.732175827026367, + "logits/rejected": -2.1224193572998047, + "logps/chosen": -223.2967529296875, + "logps/rejected": -313.82080078125, + "loss": 0.4161, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8891289234161377, + "rewards/margins": 2.424548387527466, + "rewards/rejected": -3.3136773109436035, + "step": 2831 + }, + { + "epoch": 0.33, + "learning_rate": 2.0515041554489055e-07, + "logits/chosen": -2.070732355117798, + "logits/rejected": -2.2711431980133057, + "logps/chosen": -540.6103515625, + "logps/rejected": -468.838623046875, + "loss": 0.5289, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9957302212715149, + "rewards/margins": 1.1931248903274536, + "rewards/rejected": -2.1888551712036133, + "step": 2832 + }, + { + "epoch": 0.33, + "learning_rate": 2.051152990752663e-07, + "logits/chosen": -2.6039695739746094, + "logits/rejected": -2.682796001434326, + "logps/chosen": -291.67108154296875, + "logps/rejected": -250.29000854492188, + "loss": 0.4467, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6754062175750732, + "rewards/margins": 1.3119275569915771, + "rewards/rejected": -1.9873337745666504, + "step": 2833 + }, + { + "epoch": 0.33, + "learning_rate": 2.0508018260564204e-07, + "logits/chosen": -2.379230499267578, + "logits/rejected": -2.482743501663208, + "logps/chosen": -343.6300048828125, + "logps/rejected": -337.2133483886719, + "loss": 0.1893, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1643267720937729, + "rewards/margins": 2.49362850189209, + "rewards/rejected": -2.6579554080963135, + "step": 2834 + }, + { + "epoch": 0.33, + "learning_rate": 2.050450661360178e-07, + "logits/chosen": -2.0484325885772705, + "logits/rejected": -1.9364073276519775, + "logps/chosen": -203.6300506591797, + "logps/rejected": -249.9018096923828, + "loss": 0.3806, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.23294204473495483, + "rewards/margins": 2.0436174869537354, + "rewards/rejected": -2.276559591293335, + "step": 2835 + }, + { + "epoch": 0.33, + "learning_rate": 2.0500994966639354e-07, + "logits/chosen": -2.341432571411133, + "logits/rejected": -2.416940212249756, + "logps/chosen": -292.64898681640625, + "logps/rejected": -275.5111389160156, + "loss": 0.2891, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5626662969589233, + "rewards/margins": 1.8369733095169067, + "rewards/rejected": -2.39963960647583, + "step": 2836 + }, + { + "epoch": 0.33, + "learning_rate": 2.0497483319676927e-07, + "logits/chosen": -2.3278791904449463, + "logits/rejected": -2.4663162231445312, + "logps/chosen": -318.59796142578125, + "logps/rejected": -242.07650756835938, + "loss": 0.4324, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8004372119903564, + "rewards/margins": 0.8159778714179993, + "rewards/rejected": -2.616415023803711, + "step": 2837 + }, + { + "epoch": 0.33, + "learning_rate": 2.0493971672714503e-07, + "logits/chosen": -1.904645562171936, + "logits/rejected": -2.2002573013305664, + "logps/chosen": -289.3607482910156, + "logps/rejected": -189.69009399414062, + "loss": 0.5397, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7015918493270874, + "rewards/margins": 1.3400883674621582, + "rewards/rejected": -2.041680335998535, + "step": 2838 + }, + { + "epoch": 0.33, + "learning_rate": 2.0490460025752075e-07, + "logits/chosen": -2.6167845726013184, + "logits/rejected": -2.5547518730163574, + "logps/chosen": -248.96849060058594, + "logps/rejected": -174.1318359375, + "loss": 0.7972, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2115131616592407, + "rewards/margins": 1.3982458114624023, + "rewards/rejected": -2.6097590923309326, + "step": 2839 + }, + { + "epoch": 0.33, + "learning_rate": 2.048694837878965e-07, + "logits/chosen": -2.745065689086914, + "logits/rejected": -2.6712639331817627, + "logps/chosen": -147.23486328125, + "logps/rejected": -233.73953247070312, + "loss": 0.7337, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3562252521514893, + "rewards/margins": 1.7394148111343384, + "rewards/rejected": -3.095640182495117, + "step": 2840 + }, + { + "epoch": 0.33, + "learning_rate": 2.0483436731827226e-07, + "logits/chosen": -2.253904104232788, + "logits/rejected": -2.21041202545166, + "logps/chosen": -458.8148193359375, + "logps/rejected": -361.0691223144531, + "loss": 0.3401, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.29131412506103516, + "rewards/margins": 2.660658121109009, + "rewards/rejected": -2.951972246170044, + "step": 2841 + }, + { + "epoch": 0.33, + "learning_rate": 2.04799250848648e-07, + "logits/chosen": -2.1048202514648438, + "logits/rejected": -2.1107068061828613, + "logps/chosen": -369.31427001953125, + "logps/rejected": -322.2354736328125, + "loss": 0.3181, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2911553680896759, + "rewards/margins": 1.4580858945846558, + "rewards/rejected": -1.7492413520812988, + "step": 2842 + }, + { + "epoch": 0.33, + "learning_rate": 2.0476413437902377e-07, + "logits/chosen": -2.187194585800171, + "logits/rejected": -2.0532515048980713, + "logps/chosen": -367.7154541015625, + "logps/rejected": -355.00592041015625, + "loss": 0.4835, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.22309523820877075, + "rewards/margins": 1.0071748495101929, + "rewards/rejected": -1.2302701473236084, + "step": 2843 + }, + { + "epoch": 0.33, + "learning_rate": 2.0472901790939952e-07, + "logits/chosen": -2.3346993923187256, + "logits/rejected": -2.410022497177124, + "logps/chosen": -348.9272766113281, + "logps/rejected": -349.5078430175781, + "loss": 0.2484, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1517561674118042, + "rewards/margins": 1.950699806213379, + "rewards/rejected": -3.1024560928344727, + "step": 2844 + }, + { + "epoch": 0.33, + "learning_rate": 2.0469390143977525e-07, + "logits/chosen": -2.257385015487671, + "logits/rejected": -2.1613974571228027, + "logps/chosen": -334.4905700683594, + "logps/rejected": -232.07308959960938, + "loss": 0.1992, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8518685102462769, + "rewards/margins": 2.374648332595825, + "rewards/rejected": -3.2265167236328125, + "step": 2845 + }, + { + "epoch": 0.33, + "learning_rate": 2.04658784970151e-07, + "logits/chosen": -2.3396599292755127, + "logits/rejected": -2.1178925037384033, + "logps/chosen": -192.92831420898438, + "logps/rejected": -276.3161315917969, + "loss": 0.5228, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9736926555633545, + "rewards/margins": 0.9151672124862671, + "rewards/rejected": -1.888859748840332, + "step": 2846 + }, + { + "epoch": 0.33, + "learning_rate": 2.0462366850052673e-07, + "logits/chosen": -2.175097703933716, + "logits/rejected": -2.227154016494751, + "logps/chosen": -195.5404815673828, + "logps/rejected": -256.2129211425781, + "loss": 0.5282, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1363568305969238, + "rewards/margins": 0.8726077675819397, + "rewards/rejected": -2.008964776992798, + "step": 2847 + }, + { + "epoch": 0.33, + "learning_rate": 2.0458855203090248e-07, + "logits/chosen": -2.0490989685058594, + "logits/rejected": -2.2978994846343994, + "logps/chosen": -488.8940124511719, + "logps/rejected": -274.25811767578125, + "loss": 0.2378, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3807342052459717, + "rewards/margins": 1.6776578426361084, + "rewards/rejected": -2.05839204788208, + "step": 2848 + }, + { + "epoch": 0.33, + "learning_rate": 2.0455343556127824e-07, + "logits/chosen": -2.745628595352173, + "logits/rejected": -2.778799533843994, + "logps/chosen": -117.11520385742188, + "logps/rejected": -160.67776489257812, + "loss": 0.8776, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4219130277633667, + "rewards/margins": 1.3561739921569824, + "rewards/rejected": -2.7780871391296387, + "step": 2849 + }, + { + "epoch": 0.33, + "learning_rate": 2.0451831909165397e-07, + "logits/chosen": -2.4354026317596436, + "logits/rejected": -2.453993558883667, + "logps/chosen": -260.27001953125, + "logps/rejected": -286.08001708984375, + "loss": 0.2257, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.367820143699646, + "rewards/margins": 2.3529112339019775, + "rewards/rejected": -2.720731496810913, + "step": 2850 + }, + { + "epoch": 0.33, + "learning_rate": 2.0448320262202972e-07, + "logits/chosen": -2.1885712146759033, + "logits/rejected": -1.9731658697128296, + "logps/chosen": -240.25550842285156, + "logps/rejected": -374.0572204589844, + "loss": 0.2794, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3478146493434906, + "rewards/margins": 1.7931917905807495, + "rewards/rejected": -2.1410062313079834, + "step": 2851 + }, + { + "epoch": 0.33, + "learning_rate": 2.0444808615240545e-07, + "logits/chosen": -1.753225564956665, + "logits/rejected": -2.175724983215332, + "logps/chosen": -338.32366943359375, + "logps/rejected": -337.73419189453125, + "loss": 0.4828, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3851308822631836, + "rewards/margins": 1.2185851335525513, + "rewards/rejected": -2.6037161350250244, + "step": 2852 + }, + { + "epoch": 0.33, + "learning_rate": 2.044129696827812e-07, + "logits/chosen": -2.3378682136535645, + "logits/rejected": -2.5116302967071533, + "logps/chosen": -326.7639465332031, + "logps/rejected": -302.9674072265625, + "loss": 0.3533, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6194921731948853, + "rewards/margins": 2.33389949798584, + "rewards/rejected": -2.9533917903900146, + "step": 2853 + }, + { + "epoch": 0.33, + "learning_rate": 2.0437785321315698e-07, + "logits/chosen": -1.845043659210205, + "logits/rejected": -1.9659678936004639, + "logps/chosen": -272.596923828125, + "logps/rejected": -253.22601318359375, + "loss": 0.3214, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.13737305998802185, + "rewards/margins": 2.391805648803711, + "rewards/rejected": -2.5291788578033447, + "step": 2854 + }, + { + "epoch": 0.33, + "learning_rate": 2.043427367435327e-07, + "logits/chosen": -2.496135711669922, + "logits/rejected": -2.6179635524749756, + "logps/chosen": -376.3818664550781, + "logps/rejected": -439.299072265625, + "loss": 0.16, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4073753356933594, + "rewards/margins": 3.093784809112549, + "rewards/rejected": -3.5011603832244873, + "step": 2855 + }, + { + "epoch": 0.33, + "learning_rate": 2.0430762027390846e-07, + "logits/chosen": -1.9697332382202148, + "logits/rejected": -1.8054184913635254, + "logps/chosen": -277.6934814453125, + "logps/rejected": -385.9830322265625, + "loss": 0.3647, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4848102331161499, + "rewards/margins": 3.300565719604492, + "rewards/rejected": -3.7853758335113525, + "step": 2856 + }, + { + "epoch": 0.33, + "learning_rate": 2.0427250380428422e-07, + "logits/chosen": -2.3381009101867676, + "logits/rejected": -2.4930684566497803, + "logps/chosen": -86.46773529052734, + "logps/rejected": -145.504638671875, + "loss": 0.3461, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.31245607137680054, + "rewards/margins": 3.1559906005859375, + "rewards/rejected": -3.468446731567383, + "step": 2857 + }, + { + "epoch": 0.33, + "learning_rate": 2.0423738733465994e-07, + "logits/chosen": -2.795861005783081, + "logits/rejected": -2.8424415588378906, + "logps/chosen": -228.2517547607422, + "logps/rejected": -275.5364685058594, + "loss": 0.2872, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0449683666229248, + "rewards/margins": 2.911404609680176, + "rewards/rejected": -3.9563732147216797, + "step": 2858 + }, + { + "epoch": 0.33, + "learning_rate": 2.042022708650357e-07, + "logits/chosen": -2.0974512100219727, + "logits/rejected": -2.2250101566314697, + "logps/chosen": -238.060546875, + "logps/rejected": -194.78965759277344, + "loss": 1.6286, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.625652551651001, + "rewards/margins": -0.6118791103363037, + "rewards/rejected": -2.0137734413146973, + "step": 2859 + }, + { + "epoch": 0.33, + "learning_rate": 2.0416715439541142e-07, + "logits/chosen": -2.581334114074707, + "logits/rejected": -2.2934041023254395, + "logps/chosen": -216.15293884277344, + "logps/rejected": -288.8791198730469, + "loss": 0.766, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7626200318336487, + "rewards/margins": 0.5555888414382935, + "rewards/rejected": -1.3182088136672974, + "step": 2860 + }, + { + "epoch": 0.33, + "learning_rate": 2.0413203792578718e-07, + "logits/chosen": -1.8508278131484985, + "logits/rejected": -1.9090347290039062, + "logps/chosen": -446.4744567871094, + "logps/rejected": -330.3827209472656, + "loss": 0.5084, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2094364166259766, + "rewards/margins": 1.3145465850830078, + "rewards/rejected": -2.5239832401275635, + "step": 2861 + }, + { + "epoch": 0.33, + "learning_rate": 2.0409692145616293e-07, + "logits/chosen": -2.806244134902954, + "logits/rejected": -2.6509361267089844, + "logps/chosen": -194.46466064453125, + "logps/rejected": -227.72421264648438, + "loss": 0.6415, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4848140478134155, + "rewards/margins": 1.6550371646881104, + "rewards/rejected": -3.1398510932922363, + "step": 2862 + }, + { + "epoch": 0.33, + "learning_rate": 2.0406180498653866e-07, + "logits/chosen": -2.380359411239624, + "logits/rejected": -2.2486886978149414, + "logps/chosen": -189.423095703125, + "logps/rejected": -245.34609985351562, + "loss": 0.2572, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.854651927947998, + "rewards/margins": 1.9184538125991821, + "rewards/rejected": -2.7731058597564697, + "step": 2863 + }, + { + "epoch": 0.33, + "learning_rate": 2.0402668851691441e-07, + "logits/chosen": -2.2332024574279785, + "logits/rejected": -2.321237087249756, + "logps/chosen": -434.17340087890625, + "logps/rejected": -193.56263732910156, + "loss": 0.56, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6656059622764587, + "rewards/margins": 1.1301661729812622, + "rewards/rejected": -1.7957720756530762, + "step": 2864 + }, + { + "epoch": 0.33, + "learning_rate": 2.039915720472902e-07, + "logits/chosen": -2.080822706222534, + "logits/rejected": -2.238271951675415, + "logps/chosen": -324.06640625, + "logps/rejected": -352.549560546875, + "loss": 0.3828, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7526441812515259, + "rewards/margins": 2.4786198139190674, + "rewards/rejected": -3.2312638759613037, + "step": 2865 + }, + { + "epoch": 0.33, + "learning_rate": 2.0395645557766592e-07, + "logits/chosen": -1.9438047409057617, + "logits/rejected": -2.0115480422973633, + "logps/chosen": -303.61407470703125, + "logps/rejected": -282.852783203125, + "loss": 0.3591, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.947278618812561, + "rewards/margins": 2.33723783493042, + "rewards/rejected": -3.2845163345336914, + "step": 2866 + }, + { + "epoch": 0.33, + "learning_rate": 2.0392133910804168e-07, + "logits/chosen": -3.1680166721343994, + "logits/rejected": -3.163942337036133, + "logps/chosen": -390.8086853027344, + "logps/rejected": -313.02630615234375, + "loss": 0.2172, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14448480308055878, + "rewards/margins": 2.0962042808532715, + "rewards/rejected": -1.9517192840576172, + "step": 2867 + }, + { + "epoch": 0.33, + "learning_rate": 2.038862226384174e-07, + "logits/chosen": -2.961618423461914, + "logits/rejected": -3.022810935974121, + "logps/chosen": -281.43902587890625, + "logps/rejected": -255.80355834960938, + "loss": 0.4446, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9353864789009094, + "rewards/margins": 1.2772279977798462, + "rewards/rejected": -2.2126145362854004, + "step": 2868 + }, + { + "epoch": 0.33, + "learning_rate": 2.0385110616879316e-07, + "logits/chosen": -2.381213665008545, + "logits/rejected": -2.3822734355926514, + "logps/chosen": -298.6895751953125, + "logps/rejected": -258.2838439941406, + "loss": 1.223, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.340366244316101, + "rewards/margins": -0.46181821823120117, + "rewards/rejected": -0.8785480260848999, + "step": 2869 + }, + { + "epoch": 0.33, + "learning_rate": 2.038159896991689e-07, + "logits/chosen": -2.1536262035369873, + "logits/rejected": -2.3405826091766357, + "logps/chosen": -490.6483459472656, + "logps/rejected": -394.0606689453125, + "loss": 0.5519, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2764008045196533, + "rewards/margins": 1.1871336698532104, + "rewards/rejected": -2.4635345935821533, + "step": 2870 + }, + { + "epoch": 0.33, + "learning_rate": 2.0378087322954464e-07, + "logits/chosen": -2.3030900955200195, + "logits/rejected": -1.8744165897369385, + "logps/chosen": -338.2276916503906, + "logps/rejected": -421.44097900390625, + "loss": 0.2472, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3731119632720947, + "rewards/margins": 3.085303544998169, + "rewards/rejected": -4.458415508270264, + "step": 2871 + }, + { + "epoch": 0.33, + "learning_rate": 2.037457567599204e-07, + "logits/chosen": -2.269686698913574, + "logits/rejected": -2.2825307846069336, + "logps/chosen": -217.49136352539062, + "logps/rejected": -248.01724243164062, + "loss": 0.3782, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4759656488895416, + "rewards/margins": 3.4343581199645996, + "rewards/rejected": -3.9103236198425293, + "step": 2872 + }, + { + "epoch": 0.33, + "learning_rate": 2.0371064029029615e-07, + "logits/chosen": -2.2259130477905273, + "logits/rejected": -2.036548376083374, + "logps/chosen": -198.03732299804688, + "logps/rejected": -259.62176513671875, + "loss": 0.9966, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7704288363456726, + "rewards/margins": 0.2779972553253174, + "rewards/rejected": -1.0484260320663452, + "step": 2873 + }, + { + "epoch": 0.33, + "learning_rate": 2.0367552382067187e-07, + "logits/chosen": -1.810755729675293, + "logits/rejected": -2.185839891433716, + "logps/chosen": -285.3037414550781, + "logps/rejected": -285.96234130859375, + "loss": 0.3826, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0276544094085693, + "rewards/margins": 2.081320285797119, + "rewards/rejected": -3.1089746952056885, + "step": 2874 + }, + { + "epoch": 0.33, + "learning_rate": 2.0364040735104763e-07, + "logits/chosen": -2.200808048248291, + "logits/rejected": -2.297011137008667, + "logps/chosen": -443.4654541015625, + "logps/rejected": -352.13677978515625, + "loss": 0.3525, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5057308673858643, + "rewards/margins": 2.7963171005249023, + "rewards/rejected": -4.3020477294921875, + "step": 2875 + }, + { + "epoch": 0.33, + "learning_rate": 2.0360529088142335e-07, + "logits/chosen": -2.4129703044891357, + "logits/rejected": -2.4712610244750977, + "logps/chosen": -164.4167938232422, + "logps/rejected": -179.05355834960938, + "loss": 0.4273, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.33208975195884705, + "rewards/margins": 1.6704142093658447, + "rewards/rejected": -2.0025041103363037, + "step": 2876 + }, + { + "epoch": 0.33, + "learning_rate": 2.0357017441179913e-07, + "logits/chosen": -2.4376468658447266, + "logits/rejected": -2.0977187156677246, + "logps/chosen": -186.15892028808594, + "logps/rejected": -260.1129455566406, + "loss": 0.4149, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5838981866836548, + "rewards/margins": 1.592588186264038, + "rewards/rejected": -2.1764862537384033, + "step": 2877 + }, + { + "epoch": 0.33, + "learning_rate": 2.035350579421749e-07, + "logits/chosen": -2.5687224864959717, + "logits/rejected": -2.516174554824829, + "logps/chosen": -498.4281311035156, + "logps/rejected": -675.7018432617188, + "loss": 0.0918, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9777452945709229, + "rewards/margins": 4.863968372344971, + "rewards/rejected": -5.841713905334473, + "step": 2878 + }, + { + "epoch": 0.33, + "learning_rate": 2.0349994147255062e-07, + "logits/chosen": -2.074157476425171, + "logits/rejected": -2.3569087982177734, + "logps/chosen": -263.0862731933594, + "logps/rejected": -256.64794921875, + "loss": 0.2689, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5330188870429993, + "rewards/margins": 1.7341855764389038, + "rewards/rejected": -2.267204761505127, + "step": 2879 + }, + { + "epoch": 0.33, + "learning_rate": 2.0346482500292637e-07, + "logits/chosen": -2.315324306488037, + "logits/rejected": -1.810926914215088, + "logps/chosen": -191.78936767578125, + "logps/rejected": -469.3622741699219, + "loss": 0.1173, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7887303829193115, + "rewards/margins": 3.712217092514038, + "rewards/rejected": -4.50094747543335, + "step": 2880 + }, + { + "epoch": 0.33, + "learning_rate": 2.0342970853330212e-07, + "logits/chosen": -2.226008176803589, + "logits/rejected": -2.237689733505249, + "logps/chosen": -179.20814514160156, + "logps/rejected": -295.11724853515625, + "loss": 0.3246, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2932448387145996, + "rewards/margins": 2.5279927253723145, + "rewards/rejected": -2.821237564086914, + "step": 2881 + }, + { + "epoch": 0.33, + "learning_rate": 2.0339459206367785e-07, + "logits/chosen": -2.331512928009033, + "logits/rejected": -2.4409453868865967, + "logps/chosen": -235.26162719726562, + "logps/rejected": -215.18612670898438, + "loss": 1.2532, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.2476351261138916, + "rewards/margins": 0.30464255809783936, + "rewards/rejected": -2.5522775650024414, + "step": 2882 + }, + { + "epoch": 0.33, + "learning_rate": 2.033594755940536e-07, + "logits/chosen": -1.5792428255081177, + "logits/rejected": -1.7675023078918457, + "logps/chosen": -374.759033203125, + "logps/rejected": -341.4237060546875, + "loss": 0.2969, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.48706063628196716, + "rewards/margins": 1.9326173067092896, + "rewards/rejected": -2.419677734375, + "step": 2883 + }, + { + "epoch": 0.33, + "learning_rate": 2.0332435912442933e-07, + "logits/chosen": -1.969900131225586, + "logits/rejected": -2.1780319213867188, + "logps/chosen": -330.47808837890625, + "logps/rejected": -371.34112548828125, + "loss": 0.6299, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9560855031013489, + "rewards/margins": 2.935173273086548, + "rewards/rejected": -3.891258716583252, + "step": 2884 + }, + { + "epoch": 0.33, + "learning_rate": 2.0328924265480509e-07, + "logits/chosen": -1.8972275257110596, + "logits/rejected": -1.8811726570129395, + "logps/chosen": -296.8240661621094, + "logps/rejected": -273.8106384277344, + "loss": 0.2997, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.950893759727478, + "rewards/margins": 1.8935203552246094, + "rewards/rejected": -3.844414234161377, + "step": 2885 + }, + { + "epoch": 0.33, + "learning_rate": 2.0325412618518084e-07, + "logits/chosen": -1.9927659034729004, + "logits/rejected": -2.1346895694732666, + "logps/chosen": -426.0872497558594, + "logps/rejected": -392.84539794921875, + "loss": 0.7133, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3249285221099854, + "rewards/margins": 0.7732672095298767, + "rewards/rejected": -2.098195791244507, + "step": 2886 + }, + { + "epoch": 0.33, + "learning_rate": 2.0321900971555657e-07, + "logits/chosen": -2.3951354026794434, + "logits/rejected": -2.318535089492798, + "logps/chosen": -125.57726287841797, + "logps/rejected": -174.06158447265625, + "loss": 0.3291, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7714369893074036, + "rewards/margins": 1.761090636253357, + "rewards/rejected": -2.5325276851654053, + "step": 2887 + }, + { + "epoch": 0.33, + "learning_rate": 2.0318389324593235e-07, + "logits/chosen": -2.713977336883545, + "logits/rejected": -2.6822681427001953, + "logps/chosen": -97.8403549194336, + "logps/rejected": -150.67388916015625, + "loss": 0.5591, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7473061680793762, + "rewards/margins": 0.9459558725357056, + "rewards/rejected": -1.6932621002197266, + "step": 2888 + }, + { + "epoch": 0.33, + "learning_rate": 2.031487767763081e-07, + "logits/chosen": -2.1871094703674316, + "logits/rejected": -2.2708442211151123, + "logps/chosen": -243.84341430664062, + "logps/rejected": -217.11572265625, + "loss": 0.6926, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.196326494216919, + "rewards/margins": 0.9564976692199707, + "rewards/rejected": -2.1528244018554688, + "step": 2889 + }, + { + "epoch": 0.33, + "learning_rate": 2.0311366030668383e-07, + "logits/chosen": -2.2206227779388428, + "logits/rejected": -1.8804659843444824, + "logps/chosen": -232.87551879882812, + "logps/rejected": -299.09136962890625, + "loss": 0.3359, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6565624475479126, + "rewards/margins": 1.819218635559082, + "rewards/rejected": -2.475781202316284, + "step": 2890 + }, + { + "epoch": 0.33, + "learning_rate": 2.0307854383705958e-07, + "logits/chosen": -2.4753987789154053, + "logits/rejected": -2.6307809352874756, + "logps/chosen": -293.0467834472656, + "logps/rejected": -251.61428833007812, + "loss": 0.1505, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5427794456481934, + "rewards/margins": 3.091237783432007, + "rewards/rejected": -4.634017467498779, + "step": 2891 + }, + { + "epoch": 0.33, + "learning_rate": 2.030434273674353e-07, + "logits/chosen": -2.073531150817871, + "logits/rejected": -2.146477699279785, + "logps/chosen": -354.5285949707031, + "logps/rejected": -249.786865234375, + "loss": 0.8683, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6990206241607666, + "rewards/margins": 0.43133795261383057, + "rewards/rejected": -1.1303586959838867, + "step": 2892 + }, + { + "epoch": 0.33, + "learning_rate": 2.0300831089781106e-07, + "logits/chosen": -2.6291210651397705, + "logits/rejected": -2.715723752975464, + "logps/chosen": -310.57806396484375, + "logps/rejected": -272.7289123535156, + "loss": 0.3598, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7246794700622559, + "rewards/margins": 2.9239890575408936, + "rewards/rejected": -3.6486685276031494, + "step": 2893 + }, + { + "epoch": 0.33, + "learning_rate": 2.0297319442818682e-07, + "logits/chosen": -1.7472248077392578, + "logits/rejected": -2.0824172496795654, + "logps/chosen": -508.4741516113281, + "logps/rejected": -342.33251953125, + "loss": 0.5354, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8239924907684326, + "rewards/margins": 3.0592052936553955, + "rewards/rejected": -3.883197784423828, + "step": 2894 + }, + { + "epoch": 0.33, + "learning_rate": 2.0293807795856254e-07, + "logits/chosen": -2.657179355621338, + "logits/rejected": -2.794696807861328, + "logps/chosen": -147.1739044189453, + "logps/rejected": -137.89427185058594, + "loss": 0.7328, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4501416683197021, + "rewards/margins": 0.9404469728469849, + "rewards/rejected": -2.3905887603759766, + "step": 2895 + }, + { + "epoch": 0.33, + "learning_rate": 2.029029614889383e-07, + "logits/chosen": -2.2860827445983887, + "logits/rejected": -2.4494717121124268, + "logps/chosen": -273.3448791503906, + "logps/rejected": -286.2121276855469, + "loss": 0.4168, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5561496019363403, + "rewards/margins": 2.810525894165039, + "rewards/rejected": -3.36667537689209, + "step": 2896 + }, + { + "epoch": 0.33, + "learning_rate": 2.0286784501931405e-07, + "logits/chosen": -1.8862025737762451, + "logits/rejected": -1.6897631883621216, + "logps/chosen": -329.7723388671875, + "logps/rejected": -365.7198791503906, + "loss": 0.5159, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2672628164291382, + "rewards/margins": 1.3874125480651855, + "rewards/rejected": -2.654675245285034, + "step": 2897 + }, + { + "epoch": 0.33, + "learning_rate": 2.0283272854968978e-07, + "logits/chosen": -2.6383495330810547, + "logits/rejected": -2.7608842849731445, + "logps/chosen": -224.92970275878906, + "logps/rejected": -213.84469604492188, + "loss": 0.2024, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7110422849655151, + "rewards/margins": 2.2369656562805176, + "rewards/rejected": -2.9480080604553223, + "step": 2898 + }, + { + "epoch": 0.33, + "learning_rate": 2.0279761208006556e-07, + "logits/chosen": -2.1874027252197266, + "logits/rejected": -2.2222721576690674, + "logps/chosen": -156.1935577392578, + "logps/rejected": -182.2160186767578, + "loss": 0.279, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3046856224536896, + "rewards/margins": 2.9209814071655273, + "rewards/rejected": -3.2256669998168945, + "step": 2899 + }, + { + "epoch": 0.33, + "learning_rate": 2.027624956104413e-07, + "logits/chosen": -2.216900587081909, + "logits/rejected": -2.430065631866455, + "logps/chosen": -211.75820922851562, + "logps/rejected": -205.1509552001953, + "loss": 0.2272, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7810420989990234, + "rewards/margins": 3.35176420211792, + "rewards/rejected": -4.132806301116943, + "step": 2900 + }, + { + "epoch": 0.33, + "learning_rate": 2.0272737914081704e-07, + "logits/chosen": -2.523348093032837, + "logits/rejected": -2.5308780670166016, + "logps/chosen": -181.4605255126953, + "logps/rejected": -225.29234313964844, + "loss": 0.256, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6337991952896118, + "rewards/margins": 2.35030198097229, + "rewards/rejected": -2.9841010570526123, + "step": 2901 + }, + { + "epoch": 0.33, + "learning_rate": 2.026922626711928e-07, + "logits/chosen": -2.1673831939697266, + "logits/rejected": -2.2988784313201904, + "logps/chosen": -124.9368896484375, + "logps/rejected": -132.97853088378906, + "loss": 0.5032, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9305588603019714, + "rewards/margins": 1.1216288805007935, + "rewards/rejected": -2.05218768119812, + "step": 2902 + }, + { + "epoch": 0.33, + "learning_rate": 2.0265714620156852e-07, + "logits/chosen": -1.9933173656463623, + "logits/rejected": -2.139430284500122, + "logps/chosen": -295.7144470214844, + "logps/rejected": -220.30177307128906, + "loss": 0.4555, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.13121837377548218, + "rewards/margins": 1.3522040843963623, + "rewards/rejected": -1.4834225177764893, + "step": 2903 + }, + { + "epoch": 0.33, + "learning_rate": 2.0262202973194428e-07, + "logits/chosen": -2.6256613731384277, + "logits/rejected": -2.610067844390869, + "logps/chosen": -289.8770751953125, + "logps/rejected": -321.1546630859375, + "loss": 0.1859, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2630246579647064, + "rewards/margins": 2.533738136291504, + "rewards/rejected": -2.796762466430664, + "step": 2904 + }, + { + "epoch": 0.33, + "learning_rate": 2.0258691326232e-07, + "logits/chosen": -2.7111027240753174, + "logits/rejected": -2.7886102199554443, + "logps/chosen": -165.81874084472656, + "logps/rejected": -255.4509735107422, + "loss": 0.2348, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5945852994918823, + "rewards/margins": 2.7341322898864746, + "rewards/rejected": -3.3287174701690674, + "step": 2905 + }, + { + "epoch": 0.34, + "learning_rate": 2.0255179679269576e-07, + "logits/chosen": -2.2917866706848145, + "logits/rejected": -2.077056884765625, + "logps/chosen": -219.60438537597656, + "logps/rejected": -165.8478546142578, + "loss": 0.1518, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0847361832857132, + "rewards/margins": 2.8632869720458984, + "rewards/rejected": -2.9480233192443848, + "step": 2906 + }, + { + "epoch": 0.34, + "learning_rate": 2.025166803230715e-07, + "logits/chosen": -2.173067092895508, + "logits/rejected": -2.0331695079803467, + "logps/chosen": -247.88641357421875, + "logps/rejected": -302.5484924316406, + "loss": 0.561, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9106535315513611, + "rewards/margins": 1.1956095695495605, + "rewards/rejected": -2.1062631607055664, + "step": 2907 + }, + { + "epoch": 0.34, + "learning_rate": 2.0248156385344724e-07, + "logits/chosen": -1.464637041091919, + "logits/rejected": -1.9243223667144775, + "logps/chosen": -493.881103515625, + "logps/rejected": -302.5580749511719, + "loss": 0.4066, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2671809196472168, + "rewards/margins": 1.4985589981079102, + "rewards/rejected": -1.765739917755127, + "step": 2908 + }, + { + "epoch": 0.34, + "learning_rate": 2.02446447383823e-07, + "logits/chosen": -2.330472469329834, + "logits/rejected": -2.260132312774658, + "logps/chosen": -234.20083618164062, + "logps/rejected": -210.39712524414062, + "loss": 0.3816, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6039802432060242, + "rewards/margins": 1.8791003227233887, + "rewards/rejected": -2.4830803871154785, + "step": 2909 + }, + { + "epoch": 0.34, + "learning_rate": 2.0241133091419877e-07, + "logits/chosen": -2.590664863586426, + "logits/rejected": -2.4770665168762207, + "logps/chosen": -809.5160522460938, + "logps/rejected": -282.4129638671875, + "loss": 0.2803, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2029718160629272, + "rewards/margins": 1.890859603881836, + "rewards/rejected": -3.0938315391540527, + "step": 2910 + }, + { + "epoch": 0.34, + "learning_rate": 2.023762144445745e-07, + "logits/chosen": -2.7954211235046387, + "logits/rejected": -2.6515445709228516, + "logps/chosen": -127.49833679199219, + "logps/rejected": -255.12225341796875, + "loss": 0.6397, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7768792510032654, + "rewards/margins": 1.5424041748046875, + "rewards/rejected": -2.3192832469940186, + "step": 2911 + }, + { + "epoch": 0.34, + "learning_rate": 2.0234109797495025e-07, + "logits/chosen": -2.566093683242798, + "logits/rejected": -2.604149103164673, + "logps/chosen": -285.0687255859375, + "logps/rejected": -277.7811279296875, + "loss": 0.3411, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2984143495559692, + "rewards/margins": 2.123279094696045, + "rewards/rejected": -3.421693801879883, + "step": 2912 + }, + { + "epoch": 0.34, + "learning_rate": 2.0230598150532598e-07, + "logits/chosen": -2.431004047393799, + "logits/rejected": -2.436439037322998, + "logps/chosen": -423.50054931640625, + "logps/rejected": -362.6194763183594, + "loss": 0.3129, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5579596161842346, + "rewards/margins": 1.6996936798095703, + "rewards/rejected": -2.25765323638916, + "step": 2913 + }, + { + "epoch": 0.34, + "learning_rate": 2.0227086503570174e-07, + "logits/chosen": -2.4503514766693115, + "logits/rejected": -2.364471673965454, + "logps/chosen": -235.62286376953125, + "logps/rejected": -234.45706176757812, + "loss": 0.4421, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4368917942047119, + "rewards/margins": 1.5454747676849365, + "rewards/rejected": -1.9823665618896484, + "step": 2914 + }, + { + "epoch": 0.34, + "learning_rate": 2.022357485660775e-07, + "logits/chosen": -2.2521371841430664, + "logits/rejected": -1.9730112552642822, + "logps/chosen": -272.07489013671875, + "logps/rejected": -376.02337646484375, + "loss": 0.4856, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.48520123958587646, + "rewards/margins": 2.923997163772583, + "rewards/rejected": -3.409198760986328, + "step": 2915 + }, + { + "epoch": 0.34, + "learning_rate": 2.0220063209645322e-07, + "logits/chosen": -1.9118479490280151, + "logits/rejected": -2.058582067489624, + "logps/chosen": -304.4320068359375, + "logps/rejected": -241.1009979248047, + "loss": 0.8413, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6240530014038086, + "rewards/margins": 0.8704577684402466, + "rewards/rejected": -2.4945106506347656, + "step": 2916 + }, + { + "epoch": 0.34, + "learning_rate": 2.0216551562682897e-07, + "logits/chosen": -2.4083971977233887, + "logits/rejected": -2.490060329437256, + "logps/chosen": -199.28857421875, + "logps/rejected": -220.71119689941406, + "loss": 0.364, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3126411437988281, + "rewards/margins": 2.194912910461426, + "rewards/rejected": -2.507554054260254, + "step": 2917 + }, + { + "epoch": 0.34, + "learning_rate": 2.0213039915720472e-07, + "logits/chosen": -2.116807222366333, + "logits/rejected": -2.05853533744812, + "logps/chosen": -422.19696044921875, + "logps/rejected": -430.434326171875, + "loss": 0.2137, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19438910484313965, + "rewards/margins": 2.5156705379486084, + "rewards/rejected": -2.710059642791748, + "step": 2918 + }, + { + "epoch": 0.34, + "learning_rate": 2.0209528268758045e-07, + "logits/chosen": -2.03019380569458, + "logits/rejected": -2.091111660003662, + "logps/chosen": -434.737548828125, + "logps/rejected": -321.59893798828125, + "loss": 0.6842, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7816247940063477, + "rewards/margins": 0.9244297742843628, + "rewards/rejected": -1.7060545682907104, + "step": 2919 + }, + { + "epoch": 0.34, + "learning_rate": 2.020601662179562e-07, + "logits/chosen": -2.560722827911377, + "logits/rejected": -2.698169708251953, + "logps/chosen": -290.9625244140625, + "logps/rejected": -338.27154541015625, + "loss": 0.1381, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.45291197299957275, + "rewards/margins": 2.8555831909179688, + "rewards/rejected": -3.308495283126831, + "step": 2920 + }, + { + "epoch": 0.34, + "learning_rate": 2.0202504974833193e-07, + "logits/chosen": -2.584834575653076, + "logits/rejected": -2.626580238342285, + "logps/chosen": -420.5867614746094, + "logps/rejected": -332.4095764160156, + "loss": 0.4221, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4252674579620361, + "rewards/margins": 1.5309756994247437, + "rewards/rejected": -2.9562430381774902, + "step": 2921 + }, + { + "epoch": 0.34, + "learning_rate": 2.0198993327870771e-07, + "logits/chosen": -2.6393275260925293, + "logits/rejected": -2.714045524597168, + "logps/chosen": -241.0440673828125, + "logps/rejected": -228.06236267089844, + "loss": 0.2918, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8619283437728882, + "rewards/margins": 2.505190134048462, + "rewards/rejected": -3.3671183586120605, + "step": 2922 + }, + { + "epoch": 0.34, + "learning_rate": 2.0195481680908347e-07, + "logits/chosen": -2.1997289657592773, + "logits/rejected": -2.2693045139312744, + "logps/chosen": -379.0743408203125, + "logps/rejected": -285.3302307128906, + "loss": 0.8184, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6538026332855225, + "rewards/margins": 1.7499655485153198, + "rewards/rejected": -2.4037680625915527, + "step": 2923 + }, + { + "epoch": 0.34, + "learning_rate": 2.019197003394592e-07, + "logits/chosen": -2.058180332183838, + "logits/rejected": -2.1993892192840576, + "logps/chosen": -392.0657958984375, + "logps/rejected": -321.40203857421875, + "loss": 1.0307, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.81352698802948, + "rewards/margins": -0.009601950645446777, + "rewards/rejected": -0.803925096988678, + "step": 2924 + }, + { + "epoch": 0.34, + "learning_rate": 2.0188458386983495e-07, + "logits/chosen": -2.2789907455444336, + "logits/rejected": -2.310044288635254, + "logps/chosen": -366.73956298828125, + "logps/rejected": -322.8260498046875, + "loss": 0.4658, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6824455261230469, + "rewards/margins": 1.3344202041625977, + "rewards/rejected": -2.0168657302856445, + "step": 2925 + }, + { + "epoch": 0.34, + "learning_rate": 2.018494674002107e-07, + "logits/chosen": -2.555955171585083, + "logits/rejected": -2.5424134731292725, + "logps/chosen": -211.7794189453125, + "logps/rejected": -164.55177307128906, + "loss": 0.8417, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.1284542083740234, + "rewards/margins": 0.2717787027359009, + "rewards/rejected": -1.4002327919006348, + "step": 2926 + }, + { + "epoch": 0.34, + "learning_rate": 2.0181435093058643e-07, + "logits/chosen": -2.3826260566711426, + "logits/rejected": -2.46633243560791, + "logps/chosen": -246.10635375976562, + "logps/rejected": -232.2978973388672, + "loss": 0.2436, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15830621123313904, + "rewards/margins": 2.920222043991089, + "rewards/rejected": -3.0785281658172607, + "step": 2927 + }, + { + "epoch": 0.34, + "learning_rate": 2.0177923446096218e-07, + "logits/chosen": -2.6279282569885254, + "logits/rejected": -2.6414263248443604, + "logps/chosen": -189.71644592285156, + "logps/rejected": -235.67041015625, + "loss": 1.1848, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0430808067321777, + "rewards/margins": 0.9260259866714478, + "rewards/rejected": -1.969106674194336, + "step": 2928 + }, + { + "epoch": 0.34, + "learning_rate": 2.017441179913379e-07, + "logits/chosen": -2.556680679321289, + "logits/rejected": -2.8065967559814453, + "logps/chosen": -284.61083984375, + "logps/rejected": -262.4125061035156, + "loss": 0.7799, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0684318542480469, + "rewards/margins": 2.6672182083129883, + "rewards/rejected": -3.735649585723877, + "step": 2929 + }, + { + "epoch": 0.34, + "learning_rate": 2.0170900152171367e-07, + "logits/chosen": -2.8524386882781982, + "logits/rejected": -2.526980400085449, + "logps/chosen": -254.2386474609375, + "logps/rejected": -375.4646911621094, + "loss": 0.2493, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8547059297561646, + "rewards/margins": 2.394578695297241, + "rewards/rejected": -3.2492847442626953, + "step": 2930 + }, + { + "epoch": 0.34, + "learning_rate": 2.0167388505208942e-07, + "logits/chosen": -1.9612207412719727, + "logits/rejected": -1.9290976524353027, + "logps/chosen": -257.4093017578125, + "logps/rejected": -200.22760009765625, + "loss": 0.4582, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.070094108581543, + "rewards/margins": 1.482888102531433, + "rewards/rejected": -2.5529823303222656, + "step": 2931 + }, + { + "epoch": 0.34, + "learning_rate": 2.0163876858246515e-07, + "logits/chosen": -2.1025800704956055, + "logits/rejected": -1.9590498208999634, + "logps/chosen": -331.8348388671875, + "logps/rejected": -175.19723510742188, + "loss": 0.4487, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8710097074508667, + "rewards/margins": 1.3137364387512207, + "rewards/rejected": -2.184746026992798, + "step": 2932 + }, + { + "epoch": 0.34, + "learning_rate": 2.0160365211284093e-07, + "logits/chosen": -2.1302459239959717, + "logits/rejected": -2.0218448638916016, + "logps/chosen": -241.24008178710938, + "logps/rejected": -249.74850463867188, + "loss": 0.4613, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7779844999313354, + "rewards/margins": 1.6765066385269165, + "rewards/rejected": -2.454491138458252, + "step": 2933 + }, + { + "epoch": 0.34, + "learning_rate": 2.0156853564321668e-07, + "logits/chosen": -3.0000808238983154, + "logits/rejected": -2.9234073162078857, + "logps/chosen": -462.7967834472656, + "logps/rejected": -287.76019287109375, + "loss": 0.2311, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.46165773272514343, + "rewards/margins": 1.6673941612243652, + "rewards/rejected": -2.129051685333252, + "step": 2934 + }, + { + "epoch": 0.34, + "learning_rate": 2.015334191735924e-07, + "logits/chosen": -2.6540253162384033, + "logits/rejected": -2.673590660095215, + "logps/chosen": -146.68270874023438, + "logps/rejected": -182.53160095214844, + "loss": 0.2546, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2760646641254425, + "rewards/margins": 2.1218137741088867, + "rewards/rejected": -2.397878408432007, + "step": 2935 + }, + { + "epoch": 0.34, + "learning_rate": 2.0149830270396816e-07, + "logits/chosen": -1.9499852657318115, + "logits/rejected": -2.1326053142547607, + "logps/chosen": -244.24087524414062, + "logps/rejected": -224.10084533691406, + "loss": 0.444, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.586181640625, + "rewards/margins": 1.540999174118042, + "rewards/rejected": -2.127180814743042, + "step": 2936 + }, + { + "epoch": 0.34, + "learning_rate": 2.014631862343439e-07, + "logits/chosen": -2.67912220954895, + "logits/rejected": -2.4735968112945557, + "logps/chosen": -290.21014404296875, + "logps/rejected": -266.65814208984375, + "loss": 0.2209, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18159812688827515, + "rewards/margins": 3.252657413482666, + "rewards/rejected": -3.434255599975586, + "step": 2937 + }, + { + "epoch": 0.34, + "learning_rate": 2.0142806976471964e-07, + "logits/chosen": -2.1109397411346436, + "logits/rejected": -2.100490093231201, + "logps/chosen": -171.0454559326172, + "logps/rejected": -211.06668090820312, + "loss": 1.2297, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7827136516571045, + "rewards/margins": 0.5814955830574036, + "rewards/rejected": -2.3642094135284424, + "step": 2938 + }, + { + "epoch": 0.34, + "learning_rate": 2.013929532950954e-07, + "logits/chosen": -1.8310754299163818, + "logits/rejected": -2.1993913650512695, + "logps/chosen": -449.514892578125, + "logps/rejected": -350.6998291015625, + "loss": 0.6973, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8419046998023987, + "rewards/margins": 0.6062421798706055, + "rewards/rejected": -1.4481470584869385, + "step": 2939 + }, + { + "epoch": 0.34, + "learning_rate": 2.0135783682547112e-07, + "logits/chosen": -2.009690999984741, + "logits/rejected": -1.848540186882019, + "logps/chosen": -229.40330505371094, + "logps/rejected": -283.21319580078125, + "loss": 0.6237, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3791273832321167, + "rewards/margins": 1.5139751434326172, + "rewards/rejected": -2.8931026458740234, + "step": 2940 + }, + { + "epoch": 0.34, + "learning_rate": 2.0132272035584688e-07, + "logits/chosen": -2.3914999961853027, + "logits/rejected": -2.6041383743286133, + "logps/chosen": -381.4527587890625, + "logps/rejected": -264.59466552734375, + "loss": 0.5319, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0120337009429932, + "rewards/margins": 0.5798991322517395, + "rewards/rejected": -1.591932773590088, + "step": 2941 + }, + { + "epoch": 0.34, + "learning_rate": 2.0128760388622266e-07, + "logits/chosen": -1.6707358360290527, + "logits/rejected": -2.1406190395355225, + "logps/chosen": -386.645751953125, + "logps/rejected": -309.1229553222656, + "loss": 0.4818, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2674867510795593, + "rewards/margins": 1.087555170059204, + "rewards/rejected": -1.3550419807434082, + "step": 2942 + }, + { + "epoch": 0.34, + "learning_rate": 2.0125248741659836e-07, + "logits/chosen": -2.1842713356018066, + "logits/rejected": -2.363373041152954, + "logps/chosen": -286.4429931640625, + "logps/rejected": -280.3057861328125, + "loss": 0.4004, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6122992038726807, + "rewards/margins": 1.8762178421020508, + "rewards/rejected": -2.4885170459747314, + "step": 2943 + }, + { + "epoch": 0.34, + "learning_rate": 2.0121737094697414e-07, + "logits/chosen": -2.7012624740600586, + "logits/rejected": -2.484896659851074, + "logps/chosen": -185.23611450195312, + "logps/rejected": -240.13087463378906, + "loss": 0.5769, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9709539413452148, + "rewards/margins": 1.3942430019378662, + "rewards/rejected": -2.365196943283081, + "step": 2944 + }, + { + "epoch": 0.34, + "learning_rate": 2.0118225447734987e-07, + "logits/chosen": -2.804144859313965, + "logits/rejected": -2.700092315673828, + "logps/chosen": -220.76950073242188, + "logps/rejected": -211.253173828125, + "loss": 0.1928, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6886768937110901, + "rewards/margins": 3.038424253463745, + "rewards/rejected": -3.7271013259887695, + "step": 2945 + }, + { + "epoch": 0.34, + "learning_rate": 2.0114713800772562e-07, + "logits/chosen": -2.1533453464508057, + "logits/rejected": -2.3745460510253906, + "logps/chosen": -255.9151611328125, + "logps/rejected": -265.51513671875, + "loss": 1.3535, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4897685050964355, + "rewards/margins": -0.5017969012260437, + "rewards/rejected": -0.9879716038703918, + "step": 2946 + }, + { + "epoch": 0.34, + "learning_rate": 2.0111202153810137e-07, + "logits/chosen": -2.2514448165893555, + "logits/rejected": -2.2445244789123535, + "logps/chosen": -290.56805419921875, + "logps/rejected": -258.0320739746094, + "loss": 0.5063, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.25913697481155396, + "rewards/margins": 1.2364563941955566, + "rewards/rejected": -1.4955933094024658, + "step": 2947 + }, + { + "epoch": 0.34, + "learning_rate": 2.010769050684771e-07, + "logits/chosen": -2.460132122039795, + "logits/rejected": -2.7472493648529053, + "logps/chosen": -327.75396728515625, + "logps/rejected": -103.3165283203125, + "loss": 0.3783, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6690380573272705, + "rewards/margins": 1.2939965724945068, + "rewards/rejected": -1.9630346298217773, + "step": 2948 + }, + { + "epoch": 0.34, + "learning_rate": 2.0104178859885286e-07, + "logits/chosen": -2.342625141143799, + "logits/rejected": -2.4326062202453613, + "logps/chosen": -234.56520080566406, + "logps/rejected": -277.104248046875, + "loss": 0.5885, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9929050207138062, + "rewards/margins": 1.1581388711929321, + "rewards/rejected": -3.1510438919067383, + "step": 2949 + }, + { + "epoch": 0.34, + "learning_rate": 2.0100667212922858e-07, + "logits/chosen": -1.992323875427246, + "logits/rejected": -2.0092992782592773, + "logps/chosen": -412.1931457519531, + "logps/rejected": -289.69488525390625, + "loss": 0.5147, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1385316848754883, + "rewards/margins": 1.5349838733673096, + "rewards/rejected": -2.673515558242798, + "step": 2950 + }, + { + "epoch": 0.34, + "learning_rate": 2.0097155565960434e-07, + "logits/chosen": -1.9274219274520874, + "logits/rejected": -2.270111083984375, + "logps/chosen": -528.1952514648438, + "logps/rejected": -449.05657958984375, + "loss": 0.4904, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5853155851364136, + "rewards/margins": 0.9025952219963074, + "rewards/rejected": -2.487910747528076, + "step": 2951 + }, + { + "epoch": 0.34, + "learning_rate": 2.009364391899801e-07, + "logits/chosen": -2.9483282566070557, + "logits/rejected": -2.8120830059051514, + "logps/chosen": -296.1802673339844, + "logps/rejected": -292.20831298828125, + "loss": 0.1015, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22206741571426392, + "rewards/margins": 3.363546848297119, + "rewards/rejected": -3.5856144428253174, + "step": 2952 + }, + { + "epoch": 0.34, + "learning_rate": 2.0090132272035582e-07, + "logits/chosen": -2.143364191055298, + "logits/rejected": -2.4619414806365967, + "logps/chosen": -590.9383544921875, + "logps/rejected": -434.5877990722656, + "loss": 0.2107, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4156082272529602, + "rewards/margins": 1.8808385133743286, + "rewards/rejected": -2.2964468002319336, + "step": 2953 + }, + { + "epoch": 0.34, + "learning_rate": 2.0086620625073157e-07, + "logits/chosen": -2.125980854034424, + "logits/rejected": -1.8671728372573853, + "logps/chosen": -257.0249328613281, + "logps/rejected": -319.63104248046875, + "loss": 0.4095, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7224347591400146, + "rewards/margins": 2.889092445373535, + "rewards/rejected": -3.61152720451355, + "step": 2954 + }, + { + "epoch": 0.34, + "learning_rate": 2.0083108978110735e-07, + "logits/chosen": -2.6174700260162354, + "logits/rejected": -2.586941719055176, + "logps/chosen": -244.5758056640625, + "logps/rejected": -240.80450439453125, + "loss": 0.2006, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7271180748939514, + "rewards/margins": 1.897060513496399, + "rewards/rejected": -2.624178409576416, + "step": 2955 + }, + { + "epoch": 0.34, + "learning_rate": 2.0079597331148308e-07, + "logits/chosen": -2.2748210430145264, + "logits/rejected": -2.218663215637207, + "logps/chosen": -223.54119873046875, + "logps/rejected": -207.44161987304688, + "loss": 0.6522, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0588269233703613, + "rewards/margins": 1.4149818420410156, + "rewards/rejected": -2.473808765411377, + "step": 2956 + }, + { + "epoch": 0.34, + "learning_rate": 2.0076085684185883e-07, + "logits/chosen": -2.5383615493774414, + "logits/rejected": -2.5203704833984375, + "logps/chosen": -303.189453125, + "logps/rejected": -452.42822265625, + "loss": 0.2702, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.44382941722869873, + "rewards/margins": 3.1592702865600586, + "rewards/rejected": -3.6030995845794678, + "step": 2957 + }, + { + "epoch": 0.34, + "learning_rate": 2.0072574037223456e-07, + "logits/chosen": -2.0256757736206055, + "logits/rejected": -2.1194941997528076, + "logps/chosen": -288.7452392578125, + "logps/rejected": -223.15982055664062, + "loss": 0.4453, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7809486389160156, + "rewards/margins": 1.2570796012878418, + "rewards/rejected": -2.0380282402038574, + "step": 2958 + }, + { + "epoch": 0.34, + "learning_rate": 2.0069062390261032e-07, + "logits/chosen": -2.596309185028076, + "logits/rejected": -2.509782552719116, + "logps/chosen": -208.19850158691406, + "logps/rejected": -248.3894500732422, + "loss": 0.2397, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.37189361453056335, + "rewards/margins": 2.62369704246521, + "rewards/rejected": -2.9955906867980957, + "step": 2959 + }, + { + "epoch": 0.34, + "learning_rate": 2.0065550743298607e-07, + "logits/chosen": -2.2404842376708984, + "logits/rejected": -2.6213788986206055, + "logps/chosen": -320.7882080078125, + "logps/rejected": -204.513427734375, + "loss": 0.6494, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0566522479057312, + "rewards/margins": 2.3629133701324463, + "rewards/rejected": -2.3062610626220703, + "step": 2960 + }, + { + "epoch": 0.34, + "learning_rate": 2.006203909633618e-07, + "logits/chosen": -2.4719009399414062, + "logits/rejected": -2.1396899223327637, + "logps/chosen": -272.8439636230469, + "logps/rejected": -247.20523071289062, + "loss": 0.3379, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17000791430473328, + "rewards/margins": 1.8981043100357056, + "rewards/rejected": -2.068112373352051, + "step": 2961 + }, + { + "epoch": 0.34, + "learning_rate": 2.0058527449373755e-07, + "logits/chosen": -2.939335823059082, + "logits/rejected": -2.921649694442749, + "logps/chosen": -218.52589416503906, + "logps/rejected": -226.0620574951172, + "loss": 0.3012, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1652675867080688, + "rewards/margins": 1.843793272972107, + "rewards/rejected": -3.009060859680176, + "step": 2962 + }, + { + "epoch": 0.34, + "learning_rate": 2.005501580241133e-07, + "logits/chosen": -2.4624462127685547, + "logits/rejected": -2.6908211708068848, + "logps/chosen": -273.4818115234375, + "logps/rejected": -235.1024169921875, + "loss": 0.257, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.04378552734851837, + "rewards/margins": 2.693044900894165, + "rewards/rejected": -2.649259567260742, + "step": 2963 + }, + { + "epoch": 0.34, + "learning_rate": 2.0051504155448903e-07, + "logits/chosen": -2.1609339714050293, + "logits/rejected": -2.289175510406494, + "logps/chosen": -346.94012451171875, + "logps/rejected": -303.8367919921875, + "loss": 0.3825, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.05883491039276123, + "rewards/margins": 2.040123224258423, + "rewards/rejected": -1.9812883138656616, + "step": 2964 + }, + { + "epoch": 0.34, + "learning_rate": 2.0047992508486479e-07, + "logits/chosen": -2.088867664337158, + "logits/rejected": -2.2533552646636963, + "logps/chosen": -138.04928588867188, + "logps/rejected": -146.94363403320312, + "loss": 0.7699, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.7427926063537598, + "rewards/margins": 0.23033276200294495, + "rewards/rejected": -0.9731253981590271, + "step": 2965 + }, + { + "epoch": 0.34, + "learning_rate": 2.004448086152405e-07, + "logits/chosen": -2.0017921924591064, + "logits/rejected": -2.0318763256073, + "logps/chosen": -298.3781433105469, + "logps/rejected": -301.95501708984375, + "loss": 0.3647, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7749739289283752, + "rewards/margins": 2.8093228340148926, + "rewards/rejected": -3.584296703338623, + "step": 2966 + }, + { + "epoch": 0.34, + "learning_rate": 2.004096921456163e-07, + "logits/chosen": -2.5823159217834473, + "logits/rejected": -2.6096553802490234, + "logps/chosen": -371.4081726074219, + "logps/rejected": -308.0110778808594, + "loss": 0.6172, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7770192623138428, + "rewards/margins": 1.4499915838241577, + "rewards/rejected": -2.227010726928711, + "step": 2967 + }, + { + "epoch": 0.34, + "learning_rate": 2.0037457567599205e-07, + "logits/chosen": -2.0697948932647705, + "logits/rejected": -1.7196913957595825, + "logps/chosen": -234.85203552246094, + "logps/rejected": -293.40936279296875, + "loss": 0.3975, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0550150871276855, + "rewards/margins": 1.7246828079223633, + "rewards/rejected": -2.7796976566314697, + "step": 2968 + }, + { + "epoch": 0.34, + "learning_rate": 2.0033945920636777e-07, + "logits/chosen": -2.2420706748962402, + "logits/rejected": -2.1568288803100586, + "logps/chosen": -356.00592041015625, + "logps/rejected": -329.4975280761719, + "loss": 0.4767, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2259436696767807, + "rewards/margins": 1.9143508672714233, + "rewards/rejected": -2.1402945518493652, + "step": 2969 + }, + { + "epoch": 0.34, + "learning_rate": 2.0030434273674353e-07, + "logits/chosen": -2.464918851852417, + "logits/rejected": -2.479679822921753, + "logps/chosen": -424.1470947265625, + "logps/rejected": -376.102783203125, + "loss": 0.3528, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3715578317642212, + "rewards/margins": 3.8304781913757324, + "rewards/rejected": -4.202035903930664, + "step": 2970 + }, + { + "epoch": 0.34, + "learning_rate": 2.0026922626711928e-07, + "logits/chosen": -2.0647854804992676, + "logits/rejected": -2.0637552738189697, + "logps/chosen": -247.51300048828125, + "logps/rejected": -304.32867431640625, + "loss": 0.4517, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7142168283462524, + "rewards/margins": 2.2299952507019043, + "rewards/rejected": -2.944211959838867, + "step": 2971 + }, + { + "epoch": 0.34, + "learning_rate": 2.00234109797495e-07, + "logits/chosen": -2.082502841949463, + "logits/rejected": -2.1817357540130615, + "logps/chosen": -139.20692443847656, + "logps/rejected": -267.0343933105469, + "loss": 0.2487, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5620020627975464, + "rewards/margins": 1.5614911317825317, + "rewards/rejected": -2.123493194580078, + "step": 2972 + }, + { + "epoch": 0.34, + "learning_rate": 2.0019899332787076e-07, + "logits/chosen": -2.4757604598999023, + "logits/rejected": -2.472005844116211, + "logps/chosen": -172.08096313476562, + "logps/rejected": -214.31057739257812, + "loss": 0.697, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0919561386108398, + "rewards/margins": 1.009584665298462, + "rewards/rejected": -2.1015408039093018, + "step": 2973 + }, + { + "epoch": 0.34, + "learning_rate": 2.001638768582465e-07, + "logits/chosen": -1.8971290588378906, + "logits/rejected": -1.8021618127822876, + "logps/chosen": -463.3720703125, + "logps/rejected": -541.1727294921875, + "loss": 0.4803, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11689379811286926, + "rewards/margins": 1.4685965776443481, + "rewards/rejected": -1.3517028093338013, + "step": 2974 + }, + { + "epoch": 0.34, + "learning_rate": 2.0012876038862224e-07, + "logits/chosen": -2.2281317710876465, + "logits/rejected": -2.2715160846710205, + "logps/chosen": -382.655029296875, + "logps/rejected": -364.7354736328125, + "loss": 0.4216, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.380295991897583, + "rewards/margins": 2.1909806728363037, + "rewards/rejected": -3.5712766647338867, + "step": 2975 + }, + { + "epoch": 0.34, + "learning_rate": 2.0009364391899802e-07, + "logits/chosen": -2.3822524547576904, + "logits/rejected": -2.406383752822876, + "logps/chosen": -419.20782470703125, + "logps/rejected": -367.33062744140625, + "loss": 0.2093, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4797033667564392, + "rewards/margins": 3.3065476417541504, + "rewards/rejected": -3.7862510681152344, + "step": 2976 + }, + { + "epoch": 0.34, + "learning_rate": 2.0005852744937373e-07, + "logits/chosen": -2.1147921085357666, + "logits/rejected": -2.356842517852783, + "logps/chosen": -326.90264892578125, + "logps/rejected": -213.88865661621094, + "loss": 0.4741, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5815288424491882, + "rewards/margins": 1.9712330102920532, + "rewards/rejected": -2.5527617931365967, + "step": 2977 + }, + { + "epoch": 0.34, + "learning_rate": 2.000234109797495e-07, + "logits/chosen": -1.66511070728302, + "logits/rejected": -1.6746222972869873, + "logps/chosen": -393.19305419921875, + "logps/rejected": -300.40887451171875, + "loss": 0.5078, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.75996333360672, + "rewards/margins": 1.6025527715682983, + "rewards/rejected": -2.362515926361084, + "step": 2978 + }, + { + "epoch": 0.34, + "learning_rate": 1.9998829451012526e-07, + "logits/chosen": -2.1640889644622803, + "logits/rejected": -2.239814281463623, + "logps/chosen": -250.43211364746094, + "logps/rejected": -228.86544799804688, + "loss": 0.7624, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.928989052772522, + "rewards/margins": 1.148359775543213, + "rewards/rejected": -2.0773489475250244, + "step": 2979 + }, + { + "epoch": 0.34, + "learning_rate": 1.99953178040501e-07, + "logits/chosen": -2.5319435596466064, + "logits/rejected": -2.6475086212158203, + "logps/chosen": -151.45297241210938, + "logps/rejected": -232.06076049804688, + "loss": 0.5944, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5762830972671509, + "rewards/margins": 0.561385452747345, + "rewards/rejected": -2.1376686096191406, + "step": 2980 + }, + { + "epoch": 0.34, + "learning_rate": 1.9991806157087674e-07, + "logits/chosen": -2.0860586166381836, + "logits/rejected": -2.0041868686676025, + "logps/chosen": -288.67144775390625, + "logps/rejected": -393.8459777832031, + "loss": 0.1595, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4761333167552948, + "rewards/margins": 2.4882116317749023, + "rewards/rejected": -2.9643449783325195, + "step": 2981 + }, + { + "epoch": 0.34, + "learning_rate": 1.9988294510125247e-07, + "logits/chosen": -2.495121717453003, + "logits/rejected": -2.8073689937591553, + "logps/chosen": -433.8245849609375, + "logps/rejected": -281.9753723144531, + "loss": 0.2244, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.250167965888977, + "rewards/margins": 1.6842856407165527, + "rewards/rejected": -2.9344534873962402, + "step": 2982 + }, + { + "epoch": 0.34, + "learning_rate": 1.9984782863162822e-07, + "logits/chosen": -2.4139151573181152, + "logits/rejected": -2.41741943359375, + "logps/chosen": -200.39996337890625, + "logps/rejected": -212.19760131835938, + "loss": 0.2793, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7080514430999756, + "rewards/margins": 1.9254976511001587, + "rewards/rejected": -2.633549213409424, + "step": 2983 + }, + { + "epoch": 0.34, + "learning_rate": 1.9981271216200398e-07, + "logits/chosen": -2.142503023147583, + "logits/rejected": -2.210843324661255, + "logps/chosen": -396.3313903808594, + "logps/rejected": -306.06390380859375, + "loss": 0.321, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.43086129426956177, + "rewards/margins": 1.5825250148773193, + "rewards/rejected": -2.0133862495422363, + "step": 2984 + }, + { + "epoch": 0.34, + "learning_rate": 1.997775956923797e-07, + "logits/chosen": -2.2501397132873535, + "logits/rejected": -1.751455545425415, + "logps/chosen": -320.1591796875, + "logps/rejected": -398.65869140625, + "loss": 0.2006, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8068549633026123, + "rewards/margins": 2.904250144958496, + "rewards/rejected": -3.7111051082611084, + "step": 2985 + }, + { + "epoch": 0.34, + "learning_rate": 1.9974247922275546e-07, + "logits/chosen": -2.291790008544922, + "logits/rejected": -2.2506024837493896, + "logps/chosen": -183.37362670898438, + "logps/rejected": -237.00863647460938, + "loss": 0.1709, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2779695987701416, + "rewards/margins": 4.350573539733887, + "rewards/rejected": -4.628542900085449, + "step": 2986 + }, + { + "epoch": 0.34, + "learning_rate": 1.9970736275313124e-07, + "logits/chosen": -2.6828784942626953, + "logits/rejected": -2.786149024963379, + "logps/chosen": -232.23236083984375, + "logps/rejected": -144.09104919433594, + "loss": 0.4041, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5158522129058838, + "rewards/margins": 1.6132475137710571, + "rewards/rejected": -2.1290998458862305, + "step": 2987 + }, + { + "epoch": 0.34, + "learning_rate": 1.9967224628350694e-07, + "logits/chosen": -2.610079050064087, + "logits/rejected": -2.7205543518066406, + "logps/chosen": -172.46456909179688, + "logps/rejected": -267.7138977050781, + "loss": 0.2869, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.269132137298584, + "rewards/margins": 2.716641426086426, + "rewards/rejected": -2.9857735633850098, + "step": 2988 + }, + { + "epoch": 0.34, + "learning_rate": 1.9963712981388272e-07, + "logits/chosen": -2.6925506591796875, + "logits/rejected": -2.5016305446624756, + "logps/chosen": -247.37159729003906, + "logps/rejected": -290.95135498046875, + "loss": 0.3769, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2626228332519531, + "rewards/margins": 1.4257864952087402, + "rewards/rejected": -2.6884093284606934, + "step": 2989 + }, + { + "epoch": 0.34, + "learning_rate": 1.9960201334425845e-07, + "logits/chosen": -2.4300639629364014, + "logits/rejected": -2.1103527545928955, + "logps/chosen": -267.7137145996094, + "logps/rejected": -292.6129455566406, + "loss": 0.492, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7198680639266968, + "rewards/margins": 1.8381422758102417, + "rewards/rejected": -2.5580103397369385, + "step": 2990 + }, + { + "epoch": 0.34, + "learning_rate": 1.995668968746342e-07, + "logits/chosen": -2.329174041748047, + "logits/rejected": -2.2780370712280273, + "logps/chosen": -339.1028747558594, + "logps/rejected": -339.1431884765625, + "loss": 0.3663, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6539075374603271, + "rewards/margins": 2.2474379539489746, + "rewards/rejected": -2.9013452529907227, + "step": 2991 + }, + { + "epoch": 0.34, + "learning_rate": 1.9953178040500995e-07, + "logits/chosen": -2.3141138553619385, + "logits/rejected": -2.4002737998962402, + "logps/chosen": -264.1679382324219, + "logps/rejected": -316.4339599609375, + "loss": 0.5855, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8847602605819702, + "rewards/margins": 1.01506769657135, + "rewards/rejected": -1.8998280763626099, + "step": 2992 + }, + { + "epoch": 0.35, + "learning_rate": 1.9949666393538568e-07, + "logits/chosen": -1.8873540163040161, + "logits/rejected": -1.999638319015503, + "logps/chosen": -389.1581115722656, + "logps/rejected": -277.2342529296875, + "loss": 0.1287, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14616693556308746, + "rewards/margins": 3.148092746734619, + "rewards/rejected": -3.294260025024414, + "step": 2993 + }, + { + "epoch": 0.35, + "learning_rate": 1.9946154746576144e-07, + "logits/chosen": -2.5089974403381348, + "logits/rejected": -2.5420656204223633, + "logps/chosen": -219.68199157714844, + "logps/rejected": -207.28675842285156, + "loss": 0.6704, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0663793087005615, + "rewards/margins": 1.43804931640625, + "rewards/rejected": -2.5044288635253906, + "step": 2994 + }, + { + "epoch": 0.35, + "learning_rate": 1.9942643099613716e-07, + "logits/chosen": -1.9909238815307617, + "logits/rejected": -2.5499496459960938, + "logps/chosen": -282.71990966796875, + "logps/rejected": -202.96603393554688, + "loss": 1.9356, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.2757749557495117, + "rewards/margins": 0.45523932576179504, + "rewards/rejected": -2.7310142517089844, + "step": 2995 + }, + { + "epoch": 0.35, + "learning_rate": 1.9939131452651292e-07, + "logits/chosen": -2.134617328643799, + "logits/rejected": -2.3950884342193604, + "logps/chosen": -413.3203125, + "logps/rejected": -298.59088134765625, + "loss": 0.4993, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0402363538742065, + "rewards/margins": 1.8638272285461426, + "rewards/rejected": -2.9040637016296387, + "step": 2996 + }, + { + "epoch": 0.35, + "learning_rate": 1.9935619805688867e-07, + "logits/chosen": -1.9189471006393433, + "logits/rejected": -2.022900104522705, + "logps/chosen": -204.70057678222656, + "logps/rejected": -257.4367980957031, + "loss": 0.4907, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8388512134552002, + "rewards/margins": 1.0100154876708984, + "rewards/rejected": -1.8488667011260986, + "step": 2997 + }, + { + "epoch": 0.35, + "learning_rate": 1.993210815872644e-07, + "logits/chosen": -2.195380449295044, + "logits/rejected": -2.3026068210601807, + "logps/chosen": -273.2369384765625, + "logps/rejected": -319.4605407714844, + "loss": 0.2077, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5484293699264526, + "rewards/margins": 2.0045547485351562, + "rewards/rejected": -2.5529837608337402, + "step": 2998 + }, + { + "epoch": 0.35, + "learning_rate": 1.9928596511764015e-07, + "logits/chosen": -1.8691952228546143, + "logits/rejected": -2.334585189819336, + "logps/chosen": -525.2801513671875, + "logps/rejected": -241.25250244140625, + "loss": 0.4166, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0434932708740234, + "rewards/margins": 1.5601012706756592, + "rewards/rejected": -2.6035947799682617, + "step": 2999 + }, + { + "epoch": 0.35, + "learning_rate": 1.9925084864801593e-07, + "logits/chosen": -2.633796215057373, + "logits/rejected": -2.773170232772827, + "logps/chosen": -243.52151489257812, + "logps/rejected": -104.81770324707031, + "loss": 0.5157, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8212080001831055, + "rewards/margins": 1.4868805408477783, + "rewards/rejected": -2.3080883026123047, + "step": 3000 + }, + { + "epoch": 0.35, + "eval_logits/chosen": -1.7166334390640259, + "eval_logits/rejected": -1.5972741842269897, + "eval_logps/chosen": -299.1963806152344, + "eval_logps/rejected": -270.92608642578125, + "eval_loss": 0.34477534890174866, + "eval_rewards/accuracies": 0.8428571224212646, + "eval_rewards/chosen": -0.651619553565979, + "eval_rewards/margins": 1.9222490787506104, + "eval_rewards/rejected": -2.5738685131073, + "eval_runtime": 24.2332, + "eval_samples_per_second": 2.889, + "eval_steps_per_second": 1.444, + "step": 3000 + }, + { + "epoch": 0.35, + "learning_rate": 1.9921573217839166e-07, + "logits/chosen": -1.789344310760498, + "logits/rejected": -1.4893455505371094, + "logps/chosen": -299.3279724121094, + "logps/rejected": -360.33984375, + "loss": 0.6275, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4908181130886078, + "rewards/margins": 1.4493180513381958, + "rewards/rejected": -1.9401360750198364, + "step": 3001 + }, + { + "epoch": 0.35, + "learning_rate": 1.9918061570876741e-07, + "logits/chosen": -2.744720220565796, + "logits/rejected": -2.6191134452819824, + "logps/chosen": -236.34432983398438, + "logps/rejected": -196.3087921142578, + "loss": 1.5161, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.677825450897217, + "rewards/margins": 0.74766606092453, + "rewards/rejected": -3.4254913330078125, + "step": 3002 + }, + { + "epoch": 0.35, + "learning_rate": 1.9914549923914314e-07, + "logits/chosen": -1.8557863235473633, + "logits/rejected": -1.982408046722412, + "logps/chosen": -408.7522277832031, + "logps/rejected": -308.29791259765625, + "loss": 0.3401, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8378155827522278, + "rewards/margins": 2.1910598278045654, + "rewards/rejected": -3.0288753509521484, + "step": 3003 + }, + { + "epoch": 0.35, + "learning_rate": 1.991103827695189e-07, + "logits/chosen": -2.271378993988037, + "logits/rejected": -2.060120105743408, + "logps/chosen": -161.41346740722656, + "logps/rejected": -202.39141845703125, + "loss": 0.9593, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5567625761032104, + "rewards/margins": 0.3391309380531311, + "rewards/rejected": -1.8958935737609863, + "step": 3004 + }, + { + "epoch": 0.35, + "learning_rate": 1.9907526629989465e-07, + "logits/chosen": -2.4045937061309814, + "logits/rejected": -2.3009300231933594, + "logps/chosen": -293.4571533203125, + "logps/rejected": -462.8999938964844, + "loss": 0.2805, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.42263445258140564, + "rewards/margins": 2.580713987350464, + "rewards/rejected": -3.0033483505249023, + "step": 3005 + }, + { + "epoch": 0.35, + "learning_rate": 1.9904014983027038e-07, + "logits/chosen": -2.451859474182129, + "logits/rejected": -2.4052696228027344, + "logps/chosen": -223.12167358398438, + "logps/rejected": -328.9119873046875, + "loss": 0.3339, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07845830917358398, + "rewards/margins": 2.5996203422546387, + "rewards/rejected": -2.5211620330810547, + "step": 3006 + }, + { + "epoch": 0.35, + "learning_rate": 1.9900503336064613e-07, + "logits/chosen": -1.8816890716552734, + "logits/rejected": -2.004540205001831, + "logps/chosen": -167.40907287597656, + "logps/rejected": -163.78695678710938, + "loss": 0.6581, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8357162475585938, + "rewards/margins": 0.8606334328651428, + "rewards/rejected": -1.6963497400283813, + "step": 3007 + }, + { + "epoch": 0.35, + "learning_rate": 1.9896991689102188e-07, + "logits/chosen": -2.100127696990967, + "logits/rejected": -2.450413465499878, + "logps/chosen": -360.3726806640625, + "logps/rejected": -279.0927734375, + "loss": 0.0868, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3913275897502899, + "rewards/margins": 3.207730770111084, + "rewards/rejected": -3.599058151245117, + "step": 3008 + }, + { + "epoch": 0.35, + "learning_rate": 1.989348004213976e-07, + "logits/chosen": -2.9607675075531006, + "logits/rejected": -2.852139711380005, + "logps/chosen": -175.83885192871094, + "logps/rejected": -199.82408142089844, + "loss": 0.1753, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6454261541366577, + "rewards/margins": 3.486232280731201, + "rewards/rejected": -4.131658554077148, + "step": 3009 + }, + { + "epoch": 0.35, + "learning_rate": 1.988996839517734e-07, + "logits/chosen": -2.678169012069702, + "logits/rejected": -2.682683229446411, + "logps/chosen": -234.0556182861328, + "logps/rejected": -282.1955261230469, + "loss": 0.2853, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2918631434440613, + "rewards/margins": 2.4242377281188965, + "rewards/rejected": -2.7161011695861816, + "step": 3010 + }, + { + "epoch": 0.35, + "learning_rate": 1.988645674821491e-07, + "logits/chosen": -2.7531421184539795, + "logits/rejected": -2.6905863285064697, + "logps/chosen": -477.92236328125, + "logps/rejected": -292.7354736328125, + "loss": 0.487, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0377458333969116, + "rewards/margins": 2.057008743286133, + "rewards/rejected": -3.094754695892334, + "step": 3011 + }, + { + "epoch": 0.35, + "learning_rate": 1.9882945101252487e-07, + "logits/chosen": -2.4630515575408936, + "logits/rejected": -2.5355727672576904, + "logps/chosen": -324.3270263671875, + "logps/rejected": -242.75184631347656, + "loss": 0.5466, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0043998956680298, + "rewards/margins": 0.9863296747207642, + "rewards/rejected": -1.990729570388794, + "step": 3012 + }, + { + "epoch": 0.35, + "learning_rate": 1.9879433454290063e-07, + "logits/chosen": -2.059372901916504, + "logits/rejected": -2.1642544269561768, + "logps/chosen": -287.0716857910156, + "logps/rejected": -326.9747314453125, + "loss": 0.3359, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13886182010173798, + "rewards/margins": 3.6509668827056885, + "rewards/rejected": -3.7898287773132324, + "step": 3013 + }, + { + "epoch": 0.35, + "learning_rate": 1.9875921807327635e-07, + "logits/chosen": -2.3891358375549316, + "logits/rejected": -2.550356864929199, + "logps/chosen": -267.4891662597656, + "logps/rejected": -189.1055450439453, + "loss": 0.3365, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.084890604019165, + "rewards/margins": 2.1664576530456543, + "rewards/rejected": -3.2513482570648193, + "step": 3014 + }, + { + "epoch": 0.35, + "learning_rate": 1.987241016036521e-07, + "logits/chosen": -1.895287036895752, + "logits/rejected": -2.456604242324829, + "logps/chosen": -523.04150390625, + "logps/rejected": -246.94757080078125, + "loss": 0.3023, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24534504115581512, + "rewards/margins": 2.0529017448425293, + "rewards/rejected": -2.2982466220855713, + "step": 3015 + }, + { + "epoch": 0.35, + "learning_rate": 1.9868898513402786e-07, + "logits/chosen": -1.94840407371521, + "logits/rejected": -2.090705394744873, + "logps/chosen": -241.648681640625, + "logps/rejected": -284.85272216796875, + "loss": 0.3524, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.06959345191717148, + "rewards/margins": 2.1778454780578613, + "rewards/rejected": -2.1082520484924316, + "step": 3016 + }, + { + "epoch": 0.35, + "learning_rate": 1.986538686644036e-07, + "logits/chosen": -2.460684299468994, + "logits/rejected": -2.2983155250549316, + "logps/chosen": -189.37342834472656, + "logps/rejected": -252.16354370117188, + "loss": 0.3549, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6813235282897949, + "rewards/margins": 2.233319044113159, + "rewards/rejected": -2.914642572402954, + "step": 3017 + }, + { + "epoch": 0.35, + "learning_rate": 1.9861875219477934e-07, + "logits/chosen": -2.3377127647399902, + "logits/rejected": -2.1506452560424805, + "logps/chosen": -173.37826538085938, + "logps/rejected": -282.6359558105469, + "loss": 0.2172, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4615718722343445, + "rewards/margins": 2.6175129413604736, + "rewards/rejected": -3.079084873199463, + "step": 3018 + }, + { + "epoch": 0.35, + "learning_rate": 1.9858363572515507e-07, + "logits/chosen": -2.7534987926483154, + "logits/rejected": -2.8391380310058594, + "logps/chosen": -155.87136840820312, + "logps/rejected": -236.49903869628906, + "loss": 0.1943, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.45560821890830994, + "rewards/margins": 2.282127618789673, + "rewards/rejected": -2.7377357482910156, + "step": 3019 + }, + { + "epoch": 0.35, + "learning_rate": 1.9854851925553082e-07, + "logits/chosen": -2.3054213523864746, + "logits/rejected": -2.180716037750244, + "logps/chosen": -291.4873046875, + "logps/rejected": -414.41082763671875, + "loss": 0.3748, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6350138783454895, + "rewards/margins": 1.5918924808502197, + "rewards/rejected": -2.2269062995910645, + "step": 3020 + }, + { + "epoch": 0.35, + "learning_rate": 1.985134027859066e-07, + "logits/chosen": -2.4071128368377686, + "logits/rejected": -2.622795343399048, + "logps/chosen": -251.73825073242188, + "logps/rejected": -279.4241943359375, + "loss": 0.3449, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5316616296768188, + "rewards/margins": 1.9053313732147217, + "rewards/rejected": -3.436992883682251, + "step": 3021 + }, + { + "epoch": 0.35, + "learning_rate": 1.984782863162823e-07, + "logits/chosen": -2.4879345893859863, + "logits/rejected": -2.3100900650024414, + "logps/chosen": -97.45529174804688, + "logps/rejected": -205.6041717529297, + "loss": 0.4103, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4581340551376343, + "rewards/margins": 1.6856043338775635, + "rewards/rejected": -2.143738269805908, + "step": 3022 + }, + { + "epoch": 0.35, + "learning_rate": 1.9844316984665809e-07, + "logits/chosen": -2.8303823471069336, + "logits/rejected": -2.5651450157165527, + "logps/chosen": -145.82875061035156, + "logps/rejected": -229.27285766601562, + "loss": 0.5651, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1014127731323242, + "rewards/margins": 1.0699166059494019, + "rewards/rejected": -2.1713294982910156, + "step": 3023 + }, + { + "epoch": 0.35, + "learning_rate": 1.9840805337703384e-07, + "logits/chosen": -2.354445457458496, + "logits/rejected": -2.3588576316833496, + "logps/chosen": -458.847900390625, + "logps/rejected": -356.55426025390625, + "loss": 0.2541, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.26197701692581177, + "rewards/margins": 2.77805757522583, + "rewards/rejected": -3.040034532546997, + "step": 3024 + }, + { + "epoch": 0.35, + "learning_rate": 1.9837293690740957e-07, + "logits/chosen": -2.245725393295288, + "logits/rejected": -1.9569470882415771, + "logps/chosen": -394.37249755859375, + "logps/rejected": -357.92291259765625, + "loss": 0.5119, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7189288139343262, + "rewards/margins": 1.6389424800872803, + "rewards/rejected": -2.3578712940216064, + "step": 3025 + }, + { + "epoch": 0.35, + "learning_rate": 1.9833782043778532e-07, + "logits/chosen": -2.0231523513793945, + "logits/rejected": -2.0626349449157715, + "logps/chosen": -265.54638671875, + "logps/rejected": -313.32861328125, + "loss": 0.6439, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4478292763233185, + "rewards/margins": 1.4635992050170898, + "rewards/rejected": -1.9114285707473755, + "step": 3026 + }, + { + "epoch": 0.35, + "learning_rate": 1.9830270396816105e-07, + "logits/chosen": -1.9457857608795166, + "logits/rejected": -1.8936662673950195, + "logps/chosen": -154.83279418945312, + "logps/rejected": -275.530517578125, + "loss": 0.2669, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17719599604606628, + "rewards/margins": 3.050405502319336, + "rewards/rejected": -3.2276015281677246, + "step": 3027 + }, + { + "epoch": 0.35, + "learning_rate": 1.982675874985368e-07, + "logits/chosen": -2.028526544570923, + "logits/rejected": -1.9955010414123535, + "logps/chosen": -411.13519287109375, + "logps/rejected": -351.24395751953125, + "loss": 0.4415, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5698676705360413, + "rewards/margins": 1.14314603805542, + "rewards/rejected": -1.713013768196106, + "step": 3028 + }, + { + "epoch": 0.35, + "learning_rate": 1.9823247102891256e-07, + "logits/chosen": -2.031445026397705, + "logits/rejected": -2.516256332397461, + "logps/chosen": -366.8381042480469, + "logps/rejected": -232.802734375, + "loss": 0.2399, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18715572357177734, + "rewards/margins": 2.7652106285095215, + "rewards/rejected": -2.952366352081299, + "step": 3029 + }, + { + "epoch": 0.35, + "learning_rate": 1.9819735455928828e-07, + "logits/chosen": -2.2605247497558594, + "logits/rejected": -2.3022401332855225, + "logps/chosen": -193.1433563232422, + "logps/rejected": -218.81419372558594, + "loss": 0.2016, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2972192168235779, + "rewards/margins": 2.2584376335144043, + "rewards/rejected": -2.555656909942627, + "step": 3030 + }, + { + "epoch": 0.35, + "learning_rate": 1.9816223808966404e-07, + "logits/chosen": -2.5075159072875977, + "logits/rejected": -2.5051798820495605, + "logps/chosen": -254.256103515625, + "logps/rejected": -269.2448425292969, + "loss": 0.3095, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.39885127544403076, + "rewards/margins": 3.1251063346862793, + "rewards/rejected": -3.5239574909210205, + "step": 3031 + }, + { + "epoch": 0.35, + "learning_rate": 1.9812712162003982e-07, + "logits/chosen": -2.3553318977355957, + "logits/rejected": -2.3362338542938232, + "logps/chosen": -404.2259216308594, + "logps/rejected": -272.86383056640625, + "loss": 0.3724, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6922221183776855, + "rewards/margins": 1.7269587516784668, + "rewards/rejected": -2.4191808700561523, + "step": 3032 + }, + { + "epoch": 0.35, + "learning_rate": 1.9809200515041552e-07, + "logits/chosen": -2.368151903152466, + "logits/rejected": -2.1834053993225098, + "logps/chosen": -151.51754760742188, + "logps/rejected": -206.33499145507812, + "loss": 0.2959, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7295833230018616, + "rewards/margins": 2.266834020614624, + "rewards/rejected": -2.99641752243042, + "step": 3033 + }, + { + "epoch": 0.35, + "learning_rate": 1.980568886807913e-07, + "logits/chosen": -2.508687973022461, + "logits/rejected": -2.6030423641204834, + "logps/chosen": -283.4814147949219, + "logps/rejected": -349.9670104980469, + "loss": 0.1621, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8757786750793457, + "rewards/margins": 2.3308658599853516, + "rewards/rejected": -3.2066445350646973, + "step": 3034 + }, + { + "epoch": 0.35, + "learning_rate": 1.9802177221116703e-07, + "logits/chosen": -1.8700343370437622, + "logits/rejected": -2.027663230895996, + "logps/chosen": -303.87908935546875, + "logps/rejected": -249.9828338623047, + "loss": 0.5827, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5779990553855896, + "rewards/margins": 1.0196168422698975, + "rewards/rejected": -1.5976158380508423, + "step": 3035 + }, + { + "epoch": 0.35, + "learning_rate": 1.9798665574154278e-07, + "logits/chosen": -2.6184515953063965, + "logits/rejected": -2.427236795425415, + "logps/chosen": -175.18968200683594, + "logps/rejected": -252.38824462890625, + "loss": 0.2941, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.083742618560791, + "rewards/margins": 1.8782062530517578, + "rewards/rejected": -2.961948871612549, + "step": 3036 + }, + { + "epoch": 0.35, + "learning_rate": 1.9795153927191853e-07, + "logits/chosen": -2.714168071746826, + "logits/rejected": -2.7034521102905273, + "logps/chosen": -149.73646545410156, + "logps/rejected": -168.17396545410156, + "loss": 0.2546, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1122044324874878, + "rewards/margins": 2.0885491371154785, + "rewards/rejected": -3.200753688812256, + "step": 3037 + }, + { + "epoch": 0.35, + "learning_rate": 1.9791642280229426e-07, + "logits/chosen": -2.7354719638824463, + "logits/rejected": -2.7778382301330566, + "logps/chosen": -203.30392456054688, + "logps/rejected": -201.02496337890625, + "loss": 0.5852, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.659195065498352, + "rewards/margins": 0.8067041635513306, + "rewards/rejected": -2.4658994674682617, + "step": 3038 + }, + { + "epoch": 0.35, + "learning_rate": 1.9788130633267001e-07, + "logits/chosen": -2.059082508087158, + "logits/rejected": -2.04524564743042, + "logps/chosen": -383.1037902832031, + "logps/rejected": -381.7314758300781, + "loss": 0.391, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6839225888252258, + "rewards/margins": 1.894096851348877, + "rewards/rejected": -2.578019380569458, + "step": 3039 + }, + { + "epoch": 0.35, + "learning_rate": 1.9784618986304577e-07, + "logits/chosen": -1.9617093801498413, + "logits/rejected": -1.967954397201538, + "logps/chosen": -400.9415283203125, + "logps/rejected": -346.4884948730469, + "loss": 0.227, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5711531043052673, + "rewards/margins": 1.905151605606079, + "rewards/rejected": -2.476304769515991, + "step": 3040 + }, + { + "epoch": 0.35, + "learning_rate": 1.978110733934215e-07, + "logits/chosen": -2.5641603469848633, + "logits/rejected": -2.2641048431396484, + "logps/chosen": -184.38449096679688, + "logps/rejected": -253.1776123046875, + "loss": 0.3235, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5379083156585693, + "rewards/margins": 1.8655084371566772, + "rewards/rejected": -2.403416872024536, + "step": 3041 + }, + { + "epoch": 0.35, + "learning_rate": 1.9777595692379725e-07, + "logits/chosen": -2.4306087493896484, + "logits/rejected": -2.204993724822998, + "logps/chosen": -120.1019287109375, + "logps/rejected": -132.63375854492188, + "loss": 0.4617, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7607610821723938, + "rewards/margins": 1.5513169765472412, + "rewards/rejected": -2.3120779991149902, + "step": 3042 + }, + { + "epoch": 0.35, + "learning_rate": 1.9774084045417298e-07, + "logits/chosen": -1.9727396965026855, + "logits/rejected": -2.4024806022644043, + "logps/chosen": -384.86846923828125, + "logps/rejected": -198.30233764648438, + "loss": 0.3557, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.979198932647705, + "rewards/margins": 1.5194718837738037, + "rewards/rejected": -3.498670816421509, + "step": 3043 + }, + { + "epoch": 0.35, + "learning_rate": 1.9770572398454876e-07, + "logits/chosen": -2.2516019344329834, + "logits/rejected": -2.2976086139678955, + "logps/chosen": -316.4281005859375, + "logps/rejected": -329.13916015625, + "loss": 0.3139, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7456754446029663, + "rewards/margins": 1.1992517709732056, + "rewards/rejected": -1.9449272155761719, + "step": 3044 + }, + { + "epoch": 0.35, + "learning_rate": 1.976706075149245e-07, + "logits/chosen": -2.1720659732818604, + "logits/rejected": -2.423403024673462, + "logps/chosen": -342.974365234375, + "logps/rejected": -206.34664916992188, + "loss": 0.2798, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8353420495986938, + "rewards/margins": 2.9444260597229004, + "rewards/rejected": -3.779768466949463, + "step": 3045 + }, + { + "epoch": 0.35, + "learning_rate": 1.9763549104530024e-07, + "logits/chosen": -2.470935344696045, + "logits/rejected": -2.5388543605804443, + "logps/chosen": -245.58798217773438, + "logps/rejected": -183.8765869140625, + "loss": 0.5553, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6772194504737854, + "rewards/margins": 0.6888043880462646, + "rewards/rejected": -1.3660237789154053, + "step": 3046 + }, + { + "epoch": 0.35, + "learning_rate": 1.97600374575676e-07, + "logits/chosen": -2.118025302886963, + "logits/rejected": -2.315929412841797, + "logps/chosen": -581.44677734375, + "logps/rejected": -408.1746826171875, + "loss": 0.245, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0389560461044312, + "rewards/margins": 2.1920316219329834, + "rewards/rejected": -3.230987787246704, + "step": 3047 + }, + { + "epoch": 0.35, + "learning_rate": 1.9756525810605172e-07, + "logits/chosen": -2.5244383811950684, + "logits/rejected": -2.773857831954956, + "logps/chosen": -171.00390625, + "logps/rejected": -181.39263916015625, + "loss": 0.1365, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19933444261550903, + "rewards/margins": 3.053353786468506, + "rewards/rejected": -3.252688407897949, + "step": 3048 + }, + { + "epoch": 0.35, + "learning_rate": 1.9753014163642747e-07, + "logits/chosen": -2.05407452583313, + "logits/rejected": -1.9726907014846802, + "logps/chosen": -416.7938232421875, + "logps/rejected": -307.5511169433594, + "loss": 0.5089, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.05655696988105774, + "rewards/margins": 0.8088143467903137, + "rewards/rejected": -0.7522573471069336, + "step": 3049 + }, + { + "epoch": 0.35, + "learning_rate": 1.9749502516680323e-07, + "logits/chosen": -2.238893508911133, + "logits/rejected": -1.9854451417922974, + "logps/chosen": -307.86480712890625, + "logps/rejected": -353.8805236816406, + "loss": 0.3621, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0131475925445557, + "rewards/margins": 1.9116053581237793, + "rewards/rejected": -2.924752950668335, + "step": 3050 + }, + { + "epoch": 0.35, + "learning_rate": 1.9745990869717896e-07, + "logits/chosen": -2.8358078002929688, + "logits/rejected": -2.8740475177764893, + "logps/chosen": -171.73516845703125, + "logps/rejected": -255.96661376953125, + "loss": 0.1455, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3181605935096741, + "rewards/margins": 3.705397129058838, + "rewards/rejected": -4.023557662963867, + "step": 3051 + }, + { + "epoch": 0.35, + "learning_rate": 1.974247922275547e-07, + "logits/chosen": -2.624497652053833, + "logits/rejected": -2.4893977642059326, + "logps/chosen": -109.17755126953125, + "logps/rejected": -184.54290771484375, + "loss": 0.1448, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5805693864822388, + "rewards/margins": 2.169565200805664, + "rewards/rejected": -2.7501344680786133, + "step": 3052 + }, + { + "epoch": 0.35, + "learning_rate": 1.9738967575793046e-07, + "logits/chosen": -2.0912699699401855, + "logits/rejected": -2.291929244995117, + "logps/chosen": -293.0837707519531, + "logps/rejected": -207.50250244140625, + "loss": 0.8074, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6379456520080566, + "rewards/margins": 1.5711188316345215, + "rewards/rejected": -3.209064483642578, + "step": 3053 + }, + { + "epoch": 0.35, + "learning_rate": 1.973545592883062e-07, + "logits/chosen": -2.83964204788208, + "logits/rejected": -2.786771297454834, + "logps/chosen": -194.36178588867188, + "logps/rejected": -173.10472106933594, + "loss": 0.2458, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5045759677886963, + "rewards/margins": 1.5834801197052002, + "rewards/rejected": -2.0880560874938965, + "step": 3054 + }, + { + "epoch": 0.35, + "learning_rate": 1.9731944281868197e-07, + "logits/chosen": -2.121267318725586, + "logits/rejected": -2.2974886894226074, + "logps/chosen": -310.3702087402344, + "logps/rejected": -406.0981750488281, + "loss": 0.3674, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5431795120239258, + "rewards/margins": 2.3805387020111084, + "rewards/rejected": -3.9237184524536133, + "step": 3055 + }, + { + "epoch": 0.35, + "learning_rate": 1.9728432634905767e-07, + "logits/chosen": -2.6816859245300293, + "logits/rejected": -2.6881606578826904, + "logps/chosen": -244.11094665527344, + "logps/rejected": -253.84808349609375, + "loss": 0.6495, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6864670515060425, + "rewards/margins": 1.2999563217163086, + "rewards/rejected": -2.9864232540130615, + "step": 3056 + }, + { + "epoch": 0.35, + "learning_rate": 1.9724920987943345e-07, + "logits/chosen": -1.9079676866531372, + "logits/rejected": -2.382317543029785, + "logps/chosen": -269.9123840332031, + "logps/rejected": -241.493408203125, + "loss": 0.2347, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6471890211105347, + "rewards/margins": 2.7325236797332764, + "rewards/rejected": -3.3797125816345215, + "step": 3057 + }, + { + "epoch": 0.35, + "learning_rate": 1.972140934098092e-07, + "logits/chosen": -1.9801772832870483, + "logits/rejected": -2.229264974594116, + "logps/chosen": -395.3722229003906, + "logps/rejected": -308.0810241699219, + "loss": 0.2858, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8033021092414856, + "rewards/margins": 1.9745433330535889, + "rewards/rejected": -2.7778453826904297, + "step": 3058 + }, + { + "epoch": 0.35, + "learning_rate": 1.9717897694018493e-07, + "logits/chosen": -2.182541847229004, + "logits/rejected": -2.2158567905426025, + "logps/chosen": -493.890625, + "logps/rejected": -344.2589416503906, + "loss": 1.7317, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.575181245803833, + "rewards/margins": 0.9135735034942627, + "rewards/rejected": -3.4887545108795166, + "step": 3059 + }, + { + "epoch": 0.35, + "learning_rate": 1.971438604705607e-07, + "logits/chosen": -1.8914275169372559, + "logits/rejected": -2.227060317993164, + "logps/chosen": -454.4715270996094, + "logps/rejected": -310.7157897949219, + "loss": 0.6755, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.48597514629364014, + "rewards/margins": 1.505975365638733, + "rewards/rejected": -1.991950511932373, + "step": 3060 + }, + { + "epoch": 0.35, + "learning_rate": 1.9710874400093644e-07, + "logits/chosen": -2.1391828060150146, + "logits/rejected": -1.734690546989441, + "logps/chosen": -229.7568817138672, + "logps/rejected": -395.66314697265625, + "loss": 0.2401, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21032458543777466, + "rewards/margins": 2.364283323287964, + "rewards/rejected": -2.5746078491210938, + "step": 3061 + }, + { + "epoch": 0.35, + "learning_rate": 1.9707362753131217e-07, + "logits/chosen": -2.1401472091674805, + "logits/rejected": -1.8224642276763916, + "logps/chosen": -272.4087219238281, + "logps/rejected": -416.36773681640625, + "loss": 0.2002, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13835987448692322, + "rewards/margins": 2.0641369819641113, + "rewards/rejected": -2.2024967670440674, + "step": 3062 + }, + { + "epoch": 0.35, + "learning_rate": 1.9703851106168792e-07, + "logits/chosen": -2.6472840309143066, + "logits/rejected": -2.6183671951293945, + "logps/chosen": -145.86767578125, + "logps/rejected": -198.318115234375, + "loss": 0.5155, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7184522151947021, + "rewards/margins": 2.464923620223999, + "rewards/rejected": -3.183375835418701, + "step": 3063 + }, + { + "epoch": 0.35, + "learning_rate": 1.9700339459206365e-07, + "logits/chosen": -2.3170313835144043, + "logits/rejected": -2.180136203765869, + "logps/chosen": -245.0532684326172, + "logps/rejected": -305.136474609375, + "loss": 0.3665, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.765383243560791, + "rewards/margins": 3.4176173210144043, + "rewards/rejected": -4.183000564575195, + "step": 3064 + }, + { + "epoch": 0.35, + "learning_rate": 1.969682781224394e-07, + "logits/chosen": -2.1665427684783936, + "logits/rejected": -2.402552366256714, + "logps/chosen": -302.73028564453125, + "logps/rejected": -223.52674865722656, + "loss": 0.349, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3773943781852722, + "rewards/margins": 1.3770980834960938, + "rewards/rejected": -1.7544924020767212, + "step": 3065 + }, + { + "epoch": 0.35, + "learning_rate": 1.9693316165281518e-07, + "logits/chosen": -2.431758165359497, + "logits/rejected": -2.2825350761413574, + "logps/chosen": -136.91932678222656, + "logps/rejected": -206.8841552734375, + "loss": 0.5971, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9623004198074341, + "rewards/margins": 1.2227402925491333, + "rewards/rejected": -2.1850407123565674, + "step": 3066 + }, + { + "epoch": 0.35, + "learning_rate": 1.9689804518319088e-07, + "logits/chosen": -2.4093406200408936, + "logits/rejected": -2.2813799381256104, + "logps/chosen": -186.6121826171875, + "logps/rejected": -259.08135986328125, + "loss": 0.6702, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5760273933410645, + "rewards/margins": 1.9162355661392212, + "rewards/rejected": -3.492262840270996, + "step": 3067 + }, + { + "epoch": 0.35, + "learning_rate": 1.9686292871356666e-07, + "logits/chosen": -2.0861093997955322, + "logits/rejected": -2.265065908432007, + "logps/chosen": -333.0917053222656, + "logps/rejected": -244.0816650390625, + "loss": 1.1969, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5404502153396606, + "rewards/margins": 0.8860201835632324, + "rewards/rejected": -2.4264705181121826, + "step": 3068 + }, + { + "epoch": 0.35, + "learning_rate": 1.9682781224394242e-07, + "logits/chosen": -2.7882227897644043, + "logits/rejected": -2.95322847366333, + "logps/chosen": -390.3488464355469, + "logps/rejected": -332.47344970703125, + "loss": 0.3246, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3415570855140686, + "rewards/margins": 2.126649856567383, + "rewards/rejected": -2.4682068824768066, + "step": 3069 + }, + { + "epoch": 0.35, + "learning_rate": 1.9679269577431815e-07, + "logits/chosen": -2.9196791648864746, + "logits/rejected": -2.7260003089904785, + "logps/chosen": -280.9155578613281, + "logps/rejected": -325.5986328125, + "loss": 0.5735, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3533681035041809, + "rewards/margins": 2.8049263954162598, + "rewards/rejected": -3.158294439315796, + "step": 3070 + }, + { + "epoch": 0.35, + "learning_rate": 1.967575793046939e-07, + "logits/chosen": -2.5152106285095215, + "logits/rejected": -2.5231354236602783, + "logps/chosen": -294.64776611328125, + "logps/rejected": -274.64404296875, + "loss": 0.6495, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.8904926776885986, + "rewards/margins": 2.2015767097473145, + "rewards/rejected": -4.092069625854492, + "step": 3071 + }, + { + "epoch": 0.35, + "learning_rate": 1.9672246283506963e-07, + "logits/chosen": -2.4056954383850098, + "logits/rejected": -2.275485038757324, + "logps/chosen": -168.9790802001953, + "logps/rejected": -252.24557495117188, + "loss": 0.1855, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.48062652349472046, + "rewards/margins": 2.072368860244751, + "rewards/rejected": -2.552995204925537, + "step": 3072 + }, + { + "epoch": 0.35, + "learning_rate": 1.9668734636544538e-07, + "logits/chosen": -2.636580228805542, + "logits/rejected": -2.627159357070923, + "logps/chosen": -213.55746459960938, + "logps/rejected": -267.0981140136719, + "loss": 0.412, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9743032455444336, + "rewards/margins": 2.0324957370758057, + "rewards/rejected": -3.00679874420166, + "step": 3073 + }, + { + "epoch": 0.35, + "learning_rate": 1.9665222989582113e-07, + "logits/chosen": -2.6682705879211426, + "logits/rejected": -2.6263504028320312, + "logps/chosen": -378.329345703125, + "logps/rejected": -269.3019104003906, + "loss": 0.8695, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1582900285720825, + "rewards/margins": 1.2688593864440918, + "rewards/rejected": -2.427149772644043, + "step": 3074 + }, + { + "epoch": 0.35, + "learning_rate": 1.9661711342619686e-07, + "logits/chosen": -2.938185691833496, + "logits/rejected": -2.9499502182006836, + "logps/chosen": -271.43310546875, + "logps/rejected": -241.18173217773438, + "loss": 0.1313, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3584938645362854, + "rewards/margins": 2.451694965362549, + "rewards/rejected": -2.8101887702941895, + "step": 3075 + }, + { + "epoch": 0.35, + "learning_rate": 1.9658199695657262e-07, + "logits/chosen": -1.5976073741912842, + "logits/rejected": -1.6936099529266357, + "logps/chosen": -330.6689453125, + "logps/rejected": -326.1499328613281, + "loss": 0.4627, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6099236607551575, + "rewards/margins": 1.049404263496399, + "rewards/rejected": -1.6593279838562012, + "step": 3076 + }, + { + "epoch": 0.35, + "learning_rate": 1.965468804869484e-07, + "logits/chosen": -2.397580146789551, + "logits/rejected": -2.3851218223571777, + "logps/chosen": -203.72518920898438, + "logps/rejected": -191.79678344726562, + "loss": 0.2594, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0262641906738281, + "rewards/margins": 1.861830234527588, + "rewards/rejected": -2.888094425201416, + "step": 3077 + }, + { + "epoch": 0.35, + "learning_rate": 1.9651176401732412e-07, + "logits/chosen": -2.7409467697143555, + "logits/rejected": -2.678997039794922, + "logps/chosen": -176.47482299804688, + "logps/rejected": -172.74249267578125, + "loss": 0.2654, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3239107131958008, + "rewards/margins": 1.591695785522461, + "rewards/rejected": -2.915606737136841, + "step": 3078 + }, + { + "epoch": 0.35, + "learning_rate": 1.9647664754769988e-07, + "logits/chosen": -2.6171398162841797, + "logits/rejected": -2.826530933380127, + "logps/chosen": -371.0113525390625, + "logps/rejected": -257.88458251953125, + "loss": 0.2567, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2669713497161865, + "rewards/margins": 2.931901454925537, + "rewards/rejected": -4.1988725662231445, + "step": 3079 + }, + { + "epoch": 0.36, + "learning_rate": 1.964415310780756e-07, + "logits/chosen": -2.6936144828796387, + "logits/rejected": -2.708340883255005, + "logps/chosen": -273.9858093261719, + "logps/rejected": -341.27313232421875, + "loss": 0.1527, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9383569955825806, + "rewards/margins": 2.9387459754943848, + "rewards/rejected": -3.877102851867676, + "step": 3080 + }, + { + "epoch": 0.36, + "learning_rate": 1.9640641460845136e-07, + "logits/chosen": -2.8644633293151855, + "logits/rejected": -2.7488958835601807, + "logps/chosen": -93.93399047851562, + "logps/rejected": -189.11016845703125, + "loss": 0.3862, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8116876482963562, + "rewards/margins": 1.4883546829223633, + "rewards/rejected": -2.3000423908233643, + "step": 3081 + }, + { + "epoch": 0.36, + "learning_rate": 1.963712981388271e-07, + "logits/chosen": -2.9760260581970215, + "logits/rejected": -2.951469898223877, + "logps/chosen": -351.1231994628906, + "logps/rejected": -405.6488037109375, + "loss": 0.2185, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4345223903656006, + "rewards/margins": 2.7017736434936523, + "rewards/rejected": -3.136296033859253, + "step": 3082 + }, + { + "epoch": 0.36, + "learning_rate": 1.9633618166920284e-07, + "logits/chosen": -2.191518545150757, + "logits/rejected": -2.067218542098999, + "logps/chosen": -301.13775634765625, + "logps/rejected": -385.52099609375, + "loss": 0.2251, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8570387363433838, + "rewards/margins": 2.6553385257720947, + "rewards/rejected": -3.5123772621154785, + "step": 3083 + }, + { + "epoch": 0.36, + "learning_rate": 1.963010651995786e-07, + "logits/chosen": -2.499401330947876, + "logits/rejected": -2.5203988552093506, + "logps/chosen": -554.2305908203125, + "logps/rejected": -370.3623962402344, + "loss": 0.4881, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8287330865859985, + "rewards/margins": 1.2809057235717773, + "rewards/rejected": -2.1096389293670654, + "step": 3084 + }, + { + "epoch": 0.36, + "learning_rate": 1.9626594872995435e-07, + "logits/chosen": -1.6547162532806396, + "logits/rejected": -1.8190226554870605, + "logps/chosen": -322.9851379394531, + "logps/rejected": -316.5318603515625, + "loss": 0.4983, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3578563332557678, + "rewards/margins": 1.1943999528884888, + "rewards/rejected": -1.5522563457489014, + "step": 3085 + }, + { + "epoch": 0.36, + "learning_rate": 1.9623083226033008e-07, + "logits/chosen": -2.130901575088501, + "logits/rejected": -2.1633713245391846, + "logps/chosen": -335.9841003417969, + "logps/rejected": -392.86199951171875, + "loss": 0.1261, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2411862313747406, + "rewards/margins": 2.928396701812744, + "rewards/rejected": -3.1695828437805176, + "step": 3086 + }, + { + "epoch": 0.36, + "learning_rate": 1.9619571579070583e-07, + "logits/chosen": -1.8750816583633423, + "logits/rejected": -1.9302979707717896, + "logps/chosen": -421.949951171875, + "logps/rejected": -296.0797424316406, + "loss": 0.8546, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2667686939239502, + "rewards/margins": 0.6988991498947144, + "rewards/rejected": -1.965667724609375, + "step": 3087 + }, + { + "epoch": 0.36, + "learning_rate": 1.9616059932108156e-07, + "logits/chosen": -2.3375649452209473, + "logits/rejected": -2.357346534729004, + "logps/chosen": -218.85089111328125, + "logps/rejected": -205.3568572998047, + "loss": 0.3744, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04795059561729431, + "rewards/margins": 1.9835255146026611, + "rewards/rejected": -2.0314760208129883, + "step": 3088 + }, + { + "epoch": 0.36, + "learning_rate": 1.9612548285145734e-07, + "logits/chosen": -2.1249706745147705, + "logits/rejected": -1.9641821384429932, + "logps/chosen": -438.9639892578125, + "logps/rejected": -417.05023193359375, + "loss": 0.6183, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.972953200340271, + "rewards/margins": 0.8396673798561096, + "rewards/rejected": -1.8126204013824463, + "step": 3089 + }, + { + "epoch": 0.36, + "learning_rate": 1.960903663818331e-07, + "logits/chosen": -2.5766024589538574, + "logits/rejected": -2.539412498474121, + "logps/chosen": -234.88967895507812, + "logps/rejected": -212.0004119873047, + "loss": 0.1801, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7632539868354797, + "rewards/margins": 2.1959800720214844, + "rewards/rejected": -2.9592339992523193, + "step": 3090 + }, + { + "epoch": 0.36, + "learning_rate": 1.9605524991220882e-07, + "logits/chosen": -2.994640588760376, + "logits/rejected": -3.0014407634735107, + "logps/chosen": -169.81787109375, + "logps/rejected": -186.98971557617188, + "loss": 0.1379, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.036567091941833496, + "rewards/margins": 3.0206310749053955, + "rewards/rejected": -2.9840641021728516, + "step": 3091 + }, + { + "epoch": 0.36, + "learning_rate": 1.9602013344258457e-07, + "logits/chosen": -2.893916606903076, + "logits/rejected": -2.815884590148926, + "logps/chosen": -197.46664428710938, + "logps/rejected": -232.71583557128906, + "loss": 0.1008, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8704132437705994, + "rewards/margins": 3.380824089050293, + "rewards/rejected": -4.251236915588379, + "step": 3092 + }, + { + "epoch": 0.36, + "learning_rate": 1.959850169729603e-07, + "logits/chosen": -2.451277732849121, + "logits/rejected": -2.4753665924072266, + "logps/chosen": -280.0081787109375, + "logps/rejected": -217.07664489746094, + "loss": 0.2794, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10805094242095947, + "rewards/margins": 1.8000813722610474, + "rewards/rejected": -1.9081323146820068, + "step": 3093 + }, + { + "epoch": 0.36, + "learning_rate": 1.9594990050333605e-07, + "logits/chosen": -2.7866454124450684, + "logits/rejected": -2.801309823989868, + "logps/chosen": -231.65142822265625, + "logps/rejected": -289.06109619140625, + "loss": 0.334, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9167325496673584, + "rewards/margins": 1.7122331857681274, + "rewards/rejected": -2.6289656162261963, + "step": 3094 + }, + { + "epoch": 0.36, + "learning_rate": 1.959147840337118e-07, + "logits/chosen": -2.705172538757324, + "logits/rejected": -2.728128671646118, + "logps/chosen": -112.56613159179688, + "logps/rejected": -138.15182495117188, + "loss": 0.4332, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.2755625247955322, + "rewards/margins": 1.3419406414031982, + "rewards/rejected": -3.6175034046173096, + "step": 3095 + }, + { + "epoch": 0.36, + "learning_rate": 1.9587966756408753e-07, + "logits/chosen": -1.7396575212478638, + "logits/rejected": -1.983611822128296, + "logps/chosen": -390.5391845703125, + "logps/rejected": -357.83740234375, + "loss": 0.6411, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0106933116912842, + "rewards/margins": 2.872037172317505, + "rewards/rejected": -3.882730484008789, + "step": 3096 + }, + { + "epoch": 0.36, + "learning_rate": 1.958445510944633e-07, + "logits/chosen": -3.044234037399292, + "logits/rejected": -2.987459182739258, + "logps/chosen": -204.94882202148438, + "logps/rejected": -258.0638122558594, + "loss": 0.2523, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4091903567314148, + "rewards/margins": 1.9944565296173096, + "rewards/rejected": -2.403646945953369, + "step": 3097 + }, + { + "epoch": 0.36, + "learning_rate": 1.9580943462483904e-07, + "logits/chosen": -2.049899101257324, + "logits/rejected": -1.9063458442687988, + "logps/chosen": -278.515380859375, + "logps/rejected": -396.078369140625, + "loss": 0.6808, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3398776054382324, + "rewards/margins": 1.6139492988586426, + "rewards/rejected": -2.953826904296875, + "step": 3098 + }, + { + "epoch": 0.36, + "learning_rate": 1.9577431815521477e-07, + "logits/chosen": -2.2222976684570312, + "logits/rejected": -2.1944265365600586, + "logps/chosen": -266.6556701660156, + "logps/rejected": -310.6433410644531, + "loss": 0.2217, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.32735779881477356, + "rewards/margins": 2.1852340698242188, + "rewards/rejected": -1.8578764200210571, + "step": 3099 + }, + { + "epoch": 0.36, + "learning_rate": 1.9573920168559055e-07, + "logits/chosen": -2.733588218688965, + "logits/rejected": -2.7694883346557617, + "logps/chosen": -313.8576354980469, + "logps/rejected": -223.81781005859375, + "loss": 0.3385, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7956733107566833, + "rewards/margins": 1.539943814277649, + "rewards/rejected": -2.3356170654296875, + "step": 3100 + }, + { + "epoch": 0.36, + "learning_rate": 1.9570408521596625e-07, + "logits/chosen": -2.6132614612579346, + "logits/rejected": -2.284115791320801, + "logps/chosen": -158.18284606933594, + "logps/rejected": -263.92608642578125, + "loss": 0.4576, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9928967356681824, + "rewards/margins": 0.9434848427772522, + "rewards/rejected": -1.9363815784454346, + "step": 3101 + }, + { + "epoch": 0.36, + "learning_rate": 1.9566896874634203e-07, + "logits/chosen": -2.358706474304199, + "logits/rejected": -2.3560986518859863, + "logps/chosen": -264.48516845703125, + "logps/rejected": -181.67596435546875, + "loss": 0.9106, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.374334692955017, + "rewards/margins": 0.31768298149108887, + "rewards/rejected": -1.6920177936553955, + "step": 3102 + }, + { + "epoch": 0.36, + "learning_rate": 1.9563385227671778e-07, + "logits/chosen": -2.4334137439727783, + "logits/rejected": -2.5610246658325195, + "logps/chosen": -368.0516052246094, + "logps/rejected": -225.9927215576172, + "loss": 0.5819, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5697246193885803, + "rewards/margins": 0.6916254162788391, + "rewards/rejected": -1.2613499164581299, + "step": 3103 + }, + { + "epoch": 0.36, + "learning_rate": 1.955987358070935e-07, + "logits/chosen": -2.1283600330352783, + "logits/rejected": -2.7457265853881836, + "logps/chosen": -471.1671447753906, + "logps/rejected": -256.78045654296875, + "loss": 0.2227, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22077012062072754, + "rewards/margins": 1.7984765768051147, + "rewards/rejected": -2.0192465782165527, + "step": 3104 + }, + { + "epoch": 0.36, + "learning_rate": 1.9556361933746927e-07, + "logits/chosen": -1.85600745677948, + "logits/rejected": -2.2078585624694824, + "logps/chosen": -355.10595703125, + "logps/rejected": -268.57586669921875, + "loss": 0.4232, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8610683083534241, + "rewards/margins": 1.9228333234786987, + "rewards/rejected": -2.7839014530181885, + "step": 3105 + }, + { + "epoch": 0.36, + "learning_rate": 1.9552850286784502e-07, + "logits/chosen": -2.448249340057373, + "logits/rejected": -2.274521589279175, + "logps/chosen": -216.76406860351562, + "logps/rejected": -201.02037048339844, + "loss": 0.3956, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6578423380851746, + "rewards/margins": 0.8620545268058777, + "rewards/rejected": -1.5198968648910522, + "step": 3106 + }, + { + "epoch": 0.36, + "learning_rate": 1.9549338639822075e-07, + "logits/chosen": -2.4913408756256104, + "logits/rejected": -2.5736358165740967, + "logps/chosen": -253.82777404785156, + "logps/rejected": -173.4739990234375, + "loss": 0.2279, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7932449579238892, + "rewards/margins": 2.3583695888519287, + "rewards/rejected": -3.1516144275665283, + "step": 3107 + }, + { + "epoch": 0.36, + "learning_rate": 1.954582699285965e-07, + "logits/chosen": -2.7557973861694336, + "logits/rejected": -2.7469382286071777, + "logps/chosen": -225.00689697265625, + "logps/rejected": -275.73321533203125, + "loss": 0.0956, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.678947925567627, + "rewards/margins": 2.965020179748535, + "rewards/rejected": -3.643968105316162, + "step": 3108 + }, + { + "epoch": 0.36, + "learning_rate": 1.9542315345897223e-07, + "logits/chosen": -2.953249454498291, + "logits/rejected": -3.0433244705200195, + "logps/chosen": -167.44662475585938, + "logps/rejected": -231.55360412597656, + "loss": 0.2009, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.33681830763816833, + "rewards/margins": 2.1525216102600098, + "rewards/rejected": -2.48934006690979, + "step": 3109 + }, + { + "epoch": 0.36, + "learning_rate": 1.9538803698934798e-07, + "logits/chosen": -2.11419677734375, + "logits/rejected": -2.148578405380249, + "logps/chosen": -575.5578002929688, + "logps/rejected": -454.27679443359375, + "loss": 0.3975, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7819559574127197, + "rewards/margins": 1.7788472175598145, + "rewards/rejected": -2.560802936553955, + "step": 3110 + }, + { + "epoch": 0.36, + "learning_rate": 1.9535292051972376e-07, + "logits/chosen": -1.967519760131836, + "logits/rejected": -2.044466018676758, + "logps/chosen": -324.3872375488281, + "logps/rejected": -340.9453125, + "loss": 0.3938, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6307142972946167, + "rewards/margins": 2.123629093170166, + "rewards/rejected": -2.754343271255493, + "step": 3111 + }, + { + "epoch": 0.36, + "learning_rate": 1.953178040500995e-07, + "logits/chosen": -2.2476470470428467, + "logits/rejected": -2.4458794593811035, + "logps/chosen": -317.8234558105469, + "logps/rejected": -318.40771484375, + "loss": 0.5418, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9156367778778076, + "rewards/margins": 0.9835942983627319, + "rewards/rejected": -1.8992310762405396, + "step": 3112 + }, + { + "epoch": 0.36, + "learning_rate": 1.9528268758047524e-07, + "logits/chosen": -2.7319045066833496, + "logits/rejected": -2.5396342277526855, + "logps/chosen": -219.27993774414062, + "logps/rejected": -227.51889038085938, + "loss": 0.2745, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.205019235610962, + "rewards/margins": 1.685842514038086, + "rewards/rejected": -2.890861749649048, + "step": 3113 + }, + { + "epoch": 0.36, + "learning_rate": 1.95247571110851e-07, + "logits/chosen": -2.8659865856170654, + "logits/rejected": -2.930150032043457, + "logps/chosen": -221.16575622558594, + "logps/rejected": -168.13201904296875, + "loss": 0.2526, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4587364196777344, + "rewards/margins": 2.0661933422088623, + "rewards/rejected": -2.524930000305176, + "step": 3114 + }, + { + "epoch": 0.36, + "learning_rate": 1.9521245464122673e-07, + "logits/chosen": -2.2860875129699707, + "logits/rejected": -2.3738760948181152, + "logps/chosen": -229.1325225830078, + "logps/rejected": -267.71209716796875, + "loss": 0.3689, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7061110734939575, + "rewards/margins": 2.0722599029541016, + "rewards/rejected": -2.7783710956573486, + "step": 3115 + }, + { + "epoch": 0.36, + "learning_rate": 1.9517733817160248e-07, + "logits/chosen": -2.923823356628418, + "logits/rejected": -2.8451125621795654, + "logps/chosen": -198.5912628173828, + "logps/rejected": -259.2249755859375, + "loss": 0.1949, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.085289716720581, + "rewards/margins": 4.506181240081787, + "rewards/rejected": -5.591470718383789, + "step": 3116 + }, + { + "epoch": 0.36, + "learning_rate": 1.951422217019782e-07, + "logits/chosen": -2.1090176105499268, + "logits/rejected": -2.3265223503112793, + "logps/chosen": -344.79241943359375, + "logps/rejected": -266.1082458496094, + "loss": 0.1728, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.43658360838890076, + "rewards/margins": 2.7204813957214355, + "rewards/rejected": -3.157064914703369, + "step": 3117 + }, + { + "epoch": 0.36, + "learning_rate": 1.9510710523235396e-07, + "logits/chosen": -2.290546178817749, + "logits/rejected": -2.0674407482147217, + "logps/chosen": -69.42042541503906, + "logps/rejected": -163.2729034423828, + "loss": 0.2139, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.251050740480423, + "rewards/margins": 3.08308744430542, + "rewards/rejected": -3.3341381549835205, + "step": 3118 + }, + { + "epoch": 0.36, + "learning_rate": 1.9507198876272971e-07, + "logits/chosen": -1.8392027616500854, + "logits/rejected": -2.1754140853881836, + "logps/chosen": -512.7687377929688, + "logps/rejected": -377.109375, + "loss": 0.3184, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8156626224517822, + "rewards/margins": 2.5222249031066895, + "rewards/rejected": -3.337887763977051, + "step": 3119 + }, + { + "epoch": 0.36, + "learning_rate": 1.9503687229310544e-07, + "logits/chosen": -2.21523380279541, + "logits/rejected": -2.0741095542907715, + "logps/chosen": -339.77716064453125, + "logps/rejected": -367.8578796386719, + "loss": 0.3035, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.275016188621521, + "rewards/margins": 1.9592798948287964, + "rewards/rejected": -2.2342960834503174, + "step": 3120 + }, + { + "epoch": 0.36, + "learning_rate": 1.950017558234812e-07, + "logits/chosen": -2.4695303440093994, + "logits/rejected": -2.413843870162964, + "logps/chosen": -290.359130859375, + "logps/rejected": -287.7592468261719, + "loss": 0.8369, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5566847324371338, + "rewards/margins": 1.3117501735687256, + "rewards/rejected": -2.8684349060058594, + "step": 3121 + }, + { + "epoch": 0.36, + "learning_rate": 1.9496663935385698e-07, + "logits/chosen": -2.1927175521850586, + "logits/rejected": -2.387068748474121, + "logps/chosen": -592.9461059570312, + "logps/rejected": -414.6092529296875, + "loss": 0.3847, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6630598306655884, + "rewards/margins": 1.2250436544418335, + "rewards/rejected": -2.888103485107422, + "step": 3122 + }, + { + "epoch": 0.36, + "learning_rate": 1.949315228842327e-07, + "logits/chosen": -2.2820403575897217, + "logits/rejected": -2.2921924591064453, + "logps/chosen": -288.0614013671875, + "logps/rejected": -237.62484741210938, + "loss": 0.1989, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6354726552963257, + "rewards/margins": 3.9952552318573, + "rewards/rejected": -5.630727767944336, + "step": 3123 + }, + { + "epoch": 0.36, + "learning_rate": 1.9489640641460846e-07, + "logits/chosen": -2.73960542678833, + "logits/rejected": -2.5148730278015137, + "logps/chosen": -90.53556060791016, + "logps/rejected": -166.40301513671875, + "loss": 0.2807, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7397007346153259, + "rewards/margins": 2.3496108055114746, + "rewards/rejected": -3.0893115997314453, + "step": 3124 + }, + { + "epoch": 0.36, + "learning_rate": 1.9486128994498418e-07, + "logits/chosen": -2.3940839767456055, + "logits/rejected": -2.436276435852051, + "logps/chosen": -259.9599914550781, + "logps/rejected": -273.7616882324219, + "loss": 0.3631, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6173297166824341, + "rewards/margins": 2.268190383911133, + "rewards/rejected": -2.8855199813842773, + "step": 3125 + }, + { + "epoch": 0.36, + "learning_rate": 1.9482617347535994e-07, + "logits/chosen": -2.249258041381836, + "logits/rejected": -2.2959647178649902, + "logps/chosen": -259.3506164550781, + "logps/rejected": -266.72509765625, + "loss": 0.5737, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1394498348236084, + "rewards/margins": 2.354887008666992, + "rewards/rejected": -3.4943368434906006, + "step": 3126 + }, + { + "epoch": 0.36, + "learning_rate": 1.947910570057357e-07, + "logits/chosen": -2.2917370796203613, + "logits/rejected": -2.5944464206695557, + "logps/chosen": -372.638671875, + "logps/rejected": -303.4380187988281, + "loss": 0.9097, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.8796391487121582, + "rewards/margins": 1.3972399234771729, + "rewards/rejected": -3.276879072189331, + "step": 3127 + }, + { + "epoch": 0.36, + "learning_rate": 1.9475594053611142e-07, + "logits/chosen": -1.7941418886184692, + "logits/rejected": -1.989912986755371, + "logps/chosen": -340.1589660644531, + "logps/rejected": -252.43377685546875, + "loss": 0.6158, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8707656860351562, + "rewards/margins": 1.479657769203186, + "rewards/rejected": -2.3504233360290527, + "step": 3128 + }, + { + "epoch": 0.36, + "learning_rate": 1.9472082406648717e-07, + "logits/chosen": -2.2199697494506836, + "logits/rejected": -2.3552141189575195, + "logps/chosen": -267.15625, + "logps/rejected": -248.58587646484375, + "loss": 0.4772, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6287648677825928, + "rewards/margins": 1.9179586172103882, + "rewards/rejected": -2.5467236042022705, + "step": 3129 + }, + { + "epoch": 0.36, + "learning_rate": 1.9468570759686293e-07, + "logits/chosen": -2.32252836227417, + "logits/rejected": -2.3431830406188965, + "logps/chosen": -314.8145446777344, + "logps/rejected": -305.54351806640625, + "loss": 0.4891, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1690998077392578, + "rewards/margins": 1.4230166673660278, + "rewards/rejected": -2.592116594314575, + "step": 3130 + }, + { + "epoch": 0.36, + "learning_rate": 1.9465059112723865e-07, + "logits/chosen": -3.020211696624756, + "logits/rejected": -2.941385269165039, + "logps/chosen": -391.8646545410156, + "logps/rejected": -246.7650146484375, + "loss": 0.5043, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9495858550071716, + "rewards/margins": 1.6231026649475098, + "rewards/rejected": -2.572688579559326, + "step": 3131 + }, + { + "epoch": 0.36, + "learning_rate": 1.946154746576144e-07, + "logits/chosen": -2.678713798522949, + "logits/rejected": -2.7858426570892334, + "logps/chosen": -431.4220275878906, + "logps/rejected": -367.8868408203125, + "loss": 0.0657, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6591325998306274, + "rewards/margins": 2.8996496200561523, + "rewards/rejected": -3.5587821006774902, + "step": 3132 + }, + { + "epoch": 0.36, + "learning_rate": 1.9458035818799014e-07, + "logits/chosen": -2.159714937210083, + "logits/rejected": -2.200671672821045, + "logps/chosen": -243.55859375, + "logps/rejected": -326.61126708984375, + "loss": 0.722, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3133256435394287, + "rewards/margins": 0.9063308238983154, + "rewards/rejected": -2.219656467437744, + "step": 3133 + }, + { + "epoch": 0.36, + "learning_rate": 1.9454524171836592e-07, + "logits/chosen": -2.6975247859954834, + "logits/rejected": -2.6923155784606934, + "logps/chosen": -206.96994018554688, + "logps/rejected": -193.20193481445312, + "loss": 0.6111, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.543494701385498, + "rewards/margins": 1.743812084197998, + "rewards/rejected": -3.287306785583496, + "step": 3134 + }, + { + "epoch": 0.36, + "learning_rate": 1.9451012524874167e-07, + "logits/chosen": -2.6292169094085693, + "logits/rejected": -2.22845721244812, + "logps/chosen": -185.2086944580078, + "logps/rejected": -293.89154052734375, + "loss": 0.1887, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3486311435699463, + "rewards/margins": 3.3579704761505127, + "rewards/rejected": -4.706602096557617, + "step": 3135 + }, + { + "epoch": 0.36, + "learning_rate": 1.944750087791174e-07, + "logits/chosen": -1.6885020732879639, + "logits/rejected": -1.882878065109253, + "logps/chosen": -634.0895385742188, + "logps/rejected": -589.9483642578125, + "loss": 0.4725, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5981866717338562, + "rewards/margins": 1.6001389026641846, + "rewards/rejected": -2.1983256340026855, + "step": 3136 + }, + { + "epoch": 0.36, + "learning_rate": 1.9443989230949315e-07, + "logits/chosen": -2.122853994369507, + "logits/rejected": -1.928478479385376, + "logps/chosen": -344.93133544921875, + "logps/rejected": -336.27587890625, + "loss": 0.6481, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8181113600730896, + "rewards/margins": 0.7755937576293945, + "rewards/rejected": -1.593705177307129, + "step": 3137 + }, + { + "epoch": 0.36, + "learning_rate": 1.9440477583986888e-07, + "logits/chosen": -2.047321081161499, + "logits/rejected": -2.0595061779022217, + "logps/chosen": -266.11065673828125, + "logps/rejected": -245.34120178222656, + "loss": 0.7075, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7644503116607666, + "rewards/margins": 0.30063438415527344, + "rewards/rejected": -2.06508469581604, + "step": 3138 + }, + { + "epoch": 0.36, + "learning_rate": 1.9436965937024463e-07, + "logits/chosen": -2.6129813194274902, + "logits/rejected": -2.2734713554382324, + "logps/chosen": -146.06344604492188, + "logps/rejected": -218.6856231689453, + "loss": 0.3362, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.349689781665802, + "rewards/margins": 2.944772243499756, + "rewards/rejected": -3.294462203979492, + "step": 3139 + }, + { + "epoch": 0.36, + "learning_rate": 1.9433454290062039e-07, + "logits/chosen": -1.8588626384735107, + "logits/rejected": -1.5602413415908813, + "logps/chosen": -192.5017547607422, + "logps/rejected": -290.65447998046875, + "loss": 2.205, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.7896032333374023, + "rewards/margins": -0.2975519895553589, + "rewards/rejected": -2.492051124572754, + "step": 3140 + }, + { + "epoch": 0.36, + "learning_rate": 1.9429942643099611e-07, + "logits/chosen": -2.4264259338378906, + "logits/rejected": -2.504601001739502, + "logps/chosen": -589.9705200195312, + "logps/rejected": -331.6932373046875, + "loss": 1.0646, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5706617832183838, + "rewards/margins": 0.013243436813354492, + "rewards/rejected": -1.5839052200317383, + "step": 3141 + }, + { + "epoch": 0.36, + "learning_rate": 1.9426430996137187e-07, + "logits/chosen": -2.547720432281494, + "logits/rejected": -2.3032543659210205, + "logps/chosen": -369.7059020996094, + "logps/rejected": -322.2955322265625, + "loss": 0.6087, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2909533977508545, + "rewards/margins": 0.849115252494812, + "rewards/rejected": -2.140068769454956, + "step": 3142 + }, + { + "epoch": 0.36, + "learning_rate": 1.9422919349174762e-07, + "logits/chosen": -2.9959402084350586, + "logits/rejected": -2.957061529159546, + "logps/chosen": -273.4281311035156, + "logps/rejected": -301.74725341796875, + "loss": 0.3259, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.090683102607727, + "rewards/margins": 2.1137866973876953, + "rewards/rejected": -3.204469919204712, + "step": 3143 + }, + { + "epoch": 0.36, + "learning_rate": 1.9419407702212335e-07, + "logits/chosen": -2.0835139751434326, + "logits/rejected": -2.5321035385131836, + "logps/chosen": -329.7633056640625, + "logps/rejected": -229.16006469726562, + "loss": 0.4924, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0666297674179077, + "rewards/margins": 2.1538100242614746, + "rewards/rejected": -3.220439910888672, + "step": 3144 + }, + { + "epoch": 0.36, + "learning_rate": 1.9415896055249913e-07, + "logits/chosen": -2.727970838546753, + "logits/rejected": -2.6290905475616455, + "logps/chosen": -235.81341552734375, + "logps/rejected": -225.3533935546875, + "loss": 0.1017, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4768626093864441, + "rewards/margins": 2.973494529724121, + "rewards/rejected": -3.450357437133789, + "step": 3145 + }, + { + "epoch": 0.36, + "learning_rate": 1.9412384408287486e-07, + "logits/chosen": -2.529512882232666, + "logits/rejected": -2.800849676132202, + "logps/chosen": -197.0749053955078, + "logps/rejected": -202.7888946533203, + "loss": 0.3808, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.621638298034668, + "rewards/margins": 2.4556398391723633, + "rewards/rejected": -3.0772781372070312, + "step": 3146 + }, + { + "epoch": 0.36, + "learning_rate": 1.940887276132506e-07, + "logits/chosen": -2.4865036010742188, + "logits/rejected": -2.534517288208008, + "logps/chosen": -160.1866455078125, + "logps/rejected": -214.58453369140625, + "loss": 0.8181, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6082054376602173, + "rewards/margins": 0.7439761757850647, + "rewards/rejected": -2.3521816730499268, + "step": 3147 + }, + { + "epoch": 0.36, + "learning_rate": 1.9405361114362636e-07, + "logits/chosen": -2.6352553367614746, + "logits/rejected": -2.4044413566589355, + "logps/chosen": -143.63320922851562, + "logps/rejected": -208.7255401611328, + "loss": 0.3096, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6630328893661499, + "rewards/margins": 2.071988821029663, + "rewards/rejected": -2.7350218296051025, + "step": 3148 + }, + { + "epoch": 0.36, + "learning_rate": 1.940184946740021e-07, + "logits/chosen": -2.8205201625823975, + "logits/rejected": -2.860401153564453, + "logps/chosen": -236.8096160888672, + "logps/rejected": -155.61212158203125, + "loss": 0.9594, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.9655389785766602, + "rewards/margins": 0.19093842804431915, + "rewards/rejected": -2.156477451324463, + "step": 3149 + }, + { + "epoch": 0.36, + "learning_rate": 1.9398337820437785e-07, + "logits/chosen": -2.3246572017669678, + "logits/rejected": -2.388625144958496, + "logps/chosen": -240.45541381835938, + "logps/rejected": -207.9353790283203, + "loss": 0.3419, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0307608842849731, + "rewards/margins": 2.0065791606903076, + "rewards/rejected": -3.037339925765991, + "step": 3150 + }, + { + "epoch": 0.36, + "learning_rate": 1.939482617347536e-07, + "logits/chosen": -2.293090581893921, + "logits/rejected": -2.473846197128296, + "logps/chosen": -307.17236328125, + "logps/rejected": -237.19932556152344, + "loss": 0.3857, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.788901686668396, + "rewards/margins": 2.4225940704345703, + "rewards/rejected": -3.2114956378936768, + "step": 3151 + }, + { + "epoch": 0.36, + "learning_rate": 1.9391314526512933e-07, + "logits/chosen": -2.3599483966827393, + "logits/rejected": -2.3356521129608154, + "logps/chosen": -200.3927459716797, + "logps/rejected": -251.86972045898438, + "loss": 0.2443, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5437919497489929, + "rewards/margins": 2.74511456489563, + "rewards/rejected": -3.2889063358306885, + "step": 3152 + }, + { + "epoch": 0.36, + "learning_rate": 1.9387802879550508e-07, + "logits/chosen": -2.225285053253174, + "logits/rejected": -2.1645073890686035, + "logps/chosen": -479.7649841308594, + "logps/rejected": -444.18133544921875, + "loss": 0.1721, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.321206271648407, + "rewards/margins": 3.589596748352051, + "rewards/rejected": -3.9108023643493652, + "step": 3153 + }, + { + "epoch": 0.36, + "learning_rate": 1.938429123258808e-07, + "logits/chosen": -2.553158760070801, + "logits/rejected": -2.6542296409606934, + "logps/chosen": -386.17510986328125, + "logps/rejected": -428.2359924316406, + "loss": 0.1524, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6536270976066589, + "rewards/margins": 2.4196243286132812, + "rewards/rejected": -3.073251247406006, + "step": 3154 + }, + { + "epoch": 0.36, + "learning_rate": 1.9380779585625656e-07, + "logits/chosen": -1.8198652267456055, + "logits/rejected": -2.300130605697632, + "logps/chosen": -296.6413879394531, + "logps/rejected": -174.77554321289062, + "loss": 1.2034, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.502197742462158, + "rewards/margins": 1.0123283863067627, + "rewards/rejected": -3.514526128768921, + "step": 3155 + }, + { + "epoch": 0.36, + "learning_rate": 1.9377267938663234e-07, + "logits/chosen": -1.8930219411849976, + "logits/rejected": -2.276884078979492, + "logps/chosen": -486.91943359375, + "logps/rejected": -224.1693572998047, + "loss": 0.5595, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0209718942642212, + "rewards/margins": 0.9149029850959778, + "rewards/rejected": -1.9358747005462646, + "step": 3156 + }, + { + "epoch": 0.36, + "learning_rate": 1.9373756291700807e-07, + "logits/chosen": -2.44633150100708, + "logits/rejected": -2.463796615600586, + "logps/chosen": -242.73623657226562, + "logps/rejected": -195.86614990234375, + "loss": 0.419, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.196081280708313, + "rewards/margins": 0.9482672214508057, + "rewards/rejected": -2.144348621368408, + "step": 3157 + }, + { + "epoch": 0.36, + "learning_rate": 1.9370244644738382e-07, + "logits/chosen": -2.480922222137451, + "logits/rejected": -2.054884433746338, + "logps/chosen": -144.59739685058594, + "logps/rejected": -309.75067138671875, + "loss": 0.691, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4461123943328857, + "rewards/margins": 0.8797208666801453, + "rewards/rejected": -2.325833320617676, + "step": 3158 + }, + { + "epoch": 0.36, + "learning_rate": 1.9366732997775958e-07, + "logits/chosen": -2.1165711879730225, + "logits/rejected": -2.5368566513061523, + "logps/chosen": -412.7290344238281, + "logps/rejected": -288.749755859375, + "loss": 0.8718, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4805212020874023, + "rewards/margins": 0.7513979077339172, + "rewards/rejected": -2.231919288635254, + "step": 3159 + }, + { + "epoch": 0.36, + "learning_rate": 1.936322135081353e-07, + "logits/chosen": -1.8120417594909668, + "logits/rejected": -2.1126718521118164, + "logps/chosen": -307.33233642578125, + "logps/rejected": -259.35748291015625, + "loss": 0.3038, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.31942838430404663, + "rewards/margins": 1.3321106433868408, + "rewards/rejected": -1.6515390872955322, + "step": 3160 + }, + { + "epoch": 0.36, + "learning_rate": 1.9359709703851106e-07, + "logits/chosen": -2.270761251449585, + "logits/rejected": -2.5394442081451416, + "logps/chosen": -411.61865234375, + "logps/rejected": -303.97113037109375, + "loss": 0.4731, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3018410205841064, + "rewards/margins": 1.9424198865890503, + "rewards/rejected": -3.244260787963867, + "step": 3161 + }, + { + "epoch": 0.36, + "learning_rate": 1.9356198056888679e-07, + "logits/chosen": -1.7548669576644897, + "logits/rejected": -2.0555050373077393, + "logps/chosen": -289.2738342285156, + "logps/rejected": -237.4573211669922, + "loss": 0.3137, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.811498761177063, + "rewards/margins": 1.9747257232666016, + "rewards/rejected": -2.786224603652954, + "step": 3162 + }, + { + "epoch": 0.36, + "learning_rate": 1.9352686409926254e-07, + "logits/chosen": -1.9954984188079834, + "logits/rejected": -2.0585529804229736, + "logps/chosen": -385.6422424316406, + "logps/rejected": -340.9436950683594, + "loss": 0.2258, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7717919945716858, + "rewards/margins": 2.153120756149292, + "rewards/rejected": -2.924912691116333, + "step": 3163 + }, + { + "epoch": 0.36, + "learning_rate": 1.934917476296383e-07, + "logits/chosen": -2.520843029022217, + "logits/rejected": -2.8270926475524902, + "logps/chosen": -256.1054382324219, + "logps/rejected": -227.4391632080078, + "loss": 0.4475, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6019074320793152, + "rewards/margins": 2.1560440063476562, + "rewards/rejected": -2.757951498031616, + "step": 3164 + }, + { + "epoch": 0.36, + "learning_rate": 1.9345663116001402e-07, + "logits/chosen": -2.5477278232574463, + "logits/rejected": -2.7910988330841064, + "logps/chosen": -184.17681884765625, + "logps/rejected": -241.90834045410156, + "loss": 0.3524, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1395983695983887, + "rewards/margins": 3.4331302642822266, + "rewards/rejected": -4.572729110717773, + "step": 3165 + }, + { + "epoch": 0.36, + "learning_rate": 1.9342151469038977e-07, + "logits/chosen": -1.5526047945022583, + "logits/rejected": -1.5861364603042603, + "logps/chosen": -293.877197265625, + "logps/rejected": -411.8284912109375, + "loss": 0.4312, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9234793186187744, + "rewards/margins": 1.4820499420166016, + "rewards/rejected": -2.405529260635376, + "step": 3166 + }, + { + "epoch": 0.37, + "learning_rate": 1.9338639822076556e-07, + "logits/chosen": -2.4230310916900635, + "logits/rejected": -2.4075770378112793, + "logps/chosen": -152.56149291992188, + "logps/rejected": -189.22897338867188, + "loss": 0.5846, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2053780555725098, + "rewards/margins": 0.8751765489578247, + "rewards/rejected": -2.080554485321045, + "step": 3167 + }, + { + "epoch": 0.37, + "learning_rate": 1.9335128175114128e-07, + "logits/chosen": -2.8583149909973145, + "logits/rejected": -2.7324233055114746, + "logps/chosen": -257.2450866699219, + "logps/rejected": -289.2914123535156, + "loss": 0.3224, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4822622537612915, + "rewards/margins": 1.374607801437378, + "rewards/rejected": -1.8568699359893799, + "step": 3168 + }, + { + "epoch": 0.37, + "learning_rate": 1.9331616528151704e-07, + "logits/chosen": -2.385619640350342, + "logits/rejected": -2.6156492233276367, + "logps/chosen": -411.13232421875, + "logps/rejected": -275.4765625, + "loss": 0.3263, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4704168140888214, + "rewards/margins": 2.900026559829712, + "rewards/rejected": -3.370443344116211, + "step": 3169 + }, + { + "epoch": 0.37, + "learning_rate": 1.9328104881189276e-07, + "logits/chosen": -3.0463554859161377, + "logits/rejected": -3.0510611534118652, + "logps/chosen": -246.25254821777344, + "logps/rejected": -286.6976318359375, + "loss": 0.1483, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9627786874771118, + "rewards/margins": 3.5651252269744873, + "rewards/rejected": -4.5279035568237305, + "step": 3170 + }, + { + "epoch": 0.37, + "learning_rate": 1.9324593234226852e-07, + "logits/chosen": -2.4321954250335693, + "logits/rejected": -2.3862221240997314, + "logps/chosen": -392.3564453125, + "logps/rejected": -289.9694519042969, + "loss": 0.2663, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.015571117401123, + "rewards/margins": 1.6141798496246338, + "rewards/rejected": -3.629750967025757, + "step": 3171 + }, + { + "epoch": 0.37, + "learning_rate": 1.9321081587264427e-07, + "logits/chosen": -2.4266791343688965, + "logits/rejected": -2.7360610961914062, + "logps/chosen": -222.05633544921875, + "logps/rejected": -142.28854370117188, + "loss": 1.0277, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3004878759384155, + "rewards/margins": 0.8119280934333801, + "rewards/rejected": -2.1124160289764404, + "step": 3172 + }, + { + "epoch": 0.37, + "learning_rate": 1.9317569940302e-07, + "logits/chosen": -2.295703887939453, + "logits/rejected": -2.2835707664489746, + "logps/chosen": -566.7529907226562, + "logps/rejected": -591.857421875, + "loss": 0.2359, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4898945689201355, + "rewards/margins": 4.064135551452637, + "rewards/rejected": -4.554029941558838, + "step": 3173 + }, + { + "epoch": 0.37, + "learning_rate": 1.9314058293339575e-07, + "logits/chosen": -2.151926279067993, + "logits/rejected": -2.4233593940734863, + "logps/chosen": -256.9464111328125, + "logps/rejected": -360.1933898925781, + "loss": 0.4145, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8165274858474731, + "rewards/margins": 2.2151405811309814, + "rewards/rejected": -3.031668186187744, + "step": 3174 + }, + { + "epoch": 0.37, + "learning_rate": 1.931054664637715e-07, + "logits/chosen": -2.1272058486938477, + "logits/rejected": -2.2995123863220215, + "logps/chosen": -281.47576904296875, + "logps/rejected": -251.13861083984375, + "loss": 0.1721, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7271847128868103, + "rewards/margins": 2.5587656497955322, + "rewards/rejected": -3.285950183868408, + "step": 3175 + }, + { + "epoch": 0.37, + "learning_rate": 1.9307034999414723e-07, + "logits/chosen": -2.2978949546813965, + "logits/rejected": -2.7157726287841797, + "logps/chosen": -454.29730224609375, + "logps/rejected": -229.4456787109375, + "loss": 0.4662, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8970307111740112, + "rewards/margins": 1.3795546293258667, + "rewards/rejected": -2.276585340499878, + "step": 3176 + }, + { + "epoch": 0.37, + "learning_rate": 1.93035233524523e-07, + "logits/chosen": -2.630176305770874, + "logits/rejected": -2.808920383453369, + "logps/chosen": -225.8303680419922, + "logps/rejected": -114.7637939453125, + "loss": 0.4919, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8557724952697754, + "rewards/margins": 2.07440447807312, + "rewards/rejected": -2.9301769733428955, + "step": 3177 + }, + { + "epoch": 0.37, + "learning_rate": 1.9300011705489872e-07, + "logits/chosen": -1.925976276397705, + "logits/rejected": -2.2158827781677246, + "logps/chosen": -406.74566650390625, + "logps/rejected": -294.28564453125, + "loss": 0.2695, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0678925514221191, + "rewards/margins": 2.835362672805786, + "rewards/rejected": -3.9032554626464844, + "step": 3178 + }, + { + "epoch": 0.37, + "learning_rate": 1.929650005852745e-07, + "logits/chosen": -2.336238384246826, + "logits/rejected": -2.2873973846435547, + "logps/chosen": -406.5097351074219, + "logps/rejected": -228.9411163330078, + "loss": 0.2437, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9119406938552856, + "rewards/margins": 3.1231937408447266, + "rewards/rejected": -5.035134315490723, + "step": 3179 + }, + { + "epoch": 0.37, + "learning_rate": 1.9292988411565025e-07, + "logits/chosen": -1.955564260482788, + "logits/rejected": -2.266599178314209, + "logps/chosen": -290.6675109863281, + "logps/rejected": -269.1775207519531, + "loss": 0.4904, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0227714776992798, + "rewards/margins": 2.2173829078674316, + "rewards/rejected": -3.240154504776001, + "step": 3180 + }, + { + "epoch": 0.37, + "learning_rate": 1.9289476764602598e-07, + "logits/chosen": -2.1222341060638428, + "logits/rejected": -2.464975357055664, + "logps/chosen": -338.0621032714844, + "logps/rejected": -263.79986572265625, + "loss": 0.1953, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.33961015939712524, + "rewards/margins": 2.494269609451294, + "rewards/rejected": -2.8338797092437744, + "step": 3181 + }, + { + "epoch": 0.37, + "learning_rate": 1.9285965117640173e-07, + "logits/chosen": -1.609241247177124, + "logits/rejected": -2.0293006896972656, + "logps/chosen": -423.5540771484375, + "logps/rejected": -257.517578125, + "loss": 0.5425, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4015200138092041, + "rewards/margins": 1.1126656532287598, + "rewards/rejected": -1.5141856670379639, + "step": 3182 + }, + { + "epoch": 0.37, + "learning_rate": 1.9282453470677748e-07, + "logits/chosen": -2.233884334564209, + "logits/rejected": -2.593989849090576, + "logps/chosen": -226.5009765625, + "logps/rejected": -166.2117919921875, + "loss": 0.3006, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10337947309017181, + "rewards/margins": 1.7184010744094849, + "rewards/rejected": -1.8217806816101074, + "step": 3183 + }, + { + "epoch": 0.37, + "learning_rate": 1.927894182371532e-07, + "logits/chosen": -2.377129077911377, + "logits/rejected": -2.04062557220459, + "logps/chosen": -377.87103271484375, + "logps/rejected": -370.0451965332031, + "loss": 0.7869, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.392898440361023, + "rewards/margins": 0.7684478759765625, + "rewards/rejected": -2.161346197128296, + "step": 3184 + }, + { + "epoch": 0.37, + "learning_rate": 1.9275430176752897e-07, + "logits/chosen": -2.2035787105560303, + "logits/rejected": -2.4938888549804688, + "logps/chosen": -142.78274536132812, + "logps/rejected": -130.8389892578125, + "loss": 0.5657, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0613963603973389, + "rewards/margins": 1.478614330291748, + "rewards/rejected": -2.540010690689087, + "step": 3185 + }, + { + "epoch": 0.37, + "learning_rate": 1.927191852979047e-07, + "logits/chosen": -2.460076332092285, + "logits/rejected": -2.32920503616333, + "logps/chosen": -112.0361099243164, + "logps/rejected": -235.18792724609375, + "loss": 0.2289, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.117453098297119, + "rewards/margins": 3.3901333808898926, + "rewards/rejected": -5.507586479187012, + "step": 3186 + }, + { + "epoch": 0.37, + "learning_rate": 1.9268406882828045e-07, + "logits/chosen": -2.873163938522339, + "logits/rejected": -2.880739688873291, + "logps/chosen": -238.04754638671875, + "logps/rejected": -319.43597412109375, + "loss": 0.4271, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4845625162124634, + "rewards/margins": 2.04876708984375, + "rewards/rejected": -3.533329486846924, + "step": 3187 + }, + { + "epoch": 0.37, + "learning_rate": 1.926489523586562e-07, + "logits/chosen": -2.2296881675720215, + "logits/rejected": -2.1942460536956787, + "logps/chosen": -361.3300476074219, + "logps/rejected": -315.4604797363281, + "loss": 0.2631, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4950950145721436, + "rewards/margins": 2.7636637687683105, + "rewards/rejected": -4.258758544921875, + "step": 3188 + }, + { + "epoch": 0.37, + "learning_rate": 1.9261383588903193e-07, + "logits/chosen": -2.50163197517395, + "logits/rejected": -2.5528688430786133, + "logps/chosen": -317.81011962890625, + "logps/rejected": -217.2755889892578, + "loss": 0.1912, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0006332397460938, + "rewards/margins": 2.885354995727539, + "rewards/rejected": -3.885988235473633, + "step": 3189 + }, + { + "epoch": 0.37, + "learning_rate": 1.925787194194077e-07, + "logits/chosen": -2.409959077835083, + "logits/rejected": -2.1178719997406006, + "logps/chosen": -153.40869140625, + "logps/rejected": -298.6468505859375, + "loss": 0.5181, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8298629522323608, + "rewards/margins": 2.1785049438476562, + "rewards/rejected": -4.008367538452148, + "step": 3190 + }, + { + "epoch": 0.37, + "learning_rate": 1.9254360294978344e-07, + "logits/chosen": -2.5554070472717285, + "logits/rejected": -2.421757221221924, + "logps/chosen": -227.7171173095703, + "logps/rejected": -249.70437622070312, + "loss": 0.2586, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9105822443962097, + "rewards/margins": 1.962022304534912, + "rewards/rejected": -2.8726046085357666, + "step": 3191 + }, + { + "epoch": 0.37, + "learning_rate": 1.925084864801592e-07, + "logits/chosen": -2.765136480331421, + "logits/rejected": -2.4327807426452637, + "logps/chosen": -220.33944702148438, + "logps/rejected": -387.0235290527344, + "loss": 0.1592, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.849673330783844, + "rewards/margins": 2.8943309783935547, + "rewards/rejected": -3.744004726409912, + "step": 3192 + }, + { + "epoch": 0.37, + "learning_rate": 1.9247337001053494e-07, + "logits/chosen": -1.6743366718292236, + "logits/rejected": -1.8045693635940552, + "logps/chosen": -327.3912353515625, + "logps/rejected": -356.3032531738281, + "loss": 0.2784, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6848108768463135, + "rewards/margins": 1.8299883604049683, + "rewards/rejected": -2.5147995948791504, + "step": 3193 + }, + { + "epoch": 0.37, + "learning_rate": 1.9243825354091067e-07, + "logits/chosen": -2.647738456726074, + "logits/rejected": -2.730057954788208, + "logps/chosen": -228.40725708007812, + "logps/rejected": -206.15464782714844, + "loss": 0.9032, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5878829956054688, + "rewards/margins": 1.9105770587921143, + "rewards/rejected": -3.498460054397583, + "step": 3194 + }, + { + "epoch": 0.37, + "learning_rate": 1.9240313707128642e-07, + "logits/chosen": -2.163642406463623, + "logits/rejected": -1.9137470722198486, + "logps/chosen": -120.30136108398438, + "logps/rejected": -171.73301696777344, + "loss": 0.4343, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.32558852434158325, + "rewards/margins": 0.7422335147857666, + "rewards/rejected": -1.067821979522705, + "step": 3195 + }, + { + "epoch": 0.37, + "learning_rate": 1.9236802060166218e-07, + "logits/chosen": -2.1842808723449707, + "logits/rejected": -2.424293041229248, + "logps/chosen": -389.4468994140625, + "logps/rejected": -312.48150634765625, + "loss": 0.4521, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5881210565567017, + "rewards/margins": 1.0983647108078003, + "rewards/rejected": -1.686485767364502, + "step": 3196 + }, + { + "epoch": 0.37, + "learning_rate": 1.923329041320379e-07, + "logits/chosen": -2.3824918270111084, + "logits/rejected": -2.378804922103882, + "logps/chosen": -193.2406005859375, + "logps/rejected": -254.7410125732422, + "loss": 0.675, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.607877790927887, + "rewards/margins": 2.6403748989105225, + "rewards/rejected": -3.2482526302337646, + "step": 3197 + }, + { + "epoch": 0.37, + "learning_rate": 1.9229778766241366e-07, + "logits/chosen": -2.173361301422119, + "logits/rejected": -2.4532816410064697, + "logps/chosen": -421.20159912109375, + "logps/rejected": -265.6947937011719, + "loss": 0.1063, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12034028768539429, + "rewards/margins": 2.270887613296509, + "rewards/rejected": -2.1505472660064697, + "step": 3198 + }, + { + "epoch": 0.37, + "learning_rate": 1.922626711927894e-07, + "logits/chosen": -2.6547937393188477, + "logits/rejected": -2.7644855976104736, + "logps/chosen": -357.8065185546875, + "logps/rejected": -209.68801879882812, + "loss": 0.353, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6381357908248901, + "rewards/margins": 1.61263906955719, + "rewards/rejected": -3.25077486038208, + "step": 3199 + }, + { + "epoch": 0.37, + "learning_rate": 1.9222755472316514e-07, + "logits/chosen": -2.6238749027252197, + "logits/rejected": -2.6273999214172363, + "logps/chosen": -357.6358642578125, + "logps/rejected": -371.8435363769531, + "loss": 0.3555, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1801223754882812, + "rewards/margins": 1.8048169612884521, + "rewards/rejected": -2.9849390983581543, + "step": 3200 + }, + { + "epoch": 0.37, + "learning_rate": 1.9219243825354092e-07, + "logits/chosen": -2.1464922428131104, + "logits/rejected": -2.027754783630371, + "logps/chosen": -189.87835693359375, + "logps/rejected": -304.1450500488281, + "loss": 0.5331, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4089159965515137, + "rewards/margins": 2.7268471717834473, + "rewards/rejected": -4.135763168334961, + "step": 3201 + }, + { + "epoch": 0.37, + "learning_rate": 1.9215732178391665e-07, + "logits/chosen": -2.3675038814544678, + "logits/rejected": -2.5716726779937744, + "logps/chosen": -297.38104248046875, + "logps/rejected": -274.4926452636719, + "loss": 0.21, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.28225505352020264, + "rewards/margins": 3.6043074131011963, + "rewards/rejected": -3.8865623474121094, + "step": 3202 + }, + { + "epoch": 0.37, + "learning_rate": 1.921222053142924e-07, + "logits/chosen": -2.1412200927734375, + "logits/rejected": -2.1792726516723633, + "logps/chosen": -305.129638671875, + "logps/rejected": -214.63787841796875, + "loss": 0.4223, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8028964400291443, + "rewards/margins": 1.1804964542388916, + "rewards/rejected": -1.9833929538726807, + "step": 3203 + }, + { + "epoch": 0.37, + "learning_rate": 1.9208708884466816e-07, + "logits/chosen": -2.7087883949279785, + "logits/rejected": -2.887730121612549, + "logps/chosen": -293.93792724609375, + "logps/rejected": -279.66851806640625, + "loss": 0.2252, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3119114339351654, + "rewards/margins": 2.9990925788879395, + "rewards/rejected": -3.3110039234161377, + "step": 3204 + }, + { + "epoch": 0.37, + "learning_rate": 1.9205197237504388e-07, + "logits/chosen": -2.2404468059539795, + "logits/rejected": -2.4750657081604004, + "logps/chosen": -406.7594299316406, + "logps/rejected": -293.9046630859375, + "loss": 0.2918, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7959896326065063, + "rewards/margins": 3.271254777908325, + "rewards/rejected": -4.067244529724121, + "step": 3205 + }, + { + "epoch": 0.37, + "learning_rate": 1.9201685590541964e-07, + "logits/chosen": -2.098480701446533, + "logits/rejected": -2.165003776550293, + "logps/chosen": -370.3039855957031, + "logps/rejected": -231.72804260253906, + "loss": 0.3645, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8581496477127075, + "rewards/margins": 1.5684837102890015, + "rewards/rejected": -2.426633358001709, + "step": 3206 + }, + { + "epoch": 0.37, + "learning_rate": 1.9198173943579537e-07, + "logits/chosen": -2.2520320415496826, + "logits/rejected": -2.2654008865356445, + "logps/chosen": -424.7441101074219, + "logps/rejected": -349.6483459472656, + "loss": 0.1864, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3030763268470764, + "rewards/margins": 1.975055456161499, + "rewards/rejected": -2.2781317234039307, + "step": 3207 + }, + { + "epoch": 0.37, + "learning_rate": 1.9194662296617112e-07, + "logits/chosen": -2.433889865875244, + "logits/rejected": -2.4706192016601562, + "logps/chosen": -172.10475158691406, + "logps/rejected": -222.01278686523438, + "loss": 0.6224, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0891315937042236, + "rewards/margins": 2.7707250118255615, + "rewards/rejected": -3.859856605529785, + "step": 3208 + }, + { + "epoch": 0.37, + "learning_rate": 1.9191150649654687e-07, + "logits/chosen": -2.031899929046631, + "logits/rejected": -1.6567630767822266, + "logps/chosen": -302.551513671875, + "logps/rejected": -420.7549133300781, + "loss": 0.8984, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5224350690841675, + "rewards/margins": 0.6263744831085205, + "rewards/rejected": -2.1488096714019775, + "step": 3209 + }, + { + "epoch": 0.37, + "learning_rate": 1.918763900269226e-07, + "logits/chosen": -2.2914857864379883, + "logits/rejected": -2.2724661827087402, + "logps/chosen": -248.45919799804688, + "logps/rejected": -262.6332702636719, + "loss": 0.6266, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0414788722991943, + "rewards/margins": 0.8385477066040039, + "rewards/rejected": -1.8800264596939087, + "step": 3210 + }, + { + "epoch": 0.37, + "learning_rate": 1.9184127355729835e-07, + "logits/chosen": -2.1376805305480957, + "logits/rejected": -1.8718832731246948, + "logps/chosen": -337.48876953125, + "logps/rejected": -331.794677734375, + "loss": 0.0889, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.529987096786499, + "rewards/margins": 3.449474334716797, + "rewards/rejected": -3.979461193084717, + "step": 3211 + }, + { + "epoch": 0.37, + "learning_rate": 1.9180615708767413e-07, + "logits/chosen": -1.7753843069076538, + "logits/rejected": -1.6328661441802979, + "logps/chosen": -415.8115234375, + "logps/rejected": -440.1239013671875, + "loss": 0.6238, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9498165845870972, + "rewards/margins": 0.7212100028991699, + "rewards/rejected": -1.671026587486267, + "step": 3212 + }, + { + "epoch": 0.37, + "learning_rate": 1.9177104061804986e-07, + "logits/chosen": -2.3730573654174805, + "logits/rejected": -2.4972641468048096, + "logps/chosen": -245.30010986328125, + "logps/rejected": -227.27261352539062, + "loss": 0.4811, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9065850973129272, + "rewards/margins": 2.6513454914093018, + "rewards/rejected": -3.5579304695129395, + "step": 3213 + }, + { + "epoch": 0.37, + "learning_rate": 1.9173592414842562e-07, + "logits/chosen": -2.1196951866149902, + "logits/rejected": -1.875179648399353, + "logps/chosen": -361.92913818359375, + "logps/rejected": -350.62945556640625, + "loss": 0.1626, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7680625319480896, + "rewards/margins": 3.0035080909729004, + "rewards/rejected": -3.7715706825256348, + "step": 3214 + }, + { + "epoch": 0.37, + "learning_rate": 1.9170080767880134e-07, + "logits/chosen": -2.5106160640716553, + "logits/rejected": -2.664267063140869, + "logps/chosen": -275.7483825683594, + "logps/rejected": -169.0465850830078, + "loss": 0.388, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0389065742492676, + "rewards/margins": 1.4092808961868286, + "rewards/rejected": -2.4481875896453857, + "step": 3215 + }, + { + "epoch": 0.37, + "learning_rate": 1.916656912091771e-07, + "logits/chosen": -1.9119139909744263, + "logits/rejected": -1.784617304801941, + "logps/chosen": -262.243408203125, + "logps/rejected": -317.5898742675781, + "loss": 0.6916, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9523583650588989, + "rewards/margins": 1.7338262796401978, + "rewards/rejected": -2.6861844062805176, + "step": 3216 + }, + { + "epoch": 0.37, + "learning_rate": 1.9163057473955285e-07, + "logits/chosen": -2.5144448280334473, + "logits/rejected": -2.3084256649017334, + "logps/chosen": -508.58526611328125, + "logps/rejected": -535.469970703125, + "loss": 0.1721, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6735275983810425, + "rewards/margins": 2.9369266033172607, + "rewards/rejected": -3.6104540824890137, + "step": 3217 + }, + { + "epoch": 0.37, + "learning_rate": 1.9159545826992858e-07, + "logits/chosen": -2.360067844390869, + "logits/rejected": -2.3245906829833984, + "logps/chosen": -358.96142578125, + "logps/rejected": -315.2904052734375, + "loss": 0.7725, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0861793756484985, + "rewards/margins": 0.21861226856708527, + "rewards/rejected": -1.3047916889190674, + "step": 3218 + }, + { + "epoch": 0.37, + "learning_rate": 1.9156034180030433e-07, + "logits/chosen": -2.322202205657959, + "logits/rejected": -2.1176328659057617, + "logps/chosen": -256.536865234375, + "logps/rejected": -252.10548400878906, + "loss": 0.4284, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2660086154937744, + "rewards/margins": 1.2773265838623047, + "rewards/rejected": -2.543335199356079, + "step": 3219 + }, + { + "epoch": 0.37, + "learning_rate": 1.9152522533068009e-07, + "logits/chosen": -1.9050346612930298, + "logits/rejected": -2.105329751968384, + "logps/chosen": -239.60736083984375, + "logps/rejected": -179.34523010253906, + "loss": 0.4706, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8088674545288086, + "rewards/margins": 1.3404514789581299, + "rewards/rejected": -2.1493186950683594, + "step": 3220 + }, + { + "epoch": 0.37, + "learning_rate": 1.9149010886105581e-07, + "logits/chosen": -2.9761149883270264, + "logits/rejected": -2.9548048973083496, + "logps/chosen": -152.12693786621094, + "logps/rejected": -190.60714721679688, + "loss": 0.162, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8543515205383301, + "rewards/margins": 3.038710117340088, + "rewards/rejected": -3.893061399459839, + "step": 3221 + }, + { + "epoch": 0.37, + "learning_rate": 1.9145499239143157e-07, + "logits/chosen": -2.3411781787872314, + "logits/rejected": -2.527927875518799, + "logps/chosen": -308.43170166015625, + "logps/rejected": -246.7545166015625, + "loss": 0.5752, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6483064889907837, + "rewards/margins": 0.9431005120277405, + "rewards/rejected": -1.591407060623169, + "step": 3222 + }, + { + "epoch": 0.37, + "learning_rate": 1.914198759218073e-07, + "logits/chosen": -2.5906662940979004, + "logits/rejected": -2.615285873413086, + "logps/chosen": -170.65048217773438, + "logps/rejected": -155.81692504882812, + "loss": 0.6862, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1002418994903564, + "rewards/margins": 1.9506767988204956, + "rewards/rejected": -3.0509185791015625, + "step": 3223 + }, + { + "epoch": 0.37, + "learning_rate": 1.9138475945218307e-07, + "logits/chosen": -1.992499589920044, + "logits/rejected": -1.858843445777893, + "logps/chosen": -265.7242736816406, + "logps/rejected": -263.9051208496094, + "loss": 0.4733, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2267136573791504, + "rewards/margins": 1.0423943996429443, + "rewards/rejected": -2.2691080570220947, + "step": 3224 + }, + { + "epoch": 0.37, + "learning_rate": 1.9134964298255883e-07, + "logits/chosen": -2.2307870388031006, + "logits/rejected": -2.3936221599578857, + "logps/chosen": -363.5297546386719, + "logps/rejected": -394.2835388183594, + "loss": 0.2434, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0915018320083618, + "rewards/margins": 2.28348970413208, + "rewards/rejected": -3.3749914169311523, + "step": 3225 + }, + { + "epoch": 0.37, + "learning_rate": 1.9131452651293456e-07, + "logits/chosen": -2.1070075035095215, + "logits/rejected": -1.9209818840026855, + "logps/chosen": -378.609619140625, + "logps/rejected": -236.52090454101562, + "loss": 0.2873, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13527168333530426, + "rewards/margins": 1.8019130229949951, + "rewards/rejected": -1.9371845722198486, + "step": 3226 + }, + { + "epoch": 0.37, + "learning_rate": 1.912794100433103e-07, + "logits/chosen": -2.2773237228393555, + "logits/rejected": -2.606339931488037, + "logps/chosen": -190.42245483398438, + "logps/rejected": -142.29476928710938, + "loss": 0.6957, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7956674695014954, + "rewards/margins": 1.7520499229431152, + "rewards/rejected": -2.547717332839966, + "step": 3227 + }, + { + "epoch": 0.37, + "learning_rate": 1.9124429357368606e-07, + "logits/chosen": -2.194204568862915, + "logits/rejected": -2.0882720947265625, + "logps/chosen": -197.5236053466797, + "logps/rejected": -254.85089111328125, + "loss": 0.3315, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6069765686988831, + "rewards/margins": 1.8906821012496948, + "rewards/rejected": -2.4976584911346436, + "step": 3228 + }, + { + "epoch": 0.37, + "learning_rate": 1.912091771040618e-07, + "logits/chosen": -2.349149703979492, + "logits/rejected": -2.2923526763916016, + "logps/chosen": -241.9728546142578, + "logps/rejected": -294.5570373535156, + "loss": 0.2533, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5757079124450684, + "rewards/margins": 2.02740478515625, + "rewards/rejected": -2.6031126976013184, + "step": 3229 + }, + { + "epoch": 0.37, + "learning_rate": 1.9117406063443755e-07, + "logits/chosen": -1.7300277948379517, + "logits/rejected": -2.187500238418579, + "logps/chosen": -461.16046142578125, + "logps/rejected": -270.2566223144531, + "loss": 0.3647, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6026450395584106, + "rewards/margins": 3.396928310394287, + "rewards/rejected": -3.999573230743408, + "step": 3230 + }, + { + "epoch": 0.37, + "learning_rate": 1.9113894416481327e-07, + "logits/chosen": -2.5673298835754395, + "logits/rejected": -2.614267349243164, + "logps/chosen": -249.7477264404297, + "logps/rejected": -250.56353759765625, + "loss": 0.1575, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.602375864982605, + "rewards/margins": 2.5932888984680176, + "rewards/rejected": -3.195664882659912, + "step": 3231 + }, + { + "epoch": 0.37, + "learning_rate": 1.9110382769518903e-07, + "logits/chosen": -2.846735715866089, + "logits/rejected": -2.8511176109313965, + "logps/chosen": -172.0358428955078, + "logps/rejected": -296.25732421875, + "loss": 0.2402, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5085477232933044, + "rewards/margins": 2.506248712539673, + "rewards/rejected": -3.014796495437622, + "step": 3232 + }, + { + "epoch": 0.37, + "learning_rate": 1.910687112255648e-07, + "logits/chosen": -2.18988037109375, + "logits/rejected": -2.3484067916870117, + "logps/chosen": -405.2607421875, + "logps/rejected": -315.61236572265625, + "loss": 0.2808, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1109650135040283, + "rewards/margins": 1.9808012247085571, + "rewards/rejected": -3.091765880584717, + "step": 3233 + }, + { + "epoch": 0.37, + "learning_rate": 1.910335947559405e-07, + "logits/chosen": -1.7010945081710815, + "logits/rejected": -1.8567029237747192, + "logps/chosen": -342.3530578613281, + "logps/rejected": -337.22882080078125, + "loss": 0.4257, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2299234867095947, + "rewards/margins": 1.0553739070892334, + "rewards/rejected": -2.285297393798828, + "step": 3234 + }, + { + "epoch": 0.37, + "learning_rate": 1.909984782863163e-07, + "logits/chosen": -2.482731819152832, + "logits/rejected": -2.612645149230957, + "logps/chosen": -139.89390563964844, + "logps/rejected": -245.2103271484375, + "loss": 0.1218, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17751257121562958, + "rewards/margins": 3.609588146209717, + "rewards/rejected": -3.7871007919311523, + "step": 3235 + }, + { + "epoch": 0.37, + "learning_rate": 1.9096336181669202e-07, + "logits/chosen": -2.5433292388916016, + "logits/rejected": -2.5879926681518555, + "logps/chosen": -115.07571411132812, + "logps/rejected": -139.15145874023438, + "loss": 0.8069, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6142470836639404, + "rewards/margins": 0.5856675505638123, + "rewards/rejected": -2.1999144554138184, + "step": 3236 + }, + { + "epoch": 0.37, + "learning_rate": 1.9092824534706777e-07, + "logits/chosen": -2.41509747505188, + "logits/rejected": -2.302262544631958, + "logps/chosen": -87.05143737792969, + "logps/rejected": -219.56719970703125, + "loss": 0.3026, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5961471199989319, + "rewards/margins": 2.0022144317626953, + "rewards/rejected": -2.5983614921569824, + "step": 3237 + }, + { + "epoch": 0.37, + "learning_rate": 1.9089312887744352e-07, + "logits/chosen": -2.329216957092285, + "logits/rejected": -2.3566784858703613, + "logps/chosen": -235.15176391601562, + "logps/rejected": -183.5102996826172, + "loss": 0.6733, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.88872230052948, + "rewards/margins": 0.34474480152130127, + "rewards/rejected": -1.2334673404693604, + "step": 3238 + }, + { + "epoch": 0.37, + "learning_rate": 1.9085801240781925e-07, + "logits/chosen": -2.979156970977783, + "logits/rejected": -3.0226473808288574, + "logps/chosen": -255.14694213867188, + "logps/rejected": -222.02687072753906, + "loss": 0.1413, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7458598017692566, + "rewards/margins": 2.823683023452759, + "rewards/rejected": -3.56954288482666, + "step": 3239 + }, + { + "epoch": 0.37, + "learning_rate": 1.90822895938195e-07, + "logits/chosen": -2.1655588150024414, + "logits/rejected": -2.246284008026123, + "logps/chosen": -335.1148681640625, + "logps/rejected": -315.49322509765625, + "loss": 0.6703, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.60196852684021, + "rewards/margins": 0.978794515132904, + "rewards/rejected": -2.580763101577759, + "step": 3240 + }, + { + "epoch": 0.37, + "learning_rate": 1.9078777946857076e-07, + "logits/chosen": -2.3327059745788574, + "logits/rejected": -2.3451225757598877, + "logps/chosen": -258.759521484375, + "logps/rejected": -285.69635009765625, + "loss": 0.3199, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5492120981216431, + "rewards/margins": 2.091522455215454, + "rewards/rejected": -2.6407346725463867, + "step": 3241 + }, + { + "epoch": 0.37, + "learning_rate": 1.9075266299894649e-07, + "logits/chosen": -2.2374110221862793, + "logits/rejected": -2.287745952606201, + "logps/chosen": -340.118896484375, + "logps/rejected": -352.6009521484375, + "loss": 0.5145, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4645847678184509, + "rewards/margins": 1.0941646099090576, + "rewards/rejected": -1.5587494373321533, + "step": 3242 + }, + { + "epoch": 0.37, + "learning_rate": 1.9071754652932224e-07, + "logits/chosen": -2.032428741455078, + "logits/rejected": -2.11574125289917, + "logps/chosen": -565.244384765625, + "logps/rejected": -465.6898193359375, + "loss": 0.1672, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08293430507183075, + "rewards/margins": 3.1616106033325195, + "rewards/rejected": -3.078676462173462, + "step": 3243 + }, + { + "epoch": 0.37, + "learning_rate": 1.9068243005969797e-07, + "logits/chosen": -2.766401767730713, + "logits/rejected": -2.6206226348876953, + "logps/chosen": -427.8995361328125, + "logps/rejected": -246.79110717773438, + "loss": 0.4729, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1460349559783936, + "rewards/margins": 2.1568691730499268, + "rewards/rejected": -3.3029041290283203, + "step": 3244 + }, + { + "epoch": 0.37, + "learning_rate": 1.9064731359007372e-07, + "logits/chosen": -2.000807046890259, + "logits/rejected": -2.244251012802124, + "logps/chosen": -555.3555297851562, + "logps/rejected": -440.73046875, + "loss": 0.4426, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.844902515411377, + "rewards/margins": 1.4045891761779785, + "rewards/rejected": -2.2494916915893555, + "step": 3245 + }, + { + "epoch": 0.37, + "learning_rate": 1.906121971204495e-07, + "logits/chosen": -2.182837724685669, + "logits/rejected": -2.2293341159820557, + "logps/chosen": -264.5556335449219, + "logps/rejected": -288.6997375488281, + "loss": 0.3382, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0167686939239502, + "rewards/margins": 2.40460467338562, + "rewards/rejected": -3.4213733673095703, + "step": 3246 + }, + { + "epoch": 0.37, + "learning_rate": 1.9057708065082523e-07, + "logits/chosen": -2.910900115966797, + "logits/rejected": -2.9760189056396484, + "logps/chosen": -77.23954010009766, + "logps/rejected": -177.10760498046875, + "loss": 0.2079, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2990801930427551, + "rewards/margins": 2.33066987991333, + "rewards/rejected": -2.6297500133514404, + "step": 3247 + }, + { + "epoch": 0.37, + "learning_rate": 1.9054196418120098e-07, + "logits/chosen": -2.5930392742156982, + "logits/rejected": -2.8209686279296875, + "logps/chosen": -245.7379150390625, + "logps/rejected": -235.55789184570312, + "loss": 0.7648, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2553261518478394, + "rewards/margins": 1.21134352684021, + "rewards/rejected": -2.466669797897339, + "step": 3248 + }, + { + "epoch": 0.37, + "learning_rate": 1.9050684771157674e-07, + "logits/chosen": -2.242314577102661, + "logits/rejected": -2.355499267578125, + "logps/chosen": -286.13714599609375, + "logps/rejected": -265.9484558105469, + "loss": 0.4646, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9577001333236694, + "rewards/margins": 3.0575156211853027, + "rewards/rejected": -4.015215873718262, + "step": 3249 + }, + { + "epoch": 0.37, + "learning_rate": 1.9047173124195246e-07, + "logits/chosen": -2.018367052078247, + "logits/rejected": -1.7440327405929565, + "logps/chosen": -114.18154907226562, + "logps/rejected": -250.44287109375, + "loss": 0.2692, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6810930967330933, + "rewards/margins": 2.645387649536133, + "rewards/rejected": -3.3264808654785156, + "step": 3250 + }, + { + "epoch": 0.37, + "learning_rate": 1.9043661477232822e-07, + "logits/chosen": -2.4369571208953857, + "logits/rejected": -2.246279716491699, + "logps/chosen": -279.4761962890625, + "logps/rejected": -324.05206298828125, + "loss": 0.3944, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4846415519714355, + "rewards/margins": 2.212261199951172, + "rewards/rejected": -3.6969029903411865, + "step": 3251 + }, + { + "epoch": 0.37, + "learning_rate": 1.9040149830270394e-07, + "logits/chosen": -2.627434730529785, + "logits/rejected": -2.62156343460083, + "logps/chosen": -189.60678100585938, + "logps/rejected": -199.3087921142578, + "loss": 0.4421, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8599783182144165, + "rewards/margins": 1.2236329317092896, + "rewards/rejected": -3.083611011505127, + "step": 3252 + }, + { + "epoch": 0.38, + "learning_rate": 1.903663818330797e-07, + "logits/chosen": -2.0385751724243164, + "logits/rejected": -2.195742130279541, + "logps/chosen": -409.072509765625, + "logps/rejected": -302.34637451171875, + "loss": 0.376, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.61045241355896, + "rewards/margins": 2.276735544204712, + "rewards/rejected": -2.887187957763672, + "step": 3253 + }, + { + "epoch": 0.38, + "learning_rate": 1.9033126536345545e-07, + "logits/chosen": -2.171597957611084, + "logits/rejected": -2.0415902137756348, + "logps/chosen": -234.15426635742188, + "logps/rejected": -398.7038269042969, + "loss": 0.3492, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8946021795272827, + "rewards/margins": 2.150768995285034, + "rewards/rejected": -3.0453710556030273, + "step": 3254 + }, + { + "epoch": 0.38, + "learning_rate": 1.9029614889383118e-07, + "logits/chosen": -2.4243149757385254, + "logits/rejected": -2.5489609241485596, + "logps/chosen": -285.04901123046875, + "logps/rejected": -278.707275390625, + "loss": 0.2409, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20623943209648132, + "rewards/margins": 2.5384719371795654, + "rewards/rejected": -2.744711399078369, + "step": 3255 + }, + { + "epoch": 0.38, + "learning_rate": 1.9026103242420693e-07, + "logits/chosen": -2.796550989151001, + "logits/rejected": -2.647268056869507, + "logps/chosen": -211.519775390625, + "logps/rejected": -195.79144287109375, + "loss": 0.5382, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2500070333480835, + "rewards/margins": 2.201030731201172, + "rewards/rejected": -3.451037883758545, + "step": 3256 + }, + { + "epoch": 0.38, + "learning_rate": 1.9022591595458271e-07, + "logits/chosen": -2.0487732887268066, + "logits/rejected": -1.8607107400894165, + "logps/chosen": -404.6496276855469, + "logps/rejected": -346.6065673828125, + "loss": 0.516, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0490622520446777, + "rewards/margins": 1.8883740901947021, + "rewards/rejected": -2.93743634223938, + "step": 3257 + }, + { + "epoch": 0.38, + "learning_rate": 1.9019079948495844e-07, + "logits/chosen": -1.9815906286239624, + "logits/rejected": -1.8658604621887207, + "logps/chosen": -304.461669921875, + "logps/rejected": -248.11541748046875, + "loss": 0.6869, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.912968397140503, + "rewards/margins": 1.0229235887527466, + "rewards/rejected": -2.935892105102539, + "step": 3258 + }, + { + "epoch": 0.38, + "learning_rate": 1.901556830153342e-07, + "logits/chosen": -2.9059317111968994, + "logits/rejected": -2.762006998062134, + "logps/chosen": -138.8431854248047, + "logps/rejected": -166.42095947265625, + "loss": 0.4595, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2281897068023682, + "rewards/margins": 0.9800556302070618, + "rewards/rejected": -2.208245277404785, + "step": 3259 + }, + { + "epoch": 0.38, + "learning_rate": 1.9012056654570992e-07, + "logits/chosen": -2.066317081451416, + "logits/rejected": -2.2494349479675293, + "logps/chosen": -379.1339111328125, + "logps/rejected": -309.93359375, + "loss": 0.6215, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9046348333358765, + "rewards/margins": 0.9656919240951538, + "rewards/rejected": -1.8703265190124512, + "step": 3260 + }, + { + "epoch": 0.38, + "learning_rate": 1.9008545007608568e-07, + "logits/chosen": -2.9197731018066406, + "logits/rejected": -2.879359722137451, + "logps/chosen": -218.48239135742188, + "logps/rejected": -228.4048614501953, + "loss": 0.1993, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4524841010570526, + "rewards/margins": 2.7182817459106445, + "rewards/rejected": -3.1707658767700195, + "step": 3261 + }, + { + "epoch": 0.38, + "learning_rate": 1.9005033360646143e-07, + "logits/chosen": -2.5232622623443604, + "logits/rejected": -2.6467127799987793, + "logps/chosen": -299.5596008300781, + "logps/rejected": -291.4978942871094, + "loss": 0.6247, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9165400266647339, + "rewards/margins": 0.9961432218551636, + "rewards/rejected": -1.9126832485198975, + "step": 3262 + }, + { + "epoch": 0.38, + "learning_rate": 1.9001521713683716e-07, + "logits/chosen": -2.1451892852783203, + "logits/rejected": -2.5514397621154785, + "logps/chosen": -309.1081237792969, + "logps/rejected": -190.04135131835938, + "loss": 0.3846, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6965748071670532, + "rewards/margins": 2.4833319187164307, + "rewards/rejected": -3.1799068450927734, + "step": 3263 + }, + { + "epoch": 0.38, + "learning_rate": 1.899801006672129e-07, + "logits/chosen": -2.013824701309204, + "logits/rejected": -2.199186325073242, + "logps/chosen": -272.1514587402344, + "logps/rejected": -233.813232421875, + "loss": 0.7233, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5101286172866821, + "rewards/margins": 0.4892955422401428, + "rewards/rejected": -1.9994242191314697, + "step": 3264 + }, + { + "epoch": 0.38, + "learning_rate": 1.8994498419758867e-07, + "logits/chosen": -2.5571656227111816, + "logits/rejected": -2.1645755767822266, + "logps/chosen": -586.4765625, + "logps/rejected": -620.69677734375, + "loss": 0.2737, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.429807424545288, + "rewards/margins": 2.4142284393310547, + "rewards/rejected": -3.844036102294922, + "step": 3265 + }, + { + "epoch": 0.38, + "learning_rate": 1.899098677279644e-07, + "logits/chosen": -2.499805450439453, + "logits/rejected": -2.6329874992370605, + "logps/chosen": -375.4840087890625, + "logps/rejected": -281.9699401855469, + "loss": 1.4287, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.8551485538482666, + "rewards/margins": 0.21658855676651, + "rewards/rejected": -3.071737289428711, + "step": 3266 + }, + { + "epoch": 0.38, + "learning_rate": 1.8987475125834017e-07, + "logits/chosen": -2.79392409324646, + "logits/rejected": -2.7466318607330322, + "logps/chosen": -228.12684631347656, + "logps/rejected": -223.784423828125, + "loss": 0.3189, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9385015964508057, + "rewards/margins": 2.062432050704956, + "rewards/rejected": -3.0009336471557617, + "step": 3267 + }, + { + "epoch": 0.38, + "learning_rate": 1.8983963478871587e-07, + "logits/chosen": -2.202502489089966, + "logits/rejected": -2.167891502380371, + "logps/chosen": -247.9764862060547, + "logps/rejected": -191.8134765625, + "loss": 0.5691, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.857790470123291, + "rewards/margins": 1.4386940002441406, + "rewards/rejected": -2.2964844703674316, + "step": 3268 + }, + { + "epoch": 0.38, + "learning_rate": 1.8980451831909165e-07, + "logits/chosen": -2.0180411338806152, + "logits/rejected": -2.2204699516296387, + "logps/chosen": -342.6130676269531, + "logps/rejected": -315.2860412597656, + "loss": 0.1502, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.35546213388442993, + "rewards/margins": 1.99339759349823, + "rewards/rejected": -1.6379355192184448, + "step": 3269 + }, + { + "epoch": 0.38, + "learning_rate": 1.897694018494674e-07, + "logits/chosen": -1.9868800640106201, + "logits/rejected": -1.7067686319351196, + "logps/chosen": -126.47512817382812, + "logps/rejected": -194.5304412841797, + "loss": 0.3272, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.065507411956787, + "rewards/margins": 1.560427188873291, + "rewards/rejected": -2.625934600830078, + "step": 3270 + }, + { + "epoch": 0.38, + "learning_rate": 1.8973428537984314e-07, + "logits/chosen": -2.42052960395813, + "logits/rejected": -2.519713878631592, + "logps/chosen": -200.5906982421875, + "logps/rejected": -253.52297973632812, + "loss": 0.5449, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8511555194854736, + "rewards/margins": 1.112528920173645, + "rewards/rejected": -1.9636844396591187, + "step": 3271 + }, + { + "epoch": 0.38, + "learning_rate": 1.896991689102189e-07, + "logits/chosen": -2.5718493461608887, + "logits/rejected": -2.8372726440429688, + "logps/chosen": -196.5013885498047, + "logps/rejected": -242.30621337890625, + "loss": 0.2704, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4719979166984558, + "rewards/margins": 2.8414270877838135, + "rewards/rejected": -3.313424825668335, + "step": 3272 + }, + { + "epoch": 0.38, + "learning_rate": 1.8966405244059464e-07, + "logits/chosen": -1.8880492448806763, + "logits/rejected": -2.1127424240112305, + "logps/chosen": -431.2809143066406, + "logps/rejected": -278.9860534667969, + "loss": 0.8728, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7656667828559875, + "rewards/margins": 0.3691399097442627, + "rewards/rejected": -1.134806752204895, + "step": 3273 + }, + { + "epoch": 0.38, + "learning_rate": 1.8962893597097037e-07, + "logits/chosen": -2.2403628826141357, + "logits/rejected": -2.0572707653045654, + "logps/chosen": -212.3458251953125, + "logps/rejected": -313.9735107421875, + "loss": 0.7811, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6038944721221924, + "rewards/margins": 0.6007136106491089, + "rewards/rejected": -2.20460844039917, + "step": 3274 + }, + { + "epoch": 0.38, + "learning_rate": 1.8959381950134612e-07, + "logits/chosen": -2.394735097885132, + "logits/rejected": -2.3937597274780273, + "logps/chosen": -336.0953674316406, + "logps/rejected": -235.43731689453125, + "loss": 0.1087, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05868007242679596, + "rewards/margins": 3.4519331455230713, + "rewards/rejected": -3.510613203048706, + "step": 3275 + }, + { + "epoch": 0.38, + "learning_rate": 1.8955870303172185e-07, + "logits/chosen": -2.334635019302368, + "logits/rejected": -2.286454677581787, + "logps/chosen": -308.2416687011719, + "logps/rejected": -256.2568054199219, + "loss": 0.1676, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1740318536758423, + "rewards/margins": 3.895336866378784, + "rewards/rejected": -5.069368839263916, + "step": 3276 + }, + { + "epoch": 0.38, + "learning_rate": 1.895235865620976e-07, + "logits/chosen": -2.3084495067596436, + "logits/rejected": -2.2393548488616943, + "logps/chosen": -247.2765350341797, + "logps/rejected": -296.1494140625, + "loss": 0.2506, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5552542805671692, + "rewards/margins": 2.2580173015594482, + "rewards/rejected": -2.8132715225219727, + "step": 3277 + }, + { + "epoch": 0.38, + "learning_rate": 1.8948847009247339e-07, + "logits/chosen": -2.1740665435791016, + "logits/rejected": -2.1926984786987305, + "logps/chosen": -392.8775329589844, + "logps/rejected": -335.9189147949219, + "loss": 0.2847, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9376282095909119, + "rewards/margins": 2.2027041912078857, + "rewards/rejected": -3.1403324604034424, + "step": 3278 + }, + { + "epoch": 0.38, + "learning_rate": 1.894533536228491e-07, + "logits/chosen": -2.1219778060913086, + "logits/rejected": -1.762529730796814, + "logps/chosen": -333.1546325683594, + "logps/rejected": -306.08673095703125, + "loss": 0.3911, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.293074369430542, + "rewards/margins": 2.324694871902466, + "rewards/rejected": -3.6177690029144287, + "step": 3279 + }, + { + "epoch": 0.38, + "learning_rate": 1.8941823715322487e-07, + "logits/chosen": -2.498765707015991, + "logits/rejected": -2.495039701461792, + "logps/chosen": -232.97781372070312, + "logps/rejected": -368.4605712890625, + "loss": 0.5472, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.390188455581665, + "rewards/margins": 2.6230008602142334, + "rewards/rejected": -4.013189315795898, + "step": 3280 + }, + { + "epoch": 0.38, + "learning_rate": 1.893831206836006e-07, + "logits/chosen": -2.1934444904327393, + "logits/rejected": -2.2617781162261963, + "logps/chosen": -560.2706298828125, + "logps/rejected": -400.8258056640625, + "loss": 0.3902, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0010812282562256, + "rewards/margins": 1.2497525215148926, + "rewards/rejected": -2.2508339881896973, + "step": 3281 + }, + { + "epoch": 0.38, + "learning_rate": 1.8934800421397635e-07, + "logits/chosen": -2.1603705883026123, + "logits/rejected": -2.1417670249938965, + "logps/chosen": -174.89715576171875, + "logps/rejected": -159.04476928710938, + "loss": 0.2852, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.217457890510559, + "rewards/margins": 1.3908886909484863, + "rewards/rejected": -2.608346700668335, + "step": 3282 + }, + { + "epoch": 0.38, + "learning_rate": 1.893128877443521e-07, + "logits/chosen": -2.839916229248047, + "logits/rejected": -2.9906866550445557, + "logps/chosen": -337.2292175292969, + "logps/rejected": -287.3031921386719, + "loss": 0.5264, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3260172605514526, + "rewards/margins": 1.3733580112457275, + "rewards/rejected": -2.6993753910064697, + "step": 3283 + }, + { + "epoch": 0.38, + "learning_rate": 1.8927777127472783e-07, + "logits/chosen": -2.4616074562072754, + "logits/rejected": -2.325831890106201, + "logps/chosen": -280.2967529296875, + "logps/rejected": -274.264892578125, + "loss": 0.517, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6667501330375671, + "rewards/margins": 1.240465521812439, + "rewards/rejected": -1.9072158336639404, + "step": 3284 + }, + { + "epoch": 0.38, + "learning_rate": 1.8924265480510358e-07, + "logits/chosen": -2.5050852298736572, + "logits/rejected": -2.3771817684173584, + "logps/chosen": -153.4530029296875, + "logps/rejected": -297.0697021484375, + "loss": 0.2234, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0344029664993286, + "rewards/margins": 3.4643092155456543, + "rewards/rejected": -4.498712539672852, + "step": 3285 + }, + { + "epoch": 0.38, + "learning_rate": 1.8920753833547934e-07, + "logits/chosen": -1.988562822341919, + "logits/rejected": -2.1120076179504395, + "logps/chosen": -489.45306396484375, + "logps/rejected": -360.2896728515625, + "loss": 0.5605, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2181516885757446, + "rewards/margins": 1.2479689121246338, + "rewards/rejected": -2.466120719909668, + "step": 3286 + }, + { + "epoch": 0.38, + "learning_rate": 1.8917242186585506e-07, + "logits/chosen": -2.802196502685547, + "logits/rejected": -2.680649757385254, + "logps/chosen": -416.2929992675781, + "logps/rejected": -293.18145751953125, + "loss": 0.165, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7627511024475098, + "rewards/margins": 3.2112112045288086, + "rewards/rejected": -4.973962783813477, + "step": 3287 + }, + { + "epoch": 0.38, + "learning_rate": 1.8913730539623082e-07, + "logits/chosen": -2.457843542098999, + "logits/rejected": -2.3046882152557373, + "logps/chosen": -276.199951171875, + "logps/rejected": -188.7687225341797, + "loss": 0.4107, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.45737650990486145, + "rewards/margins": 1.1780894994735718, + "rewards/rejected": -1.6354660987854004, + "step": 3288 + }, + { + "epoch": 0.38, + "learning_rate": 1.8910218892660655e-07, + "logits/chosen": -2.686631202697754, + "logits/rejected": -2.7651162147521973, + "logps/chosen": -143.43817138671875, + "logps/rejected": -227.3778533935547, + "loss": 0.4382, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.713526725769043, + "rewards/margins": 2.297795295715332, + "rewards/rejected": -3.011321783065796, + "step": 3289 + }, + { + "epoch": 0.38, + "learning_rate": 1.890670724569823e-07, + "logits/chosen": -2.2581381797790527, + "logits/rejected": -2.7220706939697266, + "logps/chosen": -237.01841735839844, + "logps/rejected": -249.83648681640625, + "loss": 0.3854, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.115190029144287, + "rewards/margins": 1.9746378660202026, + "rewards/rejected": -3.0898277759552, + "step": 3290 + }, + { + "epoch": 0.38, + "learning_rate": 1.8903195598735808e-07, + "logits/chosen": -2.5228893756866455, + "logits/rejected": -2.7385380268096924, + "logps/chosen": -201.47885131835938, + "logps/rejected": -183.1134490966797, + "loss": 0.4879, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5332611203193665, + "rewards/margins": 1.488957166671753, + "rewards/rejected": -2.0222184658050537, + "step": 3291 + }, + { + "epoch": 0.38, + "learning_rate": 1.889968395177338e-07, + "logits/chosen": -2.079832077026367, + "logits/rejected": -2.30779767036438, + "logps/chosen": -265.66314697265625, + "logps/rejected": -250.14683532714844, + "loss": 0.2775, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6451374888420105, + "rewards/margins": 2.112771511077881, + "rewards/rejected": -2.757909059524536, + "step": 3292 + }, + { + "epoch": 0.38, + "learning_rate": 1.8896172304810956e-07, + "logits/chosen": -2.1696033477783203, + "logits/rejected": -2.422642230987549, + "logps/chosen": -397.308349609375, + "logps/rejected": -280.12152099609375, + "loss": 0.5308, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.091752529144287, + "rewards/margins": 2.192356586456299, + "rewards/rejected": -3.284109115600586, + "step": 3293 + }, + { + "epoch": 0.38, + "learning_rate": 1.8892660657848532e-07, + "logits/chosen": -2.3474173545837402, + "logits/rejected": -2.300229787826538, + "logps/chosen": -163.26397705078125, + "logps/rejected": -252.78860473632812, + "loss": 0.4955, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5911365151405334, + "rewards/margins": 1.089516520500183, + "rewards/rejected": -1.6806529760360718, + "step": 3294 + }, + { + "epoch": 0.38, + "learning_rate": 1.8889149010886104e-07, + "logits/chosen": -2.234299421310425, + "logits/rejected": -2.3857994079589844, + "logps/chosen": -404.1620788574219, + "logps/rejected": -254.6629180908203, + "loss": 0.271, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.46945515275001526, + "rewards/margins": 2.307889938354492, + "rewards/rejected": -2.7773451805114746, + "step": 3295 + }, + { + "epoch": 0.38, + "learning_rate": 1.888563736392368e-07, + "logits/chosen": -2.666318655014038, + "logits/rejected": -2.679415702819824, + "logps/chosen": -395.9197998046875, + "logps/rejected": -377.96026611328125, + "loss": 0.1902, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.45595288276672363, + "rewards/margins": 4.540170192718506, + "rewards/rejected": -4.996123313903809, + "step": 3296 + }, + { + "epoch": 0.38, + "learning_rate": 1.8882125716961252e-07, + "logits/chosen": -2.411773681640625, + "logits/rejected": -2.3956849575042725, + "logps/chosen": -217.9118194580078, + "logps/rejected": -415.59722900390625, + "loss": 0.4535, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9639575481414795, + "rewards/margins": 2.990549087524414, + "rewards/rejected": -3.9545063972473145, + "step": 3297 + }, + { + "epoch": 0.38, + "learning_rate": 1.8878614069998828e-07, + "logits/chosen": -2.357614040374756, + "logits/rejected": -2.033998489379883, + "logps/chosen": -326.07720947265625, + "logps/rejected": -343.1966247558594, + "loss": 0.4391, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3340268135070801, + "rewards/margins": 1.498906135559082, + "rewards/rejected": -1.832932949066162, + "step": 3298 + }, + { + "epoch": 0.38, + "learning_rate": 1.8875102423036403e-07, + "logits/chosen": -1.973628282546997, + "logits/rejected": -2.318349838256836, + "logps/chosen": -361.08367919921875, + "logps/rejected": -275.19195556640625, + "loss": 0.3064, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5454041957855225, + "rewards/margins": 2.1851046085357666, + "rewards/rejected": -2.730509042739868, + "step": 3299 + }, + { + "epoch": 0.38, + "learning_rate": 1.8871590776073976e-07, + "logits/chosen": -2.0648961067199707, + "logits/rejected": -2.0417819023132324, + "logps/chosen": -178.17298889160156, + "logps/rejected": -350.5611267089844, + "loss": 0.2546, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1793678104877472, + "rewards/margins": 3.6473805904388428, + "rewards/rejected": -3.8267483711242676, + "step": 3300 + }, + { + "epoch": 0.38, + "learning_rate": 1.8868079129111554e-07, + "logits/chosen": -2.0365729331970215, + "logits/rejected": -2.2665350437164307, + "logps/chosen": -330.055908203125, + "logps/rejected": -337.71282958984375, + "loss": 0.197, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9494022130966187, + "rewards/margins": 3.377856731414795, + "rewards/rejected": -4.327258586883545, + "step": 3301 + }, + { + "epoch": 0.38, + "learning_rate": 1.886456748214913e-07, + "logits/chosen": -2.4094510078430176, + "logits/rejected": -2.6001319885253906, + "logps/chosen": -141.65960693359375, + "logps/rejected": -155.77268981933594, + "loss": 0.3789, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6999171376228333, + "rewards/margins": 2.349151134490967, + "rewards/rejected": -3.0490684509277344, + "step": 3302 + }, + { + "epoch": 0.38, + "learning_rate": 1.8861055835186702e-07, + "logits/chosen": -2.3964617252349854, + "logits/rejected": -2.85002064704895, + "logps/chosen": -201.3780059814453, + "logps/rejected": -140.4272918701172, + "loss": 0.4548, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6436787247657776, + "rewards/margins": 1.4359358549118042, + "rewards/rejected": -2.0796146392822266, + "step": 3303 + }, + { + "epoch": 0.38, + "learning_rate": 1.8857544188224277e-07, + "logits/chosen": -2.2283382415771484, + "logits/rejected": -2.4479928016662598, + "logps/chosen": -289.9859313964844, + "logps/rejected": -197.5041961669922, + "loss": 0.4857, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6631344556808472, + "rewards/margins": 1.83750581741333, + "rewards/rejected": -2.500640392303467, + "step": 3304 + }, + { + "epoch": 0.38, + "learning_rate": 1.885403254126185e-07, + "logits/chosen": -2.5654914379119873, + "logits/rejected": -2.61314058303833, + "logps/chosen": -211.6577911376953, + "logps/rejected": -211.29129028320312, + "loss": 0.3151, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0450878143310547, + "rewards/margins": 1.8855726718902588, + "rewards/rejected": -2.9306604862213135, + "step": 3305 + }, + { + "epoch": 0.38, + "learning_rate": 1.8850520894299426e-07, + "logits/chosen": -2.5247535705566406, + "logits/rejected": -2.740659475326538, + "logps/chosen": -174.81484985351562, + "logps/rejected": -196.1655731201172, + "loss": 0.4418, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2187153100967407, + "rewards/margins": 2.199472665786743, + "rewards/rejected": -3.4181880950927734, + "step": 3306 + }, + { + "epoch": 0.38, + "learning_rate": 1.8847009247337e-07, + "logits/chosen": -2.588507652282715, + "logits/rejected": -2.7264492511749268, + "logps/chosen": -100.00163269042969, + "logps/rejected": -174.25796508789062, + "loss": 0.3765, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3343883156776428, + "rewards/margins": 2.927050828933716, + "rewards/rejected": -3.2614388465881348, + "step": 3307 + }, + { + "epoch": 0.38, + "learning_rate": 1.8843497600374574e-07, + "logits/chosen": -2.182044506072998, + "logits/rejected": -2.3169314861297607, + "logps/chosen": -257.8022155761719, + "logps/rejected": -205.86831665039062, + "loss": 0.3317, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.33001047372817993, + "rewards/margins": 2.4944357872009277, + "rewards/rejected": -2.824446201324463, + "step": 3308 + }, + { + "epoch": 0.38, + "learning_rate": 1.883998595341215e-07, + "logits/chosen": -2.082434892654419, + "logits/rejected": -2.0126655101776123, + "logps/chosen": -243.00723266601562, + "logps/rejected": -347.19586181640625, + "loss": 0.2319, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6383155584335327, + "rewards/margins": 3.5076913833618164, + "rewards/rejected": -4.146007061004639, + "step": 3309 + }, + { + "epoch": 0.38, + "learning_rate": 1.8836474306449724e-07, + "logits/chosen": -2.511061668395996, + "logits/rejected": -2.311044931411743, + "logps/chosen": -162.3359375, + "logps/rejected": -305.0671081542969, + "loss": 0.2368, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2820076942443848, + "rewards/margins": 2.8445675373077393, + "rewards/rejected": -4.126575469970703, + "step": 3310 + }, + { + "epoch": 0.38, + "learning_rate": 1.8832962659487297e-07, + "logits/chosen": -2.1133780479431152, + "logits/rejected": -2.4190995693206787, + "logps/chosen": -305.86767578125, + "logps/rejected": -210.8787384033203, + "loss": 1.6133, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.001767158508301, + "rewards/margins": -0.3772916793823242, + "rewards/rejected": -1.624475359916687, + "step": 3311 + }, + { + "epoch": 0.38, + "learning_rate": 1.8829451012524875e-07, + "logits/chosen": -2.1045403480529785, + "logits/rejected": -2.2589731216430664, + "logps/chosen": -152.66075134277344, + "logps/rejected": -109.50471496582031, + "loss": 0.3157, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5659847855567932, + "rewards/margins": 1.2406373023986816, + "rewards/rejected": -1.8066221475601196, + "step": 3312 + }, + { + "epoch": 0.38, + "learning_rate": 1.8825939365562445e-07, + "logits/chosen": -2.6818833351135254, + "logits/rejected": -2.5318973064422607, + "logps/chosen": -188.3158721923828, + "logps/rejected": -233.11129760742188, + "loss": 0.3904, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3955239057540894, + "rewards/margins": 1.788008213043213, + "rewards/rejected": -3.1835317611694336, + "step": 3313 + }, + { + "epoch": 0.38, + "learning_rate": 1.8822427718600023e-07, + "logits/chosen": -2.3301613330841064, + "logits/rejected": -2.28682017326355, + "logps/chosen": -286.9452209472656, + "logps/rejected": -372.7063903808594, + "loss": 0.4603, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2420979738235474, + "rewards/margins": 1.8363816738128662, + "rewards/rejected": -3.078479766845703, + "step": 3314 + }, + { + "epoch": 0.38, + "learning_rate": 1.88189160716376e-07, + "logits/chosen": -2.4675116539001465, + "logits/rejected": -2.2481818199157715, + "logps/chosen": -315.9098205566406, + "logps/rejected": -464.0579528808594, + "loss": 0.5022, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3554122745990753, + "rewards/margins": 3.411602020263672, + "rewards/rejected": -3.767014503479004, + "step": 3315 + }, + { + "epoch": 0.38, + "learning_rate": 1.8815404424675171e-07, + "logits/chosen": -2.24786639213562, + "logits/rejected": -2.020974636077881, + "logps/chosen": -227.18092346191406, + "logps/rejected": -282.5074157714844, + "loss": 0.4918, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2513149976730347, + "rewards/margins": 0.7030634880065918, + "rewards/rejected": -1.9543784856796265, + "step": 3316 + }, + { + "epoch": 0.38, + "learning_rate": 1.8811892777712747e-07, + "logits/chosen": -2.5088000297546387, + "logits/rejected": -2.5912115573883057, + "logps/chosen": -287.9124450683594, + "logps/rejected": -250.03733825683594, + "loss": 0.6716, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2171072959899902, + "rewards/margins": 1.3227977752685547, + "rewards/rejected": -2.539905071258545, + "step": 3317 + }, + { + "epoch": 0.38, + "learning_rate": 1.8808381130750322e-07, + "logits/chosen": -2.2205934524536133, + "logits/rejected": -2.42151141166687, + "logps/chosen": -235.42535400390625, + "logps/rejected": -268.89752197265625, + "loss": 0.3408, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8302520513534546, + "rewards/margins": 2.0264182090759277, + "rewards/rejected": -2.856670379638672, + "step": 3318 + }, + { + "epoch": 0.38, + "learning_rate": 1.8804869483787895e-07, + "logits/chosen": -2.4476089477539062, + "logits/rejected": -2.451732873916626, + "logps/chosen": -274.46063232421875, + "logps/rejected": -275.1334228515625, + "loss": 0.2536, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8233855962753296, + "rewards/margins": 2.116312265396118, + "rewards/rejected": -2.9396979808807373, + "step": 3319 + }, + { + "epoch": 0.38, + "learning_rate": 1.880135783682547e-07, + "logits/chosen": -2.1434290409088135, + "logits/rejected": -2.0780348777770996, + "logps/chosen": -210.03646850585938, + "logps/rejected": -236.0110321044922, + "loss": 0.3392, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14326444268226624, + "rewards/margins": 2.005793571472168, + "rewards/rejected": -2.1490578651428223, + "step": 3320 + }, + { + "epoch": 0.38, + "learning_rate": 1.8797846189863043e-07, + "logits/chosen": -1.7180458307266235, + "logits/rejected": -1.516061544418335, + "logps/chosen": -174.96929931640625, + "logps/rejected": -306.0562438964844, + "loss": 0.1564, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1270681619644165, + "rewards/margins": 3.298774480819702, + "rewards/rejected": -4.425842761993408, + "step": 3321 + }, + { + "epoch": 0.38, + "learning_rate": 1.8794334542900619e-07, + "logits/chosen": -2.4154462814331055, + "logits/rejected": -2.36073899269104, + "logps/chosen": -322.6865539550781, + "logps/rejected": -258.2545471191406, + "loss": 0.4831, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9380956292152405, + "rewards/margins": 1.3385062217712402, + "rewards/rejected": -2.276602029800415, + "step": 3322 + }, + { + "epoch": 0.38, + "learning_rate": 1.8790822895938197e-07, + "logits/chosen": -2.6839182376861572, + "logits/rejected": -2.304874897003174, + "logps/chosen": -151.576904296875, + "logps/rejected": -273.1495361328125, + "loss": 0.1017, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8756628632545471, + "rewards/margins": 3.8313372135162354, + "rewards/rejected": -4.707000255584717, + "step": 3323 + }, + { + "epoch": 0.38, + "learning_rate": 1.8787311248975767e-07, + "logits/chosen": -2.262017011642456, + "logits/rejected": -2.4689431190490723, + "logps/chosen": -336.2411804199219, + "logps/rejected": -278.46246337890625, + "loss": 0.1862, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7186832427978516, + "rewards/margins": 2.2217719554901123, + "rewards/rejected": -2.940455198287964, + "step": 3324 + }, + { + "epoch": 0.38, + "learning_rate": 1.8783799602013345e-07, + "logits/chosen": -2.779120922088623, + "logits/rejected": -2.9129834175109863, + "logps/chosen": -233.1520538330078, + "logps/rejected": -291.80096435546875, + "loss": 0.323, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04367147386074066, + "rewards/margins": 2.0642528533935547, + "rewards/rejected": -2.0205812454223633, + "step": 3325 + }, + { + "epoch": 0.38, + "learning_rate": 1.878028795505092e-07, + "logits/chosen": -2.5033178329467773, + "logits/rejected": -2.5343050956726074, + "logps/chosen": -312.64678955078125, + "logps/rejected": -175.02749633789062, + "loss": 0.2857, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5324976444244385, + "rewards/margins": 1.9687678813934326, + "rewards/rejected": -2.501265525817871, + "step": 3326 + }, + { + "epoch": 0.38, + "learning_rate": 1.8776776308088493e-07, + "logits/chosen": -2.251124382019043, + "logits/rejected": -2.091768264770508, + "logps/chosen": -224.0312957763672, + "logps/rejected": -283.90911865234375, + "loss": 0.4134, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7593843936920166, + "rewards/margins": 1.1420453786849976, + "rewards/rejected": -1.9014298915863037, + "step": 3327 + }, + { + "epoch": 0.38, + "learning_rate": 1.8773264661126068e-07, + "logits/chosen": -2.1006858348846436, + "logits/rejected": -2.1024138927459717, + "logps/chosen": -203.8394317626953, + "logps/rejected": -232.57618713378906, + "loss": 0.4866, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7036283612251282, + "rewards/margins": 1.0739068984985352, + "rewards/rejected": -1.777535319328308, + "step": 3328 + }, + { + "epoch": 0.38, + "learning_rate": 1.876975301416364e-07, + "logits/chosen": -2.677239418029785, + "logits/rejected": -2.7307422161102295, + "logps/chosen": -453.8072814941406, + "logps/rejected": -318.1021423339844, + "loss": 0.2544, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3078330159187317, + "rewards/margins": 2.879520893096924, + "rewards/rejected": -3.1873538494110107, + "step": 3329 + }, + { + "epoch": 0.38, + "learning_rate": 1.8766241367201216e-07, + "logits/chosen": -2.193683624267578, + "logits/rejected": -2.1635754108428955, + "logps/chosen": -156.03182983398438, + "logps/rejected": -221.346923828125, + "loss": 0.5108, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0885980129241943, + "rewards/margins": 1.1994845867156982, + "rewards/rejected": -2.2880825996398926, + "step": 3330 + }, + { + "epoch": 0.38, + "learning_rate": 1.8762729720238792e-07, + "logits/chosen": -2.725551128387451, + "logits/rejected": -2.5899853706359863, + "logps/chosen": -236.7302703857422, + "logps/rejected": -268.15374755859375, + "loss": 0.6434, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.86612069606781, + "rewards/margins": 0.6412808895111084, + "rewards/rejected": -2.507401704788208, + "step": 3331 + }, + { + "epoch": 0.38, + "learning_rate": 1.8759218073276364e-07, + "logits/chosen": -2.8423006534576416, + "logits/rejected": -2.7452526092529297, + "logps/chosen": -107.78673553466797, + "logps/rejected": -136.87625122070312, + "loss": 0.4791, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8605637550354004, + "rewards/margins": 1.5592998266220093, + "rewards/rejected": -2.41986346244812, + "step": 3332 + }, + { + "epoch": 0.38, + "learning_rate": 1.875570642631394e-07, + "logits/chosen": -2.4082634449005127, + "logits/rejected": -2.52132248878479, + "logps/chosen": -249.5768280029297, + "logps/rejected": -236.3294677734375, + "loss": 0.5374, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7409813404083252, + "rewards/margins": 0.9770338535308838, + "rewards/rejected": -2.718015193939209, + "step": 3333 + }, + { + "epoch": 0.38, + "learning_rate": 1.8752194779351513e-07, + "logits/chosen": -2.8345818519592285, + "logits/rejected": -2.388775587081909, + "logps/chosen": -169.98536682128906, + "logps/rejected": -260.43145751953125, + "loss": 0.4587, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2753159999847412, + "rewards/margins": 1.6120599508285522, + "rewards/rejected": -2.887375831604004, + "step": 3334 + }, + { + "epoch": 0.38, + "learning_rate": 1.874868313238909e-07, + "logits/chosen": -2.678595542907715, + "logits/rejected": -2.623809576034546, + "logps/chosen": -260.1778259277344, + "logps/rejected": -239.94627380371094, + "loss": 0.4246, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3338649272918701, + "rewards/margins": 2.153834819793701, + "rewards/rejected": -3.4876997470855713, + "step": 3335 + }, + { + "epoch": 0.38, + "learning_rate": 1.8745171485426666e-07, + "logits/chosen": -2.618518114089966, + "logits/rejected": -2.7707629203796387, + "logps/chosen": -291.8432922363281, + "logps/rejected": -339.1007080078125, + "loss": 0.648, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.058429479598999, + "rewards/margins": 1.7924273014068604, + "rewards/rejected": -3.8508567810058594, + "step": 3336 + }, + { + "epoch": 0.38, + "learning_rate": 1.874165983846424e-07, + "logits/chosen": -2.1320977210998535, + "logits/rejected": -2.172999858856201, + "logps/chosen": -294.57568359375, + "logps/rejected": -350.3451843261719, + "loss": 0.2531, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.26965826749801636, + "rewards/margins": 2.070388078689575, + "rewards/rejected": -2.3400464057922363, + "step": 3337 + }, + { + "epoch": 0.38, + "learning_rate": 1.8738148191501814e-07, + "logits/chosen": -2.2595083713531494, + "logits/rejected": -2.3467769622802734, + "logps/chosen": -409.12713623046875, + "logps/rejected": -280.36883544921875, + "loss": 0.4179, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0113636255264282, + "rewards/margins": 1.94270920753479, + "rewards/rejected": -2.954072952270508, + "step": 3338 + }, + { + "epoch": 0.38, + "learning_rate": 1.873463654453939e-07, + "logits/chosen": -1.2229032516479492, + "logits/rejected": -1.5616182088851929, + "logps/chosen": -546.2887573242188, + "logps/rejected": -383.779052734375, + "loss": 0.2559, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8217969536781311, + "rewards/margins": 1.881101369857788, + "rewards/rejected": -2.7028980255126953, + "step": 3339 + }, + { + "epoch": 0.39, + "learning_rate": 1.8731124897576962e-07, + "logits/chosen": -2.8426361083984375, + "logits/rejected": -2.631932497024536, + "logps/chosen": -378.2254638671875, + "logps/rejected": -346.84271240234375, + "loss": 0.38, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8837997913360596, + "rewards/margins": 2.3412580490112305, + "rewards/rejected": -3.225057601928711, + "step": 3340 + }, + { + "epoch": 0.39, + "learning_rate": 1.8727613250614538e-07, + "logits/chosen": -2.701244592666626, + "logits/rejected": -2.8475430011749268, + "logps/chosen": -379.7395324707031, + "logps/rejected": -236.31126403808594, + "loss": 0.1104, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.024467691779136658, + "rewards/margins": 2.8822882175445557, + "rewards/rejected": -2.9067559242248535, + "step": 3341 + }, + { + "epoch": 0.39, + "learning_rate": 1.872410160365211e-07, + "logits/chosen": -2.4372143745422363, + "logits/rejected": -2.438910961151123, + "logps/chosen": -297.3605651855469, + "logps/rejected": -308.37725830078125, + "loss": 0.2182, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0018562078475952, + "rewards/margins": 2.242741584777832, + "rewards/rejected": -3.2445976734161377, + "step": 3342 + }, + { + "epoch": 0.39, + "learning_rate": 1.8720589956689686e-07, + "logits/chosen": -2.029323101043701, + "logits/rejected": -2.001913070678711, + "logps/chosen": -262.429443359375, + "logps/rejected": -287.97332763671875, + "loss": 0.4314, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6886724233627319, + "rewards/margins": 2.1810061931610107, + "rewards/rejected": -2.8696789741516113, + "step": 3343 + }, + { + "epoch": 0.39, + "learning_rate": 1.871707830972726e-07, + "logits/chosen": -2.046541213989258, + "logits/rejected": -1.9184253215789795, + "logps/chosen": -211.5499267578125, + "logps/rejected": -238.7809600830078, + "loss": 0.3831, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8151876926422119, + "rewards/margins": 1.0046740770339966, + "rewards/rejected": -1.8198617696762085, + "step": 3344 + }, + { + "epoch": 0.39, + "learning_rate": 1.8713566662764834e-07, + "logits/chosen": -1.983125925064087, + "logits/rejected": -2.2178924083709717, + "logps/chosen": -275.78582763671875, + "logps/rejected": -249.26528930664062, + "loss": 0.3168, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.36559808254241943, + "rewards/margins": 2.2521731853485107, + "rewards/rejected": -2.617771625518799, + "step": 3345 + }, + { + "epoch": 0.39, + "learning_rate": 1.8710055015802412e-07, + "logits/chosen": -2.1128437519073486, + "logits/rejected": -2.202432155609131, + "logps/chosen": -475.61944580078125, + "logps/rejected": -299.87457275390625, + "loss": 0.2499, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1764345169067383, + "rewards/margins": 2.446997880935669, + "rewards/rejected": -3.6234323978424072, + "step": 3346 + }, + { + "epoch": 0.39, + "learning_rate": 1.8706543368839987e-07, + "logits/chosen": -2.3190174102783203, + "logits/rejected": -2.282945156097412, + "logps/chosen": -387.027587890625, + "logps/rejected": -225.21337890625, + "loss": 0.1751, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5409160256385803, + "rewards/margins": 2.6596460342407227, + "rewards/rejected": -3.200562000274658, + "step": 3347 + }, + { + "epoch": 0.39, + "learning_rate": 1.870303172187756e-07, + "logits/chosen": -1.6243312358856201, + "logits/rejected": -2.3969929218292236, + "logps/chosen": -299.90997314453125, + "logps/rejected": -215.3408203125, + "loss": 0.6969, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.449777364730835, + "rewards/margins": 0.9679104089736938, + "rewards/rejected": -2.4176878929138184, + "step": 3348 + }, + { + "epoch": 0.39, + "learning_rate": 1.8699520074915135e-07, + "logits/chosen": -2.13608455657959, + "logits/rejected": -1.9464552402496338, + "logps/chosen": -341.7847595214844, + "logps/rejected": -372.7637023925781, + "loss": 0.2206, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5551711320877075, + "rewards/margins": 2.8088550567626953, + "rewards/rejected": -3.3640263080596924, + "step": 3349 + }, + { + "epoch": 0.39, + "learning_rate": 1.8696008427952708e-07, + "logits/chosen": -2.457134246826172, + "logits/rejected": -2.699411153793335, + "logps/chosen": -118.1162338256836, + "logps/rejected": -126.53826904296875, + "loss": 0.6956, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4161922931671143, + "rewards/margins": 1.3692617416381836, + "rewards/rejected": -2.7854537963867188, + "step": 3350 + }, + { + "epoch": 0.39, + "learning_rate": 1.8692496780990284e-07, + "logits/chosen": -2.621933937072754, + "logits/rejected": -2.5957677364349365, + "logps/chosen": -237.19200134277344, + "logps/rejected": -253.75048828125, + "loss": 0.6668, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4279110431671143, + "rewards/margins": 1.1871042251586914, + "rewards/rejected": -2.6150155067443848, + "step": 3351 + }, + { + "epoch": 0.39, + "learning_rate": 1.868898513402786e-07, + "logits/chosen": -2.619621515274048, + "logits/rejected": -2.8504080772399902, + "logps/chosen": -343.2820739746094, + "logps/rejected": -344.83673095703125, + "loss": 0.1355, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.027383029460906982, + "rewards/margins": 3.134422779083252, + "rewards/rejected": -3.1618056297302246, + "step": 3352 + }, + { + "epoch": 0.39, + "learning_rate": 1.8685473487065432e-07, + "logits/chosen": -2.3608076572418213, + "logits/rejected": -2.306983709335327, + "logps/chosen": -379.5069580078125, + "logps/rejected": -401.0007019042969, + "loss": 0.2129, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9360305666923523, + "rewards/margins": 2.372640609741211, + "rewards/rejected": -3.308670997619629, + "step": 3353 + }, + { + "epoch": 0.39, + "learning_rate": 1.8681961840103007e-07, + "logits/chosen": -2.589881420135498, + "logits/rejected": -2.758453845977783, + "logps/chosen": -323.2960205078125, + "logps/rejected": -200.68292236328125, + "loss": 0.4039, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2434507608413696, + "rewards/margins": 1.2660279273986816, + "rewards/rejected": -2.5094785690307617, + "step": 3354 + }, + { + "epoch": 0.39, + "learning_rate": 1.8678450193140582e-07, + "logits/chosen": -2.2019429206848145, + "logits/rejected": -1.8464664220809937, + "logps/chosen": -168.99490356445312, + "logps/rejected": -349.2186584472656, + "loss": 0.415, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.128709316253662, + "rewards/margins": 3.179288387298584, + "rewards/rejected": -4.307997703552246, + "step": 3355 + }, + { + "epoch": 0.39, + "learning_rate": 1.8674938546178155e-07, + "logits/chosen": -1.8300344944000244, + "logits/rejected": -1.841170310974121, + "logps/chosen": -466.763427734375, + "logps/rejected": -354.9850769042969, + "loss": 0.4169, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5381441116333008, + "rewards/margins": 1.2950472831726074, + "rewards/rejected": -1.8331913948059082, + "step": 3356 + }, + { + "epoch": 0.39, + "learning_rate": 1.8671426899215733e-07, + "logits/chosen": -2.251718044281006, + "logits/rejected": -2.151912212371826, + "logps/chosen": -283.08978271484375, + "logps/rejected": -315.6246032714844, + "loss": 0.3116, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6382632851600647, + "rewards/margins": 2.191474676132202, + "rewards/rejected": -2.829737663269043, + "step": 3357 + }, + { + "epoch": 0.39, + "learning_rate": 1.8667915252253303e-07, + "logits/chosen": -2.8906962871551514, + "logits/rejected": -2.9622652530670166, + "logps/chosen": -199.66702270507812, + "logps/rejected": -184.5212860107422, + "loss": 0.4002, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.692035436630249, + "rewards/margins": 2.695136547088623, + "rewards/rejected": -3.387171983718872, + "step": 3358 + }, + { + "epoch": 0.39, + "learning_rate": 1.866440360529088e-07, + "logits/chosen": -2.3680360317230225, + "logits/rejected": -2.2118446826934814, + "logps/chosen": -397.70709228515625, + "logps/rejected": -367.422119140625, + "loss": 0.3387, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.28914061188697815, + "rewards/margins": 2.55415940284729, + "rewards/rejected": -2.843299627304077, + "step": 3359 + }, + { + "epoch": 0.39, + "learning_rate": 1.8660891958328457e-07, + "logits/chosen": -2.399174451828003, + "logits/rejected": -2.668753147125244, + "logps/chosen": -418.052001953125, + "logps/rejected": -260.8287353515625, + "loss": 0.2332, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9389574527740479, + "rewards/margins": 2.184858798980713, + "rewards/rejected": -3.1238162517547607, + "step": 3360 + }, + { + "epoch": 0.39, + "learning_rate": 1.865738031136603e-07, + "logits/chosen": -2.194044589996338, + "logits/rejected": -2.1435208320617676, + "logps/chosen": -241.66094970703125, + "logps/rejected": -317.36688232421875, + "loss": 0.3136, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9685699343681335, + "rewards/margins": 2.0706164836883545, + "rewards/rejected": -3.0391862392425537, + "step": 3361 + }, + { + "epoch": 0.39, + "learning_rate": 1.8653868664403605e-07, + "logits/chosen": -2.2116916179656982, + "logits/rejected": -1.8875223398208618, + "logps/chosen": -268.89697265625, + "logps/rejected": -361.8538513183594, + "loss": 0.3648, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.45923325419425964, + "rewards/margins": 1.6703208684921265, + "rewards/rejected": -2.129554033279419, + "step": 3362 + }, + { + "epoch": 0.39, + "learning_rate": 1.865035701744118e-07, + "logits/chosen": -1.744698405265808, + "logits/rejected": -1.8603202104568481, + "logps/chosen": -358.98724365234375, + "logps/rejected": -429.40264892578125, + "loss": 0.1654, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4685186743736267, + "rewards/margins": 2.951120376586914, + "rewards/rejected": -3.4196391105651855, + "step": 3363 + }, + { + "epoch": 0.39, + "learning_rate": 1.8646845370478753e-07, + "logits/chosen": -2.4019176959991455, + "logits/rejected": -2.658358573913574, + "logps/chosen": -271.1411437988281, + "logps/rejected": -203.48394775390625, + "loss": 0.6841, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7192385196685791, + "rewards/margins": 2.4681644439697266, + "rewards/rejected": -3.1874027252197266, + "step": 3364 + }, + { + "epoch": 0.39, + "learning_rate": 1.8643333723516328e-07, + "logits/chosen": -2.703979015350342, + "logits/rejected": -2.5614943504333496, + "logps/chosen": -253.2201385498047, + "logps/rejected": -282.93695068359375, + "loss": 0.2693, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5284875631332397, + "rewards/margins": 1.4160255193710327, + "rewards/rejected": -1.944513201713562, + "step": 3365 + }, + { + "epoch": 0.39, + "learning_rate": 1.86398220765539e-07, + "logits/chosen": -2.5771634578704834, + "logits/rejected": -2.460827589035034, + "logps/chosen": -81.35899353027344, + "logps/rejected": -256.68951416015625, + "loss": 0.1878, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.188012957572937, + "rewards/margins": 3.386077880859375, + "rewards/rejected": -3.5740909576416016, + "step": 3366 + }, + { + "epoch": 0.39, + "learning_rate": 1.8636310429591476e-07, + "logits/chosen": -2.968655586242676, + "logits/rejected": -2.9772071838378906, + "logps/chosen": -109.47103118896484, + "logps/rejected": -144.55810546875, + "loss": 0.5948, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4669816493988037, + "rewards/margins": 0.9986754655838013, + "rewards/rejected": -2.4656572341918945, + "step": 3367 + }, + { + "epoch": 0.39, + "learning_rate": 1.8632798782629054e-07, + "logits/chosen": -1.9455219507217407, + "logits/rejected": -1.8250346183776855, + "logps/chosen": -363.8976745605469, + "logps/rejected": -461.7969055175781, + "loss": 0.5937, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8163896203041077, + "rewards/margins": 0.8989923596382141, + "rewards/rejected": -1.7153819799423218, + "step": 3368 + }, + { + "epoch": 0.39, + "learning_rate": 1.8629287135666627e-07, + "logits/chosen": -2.7903952598571777, + "logits/rejected": -2.6144700050354004, + "logps/chosen": -230.0545654296875, + "logps/rejected": -260.32171630859375, + "loss": 0.4601, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3493849039077759, + "rewards/margins": 2.23892879486084, + "rewards/rejected": -3.588313579559326, + "step": 3369 + }, + { + "epoch": 0.39, + "learning_rate": 1.8625775488704203e-07, + "logits/chosen": -2.304051637649536, + "logits/rejected": -2.131971597671509, + "logps/chosen": -315.7696228027344, + "logps/rejected": -389.2937316894531, + "loss": 0.4542, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3364452123641968, + "rewards/margins": 1.5038397312164307, + "rewards/rejected": -1.840285062789917, + "step": 3370 + }, + { + "epoch": 0.39, + "learning_rate": 1.8622263841741778e-07, + "logits/chosen": -2.037550687789917, + "logits/rejected": -2.216778039932251, + "logps/chosen": -381.77978515625, + "logps/rejected": -330.2088623046875, + "loss": 0.493, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1696697473526, + "rewards/margins": 1.4865283966064453, + "rewards/rejected": -2.656198024749756, + "step": 3371 + }, + { + "epoch": 0.39, + "learning_rate": 1.861875219477935e-07, + "logits/chosen": -2.208996057510376, + "logits/rejected": -2.189384698867798, + "logps/chosen": -292.33966064453125, + "logps/rejected": -260.5354309082031, + "loss": 0.2337, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6493122577667236, + "rewards/margins": 3.3402228355407715, + "rewards/rejected": -3.989534854888916, + "step": 3372 + }, + { + "epoch": 0.39, + "learning_rate": 1.8615240547816926e-07, + "logits/chosen": -2.5186855792999268, + "logits/rejected": -2.773037910461426, + "logps/chosen": -414.7930603027344, + "logps/rejected": -405.038330078125, + "loss": 0.2197, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8507785797119141, + "rewards/margins": 2.5604043006896973, + "rewards/rejected": -3.4111826419830322, + "step": 3373 + }, + { + "epoch": 0.39, + "learning_rate": 1.86117289008545e-07, + "logits/chosen": -2.2503886222839355, + "logits/rejected": -2.303617477416992, + "logps/chosen": -289.8755798339844, + "logps/rejected": -272.7761535644531, + "loss": 0.5003, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5725683569908142, + "rewards/margins": 1.2065985202789307, + "rewards/rejected": -1.7791666984558105, + "step": 3374 + }, + { + "epoch": 0.39, + "learning_rate": 1.8608217253892074e-07, + "logits/chosen": -2.74006986618042, + "logits/rejected": -2.6067066192626953, + "logps/chosen": -345.6149597167969, + "logps/rejected": -212.00848388671875, + "loss": 0.8343, + "rewards/accuracies": 0.25, + "rewards/chosen": -2.115536689758301, + "rewards/margins": 0.13934031128883362, + "rewards/rejected": -2.2548770904541016, + "step": 3375 + }, + { + "epoch": 0.39, + "learning_rate": 1.860470560692965e-07, + "logits/chosen": -2.2876956462860107, + "logits/rejected": -2.5081100463867188, + "logps/chosen": -445.2192077636719, + "logps/rejected": -174.2621612548828, + "loss": 0.4782, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2051786184310913, + "rewards/margins": 1.370936393737793, + "rewards/rejected": -2.5761148929595947, + "step": 3376 + }, + { + "epoch": 0.39, + "learning_rate": 1.8601193959967222e-07, + "logits/chosen": -2.1680057048797607, + "logits/rejected": -2.3232240676879883, + "logps/chosen": -269.560546875, + "logps/rejected": -329.1969299316406, + "loss": 0.422, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9305271506309509, + "rewards/margins": 2.1897599697113037, + "rewards/rejected": -3.1202869415283203, + "step": 3377 + }, + { + "epoch": 0.39, + "learning_rate": 1.8597682313004798e-07, + "logits/chosen": -2.2501184940338135, + "logits/rejected": -2.2302794456481934, + "logps/chosen": -225.0631103515625, + "logps/rejected": -251.88851928710938, + "loss": 0.4313, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.35781070590019226, + "rewards/margins": 1.6385983228683472, + "rewards/rejected": -1.9964090585708618, + "step": 3378 + }, + { + "epoch": 0.39, + "learning_rate": 1.859417066604237e-07, + "logits/chosen": -2.8830668926239014, + "logits/rejected": -2.759021282196045, + "logps/chosen": -169.9639892578125, + "logps/rejected": -313.7715759277344, + "loss": 0.4947, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5572706460952759, + "rewards/margins": 1.8487565517425537, + "rewards/rejected": -2.40602707862854, + "step": 3379 + }, + { + "epoch": 0.39, + "learning_rate": 1.8590659019079949e-07, + "logits/chosen": -2.4445698261260986, + "logits/rejected": -2.7010834217071533, + "logps/chosen": -321.0478210449219, + "logps/rejected": -246.68731689453125, + "loss": 1.0877, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7644761800765991, + "rewards/margins": 0.5509693026542664, + "rewards/rejected": -2.3154456615448, + "step": 3380 + }, + { + "epoch": 0.39, + "learning_rate": 1.8587147372117524e-07, + "logits/chosen": -2.4870762825012207, + "logits/rejected": -2.51261043548584, + "logps/chosen": -411.60931396484375, + "logps/rejected": -255.3831024169922, + "loss": 0.3283, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8490490317344666, + "rewards/margins": 1.4142810106277466, + "rewards/rejected": -2.2633299827575684, + "step": 3381 + }, + { + "epoch": 0.39, + "learning_rate": 1.8583635725155097e-07, + "logits/chosen": -2.649639368057251, + "logits/rejected": -2.5207180976867676, + "logps/chosen": -365.3172912597656, + "logps/rejected": -417.9657287597656, + "loss": 0.8474, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.853009819984436, + "rewards/margins": 2.339107036590576, + "rewards/rejected": -4.192116737365723, + "step": 3382 + }, + { + "epoch": 0.39, + "learning_rate": 1.8580124078192672e-07, + "logits/chosen": -2.103750705718994, + "logits/rejected": -1.9487168788909912, + "logps/chosen": -218.8616943359375, + "logps/rejected": -239.6990203857422, + "loss": 0.1781, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40333041548728943, + "rewards/margins": 2.815403461456299, + "rewards/rejected": -3.2187342643737793, + "step": 3383 + }, + { + "epoch": 0.39, + "learning_rate": 1.8576612431230247e-07, + "logits/chosen": -2.4388818740844727, + "logits/rejected": -2.3547487258911133, + "logps/chosen": -319.194580078125, + "logps/rejected": -360.4434509277344, + "loss": 0.2157, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.40508148074150085, + "rewards/margins": 2.6700081825256348, + "rewards/rejected": -3.075089931488037, + "step": 3384 + }, + { + "epoch": 0.39, + "learning_rate": 1.857310078426782e-07, + "logits/chosen": -2.1515731811523438, + "logits/rejected": -1.9663879871368408, + "logps/chosen": -254.949462890625, + "logps/rejected": -453.38092041015625, + "loss": 0.5964, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7666144967079163, + "rewards/margins": 2.0424859523773193, + "rewards/rejected": -2.809100389480591, + "step": 3385 + }, + { + "epoch": 0.39, + "learning_rate": 1.8569589137305396e-07, + "logits/chosen": -2.032773733139038, + "logits/rejected": -2.064457416534424, + "logps/chosen": -423.35321044921875, + "logps/rejected": -386.62188720703125, + "loss": 0.3124, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8877373337745667, + "rewards/margins": 1.7660033702850342, + "rewards/rejected": -2.653740882873535, + "step": 3386 + }, + { + "epoch": 0.39, + "learning_rate": 1.8566077490342968e-07, + "logits/chosen": -2.295409679412842, + "logits/rejected": -2.3349592685699463, + "logps/chosen": -359.2001647949219, + "logps/rejected": -304.81292724609375, + "loss": 0.5751, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5686944723129272, + "rewards/margins": 3.1644392013549805, + "rewards/rejected": -3.733133554458618, + "step": 3387 + }, + { + "epoch": 0.39, + "learning_rate": 1.8562565843380544e-07, + "logits/chosen": -1.705788493156433, + "logits/rejected": -2.1030375957489014, + "logps/chosen": -371.24267578125, + "logps/rejected": -281.3485107421875, + "loss": 0.2977, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3992070257663727, + "rewards/margins": 3.0532162189483643, + "rewards/rejected": -3.452423095703125, + "step": 3388 + }, + { + "epoch": 0.39, + "learning_rate": 1.855905419641812e-07, + "logits/chosen": -2.4686198234558105, + "logits/rejected": -2.4929451942443848, + "logps/chosen": -171.825927734375, + "logps/rejected": -248.1038818359375, + "loss": 0.1653, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5349862575531006, + "rewards/margins": 2.1989879608154297, + "rewards/rejected": -3.7339744567871094, + "step": 3389 + }, + { + "epoch": 0.39, + "learning_rate": 1.8555542549455692e-07, + "logits/chosen": -2.56423020362854, + "logits/rejected": -2.316068410873413, + "logps/chosen": -241.83448791503906, + "logps/rejected": -345.6561584472656, + "loss": 0.5902, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6194920539855957, + "rewards/margins": 2.1035547256469727, + "rewards/rejected": -3.7230470180511475, + "step": 3390 + }, + { + "epoch": 0.39, + "learning_rate": 1.855203090249327e-07, + "logits/chosen": -2.595362424850464, + "logits/rejected": -2.268951416015625, + "logps/chosen": -215.6515350341797, + "logps/rejected": -494.3312683105469, + "loss": 0.6374, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7373983860015869, + "rewards/margins": 1.5893802642822266, + "rewards/rejected": -2.3267786502838135, + "step": 3391 + }, + { + "epoch": 0.39, + "learning_rate": 1.8548519255530845e-07, + "logits/chosen": -2.278663396835327, + "logits/rejected": -2.023400068283081, + "logps/chosen": -259.4244689941406, + "logps/rejected": -339.81298828125, + "loss": 0.745, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9847474098205566, + "rewards/margins": 1.4811843633651733, + "rewards/rejected": -2.4659316539764404, + "step": 3392 + }, + { + "epoch": 0.39, + "learning_rate": 1.8545007608568418e-07, + "logits/chosen": -2.5649702548980713, + "logits/rejected": -2.753958225250244, + "logps/chosen": -351.5419006347656, + "logps/rejected": -283.36578369140625, + "loss": 0.3662, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9452238082885742, + "rewards/margins": 2.0935912132263184, + "rewards/rejected": -3.038815498352051, + "step": 3393 + }, + { + "epoch": 0.39, + "learning_rate": 1.8541495961605993e-07, + "logits/chosen": -2.4002151489257812, + "logits/rejected": -2.129815101623535, + "logps/chosen": -152.96365356445312, + "logps/rejected": -307.0616760253906, + "loss": 0.7059, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2729295492172241, + "rewards/margins": 2.132801055908203, + "rewards/rejected": -3.405730724334717, + "step": 3394 + }, + { + "epoch": 0.39, + "learning_rate": 1.8537984314643566e-07, + "logits/chosen": -2.0420026779174805, + "logits/rejected": -2.179887056350708, + "logps/chosen": -300.0724792480469, + "logps/rejected": -343.71368408203125, + "loss": 0.2869, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.480202317237854, + "rewards/margins": 1.9813860654830933, + "rewards/rejected": -2.4615883827209473, + "step": 3395 + }, + { + "epoch": 0.39, + "learning_rate": 1.8534472667681141e-07, + "logits/chosen": -2.2286083698272705, + "logits/rejected": -2.3431074619293213, + "logps/chosen": -365.346923828125, + "logps/rejected": -384.70550537109375, + "loss": 0.4592, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8660470247268677, + "rewards/margins": 3.2943294048309326, + "rewards/rejected": -4.16037654876709, + "step": 3396 + }, + { + "epoch": 0.39, + "learning_rate": 1.8530961020718717e-07, + "logits/chosen": -2.3979122638702393, + "logits/rejected": -2.3407840728759766, + "logps/chosen": -228.0629425048828, + "logps/rejected": -307.26983642578125, + "loss": 0.1365, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04852994531393051, + "rewards/margins": 2.803269386291504, + "rewards/rejected": -2.851799249649048, + "step": 3397 + }, + { + "epoch": 0.39, + "learning_rate": 1.852744937375629e-07, + "logits/chosen": -2.4248695373535156, + "logits/rejected": -2.3506250381469727, + "logps/chosen": -238.00733947753906, + "logps/rejected": -202.18649291992188, + "loss": 1.1, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4774333238601685, + "rewards/margins": 0.7345917820930481, + "rewards/rejected": -2.2120251655578613, + "step": 3398 + }, + { + "epoch": 0.39, + "learning_rate": 1.8523937726793865e-07, + "logits/chosen": -1.9439349174499512, + "logits/rejected": -2.2717738151550293, + "logps/chosen": -287.94793701171875, + "logps/rejected": -193.94236755371094, + "loss": 0.255, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.45211362838745117, + "rewards/margins": 2.188429832458496, + "rewards/rejected": -2.6405434608459473, + "step": 3399 + }, + { + "epoch": 0.39, + "learning_rate": 1.852042607983144e-07, + "logits/chosen": -2.835299015045166, + "logits/rejected": -2.5489821434020996, + "logps/chosen": -316.1351013183594, + "logps/rejected": -252.37451171875, + "loss": 0.3877, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.146697759628296, + "rewards/margins": 1.9709025621414185, + "rewards/rejected": -3.1175999641418457, + "step": 3400 + }, + { + "epoch": 0.39, + "learning_rate": 1.8516914432869013e-07, + "logits/chosen": -2.239596366882324, + "logits/rejected": -2.6059436798095703, + "logps/chosen": -142.97195434570312, + "logps/rejected": -185.1639862060547, + "loss": 0.8141, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2375797033309937, + "rewards/margins": 0.5797410011291504, + "rewards/rejected": -1.817320704460144, + "step": 3401 + }, + { + "epoch": 0.39, + "learning_rate": 1.851340278590659e-07, + "logits/chosen": -2.120612621307373, + "logits/rejected": -1.992894172668457, + "logps/chosen": -222.405029296875, + "logps/rejected": -288.6795349121094, + "loss": 0.3806, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3140937089920044, + "rewards/margins": 1.6186646223068237, + "rewards/rejected": -1.9327583312988281, + "step": 3402 + }, + { + "epoch": 0.39, + "learning_rate": 1.8509891138944164e-07, + "logits/chosen": -2.527972459793091, + "logits/rejected": -2.5178041458129883, + "logps/chosen": -421.0894775390625, + "logps/rejected": -340.74884033203125, + "loss": 0.0644, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2567298710346222, + "rewards/margins": 4.073417663574219, + "rewards/rejected": -3.816687822341919, + "step": 3403 + }, + { + "epoch": 0.39, + "learning_rate": 1.850637949198174e-07, + "logits/chosen": -2.2915356159210205, + "logits/rejected": -2.5445845127105713, + "logps/chosen": -401.07415771484375, + "logps/rejected": -193.94857788085938, + "loss": 0.2978, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.35167810320854187, + "rewards/margins": 1.9997117519378662, + "rewards/rejected": -2.3513898849487305, + "step": 3404 + }, + { + "epoch": 0.39, + "learning_rate": 1.8502867845019315e-07, + "logits/chosen": -1.7336556911468506, + "logits/rejected": -2.083434820175171, + "logps/chosen": -274.19696044921875, + "logps/rejected": -320.6673889160156, + "loss": 0.2376, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7096796035766602, + "rewards/margins": 2.2472097873687744, + "rewards/rejected": -2.9568893909454346, + "step": 3405 + }, + { + "epoch": 0.39, + "learning_rate": 1.8499356198056887e-07, + "logits/chosen": -2.3199639320373535, + "logits/rejected": -2.3964173793792725, + "logps/chosen": -265.72808837890625, + "logps/rejected": -182.88223266601562, + "loss": 0.7466, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5249269008636475, + "rewards/margins": 2.0112147331237793, + "rewards/rejected": -3.5361416339874268, + "step": 3406 + }, + { + "epoch": 0.39, + "learning_rate": 1.8495844551094463e-07, + "logits/chosen": -2.137009620666504, + "logits/rejected": -2.2146987915039062, + "logps/chosen": -317.2880859375, + "logps/rejected": -271.37677001953125, + "loss": 1.0753, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2432917356491089, + "rewards/margins": 0.9869368076324463, + "rewards/rejected": -2.2302284240722656, + "step": 3407 + }, + { + "epoch": 0.39, + "learning_rate": 1.8492332904132038e-07, + "logits/chosen": -1.9896495342254639, + "logits/rejected": -2.118257522583008, + "logps/chosen": -461.888916015625, + "logps/rejected": -315.3835144042969, + "loss": 0.4678, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.20744407176971436, + "rewards/margins": 1.127053141593933, + "rewards/rejected": -1.3344972133636475, + "step": 3408 + }, + { + "epoch": 0.39, + "learning_rate": 1.848882125716961e-07, + "logits/chosen": -2.2308077812194824, + "logits/rejected": -2.42933988571167, + "logps/chosen": -331.5589599609375, + "logps/rejected": -377.89117431640625, + "loss": 0.2996, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.590970516204834, + "rewards/margins": 2.6322686672210693, + "rewards/rejected": -3.2232391834259033, + "step": 3409 + }, + { + "epoch": 0.39, + "learning_rate": 1.8485309610207186e-07, + "logits/chosen": -2.140857219696045, + "logits/rejected": -1.6103229522705078, + "logps/chosen": -139.7379608154297, + "logps/rejected": -289.3840637207031, + "loss": 0.3438, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.133813500404358, + "rewards/margins": 2.0659310817718506, + "rewards/rejected": -3.199744701385498, + "step": 3410 + }, + { + "epoch": 0.39, + "learning_rate": 1.848179796324476e-07, + "logits/chosen": -2.829601764678955, + "logits/rejected": -2.648491144180298, + "logps/chosen": -176.93214416503906, + "logps/rejected": -190.733642578125, + "loss": 0.2628, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.49186548590660095, + "rewards/margins": 2.5169551372528076, + "rewards/rejected": -3.0088207721710205, + "step": 3411 + }, + { + "epoch": 0.39, + "learning_rate": 1.8478286316282334e-07, + "logits/chosen": -2.9413952827453613, + "logits/rejected": -2.7996113300323486, + "logps/chosen": -226.05661010742188, + "logps/rejected": -183.42056274414062, + "loss": 0.5689, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2002955675125122, + "rewards/margins": 0.786704421043396, + "rewards/rejected": -1.9870001077651978, + "step": 3412 + }, + { + "epoch": 0.39, + "learning_rate": 1.8474774669319912e-07, + "logits/chosen": -1.5751748085021973, + "logits/rejected": -2.168698787689209, + "logps/chosen": -546.34619140625, + "logps/rejected": -256.2280578613281, + "loss": 0.4837, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5569456815719604, + "rewards/margins": 1.357166051864624, + "rewards/rejected": -1.914111614227295, + "step": 3413 + }, + { + "epoch": 0.39, + "learning_rate": 1.8471263022357485e-07, + "logits/chosen": -2.1594247817993164, + "logits/rejected": -1.854740858078003, + "logps/chosen": -180.4992218017578, + "logps/rejected": -262.63800048828125, + "loss": 0.6184, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9312217235565186, + "rewards/margins": 0.736966609954834, + "rewards/rejected": -2.6681885719299316, + "step": 3414 + }, + { + "epoch": 0.39, + "learning_rate": 1.846775137539506e-07, + "logits/chosen": -2.6055221557617188, + "logits/rejected": -2.5790107250213623, + "logps/chosen": -214.24429321289062, + "logps/rejected": -323.7753601074219, + "loss": 0.2825, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6614896059036255, + "rewards/margins": 3.2226641178131104, + "rewards/rejected": -3.8841538429260254, + "step": 3415 + }, + { + "epoch": 0.39, + "learning_rate": 1.8464239728432636e-07, + "logits/chosen": -2.398256301879883, + "logits/rejected": -2.0368099212646484, + "logps/chosen": -105.05029296875, + "logps/rejected": -220.4765167236328, + "loss": 0.2331, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0012028217315674, + "rewards/margins": 2.1322977542877197, + "rewards/rejected": -3.133500576019287, + "step": 3416 + }, + { + "epoch": 0.39, + "learning_rate": 1.8460728081470209e-07, + "logits/chosen": -2.3726258277893066, + "logits/rejected": -2.5254130363464355, + "logps/chosen": -474.9898681640625, + "logps/rejected": -237.50070190429688, + "loss": 0.2225, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9599605798721313, + "rewards/margins": 2.540956974029541, + "rewards/rejected": -3.500917434692383, + "step": 3417 + }, + { + "epoch": 0.39, + "learning_rate": 1.8457216434507784e-07, + "logits/chosen": -2.440474510192871, + "logits/rejected": -2.7279131412506104, + "logps/chosen": -608.5242919921875, + "logps/rejected": -338.7431640625, + "loss": 0.4577, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4355657696723938, + "rewards/margins": 1.4646201133728027, + "rewards/rejected": -1.9001859426498413, + "step": 3418 + }, + { + "epoch": 0.39, + "learning_rate": 1.8453704787545357e-07, + "logits/chosen": -2.6975629329681396, + "logits/rejected": -2.770758628845215, + "logps/chosen": -273.75421142578125, + "logps/rejected": -323.1429138183594, + "loss": 0.606, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6311876177787781, + "rewards/margins": 1.765822172164917, + "rewards/rejected": -2.39700984954834, + "step": 3419 + }, + { + "epoch": 0.39, + "learning_rate": 1.8450193140582932e-07, + "logits/chosen": -1.4890124797821045, + "logits/rejected": -1.6577856540679932, + "logps/chosen": -202.8198699951172, + "logps/rejected": -169.1015625, + "loss": 0.2084, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17943274974822998, + "rewards/margins": 1.8780845403671265, + "rewards/rejected": -1.6986517906188965, + "step": 3420 + }, + { + "epoch": 0.39, + "learning_rate": 1.8446681493620508e-07, + "logits/chosen": -1.7175010442733765, + "logits/rejected": -1.8674627542495728, + "logps/chosen": -442.4498291015625, + "logps/rejected": -326.35650634765625, + "loss": 0.3526, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3522869944572449, + "rewards/margins": 1.6029689311981201, + "rewards/rejected": -1.9552558660507202, + "step": 3421 + }, + { + "epoch": 0.39, + "learning_rate": 1.844316984665808e-07, + "logits/chosen": -2.0480995178222656, + "logits/rejected": -2.2931413650512695, + "logps/chosen": -317.51336669921875, + "logps/rejected": -188.74441528320312, + "loss": 0.8844, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3769139051437378, + "rewards/margins": 0.006922446191310883, + "rewards/rejected": -1.383836269378662, + "step": 3422 + }, + { + "epoch": 0.39, + "learning_rate": 1.8439658199695656e-07, + "logits/chosen": -2.7413713932037354, + "logits/rejected": -2.554705858230591, + "logps/chosen": -472.4212951660156, + "logps/rejected": -339.3109130859375, + "loss": 0.5113, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2434289455413818, + "rewards/margins": 1.0031095743179321, + "rewards/rejected": -2.2465386390686035, + "step": 3423 + }, + { + "epoch": 0.39, + "learning_rate": 1.8436146552733234e-07, + "logits/chosen": -2.4752612113952637, + "logits/rejected": -2.279817819595337, + "logps/chosen": -148.1783447265625, + "logps/rejected": -387.62164306640625, + "loss": 0.2077, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9308451414108276, + "rewards/margins": 3.780398368835449, + "rewards/rejected": -4.711243152618408, + "step": 3424 + }, + { + "epoch": 0.39, + "learning_rate": 1.8432634905770806e-07, + "logits/chosen": -2.304647922515869, + "logits/rejected": -1.9831498861312866, + "logps/chosen": -231.42169189453125, + "logps/rejected": -319.0210876464844, + "loss": 0.5531, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.317940354347229, + "rewards/margins": 1.8683346509933472, + "rewards/rejected": -3.1862752437591553, + "step": 3425 + }, + { + "epoch": 0.39, + "learning_rate": 1.8429123258808382e-07, + "logits/chosen": -2.5977423191070557, + "logits/rejected": -2.5973618030548096, + "logps/chosen": -308.49652099609375, + "logps/rejected": -301.1061706542969, + "loss": 0.4871, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6410678625106812, + "rewards/margins": 0.7835596799850464, + "rewards/rejected": -1.424627423286438, + "step": 3426 + }, + { + "epoch": 0.4, + "learning_rate": 1.8425611611845955e-07, + "logits/chosen": -1.5811134576797485, + "logits/rejected": -2.2426202297210693, + "logps/chosen": -500.0428161621094, + "logps/rejected": -209.42295837402344, + "loss": 0.5716, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.48035481572151184, + "rewards/margins": 1.8688596487045288, + "rewards/rejected": -2.3492143154144287, + "step": 3427 + }, + { + "epoch": 0.4, + "learning_rate": 1.842209996488353e-07, + "logits/chosen": -2.1000161170959473, + "logits/rejected": -2.0031778812408447, + "logps/chosen": -454.4288024902344, + "logps/rejected": -437.28643798828125, + "loss": 0.4643, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7754945158958435, + "rewards/margins": 1.515150785446167, + "rewards/rejected": -2.2906453609466553, + "step": 3428 + }, + { + "epoch": 0.4, + "learning_rate": 1.8418588317921105e-07, + "logits/chosen": -2.024393320083618, + "logits/rejected": -2.055626630783081, + "logps/chosen": -321.6412658691406, + "logps/rejected": -393.0633544921875, + "loss": 0.2301, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.26654767990112305, + "rewards/margins": 1.9147021770477295, + "rewards/rejected": -2.1812496185302734, + "step": 3429 + }, + { + "epoch": 0.4, + "learning_rate": 1.8415076670958678e-07, + "logits/chosen": -2.2553327083587646, + "logits/rejected": -2.038856267929077, + "logps/chosen": -180.28109741210938, + "logps/rejected": -179.01329040527344, + "loss": 0.3653, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5651673674583435, + "rewards/margins": 1.3413543701171875, + "rewards/rejected": -1.9065215587615967, + "step": 3430 + }, + { + "epoch": 0.4, + "learning_rate": 1.8411565023996253e-07, + "logits/chosen": -2.3348352909088135, + "logits/rejected": -2.3064115047454834, + "logps/chosen": -176.42330932617188, + "logps/rejected": -304.560546875, + "loss": 0.1524, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0036036819219589233, + "rewards/margins": 3.866057872772217, + "rewards/rejected": -3.8624541759490967, + "step": 3431 + }, + { + "epoch": 0.4, + "learning_rate": 1.8408053377033826e-07, + "logits/chosen": -2.514810562133789, + "logits/rejected": -2.1090312004089355, + "logps/chosen": -167.50149536132812, + "logps/rejected": -271.534423828125, + "loss": 0.1831, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1971008777618408, + "rewards/margins": 2.5549917221069336, + "rewards/rejected": -3.7520923614501953, + "step": 3432 + }, + { + "epoch": 0.4, + "learning_rate": 1.8404541730071402e-07, + "logits/chosen": -2.263019561767578, + "logits/rejected": -1.9263297319412231, + "logps/chosen": -158.52940368652344, + "logps/rejected": -342.97894287109375, + "loss": 0.4628, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3894798755645752, + "rewards/margins": 3.7545597553253174, + "rewards/rejected": -5.144039630889893, + "step": 3433 + }, + { + "epoch": 0.4, + "learning_rate": 1.8401030083108977e-07, + "logits/chosen": -1.4527701139450073, + "logits/rejected": -1.626400113105774, + "logps/chosen": -359.1523742675781, + "logps/rejected": -335.7385559082031, + "loss": 0.6072, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0598480701446533, + "rewards/margins": 1.1673493385314941, + "rewards/rejected": -2.2271976470947266, + "step": 3434 + }, + { + "epoch": 0.4, + "learning_rate": 1.839751843614655e-07, + "logits/chosen": -2.4720959663391113, + "logits/rejected": -2.4390199184417725, + "logps/chosen": -133.0812530517578, + "logps/rejected": -198.67361450195312, + "loss": 0.2681, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4110444784164429, + "rewards/margins": 2.685079336166382, + "rewards/rejected": -4.096123695373535, + "step": 3435 + }, + { + "epoch": 0.4, + "learning_rate": 1.8394006789184128e-07, + "logits/chosen": -2.475752830505371, + "logits/rejected": -2.40789532661438, + "logps/chosen": -280.2025146484375, + "logps/rejected": -286.03131103515625, + "loss": 0.4635, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8780519366264343, + "rewards/margins": 1.488355040550232, + "rewards/rejected": -2.3664069175720215, + "step": 3436 + }, + { + "epoch": 0.4, + "learning_rate": 1.8390495142221703e-07, + "logits/chosen": -2.2878284454345703, + "logits/rejected": -2.275428056716919, + "logps/chosen": -305.6111755371094, + "logps/rejected": -270.358642578125, + "loss": 0.4512, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8396450281143188, + "rewards/margins": 2.392852783203125, + "rewards/rejected": -3.2324979305267334, + "step": 3437 + }, + { + "epoch": 0.4, + "learning_rate": 1.8386983495259276e-07, + "logits/chosen": -2.416740894317627, + "logits/rejected": -2.323178291320801, + "logps/chosen": -151.35694885253906, + "logps/rejected": -157.68800354003906, + "loss": 0.1754, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9257364273071289, + "rewards/margins": 2.7975382804870605, + "rewards/rejected": -3.7232747077941895, + "step": 3438 + }, + { + "epoch": 0.4, + "learning_rate": 1.838347184829685e-07, + "logits/chosen": -2.4304933547973633, + "logits/rejected": -2.45039963722229, + "logps/chosen": -187.50180053710938, + "logps/rejected": -261.7891845703125, + "loss": 0.2848, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4344600439071655, + "rewards/margins": 2.4195942878723145, + "rewards/rejected": -2.8540544509887695, + "step": 3439 + }, + { + "epoch": 0.4, + "learning_rate": 1.8379960201334424e-07, + "logits/chosen": -2.393094539642334, + "logits/rejected": -2.6576671600341797, + "logps/chosen": -345.8524169921875, + "logps/rejected": -240.37399291992188, + "loss": 0.3092, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4843888282775879, + "rewards/margins": 1.809259057044983, + "rewards/rejected": -2.2936477661132812, + "step": 3440 + }, + { + "epoch": 0.4, + "learning_rate": 1.8376448554372e-07, + "logits/chosen": -1.971710443496704, + "logits/rejected": -2.308285713195801, + "logps/chosen": -377.4404296875, + "logps/rejected": -337.74871826171875, + "loss": 0.2937, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5322570204734802, + "rewards/margins": 2.1715986728668213, + "rewards/rejected": -2.7038557529449463, + "step": 3441 + }, + { + "epoch": 0.4, + "learning_rate": 1.8372936907409575e-07, + "logits/chosen": -2.087322235107422, + "logits/rejected": -2.3493494987487793, + "logps/chosen": -300.6502990722656, + "logps/rejected": -162.88839721679688, + "loss": 0.3971, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4866185188293457, + "rewards/margins": 1.456489086151123, + "rewards/rejected": -1.9431074857711792, + "step": 3442 + }, + { + "epoch": 0.4, + "learning_rate": 1.8369425260447148e-07, + "logits/chosen": -1.9774850606918335, + "logits/rejected": -2.070903778076172, + "logps/chosen": -425.0703430175781, + "logps/rejected": -410.8796691894531, + "loss": 0.5444, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9512521028518677, + "rewards/margins": 1.366652250289917, + "rewards/rejected": -2.317904233932495, + "step": 3443 + }, + { + "epoch": 0.4, + "learning_rate": 1.8365913613484723e-07, + "logits/chosen": -2.0322935581207275, + "logits/rejected": -1.9639601707458496, + "logps/chosen": -188.43780517578125, + "logps/rejected": -228.18582153320312, + "loss": 0.1241, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.668282151222229, + "rewards/margins": 4.426602840423584, + "rewards/rejected": -5.094885349273682, + "step": 3444 + }, + { + "epoch": 0.4, + "learning_rate": 1.8362401966522298e-07, + "logits/chosen": -2.336460590362549, + "logits/rejected": -1.9889415502548218, + "logps/chosen": -259.35986328125, + "logps/rejected": -276.63702392578125, + "loss": 0.3073, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9413301944732666, + "rewards/margins": 2.235980272293091, + "rewards/rejected": -3.1773104667663574, + "step": 3445 + }, + { + "epoch": 0.4, + "learning_rate": 1.835889031955987e-07, + "logits/chosen": -2.0992698669433594, + "logits/rejected": -2.0682005882263184, + "logps/chosen": -215.6234130859375, + "logps/rejected": -260.3670654296875, + "loss": 0.3452, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5860075950622559, + "rewards/margins": 2.7101235389709473, + "rewards/rejected": -3.296131134033203, + "step": 3446 + }, + { + "epoch": 0.4, + "learning_rate": 1.835537867259745e-07, + "logits/chosen": -2.0305747985839844, + "logits/rejected": -2.350179672241211, + "logps/chosen": -419.2716064453125, + "logps/rejected": -357.0075988769531, + "loss": 0.0582, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02379731833934784, + "rewards/margins": 4.031942844390869, + "rewards/rejected": -4.0557403564453125, + "step": 3447 + }, + { + "epoch": 0.4, + "learning_rate": 1.8351867025635022e-07, + "logits/chosen": -2.3320744037628174, + "logits/rejected": -2.176877975463867, + "logps/chosen": -270.05059814453125, + "logps/rejected": -241.45932006835938, + "loss": 0.3719, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.116614818572998, + "rewards/margins": 1.6707346439361572, + "rewards/rejected": -2.7873494625091553, + "step": 3448 + }, + { + "epoch": 0.4, + "learning_rate": 1.8348355378672597e-07, + "logits/chosen": -2.338946580886841, + "logits/rejected": -2.5132973194122314, + "logps/chosen": -336.7106628417969, + "logps/rejected": -358.3967590332031, + "loss": 0.175, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0246039628982544, + "rewards/margins": 4.471155643463135, + "rewards/rejected": -5.495759963989258, + "step": 3449 + }, + { + "epoch": 0.4, + "learning_rate": 1.8344843731710173e-07, + "logits/chosen": -2.167855739593506, + "logits/rejected": -2.0119822025299072, + "logps/chosen": -425.5822448730469, + "logps/rejected": -336.1635437011719, + "loss": 0.2078, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5367789268493652, + "rewards/margins": 1.8740911483764648, + "rewards/rejected": -2.41087007522583, + "step": 3450 + }, + { + "epoch": 0.4, + "learning_rate": 1.8341332084747745e-07, + "logits/chosen": -2.3186135292053223, + "logits/rejected": -2.550985813140869, + "logps/chosen": -260.6858215332031, + "logps/rejected": -227.74618530273438, + "loss": 0.2833, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5968679189682007, + "rewards/margins": 2.5525293350219727, + "rewards/rejected": -3.1493968963623047, + "step": 3451 + }, + { + "epoch": 0.4, + "learning_rate": 1.833782043778532e-07, + "logits/chosen": -2.965827226638794, + "logits/rejected": -2.7264623641967773, + "logps/chosen": -224.6422119140625, + "logps/rejected": -155.17440795898438, + "loss": 0.2687, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7486773729324341, + "rewards/margins": 1.7954328060150146, + "rewards/rejected": -2.544110059738159, + "step": 3452 + }, + { + "epoch": 0.4, + "learning_rate": 1.8334308790822896e-07, + "logits/chosen": -2.4663000106811523, + "logits/rejected": -2.224977970123291, + "logps/chosen": -241.29946899414062, + "logps/rejected": -266.1739501953125, + "loss": 0.3147, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0313725471496582, + "rewards/margins": 2.241791009902954, + "rewards/rejected": -3.273163318634033, + "step": 3453 + }, + { + "epoch": 0.4, + "learning_rate": 1.833079714386047e-07, + "logits/chosen": -2.8049371242523193, + "logits/rejected": -2.638559579849243, + "logps/chosen": -197.1254425048828, + "logps/rejected": -221.0713653564453, + "loss": 0.5791, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9578665494918823, + "rewards/margins": 1.5851118564605713, + "rewards/rejected": -2.542978525161743, + "step": 3454 + }, + { + "epoch": 0.4, + "learning_rate": 1.8327285496898044e-07, + "logits/chosen": -2.029378652572632, + "logits/rejected": -2.1550521850585938, + "logps/chosen": -413.69091796875, + "logps/rejected": -263.6728515625, + "loss": 0.221, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.33628231287002563, + "rewards/margins": 1.9594571590423584, + "rewards/rejected": -2.2957396507263184, + "step": 3455 + }, + { + "epoch": 0.4, + "learning_rate": 1.8323773849935617e-07, + "logits/chosen": -2.713270902633667, + "logits/rejected": -2.5906293392181396, + "logps/chosen": -146.33322143554688, + "logps/rejected": -204.20155334472656, + "loss": 0.447, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1808794736862183, + "rewards/margins": 1.455505132675171, + "rewards/rejected": -2.6363847255706787, + "step": 3456 + }, + { + "epoch": 0.4, + "learning_rate": 1.8320262202973192e-07, + "logits/chosen": -2.8466720581054688, + "logits/rejected": -2.7731359004974365, + "logps/chosen": -212.717529296875, + "logps/rejected": -274.17803955078125, + "loss": 0.2778, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08783572912216187, + "rewards/margins": 2.21575665473938, + "rewards/rejected": -2.3035922050476074, + "step": 3457 + }, + { + "epoch": 0.4, + "learning_rate": 1.831675055601077e-07, + "logits/chosen": -2.564357042312622, + "logits/rejected": -2.5419318675994873, + "logps/chosen": -210.09681701660156, + "logps/rejected": -274.6750183105469, + "loss": 0.183, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09146415442228317, + "rewards/margins": 2.7712087631225586, + "rewards/rejected": -2.862673044204712, + "step": 3458 + }, + { + "epoch": 0.4, + "learning_rate": 1.8313238909048343e-07, + "logits/chosen": -2.017591714859009, + "logits/rejected": -2.310572385787964, + "logps/chosen": -407.8146057128906, + "logps/rejected": -288.5074768066406, + "loss": 0.5056, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7134565114974976, + "rewards/margins": 1.3767229318618774, + "rewards/rejected": -3.090179681777954, + "step": 3459 + }, + { + "epoch": 0.4, + "learning_rate": 1.8309727262085918e-07, + "logits/chosen": -2.297020435333252, + "logits/rejected": -2.546626091003418, + "logps/chosen": -337.0481262207031, + "logps/rejected": -375.7522277832031, + "loss": 0.3214, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.040137767791748, + "rewards/margins": 3.647538185119629, + "rewards/rejected": -4.687676429748535, + "step": 3460 + }, + { + "epoch": 0.4, + "learning_rate": 1.8306215615123494e-07, + "logits/chosen": -2.4567553997039795, + "logits/rejected": -2.5193185806274414, + "logps/chosen": -388.8714599609375, + "logps/rejected": -317.71478271484375, + "loss": 0.3797, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.41015443205833435, + "rewards/margins": 1.6148300170898438, + "rewards/rejected": -2.024984359741211, + "step": 3461 + }, + { + "epoch": 0.4, + "learning_rate": 1.8302703968161067e-07, + "logits/chosen": -1.6482665538787842, + "logits/rejected": -1.936436653137207, + "logps/chosen": -550.5529174804688, + "logps/rejected": -507.15960693359375, + "loss": 0.2176, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4688270092010498, + "rewards/margins": 3.1308486461639404, + "rewards/rejected": -3.5996756553649902, + "step": 3462 + }, + { + "epoch": 0.4, + "learning_rate": 1.8299192321198642e-07, + "logits/chosen": -2.2730414867401123, + "logits/rejected": -2.059208869934082, + "logps/chosen": -295.0240783691406, + "logps/rejected": -336.69244384765625, + "loss": 0.579, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3703904151916504, + "rewards/margins": 1.987565279006958, + "rewards/rejected": -3.3579559326171875, + "step": 3463 + }, + { + "epoch": 0.4, + "learning_rate": 1.8295680674236215e-07, + "logits/chosen": -2.511198043823242, + "logits/rejected": -2.215398073196411, + "logps/chosen": -312.575439453125, + "logps/rejected": -310.3520202636719, + "loss": 0.9757, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.7717621326446533, + "rewards/margins": 0.09453150629997253, + "rewards/rejected": -1.8662935495376587, + "step": 3464 + }, + { + "epoch": 0.4, + "learning_rate": 1.829216902727379e-07, + "logits/chosen": -2.603837490081787, + "logits/rejected": -2.502150774002075, + "logps/chosen": -369.6578369140625, + "logps/rejected": -251.31141662597656, + "loss": 0.3932, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.426952600479126, + "rewards/margins": 2.787383794784546, + "rewards/rejected": -4.214336395263672, + "step": 3465 + }, + { + "epoch": 0.4, + "learning_rate": 1.8288657380311365e-07, + "logits/chosen": -2.603839635848999, + "logits/rejected": -2.6082568168640137, + "logps/chosen": -351.0023193359375, + "logps/rejected": -354.2617492675781, + "loss": 0.1061, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0028672218322754, + "rewards/margins": 3.0604329109191895, + "rewards/rejected": -4.063300132751465, + "step": 3466 + }, + { + "epoch": 0.4, + "learning_rate": 1.8285145733348938e-07, + "logits/chosen": -2.1342954635620117, + "logits/rejected": -2.430471897125244, + "logps/chosen": -378.33319091796875, + "logps/rejected": -276.7603759765625, + "loss": 0.2126, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9912621974945068, + "rewards/margins": 2.8690247535705566, + "rewards/rejected": -4.860286712646484, + "step": 3467 + }, + { + "epoch": 0.4, + "learning_rate": 1.8281634086386514e-07, + "logits/chosen": -1.5326722860336304, + "logits/rejected": -2.006308078765869, + "logps/chosen": -354.6554260253906, + "logps/rejected": -278.567626953125, + "loss": 0.2594, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6582671403884888, + "rewards/margins": 2.517352819442749, + "rewards/rejected": -3.1756200790405273, + "step": 3468 + }, + { + "epoch": 0.4, + "learning_rate": 1.8278122439424092e-07, + "logits/chosen": -2.332807779312134, + "logits/rejected": -2.2597835063934326, + "logps/chosen": -209.81797790527344, + "logps/rejected": -250.55604553222656, + "loss": 0.5898, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1496249437332153, + "rewards/margins": 0.6590939164161682, + "rewards/rejected": -1.8087188005447388, + "step": 3469 + }, + { + "epoch": 0.4, + "learning_rate": 1.8274610792461664e-07, + "logits/chosen": -2.6707262992858887, + "logits/rejected": -2.546971321105957, + "logps/chosen": -298.6444091796875, + "logps/rejected": -239.86685180664062, + "loss": 0.3111, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5064491033554077, + "rewards/margins": 1.75004243850708, + "rewards/rejected": -3.2564916610717773, + "step": 3470 + }, + { + "epoch": 0.4, + "learning_rate": 1.827109914549924e-07, + "logits/chosen": -2.439513683319092, + "logits/rejected": -2.6705820560455322, + "logps/chosen": -144.60186767578125, + "logps/rejected": -200.79039001464844, + "loss": 1.0532, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4088215827941895, + "rewards/margins": 0.23274925351142883, + "rewards/rejected": -1.641570806503296, + "step": 3471 + }, + { + "epoch": 0.4, + "learning_rate": 1.8267587498536813e-07, + "logits/chosen": -2.376894235610962, + "logits/rejected": -2.1201529502868652, + "logps/chosen": -209.98846435546875, + "logps/rejected": -301.533203125, + "loss": 0.3452, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9090030193328857, + "rewards/margins": 2.1316721439361572, + "rewards/rejected": -3.040674924850464, + "step": 3472 + }, + { + "epoch": 0.4, + "learning_rate": 1.8264075851574388e-07, + "logits/chosen": -2.5615689754486084, + "logits/rejected": -2.5048391819000244, + "logps/chosen": -220.4629669189453, + "logps/rejected": -164.64895629882812, + "loss": 1.2334, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.408418893814087, + "rewards/margins": 0.21921581029891968, + "rewards/rejected": -2.6276345252990723, + "step": 3473 + }, + { + "epoch": 0.4, + "learning_rate": 1.8260564204611963e-07, + "logits/chosen": -2.6118812561035156, + "logits/rejected": -2.558915615081787, + "logps/chosen": -185.91961669921875, + "logps/rejected": -192.83470153808594, + "loss": 0.5185, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.330305814743042, + "rewards/margins": 1.5577151775360107, + "rewards/rejected": -2.8880209922790527, + "step": 3474 + }, + { + "epoch": 0.4, + "learning_rate": 1.8257052557649536e-07, + "logits/chosen": -2.395822048187256, + "logits/rejected": -2.494145393371582, + "logps/chosen": -398.21356201171875, + "logps/rejected": -349.3404541015625, + "loss": 0.4504, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5723557472229004, + "rewards/margins": 1.530517816543579, + "rewards/rejected": -2.1028735637664795, + "step": 3475 + }, + { + "epoch": 0.4, + "learning_rate": 1.8253540910687111e-07, + "logits/chosen": -1.870450496673584, + "logits/rejected": -1.9110243320465088, + "logps/chosen": -375.6282043457031, + "logps/rejected": -298.0571594238281, + "loss": 0.1528, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6606371402740479, + "rewards/margins": 3.5058467388153076, + "rewards/rejected": -4.1664838790893555, + "step": 3476 + }, + { + "epoch": 0.4, + "learning_rate": 1.8250029263724684e-07, + "logits/chosen": -2.352674961090088, + "logits/rejected": -2.3751778602600098, + "logps/chosen": -165.71890258789062, + "logps/rejected": -218.75640869140625, + "loss": 0.2231, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0008878707885742, + "rewards/margins": 2.6339316368103027, + "rewards/rejected": -3.634819507598877, + "step": 3477 + }, + { + "epoch": 0.4, + "learning_rate": 1.824651761676226e-07, + "logits/chosen": -2.0832183361053467, + "logits/rejected": -1.9252712726593018, + "logps/chosen": -413.9217529296875, + "logps/rejected": -320.1031188964844, + "loss": 0.1214, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9103125333786011, + "rewards/margins": 3.2259490489959717, + "rewards/rejected": -4.136261940002441, + "step": 3478 + }, + { + "epoch": 0.4, + "learning_rate": 1.8243005969799835e-07, + "logits/chosen": -2.8459832668304443, + "logits/rejected": -2.8220834732055664, + "logps/chosen": -211.68734741210938, + "logps/rejected": -254.10166931152344, + "loss": 0.1975, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7540349960327148, + "rewards/margins": 2.958256483078003, + "rewards/rejected": -3.712291717529297, + "step": 3479 + }, + { + "epoch": 0.4, + "learning_rate": 1.8239494322837408e-07, + "logits/chosen": -1.5724942684173584, + "logits/rejected": -1.8622636795043945, + "logps/chosen": -523.6053466796875, + "logps/rejected": -352.3165283203125, + "loss": 0.401, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3169026374816895, + "rewards/margins": 1.8147504329681396, + "rewards/rejected": -3.131653308868408, + "step": 3480 + }, + { + "epoch": 0.4, + "learning_rate": 1.8235982675874986e-07, + "logits/chosen": -1.9034934043884277, + "logits/rejected": -1.9893099069595337, + "logps/chosen": -261.09173583984375, + "logps/rejected": -297.14569091796875, + "loss": 0.6175, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.49984410405158997, + "rewards/margins": 0.47309812903404236, + "rewards/rejected": -0.9729422330856323, + "step": 3481 + }, + { + "epoch": 0.4, + "learning_rate": 1.823247102891256e-07, + "logits/chosen": -1.970341682434082, + "logits/rejected": -2.0294580459594727, + "logps/chosen": -190.25904846191406, + "logps/rejected": -267.19207763671875, + "loss": 0.4528, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8863064050674438, + "rewards/margins": 2.3008151054382324, + "rewards/rejected": -3.187121868133545, + "step": 3482 + }, + { + "epoch": 0.4, + "learning_rate": 1.8228959381950134e-07, + "logits/chosen": -2.694268226623535, + "logits/rejected": -2.4665000438690186, + "logps/chosen": -203.39605712890625, + "logps/rejected": -251.84298706054688, + "loss": 0.252, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9429349899291992, + "rewards/margins": 2.590081214904785, + "rewards/rejected": -3.5330164432525635, + "step": 3483 + }, + { + "epoch": 0.4, + "learning_rate": 1.822544773498771e-07, + "logits/chosen": -2.1597907543182373, + "logits/rejected": -1.9691964387893677, + "logps/chosen": -232.21527099609375, + "logps/rejected": -397.0240783691406, + "loss": 0.1349, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.44980236887931824, + "rewards/margins": 4.362714767456055, + "rewards/rejected": -4.812517166137695, + "step": 3484 + }, + { + "epoch": 0.4, + "learning_rate": 1.8221936088025282e-07, + "logits/chosen": -1.9797899723052979, + "logits/rejected": -2.352370023727417, + "logps/chosen": -332.0746765136719, + "logps/rejected": -179.67543029785156, + "loss": 0.3013, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12704525887966156, + "rewards/margins": 1.639173984527588, + "rewards/rejected": -1.7662192583084106, + "step": 3485 + }, + { + "epoch": 0.4, + "learning_rate": 1.8218424441062857e-07, + "logits/chosen": -2.443716049194336, + "logits/rejected": -2.4197587966918945, + "logps/chosen": -229.15196228027344, + "logps/rejected": -252.9473419189453, + "loss": 0.1237, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15063315629959106, + "rewards/margins": 3.334245204925537, + "rewards/rejected": -3.4848780632019043, + "step": 3486 + }, + { + "epoch": 0.4, + "learning_rate": 1.8214912794100433e-07, + "logits/chosen": -2.4324214458465576, + "logits/rejected": -2.214369773864746, + "logps/chosen": -238.77476501464844, + "logps/rejected": -248.37673950195312, + "loss": 0.1441, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.628083348274231, + "rewards/margins": 2.943591594696045, + "rewards/rejected": -3.5716750621795654, + "step": 3487 + }, + { + "epoch": 0.4, + "learning_rate": 1.8211401147138005e-07, + "logits/chosen": -2.0959572792053223, + "logits/rejected": -2.3660786151885986, + "logps/chosen": -207.5775604248047, + "logps/rejected": -224.50357055664062, + "loss": 0.3655, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6106810569763184, + "rewards/margins": 2.803530693054199, + "rewards/rejected": -4.414211750030518, + "step": 3488 + }, + { + "epoch": 0.4, + "learning_rate": 1.820788950017558e-07, + "logits/chosen": -2.344372272491455, + "logits/rejected": -2.5180323123931885, + "logps/chosen": -198.0757598876953, + "logps/rejected": -122.56574249267578, + "loss": 0.5342, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6030714511871338, + "rewards/margins": 1.1310802698135376, + "rewards/rejected": -1.734151840209961, + "step": 3489 + }, + { + "epoch": 0.4, + "learning_rate": 1.820437785321316e-07, + "logits/chosen": -1.9408495426177979, + "logits/rejected": -2.1052656173706055, + "logps/chosen": -405.5652160644531, + "logps/rejected": -359.1542663574219, + "loss": 0.5107, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8845511674880981, + "rewards/margins": 1.273641586303711, + "rewards/rejected": -2.1581926345825195, + "step": 3490 + }, + { + "epoch": 0.4, + "learning_rate": 1.820086620625073e-07, + "logits/chosen": -2.5119404792785645, + "logits/rejected": -2.6166672706604004, + "logps/chosen": -148.66281127929688, + "logps/rejected": -163.13299560546875, + "loss": 0.3269, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8956683278083801, + "rewards/margins": 1.3459904193878174, + "rewards/rejected": -2.2416586875915527, + "step": 3491 + }, + { + "epoch": 0.4, + "learning_rate": 1.8197354559288307e-07, + "logits/chosen": -2.9086947441101074, + "logits/rejected": -3.031745433807373, + "logps/chosen": -171.87115478515625, + "logps/rejected": -245.1247100830078, + "loss": 0.6029, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.466628909111023, + "rewards/margins": 2.007871627807617, + "rewards/rejected": -3.4745001792907715, + "step": 3492 + }, + { + "epoch": 0.4, + "learning_rate": 1.819384291232588e-07, + "logits/chosen": -1.8246378898620605, + "logits/rejected": -2.1105778217315674, + "logps/chosen": -271.7484130859375, + "logps/rejected": -173.80657958984375, + "loss": 0.3656, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.39874058961868286, + "rewards/margins": 1.7939236164093018, + "rewards/rejected": -2.19266414642334, + "step": 3493 + }, + { + "epoch": 0.4, + "learning_rate": 1.8190331265363455e-07, + "logits/chosen": -2.049201250076294, + "logits/rejected": -2.406092643737793, + "logps/chosen": -271.6443176269531, + "logps/rejected": -263.3062744140625, + "loss": 0.3144, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3186275959014893, + "rewards/margins": 2.96799635887146, + "rewards/rejected": -4.286624431610107, + "step": 3494 + }, + { + "epoch": 0.4, + "learning_rate": 1.818681961840103e-07, + "logits/chosen": -2.402522325515747, + "logits/rejected": -2.4668993949890137, + "logps/chosen": -279.8423767089844, + "logps/rejected": -232.18435668945312, + "loss": 0.3168, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8611333966255188, + "rewards/margins": 1.7637372016906738, + "rewards/rejected": -2.624870777130127, + "step": 3495 + }, + { + "epoch": 0.4, + "learning_rate": 1.8183307971438603e-07, + "logits/chosen": -2.6305594444274902, + "logits/rejected": -2.6761066913604736, + "logps/chosen": -231.32362365722656, + "logps/rejected": -213.86143493652344, + "loss": 0.2979, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8260130882263184, + "rewards/margins": 2.67539381980896, + "rewards/rejected": -3.5014069080352783, + "step": 3496 + }, + { + "epoch": 0.4, + "learning_rate": 1.8179796324476179e-07, + "logits/chosen": -2.200721502304077, + "logits/rejected": -2.158046007156372, + "logps/chosen": -216.85586547851562, + "logps/rejected": -293.2703857421875, + "loss": 0.9941, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8142483234405518, + "rewards/margins": 2.3168251514434814, + "rewards/rejected": -4.131073951721191, + "step": 3497 + }, + { + "epoch": 0.4, + "learning_rate": 1.8176284677513754e-07, + "logits/chosen": -2.082149028778076, + "logits/rejected": -2.2478132247924805, + "logps/chosen": -247.89682006835938, + "logps/rejected": -302.05804443359375, + "loss": 0.7179, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9256081581115723, + "rewards/margins": 1.146968126296997, + "rewards/rejected": -2.0725765228271484, + "step": 3498 + }, + { + "epoch": 0.4, + "learning_rate": 1.8172773030551327e-07, + "logits/chosen": -2.678234100341797, + "logits/rejected": -2.708167314529419, + "logps/chosen": -127.80073547363281, + "logps/rejected": -183.7919158935547, + "loss": 0.4109, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1131923198699951, + "rewards/margins": 1.4539320468902588, + "rewards/rejected": -2.567124605178833, + "step": 3499 + }, + { + "epoch": 0.4, + "learning_rate": 1.8169261383588902e-07, + "logits/chosen": -1.9848487377166748, + "logits/rejected": -2.277953624725342, + "logps/chosen": -356.45068359375, + "logps/rejected": -218.3432159423828, + "loss": 0.2342, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6784740090370178, + "rewards/margins": 2.343069314956665, + "rewards/rejected": -1.6645952463150024, + "step": 3500 + }, + { + "epoch": 0.4, + "learning_rate": 1.8165749736626475e-07, + "logits/chosen": -2.4036598205566406, + "logits/rejected": -2.349074125289917, + "logps/chosen": -78.99433898925781, + "logps/rejected": -117.5468521118164, + "loss": 0.4731, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.23772768676280975, + "rewards/margins": 1.300092339515686, + "rewards/rejected": -1.5378199815750122, + "step": 3501 + }, + { + "epoch": 0.4, + "learning_rate": 1.816223808966405e-07, + "logits/chosen": -2.523761034011841, + "logits/rejected": -2.3796656131744385, + "logps/chosen": -154.64215087890625, + "logps/rejected": -308.43353271484375, + "loss": 0.3102, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0527552366256714, + "rewards/margins": 3.937802791595459, + "rewards/rejected": -4.99055814743042, + "step": 3502 + }, + { + "epoch": 0.4, + "learning_rate": 1.8158726442701628e-07, + "logits/chosen": -2.14432954788208, + "logits/rejected": -2.5922939777374268, + "logps/chosen": -213.45399475097656, + "logps/rejected": -199.88568115234375, + "loss": 0.6471, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6453548073768616, + "rewards/margins": 1.6452018022537231, + "rewards/rejected": -2.2905569076538086, + "step": 3503 + }, + { + "epoch": 0.4, + "learning_rate": 1.81552147957392e-07, + "logits/chosen": -2.829921007156372, + "logits/rejected": -2.8131496906280518, + "logps/chosen": -248.74905395507812, + "logps/rejected": -424.4129333496094, + "loss": 0.1931, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9326615333557129, + "rewards/margins": 4.363028526306152, + "rewards/rejected": -5.295689582824707, + "step": 3504 + }, + { + "epoch": 0.4, + "learning_rate": 1.8151703148776776e-07, + "logits/chosen": -2.5199778079986572, + "logits/rejected": -2.311880111694336, + "logps/chosen": -204.592041015625, + "logps/rejected": -292.4320068359375, + "loss": 0.7262, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.696111798286438, + "rewards/margins": 1.2052761316299438, + "rewards/rejected": -1.9013879299163818, + "step": 3505 + }, + { + "epoch": 0.4, + "learning_rate": 1.8148191501814352e-07, + "logits/chosen": -2.123505115509033, + "logits/rejected": -2.348897933959961, + "logps/chosen": -281.0674133300781, + "logps/rejected": -161.4801025390625, + "loss": 0.7249, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.7164030075073242, + "rewards/margins": 1.2096956968307495, + "rewards/rejected": -2.9260988235473633, + "step": 3506 + }, + { + "epoch": 0.4, + "learning_rate": 1.8144679854851925e-07, + "logits/chosen": -2.4550094604492188, + "logits/rejected": -2.303421974182129, + "logps/chosen": -244.603271484375, + "logps/rejected": -155.72555541992188, + "loss": 0.4015, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1189498901367188, + "rewards/margins": 1.8337541818618774, + "rewards/rejected": -2.9527039527893066, + "step": 3507 + }, + { + "epoch": 0.4, + "learning_rate": 1.81411682078895e-07, + "logits/chosen": -2.482395648956299, + "logits/rejected": -2.378389596939087, + "logps/chosen": -163.39181518554688, + "logps/rejected": -277.9849853515625, + "loss": 0.1627, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.714758574962616, + "rewards/margins": 3.134026050567627, + "rewards/rejected": -3.8487844467163086, + "step": 3508 + }, + { + "epoch": 0.4, + "learning_rate": 1.8137656560927073e-07, + "logits/chosen": -2.700683832168579, + "logits/rejected": -2.5634560585021973, + "logps/chosen": -110.57183074951172, + "logps/rejected": -256.24603271484375, + "loss": 0.318, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40474867820739746, + "rewards/margins": 1.47560453414917, + "rewards/rejected": -1.8803532123565674, + "step": 3509 + }, + { + "epoch": 0.4, + "learning_rate": 1.8134144913964648e-07, + "logits/chosen": -1.9756237268447876, + "logits/rejected": -1.7822723388671875, + "logps/chosen": -287.97772216796875, + "logps/rejected": -303.6821594238281, + "loss": 0.6613, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9959744811058044, + "rewards/margins": 1.069390058517456, + "rewards/rejected": -2.0653645992279053, + "step": 3510 + }, + { + "epoch": 0.4, + "learning_rate": 1.8130633267002223e-07, + "logits/chosen": -2.5686845779418945, + "logits/rejected": -2.6720354557037354, + "logps/chosen": -190.8817138671875, + "logps/rejected": -194.4479522705078, + "loss": 0.4627, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.818061351776123, + "rewards/margins": 2.1391119956970215, + "rewards/rejected": -2.9571731090545654, + "step": 3511 + }, + { + "epoch": 0.4, + "learning_rate": 1.8127121620039796e-07, + "logits/chosen": -2.120206832885742, + "logits/rejected": -2.2599239349365234, + "logps/chosen": -401.8031005859375, + "logps/rejected": -370.94842529296875, + "loss": 0.3897, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7917519807815552, + "rewards/margins": 1.951440453529358, + "rewards/rejected": -3.743192434310913, + "step": 3512 + }, + { + "epoch": 0.4, + "learning_rate": 1.8123609973077372e-07, + "logits/chosen": -2.5054023265838623, + "logits/rejected": -2.5914812088012695, + "logps/chosen": -376.70025634765625, + "logps/rejected": -473.21234130859375, + "loss": 0.1138, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2261849045753479, + "rewards/margins": 4.448240280151367, + "rewards/rejected": -4.67442512512207, + "step": 3513 + }, + { + "epoch": 0.41, + "learning_rate": 1.812009832611495e-07, + "logits/chosen": -2.7243382930755615, + "logits/rejected": -2.596773624420166, + "logps/chosen": -143.03634643554688, + "logps/rejected": -224.4987030029297, + "loss": 0.1811, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5863246917724609, + "rewards/margins": 2.368227958679199, + "rewards/rejected": -2.95455265045166, + "step": 3514 + }, + { + "epoch": 0.41, + "learning_rate": 1.8116586679152522e-07, + "logits/chosen": -1.887990951538086, + "logits/rejected": -2.230828285217285, + "logps/chosen": -507.597412109375, + "logps/rejected": -300.8692932128906, + "loss": 0.2924, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2526611089706421, + "rewards/margins": 2.6444144248962402, + "rewards/rejected": -2.8970754146575928, + "step": 3515 + }, + { + "epoch": 0.41, + "learning_rate": 1.8113075032190098e-07, + "logits/chosen": -2.5190622806549072, + "logits/rejected": -2.307690143585205, + "logps/chosen": -173.91796875, + "logps/rejected": -176.48138427734375, + "loss": 0.4206, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2469992637634277, + "rewards/margins": 1.4083750247955322, + "rewards/rejected": -2.655374526977539, + "step": 3516 + }, + { + "epoch": 0.41, + "learning_rate": 1.810956338522767e-07, + "logits/chosen": -2.700300931930542, + "logits/rejected": -2.618936538696289, + "logps/chosen": -406.0477294921875, + "logps/rejected": -453.734619140625, + "loss": 0.4081, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5736387372016907, + "rewards/margins": 2.2074806690216064, + "rewards/rejected": -2.7811195850372314, + "step": 3517 + }, + { + "epoch": 0.41, + "learning_rate": 1.8106051738265246e-07, + "logits/chosen": -2.3746142387390137, + "logits/rejected": -2.2317140102386475, + "logps/chosen": -428.3914489746094, + "logps/rejected": -292.82867431640625, + "loss": 0.1594, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07102857530117035, + "rewards/margins": 2.5031888484954834, + "rewards/rejected": -2.4321603775024414, + "step": 3518 + }, + { + "epoch": 0.41, + "learning_rate": 1.810254009130282e-07, + "logits/chosen": -1.4573743343353271, + "logits/rejected": -1.9608813524246216, + "logps/chosen": -252.6129913330078, + "logps/rejected": -188.1300811767578, + "loss": 0.643, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8454022407531738, + "rewards/margins": 0.4171545207500458, + "rewards/rejected": -1.262556791305542, + "step": 3519 + }, + { + "epoch": 0.41, + "learning_rate": 1.8099028444340394e-07, + "logits/chosen": -2.199723720550537, + "logits/rejected": -2.3663904666900635, + "logps/chosen": -336.86175537109375, + "logps/rejected": -271.65972900390625, + "loss": 0.8115, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9820663928985596, + "rewards/margins": 1.0765929222106934, + "rewards/rejected": -2.058659315109253, + "step": 3520 + }, + { + "epoch": 0.41, + "learning_rate": 1.809551679737797e-07, + "logits/chosen": -1.9367910623550415, + "logits/rejected": -1.9700353145599365, + "logps/chosen": -326.466552734375, + "logps/rejected": -292.6698913574219, + "loss": 0.4637, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5342187881469727, + "rewards/margins": 1.5735015869140625, + "rewards/rejected": -3.107719898223877, + "step": 3521 + }, + { + "epoch": 0.41, + "learning_rate": 1.8092005150415542e-07, + "logits/chosen": -1.9413214921951294, + "logits/rejected": -1.9573523998260498, + "logps/chosen": -119.5049057006836, + "logps/rejected": -163.8813018798828, + "loss": 0.4909, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5919256210327148, + "rewards/margins": 1.486891508102417, + "rewards/rejected": -2.078817367553711, + "step": 3522 + }, + { + "epoch": 0.41, + "learning_rate": 1.8088493503453117e-07, + "logits/chosen": -2.285165309906006, + "logits/rejected": -2.1379175186157227, + "logps/chosen": -181.1411895751953, + "logps/rejected": -245.69195556640625, + "loss": 0.6109, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0510584115982056, + "rewards/margins": 2.0532591342926025, + "rewards/rejected": -3.1043176651000977, + "step": 3523 + }, + { + "epoch": 0.41, + "learning_rate": 1.8084981856490696e-07, + "logits/chosen": -2.103041648864746, + "logits/rejected": -2.237171173095703, + "logps/chosen": -165.42959594726562, + "logps/rejected": -203.61346435546875, + "loss": 0.7964, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3578779697418213, + "rewards/margins": 2.194406270980835, + "rewards/rejected": -3.5522842407226562, + "step": 3524 + }, + { + "epoch": 0.41, + "learning_rate": 1.8081470209528266e-07, + "logits/chosen": -2.0228636264801025, + "logits/rejected": -1.5846054553985596, + "logps/chosen": -312.841064453125, + "logps/rejected": -349.2811584472656, + "loss": 0.2323, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7898414134979248, + "rewards/margins": 2.8439278602600098, + "rewards/rejected": -4.6337690353393555, + "step": 3525 + }, + { + "epoch": 0.41, + "learning_rate": 1.8077958562565844e-07, + "logits/chosen": -2.712280750274658, + "logits/rejected": -2.7012734413146973, + "logps/chosen": -256.63177490234375, + "logps/rejected": -302.07891845703125, + "loss": 0.0414, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.708240270614624, + "rewards/margins": 5.649923801422119, + "rewards/rejected": -6.358164310455322, + "step": 3526 + }, + { + "epoch": 0.41, + "learning_rate": 1.807444691560342e-07, + "logits/chosen": -2.2203619480133057, + "logits/rejected": -2.20066499710083, + "logps/chosen": -403.80230712890625, + "logps/rejected": -392.1106872558594, + "loss": 0.2354, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.34246379137039185, + "rewards/margins": 2.3677854537963867, + "rewards/rejected": -2.710249423980713, + "step": 3527 + }, + { + "epoch": 0.41, + "learning_rate": 1.8070935268640992e-07, + "logits/chosen": -2.3134982585906982, + "logits/rejected": -2.500840663909912, + "logps/chosen": -264.9281311035156, + "logps/rejected": -265.72589111328125, + "loss": 0.3905, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2986009418964386, + "rewards/margins": 1.9365406036376953, + "rewards/rejected": -2.2351415157318115, + "step": 3528 + }, + { + "epoch": 0.41, + "learning_rate": 1.8067423621678567e-07, + "logits/chosen": -2.4133381843566895, + "logits/rejected": -2.5758910179138184, + "logps/chosen": -275.3443908691406, + "logps/rejected": -245.77410888671875, + "loss": 0.3724, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1947863101959229, + "rewards/margins": 1.691400170326233, + "rewards/rejected": -2.886186361312866, + "step": 3529 + }, + { + "epoch": 0.41, + "learning_rate": 1.806391197471614e-07, + "logits/chosen": -2.0868630409240723, + "logits/rejected": -2.2797951698303223, + "logps/chosen": -221.63064575195312, + "logps/rejected": -226.4847412109375, + "loss": 0.4857, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0258104801177979, + "rewards/margins": 1.7544220685958862, + "rewards/rejected": -2.7802326679229736, + "step": 3530 + }, + { + "epoch": 0.41, + "learning_rate": 1.8060400327753715e-07, + "logits/chosen": -2.3341100215911865, + "logits/rejected": -2.43282151222229, + "logps/chosen": -318.9610900878906, + "logps/rejected": -301.011962890625, + "loss": 0.2503, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5640027523040771, + "rewards/margins": 1.9186456203460693, + "rewards/rejected": -2.4826483726501465, + "step": 3531 + }, + { + "epoch": 0.41, + "learning_rate": 1.805688868079129e-07, + "logits/chosen": -1.8431779146194458, + "logits/rejected": -2.0076045989990234, + "logps/chosen": -493.97857666015625, + "logps/rejected": -331.23101806640625, + "loss": 0.2702, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.34688541293144226, + "rewards/margins": 1.861246943473816, + "rewards/rejected": -1.5143613815307617, + "step": 3532 + }, + { + "epoch": 0.41, + "learning_rate": 1.8053377033828863e-07, + "logits/chosen": -2.214817762374878, + "logits/rejected": -2.5214715003967285, + "logps/chosen": -257.65777587890625, + "logps/rejected": -188.41793823242188, + "loss": 0.4767, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0029747486114502, + "rewards/margins": 0.9863812327384949, + "rewards/rejected": -1.9893560409545898, + "step": 3533 + }, + { + "epoch": 0.41, + "learning_rate": 1.804986538686644e-07, + "logits/chosen": -1.9649537801742554, + "logits/rejected": -2.124588966369629, + "logps/chosen": -257.9832763671875, + "logps/rejected": -284.8640441894531, + "loss": 0.2343, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24949395656585693, + "rewards/margins": 2.936070442199707, + "rewards/rejected": -3.1855645179748535, + "step": 3534 + }, + { + "epoch": 0.41, + "learning_rate": 1.8046353739904017e-07, + "logits/chosen": -2.5294628143310547, + "logits/rejected": -2.6425716876983643, + "logps/chosen": -251.93618774414062, + "logps/rejected": -270.3712463378906, + "loss": 0.8728, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1558507680892944, + "rewards/margins": 0.582554042339325, + "rewards/rejected": -1.7384048700332642, + "step": 3535 + }, + { + "epoch": 0.41, + "learning_rate": 1.8042842092941587e-07, + "logits/chosen": -2.4454221725463867, + "logits/rejected": -2.4064226150512695, + "logps/chosen": -227.56640625, + "logps/rejected": -185.18589782714844, + "loss": 0.6402, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.70539391040802, + "rewards/margins": 1.533155918121338, + "rewards/rejected": -2.2385499477386475, + "step": 3536 + }, + { + "epoch": 0.41, + "learning_rate": 1.8039330445979165e-07, + "logits/chosen": -2.6352574825286865, + "logits/rejected": -2.3738059997558594, + "logps/chosen": -165.63360595703125, + "logps/rejected": -223.46954345703125, + "loss": 0.4812, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9213011264801025, + "rewards/margins": 1.3050916194915771, + "rewards/rejected": -2.2263927459716797, + "step": 3537 + }, + { + "epoch": 0.41, + "learning_rate": 1.8035818799016738e-07, + "logits/chosen": -2.0933992862701416, + "logits/rejected": -2.092843532562256, + "logps/chosen": -272.4374084472656, + "logps/rejected": -316.7210693359375, + "loss": 0.31, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22728073596954346, + "rewards/margins": 2.024268388748169, + "rewards/rejected": -2.251549005508423, + "step": 3538 + }, + { + "epoch": 0.41, + "learning_rate": 1.8032307152054313e-07, + "logits/chosen": -2.5810601711273193, + "logits/rejected": -2.5690712928771973, + "logps/chosen": -339.3632507324219, + "logps/rejected": -331.6188049316406, + "loss": 0.0891, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2927285432815552, + "rewards/margins": 3.250652313232422, + "rewards/rejected": -3.5433812141418457, + "step": 3539 + }, + { + "epoch": 0.41, + "learning_rate": 1.8028795505091888e-07, + "logits/chosen": -2.8289883136749268, + "logits/rejected": -2.8404440879821777, + "logps/chosen": -120.58770751953125, + "logps/rejected": -166.0035400390625, + "loss": 0.2703, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0423343181610107, + "rewards/margins": 2.4345552921295166, + "rewards/rejected": -3.4768893718719482, + "step": 3540 + }, + { + "epoch": 0.41, + "learning_rate": 1.802528385812946e-07, + "logits/chosen": -2.2782645225524902, + "logits/rejected": -2.111811637878418, + "logps/chosen": -154.4683074951172, + "logps/rejected": -248.75050354003906, + "loss": 0.304, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1426589488983154, + "rewards/margins": 1.946082592010498, + "rewards/rejected": -3.0887415409088135, + "step": 3541 + }, + { + "epoch": 0.41, + "learning_rate": 1.8021772211167037e-07, + "logits/chosen": -2.1967990398406982, + "logits/rejected": -2.125589609146118, + "logps/chosen": -295.35833740234375, + "logps/rejected": -328.77874755859375, + "loss": 0.2588, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04026827961206436, + "rewards/margins": 2.4368550777435303, + "rewards/rejected": -2.477123260498047, + "step": 3542 + }, + { + "epoch": 0.41, + "learning_rate": 1.8018260564204612e-07, + "logits/chosen": -2.311434745788574, + "logits/rejected": -2.553699016571045, + "logps/chosen": -363.82098388671875, + "logps/rejected": -298.4664001464844, + "loss": 0.2119, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8383569717407227, + "rewards/margins": 3.309569835662842, + "rewards/rejected": -4.1479268074035645, + "step": 3543 + }, + { + "epoch": 0.41, + "learning_rate": 1.8014748917242185e-07, + "logits/chosen": -2.0192244052886963, + "logits/rejected": -1.9948248863220215, + "logps/chosen": -274.1799011230469, + "logps/rejected": -298.7610168457031, + "loss": 0.2673, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9002323746681213, + "rewards/margins": 2.700639486312866, + "rewards/rejected": -3.6008718013763428, + "step": 3544 + }, + { + "epoch": 0.41, + "learning_rate": 1.801123727027976e-07, + "logits/chosen": -2.353586196899414, + "logits/rejected": -2.173893451690674, + "logps/chosen": -176.4847869873047, + "logps/rejected": -287.672607421875, + "loss": 0.1887, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6908454895019531, + "rewards/margins": 2.4313650131225586, + "rewards/rejected": -3.1222102642059326, + "step": 3545 + }, + { + "epoch": 0.41, + "learning_rate": 1.8007725623317333e-07, + "logits/chosen": -2.4961133003234863, + "logits/rejected": -2.279766798019409, + "logps/chosen": -198.29776000976562, + "logps/rejected": -208.37872314453125, + "loss": 1.5051, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.704369068145752, + "rewards/margins": -0.07373902201652527, + "rewards/rejected": -2.630629777908325, + "step": 3546 + }, + { + "epoch": 0.41, + "learning_rate": 1.8004213976354908e-07, + "logits/chosen": -2.3033432960510254, + "logits/rejected": -2.2035412788391113, + "logps/chosen": -209.1586456298828, + "logps/rejected": -222.79931640625, + "loss": 0.2432, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5979228019714355, + "rewards/margins": 2.4779374599456787, + "rewards/rejected": -3.0758602619171143, + "step": 3547 + }, + { + "epoch": 0.41, + "learning_rate": 1.8000702329392486e-07, + "logits/chosen": -1.8246805667877197, + "logits/rejected": -2.1061320304870605, + "logps/chosen": -384.0386962890625, + "logps/rejected": -251.1829833984375, + "loss": 0.21, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.23892173171043396, + "rewards/margins": 2.166065216064453, + "rewards/rejected": -1.9271434545516968, + "step": 3548 + }, + { + "epoch": 0.41, + "learning_rate": 1.799719068243006e-07, + "logits/chosen": -1.9113495349884033, + "logits/rejected": -2.115168571472168, + "logps/chosen": -415.95684814453125, + "logps/rejected": -389.24627685546875, + "loss": 0.498, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9832552075386047, + "rewards/margins": 1.2644743919372559, + "rewards/rejected": -2.247729778289795, + "step": 3549 + }, + { + "epoch": 0.41, + "learning_rate": 1.7993679035467634e-07, + "logits/chosen": -2.629573345184326, + "logits/rejected": -2.5921761989593506, + "logps/chosen": -204.19361877441406, + "logps/rejected": -279.05047607421875, + "loss": 0.2451, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9960924386978149, + "rewards/margins": 3.0946786403656006, + "rewards/rejected": -4.090770721435547, + "step": 3550 + }, + { + "epoch": 0.41, + "learning_rate": 1.799016738850521e-07, + "logits/chosen": -1.769075870513916, + "logits/rejected": -2.3854846954345703, + "logps/chosen": -420.5357666015625, + "logps/rejected": -351.6651611328125, + "loss": 0.8519, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2478502988815308, + "rewards/margins": 1.1691920757293701, + "rewards/rejected": -2.4170427322387695, + "step": 3551 + }, + { + "epoch": 0.41, + "learning_rate": 1.7986655741542782e-07, + "logits/chosen": -2.4734764099121094, + "logits/rejected": -2.449409008026123, + "logps/chosen": -330.63897705078125, + "logps/rejected": -250.45223999023438, + "loss": 0.4897, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9630149602890015, + "rewards/margins": 1.189121127128601, + "rewards/rejected": -2.1521360874176025, + "step": 3552 + }, + { + "epoch": 0.41, + "learning_rate": 1.7983144094580358e-07, + "logits/chosen": -2.446950674057007, + "logits/rejected": -2.224323272705078, + "logps/chosen": -333.8988342285156, + "logps/rejected": -359.7405090332031, + "loss": 0.4703, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.280540943145752, + "rewards/margins": 2.198491096496582, + "rewards/rejected": -3.479032039642334, + "step": 3553 + }, + { + "epoch": 0.41, + "learning_rate": 1.797963244761793e-07, + "logits/chosen": -2.1867122650146484, + "logits/rejected": -2.252885341644287, + "logps/chosen": -223.48004150390625, + "logps/rejected": -217.77096557617188, + "loss": 0.4056, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2128932923078537, + "rewards/margins": 2.2052981853485107, + "rewards/rejected": -2.418191432952881, + "step": 3554 + }, + { + "epoch": 0.41, + "learning_rate": 1.7976120800655506e-07, + "logits/chosen": -2.456129789352417, + "logits/rejected": -2.457733154296875, + "logps/chosen": -214.98324584960938, + "logps/rejected": -186.06690979003906, + "loss": 0.2008, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3174951076507568, + "rewards/margins": 3.111607551574707, + "rewards/rejected": -4.429102897644043, + "step": 3555 + }, + { + "epoch": 0.41, + "learning_rate": 1.7972609153693081e-07, + "logits/chosen": -2.764863967895508, + "logits/rejected": -2.6917011737823486, + "logps/chosen": -386.17584228515625, + "logps/rejected": -383.60400390625, + "loss": 0.0278, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3112248480319977, + "rewards/margins": 4.957281589508057, + "rewards/rejected": -5.268506050109863, + "step": 3556 + }, + { + "epoch": 0.41, + "learning_rate": 1.7969097506730654e-07, + "logits/chosen": -2.714672088623047, + "logits/rejected": -2.599500894546509, + "logps/chosen": -260.2611083984375, + "logps/rejected": -269.3373718261719, + "loss": 0.3972, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.23215913772583, + "rewards/margins": 1.7951562404632568, + "rewards/rejected": -3.027315139770508, + "step": 3557 + }, + { + "epoch": 0.41, + "learning_rate": 1.7965585859768232e-07, + "logits/chosen": -2.149289846420288, + "logits/rejected": -2.4633679389953613, + "logps/chosen": -316.8570556640625, + "logps/rejected": -228.2236328125, + "loss": 0.4959, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.36767908930778503, + "rewards/margins": 2.4726901054382324, + "rewards/rejected": -2.84036922454834, + "step": 3558 + }, + { + "epoch": 0.41, + "learning_rate": 1.7962074212805808e-07, + "logits/chosen": -2.818368911743164, + "logits/rejected": -2.635471820831299, + "logps/chosen": -221.9868621826172, + "logps/rejected": -297.7509765625, + "loss": 0.3775, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7373293042182922, + "rewards/margins": 2.822486639022827, + "rewards/rejected": -3.5598158836364746, + "step": 3559 + }, + { + "epoch": 0.41, + "learning_rate": 1.795856256584338e-07, + "logits/chosen": -2.108353614807129, + "logits/rejected": -2.1933393478393555, + "logps/chosen": -320.47894287109375, + "logps/rejected": -298.2161865234375, + "loss": 0.3193, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1243304014205933, + "rewards/margins": 3.3684582710266113, + "rewards/rejected": -4.492788791656494, + "step": 3560 + }, + { + "epoch": 0.41, + "learning_rate": 1.7955050918880956e-07, + "logits/chosen": -2.614133834838867, + "logits/rejected": -2.7652084827423096, + "logps/chosen": -150.66845703125, + "logps/rejected": -228.3188934326172, + "loss": 0.3041, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5031543970108032, + "rewards/margins": 2.8708949089050293, + "rewards/rejected": -3.374049186706543, + "step": 3561 + }, + { + "epoch": 0.41, + "learning_rate": 1.7951539271918528e-07, + "logits/chosen": -2.436851978302002, + "logits/rejected": -2.525447130203247, + "logps/chosen": -326.63690185546875, + "logps/rejected": -213.33135986328125, + "loss": 0.6382, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1521110534667969, + "rewards/margins": 0.7525472640991211, + "rewards/rejected": -1.904658317565918, + "step": 3562 + }, + { + "epoch": 0.41, + "learning_rate": 1.7948027624956104e-07, + "logits/chosen": -1.8970766067504883, + "logits/rejected": -1.8907779455184937, + "logps/chosen": -346.79461669921875, + "logps/rejected": -343.54608154296875, + "loss": 0.7548, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.687272548675537, + "rewards/margins": 1.5279362201690674, + "rewards/rejected": -3.2152087688446045, + "step": 3563 + }, + { + "epoch": 0.41, + "learning_rate": 1.794451597799368e-07, + "logits/chosen": -1.9507843255996704, + "logits/rejected": -2.1951427459716797, + "logps/chosen": -345.15380859375, + "logps/rejected": -257.1893310546875, + "loss": 0.2242, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5535187125205994, + "rewards/margins": 2.479174852371216, + "rewards/rejected": -3.03269362449646, + "step": 3564 + }, + { + "epoch": 0.41, + "learning_rate": 1.7941004331031252e-07, + "logits/chosen": -2.3180058002471924, + "logits/rejected": -2.600421190261841, + "logps/chosen": -451.2398986816406, + "logps/rejected": -296.0256042480469, + "loss": 1.0515, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7213994264602661, + "rewards/margins": 0.6054724454879761, + "rewards/rejected": -1.3268719911575317, + "step": 3565 + }, + { + "epoch": 0.41, + "learning_rate": 1.7937492684068827e-07, + "logits/chosen": -2.2685632705688477, + "logits/rejected": -2.403949737548828, + "logps/chosen": -233.5272979736328, + "logps/rejected": -315.4733581542969, + "loss": 0.7329, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.435143232345581, + "rewards/margins": 0.9150569438934326, + "rewards/rejected": -2.3502004146575928, + "step": 3566 + }, + { + "epoch": 0.41, + "learning_rate": 1.7933981037106403e-07, + "logits/chosen": -2.2077865600585938, + "logits/rejected": -2.310357093811035, + "logps/chosen": -305.72869873046875, + "logps/rejected": -287.8505859375, + "loss": 0.2676, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03281751275062561, + "rewards/margins": 1.6708613634109497, + "rewards/rejected": -1.6380438804626465, + "step": 3567 + }, + { + "epoch": 0.41, + "learning_rate": 1.7930469390143975e-07, + "logits/chosen": -1.8176517486572266, + "logits/rejected": -1.6966298818588257, + "logps/chosen": -229.09942626953125, + "logps/rejected": -312.673828125, + "loss": 0.5502, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.0146431922912598, + "rewards/margins": 2.93537974357605, + "rewards/rejected": -4.950023174285889, + "step": 3568 + }, + { + "epoch": 0.41, + "learning_rate": 1.7926957743181553e-07, + "logits/chosen": -2.050802707672119, + "logits/rejected": -1.9268600940704346, + "logps/chosen": -167.26031494140625, + "logps/rejected": -252.658447265625, + "loss": 0.1809, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9320498108863831, + "rewards/margins": 2.5401771068573, + "rewards/rejected": -3.472226619720459, + "step": 3569 + }, + { + "epoch": 0.41, + "learning_rate": 1.7923446096219124e-07, + "logits/chosen": -2.865384578704834, + "logits/rejected": -2.901416778564453, + "logps/chosen": -328.4598693847656, + "logps/rejected": -505.0321044921875, + "loss": 0.2868, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6974014639854431, + "rewards/margins": 3.0226681232452393, + "rewards/rejected": -3.720069408416748, + "step": 3570 + }, + { + "epoch": 0.41, + "learning_rate": 1.7919934449256702e-07, + "logits/chosen": -2.5705137252807617, + "logits/rejected": -2.436234474182129, + "logps/chosen": -154.15692138671875, + "logps/rejected": -207.51483154296875, + "loss": 0.3576, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8524191975593567, + "rewards/margins": 1.6668959856033325, + "rewards/rejected": -2.519315004348755, + "step": 3571 + }, + { + "epoch": 0.41, + "learning_rate": 1.7916422802294277e-07, + "logits/chosen": -2.31919527053833, + "logits/rejected": -2.541149616241455, + "logps/chosen": -460.62103271484375, + "logps/rejected": -335.80535888671875, + "loss": 0.2847, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7887603044509888, + "rewards/margins": 2.4247148036956787, + "rewards/rejected": -3.213474988937378, + "step": 3572 + }, + { + "epoch": 0.41, + "learning_rate": 1.791291115533185e-07, + "logits/chosen": -1.868739366531372, + "logits/rejected": -1.9357609748840332, + "logps/chosen": -394.1673583984375, + "logps/rejected": -241.77777099609375, + "loss": 0.6385, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.712261438369751, + "rewards/margins": 1.156725287437439, + "rewards/rejected": -1.86898672580719, + "step": 3573 + }, + { + "epoch": 0.41, + "learning_rate": 1.7909399508369425e-07, + "logits/chosen": -1.8626716136932373, + "logits/rejected": -2.0302481651306152, + "logps/chosen": -450.0472412109375, + "logps/rejected": -379.01953125, + "loss": 0.0808, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8202368021011353, + "rewards/margins": 3.251415729522705, + "rewards/rejected": -4.071652889251709, + "step": 3574 + }, + { + "epoch": 0.41, + "learning_rate": 1.7905887861406998e-07, + "logits/chosen": -2.782750368118286, + "logits/rejected": -2.746908187866211, + "logps/chosen": -250.4927215576172, + "logps/rejected": -222.77041625976562, + "loss": 0.3736, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6340603232383728, + "rewards/margins": 2.6452620029449463, + "rewards/rejected": -3.2793221473693848, + "step": 3575 + }, + { + "epoch": 0.41, + "learning_rate": 1.7902376214444573e-07, + "logits/chosen": -1.9396145343780518, + "logits/rejected": -2.083091974258423, + "logps/chosen": -262.53375244140625, + "logps/rejected": -294.4822692871094, + "loss": 0.3879, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1345806121826172, + "rewards/margins": 2.787428617477417, + "rewards/rejected": -3.9220094680786133, + "step": 3576 + }, + { + "epoch": 0.41, + "learning_rate": 1.7898864567482149e-07, + "logits/chosen": -2.799881935119629, + "logits/rejected": -2.6332058906555176, + "logps/chosen": -413.8352355957031, + "logps/rejected": -478.34246826171875, + "loss": 0.4274, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9428863525390625, + "rewards/margins": 1.6728546619415283, + "rewards/rejected": -2.61574125289917, + "step": 3577 + }, + { + "epoch": 0.41, + "learning_rate": 1.789535292051972e-07, + "logits/chosen": -2.4201302528381348, + "logits/rejected": -2.6378769874572754, + "logps/chosen": -308.7447509765625, + "logps/rejected": -302.12335205078125, + "loss": 0.2172, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3541167378425598, + "rewards/margins": 2.458134174346924, + "rewards/rejected": -2.812251091003418, + "step": 3578 + }, + { + "epoch": 0.41, + "learning_rate": 1.7891841273557297e-07, + "logits/chosen": -2.734663248062134, + "logits/rejected": -2.898524761199951, + "logps/chosen": -281.3639221191406, + "logps/rejected": -291.03265380859375, + "loss": 0.564, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0631513595581055, + "rewards/margins": 2.0363030433654785, + "rewards/rejected": -3.099454402923584, + "step": 3579 + }, + { + "epoch": 0.41, + "learning_rate": 1.7888329626594875e-07, + "logits/chosen": -2.952038049697876, + "logits/rejected": -2.975628137588501, + "logps/chosen": -296.509033203125, + "logps/rejected": -327.430908203125, + "loss": 0.3722, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2457325458526611, + "rewards/margins": 2.2721643447875977, + "rewards/rejected": -3.5178966522216797, + "step": 3580 + }, + { + "epoch": 0.41, + "learning_rate": 1.7884817979632445e-07, + "logits/chosen": -1.7471321821212769, + "logits/rejected": -1.6370348930358887, + "logps/chosen": -452.522216796875, + "logps/rejected": -441.3392639160156, + "loss": 0.3032, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.251227855682373, + "rewards/margins": 1.640129566192627, + "rewards/rejected": -2.891357660293579, + "step": 3581 + }, + { + "epoch": 0.41, + "learning_rate": 1.7881306332670023e-07, + "logits/chosen": -2.2972805500030518, + "logits/rejected": -2.1197564601898193, + "logps/chosen": -202.37013244628906, + "logps/rejected": -265.1471862792969, + "loss": 0.3723, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.265049695968628, + "rewards/margins": 2.4808316230773926, + "rewards/rejected": -3.7458815574645996, + "step": 3582 + }, + { + "epoch": 0.41, + "learning_rate": 1.7877794685707596e-07, + "logits/chosen": -1.8970487117767334, + "logits/rejected": -2.332200288772583, + "logps/chosen": -348.5583801269531, + "logps/rejected": -308.7246398925781, + "loss": 0.2609, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6570977568626404, + "rewards/margins": 1.9405618906021118, + "rewards/rejected": -2.5976595878601074, + "step": 3583 + }, + { + "epoch": 0.41, + "learning_rate": 1.787428303874517e-07, + "logits/chosen": -2.4070260524749756, + "logits/rejected": -2.249011754989624, + "logps/chosen": -505.2513427734375, + "logps/rejected": -484.1454772949219, + "loss": 0.395, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2765377759933472, + "rewards/margins": 3.1456143856048584, + "rewards/rejected": -4.422152042388916, + "step": 3584 + }, + { + "epoch": 0.41, + "learning_rate": 1.7870771391782746e-07, + "logits/chosen": -2.2495977878570557, + "logits/rejected": -2.280442237854004, + "logps/chosen": -222.69775390625, + "logps/rejected": -194.90594482421875, + "loss": 0.4671, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0393284559249878, + "rewards/margins": 1.1291158199310303, + "rewards/rejected": -2.1684443950653076, + "step": 3585 + }, + { + "epoch": 0.41, + "learning_rate": 1.786725974482032e-07, + "logits/chosen": -2.270045280456543, + "logits/rejected": -2.6198220252990723, + "logps/chosen": -334.16326904296875, + "logps/rejected": -408.0368957519531, + "loss": 0.2626, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.39184045791625977, + "rewards/margins": 2.7275137901306152, + "rewards/rejected": -3.119354486465454, + "step": 3586 + }, + { + "epoch": 0.41, + "learning_rate": 1.7863748097857894e-07, + "logits/chosen": -2.182135581970215, + "logits/rejected": -2.19295334815979, + "logps/chosen": -356.12548828125, + "logps/rejected": -371.53814697265625, + "loss": 0.6661, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3346171379089355, + "rewards/margins": 0.9031849503517151, + "rewards/rejected": -2.237802028656006, + "step": 3587 + }, + { + "epoch": 0.41, + "learning_rate": 1.786023645089547e-07, + "logits/chosen": -1.9598628282546997, + "logits/rejected": -1.6002066135406494, + "logps/chosen": -564.531494140625, + "logps/rejected": -469.30767822265625, + "loss": 0.1053, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9778386354446411, + "rewards/margins": 3.5046181678771973, + "rewards/rejected": -4.482457160949707, + "step": 3588 + }, + { + "epoch": 0.41, + "learning_rate": 1.7856724803933043e-07, + "logits/chosen": -2.631894588470459, + "logits/rejected": -2.7673842906951904, + "logps/chosen": -131.91429138183594, + "logps/rejected": -197.71726989746094, + "loss": 0.3907, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4093049466609955, + "rewards/margins": 2.3817131519317627, + "rewards/rejected": -2.791018009185791, + "step": 3589 + }, + { + "epoch": 0.41, + "learning_rate": 1.7853213156970618e-07, + "logits/chosen": -1.6659021377563477, + "logits/rejected": -1.8716204166412354, + "logps/chosen": -304.2913818359375, + "logps/rejected": -281.9857482910156, + "loss": 0.8605, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.76190185546875, + "rewards/margins": 1.5614510774612427, + "rewards/rejected": -3.3233530521392822, + "step": 3590 + }, + { + "epoch": 0.41, + "learning_rate": 1.784970151000819e-07, + "logits/chosen": -2.4215824604034424, + "logits/rejected": -2.3347420692443848, + "logps/chosen": -172.50958251953125, + "logps/rejected": -270.5151062011719, + "loss": 0.0769, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08019132912158966, + "rewards/margins": 3.957062244415283, + "rewards/rejected": -4.0372538566589355, + "step": 3591 + }, + { + "epoch": 0.41, + "learning_rate": 1.784618986304577e-07, + "logits/chosen": -2.181149959564209, + "logits/rejected": -2.5066113471984863, + "logps/chosen": -318.3905944824219, + "logps/rejected": -287.08148193359375, + "loss": 0.1633, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14246726036071777, + "rewards/margins": 2.9784436225891113, + "rewards/rejected": -3.12091064453125, + "step": 3592 + }, + { + "epoch": 0.41, + "learning_rate": 1.7842678216083344e-07, + "logits/chosen": -2.286510467529297, + "logits/rejected": -1.908852219581604, + "logps/chosen": -255.35403442382812, + "logps/rejected": -367.91033935546875, + "loss": 0.5849, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6327435970306396, + "rewards/margins": 2.3281617164611816, + "rewards/rejected": -3.960905075073242, + "step": 3593 + }, + { + "epoch": 0.41, + "learning_rate": 1.7839166569120917e-07, + "logits/chosen": -2.176199197769165, + "logits/rejected": -2.284890651702881, + "logps/chosen": -450.2523193359375, + "logps/rejected": -537.8079223632812, + "loss": 0.3558, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2821146845817566, + "rewards/margins": 2.1152074337005615, + "rewards/rejected": -2.397321939468384, + "step": 3594 + }, + { + "epoch": 0.41, + "learning_rate": 1.7835654922158492e-07, + "logits/chosen": -1.8098034858703613, + "logits/rejected": -1.8243012428283691, + "logps/chosen": -321.2758483886719, + "logps/rejected": -208.9840545654297, + "loss": 0.6126, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5752813816070557, + "rewards/margins": 0.4157286584377289, + "rewards/rejected": -0.9910101890563965, + "step": 3595 + }, + { + "epoch": 0.41, + "learning_rate": 1.7832143275196068e-07, + "logits/chosen": -2.6887691020965576, + "logits/rejected": -2.7104647159576416, + "logps/chosen": -529.2926635742188, + "logps/rejected": -319.26861572265625, + "loss": 0.1191, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7497872710227966, + "rewards/margins": 3.3187572956085205, + "rewards/rejected": -4.068544387817383, + "step": 3596 + }, + { + "epoch": 0.41, + "learning_rate": 1.782863162823364e-07, + "logits/chosen": -1.4507358074188232, + "logits/rejected": -1.527939796447754, + "logps/chosen": -477.72161865234375, + "logps/rejected": -475.42413330078125, + "loss": 0.3532, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3792815208435059, + "rewards/margins": 1.5855128765106201, + "rewards/rejected": -2.964794635772705, + "step": 3597 + }, + { + "epoch": 0.41, + "learning_rate": 1.7825119981271216e-07, + "logits/chosen": -2.1191799640655518, + "logits/rejected": -2.3768692016601562, + "logps/chosen": -457.85955810546875, + "logps/rejected": -190.6631317138672, + "loss": 0.8809, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.0846266746520996, + "rewards/margins": 0.18749195337295532, + "rewards/rejected": -2.2721188068389893, + "step": 3598 + }, + { + "epoch": 0.41, + "learning_rate": 1.7821608334308789e-07, + "logits/chosen": -1.8057094812393188, + "logits/rejected": -2.009748935699463, + "logps/chosen": -466.5412292480469, + "logps/rejected": -405.824951171875, + "loss": 0.1354, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01929210126399994, + "rewards/margins": 2.5913076400756836, + "rewards/rejected": -2.6105997562408447, + "step": 3599 + }, + { + "epoch": 0.42, + "learning_rate": 1.7818096687346364e-07, + "logits/chosen": -1.446902871131897, + "logits/rejected": -1.5905039310455322, + "logps/chosen": -439.5801696777344, + "logps/rejected": -405.6097717285156, + "loss": 0.6382, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.38397711515426636, + "rewards/margins": 0.5190382599830627, + "rewards/rejected": -0.9030153751373291, + "step": 3600 + }, + { + "epoch": 0.42, + "learning_rate": 1.781458504038394e-07, + "logits/chosen": -2.3738913536071777, + "logits/rejected": -2.5589938163757324, + "logps/chosen": -236.63455200195312, + "logps/rejected": -298.75286865234375, + "loss": 0.0971, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17686188220977783, + "rewards/margins": 4.822299957275391, + "rewards/rejected": -4.645437717437744, + "step": 3601 + }, + { + "epoch": 0.42, + "learning_rate": 1.7811073393421512e-07, + "logits/chosen": -2.4539787769317627, + "logits/rejected": -2.6048765182495117, + "logps/chosen": -192.59849548339844, + "logps/rejected": -245.50906372070312, + "loss": 0.3858, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9632905125617981, + "rewards/margins": 4.033865928649902, + "rewards/rejected": -4.997156620025635, + "step": 3602 + }, + { + "epoch": 0.42, + "learning_rate": 1.780756174645909e-07, + "logits/chosen": -2.2022452354431152, + "logits/rejected": -2.433781147003174, + "logps/chosen": -418.2686767578125, + "logps/rejected": -259.5260009765625, + "loss": 0.1867, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.565525233745575, + "rewards/margins": 2.577277898788452, + "rewards/rejected": -3.142803192138672, + "step": 3603 + }, + { + "epoch": 0.42, + "learning_rate": 1.7804050099496665e-07, + "logits/chosen": -1.6715790033340454, + "logits/rejected": -1.5464506149291992, + "logps/chosen": -323.5335693359375, + "logps/rejected": -401.313232421875, + "loss": 0.4911, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.652055263519287, + "rewards/margins": 1.3308014869689941, + "rewards/rejected": -2.9828569889068604, + "step": 3604 + }, + { + "epoch": 0.42, + "learning_rate": 1.7800538452534238e-07, + "logits/chosen": -2.5503907203674316, + "logits/rejected": -2.590904712677002, + "logps/chosen": -230.70889282226562, + "logps/rejected": -228.47657775878906, + "loss": 1.004, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3566226959228516, + "rewards/margins": 0.7307469844818115, + "rewards/rejected": -2.087369680404663, + "step": 3605 + }, + { + "epoch": 0.42, + "learning_rate": 1.7797026805571814e-07, + "logits/chosen": -2.352842092514038, + "logits/rejected": -2.451563596725464, + "logps/chosen": -331.76165771484375, + "logps/rejected": -287.55218505859375, + "loss": 0.3883, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7924458980560303, + "rewards/margins": 1.7273032665252686, + "rewards/rejected": -2.519749164581299, + "step": 3606 + }, + { + "epoch": 0.42, + "learning_rate": 1.7793515158609386e-07, + "logits/chosen": -2.6014065742492676, + "logits/rejected": -2.5391626358032227, + "logps/chosen": -258.5137634277344, + "logps/rejected": -252.25076293945312, + "loss": 0.4927, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.488029956817627, + "rewards/margins": 1.524878740310669, + "rewards/rejected": -3.012908458709717, + "step": 3607 + }, + { + "epoch": 0.42, + "learning_rate": 1.7790003511646962e-07, + "logits/chosen": -1.9628427028656006, + "logits/rejected": -2.1422693729400635, + "logps/chosen": -317.9272155761719, + "logps/rejected": -265.4373474121094, + "loss": 0.653, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8377781510353088, + "rewards/margins": 0.4428597688674927, + "rewards/rejected": -1.2806379795074463, + "step": 3608 + }, + { + "epoch": 0.42, + "learning_rate": 1.7786491864684537e-07, + "logits/chosen": -2.4203014373779297, + "logits/rejected": -2.5579824447631836, + "logps/chosen": -416.33782958984375, + "logps/rejected": -366.5064697265625, + "loss": 0.4015, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1575473546981812, + "rewards/margins": 2.9823765754699707, + "rewards/rejected": -4.139924049377441, + "step": 3609 + }, + { + "epoch": 0.42, + "learning_rate": 1.778298021772211e-07, + "logits/chosen": -2.2808144092559814, + "logits/rejected": -2.0310559272766113, + "logps/chosen": -195.50535583496094, + "logps/rejected": -275.6591491699219, + "loss": 0.5324, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5728709697723389, + "rewards/margins": 1.5878257751464844, + "rewards/rejected": -2.160696506500244, + "step": 3610 + }, + { + "epoch": 0.42, + "learning_rate": 1.7779468570759685e-07, + "logits/chosen": -2.7866275310516357, + "logits/rejected": -2.561447858810425, + "logps/chosen": -294.4854431152344, + "logps/rejected": -358.77423095703125, + "loss": 0.3502, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0771172046661377, + "rewards/margins": 2.7362608909606934, + "rewards/rejected": -3.81337833404541, + "step": 3611 + }, + { + "epoch": 0.42, + "learning_rate": 1.777595692379726e-07, + "logits/chosen": -2.1296732425689697, + "logits/rejected": -2.210264205932617, + "logps/chosen": -277.14697265625, + "logps/rejected": -249.24362182617188, + "loss": 0.1592, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06232161819934845, + "rewards/margins": 2.9936859607696533, + "rewards/rejected": -3.0560073852539062, + "step": 3612 + }, + { + "epoch": 0.42, + "learning_rate": 1.7772445276834833e-07, + "logits/chosen": -2.0902957916259766, + "logits/rejected": -1.7844884395599365, + "logps/chosen": -312.26104736328125, + "logps/rejected": -412.0323181152344, + "loss": 0.4301, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2339836359024048, + "rewards/margins": 2.8982884883880615, + "rewards/rejected": -4.132272243499756, + "step": 3613 + }, + { + "epoch": 0.42, + "learning_rate": 1.7768933629872411e-07, + "logits/chosen": -2.063283681869507, + "logits/rejected": -2.156099319458008, + "logps/chosen": -229.53759765625, + "logps/rejected": -253.26486206054688, + "loss": 0.4591, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.45934736728668213, + "rewards/margins": 2.5165657997131348, + "rewards/rejected": -2.9759130477905273, + "step": 3614 + }, + { + "epoch": 0.42, + "learning_rate": 1.7765421982909981e-07, + "logits/chosen": -2.8633880615234375, + "logits/rejected": -2.853503704071045, + "logps/chosen": -392.9278564453125, + "logps/rejected": -306.0523681640625, + "loss": 0.0393, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24570497870445251, + "rewards/margins": 5.262941837310791, + "rewards/rejected": -5.5086469650268555, + "step": 3615 + }, + { + "epoch": 0.42, + "learning_rate": 1.776191033594756e-07, + "logits/chosen": -2.711470603942871, + "logits/rejected": -2.630847215652466, + "logps/chosen": -167.6109619140625, + "logps/rejected": -204.21937561035156, + "loss": 0.592, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6992620825767517, + "rewards/margins": 1.3853546380996704, + "rewards/rejected": -2.0846166610717773, + "step": 3616 + }, + { + "epoch": 0.42, + "learning_rate": 1.7758398688985135e-07, + "logits/chosen": -2.368584632873535, + "logits/rejected": -2.232464075088501, + "logps/chosen": -281.652099609375, + "logps/rejected": -314.90228271484375, + "loss": 0.309, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1038343906402588, + "rewards/margins": 2.049318313598633, + "rewards/rejected": -3.1531527042388916, + "step": 3617 + }, + { + "epoch": 0.42, + "learning_rate": 1.7754887042022708e-07, + "logits/chosen": -2.0270800590515137, + "logits/rejected": -2.1185011863708496, + "logps/chosen": -447.4028625488281, + "logps/rejected": -306.7783203125, + "loss": 0.4632, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2447025775909424, + "rewards/margins": 1.7608966827392578, + "rewards/rejected": -3.0055994987487793, + "step": 3618 + }, + { + "epoch": 0.42, + "learning_rate": 1.7751375395060283e-07, + "logits/chosen": -2.2253706455230713, + "logits/rejected": -2.2696311473846436, + "logps/chosen": -169.2085418701172, + "logps/rejected": -184.03564453125, + "loss": 0.2461, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.215325951576233, + "rewards/margins": 2.728586435317993, + "rewards/rejected": -3.9439122676849365, + "step": 3619 + }, + { + "epoch": 0.42, + "learning_rate": 1.7747863748097856e-07, + "logits/chosen": -1.9085273742675781, + "logits/rejected": -2.283550977706909, + "logps/chosen": -299.8195495605469, + "logps/rejected": -280.05804443359375, + "loss": 0.0738, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.49576523900032043, + "rewards/margins": 4.345239162445068, + "rewards/rejected": -4.841004371643066, + "step": 3620 + }, + { + "epoch": 0.42, + "learning_rate": 1.774435210113543e-07, + "logits/chosen": -2.340235948562622, + "logits/rejected": -2.003641366958618, + "logps/chosen": -156.5685577392578, + "logps/rejected": -281.428466796875, + "loss": 0.43, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.43985018134117126, + "rewards/margins": 1.3698128461837769, + "rewards/rejected": -1.8096630573272705, + "step": 3621 + }, + { + "epoch": 0.42, + "learning_rate": 1.7740840454173007e-07, + "logits/chosen": -2.3808701038360596, + "logits/rejected": -2.2840073108673096, + "logps/chosen": -144.3193817138672, + "logps/rejected": -245.79112243652344, + "loss": 0.3205, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0688130855560303, + "rewards/margins": 2.2908029556274414, + "rewards/rejected": -3.359616279602051, + "step": 3622 + }, + { + "epoch": 0.42, + "learning_rate": 1.773732880721058e-07, + "logits/chosen": -2.2718207836151123, + "logits/rejected": -2.365293025970459, + "logps/chosen": -171.39073181152344, + "logps/rejected": -200.81015014648438, + "loss": 0.9537, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.8643345832824707, + "rewards/margins": 0.3643310070037842, + "rewards/rejected": -2.228665590286255, + "step": 3623 + }, + { + "epoch": 0.42, + "learning_rate": 1.7733817160248155e-07, + "logits/chosen": -2.117051839828491, + "logits/rejected": -1.9574432373046875, + "logps/chosen": -172.67039489746094, + "logps/rejected": -357.60723876953125, + "loss": 0.2766, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7794634699821472, + "rewards/margins": 3.9008121490478516, + "rewards/rejected": -4.6802754402160645, + "step": 3624 + }, + { + "epoch": 0.42, + "learning_rate": 1.7730305513285733e-07, + "logits/chosen": -2.6771602630615234, + "logits/rejected": -2.6843767166137695, + "logps/chosen": -225.93588256835938, + "logps/rejected": -303.90771484375, + "loss": 0.3605, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8731434345245361, + "rewards/margins": 2.823723793029785, + "rewards/rejected": -4.696866989135742, + "step": 3625 + }, + { + "epoch": 0.42, + "learning_rate": 1.7726793866323305e-07, + "logits/chosen": -2.6482620239257812, + "logits/rejected": -2.690706729888916, + "logps/chosen": -325.26971435546875, + "logps/rejected": -272.7709045410156, + "loss": 0.4344, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1753145456314087, + "rewards/margins": 1.3722548484802246, + "rewards/rejected": -2.547569513320923, + "step": 3626 + }, + { + "epoch": 0.42, + "learning_rate": 1.772328221936088e-07, + "logits/chosen": -2.1661362648010254, + "logits/rejected": -2.45961332321167, + "logps/chosen": -301.2500915527344, + "logps/rejected": -213.01846313476562, + "loss": 0.5508, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.025341510772705, + "rewards/margins": 1.118487000465393, + "rewards/rejected": -2.1438286304473877, + "step": 3627 + }, + { + "epoch": 0.42, + "learning_rate": 1.7719770572398454e-07, + "logits/chosen": -2.10052227973938, + "logits/rejected": -2.3743131160736084, + "logps/chosen": -157.0155792236328, + "logps/rejected": -165.58270263671875, + "loss": 0.8154, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2215276956558228, + "rewards/margins": 1.129809856414795, + "rewards/rejected": -2.3513379096984863, + "step": 3628 + }, + { + "epoch": 0.42, + "learning_rate": 1.771625892543603e-07, + "logits/chosen": -2.305598497390747, + "logits/rejected": -2.595730781555176, + "logps/chosen": -273.3182678222656, + "logps/rejected": -229.67581176757812, + "loss": 0.4288, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6132289171218872, + "rewards/margins": 1.4624143838882446, + "rewards/rejected": -2.075643301010132, + "step": 3629 + }, + { + "epoch": 0.42, + "learning_rate": 1.7712747278473604e-07, + "logits/chosen": -1.879894495010376, + "logits/rejected": -1.8363778591156006, + "logps/chosen": -322.9818115234375, + "logps/rejected": -303.95751953125, + "loss": 0.4502, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.338435560464859, + "rewards/margins": 1.753393530845642, + "rewards/rejected": -2.0918290615081787, + "step": 3630 + }, + { + "epoch": 0.42, + "learning_rate": 1.7709235631511177e-07, + "logits/chosen": -2.6815128326416016, + "logits/rejected": -2.499873161315918, + "logps/chosen": -314.1072082519531, + "logps/rejected": -303.625732421875, + "loss": 0.2525, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7809873223304749, + "rewards/margins": 3.3359107971191406, + "rewards/rejected": -4.116898059844971, + "step": 3631 + }, + { + "epoch": 0.42, + "learning_rate": 1.7705723984548752e-07, + "logits/chosen": -1.607651948928833, + "logits/rejected": -1.904699683189392, + "logps/chosen": -549.3886108398438, + "logps/rejected": -331.3436279296875, + "loss": 0.3012, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12083860486745834, + "rewards/margins": 1.8058278560638428, + "rewards/rejected": -1.6849894523620605, + "step": 3632 + }, + { + "epoch": 0.42, + "learning_rate": 1.7702212337586328e-07, + "logits/chosen": -2.1592490673065186, + "logits/rejected": -2.0161802768707275, + "logps/chosen": -259.96337890625, + "logps/rejected": -218.8419952392578, + "loss": 0.7075, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.430704116821289, + "rewards/margins": 0.44341781735420227, + "rewards/rejected": -1.874121904373169, + "step": 3633 + }, + { + "epoch": 0.42, + "learning_rate": 1.76987006906239e-07, + "logits/chosen": -2.4325833320617676, + "logits/rejected": -2.3415474891662598, + "logps/chosen": -205.13558959960938, + "logps/rejected": -218.85354614257812, + "loss": 1.5945, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.1010026931762695, + "rewards/margins": -0.0357949435710907, + "rewards/rejected": -2.0652077198028564, + "step": 3634 + }, + { + "epoch": 0.42, + "learning_rate": 1.7695189043661476e-07, + "logits/chosen": -1.8448482751846313, + "logits/rejected": -2.0191938877105713, + "logps/chosen": -484.2198486328125, + "logps/rejected": -379.8644714355469, + "loss": 0.8126, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6234964728355408, + "rewards/margins": 1.0089694261550903, + "rewards/rejected": -1.6324660778045654, + "step": 3635 + }, + { + "epoch": 0.42, + "learning_rate": 1.769167739669905e-07, + "logits/chosen": -2.9228975772857666, + "logits/rejected": -2.814814567565918, + "logps/chosen": -174.52908325195312, + "logps/rejected": -246.65676879882812, + "loss": 0.2937, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2740232944488525, + "rewards/margins": 2.8999409675598145, + "rewards/rejected": -4.173964023590088, + "step": 3636 + }, + { + "epoch": 0.42, + "learning_rate": 1.7688165749736627e-07, + "logits/chosen": -2.0411810874938965, + "logits/rejected": -1.9190821647644043, + "logps/chosen": -269.06903076171875, + "logps/rejected": -374.7118225097656, + "loss": 0.4751, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2801930904388428, + "rewards/margins": 1.6159504652023315, + "rewards/rejected": -2.896143674850464, + "step": 3637 + }, + { + "epoch": 0.42, + "learning_rate": 1.7684654102774202e-07, + "logits/chosen": -2.0515635013580322, + "logits/rejected": -2.1799511909484863, + "logps/chosen": -336.0790710449219, + "logps/rejected": -285.1427001953125, + "loss": 0.2764, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9653873443603516, + "rewards/margins": 2.7946248054504395, + "rewards/rejected": -3.760012149810791, + "step": 3638 + }, + { + "epoch": 0.42, + "learning_rate": 1.7681142455811775e-07, + "logits/chosen": -2.198565721511841, + "logits/rejected": -2.649580717086792, + "logps/chosen": -502.986328125, + "logps/rejected": -176.0306854248047, + "loss": 0.3152, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.34555351734161377, + "rewards/margins": 2.195173978805542, + "rewards/rejected": -2.5407276153564453, + "step": 3639 + }, + { + "epoch": 0.42, + "learning_rate": 1.767763080884935e-07, + "logits/chosen": -2.3618364334106445, + "logits/rejected": -2.3091769218444824, + "logps/chosen": -322.7622375488281, + "logps/rejected": -285.536865234375, + "loss": 0.3676, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.989288330078125, + "rewards/margins": 1.3789080381393433, + "rewards/rejected": -2.368196487426758, + "step": 3640 + }, + { + "epoch": 0.42, + "learning_rate": 1.7674119161886926e-07, + "logits/chosen": -2.4038214683532715, + "logits/rejected": -2.43312668800354, + "logps/chosen": -265.4901428222656, + "logps/rejected": -254.34625244140625, + "loss": 0.1862, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1092675924301147, + "rewards/margins": 2.4113340377807617, + "rewards/rejected": -3.520601511001587, + "step": 3641 + }, + { + "epoch": 0.42, + "learning_rate": 1.7670607514924498e-07, + "logits/chosen": -2.257789373397827, + "logits/rejected": -2.1314404010772705, + "logps/chosen": -359.7840270996094, + "logps/rejected": -314.2688293457031, + "loss": 0.1612, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.122269868850708, + "rewards/margins": 2.2025296688079834, + "rewards/rejected": -3.3247992992401123, + "step": 3642 + }, + { + "epoch": 0.42, + "learning_rate": 1.7667095867962074e-07, + "logits/chosen": -2.3032779693603516, + "logits/rejected": -2.3171448707580566, + "logps/chosen": -290.20074462890625, + "logps/rejected": -262.654052734375, + "loss": 0.6059, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2939019203186035, + "rewards/margins": 0.4655379354953766, + "rewards/rejected": -1.7594399452209473, + "step": 3643 + }, + { + "epoch": 0.42, + "learning_rate": 1.7663584220999646e-07, + "logits/chosen": -1.9134513139724731, + "logits/rejected": -2.1486592292785645, + "logps/chosen": -163.4988250732422, + "logps/rejected": -131.4730224609375, + "loss": 1.0716, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.1255030632019043, + "rewards/margins": 1.009188175201416, + "rewards/rejected": -3.1346912384033203, + "step": 3644 + }, + { + "epoch": 0.42, + "learning_rate": 1.7660072574037222e-07, + "logits/chosen": -2.5539777278900146, + "logits/rejected": -2.548720121383667, + "logps/chosen": -466.5169982910156, + "logps/rejected": -291.71710205078125, + "loss": 0.3116, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8340349197387695, + "rewards/margins": 1.5559927225112915, + "rewards/rejected": -2.3900277614593506, + "step": 3645 + }, + { + "epoch": 0.42, + "learning_rate": 1.7656560927074797e-07, + "logits/chosen": -2.5896530151367188, + "logits/rejected": -2.549361228942871, + "logps/chosen": -350.7493896484375, + "logps/rejected": -269.4369201660156, + "loss": 0.3093, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8099797368049622, + "rewards/margins": 1.8300485610961914, + "rewards/rejected": -2.640028238296509, + "step": 3646 + }, + { + "epoch": 0.42, + "learning_rate": 1.765304928011237e-07, + "logits/chosen": -2.2544307708740234, + "logits/rejected": -2.2650768756866455, + "logps/chosen": -441.5562438964844, + "logps/rejected": -378.6304016113281, + "loss": 0.8094, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8754667043685913, + "rewards/margins": 1.732333779335022, + "rewards/rejected": -2.6078007221221924, + "step": 3647 + }, + { + "epoch": 0.42, + "learning_rate": 1.7649537633149948e-07, + "logits/chosen": -2.6592600345611572, + "logits/rejected": -2.542020082473755, + "logps/chosen": -179.9562530517578, + "logps/rejected": -285.7052001953125, + "loss": 0.2059, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7976799607276917, + "rewards/margins": 3.8978776931762695, + "rewards/rejected": -4.695557594299316, + "step": 3648 + }, + { + "epoch": 0.42, + "learning_rate": 1.7646025986187523e-07, + "logits/chosen": -1.9725643396377563, + "logits/rejected": -1.9499183893203735, + "logps/chosen": -308.1611022949219, + "logps/rejected": -299.430419921875, + "loss": 0.3926, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.069166898727417, + "rewards/margins": 2.6431069374084473, + "rewards/rejected": -3.7122738361358643, + "step": 3649 + }, + { + "epoch": 0.42, + "learning_rate": 1.7642514339225096e-07, + "logits/chosen": -2.067564010620117, + "logits/rejected": -2.147291660308838, + "logps/chosen": -292.1568908691406, + "logps/rejected": -272.76409912109375, + "loss": 0.9473, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5166399478912354, + "rewards/margins": 1.2977895736694336, + "rewards/rejected": -2.814429521560669, + "step": 3650 + }, + { + "epoch": 0.42, + "learning_rate": 1.7639002692262672e-07, + "logits/chosen": -2.303286075592041, + "logits/rejected": -2.205859661102295, + "logps/chosen": -386.7969970703125, + "logps/rejected": -356.91748046875, + "loss": 0.5513, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.096520185470581, + "rewards/margins": 2.55541729927063, + "rewards/rejected": -3.651937484741211, + "step": 3651 + }, + { + "epoch": 0.42, + "learning_rate": 1.7635491045300244e-07, + "logits/chosen": -2.12306809425354, + "logits/rejected": -2.288560628890991, + "logps/chosen": -321.3630065917969, + "logps/rejected": -263.89599609375, + "loss": 0.8542, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5197497010231018, + "rewards/margins": 0.7525572180747986, + "rewards/rejected": -1.2723069190979004, + "step": 3652 + }, + { + "epoch": 0.42, + "learning_rate": 1.763197939833782e-07, + "logits/chosen": -2.3758232593536377, + "logits/rejected": -2.331402540206909, + "logps/chosen": -198.66868591308594, + "logps/rejected": -180.34881591796875, + "loss": 0.5681, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1114444732666016, + "rewards/margins": 1.578599214553833, + "rewards/rejected": -2.6900436878204346, + "step": 3653 + }, + { + "epoch": 0.42, + "learning_rate": 1.7628467751375395e-07, + "logits/chosen": -2.3200204372406006, + "logits/rejected": -2.3625197410583496, + "logps/chosen": -358.385498046875, + "logps/rejected": -224.7896270751953, + "loss": 0.4306, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6598082780838013, + "rewards/margins": 1.0426180362701416, + "rewards/rejected": -1.7024264335632324, + "step": 3654 + }, + { + "epoch": 0.42, + "learning_rate": 1.7624956104412968e-07, + "logits/chosen": -1.9080601930618286, + "logits/rejected": -2.084385395050049, + "logps/chosen": -308.4860534667969, + "logps/rejected": -314.7080078125, + "loss": 1.0919, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8213934302330017, + "rewards/margins": 0.5329241752624512, + "rewards/rejected": -1.3543176651000977, + "step": 3655 + }, + { + "epoch": 0.42, + "learning_rate": 1.7621444457450543e-07, + "logits/chosen": -1.7748441696166992, + "logits/rejected": -2.041292190551758, + "logps/chosen": -438.07135009765625, + "logps/rejected": -324.539306640625, + "loss": 1.2997, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.195584774017334, + "rewards/margins": -0.14765536785125732, + "rewards/rejected": -2.047929286956787, + "step": 3656 + }, + { + "epoch": 0.42, + "learning_rate": 1.7617932810488119e-07, + "logits/chosen": -2.3356521129608154, + "logits/rejected": -2.313835382461548, + "logps/chosen": -200.1031494140625, + "logps/rejected": -276.15325927734375, + "loss": 0.7036, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9775222539901733, + "rewards/margins": 1.3574917316436768, + "rewards/rejected": -2.3350138664245605, + "step": 3657 + }, + { + "epoch": 0.42, + "learning_rate": 1.761442116352569e-07, + "logits/chosen": -2.033510446548462, + "logits/rejected": -1.7972311973571777, + "logps/chosen": -208.91116333007812, + "logps/rejected": -294.4700927734375, + "loss": 0.4461, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9395633339881897, + "rewards/margins": 4.037163734436035, + "rewards/rejected": -4.976727485656738, + "step": 3658 + }, + { + "epoch": 0.42, + "learning_rate": 1.761090951656327e-07, + "logits/chosen": -2.1680476665496826, + "logits/rejected": -2.2702083587646484, + "logps/chosen": -389.48858642578125, + "logps/rejected": -354.3136291503906, + "loss": 0.6484, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7240912318229675, + "rewards/margins": 1.5531861782073975, + "rewards/rejected": -2.2772774696350098, + "step": 3659 + }, + { + "epoch": 0.42, + "learning_rate": 1.760739786960084e-07, + "logits/chosen": -2.7910754680633545, + "logits/rejected": -2.4383137226104736, + "logps/chosen": -265.5043640136719, + "logps/rejected": -382.41302490234375, + "loss": 0.4083, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3481374979019165, + "rewards/margins": 2.4834067821502686, + "rewards/rejected": -3.8315441608428955, + "step": 3660 + }, + { + "epoch": 0.42, + "learning_rate": 1.7603886222638417e-07, + "logits/chosen": -2.870151996612549, + "logits/rejected": -2.6460392475128174, + "logps/chosen": -184.67520141601562, + "logps/rejected": -211.6183319091797, + "loss": 0.9717, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5088298320770264, + "rewards/margins": 0.8680024743080139, + "rewards/rejected": -2.3768324851989746, + "step": 3661 + }, + { + "epoch": 0.42, + "learning_rate": 1.7600374575675993e-07, + "logits/chosen": -2.8305351734161377, + "logits/rejected": -3.014303207397461, + "logps/chosen": -197.25132751464844, + "logps/rejected": -183.88424682617188, + "loss": 0.1944, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6275904178619385, + "rewards/margins": 2.4364373683929443, + "rewards/rejected": -3.064027786254883, + "step": 3662 + }, + { + "epoch": 0.42, + "learning_rate": 1.7596862928713566e-07, + "logits/chosen": -2.5611345767974854, + "logits/rejected": -2.1056673526763916, + "logps/chosen": -260.60662841796875, + "logps/rejected": -318.6593017578125, + "loss": 0.3304, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8329028487205505, + "rewards/margins": 1.4056962728500366, + "rewards/rejected": -2.2385990619659424, + "step": 3663 + }, + { + "epoch": 0.42, + "learning_rate": 1.759335128175114e-07, + "logits/chosen": -1.7581077814102173, + "logits/rejected": -1.672938346862793, + "logps/chosen": -363.2204284667969, + "logps/rejected": -448.3829345703125, + "loss": 0.3909, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.534072995185852, + "rewards/margins": 2.1040761470794678, + "rewards/rejected": -2.6381490230560303, + "step": 3664 + }, + { + "epoch": 0.42, + "learning_rate": 1.7589839634788714e-07, + "logits/chosen": -2.174468755722046, + "logits/rejected": -2.126870632171631, + "logps/chosen": -214.11679077148438, + "logps/rejected": -238.8479766845703, + "loss": 0.5894, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9084979891777039, + "rewards/margins": 1.9382113218307495, + "rewards/rejected": -2.8467092514038086, + "step": 3665 + }, + { + "epoch": 0.42, + "learning_rate": 1.758632798782629e-07, + "logits/chosen": -2.2827506065368652, + "logits/rejected": -2.1374759674072266, + "logps/chosen": -441.53466796875, + "logps/rejected": -376.26568603515625, + "loss": 0.185, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8497387170791626, + "rewards/margins": 2.8521041870117188, + "rewards/rejected": -3.701842784881592, + "step": 3666 + }, + { + "epoch": 0.42, + "learning_rate": 1.7582816340863864e-07, + "logits/chosen": -2.711533308029175, + "logits/rejected": -2.959078788757324, + "logps/chosen": -250.7953338623047, + "logps/rejected": -252.70916748046875, + "loss": 0.4824, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3937745094299316, + "rewards/margins": 2.3220062255859375, + "rewards/rejected": -3.715780735015869, + "step": 3667 + }, + { + "epoch": 0.42, + "learning_rate": 1.7579304693901437e-07, + "logits/chosen": -2.3006134033203125, + "logits/rejected": -2.13429594039917, + "logps/chosen": -263.20965576171875, + "logps/rejected": -150.86087036132812, + "loss": 0.5432, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8675420880317688, + "rewards/margins": 1.1286720037460327, + "rewards/rejected": -1.9962141513824463, + "step": 3668 + }, + { + "epoch": 0.42, + "learning_rate": 1.7575793046939013e-07, + "logits/chosen": -2.56234073638916, + "logits/rejected": -2.3443238735198975, + "logps/chosen": -420.63861083984375, + "logps/rejected": -456.5143127441406, + "loss": 0.2955, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8189125061035156, + "rewards/margins": 2.734945774078369, + "rewards/rejected": -3.553858518600464, + "step": 3669 + }, + { + "epoch": 0.42, + "learning_rate": 1.757228139997659e-07, + "logits/chosen": -2.109347343444824, + "logits/rejected": -2.0885632038116455, + "logps/chosen": -488.25604248046875, + "logps/rejected": -465.506103515625, + "loss": 0.7693, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.325269103050232, + "rewards/margins": 0.566271960735321, + "rewards/rejected": -1.8915410041809082, + "step": 3670 + }, + { + "epoch": 0.42, + "learning_rate": 1.7568769753014163e-07, + "logits/chosen": -2.222006320953369, + "logits/rejected": -2.3379416465759277, + "logps/chosen": -173.365478515625, + "logps/rejected": -187.4268798828125, + "loss": 0.2491, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7394006848335266, + "rewards/margins": 2.535966157913208, + "rewards/rejected": -3.27536678314209, + "step": 3671 + }, + { + "epoch": 0.42, + "learning_rate": 1.756525810605174e-07, + "logits/chosen": -2.426300525665283, + "logits/rejected": -2.5731425285339355, + "logps/chosen": -345.73150634765625, + "logps/rejected": -243.4466552734375, + "loss": 0.2991, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8564110398292542, + "rewards/margins": 1.3411659002304077, + "rewards/rejected": -2.1975767612457275, + "step": 3672 + }, + { + "epoch": 0.42, + "learning_rate": 1.7561746459089311e-07, + "logits/chosen": -1.7410686016082764, + "logits/rejected": -2.0366008281707764, + "logps/chosen": -367.3852233886719, + "logps/rejected": -268.8139953613281, + "loss": 0.4289, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9114644527435303, + "rewards/margins": 3.0797133445739746, + "rewards/rejected": -3.991177558898926, + "step": 3673 + }, + { + "epoch": 0.42, + "learning_rate": 1.7558234812126887e-07, + "logits/chosen": -2.001483917236328, + "logits/rejected": -2.2515218257904053, + "logps/chosen": -320.1861572265625, + "logps/rejected": -283.8935241699219, + "loss": 0.1676, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6220220327377319, + "rewards/margins": 3.027815818786621, + "rewards/rejected": -3.6498379707336426, + "step": 3674 + }, + { + "epoch": 0.42, + "learning_rate": 1.7554723165164462e-07, + "logits/chosen": -2.1662068367004395, + "logits/rejected": -2.2477784156799316, + "logps/chosen": -287.53607177734375, + "logps/rejected": -385.84619140625, + "loss": 0.2544, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6734771728515625, + "rewards/margins": 1.9922242164611816, + "rewards/rejected": -2.665701389312744, + "step": 3675 + }, + { + "epoch": 0.42, + "learning_rate": 1.7551211518202035e-07, + "logits/chosen": -3.0342464447021484, + "logits/rejected": -3.027993679046631, + "logps/chosen": -412.24432373046875, + "logps/rejected": -299.5213317871094, + "loss": 0.3794, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.79170161485672, + "rewards/margins": 1.6544764041900635, + "rewards/rejected": -2.4461779594421387, + "step": 3676 + }, + { + "epoch": 0.42, + "learning_rate": 1.754769987123961e-07, + "logits/chosen": -2.184462070465088, + "logits/rejected": -2.5559792518615723, + "logps/chosen": -458.7841796875, + "logps/rejected": -382.20672607421875, + "loss": 0.2673, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2079081535339355, + "rewards/margins": 2.489833354949951, + "rewards/rejected": -3.6977415084838867, + "step": 3677 + }, + { + "epoch": 0.42, + "learning_rate": 1.7544188224277186e-07, + "logits/chosen": -2.3489739894866943, + "logits/rejected": -2.321608304977417, + "logps/chosen": -255.1737518310547, + "logps/rejected": -229.43338012695312, + "loss": 1.0524, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6503820419311523, + "rewards/margins": 0.6398910880088806, + "rewards/rejected": -2.2902729511260986, + "step": 3678 + }, + { + "epoch": 0.42, + "learning_rate": 1.7540676577314758e-07, + "logits/chosen": -2.368597984313965, + "logits/rejected": -2.248931884765625, + "logps/chosen": -167.56959533691406, + "logps/rejected": -218.646240234375, + "loss": 0.1764, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8443945646286011, + "rewards/margins": 2.9320006370544434, + "rewards/rejected": -3.776395082473755, + "step": 3679 + }, + { + "epoch": 0.42, + "learning_rate": 1.7537164930352334e-07, + "logits/chosen": -2.1558585166931152, + "logits/rejected": -2.2635886669158936, + "logps/chosen": -360.9857177734375, + "logps/rejected": -284.484130859375, + "loss": 0.1482, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.34716567397117615, + "rewards/margins": 3.685462236404419, + "rewards/rejected": -4.032628059387207, + "step": 3680 + }, + { + "epoch": 0.42, + "learning_rate": 1.7533653283389907e-07, + "logits/chosen": -2.417902708053589, + "logits/rejected": -2.532045364379883, + "logps/chosen": -380.3587646484375, + "logps/rejected": -292.94305419921875, + "loss": 0.1178, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5027264356613159, + "rewards/margins": 3.530672073364258, + "rewards/rejected": -4.033398628234863, + "step": 3681 + }, + { + "epoch": 0.42, + "learning_rate": 1.7530141636427485e-07, + "logits/chosen": -2.3182995319366455, + "logits/rejected": -2.289841413497925, + "logps/chosen": -263.27130126953125, + "logps/rejected": -304.5800476074219, + "loss": 0.7339, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.897684931755066, + "rewards/margins": 1.296128749847412, + "rewards/rejected": -3.1938135623931885, + "step": 3682 + }, + { + "epoch": 0.42, + "learning_rate": 1.752662998946506e-07, + "logits/chosen": -1.9276607036590576, + "logits/rejected": -1.990384578704834, + "logps/chosen": -445.75885009765625, + "logps/rejected": -398.8851013183594, + "loss": 0.3718, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4620780944824219, + "rewards/margins": 2.0023863315582275, + "rewards/rejected": -3.4644646644592285, + "step": 3683 + }, + { + "epoch": 0.42, + "learning_rate": 1.7523118342502633e-07, + "logits/chosen": -2.2864646911621094, + "logits/rejected": -2.42741060256958, + "logps/chosen": -218.3878173828125, + "logps/rejected": -213.6890106201172, + "loss": 0.7316, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1141983270645142, + "rewards/margins": 1.2398858070373535, + "rewards/rejected": -2.354084014892578, + "step": 3684 + }, + { + "epoch": 0.42, + "learning_rate": 1.7519606695540208e-07, + "logits/chosen": -2.211531162261963, + "logits/rejected": -2.4272961616516113, + "logps/chosen": -231.1154022216797, + "logps/rejected": -206.16632080078125, + "loss": 0.4169, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2581069469451904, + "rewards/margins": 2.691976547241211, + "rewards/rejected": -3.9500832557678223, + "step": 3685 + }, + { + "epoch": 0.42, + "learning_rate": 1.7516095048577784e-07, + "logits/chosen": -2.57967472076416, + "logits/rejected": -2.381075620651245, + "logps/chosen": -212.86439514160156, + "logps/rejected": -332.57635498046875, + "loss": 0.1923, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.580716609954834, + "rewards/margins": 3.448117256164551, + "rewards/rejected": -4.028833866119385, + "step": 3686 + }, + { + "epoch": 0.43, + "learning_rate": 1.7512583401615356e-07, + "logits/chosen": -2.5319437980651855, + "logits/rejected": -2.5662007331848145, + "logps/chosen": -324.89593505859375, + "logps/rejected": -371.5928649902344, + "loss": 0.1706, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5194817781448364, + "rewards/margins": 2.3568766117095947, + "rewards/rejected": -2.8763585090637207, + "step": 3687 + }, + { + "epoch": 0.43, + "learning_rate": 1.7509071754652932e-07, + "logits/chosen": -2.0015244483947754, + "logits/rejected": -2.0550856590270996, + "logps/chosen": -654.6995239257812, + "logps/rejected": -430.50616455078125, + "loss": 0.6165, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9745237827301025, + "rewards/margins": 2.3766870498657227, + "rewards/rejected": -3.3512110710144043, + "step": 3688 + }, + { + "epoch": 0.43, + "learning_rate": 1.7505560107690504e-07, + "logits/chosen": -2.0950896739959717, + "logits/rejected": -1.9493703842163086, + "logps/chosen": -159.2845458984375, + "logps/rejected": -252.755615234375, + "loss": 0.1712, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8233861923217773, + "rewards/margins": 2.067927598953247, + "rewards/rejected": -2.8913135528564453, + "step": 3689 + }, + { + "epoch": 0.43, + "learning_rate": 1.750204846072808e-07, + "logits/chosen": -2.579516649246216, + "logits/rejected": -2.5103700160980225, + "logps/chosen": -246.68463134765625, + "logps/rejected": -297.4742431640625, + "loss": 0.4844, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.591590166091919, + "rewards/margins": 1.884194254875183, + "rewards/rejected": -2.4757843017578125, + "step": 3690 + }, + { + "epoch": 0.43, + "learning_rate": 1.7498536813765655e-07, + "logits/chosen": -2.8553805351257324, + "logits/rejected": -2.823073148727417, + "logps/chosen": -214.97073364257812, + "logps/rejected": -343.7340087890625, + "loss": 0.1379, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7406502962112427, + "rewards/margins": 3.641632556915283, + "rewards/rejected": -4.382282733917236, + "step": 3691 + }, + { + "epoch": 0.43, + "learning_rate": 1.7495025166803228e-07, + "logits/chosen": -2.657167911529541, + "logits/rejected": -2.8240301609039307, + "logps/chosen": -126.77743530273438, + "logps/rejected": -173.01739501953125, + "loss": 0.6685, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6708940863609314, + "rewards/margins": 2.215385913848877, + "rewards/rejected": -2.886279582977295, + "step": 3692 + }, + { + "epoch": 0.43, + "learning_rate": 1.7491513519840806e-07, + "logits/chosen": -2.0280957221984863, + "logits/rejected": -2.2107205390930176, + "logps/chosen": -258.6259460449219, + "logps/rejected": -307.65380859375, + "loss": 0.4754, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9942609071731567, + "rewards/margins": 1.1709825992584229, + "rewards/rejected": -2.16524338722229, + "step": 3693 + }, + { + "epoch": 0.43, + "learning_rate": 1.7488001872878381e-07, + "logits/chosen": -2.8920414447784424, + "logits/rejected": -2.996122121810913, + "logps/chosen": -221.13589477539062, + "logps/rejected": -311.3957214355469, + "loss": 0.4358, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.150698184967041, + "rewards/margins": 2.722837209701538, + "rewards/rejected": -3.873535633087158, + "step": 3694 + }, + { + "epoch": 0.43, + "learning_rate": 1.7484490225915954e-07, + "logits/chosen": -2.532399892807007, + "logits/rejected": -2.6072640419006348, + "logps/chosen": -324.2994384765625, + "logps/rejected": -297.1379699707031, + "loss": 0.3229, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5651881694793701, + "rewards/margins": 1.8962795734405518, + "rewards/rejected": -2.461467981338501, + "step": 3695 + }, + { + "epoch": 0.43, + "learning_rate": 1.748097857895353e-07, + "logits/chosen": -2.4528911113739014, + "logits/rejected": -2.387531042098999, + "logps/chosen": -129.04592895507812, + "logps/rejected": -193.1346893310547, + "loss": 0.4512, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6125612258911133, + "rewards/margins": 2.3559162616729736, + "rewards/rejected": -2.968477487564087, + "step": 3696 + }, + { + "epoch": 0.43, + "learning_rate": 1.7477466931991102e-07, + "logits/chosen": -1.8423422574996948, + "logits/rejected": -2.0847723484039307, + "logps/chosen": -363.09210205078125, + "logps/rejected": -305.799072265625, + "loss": 0.3076, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9548479914665222, + "rewards/margins": 2.4132742881774902, + "rewards/rejected": -3.3681223392486572, + "step": 3697 + }, + { + "epoch": 0.43, + "learning_rate": 1.7473955285028678e-07, + "logits/chosen": -2.58256196975708, + "logits/rejected": -2.324629068374634, + "logps/chosen": -254.54220581054688, + "logps/rejected": -249.52316284179688, + "loss": 0.4948, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.473341941833496, + "rewards/margins": 0.8760615587234497, + "rewards/rejected": -2.3494033813476562, + "step": 3698 + }, + { + "epoch": 0.43, + "learning_rate": 1.7470443638066253e-07, + "logits/chosen": -2.553278923034668, + "logits/rejected": -2.6697511672973633, + "logps/chosen": -276.24395751953125, + "logps/rejected": -236.68955993652344, + "loss": 0.214, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2644386291503906, + "rewards/margins": 2.9987261295318604, + "rewards/rejected": -4.263164520263672, + "step": 3699 + }, + { + "epoch": 0.43, + "learning_rate": 1.7466931991103826e-07, + "logits/chosen": -2.280559539794922, + "logits/rejected": -2.427980422973633, + "logps/chosen": -184.90463256835938, + "logps/rejected": -174.17129516601562, + "loss": 0.6179, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6196066737174988, + "rewards/margins": 1.9853920936584473, + "rewards/rejected": -2.604998826980591, + "step": 3700 + }, + { + "epoch": 0.43, + "learning_rate": 1.74634203441414e-07, + "logits/chosen": -1.4104197025299072, + "logits/rejected": -1.810443639755249, + "logps/chosen": -291.0824279785156, + "logps/rejected": -241.86422729492188, + "loss": 0.691, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9534369707107544, + "rewards/margins": 1.3945083618164062, + "rewards/rejected": -2.34794545173645, + "step": 3701 + }, + { + "epoch": 0.43, + "learning_rate": 1.7459908697178976e-07, + "logits/chosen": -2.6581976413726807, + "logits/rejected": -2.321286916732788, + "logps/chosen": -183.0926971435547, + "logps/rejected": -212.3529052734375, + "loss": 0.4664, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.8244446516036987, + "rewards/margins": 1.0794305801391602, + "rewards/rejected": -2.9038751125335693, + "step": 3702 + }, + { + "epoch": 0.43, + "learning_rate": 1.745639705021655e-07, + "logits/chosen": -2.5832695960998535, + "logits/rejected": -2.37520170211792, + "logps/chosen": -274.5074157714844, + "logps/rejected": -234.78311157226562, + "loss": 0.5689, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0677810907363892, + "rewards/margins": 2.130481243133545, + "rewards/rejected": -3.1982622146606445, + "step": 3703 + }, + { + "epoch": 0.43, + "learning_rate": 1.7452885403254127e-07, + "logits/chosen": -2.2265870571136475, + "logits/rejected": -2.5363848209381104, + "logps/chosen": -253.0446014404297, + "logps/rejected": -200.46568298339844, + "loss": 0.3975, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2072417438030243, + "rewards/margins": 2.1053547859191895, + "rewards/rejected": -2.312596321105957, + "step": 3704 + }, + { + "epoch": 0.43, + "learning_rate": 1.74493737562917e-07, + "logits/chosen": -2.85567307472229, + "logits/rejected": -2.617107391357422, + "logps/chosen": -119.68362426757812, + "logps/rejected": -210.76670837402344, + "loss": 0.2605, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.04221966862678528, + "rewards/margins": 1.7503933906555176, + "rewards/rejected": -1.7081738710403442, + "step": 3705 + }, + { + "epoch": 0.43, + "learning_rate": 1.7445862109329275e-07, + "logits/chosen": -1.9520537853240967, + "logits/rejected": -1.9826440811157227, + "logps/chosen": -463.1183166503906, + "logps/rejected": -399.5425720214844, + "loss": 0.5888, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.728278636932373, + "rewards/margins": 1.8791182041168213, + "rewards/rejected": -2.6073968410491943, + "step": 3706 + }, + { + "epoch": 0.43, + "learning_rate": 1.744235046236685e-07, + "logits/chosen": -2.5793521404266357, + "logits/rejected": -2.4568419456481934, + "logps/chosen": -277.3601379394531, + "logps/rejected": -350.9031066894531, + "loss": 0.3421, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1911054849624634, + "rewards/margins": 2.6173782348632812, + "rewards/rejected": -3.808483839035034, + "step": 3707 + }, + { + "epoch": 0.43, + "learning_rate": 1.7438838815404423e-07, + "logits/chosen": -2.2269980907440186, + "logits/rejected": -2.1645126342773438, + "logps/chosen": -298.35296630859375, + "logps/rejected": -316.21636962890625, + "loss": 0.2836, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5154843330383301, + "rewards/margins": 2.7716479301452637, + "rewards/rejected": -3.2871322631835938, + "step": 3708 + }, + { + "epoch": 0.43, + "learning_rate": 1.7435327168442e-07, + "logits/chosen": -2.4658203125, + "logits/rejected": -2.2872109413146973, + "logps/chosen": -265.4764099121094, + "logps/rejected": -260.7173156738281, + "loss": 0.1337, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4601145386695862, + "rewards/margins": 2.73685884475708, + "rewards/rejected": -3.1969735622406006, + "step": 3709 + }, + { + "epoch": 0.43, + "learning_rate": 1.7431815521479574e-07, + "logits/chosen": -2.0720953941345215, + "logits/rejected": -2.0447187423706055, + "logps/chosen": -358.13067626953125, + "logps/rejected": -379.4870910644531, + "loss": 0.2165, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.30089640617370605, + "rewards/margins": 3.372893810272217, + "rewards/rejected": -3.673790454864502, + "step": 3710 + }, + { + "epoch": 0.43, + "learning_rate": 1.7428303874517147e-07, + "logits/chosen": -2.272207498550415, + "logits/rejected": -2.4193639755249023, + "logps/chosen": -274.88287353515625, + "logps/rejected": -319.232421875, + "loss": 0.4149, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4375825524330139, + "rewards/margins": 2.486161708831787, + "rewards/rejected": -2.9237442016601562, + "step": 3711 + }, + { + "epoch": 0.43, + "learning_rate": 1.7424792227554722e-07, + "logits/chosen": -3.070673704147339, + "logits/rejected": -3.002833366394043, + "logps/chosen": -338.3130187988281, + "logps/rejected": -419.7816162109375, + "loss": 0.1883, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2734375, + "rewards/margins": 3.3670454025268555, + "rewards/rejected": -4.6404829025268555, + "step": 3712 + }, + { + "epoch": 0.43, + "learning_rate": 1.7421280580592295e-07, + "logits/chosen": -2.5786008834838867, + "logits/rejected": -2.7454357147216797, + "logps/chosen": -97.59512329101562, + "logps/rejected": -242.9466552734375, + "loss": 0.1987, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.26267167925834656, + "rewards/margins": 3.316704750061035, + "rewards/rejected": -3.579376459121704, + "step": 3713 + }, + { + "epoch": 0.43, + "learning_rate": 1.741776893362987e-07, + "logits/chosen": -2.2428884506225586, + "logits/rejected": -2.1107325553894043, + "logps/chosen": -182.28683471679688, + "logps/rejected": -172.25631713867188, + "loss": 0.4819, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7051963210105896, + "rewards/margins": 1.1741935014724731, + "rewards/rejected": -1.8793898820877075, + "step": 3714 + }, + { + "epoch": 0.43, + "learning_rate": 1.7414257286667449e-07, + "logits/chosen": -2.5720267295837402, + "logits/rejected": -2.5981411933898926, + "logps/chosen": -121.84402465820312, + "logps/rejected": -174.78485107421875, + "loss": 0.3352, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8698282241821289, + "rewards/margins": 1.099316954612732, + "rewards/rejected": -1.9691452980041504, + "step": 3715 + }, + { + "epoch": 0.43, + "learning_rate": 1.741074563970502e-07, + "logits/chosen": -1.9485058784484863, + "logits/rejected": -2.254270076751709, + "logps/chosen": -537.8999633789062, + "logps/rejected": -346.90411376953125, + "loss": 0.4607, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5234023332595825, + "rewards/margins": 0.9701448082923889, + "rewards/rejected": -1.4935472011566162, + "step": 3716 + }, + { + "epoch": 0.43, + "learning_rate": 1.7407233992742597e-07, + "logits/chosen": -2.092101812362671, + "logits/rejected": -2.0461907386779785, + "logps/chosen": -360.84844970703125, + "logps/rejected": -480.22796630859375, + "loss": 1.5368, + "rewards/accuracies": 0.375, + "rewards/chosen": -3.1318514347076416, + "rewards/margins": -0.1268114298582077, + "rewards/rejected": -3.005039930343628, + "step": 3717 + }, + { + "epoch": 0.43, + "learning_rate": 1.740372234578017e-07, + "logits/chosen": -2.5695102214813232, + "logits/rejected": -2.6146535873413086, + "logps/chosen": -257.3558044433594, + "logps/rejected": -313.78302001953125, + "loss": 0.2002, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.54622483253479, + "rewards/margins": 3.7196335792541504, + "rewards/rejected": -4.2658586502075195, + "step": 3718 + }, + { + "epoch": 0.43, + "learning_rate": 1.7400210698817745e-07, + "logits/chosen": -2.664907693862915, + "logits/rejected": -2.5186703205108643, + "logps/chosen": -125.43113708496094, + "logps/rejected": -252.84213256835938, + "loss": 0.1088, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7207648158073425, + "rewards/margins": 3.5359244346618652, + "rewards/rejected": -4.256689548492432, + "step": 3719 + }, + { + "epoch": 0.43, + "learning_rate": 1.739669905185532e-07, + "logits/chosen": -2.2444863319396973, + "logits/rejected": -2.338137149810791, + "logps/chosen": -417.6651306152344, + "logps/rejected": -276.0670166015625, + "loss": 0.4521, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.260779619216919, + "rewards/margins": 1.0798618793487549, + "rewards/rejected": -2.340641498565674, + "step": 3720 + }, + { + "epoch": 0.43, + "learning_rate": 1.7393187404892893e-07, + "logits/chosen": -2.275958299636841, + "logits/rejected": -2.315610647201538, + "logps/chosen": -269.10845947265625, + "logps/rejected": -259.11798095703125, + "loss": 0.3128, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7178850173950195, + "rewards/margins": 1.3834092617034912, + "rewards/rejected": -2.10129451751709, + "step": 3721 + }, + { + "epoch": 0.43, + "learning_rate": 1.7389675757930468e-07, + "logits/chosen": -2.429239273071289, + "logits/rejected": -2.544045925140381, + "logps/chosen": -220.24879455566406, + "logps/rejected": -264.84747314453125, + "loss": 0.213, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4551880359649658, + "rewards/margins": 2.6256959438323975, + "rewards/rejected": -3.0808839797973633, + "step": 3722 + }, + { + "epoch": 0.43, + "learning_rate": 1.7386164110968044e-07, + "logits/chosen": -2.3390214443206787, + "logits/rejected": -2.509690284729004, + "logps/chosen": -299.90911865234375, + "logps/rejected": -294.17681884765625, + "loss": 0.2143, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5580601692199707, + "rewards/margins": 2.7736170291900635, + "rewards/rejected": -3.331677198410034, + "step": 3723 + }, + { + "epoch": 0.43, + "learning_rate": 1.7382652464005616e-07, + "logits/chosen": -2.2829556465148926, + "logits/rejected": -2.153109312057495, + "logps/chosen": -206.41941833496094, + "logps/rejected": -284.46185302734375, + "loss": 0.256, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.669244647026062, + "rewards/margins": 2.350661039352417, + "rewards/rejected": -3.0199055671691895, + "step": 3724 + }, + { + "epoch": 0.43, + "learning_rate": 1.7379140817043192e-07, + "logits/chosen": -2.2433650493621826, + "logits/rejected": -2.395188808441162, + "logps/chosen": -294.7232360839844, + "logps/rejected": -177.75827026367188, + "loss": 0.5156, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7678340673446655, + "rewards/margins": 0.7712275981903076, + "rewards/rejected": -2.5390615463256836, + "step": 3725 + }, + { + "epoch": 0.43, + "learning_rate": 1.7375629170080765e-07, + "logits/chosen": -1.771245002746582, + "logits/rejected": -1.954437017440796, + "logps/chosen": -132.24142456054688, + "logps/rejected": -202.79147338867188, + "loss": 0.3397, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3614476919174194, + "rewards/margins": 2.14040470123291, + "rewards/rejected": -3.501852512359619, + "step": 3726 + }, + { + "epoch": 0.43, + "learning_rate": 1.7372117523118343e-07, + "logits/chosen": -2.0423083305358887, + "logits/rejected": -2.0986924171447754, + "logps/chosen": -226.12403869628906, + "logps/rejected": -241.36354064941406, + "loss": 0.4448, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2168315649032593, + "rewards/margins": 1.9510226249694824, + "rewards/rejected": -3.167854070663452, + "step": 3727 + }, + { + "epoch": 0.43, + "learning_rate": 1.7368605876155918e-07, + "logits/chosen": -2.4372031688690186, + "logits/rejected": -2.6321516036987305, + "logps/chosen": -308.97833251953125, + "logps/rejected": -306.5509033203125, + "loss": 0.4481, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.657071590423584, + "rewards/margins": 2.2891716957092285, + "rewards/rejected": -4.9462432861328125, + "step": 3728 + }, + { + "epoch": 0.43, + "learning_rate": 1.736509422919349e-07, + "logits/chosen": -2.298553466796875, + "logits/rejected": -2.183950662612915, + "logps/chosen": -309.299560546875, + "logps/rejected": -251.0209197998047, + "loss": 0.2952, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0999951362609863, + "rewards/margins": 2.380807399749756, + "rewards/rejected": -3.480802297592163, + "step": 3729 + }, + { + "epoch": 0.43, + "learning_rate": 1.7361582582231066e-07, + "logits/chosen": -2.179857015609741, + "logits/rejected": -2.2975058555603027, + "logps/chosen": -268.81671142578125, + "logps/rejected": -266.7983093261719, + "loss": 0.67, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6848442554473877, + "rewards/margins": 2.341599702835083, + "rewards/rejected": -4.026443958282471, + "step": 3730 + }, + { + "epoch": 0.43, + "learning_rate": 1.7358070935268641e-07, + "logits/chosen": -2.3913064002990723, + "logits/rejected": -2.298034191131592, + "logps/chosen": -232.11569213867188, + "logps/rejected": -200.54725646972656, + "loss": 0.5281, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0843610763549805, + "rewards/margins": 0.994807243347168, + "rewards/rejected": -2.0791683197021484, + "step": 3731 + }, + { + "epoch": 0.43, + "learning_rate": 1.7354559288306214e-07, + "logits/chosen": -2.88592529296875, + "logits/rejected": -2.9066855907440186, + "logps/chosen": -149.2014617919922, + "logps/rejected": -215.04861450195312, + "loss": 0.3987, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.149814248085022, + "rewards/margins": 2.711327314376831, + "rewards/rejected": -3.8611412048339844, + "step": 3732 + }, + { + "epoch": 0.43, + "learning_rate": 1.735104764134379e-07, + "logits/chosen": -2.4193034172058105, + "logits/rejected": -2.173814296722412, + "logps/chosen": -216.6834716796875, + "logps/rejected": -351.99981689453125, + "loss": 0.2574, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.314851999282837, + "rewards/margins": 3.324230194091797, + "rewards/rejected": -4.639081954956055, + "step": 3733 + }, + { + "epoch": 0.43, + "learning_rate": 1.7347535994381362e-07, + "logits/chosen": -2.3134889602661133, + "logits/rejected": -2.669947624206543, + "logps/chosen": -559.4317626953125, + "logps/rejected": -194.19622802734375, + "loss": 0.4005, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8515218496322632, + "rewards/margins": 1.8696691989898682, + "rewards/rejected": -2.721191167831421, + "step": 3734 + }, + { + "epoch": 0.43, + "learning_rate": 1.7344024347418938e-07, + "logits/chosen": -2.047316551208496, + "logits/rejected": -1.7641997337341309, + "logps/chosen": -283.4241943359375, + "logps/rejected": -431.482666015625, + "loss": 0.81, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6315019130706787, + "rewards/margins": 1.1789262294769287, + "rewards/rejected": -2.8104281425476074, + "step": 3735 + }, + { + "epoch": 0.43, + "learning_rate": 1.7340512700456513e-07, + "logits/chosen": -2.03806471824646, + "logits/rejected": -2.033353805541992, + "logps/chosen": -446.2624816894531, + "logps/rejected": -473.5070495605469, + "loss": 0.6541, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1785917282104492, + "rewards/margins": 1.0876063108444214, + "rewards/rejected": -2.266197919845581, + "step": 3736 + }, + { + "epoch": 0.43, + "learning_rate": 1.7337001053494086e-07, + "logits/chosen": -2.1377265453338623, + "logits/rejected": -2.168424606323242, + "logps/chosen": -230.99905395507812, + "logps/rejected": -257.2730712890625, + "loss": 0.717, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.331427574157715, + "rewards/margins": 1.5642644166946411, + "rewards/rejected": -3.8956921100616455, + "step": 3737 + }, + { + "epoch": 0.43, + "learning_rate": 1.7333489406531664e-07, + "logits/chosen": -2.488595962524414, + "logits/rejected": -2.4565227031707764, + "logps/chosen": -252.84913635253906, + "logps/rejected": -373.9060363769531, + "loss": 0.8172, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.8588571548461914, + "rewards/margins": 0.5421344041824341, + "rewards/rejected": -2.400991439819336, + "step": 3738 + }, + { + "epoch": 0.43, + "learning_rate": 1.732997775956924e-07, + "logits/chosen": -2.1281111240386963, + "logits/rejected": -2.08808970451355, + "logps/chosen": -362.7334899902344, + "logps/rejected": -368.4527893066406, + "loss": 0.351, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3204973340034485, + "rewards/margins": 2.305485248565674, + "rewards/rejected": -2.6259825229644775, + "step": 3739 + }, + { + "epoch": 0.43, + "learning_rate": 1.7326466112606812e-07, + "logits/chosen": -2.848422050476074, + "logits/rejected": -2.447169303894043, + "logps/chosen": -280.4981384277344, + "logps/rejected": -361.2188720703125, + "loss": 0.6788, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7548512816429138, + "rewards/margins": 2.5470242500305176, + "rewards/rejected": -3.301875114440918, + "step": 3740 + }, + { + "epoch": 0.43, + "learning_rate": 1.7322954465644387e-07, + "logits/chosen": -2.1191139221191406, + "logits/rejected": -2.487870693206787, + "logps/chosen": -267.79052734375, + "logps/rejected": -224.6591796875, + "loss": 0.3185, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9246165156364441, + "rewards/margins": 2.0574276447296143, + "rewards/rejected": -2.982044219970703, + "step": 3741 + }, + { + "epoch": 0.43, + "learning_rate": 1.731944281868196e-07, + "logits/chosen": -2.096972703933716, + "logits/rejected": -2.0649592876434326, + "logps/chosen": -297.6888732910156, + "logps/rejected": -246.94493103027344, + "loss": 0.6049, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5288341045379639, + "rewards/margins": 2.8638241291046143, + "rewards/rejected": -4.392658233642578, + "step": 3742 + }, + { + "epoch": 0.43, + "learning_rate": 1.7315931171719536e-07, + "logits/chosen": -2.450012683868408, + "logits/rejected": -2.624135732650757, + "logps/chosen": -194.8348846435547, + "logps/rejected": -207.05160522460938, + "loss": 1.0163, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.405521273612976, + "rewards/margins": 1.1399270296096802, + "rewards/rejected": -2.5454483032226562, + "step": 3743 + }, + { + "epoch": 0.43, + "learning_rate": 1.731241952475711e-07, + "logits/chosen": -2.793168783187866, + "logits/rejected": -2.7915701866149902, + "logps/chosen": -234.56307983398438, + "logps/rejected": -193.22802734375, + "loss": 0.4046, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7040572166442871, + "rewards/margins": 2.395887851715088, + "rewards/rejected": -3.099945068359375, + "step": 3744 + }, + { + "epoch": 0.43, + "learning_rate": 1.7308907877794684e-07, + "logits/chosen": -2.2650513648986816, + "logits/rejected": -2.658298969268799, + "logps/chosen": -338.1461486816406, + "logps/rejected": -279.3612365722656, + "loss": 0.6319, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3585703372955322, + "rewards/margins": 1.208149790763855, + "rewards/rejected": -2.5667200088500977, + "step": 3745 + }, + { + "epoch": 0.43, + "learning_rate": 1.730539623083226e-07, + "logits/chosen": -2.285170555114746, + "logits/rejected": -2.4940528869628906, + "logps/chosen": -334.3975830078125, + "logps/rejected": -220.14926147460938, + "loss": 0.6815, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8116886615753174, + "rewards/margins": 0.21509870886802673, + "rewards/rejected": -1.0267874002456665, + "step": 3746 + }, + { + "epoch": 0.43, + "learning_rate": 1.7301884583869837e-07, + "logits/chosen": -2.736454725265503, + "logits/rejected": -2.6109836101531982, + "logps/chosen": -361.1083984375, + "logps/rejected": -349.2987365722656, + "loss": 0.0742, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03614532947540283, + "rewards/margins": 3.4390206336975098, + "rewards/rejected": -3.475165843963623, + "step": 3747 + }, + { + "epoch": 0.43, + "learning_rate": 1.7298372936907407e-07, + "logits/chosen": -2.384404182434082, + "logits/rejected": -2.2881665229797363, + "logps/chosen": -289.1214294433594, + "logps/rejected": -321.5381164550781, + "loss": 0.2378, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5884854197502136, + "rewards/margins": 2.119734764099121, + "rewards/rejected": -2.7082200050354004, + "step": 3748 + }, + { + "epoch": 0.43, + "learning_rate": 1.7294861289944985e-07, + "logits/chosen": -2.447343349456787, + "logits/rejected": -2.267524003982544, + "logps/chosen": -269.5020751953125, + "logps/rejected": -278.80364990234375, + "loss": 0.5486, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0019279718399048, + "rewards/margins": 1.2364490032196045, + "rewards/rejected": -2.238377094268799, + "step": 3749 + }, + { + "epoch": 0.43, + "learning_rate": 1.7291349642982558e-07, + "logits/chosen": -1.705774188041687, + "logits/rejected": -1.9181822538375854, + "logps/chosen": -585.7000122070312, + "logps/rejected": -524.7742919921875, + "loss": 0.1177, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1703091561794281, + "rewards/margins": 3.5378003120422363, + "rewards/rejected": -3.367490768432617, + "step": 3750 + }, + { + "epoch": 0.43, + "learning_rate": 1.7287837996020133e-07, + "logits/chosen": -2.446239948272705, + "logits/rejected": -2.530677080154419, + "logps/chosen": -263.9695129394531, + "logps/rejected": -327.2392578125, + "loss": 0.0647, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.190446376800537, + "rewards/margins": 3.7636866569519043, + "rewards/rejected": -4.954133033752441, + "step": 3751 + }, + { + "epoch": 0.43, + "learning_rate": 1.728432634905771e-07, + "logits/chosen": -2.8968617916107178, + "logits/rejected": -2.935072183609009, + "logps/chosen": -289.0413513183594, + "logps/rejected": -192.8693084716797, + "loss": 0.5632, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.038048505783081, + "rewards/margins": 1.364652395248413, + "rewards/rejected": -2.402700901031494, + "step": 3752 + }, + { + "epoch": 0.43, + "learning_rate": 1.7280814702095281e-07, + "logits/chosen": -2.8359017372131348, + "logits/rejected": -2.8273468017578125, + "logps/chosen": -376.1318359375, + "logps/rejected": -333.7947082519531, + "loss": 0.3922, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8713890314102173, + "rewards/margins": 1.7625257968902588, + "rewards/rejected": -2.6339147090911865, + "step": 3753 + }, + { + "epoch": 0.43, + "learning_rate": 1.7277303055132857e-07, + "logits/chosen": -2.1869349479675293, + "logits/rejected": -2.1597864627838135, + "logps/chosen": -303.8690185546875, + "logps/rejected": -278.869384765625, + "loss": 0.588, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8328754901885986, + "rewards/margins": 1.302371621131897, + "rewards/rejected": -2.135247230529785, + "step": 3754 + }, + { + "epoch": 0.43, + "learning_rate": 1.7273791408170432e-07, + "logits/chosen": -1.8569352626800537, + "logits/rejected": -2.287899971008301, + "logps/chosen": -692.0770263671875, + "logps/rejected": -401.9532470703125, + "loss": 0.4495, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3724208474159241, + "rewards/margins": 1.769983172416687, + "rewards/rejected": -2.142404079437256, + "step": 3755 + }, + { + "epoch": 0.43, + "learning_rate": 1.7270279761208005e-07, + "logits/chosen": -1.856876015663147, + "logits/rejected": -1.7215908765792847, + "logps/chosen": -272.4018249511719, + "logps/rejected": -407.2786865234375, + "loss": 0.5139, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.842138648033142, + "rewards/margins": 1.6670489311218262, + "rewards/rejected": -3.509187698364258, + "step": 3756 + }, + { + "epoch": 0.43, + "learning_rate": 1.726676811424558e-07, + "logits/chosen": -2.567389965057373, + "logits/rejected": -2.371199369430542, + "logps/chosen": -216.22467041015625, + "logps/rejected": -272.1217956542969, + "loss": 0.2428, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7136369943618774, + "rewards/margins": 2.589496612548828, + "rewards/rejected": -3.303133487701416, + "step": 3757 + }, + { + "epoch": 0.43, + "learning_rate": 1.7263256467283153e-07, + "logits/chosen": -2.1888208389282227, + "logits/rejected": -2.2745413780212402, + "logps/chosen": -259.7445068359375, + "logps/rejected": -234.66444396972656, + "loss": 0.4711, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6676732301712036, + "rewards/margins": 1.5060265064239502, + "rewards/rejected": -2.1736998558044434, + "step": 3758 + }, + { + "epoch": 0.43, + "learning_rate": 1.7259744820320728e-07, + "logits/chosen": -2.425238609313965, + "logits/rejected": -2.174091100692749, + "logps/chosen": -134.35360717773438, + "logps/rejected": -253.11642456054688, + "loss": 0.7594, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6234185099601746, + "rewards/margins": 1.5415902137756348, + "rewards/rejected": -2.165008544921875, + "step": 3759 + }, + { + "epoch": 0.43, + "learning_rate": 1.7256233173358306e-07, + "logits/chosen": -2.737079381942749, + "logits/rejected": -2.577766180038452, + "logps/chosen": -95.79524993896484, + "logps/rejected": -175.65377807617188, + "loss": 0.4711, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0143707990646362, + "rewards/margins": 2.33659291267395, + "rewards/rejected": -3.350964069366455, + "step": 3760 + }, + { + "epoch": 0.43, + "learning_rate": 1.725272152639588e-07, + "logits/chosen": -2.4828128814697266, + "logits/rejected": -2.401500940322876, + "logps/chosen": -190.4525604248047, + "logps/rejected": -261.5472106933594, + "loss": 0.4494, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.159170150756836, + "rewards/margins": 1.3591773509979248, + "rewards/rejected": -2.5183472633361816, + "step": 3761 + }, + { + "epoch": 0.43, + "learning_rate": 1.7249209879433455e-07, + "logits/chosen": -2.216224193572998, + "logits/rejected": -2.0230352878570557, + "logps/chosen": -197.84030151367188, + "logps/rejected": -258.95904541015625, + "loss": 0.2415, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0407283306121826, + "rewards/margins": 3.2367067337036133, + "rewards/rejected": -4.277435302734375, + "step": 3762 + }, + { + "epoch": 0.43, + "learning_rate": 1.7245698232471027e-07, + "logits/chosen": -1.9306813478469849, + "logits/rejected": -2.1453537940979004, + "logps/chosen": -238.96542358398438, + "logps/rejected": -166.7171173095703, + "loss": 0.7061, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0998679399490356, + "rewards/margins": 0.8750565052032471, + "rewards/rejected": -1.9749245643615723, + "step": 3763 + }, + { + "epoch": 0.43, + "learning_rate": 1.7242186585508603e-07, + "logits/chosen": -2.5093393325805664, + "logits/rejected": -2.397428512573242, + "logps/chosen": -317.4570007324219, + "logps/rejected": -295.96160888671875, + "loss": 0.7021, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.8035125732421875, + "rewards/margins": 0.45499205589294434, + "rewards/rejected": -2.258504629135132, + "step": 3764 + }, + { + "epoch": 0.43, + "learning_rate": 1.7238674938546178e-07, + "logits/chosen": -2.0446572303771973, + "logits/rejected": -1.9806305170059204, + "logps/chosen": -346.0099182128906, + "logps/rejected": -248.74034118652344, + "loss": 1.0832, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8512260913848877, + "rewards/margins": 0.07484832406044006, + "rewards/rejected": -0.9260744452476501, + "step": 3765 + }, + { + "epoch": 0.43, + "learning_rate": 1.723516329158375e-07, + "logits/chosen": -2.61851167678833, + "logits/rejected": -2.756491184234619, + "logps/chosen": -307.47509765625, + "logps/rejected": -188.28358459472656, + "loss": 0.3912, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8148785829544067, + "rewards/margins": 2.772365093231201, + "rewards/rejected": -3.5872435569763184, + "step": 3766 + }, + { + "epoch": 0.43, + "learning_rate": 1.7231651644621326e-07, + "logits/chosen": -2.1098575592041016, + "logits/rejected": -2.2678239345550537, + "logps/chosen": -348.45123291015625, + "logps/rejected": -363.0090026855469, + "loss": 0.2622, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4281258583068848, + "rewards/margins": 2.0179333686828613, + "rewards/rejected": -3.446059226989746, + "step": 3767 + }, + { + "epoch": 0.43, + "learning_rate": 1.7228139997658902e-07, + "logits/chosen": -2.1955511569976807, + "logits/rejected": -2.263143539428711, + "logps/chosen": -227.6018524169922, + "logps/rejected": -303.9581298828125, + "loss": 0.1149, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.021193422377109528, + "rewards/margins": 5.082085132598877, + "rewards/rejected": -5.103278636932373, + "step": 3768 + }, + { + "epoch": 0.43, + "learning_rate": 1.7224628350696474e-07, + "logits/chosen": -2.446040630340576, + "logits/rejected": -2.483865737915039, + "logps/chosen": -314.6625671386719, + "logps/rejected": -299.76275634765625, + "loss": 0.1704, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.001455359160900116, + "rewards/margins": 2.4332869052886963, + "rewards/rejected": -2.4318315982818604, + "step": 3769 + }, + { + "epoch": 0.43, + "learning_rate": 1.722111670373405e-07, + "logits/chosen": -2.90270733833313, + "logits/rejected": -2.949770927429199, + "logps/chosen": -205.06716918945312, + "logps/rejected": -273.055908203125, + "loss": 0.0384, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.827436089515686, + "rewards/margins": 4.2470622062683105, + "rewards/rejected": -5.074498176574707, + "step": 3770 + }, + { + "epoch": 0.43, + "learning_rate": 1.7217605056771622e-07, + "logits/chosen": -1.672374963760376, + "logits/rejected": -1.834188461303711, + "logps/chosen": -411.575927734375, + "logps/rejected": -472.8653259277344, + "loss": 0.3481, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.416628897190094, + "rewards/margins": 2.862792491912842, + "rewards/rejected": -3.279421329498291, + "step": 3771 + }, + { + "epoch": 0.43, + "learning_rate": 1.72140934098092e-07, + "logits/chosen": -2.4936068058013916, + "logits/rejected": -2.363560438156128, + "logps/chosen": -194.47557067871094, + "logps/rejected": -200.67845153808594, + "loss": 0.5737, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5417225956916809, + "rewards/margins": 0.8497003316879272, + "rewards/rejected": -1.391422986984253, + "step": 3772 + }, + { + "epoch": 0.43, + "learning_rate": 1.7210581762846776e-07, + "logits/chosen": -3.0633656978607178, + "logits/rejected": -2.9542429447174072, + "logps/chosen": -376.4439392089844, + "logps/rejected": -282.9056091308594, + "loss": 0.1892, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5098645687103271, + "rewards/margins": 2.681166887283325, + "rewards/rejected": -3.1910316944122314, + "step": 3773 + }, + { + "epoch": 0.44, + "learning_rate": 1.7207070115884349e-07, + "logits/chosen": -1.7901926040649414, + "logits/rejected": -1.992153525352478, + "logps/chosen": -421.330078125, + "logps/rejected": -489.4497375488281, + "loss": 0.4818, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2921766936779022, + "rewards/margins": 1.5697221755981445, + "rewards/rejected": -1.8618988990783691, + "step": 3774 + }, + { + "epoch": 0.44, + "learning_rate": 1.7203558468921924e-07, + "logits/chosen": -2.6235249042510986, + "logits/rejected": -2.356980323791504, + "logps/chosen": -159.07443237304688, + "logps/rejected": -192.19757080078125, + "loss": 0.4186, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8924105167388916, + "rewards/margins": 1.0091497898101807, + "rewards/rejected": -1.9015603065490723, + "step": 3775 + }, + { + "epoch": 0.44, + "learning_rate": 1.72000468219595e-07, + "logits/chosen": -2.1231777667999268, + "logits/rejected": -2.2860100269317627, + "logps/chosen": -210.87399291992188, + "logps/rejected": -234.9132080078125, + "loss": 0.413, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5546303987503052, + "rewards/margins": 1.9671902656555176, + "rewards/rejected": -2.521820545196533, + "step": 3776 + }, + { + "epoch": 0.44, + "learning_rate": 1.7196535174997072e-07, + "logits/chosen": -1.9898755550384521, + "logits/rejected": -1.9523406028747559, + "logps/chosen": -139.03814697265625, + "logps/rejected": -254.30252075195312, + "loss": 0.4036, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.597998321056366, + "rewards/margins": 1.5730060338974, + "rewards/rejected": -2.171004295349121, + "step": 3777 + }, + { + "epoch": 0.44, + "learning_rate": 1.7193023528034648e-07, + "logits/chosen": -2.202172040939331, + "logits/rejected": -1.9795033931732178, + "logps/chosen": -192.14370727539062, + "logps/rejected": -348.5393981933594, + "loss": 0.2845, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5081850290298462, + "rewards/margins": 3.1580474376678467, + "rewards/rejected": -3.6662323474884033, + "step": 3778 + }, + { + "epoch": 0.44, + "learning_rate": 1.718951188107222e-07, + "logits/chosen": -2.4945600032806396, + "logits/rejected": -2.6634106636047363, + "logps/chosen": -233.42840576171875, + "logps/rejected": -215.6282958984375, + "loss": 0.1798, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4223039746284485, + "rewards/margins": 3.4206700325012207, + "rewards/rejected": -3.8429741859436035, + "step": 3779 + }, + { + "epoch": 0.44, + "learning_rate": 1.7186000234109796e-07, + "logits/chosen": -2.303050994873047, + "logits/rejected": -2.5476770401000977, + "logps/chosen": -385.24102783203125, + "logps/rejected": -254.7397003173828, + "loss": 0.7811, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1461308002471924, + "rewards/margins": 0.7208349704742432, + "rewards/rejected": -1.8669657707214355, + "step": 3780 + }, + { + "epoch": 0.44, + "learning_rate": 1.718248858714737e-07, + "logits/chosen": -2.597083806991577, + "logits/rejected": -2.619447946548462, + "logps/chosen": -162.16958618164062, + "logps/rejected": -213.56871032714844, + "loss": 0.2655, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1001909971237183, + "rewards/margins": 2.2220077514648438, + "rewards/rejected": -3.3221983909606934, + "step": 3781 + }, + { + "epoch": 0.44, + "learning_rate": 1.7178976940184944e-07, + "logits/chosen": -1.8762435913085938, + "logits/rejected": -2.406557083129883, + "logps/chosen": -443.8714599609375, + "logps/rejected": -330.7965393066406, + "loss": 0.229, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3220938742160797, + "rewards/margins": 2.051208972930908, + "rewards/rejected": -2.373302698135376, + "step": 3782 + }, + { + "epoch": 0.44, + "learning_rate": 1.7175465293222522e-07, + "logits/chosen": -2.1607906818389893, + "logits/rejected": -2.4607737064361572, + "logps/chosen": -249.90185546875, + "logps/rejected": -288.2975769042969, + "loss": 0.6738, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.541136622428894, + "rewards/margins": 0.7531237602233887, + "rewards/rejected": -2.294260263442993, + "step": 3783 + }, + { + "epoch": 0.44, + "learning_rate": 1.7171953646260097e-07, + "logits/chosen": -2.4821064472198486, + "logits/rejected": -2.5397307872772217, + "logps/chosen": -158.6207733154297, + "logps/rejected": -181.14215087890625, + "loss": 0.3383, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8720839023590088, + "rewards/margins": 1.9641588926315308, + "rewards/rejected": -2.83624267578125, + "step": 3784 + }, + { + "epoch": 0.44, + "learning_rate": 1.716844199929767e-07, + "logits/chosen": -2.286621570587158, + "logits/rejected": -2.08520245552063, + "logps/chosen": -268.77813720703125, + "logps/rejected": -372.3160095214844, + "loss": 0.5387, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2246862649917603, + "rewards/margins": 0.45782434940338135, + "rewards/rejected": -1.6825106143951416, + "step": 3785 + }, + { + "epoch": 0.44, + "learning_rate": 1.7164930352335245e-07, + "logits/chosen": -2.580629348754883, + "logits/rejected": -2.6632728576660156, + "logps/chosen": -304.691162109375, + "logps/rejected": -190.48268127441406, + "loss": 1.0886, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.576490581035614, + "rewards/margins": 0.5148320198059082, + "rewards/rejected": -1.091322660446167, + "step": 3786 + }, + { + "epoch": 0.44, + "learning_rate": 1.7161418705372818e-07, + "logits/chosen": -2.4644618034362793, + "logits/rejected": -2.550147771835327, + "logps/chosen": -441.30255126953125, + "logps/rejected": -357.9097900390625, + "loss": 0.2471, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.711467981338501, + "rewards/margins": 1.7867532968521118, + "rewards/rejected": -2.4982213973999023, + "step": 3787 + }, + { + "epoch": 0.44, + "learning_rate": 1.7157907058410393e-07, + "logits/chosen": -2.0607547760009766, + "logits/rejected": -1.9700002670288086, + "logps/chosen": -370.7115173339844, + "logps/rejected": -371.62774658203125, + "loss": 0.1464, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.849545419216156, + "rewards/margins": 2.831505298614502, + "rewards/rejected": -3.6810505390167236, + "step": 3788 + }, + { + "epoch": 0.44, + "learning_rate": 1.715439541144797e-07, + "logits/chosen": -2.1055827140808105, + "logits/rejected": -1.9755529165267944, + "logps/chosen": -147.15255737304688, + "logps/rejected": -301.9457702636719, + "loss": 0.2603, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0706195831298828, + "rewards/margins": 3.4402596950531006, + "rewards/rejected": -4.5108795166015625, + "step": 3789 + }, + { + "epoch": 0.44, + "learning_rate": 1.7150883764485542e-07, + "logits/chosen": -2.388728380203247, + "logits/rejected": -2.405648946762085, + "logps/chosen": -332.3238525390625, + "logps/rejected": -348.6378173828125, + "loss": 0.6136, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4198437929153442, + "rewards/margins": 1.7976195812225342, + "rewards/rejected": -3.217463493347168, + "step": 3790 + }, + { + "epoch": 0.44, + "learning_rate": 1.7147372117523117e-07, + "logits/chosen": -1.7605946063995361, + "logits/rejected": -2.1170966625213623, + "logps/chosen": -231.79275512695312, + "logps/rejected": -209.193115234375, + "loss": 0.2723, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4497804641723633, + "rewards/margins": 3.024508237838745, + "rewards/rejected": -4.4742889404296875, + "step": 3791 + }, + { + "epoch": 0.44, + "learning_rate": 1.7143860470560695e-07, + "logits/chosen": -1.9430607557296753, + "logits/rejected": -2.084512233734131, + "logps/chosen": -419.9729919433594, + "logps/rejected": -363.4596862792969, + "loss": 0.5413, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3032764494419098, + "rewards/margins": 1.0244349241256714, + "rewards/rejected": -1.3277113437652588, + "step": 3792 + }, + { + "epoch": 0.44, + "learning_rate": 1.7140348823598265e-07, + "logits/chosen": -2.2760515213012695, + "logits/rejected": -2.2595090866088867, + "logps/chosen": -402.9557189941406, + "logps/rejected": -360.75634765625, + "loss": 0.4373, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9310461282730103, + "rewards/margins": 1.210587501525879, + "rewards/rejected": -2.1416335105895996, + "step": 3793 + }, + { + "epoch": 0.44, + "learning_rate": 1.7136837176635843e-07, + "logits/chosen": -2.562532424926758, + "logits/rejected": -2.8707027435302734, + "logps/chosen": -322.9198303222656, + "logps/rejected": -196.19644165039062, + "loss": 0.2901, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40986156463623047, + "rewards/margins": 1.4472839832305908, + "rewards/rejected": -1.8571455478668213, + "step": 3794 + }, + { + "epoch": 0.44, + "learning_rate": 1.7133325529673416e-07, + "logits/chosen": -2.6729564666748047, + "logits/rejected": -2.8155035972595215, + "logps/chosen": -97.91200256347656, + "logps/rejected": -90.00159454345703, + "loss": 0.6458, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.012850284576416, + "rewards/margins": 0.7848643660545349, + "rewards/rejected": -1.7977144718170166, + "step": 3795 + }, + { + "epoch": 0.44, + "learning_rate": 1.712981388271099e-07, + "logits/chosen": -2.4307174682617188, + "logits/rejected": -2.6195857524871826, + "logps/chosen": -331.498779296875, + "logps/rejected": -244.21945190429688, + "loss": 0.2532, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9904413223266602, + "rewards/margins": 3.515638828277588, + "rewards/rejected": -4.506080150604248, + "step": 3796 + }, + { + "epoch": 0.44, + "learning_rate": 1.7126302235748567e-07, + "logits/chosen": -2.682821273803711, + "logits/rejected": -2.5022356510162354, + "logps/chosen": -165.94058227539062, + "logps/rejected": -272.36273193359375, + "loss": 0.257, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7118930816650391, + "rewards/margins": 2.6183550357818604, + "rewards/rejected": -3.3302481174468994, + "step": 3797 + }, + { + "epoch": 0.44, + "learning_rate": 1.712279058878614e-07, + "logits/chosen": -2.704883575439453, + "logits/rejected": -2.6912293434143066, + "logps/chosen": -319.0637512207031, + "logps/rejected": -402.8757629394531, + "loss": 0.4466, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2678797245025635, + "rewards/margins": 2.6337578296661377, + "rewards/rejected": -3.901637554168701, + "step": 3798 + }, + { + "epoch": 0.44, + "learning_rate": 1.7119278941823715e-07, + "logits/chosen": -1.960347056388855, + "logits/rejected": -2.324129819869995, + "logps/chosen": -236.13360595703125, + "logps/rejected": -192.1865692138672, + "loss": 0.3848, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.05039702355861664, + "rewards/margins": 1.2960059642791748, + "rewards/rejected": -1.2456088066101074, + "step": 3799 + }, + { + "epoch": 0.44, + "learning_rate": 1.711576729486129e-07, + "logits/chosen": -2.300632953643799, + "logits/rejected": -2.2930760383605957, + "logps/chosen": -236.32923889160156, + "logps/rejected": -184.28297424316406, + "loss": 1.1569, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5534391403198242, + "rewards/margins": 0.2930576503276825, + "rewards/rejected": -1.84649658203125, + "step": 3800 + }, + { + "epoch": 0.44, + "learning_rate": 1.7112255647898863e-07, + "logits/chosen": -2.242605209350586, + "logits/rejected": -1.9438470602035522, + "logps/chosen": -249.7130584716797, + "logps/rejected": -385.1592712402344, + "loss": 0.4811, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.090812087059021, + "rewards/margins": 2.252624750137329, + "rewards/rejected": -3.3434367179870605, + "step": 3801 + }, + { + "epoch": 0.44, + "learning_rate": 1.7108744000936438e-07, + "logits/chosen": -2.3173909187316895, + "logits/rejected": -2.434642791748047, + "logps/chosen": -340.2464599609375, + "logps/rejected": -332.917724609375, + "loss": 0.1123, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23889899253845215, + "rewards/margins": 3.621528387069702, + "rewards/rejected": -3.8604276180267334, + "step": 3802 + }, + { + "epoch": 0.44, + "learning_rate": 1.710523235397401e-07, + "logits/chosen": -2.9138097763061523, + "logits/rejected": -2.929800510406494, + "logps/chosen": -285.6733093261719, + "logps/rejected": -236.4002685546875, + "loss": 0.1734, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7650874853134155, + "rewards/margins": 2.25852108001709, + "rewards/rejected": -3.023608684539795, + "step": 3803 + }, + { + "epoch": 0.44, + "learning_rate": 1.7101720707011586e-07, + "logits/chosen": -2.8830204010009766, + "logits/rejected": -2.7694571018218994, + "logps/chosen": -118.55218505859375, + "logps/rejected": -127.65757751464844, + "loss": 0.4095, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.32742854952812195, + "rewards/margins": 1.5565712451934814, + "rewards/rejected": -1.8839998245239258, + "step": 3804 + }, + { + "epoch": 0.44, + "learning_rate": 1.7098209060049164e-07, + "logits/chosen": -2.781064987182617, + "logits/rejected": -2.7538986206054688, + "logps/chosen": -243.99484252929688, + "logps/rejected": -300.0180969238281, + "loss": 0.495, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7805542945861816, + "rewards/margins": 4.934805870056152, + "rewards/rejected": -6.715359687805176, + "step": 3805 + }, + { + "epoch": 0.44, + "learning_rate": 1.7094697413086737e-07, + "logits/chosen": -2.5404701232910156, + "logits/rejected": -2.533531665802002, + "logps/chosen": -287.4344177246094, + "logps/rejected": -207.54815673828125, + "loss": 0.3233, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.568620502948761, + "rewards/margins": 1.857923984527588, + "rewards/rejected": -2.426544427871704, + "step": 3806 + }, + { + "epoch": 0.44, + "learning_rate": 1.7091185766124313e-07, + "logits/chosen": -2.0172946453094482, + "logits/rejected": -1.7453944683074951, + "logps/chosen": -238.83682250976562, + "logps/rejected": -393.95635986328125, + "loss": 0.1473, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37336817383766174, + "rewards/margins": 3.34295654296875, + "rewards/rejected": -3.7163245677948, + "step": 3807 + }, + { + "epoch": 0.44, + "learning_rate": 1.7087674119161885e-07, + "logits/chosen": -1.9843579530715942, + "logits/rejected": -2.101822853088379, + "logps/chosen": -273.39459228515625, + "logps/rejected": -213.950927734375, + "loss": 0.3684, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3718414306640625, + "rewards/margins": 1.7508031129837036, + "rewards/rejected": -2.1226444244384766, + "step": 3808 + }, + { + "epoch": 0.44, + "learning_rate": 1.708416247219946e-07, + "logits/chosen": -2.6384518146514893, + "logits/rejected": -2.726228713989258, + "logps/chosen": -227.1344451904297, + "logps/rejected": -163.04983520507812, + "loss": 0.4097, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2002403736114502, + "rewards/margins": 2.1770823001861572, + "rewards/rejected": -3.3773226737976074, + "step": 3809 + }, + { + "epoch": 0.44, + "learning_rate": 1.7080650825237036e-07, + "logits/chosen": -2.9088211059570312, + "logits/rejected": -2.825148820877075, + "logps/chosen": -325.6295166015625, + "logps/rejected": -331.3367614746094, + "loss": 0.2892, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.46427711844444275, + "rewards/margins": 2.9513845443725586, + "rewards/rejected": -3.4156618118286133, + "step": 3810 + }, + { + "epoch": 0.44, + "learning_rate": 1.707713917827461e-07, + "logits/chosen": -1.9880104064941406, + "logits/rejected": -2.1039419174194336, + "logps/chosen": -376.9765319824219, + "logps/rejected": -264.8349914550781, + "loss": 0.226, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.572688102722168, + "rewards/margins": 2.008105754852295, + "rewards/rejected": -2.580793857574463, + "step": 3811 + }, + { + "epoch": 0.44, + "learning_rate": 1.7073627531312184e-07, + "logits/chosen": -2.885791778564453, + "logits/rejected": -2.8127615451812744, + "logps/chosen": -287.10296630859375, + "logps/rejected": -276.8470458984375, + "loss": 0.492, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7312270998954773, + "rewards/margins": 1.6586580276489258, + "rewards/rejected": -2.389885187149048, + "step": 3812 + }, + { + "epoch": 0.44, + "learning_rate": 1.707011588434976e-07, + "logits/chosen": -1.6136667728424072, + "logits/rejected": -1.991142749786377, + "logps/chosen": -240.0808563232422, + "logps/rejected": -237.0116729736328, + "loss": 0.2777, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1660195589065552, + "rewards/margins": 1.430239200592041, + "rewards/rejected": -2.5962586402893066, + "step": 3813 + }, + { + "epoch": 0.44, + "learning_rate": 1.7066604237387332e-07, + "logits/chosen": -2.69391131401062, + "logits/rejected": -2.8443403244018555, + "logps/chosen": -204.03778076171875, + "logps/rejected": -189.75328063964844, + "loss": 0.3394, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.45081597566604614, + "rewards/margins": 1.7208352088928223, + "rewards/rejected": -2.1716511249542236, + "step": 3814 + }, + { + "epoch": 0.44, + "learning_rate": 1.7063092590424908e-07, + "logits/chosen": -2.2435662746429443, + "logits/rejected": -2.31201171875, + "logps/chosen": -161.0721893310547, + "logps/rejected": -157.7292938232422, + "loss": 0.535, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8389098048210144, + "rewards/margins": 1.1591651439666748, + "rewards/rejected": -1.9980747699737549, + "step": 3815 + }, + { + "epoch": 0.44, + "learning_rate": 1.705958094346248e-07, + "logits/chosen": -2.035829782485962, + "logits/rejected": -1.7138956785202026, + "logps/chosen": -300.5357360839844, + "logps/rejected": -451.09979248046875, + "loss": 0.2003, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6035288572311401, + "rewards/margins": 2.507415771484375, + "rewards/rejected": -3.1109447479248047, + "step": 3816 + }, + { + "epoch": 0.44, + "learning_rate": 1.7056069296500058e-07, + "logits/chosen": -2.216228485107422, + "logits/rejected": -2.5866284370422363, + "logps/chosen": -368.2237548828125, + "logps/rejected": -245.37249755859375, + "loss": 0.5329, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.47089487314224243, + "rewards/margins": 1.6033954620361328, + "rewards/rejected": -2.0742902755737305, + "step": 3817 + }, + { + "epoch": 0.44, + "learning_rate": 1.7052557649537634e-07, + "logits/chosen": -2.2914984226226807, + "logits/rejected": -2.0757105350494385, + "logps/chosen": -306.72149658203125, + "logps/rejected": -272.123046875, + "loss": 0.3866, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9581869840621948, + "rewards/margins": 1.5000040531158447, + "rewards/rejected": -2.458191156387329, + "step": 3818 + }, + { + "epoch": 0.44, + "learning_rate": 1.7049046002575207e-07, + "logits/chosen": -2.3657827377319336, + "logits/rejected": -2.164844274520874, + "logps/chosen": -151.30575561523438, + "logps/rejected": -249.42010498046875, + "loss": 0.7358, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0355045795440674, + "rewards/margins": 1.2237324714660645, + "rewards/rejected": -2.259237051010132, + "step": 3819 + }, + { + "epoch": 0.44, + "learning_rate": 1.7045534355612782e-07, + "logits/chosen": -2.286648750305176, + "logits/rejected": -2.307330369949341, + "logps/chosen": -392.8587646484375, + "logps/rejected": -308.4493103027344, + "loss": 0.1604, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6557238698005676, + "rewards/margins": 2.3702359199523926, + "rewards/rejected": -3.0259599685668945, + "step": 3820 + }, + { + "epoch": 0.44, + "learning_rate": 1.7042022708650357e-07, + "logits/chosen": -2.5334651470184326, + "logits/rejected": -2.5173354148864746, + "logps/chosen": -258.91998291015625, + "logps/rejected": -247.20106506347656, + "loss": 0.2184, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3299521207809448, + "rewards/margins": 2.5490169525146484, + "rewards/rejected": -3.8789687156677246, + "step": 3821 + }, + { + "epoch": 0.44, + "learning_rate": 1.703851106168793e-07, + "logits/chosen": -2.894611358642578, + "logits/rejected": -2.8180181980133057, + "logps/chosen": -106.22805786132812, + "logps/rejected": -126.52880096435547, + "loss": 0.5494, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7802702188491821, + "rewards/margins": 1.9728052616119385, + "rewards/rejected": -2.753075361251831, + "step": 3822 + }, + { + "epoch": 0.44, + "learning_rate": 1.7034999414725505e-07, + "logits/chosen": -2.396212577819824, + "logits/rejected": -2.4823262691497803, + "logps/chosen": -785.3677368164062, + "logps/rejected": -338.17596435546875, + "loss": 0.3644, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.995932936668396, + "rewards/margins": 1.824735403060913, + "rewards/rejected": -2.8206682205200195, + "step": 3823 + }, + { + "epoch": 0.44, + "learning_rate": 1.7031487767763078e-07, + "logits/chosen": -2.541182041168213, + "logits/rejected": -2.3530220985412598, + "logps/chosen": -425.4764099121094, + "logps/rejected": -381.4991455078125, + "loss": 0.2921, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0921481847763062, + "rewards/margins": 1.7403736114501953, + "rewards/rejected": -2.832521915435791, + "step": 3824 + }, + { + "epoch": 0.44, + "learning_rate": 1.7027976120800654e-07, + "logits/chosen": -2.562185525894165, + "logits/rejected": -2.473237991333008, + "logps/chosen": -173.1590118408203, + "logps/rejected": -259.73577880859375, + "loss": 0.537, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5765318870544434, + "rewards/margins": 0.6767390966415405, + "rewards/rejected": -2.2532708644866943, + "step": 3825 + }, + { + "epoch": 0.44, + "learning_rate": 1.7024464473838232e-07, + "logits/chosen": -2.6868183612823486, + "logits/rejected": -2.60728120803833, + "logps/chosen": -193.79299926757812, + "logps/rejected": -167.25088500976562, + "loss": 0.3131, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6375921368598938, + "rewards/margins": 2.3112874031066895, + "rewards/rejected": -2.9488797187805176, + "step": 3826 + }, + { + "epoch": 0.44, + "learning_rate": 1.7020952826875802e-07, + "logits/chosen": -2.785830497741699, + "logits/rejected": -2.6693882942199707, + "logps/chosen": -210.12884521484375, + "logps/rejected": -314.5447082519531, + "loss": 0.2416, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6539344191551208, + "rewards/margins": 2.9536802768707275, + "rewards/rejected": -3.607614517211914, + "step": 3827 + }, + { + "epoch": 0.44, + "learning_rate": 1.701744117991338e-07, + "logits/chosen": -2.508934497833252, + "logits/rejected": -2.5173747539520264, + "logps/chosen": -214.94412231445312, + "logps/rejected": -123.19868469238281, + "loss": 0.6218, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9345152974128723, + "rewards/margins": 0.986547589302063, + "rewards/rejected": -1.92106294631958, + "step": 3828 + }, + { + "epoch": 0.44, + "learning_rate": 1.7013929532950955e-07, + "logits/chosen": -2.4063234329223633, + "logits/rejected": -2.337643623352051, + "logps/chosen": -246.22854614257812, + "logps/rejected": -232.1806182861328, + "loss": 0.1471, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.289278507232666, + "rewards/margins": 3.0897247791290283, + "rewards/rejected": -4.379003524780273, + "step": 3829 + }, + { + "epoch": 0.44, + "learning_rate": 1.7010417885988528e-07, + "logits/chosen": -2.2389886379241943, + "logits/rejected": -2.5914220809936523, + "logps/chosen": -376.1381530761719, + "logps/rejected": -268.6247253417969, + "loss": 0.9554, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2949141263961792, + "rewards/margins": 1.7892265319824219, + "rewards/rejected": -3.0841405391693115, + "step": 3830 + }, + { + "epoch": 0.44, + "learning_rate": 1.7006906239026103e-07, + "logits/chosen": -1.9994516372680664, + "logits/rejected": -1.8566360473632812, + "logps/chosen": -391.94354248046875, + "logps/rejected": -359.2002258300781, + "loss": 1.4842, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.5864529609680176, + "rewards/margins": 0.5494543313980103, + "rewards/rejected": -3.1359076499938965, + "step": 3831 + }, + { + "epoch": 0.44, + "learning_rate": 1.7003394592063676e-07, + "logits/chosen": -2.055615186691284, + "logits/rejected": -1.905775547027588, + "logps/chosen": -267.9408874511719, + "logps/rejected": -233.90447998046875, + "loss": 0.7587, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3144652843475342, + "rewards/margins": 1.3423736095428467, + "rewards/rejected": -2.656838893890381, + "step": 3832 + }, + { + "epoch": 0.44, + "learning_rate": 1.6999882945101251e-07, + "logits/chosen": -1.7328236103057861, + "logits/rejected": -2.203352212905884, + "logps/chosen": -511.1704406738281, + "logps/rejected": -325.9073791503906, + "loss": 0.2155, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3010881841182709, + "rewards/margins": 2.3186588287353516, + "rewards/rejected": -2.6197469234466553, + "step": 3833 + }, + { + "epoch": 0.44, + "learning_rate": 1.6996371298138827e-07, + "logits/chosen": -2.2971949577331543, + "logits/rejected": -2.304630994796753, + "logps/chosen": -183.03765869140625, + "logps/rejected": -228.65835571289062, + "loss": 0.2408, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6490246653556824, + "rewards/margins": 2.614898204803467, + "rewards/rejected": -3.263922691345215, + "step": 3834 + }, + { + "epoch": 0.44, + "learning_rate": 1.69928596511764e-07, + "logits/chosen": -2.1177027225494385, + "logits/rejected": -2.459933042526245, + "logps/chosen": -354.3540344238281, + "logps/rejected": -200.8555908203125, + "loss": 0.2785, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7284502983093262, + "rewards/margins": 2.0306739807128906, + "rewards/rejected": -2.759124279022217, + "step": 3835 + }, + { + "epoch": 0.44, + "learning_rate": 1.6989348004213975e-07, + "logits/chosen": -2.3157310485839844, + "logits/rejected": -1.8686115741729736, + "logps/chosen": -208.85894775390625, + "logps/rejected": -368.54327392578125, + "loss": 0.2539, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0550193786621094, + "rewards/margins": 3.7284369468688965, + "rewards/rejected": -4.783456325531006, + "step": 3836 + }, + { + "epoch": 0.44, + "learning_rate": 1.6985836357251553e-07, + "logits/chosen": -2.7008628845214844, + "logits/rejected": -2.6553561687469482, + "logps/chosen": -111.456298828125, + "logps/rejected": -193.4256591796875, + "loss": 0.3327, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0174766778945923, + "rewards/margins": 1.54151451587677, + "rewards/rejected": -2.5589914321899414, + "step": 3837 + }, + { + "epoch": 0.44, + "learning_rate": 1.6982324710289123e-07, + "logits/chosen": -2.6025190353393555, + "logits/rejected": -2.3951308727264404, + "logps/chosen": -162.66737365722656, + "logps/rejected": -317.6391296386719, + "loss": 0.2247, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8099106550216675, + "rewards/margins": 3.853424310684204, + "rewards/rejected": -4.663334846496582, + "step": 3838 + }, + { + "epoch": 0.44, + "learning_rate": 1.69788130633267e-07, + "logits/chosen": -2.7469592094421387, + "logits/rejected": -2.515538215637207, + "logps/chosen": -145.68234252929688, + "logps/rejected": -385.319091796875, + "loss": 0.3312, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8621686697006226, + "rewards/margins": 3.669740676879883, + "rewards/rejected": -4.531909942626953, + "step": 3839 + }, + { + "epoch": 0.44, + "learning_rate": 1.6975301416364274e-07, + "logits/chosen": -2.396146059036255, + "logits/rejected": -2.096831798553467, + "logps/chosen": -275.350830078125, + "logps/rejected": -338.5391845703125, + "loss": 0.3754, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4271667003631592, + "rewards/margins": 3.0996124744415283, + "rewards/rejected": -4.5267791748046875, + "step": 3840 + }, + { + "epoch": 0.44, + "learning_rate": 1.697178976940185e-07, + "logits/chosen": -2.111196517944336, + "logits/rejected": -2.0692288875579834, + "logps/chosen": -235.7439422607422, + "logps/rejected": -285.3548583984375, + "loss": 0.3301, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8608933687210083, + "rewards/margins": 2.1064467430114746, + "rewards/rejected": -2.9673402309417725, + "step": 3841 + }, + { + "epoch": 0.44, + "learning_rate": 1.6968278122439425e-07, + "logits/chosen": -2.9010581970214844, + "logits/rejected": -2.7713873386383057, + "logps/chosen": -271.1654968261719, + "logps/rejected": -226.99192810058594, + "loss": 0.4974, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7628979682922363, + "rewards/margins": 0.7074510455131531, + "rewards/rejected": -1.4703489542007446, + "step": 3842 + }, + { + "epoch": 0.44, + "learning_rate": 1.6964766475476997e-07, + "logits/chosen": -2.396634340286255, + "logits/rejected": -2.704097270965576, + "logps/chosen": -238.7843017578125, + "logps/rejected": -176.95297241210938, + "loss": 0.4103, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8171848654747009, + "rewards/margins": 2.7229878902435303, + "rewards/rejected": -3.540173053741455, + "step": 3843 + }, + { + "epoch": 0.44, + "learning_rate": 1.6961254828514573e-07, + "logits/chosen": -2.164621353149414, + "logits/rejected": -2.353060722351074, + "logps/chosen": -217.20126342773438, + "logps/rejected": -277.1040954589844, + "loss": 0.3245, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5746141672134399, + "rewards/margins": 1.607774257659912, + "rewards/rejected": -2.1823887825012207, + "step": 3844 + }, + { + "epoch": 0.44, + "learning_rate": 1.6957743181552148e-07, + "logits/chosen": -1.9378867149353027, + "logits/rejected": -1.8295831680297852, + "logps/chosen": -319.3940124511719, + "logps/rejected": -335.7062072753906, + "loss": 0.3804, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.244486093521118, + "rewards/margins": 6.258852958679199, + "rewards/rejected": -9.503338813781738, + "step": 3845 + }, + { + "epoch": 0.44, + "learning_rate": 1.695423153458972e-07, + "logits/chosen": -2.6468312740325928, + "logits/rejected": -2.5438356399536133, + "logps/chosen": -274.332763671875, + "logps/rejected": -293.52288818359375, + "loss": 0.4495, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.439146637916565, + "rewards/margins": 1.709340214729309, + "rewards/rejected": -3.148486852645874, + "step": 3846 + }, + { + "epoch": 0.44, + "learning_rate": 1.6950719887627296e-07, + "logits/chosen": -2.5233469009399414, + "logits/rejected": -2.3268680572509766, + "logps/chosen": -223.75515747070312, + "logps/rejected": -287.9001159667969, + "loss": 0.3839, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.45235246419906616, + "rewards/margins": 2.2073588371276855, + "rewards/rejected": -2.6597113609313965, + "step": 3847 + }, + { + "epoch": 0.44, + "learning_rate": 1.694720824066487e-07, + "logits/chosen": -2.1960020065307617, + "logits/rejected": -2.27681303024292, + "logps/chosen": -244.76954650878906, + "logps/rejected": -202.34121704101562, + "loss": 0.3948, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9657535552978516, + "rewards/margins": 1.685014247894287, + "rewards/rejected": -3.6507678031921387, + "step": 3848 + }, + { + "epoch": 0.44, + "learning_rate": 1.6943696593702444e-07, + "logits/chosen": -1.8110623359680176, + "logits/rejected": -1.6203831434249878, + "logps/chosen": -337.7230224609375, + "logps/rejected": -385.0762939453125, + "loss": 0.4027, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9598989486694336, + "rewards/margins": 1.8775713443756104, + "rewards/rejected": -2.837470293045044, + "step": 3849 + }, + { + "epoch": 0.44, + "learning_rate": 1.6940184946740022e-07, + "logits/chosen": -2.5655884742736816, + "logits/rejected": -2.4817593097686768, + "logps/chosen": -263.61346435546875, + "logps/rejected": -260.5318298339844, + "loss": 0.432, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6437438726425171, + "rewards/margins": 1.8326096534729004, + "rewards/rejected": -2.476353406906128, + "step": 3850 + }, + { + "epoch": 0.44, + "learning_rate": 1.6936673299777595e-07, + "logits/chosen": -2.316171407699585, + "logits/rejected": -2.173011064529419, + "logps/chosen": -370.2538757324219, + "logps/rejected": -351.9009704589844, + "loss": 0.4972, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.32912015914917, + "rewards/margins": 1.4170351028442383, + "rewards/rejected": -2.746155261993408, + "step": 3851 + }, + { + "epoch": 0.44, + "learning_rate": 1.693316165281517e-07, + "logits/chosen": -2.662249803543091, + "logits/rejected": -2.4622302055358887, + "logps/chosen": -202.63427734375, + "logps/rejected": -288.9203796386719, + "loss": 0.8993, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1407333612442017, + "rewards/margins": 0.20629583299160004, + "rewards/rejected": -1.3470290899276733, + "step": 3852 + }, + { + "epoch": 0.44, + "learning_rate": 1.6929650005852746e-07, + "logits/chosen": -2.4718611240386963, + "logits/rejected": -2.3716940879821777, + "logps/chosen": -282.96917724609375, + "logps/rejected": -295.9461364746094, + "loss": 0.5279, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.977541983127594, + "rewards/margins": 1.0777513980865479, + "rewards/rejected": -2.055293560028076, + "step": 3853 + }, + { + "epoch": 0.44, + "learning_rate": 1.6926138358890319e-07, + "logits/chosen": -2.7247376441955566, + "logits/rejected": -2.74924898147583, + "logps/chosen": -134.55679321289062, + "logps/rejected": -322.955322265625, + "loss": 0.4355, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1151797771453857, + "rewards/margins": 3.2267191410064697, + "rewards/rejected": -4.3418989181518555, + "step": 3854 + }, + { + "epoch": 0.44, + "learning_rate": 1.6922626711927894e-07, + "logits/chosen": -2.1495277881622314, + "logits/rejected": -2.3103737831115723, + "logps/chosen": -318.9541931152344, + "logps/rejected": -448.09417724609375, + "loss": 0.5535, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3611186742782593, + "rewards/margins": 2.699557304382324, + "rewards/rejected": -4.060676097869873, + "step": 3855 + }, + { + "epoch": 0.44, + "learning_rate": 1.6919115064965467e-07, + "logits/chosen": -2.490183115005493, + "logits/rejected": -2.7499067783355713, + "logps/chosen": -208.47027587890625, + "logps/rejected": -174.536376953125, + "loss": 0.1434, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8582277297973633, + "rewards/margins": 3.7215752601623535, + "rewards/rejected": -4.579802989959717, + "step": 3856 + }, + { + "epoch": 0.44, + "learning_rate": 1.6915603418003042e-07, + "logits/chosen": -2.308081865310669, + "logits/rejected": -2.2577991485595703, + "logps/chosen": -312.4014892578125, + "logps/rejected": -270.7806396484375, + "loss": 0.1766, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7627633213996887, + "rewards/margins": 2.1538267135620117, + "rewards/rejected": -2.9165902137756348, + "step": 3857 + }, + { + "epoch": 0.44, + "learning_rate": 1.6912091771040617e-07, + "logits/chosen": -2.535581111907959, + "logits/rejected": -2.554657459259033, + "logps/chosen": -399.87469482421875, + "logps/rejected": -323.4244079589844, + "loss": 0.2138, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5259734392166138, + "rewards/margins": 2.8251101970672607, + "rewards/rejected": -3.351083517074585, + "step": 3858 + }, + { + "epoch": 0.44, + "learning_rate": 1.690858012407819e-07, + "logits/chosen": -2.40118408203125, + "logits/rejected": -2.247591018676758, + "logps/chosen": -128.9613494873047, + "logps/rejected": -232.43878173828125, + "loss": 0.3155, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7167032957077026, + "rewards/margins": 2.3900492191314697, + "rewards/rejected": -3.106752634048462, + "step": 3859 + }, + { + "epoch": 0.44, + "learning_rate": 1.6905068477115768e-07, + "logits/chosen": -2.4220447540283203, + "logits/rejected": -2.523256540298462, + "logps/chosen": -241.8350372314453, + "logps/rejected": -180.77345275878906, + "loss": 0.9679, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5694873332977295, + "rewards/margins": 1.0008337497711182, + "rewards/rejected": -2.5703210830688477, + "step": 3860 + }, + { + "epoch": 0.45, + "learning_rate": 1.6901556830153338e-07, + "logits/chosen": -2.655475616455078, + "logits/rejected": -2.7332279682159424, + "logps/chosen": -328.1018371582031, + "logps/rejected": -311.28411865234375, + "loss": 0.3485, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7047428488731384, + "rewards/margins": 1.9517483711242676, + "rewards/rejected": -2.656491279602051, + "step": 3861 + }, + { + "epoch": 0.45, + "learning_rate": 1.6898045183190916e-07, + "logits/chosen": -1.804803490638733, + "logits/rejected": -1.8044352531433105, + "logps/chosen": -283.644775390625, + "logps/rejected": -242.26553344726562, + "loss": 0.5458, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.147869348526001, + "rewards/margins": 0.9420067071914673, + "rewards/rejected": -2.0898759365081787, + "step": 3862 + }, + { + "epoch": 0.45, + "learning_rate": 1.6894533536228492e-07, + "logits/chosen": -1.9733535051345825, + "logits/rejected": -2.2197000980377197, + "logps/chosen": -318.1171875, + "logps/rejected": -281.4585266113281, + "loss": 0.483, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.198301911354065, + "rewards/margins": 2.338026762008667, + "rewards/rejected": -3.5363285541534424, + "step": 3863 + }, + { + "epoch": 0.45, + "learning_rate": 1.6891021889266065e-07, + "logits/chosen": -2.4149527549743652, + "logits/rejected": -2.308739185333252, + "logps/chosen": -227.6942138671875, + "logps/rejected": -248.65647888183594, + "loss": 0.8888, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.357154607772827, + "rewards/margins": 1.8728351593017578, + "rewards/rejected": -4.229990005493164, + "step": 3864 + }, + { + "epoch": 0.45, + "learning_rate": 1.688751024230364e-07, + "logits/chosen": -2.178861141204834, + "logits/rejected": -2.3841474056243896, + "logps/chosen": -164.81985473632812, + "logps/rejected": -194.00381469726562, + "loss": 0.5021, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8989822864532471, + "rewards/margins": 2.089539051055908, + "rewards/rejected": -2.9885213375091553, + "step": 3865 + }, + { + "epoch": 0.45, + "learning_rate": 1.6883998595341215e-07, + "logits/chosen": -2.70060658454895, + "logits/rejected": -2.750944137573242, + "logps/chosen": -253.7908935546875, + "logps/rejected": -250.34744262695312, + "loss": 0.5523, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.514293909072876, + "rewards/margins": 1.8248642683029175, + "rewards/rejected": -3.339158058166504, + "step": 3866 + }, + { + "epoch": 0.45, + "learning_rate": 1.6880486948378788e-07, + "logits/chosen": -2.345555543899536, + "logits/rejected": -2.3327012062072754, + "logps/chosen": -423.1318664550781, + "logps/rejected": -284.002197265625, + "loss": 0.3466, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2346450090408325, + "rewards/margins": 1.78971266746521, + "rewards/rejected": -3.024357795715332, + "step": 3867 + }, + { + "epoch": 0.45, + "learning_rate": 1.6876975301416363e-07, + "logits/chosen": -2.5621397495269775, + "logits/rejected": -2.7196919918060303, + "logps/chosen": -229.7518310546875, + "logps/rejected": -157.27088928222656, + "loss": 0.5569, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7419559359550476, + "rewards/margins": 1.2828272581100464, + "rewards/rejected": -2.024783134460449, + "step": 3868 + }, + { + "epoch": 0.45, + "learning_rate": 1.6873463654453936e-07, + "logits/chosen": -2.0808334350585938, + "logits/rejected": -2.517530918121338, + "logps/chosen": -433.5489196777344, + "logps/rejected": -255.83746337890625, + "loss": 0.3705, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9852501153945923, + "rewards/margins": 1.6402897834777832, + "rewards/rejected": -2.625540018081665, + "step": 3869 + }, + { + "epoch": 0.45, + "learning_rate": 1.6869952007491512e-07, + "logits/chosen": -1.624910593032837, + "logits/rejected": -2.077883243560791, + "logps/chosen": -374.1701965332031, + "logps/rejected": -176.0764923095703, + "loss": 0.5449, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8722538948059082, + "rewards/margins": 1.3390769958496094, + "rewards/rejected": -2.2113308906555176, + "step": 3870 + }, + { + "epoch": 0.45, + "learning_rate": 1.686644036052909e-07, + "logits/chosen": -2.648261547088623, + "logits/rejected": -2.7847461700439453, + "logps/chosen": -270.55975341796875, + "logps/rejected": -263.1404113769531, + "loss": 0.9637, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5968892574310303, + "rewards/margins": 2.340385913848877, + "rewards/rejected": -3.9372751712799072, + "step": 3871 + }, + { + "epoch": 0.45, + "learning_rate": 1.686292871356666e-07, + "logits/chosen": -2.6994223594665527, + "logits/rejected": -2.7388439178466797, + "logps/chosen": -72.91661834716797, + "logps/rejected": -252.03713989257812, + "loss": 0.2991, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1615432500839233, + "rewards/margins": 3.395587921142578, + "rewards/rejected": -4.557131290435791, + "step": 3872 + }, + { + "epoch": 0.45, + "learning_rate": 1.6859417066604238e-07, + "logits/chosen": -2.001394748687744, + "logits/rejected": -2.2318286895751953, + "logps/chosen": -342.262939453125, + "logps/rejected": -383.23492431640625, + "loss": 0.3047, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8302783966064453, + "rewards/margins": 2.6285574436187744, + "rewards/rejected": -3.4588356018066406, + "step": 3873 + }, + { + "epoch": 0.45, + "learning_rate": 1.6855905419641813e-07, + "logits/chosen": -1.9785590171813965, + "logits/rejected": -2.401143789291382, + "logps/chosen": -317.91473388671875, + "logps/rejected": -309.1957702636719, + "loss": 0.2273, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5809940099716187, + "rewards/margins": 2.453404664993286, + "rewards/rejected": -3.0343987941741943, + "step": 3874 + }, + { + "epoch": 0.45, + "learning_rate": 1.6852393772679386e-07, + "logits/chosen": -2.2806308269500732, + "logits/rejected": -2.26350474357605, + "logps/chosen": -263.7696533203125, + "logps/rejected": -230.71302795410156, + "loss": 0.3086, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0054008960723877, + "rewards/margins": 2.321216583251953, + "rewards/rejected": -3.326617479324341, + "step": 3875 + }, + { + "epoch": 0.45, + "learning_rate": 1.684888212571696e-07, + "logits/chosen": -2.0216710567474365, + "logits/rejected": -2.1257104873657227, + "logps/chosen": -522.439453125, + "logps/rejected": -406.8453674316406, + "loss": 0.2601, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7269277572631836, + "rewards/margins": 3.2683422565460205, + "rewards/rejected": -3.995269775390625, + "step": 3876 + }, + { + "epoch": 0.45, + "learning_rate": 1.6845370478754534e-07, + "logits/chosen": -2.5984489917755127, + "logits/rejected": -2.693237066268921, + "logps/chosen": -287.7395935058594, + "logps/rejected": -293.3466491699219, + "loss": 0.3993, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6271222233772278, + "rewards/margins": 2.3340156078338623, + "rewards/rejected": -2.9611380100250244, + "step": 3877 + }, + { + "epoch": 0.45, + "learning_rate": 1.684185883179211e-07, + "logits/chosen": -2.274710178375244, + "logits/rejected": -2.747851848602295, + "logps/chosen": -486.3345642089844, + "logps/rejected": -331.7602233886719, + "loss": 0.0575, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.048789918422698975, + "rewards/margins": 3.446607828140259, + "rewards/rejected": -3.3978183269500732, + "step": 3878 + }, + { + "epoch": 0.45, + "learning_rate": 1.6838347184829685e-07, + "logits/chosen": -2.8639280796051025, + "logits/rejected": -2.947147846221924, + "logps/chosen": -264.0348205566406, + "logps/rejected": -234.38348388671875, + "loss": 0.2961, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.26648184657096863, + "rewards/margins": 2.187169313430786, + "rewards/rejected": -2.453651189804077, + "step": 3879 + }, + { + "epoch": 0.45, + "learning_rate": 1.6834835537867257e-07, + "logits/chosen": -2.4144134521484375, + "logits/rejected": -2.1904680728912354, + "logps/chosen": -223.54660034179688, + "logps/rejected": -230.32998657226562, + "loss": 0.7741, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.8224616050720215, + "rewards/margins": 0.3560320734977722, + "rewards/rejected": -2.1784937381744385, + "step": 3880 + }, + { + "epoch": 0.45, + "learning_rate": 1.6831323890904833e-07, + "logits/chosen": -2.4615020751953125, + "logits/rejected": -2.628901958465576, + "logps/chosen": -448.62762451171875, + "logps/rejected": -363.7452392578125, + "loss": 0.2678, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8846416473388672, + "rewards/margins": 2.6028952598571777, + "rewards/rejected": -3.487536907196045, + "step": 3881 + }, + { + "epoch": 0.45, + "learning_rate": 1.682781224394241e-07, + "logits/chosen": -2.321704626083374, + "logits/rejected": -2.2132792472839355, + "logps/chosen": -318.7002868652344, + "logps/rejected": -307.8858337402344, + "loss": 0.204, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0417630672454834, + "rewards/margins": 2.782345771789551, + "rewards/rejected": -3.824108600616455, + "step": 3882 + }, + { + "epoch": 0.45, + "learning_rate": 1.682430059697998e-07, + "logits/chosen": -2.138943910598755, + "logits/rejected": -2.139854907989502, + "logps/chosen": -333.4380798339844, + "logps/rejected": -366.00701904296875, + "loss": 0.2909, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.37894147634506226, + "rewards/margins": 2.0328257083892822, + "rewards/rejected": -2.4117672443389893, + "step": 3883 + }, + { + "epoch": 0.45, + "learning_rate": 1.682078895001756e-07, + "logits/chosen": -2.1793739795684814, + "logits/rejected": -2.1470141410827637, + "logps/chosen": -228.85585021972656, + "logps/rejected": -300.19921875, + "loss": 1.0744, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.100628614425659, + "rewards/margins": 1.4144564867019653, + "rewards/rejected": -3.515085220336914, + "step": 3884 + }, + { + "epoch": 0.45, + "learning_rate": 1.6817277303055132e-07, + "logits/chosen": -1.8313441276550293, + "logits/rejected": -1.8767473697662354, + "logps/chosen": -534.5008544921875, + "logps/rejected": -396.6796569824219, + "loss": 0.8144, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6197255253791809, + "rewards/margins": 0.45437806844711304, + "rewards/rejected": -1.074103593826294, + "step": 3885 + }, + { + "epoch": 0.45, + "learning_rate": 1.6813765656092707e-07, + "logits/chosen": -2.402069091796875, + "logits/rejected": -2.4686436653137207, + "logps/chosen": -425.7051086425781, + "logps/rejected": -367.4908142089844, + "loss": 0.298, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6556885242462158, + "rewards/margins": 2.06441068649292, + "rewards/rejected": -2.7200992107391357, + "step": 3886 + }, + { + "epoch": 0.45, + "learning_rate": 1.6810254009130283e-07, + "logits/chosen": -1.9283297061920166, + "logits/rejected": -2.2286739349365234, + "logps/chosen": -544.8074951171875, + "logps/rejected": -255.74310302734375, + "loss": 0.1332, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0233490467071533, + "rewards/margins": 3.8382935523986816, + "rewards/rejected": -4.861642360687256, + "step": 3887 + }, + { + "epoch": 0.45, + "learning_rate": 1.6806742362167855e-07, + "logits/chosen": -2.693471908569336, + "logits/rejected": -2.842181444168091, + "logps/chosen": -403.2344665527344, + "logps/rejected": -265.2064208984375, + "loss": 0.1485, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40123385190963745, + "rewards/margins": 4.256871700286865, + "rewards/rejected": -4.658105373382568, + "step": 3888 + }, + { + "epoch": 0.45, + "learning_rate": 1.680323071520543e-07, + "logits/chosen": -2.5842819213867188, + "logits/rejected": -2.0970418453216553, + "logps/chosen": -340.6988830566406, + "logps/rejected": -297.6927490234375, + "loss": 0.392, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0274913311004639, + "rewards/margins": 2.1879653930664062, + "rewards/rejected": -3.215456962585449, + "step": 3889 + }, + { + "epoch": 0.45, + "learning_rate": 1.6799719068243006e-07, + "logits/chosen": -2.356572151184082, + "logits/rejected": -2.275865316390991, + "logps/chosen": -238.2802276611328, + "logps/rejected": -272.9028015136719, + "loss": 0.4095, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6386374831199646, + "rewards/margins": 1.8817533254623413, + "rewards/rejected": -2.520390748977661, + "step": 3890 + }, + { + "epoch": 0.45, + "learning_rate": 1.679620742128058e-07, + "logits/chosen": -2.2658157348632812, + "logits/rejected": -2.1632251739501953, + "logps/chosen": -207.111083984375, + "logps/rejected": -211.40928649902344, + "loss": 0.8313, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.389183759689331, + "rewards/margins": 0.17532210052013397, + "rewards/rejected": -1.564505696296692, + "step": 3891 + }, + { + "epoch": 0.45, + "learning_rate": 1.6792695774318154e-07, + "logits/chosen": -2.0859017372131348, + "logits/rejected": -2.0710954666137695, + "logps/chosen": -257.31341552734375, + "logps/rejected": -337.41168212890625, + "loss": 0.3005, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.394957184791565, + "rewards/margins": 3.253206968307495, + "rewards/rejected": -4.64816427230835, + "step": 3892 + }, + { + "epoch": 0.45, + "learning_rate": 1.6789184127355727e-07, + "logits/chosen": -2.6692657470703125, + "logits/rejected": -2.351573944091797, + "logps/chosen": -164.9207763671875, + "logps/rejected": -348.96258544921875, + "loss": 0.2088, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4579508900642395, + "rewards/margins": 3.951700210571289, + "rewards/rejected": -4.409650802612305, + "step": 3893 + }, + { + "epoch": 0.45, + "learning_rate": 1.6785672480393305e-07, + "logits/chosen": -2.20528244972229, + "logits/rejected": -2.3277711868286133, + "logps/chosen": -338.5162658691406, + "logps/rejected": -343.5836181640625, + "loss": 0.1187, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5111141204833984, + "rewards/margins": 3.5628273487091064, + "rewards/rejected": -4.073941707611084, + "step": 3894 + }, + { + "epoch": 0.45, + "learning_rate": 1.678216083343088e-07, + "logits/chosen": -1.9699783325195312, + "logits/rejected": -2.0974040031433105, + "logps/chosen": -353.06085205078125, + "logps/rejected": -174.5726318359375, + "loss": 0.3087, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.33197033405303955, + "rewards/margins": 1.834250569343567, + "rewards/rejected": -2.1662209033966064, + "step": 3895 + }, + { + "epoch": 0.45, + "learning_rate": 1.6778649186468453e-07, + "logits/chosen": -1.8004130125045776, + "logits/rejected": -2.063185930252075, + "logps/chosen": -404.6912841796875, + "logps/rejected": -305.9167785644531, + "loss": 0.3461, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.24612057209014893, + "rewards/margins": 2.588975429534912, + "rewards/rejected": -2.8350958824157715, + "step": 3896 + }, + { + "epoch": 0.45, + "learning_rate": 1.6775137539506028e-07, + "logits/chosen": -2.5322301387786865, + "logits/rejected": -2.7294468879699707, + "logps/chosen": -405.017578125, + "logps/rejected": -132.91314697265625, + "loss": 0.4637, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8275690078735352, + "rewards/margins": 1.4745792150497437, + "rewards/rejected": -2.3021481037139893, + "step": 3897 + }, + { + "epoch": 0.45, + "learning_rate": 1.6771625892543604e-07, + "logits/chosen": -2.350473403930664, + "logits/rejected": -2.4486780166625977, + "logps/chosen": -365.1645202636719, + "logps/rejected": -407.29205322265625, + "loss": 0.529, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1760094165802002, + "rewards/margins": 3.1276440620422363, + "rewards/rejected": -4.303653240203857, + "step": 3898 + }, + { + "epoch": 0.45, + "learning_rate": 1.6768114245581177e-07, + "logits/chosen": -1.8065412044525146, + "logits/rejected": -2.2734174728393555, + "logps/chosen": -328.6831359863281, + "logps/rejected": -174.785888671875, + "loss": 0.8697, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0327833890914917, + "rewards/margins": 0.6389006972312927, + "rewards/rejected": -1.6716841459274292, + "step": 3899 + }, + { + "epoch": 0.45, + "learning_rate": 1.6764602598618752e-07, + "logits/chosen": -2.6763739585876465, + "logits/rejected": -2.692064046859741, + "logps/chosen": -232.57421875, + "logps/rejected": -187.75515747070312, + "loss": 0.1516, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4679398238658905, + "rewards/margins": 3.25698184967041, + "rewards/rejected": -3.724921703338623, + "step": 3900 + }, + { + "epoch": 0.45, + "learning_rate": 1.6761090951656325e-07, + "logits/chosen": -1.6599013805389404, + "logits/rejected": -1.6679706573486328, + "logps/chosen": -312.2943115234375, + "logps/rejected": -310.36578369140625, + "loss": 0.2408, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8676731586456299, + "rewards/margins": 2.1755664348602295, + "rewards/rejected": -3.0432395935058594, + "step": 3901 + }, + { + "epoch": 0.45, + "learning_rate": 1.67575793046939e-07, + "logits/chosen": -2.193258762359619, + "logits/rejected": -2.130298137664795, + "logps/chosen": -216.81927490234375, + "logps/rejected": -185.86717224121094, + "loss": 0.4216, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7464454174041748, + "rewards/margins": 0.8883932828903198, + "rewards/rejected": -1.6348387002944946, + "step": 3902 + }, + { + "epoch": 0.45, + "learning_rate": 1.6754067657731475e-07, + "logits/chosen": -2.121302843093872, + "logits/rejected": -2.396516799926758, + "logps/chosen": -498.60797119140625, + "logps/rejected": -284.13330078125, + "loss": 0.2633, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.03645532578229904, + "rewards/margins": 2.3395495414733887, + "rewards/rejected": -2.376005172729492, + "step": 3903 + }, + { + "epoch": 0.45, + "learning_rate": 1.6750556010769048e-07, + "logits/chosen": -2.926079750061035, + "logits/rejected": -2.8638927936553955, + "logps/chosen": -313.7756652832031, + "logps/rejected": -287.32196044921875, + "loss": 0.2763, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.40858057141304016, + "rewards/margins": 2.4522182941436768, + "rewards/rejected": -2.8607988357543945, + "step": 3904 + }, + { + "epoch": 0.45, + "learning_rate": 1.6747044363806626e-07, + "logits/chosen": -1.9952495098114014, + "logits/rejected": -2.144815683364868, + "logps/chosen": -275.61859130859375, + "logps/rejected": -256.12347412109375, + "loss": 0.3207, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6907277703285217, + "rewards/margins": 2.9106369018554688, + "rewards/rejected": -3.6013646125793457, + "step": 3905 + }, + { + "epoch": 0.45, + "learning_rate": 1.6743532716844196e-07, + "logits/chosen": -2.424299716949463, + "logits/rejected": -2.7231082916259766, + "logps/chosen": -264.89532470703125, + "logps/rejected": -213.49911499023438, + "loss": 1.247, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.630220413208008, + "rewards/margins": 1.0574792623519897, + "rewards/rejected": -3.687699556350708, + "step": 3906 + }, + { + "epoch": 0.45, + "learning_rate": 1.6740021069881774e-07, + "logits/chosen": -1.890264630317688, + "logits/rejected": -1.6884386539459229, + "logps/chosen": -350.762451171875, + "logps/rejected": -281.709716796875, + "loss": 0.7313, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.463233470916748, + "rewards/margins": 0.8777304887771606, + "rewards/rejected": -2.340963840484619, + "step": 3907 + }, + { + "epoch": 0.45, + "learning_rate": 1.673650942291935e-07, + "logits/chosen": -1.5632680654525757, + "logits/rejected": -1.330673098564148, + "logps/chosen": -291.8047180175781, + "logps/rejected": -354.3717346191406, + "loss": 0.4819, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3324914276599884, + "rewards/margins": 1.5976530313491821, + "rewards/rejected": -1.9301445484161377, + "step": 3908 + }, + { + "epoch": 0.45, + "learning_rate": 1.6732997775956922e-07, + "logits/chosen": -2.0959086418151855, + "logits/rejected": -1.8949732780456543, + "logps/chosen": -319.6287536621094, + "logps/rejected": -313.082763671875, + "loss": 0.3246, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6839169263839722, + "rewards/margins": 2.0255908966064453, + "rewards/rejected": -2.709507703781128, + "step": 3909 + }, + { + "epoch": 0.45, + "learning_rate": 1.6729486128994498e-07, + "logits/chosen": -2.650439739227295, + "logits/rejected": -2.5849990844726562, + "logps/chosen": -299.5374755859375, + "logps/rejected": -254.1101837158203, + "loss": 0.5772, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6744197607040405, + "rewards/margins": 1.9315459728240967, + "rewards/rejected": -3.6059656143188477, + "step": 3910 + }, + { + "epoch": 0.45, + "learning_rate": 1.6725974482032073e-07, + "logits/chosen": -1.942318320274353, + "logits/rejected": -2.1666386127471924, + "logps/chosen": -535.3443603515625, + "logps/rejected": -416.123779296875, + "loss": 0.2348, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6520534753799438, + "rewards/margins": 1.576141357421875, + "rewards/rejected": -2.2281949520111084, + "step": 3911 + }, + { + "epoch": 0.45, + "learning_rate": 1.6722462835069646e-07, + "logits/chosen": -2.0441551208496094, + "logits/rejected": -2.6030685901641846, + "logps/chosen": -264.31353759765625, + "logps/rejected": -231.41915893554688, + "loss": 0.8932, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8533906936645508, + "rewards/margins": 0.7616713047027588, + "rewards/rejected": -2.6150619983673096, + "step": 3912 + }, + { + "epoch": 0.45, + "learning_rate": 1.6718951188107221e-07, + "logits/chosen": -2.037839889526367, + "logits/rejected": -1.8110264539718628, + "logps/chosen": -342.4415283203125, + "logps/rejected": -427.11480712890625, + "loss": 0.3969, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.38029903173446655, + "rewards/margins": 2.952770948410034, + "rewards/rejected": -3.3330698013305664, + "step": 3913 + }, + { + "epoch": 0.45, + "learning_rate": 1.6715439541144794e-07, + "logits/chosen": -2.286022901535034, + "logits/rejected": -2.0370707511901855, + "logps/chosen": -245.46554565429688, + "logps/rejected": -245.516357421875, + "loss": 1.0647, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.733682632446289, + "rewards/margins": 1.0124547481536865, + "rewards/rejected": -3.7461373805999756, + "step": 3914 + }, + { + "epoch": 0.45, + "learning_rate": 1.671192789418237e-07, + "logits/chosen": -2.7137162685394287, + "logits/rejected": -2.7353289127349854, + "logps/chosen": -126.3277816772461, + "logps/rejected": -300.56842041015625, + "loss": 0.1197, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5386061072349548, + "rewards/margins": 4.3127923011779785, + "rewards/rejected": -4.851398468017578, + "step": 3915 + }, + { + "epoch": 0.45, + "learning_rate": 1.6708416247219948e-07, + "logits/chosen": -2.2575223445892334, + "logits/rejected": -2.522204875946045, + "logps/chosen": -220.23289489746094, + "logps/rejected": -234.27442932128906, + "loss": 0.5727, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2082593441009521, + "rewards/margins": 2.5440258979797363, + "rewards/rejected": -3.7522847652435303, + "step": 3916 + }, + { + "epoch": 0.45, + "learning_rate": 1.6704904600257518e-07, + "logits/chosen": -2.628430128097534, + "logits/rejected": -2.541682481765747, + "logps/chosen": -285.2110290527344, + "logps/rejected": -408.9667053222656, + "loss": 1.1263, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9432406425476074, + "rewards/margins": 2.0418970584869385, + "rewards/rejected": -3.985137462615967, + "step": 3917 + }, + { + "epoch": 0.45, + "learning_rate": 1.6701392953295096e-07, + "logits/chosen": -1.5311832427978516, + "logits/rejected": -2.185250759124756, + "logps/chosen": -519.02783203125, + "logps/rejected": -212.0704345703125, + "loss": 0.538, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0743904113769531, + "rewards/margins": 0.8793898820877075, + "rewards/rejected": -1.9537802934646606, + "step": 3918 + }, + { + "epoch": 0.45, + "learning_rate": 1.669788130633267e-07, + "logits/chosen": -2.838197708129883, + "logits/rejected": -2.806173801422119, + "logps/chosen": -218.6263427734375, + "logps/rejected": -318.71075439453125, + "loss": 0.4803, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.377539873123169, + "rewards/margins": 1.75160813331604, + "rewards/rejected": -3.129147529602051, + "step": 3919 + }, + { + "epoch": 0.45, + "learning_rate": 1.6694369659370244e-07, + "logits/chosen": -2.5555882453918457, + "logits/rejected": -2.4554057121276855, + "logps/chosen": -306.2464599609375, + "logps/rejected": -235.08746337890625, + "loss": 0.3947, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6224749088287354, + "rewards/margins": 1.7190605401992798, + "rewards/rejected": -3.3415355682373047, + "step": 3920 + }, + { + "epoch": 0.45, + "learning_rate": 1.669085801240782e-07, + "logits/chosen": -2.4954752922058105, + "logits/rejected": -2.515474557876587, + "logps/chosen": -129.97909545898438, + "logps/rejected": -162.74227905273438, + "loss": 0.7567, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9453173875808716, + "rewards/margins": 2.132716178894043, + "rewards/rejected": -3.078033685684204, + "step": 3921 + }, + { + "epoch": 0.45, + "learning_rate": 1.6687346365445392e-07, + "logits/chosen": -2.2682225704193115, + "logits/rejected": -2.729537010192871, + "logps/chosen": -320.0890197753906, + "logps/rejected": -192.741455078125, + "loss": 0.3181, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.504666268825531, + "rewards/margins": 1.9281543493270874, + "rewards/rejected": -2.4328205585479736, + "step": 3922 + }, + { + "epoch": 0.45, + "learning_rate": 1.6683834718482967e-07, + "logits/chosen": -1.88706374168396, + "logits/rejected": -1.8779611587524414, + "logps/chosen": -238.85421752929688, + "logps/rejected": -308.3990478515625, + "loss": 0.5424, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6576306819915771, + "rewards/margins": 0.909279465675354, + "rewards/rejected": -2.5669102668762207, + "step": 3923 + }, + { + "epoch": 0.45, + "learning_rate": 1.6680323071520543e-07, + "logits/chosen": -2.499898672103882, + "logits/rejected": -2.465571403503418, + "logps/chosen": -272.7051696777344, + "logps/rejected": -279.75482177734375, + "loss": 0.4031, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5417623519897461, + "rewards/margins": 2.053502082824707, + "rewards/rejected": -2.595264434814453, + "step": 3924 + }, + { + "epoch": 0.45, + "learning_rate": 1.6676811424558115e-07, + "logits/chosen": -2.214298963546753, + "logits/rejected": -1.8112174272537231, + "logps/chosen": -170.46487426757812, + "logps/rejected": -389.0030822753906, + "loss": 0.1448, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2087295055389404, + "rewards/margins": 4.986207008361816, + "rewards/rejected": -6.194936275482178, + "step": 3925 + }, + { + "epoch": 0.45, + "learning_rate": 1.667329977759569e-07, + "logits/chosen": -2.3171114921569824, + "logits/rejected": -2.476522445678711, + "logps/chosen": -326.0372314453125, + "logps/rejected": -236.75759887695312, + "loss": 0.3208, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8708253502845764, + "rewards/margins": 2.633080244064331, + "rewards/rejected": -3.503905773162842, + "step": 3926 + }, + { + "epoch": 0.45, + "learning_rate": 1.666978813063327e-07, + "logits/chosen": -2.0250213146209717, + "logits/rejected": -1.7411898374557495, + "logps/chosen": -257.1036376953125, + "logps/rejected": -353.54632568359375, + "loss": 0.6906, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.7167325019836426, + "rewards/margins": 1.2268048524856567, + "rewards/rejected": -3.9435372352600098, + "step": 3927 + }, + { + "epoch": 0.45, + "learning_rate": 1.6666276483670842e-07, + "logits/chosen": -2.383078098297119, + "logits/rejected": -2.35518217086792, + "logps/chosen": -256.1633605957031, + "logps/rejected": -274.41424560546875, + "loss": 0.2912, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5418941378593445, + "rewards/margins": 3.3503599166870117, + "rewards/rejected": -3.892253875732422, + "step": 3928 + }, + { + "epoch": 0.45, + "learning_rate": 1.6662764836708417e-07, + "logits/chosen": -2.4280190467834473, + "logits/rejected": -2.5179803371429443, + "logps/chosen": -342.563232421875, + "logps/rejected": -372.8247985839844, + "loss": 0.3632, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7568670511245728, + "rewards/margins": 3.386565685272217, + "rewards/rejected": -4.1434326171875, + "step": 3929 + }, + { + "epoch": 0.45, + "learning_rate": 1.665925318974599e-07, + "logits/chosen": -2.181550979614258, + "logits/rejected": -2.5746874809265137, + "logps/chosen": -694.3225708007812, + "logps/rejected": -389.36187744140625, + "loss": 0.4647, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.728143334388733, + "rewards/margins": 2.058760643005371, + "rewards/rejected": -3.7869040966033936, + "step": 3930 + }, + { + "epoch": 0.45, + "learning_rate": 1.6655741542783565e-07, + "logits/chosen": -2.1777334213256836, + "logits/rejected": -2.395951271057129, + "logps/chosen": -343.3200378417969, + "logps/rejected": -261.7682800292969, + "loss": 0.2302, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6470081806182861, + "rewards/margins": 1.9666869640350342, + "rewards/rejected": -2.6136951446533203, + "step": 3931 + }, + { + "epoch": 0.45, + "learning_rate": 1.665222989582114e-07, + "logits/chosen": -2.5381760597229004, + "logits/rejected": -2.2878317832946777, + "logps/chosen": -214.30079650878906, + "logps/rejected": -279.273681640625, + "loss": 0.2097, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6855375170707703, + "rewards/margins": 2.946728229522705, + "rewards/rejected": -3.632266044616699, + "step": 3932 + }, + { + "epoch": 0.45, + "learning_rate": 1.6648718248858713e-07, + "logits/chosen": -1.6251009702682495, + "logits/rejected": -2.0462851524353027, + "logps/chosen": -644.3443603515625, + "logps/rejected": -398.1548767089844, + "loss": 0.4684, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6214601993560791, + "rewards/margins": 1.7575912475585938, + "rewards/rejected": -2.3790512084960938, + "step": 3933 + }, + { + "epoch": 0.45, + "learning_rate": 1.6645206601896289e-07, + "logits/chosen": -2.0428545475006104, + "logits/rejected": -2.2873682975769043, + "logps/chosen": -494.8413391113281, + "logps/rejected": -510.33416748046875, + "loss": 1.0242, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0636136531829834, + "rewards/margins": 0.7178366184234619, + "rewards/rejected": -1.7814502716064453, + "step": 3934 + }, + { + "epoch": 0.45, + "learning_rate": 1.6641694954933864e-07, + "logits/chosen": -2.271472692489624, + "logits/rejected": -2.3088722229003906, + "logps/chosen": -340.972900390625, + "logps/rejected": -236.01235961914062, + "loss": 0.3957, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4806813895702362, + "rewards/margins": 2.081348180770874, + "rewards/rejected": -2.5620296001434326, + "step": 3935 + }, + { + "epoch": 0.45, + "learning_rate": 1.6638183307971437e-07, + "logits/chosen": -1.952361822128296, + "logits/rejected": -2.256971836090088, + "logps/chosen": -297.3333740234375, + "logps/rejected": -321.8904724121094, + "loss": 0.2517, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1755805015563965, + "rewards/margins": 1.8957786560058594, + "rewards/rejected": -3.071359157562256, + "step": 3936 + }, + { + "epoch": 0.45, + "learning_rate": 1.6634671661009012e-07, + "logits/chosen": -2.5555591583251953, + "logits/rejected": -2.578249454498291, + "logps/chosen": -368.2088623046875, + "logps/rejected": -223.736083984375, + "loss": 0.3435, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6703633069992065, + "rewards/margins": 2.1879239082336426, + "rewards/rejected": -2.8582870960235596, + "step": 3937 + }, + { + "epoch": 0.45, + "learning_rate": 1.6631160014046585e-07, + "logits/chosen": -2.4725899696350098, + "logits/rejected": -2.69741153717041, + "logps/chosen": -370.44989013671875, + "logps/rejected": -307.7783203125, + "loss": 0.2227, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1864956617355347, + "rewards/margins": 3.2115941047668457, + "rewards/rejected": -4.398089408874512, + "step": 3938 + }, + { + "epoch": 0.45, + "learning_rate": 1.6627648367084163e-07, + "logits/chosen": -2.5873496532440186, + "logits/rejected": -2.4030771255493164, + "logps/chosen": -145.5439910888672, + "logps/rejected": -211.23928833007812, + "loss": 0.3999, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7679060101509094, + "rewards/margins": 1.8773808479309082, + "rewards/rejected": -2.645287036895752, + "step": 3939 + }, + { + "epoch": 0.45, + "learning_rate": 1.6624136720121738e-07, + "logits/chosen": -2.8566598892211914, + "logits/rejected": -2.618821144104004, + "logps/chosen": -115.03944396972656, + "logps/rejected": -331.5699157714844, + "loss": 0.3835, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4191625714302063, + "rewards/margins": 3.1037139892578125, + "rewards/rejected": -3.522876739501953, + "step": 3940 + }, + { + "epoch": 0.45, + "learning_rate": 1.662062507315931e-07, + "logits/chosen": -2.320673704147339, + "logits/rejected": -1.9668104648590088, + "logps/chosen": -170.67172241210938, + "logps/rejected": -340.9875183105469, + "loss": 0.7381, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.1298489570617676, + "rewards/margins": 0.43310147523880005, + "rewards/rejected": -2.562950611114502, + "step": 3941 + }, + { + "epoch": 0.45, + "learning_rate": 1.6617113426196886e-07, + "logits/chosen": -2.63631534576416, + "logits/rejected": -2.430406332015991, + "logps/chosen": -362.1504211425781, + "logps/rejected": -291.4591064453125, + "loss": 0.5603, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9715687036514282, + "rewards/margins": 2.3884692192077637, + "rewards/rejected": -3.3600382804870605, + "step": 3942 + }, + { + "epoch": 0.45, + "learning_rate": 1.6613601779234462e-07, + "logits/chosen": -2.381194591522217, + "logits/rejected": -2.418365955352783, + "logps/chosen": -234.16270446777344, + "logps/rejected": -255.93064880371094, + "loss": 0.7103, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8517752885818481, + "rewards/margins": 0.9633183479309082, + "rewards/rejected": -2.815093517303467, + "step": 3943 + }, + { + "epoch": 0.45, + "learning_rate": 1.6610090132272034e-07, + "logits/chosen": -2.9178428649902344, + "logits/rejected": -2.9088361263275146, + "logps/chosen": -270.9840087890625, + "logps/rejected": -181.11781311035156, + "loss": 0.3115, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8070390224456787, + "rewards/margins": 1.9301090240478516, + "rewards/rejected": -2.7371480464935303, + "step": 3944 + }, + { + "epoch": 0.45, + "learning_rate": 1.660657848530961e-07, + "logits/chosen": -2.3470888137817383, + "logits/rejected": -2.2479617595672607, + "logps/chosen": -224.77117919921875, + "logps/rejected": -515.1286010742188, + "loss": 0.3391, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.428180456161499, + "rewards/margins": 1.8124445676803589, + "rewards/rejected": -3.2406249046325684, + "step": 3945 + }, + { + "epoch": 0.45, + "learning_rate": 1.6603066838347183e-07, + "logits/chosen": -1.857641577720642, + "logits/rejected": -2.2756471633911133, + "logps/chosen": -194.7093048095703, + "logps/rejected": -167.49014282226562, + "loss": 2.0258, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.265126943588257, + "rewards/margins": -0.10940897464752197, + "rewards/rejected": -2.1557178497314453, + "step": 3946 + }, + { + "epoch": 0.46, + "learning_rate": 1.6599555191384758e-07, + "logits/chosen": -2.4904651641845703, + "logits/rejected": -2.544860363006592, + "logps/chosen": -291.6348876953125, + "logps/rejected": -177.51475524902344, + "loss": 0.4136, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0843847319483757, + "rewards/margins": 0.9495710730552673, + "rewards/rejected": -1.0339558124542236, + "step": 3947 + }, + { + "epoch": 0.46, + "learning_rate": 1.6596043544422333e-07, + "logits/chosen": -3.0518269538879395, + "logits/rejected": -3.0472588539123535, + "logps/chosen": -195.567138671875, + "logps/rejected": -305.7359313964844, + "loss": 0.1728, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5385348796844482, + "rewards/margins": 2.8349485397338867, + "rewards/rejected": -3.373483419418335, + "step": 3948 + }, + { + "epoch": 0.46, + "learning_rate": 1.6592531897459906e-07, + "logits/chosen": -2.662065029144287, + "logits/rejected": -2.723919153213501, + "logps/chosen": -272.8433837890625, + "logps/rejected": -294.48541259765625, + "loss": 0.3584, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2933790683746338, + "rewards/margins": 1.820314884185791, + "rewards/rejected": -3.113693952560425, + "step": 3949 + }, + { + "epoch": 0.46, + "learning_rate": 1.6589020250497484e-07, + "logits/chosen": -1.4793457984924316, + "logits/rejected": -1.5228264331817627, + "logps/chosen": -446.78106689453125, + "logps/rejected": -389.153564453125, + "loss": 0.6305, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5767887830734253, + "rewards/margins": 1.1029632091522217, + "rewards/rejected": -1.6797521114349365, + "step": 3950 + }, + { + "epoch": 0.46, + "learning_rate": 1.6585508603535054e-07, + "logits/chosen": -2.7055647373199463, + "logits/rejected": -2.9117846488952637, + "logps/chosen": -226.1087188720703, + "logps/rejected": -181.13758850097656, + "loss": 0.8313, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4220025539398193, + "rewards/margins": 1.3542284965515137, + "rewards/rejected": -2.776231050491333, + "step": 3951 + }, + { + "epoch": 0.46, + "learning_rate": 1.6581996956572632e-07, + "logits/chosen": -2.621865749359131, + "logits/rejected": -2.686500072479248, + "logps/chosen": -263.4086608886719, + "logps/rejected": -283.2052917480469, + "loss": 0.1549, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.42093777656555176, + "rewards/margins": 2.7193713188171387, + "rewards/rejected": -3.1403090953826904, + "step": 3952 + }, + { + "epoch": 0.46, + "learning_rate": 1.6578485309610208e-07, + "logits/chosen": -2.3619301319122314, + "logits/rejected": -2.2805395126342773, + "logps/chosen": -204.8262481689453, + "logps/rejected": -255.6668701171875, + "loss": 0.0995, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8075869083404541, + "rewards/margins": 3.1947381496429443, + "rewards/rejected": -4.002325057983398, + "step": 3953 + }, + { + "epoch": 0.46, + "learning_rate": 1.657497366264778e-07, + "logits/chosen": -2.453810453414917, + "logits/rejected": -2.2820851802825928, + "logps/chosen": -203.37246704101562, + "logps/rejected": -275.1080627441406, + "loss": 0.3294, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7469391226768494, + "rewards/margins": 2.6281557083129883, + "rewards/rejected": -3.3750948905944824, + "step": 3954 + }, + { + "epoch": 0.46, + "learning_rate": 1.6571462015685356e-07, + "logits/chosen": -2.3608360290527344, + "logits/rejected": -2.11242413520813, + "logps/chosen": -156.99000549316406, + "logps/rejected": -215.59613037109375, + "loss": 0.4998, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3257813453674316, + "rewards/margins": 2.1301074028015137, + "rewards/rejected": -3.4558887481689453, + "step": 3955 + }, + { + "epoch": 0.46, + "learning_rate": 1.656795036872293e-07, + "logits/chosen": -1.5275425910949707, + "logits/rejected": -1.7134920358657837, + "logps/chosen": -413.39056396484375, + "logps/rejected": -367.1795654296875, + "loss": 0.4053, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3554207682609558, + "rewards/margins": 1.8353240489959717, + "rewards/rejected": -2.190744638442993, + "step": 3956 + }, + { + "epoch": 0.46, + "learning_rate": 1.6564438721760504e-07, + "logits/chosen": -1.920100450515747, + "logits/rejected": -2.2568626403808594, + "logps/chosen": -471.03289794921875, + "logps/rejected": -385.8522644042969, + "loss": 0.4287, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24719564616680145, + "rewards/margins": 1.3242781162261963, + "rewards/rejected": -1.5714735984802246, + "step": 3957 + }, + { + "epoch": 0.46, + "learning_rate": 1.656092707479808e-07, + "logits/chosen": -2.285789728164673, + "logits/rejected": -2.1485588550567627, + "logps/chosen": -273.6502990722656, + "logps/rejected": -365.7036437988281, + "loss": 0.1308, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7364071607589722, + "rewards/margins": 2.481714963912964, + "rewards/rejected": -3.2181220054626465, + "step": 3958 + }, + { + "epoch": 0.46, + "learning_rate": 1.6557415427835652e-07, + "logits/chosen": -2.490696907043457, + "logits/rejected": -2.544343948364258, + "logps/chosen": -222.9994354248047, + "logps/rejected": -347.53125, + "loss": 0.2928, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0966757535934448, + "rewards/margins": 2.20084285736084, + "rewards/rejected": -3.297518730163574, + "step": 3959 + }, + { + "epoch": 0.46, + "learning_rate": 1.6553903780873227e-07, + "logits/chosen": -2.6071481704711914, + "logits/rejected": -2.635166645050049, + "logps/chosen": -318.06884765625, + "logps/rejected": -221.91355895996094, + "loss": 0.6558, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5995728969573975, + "rewards/margins": 0.9942283630371094, + "rewards/rejected": -2.593801259994507, + "step": 3960 + }, + { + "epoch": 0.46, + "learning_rate": 1.6550392133910805e-07, + "logits/chosen": -2.3839263916015625, + "logits/rejected": -2.2859113216400146, + "logps/chosen": -434.76611328125, + "logps/rejected": -174.27853393554688, + "loss": 0.7374, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.221775770187378, + "rewards/margins": 0.6669775247573853, + "rewards/rejected": -1.8887534141540527, + "step": 3961 + }, + { + "epoch": 0.46, + "learning_rate": 1.6546880486948378e-07, + "logits/chosen": -2.7075023651123047, + "logits/rejected": -2.726113796234131, + "logps/chosen": -335.8284912109375, + "logps/rejected": -236.14495849609375, + "loss": 0.3244, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.980148434638977, + "rewards/margins": 1.7941340208053589, + "rewards/rejected": -2.774282455444336, + "step": 3962 + }, + { + "epoch": 0.46, + "learning_rate": 1.6543368839985954e-07, + "logits/chosen": -1.9909179210662842, + "logits/rejected": -1.8945446014404297, + "logps/chosen": -352.8260498046875, + "logps/rejected": -272.01171875, + "loss": 0.2568, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.48445016145706177, + "rewards/margins": 1.6254019737243652, + "rewards/rejected": -2.1098520755767822, + "step": 3963 + }, + { + "epoch": 0.46, + "learning_rate": 1.653985719302353e-07, + "logits/chosen": -1.920406699180603, + "logits/rejected": -2.1384642124176025, + "logps/chosen": -355.1025390625, + "logps/rejected": -155.16897583007812, + "loss": 0.6027, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2081551551818848, + "rewards/margins": 0.4938991367816925, + "rewards/rejected": -1.7020542621612549, + "step": 3964 + }, + { + "epoch": 0.46, + "learning_rate": 1.6536345546061102e-07, + "logits/chosen": -1.8515630960464478, + "logits/rejected": -2.1688573360443115, + "logps/chosen": -328.0232238769531, + "logps/rejected": -278.513916015625, + "loss": 0.3879, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6715490818023682, + "rewards/margins": 1.9082884788513184, + "rewards/rejected": -2.5798375606536865, + "step": 3965 + }, + { + "epoch": 0.46, + "learning_rate": 1.6532833899098677e-07, + "logits/chosen": -2.49076509475708, + "logits/rejected": -2.469717264175415, + "logps/chosen": -292.07098388671875, + "logps/rejected": -277.397705078125, + "loss": 0.4025, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8635863065719604, + "rewards/margins": 2.2224411964416504, + "rewards/rejected": -4.0860276222229, + "step": 3966 + }, + { + "epoch": 0.46, + "learning_rate": 1.652932225213625e-07, + "logits/chosen": -2.157738447189331, + "logits/rejected": -2.0515315532684326, + "logps/chosen": -158.40684509277344, + "logps/rejected": -317.7582702636719, + "loss": 0.5513, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6152805685997009, + "rewards/margins": 1.5458593368530273, + "rewards/rejected": -2.161139965057373, + "step": 3967 + }, + { + "epoch": 0.46, + "learning_rate": 1.6525810605173825e-07, + "logits/chosen": -2.2959365844726562, + "logits/rejected": -2.145045280456543, + "logps/chosen": -271.5811462402344, + "logps/rejected": -347.3125305175781, + "loss": 0.1813, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3181222379207611, + "rewards/margins": 2.09908390045166, + "rewards/rejected": -2.417206048965454, + "step": 3968 + }, + { + "epoch": 0.46, + "learning_rate": 1.65222989582114e-07, + "logits/chosen": -2.640347719192505, + "logits/rejected": -2.611970901489258, + "logps/chosen": -246.8706817626953, + "logps/rejected": -163.92349243164062, + "loss": 0.3082, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3047681748867035, + "rewards/margins": 1.8350186347961426, + "rewards/rejected": -2.139786720275879, + "step": 3969 + }, + { + "epoch": 0.46, + "learning_rate": 1.6518787311248973e-07, + "logits/chosen": -2.4815099239349365, + "logits/rejected": -2.504767894744873, + "logps/chosen": -152.26107788085938, + "logps/rejected": -178.3741455078125, + "loss": 0.6333, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.216573476791382, + "rewards/margins": 0.6190066933631897, + "rewards/rejected": -2.835580348968506, + "step": 3970 + }, + { + "epoch": 0.46, + "learning_rate": 1.651527566428655e-07, + "logits/chosen": -1.7115159034729004, + "logits/rejected": -1.675868034362793, + "logps/chosen": -306.2984313964844, + "logps/rejected": -389.5063171386719, + "loss": 0.5248, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8961662650108337, + "rewards/margins": 2.6892871856689453, + "rewards/rejected": -3.585453510284424, + "step": 3971 + }, + { + "epoch": 0.46, + "learning_rate": 1.6511764017324127e-07, + "logits/chosen": -2.2723569869995117, + "logits/rejected": -2.5387988090515137, + "logps/chosen": -404.935302734375, + "logps/rejected": -289.78106689453125, + "loss": 0.3368, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9461743831634521, + "rewards/margins": 2.586674213409424, + "rewards/rejected": -3.532848834991455, + "step": 3972 + }, + { + "epoch": 0.46, + "learning_rate": 1.65082523703617e-07, + "logits/chosen": -1.9847819805145264, + "logits/rejected": -1.6356492042541504, + "logps/chosen": -111.67167663574219, + "logps/rejected": -211.3742218017578, + "loss": 0.5842, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.120133399963379, + "rewards/margins": 1.3074347972869873, + "rewards/rejected": -2.427567958831787, + "step": 3973 + }, + { + "epoch": 0.46, + "learning_rate": 1.6504740723399275e-07, + "logits/chosen": -1.7681605815887451, + "logits/rejected": -1.966635823249817, + "logps/chosen": -340.85260009765625, + "logps/rejected": -258.3349609375, + "loss": 0.3376, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.46602603793144226, + "rewards/margins": 1.8903886079788208, + "rewards/rejected": -2.356414556503296, + "step": 3974 + }, + { + "epoch": 0.46, + "learning_rate": 1.6501229076436848e-07, + "logits/chosen": -1.9306427240371704, + "logits/rejected": -2.0681583881378174, + "logps/chosen": -306.22259521484375, + "logps/rejected": -225.3975372314453, + "loss": 0.5526, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0327662229537964, + "rewards/margins": 1.4898056983947754, + "rewards/rejected": -2.5225720405578613, + "step": 3975 + }, + { + "epoch": 0.46, + "learning_rate": 1.6497717429474423e-07, + "logits/chosen": -2.476217746734619, + "logits/rejected": -2.740955114364624, + "logps/chosen": -345.8601989746094, + "logps/rejected": -263.3467712402344, + "loss": 0.3982, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2626163959503174, + "rewards/margins": 2.5964791774749756, + "rewards/rejected": -3.8590950965881348, + "step": 3976 + }, + { + "epoch": 0.46, + "learning_rate": 1.6494205782511998e-07, + "logits/chosen": -2.4719059467315674, + "logits/rejected": -2.7000694274902344, + "logps/chosen": -271.3600769042969, + "logps/rejected": -261.5184020996094, + "loss": 0.5051, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8645800948143005, + "rewards/margins": 1.6993107795715332, + "rewards/rejected": -2.5638909339904785, + "step": 3977 + }, + { + "epoch": 0.46, + "learning_rate": 1.649069413554957e-07, + "logits/chosen": -2.1811156272888184, + "logits/rejected": -2.3860950469970703, + "logps/chosen": -263.3382263183594, + "logps/rejected": -269.15875244140625, + "loss": 0.3239, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7934960722923279, + "rewards/margins": 2.479903221130371, + "rewards/rejected": -3.2733993530273438, + "step": 3978 + }, + { + "epoch": 0.46, + "learning_rate": 1.6487182488587146e-07, + "logits/chosen": -2.2783470153808594, + "logits/rejected": -2.0147321224212646, + "logps/chosen": -237.6190185546875, + "logps/rejected": -319.0047302246094, + "loss": 0.1984, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2046138048171997, + "rewards/margins": 3.5039288997650146, + "rewards/rejected": -4.708542823791504, + "step": 3979 + }, + { + "epoch": 0.46, + "learning_rate": 1.6483670841624722e-07, + "logits/chosen": -2.405667781829834, + "logits/rejected": -2.4021084308624268, + "logps/chosen": -321.840087890625, + "logps/rejected": -214.93728637695312, + "loss": 0.7737, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1758075952529907, + "rewards/margins": 0.4863715171813965, + "rewards/rejected": -1.6621792316436768, + "step": 3980 + }, + { + "epoch": 0.46, + "learning_rate": 1.6480159194662295e-07, + "logits/chosen": -2.305312156677246, + "logits/rejected": -2.2429521083831787, + "logps/chosen": -222.42799377441406, + "logps/rejected": -304.96783447265625, + "loss": 0.2628, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2714213132858276, + "rewards/margins": 1.9709985256195068, + "rewards/rejected": -3.242420196533203, + "step": 3981 + }, + { + "epoch": 0.46, + "learning_rate": 1.647664754769987e-07, + "logits/chosen": -1.9992047548294067, + "logits/rejected": -2.198514461517334, + "logps/chosen": -176.98175048828125, + "logps/rejected": -140.49310302734375, + "loss": 0.3796, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6888483762741089, + "rewards/margins": 1.2772512435913086, + "rewards/rejected": -1.9660996198654175, + "step": 3982 + }, + { + "epoch": 0.46, + "learning_rate": 1.6473135900737443e-07, + "logits/chosen": -2.248339891433716, + "logits/rejected": -2.5883965492248535, + "logps/chosen": -409.0836486816406, + "logps/rejected": -355.9622802734375, + "loss": 0.3508, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7342567443847656, + "rewards/margins": 1.7251851558685303, + "rewards/rejected": -2.459441900253296, + "step": 3983 + }, + { + "epoch": 0.46, + "learning_rate": 1.646962425377502e-07, + "logits/chosen": -2.417553186416626, + "logits/rejected": -2.4071333408355713, + "logps/chosen": -388.63775634765625, + "logps/rejected": -443.41558837890625, + "loss": 0.3164, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6593343615531921, + "rewards/margins": 2.181065082550049, + "rewards/rejected": -2.8403995037078857, + "step": 3984 + }, + { + "epoch": 0.46, + "learning_rate": 1.6466112606812596e-07, + "logits/chosen": -2.1830644607543945, + "logits/rejected": -2.272608518600464, + "logps/chosen": -518.4270629882812, + "logps/rejected": -323.9378356933594, + "loss": 0.4096, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0912213325500488, + "rewards/margins": 1.7127394676208496, + "rewards/rejected": -2.8039608001708984, + "step": 3985 + }, + { + "epoch": 0.46, + "learning_rate": 1.646260095985017e-07, + "logits/chosen": -1.3992468118667603, + "logits/rejected": -1.5587022304534912, + "logps/chosen": -267.55242919921875, + "logps/rejected": -261.25836181640625, + "loss": 0.6904, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.5089914798736572, + "rewards/margins": 1.607092261314392, + "rewards/rejected": -5.116084098815918, + "step": 3986 + }, + { + "epoch": 0.46, + "learning_rate": 1.6459089312887744e-07, + "logits/chosen": -2.0625126361846924, + "logits/rejected": -1.880122184753418, + "logps/chosen": -196.79672241210938, + "logps/rejected": -356.5837707519531, + "loss": 0.1169, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0037307143211364746, + "rewards/margins": 3.1493310928344727, + "rewards/rejected": -3.153061866760254, + "step": 3987 + }, + { + "epoch": 0.46, + "learning_rate": 1.645557766592532e-07, + "logits/chosen": -2.548787832260132, + "logits/rejected": -2.6045851707458496, + "logps/chosen": -177.04293823242188, + "logps/rejected": -142.90869140625, + "loss": 0.4368, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6691881418228149, + "rewards/margins": 0.768409252166748, + "rewards/rejected": -1.4375972747802734, + "step": 3988 + }, + { + "epoch": 0.46, + "learning_rate": 1.6452066018962892e-07, + "logits/chosen": -2.1883468627929688, + "logits/rejected": -2.2534282207489014, + "logps/chosen": -286.280517578125, + "logps/rejected": -220.52853393554688, + "loss": 0.5835, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6376440525054932, + "rewards/margins": 0.534375786781311, + "rewards/rejected": -2.1720199584960938, + "step": 3989 + }, + { + "epoch": 0.46, + "learning_rate": 1.6448554372000468e-07, + "logits/chosen": -2.039689779281616, + "logits/rejected": -2.2530269622802734, + "logps/chosen": -328.8376770019531, + "logps/rejected": -219.34991455078125, + "loss": 0.4806, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.35721930861473083, + "rewards/margins": 1.4121689796447754, + "rewards/rejected": -1.769388198852539, + "step": 3990 + }, + { + "epoch": 0.46, + "learning_rate": 1.644504272503804e-07, + "logits/chosen": -2.7240428924560547, + "logits/rejected": -2.227140188217163, + "logps/chosen": -160.98130798339844, + "logps/rejected": -287.746337890625, + "loss": 0.3641, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6622711420059204, + "rewards/margins": 2.187952995300293, + "rewards/rejected": -2.850224256515503, + "step": 3991 + }, + { + "epoch": 0.46, + "learning_rate": 1.6441531078075616e-07, + "logits/chosen": -2.2220072746276855, + "logits/rejected": -2.0055885314941406, + "logps/chosen": -349.71856689453125, + "logps/rejected": -456.82501220703125, + "loss": 0.4756, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9551288485527039, + "rewards/margins": 1.1065499782562256, + "rewards/rejected": -2.061678886413574, + "step": 3992 + }, + { + "epoch": 0.46, + "learning_rate": 1.643801943111319e-07, + "logits/chosen": -2.9456701278686523, + "logits/rejected": -2.885826826095581, + "logps/chosen": -399.7174377441406, + "logps/rejected": -294.1024475097656, + "loss": 0.5681, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6934443712234497, + "rewards/margins": 1.976876974105835, + "rewards/rejected": -3.670321464538574, + "step": 3993 + }, + { + "epoch": 0.46, + "learning_rate": 1.6434507784150764e-07, + "logits/chosen": -1.8609986305236816, + "logits/rejected": -2.154402256011963, + "logps/chosen": -394.7362976074219, + "logps/rejected": -294.015869140625, + "loss": 0.4633, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.40044716000556946, + "rewards/margins": 1.7274279594421387, + "rewards/rejected": -2.1278750896453857, + "step": 3994 + }, + { + "epoch": 0.46, + "learning_rate": 1.6430996137188342e-07, + "logits/chosen": -1.9237029552459717, + "logits/rejected": -1.9349737167358398, + "logps/chosen": -382.0674133300781, + "logps/rejected": -422.9450988769531, + "loss": 0.3861, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1382359266281128, + "rewards/margins": 4.156069755554199, + "rewards/rejected": -5.294305801391602, + "step": 3995 + }, + { + "epoch": 0.46, + "learning_rate": 1.6427484490225917e-07, + "logits/chosen": -1.9934742450714111, + "logits/rejected": -1.91737961769104, + "logps/chosen": -258.1655578613281, + "logps/rejected": -359.38543701171875, + "loss": 0.3442, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4655938148498535, + "rewards/margins": 1.5649473667144775, + "rewards/rejected": -3.030540943145752, + "step": 3996 + }, + { + "epoch": 0.46, + "learning_rate": 1.642397284326349e-07, + "logits/chosen": -2.920297622680664, + "logits/rejected": -3.0242695808410645, + "logps/chosen": -212.04559326171875, + "logps/rejected": -336.4468078613281, + "loss": 0.5603, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4966762065887451, + "rewards/margins": 1.5715868473052979, + "rewards/rejected": -3.068263053894043, + "step": 3997 + }, + { + "epoch": 0.46, + "learning_rate": 1.6420461196301066e-07, + "logits/chosen": -2.563337564468384, + "logits/rejected": -2.378255605697632, + "logps/chosen": -192.62937927246094, + "logps/rejected": -287.104248046875, + "loss": 0.4608, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.148889422416687, + "rewards/margins": 1.17509126663208, + "rewards/rejected": -2.3239808082580566, + "step": 3998 + }, + { + "epoch": 0.46, + "learning_rate": 1.6416949549338638e-07, + "logits/chosen": -2.5155513286590576, + "logits/rejected": -2.413499355316162, + "logps/chosen": -197.51437377929688, + "logps/rejected": -201.7019500732422, + "loss": 0.395, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2021584510803223, + "rewards/margins": 2.633225202560425, + "rewards/rejected": -3.835383415222168, + "step": 3999 + }, + { + "epoch": 0.46, + "learning_rate": 1.6413437902376214e-07, + "logits/chosen": -2.5257534980773926, + "logits/rejected": -2.5963544845581055, + "logps/chosen": -322.2962951660156, + "logps/rejected": -321.1884765625, + "loss": 0.4898, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4658392667770386, + "rewards/margins": 2.794701099395752, + "rewards/rejected": -4.26054048538208, + "step": 4000 + }, + { + "epoch": 0.46, + "eval_logits/chosen": -1.6406599283218384, + "eval_logits/rejected": -1.516654372215271, + "eval_logps/chosen": -300.2294006347656, + "eval_logps/rejected": -273.6029052734375, + "eval_loss": 0.34393760561943054, + "eval_rewards/accuracies": 0.8285714387893677, + "eval_rewards/chosen": -0.7549214959144592, + "eval_rewards/margins": 2.0866284370422363, + "eval_rewards/rejected": -2.841550350189209, + "eval_runtime": 24.1574, + "eval_samples_per_second": 2.898, + "eval_steps_per_second": 1.449, + "step": 4000 + }, + { + "epoch": 0.46, + "learning_rate": 1.640992625541379e-07, + "logits/chosen": -1.9737932682037354, + "logits/rejected": -2.1594245433807373, + "logps/chosen": -378.03045654296875, + "logps/rejected": -307.0249328613281, + "loss": 0.495, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17286673188209534, + "rewards/margins": 1.2201707363128662, + "rewards/rejected": -1.3930375576019287, + "step": 4001 + }, + { + "epoch": 0.46, + "learning_rate": 1.6406414608451362e-07, + "logits/chosen": -1.9115948677062988, + "logits/rejected": -2.2294225692749023, + "logps/chosen": -314.03045654296875, + "logps/rejected": -290.288818359375, + "loss": 0.2841, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2201824188232422, + "rewards/margins": 1.6503905057907104, + "rewards/rejected": -2.870573043823242, + "step": 4002 + }, + { + "epoch": 0.46, + "learning_rate": 1.6402902961488937e-07, + "logits/chosen": -2.434197425842285, + "logits/rejected": -2.197728157043457, + "logps/chosen": -303.4789733886719, + "logps/rejected": -312.72210693359375, + "loss": 0.2406, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4126322269439697, + "rewards/margins": 2.3031277656555176, + "rewards/rejected": -3.7157602310180664, + "step": 4003 + }, + { + "epoch": 0.46, + "learning_rate": 1.639939131452651e-07, + "logits/chosen": -2.496333599090576, + "logits/rejected": -2.6102869510650635, + "logps/chosen": -222.3234100341797, + "logps/rejected": -214.27760314941406, + "loss": 0.3807, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.193448543548584, + "rewards/margins": 1.8931740522384644, + "rewards/rejected": -3.086622476577759, + "step": 4004 + }, + { + "epoch": 0.46, + "learning_rate": 1.6395879667564085e-07, + "logits/chosen": -1.5690149068832397, + "logits/rejected": -1.5673105716705322, + "logps/chosen": -366.7588195800781, + "logps/rejected": -388.8407897949219, + "loss": 0.1367, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5902736783027649, + "rewards/margins": 3.4379827976226807, + "rewards/rejected": -4.028256416320801, + "step": 4005 + }, + { + "epoch": 0.46, + "learning_rate": 1.6392368020601663e-07, + "logits/chosen": -2.1477468013763428, + "logits/rejected": -1.9727084636688232, + "logps/chosen": -472.6855163574219, + "logps/rejected": -520.8834228515625, + "loss": 0.2817, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0002237558364868, + "rewards/margins": 2.468726634979248, + "rewards/rejected": -3.4689502716064453, + "step": 4006 + }, + { + "epoch": 0.46, + "learning_rate": 1.6388856373639236e-07, + "logits/chosen": -2.2257931232452393, + "logits/rejected": -2.277761697769165, + "logps/chosen": -234.40370178222656, + "logps/rejected": -212.2503662109375, + "loss": 0.1825, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8696199655532837, + "rewards/margins": 2.5098624229431152, + "rewards/rejected": -3.3794825077056885, + "step": 4007 + }, + { + "epoch": 0.46, + "learning_rate": 1.6385344726676812e-07, + "logits/chosen": -2.734919786453247, + "logits/rejected": -2.5691893100738525, + "logps/chosen": -353.8924865722656, + "logps/rejected": -329.0403137207031, + "loss": 0.4206, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6839042901992798, + "rewards/margins": 2.1135075092315674, + "rewards/rejected": -2.7974116802215576, + "step": 4008 + }, + { + "epoch": 0.46, + "learning_rate": 1.6381833079714387e-07, + "logits/chosen": -2.347255229949951, + "logits/rejected": -2.420576572418213, + "logps/chosen": -175.29127502441406, + "logps/rejected": -226.76107788085938, + "loss": 0.6541, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9401611089706421, + "rewards/margins": 0.4648658335208893, + "rewards/rejected": -1.405026912689209, + "step": 4009 + }, + { + "epoch": 0.46, + "learning_rate": 1.637832143275196e-07, + "logits/chosen": -2.5278632640838623, + "logits/rejected": -2.7428863048553467, + "logps/chosen": -225.1766815185547, + "logps/rejected": -164.04177856445312, + "loss": 0.431, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8347035646438599, + "rewards/margins": 1.9003095626831055, + "rewards/rejected": -2.735013246536255, + "step": 4010 + }, + { + "epoch": 0.46, + "learning_rate": 1.6374809785789535e-07, + "logits/chosen": -1.9577105045318604, + "logits/rejected": -1.7969433069229126, + "logps/chosen": -187.33914184570312, + "logps/rejected": -276.6960144042969, + "loss": 0.5914, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5263363718986511, + "rewards/margins": 1.5533578395843506, + "rewards/rejected": -2.0796942710876465, + "step": 4011 + }, + { + "epoch": 0.46, + "learning_rate": 1.6371298138827108e-07, + "logits/chosen": -2.576362371444702, + "logits/rejected": -2.612570285797119, + "logps/chosen": -370.394775390625, + "logps/rejected": -261.2861022949219, + "loss": 0.5457, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0411648750305176, + "rewards/margins": 1.2489190101623535, + "rewards/rejected": -2.290083885192871, + "step": 4012 + }, + { + "epoch": 0.46, + "learning_rate": 1.6367786491864683e-07, + "logits/chosen": -2.543894052505493, + "logits/rejected": -2.831585645675659, + "logps/chosen": -272.3045654296875, + "logps/rejected": -360.85546875, + "loss": 0.1497, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.29102692008018494, + "rewards/margins": 3.9704341888427734, + "rewards/rejected": -4.26146125793457, + "step": 4013 + }, + { + "epoch": 0.46, + "learning_rate": 1.6364274844902259e-07, + "logits/chosen": -2.20039701461792, + "logits/rejected": -2.2407407760620117, + "logps/chosen": -279.8970031738281, + "logps/rejected": -317.2011413574219, + "loss": 0.1327, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3067723512649536, + "rewards/margins": 3.0698184967041016, + "rewards/rejected": -4.376590728759766, + "step": 4014 + }, + { + "epoch": 0.46, + "learning_rate": 1.636076319793983e-07, + "logits/chosen": -2.17452073097229, + "logits/rejected": -1.902077317237854, + "logps/chosen": -248.82540893554688, + "logps/rejected": -267.5853271484375, + "loss": 0.3714, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8798702955245972, + "rewards/margins": 1.3410749435424805, + "rewards/rejected": -2.220945119857788, + "step": 4015 + }, + { + "epoch": 0.46, + "learning_rate": 1.6357251550977407e-07, + "logits/chosen": -2.028080701828003, + "logits/rejected": -1.844179630279541, + "logps/chosen": -225.27183532714844, + "logps/rejected": -323.3546142578125, + "loss": 0.6821, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1519298553466797, + "rewards/margins": 1.6776703596115112, + "rewards/rejected": -2.8296003341674805, + "step": 4016 + }, + { + "epoch": 0.46, + "learning_rate": 1.6353739904014985e-07, + "logits/chosen": -2.3378663063049316, + "logits/rejected": -2.3105719089508057, + "logps/chosen": -317.1321716308594, + "logps/rejected": -272.726318359375, + "loss": 0.3471, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8373597860336304, + "rewards/margins": 1.7462267875671387, + "rewards/rejected": -2.5835866928100586, + "step": 4017 + }, + { + "epoch": 0.46, + "learning_rate": 1.6350228257052557e-07, + "logits/chosen": -2.419584274291992, + "logits/rejected": -2.455306053161621, + "logps/chosen": -509.07147216796875, + "logps/rejected": -276.5890808105469, + "loss": 0.1282, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08608086407184601, + "rewards/margins": 3.313584804534912, + "rewards/rejected": -3.3996658325195312, + "step": 4018 + }, + { + "epoch": 0.46, + "learning_rate": 1.6346716610090133e-07, + "logits/chosen": -2.308361053466797, + "logits/rejected": -2.3610026836395264, + "logps/chosen": -289.8578796386719, + "logps/rejected": -256.7793884277344, + "loss": 0.6578, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7580670118331909, + "rewards/margins": 0.5690996050834656, + "rewards/rejected": -1.3271665573120117, + "step": 4019 + }, + { + "epoch": 0.46, + "learning_rate": 1.6343204963127706e-07, + "logits/chosen": -2.328831911087036, + "logits/rejected": -2.1705408096313477, + "logps/chosen": -267.61279296875, + "logps/rejected": -312.26416015625, + "loss": 0.2279, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1006901264190674, + "rewards/margins": 2.472214460372925, + "rewards/rejected": -3.572904586791992, + "step": 4020 + }, + { + "epoch": 0.46, + "learning_rate": 1.633969331616528e-07, + "logits/chosen": -2.2039291858673096, + "logits/rejected": -2.4467098712921143, + "logps/chosen": -302.89361572265625, + "logps/rejected": -221.4822540283203, + "loss": 0.1841, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37533020973205566, + "rewards/margins": 3.0112674236297607, + "rewards/rejected": -3.3865976333618164, + "step": 4021 + }, + { + "epoch": 0.46, + "learning_rate": 1.6336181669202856e-07, + "logits/chosen": -2.1511659622192383, + "logits/rejected": -2.017822742462158, + "logps/chosen": -170.95811462402344, + "logps/rejected": -258.7314147949219, + "loss": 0.4982, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6336207389831543, + "rewards/margins": 2.319531202316284, + "rewards/rejected": -2.9531521797180176, + "step": 4022 + }, + { + "epoch": 0.46, + "learning_rate": 1.633267002224043e-07, + "logits/chosen": -1.7761726379394531, + "logits/rejected": -1.703749179840088, + "logps/chosen": -238.5573272705078, + "logps/rejected": -285.65118408203125, + "loss": 0.4924, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.826569676399231, + "rewards/margins": 1.4002752304077148, + "rewards/rejected": -2.2268447875976562, + "step": 4023 + }, + { + "epoch": 0.46, + "learning_rate": 1.6329158375278004e-07, + "logits/chosen": -2.352194309234619, + "logits/rejected": -2.331758737564087, + "logps/chosen": -292.820556640625, + "logps/rejected": -330.211669921875, + "loss": 0.7059, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0570456981658936, + "rewards/margins": 1.334035873413086, + "rewards/rejected": -2.3910815715789795, + "step": 4024 + }, + { + "epoch": 0.46, + "learning_rate": 1.632564672831558e-07, + "logits/chosen": -2.1270134449005127, + "logits/rejected": -2.3481836318969727, + "logps/chosen": -285.0329284667969, + "logps/rejected": -281.0369873046875, + "loss": 0.4183, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7298942804336548, + "rewards/margins": 1.498193383216858, + "rewards/rejected": -2.228087902069092, + "step": 4025 + }, + { + "epoch": 0.46, + "learning_rate": 1.6322135081353153e-07, + "logits/chosen": -2.3938937187194824, + "logits/rejected": -2.4457688331604004, + "logps/chosen": -358.98553466796875, + "logps/rejected": -256.6583557128906, + "loss": 0.2912, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5956088304519653, + "rewards/margins": 1.8842949867248535, + "rewards/rejected": -2.4799036979675293, + "step": 4026 + }, + { + "epoch": 0.46, + "learning_rate": 1.6318623434390728e-07, + "logits/chosen": -2.1938018798828125, + "logits/rejected": -2.5274465084075928, + "logps/chosen": -309.59503173828125, + "logps/rejected": -215.21646118164062, + "loss": 0.6842, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3408128023147583, + "rewards/margins": 0.6913925409317017, + "rewards/rejected": -2.03220534324646, + "step": 4027 + }, + { + "epoch": 0.46, + "learning_rate": 1.63151117874283e-07, + "logits/chosen": -1.9886225461959839, + "logits/rejected": -1.736000895500183, + "logps/chosen": -386.8586730957031, + "logps/rejected": -349.27880859375, + "loss": 0.5224, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8012497425079346, + "rewards/margins": 1.6673078536987305, + "rewards/rejected": -2.468557357788086, + "step": 4028 + }, + { + "epoch": 0.46, + "learning_rate": 1.631160014046588e-07, + "logits/chosen": -2.0466904640197754, + "logits/rejected": -2.6541409492492676, + "logps/chosen": -332.0159912109375, + "logps/rejected": -197.1807861328125, + "loss": 0.1719, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.32187163829803467, + "rewards/margins": 2.8410449028015137, + "rewards/rejected": -3.162916421890259, + "step": 4029 + }, + { + "epoch": 0.46, + "learning_rate": 1.6308088493503454e-07, + "logits/chosen": -2.076911449432373, + "logits/rejected": -2.4620726108551025, + "logps/chosen": -331.4619140625, + "logps/rejected": -194.01626586914062, + "loss": 0.3776, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.44083601236343384, + "rewards/margins": 2.5447702407836914, + "rewards/rejected": -2.9856064319610596, + "step": 4030 + }, + { + "epoch": 0.46, + "learning_rate": 1.6304576846541027e-07, + "logits/chosen": -2.480882167816162, + "logits/rejected": -2.3910670280456543, + "logps/chosen": -412.6787109375, + "logps/rejected": -328.1665344238281, + "loss": 0.3322, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.38068997859954834, + "rewards/margins": 1.8965702056884766, + "rewards/rejected": -2.2772600650787354, + "step": 4031 + }, + { + "epoch": 0.46, + "learning_rate": 1.6301065199578602e-07, + "logits/chosen": -1.793660283088684, + "logits/rejected": -1.7220077514648438, + "logps/chosen": -342.0166320800781, + "logps/rejected": -376.2327880859375, + "loss": 0.2964, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0039567947387695, + "rewards/margins": 2.724238157272339, + "rewards/rejected": -3.7281947135925293, + "step": 4032 + }, + { + "epoch": 0.46, + "learning_rate": 1.6297553552616178e-07, + "logits/chosen": -1.989438533782959, + "logits/rejected": -1.6577047109603882, + "logps/chosen": -220.60386657714844, + "logps/rejected": -480.64581298828125, + "loss": 0.0497, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.036285161972046, + "rewards/margins": 3.9679784774780273, + "rewards/rejected": -5.004263877868652, + "step": 4033 + }, + { + "epoch": 0.47, + "learning_rate": 1.629404190565375e-07, + "logits/chosen": -1.928796410560608, + "logits/rejected": -1.9286950826644897, + "logps/chosen": -360.5697021484375, + "logps/rejected": -368.8806457519531, + "loss": 0.628, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3992191553115845, + "rewards/margins": 1.0741748809814453, + "rewards/rejected": -2.4733939170837402, + "step": 4034 + }, + { + "epoch": 0.47, + "learning_rate": 1.6290530258691326e-07, + "logits/chosen": -2.419039249420166, + "logits/rejected": -2.320478916168213, + "logps/chosen": -391.736572265625, + "logps/rejected": -401.0994873046875, + "loss": 0.4001, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.28467148542404175, + "rewards/margins": 1.3797107934951782, + "rewards/rejected": -1.6643822193145752, + "step": 4035 + }, + { + "epoch": 0.47, + "learning_rate": 1.6287018611728898e-07, + "logits/chosen": -2.3755273818969727, + "logits/rejected": -2.5313172340393066, + "logps/chosen": -283.62225341796875, + "logps/rejected": -321.06982421875, + "loss": 0.3342, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3404895067214966, + "rewards/margins": 2.2640767097473145, + "rewards/rejected": -2.6045660972595215, + "step": 4036 + }, + { + "epoch": 0.47, + "learning_rate": 1.6283506964766474e-07, + "logits/chosen": -2.3364241123199463, + "logits/rejected": -2.0453004837036133, + "logps/chosen": -346.0035400390625, + "logps/rejected": -349.22393798828125, + "loss": 0.3999, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4002597332000732, + "rewards/margins": 1.7728736400604248, + "rewards/rejected": -3.173133373260498, + "step": 4037 + }, + { + "epoch": 0.47, + "learning_rate": 1.627999531780405e-07, + "logits/chosen": -2.2943716049194336, + "logits/rejected": -2.308107376098633, + "logps/chosen": -237.6920623779297, + "logps/rejected": -234.4242706298828, + "loss": 0.6862, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9960588216781616, + "rewards/margins": 1.2447395324707031, + "rewards/rejected": -2.2407984733581543, + "step": 4038 + }, + { + "epoch": 0.47, + "learning_rate": 1.6276483670841622e-07, + "logits/chosen": -2.594271183013916, + "logits/rejected": -2.4110493659973145, + "logps/chosen": -148.71499633789062, + "logps/rejected": -285.4412536621094, + "loss": 0.2925, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.933343768119812, + "rewards/margins": 3.254876136779785, + "rewards/rejected": -4.188220024108887, + "step": 4039 + }, + { + "epoch": 0.47, + "learning_rate": 1.62729720238792e-07, + "logits/chosen": -2.149049997329712, + "logits/rejected": -2.138410806655884, + "logps/chosen": -235.89976501464844, + "logps/rejected": -282.5596618652344, + "loss": 0.4359, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2306978404521942, + "rewards/margins": 1.654325008392334, + "rewards/rejected": -1.8850228786468506, + "step": 4040 + }, + { + "epoch": 0.47, + "learning_rate": 1.6269460376916775e-07, + "logits/chosen": -1.9820678234100342, + "logits/rejected": -2.0328519344329834, + "logps/chosen": -308.9214782714844, + "logps/rejected": -196.23728942871094, + "loss": 0.5638, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0083820819854736, + "rewards/margins": 0.8284822702407837, + "rewards/rejected": -1.8368644714355469, + "step": 4041 + }, + { + "epoch": 0.47, + "learning_rate": 1.6265948729954348e-07, + "logits/chosen": -2.948190450668335, + "logits/rejected": -3.087261438369751, + "logps/chosen": -338.85833740234375, + "logps/rejected": -255.10757446289062, + "loss": 0.309, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9801228046417236, + "rewards/margins": 2.7502174377441406, + "rewards/rejected": -3.7303402423858643, + "step": 4042 + }, + { + "epoch": 0.47, + "learning_rate": 1.6262437082991924e-07, + "logits/chosen": -2.110568046569824, + "logits/rejected": -2.298178195953369, + "logps/chosen": -254.2781524658203, + "logps/rejected": -244.98866271972656, + "loss": 0.4643, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7209246158599854, + "rewards/margins": 2.0298478603363037, + "rewards/rejected": -2.750772476196289, + "step": 4043 + }, + { + "epoch": 0.47, + "learning_rate": 1.6258925436029496e-07, + "logits/chosen": -2.7490499019622803, + "logits/rejected": -2.6624438762664795, + "logps/chosen": -109.7442398071289, + "logps/rejected": -193.0697021484375, + "loss": 0.3369, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.768500804901123, + "rewards/margins": 2.787912607192993, + "rewards/rejected": -3.5564136505126953, + "step": 4044 + }, + { + "epoch": 0.47, + "learning_rate": 1.6255413789067072e-07, + "logits/chosen": -1.8580667972564697, + "logits/rejected": -1.4099977016448975, + "logps/chosen": -254.25064086914062, + "logps/rejected": -349.07958984375, + "loss": 0.1669, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4009668231010437, + "rewards/margins": 2.857931137084961, + "rewards/rejected": -3.2588982582092285, + "step": 4045 + }, + { + "epoch": 0.47, + "learning_rate": 1.6251902142104647e-07, + "logits/chosen": -1.960321068763733, + "logits/rejected": -2.127728223800659, + "logps/chosen": -208.63978576660156, + "logps/rejected": -241.87167358398438, + "loss": 0.3005, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.01934012770652771, + "rewards/margins": 3.0410847663879395, + "rewards/rejected": -3.060425281524658, + "step": 4046 + }, + { + "epoch": 0.47, + "learning_rate": 1.624839049514222e-07, + "logits/chosen": -2.767021894454956, + "logits/rejected": -2.808140516281128, + "logps/chosen": -214.4694366455078, + "logps/rejected": -330.8081970214844, + "loss": 0.3129, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2862359285354614, + "rewards/margins": 2.0493364334106445, + "rewards/rejected": -3.3355722427368164, + "step": 4047 + }, + { + "epoch": 0.47, + "learning_rate": 1.6244878848179795e-07, + "logits/chosen": -2.6775121688842773, + "logits/rejected": -2.6026504039764404, + "logps/chosen": -216.8753662109375, + "logps/rejected": -165.9452362060547, + "loss": 0.3709, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3911198377609253, + "rewards/margins": 1.6134048700332642, + "rewards/rejected": -2.0045247077941895, + "step": 4048 + }, + { + "epoch": 0.47, + "learning_rate": 1.6241367201217368e-07, + "logits/chosen": -2.2761685848236084, + "logits/rejected": -2.5279908180236816, + "logps/chosen": -269.69158935546875, + "logps/rejected": -192.84405517578125, + "loss": 0.696, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2176463603973389, + "rewards/margins": 0.14784258604049683, + "rewards/rejected": -1.365488886833191, + "step": 4049 + }, + { + "epoch": 0.47, + "learning_rate": 1.6237855554254943e-07, + "logits/chosen": -2.5797462463378906, + "logits/rejected": -2.6476755142211914, + "logps/chosen": -139.33309936523438, + "logps/rejected": -209.50555419921875, + "loss": 0.4691, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6821964979171753, + "rewards/margins": 3.2496185302734375, + "rewards/rejected": -3.931814670562744, + "step": 4050 + }, + { + "epoch": 0.47, + "learning_rate": 1.623434390729252e-07, + "logits/chosen": -2.3446452617645264, + "logits/rejected": -2.4429280757904053, + "logps/chosen": -207.12942504882812, + "logps/rejected": -339.14801025390625, + "loss": 0.3014, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12317030131816864, + "rewards/margins": 1.6994234323501587, + "rewards/rejected": -1.8225938081741333, + "step": 4051 + }, + { + "epoch": 0.47, + "learning_rate": 1.6230832260330094e-07, + "logits/chosen": -2.3298895359039307, + "logits/rejected": -2.2756783962249756, + "logps/chosen": -508.71075439453125, + "logps/rejected": -353.876953125, + "loss": 0.1007, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2821435928344727, + "rewards/margins": 2.785862684249878, + "rewards/rejected": -4.0680060386657715, + "step": 4052 + }, + { + "epoch": 0.47, + "learning_rate": 1.622732061336767e-07, + "logits/chosen": -2.117284059524536, + "logits/rejected": -2.451022148132324, + "logps/chosen": -625.6993408203125, + "logps/rejected": -248.60914611816406, + "loss": 0.2646, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9787143468856812, + "rewards/margins": 3.2384748458862305, + "rewards/rejected": -4.217188835144043, + "step": 4053 + }, + { + "epoch": 0.47, + "learning_rate": 1.6223808966405245e-07, + "logits/chosen": -2.4161019325256348, + "logits/rejected": -2.5010557174682617, + "logps/chosen": -252.11019897460938, + "logps/rejected": -278.6802062988281, + "loss": 0.4472, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0515730381011963, + "rewards/margins": 1.013864278793335, + "rewards/rejected": -2.0654373168945312, + "step": 4054 + }, + { + "epoch": 0.47, + "learning_rate": 1.6220297319442818e-07, + "logits/chosen": -2.715533494949341, + "logits/rejected": -2.5754430294036865, + "logps/chosen": -315.202392578125, + "logps/rejected": -259.4331359863281, + "loss": 0.395, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.518768548965454, + "rewards/margins": 2.345834255218506, + "rewards/rejected": -3.864602565765381, + "step": 4055 + }, + { + "epoch": 0.47, + "learning_rate": 1.6216785672480393e-07, + "logits/chosen": -2.218189001083374, + "logits/rejected": -2.405888319015503, + "logps/chosen": -238.34605407714844, + "logps/rejected": -263.1693420410156, + "loss": 0.3858, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8614720106124878, + "rewards/margins": 1.8320231437683105, + "rewards/rejected": -2.693495273590088, + "step": 4056 + }, + { + "epoch": 0.47, + "learning_rate": 1.6213274025517966e-07, + "logits/chosen": -2.677570104598999, + "logits/rejected": -2.491865396499634, + "logps/chosen": -256.23651123046875, + "logps/rejected": -248.8232421875, + "loss": 0.5481, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1375097036361694, + "rewards/margins": 2.5649070739746094, + "rewards/rejected": -3.7024168968200684, + "step": 4057 + }, + { + "epoch": 0.47, + "learning_rate": 1.620976237855554e-07, + "logits/chosen": -2.6474015712738037, + "logits/rejected": -2.8745522499084473, + "logps/chosen": -313.1148681640625, + "logps/rejected": -264.4124450683594, + "loss": 0.5451, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.8316915035247803, + "rewards/margins": 3.1819584369659424, + "rewards/rejected": -5.013649940490723, + "step": 4058 + }, + { + "epoch": 0.47, + "learning_rate": 1.6206250731593116e-07, + "logits/chosen": -1.8532270193099976, + "logits/rejected": -1.7716631889343262, + "logps/chosen": -337.7492980957031, + "logps/rejected": -444.0914001464844, + "loss": 0.3031, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6079754829406738, + "rewards/margins": 1.3479406833648682, + "rewards/rejected": -2.955916166305542, + "step": 4059 + }, + { + "epoch": 0.47, + "learning_rate": 1.620273908463069e-07, + "logits/chosen": -2.221064567565918, + "logits/rejected": -2.257002115249634, + "logps/chosen": -107.21549987792969, + "logps/rejected": -156.38128662109375, + "loss": 1.3644, + "rewards/accuracies": 0.375, + "rewards/chosen": -2.0631346702575684, + "rewards/margins": -0.1180376410484314, + "rewards/rejected": -1.9450969696044922, + "step": 4060 + }, + { + "epoch": 0.47, + "learning_rate": 1.6199227437668265e-07, + "logits/chosen": -2.108454465866089, + "logits/rejected": -2.049570083618164, + "logps/chosen": -214.90841674804688, + "logps/rejected": -282.4902648925781, + "loss": 0.2212, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1927778720855713, + "rewards/margins": 3.8820884227752686, + "rewards/rejected": -5.07486629486084, + "step": 4061 + }, + { + "epoch": 0.47, + "learning_rate": 1.6195715790705843e-07, + "logits/chosen": -1.6633293628692627, + "logits/rejected": -1.751953363418579, + "logps/chosen": -446.5719909667969, + "logps/rejected": -353.74822998046875, + "loss": 0.4001, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.13027912378311157, + "rewards/margins": 1.1607311964035034, + "rewards/rejected": -1.030452013015747, + "step": 4062 + }, + { + "epoch": 0.47, + "learning_rate": 1.6192204143743415e-07, + "logits/chosen": -2.3236188888549805, + "logits/rejected": -2.2074761390686035, + "logps/chosen": -169.67779541015625, + "logps/rejected": -219.15797424316406, + "loss": 0.4446, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6069523692131042, + "rewards/margins": 1.762739896774292, + "rewards/rejected": -2.369692325592041, + "step": 4063 + }, + { + "epoch": 0.47, + "learning_rate": 1.618869249678099e-07, + "logits/chosen": -2.192363739013672, + "logits/rejected": -2.408557415008545, + "logps/chosen": -409.59320068359375, + "logps/rejected": -251.49278259277344, + "loss": 0.2327, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3333477973937988, + "rewards/margins": 2.100590229034424, + "rewards/rejected": -3.4339377880096436, + "step": 4064 + }, + { + "epoch": 0.47, + "learning_rate": 1.6185180849818563e-07, + "logits/chosen": -2.1193106174468994, + "logits/rejected": -2.128331422805786, + "logps/chosen": -361.907470703125, + "logps/rejected": -316.2882080078125, + "loss": 0.5671, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17311938107013702, + "rewards/margins": 1.3972141742706299, + "rewards/rejected": -1.570333480834961, + "step": 4065 + }, + { + "epoch": 0.47, + "learning_rate": 1.618166920285614e-07, + "logits/chosen": -1.86654531955719, + "logits/rejected": -1.8267643451690674, + "logps/chosen": -326.3938293457031, + "logps/rejected": -303.8757629394531, + "loss": 0.5087, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.32471030950546265, + "rewards/margins": 0.9782297611236572, + "rewards/rejected": -1.3029401302337646, + "step": 4066 + }, + { + "epoch": 0.47, + "learning_rate": 1.6178157555893714e-07, + "logits/chosen": -1.6621572971343994, + "logits/rejected": -2.074618339538574, + "logps/chosen": -409.33526611328125, + "logps/rejected": -375.12371826171875, + "loss": 0.3677, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6769407391548157, + "rewards/margins": 1.8410389423370361, + "rewards/rejected": -2.517979621887207, + "step": 4067 + }, + { + "epoch": 0.47, + "learning_rate": 1.6174645908931287e-07, + "logits/chosen": -2.338637351989746, + "logits/rejected": -2.726153612136841, + "logps/chosen": -333.11505126953125, + "logps/rejected": -221.6416473388672, + "loss": 0.2471, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6674741506576538, + "rewards/margins": 2.3774428367614746, + "rewards/rejected": -3.044917106628418, + "step": 4068 + }, + { + "epoch": 0.47, + "learning_rate": 1.6171134261968862e-07, + "logits/chosen": -2.2835540771484375, + "logits/rejected": -2.4323861598968506, + "logps/chosen": -268.17279052734375, + "logps/rejected": -231.20849609375, + "loss": 0.3291, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9685583710670471, + "rewards/margins": 2.2987751960754395, + "rewards/rejected": -3.267333507537842, + "step": 4069 + }, + { + "epoch": 0.47, + "learning_rate": 1.6167622615006438e-07, + "logits/chosen": -1.5911083221435547, + "logits/rejected": -2.106315851211548, + "logps/chosen": -521.0924072265625, + "logps/rejected": -252.1336669921875, + "loss": 0.5582, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.468075156211853, + "rewards/margins": 1.2240400314331055, + "rewards/rejected": -2.692115306854248, + "step": 4070 + }, + { + "epoch": 0.47, + "learning_rate": 1.616411096804401e-07, + "logits/chosen": -2.276231050491333, + "logits/rejected": -2.4847769737243652, + "logps/chosen": -336.46234130859375, + "logps/rejected": -282.5848388671875, + "loss": 0.2207, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5508806109428406, + "rewards/margins": 3.4326560497283936, + "rewards/rejected": -3.983536720275879, + "step": 4071 + }, + { + "epoch": 0.47, + "learning_rate": 1.6160599321081586e-07, + "logits/chosen": -1.8272122144699097, + "logits/rejected": -2.0673155784606934, + "logps/chosen": -161.41845703125, + "logps/rejected": -155.43746948242188, + "loss": 0.9475, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.798898935317993, + "rewards/margins": 0.601508617401123, + "rewards/rejected": -3.400407552719116, + "step": 4072 + }, + { + "epoch": 0.47, + "learning_rate": 1.6157087674119159e-07, + "logits/chosen": -2.309274196624756, + "logits/rejected": -2.237903594970703, + "logps/chosen": -154.31570434570312, + "logps/rejected": -283.03009033203125, + "loss": 0.354, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5326815843582153, + "rewards/margins": 1.6765415668487549, + "rewards/rejected": -2.2092232704162598, + "step": 4073 + }, + { + "epoch": 0.47, + "learning_rate": 1.6153576027156737e-07, + "logits/chosen": -2.1584837436676025, + "logits/rejected": -1.9372174739837646, + "logps/chosen": -286.4189453125, + "logps/rejected": -240.47821044921875, + "loss": 0.4053, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8063260912895203, + "rewards/margins": 1.6139931678771973, + "rewards/rejected": -2.420319080352783, + "step": 4074 + }, + { + "epoch": 0.47, + "learning_rate": 1.6150064380194312e-07, + "logits/chosen": -2.039973258972168, + "logits/rejected": -2.3737142086029053, + "logps/chosen": -275.25146484375, + "logps/rejected": -304.2017822265625, + "loss": 1.088, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4900238513946533, + "rewards/margins": 0.6990743279457092, + "rewards/rejected": -2.1890981197357178, + "step": 4075 + }, + { + "epoch": 0.47, + "learning_rate": 1.6146552733231885e-07, + "logits/chosen": -1.8903064727783203, + "logits/rejected": -2.093606948852539, + "logps/chosen": -364.7601623535156, + "logps/rejected": -324.583984375, + "loss": 0.441, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8773008584976196, + "rewards/margins": 2.534863233566284, + "rewards/rejected": -3.4121642112731934, + "step": 4076 + }, + { + "epoch": 0.47, + "learning_rate": 1.614304108626946e-07, + "logits/chosen": -2.404609441757202, + "logits/rejected": -2.2806148529052734, + "logps/chosen": -377.9659423828125, + "logps/rejected": -291.92388916015625, + "loss": 0.7987, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.54258131980896, + "rewards/margins": 1.073941946029663, + "rewards/rejected": -2.616523265838623, + "step": 4077 + }, + { + "epoch": 0.47, + "learning_rate": 1.6139529439307036e-07, + "logits/chosen": -1.5557137727737427, + "logits/rejected": -2.043692111968994, + "logps/chosen": -297.6125793457031, + "logps/rejected": -208.07089233398438, + "loss": 0.6556, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.975675106048584, + "rewards/margins": 1.0040442943572998, + "rewards/rejected": -1.9797195196151733, + "step": 4078 + }, + { + "epoch": 0.47, + "learning_rate": 1.6136017792344608e-07, + "logits/chosen": -2.453470468521118, + "logits/rejected": -2.3809871673583984, + "logps/chosen": -305.5262756347656, + "logps/rejected": -286.71893310546875, + "loss": 0.7037, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4926397800445557, + "rewards/margins": 2.2399086952209473, + "rewards/rejected": -3.732548236846924, + "step": 4079 + }, + { + "epoch": 0.47, + "learning_rate": 1.6132506145382184e-07, + "logits/chosen": -2.4578707218170166, + "logits/rejected": -2.7159910202026367, + "logps/chosen": -410.5912780761719, + "logps/rejected": -170.71095275878906, + "loss": 1.0336, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8815587759017944, + "rewards/margins": 0.847082257270813, + "rewards/rejected": -2.7286410331726074, + "step": 4080 + }, + { + "epoch": 0.47, + "learning_rate": 1.6128994498419756e-07, + "logits/chosen": -2.7529072761535645, + "logits/rejected": -2.6786465644836426, + "logps/chosen": -210.61346435546875, + "logps/rejected": -248.85443115234375, + "loss": 0.115, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8383162617683411, + "rewards/margins": 2.838683605194092, + "rewards/rejected": -3.676999807357788, + "step": 4081 + }, + { + "epoch": 0.47, + "learning_rate": 1.6125482851457332e-07, + "logits/chosen": -1.5214779376983643, + "logits/rejected": -1.8131195306777954, + "logps/chosen": -395.867919921875, + "logps/rejected": -303.326904296875, + "loss": 0.9868, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6654785871505737, + "rewards/margins": 0.39382725954055786, + "rewards/rejected": -2.0593059062957764, + "step": 4082 + }, + { + "epoch": 0.47, + "learning_rate": 1.612197120449491e-07, + "logits/chosen": -2.367452383041382, + "logits/rejected": -2.205029010772705, + "logps/chosen": -127.2955093383789, + "logps/rejected": -312.6853942871094, + "loss": 0.2424, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0490080118179321, + "rewards/margins": 2.8275585174560547, + "rewards/rejected": -3.8765666484832764, + "step": 4083 + }, + { + "epoch": 0.47, + "learning_rate": 1.611845955753248e-07, + "logits/chosen": -2.1669540405273438, + "logits/rejected": -2.167100191116333, + "logps/chosen": -299.8885803222656, + "logps/rejected": -361.86865234375, + "loss": 0.2353, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.49543526768684387, + "rewards/margins": 2.217604398727417, + "rewards/rejected": -2.7130398750305176, + "step": 4084 + }, + { + "epoch": 0.47, + "learning_rate": 1.6114947910570058e-07, + "logits/chosen": -2.251620292663574, + "logits/rejected": -2.3804209232330322, + "logps/chosen": -252.16836547851562, + "logps/rejected": -165.93450927734375, + "loss": 0.3903, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.105189323425293, + "rewards/margins": 0.9789416193962097, + "rewards/rejected": -2.0841310024261475, + "step": 4085 + }, + { + "epoch": 0.47, + "learning_rate": 1.6111436263607633e-07, + "logits/chosen": -2.4273006916046143, + "logits/rejected": -2.4350085258483887, + "logps/chosen": -295.17156982421875, + "logps/rejected": -289.3701171875, + "loss": 0.4538, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6910216212272644, + "rewards/margins": 1.5434467792510986, + "rewards/rejected": -2.234468460083008, + "step": 4086 + }, + { + "epoch": 0.47, + "learning_rate": 1.6107924616645206e-07, + "logits/chosen": -2.2668564319610596, + "logits/rejected": -2.223090171813965, + "logps/chosen": -308.89654541015625, + "logps/rejected": -323.40106201171875, + "loss": 0.9993, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.814495325088501, + "rewards/margins": 2.513817310333252, + "rewards/rejected": -4.328312873840332, + "step": 4087 + }, + { + "epoch": 0.47, + "learning_rate": 1.6104412969682781e-07, + "logits/chosen": -1.9297471046447754, + "logits/rejected": -2.590965986251831, + "logps/chosen": -432.3721923828125, + "logps/rejected": -207.759521484375, + "loss": 0.3593, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8051774501800537, + "rewards/margins": 2.1277620792388916, + "rewards/rejected": -2.9329395294189453, + "step": 4088 + }, + { + "epoch": 0.47, + "learning_rate": 1.6100901322720354e-07, + "logits/chosen": -2.3019065856933594, + "logits/rejected": -2.228194236755371, + "logps/chosen": -211.03785705566406, + "logps/rejected": -317.89361572265625, + "loss": 0.1646, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3568490743637085, + "rewards/margins": 2.9417128562927246, + "rewards/rejected": -3.2985618114471436, + "step": 4089 + }, + { + "epoch": 0.47, + "learning_rate": 1.609738967575793e-07, + "logits/chosen": -2.5416154861450195, + "logits/rejected": -2.3691728115081787, + "logps/chosen": -305.0376281738281, + "logps/rejected": -453.8106384277344, + "loss": 0.1124, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.029879868030548096, + "rewards/margins": 3.7580971717834473, + "rewards/rejected": -3.7879767417907715, + "step": 4090 + }, + { + "epoch": 0.47, + "learning_rate": 1.6093878028795505e-07, + "logits/chosen": -2.6116135120391846, + "logits/rejected": -2.766451597213745, + "logps/chosen": -243.2606658935547, + "logps/rejected": -305.8946228027344, + "loss": 0.2433, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.035979799926280975, + "rewards/margins": 2.8015971183776855, + "rewards/rejected": -2.7656173706054688, + "step": 4091 + }, + { + "epoch": 0.47, + "learning_rate": 1.6090366381833078e-07, + "logits/chosen": -2.620851993560791, + "logits/rejected": -2.635206460952759, + "logps/chosen": -245.22756958007812, + "logps/rejected": -260.5716857910156, + "loss": 0.3012, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1723254919052124, + "rewards/margins": 2.6529905796051025, + "rewards/rejected": -3.8253161907196045, + "step": 4092 + }, + { + "epoch": 0.47, + "learning_rate": 1.6086854734870653e-07, + "logits/chosen": -1.8812451362609863, + "logits/rejected": -2.27131986618042, + "logps/chosen": -286.4471130371094, + "logps/rejected": -174.0018310546875, + "loss": 0.5735, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06690216064453125, + "rewards/margins": 0.860946774482727, + "rewards/rejected": -0.7940446138381958, + "step": 4093 + }, + { + "epoch": 0.47, + "learning_rate": 1.6083343087908226e-07, + "logits/chosen": -2.3856921195983887, + "logits/rejected": -2.2913973331451416, + "logps/chosen": -299.47979736328125, + "logps/rejected": -304.3179016113281, + "loss": 0.5325, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.48592814803123474, + "rewards/margins": 1.6610703468322754, + "rewards/rejected": -2.146998405456543, + "step": 4094 + }, + { + "epoch": 0.47, + "learning_rate": 1.60798314409458e-07, + "logits/chosen": -1.8686554431915283, + "logits/rejected": -2.008744955062866, + "logps/chosen": -333.26544189453125, + "logps/rejected": -262.12164306640625, + "loss": 0.2578, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7544898986816406, + "rewards/margins": 1.648327350616455, + "rewards/rejected": -2.4028172492980957, + "step": 4095 + }, + { + "epoch": 0.47, + "learning_rate": 1.607631979398338e-07, + "logits/chosen": -2.6755104064941406, + "logits/rejected": -2.4837756156921387, + "logps/chosen": -430.01116943359375, + "logps/rejected": -269.9899597167969, + "loss": 0.2356, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1211445331573486, + "rewards/margins": 2.1278300285339355, + "rewards/rejected": -3.248974561691284, + "step": 4096 + }, + { + "epoch": 0.47, + "learning_rate": 1.6072808147020952e-07, + "logits/chosen": -2.052738666534424, + "logits/rejected": -2.225803852081299, + "logps/chosen": -379.5043640136719, + "logps/rejected": -310.9878845214844, + "loss": 0.3543, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.080625057220459, + "rewards/margins": 1.6158733367919922, + "rewards/rejected": -3.696498394012451, + "step": 4097 + }, + { + "epoch": 0.47, + "learning_rate": 1.6069296500058527e-07, + "logits/chosen": -2.389155626296997, + "logits/rejected": -2.16074800491333, + "logps/chosen": -313.771728515625, + "logps/rejected": -308.56829833984375, + "loss": 0.3571, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5434824228286743, + "rewards/margins": 1.8958373069763184, + "rewards/rejected": -2.439319610595703, + "step": 4098 + }, + { + "epoch": 0.47, + "learning_rate": 1.6065784853096103e-07, + "logits/chosen": -2.680276870727539, + "logits/rejected": -2.397618055343628, + "logps/chosen": -320.61236572265625, + "logps/rejected": -335.6766357421875, + "loss": 0.0696, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25358107686042786, + "rewards/margins": 3.506592273712158, + "rewards/rejected": -3.7601735591888428, + "step": 4099 + }, + { + "epoch": 0.47, + "learning_rate": 1.6062273206133675e-07, + "logits/chosen": -2.1493752002716064, + "logits/rejected": -2.094426155090332, + "logps/chosen": -181.9271697998047, + "logps/rejected": -245.17166137695312, + "loss": 0.7622, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1206516027450562, + "rewards/margins": 0.16471268236637115, + "rewards/rejected": -1.2853642702102661, + "step": 4100 + }, + { + "epoch": 0.47, + "learning_rate": 1.605876155917125e-07, + "logits/chosen": -2.214700937271118, + "logits/rejected": -2.1615262031555176, + "logps/chosen": -461.1577453613281, + "logps/rejected": -356.3816833496094, + "loss": 0.1527, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5510581731796265, + "rewards/margins": 3.4639813899993896, + "rewards/rejected": -4.015039443969727, + "step": 4101 + }, + { + "epoch": 0.47, + "learning_rate": 1.6055249912208824e-07, + "logits/chosen": -1.7918611764907837, + "logits/rejected": -1.999833345413208, + "logps/chosen": -415.18255615234375, + "logps/rejected": -322.076904296875, + "loss": 0.3256, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6571744084358215, + "rewards/margins": 1.4385273456573486, + "rewards/rejected": -2.0957016944885254, + "step": 4102 + }, + { + "epoch": 0.47, + "learning_rate": 1.60517382652464e-07, + "logits/chosen": -2.049405574798584, + "logits/rejected": -2.3245208263397217, + "logps/chosen": -453.4102478027344, + "logps/rejected": -290.8628845214844, + "loss": 0.3477, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.896887481212616, + "rewards/margins": 2.0989582538604736, + "rewards/rejected": -2.9958457946777344, + "step": 4103 + }, + { + "epoch": 0.47, + "learning_rate": 1.6048226618283974e-07, + "logits/chosen": -2.676017999649048, + "logits/rejected": -2.6510469913482666, + "logps/chosen": -244.7408447265625, + "logps/rejected": -215.36703491210938, + "loss": 0.4083, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7918804287910461, + "rewards/margins": 1.309813380241394, + "rewards/rejected": -2.101693630218506, + "step": 4104 + }, + { + "epoch": 0.47, + "learning_rate": 1.6044714971321547e-07, + "logits/chosen": -1.9865262508392334, + "logits/rejected": -1.8012027740478516, + "logps/chosen": -340.88836669921875, + "logps/rejected": -303.94061279296875, + "loss": 0.2255, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8469325304031372, + "rewards/margins": 1.9833999872207642, + "rewards/rejected": -2.8303327560424805, + "step": 4105 + }, + { + "epoch": 0.47, + "learning_rate": 1.6041203324359123e-07, + "logits/chosen": -2.21055006980896, + "logits/rejected": -2.335566997528076, + "logps/chosen": -224.20643615722656, + "logps/rejected": -343.764892578125, + "loss": 0.3434, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6984950304031372, + "rewards/margins": 3.2691938877105713, + "rewards/rejected": -3.967689037322998, + "step": 4106 + }, + { + "epoch": 0.47, + "learning_rate": 1.60376916773967e-07, + "logits/chosen": -2.493095874786377, + "logits/rejected": -2.4995431900024414, + "logps/chosen": -355.13201904296875, + "logps/rejected": -223.3712158203125, + "loss": 0.2454, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.44106796383857727, + "rewards/margins": 1.6789355278015137, + "rewards/rejected": -2.1200034618377686, + "step": 4107 + }, + { + "epoch": 0.47, + "learning_rate": 1.6034180030434273e-07, + "logits/chosen": -2.2122509479522705, + "logits/rejected": -2.2859811782836914, + "logps/chosen": -350.52020263671875, + "logps/rejected": -323.9977111816406, + "loss": 0.3997, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.519598126411438, + "rewards/margins": 1.9371310472488403, + "rewards/rejected": -2.4567294120788574, + "step": 4108 + }, + { + "epoch": 0.47, + "learning_rate": 1.6030668383471849e-07, + "logits/chosen": -2.5853497982025146, + "logits/rejected": -2.7037994861602783, + "logps/chosen": -220.35464477539062, + "logps/rejected": -206.154296875, + "loss": 0.2321, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.039069652557373, + "rewards/margins": 2.387359380722046, + "rewards/rejected": -3.426429033279419, + "step": 4109 + }, + { + "epoch": 0.47, + "learning_rate": 1.6027156736509421e-07, + "logits/chosen": -2.253620147705078, + "logits/rejected": -2.4505395889282227, + "logps/chosen": -366.33551025390625, + "logps/rejected": -323.8794860839844, + "loss": 0.2904, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8987255096435547, + "rewards/margins": 3.008237838745117, + "rewards/rejected": -3.9069631099700928, + "step": 4110 + }, + { + "epoch": 0.47, + "learning_rate": 1.6023645089546997e-07, + "logits/chosen": -2.1815428733825684, + "logits/rejected": -2.144838809967041, + "logps/chosen": -350.3526916503906, + "logps/rejected": -494.51776123046875, + "loss": 0.4087, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6864048838615417, + "rewards/margins": 3.230854034423828, + "rewards/rejected": -3.9172587394714355, + "step": 4111 + }, + { + "epoch": 0.47, + "learning_rate": 1.6020133442584572e-07, + "logits/chosen": -2.9528610706329346, + "logits/rejected": -2.9794788360595703, + "logps/chosen": -103.31685638427734, + "logps/rejected": -240.0576171875, + "loss": 0.1566, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4556208848953247, + "rewards/margins": 4.5543413162231445, + "rewards/rejected": -6.009962558746338, + "step": 4112 + }, + { + "epoch": 0.47, + "learning_rate": 1.6016621795622145e-07, + "logits/chosen": -2.2761178016662598, + "logits/rejected": -2.3433048725128174, + "logps/chosen": -240.533447265625, + "logps/rejected": -187.36013793945312, + "loss": 0.471, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7103487253189087, + "rewards/margins": 1.0403754711151123, + "rewards/rejected": -2.7507243156433105, + "step": 4113 + }, + { + "epoch": 0.47, + "learning_rate": 1.601311014865972e-07, + "logits/chosen": -2.4477880001068115, + "logits/rejected": -2.4188578128814697, + "logps/chosen": -215.58682250976562, + "logps/rejected": -273.8023986816406, + "loss": 0.6308, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0395787954330444, + "rewards/margins": 0.6056231260299683, + "rewards/rejected": -1.6452019214630127, + "step": 4114 + }, + { + "epoch": 0.47, + "learning_rate": 1.6009598501697296e-07, + "logits/chosen": -2.7016384601593018, + "logits/rejected": -2.6957509517669678, + "logps/chosen": -97.50545501708984, + "logps/rejected": -182.89883422851562, + "loss": 0.3766, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4085582494735718, + "rewards/margins": 1.6369932889938354, + "rewards/rejected": -3.0455517768859863, + "step": 4115 + }, + { + "epoch": 0.47, + "learning_rate": 1.6006086854734868e-07, + "logits/chosen": -2.3534984588623047, + "logits/rejected": -2.1655547618865967, + "logps/chosen": -166.0370330810547, + "logps/rejected": -179.06539916992188, + "loss": 0.7838, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.8781228065490723, + "rewards/margins": 1.3954730033874512, + "rewards/rejected": -4.273595809936523, + "step": 4116 + }, + { + "epoch": 0.47, + "learning_rate": 1.6002575207772446e-07, + "logits/chosen": -2.188701868057251, + "logits/rejected": -2.651878833770752, + "logps/chosen": -182.2012481689453, + "logps/rejected": -261.45599365234375, + "loss": 1.2838, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.291762113571167, + "rewards/margins": 1.006394863128662, + "rewards/rejected": -3.298156976699829, + "step": 4117 + }, + { + "epoch": 0.47, + "learning_rate": 1.5999063560810017e-07, + "logits/chosen": -2.981233835220337, + "logits/rejected": -3.0437474250793457, + "logps/chosen": -167.22793579101562, + "logps/rejected": -201.64398193359375, + "loss": 0.3534, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7238649129867554, + "rewards/margins": 2.5359601974487305, + "rewards/rejected": -3.2598252296447754, + "step": 4118 + }, + { + "epoch": 0.47, + "learning_rate": 1.5995551913847595e-07, + "logits/chosen": -2.3007283210754395, + "logits/rejected": -2.3206734657287598, + "logps/chosen": -272.58050537109375, + "logps/rejected": -234.76773071289062, + "loss": 0.0695, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13335701823234558, + "rewards/margins": 4.070335388183594, + "rewards/rejected": -3.9369781017303467, + "step": 4119 + }, + { + "epoch": 0.47, + "learning_rate": 1.599204026688517e-07, + "logits/chosen": -2.6119110584259033, + "logits/rejected": -2.921652317047119, + "logps/chosen": -226.768798828125, + "logps/rejected": -213.60366821289062, + "loss": 0.0664, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2561192214488983, + "rewards/margins": 3.7467589378356934, + "rewards/rejected": -4.002878665924072, + "step": 4120 + }, + { + "epoch": 0.48, + "learning_rate": 1.5988528619922743e-07, + "logits/chosen": -2.770692825317383, + "logits/rejected": -2.5862627029418945, + "logps/chosen": -159.471435546875, + "logps/rejected": -329.1957702636719, + "loss": 0.1969, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.41988566517829895, + "rewards/margins": 2.384624719619751, + "rewards/rejected": -2.8045105934143066, + "step": 4121 + }, + { + "epoch": 0.48, + "learning_rate": 1.5985016972960318e-07, + "logits/chosen": -2.224642038345337, + "logits/rejected": -2.1729817390441895, + "logps/chosen": -184.18324279785156, + "logps/rejected": -236.4143524169922, + "loss": 0.4155, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.858309268951416, + "rewards/margins": 2.4318552017211914, + "rewards/rejected": -3.2901644706726074, + "step": 4122 + }, + { + "epoch": 0.48, + "learning_rate": 1.5981505325997893e-07, + "logits/chosen": -2.388137102127075, + "logits/rejected": -2.3586230278015137, + "logps/chosen": -278.42144775390625, + "logps/rejected": -354.7926330566406, + "loss": 0.3377, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9251165390014648, + "rewards/margins": 2.572152853012085, + "rewards/rejected": -3.4972691535949707, + "step": 4123 + }, + { + "epoch": 0.48, + "learning_rate": 1.5977993679035466e-07, + "logits/chosen": -2.114525556564331, + "logits/rejected": -2.218350410461426, + "logps/chosen": -245.8483123779297, + "logps/rejected": -268.71630859375, + "loss": 0.2734, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.40699297189712524, + "rewards/margins": 1.8667852878570557, + "rewards/rejected": -2.2737784385681152, + "step": 4124 + }, + { + "epoch": 0.48, + "learning_rate": 1.5974482032073042e-07, + "logits/chosen": -2.5183024406433105, + "logits/rejected": -2.4166555404663086, + "logps/chosen": -133.14590454101562, + "logps/rejected": -278.999755859375, + "loss": 0.2361, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4440816044807434, + "rewards/margins": 2.914409875869751, + "rewards/rejected": -3.3584914207458496, + "step": 4125 + }, + { + "epoch": 0.48, + "learning_rate": 1.5970970385110614e-07, + "logits/chosen": -2.4100446701049805, + "logits/rejected": -2.400463342666626, + "logps/chosen": -178.9478759765625, + "logps/rejected": -268.37274169921875, + "loss": 0.4697, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.336674451828003, + "rewards/margins": 1.102486252784729, + "rewards/rejected": -2.4391608238220215, + "step": 4126 + }, + { + "epoch": 0.48, + "learning_rate": 1.596745873814819e-07, + "logits/chosen": -2.4325828552246094, + "logits/rejected": -2.271296501159668, + "logps/chosen": -231.3978271484375, + "logps/rejected": -432.86761474609375, + "loss": 0.2601, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.334966242313385, + "rewards/margins": 3.3541409969329834, + "rewards/rejected": -3.6891074180603027, + "step": 4127 + }, + { + "epoch": 0.48, + "learning_rate": 1.5963947091185768e-07, + "logits/chosen": -2.626314878463745, + "logits/rejected": -2.395801305770874, + "logps/chosen": -244.17604064941406, + "logps/rejected": -217.0062713623047, + "loss": 0.1329, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7965819239616394, + "rewards/margins": 2.6526520252227783, + "rewards/rejected": -3.4492340087890625, + "step": 4128 + }, + { + "epoch": 0.48, + "learning_rate": 1.5960435444223338e-07, + "logits/chosen": -1.959313154220581, + "logits/rejected": -2.193256139755249, + "logps/chosen": -338.25189208984375, + "logps/rejected": -211.05941772460938, + "loss": 0.34, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7278993129730225, + "rewards/margins": 2.6708884239196777, + "rewards/rejected": -3.3987877368927, + "step": 4129 + }, + { + "epoch": 0.48, + "learning_rate": 1.5956923797260916e-07, + "logits/chosen": -2.5440022945404053, + "logits/rejected": -2.381474494934082, + "logps/chosen": -316.13983154296875, + "logps/rejected": -326.9100646972656, + "loss": 0.583, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6632647514343262, + "rewards/margins": 1.9797664880752563, + "rewards/rejected": -3.643031597137451, + "step": 4130 + }, + { + "epoch": 0.48, + "learning_rate": 1.595341215029849e-07, + "logits/chosen": -1.3594532012939453, + "logits/rejected": -1.851884365081787, + "logps/chosen": -270.5323181152344, + "logps/rejected": -226.07847595214844, + "loss": 0.3894, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6485265493392944, + "rewards/margins": 1.5158462524414062, + "rewards/rejected": -2.1643729209899902, + "step": 4131 + }, + { + "epoch": 0.48, + "learning_rate": 1.5949900503336064e-07, + "logits/chosen": -2.2382326126098633, + "logits/rejected": -2.122321367263794, + "logps/chosen": -267.213134765625, + "logps/rejected": -359.9280090332031, + "loss": 0.304, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0484862327575684, + "rewards/margins": 2.135158061981201, + "rewards/rejected": -3.1836442947387695, + "step": 4132 + }, + { + "epoch": 0.48, + "learning_rate": 1.594638885637364e-07, + "logits/chosen": -2.554311513900757, + "logits/rejected": -2.787567138671875, + "logps/chosen": -261.00152587890625, + "logps/rejected": -143.1907196044922, + "loss": 0.5019, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6739442944526672, + "rewards/margins": 0.5986015200614929, + "rewards/rejected": -1.2725458145141602, + "step": 4133 + }, + { + "epoch": 0.48, + "learning_rate": 1.5942877209411212e-07, + "logits/chosen": -2.754499673843384, + "logits/rejected": -2.6411068439483643, + "logps/chosen": -314.885986328125, + "logps/rejected": -285.037841796875, + "loss": 0.2602, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.967390239238739, + "rewards/margins": 3.0562803745269775, + "rewards/rejected": -4.023670196533203, + "step": 4134 + }, + { + "epoch": 0.48, + "learning_rate": 1.5939365562448788e-07, + "logits/chosen": -2.5069053173065186, + "logits/rejected": -2.3995325565338135, + "logps/chosen": -319.34393310546875, + "logps/rejected": -208.58578491210938, + "loss": 0.4326, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9902583360671997, + "rewards/margins": 1.0978554487228394, + "rewards/rejected": -2.088113784790039, + "step": 4135 + }, + { + "epoch": 0.48, + "learning_rate": 1.5935853915486363e-07, + "logits/chosen": -1.5497431755065918, + "logits/rejected": -1.5971484184265137, + "logps/chosen": -179.87103271484375, + "logps/rejected": -167.39955139160156, + "loss": 0.8135, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7674784660339355, + "rewards/margins": 0.5948567986488342, + "rewards/rejected": -2.362335205078125, + "step": 4136 + }, + { + "epoch": 0.48, + "learning_rate": 1.5932342268523936e-07, + "logits/chosen": -2.4997830390930176, + "logits/rejected": -2.5477869510650635, + "logps/chosen": -305.07769775390625, + "logps/rejected": -314.6468505859375, + "loss": 0.0669, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1144816875457764, + "rewards/margins": 4.791143894195557, + "rewards/rejected": -5.905625820159912, + "step": 4137 + }, + { + "epoch": 0.48, + "learning_rate": 1.592883062156151e-07, + "logits/chosen": -2.466730833053589, + "logits/rejected": -2.7269937992095947, + "logps/chosen": -462.5062255859375, + "logps/rejected": -271.11029052734375, + "loss": 0.1835, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6222299337387085, + "rewards/margins": 3.1108789443969727, + "rewards/rejected": -3.7331089973449707, + "step": 4138 + }, + { + "epoch": 0.48, + "learning_rate": 1.592531897459909e-07, + "logits/chosen": -2.6731209754943848, + "logits/rejected": -2.6647419929504395, + "logps/chosen": -317.65283203125, + "logps/rejected": -206.77825927734375, + "loss": 0.4653, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8506927490234375, + "rewards/margins": 1.246875524520874, + "rewards/rejected": -2.0975685119628906, + "step": 4139 + }, + { + "epoch": 0.48, + "learning_rate": 1.592180732763666e-07, + "logits/chosen": -2.284512758255005, + "logits/rejected": -2.5882740020751953, + "logps/chosen": -159.36477661132812, + "logps/rejected": -157.8679962158203, + "loss": 0.3991, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7417755126953125, + "rewards/margins": 2.8457632064819336, + "rewards/rejected": -3.587538719177246, + "step": 4140 + }, + { + "epoch": 0.48, + "learning_rate": 1.5918295680674237e-07, + "logits/chosen": -2.4596683979034424, + "logits/rejected": -2.229177713394165, + "logps/chosen": -278.3698425292969, + "logps/rejected": -291.2955322265625, + "loss": 0.1621, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7678849697113037, + "rewards/margins": 1.8762940168380737, + "rewards/rejected": -2.644179105758667, + "step": 4141 + }, + { + "epoch": 0.48, + "learning_rate": 1.591478403371181e-07, + "logits/chosen": -1.9782418012619019, + "logits/rejected": -2.076907157897949, + "logps/chosen": -427.3097229003906, + "logps/rejected": -463.1794128417969, + "loss": 0.572, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5051898956298828, + "rewards/margins": 2.133700370788574, + "rewards/rejected": -3.638890027999878, + "step": 4142 + }, + { + "epoch": 0.48, + "learning_rate": 1.5911272386749385e-07, + "logits/chosen": -2.626478672027588, + "logits/rejected": -2.707528591156006, + "logps/chosen": -132.080810546875, + "logps/rejected": -211.56423950195312, + "loss": 0.3714, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.249619960784912, + "rewards/margins": 2.0457375049591064, + "rewards/rejected": -3.2953577041625977, + "step": 4143 + }, + { + "epoch": 0.48, + "learning_rate": 1.590776073978696e-07, + "logits/chosen": -2.3217086791992188, + "logits/rejected": -2.5177359580993652, + "logps/chosen": -302.87451171875, + "logps/rejected": -294.3072509765625, + "loss": 0.3241, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0505828857421875, + "rewards/margins": 2.727006673812866, + "rewards/rejected": -3.7775895595550537, + "step": 4144 + }, + { + "epoch": 0.48, + "learning_rate": 1.5904249092824533e-07, + "logits/chosen": -2.4793570041656494, + "logits/rejected": -2.3330917358398438, + "logps/chosen": -182.84347534179688, + "logps/rejected": -374.9985656738281, + "loss": 0.1176, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.31762275099754333, + "rewards/margins": 3.307727336883545, + "rewards/rejected": -3.625349998474121, + "step": 4145 + }, + { + "epoch": 0.48, + "learning_rate": 1.590073744586211e-07, + "logits/chosen": -2.0509090423583984, + "logits/rejected": -2.029766798019409, + "logps/chosen": -313.476318359375, + "logps/rejected": -359.3482666015625, + "loss": 0.1483, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1034265011548996, + "rewards/margins": 3.0615177154541016, + "rewards/rejected": -3.1649441719055176, + "step": 4146 + }, + { + "epoch": 0.48, + "learning_rate": 1.5897225798899682e-07, + "logits/chosen": -2.433865547180176, + "logits/rejected": -2.6803841590881348, + "logps/chosen": -190.62301635742188, + "logps/rejected": -269.4442138671875, + "loss": 0.3203, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6896458864212036, + "rewards/margins": 2.8892924785614014, + "rewards/rejected": -3.5789382457733154, + "step": 4147 + }, + { + "epoch": 0.48, + "learning_rate": 1.5893714151937257e-07, + "logits/chosen": -2.2779541015625, + "logits/rejected": -2.132328510284424, + "logps/chosen": -364.697265625, + "logps/rejected": -228.86489868164062, + "loss": 0.6318, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8573930263519287, + "rewards/margins": 1.3059866428375244, + "rewards/rejected": -2.163379669189453, + "step": 4148 + }, + { + "epoch": 0.48, + "learning_rate": 1.5890202504974832e-07, + "logits/chosen": -1.8976415395736694, + "logits/rejected": -1.9833409786224365, + "logps/chosen": -279.84814453125, + "logps/rejected": -338.28277587890625, + "loss": 0.2234, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5996190309524536, + "rewards/margins": 3.241558313369751, + "rewards/rejected": -3.841177463531494, + "step": 4149 + }, + { + "epoch": 0.48, + "learning_rate": 1.5886690858012405e-07, + "logits/chosen": -2.2912840843200684, + "logits/rejected": -2.252549171447754, + "logps/chosen": -335.2020263671875, + "logps/rejected": -289.32025146484375, + "loss": 0.7073, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3236135244369507, + "rewards/margins": 1.5012534856796265, + "rewards/rejected": -2.824867010116577, + "step": 4150 + }, + { + "epoch": 0.48, + "learning_rate": 1.5883179211049983e-07, + "logits/chosen": -2.8527450561523438, + "logits/rejected": -2.701615810394287, + "logps/chosen": -267.1980285644531, + "logps/rejected": -264.514892578125, + "loss": 0.618, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2040785551071167, + "rewards/margins": 3.286198854446411, + "rewards/rejected": -4.490277290344238, + "step": 4151 + }, + { + "epoch": 0.48, + "learning_rate": 1.5879667564087558e-07, + "logits/chosen": -2.224844217300415, + "logits/rejected": -2.327770233154297, + "logps/chosen": -292.5950927734375, + "logps/rejected": -339.3252258300781, + "loss": 0.3582, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8177800178527832, + "rewards/margins": 2.0582456588745117, + "rewards/rejected": -2.876025676727295, + "step": 4152 + }, + { + "epoch": 0.48, + "learning_rate": 1.587615591712513e-07, + "logits/chosen": -2.4829635620117188, + "logits/rejected": -2.152556896209717, + "logps/chosen": -152.16592407226562, + "logps/rejected": -384.2834167480469, + "loss": 0.4363, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7179359197616577, + "rewards/margins": 2.090024471282959, + "rewards/rejected": -2.8079605102539062, + "step": 4153 + }, + { + "epoch": 0.48, + "learning_rate": 1.5872644270162707e-07, + "logits/chosen": -2.223649024963379, + "logits/rejected": -2.201105833053589, + "logps/chosen": -403.2941589355469, + "logps/rejected": -383.1272277832031, + "loss": 0.2936, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8864720463752747, + "rewards/margins": 2.2607617378234863, + "rewards/rejected": -3.1472339630126953, + "step": 4154 + }, + { + "epoch": 0.48, + "learning_rate": 1.586913262320028e-07, + "logits/chosen": -2.2886528968811035, + "logits/rejected": -2.300713062286377, + "logps/chosen": -157.64334106445312, + "logps/rejected": -191.28407287597656, + "loss": 0.6412, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1714521646499634, + "rewards/margins": 1.5574887990951538, + "rewards/rejected": -2.728940725326538, + "step": 4155 + }, + { + "epoch": 0.48, + "learning_rate": 1.5865620976237855e-07, + "logits/chosen": -2.5842690467834473, + "logits/rejected": -2.4732303619384766, + "logps/chosen": -252.60772705078125, + "logps/rejected": -283.82012939453125, + "loss": 0.1572, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.934379518032074, + "rewards/margins": 2.853848457336426, + "rewards/rejected": -3.7882277965545654, + "step": 4156 + }, + { + "epoch": 0.48, + "learning_rate": 1.586210932927543e-07, + "logits/chosen": -2.318422317504883, + "logits/rejected": -2.062633752822876, + "logps/chosen": -238.79763793945312, + "logps/rejected": -165.0416259765625, + "loss": 0.308, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4599366784095764, + "rewards/margins": 1.8255484104156494, + "rewards/rejected": -2.28548526763916, + "step": 4157 + }, + { + "epoch": 0.48, + "learning_rate": 1.5858597682313003e-07, + "logits/chosen": -2.2407712936401367, + "logits/rejected": -2.3128018379211426, + "logps/chosen": -412.84918212890625, + "logps/rejected": -332.1477966308594, + "loss": 0.5167, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0311179161071777, + "rewards/margins": 1.393491506576538, + "rewards/rejected": -2.424609422683716, + "step": 4158 + }, + { + "epoch": 0.48, + "learning_rate": 1.5855086035350578e-07, + "logits/chosen": -2.1998698711395264, + "logits/rejected": -2.351351499557495, + "logps/chosen": -376.69232177734375, + "logps/rejected": -308.1649169921875, + "loss": 0.1676, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9276732802391052, + "rewards/margins": 2.298452138900757, + "rewards/rejected": -3.226125478744507, + "step": 4159 + }, + { + "epoch": 0.48, + "learning_rate": 1.5851574388388154e-07, + "logits/chosen": -2.1270430088043213, + "logits/rejected": -2.299438953399658, + "logps/chosen": -188.701171875, + "logps/rejected": -197.82965087890625, + "loss": 0.2089, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24678878486156464, + "rewards/margins": 3.18898344039917, + "rewards/rejected": -3.435772180557251, + "step": 4160 + }, + { + "epoch": 0.48, + "learning_rate": 1.5848062741425726e-07, + "logits/chosen": -2.4522087574005127, + "logits/rejected": -2.4590823650360107, + "logps/chosen": -186.88641357421875, + "logps/rejected": -263.5754089355469, + "loss": 0.5095, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6921917200088501, + "rewards/margins": 2.1810922622680664, + "rewards/rejected": -2.873283863067627, + "step": 4161 + }, + { + "epoch": 0.48, + "learning_rate": 1.5844551094463304e-07, + "logits/chosen": -2.127992868423462, + "logits/rejected": -1.7686344385147095, + "logps/chosen": -178.13864135742188, + "logps/rejected": -456.8495178222656, + "loss": 1.0401, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5752566456794739, + "rewards/margins": 0.5597215294837952, + "rewards/rejected": -1.134978175163269, + "step": 4162 + }, + { + "epoch": 0.48, + "learning_rate": 1.5841039447500874e-07, + "logits/chosen": -2.133579730987549, + "logits/rejected": -2.1404294967651367, + "logps/chosen": -195.21795654296875, + "logps/rejected": -180.37869262695312, + "loss": 0.4196, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6183388233184814, + "rewards/margins": 1.4149963855743408, + "rewards/rejected": -2.0333352088928223, + "step": 4163 + }, + { + "epoch": 0.48, + "learning_rate": 1.5837527800538453e-07, + "logits/chosen": -2.5462987422943115, + "logits/rejected": -2.621774911880493, + "logps/chosen": -220.75314331054688, + "logps/rejected": -210.02359008789062, + "loss": 0.5567, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5683330297470093, + "rewards/margins": 0.9298300743103027, + "rewards/rejected": -1.498163104057312, + "step": 4164 + }, + { + "epoch": 0.48, + "learning_rate": 1.5834016153576028e-07, + "logits/chosen": -2.941664695739746, + "logits/rejected": -3.0032052993774414, + "logps/chosen": -297.5798034667969, + "logps/rejected": -314.6065979003906, + "loss": 0.9053, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.723264217376709, + "rewards/margins": -0.08413748443126678, + "rewards/rejected": -1.6391267776489258, + "step": 4165 + }, + { + "epoch": 0.48, + "learning_rate": 1.58305045066136e-07, + "logits/chosen": -2.731733798980713, + "logits/rejected": -2.851516008377075, + "logps/chosen": -174.22412109375, + "logps/rejected": -172.7543487548828, + "loss": 0.8541, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.7943819761276245, + "rewards/margins": 0.7109643220901489, + "rewards/rejected": -2.5053465366363525, + "step": 4166 + }, + { + "epoch": 0.48, + "learning_rate": 1.5826992859651176e-07, + "logits/chosen": -2.062631130218506, + "logits/rejected": -1.977992296218872, + "logps/chosen": -273.81805419921875, + "logps/rejected": -220.37762451171875, + "loss": 0.3834, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3273848295211792, + "rewards/margins": 2.1761600971221924, + "rewards/rejected": -3.5035452842712402, + "step": 4167 + }, + { + "epoch": 0.48, + "learning_rate": 1.5823481212688751e-07, + "logits/chosen": -2.019779920578003, + "logits/rejected": -2.2156224250793457, + "logps/chosen": -356.23077392578125, + "logps/rejected": -252.92385864257812, + "loss": 0.238, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2603971064090729, + "rewards/margins": 2.2756242752075195, + "rewards/rejected": -2.5360212326049805, + "step": 4168 + }, + { + "epoch": 0.48, + "learning_rate": 1.5819969565726324e-07, + "logits/chosen": -2.761948823928833, + "logits/rejected": -2.783050298690796, + "logps/chosen": -239.85260009765625, + "logps/rejected": -346.6163330078125, + "loss": 0.4679, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9648603796958923, + "rewards/margins": 2.18851900100708, + "rewards/rejected": -3.153379440307617, + "step": 4169 + }, + { + "epoch": 0.48, + "learning_rate": 1.58164579187639e-07, + "logits/chosen": -2.252058982849121, + "logits/rejected": -2.3212218284606934, + "logps/chosen": -261.46795654296875, + "logps/rejected": -364.33001708984375, + "loss": 0.5221, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5259119272232056, + "rewards/margins": 2.578752040863037, + "rewards/rejected": -3.104663848876953, + "step": 4170 + }, + { + "epoch": 0.48, + "learning_rate": 1.5812946271801472e-07, + "logits/chosen": -1.9707880020141602, + "logits/rejected": -2.2384183406829834, + "logps/chosen": -217.36849975585938, + "logps/rejected": -178.5031280517578, + "loss": 0.5912, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8278499841690063, + "rewards/margins": 1.1976183652877808, + "rewards/rejected": -2.025468349456787, + "step": 4171 + }, + { + "epoch": 0.48, + "learning_rate": 1.5809434624839048e-07, + "logits/chosen": -2.232827663421631, + "logits/rejected": -1.9887951612472534, + "logps/chosen": -265.96466064453125, + "logps/rejected": -301.56036376953125, + "loss": 0.606, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7419966459274292, + "rewards/margins": 1.1751800775527954, + "rewards/rejected": -2.9171767234802246, + "step": 4172 + }, + { + "epoch": 0.48, + "learning_rate": 1.5805922977876626e-07, + "logits/chosen": -2.4598379135131836, + "logits/rejected": -2.5044162273406982, + "logps/chosen": -167.71823120117188, + "logps/rejected": -268.15142822265625, + "loss": 0.2006, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24355149269104004, + "rewards/margins": 3.159034252166748, + "rewards/rejected": -3.402585744857788, + "step": 4173 + }, + { + "epoch": 0.48, + "learning_rate": 1.5802411330914196e-07, + "logits/chosen": -2.1993398666381836, + "logits/rejected": -2.119417667388916, + "logps/chosen": -451.8354187011719, + "logps/rejected": -338.43505859375, + "loss": 0.3433, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9143760800361633, + "rewards/margins": 1.9470711946487427, + "rewards/rejected": -2.861447334289551, + "step": 4174 + }, + { + "epoch": 0.48, + "learning_rate": 1.5798899683951774e-07, + "logits/chosen": -1.9177865982055664, + "logits/rejected": -1.9559944868087769, + "logps/chosen": -242.1678924560547, + "logps/rejected": -285.38140869140625, + "loss": 0.6621, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.251367449760437, + "rewards/margins": 0.7926424145698547, + "rewards/rejected": -2.0440099239349365, + "step": 4175 + }, + { + "epoch": 0.48, + "learning_rate": 1.579538803698935e-07, + "logits/chosen": -2.2098188400268555, + "logits/rejected": -2.164543867111206, + "logps/chosen": -360.8795471191406, + "logps/rejected": -368.86676025390625, + "loss": 0.6281, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.601966381072998, + "rewards/margins": 2.2391862869262695, + "rewards/rejected": -3.8411524295806885, + "step": 4176 + }, + { + "epoch": 0.48, + "learning_rate": 1.5791876390026922e-07, + "logits/chosen": -1.74921452999115, + "logits/rejected": -2.2628374099731445, + "logps/chosen": -444.75189208984375, + "logps/rejected": -221.17611694335938, + "loss": 0.6255, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5487383604049683, + "rewards/margins": 0.6359565258026123, + "rewards/rejected": -1.1846948862075806, + "step": 4177 + }, + { + "epoch": 0.48, + "learning_rate": 1.5788364743064497e-07, + "logits/chosen": -2.0128982067108154, + "logits/rejected": -2.072018623352051, + "logps/chosen": -349.3851318359375, + "logps/rejected": -352.5478820800781, + "loss": 0.3764, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.780817985534668, + "rewards/margins": 2.2114312648773193, + "rewards/rejected": -2.992249011993408, + "step": 4178 + }, + { + "epoch": 0.48, + "learning_rate": 1.578485309610207e-07, + "logits/chosen": -2.5062062740325928, + "logits/rejected": -2.452104330062866, + "logps/chosen": -356.3409423828125, + "logps/rejected": -408.43927001953125, + "loss": 0.1523, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6049448251724243, + "rewards/margins": 3.560317277908325, + "rewards/rejected": -4.165262222290039, + "step": 4179 + }, + { + "epoch": 0.48, + "learning_rate": 1.5781341449139645e-07, + "logits/chosen": -1.3511056900024414, + "logits/rejected": -1.2999587059020996, + "logps/chosen": -328.5299072265625, + "logps/rejected": -413.5652770996094, + "loss": 0.4736, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8717795610427856, + "rewards/margins": 1.4121413230895996, + "rewards/rejected": -2.283921003341675, + "step": 4180 + }, + { + "epoch": 0.48, + "learning_rate": 1.577782980217722e-07, + "logits/chosen": -2.6420369148254395, + "logits/rejected": -2.5662994384765625, + "logps/chosen": -328.170166015625, + "logps/rejected": -209.39559936523438, + "loss": 0.5274, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.32616037130355835, + "rewards/margins": 1.5811705589294434, + "rewards/rejected": -1.9073309898376465, + "step": 4181 + }, + { + "epoch": 0.48, + "learning_rate": 1.5774318155214794e-07, + "logits/chosen": -2.2510910034179688, + "logits/rejected": -2.51155686378479, + "logps/chosen": -409.47528076171875, + "logps/rejected": -408.4935302734375, + "loss": 0.114, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3239731788635254, + "rewards/margins": 3.3085262775421143, + "rewards/rejected": -4.632499694824219, + "step": 4182 + }, + { + "epoch": 0.48, + "learning_rate": 1.577080650825237e-07, + "logits/chosen": -2.4774065017700195, + "logits/rejected": -2.731654167175293, + "logps/chosen": -210.55316162109375, + "logps/rejected": -223.44586181640625, + "loss": 0.2132, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0843593031167984, + "rewards/margins": 2.6510140895843506, + "rewards/rejected": -2.7353734970092773, + "step": 4183 + }, + { + "epoch": 0.48, + "learning_rate": 1.5767294861289947e-07, + "logits/chosen": -2.60205078125, + "logits/rejected": -2.595266819000244, + "logps/chosen": -246.28326416015625, + "logps/rejected": -297.328125, + "loss": 0.2773, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6477826833724976, + "rewards/margins": 2.172811985015869, + "rewards/rejected": -2.8205947875976562, + "step": 4184 + }, + { + "epoch": 0.48, + "learning_rate": 1.576378321432752e-07, + "logits/chosen": -2.345221996307373, + "logits/rejected": -2.2019970417022705, + "logps/chosen": -173.53306579589844, + "logps/rejected": -173.51824951171875, + "loss": 0.9343, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8028907179832458, + "rewards/margins": -0.13566173613071442, + "rewards/rejected": -0.6672289371490479, + "step": 4185 + }, + { + "epoch": 0.48, + "learning_rate": 1.5760271567365095e-07, + "logits/chosen": -2.483412981033325, + "logits/rejected": -2.317060708999634, + "logps/chosen": -157.41552734375, + "logps/rejected": -197.85037231445312, + "loss": 1.9932, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.8626821041107178, + "rewards/margins": -0.3348003625869751, + "rewards/rejected": -2.527881622314453, + "step": 4186 + }, + { + "epoch": 0.48, + "learning_rate": 1.5756759920402668e-07, + "logits/chosen": -2.3113174438476562, + "logits/rejected": -2.416959762573242, + "logps/chosen": -273.7010803222656, + "logps/rejected": -183.18905639648438, + "loss": 0.482, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7906876802444458, + "rewards/margins": 1.261271357536316, + "rewards/rejected": -2.0519590377807617, + "step": 4187 + }, + { + "epoch": 0.48, + "learning_rate": 1.5753248273440243e-07, + "logits/chosen": -2.9394290447235107, + "logits/rejected": -3.05859375, + "logps/chosen": -229.81533813476562, + "logps/rejected": -323.6961669921875, + "loss": 0.114, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5726364254951477, + "rewards/margins": 5.526298999786377, + "rewards/rejected": -6.098935604095459, + "step": 4188 + }, + { + "epoch": 0.48, + "learning_rate": 1.5749736626477819e-07, + "logits/chosen": -2.3116559982299805, + "logits/rejected": -2.5464742183685303, + "logps/chosen": -357.37060546875, + "logps/rejected": -343.5750427246094, + "loss": 0.6665, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2206343412399292, + "rewards/margins": 2.9676380157470703, + "rewards/rejected": -4.188271999359131, + "step": 4189 + }, + { + "epoch": 0.48, + "learning_rate": 1.5746224979515391e-07, + "logits/chosen": -2.4653797149658203, + "logits/rejected": -2.3089988231658936, + "logps/chosen": -177.77157592773438, + "logps/rejected": -239.54403686523438, + "loss": 0.1654, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12394649535417557, + "rewards/margins": 1.9799761772155762, + "rewards/rejected": -2.1039228439331055, + "step": 4190 + }, + { + "epoch": 0.48, + "learning_rate": 1.5742713332552967e-07, + "logits/chosen": -2.3129212856292725, + "logits/rejected": -2.4944605827331543, + "logps/chosen": -371.2263488769531, + "logps/rejected": -311.212646484375, + "loss": 0.3672, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5533415675163269, + "rewards/margins": 2.0058326721191406, + "rewards/rejected": -2.5591742992401123, + "step": 4191 + }, + { + "epoch": 0.48, + "learning_rate": 1.573920168559054e-07, + "logits/chosen": -2.424862861633301, + "logits/rejected": -1.990969181060791, + "logps/chosen": -209.43690490722656, + "logps/rejected": -350.44989013671875, + "loss": 0.4467, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3192787170410156, + "rewards/margins": 1.2928987741470337, + "rewards/rejected": -2.6121773719787598, + "step": 4192 + }, + { + "epoch": 0.48, + "learning_rate": 1.5735690038628115e-07, + "logits/chosen": -2.269038200378418, + "logits/rejected": -2.1604180335998535, + "logps/chosen": -210.59222412109375, + "logps/rejected": -306.13525390625, + "loss": 0.521, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7968363761901855, + "rewards/margins": 1.1645770072937012, + "rewards/rejected": -2.9614133834838867, + "step": 4193 + }, + { + "epoch": 0.48, + "learning_rate": 1.573217839166569e-07, + "logits/chosen": -2.3623123168945312, + "logits/rejected": -2.3036844730377197, + "logps/chosen": -204.74940490722656, + "logps/rejected": -182.43307495117188, + "loss": 0.3184, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4180070161819458, + "rewards/margins": 1.582663655281067, + "rewards/rejected": -3.0006706714630127, + "step": 4194 + }, + { + "epoch": 0.48, + "learning_rate": 1.5728666744703263e-07, + "logits/chosen": -1.8767311573028564, + "logits/rejected": -2.178133964538574, + "logps/chosen": -514.9400634765625, + "logps/rejected": -380.70928955078125, + "loss": 0.3315, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.46475064754486084, + "rewards/margins": 2.0715198516845703, + "rewards/rejected": -2.5362706184387207, + "step": 4195 + }, + { + "epoch": 0.48, + "learning_rate": 1.572515509774084e-07, + "logits/chosen": -2.28539776802063, + "logits/rejected": -2.297421455383301, + "logps/chosen": -238.46913146972656, + "logps/rejected": -204.91891479492188, + "loss": 0.4692, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3552684783935547, + "rewards/margins": 0.8254589438438416, + "rewards/rejected": -2.180727243423462, + "step": 4196 + }, + { + "epoch": 0.48, + "learning_rate": 1.5721643450778416e-07, + "logits/chosen": -1.8936126232147217, + "logits/rejected": -1.9324592351913452, + "logps/chosen": -225.9920654296875, + "logps/rejected": -231.76600646972656, + "loss": 0.1998, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5858715772628784, + "rewards/margins": 2.5559332370758057, + "rewards/rejected": -3.1418046951293945, + "step": 4197 + }, + { + "epoch": 0.48, + "learning_rate": 1.571813180381599e-07, + "logits/chosen": -2.200374126434326, + "logits/rejected": -2.1222965717315674, + "logps/chosen": -217.35279846191406, + "logps/rejected": -385.4371032714844, + "loss": 0.2089, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.80869460105896, + "rewards/margins": 3.3640785217285156, + "rewards/rejected": -4.172772407531738, + "step": 4198 + }, + { + "epoch": 0.48, + "learning_rate": 1.5714620156853565e-07, + "logits/chosen": -2.739654541015625, + "logits/rejected": -2.7420730590820312, + "logps/chosen": -264.2947692871094, + "logps/rejected": -214.26914978027344, + "loss": 0.5638, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1674028635025024, + "rewards/margins": 1.1353559494018555, + "rewards/rejected": -2.3027586936950684, + "step": 4199 + }, + { + "epoch": 0.48, + "learning_rate": 1.5711108509891137e-07, + "logits/chosen": -1.946958065032959, + "logits/rejected": -1.7098833322525024, + "logps/chosen": -242.86993408203125, + "logps/rejected": -260.48492431640625, + "loss": 0.8773, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7237279415130615, + "rewards/margins": 1.0430195331573486, + "rewards/rejected": -2.7667477130889893, + "step": 4200 + }, + { + "epoch": 0.48, + "learning_rate": 1.5707596862928713e-07, + "logits/chosen": -2.4192817211151123, + "logits/rejected": -2.2875537872314453, + "logps/chosen": -326.4383544921875, + "logps/rejected": -308.92059326171875, + "loss": 0.1668, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.507658839225769, + "rewards/margins": 2.681333541870117, + "rewards/rejected": -3.188992500305176, + "step": 4201 + }, + { + "epoch": 0.48, + "learning_rate": 1.5704085215966288e-07, + "logits/chosen": -1.7196431159973145, + "logits/rejected": -1.706832766532898, + "logps/chosen": -219.0457305908203, + "logps/rejected": -303.8927307128906, + "loss": 0.4184, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.920111060142517, + "rewards/margins": 1.8622703552246094, + "rewards/rejected": -3.782381534576416, + "step": 4202 + }, + { + "epoch": 0.48, + "learning_rate": 1.570057356900386e-07, + "logits/chosen": -2.6158127784729004, + "logits/rejected": -2.498476266860962, + "logps/chosen": -239.23683166503906, + "logps/rejected": -167.2834014892578, + "loss": 0.2733, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9910975694656372, + "rewards/margins": 2.490464687347412, + "rewards/rejected": -3.481562614440918, + "step": 4203 + }, + { + "epoch": 0.48, + "learning_rate": 1.5697061922041436e-07, + "logits/chosen": -2.074303388595581, + "logits/rejected": -2.407045602798462, + "logps/chosen": -542.2539672851562, + "logps/rejected": -277.42816162109375, + "loss": 0.1246, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.389218807220459, + "rewards/margins": 3.053637981414795, + "rewards/rejected": -3.442857027053833, + "step": 4204 + }, + { + "epoch": 0.48, + "learning_rate": 1.5693550275079012e-07, + "logits/chosen": -1.6481379270553589, + "logits/rejected": -1.9850831031799316, + "logps/chosen": -390.9188537597656, + "logps/rejected": -235.2791290283203, + "loss": 0.4081, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5213683843612671, + "rewards/margins": 1.0052801370620728, + "rewards/rejected": -1.5266485214233398, + "step": 4205 + }, + { + "epoch": 0.48, + "learning_rate": 1.5690038628116584e-07, + "logits/chosen": -1.9604040384292603, + "logits/rejected": -2.08304500579834, + "logps/chosen": -421.4015197753906, + "logps/rejected": -264.9510498046875, + "loss": 0.1553, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.48581093549728394, + "rewards/margins": 2.1436281204223633, + "rewards/rejected": -2.629439115524292, + "step": 4206 + }, + { + "epoch": 0.48, + "learning_rate": 1.5686526981154162e-07, + "logits/chosen": -2.7540838718414307, + "logits/rejected": -2.337743043899536, + "logps/chosen": -154.6901092529297, + "logps/rejected": -361.4557800292969, + "loss": 0.1697, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8161468505859375, + "rewards/margins": 2.8114285469055176, + "rewards/rejected": -3.627575397491455, + "step": 4207 + }, + { + "epoch": 0.49, + "learning_rate": 1.5683015334191732e-07, + "logits/chosen": -2.4632253646850586, + "logits/rejected": -2.4135210514068604, + "logps/chosen": -199.2728271484375, + "logps/rejected": -376.6460266113281, + "loss": 0.1031, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7856599688529968, + "rewards/margins": 4.295265197753906, + "rewards/rejected": -5.080924987792969, + "step": 4208 + }, + { + "epoch": 0.49, + "learning_rate": 1.567950368722931e-07, + "logits/chosen": -2.288208484649658, + "logits/rejected": -2.1480376720428467, + "logps/chosen": -151.76197814941406, + "logps/rejected": -315.80859375, + "loss": 0.3081, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3790728449821472, + "rewards/margins": 3.012195110321045, + "rewards/rejected": -3.391268253326416, + "step": 4209 + }, + { + "epoch": 0.49, + "learning_rate": 1.5675992040266886e-07, + "logits/chosen": -2.2387120723724365, + "logits/rejected": -2.3506903648376465, + "logps/chosen": -227.7056884765625, + "logps/rejected": -197.45974731445312, + "loss": 0.2333, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2768878638744354, + "rewards/margins": 2.151421546936035, + "rewards/rejected": -2.428309440612793, + "step": 4210 + }, + { + "epoch": 0.49, + "learning_rate": 1.5672480393304459e-07, + "logits/chosen": -1.9342507123947144, + "logits/rejected": -2.372058391571045, + "logps/chosen": -443.28118896484375, + "logps/rejected": -205.30812072753906, + "loss": 0.7062, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3923707008361816, + "rewards/margins": 1.1304491758346558, + "rewards/rejected": -2.5228195190429688, + "step": 4211 + }, + { + "epoch": 0.49, + "learning_rate": 1.5668968746342034e-07, + "logits/chosen": -1.9214320182800293, + "logits/rejected": -1.9177241325378418, + "logps/chosen": -411.8109130859375, + "logps/rejected": -382.410888671875, + "loss": 0.4336, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.032616376876831, + "rewards/margins": 3.555063247680664, + "rewards/rejected": -4.587679386138916, + "step": 4212 + }, + { + "epoch": 0.49, + "learning_rate": 1.566545709937961e-07, + "logits/chosen": -2.2192251682281494, + "logits/rejected": -2.3691320419311523, + "logps/chosen": -154.17030334472656, + "logps/rejected": -107.11441040039062, + "loss": 0.7085, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.075068473815918, + "rewards/margins": 0.1275767982006073, + "rewards/rejected": -1.202645182609558, + "step": 4213 + }, + { + "epoch": 0.49, + "learning_rate": 1.5661945452417182e-07, + "logits/chosen": -1.5318021774291992, + "logits/rejected": -2.014939546585083, + "logps/chosen": -366.27874755859375, + "logps/rejected": -230.613037109375, + "loss": 1.3466, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.657299041748047, + "rewards/margins": -0.2076714038848877, + "rewards/rejected": -2.449627637863159, + "step": 4214 + }, + { + "epoch": 0.49, + "learning_rate": 1.5658433805454757e-07, + "logits/chosen": -1.9234416484832764, + "logits/rejected": -2.024278163909912, + "logps/chosen": -339.386474609375, + "logps/rejected": -245.58555603027344, + "loss": 1.0358, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9219328165054321, + "rewards/margins": 0.962805449962616, + "rewards/rejected": -2.8847384452819824, + "step": 4215 + }, + { + "epoch": 0.49, + "learning_rate": 1.565492215849233e-07, + "logits/chosen": -2.6283464431762695, + "logits/rejected": -2.524519920349121, + "logps/chosen": -319.7002868652344, + "logps/rejected": -233.74159240722656, + "loss": 0.1482, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06817638874053955, + "rewards/margins": 2.446258544921875, + "rewards/rejected": -2.514434814453125, + "step": 4216 + }, + { + "epoch": 0.49, + "learning_rate": 1.5651410511529906e-07, + "logits/chosen": -1.58869206905365, + "logits/rejected": -1.3796463012695312, + "logps/chosen": -557.551513671875, + "logps/rejected": -608.98095703125, + "loss": 0.5179, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8880701065063477, + "rewards/margins": 1.660168170928955, + "rewards/rejected": -2.5482382774353027, + "step": 4217 + }, + { + "epoch": 0.49, + "learning_rate": 1.5647898864567484e-07, + "logits/chosen": -2.537769317626953, + "logits/rejected": -2.3741021156311035, + "logps/chosen": -151.4017333984375, + "logps/rejected": -206.39736938476562, + "loss": 0.6089, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4860379695892334, + "rewards/margins": 1.1182188987731934, + "rewards/rejected": -2.6042568683624268, + "step": 4218 + }, + { + "epoch": 0.49, + "learning_rate": 1.5644387217605056e-07, + "logits/chosen": -2.376673698425293, + "logits/rejected": -2.457350492477417, + "logps/chosen": -501.3039855957031, + "logps/rejected": -415.0723876953125, + "loss": 0.4074, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8841135501861572, + "rewards/margins": 1.3272442817687988, + "rewards/rejected": -2.211357831954956, + "step": 4219 + }, + { + "epoch": 0.49, + "learning_rate": 1.5640875570642632e-07, + "logits/chosen": -2.1842198371887207, + "logits/rejected": -2.314274787902832, + "logps/chosen": -243.8307342529297, + "logps/rejected": -341.51654052734375, + "loss": 0.6523, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.093815565109253, + "rewards/margins": 0.6193354725837708, + "rewards/rejected": -1.713151216506958, + "step": 4220 + }, + { + "epoch": 0.49, + "learning_rate": 1.5637363923680207e-07, + "logits/chosen": -1.7110918760299683, + "logits/rejected": -2.019883871078491, + "logps/chosen": -257.1541442871094, + "logps/rejected": -164.76272583007812, + "loss": 0.8456, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6117973923683167, + "rewards/margins": 0.05935007333755493, + "rewards/rejected": -0.6711475253105164, + "step": 4221 + }, + { + "epoch": 0.49, + "learning_rate": 1.563385227671778e-07, + "logits/chosen": -1.933599591255188, + "logits/rejected": -1.7246671915054321, + "logps/chosen": -214.33465576171875, + "logps/rejected": -347.0713195800781, + "loss": 0.334, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1849020719528198, + "rewards/margins": 2.547098159790039, + "rewards/rejected": -3.7320001125335693, + "step": 4222 + }, + { + "epoch": 0.49, + "learning_rate": 1.5630340629755355e-07, + "logits/chosen": -1.8650894165039062, + "logits/rejected": -1.6824091672897339, + "logps/chosen": -389.2283935546875, + "logps/rejected": -467.9124755859375, + "loss": 0.2089, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7794179916381836, + "rewards/margins": 2.1459813117980957, + "rewards/rejected": -2.9253993034362793, + "step": 4223 + }, + { + "epoch": 0.49, + "learning_rate": 1.5626828982792928e-07, + "logits/chosen": -2.554144859313965, + "logits/rejected": -2.6370253562927246, + "logps/chosen": -166.36795043945312, + "logps/rejected": -231.07501220703125, + "loss": 0.4022, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9278030395507812, + "rewards/margins": 1.913756251335144, + "rewards/rejected": -2.841559410095215, + "step": 4224 + }, + { + "epoch": 0.49, + "learning_rate": 1.5623317335830503e-07, + "logits/chosen": -2.3181581497192383, + "logits/rejected": -2.149461269378662, + "logps/chosen": -271.2208251953125, + "logps/rejected": -333.38446044921875, + "loss": 0.3859, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6689639687538147, + "rewards/margins": 2.2070417404174805, + "rewards/rejected": -2.8760056495666504, + "step": 4225 + }, + { + "epoch": 0.49, + "learning_rate": 1.561980568886808e-07, + "logits/chosen": -2.4435768127441406, + "logits/rejected": -2.252858877182007, + "logps/chosen": -158.73960876464844, + "logps/rejected": -185.555419921875, + "loss": 0.5964, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8023294806480408, + "rewards/margins": 0.5609773397445679, + "rewards/rejected": -1.3633067607879639, + "step": 4226 + }, + { + "epoch": 0.49, + "learning_rate": 1.5616294041905652e-07, + "logits/chosen": -1.799572229385376, + "logits/rejected": -1.9681755304336548, + "logps/chosen": -535.9168701171875, + "logps/rejected": -389.0416259765625, + "loss": 0.4963, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0325911045074463, + "rewards/margins": 1.206040382385254, + "rewards/rejected": -2.2386314868927, + "step": 4227 + }, + { + "epoch": 0.49, + "learning_rate": 1.5612782394943227e-07, + "logits/chosen": -2.440631866455078, + "logits/rejected": -2.4419097900390625, + "logps/chosen": -352.6609191894531, + "logps/rejected": -311.79840087890625, + "loss": 0.1918, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0172407627105713, + "rewards/margins": 2.668212413787842, + "rewards/rejected": -3.685453414916992, + "step": 4228 + }, + { + "epoch": 0.49, + "learning_rate": 1.5609270747980805e-07, + "logits/chosen": -1.8838800191879272, + "logits/rejected": -2.3699252605438232, + "logps/chosen": -553.7008056640625, + "logps/rejected": -312.9853210449219, + "loss": 0.5613, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2767571210861206, + "rewards/margins": 0.7286349534988403, + "rewards/rejected": -2.005392074584961, + "step": 4229 + }, + { + "epoch": 0.49, + "learning_rate": 1.5605759101018378e-07, + "logits/chosen": -2.475058078765869, + "logits/rejected": -2.4222071170806885, + "logps/chosen": -252.60687255859375, + "logps/rejected": -120.23443603515625, + "loss": 0.2693, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0405287742614746, + "rewards/margins": 1.429612636566162, + "rewards/rejected": -2.4701414108276367, + "step": 4230 + }, + { + "epoch": 0.49, + "learning_rate": 1.5602247454055953e-07, + "logits/chosen": -2.042475938796997, + "logits/rejected": -2.341280937194824, + "logps/chosen": -345.87969970703125, + "logps/rejected": -297.0770263671875, + "loss": 0.2251, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9427078366279602, + "rewards/margins": 2.954148769378662, + "rewards/rejected": -3.8968563079833984, + "step": 4231 + }, + { + "epoch": 0.49, + "learning_rate": 1.5598735807093526e-07, + "logits/chosen": -1.756178617477417, + "logits/rejected": -1.45875883102417, + "logps/chosen": -351.5302734375, + "logps/rejected": -444.7590026855469, + "loss": 0.4466, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.503971815109253, + "rewards/margins": 1.3786628246307373, + "rewards/rejected": -2.8826346397399902, + "step": 4232 + }, + { + "epoch": 0.49, + "learning_rate": 1.55952241601311e-07, + "logits/chosen": -2.3353185653686523, + "logits/rejected": -2.3814005851745605, + "logps/chosen": -184.96778869628906, + "logps/rejected": -203.60604858398438, + "loss": 0.15, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7649977207183838, + "rewards/margins": 2.986907482147217, + "rewards/rejected": -3.7519052028656006, + "step": 4233 + }, + { + "epoch": 0.49, + "learning_rate": 1.5591712513168677e-07, + "logits/chosen": -2.725445032119751, + "logits/rejected": -2.6790878772735596, + "logps/chosen": -334.31842041015625, + "logps/rejected": -328.44683837890625, + "loss": 0.4031, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.331268548965454, + "rewards/margins": 2.597797393798828, + "rewards/rejected": -3.929065704345703, + "step": 4234 + }, + { + "epoch": 0.49, + "learning_rate": 1.558820086620625e-07, + "logits/chosen": -2.507725238800049, + "logits/rejected": -2.349255084991455, + "logps/chosen": -269.88720703125, + "logps/rejected": -327.62921142578125, + "loss": 0.2543, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5970638990402222, + "rewards/margins": 3.2785391807556152, + "rewards/rejected": -4.875603199005127, + "step": 4235 + }, + { + "epoch": 0.49, + "learning_rate": 1.5584689219243825e-07, + "logits/chosen": -2.3292927742004395, + "logits/rejected": -2.51723051071167, + "logps/chosen": -273.0090637207031, + "logps/rejected": -231.2939910888672, + "loss": 0.3502, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6509242057800293, + "rewards/margins": 2.4255166053771973, + "rewards/rejected": -3.0764408111572266, + "step": 4236 + }, + { + "epoch": 0.49, + "learning_rate": 1.5581177572281397e-07, + "logits/chosen": -2.3939456939697266, + "logits/rejected": -2.5557727813720703, + "logps/chosen": -488.9620361328125, + "logps/rejected": -322.2242431640625, + "loss": 0.3473, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6750190258026123, + "rewards/margins": 1.5352051258087158, + "rewards/rejected": -2.210224151611328, + "step": 4237 + }, + { + "epoch": 0.49, + "learning_rate": 1.5577665925318973e-07, + "logits/chosen": -2.170531988143921, + "logits/rejected": -2.0335636138916016, + "logps/chosen": -469.61444091796875, + "logps/rejected": -439.6282958984375, + "loss": 0.4705, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9007927775382996, + "rewards/margins": 3.247128486633301, + "rewards/rejected": -4.147921085357666, + "step": 4238 + }, + { + "epoch": 0.49, + "learning_rate": 1.5574154278356548e-07, + "logits/chosen": -2.360187530517578, + "logits/rejected": -2.163048028945923, + "logps/chosen": -190.42486572265625, + "logps/rejected": -294.83551025390625, + "loss": 0.4725, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.8619292974472046, + "rewards/margins": 2.5993878841400146, + "rewards/rejected": -4.46131706237793, + "step": 4239 + }, + { + "epoch": 0.49, + "learning_rate": 1.557064263139412e-07, + "logits/chosen": -2.988489866256714, + "logits/rejected": -2.9627525806427, + "logps/chosen": -192.27639770507812, + "logps/rejected": -234.4753875732422, + "loss": 0.3462, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9407197833061218, + "rewards/margins": 1.8915727138519287, + "rewards/rejected": -2.8322925567626953, + "step": 4240 + }, + { + "epoch": 0.49, + "learning_rate": 1.55671309844317e-07, + "logits/chosen": -2.8456549644470215, + "logits/rejected": -2.881100654602051, + "logps/chosen": -214.72283935546875, + "logps/rejected": -192.97784423828125, + "loss": 0.3216, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1685038805007935, + "rewards/margins": 2.1458241939544678, + "rewards/rejected": -3.314328193664551, + "step": 4241 + }, + { + "epoch": 0.49, + "learning_rate": 1.5563619337469274e-07, + "logits/chosen": -2.4756393432617188, + "logits/rejected": -2.3446526527404785, + "logps/chosen": -229.24241638183594, + "logps/rejected": -187.88070678710938, + "loss": 0.3242, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7082065343856812, + "rewards/margins": 1.68669593334198, + "rewards/rejected": -3.394902467727661, + "step": 4242 + }, + { + "epoch": 0.49, + "learning_rate": 1.5560107690506847e-07, + "logits/chosen": -2.040996551513672, + "logits/rejected": -2.238670825958252, + "logps/chosen": -320.0558166503906, + "logps/rejected": -230.02914428710938, + "loss": 0.4187, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2552129626274109, + "rewards/margins": 1.6077282428741455, + "rewards/rejected": -1.8629411458969116, + "step": 4243 + }, + { + "epoch": 0.49, + "learning_rate": 1.5556596043544422e-07, + "logits/chosen": -2.1973085403442383, + "logits/rejected": -2.440067768096924, + "logps/chosen": -371.19158935546875, + "logps/rejected": -262.6546325683594, + "loss": 0.2586, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0818113088607788, + "rewards/margins": 2.345247983932495, + "rewards/rejected": -3.4270594120025635, + "step": 4244 + }, + { + "epoch": 0.49, + "learning_rate": 1.5553084396581995e-07, + "logits/chosen": -1.8182568550109863, + "logits/rejected": -2.2151236534118652, + "logps/chosen": -460.04193115234375, + "logps/rejected": -216.50186157226562, + "loss": 0.4559, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8646661639213562, + "rewards/margins": 1.2656238079071045, + "rewards/rejected": -2.1302900314331055, + "step": 4245 + }, + { + "epoch": 0.49, + "learning_rate": 1.554957274961957e-07, + "logits/chosen": -2.9842586517333984, + "logits/rejected": -2.9784061908721924, + "logps/chosen": -169.8008270263672, + "logps/rejected": -139.46304321289062, + "loss": 0.6465, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.037259578704834, + "rewards/margins": 1.7148027420043945, + "rewards/rejected": -2.7520623207092285, + "step": 4246 + }, + { + "epoch": 0.49, + "learning_rate": 1.5546061102657146e-07, + "logits/chosen": -2.475853681564331, + "logits/rejected": -2.5905258655548096, + "logps/chosen": -371.2351379394531, + "logps/rejected": -271.23516845703125, + "loss": 0.0597, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0013205483555793762, + "rewards/margins": 4.771080017089844, + "rewards/rejected": -4.769759654998779, + "step": 4247 + }, + { + "epoch": 0.49, + "learning_rate": 1.554254945569472e-07, + "logits/chosen": -2.9420957565307617, + "logits/rejected": -2.8888607025146484, + "logps/chosen": -375.4805603027344, + "logps/rejected": -260.15576171875, + "loss": 0.616, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4776105880737305, + "rewards/margins": 1.2346986532211304, + "rewards/rejected": -2.7123091220855713, + "step": 4248 + }, + { + "epoch": 0.49, + "learning_rate": 1.5539037808732294e-07, + "logits/chosen": -1.9463872909545898, + "logits/rejected": -2.176630735397339, + "logps/chosen": -177.64300537109375, + "logps/rejected": -205.77838134765625, + "loss": 1.2444, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2751761674880981, + "rewards/margins": 0.1317521035671234, + "rewards/rejected": -1.4069281816482544, + "step": 4249 + }, + { + "epoch": 0.49, + "learning_rate": 1.553552616176987e-07, + "logits/chosen": -2.520395040512085, + "logits/rejected": -2.3939549922943115, + "logps/chosen": -123.86805725097656, + "logps/rejected": -215.66989135742188, + "loss": 0.4585, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7717288732528687, + "rewards/margins": 1.6406065225601196, + "rewards/rejected": -2.4123356342315674, + "step": 4250 + }, + { + "epoch": 0.49, + "learning_rate": 1.5532014514807442e-07, + "logits/chosen": -2.3144233226776123, + "logits/rejected": -2.357525110244751, + "logps/chosen": -439.8486022949219, + "logps/rejected": -299.69317626953125, + "loss": 0.0933, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4679812788963318, + "rewards/margins": 2.9849154949188232, + "rewards/rejected": -3.452897071838379, + "step": 4251 + }, + { + "epoch": 0.49, + "learning_rate": 1.552850286784502e-07, + "logits/chosen": -2.5192975997924805, + "logits/rejected": -2.4328341484069824, + "logps/chosen": -321.7256774902344, + "logps/rejected": -241.8342742919922, + "loss": 0.4293, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1163926124572754, + "rewards/margins": 1.7498886585235596, + "rewards/rejected": -2.866281270980835, + "step": 4252 + }, + { + "epoch": 0.49, + "learning_rate": 1.5524991220882593e-07, + "logits/chosen": -2.044063091278076, + "logits/rejected": -1.7443102598190308, + "logps/chosen": -232.4300994873047, + "logps/rejected": -298.78546142578125, + "loss": 0.3789, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6181704998016357, + "rewards/margins": 1.7348469495773315, + "rewards/rejected": -2.3530173301696777, + "step": 4253 + }, + { + "epoch": 0.49, + "learning_rate": 1.5521479573920168e-07, + "logits/chosen": -2.181408166885376, + "logits/rejected": -2.2803502082824707, + "logps/chosen": -343.2082824707031, + "logps/rejected": -268.43927001953125, + "loss": 0.4651, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5537881851196289, + "rewards/margins": 2.0025858879089355, + "rewards/rejected": -2.5563740730285645, + "step": 4254 + }, + { + "epoch": 0.49, + "learning_rate": 1.5517967926957744e-07, + "logits/chosen": -2.2671139240264893, + "logits/rejected": -2.1122946739196777, + "logps/chosen": -115.03800964355469, + "logps/rejected": -193.48519897460938, + "loss": 0.4936, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.26367321610450745, + "rewards/margins": 1.8377550840377808, + "rewards/rejected": -2.101428270339966, + "step": 4255 + }, + { + "epoch": 0.49, + "learning_rate": 1.5514456279995317e-07, + "logits/chosen": -2.108509063720703, + "logits/rejected": -1.8708314895629883, + "logps/chosen": -294.5366516113281, + "logps/rejected": -391.1912841796875, + "loss": 0.3347, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03650176525115967, + "rewards/margins": 2.8043265342712402, + "rewards/rejected": -2.8408284187316895, + "step": 4256 + }, + { + "epoch": 0.49, + "learning_rate": 1.5510944633032892e-07, + "logits/chosen": -2.473858594894409, + "logits/rejected": -2.408494234085083, + "logps/chosen": -327.098876953125, + "logps/rejected": -243.08282470703125, + "loss": 0.8019, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.395446538925171, + "rewards/margins": 1.6696780920028687, + "rewards/rejected": -3.06512451171875, + "step": 4257 + }, + { + "epoch": 0.49, + "learning_rate": 1.5507432986070467e-07, + "logits/chosen": -2.187927722930908, + "logits/rejected": -2.1198348999023438, + "logps/chosen": -193.48281860351562, + "logps/rejected": -281.009765625, + "loss": 0.2679, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2022854089736938, + "rewards/margins": 2.0952975749969482, + "rewards/rejected": -3.2975833415985107, + "step": 4258 + }, + { + "epoch": 0.49, + "learning_rate": 1.550392133910804e-07, + "logits/chosen": -2.289928913116455, + "logits/rejected": -2.0563106536865234, + "logps/chosen": -299.15277099609375, + "logps/rejected": -269.60345458984375, + "loss": 0.3683, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0087814331054688, + "rewards/margins": 1.8157625198364258, + "rewards/rejected": -2.8245439529418945, + "step": 4259 + }, + { + "epoch": 0.49, + "learning_rate": 1.5500409692145615e-07, + "logits/chosen": -2.3239567279815674, + "logits/rejected": -2.6717872619628906, + "logps/chosen": -369.39434814453125, + "logps/rejected": -158.19261169433594, + "loss": 0.2546, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3790713846683502, + "rewards/margins": 2.631450653076172, + "rewards/rejected": -3.01052188873291, + "step": 4260 + }, + { + "epoch": 0.49, + "learning_rate": 1.5496898045183188e-07, + "logits/chosen": -2.4935684204101562, + "logits/rejected": -2.239011287689209, + "logps/chosen": -211.85134887695312, + "logps/rejected": -302.459228515625, + "loss": 0.2257, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5904861688613892, + "rewards/margins": 3.2169995307922363, + "rewards/rejected": -3.807485580444336, + "step": 4261 + }, + { + "epoch": 0.49, + "learning_rate": 1.5493386398220764e-07, + "logits/chosen": -2.254493474960327, + "logits/rejected": -2.221564531326294, + "logps/chosen": -146.3356475830078, + "logps/rejected": -276.55322265625, + "loss": 0.4235, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7958736419677734, + "rewards/margins": 2.3676350116729736, + "rewards/rejected": -3.163508892059326, + "step": 4262 + }, + { + "epoch": 0.49, + "learning_rate": 1.5489874751258342e-07, + "logits/chosen": -2.303438425064087, + "logits/rejected": -2.3122572898864746, + "logps/chosen": -164.5259552001953, + "logps/rejected": -176.25729370117188, + "loss": 0.8168, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8190931081771851, + "rewards/margins": 0.37149015069007874, + "rewards/rejected": -1.1905832290649414, + "step": 4263 + }, + { + "epoch": 0.49, + "learning_rate": 1.5486363104295914e-07, + "logits/chosen": -2.427278995513916, + "logits/rejected": -2.5308127403259277, + "logps/chosen": -340.91717529296875, + "logps/rejected": -382.58465576171875, + "loss": 0.2212, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4118955135345459, + "rewards/margins": 1.8430241346359253, + "rewards/rejected": -2.2549197673797607, + "step": 4264 + }, + { + "epoch": 0.49, + "learning_rate": 1.548285145733349e-07, + "logits/chosen": -2.513460636138916, + "logits/rejected": -2.3701629638671875, + "logps/chosen": -265.6355285644531, + "logps/rejected": -373.8954772949219, + "loss": 0.1442, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17697158455848694, + "rewards/margins": 2.6956541538238525, + "rewards/rejected": -2.8726258277893066, + "step": 4265 + }, + { + "epoch": 0.49, + "learning_rate": 1.5479339810371065e-07, + "logits/chosen": -2.7880380153656006, + "logits/rejected": -2.7543177604675293, + "logps/chosen": -240.76620483398438, + "logps/rejected": -229.79840087890625, + "loss": 0.2744, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9904605150222778, + "rewards/margins": 1.4844697713851929, + "rewards/rejected": -2.4749302864074707, + "step": 4266 + }, + { + "epoch": 0.49, + "learning_rate": 1.5475828163408638e-07, + "logits/chosen": -1.8808529376983643, + "logits/rejected": -2.153048038482666, + "logps/chosen": -379.8167724609375, + "logps/rejected": -212.78453063964844, + "loss": 0.4236, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05268029868602753, + "rewards/margins": 1.3393747806549072, + "rewards/rejected": -1.3920550346374512, + "step": 4267 + }, + { + "epoch": 0.49, + "learning_rate": 1.5472316516446213e-07, + "logits/chosen": -2.1965713500976562, + "logits/rejected": -1.9893226623535156, + "logps/chosen": -359.0370788574219, + "logps/rejected": -370.95721435546875, + "loss": 0.7239, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6667221784591675, + "rewards/margins": 1.3991587162017822, + "rewards/rejected": -2.0658810138702393, + "step": 4268 + }, + { + "epoch": 0.49, + "learning_rate": 1.5468804869483786e-07, + "logits/chosen": -2.769692897796631, + "logits/rejected": -2.602405548095703, + "logps/chosen": -249.71055603027344, + "logps/rejected": -301.119140625, + "loss": 0.0818, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9906722903251648, + "rewards/margins": 4.311026573181152, + "rewards/rejected": -5.301698684692383, + "step": 4269 + }, + { + "epoch": 0.49, + "learning_rate": 1.5465293222521361e-07, + "logits/chosen": -1.4038257598876953, + "logits/rejected": -1.4845749139785767, + "logps/chosen": -520.3656005859375, + "logps/rejected": -531.2212524414062, + "loss": 0.57, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5798701047897339, + "rewards/margins": 1.9991319179534912, + "rewards/rejected": -2.5790021419525146, + "step": 4270 + }, + { + "epoch": 0.49, + "learning_rate": 1.5461781575558937e-07, + "logits/chosen": -2.1206676959991455, + "logits/rejected": -2.3399088382720947, + "logps/chosen": -468.61541748046875, + "logps/rejected": -321.4236755371094, + "loss": 0.4947, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8195886611938477, + "rewards/margins": 2.03159499168396, + "rewards/rejected": -2.8511838912963867, + "step": 4271 + }, + { + "epoch": 0.49, + "learning_rate": 1.545826992859651e-07, + "logits/chosen": -1.9805819988250732, + "logits/rejected": -2.2693679332733154, + "logps/chosen": -362.7086181640625, + "logps/rejected": -341.34246826171875, + "loss": 0.1259, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.31070271134376526, + "rewards/margins": 3.9784321784973145, + "rewards/rejected": -4.289134979248047, + "step": 4272 + }, + { + "epoch": 0.49, + "learning_rate": 1.5454758281634085e-07, + "logits/chosen": -1.9131240844726562, + "logits/rejected": -1.7583279609680176, + "logps/chosen": -266.7744445800781, + "logps/rejected": -358.7016296386719, + "loss": 1.2344, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5560312271118164, + "rewards/margins": 1.9684780836105347, + "rewards/rejected": -3.5245094299316406, + "step": 4273 + }, + { + "epoch": 0.49, + "learning_rate": 1.5451246634671663e-07, + "logits/chosen": -2.3180012702941895, + "logits/rejected": -2.077631950378418, + "logps/chosen": -110.39442443847656, + "logps/rejected": -310.1199951171875, + "loss": 0.654, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.419795960187912, + "rewards/margins": 0.7310262322425842, + "rewards/rejected": -1.1508221626281738, + "step": 4274 + }, + { + "epoch": 0.49, + "learning_rate": 1.5447734987709236e-07, + "logits/chosen": -2.5282089710235596, + "logits/rejected": -2.6892106533050537, + "logps/chosen": -305.4280090332031, + "logps/rejected": -229.91802978515625, + "loss": 0.2026, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09807588905096054, + "rewards/margins": 2.4915075302124023, + "rewards/rejected": -2.589583396911621, + "step": 4275 + }, + { + "epoch": 0.49, + "learning_rate": 1.544422334074681e-07, + "logits/chosen": -2.1611204147338867, + "logits/rejected": -2.2637133598327637, + "logps/chosen": -347.20697021484375, + "logps/rejected": -348.5988464355469, + "loss": 0.7462, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.933088779449463, + "rewards/margins": 0.3000446557998657, + "rewards/rejected": -2.233133316040039, + "step": 4276 + }, + { + "epoch": 0.49, + "learning_rate": 1.5440711693784384e-07, + "logits/chosen": -2.003969430923462, + "logits/rejected": -2.214430332183838, + "logps/chosen": -577.824462890625, + "logps/rejected": -473.27362060546875, + "loss": 0.8412, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6551856994628906, + "rewards/margins": 1.7298266887664795, + "rewards/rejected": -2.38501238822937, + "step": 4277 + }, + { + "epoch": 0.49, + "learning_rate": 1.543720004682196e-07, + "logits/chosen": -2.1088316440582275, + "logits/rejected": -2.275646209716797, + "logps/chosen": -266.5814514160156, + "logps/rejected": -299.11773681640625, + "loss": 0.2751, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4599483907222748, + "rewards/margins": 2.5643105506896973, + "rewards/rejected": -3.024259090423584, + "step": 4278 + }, + { + "epoch": 0.49, + "learning_rate": 1.5433688399859535e-07, + "logits/chosen": -2.3180603981018066, + "logits/rejected": -2.5398435592651367, + "logps/chosen": -176.55055236816406, + "logps/rejected": -209.92359924316406, + "loss": 1.0448, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3775149583816528, + "rewards/margins": 0.9907365441322327, + "rewards/rejected": -2.3682515621185303, + "step": 4279 + }, + { + "epoch": 0.49, + "learning_rate": 1.5430176752897107e-07, + "logits/chosen": -2.2349624633789062, + "logits/rejected": -2.0839998722076416, + "logps/chosen": -154.71080017089844, + "logps/rejected": -231.34701538085938, + "loss": 0.1403, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22009767591953278, + "rewards/margins": 2.387263536453247, + "rewards/rejected": -2.607361316680908, + "step": 4280 + }, + { + "epoch": 0.49, + "learning_rate": 1.5426665105934683e-07, + "logits/chosen": -1.8689556121826172, + "logits/rejected": -2.121680736541748, + "logps/chosen": -354.4046936035156, + "logps/rejected": -204.32595825195312, + "loss": 0.2761, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21342843770980835, + "rewards/margins": 1.9990403652191162, + "rewards/rejected": -2.2124686241149902, + "step": 4281 + }, + { + "epoch": 0.49, + "learning_rate": 1.5423153458972258e-07, + "logits/chosen": -2.362455368041992, + "logits/rejected": -2.4734601974487305, + "logps/chosen": -426.8388977050781, + "logps/rejected": -241.4331817626953, + "loss": 0.602, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7875474691390991, + "rewards/margins": 2.771512269973755, + "rewards/rejected": -3.5590598583221436, + "step": 4282 + }, + { + "epoch": 0.49, + "learning_rate": 1.541964181200983e-07, + "logits/chosen": -2.085263967514038, + "logits/rejected": -2.318796157836914, + "logps/chosen": -363.91046142578125, + "logps/rejected": -278.63995361328125, + "loss": 0.2567, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7406743764877319, + "rewards/margins": 2.155169725418091, + "rewards/rejected": -2.8958442211151123, + "step": 4283 + }, + { + "epoch": 0.49, + "learning_rate": 1.5416130165047406e-07, + "logits/chosen": -2.177468776702881, + "logits/rejected": -2.183647394180298, + "logps/chosen": -215.9737548828125, + "logps/rejected": -226.5096435546875, + "loss": 0.3667, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2672269344329834, + "rewards/margins": 1.8949215412139893, + "rewards/rejected": -3.1621484756469727, + "step": 4284 + }, + { + "epoch": 0.49, + "learning_rate": 1.541261851808498e-07, + "logits/chosen": -2.378016471862793, + "logits/rejected": -2.5368144512176514, + "logps/chosen": -260.99664306640625, + "logps/rejected": -193.74472045898438, + "loss": 0.6745, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2910186052322388, + "rewards/margins": 0.18938593566417694, + "rewards/rejected": -1.4804046154022217, + "step": 4285 + }, + { + "epoch": 0.49, + "learning_rate": 1.5409106871122557e-07, + "logits/chosen": -2.542969226837158, + "logits/rejected": -2.4059622287750244, + "logps/chosen": -161.3694305419922, + "logps/rejected": -227.6242218017578, + "loss": 0.4646, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6140630841255188, + "rewards/margins": 0.9214239120483398, + "rewards/rejected": -1.535487174987793, + "step": 4286 + }, + { + "epoch": 0.49, + "learning_rate": 1.5405595224160132e-07, + "logits/chosen": -2.4938089847564697, + "logits/rejected": -2.3988959789276123, + "logps/chosen": -105.3293228149414, + "logps/rejected": -219.8614959716797, + "loss": 0.2621, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3997706472873688, + "rewards/margins": 3.2370290756225586, + "rewards/rejected": -3.6367995738983154, + "step": 4287 + }, + { + "epoch": 0.49, + "learning_rate": 1.5402083577197705e-07, + "logits/chosen": -2.322420597076416, + "logits/rejected": -2.5521206855773926, + "logps/chosen": -413.3334655761719, + "logps/rejected": -206.80029296875, + "loss": 0.3124, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.010786533355713, + "rewards/margins": 1.769822359085083, + "rewards/rejected": -2.780608892440796, + "step": 4288 + }, + { + "epoch": 0.49, + "learning_rate": 1.539857193023528e-07, + "logits/chosen": -2.4591994285583496, + "logits/rejected": -2.5283126831054688, + "logps/chosen": -154.4801483154297, + "logps/rejected": -237.9758758544922, + "loss": 0.399, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6971160173416138, + "rewards/margins": 2.9053690433502197, + "rewards/rejected": -3.602485179901123, + "step": 4289 + }, + { + "epoch": 0.49, + "learning_rate": 1.5395060283272853e-07, + "logits/chosen": -2.573784828186035, + "logits/rejected": -2.679215431213379, + "logps/chosen": -355.9954833984375, + "logps/rejected": -320.64654541015625, + "loss": 0.3531, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7135434150695801, + "rewards/margins": 2.7123589515686035, + "rewards/rejected": -3.4259023666381836, + "step": 4290 + }, + { + "epoch": 0.49, + "learning_rate": 1.5391548636310429e-07, + "logits/chosen": -2.518500566482544, + "logits/rejected": -2.599621534347534, + "logps/chosen": -443.5031433105469, + "logps/rejected": -376.0960693359375, + "loss": 0.5509, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.057063102722168, + "rewards/margins": 1.2400835752487183, + "rewards/rejected": -2.297146797180176, + "step": 4291 + }, + { + "epoch": 0.49, + "learning_rate": 1.5388036989348004e-07, + "logits/chosen": -2.271730422973633, + "logits/rejected": -2.2961277961730957, + "logps/chosen": -186.56387329101562, + "logps/rejected": -228.823974609375, + "loss": 0.9183, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5697458982467651, + "rewards/margins": 1.255620002746582, + "rewards/rejected": -1.8253659009933472, + "step": 4292 + }, + { + "epoch": 0.49, + "learning_rate": 1.5384525342385577e-07, + "logits/chosen": -1.888594150543213, + "logits/rejected": -1.971625804901123, + "logps/chosen": -324.4944152832031, + "logps/rejected": -291.10052490234375, + "loss": 0.2251, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9309240579605103, + "rewards/margins": 3.4251794815063477, + "rewards/rejected": -4.356103420257568, + "step": 4293 + }, + { + "epoch": 0.5, + "learning_rate": 1.5381013695423152e-07, + "logits/chosen": -2.881653070449829, + "logits/rejected": -2.9616971015930176, + "logps/chosen": -193.80081176757812, + "logps/rejected": -265.7061767578125, + "loss": 0.1766, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.773298978805542, + "rewards/margins": 3.386645793914795, + "rewards/rejected": -5.159944534301758, + "step": 4294 + }, + { + "epoch": 0.5, + "learning_rate": 1.5377502048460727e-07, + "logits/chosen": -1.8452510833740234, + "logits/rejected": -1.7379156351089478, + "logps/chosen": -322.31121826171875, + "logps/rejected": -255.63601684570312, + "loss": 0.4655, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8334104418754578, + "rewards/margins": 1.7243090867996216, + "rewards/rejected": -2.5577194690704346, + "step": 4295 + }, + { + "epoch": 0.5, + "learning_rate": 1.53739904014983e-07, + "logits/chosen": -2.8646812438964844, + "logits/rejected": -2.7948720455169678, + "logps/chosen": -322.8624267578125, + "logps/rejected": -296.56256103515625, + "loss": 0.5008, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4538086950778961, + "rewards/margins": 1.4384329319000244, + "rewards/rejected": -1.8922414779663086, + "step": 4296 + }, + { + "epoch": 0.5, + "learning_rate": 1.5370478754535878e-07, + "logits/chosen": -2.198946952819824, + "logits/rejected": -1.9791362285614014, + "logps/chosen": -107.15098571777344, + "logps/rejected": -245.49681091308594, + "loss": 0.4339, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2334626913070679, + "rewards/margins": 3.8491861820220947, + "rewards/rejected": -5.082648754119873, + "step": 4297 + }, + { + "epoch": 0.5, + "learning_rate": 1.536696710757345e-07, + "logits/chosen": -1.9819684028625488, + "logits/rejected": -2.0445168018341064, + "logps/chosen": -295.87255859375, + "logps/rejected": -255.82183837890625, + "loss": 0.5885, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8567246794700623, + "rewards/margins": 1.0551856756210327, + "rewards/rejected": -1.9119104146957397, + "step": 4298 + }, + { + "epoch": 0.5, + "learning_rate": 1.5363455460611026e-07, + "logits/chosen": -2.1212921142578125, + "logits/rejected": -1.8659741878509521, + "logps/chosen": -181.22706604003906, + "logps/rejected": -484.7723693847656, + "loss": 0.206, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5725463628768921, + "rewards/margins": 2.2283623218536377, + "rewards/rejected": -2.8009085655212402, + "step": 4299 + }, + { + "epoch": 0.5, + "learning_rate": 1.5359943813648602e-07, + "logits/chosen": -2.0063860416412354, + "logits/rejected": -2.274444103240967, + "logps/chosen": -695.8934326171875, + "logps/rejected": -554.8896484375, + "loss": 0.2834, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1924570798873901, + "rewards/margins": 1.6998540163040161, + "rewards/rejected": -2.8923110961914062, + "step": 4300 + }, + { + "epoch": 0.5, + "learning_rate": 1.5356432166686174e-07, + "logits/chosen": -2.287292003631592, + "logits/rejected": -2.4113829135894775, + "logps/chosen": -294.4921875, + "logps/rejected": -205.52630615234375, + "loss": 0.5548, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4144182503223419, + "rewards/margins": 0.7742726802825928, + "rewards/rejected": -1.1886909008026123, + "step": 4301 + }, + { + "epoch": 0.5, + "learning_rate": 1.535292051972375e-07, + "logits/chosen": -2.183380603790283, + "logits/rejected": -2.3036162853240967, + "logps/chosen": -306.99066162109375, + "logps/rejected": -350.8436279296875, + "loss": 0.3319, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3936305046081543, + "rewards/margins": 2.114546298980713, + "rewards/rejected": -2.508176803588867, + "step": 4302 + }, + { + "epoch": 0.5, + "learning_rate": 1.5349408872761325e-07, + "logits/chosen": -2.2182209491729736, + "logits/rejected": -2.0840659141540527, + "logps/chosen": -191.8278350830078, + "logps/rejected": -279.5853576660156, + "loss": 0.1772, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.836500346660614, + "rewards/margins": 2.730959415435791, + "rewards/rejected": -3.56745982170105, + "step": 4303 + }, + { + "epoch": 0.5, + "learning_rate": 1.5345897225798898e-07, + "logits/chosen": -1.6423344612121582, + "logits/rejected": -1.6338632106781006, + "logps/chosen": -428.5531005859375, + "logps/rejected": -341.99444580078125, + "loss": 0.4141, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4586206078529358, + "rewards/margins": 1.417841911315918, + "rewards/rejected": -1.876462459564209, + "step": 4304 + }, + { + "epoch": 0.5, + "learning_rate": 1.5342385578836473e-07, + "logits/chosen": -1.7893850803375244, + "logits/rejected": -1.618138313293457, + "logps/chosen": -233.3052978515625, + "logps/rejected": -407.2275695800781, + "loss": 0.173, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.43723371624946594, + "rewards/margins": 2.66158390045166, + "rewards/rejected": -3.098817825317383, + "step": 4305 + }, + { + "epoch": 0.5, + "learning_rate": 1.5338873931874046e-07, + "logits/chosen": -2.8695363998413086, + "logits/rejected": -2.7712583541870117, + "logps/chosen": -308.44708251953125, + "logps/rejected": -238.23876953125, + "loss": 0.3199, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.14069689810276031, + "rewards/margins": 1.8759742975234985, + "rewards/rejected": -1.7352774143218994, + "step": 4306 + }, + { + "epoch": 0.5, + "learning_rate": 1.5335362284911621e-07, + "logits/chosen": -2.615168333053589, + "logits/rejected": -2.5864005088806152, + "logps/chosen": -217.43418884277344, + "logps/rejected": -223.51553344726562, + "loss": 0.3903, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6451807022094727, + "rewards/margins": 1.759591817855835, + "rewards/rejected": -2.4047722816467285, + "step": 4307 + }, + { + "epoch": 0.5, + "learning_rate": 1.53318506379492e-07, + "logits/chosen": -1.6863794326782227, + "logits/rejected": -1.9210878610610962, + "logps/chosen": -360.3377685546875, + "logps/rejected": -352.1590881347656, + "loss": 0.2467, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6658530235290527, + "rewards/margins": 2.151217222213745, + "rewards/rejected": -3.8170700073242188, + "step": 4308 + }, + { + "epoch": 0.5, + "learning_rate": 1.5328338990986772e-07, + "logits/chosen": -1.9024269580841064, + "logits/rejected": -2.0559675693511963, + "logps/chosen": -290.43402099609375, + "logps/rejected": -295.75140380859375, + "loss": 0.5586, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5223976373672485, + "rewards/margins": 2.1531903743743896, + "rewards/rejected": -3.6755876541137695, + "step": 4309 + }, + { + "epoch": 0.5, + "learning_rate": 1.5324827344024348e-07, + "logits/chosen": -2.1764845848083496, + "logits/rejected": -1.789870023727417, + "logps/chosen": -174.2303924560547, + "logps/rejected": -319.3096923828125, + "loss": 0.2974, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19888398051261902, + "rewards/margins": 3.0395047664642334, + "rewards/rejected": -3.238388776779175, + "step": 4310 + }, + { + "epoch": 0.5, + "learning_rate": 1.5321315697061923e-07, + "logits/chosen": -2.044743061065674, + "logits/rejected": -2.2647552490234375, + "logps/chosen": -399.2802734375, + "logps/rejected": -252.93788146972656, + "loss": 1.4036, + "rewards/accuracies": 0.375, + "rewards/chosen": -2.243159532546997, + "rewards/margins": -0.27246642112731934, + "rewards/rejected": -1.9706929922103882, + "step": 4311 + }, + { + "epoch": 0.5, + "learning_rate": 1.5317804050099496e-07, + "logits/chosen": -2.457404136657715, + "logits/rejected": -2.2841334342956543, + "logps/chosen": -193.14208984375, + "logps/rejected": -256.52105712890625, + "loss": 0.4779, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4510992467403412, + "rewards/margins": 1.779733419418335, + "rewards/rejected": -2.230832576751709, + "step": 4312 + }, + { + "epoch": 0.5, + "learning_rate": 1.531429240313707e-07, + "logits/chosen": -2.950223922729492, + "logits/rejected": -2.909891366958618, + "logps/chosen": -174.7480926513672, + "logps/rejected": -279.058837890625, + "loss": 0.1492, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4743388295173645, + "rewards/margins": 2.860653877258301, + "rewards/rejected": -3.3349926471710205, + "step": 4313 + }, + { + "epoch": 0.5, + "learning_rate": 1.5310780756174644e-07, + "logits/chosen": -2.4287192821502686, + "logits/rejected": -2.4836556911468506, + "logps/chosen": -273.7009582519531, + "logps/rejected": -305.3197021484375, + "loss": 0.2658, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22321298718452454, + "rewards/margins": 2.177469491958618, + "rewards/rejected": -2.4006826877593994, + "step": 4314 + }, + { + "epoch": 0.5, + "learning_rate": 1.530726910921222e-07, + "logits/chosen": -2.598573684692383, + "logits/rejected": -2.6426174640655518, + "logps/chosen": -431.36175537109375, + "logps/rejected": -272.64434814453125, + "loss": 0.1468, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3940432071685791, + "rewards/margins": 3.1115803718566895, + "rewards/rejected": -3.5056238174438477, + "step": 4315 + }, + { + "epoch": 0.5, + "learning_rate": 1.5303757462249795e-07, + "logits/chosen": -2.783262252807617, + "logits/rejected": -2.672635793685913, + "logps/chosen": -287.39794921875, + "logps/rejected": -151.38783264160156, + "loss": 0.2335, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2949923276901245, + "rewards/margins": 2.020247459411621, + "rewards/rejected": -2.315239906311035, + "step": 4316 + }, + { + "epoch": 0.5, + "learning_rate": 1.5300245815287367e-07, + "logits/chosen": -2.3593790531158447, + "logits/rejected": -2.255155563354492, + "logps/chosen": -404.89642333984375, + "logps/rejected": -543.6422119140625, + "loss": 0.2287, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1468039751052856, + "rewards/margins": 3.4828710556030273, + "rewards/rejected": -4.629674911499023, + "step": 4317 + }, + { + "epoch": 0.5, + "learning_rate": 1.5296734168324943e-07, + "logits/chosen": -1.7802616357803345, + "logits/rejected": -1.8309589624404907, + "logps/chosen": -395.2065124511719, + "logps/rejected": -415.15179443359375, + "loss": 1.0041, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3629530668258667, + "rewards/margins": 2.295816659927368, + "rewards/rejected": -3.6587696075439453, + "step": 4318 + }, + { + "epoch": 0.5, + "learning_rate": 1.529322252136252e-07, + "logits/chosen": -2.396449089050293, + "logits/rejected": -2.1824371814727783, + "logps/chosen": -202.1991729736328, + "logps/rejected": -213.61146545410156, + "loss": 0.7534, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2469289302825928, + "rewards/margins": 2.2883236408233643, + "rewards/rejected": -3.535252332687378, + "step": 4319 + }, + { + "epoch": 0.5, + "learning_rate": 1.5289710874400094e-07, + "logits/chosen": -2.3206467628479004, + "logits/rejected": -2.3060760498046875, + "logps/chosen": -123.92013549804688, + "logps/rejected": -222.09791564941406, + "loss": 0.205, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7023661136627197, + "rewards/margins": 3.7448511123657227, + "rewards/rejected": -4.4472174644470215, + "step": 4320 + }, + { + "epoch": 0.5, + "learning_rate": 1.528619922743767e-07, + "logits/chosen": -2.493866443634033, + "logits/rejected": -2.359467029571533, + "logps/chosen": -181.82057189941406, + "logps/rejected": -337.14227294921875, + "loss": 0.556, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9105620384216309, + "rewards/margins": 2.808825731277466, + "rewards/rejected": -4.719387531280518, + "step": 4321 + }, + { + "epoch": 0.5, + "learning_rate": 1.5282687580475242e-07, + "logits/chosen": -2.439154624938965, + "logits/rejected": -2.3384876251220703, + "logps/chosen": -184.47821044921875, + "logps/rejected": -182.48687744140625, + "loss": 0.7673, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6055569648742676, + "rewards/margins": 1.2565808296203613, + "rewards/rejected": -2.862137794494629, + "step": 4322 + }, + { + "epoch": 0.5, + "learning_rate": 1.5279175933512817e-07, + "logits/chosen": -2.587132453918457, + "logits/rejected": -2.5888845920562744, + "logps/chosen": -311.2697448730469, + "logps/rejected": -285.2121887207031, + "loss": 0.2894, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3987598717212677, + "rewards/margins": 2.9261083602905273, + "rewards/rejected": -3.3248684406280518, + "step": 4323 + }, + { + "epoch": 0.5, + "learning_rate": 1.5275664286550392e-07, + "logits/chosen": -2.3790884017944336, + "logits/rejected": -2.5427820682525635, + "logps/chosen": -179.91357421875, + "logps/rejected": -114.00350952148438, + "loss": 0.4547, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5563474297523499, + "rewards/margins": 0.827017068862915, + "rewards/rejected": -1.3833644390106201, + "step": 4324 + }, + { + "epoch": 0.5, + "learning_rate": 1.5272152639587965e-07, + "logits/chosen": -1.580289602279663, + "logits/rejected": -1.673201322555542, + "logps/chosen": -375.2184753417969, + "logps/rejected": -416.60919189453125, + "loss": 0.3011, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.26968634128570557, + "rewards/margins": 2.956698417663574, + "rewards/rejected": -3.2263846397399902, + "step": 4325 + }, + { + "epoch": 0.5, + "learning_rate": 1.526864099262554e-07, + "logits/chosen": -2.1359872817993164, + "logits/rejected": -2.262991189956665, + "logps/chosen": -213.19654846191406, + "logps/rejected": -166.50462341308594, + "loss": 0.4295, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7884884476661682, + "rewards/margins": 1.5487818717956543, + "rewards/rejected": -2.3372702598571777, + "step": 4326 + }, + { + "epoch": 0.5, + "learning_rate": 1.5265129345663116e-07, + "logits/chosen": -2.392335891723633, + "logits/rejected": -2.6265454292297363, + "logps/chosen": -414.0235595703125, + "logps/rejected": -375.43939208984375, + "loss": 0.3623, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.29935622215271, + "rewards/margins": 3.0375404357910156, + "rewards/rejected": -4.336896896362305, + "step": 4327 + }, + { + "epoch": 0.5, + "learning_rate": 1.526161769870069e-07, + "logits/chosen": -2.0861806869506836, + "logits/rejected": -2.379035234451294, + "logps/chosen": -405.7969055175781, + "logps/rejected": -270.9029235839844, + "loss": 0.5432, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2685599327087402, + "rewards/margins": 1.3647575378417969, + "rewards/rejected": -2.633317470550537, + "step": 4328 + }, + { + "epoch": 0.5, + "learning_rate": 1.5258106051738264e-07, + "logits/chosen": -2.6948282718658447, + "logits/rejected": -2.550394058227539, + "logps/chosen": -125.80111694335938, + "logps/rejected": -164.3424072265625, + "loss": 0.2964, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.39239394664764404, + "rewards/margins": 1.9819588661193848, + "rewards/rejected": -2.3743526935577393, + "step": 4329 + }, + { + "epoch": 0.5, + "learning_rate": 1.5254594404775837e-07, + "logits/chosen": -1.9518802165985107, + "logits/rejected": -2.1497230529785156, + "logps/chosen": -372.79132080078125, + "logps/rejected": -320.16595458984375, + "loss": 0.1878, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36200833320617676, + "rewards/margins": 2.8494386672973633, + "rewards/rejected": -3.211447238922119, + "step": 4330 + }, + { + "epoch": 0.5, + "learning_rate": 1.5251082757813415e-07, + "logits/chosen": -1.9330527782440186, + "logits/rejected": -1.8927773237228394, + "logps/chosen": -356.3074951171875, + "logps/rejected": -202.90939331054688, + "loss": 0.5374, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6634069085121155, + "rewards/margins": 0.9234924912452698, + "rewards/rejected": -1.5868993997573853, + "step": 4331 + }, + { + "epoch": 0.5, + "learning_rate": 1.524757111085099e-07, + "logits/chosen": -2.49900221824646, + "logits/rejected": -2.596681833267212, + "logps/chosen": -158.65530395507812, + "logps/rejected": -159.76303100585938, + "loss": 0.6638, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1863950490951538, + "rewards/margins": 0.363518625497818, + "rewards/rejected": -1.549913763999939, + "step": 4332 + }, + { + "epoch": 0.5, + "learning_rate": 1.5244059463888563e-07, + "logits/chosen": -2.2659926414489746, + "logits/rejected": -2.225026845932007, + "logps/chosen": -391.0314025878906, + "logps/rejected": -428.966552734375, + "loss": 0.574, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19110506772994995, + "rewards/margins": 2.0458500385284424, + "rewards/rejected": -2.236955165863037, + "step": 4333 + }, + { + "epoch": 0.5, + "learning_rate": 1.5240547816926138e-07, + "logits/chosen": -2.3413729667663574, + "logits/rejected": -2.402949333190918, + "logps/chosen": -250.56849670410156, + "logps/rejected": -247.46795654296875, + "loss": 0.285, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.44153648614883423, + "rewards/margins": 2.7585291862487793, + "rewards/rejected": -3.2000653743743896, + "step": 4334 + }, + { + "epoch": 0.5, + "learning_rate": 1.523703616996371e-07, + "logits/chosen": -2.4825568199157715, + "logits/rejected": -2.616671562194824, + "logps/chosen": -115.60231018066406, + "logps/rejected": -209.6991729736328, + "loss": 0.3687, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.35815513134002686, + "rewards/margins": 2.549964666366577, + "rewards/rejected": -2.9081196784973145, + "step": 4335 + }, + { + "epoch": 0.5, + "learning_rate": 1.5233524523001286e-07, + "logits/chosen": -2.5754446983337402, + "logits/rejected": -2.704883098602295, + "logps/chosen": -357.12579345703125, + "logps/rejected": -282.332275390625, + "loss": 0.2857, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.406065434217453, + "rewards/margins": 2.045396089553833, + "rewards/rejected": -2.4514613151550293, + "step": 4336 + }, + { + "epoch": 0.5, + "learning_rate": 1.5230012876038862e-07, + "logits/chosen": -2.2094321250915527, + "logits/rejected": -2.1698384284973145, + "logps/chosen": -325.32501220703125, + "logps/rejected": -394.0058898925781, + "loss": 0.8541, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2081868648529053, + "rewards/margins": 0.8442801833152771, + "rewards/rejected": -2.052467107772827, + "step": 4337 + }, + { + "epoch": 0.5, + "learning_rate": 1.5226501229076435e-07, + "logits/chosen": -2.515423059463501, + "logits/rejected": -2.576887607574463, + "logps/chosen": -305.4349670410156, + "logps/rejected": -199.07553100585938, + "loss": 0.2757, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14723876118659973, + "rewards/margins": 2.2017946243286133, + "rewards/rejected": -2.3490333557128906, + "step": 4338 + }, + { + "epoch": 0.5, + "learning_rate": 1.522298958211401e-07, + "logits/chosen": -2.261758327484131, + "logits/rejected": -2.15348744392395, + "logps/chosen": -311.1342468261719, + "logps/rejected": -352.9677429199219, + "loss": 0.4278, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6771916151046753, + "rewards/margins": 2.6053032875061035, + "rewards/rejected": -3.28249454498291, + "step": 4339 + }, + { + "epoch": 0.5, + "learning_rate": 1.5219477935151588e-07, + "logits/chosen": -2.1492514610290527, + "logits/rejected": -2.2651143074035645, + "logps/chosen": -393.92242431640625, + "logps/rejected": -388.6973876953125, + "loss": 0.427, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.361854910850525, + "rewards/margins": 1.4765223264694214, + "rewards/rejected": -2.8383772373199463, + "step": 4340 + }, + { + "epoch": 0.5, + "learning_rate": 1.5215966288189158e-07, + "logits/chosen": -1.753844141960144, + "logits/rejected": -2.056380033493042, + "logps/chosen": -299.06024169921875, + "logps/rejected": -204.52523803710938, + "loss": 0.4432, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3105154037475586, + "rewards/margins": 1.8945083618164062, + "rewards/rejected": -2.205023765563965, + "step": 4341 + }, + { + "epoch": 0.5, + "learning_rate": 1.5212454641226736e-07, + "logits/chosen": -2.83795166015625, + "logits/rejected": -2.760812997817993, + "logps/chosen": -99.76710510253906, + "logps/rejected": -200.63299560546875, + "loss": 0.2076, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9839363694190979, + "rewards/margins": 3.1411330699920654, + "rewards/rejected": -4.1250691413879395, + "step": 4342 + }, + { + "epoch": 0.5, + "learning_rate": 1.520894299426431e-07, + "logits/chosen": -2.476494550704956, + "logits/rejected": -2.5994420051574707, + "logps/chosen": -214.76454162597656, + "logps/rejected": -168.64840698242188, + "loss": 0.4124, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7600891590118408, + "rewards/margins": 1.3648632764816284, + "rewards/rejected": -2.1249523162841797, + "step": 4343 + }, + { + "epoch": 0.5, + "learning_rate": 1.5205431347301884e-07, + "logits/chosen": -1.753782033920288, + "logits/rejected": -2.1255691051483154, + "logps/chosen": -394.8559265136719, + "logps/rejected": -346.88665771484375, + "loss": 0.1516, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7666352391242981, + "rewards/margins": 3.173478841781616, + "rewards/rejected": -3.9401137828826904, + "step": 4344 + }, + { + "epoch": 0.5, + "learning_rate": 1.520191970033946e-07, + "logits/chosen": -1.5593167543411255, + "logits/rejected": -2.0305352210998535, + "logps/chosen": -408.1869201660156, + "logps/rejected": -273.4218444824219, + "loss": 0.7832, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6305750608444214, + "rewards/margins": -0.01882648468017578, + "rewards/rejected": -1.6117486953735352, + "step": 4345 + }, + { + "epoch": 0.5, + "learning_rate": 1.5198408053377032e-07, + "logits/chosen": -2.2084319591522217, + "logits/rejected": -2.094452142715454, + "logps/chosen": -265.1305847167969, + "logps/rejected": -282.9176330566406, + "loss": 0.2688, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7833235263824463, + "rewards/margins": 2.1280391216278076, + "rewards/rejected": -2.911362648010254, + "step": 4346 + }, + { + "epoch": 0.5, + "learning_rate": 1.5194896406414608e-07, + "logits/chosen": -2.745919704437256, + "logits/rejected": -2.60921311378479, + "logps/chosen": -188.6512451171875, + "logps/rejected": -257.35308837890625, + "loss": 0.1647, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5929628610610962, + "rewards/margins": 4.18134880065918, + "rewards/rejected": -4.774311542510986, + "step": 4347 + }, + { + "epoch": 0.5, + "learning_rate": 1.5191384759452183e-07, + "logits/chosen": -2.428264617919922, + "logits/rejected": -2.4970498085021973, + "logps/chosen": -555.7412719726562, + "logps/rejected": -423.0617980957031, + "loss": 0.4018, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6933441162109375, + "rewards/margins": 1.6512850522994995, + "rewards/rejected": -2.3446292877197266, + "step": 4348 + }, + { + "epoch": 0.5, + "learning_rate": 1.5187873112489756e-07, + "logits/chosen": -2.5520989894866943, + "logits/rejected": -2.530271053314209, + "logps/chosen": -262.00958251953125, + "logps/rejected": -239.4275360107422, + "loss": 0.7311, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1371983289718628, + "rewards/margins": 0.5645054578781128, + "rewards/rejected": -1.7017037868499756, + "step": 4349 + }, + { + "epoch": 0.5, + "learning_rate": 1.518436146552733e-07, + "logits/chosen": -2.241729497909546, + "logits/rejected": -2.1989104747772217, + "logps/chosen": -225.4781494140625, + "logps/rejected": -145.5193328857422, + "loss": 1.1934, + "rewards/accuracies": 0.375, + "rewards/chosen": -2.5282340049743652, + "rewards/margins": -0.36834800243377686, + "rewards/rejected": -2.159885883331299, + "step": 4350 + }, + { + "epoch": 0.5, + "learning_rate": 1.5180849818564904e-07, + "logits/chosen": -2.0278053283691406, + "logits/rejected": -1.9340789318084717, + "logps/chosen": -309.734619140625, + "logps/rejected": -351.6844787597656, + "loss": 0.4716, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1518019288778305, + "rewards/margins": 1.747276782989502, + "rewards/rejected": -1.899078607559204, + "step": 4351 + }, + { + "epoch": 0.5, + "learning_rate": 1.517733817160248e-07, + "logits/chosen": -1.9947609901428223, + "logits/rejected": -2.015561580657959, + "logps/chosen": -333.84967041015625, + "logps/rejected": -318.787109375, + "loss": 0.3298, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5263568162918091, + "rewards/margins": 2.441798448562622, + "rewards/rejected": -2.9681553840637207, + "step": 4352 + }, + { + "epoch": 0.5, + "learning_rate": 1.5173826524640057e-07, + "logits/chosen": -2.2853055000305176, + "logits/rejected": -2.0838160514831543, + "logps/chosen": -352.29541015625, + "logps/rejected": -422.3295593261719, + "loss": 0.145, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5344489216804504, + "rewards/margins": 2.4653730392456055, + "rewards/rejected": -2.999821662902832, + "step": 4353 + }, + { + "epoch": 0.5, + "learning_rate": 1.517031487767763e-07, + "logits/chosen": -2.124065399169922, + "logits/rejected": -2.2336924076080322, + "logps/chosen": -278.87310791015625, + "logps/rejected": -283.648193359375, + "loss": 0.3043, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08465489745140076, + "rewards/margins": 2.403207302093506, + "rewards/rejected": -2.4878623485565186, + "step": 4354 + }, + { + "epoch": 0.5, + "learning_rate": 1.5166803230715206e-07, + "logits/chosen": -2.624272108078003, + "logits/rejected": -2.5899477005004883, + "logps/chosen": -309.5693359375, + "logps/rejected": -351.1058349609375, + "loss": 0.2633, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3232845962047577, + "rewards/margins": 2.6997790336608887, + "rewards/rejected": -3.0230634212493896, + "step": 4355 + }, + { + "epoch": 0.5, + "learning_rate": 1.516329158375278e-07, + "logits/chosen": -2.4367570877075195, + "logits/rejected": -2.040590286254883, + "logps/chosen": -234.13653564453125, + "logps/rejected": -276.04461669921875, + "loss": 0.7097, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6444412469863892, + "rewards/margins": 0.7962694764137268, + "rewards/rejected": -1.4407105445861816, + "step": 4356 + }, + { + "epoch": 0.5, + "learning_rate": 1.5159779936790354e-07, + "logits/chosen": -2.6306405067443848, + "logits/rejected": -2.522454261779785, + "logps/chosen": -229.3409881591797, + "logps/rejected": -283.4871826171875, + "loss": 0.2152, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0128830671310425, + "rewards/margins": 2.610119342803955, + "rewards/rejected": -3.623002290725708, + "step": 4357 + }, + { + "epoch": 0.5, + "learning_rate": 1.515626828982793e-07, + "logits/chosen": -2.096242904663086, + "logits/rejected": -2.462519407272339, + "logps/chosen": -283.6170349121094, + "logps/rejected": -208.66058349609375, + "loss": 1.7539, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.9134440422058105, + "rewards/margins": -0.55561363697052, + "rewards/rejected": -2.357830286026001, + "step": 4358 + }, + { + "epoch": 0.5, + "learning_rate": 1.5152756642865502e-07, + "logits/chosen": -2.412675619125366, + "logits/rejected": -2.250065803527832, + "logps/chosen": -119.65841674804688, + "logps/rejected": -192.32675170898438, + "loss": 0.5457, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.02435983717441559, + "rewards/margins": 1.9040076732635498, + "rewards/rejected": -1.9283673763275146, + "step": 4359 + }, + { + "epoch": 0.5, + "learning_rate": 1.5149244995903077e-07, + "logits/chosen": -2.5316202640533447, + "logits/rejected": -2.2899749279022217, + "logps/chosen": -145.36570739746094, + "logps/rejected": -201.33908081054688, + "loss": 0.752, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8583285808563232, + "rewards/margins": 0.6379820108413696, + "rewards/rejected": -1.4963104724884033, + "step": 4360 + }, + { + "epoch": 0.5, + "learning_rate": 1.5145733348940653e-07, + "logits/chosen": -2.0810015201568604, + "logits/rejected": -2.3720543384552, + "logps/chosen": -395.47821044921875, + "logps/rejected": -236.40057373046875, + "loss": 0.2152, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5027607679367065, + "rewards/margins": 2.019789695739746, + "rewards/rejected": -2.522550344467163, + "step": 4361 + }, + { + "epoch": 0.5, + "learning_rate": 1.5142221701978225e-07, + "logits/chosen": -2.3006467819213867, + "logits/rejected": -2.4186794757843018, + "logps/chosen": -346.2559509277344, + "logps/rejected": -324.8556823730469, + "loss": 0.5731, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6021296381950378, + "rewards/margins": 2.3951940536499023, + "rewards/rejected": -2.997323513031006, + "step": 4362 + }, + { + "epoch": 0.5, + "learning_rate": 1.51387100550158e-07, + "logits/chosen": -2.0057461261749268, + "logits/rejected": -2.2200210094451904, + "logps/chosen": -325.6377868652344, + "logps/rejected": -251.28680419921875, + "loss": 0.4718, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0579943656921387, + "rewards/margins": 1.9904261827468872, + "rewards/rejected": -4.048420429229736, + "step": 4363 + }, + { + "epoch": 0.5, + "learning_rate": 1.513519840805338e-07, + "logits/chosen": -2.7271687984466553, + "logits/rejected": -2.682072162628174, + "logps/chosen": -265.58197021484375, + "logps/rejected": -239.61268615722656, + "loss": 0.0352, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06917864829301834, + "rewards/margins": 4.66085147857666, + "rewards/rejected": -4.591672897338867, + "step": 4364 + }, + { + "epoch": 0.5, + "learning_rate": 1.5131686761090951e-07, + "logits/chosen": -2.3863234519958496, + "logits/rejected": -2.356414318084717, + "logps/chosen": -195.43128967285156, + "logps/rejected": -351.60211181640625, + "loss": 0.1381, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0999583005905151, + "rewards/margins": 2.8566150665283203, + "rewards/rejected": -3.956573247909546, + "step": 4365 + }, + { + "epoch": 0.5, + "learning_rate": 1.5128175114128527e-07, + "logits/chosen": -2.371140480041504, + "logits/rejected": -2.271988868713379, + "logps/chosen": -275.004150390625, + "logps/rejected": -328.6239013671875, + "loss": 0.327, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4369615614414215, + "rewards/margins": 1.5265777111053467, + "rewards/rejected": -1.9635393619537354, + "step": 4366 + }, + { + "epoch": 0.5, + "learning_rate": 1.51246634671661e-07, + "logits/chosen": -2.4800961017608643, + "logits/rejected": -2.3517136573791504, + "logps/chosen": -216.90931701660156, + "logps/rejected": -240.36053466796875, + "loss": 0.2287, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3518517017364502, + "rewards/margins": 2.7707786560058594, + "rewards/rejected": -4.1226301193237305, + "step": 4367 + }, + { + "epoch": 0.5, + "learning_rate": 1.5121151820203675e-07, + "logits/chosen": -2.405310869216919, + "logits/rejected": -2.1392886638641357, + "logps/chosen": -191.130859375, + "logps/rejected": -204.92535400390625, + "loss": 0.4139, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8011865615844727, + "rewards/margins": 1.7305693626403809, + "rewards/rejected": -3.5317559242248535, + "step": 4368 + }, + { + "epoch": 0.5, + "learning_rate": 1.511764017324125e-07, + "logits/chosen": -2.520211935043335, + "logits/rejected": -2.2597498893737793, + "logps/chosen": -280.86187744140625, + "logps/rejected": -263.29681396484375, + "loss": 0.392, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6213918328285217, + "rewards/margins": 1.8007457256317139, + "rewards/rejected": -2.422137498855591, + "step": 4369 + }, + { + "epoch": 0.5, + "learning_rate": 1.5114128526278823e-07, + "logits/chosen": -2.1095519065856934, + "logits/rejected": -2.2464611530303955, + "logps/chosen": -314.1414489746094, + "logps/rejected": -310.2584533691406, + "loss": 0.5669, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0673679113388062, + "rewards/margins": 1.0426828861236572, + "rewards/rejected": -2.110050678253174, + "step": 4370 + }, + { + "epoch": 0.5, + "learning_rate": 1.5110616879316399e-07, + "logits/chosen": -2.1587045192718506, + "logits/rejected": -2.349966049194336, + "logps/chosen": -331.2708740234375, + "logps/rejected": -276.43157958984375, + "loss": 0.3051, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2070777863264084, + "rewards/margins": 2.34511661529541, + "rewards/rejected": -2.552194595336914, + "step": 4371 + }, + { + "epoch": 0.5, + "learning_rate": 1.5107105232353974e-07, + "logits/chosen": -2.3063480854034424, + "logits/rejected": -2.269470453262329, + "logps/chosen": -261.884033203125, + "logps/rejected": -194.11415100097656, + "loss": 0.7804, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.979956865310669, + "rewards/margins": 0.5863962769508362, + "rewards/rejected": -1.5663530826568604, + "step": 4372 + }, + { + "epoch": 0.5, + "learning_rate": 1.5103593585391547e-07, + "logits/chosen": -2.4827206134796143, + "logits/rejected": -2.7533583641052246, + "logps/chosen": -304.3182678222656, + "logps/rejected": -431.67791748046875, + "loss": 0.2537, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7976561188697815, + "rewards/margins": 2.51890230178833, + "rewards/rejected": -3.316558361053467, + "step": 4373 + }, + { + "epoch": 0.5, + "learning_rate": 1.5100081938429125e-07, + "logits/chosen": -2.472055435180664, + "logits/rejected": -2.444894790649414, + "logps/chosen": -247.21701049804688, + "logps/rejected": -258.25152587890625, + "loss": 0.2137, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5261552333831787, + "rewards/margins": 3.3543570041656494, + "rewards/rejected": -4.880512714385986, + "step": 4374 + }, + { + "epoch": 0.5, + "learning_rate": 1.5096570291466695e-07, + "logits/chosen": -2.4274654388427734, + "logits/rejected": -2.376140832901001, + "logps/chosen": -192.57339477539062, + "logps/rejected": -381.4255676269531, + "loss": 0.2072, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20453178882598877, + "rewards/margins": 4.075020790100098, + "rewards/rejected": -4.279552459716797, + "step": 4375 + }, + { + "epoch": 0.5, + "learning_rate": 1.5093058644504273e-07, + "logits/chosen": -1.8028538227081299, + "logits/rejected": -1.7472572326660156, + "logps/chosen": -329.85064697265625, + "logps/rejected": -327.7064208984375, + "loss": 0.4388, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.932438313961029, + "rewards/margins": 1.5219918489456177, + "rewards/rejected": -2.454430103302002, + "step": 4376 + }, + { + "epoch": 0.5, + "learning_rate": 1.5089546997541848e-07, + "logits/chosen": -2.302943468093872, + "logits/rejected": -2.515083074569702, + "logps/chosen": -417.78271484375, + "logps/rejected": -295.15069580078125, + "loss": 0.182, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15340164303779602, + "rewards/margins": 3.6352202892303467, + "rewards/rejected": -3.7886219024658203, + "step": 4377 + }, + { + "epoch": 0.5, + "learning_rate": 1.508603535057942e-07, + "logits/chosen": -2.6407907009124756, + "logits/rejected": -2.4640002250671387, + "logps/chosen": -208.97030639648438, + "logps/rejected": -117.19340515136719, + "loss": 0.4278, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.33656299114227295, + "rewards/margins": 1.1085375547409058, + "rewards/rejected": -1.4451006650924683, + "step": 4378 + }, + { + "epoch": 0.5, + "learning_rate": 1.5082523703616996e-07, + "logits/chosen": -2.8768882751464844, + "logits/rejected": -2.823596954345703, + "logps/chosen": -158.3702850341797, + "logps/rejected": -159.46237182617188, + "loss": 0.1394, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6361446976661682, + "rewards/margins": 2.6109018325805664, + "rewards/rejected": -3.247046709060669, + "step": 4379 + }, + { + "epoch": 0.5, + "learning_rate": 1.507901205665457e-07, + "logits/chosen": -2.063689947128296, + "logits/rejected": -2.2216808795928955, + "logps/chosen": -443.4649658203125, + "logps/rejected": -314.2442932128906, + "loss": 0.3286, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5456040501594543, + "rewards/margins": 2.1370718479156494, + "rewards/rejected": -2.682675838470459, + "step": 4380 + }, + { + "epoch": 0.51, + "learning_rate": 1.5075500409692144e-07, + "logits/chosen": -2.3592004776000977, + "logits/rejected": -2.346092700958252, + "logps/chosen": -305.7198791503906, + "logps/rejected": -351.56707763671875, + "loss": 0.3102, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3213200569152832, + "rewards/margins": 3.087709426879883, + "rewards/rejected": -4.409029006958008, + "step": 4381 + }, + { + "epoch": 0.51, + "learning_rate": 1.507198876272972e-07, + "logits/chosen": -2.673633337020874, + "logits/rejected": -2.6375908851623535, + "logps/chosen": -301.1861572265625, + "logps/rejected": -370.1943054199219, + "loss": 0.1059, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2653285264968872, + "rewards/margins": 4.704236030578613, + "rewards/rejected": -4.969564437866211, + "step": 4382 + }, + { + "epoch": 0.51, + "learning_rate": 1.5068477115767293e-07, + "logits/chosen": -2.8077619075775146, + "logits/rejected": -2.6411609649658203, + "logps/chosen": -317.1466369628906, + "logps/rejected": -299.0841369628906, + "loss": 0.4118, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6246018409729004, + "rewards/margins": 2.3409018516540527, + "rewards/rejected": -2.965503692626953, + "step": 4383 + }, + { + "epoch": 0.51, + "learning_rate": 1.5064965468804868e-07, + "logits/chosen": -2.699977397918701, + "logits/rejected": -2.7444984912872314, + "logps/chosen": -314.93756103515625, + "logps/rejected": -245.94676208496094, + "loss": 0.3228, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3127593994140625, + "rewards/margins": 1.3034842014312744, + "rewards/rejected": -2.616243362426758, + "step": 4384 + }, + { + "epoch": 0.51, + "learning_rate": 1.5061453821842446e-07, + "logits/chosen": -2.2618606090545654, + "logits/rejected": -2.074547290802002, + "logps/chosen": -263.4847106933594, + "logps/rejected": -381.5111389160156, + "loss": 0.1338, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0537950992584229, + "rewards/margins": 3.2266221046447754, + "rewards/rejected": -4.280417442321777, + "step": 4385 + }, + { + "epoch": 0.51, + "learning_rate": 1.5057942174880016e-07, + "logits/chosen": -2.1806716918945312, + "logits/rejected": -2.3825039863586426, + "logps/chosen": -230.19798278808594, + "logps/rejected": -307.5100402832031, + "loss": 0.3182, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0741606950759888, + "rewards/margins": 2.772597074508667, + "rewards/rejected": -3.846757650375366, + "step": 4386 + }, + { + "epoch": 0.51, + "learning_rate": 1.5054430527917594e-07, + "logits/chosen": -2.447087287902832, + "logits/rejected": -2.5266637802124023, + "logps/chosen": -312.893310546875, + "logps/rejected": -298.1171569824219, + "loss": 0.2992, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2853829264640808, + "rewards/margins": 2.131840705871582, + "rewards/rejected": -2.4172236919403076, + "step": 4387 + }, + { + "epoch": 0.51, + "learning_rate": 1.5050918880955167e-07, + "logits/chosen": -2.4125993251800537, + "logits/rejected": -2.625394582748413, + "logps/chosen": -288.15423583984375, + "logps/rejected": -325.3455810546875, + "loss": 0.1394, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12602505087852478, + "rewards/margins": 4.061829566955566, + "rewards/rejected": -3.935804843902588, + "step": 4388 + }, + { + "epoch": 0.51, + "learning_rate": 1.5047407233992742e-07, + "logits/chosen": -2.072207450866699, + "logits/rejected": -2.1361076831817627, + "logps/chosen": -312.5203857421875, + "logps/rejected": -291.5352783203125, + "loss": 0.6769, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7314062118530273, + "rewards/margins": 1.025363802909851, + "rewards/rejected": -1.7567698955535889, + "step": 4389 + }, + { + "epoch": 0.51, + "learning_rate": 1.5043895587030318e-07, + "logits/chosen": -2.5744454860687256, + "logits/rejected": -2.4911625385284424, + "logps/chosen": -260.2105407714844, + "logps/rejected": -259.86566162109375, + "loss": 0.5847, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3351061344146729, + "rewards/margins": 2.835507392883301, + "rewards/rejected": -4.170613765716553, + "step": 4390 + }, + { + "epoch": 0.51, + "learning_rate": 1.504038394006789e-07, + "logits/chosen": -1.952239751815796, + "logits/rejected": -2.023588180541992, + "logps/chosen": -602.3780517578125, + "logps/rejected": -427.93402099609375, + "loss": 0.4363, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1037931442260742, + "rewards/margins": 1.1439746618270874, + "rewards/rejected": -2.247767925262451, + "step": 4391 + }, + { + "epoch": 0.51, + "learning_rate": 1.5036872293105466e-07, + "logits/chosen": -2.3288216590881348, + "logits/rejected": -1.907006859779358, + "logps/chosen": -272.3577880859375, + "logps/rejected": -327.19384765625, + "loss": 0.6953, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5647234916687012, + "rewards/margins": 1.2099288702011108, + "rewards/rejected": -2.7746524810791016, + "step": 4392 + }, + { + "epoch": 0.51, + "learning_rate": 1.503336064614304e-07, + "logits/chosen": -2.1875154972076416, + "logits/rejected": -2.0780205726623535, + "logps/chosen": -388.0709533691406, + "logps/rejected": -380.6969909667969, + "loss": 0.2018, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4459192156791687, + "rewards/margins": 2.4699127674102783, + "rewards/rejected": -2.915832042694092, + "step": 4393 + }, + { + "epoch": 0.51, + "learning_rate": 1.5029848999180614e-07, + "logits/chosen": -2.4332163333892822, + "logits/rejected": -2.6035282611846924, + "logps/chosen": -330.9805908203125, + "logps/rejected": -216.57662963867188, + "loss": 0.351, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6150068044662476, + "rewards/margins": 1.2610290050506592, + "rewards/rejected": -1.8760356903076172, + "step": 4394 + }, + { + "epoch": 0.51, + "learning_rate": 1.502633735221819e-07, + "logits/chosen": -2.201645851135254, + "logits/rejected": -2.114513397216797, + "logps/chosen": -383.89337158203125, + "logps/rejected": -274.9875793457031, + "loss": 0.1913, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25858446955680847, + "rewards/margins": 2.0914573669433594, + "rewards/rejected": -2.350041627883911, + "step": 4395 + }, + { + "epoch": 0.51, + "learning_rate": 1.5022825705255762e-07, + "logits/chosen": -1.7925989627838135, + "logits/rejected": -2.12522292137146, + "logps/chosen": -410.1684265136719, + "logps/rejected": -417.26641845703125, + "loss": 0.3319, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7250699996948242, + "rewards/margins": 1.4670586585998535, + "rewards/rejected": -2.1921286582946777, + "step": 4396 + }, + { + "epoch": 0.51, + "learning_rate": 1.5019314058293337e-07, + "logits/chosen": -1.9110552072525024, + "logits/rejected": -2.170795440673828, + "logps/chosen": -265.6136474609375, + "logps/rejected": -196.88189697265625, + "loss": 0.5616, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0698390007019043, + "rewards/margins": 1.4921034574508667, + "rewards/rejected": -2.5619425773620605, + "step": 4397 + }, + { + "epoch": 0.51, + "learning_rate": 1.5015802411330915e-07, + "logits/chosen": -2.422846794128418, + "logits/rejected": -2.0756094455718994, + "logps/chosen": -309.96124267578125, + "logps/rejected": -332.19036865234375, + "loss": 0.2263, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7912783622741699, + "rewards/margins": 3.0662407875061035, + "rewards/rejected": -3.8575186729431152, + "step": 4398 + }, + { + "epoch": 0.51, + "learning_rate": 1.5012290764368488e-07, + "logits/chosen": -2.912393093109131, + "logits/rejected": -2.7098567485809326, + "logps/chosen": -364.96697998046875, + "logps/rejected": -509.4939270019531, + "loss": 0.1418, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0433471202850342, + "rewards/margins": 3.2299647331237793, + "rewards/rejected": -4.273312091827393, + "step": 4399 + }, + { + "epoch": 0.51, + "learning_rate": 1.5008779117406064e-07, + "logits/chosen": -2.6066956520080566, + "logits/rejected": -2.3237361907958984, + "logps/chosen": -272.13214111328125, + "logps/rejected": -343.02667236328125, + "loss": 0.192, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2318124771118164, + "rewards/margins": 2.705648899078369, + "rewards/rejected": -3.9374611377716064, + "step": 4400 + }, + { + "epoch": 0.51, + "learning_rate": 1.500526747044364e-07, + "logits/chosen": -2.3824379444122314, + "logits/rejected": -2.422938346862793, + "logps/chosen": -292.3214416503906, + "logps/rejected": -280.3091735839844, + "loss": 0.6436, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.349771499633789, + "rewards/margins": 2.393847942352295, + "rewards/rejected": -3.743619203567505, + "step": 4401 + }, + { + "epoch": 0.51, + "learning_rate": 1.5001755823481212e-07, + "logits/chosen": -2.0760693550109863, + "logits/rejected": -2.0046839714050293, + "logps/chosen": -178.71063232421875, + "logps/rejected": -180.11529541015625, + "loss": 0.6182, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2829033136367798, + "rewards/margins": 1.2887002229690552, + "rewards/rejected": -2.571603775024414, + "step": 4402 + }, + { + "epoch": 0.51, + "learning_rate": 1.4998244176518787e-07, + "logits/chosen": -2.3126449584960938, + "logits/rejected": -2.4501757621765137, + "logps/chosen": -252.39505004882812, + "logps/rejected": -307.0054931640625, + "loss": 0.4311, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2209432125091553, + "rewards/margins": 1.5447494983673096, + "rewards/rejected": -2.765692710876465, + "step": 4403 + }, + { + "epoch": 0.51, + "learning_rate": 1.4994732529556362e-07, + "logits/chosen": -2.1554017066955566, + "logits/rejected": -2.417985200881958, + "logps/chosen": -212.8480224609375, + "logps/rejected": -186.67843627929688, + "loss": 0.2725, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.616976797580719, + "rewards/margins": 2.057219982147217, + "rewards/rejected": -2.67419695854187, + "step": 4404 + }, + { + "epoch": 0.51, + "learning_rate": 1.4991220882593935e-07, + "logits/chosen": -2.9489285945892334, + "logits/rejected": -2.9667630195617676, + "logps/chosen": -307.12939453125, + "logps/rejected": -355.1203308105469, + "loss": 0.26, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.40373724699020386, + "rewards/margins": 1.9364659786224365, + "rewards/rejected": -2.340203046798706, + "step": 4405 + }, + { + "epoch": 0.51, + "learning_rate": 1.498770923563151e-07, + "logits/chosen": -2.933067798614502, + "logits/rejected": -2.70407772064209, + "logps/chosen": -257.69085693359375, + "logps/rejected": -284.7534484863281, + "loss": 0.4872, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4108145236968994, + "rewards/margins": 1.4406341314315796, + "rewards/rejected": -2.8514485359191895, + "step": 4406 + }, + { + "epoch": 0.51, + "learning_rate": 1.4984197588669086e-07, + "logits/chosen": -2.5111753940582275, + "logits/rejected": -2.4342236518859863, + "logps/chosen": -256.10638427734375, + "logps/rejected": -268.6832275390625, + "loss": 0.4335, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.48706066608428955, + "rewards/margins": 1.4919461011886597, + "rewards/rejected": -1.9790066480636597, + "step": 4407 + }, + { + "epoch": 0.51, + "learning_rate": 1.498068594170666e-07, + "logits/chosen": -2.2526638507843018, + "logits/rejected": -2.0670289993286133, + "logps/chosen": -120.99366760253906, + "logps/rejected": -227.89749145507812, + "loss": 0.6065, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7330907583236694, + "rewards/margins": 0.5728581547737122, + "rewards/rejected": -1.3059489727020264, + "step": 4408 + }, + { + "epoch": 0.51, + "learning_rate": 1.4977174294744234e-07, + "logits/chosen": -1.8169662952423096, + "logits/rejected": -2.197728395462036, + "logps/chosen": -304.7825927734375, + "logps/rejected": -224.0482940673828, + "loss": 0.0877, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.30774232745170593, + "rewards/margins": 3.248953104019165, + "rewards/rejected": -2.9412105083465576, + "step": 4409 + }, + { + "epoch": 0.51, + "learning_rate": 1.497366264778181e-07, + "logits/chosen": -2.468247652053833, + "logits/rejected": -2.4505889415740967, + "logps/chosen": -270.42901611328125, + "logps/rejected": -207.1949920654297, + "loss": 0.3007, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11853466928005219, + "rewards/margins": 1.5847539901733398, + "rewards/rejected": -1.7032885551452637, + "step": 4410 + }, + { + "epoch": 0.51, + "learning_rate": 1.4970151000819385e-07, + "logits/chosen": -2.040699005126953, + "logits/rejected": -2.196739673614502, + "logps/chosen": -387.1164855957031, + "logps/rejected": -362.784423828125, + "loss": 0.342, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08198221027851105, + "rewards/margins": 1.3541185855865479, + "rewards/rejected": -1.436100721359253, + "step": 4411 + }, + { + "epoch": 0.51, + "learning_rate": 1.4966639353856958e-07, + "logits/chosen": -1.9526455402374268, + "logits/rejected": -2.1613383293151855, + "logps/chosen": -316.136474609375, + "logps/rejected": -263.1310119628906, + "loss": 0.6665, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2918567657470703, + "rewards/margins": 0.6934288740158081, + "rewards/rejected": -1.985285758972168, + "step": 4412 + }, + { + "epoch": 0.51, + "learning_rate": 1.4963127706894533e-07, + "logits/chosen": -2.4861161708831787, + "logits/rejected": -2.5290064811706543, + "logps/chosen": -340.93896484375, + "logps/rejected": -302.6599426269531, + "loss": 0.7318, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9059358239173889, + "rewards/margins": 3.2106738090515137, + "rewards/rejected": -4.116609573364258, + "step": 4413 + }, + { + "epoch": 0.51, + "learning_rate": 1.4959616059932106e-07, + "logits/chosen": -2.292069435119629, + "logits/rejected": -2.416987419128418, + "logps/chosen": -334.265869140625, + "logps/rejected": -217.0530548095703, + "loss": 0.2097, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5611006021499634, + "rewards/margins": 2.1938726902008057, + "rewards/rejected": -2.7549731731414795, + "step": 4414 + }, + { + "epoch": 0.51, + "learning_rate": 1.4956104412969684e-07, + "logits/chosen": -2.4665753841400146, + "logits/rejected": -2.1229043006896973, + "logps/chosen": -175.5201416015625, + "logps/rejected": -237.0979766845703, + "loss": 0.2978, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2828904688358307, + "rewards/margins": 3.38396954536438, + "rewards/rejected": -3.6668601036071777, + "step": 4415 + }, + { + "epoch": 0.51, + "learning_rate": 1.4952592766007256e-07, + "logits/chosen": -2.1916344165802, + "logits/rejected": -1.9670439958572388, + "logps/chosen": -406.981689453125, + "logps/rejected": -292.5906066894531, + "loss": 0.3533, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8950545191764832, + "rewards/margins": 1.7992054224014282, + "rewards/rejected": -2.6942601203918457, + "step": 4416 + }, + { + "epoch": 0.51, + "learning_rate": 1.4949081119044832e-07, + "logits/chosen": -2.2927234172821045, + "logits/rejected": -2.1787009239196777, + "logps/chosen": -372.8576965332031, + "logps/rejected": -334.71502685546875, + "loss": 1.2229, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1873761415481567, + "rewards/margins": 0.28177520632743835, + "rewards/rejected": -1.469151496887207, + "step": 4417 + }, + { + "epoch": 0.51, + "learning_rate": 1.4945569472082405e-07, + "logits/chosen": -2.366879463195801, + "logits/rejected": -2.375758647918701, + "logps/chosen": -342.30450439453125, + "logps/rejected": -321.07012939453125, + "loss": 0.9259, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3251516819000244, + "rewards/margins": 0.5398135185241699, + "rewards/rejected": -1.8649652004241943, + "step": 4418 + }, + { + "epoch": 0.51, + "learning_rate": 1.4942057825119983e-07, + "logits/chosen": -2.637155055999756, + "logits/rejected": -2.252246856689453, + "logps/chosen": -238.4218292236328, + "logps/rejected": -290.2286071777344, + "loss": 0.9387, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2670173645019531, + "rewards/margins": 2.040536403656006, + "rewards/rejected": -3.307553768157959, + "step": 4419 + }, + { + "epoch": 0.51, + "learning_rate": 1.4938546178157555e-07, + "logits/chosen": -2.3534018993377686, + "logits/rejected": -2.296941041946411, + "logps/chosen": -400.8485107421875, + "logps/rejected": -344.9678649902344, + "loss": 0.6656, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.46096497774124146, + "rewards/margins": 1.2721960544586182, + "rewards/rejected": -1.7331610918045044, + "step": 4420 + }, + { + "epoch": 0.51, + "learning_rate": 1.493503453119513e-07, + "logits/chosen": -2.573690891265869, + "logits/rejected": -2.6878561973571777, + "logps/chosen": -98.803466796875, + "logps/rejected": -146.39852905273438, + "loss": 0.3071, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.35607901215553284, + "rewards/margins": 1.4006158113479614, + "rewards/rejected": -1.7566946744918823, + "step": 4421 + }, + { + "epoch": 0.51, + "learning_rate": 1.4931522884232703e-07, + "logits/chosen": -2.2322568893432617, + "logits/rejected": -2.3680336475372314, + "logps/chosen": -254.06207275390625, + "logps/rejected": -340.3240051269531, + "loss": 0.6415, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7404909133911133, + "rewards/margins": 1.4835747480392456, + "rewards/rejected": -3.2240660190582275, + "step": 4422 + }, + { + "epoch": 0.51, + "learning_rate": 1.492801123727028e-07, + "logits/chosen": -2.3182945251464844, + "logits/rejected": -2.397336721420288, + "logps/chosen": -386.33026123046875, + "logps/rejected": -312.2720947265625, + "loss": 0.3359, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6281994581222534, + "rewards/margins": 2.3859877586364746, + "rewards/rejected": -3.0141870975494385, + "step": 4423 + }, + { + "epoch": 0.51, + "learning_rate": 1.4924499590307854e-07, + "logits/chosen": -2.209137439727783, + "logits/rejected": -2.1487185955047607, + "logps/chosen": -380.54449462890625, + "logps/rejected": -418.9285583496094, + "loss": 0.106, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8597612380981445, + "rewards/margins": 3.528768301010132, + "rewards/rejected": -4.388529300689697, + "step": 4424 + }, + { + "epoch": 0.51, + "learning_rate": 1.492098794334543e-07, + "logits/chosen": -2.170372247695923, + "logits/rejected": -2.226295232772827, + "logps/chosen": -347.756103515625, + "logps/rejected": -310.4136962890625, + "loss": 0.5343, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.731185257434845, + "rewards/margins": 0.9660189747810364, + "rewards/rejected": -1.6972042322158813, + "step": 4425 + }, + { + "epoch": 0.51, + "learning_rate": 1.4917476296383002e-07, + "logits/chosen": -2.5578620433807373, + "logits/rejected": -2.6364550590515137, + "logps/chosen": -148.75418090820312, + "logps/rejected": -170.55633544921875, + "loss": 0.309, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5669814944267273, + "rewards/margins": 1.5504794120788574, + "rewards/rejected": -2.1174609661102295, + "step": 4426 + }, + { + "epoch": 0.51, + "learning_rate": 1.4913964649420578e-07, + "logits/chosen": -2.749372720718384, + "logits/rejected": -2.6505022048950195, + "logps/chosen": -184.64102172851562, + "logps/rejected": -349.32415771484375, + "loss": 0.4692, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.104060173034668, + "rewards/margins": 5.216724395751953, + "rewards/rejected": -6.320784568786621, + "step": 4427 + }, + { + "epoch": 0.51, + "learning_rate": 1.4910453002458153e-07, + "logits/chosen": -2.970167875289917, + "logits/rejected": -2.8637990951538086, + "logps/chosen": -267.8927001953125, + "logps/rejected": -200.97976684570312, + "loss": 0.1619, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7481886744499207, + "rewards/margins": 2.932701826095581, + "rewards/rejected": -3.6808903217315674, + "step": 4428 + }, + { + "epoch": 0.51, + "learning_rate": 1.4906941355495726e-07, + "logits/chosen": -2.703808546066284, + "logits/rejected": -2.7500805854797363, + "logps/chosen": -451.87493896484375, + "logps/rejected": -315.385498046875, + "loss": 0.4274, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1642968654632568, + "rewards/margins": 2.0509400367736816, + "rewards/rejected": -3.2152366638183594, + "step": 4429 + }, + { + "epoch": 0.51, + "learning_rate": 1.49034297085333e-07, + "logits/chosen": -2.4807991981506348, + "logits/rejected": -2.424607992172241, + "logps/chosen": -238.6458740234375, + "logps/rejected": -170.21987915039062, + "loss": 0.2661, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4758965075016022, + "rewards/margins": 1.6259621381759644, + "rewards/rejected": -2.101858615875244, + "step": 4430 + }, + { + "epoch": 0.51, + "learning_rate": 1.4899918061570874e-07, + "logits/chosen": -2.9088430404663086, + "logits/rejected": -2.6264114379882812, + "logps/chosen": -370.944580078125, + "logps/rejected": -317.2470397949219, + "loss": 0.3567, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5854076147079468, + "rewards/margins": 2.1208033561706543, + "rewards/rejected": -3.7062110900878906, + "step": 4431 + }, + { + "epoch": 0.51, + "learning_rate": 1.4896406414608452e-07, + "logits/chosen": -2.196833372116089, + "logits/rejected": -2.181107521057129, + "logps/chosen": -173.17138671875, + "logps/rejected": -282.369873046875, + "loss": 0.5384, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4056010246276855, + "rewards/margins": 2.0913822650909424, + "rewards/rejected": -3.496983289718628, + "step": 4432 + }, + { + "epoch": 0.51, + "learning_rate": 1.4892894767646025e-07, + "logits/chosen": -2.63366961479187, + "logits/rejected": -2.856104612350464, + "logps/chosen": -322.3616943359375, + "logps/rejected": -221.66249084472656, + "loss": 0.3982, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.646748960018158, + "rewards/margins": 1.228312373161316, + "rewards/rejected": -1.875061273574829, + "step": 4433 + }, + { + "epoch": 0.51, + "learning_rate": 1.48893831206836e-07, + "logits/chosen": -2.498450517654419, + "logits/rejected": -2.821737051010132, + "logps/chosen": -427.7338562011719, + "logps/rejected": -370.6763916015625, + "loss": 0.0498, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.434314489364624, + "rewards/margins": 4.167354106903076, + "rewards/rejected": -5.601668357849121, + "step": 4434 + }, + { + "epoch": 0.51, + "learning_rate": 1.4885871473721173e-07, + "logits/chosen": -2.118738889694214, + "logits/rejected": -2.201975107192993, + "logps/chosen": -329.59039306640625, + "logps/rejected": -250.85366821289062, + "loss": 0.2641, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9101407527923584, + "rewards/margins": 2.1612565517425537, + "rewards/rejected": -3.071397304534912, + "step": 4435 + }, + { + "epoch": 0.51, + "learning_rate": 1.488235982675875e-07, + "logits/chosen": -2.48526930809021, + "logits/rejected": -2.5022196769714355, + "logps/chosen": -281.46514892578125, + "logps/rejected": -242.17408752441406, + "loss": 0.4103, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4273054897785187, + "rewards/margins": 1.3273950815200806, + "rewards/rejected": -1.7547005414962769, + "step": 4436 + }, + { + "epoch": 0.51, + "learning_rate": 1.4878848179796324e-07, + "logits/chosen": -2.3768343925476074, + "logits/rejected": -2.6962690353393555, + "logps/chosen": -263.70660400390625, + "logps/rejected": -160.17886352539062, + "loss": 0.9392, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.2840794324874878, + "rewards/margins": 0.9773442149162292, + "rewards/rejected": -2.2614235877990723, + "step": 4437 + }, + { + "epoch": 0.51, + "learning_rate": 1.48753365328339e-07, + "logits/chosen": -1.6836979389190674, + "logits/rejected": -1.825805425643921, + "logps/chosen": -435.743408203125, + "logps/rejected": -316.8683776855469, + "loss": 0.5712, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8490722179412842, + "rewards/margins": 0.8610494136810303, + "rewards/rejected": -2.7101216316223145, + "step": 4438 + }, + { + "epoch": 0.51, + "learning_rate": 1.4871824885871472e-07, + "logits/chosen": -2.474410057067871, + "logits/rejected": -2.3516759872436523, + "logps/chosen": -349.55914306640625, + "logps/rejected": -288.46807861328125, + "loss": 0.3509, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9640256762504578, + "rewards/margins": 1.6027610301971436, + "rewards/rejected": -2.566786766052246, + "step": 4439 + }, + { + "epoch": 0.51, + "learning_rate": 1.4868313238909047e-07, + "logits/chosen": -2.4645185470581055, + "logits/rejected": -2.6381964683532715, + "logps/chosen": -198.48594665527344, + "logps/rejected": -330.4175109863281, + "loss": 0.5171, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.212921380996704, + "rewards/margins": 1.6229928731918335, + "rewards/rejected": -2.835914134979248, + "step": 4440 + }, + { + "epoch": 0.51, + "learning_rate": 1.4864801591946623e-07, + "logits/chosen": -1.4849861860275269, + "logits/rejected": -1.858289361000061, + "logps/chosen": -314.8577880859375, + "logps/rejected": -269.4383850097656, + "loss": 0.9898, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.7682399153709412, + "rewards/margins": 0.7922964692115784, + "rewards/rejected": -1.5605363845825195, + "step": 4441 + }, + { + "epoch": 0.51, + "learning_rate": 1.4861289944984198e-07, + "logits/chosen": -2.094756603240967, + "logits/rejected": -2.302967071533203, + "logps/chosen": -240.009033203125, + "logps/rejected": -213.2340087890625, + "loss": 0.4126, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5783910751342773, + "rewards/margins": 1.0901132822036743, + "rewards/rejected": -1.668504238128662, + "step": 4442 + }, + { + "epoch": 0.51, + "learning_rate": 1.485777829802177e-07, + "logits/chosen": -2.3783814907073975, + "logits/rejected": -2.2720000743865967, + "logps/chosen": -388.21527099609375, + "logps/rejected": -367.39617919921875, + "loss": 1.1496, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5559320449829102, + "rewards/margins": 0.011534154415130615, + "rewards/rejected": -1.567466139793396, + "step": 4443 + }, + { + "epoch": 0.51, + "learning_rate": 1.4854266651059346e-07, + "logits/chosen": -2.6178956031799316, + "logits/rejected": -2.6693968772888184, + "logps/chosen": -188.4164581298828, + "logps/rejected": -241.90357971191406, + "loss": 0.4354, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.798380970954895, + "rewards/margins": 2.095221519470215, + "rewards/rejected": -2.8936023712158203, + "step": 4444 + }, + { + "epoch": 0.51, + "learning_rate": 1.4850755004096921e-07, + "logits/chosen": -2.2821362018585205, + "logits/rejected": -2.242098808288574, + "logps/chosen": -254.4058837890625, + "logps/rejected": -295.2167053222656, + "loss": 0.3165, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1763734817504883, + "rewards/margins": 2.642749786376953, + "rewards/rejected": -3.8191232681274414, + "step": 4445 + }, + { + "epoch": 0.51, + "learning_rate": 1.4847243357134494e-07, + "logits/chosen": -2.4180166721343994, + "logits/rejected": -2.201314926147461, + "logps/chosen": -268.99957275390625, + "logps/rejected": -347.82696533203125, + "loss": 0.5291, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5936076641082764, + "rewards/margins": 1.7978612184524536, + "rewards/rejected": -2.3914690017700195, + "step": 4446 + }, + { + "epoch": 0.51, + "learning_rate": 1.484373171017207e-07, + "logits/chosen": -2.3547325134277344, + "logits/rejected": -2.1295547485351562, + "logps/chosen": -298.8110656738281, + "logps/rejected": -257.8313903808594, + "loss": 0.3195, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5578607320785522, + "rewards/margins": 1.7177152633666992, + "rewards/rejected": -2.275575876235962, + "step": 4447 + }, + { + "epoch": 0.51, + "learning_rate": 1.4840220063209645e-07, + "logits/chosen": -2.3519792556762695, + "logits/rejected": -2.032566547393799, + "logps/chosen": -210.90963745117188, + "logps/rejected": -294.7355651855469, + "loss": 0.2913, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.832373857498169, + "rewards/margins": 2.7226650714874268, + "rewards/rejected": -3.5550389289855957, + "step": 4448 + }, + { + "epoch": 0.51, + "learning_rate": 1.483670841624722e-07, + "logits/chosen": -2.748250961303711, + "logits/rejected": -2.6873486042022705, + "logps/chosen": -241.35409545898438, + "logps/rejected": -247.94622802734375, + "loss": 0.621, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0967540740966797, + "rewards/margins": 1.5927733182907104, + "rewards/rejected": -2.6895275115966797, + "step": 4449 + }, + { + "epoch": 0.51, + "learning_rate": 1.4833196769284793e-07, + "logits/chosen": -1.9362890720367432, + "logits/rejected": -2.081124782562256, + "logps/chosen": -336.4212951660156, + "logps/rejected": -253.56787109375, + "loss": 0.4349, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2617247700691223, + "rewards/margins": 2.0517971515655518, + "rewards/rejected": -2.3135221004486084, + "step": 4450 + }, + { + "epoch": 0.51, + "learning_rate": 1.4829685122322368e-07, + "logits/chosen": -2.0927517414093018, + "logits/rejected": -2.36649489402771, + "logps/chosen": -439.09820556640625, + "logps/rejected": -306.078125, + "loss": 0.3764, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7421138286590576, + "rewards/margins": 1.686855673789978, + "rewards/rejected": -2.428969621658325, + "step": 4451 + }, + { + "epoch": 0.51, + "learning_rate": 1.4826173475359944e-07, + "logits/chosen": -2.870213270187378, + "logits/rejected": -2.939051389694214, + "logps/chosen": -226.2096405029297, + "logps/rejected": -266.0019226074219, + "loss": 0.1233, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2871474325656891, + "rewards/margins": 3.787716865539551, + "rewards/rejected": -3.5005698204040527, + "step": 4452 + }, + { + "epoch": 0.51, + "learning_rate": 1.482266182839752e-07, + "logits/chosen": -2.2764995098114014, + "logits/rejected": -2.602806568145752, + "logps/chosen": -393.76068115234375, + "logps/rejected": -259.2311096191406, + "loss": 0.4364, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.053565263748169, + "rewards/margins": 1.7632946968078613, + "rewards/rejected": -2.8168599605560303, + "step": 4453 + }, + { + "epoch": 0.51, + "learning_rate": 1.4819150181435092e-07, + "logits/chosen": -1.9903584718704224, + "logits/rejected": -2.2098495960235596, + "logps/chosen": -322.26336669921875, + "logps/rejected": -207.9785614013672, + "loss": 1.2734, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.7098801136016846, + "rewards/margins": 0.09250184893608093, + "rewards/rejected": -1.802381992340088, + "step": 4454 + }, + { + "epoch": 0.51, + "learning_rate": 1.4815638534472667e-07, + "logits/chosen": -2.146064519882202, + "logits/rejected": -2.3438355922698975, + "logps/chosen": -303.592041015625, + "logps/rejected": -282.63238525390625, + "loss": 0.525, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7876701951026917, + "rewards/margins": 1.7923455238342285, + "rewards/rejected": -2.5800156593322754, + "step": 4455 + }, + { + "epoch": 0.51, + "learning_rate": 1.4812126887510243e-07, + "logits/chosen": -2.0809593200683594, + "logits/rejected": -2.1487183570861816, + "logps/chosen": -196.32142639160156, + "logps/rejected": -170.9724578857422, + "loss": 0.608, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0847036838531494, + "rewards/margins": 2.7758662700653076, + "rewards/rejected": -3.860569953918457, + "step": 4456 + }, + { + "epoch": 0.51, + "learning_rate": 1.4808615240547815e-07, + "logits/chosen": -1.7208313941955566, + "logits/rejected": -2.0417115688323975, + "logps/chosen": -662.3818969726562, + "logps/rejected": -473.0665283203125, + "loss": 0.279, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6615954637527466, + "rewards/margins": 1.976548194885254, + "rewards/rejected": -2.638143539428711, + "step": 4457 + }, + { + "epoch": 0.51, + "learning_rate": 1.480510359358539e-07, + "logits/chosen": -2.49947452545166, + "logits/rejected": -2.480313777923584, + "logps/chosen": -299.54742431640625, + "logps/rejected": -246.77261352539062, + "loss": 0.4221, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04095262661576271, + "rewards/margins": 1.2501648664474487, + "rewards/rejected": -1.2911174297332764, + "step": 4458 + }, + { + "epoch": 0.51, + "learning_rate": 1.4801591946622966e-07, + "logits/chosen": -1.4888523817062378, + "logits/rejected": -2.0771484375, + "logps/chosen": -628.9827880859375, + "logps/rejected": -441.0490417480469, + "loss": 0.3968, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7869725823402405, + "rewards/margins": 2.0100557804107666, + "rewards/rejected": -2.7970285415649414, + "step": 4459 + }, + { + "epoch": 0.51, + "learning_rate": 1.4798080299660542e-07, + "logits/chosen": -2.2413089275360107, + "logits/rejected": -2.422260284423828, + "logps/chosen": -352.6315002441406, + "logps/rejected": -309.6556701660156, + "loss": 0.3575, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.943009614944458, + "rewards/margins": 2.0596299171447754, + "rewards/rejected": -3.0026395320892334, + "step": 4460 + }, + { + "epoch": 0.51, + "learning_rate": 1.4794568652698114e-07, + "logits/chosen": -1.9695837497711182, + "logits/rejected": -2.5007100105285645, + "logps/chosen": -486.53485107421875, + "logps/rejected": -246.1357421875, + "loss": 0.5134, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1362903118133545, + "rewards/margins": 1.7888391017913818, + "rewards/rejected": -2.9251291751861572, + "step": 4461 + }, + { + "epoch": 0.51, + "learning_rate": 1.479105700573569e-07, + "logits/chosen": -2.4370436668395996, + "logits/rejected": -2.8392114639282227, + "logps/chosen": -278.5213623046875, + "logps/rejected": -267.1737976074219, + "loss": 0.4285, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8668200969696045, + "rewards/margins": 1.079703450202942, + "rewards/rejected": -2.9465231895446777, + "step": 4462 + }, + { + "epoch": 0.51, + "learning_rate": 1.4787545358773262e-07, + "logits/chosen": -2.282349109649658, + "logits/rejected": -2.0761044025421143, + "logps/chosen": -252.81321716308594, + "logps/rejected": -227.95835876464844, + "loss": 0.3308, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21922454237937927, + "rewards/margins": 2.4188859462738037, + "rewards/rejected": -2.638110399246216, + "step": 4463 + }, + { + "epoch": 0.51, + "learning_rate": 1.478403371181084e-07, + "logits/chosen": -2.6826605796813965, + "logits/rejected": -2.5897433757781982, + "logps/chosen": -342.3301696777344, + "logps/rejected": -331.8843078613281, + "loss": 0.4278, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3521950244903564, + "rewards/margins": 2.4890451431274414, + "rewards/rejected": -3.841240406036377, + "step": 4464 + }, + { + "epoch": 0.51, + "learning_rate": 1.4780522064848413e-07, + "logits/chosen": -1.622671127319336, + "logits/rejected": -1.996875286102295, + "logps/chosen": -538.4609375, + "logps/rejected": -334.892822265625, + "loss": 0.3346, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4674144983291626, + "rewards/margins": 1.9139785766601562, + "rewards/rejected": -2.3813929557800293, + "step": 4465 + }, + { + "epoch": 0.51, + "learning_rate": 1.4777010417885989e-07, + "logits/chosen": -2.434345006942749, + "logits/rejected": -2.4568402767181396, + "logps/chosen": -134.27325439453125, + "logps/rejected": -128.87518310546875, + "loss": 0.5619, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6218387484550476, + "rewards/margins": 1.1340956687927246, + "rewards/rejected": -1.755934476852417, + "step": 4466 + }, + { + "epoch": 0.51, + "learning_rate": 1.4773498770923561e-07, + "logits/chosen": -2.5174996852874756, + "logits/rejected": -2.594666004180908, + "logps/chosen": -247.3837127685547, + "logps/rejected": -188.14495849609375, + "loss": 0.3211, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4741041958332062, + "rewards/margins": 1.3423324823379517, + "rewards/rejected": -1.816436767578125, + "step": 4467 + }, + { + "epoch": 0.52, + "learning_rate": 1.4769987123961137e-07, + "logits/chosen": -2.215494394302368, + "logits/rejected": -2.5402910709381104, + "logps/chosen": -264.1390075683594, + "logps/rejected": -200.92201232910156, + "loss": 0.376, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6050701141357422, + "rewards/margins": 2.0514638423919678, + "rewards/rejected": -2.656533718109131, + "step": 4468 + }, + { + "epoch": 0.52, + "learning_rate": 1.4766475476998712e-07, + "logits/chosen": -1.6187388896942139, + "logits/rejected": -1.8504639863967896, + "logps/chosen": -524.217041015625, + "logps/rejected": -456.13421630859375, + "loss": 0.3871, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2714044451713562, + "rewards/margins": 1.3004857301712036, + "rewards/rejected": -1.029081106185913, + "step": 4469 + }, + { + "epoch": 0.52, + "learning_rate": 1.4762963830036288e-07, + "logits/chosen": -2.186417579650879, + "logits/rejected": -2.1987416744232178, + "logps/chosen": -470.5823059082031, + "logps/rejected": -348.0182800292969, + "loss": 0.1717, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9588849544525146, + "rewards/margins": 2.437175750732422, + "rewards/rejected": -3.3960604667663574, + "step": 4470 + }, + { + "epoch": 0.52, + "learning_rate": 1.475945218307386e-07, + "logits/chosen": -2.3560433387756348, + "logits/rejected": -2.578497886657715, + "logps/chosen": -326.55908203125, + "logps/rejected": -131.5485076904297, + "loss": 0.5741, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5496110916137695, + "rewards/margins": 0.7248479723930359, + "rewards/rejected": -2.274458885192871, + "step": 4471 + }, + { + "epoch": 0.52, + "learning_rate": 1.4755940536111436e-07, + "logits/chosen": -2.6924567222595215, + "logits/rejected": -2.5458173751831055, + "logps/chosen": -185.73345947265625, + "logps/rejected": -280.0475769042969, + "loss": 0.2526, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6125017404556274, + "rewards/margins": 2.432652235031128, + "rewards/rejected": -3.045154094696045, + "step": 4472 + }, + { + "epoch": 0.52, + "learning_rate": 1.475242888914901e-07, + "logits/chosen": -2.6343376636505127, + "logits/rejected": -2.54472017288208, + "logps/chosen": -280.26416015625, + "logps/rejected": -268.6033020019531, + "loss": 0.1843, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6646018028259277, + "rewards/margins": 3.119860887527466, + "rewards/rejected": -3.7844629287719727, + "step": 4473 + }, + { + "epoch": 0.52, + "learning_rate": 1.4748917242186584e-07, + "logits/chosen": -2.707669734954834, + "logits/rejected": -2.664632558822632, + "logps/chosen": -236.0364532470703, + "logps/rejected": -161.78228759765625, + "loss": 0.832, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2722084522247314, + "rewards/margins": 0.49522536993026733, + "rewards/rejected": -1.7674338817596436, + "step": 4474 + }, + { + "epoch": 0.52, + "learning_rate": 1.474540559522416e-07, + "logits/chosen": -2.927828311920166, + "logits/rejected": -2.8409221172332764, + "logps/chosen": -134.15802001953125, + "logps/rejected": -246.38671875, + "loss": 0.519, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0799561738967896, + "rewards/margins": 0.9935874938964844, + "rewards/rejected": -2.0735435485839844, + "step": 4475 + }, + { + "epoch": 0.52, + "learning_rate": 1.4741893948261735e-07, + "logits/chosen": -2.4954991340637207, + "logits/rejected": -2.3231098651885986, + "logps/chosen": -275.16961669921875, + "logps/rejected": -410.2815246582031, + "loss": 0.5648, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.469271183013916, + "rewards/margins": 2.2491796016693115, + "rewards/rejected": -3.7184507846832275, + "step": 4476 + }, + { + "epoch": 0.52, + "learning_rate": 1.473838230129931e-07, + "logits/chosen": -2.785229206085205, + "logits/rejected": -2.737241744995117, + "logps/chosen": -307.60626220703125, + "logps/rejected": -185.40164184570312, + "loss": 0.2865, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.660555362701416, + "rewards/margins": 2.2433254718780518, + "rewards/rejected": -2.9038808345794678, + "step": 4477 + }, + { + "epoch": 0.52, + "learning_rate": 1.4734870654336883e-07, + "logits/chosen": -2.731182336807251, + "logits/rejected": -2.6958627700805664, + "logps/chosen": -77.90621185302734, + "logps/rejected": -197.57177734375, + "loss": 0.138, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.47344452142715454, + "rewards/margins": 3.2935848236083984, + "rewards/rejected": -3.767029285430908, + "step": 4478 + }, + { + "epoch": 0.52, + "learning_rate": 1.4731359007374458e-07, + "logits/chosen": -2.5410842895507812, + "logits/rejected": -2.3460164070129395, + "logps/chosen": -339.125, + "logps/rejected": -376.1325988769531, + "loss": 0.5458, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2220261096954346, + "rewards/margins": 3.107224464416504, + "rewards/rejected": -4.329250335693359, + "step": 4479 + }, + { + "epoch": 0.52, + "learning_rate": 1.472784736041203e-07, + "logits/chosen": -2.5423057079315186, + "logits/rejected": -2.6901211738586426, + "logps/chosen": -198.87762451171875, + "logps/rejected": -142.54005432128906, + "loss": 0.3485, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6240918040275574, + "rewards/margins": 1.5880794525146484, + "rewards/rejected": -2.2121710777282715, + "step": 4480 + }, + { + "epoch": 0.52, + "learning_rate": 1.472433571344961e-07, + "logits/chosen": -2.179201126098633, + "logits/rejected": -2.4199557304382324, + "logps/chosen": -316.27996826171875, + "logps/rejected": -209.21133422851562, + "loss": 0.2346, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.054795265197754, + "rewards/margins": 2.011411190032959, + "rewards/rejected": -3.066206455230713, + "step": 4481 + }, + { + "epoch": 0.52, + "learning_rate": 1.4720824066487182e-07, + "logits/chosen": -2.6213133335113525, + "logits/rejected": -2.7548229694366455, + "logps/chosen": -339.12921142578125, + "logps/rejected": -173.25657653808594, + "loss": 0.1428, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7930293679237366, + "rewards/margins": 2.6279726028442383, + "rewards/rejected": -3.42100191116333, + "step": 4482 + }, + { + "epoch": 0.52, + "learning_rate": 1.4717312419524757e-07, + "logits/chosen": -2.3083338737487793, + "logits/rejected": -2.510371208190918, + "logps/chosen": -352.69171142578125, + "logps/rejected": -330.3204345703125, + "loss": 0.293, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8534500598907471, + "rewards/margins": 3.122562885284424, + "rewards/rejected": -3.976012706756592, + "step": 4483 + }, + { + "epoch": 0.52, + "learning_rate": 1.471380077256233e-07, + "logits/chosen": -2.0656721591949463, + "logits/rejected": -1.7116670608520508, + "logps/chosen": -316.55865478515625, + "logps/rejected": -411.9486999511719, + "loss": 0.54, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1584051847457886, + "rewards/margins": 1.3888499736785889, + "rewards/rejected": -2.547255039215088, + "step": 4484 + }, + { + "epoch": 0.52, + "learning_rate": 1.4710289125599905e-07, + "logits/chosen": -2.4220194816589355, + "logits/rejected": -2.2707133293151855, + "logps/chosen": -359.6371765136719, + "logps/rejected": -376.0588684082031, + "loss": 0.1581, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6351569294929504, + "rewards/margins": 2.8409934043884277, + "rewards/rejected": -3.4761502742767334, + "step": 4485 + }, + { + "epoch": 0.52, + "learning_rate": 1.470677747863748e-07, + "logits/chosen": -1.7333203554153442, + "logits/rejected": -1.972923755645752, + "logps/chosen": -399.52703857421875, + "logps/rejected": -323.70843505859375, + "loss": 0.8897, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0897634029388428, + "rewards/margins": 0.013392195105552673, + "rewards/rejected": -1.1031556129455566, + "step": 4486 + }, + { + "epoch": 0.52, + "learning_rate": 1.4703265831675056e-07, + "logits/chosen": -2.18430757522583, + "logits/rejected": -2.2442080974578857, + "logps/chosen": -259.8802490234375, + "logps/rejected": -202.96051025390625, + "loss": 0.346, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7015755772590637, + "rewards/margins": 1.8932397365570068, + "rewards/rejected": -2.594815254211426, + "step": 4487 + }, + { + "epoch": 0.52, + "learning_rate": 1.4699754184712629e-07, + "logits/chosen": -2.389779806137085, + "logits/rejected": -2.4325294494628906, + "logps/chosen": -134.78282165527344, + "logps/rejected": -219.6618194580078, + "loss": 0.4086, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.729432225227356, + "rewards/margins": 1.88468599319458, + "rewards/rejected": -2.6141183376312256, + "step": 4488 + }, + { + "epoch": 0.52, + "learning_rate": 1.4696242537750204e-07, + "logits/chosen": -2.8179969787597656, + "logits/rejected": -2.741070508956909, + "logps/chosen": -233.77828979492188, + "logps/rejected": -258.36773681640625, + "loss": 0.197, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7849101424217224, + "rewards/margins": 3.712263345718384, + "rewards/rejected": -4.497173309326172, + "step": 4489 + }, + { + "epoch": 0.52, + "learning_rate": 1.469273089078778e-07, + "logits/chosen": -1.5593934059143066, + "logits/rejected": -1.6899141073226929, + "logps/chosen": -202.8016357421875, + "logps/rejected": -123.70382690429688, + "loss": 0.6888, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.838225245475769, + "rewards/margins": 0.5016802549362183, + "rewards/rejected": -1.3399055004119873, + "step": 4490 + }, + { + "epoch": 0.52, + "learning_rate": 1.4689219243825352e-07, + "logits/chosen": -2.343294620513916, + "logits/rejected": -2.1513919830322266, + "logps/chosen": -211.0234832763672, + "logps/rejected": -244.654296875, + "loss": 0.3159, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6863055229187012, + "rewards/margins": 1.7290761470794678, + "rewards/rejected": -2.415381669998169, + "step": 4491 + }, + { + "epoch": 0.52, + "learning_rate": 1.4685707596862928e-07, + "logits/chosen": -2.176378011703491, + "logits/rejected": -2.2722883224487305, + "logps/chosen": -346.59869384765625, + "logps/rejected": -224.1952667236328, + "loss": 0.3758, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.49532365798950195, + "rewards/margins": 1.610236644744873, + "rewards/rejected": -2.105560302734375, + "step": 4492 + }, + { + "epoch": 0.52, + "learning_rate": 1.4682195949900503e-07, + "logits/chosen": -2.387479782104492, + "logits/rejected": -2.5482609272003174, + "logps/chosen": -319.1257019042969, + "logps/rejected": -290.103759765625, + "loss": 0.2748, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05510914325714111, + "rewards/margins": 2.382779359817505, + "rewards/rejected": -2.3276703357696533, + "step": 4493 + }, + { + "epoch": 0.52, + "learning_rate": 1.4678684302938078e-07, + "logits/chosen": -2.267709732055664, + "logits/rejected": -2.3186185359954834, + "logps/chosen": -221.2650146484375, + "logps/rejected": -246.26805114746094, + "loss": 0.1793, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6225433945655823, + "rewards/margins": 3.758993625640869, + "rewards/rejected": -4.381536960601807, + "step": 4494 + }, + { + "epoch": 0.52, + "learning_rate": 1.467517265597565e-07, + "logits/chosen": -2.0684540271759033, + "logits/rejected": -2.0069634914398193, + "logps/chosen": -483.1722412109375, + "logps/rejected": -357.3843994140625, + "loss": 0.1406, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17213335633277893, + "rewards/margins": 3.1648478507995605, + "rewards/rejected": -3.3369812965393066, + "step": 4495 + }, + { + "epoch": 0.52, + "learning_rate": 1.4671661009013226e-07, + "logits/chosen": -2.0557589530944824, + "logits/rejected": -2.1323211193084717, + "logps/chosen": -276.6868591308594, + "logps/rejected": -198.9653778076172, + "loss": 0.4162, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.20570075511932373, + "rewards/margins": 1.5732455253601074, + "rewards/rejected": -1.7789462804794312, + "step": 4496 + }, + { + "epoch": 0.52, + "learning_rate": 1.4668149362050802e-07, + "logits/chosen": -2.6851868629455566, + "logits/rejected": -2.810778856277466, + "logps/chosen": -334.06329345703125, + "logps/rejected": -210.68740844726562, + "loss": 0.3198, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5022500157356262, + "rewards/margins": 1.760735034942627, + "rewards/rejected": -2.2629849910736084, + "step": 4497 + }, + { + "epoch": 0.52, + "learning_rate": 1.4664637715088377e-07, + "logits/chosen": -1.851689100265503, + "logits/rejected": -1.7481874227523804, + "logps/chosen": -216.99822998046875, + "logps/rejected": -260.7594299316406, + "loss": 0.7177, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3970950841903687, + "rewards/margins": 1.1934711933135986, + "rewards/rejected": -2.590566635131836, + "step": 4498 + }, + { + "epoch": 0.52, + "learning_rate": 1.466112606812595e-07, + "logits/chosen": -2.0405709743499756, + "logits/rejected": -2.303290605545044, + "logps/chosen": -340.3137512207031, + "logps/rejected": -289.8396911621094, + "loss": 0.4222, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5893044471740723, + "rewards/margins": 0.9442291855812073, + "rewards/rejected": -1.5335336923599243, + "step": 4499 + }, + { + "epoch": 0.52, + "learning_rate": 1.4657614421163525e-07, + "logits/chosen": -2.13757061958313, + "logits/rejected": -2.0688674449920654, + "logps/chosen": -229.76617431640625, + "logps/rejected": -294.1022033691406, + "loss": 0.0585, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2823072075843811, + "rewards/margins": 3.8348727226257324, + "rewards/rejected": -4.117180347442627, + "step": 4500 + }, + { + "epoch": 0.52, + "learning_rate": 1.46541027742011e-07, + "logits/chosen": -2.217947244644165, + "logits/rejected": -2.3102259635925293, + "logps/chosen": -520.8761596679688, + "logps/rejected": -394.6619873046875, + "loss": 0.2926, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7544752359390259, + "rewards/margins": 1.644221305847168, + "rewards/rejected": -2.3986964225769043, + "step": 4501 + }, + { + "epoch": 0.52, + "learning_rate": 1.4650591127238673e-07, + "logits/chosen": -2.0808098316192627, + "logits/rejected": -2.2791736125946045, + "logps/chosen": -360.8982238769531, + "logps/rejected": -243.47378540039062, + "loss": 0.3994, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8330113887786865, + "rewards/margins": 1.5642836093902588, + "rewards/rejected": -3.3972949981689453, + "step": 4502 + }, + { + "epoch": 0.52, + "learning_rate": 1.464707948027625e-07, + "logits/chosen": -2.473252773284912, + "logits/rejected": -2.291365146636963, + "logps/chosen": -278.7706298828125, + "logps/rejected": -311.88800048828125, + "loss": 0.4678, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.322400838136673, + "rewards/margins": 1.9875047206878662, + "rewards/rejected": -2.309905529022217, + "step": 4503 + }, + { + "epoch": 0.52, + "learning_rate": 1.4643567833313824e-07, + "logits/chosen": -2.599180221557617, + "logits/rejected": -2.3911890983581543, + "logps/chosen": -181.87277221679688, + "logps/rejected": -318.3017883300781, + "loss": 0.7116, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5440202951431274, + "rewards/margins": 0.972307026386261, + "rewards/rejected": -2.516327381134033, + "step": 4504 + }, + { + "epoch": 0.52, + "learning_rate": 1.46400561863514e-07, + "logits/chosen": -2.3758411407470703, + "logits/rejected": -2.3114404678344727, + "logps/chosen": -179.9510955810547, + "logps/rejected": -283.39666748046875, + "loss": 0.263, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8852573037147522, + "rewards/margins": 2.490983009338379, + "rewards/rejected": -3.3762402534484863, + "step": 4505 + }, + { + "epoch": 0.52, + "learning_rate": 1.4636544539388972e-07, + "logits/chosen": -2.607492446899414, + "logits/rejected": -2.6718966960906982, + "logps/chosen": -279.0472106933594, + "logps/rejected": -221.83511352539062, + "loss": 0.5404, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7733289003372192, + "rewards/margins": 1.2744537591934204, + "rewards/rejected": -2.0477826595306396, + "step": 4506 + }, + { + "epoch": 0.52, + "learning_rate": 1.4633032892426548e-07, + "logits/chosen": -1.837442398071289, + "logits/rejected": -2.192164421081543, + "logps/chosen": -248.79234313964844, + "logps/rejected": -244.24630737304688, + "loss": 0.3767, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.423673152923584, + "rewards/margins": 1.4238229990005493, + "rewards/rejected": -2.847496271133423, + "step": 4507 + }, + { + "epoch": 0.52, + "learning_rate": 1.462952124546412e-07, + "logits/chosen": -2.4733073711395264, + "logits/rejected": -2.504044771194458, + "logps/chosen": -283.50311279296875, + "logps/rejected": -194.76959228515625, + "loss": 0.3322, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9167194962501526, + "rewards/margins": 1.6620031595230103, + "rewards/rejected": -2.5787227153778076, + "step": 4508 + }, + { + "epoch": 0.52, + "learning_rate": 1.4626009598501698e-07, + "logits/chosen": -2.1990692615509033, + "logits/rejected": -2.0533385276794434, + "logps/chosen": -269.2598571777344, + "logps/rejected": -286.0405578613281, + "loss": 0.1852, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24839991331100464, + "rewards/margins": 3.5662620067596436, + "rewards/rejected": -3.814661979675293, + "step": 4509 + }, + { + "epoch": 0.52, + "learning_rate": 1.462249795153927e-07, + "logits/chosen": -2.620501756668091, + "logits/rejected": -2.7877979278564453, + "logps/chosen": -324.80810546875, + "logps/rejected": -255.01504516601562, + "loss": 0.447, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9009279012680054, + "rewards/margins": 1.5780556201934814, + "rewards/rejected": -2.4789836406707764, + "step": 4510 + }, + { + "epoch": 0.52, + "learning_rate": 1.4618986304576847e-07, + "logits/chosen": -2.619136095046997, + "logits/rejected": -2.7170040607452393, + "logps/chosen": -400.19036865234375, + "logps/rejected": -348.3314208984375, + "loss": 0.6747, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.899999737739563, + "rewards/margins": 0.6333312392234802, + "rewards/rejected": -2.5333309173583984, + "step": 4511 + }, + { + "epoch": 0.52, + "learning_rate": 1.461547465761442e-07, + "logits/chosen": -2.0552425384521484, + "logits/rejected": -2.458050489425659, + "logps/chosen": -408.7151794433594, + "logps/rejected": -273.6378173828125, + "loss": 0.2871, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0602450966835022, + "rewards/margins": 3.8381357192993164, + "rewards/rejected": -3.77789044380188, + "step": 4512 + }, + { + "epoch": 0.52, + "learning_rate": 1.4611963010651995e-07, + "logits/chosen": -2.076575994491577, + "logits/rejected": -2.1355667114257812, + "logps/chosen": -187.89028930664062, + "logps/rejected": -189.1169891357422, + "loss": 0.8855, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6498175859451294, + "rewards/margins": 0.46200644969940186, + "rewards/rejected": -2.1118240356445312, + "step": 4513 + }, + { + "epoch": 0.52, + "learning_rate": 1.460845136368957e-07, + "logits/chosen": -2.65397047996521, + "logits/rejected": -2.8207035064697266, + "logps/chosen": -336.3890380859375, + "logps/rejected": -206.71533203125, + "loss": 0.1832, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.023110896348953247, + "rewards/margins": 2.6709327697753906, + "rewards/rejected": -2.6940436363220215, + "step": 4514 + }, + { + "epoch": 0.52, + "learning_rate": 1.4604939716727145e-07, + "logits/chosen": -2.8722126483917236, + "logits/rejected": -2.889099597930908, + "logps/chosen": -201.3444366455078, + "logps/rejected": -202.95852661132812, + "loss": 0.4802, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6602544784545898, + "rewards/margins": 2.854356050491333, + "rewards/rejected": -3.514610528945923, + "step": 4515 + }, + { + "epoch": 0.52, + "learning_rate": 1.4601428069764718e-07, + "logits/chosen": -1.9264907836914062, + "logits/rejected": -1.8268933296203613, + "logps/chosen": -374.7296142578125, + "logps/rejected": -334.5618896484375, + "loss": 0.2962, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.45489370822906494, + "rewards/margins": 1.5305794477462769, + "rewards/rejected": -1.9854731559753418, + "step": 4516 + }, + { + "epoch": 0.52, + "learning_rate": 1.4597916422802294e-07, + "logits/chosen": -2.623107433319092, + "logits/rejected": -2.457263469696045, + "logps/chosen": -244.21896362304688, + "logps/rejected": -296.065673828125, + "loss": 0.0761, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.511769711971283, + "rewards/margins": 3.6109976768493652, + "rewards/rejected": -4.122767448425293, + "step": 4517 + }, + { + "epoch": 0.52, + "learning_rate": 1.459440477583987e-07, + "logits/chosen": -2.035048484802246, + "logits/rejected": -2.0463597774505615, + "logps/chosen": -250.5462188720703, + "logps/rejected": -208.28948974609375, + "loss": 0.4118, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5646042823791504, + "rewards/margins": 0.9370508790016174, + "rewards/rejected": -1.5016552209854126, + "step": 4518 + }, + { + "epoch": 0.52, + "learning_rate": 1.4590893128877442e-07, + "logits/chosen": -2.7378928661346436, + "logits/rejected": -2.563575267791748, + "logps/chosen": -185.91830444335938, + "logps/rejected": -203.30459594726562, + "loss": 0.5279, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.296256184577942, + "rewards/margins": 0.8501039743423462, + "rewards/rejected": -2.146360397338867, + "step": 4519 + }, + { + "epoch": 0.52, + "learning_rate": 1.4587381481915017e-07, + "logits/chosen": -2.4146804809570312, + "logits/rejected": -2.5262649059295654, + "logps/chosen": -269.25787353515625, + "logps/rejected": -319.20318603515625, + "loss": 0.4422, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9839269518852234, + "rewards/margins": 2.2015788555145264, + "rewards/rejected": -3.1855058670043945, + "step": 4520 + }, + { + "epoch": 0.52, + "learning_rate": 1.4583869834952593e-07, + "logits/chosen": -2.457547187805176, + "logits/rejected": -2.4766883850097656, + "logps/chosen": -319.3867492675781, + "logps/rejected": -250.71726989746094, + "loss": 0.2798, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2736703157424927, + "rewards/margins": 1.7027270793914795, + "rewards/rejected": -2.9763975143432617, + "step": 4521 + }, + { + "epoch": 0.52, + "learning_rate": 1.4580358187990168e-07, + "logits/chosen": -2.667721748352051, + "logits/rejected": -2.4685215950012207, + "logps/chosen": -216.03231811523438, + "logps/rejected": -272.1253662109375, + "loss": 0.1038, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.24979811906814575, + "rewards/margins": 3.6365599632263184, + "rewards/rejected": -3.3867616653442383, + "step": 4522 + }, + { + "epoch": 0.52, + "learning_rate": 1.457684654102774e-07, + "logits/chosen": -2.212775945663452, + "logits/rejected": -2.1277527809143066, + "logps/chosen": -291.72509765625, + "logps/rejected": -236.2608184814453, + "loss": 0.3841, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6684090495109558, + "rewards/margins": 1.0045890808105469, + "rewards/rejected": -1.6729981899261475, + "step": 4523 + }, + { + "epoch": 0.52, + "learning_rate": 1.4573334894065316e-07, + "logits/chosen": -2.711832284927368, + "logits/rejected": -2.7116010189056396, + "logps/chosen": -235.4091033935547, + "logps/rejected": -181.9007110595703, + "loss": 0.2589, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4192194938659668, + "rewards/margins": 2.897202730178833, + "rewards/rejected": -4.316421985626221, + "step": 4524 + }, + { + "epoch": 0.52, + "learning_rate": 1.456982324710289e-07, + "logits/chosen": -2.357492208480835, + "logits/rejected": -2.464512586593628, + "logps/chosen": -261.04168701171875, + "logps/rejected": -233.74676513671875, + "loss": 0.4726, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.20815163850784302, + "rewards/margins": 1.1046596765518188, + "rewards/rejected": -1.3128113746643066, + "step": 4525 + }, + { + "epoch": 0.52, + "learning_rate": 1.4566311600140467e-07, + "logits/chosen": -2.386981964111328, + "logits/rejected": -2.3751535415649414, + "logps/chosen": -304.88043212890625, + "logps/rejected": -288.79888916015625, + "loss": 0.6159, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4941464960575104, + "rewards/margins": 1.4007129669189453, + "rewards/rejected": -1.8948595523834229, + "step": 4526 + }, + { + "epoch": 0.52, + "learning_rate": 1.456279995317804e-07, + "logits/chosen": -2.1958937644958496, + "logits/rejected": -2.1279287338256836, + "logps/chosen": -319.6920166015625, + "logps/rejected": -235.872314453125, + "loss": 0.2126, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8889787197113037, + "rewards/margins": 2.1936886310577393, + "rewards/rejected": -3.082667350769043, + "step": 4527 + }, + { + "epoch": 0.52, + "learning_rate": 1.4559288306215615e-07, + "logits/chosen": -2.127626895904541, + "logits/rejected": -2.4312028884887695, + "logps/chosen": -241.82115173339844, + "logps/rejected": -229.3202362060547, + "loss": 0.0478, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.219681054353714, + "rewards/margins": 3.644676685333252, + "rewards/rejected": -3.8643579483032227, + "step": 4528 + }, + { + "epoch": 0.52, + "learning_rate": 1.4555776659253188e-07, + "logits/chosen": -2.3684096336364746, + "logits/rejected": -2.217658519744873, + "logps/chosen": -219.30990600585938, + "logps/rejected": -369.405517578125, + "loss": 0.0511, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3347826600074768, + "rewards/margins": 3.9390292167663574, + "rewards/rejected": -4.2738118171691895, + "step": 4529 + }, + { + "epoch": 0.52, + "learning_rate": 1.4552265012290763e-07, + "logits/chosen": -2.6446051597595215, + "logits/rejected": -2.64642333984375, + "logps/chosen": -179.2228546142578, + "logps/rejected": -226.17523193359375, + "loss": 0.3059, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9037794470787048, + "rewards/margins": 1.8199188709259033, + "rewards/rejected": -2.723698139190674, + "step": 4530 + }, + { + "epoch": 0.52, + "learning_rate": 1.4548753365328338e-07, + "logits/chosen": -2.1045069694519043, + "logits/rejected": -1.622282862663269, + "logps/chosen": -176.22164916992188, + "logps/rejected": -341.1671447753906, + "loss": 0.3415, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4811893701553345, + "rewards/margins": 2.698808193206787, + "rewards/rejected": -3.1799979209899902, + "step": 4531 + }, + { + "epoch": 0.52, + "learning_rate": 1.4545241718365914e-07, + "logits/chosen": -2.0769760608673096, + "logits/rejected": -2.1854190826416016, + "logps/chosen": -369.2169189453125, + "logps/rejected": -301.868408203125, + "loss": 0.5415, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.33364319801330566, + "rewards/margins": 2.000231981277466, + "rewards/rejected": -2.3338751792907715, + "step": 4532 + }, + { + "epoch": 0.52, + "learning_rate": 1.4541730071403487e-07, + "logits/chosen": -1.5454044342041016, + "logits/rejected": -1.9492348432540894, + "logps/chosen": -394.3978271484375, + "logps/rejected": -295.6485595703125, + "loss": 0.3212, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7711766958236694, + "rewards/margins": 1.5964583158493042, + "rewards/rejected": -2.3676350116729736, + "step": 4533 + }, + { + "epoch": 0.52, + "learning_rate": 1.4538218424441062e-07, + "logits/chosen": -2.0621755123138428, + "logits/rejected": -2.227390766143799, + "logps/chosen": -290.6309814453125, + "logps/rejected": -219.99954223632812, + "loss": 0.2569, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8912074565887451, + "rewards/margins": 2.2343780994415283, + "rewards/rejected": -3.1255855560302734, + "step": 4534 + }, + { + "epoch": 0.52, + "learning_rate": 1.4534706777478637e-07, + "logits/chosen": -2.222928762435913, + "logits/rejected": -2.295349359512329, + "logps/chosen": -295.1422424316406, + "logps/rejected": -304.0709228515625, + "loss": 0.3015, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.27662888169288635, + "rewards/margins": 1.443116307258606, + "rewards/rejected": -1.719745397567749, + "step": 4535 + }, + { + "epoch": 0.52, + "learning_rate": 1.453119513051621e-07, + "logits/chosen": -2.4860026836395264, + "logits/rejected": -2.6006019115448, + "logps/chosen": -288.14581298828125, + "logps/rejected": -322.174560546875, + "loss": 0.6812, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4048688411712646, + "rewards/margins": 2.15006685256958, + "rewards/rejected": -3.5549354553222656, + "step": 4536 + }, + { + "epoch": 0.52, + "learning_rate": 1.4527683483553785e-07, + "logits/chosen": -2.6885905265808105, + "logits/rejected": -2.5981597900390625, + "logps/chosen": -257.739990234375, + "logps/rejected": -356.71014404296875, + "loss": 0.6933, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.0495240688323975, + "rewards/margins": 1.4429810047149658, + "rewards/rejected": -3.492504835128784, + "step": 4537 + }, + { + "epoch": 0.52, + "learning_rate": 1.452417183659136e-07, + "logits/chosen": -2.512481689453125, + "logits/rejected": -2.6049656867980957, + "logps/chosen": -307.4999084472656, + "logps/rejected": -402.33441162109375, + "loss": 0.6023, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4955354928970337, + "rewards/margins": 1.3185391426086426, + "rewards/rejected": -2.814074754714966, + "step": 4538 + }, + { + "epoch": 0.52, + "learning_rate": 1.4520660189628936e-07, + "logits/chosen": -2.4677252769470215, + "logits/rejected": -2.360508918762207, + "logps/chosen": -167.59201049804688, + "logps/rejected": -238.17230224609375, + "loss": 0.3155, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7561871409416199, + "rewards/margins": 1.4938316345214844, + "rewards/rejected": -2.25001859664917, + "step": 4539 + }, + { + "epoch": 0.52, + "learning_rate": 1.451714854266651e-07, + "logits/chosen": -2.765207290649414, + "logits/rejected": -2.686491012573242, + "logps/chosen": -174.75668334960938, + "logps/rejected": -165.57965087890625, + "loss": 0.2371, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0505471229553223, + "rewards/margins": 1.713667869567871, + "rewards/rejected": -2.7642149925231934, + "step": 4540 + }, + { + "epoch": 0.52, + "learning_rate": 1.4513636895704084e-07, + "logits/chosen": -2.583272695541382, + "logits/rejected": -2.6060993671417236, + "logps/chosen": -365.6835021972656, + "logps/rejected": -232.13597106933594, + "loss": 0.3833, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.821218729019165, + "rewards/margins": 1.0679517984390259, + "rewards/rejected": -1.889170527458191, + "step": 4541 + }, + { + "epoch": 0.52, + "learning_rate": 1.451012524874166e-07, + "logits/chosen": -2.6172547340393066, + "logits/rejected": -2.251911163330078, + "logps/chosen": -388.6472473144531, + "logps/rejected": -236.81382751464844, + "loss": 0.9632, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.016212224960327, + "rewards/margins": 1.2029485702514648, + "rewards/rejected": -3.219160795211792, + "step": 4542 + }, + { + "epoch": 0.52, + "learning_rate": 1.4506613601779235e-07, + "logits/chosen": -1.7525293827056885, + "logits/rejected": -1.8957237005233765, + "logps/chosen": -529.2800903320312, + "logps/rejected": -375.1937255859375, + "loss": 0.3415, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5878798961639404, + "rewards/margins": 1.2141460180282593, + "rewards/rejected": -1.8020259141921997, + "step": 4543 + }, + { + "epoch": 0.52, + "learning_rate": 1.4503101954816808e-07, + "logits/chosen": -2.939790725708008, + "logits/rejected": -2.987595319747925, + "logps/chosen": -300.43841552734375, + "logps/rejected": -370.06048583984375, + "loss": 0.5156, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.481014609336853, + "rewards/margins": 2.4618425369262695, + "rewards/rejected": -3.942857027053833, + "step": 4544 + }, + { + "epoch": 0.52, + "learning_rate": 1.4499590307854383e-07, + "logits/chosen": -2.358790397644043, + "logits/rejected": -2.4423372745513916, + "logps/chosen": -308.5971984863281, + "logps/rejected": -232.84751892089844, + "loss": 0.4418, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8780672550201416, + "rewards/margins": 2.0466504096984863, + "rewards/rejected": -2.924717664718628, + "step": 4545 + }, + { + "epoch": 0.52, + "learning_rate": 1.4496078660891959e-07, + "logits/chosen": -2.7529854774475098, + "logits/rejected": -2.6225531101226807, + "logps/chosen": -230.5198974609375, + "logps/rejected": -387.079833984375, + "loss": 0.4556, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9963108897209167, + "rewards/margins": 1.4808505773544312, + "rewards/rejected": -2.477161407470703, + "step": 4546 + }, + { + "epoch": 0.52, + "learning_rate": 1.4492567013929531e-07, + "logits/chosen": -2.3123764991760254, + "logits/rejected": -2.2730441093444824, + "logps/chosen": -254.59088134765625, + "logps/rejected": -248.85446166992188, + "loss": 0.594, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1485434770584106, + "rewards/margins": 2.6672003269195557, + "rewards/rejected": -3.8157436847686768, + "step": 4547 + }, + { + "epoch": 0.52, + "learning_rate": 1.4489055366967107e-07, + "logits/chosen": -2.332878351211548, + "logits/rejected": -2.3772964477539062, + "logps/chosen": -272.9319763183594, + "logps/rejected": -234.49761962890625, + "loss": 0.2977, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.924558162689209, + "rewards/margins": 2.340205669403076, + "rewards/rejected": -3.2647640705108643, + "step": 4548 + }, + { + "epoch": 0.52, + "learning_rate": 1.4485543720004682e-07, + "logits/chosen": -2.6078896522521973, + "logits/rejected": -2.3420515060424805, + "logps/chosen": -302.6552734375, + "logps/rejected": -392.7342529296875, + "loss": 0.4345, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6965674161911011, + "rewards/margins": 2.1302907466888428, + "rewards/rejected": -2.8268580436706543, + "step": 4549 + }, + { + "epoch": 0.52, + "learning_rate": 1.4482032073042258e-07, + "logits/chosen": -2.4462320804595947, + "logits/rejected": -2.327145576477051, + "logps/chosen": -192.89620971679688, + "logps/rejected": -277.7629699707031, + "loss": 0.2659, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6639328598976135, + "rewards/margins": 1.7588845491409302, + "rewards/rejected": -2.4228174686431885, + "step": 4550 + }, + { + "epoch": 0.52, + "learning_rate": 1.447852042607983e-07, + "logits/chosen": -2.4056010246276855, + "logits/rejected": -2.369642734527588, + "logps/chosen": -214.6212921142578, + "logps/rejected": -263.7915954589844, + "loss": 0.2421, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0544322729110718, + "rewards/margins": 4.112396717071533, + "rewards/rejected": -5.1668291091918945, + "step": 4551 + }, + { + "epoch": 0.52, + "learning_rate": 1.4475008779117406e-07, + "logits/chosen": -2.4304733276367188, + "logits/rejected": -2.3149189949035645, + "logps/chosen": -107.45736694335938, + "logps/rejected": -258.86865234375, + "loss": 0.4423, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.530328094959259, + "rewards/margins": 2.039463520050049, + "rewards/rejected": -2.569791555404663, + "step": 4552 + }, + { + "epoch": 0.52, + "learning_rate": 1.4471497132154978e-07, + "logits/chosen": -2.330380916595459, + "logits/rejected": -2.3565564155578613, + "logps/chosen": -258.62603759765625, + "logps/rejected": -206.23785400390625, + "loss": 0.3513, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8629001975059509, + "rewards/margins": 1.7094569206237793, + "rewards/rejected": -2.572356939315796, + "step": 4553 + }, + { + "epoch": 0.52, + "learning_rate": 1.4467985485192556e-07, + "logits/chosen": -2.76668381690979, + "logits/rejected": -2.582077741622925, + "logps/chosen": -206.60269165039062, + "logps/rejected": -293.556640625, + "loss": 0.591, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9687018394470215, + "rewards/margins": 1.5951282978057861, + "rewards/rejected": -2.5638303756713867, + "step": 4554 + }, + { + "epoch": 0.53, + "learning_rate": 1.446447383823013e-07, + "logits/chosen": -2.311917304992676, + "logits/rejected": -2.2380197048187256, + "logps/chosen": -247.46334838867188, + "logps/rejected": -229.82000732421875, + "loss": 0.6592, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.719559907913208, + "rewards/margins": 0.3724597096443176, + "rewards/rejected": -2.092019557952881, + "step": 4555 + }, + { + "epoch": 0.53, + "learning_rate": 1.4460962191267705e-07, + "logits/chosen": -2.817744731903076, + "logits/rejected": -2.5963943004608154, + "logps/chosen": -228.9605712890625, + "logps/rejected": -227.8700714111328, + "loss": 0.2296, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.969566285610199, + "rewards/margins": 2.9015655517578125, + "rewards/rejected": -3.8711318969726562, + "step": 4556 + }, + { + "epoch": 0.53, + "learning_rate": 1.4457450544305277e-07, + "logits/chosen": -1.9933815002441406, + "logits/rejected": -2.054795742034912, + "logps/chosen": -239.19744873046875, + "logps/rejected": -262.40087890625, + "loss": 0.271, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7485382556915283, + "rewards/margins": 2.6070187091827393, + "rewards/rejected": -3.3555567264556885, + "step": 4557 + }, + { + "epoch": 0.53, + "learning_rate": 1.4453938897342853e-07, + "logits/chosen": -2.1606037616729736, + "logits/rejected": -2.287993907928467, + "logps/chosen": -256.58917236328125, + "logps/rejected": -260.804931640625, + "loss": 0.153, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12219977378845215, + "rewards/margins": 2.9028663635253906, + "rewards/rejected": -3.025066375732422, + "step": 4558 + }, + { + "epoch": 0.53, + "learning_rate": 1.4450427250380428e-07, + "logits/chosen": -2.6295571327209473, + "logits/rejected": -2.668783187866211, + "logps/chosen": -346.01336669921875, + "logps/rejected": -301.2611083984375, + "loss": 0.2045, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.11264856159687042, + "rewards/margins": 2.9467766284942627, + "rewards/rejected": -2.834127902984619, + "step": 4559 + }, + { + "epoch": 0.53, + "learning_rate": 1.4446915603418003e-07, + "logits/chosen": -2.4343323707580566, + "logits/rejected": -2.1214981079101562, + "logps/chosen": -329.46087646484375, + "logps/rejected": -367.57403564453125, + "loss": 0.534, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5207464694976807, + "rewards/margins": 2.5660512447357178, + "rewards/rejected": -4.086797714233398, + "step": 4560 + }, + { + "epoch": 0.53, + "learning_rate": 1.4443403956455576e-07, + "logits/chosen": -2.538548469543457, + "logits/rejected": -2.535491466522217, + "logps/chosen": -235.93182373046875, + "logps/rejected": -274.64892578125, + "loss": 0.292, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1219230890274048, + "rewards/margins": 2.1142008304595947, + "rewards/rejected": -3.23612380027771, + "step": 4561 + }, + { + "epoch": 0.53, + "learning_rate": 1.4439892309493152e-07, + "logits/chosen": -2.053621530532837, + "logits/rejected": -2.0241589546203613, + "logps/chosen": -282.7286376953125, + "logps/rejected": -303.6070556640625, + "loss": 0.1599, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1143946647644043, + "rewards/margins": 2.6780097484588623, + "rewards/rejected": -2.7924044132232666, + "step": 4562 + }, + { + "epoch": 0.53, + "learning_rate": 1.4436380662530727e-07, + "logits/chosen": -1.9307011365890503, + "logits/rejected": -1.8964018821716309, + "logps/chosen": -207.48825073242188, + "logps/rejected": -197.64443969726562, + "loss": 0.3459, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.039193868637085, + "rewards/margins": 2.160154342651367, + "rewards/rejected": -3.199348211288452, + "step": 4563 + }, + { + "epoch": 0.53, + "learning_rate": 1.44328690155683e-07, + "logits/chosen": -2.8740015029907227, + "logits/rejected": -2.9059975147247314, + "logps/chosen": -252.6270751953125, + "logps/rejected": -356.424072265625, + "loss": 0.2968, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0368748903274536, + "rewards/margins": 1.9070649147033691, + "rewards/rejected": -2.9439399242401123, + "step": 4564 + }, + { + "epoch": 0.53, + "learning_rate": 1.4429357368605875e-07, + "logits/chosen": -2.590886116027832, + "logits/rejected": -2.582327365875244, + "logps/chosen": -160.74832153320312, + "logps/rejected": -259.5081481933594, + "loss": 0.2304, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0359907150268555, + "rewards/margins": 3.2744603157043457, + "rewards/rejected": -4.310451030731201, + "step": 4565 + }, + { + "epoch": 0.53, + "learning_rate": 1.442584572164345e-07, + "logits/chosen": -2.5566468238830566, + "logits/rejected": -2.4221441745758057, + "logps/chosen": -194.26902770996094, + "logps/rejected": -342.238037109375, + "loss": 0.0765, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6830250024795532, + "rewards/margins": 3.4026575088500977, + "rewards/rejected": -4.0856828689575195, + "step": 4566 + }, + { + "epoch": 0.53, + "learning_rate": 1.4422334074681026e-07, + "logits/chosen": -2.5517823696136475, + "logits/rejected": -2.4701876640319824, + "logps/chosen": -192.43966674804688, + "logps/rejected": -275.24591064453125, + "loss": 0.3233, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5579808950424194, + "rewards/margins": 2.3789451122283936, + "rewards/rejected": -2.9369256496429443, + "step": 4567 + }, + { + "epoch": 0.53, + "learning_rate": 1.4418822427718599e-07, + "logits/chosen": -2.2482638359069824, + "logits/rejected": -2.322329044342041, + "logps/chosen": -355.6593322753906, + "logps/rejected": -280.4884338378906, + "loss": 0.3022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7525438070297241, + "rewards/margins": 1.226581335067749, + "rewards/rejected": -1.979124903678894, + "step": 4568 + }, + { + "epoch": 0.53, + "learning_rate": 1.4415310780756174e-07, + "logits/chosen": -2.8085410594940186, + "logits/rejected": -2.6485037803649902, + "logps/chosen": -222.356201171875, + "logps/rejected": -235.42703247070312, + "loss": 0.1564, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6165108680725098, + "rewards/margins": 2.331669330596924, + "rewards/rejected": -3.9481801986694336, + "step": 4569 + }, + { + "epoch": 0.53, + "learning_rate": 1.4411799133793747e-07, + "logits/chosen": -2.4353675842285156, + "logits/rejected": -2.7428290843963623, + "logps/chosen": -467.74310302734375, + "logps/rejected": -302.21331787109375, + "loss": 0.1675, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.21997879445552826, + "rewards/margins": 3.408745050430298, + "rewards/rejected": -3.1887660026550293, + "step": 4570 + }, + { + "epoch": 0.53, + "learning_rate": 1.4408287486831325e-07, + "logits/chosen": -2.330303907394409, + "logits/rejected": -2.2829604148864746, + "logps/chosen": -164.86007690429688, + "logps/rejected": -297.74224853515625, + "loss": 0.1936, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10114577412605286, + "rewards/margins": 3.190108060836792, + "rewards/rejected": -3.2912540435791016, + "step": 4571 + }, + { + "epoch": 0.53, + "learning_rate": 1.4404775839868897e-07, + "logits/chosen": -2.473066806793213, + "logits/rejected": -2.4981024265289307, + "logps/chosen": -131.32992553710938, + "logps/rejected": -174.81285095214844, + "loss": 0.5296, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4619072377681732, + "rewards/margins": 1.6689090728759766, + "rewards/rejected": -2.1308164596557617, + "step": 4572 + }, + { + "epoch": 0.53, + "learning_rate": 1.4401264192906473e-07, + "logits/chosen": -2.722196578979492, + "logits/rejected": -2.619603157043457, + "logps/chosen": -90.546875, + "logps/rejected": -281.33294677734375, + "loss": 0.3805, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7404730319976807, + "rewards/margins": 2.866098403930664, + "rewards/rejected": -3.6065714359283447, + "step": 4573 + }, + { + "epoch": 0.53, + "learning_rate": 1.4397752545944046e-07, + "logits/chosen": -1.8391966819763184, + "logits/rejected": -1.680837869644165, + "logps/chosen": -262.1964111328125, + "logps/rejected": -354.0543518066406, + "loss": 0.0875, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23432758450508118, + "rewards/margins": 3.7246642112731934, + "rewards/rejected": -3.958991765975952, + "step": 4574 + }, + { + "epoch": 0.53, + "learning_rate": 1.439424089898162e-07, + "logits/chosen": -2.477588176727295, + "logits/rejected": -2.5735981464385986, + "logps/chosen": -260.5495300292969, + "logps/rejected": -305.5096435546875, + "loss": 0.3368, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8073217868804932, + "rewards/margins": 3.774658441543579, + "rewards/rejected": -4.581980228424072, + "step": 4575 + }, + { + "epoch": 0.53, + "learning_rate": 1.4390729252019196e-07, + "logits/chosen": -3.052760362625122, + "logits/rejected": -2.965384006500244, + "logps/chosen": -277.59149169921875, + "logps/rejected": -276.5054931640625, + "loss": 0.2636, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0439876317977905, + "rewards/margins": 2.411623954772949, + "rewards/rejected": -3.4556117057800293, + "step": 4576 + }, + { + "epoch": 0.53, + "learning_rate": 1.4387217605056772e-07, + "logits/chosen": -2.2190656661987305, + "logits/rejected": -1.9061282873153687, + "logps/chosen": -194.31576538085938, + "logps/rejected": -330.54156494140625, + "loss": 0.3785, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7349058985710144, + "rewards/margins": 2.2551987171173096, + "rewards/rejected": -2.990104913711548, + "step": 4577 + }, + { + "epoch": 0.53, + "learning_rate": 1.4383705958094344e-07, + "logits/chosen": -2.1246793270111084, + "logits/rejected": -2.5736420154571533, + "logps/chosen": -345.0081787109375, + "logps/rejected": -252.4187774658203, + "loss": 0.2985, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.04843759536743164, + "rewards/margins": 1.6714986562728882, + "rewards/rejected": -1.623061180114746, + "step": 4578 + }, + { + "epoch": 0.53, + "learning_rate": 1.438019431113192e-07, + "logits/chosen": -2.2705912590026855, + "logits/rejected": -2.2259349822998047, + "logps/chosen": -161.60995483398438, + "logps/rejected": -171.20603942871094, + "loss": 0.5753, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7988162040710449, + "rewards/margins": 1.6207988262176514, + "rewards/rejected": -2.419614791870117, + "step": 4579 + }, + { + "epoch": 0.53, + "learning_rate": 1.4376682664169495e-07, + "logits/chosen": -2.1002919673919678, + "logits/rejected": -2.0759167671203613, + "logps/chosen": -250.43453979492188, + "logps/rejected": -324.5311279296875, + "loss": 0.4296, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0993223190307617, + "rewards/margins": 2.7102243900299072, + "rewards/rejected": -3.809546947479248, + "step": 4580 + }, + { + "epoch": 0.53, + "learning_rate": 1.4373171017207068e-07, + "logits/chosen": -2.0387001037597656, + "logits/rejected": -2.005847692489624, + "logps/chosen": -193.26235961914062, + "logps/rejected": -259.39447021484375, + "loss": 0.4108, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5177592635154724, + "rewards/margins": 2.402437686920166, + "rewards/rejected": -2.920197010040283, + "step": 4581 + }, + { + "epoch": 0.53, + "learning_rate": 1.4369659370244643e-07, + "logits/chosen": -2.833115577697754, + "logits/rejected": -2.7133641242980957, + "logps/chosen": -233.3578338623047, + "logps/rejected": -196.70147705078125, + "loss": 0.2925, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0522327423095703, + "rewards/margins": 1.980318307876587, + "rewards/rejected": -3.0325512886047363, + "step": 4582 + }, + { + "epoch": 0.53, + "learning_rate": 1.436614772328222e-07, + "logits/chosen": -2.515141725540161, + "logits/rejected": -2.7668771743774414, + "logps/chosen": -158.49575805664062, + "logps/rejected": -120.59972381591797, + "loss": 0.2946, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12461253255605698, + "rewards/margins": 2.282203435897827, + "rewards/rejected": -2.1575911045074463, + "step": 4583 + }, + { + "epoch": 0.53, + "learning_rate": 1.4362636076319794e-07, + "logits/chosen": -1.8815624713897705, + "logits/rejected": -2.080681800842285, + "logps/chosen": -542.9639282226562, + "logps/rejected": -354.3415832519531, + "loss": 0.2414, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.024025727063417435, + "rewards/margins": 1.6544365882873535, + "rewards/rejected": -1.6784625053405762, + "step": 4584 + }, + { + "epoch": 0.53, + "learning_rate": 1.4359124429357367e-07, + "logits/chosen": -1.947667121887207, + "logits/rejected": -2.2242491245269775, + "logps/chosen": -367.0390625, + "logps/rejected": -341.833984375, + "loss": 0.2846, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.17941857874393463, + "rewards/margins": 2.5293679237365723, + "rewards/rejected": -2.349949359893799, + "step": 4585 + }, + { + "epoch": 0.53, + "learning_rate": 1.4355612782394942e-07, + "logits/chosen": -2.609102964401245, + "logits/rejected": -2.6400885581970215, + "logps/chosen": -180.1774444580078, + "logps/rejected": -256.92730712890625, + "loss": 0.5548, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.250630497932434, + "rewards/margins": 2.0996170043945312, + "rewards/rejected": -3.350247621536255, + "step": 4586 + }, + { + "epoch": 0.53, + "learning_rate": 1.4352101135432518e-07, + "logits/chosen": -1.8252507448196411, + "logits/rejected": -1.896169900894165, + "logps/chosen": -257.9202575683594, + "logps/rejected": -289.1318359375, + "loss": 0.4405, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.01981782913208, + "rewards/margins": 1.4105861186981201, + "rewards/rejected": -2.4304039478302, + "step": 4587 + }, + { + "epoch": 0.53, + "learning_rate": 1.4348589488470093e-07, + "logits/chosen": -2.7225940227508545, + "logits/rejected": -2.698352813720703, + "logps/chosen": -179.4780731201172, + "logps/rejected": -240.74317932128906, + "loss": 0.2928, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.585757315158844, + "rewards/margins": 1.8937067985534668, + "rewards/rejected": -2.479464054107666, + "step": 4588 + }, + { + "epoch": 0.53, + "learning_rate": 1.4345077841507666e-07, + "logits/chosen": -2.3621914386749268, + "logits/rejected": -2.1924259662628174, + "logps/chosen": -306.8312072753906, + "logps/rejected": -447.6097106933594, + "loss": 0.3414, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7346904277801514, + "rewards/margins": 1.078365683555603, + "rewards/rejected": -1.8130561113357544, + "step": 4589 + }, + { + "epoch": 0.53, + "learning_rate": 1.434156619454524e-07, + "logits/chosen": -2.2063379287719727, + "logits/rejected": -1.9732872247695923, + "logps/chosen": -314.0137023925781, + "logps/rejected": -514.1206665039062, + "loss": 0.5433, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0759308338165283, + "rewards/margins": 3.740086078643799, + "rewards/rejected": -4.816016674041748, + "step": 4590 + }, + { + "epoch": 0.53, + "learning_rate": 1.4338054547582817e-07, + "logits/chosen": -1.8753286600112915, + "logits/rejected": -2.1451776027679443, + "logps/chosen": -314.037109375, + "logps/rejected": -254.39154052734375, + "loss": 0.4814, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0634267330169678, + "rewards/margins": 0.9384667277336121, + "rewards/rejected": -2.0018935203552246, + "step": 4591 + }, + { + "epoch": 0.53, + "learning_rate": 1.433454290062039e-07, + "logits/chosen": -1.8339343070983887, + "logits/rejected": -1.9297635555267334, + "logps/chosen": -243.42889404296875, + "logps/rejected": -444.48388671875, + "loss": 0.1174, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6558206677436829, + "rewards/margins": 2.7840561866760254, + "rewards/rejected": -3.4398767948150635, + "step": 4592 + }, + { + "epoch": 0.53, + "learning_rate": 1.4331031253657965e-07, + "logits/chosen": -2.461843490600586, + "logits/rejected": -2.3123064041137695, + "logps/chosen": -214.88629150390625, + "logps/rejected": -248.7891845703125, + "loss": 0.5518, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.8227477073669434, + "rewards/margins": 2.7412822246551514, + "rewards/rejected": -4.564030170440674, + "step": 4593 + }, + { + "epoch": 0.53, + "learning_rate": 1.432751960669554e-07, + "logits/chosen": -2.2541255950927734, + "logits/rejected": -2.267212152481079, + "logps/chosen": -380.0458679199219, + "logps/rejected": -424.092041015625, + "loss": 0.376, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19241324067115784, + "rewards/margins": 2.1735470294952393, + "rewards/rejected": -2.3659603595733643, + "step": 4594 + }, + { + "epoch": 0.53, + "learning_rate": 1.4324007959733115e-07, + "logits/chosen": -2.5179429054260254, + "logits/rejected": -2.605703830718994, + "logps/chosen": -196.30201721191406, + "logps/rejected": -175.32077026367188, + "loss": 0.5225, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7461369633674622, + "rewards/margins": 1.3271019458770752, + "rewards/rejected": -2.0732388496398926, + "step": 4595 + }, + { + "epoch": 0.53, + "learning_rate": 1.4320496312770688e-07, + "logits/chosen": -1.8223865032196045, + "logits/rejected": -2.055166721343994, + "logps/chosen": -479.3006591796875, + "logps/rejected": -351.36676025390625, + "loss": 0.4069, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1614024639129639, + "rewards/margins": 1.6959375143051147, + "rewards/rejected": -2.857339859008789, + "step": 4596 + }, + { + "epoch": 0.53, + "learning_rate": 1.4316984665808264e-07, + "logits/chosen": -2.8809776306152344, + "logits/rejected": -2.940140724182129, + "logps/chosen": -256.28131103515625, + "logps/rejected": -219.46182250976562, + "loss": 0.346, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5735272169113159, + "rewards/margins": 2.0049004554748535, + "rewards/rejected": -2.578427791595459, + "step": 4597 + }, + { + "epoch": 0.53, + "learning_rate": 1.4313473018845836e-07, + "logits/chosen": -1.686134696006775, + "logits/rejected": -1.5523524284362793, + "logps/chosen": -397.56781005859375, + "logps/rejected": -412.857177734375, + "loss": 0.0897, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6580328941345215, + "rewards/margins": 4.231986999511719, + "rewards/rejected": -4.89001989364624, + "step": 4598 + }, + { + "epoch": 0.53, + "learning_rate": 1.4309961371883414e-07, + "logits/chosen": -2.3337056636810303, + "logits/rejected": -2.574530601501465, + "logps/chosen": -239.39735412597656, + "logps/rejected": -185.36993408203125, + "loss": 0.1249, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6487009525299072, + "rewards/margins": 2.0897998809814453, + "rewards/rejected": -2.7385010719299316, + "step": 4599 + }, + { + "epoch": 0.53, + "learning_rate": 1.4306449724920987e-07, + "logits/chosen": -2.36238169670105, + "logits/rejected": -2.5112509727478027, + "logps/chosen": -118.36769104003906, + "logps/rejected": -145.601318359375, + "loss": 0.2783, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.46497446298599243, + "rewards/margins": 2.1612164974212646, + "rewards/rejected": -2.6261911392211914, + "step": 4600 + }, + { + "epoch": 0.53, + "learning_rate": 1.4302938077958562e-07, + "logits/chosen": -2.0387558937072754, + "logits/rejected": -2.029269218444824, + "logps/chosen": -235.95689392089844, + "logps/rejected": -140.71426391601562, + "loss": 0.6743, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6749277710914612, + "rewards/margins": 0.20364168286323547, + "rewards/rejected": -0.878569483757019, + "step": 4601 + }, + { + "epoch": 0.53, + "learning_rate": 1.4299426430996135e-07, + "logits/chosen": -2.348583936691284, + "logits/rejected": -2.5725841522216797, + "logps/chosen": -292.9964599609375, + "logps/rejected": -330.40191650390625, + "loss": 0.3216, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9776831269264221, + "rewards/margins": 2.423729658126831, + "rewards/rejected": -3.4014129638671875, + "step": 4602 + }, + { + "epoch": 0.53, + "learning_rate": 1.429591478403371e-07, + "logits/chosen": -2.8026206493377686, + "logits/rejected": -2.819138765335083, + "logps/chosen": -107.45406341552734, + "logps/rejected": -125.17955017089844, + "loss": 0.3596, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6459028720855713, + "rewards/margins": 1.6148343086242676, + "rewards/rejected": -2.260737180709839, + "step": 4603 + }, + { + "epoch": 0.53, + "learning_rate": 1.4292403137071286e-07, + "logits/chosen": -2.0270028114318848, + "logits/rejected": -2.2796638011932373, + "logps/chosen": -363.6365051269531, + "logps/rejected": -276.23834228515625, + "loss": 0.3945, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2678281366825104, + "rewards/margins": 1.7474799156188965, + "rewards/rejected": -2.015307903289795, + "step": 4604 + }, + { + "epoch": 0.53, + "learning_rate": 1.4288891490108861e-07, + "logits/chosen": -2.5706114768981934, + "logits/rejected": -2.614495038986206, + "logps/chosen": -345.5575256347656, + "logps/rejected": -228.66566467285156, + "loss": 0.3159, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6123668551445007, + "rewards/margins": 1.9546180963516235, + "rewards/rejected": -2.5669851303100586, + "step": 4605 + }, + { + "epoch": 0.53, + "learning_rate": 1.4285379843146434e-07, + "logits/chosen": -1.6005020141601562, + "logits/rejected": -1.4504098892211914, + "logps/chosen": -231.88983154296875, + "logps/rejected": -300.3085632324219, + "loss": 0.3745, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6458588242530823, + "rewards/margins": 1.890517234802246, + "rewards/rejected": -2.5363762378692627, + "step": 4606 + }, + { + "epoch": 0.53, + "learning_rate": 1.428186819618401e-07, + "logits/chosen": -2.2568509578704834, + "logits/rejected": -2.322401523590088, + "logps/chosen": -153.36688232421875, + "logps/rejected": -210.109375, + "loss": 0.9572, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9893398284912109, + "rewards/margins": 1.280990481376648, + "rewards/rejected": -2.2703301906585693, + "step": 4607 + }, + { + "epoch": 0.53, + "learning_rate": 1.4278356549221585e-07, + "logits/chosen": -2.843266248703003, + "logits/rejected": -2.6353728771209717, + "logps/chosen": -191.77850341796875, + "logps/rejected": -270.4466857910156, + "loss": 0.1692, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0822160243988037, + "rewards/margins": 3.0511484146118164, + "rewards/rejected": -4.133364200592041, + "step": 4608 + }, + { + "epoch": 0.53, + "learning_rate": 1.4274844902259158e-07, + "logits/chosen": -2.736754894256592, + "logits/rejected": -2.6859121322631836, + "logps/chosen": -315.0912780761719, + "logps/rejected": -295.4405212402344, + "loss": 0.1815, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.13823935389518738, + "rewards/margins": 2.872288942337036, + "rewards/rejected": -2.7340493202209473, + "step": 4609 + }, + { + "epoch": 0.53, + "learning_rate": 1.4271333255296733e-07, + "logits/chosen": -2.486114978790283, + "logits/rejected": -2.377687931060791, + "logps/chosen": -151.2863311767578, + "logps/rejected": -219.3614501953125, + "loss": 0.4659, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2950562238693237, + "rewards/margins": 1.1981405019760132, + "rewards/rejected": -2.493196725845337, + "step": 4610 + }, + { + "epoch": 0.53, + "learning_rate": 1.4267821608334308e-07, + "logits/chosen": -1.6982041597366333, + "logits/rejected": -1.7389976978302002, + "logps/chosen": -276.00946044921875, + "logps/rejected": -226.65670776367188, + "loss": 1.0931, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1796220541000366, + "rewards/margins": 1.430648684501648, + "rewards/rejected": -2.6102709770202637, + "step": 4611 + }, + { + "epoch": 0.53, + "learning_rate": 1.4264309961371884e-07, + "logits/chosen": -2.313991069793701, + "logits/rejected": -2.1858901977539062, + "logps/chosen": -345.53436279296875, + "logps/rejected": -279.45361328125, + "loss": 0.7249, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2607606649398804, + "rewards/margins": 0.48193278908729553, + "rewards/rejected": -1.742693543434143, + "step": 4612 + }, + { + "epoch": 0.53, + "learning_rate": 1.4260798314409457e-07, + "logits/chosen": -2.632594108581543, + "logits/rejected": -2.8208162784576416, + "logps/chosen": -304.6552429199219, + "logps/rejected": -304.58282470703125, + "loss": 1.0015, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.075716972351074, + "rewards/margins": 1.6552672386169434, + "rewards/rejected": -3.7309842109680176, + "step": 4613 + }, + { + "epoch": 0.53, + "learning_rate": 1.4257286667447032e-07, + "logits/chosen": -2.332561731338501, + "logits/rejected": -2.3226685523986816, + "logps/chosen": -145.31723022460938, + "logps/rejected": -217.63687133789062, + "loss": 0.2754, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8028401136398315, + "rewards/margins": 1.8610163927078247, + "rewards/rejected": -2.6638565063476562, + "step": 4614 + }, + { + "epoch": 0.53, + "learning_rate": 1.4253775020484607e-07, + "logits/chosen": -2.151327610015869, + "logits/rejected": -2.416529655456543, + "logps/chosen": -320.60675048828125, + "logps/rejected": -224.8748779296875, + "loss": 0.4328, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8015488386154175, + "rewards/margins": 1.0819454193115234, + "rewards/rejected": -1.883494257926941, + "step": 4615 + }, + { + "epoch": 0.53, + "learning_rate": 1.4250263373522183e-07, + "logits/chosen": -2.3003361225128174, + "logits/rejected": -2.2606329917907715, + "logps/chosen": -293.77777099609375, + "logps/rejected": -218.0069580078125, + "loss": 0.7025, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9736653566360474, + "rewards/margins": 1.4816677570343018, + "rewards/rejected": -2.4553329944610596, + "step": 4616 + }, + { + "epoch": 0.53, + "learning_rate": 1.4246751726559755e-07, + "logits/chosen": -2.231025457382202, + "logits/rejected": -2.1849915981292725, + "logps/chosen": -104.97953033447266, + "logps/rejected": -204.4333038330078, + "loss": 0.4758, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.661916971206665, + "rewards/margins": 0.9486449956893921, + "rewards/rejected": -1.6105618476867676, + "step": 4617 + }, + { + "epoch": 0.53, + "learning_rate": 1.424324007959733e-07, + "logits/chosen": -2.2166247367858887, + "logits/rejected": -2.5195136070251465, + "logps/chosen": -292.54107666015625, + "logps/rejected": -176.4634246826172, + "loss": 0.1449, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2787834703922272, + "rewards/margins": 3.221130847930908, + "rewards/rejected": -3.4999141693115234, + "step": 4618 + }, + { + "epoch": 0.53, + "learning_rate": 1.4239728432634904e-07, + "logits/chosen": -2.6266207695007324, + "logits/rejected": -2.3387482166290283, + "logps/chosen": -236.53729248046875, + "logps/rejected": -271.96343994140625, + "loss": 0.6255, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2348763942718506, + "rewards/margins": 1.6875708103179932, + "rewards/rejected": -2.9224472045898438, + "step": 4619 + }, + { + "epoch": 0.53, + "learning_rate": 1.423621678567248e-07, + "logits/chosen": -2.3827428817749023, + "logits/rejected": -2.3754889965057373, + "logps/chosen": -400.2405090332031, + "logps/rejected": -379.7411193847656, + "loss": 0.154, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14648090302944183, + "rewards/margins": 2.921746253967285, + "rewards/rejected": -3.0682272911071777, + "step": 4620 + }, + { + "epoch": 0.53, + "learning_rate": 1.4232705138710054e-07, + "logits/chosen": -1.919366717338562, + "logits/rejected": -2.182487726211548, + "logps/chosen": -225.93316650390625, + "logps/rejected": -199.05728149414062, + "loss": 0.2214, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11357923597097397, + "rewards/margins": 2.90191650390625, + "rewards/rejected": -3.015495777130127, + "step": 4621 + }, + { + "epoch": 0.53, + "learning_rate": 1.422919349174763e-07, + "logits/chosen": -2.4231672286987305, + "logits/rejected": -2.355377674102783, + "logps/chosen": -235.91305541992188, + "logps/rejected": -256.6956787109375, + "loss": 0.4492, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0482604503631592, + "rewards/margins": 1.4282832145690918, + "rewards/rejected": -2.476543426513672, + "step": 4622 + }, + { + "epoch": 0.53, + "learning_rate": 1.4225681844785202e-07, + "logits/chosen": -2.6149520874023438, + "logits/rejected": -2.7423439025878906, + "logps/chosen": -338.7408142089844, + "logps/rejected": -205.2649688720703, + "loss": 0.3943, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.36600902676582336, + "rewards/margins": 1.591437816619873, + "rewards/rejected": -1.9574466943740845, + "step": 4623 + }, + { + "epoch": 0.53, + "learning_rate": 1.4222170197822778e-07, + "logits/chosen": -2.616077184677124, + "logits/rejected": -2.5845401287078857, + "logps/chosen": -198.79412841796875, + "logps/rejected": -260.2273254394531, + "loss": 0.6366, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5236846208572388, + "rewards/margins": 2.355585813522339, + "rewards/rejected": -3.879270315170288, + "step": 4624 + }, + { + "epoch": 0.53, + "learning_rate": 1.4218658550860353e-07, + "logits/chosen": -2.340505599975586, + "logits/rejected": -2.2895798683166504, + "logps/chosen": -490.2544250488281, + "logps/rejected": -314.0755615234375, + "loss": 0.2396, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6652348041534424, + "rewards/margins": 2.54158353805542, + "rewards/rejected": -3.2068185806274414, + "step": 4625 + }, + { + "epoch": 0.53, + "learning_rate": 1.4215146903897926e-07, + "logits/chosen": -2.206463575363159, + "logits/rejected": -2.340831995010376, + "logps/chosen": -199.95936584472656, + "logps/rejected": -242.68484497070312, + "loss": 0.3761, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2778679132461548, + "rewards/margins": 1.9334096908569336, + "rewards/rejected": -3.211277961730957, + "step": 4626 + }, + { + "epoch": 0.53, + "learning_rate": 1.42116352569355e-07, + "logits/chosen": -2.0949931144714355, + "logits/rejected": -1.9380743503570557, + "logps/chosen": -393.79583740234375, + "logps/rejected": -399.3138427734375, + "loss": 0.3143, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2518292963504791, + "rewards/margins": 2.458630323410034, + "rewards/rejected": -2.7104597091674805, + "step": 4627 + }, + { + "epoch": 0.53, + "learning_rate": 1.4208123609973077e-07, + "logits/chosen": -2.1250131130218506, + "logits/rejected": -1.971647024154663, + "logps/chosen": -305.37060546875, + "logps/rejected": -273.12451171875, + "loss": 0.3699, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7786741852760315, + "rewards/margins": 2.7615389823913574, + "rewards/rejected": -3.5402133464813232, + "step": 4628 + }, + { + "epoch": 0.53, + "learning_rate": 1.4204611963010652e-07, + "logits/chosen": -2.0247068405151367, + "logits/rejected": -2.133150815963745, + "logps/chosen": -325.9356994628906, + "logps/rejected": -290.1265869140625, + "loss": 0.5813, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.15826416015625, + "rewards/margins": 0.9519795775413513, + "rewards/rejected": -2.110243558883667, + "step": 4629 + }, + { + "epoch": 0.53, + "learning_rate": 1.4201100316048225e-07, + "logits/chosen": -2.2412290573120117, + "logits/rejected": -2.087247371673584, + "logps/chosen": -114.68527221679688, + "logps/rejected": -200.5529022216797, + "loss": 0.7607, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6288119554519653, + "rewards/margins": 0.5194925665855408, + "rewards/rejected": -1.1483044624328613, + "step": 4630 + }, + { + "epoch": 0.53, + "learning_rate": 1.41975886690858e-07, + "logits/chosen": -2.0679101943969727, + "logits/rejected": -2.2169647216796875, + "logps/chosen": -150.6313018798828, + "logps/rejected": -254.6761474609375, + "loss": 0.4614, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4705958068370819, + "rewards/margins": 1.0494763851165771, + "rewards/rejected": -1.5200722217559814, + "step": 4631 + }, + { + "epoch": 0.53, + "learning_rate": 1.4194077022123376e-07, + "logits/chosen": -2.3067572116851807, + "logits/rejected": -2.2572741508483887, + "logps/chosen": -268.1955261230469, + "logps/rejected": -292.45245361328125, + "loss": 0.3869, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6702145934104919, + "rewards/margins": 2.254425525665283, + "rewards/rejected": -2.924639940261841, + "step": 4632 + }, + { + "epoch": 0.53, + "learning_rate": 1.419056537516095e-07, + "logits/chosen": -2.3768210411071777, + "logits/rejected": -2.3827567100524902, + "logps/chosen": -304.66973876953125, + "logps/rejected": -325.31396484375, + "loss": 0.8305, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4121992588043213, + "rewards/margins": 0.2532758116722107, + "rewards/rejected": -1.6654748916625977, + "step": 4633 + }, + { + "epoch": 0.53, + "learning_rate": 1.4187053728198524e-07, + "logits/chosen": -2.4744327068328857, + "logits/rejected": -2.562441825866699, + "logps/chosen": -380.1806640625, + "logps/rejected": -299.8268127441406, + "loss": 0.0783, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19840645790100098, + "rewards/margins": 3.252570390701294, + "rewards/rejected": -3.450976848602295, + "step": 4634 + }, + { + "epoch": 0.53, + "learning_rate": 1.41835420812361e-07, + "logits/chosen": -2.781620502471924, + "logits/rejected": -2.6621415615081787, + "logps/chosen": -239.29605102539062, + "logps/rejected": -309.66851806640625, + "loss": 0.7819, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.547460913658142, + "rewards/margins": 1.5588335990905762, + "rewards/rejected": -3.106294631958008, + "step": 4635 + }, + { + "epoch": 0.53, + "learning_rate": 1.4180030434273674e-07, + "logits/chosen": -2.420527935028076, + "logits/rejected": -2.6254723072052, + "logps/chosen": -538.030517578125, + "logps/rejected": -497.3599853515625, + "loss": 0.3108, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7477834820747375, + "rewards/margins": 3.6738131046295166, + "rewards/rejected": -4.421597003936768, + "step": 4636 + }, + { + "epoch": 0.53, + "learning_rate": 1.4176518787311247e-07, + "logits/chosen": -2.023754358291626, + "logits/rejected": -2.2841925621032715, + "logps/chosen": -316.21826171875, + "logps/rejected": -297.3305358886719, + "loss": 0.3438, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7116836309432983, + "rewards/margins": 2.188523292541504, + "rewards/rejected": -2.900207042694092, + "step": 4637 + }, + { + "epoch": 0.53, + "learning_rate": 1.4173007140348823e-07, + "logits/chosen": -2.6867549419403076, + "logits/rejected": -2.284679889678955, + "logps/chosen": -166.104736328125, + "logps/rejected": -283.725341796875, + "loss": 0.3143, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8700273633003235, + "rewards/margins": 2.672178030014038, + "rewards/rejected": -3.542205572128296, + "step": 4638 + }, + { + "epoch": 0.53, + "learning_rate": 1.4169495493386398e-07, + "logits/chosen": -1.9029207229614258, + "logits/rejected": -1.7540408372879028, + "logps/chosen": -459.02593994140625, + "logps/rejected": -362.7139892578125, + "loss": 1.5646, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7291927337646484, + "rewards/margins": 0.6431882381439209, + "rewards/rejected": -2.3723812103271484, + "step": 4639 + }, + { + "epoch": 0.53, + "learning_rate": 1.4165983846423973e-07, + "logits/chosen": -1.998497724533081, + "logits/rejected": -2.02341365814209, + "logps/chosen": -229.5802764892578, + "logps/rejected": -277.61663818359375, + "loss": 0.2639, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8975295424461365, + "rewards/margins": 2.549210548400879, + "rewards/rejected": -3.446739912033081, + "step": 4640 + }, + { + "epoch": 0.54, + "learning_rate": 1.4162472199461546e-07, + "logits/chosen": -2.0238773822784424, + "logits/rejected": -2.333080291748047, + "logps/chosen": -210.41836547851562, + "logps/rejected": -204.94384765625, + "loss": 0.7445, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4015752077102661, + "rewards/margins": 1.8160459995269775, + "rewards/rejected": -3.217621088027954, + "step": 4641 + }, + { + "epoch": 0.54, + "learning_rate": 1.4158960552499122e-07, + "logits/chosen": -2.258866310119629, + "logits/rejected": -2.280961513519287, + "logps/chosen": -167.58424377441406, + "logps/rejected": -200.31031799316406, + "loss": 0.7508, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2002747058868408, + "rewards/margins": 1.0884193181991577, + "rewards/rejected": -2.288693904876709, + "step": 4642 + }, + { + "epoch": 0.54, + "learning_rate": 1.4155448905536694e-07, + "logits/chosen": -2.183828115463257, + "logits/rejected": -2.5251893997192383, + "logps/chosen": -353.86376953125, + "logps/rejected": -233.88162231445312, + "loss": 0.4515, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7240090370178223, + "rewards/margins": 1.6487457752227783, + "rewards/rejected": -2.3727550506591797, + "step": 4643 + }, + { + "epoch": 0.54, + "learning_rate": 1.4151937258574272e-07, + "logits/chosen": -2.1528563499450684, + "logits/rejected": -2.134943962097168, + "logps/chosen": -258.4873962402344, + "logps/rejected": -288.4265441894531, + "loss": 0.5472, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6411805152893066, + "rewards/margins": 0.9495141506195068, + "rewards/rejected": -1.5906946659088135, + "step": 4644 + }, + { + "epoch": 0.54, + "learning_rate": 1.4148425611611845e-07, + "logits/chosen": -2.051163673400879, + "logits/rejected": -1.983434796333313, + "logps/chosen": -269.3109130859375, + "logps/rejected": -252.8933563232422, + "loss": 0.701, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6390416026115417, + "rewards/margins": 1.0228296518325806, + "rewards/rejected": -1.6618711948394775, + "step": 4645 + }, + { + "epoch": 0.54, + "learning_rate": 1.414491396464942e-07, + "logits/chosen": -1.7055411338806152, + "logits/rejected": -1.8979241847991943, + "logps/chosen": -371.72900390625, + "logps/rejected": -290.3659362792969, + "loss": 0.6585, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4717903137207031, + "rewards/margins": 1.8804752826690674, + "rewards/rejected": -3.3522655963897705, + "step": 4646 + }, + { + "epoch": 0.54, + "learning_rate": 1.4141402317686993e-07, + "logits/chosen": -2.0892295837402344, + "logits/rejected": -2.3366692066192627, + "logps/chosen": -292.5573425292969, + "logps/rejected": -250.74351501464844, + "loss": 0.3297, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5080727338790894, + "rewards/margins": 2.438805341720581, + "rewards/rejected": -2.9468774795532227, + "step": 4647 + }, + { + "epoch": 0.54, + "learning_rate": 1.413789067072457e-07, + "logits/chosen": -2.043790578842163, + "logits/rejected": -2.3886799812316895, + "logps/chosen": -384.8489685058594, + "logps/rejected": -255.05230712890625, + "loss": 1.0734, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.0918359756469727, + "rewards/margins": 0.05817282199859619, + "rewards/rejected": -2.1500089168548584, + "step": 4648 + }, + { + "epoch": 0.54, + "learning_rate": 1.4134379023762144e-07, + "logits/chosen": -2.7577762603759766, + "logits/rejected": -2.3326945304870605, + "logps/chosen": -168.68948364257812, + "logps/rejected": -215.06640625, + "loss": 0.1271, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2713942229747772, + "rewards/margins": 2.5090737342834473, + "rewards/rejected": -2.2376794815063477, + "step": 4649 + }, + { + "epoch": 0.54, + "learning_rate": 1.413086737679972e-07, + "logits/chosen": -2.1501624584198, + "logits/rejected": -2.3833768367767334, + "logps/chosen": -273.18121337890625, + "logps/rejected": -249.9713592529297, + "loss": 0.2784, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7539253234863281, + "rewards/margins": 3.0028345584869385, + "rewards/rejected": -3.7567596435546875, + "step": 4650 + }, + { + "epoch": 0.54, + "learning_rate": 1.4127355729837292e-07, + "logits/chosen": -2.4564385414123535, + "logits/rejected": -2.6173055171966553, + "logps/chosen": -160.2106170654297, + "logps/rejected": -192.384033203125, + "loss": 0.1364, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15168824791908264, + "rewards/margins": 3.5509257316589355, + "rewards/rejected": -3.7026140689849854, + "step": 4651 + }, + { + "epoch": 0.54, + "learning_rate": 1.4123844082874867e-07, + "logits/chosen": -2.5734355449676514, + "logits/rejected": -2.514514446258545, + "logps/chosen": -275.5256042480469, + "logps/rejected": -162.89678955078125, + "loss": 0.4472, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8914965391159058, + "rewards/margins": 1.8044543266296387, + "rewards/rejected": -2.695950984954834, + "step": 4652 + }, + { + "epoch": 0.54, + "learning_rate": 1.4120332435912443e-07, + "logits/chosen": -2.168384552001953, + "logits/rejected": -2.2394824028015137, + "logps/chosen": -346.85382080078125, + "logps/rejected": -312.446533203125, + "loss": 0.4248, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5879580974578857, + "rewards/margins": 1.5062851905822754, + "rewards/rejected": -2.094243288040161, + "step": 4653 + }, + { + "epoch": 0.54, + "learning_rate": 1.4116820788950016e-07, + "logits/chosen": -2.625995397567749, + "logits/rejected": -2.507774591445923, + "logps/chosen": -138.39932250976562, + "logps/rejected": -179.94119262695312, + "loss": 0.4596, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.988871693611145, + "rewards/margins": 1.6082265377044678, + "rewards/rejected": -2.5970983505249023, + "step": 4654 + }, + { + "epoch": 0.54, + "learning_rate": 1.411330914198759e-07, + "logits/chosen": -2.5371673107147217, + "logits/rejected": -2.542106866836548, + "logps/chosen": -131.2529296875, + "logps/rejected": -146.44485473632812, + "loss": 0.6514, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6444640159606934, + "rewards/margins": 1.3582355976104736, + "rewards/rejected": -2.002699375152588, + "step": 4655 + }, + { + "epoch": 0.54, + "learning_rate": 1.4109797495025166e-07, + "logits/chosen": -2.09181547164917, + "logits/rejected": -2.280521869659424, + "logps/chosen": -416.0189208984375, + "logps/rejected": -292.43072509765625, + "loss": 0.2696, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.768784761428833, + "rewards/margins": 2.1109566688537598, + "rewards/rejected": -2.8797414302825928, + "step": 4656 + }, + { + "epoch": 0.54, + "learning_rate": 1.4106285848062742e-07, + "logits/chosen": -2.0021657943725586, + "logits/rejected": -2.0529439449310303, + "logps/chosen": -364.73284912109375, + "logps/rejected": -343.7151794433594, + "loss": 0.3987, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8037823438644409, + "rewards/margins": 1.8060404062271118, + "rewards/rejected": -2.6098225116729736, + "step": 4657 + }, + { + "epoch": 0.54, + "learning_rate": 1.4102774201100314e-07, + "logits/chosen": -2.54687237739563, + "logits/rejected": -2.518455982208252, + "logps/chosen": -265.74493408203125, + "logps/rejected": -208.13348388671875, + "loss": 0.5423, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6512266993522644, + "rewards/margins": 0.571692943572998, + "rewards/rejected": -1.2229197025299072, + "step": 4658 + }, + { + "epoch": 0.54, + "learning_rate": 1.409926255413789e-07, + "logits/chosen": -2.3628296852111816, + "logits/rejected": -2.430018901824951, + "logps/chosen": -334.469970703125, + "logps/rejected": -222.35733032226562, + "loss": 0.2572, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.40968960523605347, + "rewards/margins": 2.279755115509033, + "rewards/rejected": -2.6894445419311523, + "step": 4659 + }, + { + "epoch": 0.54, + "learning_rate": 1.4095750907175465e-07, + "logits/chosen": -2.320133686065674, + "logits/rejected": -2.42474627494812, + "logps/chosen": -208.34115600585938, + "logps/rejected": -213.03445434570312, + "loss": 0.6529, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4473494291305542, + "rewards/margins": 1.9888712167739868, + "rewards/rejected": -3.436220645904541, + "step": 4660 + }, + { + "epoch": 0.54, + "learning_rate": 1.409223926021304e-07, + "logits/chosen": -2.277052640914917, + "logits/rejected": -2.3101935386657715, + "logps/chosen": -418.98486328125, + "logps/rejected": -317.79449462890625, + "loss": 0.4526, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.229347586631775, + "rewards/margins": 3.0325002670288086, + "rewards/rejected": -4.261847972869873, + "step": 4661 + }, + { + "epoch": 0.54, + "learning_rate": 1.4088727613250613e-07, + "logits/chosen": -1.9387941360473633, + "logits/rejected": -1.9681607484817505, + "logps/chosen": -421.2774658203125, + "logps/rejected": -419.663818359375, + "loss": 0.4181, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2642734050750732, + "rewards/margins": 2.3274357318878174, + "rewards/rejected": -3.5917091369628906, + "step": 4662 + }, + { + "epoch": 0.54, + "learning_rate": 1.408521596628819e-07, + "logits/chosen": -2.504359483718872, + "logits/rejected": -2.4909257888793945, + "logps/chosen": -254.63600158691406, + "logps/rejected": -293.94952392578125, + "loss": 0.2831, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6383199691772461, + "rewards/margins": 1.8861948251724243, + "rewards/rejected": -2.52451491355896, + "step": 4663 + }, + { + "epoch": 0.54, + "learning_rate": 1.4081704319325764e-07, + "logits/chosen": -2.5996193885803223, + "logits/rejected": -2.9139552116394043, + "logps/chosen": -303.9418029785156, + "logps/rejected": -224.72036743164062, + "loss": 0.3938, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.710679292678833, + "rewards/margins": 1.910790205001831, + "rewards/rejected": -2.621469497680664, + "step": 4664 + }, + { + "epoch": 0.54, + "learning_rate": 1.407819267236334e-07, + "logits/chosen": -2.4083845615386963, + "logits/rejected": -2.5214874744415283, + "logps/chosen": -103.58615112304688, + "logps/rejected": -147.50546264648438, + "loss": 0.362, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7585685849189758, + "rewards/margins": 2.666978120803833, + "rewards/rejected": -3.425546884536743, + "step": 4665 + }, + { + "epoch": 0.54, + "learning_rate": 1.4074681025400912e-07, + "logits/chosen": -2.7168805599212646, + "logits/rejected": -2.711575508117676, + "logps/chosen": -234.82147216796875, + "logps/rejected": -227.42999267578125, + "loss": 0.1052, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.443274587392807, + "rewards/margins": 3.310800313949585, + "rewards/rejected": -3.7540745735168457, + "step": 4666 + }, + { + "epoch": 0.54, + "learning_rate": 1.4071169378438488e-07, + "logits/chosen": -2.175266742706299, + "logits/rejected": -2.4092955589294434, + "logps/chosen": -250.97071838378906, + "logps/rejected": -193.34832763671875, + "loss": 0.4304, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4962504506111145, + "rewards/margins": 1.5004216432571411, + "rewards/rejected": -1.9966720342636108, + "step": 4667 + }, + { + "epoch": 0.54, + "learning_rate": 1.406765773147606e-07, + "logits/chosen": -2.530606985092163, + "logits/rejected": -2.7816145420074463, + "logps/chosen": -181.92572021484375, + "logps/rejected": -218.8867950439453, + "loss": 0.1211, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08366921544075012, + "rewards/margins": 3.867915630340576, + "rewards/rejected": -3.951584815979004, + "step": 4668 + }, + { + "epoch": 0.54, + "learning_rate": 1.4064146084513636e-07, + "logits/chosen": -2.481567859649658, + "logits/rejected": -2.4614057540893555, + "logps/chosen": -168.74270629882812, + "logps/rejected": -142.98587036132812, + "loss": 0.5229, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.003484085202217102, + "rewards/margins": 0.96924889087677, + "rewards/rejected": -0.9657648801803589, + "step": 4669 + }, + { + "epoch": 0.54, + "learning_rate": 1.406063443755121e-07, + "logits/chosen": -2.0912704467773438, + "logits/rejected": -2.083850145339966, + "logps/chosen": -262.7591552734375, + "logps/rejected": -312.1327819824219, + "loss": 0.4964, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0970603227615356, + "rewards/margins": 1.3592599630355835, + "rewards/rejected": -2.456320285797119, + "step": 4670 + }, + { + "epoch": 0.54, + "learning_rate": 1.4057122790588784e-07, + "logits/chosen": -2.39577317237854, + "logits/rejected": -2.2764134407043457, + "logps/chosen": -154.080810546875, + "logps/rejected": -235.59164428710938, + "loss": 0.1846, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.44170570373535156, + "rewards/margins": 2.4032528400421143, + "rewards/rejected": -2.844958543777466, + "step": 4671 + }, + { + "epoch": 0.54, + "learning_rate": 1.405361114362636e-07, + "logits/chosen": -2.6163299083709717, + "logits/rejected": -2.780449151992798, + "logps/chosen": -375.00189208984375, + "logps/rejected": -326.9349060058594, + "loss": 0.3618, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.377819538116455, + "rewards/margins": 1.8372230529785156, + "rewards/rejected": -3.2150425910949707, + "step": 4672 + }, + { + "epoch": 0.54, + "learning_rate": 1.4050099496663935e-07, + "logits/chosen": -1.9704219102859497, + "logits/rejected": -1.7115920782089233, + "logps/chosen": -213.24005126953125, + "logps/rejected": -325.32476806640625, + "loss": 0.6931, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2003371715545654, + "rewards/margins": 0.9453713893890381, + "rewards/rejected": -2.1457085609436035, + "step": 4673 + }, + { + "epoch": 0.54, + "learning_rate": 1.404658784970151e-07, + "logits/chosen": -1.9797896146774292, + "logits/rejected": -1.9964138269424438, + "logps/chosen": -358.2486877441406, + "logps/rejected": -380.0574951171875, + "loss": 0.3368, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4871090650558472, + "rewards/margins": 2.1576919555664062, + "rewards/rejected": -3.644801139831543, + "step": 4674 + }, + { + "epoch": 0.54, + "learning_rate": 1.4043076202739083e-07, + "logits/chosen": -2.7674803733825684, + "logits/rejected": -2.5872504711151123, + "logps/chosen": -148.0118865966797, + "logps/rejected": -205.54852294921875, + "loss": 0.5722, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2788622379302979, + "rewards/margins": 2.3751120567321777, + "rewards/rejected": -3.6539742946624756, + "step": 4675 + }, + { + "epoch": 0.54, + "learning_rate": 1.4039564555776658e-07, + "logits/chosen": -2.2726452350616455, + "logits/rejected": -2.200684070587158, + "logps/chosen": -365.9416809082031, + "logps/rejected": -407.482421875, + "loss": 0.2328, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2717553973197937, + "rewards/margins": 2.5379748344421387, + "rewards/rejected": -2.8097305297851562, + "step": 4676 + }, + { + "epoch": 0.54, + "learning_rate": 1.4036052908814234e-07, + "logits/chosen": -2.557997226715088, + "logits/rejected": -2.5381550788879395, + "logps/chosen": -185.05699157714844, + "logps/rejected": -187.614013671875, + "loss": 0.4696, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7951751947402954, + "rewards/margins": 0.8024019598960876, + "rewards/rejected": -1.5975770950317383, + "step": 4677 + }, + { + "epoch": 0.54, + "learning_rate": 1.403254126185181e-07, + "logits/chosen": -2.2878546714782715, + "logits/rejected": -1.9094903469085693, + "logps/chosen": -221.55564880371094, + "logps/rejected": -218.16665649414062, + "loss": 0.5272, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2672688961029053, + "rewards/margins": 1.6307610273361206, + "rewards/rejected": -2.8980302810668945, + "step": 4678 + }, + { + "epoch": 0.54, + "learning_rate": 1.4029029614889382e-07, + "logits/chosen": -2.1756081581115723, + "logits/rejected": -2.2022628784179688, + "logps/chosen": -144.9686737060547, + "logps/rejected": -185.51976013183594, + "loss": 0.1367, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9344289302825928, + "rewards/margins": 2.7599000930786133, + "rewards/rejected": -3.694328784942627, + "step": 4679 + }, + { + "epoch": 0.54, + "learning_rate": 1.4025517967926957e-07, + "logits/chosen": -2.2368783950805664, + "logits/rejected": -2.3495144844055176, + "logps/chosen": -267.6463623046875, + "logps/rejected": -294.70281982421875, + "loss": 0.6854, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9744809865951538, + "rewards/margins": 2.904121160507202, + "rewards/rejected": -3.8786022663116455, + "step": 4680 + }, + { + "epoch": 0.54, + "learning_rate": 1.4022006320964532e-07, + "logits/chosen": -2.711851119995117, + "logits/rejected": -2.7045774459838867, + "logps/chosen": -163.43588256835938, + "logps/rejected": -241.65023803710938, + "loss": 0.5131, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9806638956069946, + "rewards/margins": 1.3934212923049927, + "rewards/rejected": -2.3740851879119873, + "step": 4681 + }, + { + "epoch": 0.54, + "learning_rate": 1.4018494674002108e-07, + "logits/chosen": -2.3472847938537598, + "logits/rejected": -2.0076892375946045, + "logps/chosen": -141.6386260986328, + "logps/rejected": -312.96356201171875, + "loss": 0.1497, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0452224016189575, + "rewards/margins": 4.06363582611084, + "rewards/rejected": -5.108858108520508, + "step": 4682 + }, + { + "epoch": 0.54, + "learning_rate": 1.401498302703968e-07, + "logits/chosen": -2.3788318634033203, + "logits/rejected": -2.372943162918091, + "logps/chosen": -170.36248779296875, + "logps/rejected": -95.65077209472656, + "loss": 0.668, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6385971307754517, + "rewards/margins": 0.6698853969573975, + "rewards/rejected": -1.3084825277328491, + "step": 4683 + }, + { + "epoch": 0.54, + "learning_rate": 1.4011471380077256e-07, + "logits/chosen": -2.389979839324951, + "logits/rejected": -2.348573923110962, + "logps/chosen": -394.12518310546875, + "logps/rejected": -410.42181396484375, + "loss": 0.4765, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5365633964538574, + "rewards/margins": 1.4483273029327393, + "rewards/rejected": -2.9848906993865967, + "step": 4684 + }, + { + "epoch": 0.54, + "learning_rate": 1.400795973311483e-07, + "logits/chosen": -2.190666675567627, + "logits/rejected": -2.2718749046325684, + "logps/chosen": -270.59710693359375, + "logps/rejected": -219.507568359375, + "loss": 0.3618, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1541028022766113, + "rewards/margins": 1.7487716674804688, + "rewards/rejected": -2.90287446975708, + "step": 4685 + }, + { + "epoch": 0.54, + "learning_rate": 1.4004448086152404e-07, + "logits/chosen": -2.2684268951416016, + "logits/rejected": -2.350329637527466, + "logps/chosen": -195.17843627929688, + "logps/rejected": -173.8470458984375, + "loss": 0.3468, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6160518527030945, + "rewards/margins": 2.3059160709381104, + "rewards/rejected": -2.9219679832458496, + "step": 4686 + }, + { + "epoch": 0.54, + "learning_rate": 1.400093643918998e-07, + "logits/chosen": -1.9388840198516846, + "logits/rejected": -2.2120749950408936, + "logps/chosen": -308.64117431640625, + "logps/rejected": -280.40692138671875, + "loss": 0.47, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4830264151096344, + "rewards/margins": 2.8086345195770264, + "rewards/rejected": -3.291661024093628, + "step": 4687 + }, + { + "epoch": 0.54, + "learning_rate": 1.3997424792227552e-07, + "logits/chosen": -1.500317096710205, + "logits/rejected": -1.830981731414795, + "logps/chosen": -309.22064208984375, + "logps/rejected": -242.42138671875, + "loss": 0.6468, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2874082326889038, + "rewards/margins": 0.5085300803184509, + "rewards/rejected": -1.79593825340271, + "step": 4688 + }, + { + "epoch": 0.54, + "learning_rate": 1.399391314526513e-07, + "logits/chosen": -2.012239694595337, + "logits/rejected": -2.18068265914917, + "logps/chosen": -392.067626953125, + "logps/rejected": -307.6922607421875, + "loss": 0.4172, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.052905797958374, + "rewards/margins": 2.0228705406188965, + "rewards/rejected": -3.0757761001586914, + "step": 4689 + }, + { + "epoch": 0.54, + "learning_rate": 1.3990401498302703e-07, + "logits/chosen": -2.00974702835083, + "logits/rejected": -2.2071533203125, + "logps/chosen": -327.2082214355469, + "logps/rejected": -233.9256591796875, + "loss": 0.5806, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5443958044052124, + "rewards/margins": 0.929771900177002, + "rewards/rejected": -1.4741675853729248, + "step": 4690 + }, + { + "epoch": 0.54, + "learning_rate": 1.3986889851340278e-07, + "logits/chosen": -2.332869052886963, + "logits/rejected": -2.2040963172912598, + "logps/chosen": -212.24374389648438, + "logps/rejected": -200.29257202148438, + "loss": 0.6119, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8698627948760986, + "rewards/margins": 0.4375637173652649, + "rewards/rejected": -2.3074264526367188, + "step": 4691 + }, + { + "epoch": 0.54, + "learning_rate": 1.398337820437785e-07, + "logits/chosen": -2.5010533332824707, + "logits/rejected": -2.709749460220337, + "logps/chosen": -190.3934783935547, + "logps/rejected": -175.0540313720703, + "loss": 0.6209, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8562482595443726, + "rewards/margins": 1.165457010269165, + "rewards/rejected": -2.021705389022827, + "step": 4692 + }, + { + "epoch": 0.54, + "learning_rate": 1.397986655741543e-07, + "logits/chosen": -2.3154544830322266, + "logits/rejected": -2.159454107284546, + "logps/chosen": -282.90325927734375, + "logps/rejected": -167.31561279296875, + "loss": 0.632, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2151880264282227, + "rewards/margins": 1.1792744398117065, + "rewards/rejected": -2.3944625854492188, + "step": 4693 + }, + { + "epoch": 0.54, + "learning_rate": 1.3976354910453002e-07, + "logits/chosen": -2.0586183071136475, + "logits/rejected": -2.1203408241271973, + "logps/chosen": -232.1318359375, + "logps/rejected": -372.66552734375, + "loss": 0.2071, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4424954652786255, + "rewards/margins": 3.9450302124023438, + "rewards/rejected": -4.38752555847168, + "step": 4694 + }, + { + "epoch": 0.54, + "learning_rate": 1.3972843263490577e-07, + "logits/chosen": -2.8769216537475586, + "logits/rejected": -2.9214417934417725, + "logps/chosen": -223.89352416992188, + "logps/rejected": -297.27679443359375, + "loss": 0.5888, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1953033208847046, + "rewards/margins": 2.9548375606536865, + "rewards/rejected": -4.150140762329102, + "step": 4695 + }, + { + "epoch": 0.54, + "learning_rate": 1.396933161652815e-07, + "logits/chosen": -2.5091664791107178, + "logits/rejected": -2.6034834384918213, + "logps/chosen": -226.7485809326172, + "logps/rejected": -280.0003356933594, + "loss": 0.4135, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.008162260055542, + "rewards/margins": 2.415501594543457, + "rewards/rejected": -3.423663854598999, + "step": 4696 + }, + { + "epoch": 0.54, + "learning_rate": 1.3965819969565725e-07, + "logits/chosen": -3.0132131576538086, + "logits/rejected": -3.0019419193267822, + "logps/chosen": -249.25436401367188, + "logps/rejected": -280.9468078613281, + "loss": 0.4284, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2727713584899902, + "rewards/margins": 3.1816794872283936, + "rewards/rejected": -4.454451084136963, + "step": 4697 + }, + { + "epoch": 0.54, + "learning_rate": 1.39623083226033e-07, + "logits/chosen": -2.190458059310913, + "logits/rejected": -2.323636770248413, + "logps/chosen": -402.11590576171875, + "logps/rejected": -350.18902587890625, + "loss": 0.2369, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5730869770050049, + "rewards/margins": 2.167562484741211, + "rewards/rejected": -2.740649461746216, + "step": 4698 + }, + { + "epoch": 0.54, + "learning_rate": 1.3958796675640876e-07, + "logits/chosen": -2.2422385215759277, + "logits/rejected": -2.2789218425750732, + "logps/chosen": -180.10690307617188, + "logps/rejected": -128.9542694091797, + "loss": 0.6008, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5851205587387085, + "rewards/margins": 0.6389923095703125, + "rewards/rejected": -1.224112868309021, + "step": 4699 + }, + { + "epoch": 0.54, + "learning_rate": 1.395528502867845e-07, + "logits/chosen": -2.3580684661865234, + "logits/rejected": -2.0002684593200684, + "logps/chosen": -265.0491027832031, + "logps/rejected": -323.1110534667969, + "loss": 0.4516, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7465962171554565, + "rewards/margins": 2.4336633682250977, + "rewards/rejected": -4.180259704589844, + "step": 4700 + }, + { + "epoch": 0.54, + "learning_rate": 1.3951773381716024e-07, + "logits/chosen": -2.1523702144622803, + "logits/rejected": -2.5484118461608887, + "logps/chosen": -331.32452392578125, + "logps/rejected": -311.4661865234375, + "loss": 0.4227, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5000629425048828, + "rewards/margins": 1.0739203691482544, + "rewards/rejected": -2.5739834308624268, + "step": 4701 + }, + { + "epoch": 0.54, + "learning_rate": 1.39482617347536e-07, + "logits/chosen": -2.3154938220977783, + "logits/rejected": -2.11558198928833, + "logps/chosen": -102.41669464111328, + "logps/rejected": -290.67657470703125, + "loss": 0.3018, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.502341628074646, + "rewards/margins": 1.850561499595642, + "rewards/rejected": -2.352903127670288, + "step": 4702 + }, + { + "epoch": 0.54, + "learning_rate": 1.3944750087791172e-07, + "logits/chosen": -2.285971164703369, + "logits/rejected": -2.511857509613037, + "logps/chosen": -467.41748046875, + "logps/rejected": -531.48876953125, + "loss": 0.1688, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6795567870140076, + "rewards/margins": 2.7301695346832275, + "rewards/rejected": -3.40972638130188, + "step": 4703 + }, + { + "epoch": 0.54, + "learning_rate": 1.3941238440828748e-07, + "logits/chosen": -2.6573500633239746, + "logits/rejected": -2.830669403076172, + "logps/chosen": -175.58900451660156, + "logps/rejected": -214.15811157226562, + "loss": 0.4495, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4772425889968872, + "rewards/margins": 2.0253067016601562, + "rewards/rejected": -3.502549171447754, + "step": 4704 + }, + { + "epoch": 0.54, + "learning_rate": 1.3937726793866323e-07, + "logits/chosen": -2.955780029296875, + "logits/rejected": -3.022489070892334, + "logps/chosen": -105.45494079589844, + "logps/rejected": -151.55349731445312, + "loss": 0.5287, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5134501457214355, + "rewards/margins": 2.5656046867370605, + "rewards/rejected": -3.079055070877075, + "step": 4705 + }, + { + "epoch": 0.54, + "learning_rate": 1.3934215146903899e-07, + "logits/chosen": -1.8189364671707153, + "logits/rejected": -1.8308746814727783, + "logps/chosen": -287.84710693359375, + "logps/rejected": -275.99383544921875, + "loss": 0.6573, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0156919956207275, + "rewards/margins": 1.7904435396194458, + "rewards/rejected": -2.806135416030884, + "step": 4706 + }, + { + "epoch": 0.54, + "learning_rate": 1.393070349994147e-07, + "logits/chosen": -1.9576683044433594, + "logits/rejected": -1.9999182224273682, + "logps/chosen": -249.91062927246094, + "logps/rejected": -349.9608154296875, + "loss": 0.1512, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.44058817625045776, + "rewards/margins": 3.077335834503174, + "rewards/rejected": -3.5179243087768555, + "step": 4707 + }, + { + "epoch": 0.54, + "learning_rate": 1.3927191852979047e-07, + "logits/chosen": -2.1674416065216064, + "logits/rejected": -2.1598639488220215, + "logps/chosen": -210.71630859375, + "logps/rejected": -244.8988037109375, + "loss": 0.2008, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1472655534744263, + "rewards/margins": 3.1560630798339844, + "rewards/rejected": -4.303328990936279, + "step": 4708 + }, + { + "epoch": 0.54, + "learning_rate": 1.3923680206016622e-07, + "logits/chosen": -1.992042064666748, + "logits/rejected": -2.3617734909057617, + "logps/chosen": -427.2333984375, + "logps/rejected": -183.26971435546875, + "loss": 0.2865, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8133429884910583, + "rewards/margins": 2.251328468322754, + "rewards/rejected": -3.064671516418457, + "step": 4709 + }, + { + "epoch": 0.54, + "learning_rate": 1.3920168559054197e-07, + "logits/chosen": -2.7310452461242676, + "logits/rejected": -2.788832187652588, + "logps/chosen": -203.2854461669922, + "logps/rejected": -297.7886047363281, + "loss": 0.2866, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2891008853912354, + "rewards/margins": 1.9397262334823608, + "rewards/rejected": -3.2288269996643066, + "step": 4710 + }, + { + "epoch": 0.54, + "learning_rate": 1.391665691209177e-07, + "logits/chosen": -2.8365068435668945, + "logits/rejected": -2.8110384941101074, + "logps/chosen": -142.14959716796875, + "logps/rejected": -170.0719757080078, + "loss": 0.5284, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3555775880813599, + "rewards/margins": 1.7340314388275146, + "rewards/rejected": -3.089608907699585, + "step": 4711 + }, + { + "epoch": 0.54, + "learning_rate": 1.3913145265129346e-07, + "logits/chosen": -2.319941997528076, + "logits/rejected": -2.0225541591644287, + "logps/chosen": -228.066650390625, + "logps/rejected": -242.80133056640625, + "loss": 0.5094, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9940363168716431, + "rewards/margins": 1.2486869096755981, + "rewards/rejected": -2.2427234649658203, + "step": 4712 + }, + { + "epoch": 0.54, + "learning_rate": 1.3909633618166918e-07, + "logits/chosen": -2.335282325744629, + "logits/rejected": -2.3811440467834473, + "logps/chosen": -326.50439453125, + "logps/rejected": -319.68670654296875, + "loss": 0.4133, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1066526174545288, + "rewards/margins": 1.1758520603179932, + "rewards/rejected": -2.2825045585632324, + "step": 4713 + }, + { + "epoch": 0.54, + "learning_rate": 1.3906121971204494e-07, + "logits/chosen": -1.6868810653686523, + "logits/rejected": -1.884109377861023, + "logps/chosen": -439.12725830078125, + "logps/rejected": -356.8514404296875, + "loss": 0.3103, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7980567216873169, + "rewards/margins": 3.790369987487793, + "rewards/rejected": -4.58842658996582, + "step": 4714 + }, + { + "epoch": 0.54, + "learning_rate": 1.390261032424207e-07, + "logits/chosen": -1.987750768661499, + "logits/rejected": -2.1238927841186523, + "logps/chosen": -443.538818359375, + "logps/rejected": -289.130615234375, + "loss": 0.2, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9622253179550171, + "rewards/margins": 2.990023374557495, + "rewards/rejected": -3.9522485733032227, + "step": 4715 + }, + { + "epoch": 0.54, + "learning_rate": 1.3899098677279644e-07, + "logits/chosen": -1.945294976234436, + "logits/rejected": -1.861660361289978, + "logps/chosen": -342.4302673339844, + "logps/rejected": -421.9557800292969, + "loss": 0.3664, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8736848831176758, + "rewards/margins": 2.3491904735565186, + "rewards/rejected": -3.2228753566741943, + "step": 4716 + }, + { + "epoch": 0.54, + "learning_rate": 1.3895587030317217e-07, + "logits/chosen": -2.2172863483428955, + "logits/rejected": -2.0145180225372314, + "logps/chosen": -187.9818878173828, + "logps/rejected": -273.32318115234375, + "loss": 0.4408, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1588635444641113, + "rewards/margins": 2.267930030822754, + "rewards/rejected": -3.426793336868286, + "step": 4717 + }, + { + "epoch": 0.54, + "learning_rate": 1.3892075383354793e-07, + "logits/chosen": -1.9662224054336548, + "logits/rejected": -2.059295892715454, + "logps/chosen": -346.1848449707031, + "logps/rejected": -322.033203125, + "loss": 0.4831, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7879611849784851, + "rewards/margins": 2.354789972305298, + "rewards/rejected": -3.142751455307007, + "step": 4718 + }, + { + "epoch": 0.54, + "learning_rate": 1.3888563736392368e-07, + "logits/chosen": -2.1098532676696777, + "logits/rejected": -1.9116101264953613, + "logps/chosen": -196.99359130859375, + "logps/rejected": -319.0745849609375, + "loss": 0.2447, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2893590927124023, + "rewards/margins": 3.9336767196655273, + "rewards/rejected": -5.22303581237793, + "step": 4719 + }, + { + "epoch": 0.54, + "learning_rate": 1.388505208942994e-07, + "logits/chosen": -2.6691536903381348, + "logits/rejected": -2.5468223094940186, + "logps/chosen": -264.0138244628906, + "logps/rejected": -227.09144592285156, + "loss": 0.5873, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5941627025604248, + "rewards/margins": 2.3890233039855957, + "rewards/rejected": -2.9831862449645996, + "step": 4720 + }, + { + "epoch": 0.54, + "learning_rate": 1.3881540442467516e-07, + "logits/chosen": -1.4958100318908691, + "logits/rejected": -2.0363171100616455, + "logps/chosen": -678.2284545898438, + "logps/rejected": -287.08355712890625, + "loss": 0.5137, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8254482746124268, + "rewards/margins": 1.081298589706421, + "rewards/rejected": -1.9067468643188477, + "step": 4721 + }, + { + "epoch": 0.54, + "learning_rate": 1.3878028795505091e-07, + "logits/chosen": -1.630255103111267, + "logits/rejected": -1.9814032316207886, + "logps/chosen": -481.20343017578125, + "logps/rejected": -250.36123657226562, + "loss": 0.4152, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07325619459152222, + "rewards/margins": 2.35868239402771, + "rewards/rejected": -2.431938648223877, + "step": 4722 + }, + { + "epoch": 0.54, + "learning_rate": 1.3874517148542667e-07, + "logits/chosen": -2.0228962898254395, + "logits/rejected": -2.201138496398926, + "logps/chosen": -409.6976318359375, + "logps/rejected": -278.41357421875, + "loss": 0.3436, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8229672312736511, + "rewards/margins": 1.8486857414245605, + "rewards/rejected": -2.6716532707214355, + "step": 4723 + }, + { + "epoch": 0.54, + "learning_rate": 1.387100550158024e-07, + "logits/chosen": -1.9444761276245117, + "logits/rejected": -2.27274751663208, + "logps/chosen": -443.4758605957031, + "logps/rejected": -341.0003662109375, + "loss": 0.3935, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3288061618804932, + "rewards/margins": 1.3373677730560303, + "rewards/rejected": -2.6661739349365234, + "step": 4724 + }, + { + "epoch": 0.54, + "learning_rate": 1.3867493854617815e-07, + "logits/chosen": -2.357743501663208, + "logits/rejected": -2.680394411087036, + "logps/chosen": -330.6519775390625, + "logps/rejected": -305.22930908203125, + "loss": 0.3182, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3584246039390564, + "rewards/margins": 3.3319783210754395, + "rewards/rejected": -3.6904029846191406, + "step": 4725 + }, + { + "epoch": 0.54, + "learning_rate": 1.386398220765539e-07, + "logits/chosen": -2.9177803993225098, + "logits/rejected": -2.779207706451416, + "logps/chosen": -233.69186401367188, + "logps/rejected": -249.88778686523438, + "loss": 0.4127, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.47669005393981934, + "rewards/margins": 2.287050247192383, + "rewards/rejected": -2.763740301132202, + "step": 4726 + }, + { + "epoch": 0.54, + "learning_rate": 1.3860470560692966e-07, + "logits/chosen": -2.421966075897217, + "logits/rejected": -2.310546636581421, + "logps/chosen": -284.5638122558594, + "logps/rejected": -289.5490417480469, + "loss": 0.2766, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0791082382202148, + "rewards/margins": 2.1884939670562744, + "rewards/rejected": -3.26760196685791, + "step": 4727 + }, + { + "epoch": 0.55, + "learning_rate": 1.3856958913730538e-07, + "logits/chosen": -1.6182091236114502, + "logits/rejected": -1.8690624237060547, + "logps/chosen": -522.0306396484375, + "logps/rejected": -410.906494140625, + "loss": 0.4467, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6807183027267456, + "rewards/margins": 2.899718761444092, + "rewards/rejected": -3.580437183380127, + "step": 4728 + }, + { + "epoch": 0.55, + "learning_rate": 1.3853447266768114e-07, + "logits/chosen": -1.5200164318084717, + "logits/rejected": -1.9884917736053467, + "logps/chosen": -420.39501953125, + "logps/rejected": -238.79006958007812, + "loss": 0.3602, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.044442757964134216, + "rewards/margins": 1.2495366334915161, + "rewards/rejected": -1.2939794063568115, + "step": 4729 + }, + { + "epoch": 0.55, + "learning_rate": 1.384993561980569e-07, + "logits/chosen": -2.2000977993011475, + "logits/rejected": -2.2399721145629883, + "logps/chosen": -247.6924285888672, + "logps/rejected": -185.50491333007812, + "loss": 0.8003, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0101313591003418, + "rewards/margins": 1.469778060913086, + "rewards/rejected": -2.4799094200134277, + "step": 4730 + }, + { + "epoch": 0.55, + "learning_rate": 1.3846423972843262e-07, + "logits/chosen": -2.1212735176086426, + "logits/rejected": -1.9572203159332275, + "logps/chosen": -300.9416198730469, + "logps/rejected": -345.2101135253906, + "loss": 0.5378, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8472627997398376, + "rewards/margins": 3.418429136276245, + "rewards/rejected": -4.265691757202148, + "step": 4731 + }, + { + "epoch": 0.55, + "learning_rate": 1.3842912325880837e-07, + "logits/chosen": -2.3349266052246094, + "logits/rejected": -2.3630528450012207, + "logps/chosen": -432.38104248046875, + "logps/rejected": -265.9549560546875, + "loss": 0.4034, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9798500537872314, + "rewards/margins": 1.1477165222167969, + "rewards/rejected": -2.1275665760040283, + "step": 4732 + }, + { + "epoch": 0.55, + "learning_rate": 1.3839400678918413e-07, + "logits/chosen": -2.3222362995147705, + "logits/rejected": -2.4657464027404785, + "logps/chosen": -385.6435241699219, + "logps/rejected": -324.62469482421875, + "loss": 0.4351, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3970074653625488, + "rewards/margins": 1.5906869173049927, + "rewards/rejected": -2.987694263458252, + "step": 4733 + }, + { + "epoch": 0.55, + "learning_rate": 1.3835889031955988e-07, + "logits/chosen": -2.831829071044922, + "logits/rejected": -2.653765916824341, + "logps/chosen": -89.31192016601562, + "logps/rejected": -164.8464813232422, + "loss": 0.5068, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.217795968055725, + "rewards/margins": 0.9930859804153442, + "rewards/rejected": -2.2108819484710693, + "step": 4734 + }, + { + "epoch": 0.55, + "learning_rate": 1.383237738499356e-07, + "logits/chosen": -2.9135842323303223, + "logits/rejected": -2.9181575775146484, + "logps/chosen": -267.48651123046875, + "logps/rejected": -201.47125244140625, + "loss": 0.4687, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0337520837783813, + "rewards/margins": 0.6998263001441956, + "rewards/rejected": -1.7335783243179321, + "step": 4735 + }, + { + "epoch": 0.55, + "learning_rate": 1.3828865738031136e-07, + "logits/chosen": -2.3993301391601562, + "logits/rejected": -2.3742387294769287, + "logps/chosen": -330.52484130859375, + "logps/rejected": -325.9254150390625, + "loss": 0.2256, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5322688221931458, + "rewards/margins": 2.382641315460205, + "rewards/rejected": -2.914910316467285, + "step": 4736 + }, + { + "epoch": 0.55, + "learning_rate": 1.382535409106871e-07, + "logits/chosen": -2.271033525466919, + "logits/rejected": -1.7932071685791016, + "logps/chosen": -162.03109741210938, + "logps/rejected": -321.21343994140625, + "loss": 0.4714, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.267367959022522, + "rewards/margins": 1.1430965662002563, + "rewards/rejected": -2.4104645252227783, + "step": 4737 + }, + { + "epoch": 0.55, + "learning_rate": 1.3821842444106287e-07, + "logits/chosen": -1.9814594984054565, + "logits/rejected": -2.2131528854370117, + "logps/chosen": -262.1648254394531, + "logps/rejected": -237.5935821533203, + "loss": 1.0884, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9253559112548828, + "rewards/margins": 1.3172682523727417, + "rewards/rejected": -3.242624282836914, + "step": 4738 + }, + { + "epoch": 0.55, + "learning_rate": 1.381833079714386e-07, + "logits/chosen": -2.531790018081665, + "logits/rejected": -2.737968921661377, + "logps/chosen": -314.2982177734375, + "logps/rejected": -205.62466430664062, + "loss": 0.1992, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8355978727340698, + "rewards/margins": 2.4905667304992676, + "rewards/rejected": -3.326164722442627, + "step": 4739 + }, + { + "epoch": 0.55, + "learning_rate": 1.3814819150181435e-07, + "logits/chosen": -1.98294198513031, + "logits/rejected": -2.001476764678955, + "logps/chosen": -279.9991149902344, + "logps/rejected": -236.0902099609375, + "loss": 0.1815, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.398165225982666, + "rewards/margins": 2.107870578765869, + "rewards/rejected": -2.506035804748535, + "step": 4740 + }, + { + "epoch": 0.55, + "learning_rate": 1.3811307503219008e-07, + "logits/chosen": -2.3133461475372314, + "logits/rejected": -2.407539129257202, + "logps/chosen": -192.08090209960938, + "logps/rejected": -103.14540100097656, + "loss": 0.3225, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.158979594707489, + "rewards/margins": 1.8635939359664917, + "rewards/rejected": -2.022573471069336, + "step": 4741 + }, + { + "epoch": 0.55, + "learning_rate": 1.3807795856256583e-07, + "logits/chosen": -2.418886423110962, + "logits/rejected": -2.32076358795166, + "logps/chosen": -285.77960205078125, + "logps/rejected": -310.8924255371094, + "loss": 0.3518, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1429007053375244, + "rewards/margins": 2.249504804611206, + "rewards/rejected": -3.3924055099487305, + "step": 4742 + }, + { + "epoch": 0.55, + "learning_rate": 1.3804284209294159e-07, + "logits/chosen": -2.4591867923736572, + "logits/rejected": -2.3911304473876953, + "logps/chosen": -428.8582458496094, + "logps/rejected": -231.12188720703125, + "loss": 0.3059, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8194006681442261, + "rewards/margins": 2.2435948848724365, + "rewards/rejected": -3.062995433807373, + "step": 4743 + }, + { + "epoch": 0.55, + "learning_rate": 1.3800772562331734e-07, + "logits/chosen": -1.7301013469696045, + "logits/rejected": -1.7783524990081787, + "logps/chosen": -233.1541748046875, + "logps/rejected": -236.17291259765625, + "loss": 0.5672, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.98287433385849, + "rewards/margins": 0.7939358949661255, + "rewards/rejected": -1.7768101692199707, + "step": 4744 + }, + { + "epoch": 0.55, + "learning_rate": 1.3797260915369307e-07, + "logits/chosen": -2.493656635284424, + "logits/rejected": -2.4422149658203125, + "logps/chosen": -257.6549072265625, + "logps/rejected": -250.7899169921875, + "loss": 0.3974, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04035656154155731, + "rewards/margins": 1.526023030281067, + "rewards/rejected": -1.5663797855377197, + "step": 4745 + }, + { + "epoch": 0.55, + "learning_rate": 1.3793749268406882e-07, + "logits/chosen": -2.289764881134033, + "logits/rejected": -2.6204702854156494, + "logps/chosen": -351.79583740234375, + "logps/rejected": -232.0137176513672, + "loss": 0.1551, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12381874024868011, + "rewards/margins": 2.627811908721924, + "rewards/rejected": -2.7516305446624756, + "step": 4746 + }, + { + "epoch": 0.55, + "learning_rate": 1.3790237621444458e-07, + "logits/chosen": -2.555459499359131, + "logits/rejected": -2.579123020172119, + "logps/chosen": -232.11526489257812, + "logps/rejected": -215.72952270507812, + "loss": 0.2818, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6169214248657227, + "rewards/margins": 2.504429340362549, + "rewards/rejected": -3.1213510036468506, + "step": 4747 + }, + { + "epoch": 0.55, + "learning_rate": 1.378672597448203e-07, + "logits/chosen": -2.614344596862793, + "logits/rejected": -2.5509755611419678, + "logps/chosen": -285.46771240234375, + "logps/rejected": -379.96624755859375, + "loss": 0.6072, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.23174463212490082, + "rewards/margins": 4.007699489593506, + "rewards/rejected": -4.239443778991699, + "step": 4748 + }, + { + "epoch": 0.55, + "learning_rate": 1.3783214327519606e-07, + "logits/chosen": -2.640320301055908, + "logits/rejected": -2.589484453201294, + "logps/chosen": -253.18858337402344, + "logps/rejected": -227.9335174560547, + "loss": 0.4127, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3381566107273102, + "rewards/margins": 1.9014720916748047, + "rewards/rejected": -2.239628791809082, + "step": 4749 + }, + { + "epoch": 0.55, + "learning_rate": 1.377970268055718e-07, + "logits/chosen": -2.293022871017456, + "logits/rejected": -2.477102518081665, + "logps/chosen": -362.9917297363281, + "logps/rejected": -365.1850891113281, + "loss": 0.2634, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.38321372866630554, + "rewards/margins": 2.3871514797210693, + "rewards/rejected": -2.770364999771118, + "step": 4750 + }, + { + "epoch": 0.55, + "learning_rate": 1.3776191033594756e-07, + "logits/chosen": -2.2565743923187256, + "logits/rejected": -2.1474053859710693, + "logps/chosen": -138.2124786376953, + "logps/rejected": -214.98289489746094, + "loss": 0.2607, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0021278858184814, + "rewards/margins": 3.2887721061706543, + "rewards/rejected": -4.290899753570557, + "step": 4751 + }, + { + "epoch": 0.55, + "learning_rate": 1.377267938663233e-07, + "logits/chosen": -1.3788636922836304, + "logits/rejected": -1.578892707824707, + "logps/chosen": -266.51776123046875, + "logps/rejected": -342.57904052734375, + "loss": 0.6128, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.7796976566314697, + "rewards/margins": 1.1142522096633911, + "rewards/rejected": -2.8939499855041504, + "step": 4752 + }, + { + "epoch": 0.55, + "learning_rate": 1.3769167739669905e-07, + "logits/chosen": -1.912524938583374, + "logits/rejected": -2.194164276123047, + "logps/chosen": -306.6999206542969, + "logps/rejected": -279.23651123046875, + "loss": 0.2019, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16616767644882202, + "rewards/margins": 2.6835765838623047, + "rewards/rejected": -2.8497443199157715, + "step": 4753 + }, + { + "epoch": 0.55, + "learning_rate": 1.376565609270748e-07, + "logits/chosen": -2.8601436614990234, + "logits/rejected": -2.7384276390075684, + "logps/chosen": -221.17140197753906, + "logps/rejected": -206.8769073486328, + "loss": 0.3914, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4838695526123047, + "rewards/margins": 1.3911497592926025, + "rewards/rejected": -2.875019073486328, + "step": 4754 + }, + { + "epoch": 0.55, + "learning_rate": 1.3762144445745055e-07, + "logits/chosen": -2.7166593074798584, + "logits/rejected": -2.545074462890625, + "logps/chosen": -265.1591491699219, + "logps/rejected": -331.2013244628906, + "loss": 0.2324, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5660688877105713, + "rewards/margins": 1.7172932624816895, + "rewards/rejected": -3.28336238861084, + "step": 4755 + }, + { + "epoch": 0.55, + "learning_rate": 1.3758632798782628e-07, + "logits/chosen": -1.776551604270935, + "logits/rejected": -1.703042984008789, + "logps/chosen": -368.6415100097656, + "logps/rejected": -349.08636474609375, + "loss": 0.5443, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7200173735618591, + "rewards/margins": 1.4073207378387451, + "rewards/rejected": -2.12733793258667, + "step": 4756 + }, + { + "epoch": 0.55, + "learning_rate": 1.3755121151820203e-07, + "logits/chosen": -2.4505724906921387, + "logits/rejected": -2.36240291595459, + "logps/chosen": -257.8924865722656, + "logps/rejected": -312.1694030761719, + "loss": 0.2785, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2204870730638504, + "rewards/margins": 2.192443609237671, + "rewards/rejected": -2.412930727005005, + "step": 4757 + }, + { + "epoch": 0.55, + "learning_rate": 1.375160950485778e-07, + "logits/chosen": -2.5045735836029053, + "logits/rejected": -2.2926502227783203, + "logps/chosen": -358.1525573730469, + "logps/rejected": -349.735107421875, + "loss": 0.2883, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0578676462173462, + "rewards/margins": 1.4204809665679932, + "rewards/rejected": -2.478348731994629, + "step": 4758 + }, + { + "epoch": 0.55, + "learning_rate": 1.3748097857895352e-07, + "logits/chosen": -2.2345762252807617, + "logits/rejected": -2.209785223007202, + "logps/chosen": -277.5191650390625, + "logps/rejected": -212.08331298828125, + "loss": 0.4419, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06819094717502594, + "rewards/margins": 1.801814317703247, + "rewards/rejected": -1.7336235046386719, + "step": 4759 + }, + { + "epoch": 0.55, + "learning_rate": 1.3744586210932927e-07, + "logits/chosen": -2.1709465980529785, + "logits/rejected": -2.6257269382476807, + "logps/chosen": -214.5848388671875, + "logps/rejected": -152.8439178466797, + "loss": 0.657, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3375577926635742, + "rewards/margins": 1.8206528425216675, + "rewards/rejected": -3.1582107543945312, + "step": 4760 + }, + { + "epoch": 0.55, + "learning_rate": 1.3741074563970502e-07, + "logits/chosen": -2.4657111167907715, + "logits/rejected": -2.6337666511535645, + "logps/chosen": -284.5972900390625, + "logps/rejected": -243.11898803710938, + "loss": 1.2277, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.6396567821502686, + "rewards/margins": -0.39647507667541504, + "rewards/rejected": -1.2431817054748535, + "step": 4761 + }, + { + "epoch": 0.55, + "learning_rate": 1.3737562917008075e-07, + "logits/chosen": -1.6151541471481323, + "logits/rejected": -1.7302676439285278, + "logps/chosen": -539.0516967773438, + "logps/rejected": -487.12567138671875, + "loss": 0.2861, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0289115905761719, + "rewards/margins": 2.364896297454834, + "rewards/rejected": -3.3938076496124268, + "step": 4762 + }, + { + "epoch": 0.55, + "learning_rate": 1.373405127004565e-07, + "logits/chosen": -2.4983551502227783, + "logits/rejected": -2.425229549407959, + "logps/chosen": -113.59066772460938, + "logps/rejected": -227.5660400390625, + "loss": 0.503, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.627752661705017, + "rewards/margins": 1.3025739192962646, + "rewards/rejected": -2.930326461791992, + "step": 4763 + }, + { + "epoch": 0.55, + "learning_rate": 1.3730539623083226e-07, + "logits/chosen": -2.1206703186035156, + "logits/rejected": -2.0879838466644287, + "logps/chosen": -167.17584228515625, + "logps/rejected": -230.98171997070312, + "loss": 0.6537, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7349832057952881, + "rewards/margins": 0.7892998456954956, + "rewards/rejected": -1.5242830514907837, + "step": 4764 + }, + { + "epoch": 0.55, + "learning_rate": 1.3727027976120799e-07, + "logits/chosen": -1.8827470541000366, + "logits/rejected": -2.069988965988159, + "logps/chosen": -268.3455810546875, + "logps/rejected": -213.40249633789062, + "loss": 0.253, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2255181074142456, + "rewards/margins": 2.159513473510742, + "rewards/rejected": -3.3850314617156982, + "step": 4765 + }, + { + "epoch": 0.55, + "learning_rate": 1.3723516329158374e-07, + "logits/chosen": -2.3392183780670166, + "logits/rejected": -2.4722962379455566, + "logps/chosen": -328.1591796875, + "logps/rejected": -281.052978515625, + "loss": 0.0983, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0158942937850952, + "rewards/margins": 3.8411223888397217, + "rewards/rejected": -4.857016563415527, + "step": 4766 + }, + { + "epoch": 0.55, + "learning_rate": 1.372000468219595e-07, + "logits/chosen": -2.9582560062408447, + "logits/rejected": -3.031214952468872, + "logps/chosen": -255.09774780273438, + "logps/rejected": -355.59393310546875, + "loss": 0.5094, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5924451351165771, + "rewards/margins": 1.1977930068969727, + "rewards/rejected": -2.790238380432129, + "step": 4767 + }, + { + "epoch": 0.55, + "learning_rate": 1.3716493035233525e-07, + "logits/chosen": -2.4220705032348633, + "logits/rejected": -2.529585838317871, + "logps/chosen": -285.5322265625, + "logps/rejected": -270.87872314453125, + "loss": 0.2045, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19355684518814087, + "rewards/margins": 3.595775604248047, + "rewards/rejected": -3.789332389831543, + "step": 4768 + }, + { + "epoch": 0.55, + "learning_rate": 1.3712981388271098e-07, + "logits/chosen": -2.069401264190674, + "logits/rejected": -2.147836208343506, + "logps/chosen": -387.49176025390625, + "logps/rejected": -350.857177734375, + "loss": 0.3382, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7505100965499878, + "rewards/margins": 2.9087257385253906, + "rewards/rejected": -4.659235954284668, + "step": 4769 + }, + { + "epoch": 0.55, + "learning_rate": 1.3709469741308673e-07, + "logits/chosen": -2.6859631538391113, + "logits/rejected": -2.6411752700805664, + "logps/chosen": -243.14617919921875, + "logps/rejected": -301.5563049316406, + "loss": 0.4484, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6823689937591553, + "rewards/margins": 1.1419978141784668, + "rewards/rejected": -1.824366807937622, + "step": 4770 + }, + { + "epoch": 0.55, + "learning_rate": 1.3705958094346248e-07, + "logits/chosen": -1.7534584999084473, + "logits/rejected": -1.9895716905593872, + "logps/chosen": -453.5174865722656, + "logps/rejected": -264.55950927734375, + "loss": 0.3336, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6417570114135742, + "rewards/margins": 2.3244338035583496, + "rewards/rejected": -2.966190814971924, + "step": 4771 + }, + { + "epoch": 0.55, + "learning_rate": 1.3702446447383824e-07, + "logits/chosen": -2.77999210357666, + "logits/rejected": -2.6756339073181152, + "logps/chosen": -271.7590637207031, + "logps/rejected": -261.004150390625, + "loss": 0.3362, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5112205743789673, + "rewards/margins": 1.649004340171814, + "rewards/rejected": -2.1602249145507812, + "step": 4772 + }, + { + "epoch": 0.55, + "learning_rate": 1.3698934800421396e-07, + "logits/chosen": -2.1617588996887207, + "logits/rejected": -2.2363767623901367, + "logps/chosen": -310.5322570800781, + "logps/rejected": -278.7085876464844, + "loss": 0.7632, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0253801345825195, + "rewards/margins": 1.2944202423095703, + "rewards/rejected": -2.31980037689209, + "step": 4773 + }, + { + "epoch": 0.55, + "learning_rate": 1.3695423153458972e-07, + "logits/chosen": -2.7142298221588135, + "logits/rejected": -2.6769914627075195, + "logps/chosen": -151.8915252685547, + "logps/rejected": -205.11924743652344, + "loss": 0.5564, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7489889860153198, + "rewards/margins": 1.6369432210922241, + "rewards/rejected": -2.385932207107544, + "step": 4774 + }, + { + "epoch": 0.55, + "learning_rate": 1.3691911506496547e-07, + "logits/chosen": -2.286928653717041, + "logits/rejected": -1.987725019454956, + "logps/chosen": -419.28717041015625, + "logps/rejected": -419.3485107421875, + "loss": 0.1849, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6812472343444824, + "rewards/margins": 3.0018272399902344, + "rewards/rejected": -4.683074474334717, + "step": 4775 + }, + { + "epoch": 0.55, + "learning_rate": 1.368839985953412e-07, + "logits/chosen": -2.3417251110076904, + "logits/rejected": -2.072990894317627, + "logps/chosen": -198.09068298339844, + "logps/rejected": -215.40963745117188, + "loss": 0.5646, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4087131023406982, + "rewards/margins": 0.9551293849945068, + "rewards/rejected": -2.363842487335205, + "step": 4776 + }, + { + "epoch": 0.55, + "learning_rate": 1.3684888212571695e-07, + "logits/chosen": -2.503267526626587, + "logits/rejected": -2.6462178230285645, + "logps/chosen": -156.57449340820312, + "logps/rejected": -167.30345153808594, + "loss": 0.268, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6747316718101501, + "rewards/margins": 2.9542675018310547, + "rewards/rejected": -3.6289992332458496, + "step": 4777 + }, + { + "epoch": 0.55, + "learning_rate": 1.368137656560927e-07, + "logits/chosen": -2.392011880874634, + "logits/rejected": -2.3408303260803223, + "logps/chosen": -475.121337890625, + "logps/rejected": -416.6802978515625, + "loss": 1.2933, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.7007205486297607, + "rewards/margins": 0.539596676826477, + "rewards/rejected": -2.2403175830841064, + "step": 4778 + }, + { + "epoch": 0.55, + "learning_rate": 1.3677864918646846e-07, + "logits/chosen": -2.100353479385376, + "logits/rejected": -2.2841649055480957, + "logps/chosen": -270.538330078125, + "logps/rejected": -450.7897644042969, + "loss": 0.1574, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.519186794757843, + "rewards/margins": 4.522678375244141, + "rewards/rejected": -5.041865348815918, + "step": 4779 + }, + { + "epoch": 0.55, + "learning_rate": 1.367435327168442e-07, + "logits/chosen": -2.715243101119995, + "logits/rejected": -2.683239459991455, + "logps/chosen": -241.65045166015625, + "logps/rejected": -318.4117431640625, + "loss": 0.3902, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.907753586769104, + "rewards/margins": 2.2502450942993164, + "rewards/rejected": -3.157998561859131, + "step": 4780 + }, + { + "epoch": 0.55, + "learning_rate": 1.3670841624721994e-07, + "logits/chosen": -2.458416223526001, + "logits/rejected": -2.488785743713379, + "logps/chosen": -197.3876953125, + "logps/rejected": -268.0456848144531, + "loss": 0.1483, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.005362644791603088, + "rewards/margins": 3.227083206176758, + "rewards/rejected": -3.2217206954956055, + "step": 4781 + }, + { + "epoch": 0.55, + "learning_rate": 1.3667329977759567e-07, + "logits/chosen": -2.976411819458008, + "logits/rejected": -2.8608415126800537, + "logps/chosen": -154.41287231445312, + "logps/rejected": -331.93487548828125, + "loss": 0.2117, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12858828902244568, + "rewards/margins": 3.002018451690674, + "rewards/rejected": -3.1306064128875732, + "step": 4782 + }, + { + "epoch": 0.55, + "learning_rate": 1.3663818330797145e-07, + "logits/chosen": -2.6122477054595947, + "logits/rejected": -2.533196210861206, + "logps/chosen": -445.033447265625, + "logps/rejected": -282.23193359375, + "loss": 0.1716, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.00445556640625, + "rewards/margins": 2.116024971008301, + "rewards/rejected": -3.120480537414551, + "step": 4783 + }, + { + "epoch": 0.55, + "learning_rate": 1.3660306683834718e-07, + "logits/chosen": -2.586733818054199, + "logits/rejected": -2.8027830123901367, + "logps/chosen": -386.2651062011719, + "logps/rejected": -359.5486145019531, + "loss": 0.31, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7972564101219177, + "rewards/margins": 2.956162691116333, + "rewards/rejected": -3.7534193992614746, + "step": 4784 + }, + { + "epoch": 0.55, + "learning_rate": 1.3656795036872293e-07, + "logits/chosen": -2.6999216079711914, + "logits/rejected": -2.313302993774414, + "logps/chosen": -312.5986022949219, + "logps/rejected": -350.8476867675781, + "loss": 0.461, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9161595106124878, + "rewards/margins": 3.5308969020843506, + "rewards/rejected": -4.447056293487549, + "step": 4785 + }, + { + "epoch": 0.55, + "learning_rate": 1.3653283389909866e-07, + "logits/chosen": -2.1755943298339844, + "logits/rejected": -2.091987371444702, + "logps/chosen": -235.96914672851562, + "logps/rejected": -350.08544921875, + "loss": 0.5065, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8777138590812683, + "rewards/margins": 1.8180248737335205, + "rewards/rejected": -2.6957387924194336, + "step": 4786 + }, + { + "epoch": 0.55, + "learning_rate": 1.364977174294744e-07, + "logits/chosen": -2.0291223526000977, + "logits/rejected": -2.0250911712646484, + "logps/chosen": -445.70648193359375, + "logps/rejected": -367.69708251953125, + "loss": 0.2692, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7428807020187378, + "rewards/margins": 2.0099658966064453, + "rewards/rejected": -2.7528467178344727, + "step": 4787 + }, + { + "epoch": 0.55, + "learning_rate": 1.3646260095985017e-07, + "logits/chosen": -2.4536986351013184, + "logits/rejected": -2.251768112182617, + "logps/chosen": -186.5784912109375, + "logps/rejected": -354.7635803222656, + "loss": 0.1575, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0253521129488945, + "rewards/margins": 3.2515366077423096, + "rewards/rejected": -3.276888847351074, + "step": 4788 + }, + { + "epoch": 0.55, + "learning_rate": 1.3642748449022592e-07, + "logits/chosen": -1.6608468294143677, + "logits/rejected": -2.0442655086517334, + "logps/chosen": -261.11444091796875, + "logps/rejected": -181.04330444335938, + "loss": 0.7146, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1283258199691772, + "rewards/margins": 0.4800355136394501, + "rewards/rejected": -1.6083612442016602, + "step": 4789 + }, + { + "epoch": 0.55, + "learning_rate": 1.3639236802060165e-07, + "logits/chosen": -2.7576088905334473, + "logits/rejected": -2.4380035400390625, + "logps/chosen": -288.6534423828125, + "logps/rejected": -341.25927734375, + "loss": 0.123, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.165339469909668, + "rewards/margins": 3.8507604598999023, + "rewards/rejected": -5.01609992980957, + "step": 4790 + }, + { + "epoch": 0.55, + "learning_rate": 1.363572515509774e-07, + "logits/chosen": -2.045961380004883, + "logits/rejected": -2.43314790725708, + "logps/chosen": -254.54905700683594, + "logps/rejected": -185.78567504882812, + "loss": 0.3294, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8904281258583069, + "rewards/margins": 2.193471908569336, + "rewards/rejected": -3.083899974822998, + "step": 4791 + }, + { + "epoch": 0.55, + "learning_rate": 1.3632213508135316e-07, + "logits/chosen": -2.3624446392059326, + "logits/rejected": -2.1967461109161377, + "logps/chosen": -178.79469299316406, + "logps/rejected": -120.94507598876953, + "loss": 0.46, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8968144655227661, + "rewards/margins": 1.0211989879608154, + "rewards/rejected": -1.918013334274292, + "step": 4792 + }, + { + "epoch": 0.55, + "learning_rate": 1.3628701861172888e-07, + "logits/chosen": -2.170593738555908, + "logits/rejected": -2.2826461791992188, + "logps/chosen": -188.57675170898438, + "logps/rejected": -181.47650146484375, + "loss": 0.3781, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.293220043182373, + "rewards/margins": 1.3952864408493042, + "rewards/rejected": -2.6885063648223877, + "step": 4793 + }, + { + "epoch": 0.55, + "learning_rate": 1.3625190214210464e-07, + "logits/chosen": -2.2324304580688477, + "logits/rejected": -2.079941749572754, + "logps/chosen": -242.65200805664062, + "logps/rejected": -218.8368377685547, + "loss": 0.406, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5207877159118652, + "rewards/margins": 1.840127944946289, + "rewards/rejected": -2.3609158992767334, + "step": 4794 + }, + { + "epoch": 0.55, + "learning_rate": 1.362167856724804e-07, + "logits/chosen": -2.3446483612060547, + "logits/rejected": -2.057300329208374, + "logps/chosen": -189.5357666015625, + "logps/rejected": -305.0023193359375, + "loss": 0.2704, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6400583982467651, + "rewards/margins": 3.8151559829711914, + "rewards/rejected": -4.455214500427246, + "step": 4795 + }, + { + "epoch": 0.55, + "learning_rate": 1.3618166920285614e-07, + "logits/chosen": -1.839255452156067, + "logits/rejected": -1.9131402969360352, + "logps/chosen": -392.2235107421875, + "logps/rejected": -220.7092742919922, + "loss": 0.3594, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7565640807151794, + "rewards/margins": 1.3498895168304443, + "rewards/rejected": -2.1064536571502686, + "step": 4796 + }, + { + "epoch": 0.55, + "learning_rate": 1.3614655273323187e-07, + "logits/chosen": -2.3139684200286865, + "logits/rejected": -2.360741138458252, + "logps/chosen": -141.66317749023438, + "logps/rejected": -164.82884216308594, + "loss": 0.6641, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09536132216453552, + "rewards/margins": 1.671263575553894, + "rewards/rejected": -1.766624927520752, + "step": 4797 + }, + { + "epoch": 0.55, + "learning_rate": 1.3611143626360763e-07, + "logits/chosen": -1.9260863065719604, + "logits/rejected": -1.7395930290222168, + "logps/chosen": -264.0608215332031, + "logps/rejected": -325.56488037109375, + "loss": 0.2607, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.47322437167167664, + "rewards/margins": 2.606813430786133, + "rewards/rejected": -3.080037832260132, + "step": 4798 + }, + { + "epoch": 0.55, + "learning_rate": 1.3607631979398338e-07, + "logits/chosen": -2.4778008460998535, + "logits/rejected": -2.5940232276916504, + "logps/chosen": -198.496337890625, + "logps/rejected": -299.6545104980469, + "loss": 0.382, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6993316411972046, + "rewards/margins": 2.1806511878967285, + "rewards/rejected": -3.8799829483032227, + "step": 4799 + }, + { + "epoch": 0.55, + "learning_rate": 1.3604120332435913e-07, + "logits/chosen": -2.0307388305664062, + "logits/rejected": -2.359820604324341, + "logps/chosen": -276.98974609375, + "logps/rejected": -174.22190856933594, + "loss": 0.2642, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5065451264381409, + "rewards/margins": 1.6279524564743042, + "rewards/rejected": -2.13449764251709, + "step": 4800 + }, + { + "epoch": 0.55, + "learning_rate": 1.3600608685473486e-07, + "logits/chosen": -2.4220056533813477, + "logits/rejected": -2.7199504375457764, + "logps/chosen": -398.2153015136719, + "logps/rejected": -326.6436462402344, + "loss": 0.1402, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.70918208360672, + "rewards/margins": 4.1859588623046875, + "rewards/rejected": -4.895140647888184, + "step": 4801 + }, + { + "epoch": 0.55, + "learning_rate": 1.3597097038511061e-07, + "logits/chosen": -2.1157498359680176, + "logits/rejected": -2.4377331733703613, + "logps/chosen": -318.9798583984375, + "logps/rejected": -332.884765625, + "loss": 0.3713, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2917389869689941, + "rewards/margins": 2.1105809211730957, + "rewards/rejected": -3.40231990814209, + "step": 4802 + }, + { + "epoch": 0.55, + "learning_rate": 1.3593585391548637e-07, + "logits/chosen": -2.2657692432403564, + "logits/rejected": -2.1352479457855225, + "logps/chosen": -336.0774230957031, + "logps/rejected": -516.7429809570312, + "loss": 0.4536, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8948569893836975, + "rewards/margins": 2.272714614868164, + "rewards/rejected": -3.167571544647217, + "step": 4803 + }, + { + "epoch": 0.55, + "learning_rate": 1.359007374458621e-07, + "logits/chosen": -2.506450891494751, + "logits/rejected": -2.6095123291015625, + "logps/chosen": -123.25494384765625, + "logps/rejected": -241.06991577148438, + "loss": 0.4618, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5231615304946899, + "rewards/margins": 1.519148349761963, + "rewards/rejected": -2.0423097610473633, + "step": 4804 + }, + { + "epoch": 0.55, + "learning_rate": 1.3586562097623785e-07, + "logits/chosen": -2.068239688873291, + "logits/rejected": -2.2230732440948486, + "logps/chosen": -535.3038940429688, + "logps/rejected": -383.5375061035156, + "loss": 0.1722, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7486248016357422, + "rewards/margins": 1.8365799188613892, + "rewards/rejected": -2.585204839706421, + "step": 4805 + }, + { + "epoch": 0.55, + "learning_rate": 1.358305045066136e-07, + "logits/chosen": -2.256552219390869, + "logits/rejected": -2.555325984954834, + "logps/chosen": -440.5546875, + "logps/rejected": -301.87066650390625, + "loss": 0.2214, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5031406879425049, + "rewards/margins": 3.1463491916656494, + "rewards/rejected": -3.6494898796081543, + "step": 4806 + }, + { + "epoch": 0.55, + "learning_rate": 1.3579538803698936e-07, + "logits/chosen": -1.8375190496444702, + "logits/rejected": -1.6884878873825073, + "logps/chosen": -254.41717529296875, + "logps/rejected": -224.22348022460938, + "loss": 0.8478, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.0451855659484863, + "rewards/margins": 0.6035709381103516, + "rewards/rejected": -2.648756742477417, + "step": 4807 + }, + { + "epoch": 0.55, + "learning_rate": 1.3576027156736508e-07, + "logits/chosen": -1.9568248987197876, + "logits/rejected": -2.383301258087158, + "logps/chosen": -264.3798828125, + "logps/rejected": -250.001708984375, + "loss": 0.7469, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0234601497650146, + "rewards/margins": 0.11978283524513245, + "rewards/rejected": -1.1432429552078247, + "step": 4808 + }, + { + "epoch": 0.55, + "learning_rate": 1.3572515509774084e-07, + "logits/chosen": -2.2977161407470703, + "logits/rejected": -2.070328950881958, + "logps/chosen": -117.67855072021484, + "logps/rejected": -244.7462615966797, + "loss": 0.231, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8273755311965942, + "rewards/margins": 2.027289390563965, + "rewards/rejected": -2.8546648025512695, + "step": 4809 + }, + { + "epoch": 0.55, + "learning_rate": 1.3569003862811657e-07, + "logits/chosen": -2.1347815990448, + "logits/rejected": -2.139158248901367, + "logps/chosen": -185.92825317382812, + "logps/rejected": -212.6764373779297, + "loss": 0.5108, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9044349789619446, + "rewards/margins": 1.4701344966888428, + "rewards/rejected": -2.3745694160461426, + "step": 4810 + }, + { + "epoch": 0.55, + "learning_rate": 1.3565492215849232e-07, + "logits/chosen": -2.1587963104248047, + "logits/rejected": -2.1053762435913086, + "logps/chosen": -380.6593017578125, + "logps/rejected": -290.98687744140625, + "loss": 0.2346, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5797098278999329, + "rewards/margins": 2.074212074279785, + "rewards/rejected": -2.6539218425750732, + "step": 4811 + }, + { + "epoch": 0.55, + "learning_rate": 1.3561980568886807e-07, + "logits/chosen": -2.3420183658599854, + "logits/rejected": -2.324657440185547, + "logps/chosen": -191.88584899902344, + "logps/rejected": -233.16622924804688, + "loss": 0.3128, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.590413510799408, + "rewards/margins": 2.039586305618286, + "rewards/rejected": -2.6299996376037598, + "step": 4812 + }, + { + "epoch": 0.55, + "learning_rate": 1.3558468921924383e-07, + "logits/chosen": -2.4375438690185547, + "logits/rejected": -2.233778953552246, + "logps/chosen": -281.6270751953125, + "logps/rejected": -272.61663818359375, + "loss": 0.223, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24826905131340027, + "rewards/margins": 2.497628688812256, + "rewards/rejected": -2.7458977699279785, + "step": 4813 + }, + { + "epoch": 0.55, + "learning_rate": 1.3554957274961955e-07, + "logits/chosen": -2.533656120300293, + "logits/rejected": -2.7519125938415527, + "logps/chosen": -220.59877014160156, + "logps/rejected": -223.5734405517578, + "loss": 0.1097, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.35747456550598145, + "rewards/margins": 2.7447876930236816, + "rewards/rejected": -3.102262258529663, + "step": 4814 + }, + { + "epoch": 0.56, + "learning_rate": 1.355144562799953e-07, + "logits/chosen": -2.2267942428588867, + "logits/rejected": -2.084888219833374, + "logps/chosen": -107.00630950927734, + "logps/rejected": -182.64024353027344, + "loss": 0.3319, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4779113531112671, + "rewards/margins": 3.7533888816833496, + "rewards/rejected": -4.231300354003906, + "step": 4815 + }, + { + "epoch": 0.56, + "learning_rate": 1.3547933981037106e-07, + "logits/chosen": -1.9460666179656982, + "logits/rejected": -2.004321575164795, + "logps/chosen": -294.30133056640625, + "logps/rejected": -222.36190795898438, + "loss": 0.4868, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8507446050643921, + "rewards/margins": 1.5995063781738281, + "rewards/rejected": -2.4502511024475098, + "step": 4816 + }, + { + "epoch": 0.56, + "learning_rate": 1.3544422334074682e-07, + "logits/chosen": -2.4957170486450195, + "logits/rejected": -2.3248448371887207, + "logps/chosen": -319.8683166503906, + "logps/rejected": -276.8327331542969, + "loss": 0.4679, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6352083086967468, + "rewards/margins": 1.5930490493774414, + "rewards/rejected": -2.228257417678833, + "step": 4817 + }, + { + "epoch": 0.56, + "learning_rate": 1.3540910687112254e-07, + "logits/chosen": -2.0885839462280273, + "logits/rejected": -2.123796224594116, + "logps/chosen": -374.7470703125, + "logps/rejected": -240.86192321777344, + "loss": 0.5412, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6879711151123047, + "rewards/margins": 0.9923352003097534, + "rewards/rejected": -1.680306315422058, + "step": 4818 + }, + { + "epoch": 0.56, + "learning_rate": 1.353739904014983e-07, + "logits/chosen": -1.9728333950042725, + "logits/rejected": -2.0664501190185547, + "logps/chosen": -225.91021728515625, + "logps/rejected": -201.63926696777344, + "loss": 0.2333, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.806307315826416, + "rewards/margins": 2.022404670715332, + "rewards/rejected": -2.828712224960327, + "step": 4819 + }, + { + "epoch": 0.56, + "learning_rate": 1.3533887393187405e-07, + "logits/chosen": -2.327371835708618, + "logits/rejected": -2.417142868041992, + "logps/chosen": -211.29425048828125, + "logps/rejected": -150.62969970703125, + "loss": 0.5849, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5488197207450867, + "rewards/margins": 1.041601538658142, + "rewards/rejected": -1.590421199798584, + "step": 4820 + }, + { + "epoch": 0.56, + "learning_rate": 1.3530375746224978e-07, + "logits/chosen": -2.5507125854492188, + "logits/rejected": -2.3261005878448486, + "logps/chosen": -191.20339965820312, + "logps/rejected": -188.9502410888672, + "loss": 0.7273, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.3175170421600342, + "rewards/margins": 0.43799489736557007, + "rewards/rejected": -1.755511999130249, + "step": 4821 + }, + { + "epoch": 0.56, + "learning_rate": 1.3526864099262553e-07, + "logits/chosen": -2.252774238586426, + "logits/rejected": -2.410229444503784, + "logps/chosen": -296.0897521972656, + "logps/rejected": -358.5444030761719, + "loss": 0.1807, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2970284223556519, + "rewards/margins": 3.3883872032165527, + "rewards/rejected": -4.685415744781494, + "step": 4822 + }, + { + "epoch": 0.56, + "learning_rate": 1.3523352452300129e-07, + "logits/chosen": -2.7393980026245117, + "logits/rejected": -2.745945930480957, + "logps/chosen": -321.1591796875, + "logps/rejected": -306.70758056640625, + "loss": 0.4002, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9225557446479797, + "rewards/margins": 1.3042807579040527, + "rewards/rejected": -2.2268364429473877, + "step": 4823 + }, + { + "epoch": 0.56, + "learning_rate": 1.3519840805337704e-07, + "logits/chosen": -3.031780481338501, + "logits/rejected": -2.953822135925293, + "logps/chosen": -309.4754638671875, + "logps/rejected": -312.379638671875, + "loss": 0.2553, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.33312317728996277, + "rewards/margins": 5.235001564025879, + "rewards/rejected": -5.568124771118164, + "step": 4824 + }, + { + "epoch": 0.56, + "learning_rate": 1.3516329158375277e-07, + "logits/chosen": -2.0574851036071777, + "logits/rejected": -2.297445297241211, + "logps/chosen": -256.41107177734375, + "logps/rejected": -182.76539611816406, + "loss": 1.1285, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2841029167175293, + "rewards/margins": 0.7850856184959412, + "rewards/rejected": -2.069188356399536, + "step": 4825 + }, + { + "epoch": 0.56, + "learning_rate": 1.3512817511412852e-07, + "logits/chosen": -2.257132053375244, + "logits/rejected": -2.0890016555786133, + "logps/chosen": -126.13517761230469, + "logps/rejected": -227.54783630371094, + "loss": 0.2067, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36975976824760437, + "rewards/margins": 2.257255792617798, + "rewards/rejected": -2.6270153522491455, + "step": 4826 + }, + { + "epoch": 0.56, + "learning_rate": 1.3509305864450425e-07, + "logits/chosen": -2.087251663208008, + "logits/rejected": -2.214742422103882, + "logps/chosen": -351.663330078125, + "logps/rejected": -254.70335388183594, + "loss": 0.1638, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8526857495307922, + "rewards/margins": 2.52079439163208, + "rewards/rejected": -3.3734803199768066, + "step": 4827 + }, + { + "epoch": 0.56, + "learning_rate": 1.3505794217488003e-07, + "logits/chosen": -2.2456822395324707, + "logits/rejected": -2.0735559463500977, + "logps/chosen": -279.6858215332031, + "logps/rejected": -380.7767333984375, + "loss": 0.1934, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.44873905181884766, + "rewards/margins": 3.7548086643218994, + "rewards/rejected": -4.203547954559326, + "step": 4828 + }, + { + "epoch": 0.56, + "learning_rate": 1.3502282570525576e-07, + "logits/chosen": -1.7246952056884766, + "logits/rejected": -1.9301035404205322, + "logps/chosen": -375.13037109375, + "logps/rejected": -346.02349853515625, + "loss": 0.5227, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.30883336067199707, + "rewards/margins": 1.3645919561386108, + "rewards/rejected": -1.673425316810608, + "step": 4829 + }, + { + "epoch": 0.56, + "learning_rate": 1.349877092356315e-07, + "logits/chosen": -2.571249008178711, + "logits/rejected": -2.717165946960449, + "logps/chosen": -188.72837829589844, + "logps/rejected": -84.94696807861328, + "loss": 0.5758, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9457273483276367, + "rewards/margins": 0.5647879838943481, + "rewards/rejected": -1.5105152130126953, + "step": 4830 + }, + { + "epoch": 0.56, + "learning_rate": 1.3495259276600724e-07, + "logits/chosen": -2.0800204277038574, + "logits/rejected": -1.8072903156280518, + "logps/chosen": -196.95315551757812, + "logps/rejected": -263.1865539550781, + "loss": 0.754, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.25701642036438, + "rewards/margins": 0.687847375869751, + "rewards/rejected": -2.9448635578155518, + "step": 4831 + }, + { + "epoch": 0.56, + "learning_rate": 1.34917476296383e-07, + "logits/chosen": -1.7588632106781006, + "logits/rejected": -1.5654352903366089, + "logps/chosen": -345.01422119140625, + "logps/rejected": -368.0037841796875, + "loss": 0.3633, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4038272500038147, + "rewards/margins": 2.946402072906494, + "rewards/rejected": -3.350229501724243, + "step": 4832 + }, + { + "epoch": 0.56, + "learning_rate": 1.3488235982675875e-07, + "logits/chosen": -2.6276731491088867, + "logits/rejected": -2.6331562995910645, + "logps/chosen": -504.70135498046875, + "logps/rejected": -577.5037231445312, + "loss": 0.4343, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8926060199737549, + "rewards/margins": 2.2553889751434326, + "rewards/rejected": -3.1479952335357666, + "step": 4833 + }, + { + "epoch": 0.56, + "learning_rate": 1.348472433571345e-07, + "logits/chosen": -2.372121810913086, + "logits/rejected": -2.513054132461548, + "logps/chosen": -241.9613494873047, + "logps/rejected": -216.2640380859375, + "loss": 0.4359, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9028833508491516, + "rewards/margins": 1.2450761795043945, + "rewards/rejected": -2.1479592323303223, + "step": 4834 + }, + { + "epoch": 0.56, + "learning_rate": 1.3481212688751023e-07, + "logits/chosen": -1.922997236251831, + "logits/rejected": -2.310943603515625, + "logps/chosen": -259.9210205078125, + "logps/rejected": -188.63006591796875, + "loss": 0.7593, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0768673419952393, + "rewards/margins": 1.5365599393844604, + "rewards/rejected": -2.61342716217041, + "step": 4835 + }, + { + "epoch": 0.56, + "learning_rate": 1.3477701041788598e-07, + "logits/chosen": -2.5063581466674805, + "logits/rejected": -2.5061893463134766, + "logps/chosen": -373.9140625, + "logps/rejected": -442.78515625, + "loss": 0.6889, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2061558961868286, + "rewards/margins": 3.67524790763855, + "rewards/rejected": -4.88140344619751, + "step": 4836 + }, + { + "epoch": 0.56, + "learning_rate": 1.3474189394826173e-07, + "logits/chosen": -2.3067116737365723, + "logits/rejected": -2.3953933715820312, + "logps/chosen": -131.4929656982422, + "logps/rejected": -130.47064208984375, + "loss": 0.2872, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7610421180725098, + "rewards/margins": 1.9901611804962158, + "rewards/rejected": -2.7512032985687256, + "step": 4837 + }, + { + "epoch": 0.56, + "learning_rate": 1.3470677747863746e-07, + "logits/chosen": -2.536116123199463, + "logits/rejected": -2.5846052169799805, + "logps/chosen": -258.59527587890625, + "logps/rejected": -251.0260009765625, + "loss": 0.176, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0905274152755737, + "rewards/margins": 3.2770614624023438, + "rewards/rejected": -4.367588996887207, + "step": 4838 + }, + { + "epoch": 0.56, + "learning_rate": 1.3467166100901322e-07, + "logits/chosen": -1.9920032024383545, + "logits/rejected": -2.0701844692230225, + "logps/chosen": -322.38397216796875, + "logps/rejected": -239.53475952148438, + "loss": 0.3264, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2915087938308716, + "rewards/margins": 1.6064908504486084, + "rewards/rejected": -2.8979995250701904, + "step": 4839 + }, + { + "epoch": 0.56, + "learning_rate": 1.3463654453938897e-07, + "logits/chosen": -2.624277353286743, + "logits/rejected": -2.4167513847351074, + "logps/chosen": -187.3295135498047, + "logps/rejected": -362.908447265625, + "loss": 0.4445, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8609055280685425, + "rewards/margins": 3.4215216636657715, + "rewards/rejected": -4.282426834106445, + "step": 4840 + }, + { + "epoch": 0.56, + "learning_rate": 1.3460142806976472e-07, + "logits/chosen": -2.7280983924865723, + "logits/rejected": -2.7718465328216553, + "logps/chosen": -267.72210693359375, + "logps/rejected": -340.72906494140625, + "loss": 0.4071, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3434876203536987, + "rewards/margins": 2.530824661254883, + "rewards/rejected": -3.874312400817871, + "step": 4841 + }, + { + "epoch": 0.56, + "learning_rate": 1.3456631160014045e-07, + "logits/chosen": -2.5638651847839355, + "logits/rejected": -2.6412174701690674, + "logps/chosen": -323.2056884765625, + "logps/rejected": -276.073486328125, + "loss": 0.466, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5717781782150269, + "rewards/margins": 2.4859366416931152, + "rewards/rejected": -4.057714939117432, + "step": 4842 + }, + { + "epoch": 0.56, + "learning_rate": 1.345311951305162e-07, + "logits/chosen": -2.4625024795532227, + "logits/rejected": -2.4665110111236572, + "logps/chosen": -256.5439453125, + "logps/rejected": -302.19915771484375, + "loss": 0.3815, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8005316257476807, + "rewards/margins": 1.9637274742126465, + "rewards/rejected": -3.7642593383789062, + "step": 4843 + }, + { + "epoch": 0.56, + "learning_rate": 1.3449607866089196e-07, + "logits/chosen": -2.1729252338409424, + "logits/rejected": -2.467323064804077, + "logps/chosen": -228.91676330566406, + "logps/rejected": -292.58416748046875, + "loss": 0.2498, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9546171426773071, + "rewards/margins": 3.4531683921813965, + "rewards/rejected": -4.407785415649414, + "step": 4844 + }, + { + "epoch": 0.56, + "learning_rate": 1.344609621912677e-07, + "logits/chosen": -2.5118207931518555, + "logits/rejected": -2.470383644104004, + "logps/chosen": -157.86395263671875, + "logps/rejected": -218.78436279296875, + "loss": 0.3888, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4210820198059082, + "rewards/margins": 1.7025401592254639, + "rewards/rejected": -3.123622417449951, + "step": 4845 + }, + { + "epoch": 0.56, + "learning_rate": 1.3442584572164344e-07, + "logits/chosen": -2.7200698852539062, + "logits/rejected": -2.6364686489105225, + "logps/chosen": -245.41226196289062, + "logps/rejected": -203.2798614501953, + "loss": 0.3673, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9191640615463257, + "rewards/margins": 1.8838987350463867, + "rewards/rejected": -2.803062677383423, + "step": 4846 + }, + { + "epoch": 0.56, + "learning_rate": 1.343907292520192e-07, + "logits/chosen": -2.2804501056671143, + "logits/rejected": -2.5459983348846436, + "logps/chosen": -265.4000244140625, + "logps/rejected": -281.5617370605469, + "loss": 0.4665, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7049946188926697, + "rewards/margins": 2.0978622436523438, + "rewards/rejected": -2.802856922149658, + "step": 4847 + }, + { + "epoch": 0.56, + "learning_rate": 1.3435561278239495e-07, + "logits/chosen": -1.9443614482879639, + "logits/rejected": -1.908457636833191, + "logps/chosen": -330.4315185546875, + "logps/rejected": -308.06365966796875, + "loss": 0.3043, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.31668466329574585, + "rewards/margins": 1.8887419700622559, + "rewards/rejected": -2.2054264545440674, + "step": 4848 + }, + { + "epoch": 0.56, + "learning_rate": 1.3432049631277067e-07, + "logits/chosen": -2.6277711391448975, + "logits/rejected": -2.5453288555145264, + "logps/chosen": -210.15863037109375, + "logps/rejected": -258.4697570800781, + "loss": 0.3344, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0430558919906616, + "rewards/margins": 1.869638442993164, + "rewards/rejected": -2.9126944541931152, + "step": 4849 + }, + { + "epoch": 0.56, + "learning_rate": 1.3428537984314643e-07, + "logits/chosen": -2.3769638538360596, + "logits/rejected": -2.371399402618408, + "logps/chosen": -397.6316223144531, + "logps/rejected": -380.7309265136719, + "loss": 0.2282, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.48701274394989014, + "rewards/margins": 2.596240520477295, + "rewards/rejected": -3.0832531452178955, + "step": 4850 + }, + { + "epoch": 0.56, + "learning_rate": 1.3425026337352218e-07, + "logits/chosen": -2.338921546936035, + "logits/rejected": -2.8079111576080322, + "logps/chosen": -237.60903930664062, + "logps/rejected": -360.9835205078125, + "loss": 0.2773, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19019365310668945, + "rewards/margins": 2.7151682376861572, + "rewards/rejected": -2.9053618907928467, + "step": 4851 + }, + { + "epoch": 0.56, + "learning_rate": 1.3421514690389794e-07, + "logits/chosen": -2.4430923461914062, + "logits/rejected": -2.6832995414733887, + "logps/chosen": -199.77210998535156, + "logps/rejected": -265.7386779785156, + "loss": 0.318, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9154201745986938, + "rewards/margins": 2.340665340423584, + "rewards/rejected": -3.2560853958129883, + "step": 4852 + }, + { + "epoch": 0.56, + "learning_rate": 1.3418003043427366e-07, + "logits/chosen": -1.8539464473724365, + "logits/rejected": -2.04975962638855, + "logps/chosen": -334.9526062011719, + "logps/rejected": -264.18212890625, + "loss": 1.5834, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9827474355697632, + "rewards/margins": 1.6189630031585693, + "rewards/rejected": -3.601710081100464, + "step": 4853 + }, + { + "epoch": 0.56, + "learning_rate": 1.3414491396464942e-07, + "logits/chosen": -2.172433376312256, + "logits/rejected": -2.372342109680176, + "logps/chosen": -204.04666137695312, + "logps/rejected": -272.0843200683594, + "loss": 0.1738, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.02740781381726265, + "rewards/margins": 2.635265588760376, + "rewards/rejected": -2.6078577041625977, + "step": 4854 + }, + { + "epoch": 0.56, + "learning_rate": 1.3410979749502515e-07, + "logits/chosen": -2.136354446411133, + "logits/rejected": -2.1507012844085693, + "logps/chosen": -364.2919921875, + "logps/rejected": -418.81439208984375, + "loss": 0.5475, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.025556802749634, + "rewards/margins": 1.7135978937149048, + "rewards/rejected": -3.739154577255249, + "step": 4855 + }, + { + "epoch": 0.56, + "learning_rate": 1.340746810254009e-07, + "logits/chosen": -2.3979945182800293, + "logits/rejected": -2.3786230087280273, + "logps/chosen": -184.09535217285156, + "logps/rejected": -300.2059020996094, + "loss": 0.2292, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0752995014190674, + "rewards/margins": 2.7183713912963867, + "rewards/rejected": -3.793670892715454, + "step": 4856 + }, + { + "epoch": 0.56, + "learning_rate": 1.3403956455577665e-07, + "logits/chosen": -2.5794882774353027, + "logits/rejected": -2.645292043685913, + "logps/chosen": -96.72673034667969, + "logps/rejected": -190.18887329101562, + "loss": 0.2997, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5728324055671692, + "rewards/margins": 2.095517635345459, + "rewards/rejected": -2.6683502197265625, + "step": 4857 + }, + { + "epoch": 0.56, + "learning_rate": 1.340044480861524e-07, + "logits/chosen": -2.393421173095703, + "logits/rejected": -2.504747152328491, + "logps/chosen": -489.54180908203125, + "logps/rejected": -426.1352844238281, + "loss": 0.1709, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6246083378791809, + "rewards/margins": 2.6739797592163086, + "rewards/rejected": -3.2985877990722656, + "step": 4858 + }, + { + "epoch": 0.56, + "learning_rate": 1.3396933161652813e-07, + "logits/chosen": -2.683087110519409, + "logits/rejected": -2.7984678745269775, + "logps/chosen": -262.3620300292969, + "logps/rejected": -275.670166015625, + "loss": 0.2071, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1108541488647461, + "rewards/margins": 2.598660469055176, + "rewards/rejected": -2.7095141410827637, + "step": 4859 + }, + { + "epoch": 0.56, + "learning_rate": 1.339342151469039e-07, + "logits/chosen": -2.6220412254333496, + "logits/rejected": -2.425110340118408, + "logps/chosen": -195.59677124023438, + "logps/rejected": -215.57737731933594, + "loss": 0.3645, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6897997856140137, + "rewards/margins": 2.8012523651123047, + "rewards/rejected": -3.4910523891448975, + "step": 4860 + }, + { + "epoch": 0.56, + "learning_rate": 1.3389909867727964e-07, + "logits/chosen": -2.8573083877563477, + "logits/rejected": -2.7498979568481445, + "logps/chosen": -230.29397583007812, + "logps/rejected": -203.1072998046875, + "loss": 0.485, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2168370485305786, + "rewards/margins": 1.8078985214233398, + "rewards/rejected": -3.024735450744629, + "step": 4861 + }, + { + "epoch": 0.56, + "learning_rate": 1.338639822076554e-07, + "logits/chosen": -2.267040729522705, + "logits/rejected": -2.436944007873535, + "logps/chosen": -217.9496307373047, + "logps/rejected": -261.4281921386719, + "loss": 0.3446, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8850774765014648, + "rewards/margins": 2.7242798805236816, + "rewards/rejected": -4.6093573570251465, + "step": 4862 + }, + { + "epoch": 0.56, + "learning_rate": 1.3382886573803112e-07, + "logits/chosen": -2.5311198234558105, + "logits/rejected": -2.5330653190612793, + "logps/chosen": -324.1934509277344, + "logps/rejected": -246.23150634765625, + "loss": 0.5209, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2181252241134644, + "rewards/margins": 1.3532183170318604, + "rewards/rejected": -2.5713436603546143, + "step": 4863 + }, + { + "epoch": 0.56, + "learning_rate": 1.3379374926840688e-07, + "logits/chosen": -2.861323833465576, + "logits/rejected": -2.7684688568115234, + "logps/chosen": -199.6689453125, + "logps/rejected": -275.5961608886719, + "loss": 0.2694, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.930914580821991, + "rewards/margins": 4.7317214012146, + "rewards/rejected": -5.662635803222656, + "step": 4864 + }, + { + "epoch": 0.56, + "learning_rate": 1.3375863279878263e-07, + "logits/chosen": -2.485644817352295, + "logits/rejected": -2.8809287548065186, + "logps/chosen": -344.8486328125, + "logps/rejected": -270.2098083496094, + "loss": 1.5619, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.5356500148773193, + "rewards/margins": 0.7212586402893066, + "rewards/rejected": -3.256908416748047, + "step": 4865 + }, + { + "epoch": 0.56, + "learning_rate": 1.3372351632915836e-07, + "logits/chosen": -1.6640803813934326, + "logits/rejected": -2.2577099800109863, + "logps/chosen": -677.593994140625, + "logps/rejected": -321.55316162109375, + "loss": 0.4341, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.28615647554397583, + "rewards/margins": 1.894326090812683, + "rewards/rejected": -2.1804826259613037, + "step": 4866 + }, + { + "epoch": 0.56, + "learning_rate": 1.336883998595341e-07, + "logits/chosen": -2.66129994392395, + "logits/rejected": -2.6242775917053223, + "logps/chosen": -273.50213623046875, + "logps/rejected": -276.7961120605469, + "loss": 0.5449, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3447388410568237, + "rewards/margins": 1.6269534826278687, + "rewards/rejected": -2.9716923236846924, + "step": 4867 + }, + { + "epoch": 0.56, + "learning_rate": 1.3365328338990987e-07, + "logits/chosen": -2.616253137588501, + "logits/rejected": -2.5446574687957764, + "logps/chosen": -105.6913833618164, + "logps/rejected": -225.27224731445312, + "loss": 0.388, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5982733368873596, + "rewards/margins": 2.865211009979248, + "rewards/rejected": -3.463484287261963, + "step": 4868 + }, + { + "epoch": 0.56, + "learning_rate": 1.3361816692028562e-07, + "logits/chosen": -2.509903907775879, + "logits/rejected": -2.4813475608825684, + "logps/chosen": -253.03271484375, + "logps/rejected": -516.9243774414062, + "loss": 0.21, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0142229795455933, + "rewards/margins": 2.6311516761779785, + "rewards/rejected": -3.6453747749328613, + "step": 4869 + }, + { + "epoch": 0.56, + "learning_rate": 1.3358305045066135e-07, + "logits/chosen": -2.184079885482788, + "logits/rejected": -2.4028985500335693, + "logps/chosen": -371.2828063964844, + "logps/rejected": -247.85145568847656, + "loss": 0.2969, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5419096350669861, + "rewards/margins": 2.0485851764678955, + "rewards/rejected": -2.5904946327209473, + "step": 4870 + }, + { + "epoch": 0.56, + "learning_rate": 1.335479339810371e-07, + "logits/chosen": -2.051506757736206, + "logits/rejected": -2.36519718170166, + "logps/chosen": -415.91705322265625, + "logps/rejected": -331.8366394042969, + "loss": 0.2349, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20464876294136047, + "rewards/margins": 1.8811111450195312, + "rewards/rejected": -2.0857601165771484, + "step": 4871 + }, + { + "epoch": 0.56, + "learning_rate": 1.3351281751141283e-07, + "logits/chosen": -2.8378233909606934, + "logits/rejected": -2.931849479675293, + "logps/chosen": -94.53404998779297, + "logps/rejected": -277.8387756347656, + "loss": 0.1923, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2528058886528015, + "rewards/margins": 2.255629777908325, + "rewards/rejected": -2.5084357261657715, + "step": 4872 + }, + { + "epoch": 0.56, + "learning_rate": 1.334777010417886e-07, + "logits/chosen": -2.9770915508270264, + "logits/rejected": -3.0821099281311035, + "logps/chosen": -182.44586181640625, + "logps/rejected": -215.1354522705078, + "loss": 0.3999, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0514318943023682, + "rewards/margins": 2.1980552673339844, + "rewards/rejected": -3.2494866847991943, + "step": 4873 + }, + { + "epoch": 0.56, + "learning_rate": 1.3344258457216434e-07, + "logits/chosen": -2.480250835418701, + "logits/rejected": -2.58096981048584, + "logps/chosen": -258.8610534667969, + "logps/rejected": -174.78298950195312, + "loss": 0.7234, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.061375617980957, + "rewards/margins": 1.5865850448608398, + "rewards/rejected": -2.647960662841797, + "step": 4874 + }, + { + "epoch": 0.56, + "learning_rate": 1.334074681025401e-07, + "logits/chosen": -2.0937840938568115, + "logits/rejected": -2.189938545227051, + "logps/chosen": -412.83282470703125, + "logps/rejected": -316.1210021972656, + "loss": 0.2816, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6416245698928833, + "rewards/margins": 2.645981788635254, + "rewards/rejected": -3.2876064777374268, + "step": 4875 + }, + { + "epoch": 0.56, + "learning_rate": 1.3337235163291582e-07, + "logits/chosen": -2.3487026691436768, + "logits/rejected": -2.541339874267578, + "logps/chosen": -286.98651123046875, + "logps/rejected": -143.7939453125, + "loss": 0.4233, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.44583427906036377, + "rewards/margins": 1.5509321689605713, + "rewards/rejected": -1.996766448020935, + "step": 4876 + }, + { + "epoch": 0.56, + "learning_rate": 1.3333723516329157e-07, + "logits/chosen": -1.7707841396331787, + "logits/rejected": -1.7658036947250366, + "logps/chosen": -496.3251647949219, + "logps/rejected": -386.42083740234375, + "loss": 0.1678, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.41707882285118103, + "rewards/margins": 2.1200664043426514, + "rewards/rejected": -2.5371453762054443, + "step": 4877 + }, + { + "epoch": 0.56, + "learning_rate": 1.3330211869366732e-07, + "logits/chosen": -2.0844571590423584, + "logits/rejected": -2.27370548248291, + "logps/chosen": -366.13909912109375, + "logps/rejected": -200.0029754638672, + "loss": 0.3365, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3490062355995178, + "rewards/margins": 1.9496464729309082, + "rewards/rejected": -2.2986526489257812, + "step": 4878 + }, + { + "epoch": 0.56, + "learning_rate": 1.3326700222404308e-07, + "logits/chosen": -2.0419750213623047, + "logits/rejected": -1.9760794639587402, + "logps/chosen": -261.3287658691406, + "logps/rejected": -254.01632690429688, + "loss": 0.5622, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5645545125007629, + "rewards/margins": 0.9660141468048096, + "rewards/rejected": -1.5305685997009277, + "step": 4879 + }, + { + "epoch": 0.56, + "learning_rate": 1.332318857544188e-07, + "logits/chosen": -2.56803560256958, + "logits/rejected": -2.4463374614715576, + "logps/chosen": -285.47772216796875, + "logps/rejected": -178.00868225097656, + "loss": 0.119, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21979819238185883, + "rewards/margins": 2.7027268409729004, + "rewards/rejected": -2.922525405883789, + "step": 4880 + }, + { + "epoch": 0.56, + "learning_rate": 1.3319676928479456e-07, + "logits/chosen": -2.6402347087860107, + "logits/rejected": -2.8146204948425293, + "logps/chosen": -415.8497009277344, + "logps/rejected": -299.5093078613281, + "loss": 0.4684, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5105080604553223, + "rewards/margins": 1.6497355699539185, + "rewards/rejected": -2.1602437496185303, + "step": 4881 + }, + { + "epoch": 0.56, + "learning_rate": 1.3316165281517031e-07, + "logits/chosen": -2.1198160648345947, + "logits/rejected": -2.1782941818237305, + "logps/chosen": -349.337158203125, + "logps/rejected": -420.1539306640625, + "loss": 0.0488, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5370103716850281, + "rewards/margins": 4.289882183074951, + "rewards/rejected": -4.826892375946045, + "step": 4882 + }, + { + "epoch": 0.56, + "learning_rate": 1.3312653634554604e-07, + "logits/chosen": -2.3810648918151855, + "logits/rejected": -2.398820161819458, + "logps/chosen": -308.3310546875, + "logps/rejected": -341.695556640625, + "loss": 0.8683, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7016468048095703, + "rewards/margins": 1.1110955476760864, + "rewards/rejected": -2.812742233276367, + "step": 4883 + }, + { + "epoch": 0.56, + "learning_rate": 1.330914198759218e-07, + "logits/chosen": -2.7815041542053223, + "logits/rejected": -2.948491096496582, + "logps/chosen": -287.08404541015625, + "logps/rejected": -268.71514892578125, + "loss": 0.134, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7090652585029602, + "rewards/margins": 3.119553804397583, + "rewards/rejected": -3.8286192417144775, + "step": 4884 + }, + { + "epoch": 0.56, + "learning_rate": 1.3305630340629755e-07, + "logits/chosen": -2.1845216751098633, + "logits/rejected": -2.0145034790039062, + "logps/chosen": -218.43057250976562, + "logps/rejected": -245.12393188476562, + "loss": 0.2851, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5132563710212708, + "rewards/margins": 2.072495937347412, + "rewards/rejected": -2.585752248764038, + "step": 4885 + }, + { + "epoch": 0.56, + "learning_rate": 1.330211869366733e-07, + "logits/chosen": -2.1972246170043945, + "logits/rejected": -2.2028143405914307, + "logps/chosen": -275.3067321777344, + "logps/rejected": -263.8856201171875, + "loss": 0.258, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9632389545440674, + "rewards/margins": 2.9616153240203857, + "rewards/rejected": -3.924854278564453, + "step": 4886 + }, + { + "epoch": 0.56, + "learning_rate": 1.3298607046704903e-07, + "logits/chosen": -2.0361857414245605, + "logits/rejected": -1.9877792596817017, + "logps/chosen": -364.4018859863281, + "logps/rejected": -404.2934875488281, + "loss": 0.3854, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9521200656890869, + "rewards/margins": 1.4119327068328857, + "rewards/rejected": -2.3640527725219727, + "step": 4887 + }, + { + "epoch": 0.56, + "learning_rate": 1.3295095399742478e-07, + "logits/chosen": -2.1477670669555664, + "logits/rejected": -2.4686686992645264, + "logps/chosen": -153.5251007080078, + "logps/rejected": -165.31692504882812, + "loss": 0.6101, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1105231046676636, + "rewards/margins": 0.7975252866744995, + "rewards/rejected": -1.908048391342163, + "step": 4888 + }, + { + "epoch": 0.56, + "learning_rate": 1.3291583752780054e-07, + "logits/chosen": -1.9802534580230713, + "logits/rejected": -1.994807243347168, + "logps/chosen": -193.39923095703125, + "logps/rejected": -201.96075439453125, + "loss": 0.4506, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4748305082321167, + "rewards/margins": 1.7114644050598145, + "rewards/rejected": -3.1862950325012207, + "step": 4889 + }, + { + "epoch": 0.56, + "learning_rate": 1.328807210581763e-07, + "logits/chosen": -2.0650546550750732, + "logits/rejected": -2.0636281967163086, + "logps/chosen": -299.1107482910156, + "logps/rejected": -323.3382263183594, + "loss": 0.6005, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9518091678619385, + "rewards/margins": 0.7706610560417175, + "rewards/rejected": -1.7224702835083008, + "step": 4890 + }, + { + "epoch": 0.56, + "learning_rate": 1.3284560458855202e-07, + "logits/chosen": -2.2551205158233643, + "logits/rejected": -2.3492791652679443, + "logps/chosen": -257.0169372558594, + "logps/rejected": -252.74786376953125, + "loss": 0.2854, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.810402512550354, + "rewards/margins": 1.6896336078643799, + "rewards/rejected": -2.5000362396240234, + "step": 4891 + }, + { + "epoch": 0.56, + "learning_rate": 1.3281048811892777e-07, + "logits/chosen": -2.2585737705230713, + "logits/rejected": -2.4116885662078857, + "logps/chosen": -284.1536865234375, + "logps/rejected": -269.0283203125, + "loss": 0.3496, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4991920590400696, + "rewards/margins": 2.2899136543273926, + "rewards/rejected": -2.7891058921813965, + "step": 4892 + }, + { + "epoch": 0.56, + "learning_rate": 1.3277537164930353e-07, + "logits/chosen": -2.075917959213257, + "logits/rejected": -2.2678921222686768, + "logps/chosen": -311.3052978515625, + "logps/rejected": -166.1230010986328, + "loss": 0.6045, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8009999990463257, + "rewards/margins": 1.1986970901489258, + "rewards/rejected": -1.9996970891952515, + "step": 4893 + }, + { + "epoch": 0.56, + "learning_rate": 1.3274025517967925e-07, + "logits/chosen": -2.290196418762207, + "logits/rejected": -2.6348323822021484, + "logps/chosen": -333.5155029296875, + "logps/rejected": -290.2843017578125, + "loss": 0.4314, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8095315098762512, + "rewards/margins": 1.9155594110488892, + "rewards/rejected": -2.725090980529785, + "step": 4894 + }, + { + "epoch": 0.56, + "learning_rate": 1.32705138710055e-07, + "logits/chosen": -1.9054462909698486, + "logits/rejected": -1.8116576671600342, + "logps/chosen": -278.64801025390625, + "logps/rejected": -395.562744140625, + "loss": 0.2695, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.939507007598877, + "rewards/margins": 1.9442362785339355, + "rewards/rejected": -2.8837432861328125, + "step": 4895 + }, + { + "epoch": 0.56, + "learning_rate": 1.3267002224043076e-07, + "logits/chosen": -2.3986339569091797, + "logits/rejected": -2.700502872467041, + "logps/chosen": -316.1549377441406, + "logps/rejected": -203.852783203125, + "loss": 0.212, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19164659082889557, + "rewards/margins": 3.097360134124756, + "rewards/rejected": -3.2890067100524902, + "step": 4896 + }, + { + "epoch": 0.56, + "learning_rate": 1.3263490577080652e-07, + "logits/chosen": -2.887369394302368, + "logits/rejected": -2.7327470779418945, + "logps/chosen": -248.5577392578125, + "logps/rejected": -368.7901611328125, + "loss": 0.0994, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20138603448867798, + "rewards/margins": 4.127945423126221, + "rewards/rejected": -4.329331398010254, + "step": 4897 + }, + { + "epoch": 0.56, + "learning_rate": 1.3259978930118224e-07, + "logits/chosen": -2.2471699714660645, + "logits/rejected": -2.1502156257629395, + "logps/chosen": -105.9600830078125, + "logps/rejected": -193.2117919921875, + "loss": 0.2329, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4929523468017578, + "rewards/margins": 1.8557683229446411, + "rewards/rejected": -2.3487205505371094, + "step": 4898 + }, + { + "epoch": 0.56, + "learning_rate": 1.32564672831558e-07, + "logits/chosen": -2.41963791847229, + "logits/rejected": -2.4592437744140625, + "logps/chosen": -238.8378448486328, + "logps/rejected": -277.3547058105469, + "loss": 0.273, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8469871282577515, + "rewards/margins": 2.334455966949463, + "rewards/rejected": -3.181443214416504, + "step": 4899 + }, + { + "epoch": 0.56, + "learning_rate": 1.3252955636193372e-07, + "logits/chosen": -2.0159759521484375, + "logits/rejected": -1.8176376819610596, + "logps/chosen": -142.6376953125, + "logps/rejected": -270.36688232421875, + "loss": 0.1815, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.26878821849823, + "rewards/margins": 2.6155173778533936, + "rewards/rejected": -2.884305715560913, + "step": 4900 + }, + { + "epoch": 0.56, + "learning_rate": 1.324944398923095e-07, + "logits/chosen": -2.653899908065796, + "logits/rejected": -2.8335793018341064, + "logps/chosen": -403.3122863769531, + "logps/rejected": -230.8260498046875, + "loss": 0.4197, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6081969141960144, + "rewards/margins": 1.8964173793792725, + "rewards/rejected": -2.5046138763427734, + "step": 4901 + }, + { + "epoch": 0.57, + "learning_rate": 1.3245932342268523e-07, + "logits/chosen": -2.6589102745056152, + "logits/rejected": -2.587904453277588, + "logps/chosen": -183.63839721679688, + "logps/rejected": -312.6148376464844, + "loss": 0.1557, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8272925019264221, + "rewards/margins": 3.5662648677825928, + "rewards/rejected": -4.393557548522949, + "step": 4902 + }, + { + "epoch": 0.57, + "learning_rate": 1.3242420695306099e-07, + "logits/chosen": -2.4285788536071777, + "logits/rejected": -2.6321544647216797, + "logps/chosen": -235.64056396484375, + "logps/rejected": -153.7830352783203, + "loss": 0.3173, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.074108600616455, + "rewards/margins": 2.3299098014831543, + "rewards/rejected": -3.404017925262451, + "step": 4903 + }, + { + "epoch": 0.57, + "learning_rate": 1.3238909048343671e-07, + "logits/chosen": -2.1438848972320557, + "logits/rejected": -2.2887725830078125, + "logps/chosen": -339.2846374511719, + "logps/rejected": -282.32757568359375, + "loss": 0.5758, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5095176696777344, + "rewards/margins": 1.195206642150879, + "rewards/rejected": -2.7047243118286133, + "step": 4904 + }, + { + "epoch": 0.57, + "learning_rate": 1.3235397401381247e-07, + "logits/chosen": -2.3192076683044434, + "logits/rejected": -2.2683887481689453, + "logps/chosen": -158.2983856201172, + "logps/rejected": -283.3976745605469, + "loss": 0.3205, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.32909464836120605, + "rewards/margins": 2.2235145568847656, + "rewards/rejected": -2.5526092052459717, + "step": 4905 + }, + { + "epoch": 0.57, + "learning_rate": 1.3231885754418822e-07, + "logits/chosen": -2.3799655437469482, + "logits/rejected": -2.319121837615967, + "logps/chosen": -171.10008239746094, + "logps/rejected": -376.3849792480469, + "loss": 0.3809, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0918364524841309, + "rewards/margins": 3.961059808731079, + "rewards/rejected": -5.052896499633789, + "step": 4906 + }, + { + "epoch": 0.57, + "learning_rate": 1.3228374107456397e-07, + "logits/chosen": -2.8883235454559326, + "logits/rejected": -2.9141693115234375, + "logps/chosen": -268.4407043457031, + "logps/rejected": -287.9931945800781, + "loss": 0.8072, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.767824411392212, + "rewards/margins": 1.1291923522949219, + "rewards/rejected": -2.897017002105713, + "step": 4907 + }, + { + "epoch": 0.57, + "learning_rate": 1.322486246049397e-07, + "logits/chosen": -1.8616796731948853, + "logits/rejected": -2.2521250247955322, + "logps/chosen": -447.75762939453125, + "logps/rejected": -330.79327392578125, + "loss": 0.2876, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5888311862945557, + "rewards/margins": 2.2025411128997803, + "rewards/rejected": -2.791372060775757, + "step": 4908 + }, + { + "epoch": 0.57, + "learning_rate": 1.3221350813531546e-07, + "logits/chosen": -2.818662166595459, + "logits/rejected": -2.7974445819854736, + "logps/chosen": -175.2223358154297, + "logps/rejected": -222.26678466796875, + "loss": 0.1474, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6919925808906555, + "rewards/margins": 3.3382630348205566, + "rewards/rejected": -4.0302557945251465, + "step": 4909 + }, + { + "epoch": 0.57, + "learning_rate": 1.321783916656912e-07, + "logits/chosen": -2.6377956867218018, + "logits/rejected": -2.694563865661621, + "logps/chosen": -371.99102783203125, + "logps/rejected": -335.70928955078125, + "loss": 0.256, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08139069378376007, + "rewards/margins": 3.9953668117523193, + "rewards/rejected": -4.076757431030273, + "step": 4910 + }, + { + "epoch": 0.57, + "learning_rate": 1.3214327519606694e-07, + "logits/chosen": -2.274350166320801, + "logits/rejected": -2.4257049560546875, + "logps/chosen": -279.24365234375, + "logps/rejected": -412.3231506347656, + "loss": 0.1902, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.40254026651382446, + "rewards/margins": 3.117238998413086, + "rewards/rejected": -3.5197792053222656, + "step": 4911 + }, + { + "epoch": 0.57, + "learning_rate": 1.321081587264427e-07, + "logits/chosen": -2.3260581493377686, + "logits/rejected": -2.6553397178649902, + "logps/chosen": -243.2716064453125, + "logps/rejected": -201.74989318847656, + "loss": 0.3472, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19658958911895752, + "rewards/margins": 2.2711923122406006, + "rewards/rejected": -2.4677817821502686, + "step": 4912 + }, + { + "epoch": 0.57, + "learning_rate": 1.3207304225681845e-07, + "logits/chosen": -2.677807331085205, + "logits/rejected": -2.558824062347412, + "logps/chosen": -344.48590087890625, + "logps/rejected": -276.453369140625, + "loss": 0.1708, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2016855478286743, + "rewards/margins": 4.0328803062438965, + "rewards/rejected": -5.234565734863281, + "step": 4913 + }, + { + "epoch": 0.57, + "learning_rate": 1.320379257871942e-07, + "logits/chosen": -2.1118264198303223, + "logits/rejected": -2.274671792984009, + "logps/chosen": -357.5302429199219, + "logps/rejected": -297.5094909667969, + "loss": 0.3741, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1550980806350708, + "rewards/margins": 1.9864097833633423, + "rewards/rejected": -3.141507863998413, + "step": 4914 + }, + { + "epoch": 0.57, + "learning_rate": 1.3200280931756993e-07, + "logits/chosen": -2.373037338256836, + "logits/rejected": -2.5359790325164795, + "logps/chosen": -298.82049560546875, + "logps/rejected": -365.3153991699219, + "loss": 0.0401, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8842138051986694, + "rewards/margins": 4.537955284118652, + "rewards/rejected": -5.422168731689453, + "step": 4915 + }, + { + "epoch": 0.57, + "learning_rate": 1.3196769284794568e-07, + "logits/chosen": -1.8772687911987305, + "logits/rejected": -2.013113021850586, + "logps/chosen": -305.74041748046875, + "logps/rejected": -297.4919128417969, + "loss": 0.1822, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5533450841903687, + "rewards/margins": 2.711503744125366, + "rewards/rejected": -3.264848470687866, + "step": 4916 + }, + { + "epoch": 0.57, + "learning_rate": 1.319325763783214e-07, + "logits/chosen": -2.481353759765625, + "logits/rejected": -2.5455925464630127, + "logps/chosen": -216.64785766601562, + "logps/rejected": -231.75914001464844, + "loss": 1.0835, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4502907991409302, + "rewards/margins": 0.9701148271560669, + "rewards/rejected": -2.420405387878418, + "step": 4917 + }, + { + "epoch": 0.57, + "learning_rate": 1.318974599086972e-07, + "logits/chosen": -2.8154525756835938, + "logits/rejected": -2.96614146232605, + "logps/chosen": -313.10345458984375, + "logps/rejected": -210.1710205078125, + "loss": 0.7356, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3010969161987305, + "rewards/margins": 1.4232802391052246, + "rewards/rejected": -2.724377155303955, + "step": 4918 + }, + { + "epoch": 0.57, + "learning_rate": 1.3186234343907292e-07, + "logits/chosen": -2.375699043273926, + "logits/rejected": -2.6008760929107666, + "logps/chosen": -276.1436462402344, + "logps/rejected": -191.60357666015625, + "loss": 0.2457, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8208218812942505, + "rewards/margins": 2.273308038711548, + "rewards/rejected": -3.094130039215088, + "step": 4919 + }, + { + "epoch": 0.57, + "learning_rate": 1.3182722696944867e-07, + "logits/chosen": -2.3546714782714844, + "logits/rejected": -2.2238211631774902, + "logps/chosen": -225.77517700195312, + "logps/rejected": -172.37107849121094, + "loss": 0.4937, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5722332000732422, + "rewards/margins": 0.7770497798919678, + "rewards/rejected": -1.34928297996521, + "step": 4920 + }, + { + "epoch": 0.57, + "learning_rate": 1.317921104998244e-07, + "logits/chosen": -1.9742484092712402, + "logits/rejected": -2.073474168777466, + "logps/chosen": -242.1903076171875, + "logps/rejected": -301.60845947265625, + "loss": 0.1983, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.256063461303711, + "rewards/margins": 3.7122936248779297, + "rewards/rejected": -4.968356609344482, + "step": 4921 + }, + { + "epoch": 0.57, + "learning_rate": 1.3175699403020018e-07, + "logits/chosen": -2.4520456790924072, + "logits/rejected": -2.6353652477264404, + "logps/chosen": -376.4193115234375, + "logps/rejected": -300.8839111328125, + "loss": 0.221, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6501861214637756, + "rewards/margins": 2.1136209964752197, + "rewards/rejected": -2.7638072967529297, + "step": 4922 + }, + { + "epoch": 0.57, + "learning_rate": 1.317218775605759e-07, + "logits/chosen": -2.427774429321289, + "logits/rejected": -2.222588539123535, + "logps/chosen": -267.46356201171875, + "logps/rejected": -293.9261169433594, + "loss": 0.2521, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5029407739639282, + "rewards/margins": 2.9726858139038086, + "rewards/rejected": -3.4756264686584473, + "step": 4923 + }, + { + "epoch": 0.57, + "learning_rate": 1.3168676109095166e-07, + "logits/chosen": -2.1094205379486084, + "logits/rejected": -2.2795956134796143, + "logps/chosen": -440.67962646484375, + "logps/rejected": -363.379638671875, + "loss": 0.5541, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3528809547424316, + "rewards/margins": 1.2090764045715332, + "rewards/rejected": -2.561957359313965, + "step": 4924 + }, + { + "epoch": 0.57, + "learning_rate": 1.3165164462132739e-07, + "logits/chosen": -1.8627640008926392, + "logits/rejected": -1.9523131847381592, + "logps/chosen": -326.13409423828125, + "logps/rejected": -350.7335205078125, + "loss": 0.1855, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6690378785133362, + "rewards/margins": 3.0797791481018066, + "rewards/rejected": -3.7488174438476562, + "step": 4925 + }, + { + "epoch": 0.57, + "learning_rate": 1.3161652815170314e-07, + "logits/chosen": -2.483569383621216, + "logits/rejected": -2.59348726272583, + "logps/chosen": -389.14154052734375, + "logps/rejected": -493.5574035644531, + "loss": 0.2045, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7518338561058044, + "rewards/margins": 3.7920424938201904, + "rewards/rejected": -4.5438761711120605, + "step": 4926 + }, + { + "epoch": 0.57, + "learning_rate": 1.315814116820789e-07, + "logits/chosen": -2.621422529220581, + "logits/rejected": -2.7102015018463135, + "logps/chosen": -405.11163330078125, + "logps/rejected": -297.82965087890625, + "loss": 0.1859, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9426926374435425, + "rewards/margins": 2.9220945835113525, + "rewards/rejected": -3.8647871017456055, + "step": 4927 + }, + { + "epoch": 0.57, + "learning_rate": 1.3154629521245462e-07, + "logits/chosen": -1.9677495956420898, + "logits/rejected": -2.219359874725342, + "logps/chosen": -169.95498657226562, + "logps/rejected": -314.316650390625, + "loss": 0.243, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.45990556478500366, + "rewards/margins": 2.8611879348754883, + "rewards/rejected": -3.3210933208465576, + "step": 4928 + }, + { + "epoch": 0.57, + "learning_rate": 1.3151117874283037e-07, + "logits/chosen": -2.5456550121307373, + "logits/rejected": -2.5304484367370605, + "logps/chosen": -387.448974609375, + "logps/rejected": -420.647216796875, + "loss": 1.0107, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.44386625289917, + "rewards/margins": 0.5982174873352051, + "rewards/rejected": -2.042083740234375, + "step": 4929 + }, + { + "epoch": 0.57, + "learning_rate": 1.3147606227320613e-07, + "logits/chosen": -2.2984542846679688, + "logits/rejected": -2.4556210041046143, + "logps/chosen": -423.19873046875, + "logps/rejected": -260.0513610839844, + "loss": 0.5473, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6094096899032593, + "rewards/margins": 1.0297597646713257, + "rewards/rejected": -1.639169454574585, + "step": 4930 + }, + { + "epoch": 0.57, + "learning_rate": 1.3144094580358188e-07, + "logits/chosen": -2.783670663833618, + "logits/rejected": -2.9129700660705566, + "logps/chosen": -444.2365417480469, + "logps/rejected": -259.48388671875, + "loss": 0.4249, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2412186861038208, + "rewards/margins": 1.927201509475708, + "rewards/rejected": -3.1684203147888184, + "step": 4931 + }, + { + "epoch": 0.57, + "learning_rate": 1.314058293339576e-07, + "logits/chosen": -2.6254358291625977, + "logits/rejected": -2.634164333343506, + "logps/chosen": -262.00860595703125, + "logps/rejected": -258.9496765136719, + "loss": 0.1716, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.248589038848877, + "rewards/margins": 3.3267505168914795, + "rewards/rejected": -4.5753397941589355, + "step": 4932 + }, + { + "epoch": 0.57, + "learning_rate": 1.3137071286433336e-07, + "logits/chosen": -1.6720386743545532, + "logits/rejected": -1.8128905296325684, + "logps/chosen": -473.55987548828125, + "logps/rejected": -315.2620849609375, + "loss": 0.6225, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.001169845461845398, + "rewards/margins": 1.756920337677002, + "rewards/rejected": -1.7557506561279297, + "step": 4933 + }, + { + "epoch": 0.57, + "learning_rate": 1.3133559639470912e-07, + "logits/chosen": -1.7799681425094604, + "logits/rejected": -2.1340255737304688, + "logps/chosen": -448.7660217285156, + "logps/rejected": -293.6231689453125, + "loss": 0.3965, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.979763388633728, + "rewards/margins": 1.7808516025543213, + "rewards/rejected": -2.760615110397339, + "step": 4934 + }, + { + "epoch": 0.57, + "learning_rate": 1.3130047992508487e-07, + "logits/chosen": -2.2213945388793945, + "logits/rejected": -2.5729293823242188, + "logps/chosen": -228.65338134765625, + "logps/rejected": -158.1922607421875, + "loss": 0.3232, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.44222742319107056, + "rewards/margins": 3.15628981590271, + "rewards/rejected": -3.5985169410705566, + "step": 4935 + }, + { + "epoch": 0.57, + "learning_rate": 1.312653634554606e-07, + "logits/chosen": -1.9063379764556885, + "logits/rejected": -1.7633132934570312, + "logps/chosen": -374.16094970703125, + "logps/rejected": -374.3166809082031, + "loss": 0.3633, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7727116346359253, + "rewards/margins": 1.3985438346862793, + "rewards/rejected": -2.171255588531494, + "step": 4936 + }, + { + "epoch": 0.57, + "learning_rate": 1.3123024698583635e-07, + "logits/chosen": -2.4322237968444824, + "logits/rejected": -2.563180685043335, + "logps/chosen": -330.1837158203125, + "logps/rejected": -401.15936279296875, + "loss": 0.3123, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0015383958816528, + "rewards/margins": 1.8841229677200317, + "rewards/rejected": -2.8856613636016846, + "step": 4937 + }, + { + "epoch": 0.57, + "learning_rate": 1.311951305162121e-07, + "logits/chosen": -2.7731575965881348, + "logits/rejected": -2.7649712562561035, + "logps/chosen": -225.20938110351562, + "logps/rejected": -218.7582550048828, + "loss": 0.3877, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4868117868900299, + "rewards/margins": 2.0265188217163086, + "rewards/rejected": -2.5133306980133057, + "step": 4938 + }, + { + "epoch": 0.57, + "learning_rate": 1.3116001404658786e-07, + "logits/chosen": -2.2432868480682373, + "logits/rejected": -2.0481297969818115, + "logps/chosen": -174.1964569091797, + "logps/rejected": -231.24594116210938, + "loss": 0.3987, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7197843790054321, + "rewards/margins": 2.0620577335357666, + "rewards/rejected": -2.7818422317504883, + "step": 4939 + }, + { + "epoch": 0.57, + "learning_rate": 1.311248975769636e-07, + "logits/chosen": -2.4012486934661865, + "logits/rejected": -2.5323081016540527, + "logps/chosen": -196.36228942871094, + "logps/rejected": -309.07763671875, + "loss": 0.3309, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4091556966304779, + "rewards/margins": 1.6459619998931885, + "rewards/rejected": -2.055117607116699, + "step": 4940 + }, + { + "epoch": 0.57, + "learning_rate": 1.3108978110733934e-07, + "logits/chosen": -1.8889644145965576, + "logits/rejected": -2.2098281383514404, + "logps/chosen": -654.1444702148438, + "logps/rejected": -216.382080078125, + "loss": 0.2957, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5755864381790161, + "rewards/margins": 2.541396379470825, + "rewards/rejected": -3.116982936859131, + "step": 4941 + }, + { + "epoch": 0.57, + "learning_rate": 1.310546646377151e-07, + "logits/chosen": -2.2095205783843994, + "logits/rejected": -2.4297633171081543, + "logps/chosen": -202.03158569335938, + "logps/rejected": -224.55548095703125, + "loss": 0.228, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5369410514831543, + "rewards/margins": 3.1642303466796875, + "rewards/rejected": -3.701171398162842, + "step": 4942 + }, + { + "epoch": 0.57, + "learning_rate": 1.3101954816809082e-07, + "logits/chosen": -2.0444042682647705, + "logits/rejected": -2.3424477577209473, + "logps/chosen": -373.8653869628906, + "logps/rejected": -210.93321228027344, + "loss": 0.4941, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9878661036491394, + "rewards/margins": 0.8606253862380981, + "rewards/rejected": -1.8484914302825928, + "step": 4943 + }, + { + "epoch": 0.57, + "learning_rate": 1.3098443169846658e-07, + "logits/chosen": -2.4073262214660645, + "logits/rejected": -2.370610237121582, + "logps/chosen": -200.65521240234375, + "logps/rejected": -284.4913330078125, + "loss": 0.4543, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5446166396141052, + "rewards/margins": 1.318954586982727, + "rewards/rejected": -1.8635711669921875, + "step": 4944 + }, + { + "epoch": 0.57, + "learning_rate": 1.309493152288423e-07, + "logits/chosen": -2.6039745807647705, + "logits/rejected": -2.678394317626953, + "logps/chosen": -235.00189208984375, + "logps/rejected": -190.2185821533203, + "loss": 0.5477, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.770499587059021, + "rewards/margins": 0.5442188382148743, + "rewards/rejected": -2.314718246459961, + "step": 4945 + }, + { + "epoch": 0.57, + "learning_rate": 1.3091419875921808e-07, + "logits/chosen": -2.365027904510498, + "logits/rejected": -2.4211413860321045, + "logps/chosen": -321.1572570800781, + "logps/rejected": -381.37030029296875, + "loss": 0.2674, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8822674751281738, + "rewards/margins": 2.713016986846924, + "rewards/rejected": -3.5952839851379395, + "step": 4946 + }, + { + "epoch": 0.57, + "learning_rate": 1.308790822895938e-07, + "logits/chosen": -2.1830599308013916, + "logits/rejected": -2.183880567550659, + "logps/chosen": -276.5796203613281, + "logps/rejected": -320.34893798828125, + "loss": 0.2365, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09476801753044128, + "rewards/margins": 3.085629940032959, + "rewards/rejected": -3.1803979873657227, + "step": 4947 + }, + { + "epoch": 0.57, + "learning_rate": 1.3084396581996957e-07, + "logits/chosen": -2.3008933067321777, + "logits/rejected": -1.9476096630096436, + "logps/chosen": -221.21597290039062, + "logps/rejected": -343.92694091796875, + "loss": 0.1722, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.343802273273468, + "rewards/margins": 2.8497488498687744, + "rewards/rejected": -3.1935510635375977, + "step": 4948 + }, + { + "epoch": 0.57, + "learning_rate": 1.308088493503453e-07, + "logits/chosen": -2.6501505374908447, + "logits/rejected": -2.425954818725586, + "logps/chosen": -232.00982666015625, + "logps/rejected": -380.9869384765625, + "loss": 0.7708, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1668171882629395, + "rewards/margins": 2.216963291168213, + "rewards/rejected": -3.3837804794311523, + "step": 4949 + }, + { + "epoch": 0.57, + "learning_rate": 1.3077373288072107e-07, + "logits/chosen": -2.409316301345825, + "logits/rejected": -2.5971574783325195, + "logps/chosen": -573.486328125, + "logps/rejected": -312.17156982421875, + "loss": 0.2532, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4286647439002991, + "rewards/margins": 2.8532235622406006, + "rewards/rejected": -3.281888484954834, + "step": 4950 + }, + { + "epoch": 0.57, + "learning_rate": 1.307386164110968e-07, + "logits/chosen": -2.092588424682617, + "logits/rejected": -2.1445822715759277, + "logps/chosen": -303.7797546386719, + "logps/rejected": -303.78204345703125, + "loss": 0.1682, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08476351201534271, + "rewards/margins": 2.2654495239257812, + "rewards/rejected": -2.350213050842285, + "step": 4951 + }, + { + "epoch": 0.57, + "learning_rate": 1.3070349994147255e-07, + "logits/chosen": -2.9308080673217773, + "logits/rejected": -2.9993529319763184, + "logps/chosen": -323.9215087890625, + "logps/rejected": -341.4236755371094, + "loss": 0.1141, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6077297329902649, + "rewards/margins": 4.332386016845703, + "rewards/rejected": -4.940115451812744, + "step": 4952 + }, + { + "epoch": 0.57, + "learning_rate": 1.3066838347184828e-07, + "logits/chosen": -2.1163506507873535, + "logits/rejected": -2.230529308319092, + "logps/chosen": -252.60472106933594, + "logps/rejected": -292.88623046875, + "loss": 0.2656, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5943484902381897, + "rewards/margins": 3.7134745121002197, + "rewards/rejected": -4.3078227043151855, + "step": 4953 + }, + { + "epoch": 0.57, + "learning_rate": 1.3063326700222404e-07, + "logits/chosen": -1.757569432258606, + "logits/rejected": -2.0206143856048584, + "logps/chosen": -475.650634765625, + "logps/rejected": -446.57183837890625, + "loss": 0.4288, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.918567419052124, + "rewards/margins": 3.019176721572876, + "rewards/rejected": -3.937743902206421, + "step": 4954 + }, + { + "epoch": 0.57, + "learning_rate": 1.305981505325998e-07, + "logits/chosen": -2.473985433578491, + "logits/rejected": -2.705693244934082, + "logps/chosen": -404.5072326660156, + "logps/rejected": -234.97830200195312, + "loss": 0.3825, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.39714059233665466, + "rewards/margins": 1.5270957946777344, + "rewards/rejected": -1.9242362976074219, + "step": 4955 + }, + { + "epoch": 0.57, + "learning_rate": 1.3056303406297554e-07, + "logits/chosen": -2.5826103687286377, + "logits/rejected": -2.561210870742798, + "logps/chosen": -76.8671875, + "logps/rejected": -253.25830078125, + "loss": 0.207, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4428935647010803, + "rewards/margins": 3.3407387733459473, + "rewards/rejected": -3.783632278442383, + "step": 4956 + }, + { + "epoch": 0.57, + "learning_rate": 1.3052791759335127e-07, + "logits/chosen": -1.4654099941253662, + "logits/rejected": -2.1218576431274414, + "logps/chosen": -487.81048583984375, + "logps/rejected": -208.18505859375, + "loss": 0.9376, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5756986141204834, + "rewards/margins": 0.3171069324016571, + "rewards/rejected": -1.892805576324463, + "step": 4957 + }, + { + "epoch": 0.57, + "learning_rate": 1.3049280112372702e-07, + "logits/chosen": -2.0763356685638428, + "logits/rejected": -2.4369561672210693, + "logps/chosen": -326.87493896484375, + "logps/rejected": -242.17388916015625, + "loss": 0.0783, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07477041333913803, + "rewards/margins": 3.643855094909668, + "rewards/rejected": -3.718625545501709, + "step": 4958 + }, + { + "epoch": 0.57, + "learning_rate": 1.3045768465410278e-07, + "logits/chosen": -2.199310779571533, + "logits/rejected": -2.4802017211914062, + "logps/chosen": -354.2674255371094, + "logps/rejected": -200.43377685546875, + "loss": 0.4403, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6666443347930908, + "rewards/margins": 1.0948801040649414, + "rewards/rejected": -1.7615244388580322, + "step": 4959 + }, + { + "epoch": 0.57, + "learning_rate": 1.304225681844785e-07, + "logits/chosen": -2.330787420272827, + "logits/rejected": -2.3518247604370117, + "logps/chosen": -305.3346862792969, + "logps/rejected": -372.82977294921875, + "loss": 0.1173, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6460664868354797, + "rewards/margins": 3.371020793914795, + "rewards/rejected": -4.017087459564209, + "step": 4960 + }, + { + "epoch": 0.57, + "learning_rate": 1.3038745171485426e-07, + "logits/chosen": -1.9983198642730713, + "logits/rejected": -1.92411470413208, + "logps/chosen": -204.01036071777344, + "logps/rejected": -230.19317626953125, + "loss": 0.2446, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0224130153656006, + "rewards/margins": 1.8781814575195312, + "rewards/rejected": -2.900594472885132, + "step": 4961 + }, + { + "epoch": 0.57, + "learning_rate": 1.3035233524523e-07, + "logits/chosen": -1.7293412685394287, + "logits/rejected": -1.8162940740585327, + "logps/chosen": -178.19393920898438, + "logps/rejected": -201.5857696533203, + "loss": 0.596, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1177574396133423, + "rewards/margins": 1.8792630434036255, + "rewards/rejected": -2.9970202445983887, + "step": 4962 + }, + { + "epoch": 0.57, + "learning_rate": 1.3031721877560577e-07, + "logits/chosen": -2.3477940559387207, + "logits/rejected": -2.464836359024048, + "logps/chosen": -415.2171325683594, + "logps/rejected": -278.45361328125, + "loss": 0.8806, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.0394821166992188, + "rewards/margins": 0.098252072930336, + "rewards/rejected": -2.1377344131469727, + "step": 4963 + }, + { + "epoch": 0.57, + "learning_rate": 1.302821023059815e-07, + "logits/chosen": -2.0854272842407227, + "logits/rejected": -2.484714984893799, + "logps/chosen": -451.10595703125, + "logps/rejected": -409.7674560546875, + "loss": 0.2746, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2923407554626465, + "rewards/margins": 2.621755838394165, + "rewards/rejected": -3.9140963554382324, + "step": 4964 + }, + { + "epoch": 0.57, + "learning_rate": 1.3024698583635725e-07, + "logits/chosen": -2.176591157913208, + "logits/rejected": -2.5443058013916016, + "logps/chosen": -421.8949890136719, + "logps/rejected": -236.9947967529297, + "loss": 0.2751, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6791664361953735, + "rewards/margins": 1.9723434448242188, + "rewards/rejected": -2.651510000228882, + "step": 4965 + }, + { + "epoch": 0.57, + "learning_rate": 1.3021186936673298e-07, + "logits/chosen": -2.165306568145752, + "logits/rejected": -2.196570873260498, + "logps/chosen": -199.87063598632812, + "logps/rejected": -249.77383422851562, + "loss": 0.7969, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4315459728240967, + "rewards/margins": 1.153743863105774, + "rewards/rejected": -2.58528995513916, + "step": 4966 + }, + { + "epoch": 0.57, + "learning_rate": 1.3017675289710876e-07, + "logits/chosen": -2.5039565563201904, + "logits/rejected": -2.4499058723449707, + "logps/chosen": -204.31301879882812, + "logps/rejected": -195.4644317626953, + "loss": 0.4712, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9108769297599792, + "rewards/margins": 1.2190327644348145, + "rewards/rejected": -2.1299095153808594, + "step": 4967 + }, + { + "epoch": 0.57, + "learning_rate": 1.3014163642748448e-07, + "logits/chosen": -2.6646833419799805, + "logits/rejected": -2.621793270111084, + "logps/chosen": -226.47959899902344, + "logps/rejected": -183.34869384765625, + "loss": 0.5623, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0937062501907349, + "rewards/margins": 1.914860725402832, + "rewards/rejected": -3.0085670948028564, + "step": 4968 + }, + { + "epoch": 0.57, + "learning_rate": 1.3010651995786024e-07, + "logits/chosen": -2.404055595397949, + "logits/rejected": -2.445021867752075, + "logps/chosen": -238.04525756835938, + "logps/rejected": -215.8717041015625, + "loss": 0.8109, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7902642488479614, + "rewards/margins": 0.903297483921051, + "rewards/rejected": -1.6935616731643677, + "step": 4969 + }, + { + "epoch": 0.57, + "learning_rate": 1.3007140348823596e-07, + "logits/chosen": -2.255039691925049, + "logits/rejected": -2.2134146690368652, + "logps/chosen": -91.01200866699219, + "logps/rejected": -165.61056518554688, + "loss": 0.2685, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2228623926639557, + "rewards/margins": 1.7194154262542725, + "rewards/rejected": -1.9422777891159058, + "step": 4970 + }, + { + "epoch": 0.57, + "learning_rate": 1.3003628701861172e-07, + "logits/chosen": -2.0920119285583496, + "logits/rejected": -2.1529197692871094, + "logps/chosen": -224.239013671875, + "logps/rejected": -236.8587646484375, + "loss": 0.3675, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7966267466545105, + "rewards/margins": 1.2718514204025269, + "rewards/rejected": -2.0684783458709717, + "step": 4971 + }, + { + "epoch": 0.57, + "learning_rate": 1.3000117054898747e-07, + "logits/chosen": -2.5506327152252197, + "logits/rejected": -2.6028101444244385, + "logps/chosen": -343.2318420410156, + "logps/rejected": -315.8487243652344, + "loss": 0.3282, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9004282355308533, + "rewards/margins": 1.8605990409851074, + "rewards/rejected": -2.7610273361206055, + "step": 4972 + }, + { + "epoch": 0.57, + "learning_rate": 1.2996605407936323e-07, + "logits/chosen": -1.720169186592102, + "logits/rejected": -1.8091861009597778, + "logps/chosen": -258.21258544921875, + "logps/rejected": -286.2275695800781, + "loss": 0.1891, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3397691249847412, + "rewards/margins": 2.874859094619751, + "rewards/rejected": -3.2146284580230713, + "step": 4973 + }, + { + "epoch": 0.57, + "learning_rate": 1.2993093760973895e-07, + "logits/chosen": -1.7802822589874268, + "logits/rejected": -1.768507480621338, + "logps/chosen": -211.12112426757812, + "logps/rejected": -296.1044616699219, + "loss": 0.4095, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.12705036997795105, + "rewards/margins": 3.0361664295196533, + "rewards/rejected": -2.909116268157959, + "step": 4974 + }, + { + "epoch": 0.57, + "learning_rate": 1.298958211401147e-07, + "logits/chosen": -2.1594533920288086, + "logits/rejected": -1.4338594675064087, + "logps/chosen": -342.9795837402344, + "logps/rejected": -419.05291748046875, + "loss": 0.5343, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5071011781692505, + "rewards/margins": 1.8172569274902344, + "rewards/rejected": -2.3243582248687744, + "step": 4975 + }, + { + "epoch": 0.57, + "learning_rate": 1.2986070467049046e-07, + "logits/chosen": -2.2263410091400146, + "logits/rejected": -2.0445961952209473, + "logps/chosen": -295.297607421875, + "logps/rejected": -253.42996215820312, + "loss": 0.5374, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0712497234344482, + "rewards/margins": 1.6306676864624023, + "rewards/rejected": -2.7019171714782715, + "step": 4976 + }, + { + "epoch": 0.57, + "learning_rate": 1.298255882008662e-07, + "logits/chosen": -2.1512985229492188, + "logits/rejected": -2.0543320178985596, + "logps/chosen": -211.0521240234375, + "logps/rejected": -294.75750732421875, + "loss": 0.1508, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6269112229347229, + "rewards/margins": 3.1089630126953125, + "rewards/rejected": -3.7358739376068115, + "step": 4977 + }, + { + "epoch": 0.57, + "learning_rate": 1.2979047173124194e-07, + "logits/chosen": -2.753211736679077, + "logits/rejected": -2.513202667236328, + "logps/chosen": -201.29617309570312, + "logps/rejected": -255.02133178710938, + "loss": 0.2534, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.348100870847702, + "rewards/margins": 3.7123937606811523, + "rewards/rejected": -4.060494899749756, + "step": 4978 + }, + { + "epoch": 0.57, + "learning_rate": 1.297553552616177e-07, + "logits/chosen": -2.131979465484619, + "logits/rejected": -2.0949037075042725, + "logps/chosen": -209.75547790527344, + "logps/rejected": -323.833251953125, + "loss": 1.6299, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.8242297172546387, + "rewards/margins": 0.9693474769592285, + "rewards/rejected": -2.793577194213867, + "step": 4979 + }, + { + "epoch": 0.57, + "learning_rate": 1.2972023879199345e-07, + "logits/chosen": -1.9289671182632446, + "logits/rejected": -2.355367422103882, + "logps/chosen": -248.10105895996094, + "logps/rejected": -220.64727783203125, + "loss": 1.821, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6537801027297974, + "rewards/margins": 0.8345724940299988, + "rewards/rejected": -2.4883527755737305, + "step": 4980 + }, + { + "epoch": 0.57, + "learning_rate": 1.2968512232236918e-07, + "logits/chosen": -2.5387182235717773, + "logits/rejected": -2.5112271308898926, + "logps/chosen": -289.54541015625, + "logps/rejected": -261.0879821777344, + "loss": 0.3442, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3371601104736328, + "rewards/margins": 2.6230390071868896, + "rewards/rejected": -3.9601993560791016, + "step": 4981 + }, + { + "epoch": 0.57, + "learning_rate": 1.2965000585274493e-07, + "logits/chosen": -1.9998304843902588, + "logits/rejected": -2.0980522632598877, + "logps/chosen": -583.228515625, + "logps/rejected": -393.6907958984375, + "loss": 0.9739, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.5263352394104004, + "rewards/margins": 1.7097901105880737, + "rewards/rejected": -4.2361249923706055, + "step": 4982 + }, + { + "epoch": 0.57, + "learning_rate": 1.2961488938312069e-07, + "logits/chosen": -2.2853267192840576, + "logits/rejected": -2.482034921646118, + "logps/chosen": -210.123046875, + "logps/rejected": -191.92233276367188, + "loss": 0.3299, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6810033321380615, + "rewards/margins": 2.047039747238159, + "rewards/rejected": -2.7280428409576416, + "step": 4983 + }, + { + "epoch": 0.57, + "learning_rate": 1.2957977291349644e-07, + "logits/chosen": -2.3732168674468994, + "logits/rejected": -2.3817195892333984, + "logps/chosen": -183.41458129882812, + "logps/rejected": -223.32949829101562, + "loss": 0.3718, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.28181251883506775, + "rewards/margins": 2.695798635482788, + "rewards/rejected": -2.9776110649108887, + "step": 4984 + }, + { + "epoch": 0.57, + "learning_rate": 1.2954465644387217e-07, + "logits/chosen": -2.751269817352295, + "logits/rejected": -2.6732301712036133, + "logps/chosen": -256.7925109863281, + "logps/rejected": -206.84832763671875, + "loss": 0.2528, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3382070064544678, + "rewards/margins": 2.511270523071289, + "rewards/rejected": -3.8494772911071777, + "step": 4985 + }, + { + "epoch": 0.57, + "learning_rate": 1.2950953997424792e-07, + "logits/chosen": -1.839719295501709, + "logits/rejected": -1.9004896879196167, + "logps/chosen": -409.68914794921875, + "logps/rejected": -313.2829284667969, + "loss": 0.3298, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2870822250843048, + "rewards/margins": 1.978903889656067, + "rewards/rejected": -2.265986204147339, + "step": 4986 + }, + { + "epoch": 0.57, + "learning_rate": 1.2947442350462367e-07, + "logits/chosen": -2.3588104248046875, + "logits/rejected": -2.1805338859558105, + "logps/chosen": -382.3459777832031, + "logps/rejected": -306.7064208984375, + "loss": 0.7637, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4513781070709229, + "rewards/margins": 1.9753649234771729, + "rewards/rejected": -3.426743507385254, + "step": 4987 + }, + { + "epoch": 0.58, + "learning_rate": 1.294393070349994e-07, + "logits/chosen": -2.429192543029785, + "logits/rejected": -2.155595541000366, + "logps/chosen": -233.47787475585938, + "logps/rejected": -418.1794738769531, + "loss": 0.3711, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.27329128980636597, + "rewards/margins": 2.515294313430786, + "rewards/rejected": -2.788585901260376, + "step": 4988 + }, + { + "epoch": 0.58, + "learning_rate": 1.2940419056537516e-07, + "logits/chosen": -2.0400829315185547, + "logits/rejected": -2.152846336364746, + "logps/chosen": -298.80230712890625, + "logps/rejected": -344.6511535644531, + "loss": 0.4259, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5628759860992432, + "rewards/margins": 3.1281819343566895, + "rewards/rejected": -4.691058158874512, + "step": 4989 + }, + { + "epoch": 0.58, + "learning_rate": 1.293690740957509e-07, + "logits/chosen": -2.420487403869629, + "logits/rejected": -2.3049960136413574, + "logps/chosen": -175.26873779296875, + "logps/rejected": -293.8970947265625, + "loss": 0.1324, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7529714107513428, + "rewards/margins": 3.0990772247314453, + "rewards/rejected": -3.852048397064209, + "step": 4990 + }, + { + "epoch": 0.58, + "learning_rate": 1.2933395762612666e-07, + "logits/chosen": -2.5303921699523926, + "logits/rejected": -2.324674606323242, + "logps/chosen": -284.05950927734375, + "logps/rejected": -374.6523742675781, + "loss": 0.3698, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3007451295852661, + "rewards/margins": 3.5364699363708496, + "rewards/rejected": -4.837215423583984, + "step": 4991 + }, + { + "epoch": 0.58, + "learning_rate": 1.292988411565024e-07, + "logits/chosen": -1.986405849456787, + "logits/rejected": -2.298877000808716, + "logps/chosen": -308.21075439453125, + "logps/rejected": -276.8362731933594, + "loss": 0.5587, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7212930917739868, + "rewards/margins": 1.1525185108184814, + "rewards/rejected": -2.873811721801758, + "step": 4992 + }, + { + "epoch": 0.58, + "learning_rate": 1.2926372468687814e-07, + "logits/chosen": -2.51228666305542, + "logits/rejected": -2.2569479942321777, + "logps/chosen": -260.5950927734375, + "logps/rejected": -420.63409423828125, + "loss": 0.2095, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5169780254364014, + "rewards/margins": 3.9433224201202393, + "rewards/rejected": -4.460300445556641, + "step": 4993 + }, + { + "epoch": 0.58, + "learning_rate": 1.2922860821725387e-07, + "logits/chosen": -2.521697521209717, + "logits/rejected": -2.3103950023651123, + "logps/chosen": -196.10247802734375, + "logps/rejected": -278.809326171875, + "loss": 0.1896, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.577490508556366, + "rewards/margins": 3.6044087409973145, + "rewards/rejected": -4.181899070739746, + "step": 4994 + }, + { + "epoch": 0.58, + "learning_rate": 1.2919349174762965e-07, + "logits/chosen": -2.5861871242523193, + "logits/rejected": -2.8050689697265625, + "logps/chosen": -211.26876831054688, + "logps/rejected": -294.4273376464844, + "loss": 0.1766, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.932086706161499, + "rewards/margins": 2.5673131942749023, + "rewards/rejected": -3.4993999004364014, + "step": 4995 + }, + { + "epoch": 0.58, + "learning_rate": 1.2915837527800538e-07, + "logits/chosen": -2.313034772872925, + "logits/rejected": -2.5758073329925537, + "logps/chosen": -219.93411254882812, + "logps/rejected": -208.6776123046875, + "loss": 0.3211, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1626802682876587, + "rewards/margins": 2.156897783279419, + "rewards/rejected": -3.319578170776367, + "step": 4996 + }, + { + "epoch": 0.58, + "learning_rate": 1.2912325880838113e-07, + "logits/chosen": -1.8924659490585327, + "logits/rejected": -2.146787166595459, + "logps/chosen": -345.7962341308594, + "logps/rejected": -282.2001037597656, + "loss": 0.4955, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4105026721954346, + "rewards/margins": 1.4474527835845947, + "rewards/rejected": -2.8579554557800293, + "step": 4997 + }, + { + "epoch": 0.58, + "learning_rate": 1.2908814233875686e-07, + "logits/chosen": -1.8891162872314453, + "logits/rejected": -2.032261848449707, + "logps/chosen": -262.0421447753906, + "logps/rejected": -236.5675048828125, + "loss": 0.2129, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1836141049861908, + "rewards/margins": 2.8868348598480225, + "rewards/rejected": -3.070449113845825, + "step": 4998 + }, + { + "epoch": 0.58, + "learning_rate": 1.2905302586913261e-07, + "logits/chosen": -2.954892158508301, + "logits/rejected": -2.944843292236328, + "logps/chosen": -148.19268798828125, + "logps/rejected": -228.46603393554688, + "loss": 0.3051, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1756935119628906, + "rewards/margins": 3.0561654567718506, + "rewards/rejected": -4.23185920715332, + "step": 4999 + }, + { + "epoch": 0.58, + "learning_rate": 1.2901790939950837e-07, + "logits/chosen": -1.7928707599639893, + "logits/rejected": -1.8182134628295898, + "logps/chosen": -299.06103515625, + "logps/rejected": -270.42901611328125, + "loss": 1.2588, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.9918713569641113, + "rewards/margins": 0.026932954788208008, + "rewards/rejected": -2.0188043117523193, + "step": 5000 + }, + { + "epoch": 0.58, + "eval_logits/chosen": -1.63687002658844, + "eval_logits/rejected": -1.5131334066390991, + "eval_logps/chosen": -300.43621826171875, + "eval_logps/rejected": -274.9338684082031, + "eval_loss": 0.3327547311782837, + "eval_rewards/accuracies": 0.8571428656578064, + "eval_rewards/chosen": -0.7755981683731079, + "eval_rewards/margins": 2.1990485191345215, + "eval_rewards/rejected": -2.97464656829834, + "eval_runtime": 24.1792, + "eval_samples_per_second": 2.895, + "eval_steps_per_second": 1.448, + "step": 5000 + }, + { + "epoch": 0.58, + "learning_rate": 1.2898279292988412e-07, + "logits/chosen": -2.4076485633850098, + "logits/rejected": -2.4449315071105957, + "logps/chosen": -337.1163330078125, + "logps/rejected": -357.0736389160156, + "loss": 0.212, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6611875295639038, + "rewards/margins": 2.468590259552002, + "rewards/rejected": -3.1297779083251953, + "step": 5001 + }, + { + "epoch": 0.58, + "learning_rate": 1.2894767646025985e-07, + "logits/chosen": -2.8706960678100586, + "logits/rejected": -2.8815762996673584, + "logps/chosen": -281.5738830566406, + "logps/rejected": -254.9048309326172, + "loss": 0.3799, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2842018604278564, + "rewards/margins": 2.0845425128936768, + "rewards/rejected": -3.368744373321533, + "step": 5002 + }, + { + "epoch": 0.58, + "learning_rate": 1.289125599906356e-07, + "logits/chosen": -2.192267656326294, + "logits/rejected": -2.637416124343872, + "logps/chosen": -413.06201171875, + "logps/rejected": -298.1824951171875, + "loss": 0.4881, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0090850591659546, + "rewards/margins": 1.3129266500473022, + "rewards/rejected": -2.3220114707946777, + "step": 5003 + }, + { + "epoch": 0.58, + "learning_rate": 1.2887744352101136e-07, + "logits/chosen": -2.7943835258483887, + "logits/rejected": -2.7478909492492676, + "logps/chosen": -330.3172607421875, + "logps/rejected": -254.84951782226562, + "loss": 0.4984, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2476780414581299, + "rewards/margins": 1.702249526977539, + "rewards/rejected": -2.949927568435669, + "step": 5004 + }, + { + "epoch": 0.58, + "learning_rate": 1.2884232705138709e-07, + "logits/chosen": -2.4472620487213135, + "logits/rejected": -2.3408894538879395, + "logps/chosen": -218.91954040527344, + "logps/rejected": -485.9613037109375, + "loss": 0.5334, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.463925838470459, + "rewards/margins": 1.8608181476593018, + "rewards/rejected": -2.3247439861297607, + "step": 5005 + }, + { + "epoch": 0.58, + "learning_rate": 1.2880721058176284e-07, + "logits/chosen": -2.0819387435913086, + "logits/rejected": -2.1286795139312744, + "logps/chosen": -359.514892578125, + "logps/rejected": -451.45220947265625, + "loss": 0.2378, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4945705533027649, + "rewards/margins": 1.9593989849090576, + "rewards/rejected": -2.453969717025757, + "step": 5006 + }, + { + "epoch": 0.58, + "learning_rate": 1.287720941121386e-07, + "logits/chosen": -1.9727075099945068, + "logits/rejected": -2.1041393280029297, + "logps/chosen": -470.741455078125, + "logps/rejected": -457.4071960449219, + "loss": 0.2488, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2429812103509903, + "rewards/margins": 1.6341183185577393, + "rewards/rejected": -1.877099633216858, + "step": 5007 + }, + { + "epoch": 0.58, + "learning_rate": 1.2873697764251435e-07, + "logits/chosen": -1.7859251499176025, + "logits/rejected": -1.9045416116714478, + "logps/chosen": -291.3529968261719, + "logps/rejected": -382.48590087890625, + "loss": 0.1792, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18136322498321533, + "rewards/margins": 5.065678119659424, + "rewards/rejected": -5.24704122543335, + "step": 5008 + }, + { + "epoch": 0.58, + "learning_rate": 1.2870186117289007e-07, + "logits/chosen": -2.0391693115234375, + "logits/rejected": -2.1607677936553955, + "logps/chosen": -224.28762817382812, + "logps/rejected": -242.11376953125, + "loss": 0.7647, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.408144474029541, + "rewards/margins": 0.7363585233688354, + "rewards/rejected": -3.144502878189087, + "step": 5009 + }, + { + "epoch": 0.58, + "learning_rate": 1.2866674470326583e-07, + "logits/chosen": -2.2416794300079346, + "logits/rejected": -2.525074005126953, + "logps/chosen": -326.70989990234375, + "logps/rejected": -265.4552917480469, + "loss": 0.4636, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.138641357421875, + "rewards/margins": 1.005171298980713, + "rewards/rejected": -2.143812656402588, + "step": 5010 + }, + { + "epoch": 0.58, + "learning_rate": 1.2863162823364156e-07, + "logits/chosen": -2.2987060546875, + "logits/rejected": -2.323371171951294, + "logps/chosen": -293.3017578125, + "logps/rejected": -209.7847137451172, + "loss": 0.2814, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9907200336456299, + "rewards/margins": 2.5270581245422363, + "rewards/rejected": -3.5177783966064453, + "step": 5011 + }, + { + "epoch": 0.58, + "learning_rate": 1.2859651176401734e-07, + "logits/chosen": -2.3462071418762207, + "logits/rejected": -2.185269832611084, + "logps/chosen": -292.3138427734375, + "logps/rejected": -286.3416748046875, + "loss": 0.3627, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0536147356033325, + "rewards/margins": 2.3617687225341797, + "rewards/rejected": -3.4153833389282227, + "step": 5012 + }, + { + "epoch": 0.58, + "learning_rate": 1.2856139529439306e-07, + "logits/chosen": -2.2347841262817383, + "logits/rejected": -2.1503489017486572, + "logps/chosen": -299.64007568359375, + "logps/rejected": -299.333740234375, + "loss": 0.2221, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.026203647255897522, + "rewards/margins": 1.653945803642273, + "rewards/rejected": -1.6277421712875366, + "step": 5013 + }, + { + "epoch": 0.58, + "learning_rate": 1.2852627882476882e-07, + "logits/chosen": -2.18139386177063, + "logits/rejected": -2.293647050857544, + "logps/chosen": -286.03070068359375, + "logps/rejected": -223.3279266357422, + "loss": 0.222, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.160438060760498, + "rewards/margins": 2.144879102706909, + "rewards/rejected": -3.3053171634674072, + "step": 5014 + }, + { + "epoch": 0.58, + "learning_rate": 1.2849116235514454e-07, + "logits/chosen": -2.267956256866455, + "logits/rejected": -2.192429780960083, + "logps/chosen": -309.98480224609375, + "logps/rejected": -349.702392578125, + "loss": 0.5371, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.377549409866333, + "rewards/margins": 1.80447256565094, + "rewards/rejected": -3.1820220947265625, + "step": 5015 + }, + { + "epoch": 0.58, + "learning_rate": 1.284560458855203e-07, + "logits/chosen": -2.7197585105895996, + "logits/rejected": -2.349114418029785, + "logps/chosen": -203.05230712890625, + "logps/rejected": -245.33090209960938, + "loss": 0.7618, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1202330589294434, + "rewards/margins": 0.722697913646698, + "rewards/rejected": -2.842931032180786, + "step": 5016 + }, + { + "epoch": 0.58, + "learning_rate": 1.2842092941589605e-07, + "logits/chosen": -2.365102767944336, + "logits/rejected": -2.423994779586792, + "logps/chosen": -324.855712890625, + "logps/rejected": -410.25079345703125, + "loss": 0.5355, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3297371864318848, + "rewards/margins": 1.8178378343582153, + "rewards/rejected": -3.1475749015808105, + "step": 5017 + }, + { + "epoch": 0.58, + "learning_rate": 1.283858129462718e-07, + "logits/chosen": -2.176645517349243, + "logits/rejected": -2.500675678253174, + "logps/chosen": -324.6589660644531, + "logps/rejected": -284.7924499511719, + "loss": 0.0751, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37615153193473816, + "rewards/margins": 3.871318817138672, + "rewards/rejected": -4.247470855712891, + "step": 5018 + }, + { + "epoch": 0.58, + "learning_rate": 1.2835069647664753e-07, + "logits/chosen": -2.4549496173858643, + "logits/rejected": -2.5215306282043457, + "logps/chosen": -302.38775634765625, + "logps/rejected": -397.5508117675781, + "loss": 0.1022, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1787259578704834, + "rewards/margins": 2.911837577819824, + "rewards/rejected": -4.090563774108887, + "step": 5019 + }, + { + "epoch": 0.58, + "learning_rate": 1.283155800070233e-07, + "logits/chosen": -2.054065704345703, + "logits/rejected": -2.173407554626465, + "logps/chosen": -416.95098876953125, + "logps/rejected": -407.1552429199219, + "loss": 0.4367, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.65580153465271, + "rewards/margins": 2.9614505767822266, + "rewards/rejected": -3.6172523498535156, + "step": 5020 + }, + { + "epoch": 0.58, + "learning_rate": 1.2828046353739904e-07, + "logits/chosen": -2.8493223190307617, + "logits/rejected": -2.82961368560791, + "logps/chosen": -241.50799560546875, + "logps/rejected": -220.1080322265625, + "loss": 0.3798, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5870179533958435, + "rewards/margins": 2.188026189804077, + "rewards/rejected": -2.7750442028045654, + "step": 5021 + }, + { + "epoch": 0.58, + "learning_rate": 1.2824534706777477e-07, + "logits/chosen": -2.6321358680725098, + "logits/rejected": -2.3325555324554443, + "logps/chosen": -260.91064453125, + "logps/rejected": -291.1783752441406, + "loss": 0.1575, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5013160705566406, + "rewards/margins": 3.019606113433838, + "rewards/rejected": -3.5209221839904785, + "step": 5022 + }, + { + "epoch": 0.58, + "learning_rate": 1.2821023059815052e-07, + "logits/chosen": -2.5253396034240723, + "logits/rejected": -2.479766845703125, + "logps/chosen": -186.63316345214844, + "logps/rejected": -271.31097412109375, + "loss": 0.1676, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2860509753227234, + "rewards/margins": 3.7141716480255127, + "rewards/rejected": -4.000222682952881, + "step": 5023 + }, + { + "epoch": 0.58, + "learning_rate": 1.2817511412852628e-07, + "logits/chosen": -2.2729132175445557, + "logits/rejected": -2.5572478771209717, + "logps/chosen": -245.7117919921875, + "logps/rejected": -328.1680908203125, + "loss": 0.0979, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6307003498077393, + "rewards/margins": 4.052063465118408, + "rewards/rejected": -4.682763576507568, + "step": 5024 + }, + { + "epoch": 0.58, + "learning_rate": 1.2813999765890203e-07, + "logits/chosen": -2.6392104625701904, + "logits/rejected": -2.407484531402588, + "logps/chosen": -237.41207885742188, + "logps/rejected": -319.3418884277344, + "loss": 0.0506, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0828823447227478, + "rewards/margins": 4.8410868644714355, + "rewards/rejected": -4.92396879196167, + "step": 5025 + }, + { + "epoch": 0.58, + "learning_rate": 1.2810488118927776e-07, + "logits/chosen": -2.840656042098999, + "logits/rejected": -2.6287548542022705, + "logps/chosen": -162.61911010742188, + "logps/rejected": -253.52359008789062, + "loss": 0.2734, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1176066398620605, + "rewards/margins": 2.0049662590026855, + "rewards/rejected": -3.122573137283325, + "step": 5026 + }, + { + "epoch": 0.58, + "learning_rate": 1.280697647196535e-07, + "logits/chosen": -2.8098268508911133, + "logits/rejected": -2.8940579891204834, + "logps/chosen": -191.6160430908203, + "logps/rejected": -230.43353271484375, + "loss": 0.3141, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6975064873695374, + "rewards/margins": 3.246891498565674, + "rewards/rejected": -3.9443979263305664, + "step": 5027 + }, + { + "epoch": 0.58, + "learning_rate": 1.2803464825002926e-07, + "logits/chosen": -2.117746353149414, + "logits/rejected": -2.181405544281006, + "logps/chosen": -183.33734130859375, + "logps/rejected": -260.71978759765625, + "loss": 0.201, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7698904871940613, + "rewards/margins": 3.576709270477295, + "rewards/rejected": -4.346599578857422, + "step": 5028 + }, + { + "epoch": 0.58, + "learning_rate": 1.2799953178040502e-07, + "logits/chosen": -1.665814995765686, + "logits/rejected": -1.9141709804534912, + "logps/chosen": -865.5, + "logps/rejected": -562.3677978515625, + "loss": 0.8419, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3862024545669556, + "rewards/margins": 1.1881500482559204, + "rewards/rejected": -2.574352502822876, + "step": 5029 + }, + { + "epoch": 0.58, + "learning_rate": 1.2796441531078075e-07, + "logits/chosen": -1.9600485563278198, + "logits/rejected": -2.2438011169433594, + "logps/chosen": -302.33709716796875, + "logps/rejected": -180.8563995361328, + "loss": 0.3874, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2488863468170166, + "rewards/margins": 0.9275308847427368, + "rewards/rejected": -2.176417112350464, + "step": 5030 + }, + { + "epoch": 0.58, + "learning_rate": 1.279292988411565e-07, + "logits/chosen": -1.979225993156433, + "logits/rejected": -2.248785972595215, + "logps/chosen": -228.8957977294922, + "logps/rejected": -232.2417449951172, + "loss": 0.3389, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6042598485946655, + "rewards/margins": 1.4223374128341675, + "rewards/rejected": -2.026597023010254, + "step": 5031 + }, + { + "epoch": 0.58, + "learning_rate": 1.2789418237153225e-07, + "logits/chosen": -2.4069244861602783, + "logits/rejected": -2.18690824508667, + "logps/chosen": -285.9814147949219, + "logps/rejected": -300.9298400878906, + "loss": 0.2144, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2732670307159424, + "rewards/margins": 2.05186128616333, + "rewards/rejected": -3.3251285552978516, + "step": 5032 + }, + { + "epoch": 0.58, + "learning_rate": 1.2785906590190798e-07, + "logits/chosen": -1.9048688411712646, + "logits/rejected": -2.382507085800171, + "logps/chosen": -429.2525329589844, + "logps/rejected": -286.2577819824219, + "loss": 0.1844, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9972552061080933, + "rewards/margins": 3.381117105484009, + "rewards/rejected": -4.3783721923828125, + "step": 5033 + }, + { + "epoch": 0.58, + "learning_rate": 1.2782394943228374e-07, + "logits/chosen": -2.1681363582611084, + "logits/rejected": -2.227217674255371, + "logps/chosen": -251.83831787109375, + "logps/rejected": -138.78158569335938, + "loss": 0.6194, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2829848527908325, + "rewards/margins": 1.0172673463821411, + "rewards/rejected": -2.3002524375915527, + "step": 5034 + }, + { + "epoch": 0.58, + "learning_rate": 1.277888329626595e-07, + "logits/chosen": -2.323901414871216, + "logits/rejected": -2.1547722816467285, + "logps/chosen": -255.73597717285156, + "logps/rejected": -265.20855712890625, + "loss": 0.3644, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.48921334743499756, + "rewards/margins": 2.7786965370178223, + "rewards/rejected": -3.267909526824951, + "step": 5035 + }, + { + "epoch": 0.58, + "learning_rate": 1.2775371649303524e-07, + "logits/chosen": -2.890601873397827, + "logits/rejected": -2.8663530349731445, + "logps/chosen": -161.80615234375, + "logps/rejected": -201.89578247070312, + "loss": 0.2065, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9202232956886292, + "rewards/margins": 2.5317277908325195, + "rewards/rejected": -3.451951265335083, + "step": 5036 + }, + { + "epoch": 0.58, + "learning_rate": 1.2771860002341097e-07, + "logits/chosen": -2.4729628562927246, + "logits/rejected": -2.603700876235962, + "logps/chosen": -335.9371337890625, + "logps/rejected": -370.0303955078125, + "loss": 0.4506, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.35883629322052, + "rewards/margins": 1.8521144390106201, + "rewards/rejected": -3.2109503746032715, + "step": 5037 + }, + { + "epoch": 0.58, + "learning_rate": 1.2768348355378672e-07, + "logits/chosen": -1.9944603443145752, + "logits/rejected": -2.28242826461792, + "logps/chosen": -355.54052734375, + "logps/rejected": -272.4052429199219, + "loss": 0.4632, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6001549959182739, + "rewards/margins": 2.533909559249878, + "rewards/rejected": -3.134064197540283, + "step": 5038 + }, + { + "epoch": 0.58, + "learning_rate": 1.2764836708416245e-07, + "logits/chosen": -2.022078514099121, + "logits/rejected": -2.241811752319336, + "logps/chosen": -310.8746032714844, + "logps/rejected": -314.1921691894531, + "loss": 0.0508, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19839076697826385, + "rewards/margins": 4.1146721839904785, + "rewards/rejected": -3.916281223297119, + "step": 5039 + }, + { + "epoch": 0.58, + "learning_rate": 1.2761325061453823e-07, + "logits/chosen": -2.260190725326538, + "logits/rejected": -2.3458359241485596, + "logps/chosen": -206.53814697265625, + "logps/rejected": -217.11434936523438, + "loss": 0.2556, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7429813742637634, + "rewards/margins": 1.9188262224197388, + "rewards/rejected": -2.6618075370788574, + "step": 5040 + }, + { + "epoch": 0.58, + "learning_rate": 1.2757813414491396e-07, + "logits/chosen": -3.0092506408691406, + "logits/rejected": -3.041604518890381, + "logps/chosen": -263.6708068847656, + "logps/rejected": -231.69615173339844, + "loss": 0.2211, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.032274842262268, + "rewards/margins": 2.373068332672119, + "rewards/rejected": -3.4053432941436768, + "step": 5041 + }, + { + "epoch": 0.58, + "learning_rate": 1.275430176752897e-07, + "logits/chosen": -2.1208717823028564, + "logits/rejected": -2.3468637466430664, + "logps/chosen": -284.6929931640625, + "logps/rejected": -260.0819091796875, + "loss": 0.5904, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5089204907417297, + "rewards/margins": 0.9214388728141785, + "rewards/rejected": -1.4303593635559082, + "step": 5042 + }, + { + "epoch": 0.58, + "learning_rate": 1.2750790120566544e-07, + "logits/chosen": -1.6643028259277344, + "logits/rejected": -2.008107900619507, + "logps/chosen": -426.4961242675781, + "logps/rejected": -332.9363708496094, + "loss": 0.2202, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4206574261188507, + "rewards/margins": 2.3074376583099365, + "rewards/rejected": -2.728095054626465, + "step": 5043 + }, + { + "epoch": 0.58, + "learning_rate": 1.274727847360412e-07, + "logits/chosen": -1.887069582939148, + "logits/rejected": -1.971490740776062, + "logps/chosen": -383.5101318359375, + "logps/rejected": -305.2457275390625, + "loss": 0.1891, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.35070955753326416, + "rewards/margins": 2.6433348655700684, + "rewards/rejected": -2.994044303894043, + "step": 5044 + }, + { + "epoch": 0.58, + "learning_rate": 1.2743766826641695e-07, + "logits/chosen": -1.990142583847046, + "logits/rejected": -2.5214085578918457, + "logps/chosen": -527.8177490234375, + "logps/rejected": -212.49575805664062, + "loss": 0.4495, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7968008518218994, + "rewards/margins": 1.1862503290176392, + "rewards/rejected": -1.9830513000488281, + "step": 5045 + }, + { + "epoch": 0.58, + "learning_rate": 1.274025517967927e-07, + "logits/chosen": -2.2882819175720215, + "logits/rejected": -2.1438064575195312, + "logps/chosen": -124.88897705078125, + "logps/rejected": -294.85589599609375, + "loss": 0.1744, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6614838242530823, + "rewards/margins": 3.205111503601074, + "rewards/rejected": -3.86659574508667, + "step": 5046 + }, + { + "epoch": 0.58, + "learning_rate": 1.2736743532716843e-07, + "logits/chosen": -1.835686445236206, + "logits/rejected": -1.855983853340149, + "logps/chosen": -192.3330078125, + "logps/rejected": -222.82847595214844, + "loss": 2.8206, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1290446519851685, + "rewards/margins": -0.9082087874412537, + "rewards/rejected": -0.2208358645439148, + "step": 5047 + }, + { + "epoch": 0.58, + "learning_rate": 1.2733231885754418e-07, + "logits/chosen": -2.075911521911621, + "logits/rejected": -2.0398900508880615, + "logps/chosen": -251.89309692382812, + "logps/rejected": -286.7648010253906, + "loss": 0.4952, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3935294151306152, + "rewards/margins": 2.668461799621582, + "rewards/rejected": -4.0619916915893555, + "step": 5048 + }, + { + "epoch": 0.58, + "learning_rate": 1.2729720238791994e-07, + "logits/chosen": -2.2914156913757324, + "logits/rejected": -2.6216626167297363, + "logps/chosen": -181.61209106445312, + "logps/rejected": -220.3240966796875, + "loss": 0.2257, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.048251837491989136, + "rewards/margins": 3.1415557861328125, + "rewards/rejected": -3.189807653427124, + "step": 5049 + }, + { + "epoch": 0.58, + "learning_rate": 1.2726208591829566e-07, + "logits/chosen": -2.3378453254699707, + "logits/rejected": -2.261439085006714, + "logps/chosen": -136.73204040527344, + "logps/rejected": -216.67440795898438, + "loss": 0.3345, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1998155117034912, + "rewards/margins": 3.4610021114349365, + "rewards/rejected": -4.660817623138428, + "step": 5050 + }, + { + "epoch": 0.58, + "learning_rate": 1.2722696944867142e-07, + "logits/chosen": -2.684576988220215, + "logits/rejected": -2.64430570602417, + "logps/chosen": -137.19503784179688, + "logps/rejected": -155.1311492919922, + "loss": 0.3437, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6661319732666016, + "rewards/margins": 2.0860862731933594, + "rewards/rejected": -2.752218008041382, + "step": 5051 + }, + { + "epoch": 0.58, + "learning_rate": 1.2719185297904717e-07, + "logits/chosen": -2.4700934886932373, + "logits/rejected": -2.4373645782470703, + "logps/chosen": -174.0889434814453, + "logps/rejected": -175.77569580078125, + "loss": 0.5831, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2181007862091064, + "rewards/margins": 1.3194304704666138, + "rewards/rejected": -2.5375313758850098, + "step": 5052 + }, + { + "epoch": 0.58, + "learning_rate": 1.2715673650942293e-07, + "logits/chosen": -1.8825582265853882, + "logits/rejected": -2.2003297805786133, + "logps/chosen": -372.29437255859375, + "logps/rejected": -195.87098693847656, + "loss": 1.0882, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8482832312583923, + "rewards/margins": -0.30489686131477356, + "rewards/rejected": -0.5433863401412964, + "step": 5053 + }, + { + "epoch": 0.58, + "learning_rate": 1.2712162003979865e-07, + "logits/chosen": -2.5039048194885254, + "logits/rejected": -2.54756236076355, + "logps/chosen": -432.0526123046875, + "logps/rejected": -218.17762756347656, + "loss": 0.4464, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9688570499420166, + "rewards/margins": 2.0218260288238525, + "rewards/rejected": -2.990683078765869, + "step": 5054 + }, + { + "epoch": 0.58, + "learning_rate": 1.270865035701744e-07, + "logits/chosen": -1.9252967834472656, + "logits/rejected": -2.3924596309661865, + "logps/chosen": -496.6745300292969, + "logps/rejected": -323.6076354980469, + "loss": 0.2036, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5418845415115356, + "rewards/margins": 3.4875681400299072, + "rewards/rejected": -4.029452800750732, + "step": 5055 + }, + { + "epoch": 0.58, + "learning_rate": 1.2705138710055013e-07, + "logits/chosen": -2.7076754570007324, + "logits/rejected": -2.9656119346618652, + "logps/chosen": -324.3088073730469, + "logps/rejected": -237.21261596679688, + "loss": 0.1928, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5267218351364136, + "rewards/margins": 3.291525363922119, + "rewards/rejected": -3.8182475566864014, + "step": 5056 + }, + { + "epoch": 0.58, + "learning_rate": 1.2701627063092591e-07, + "logits/chosen": -2.3085784912109375, + "logits/rejected": -2.4814071655273438, + "logps/chosen": -252.50241088867188, + "logps/rejected": -206.5613250732422, + "loss": 0.7483, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1969736814498901, + "rewards/margins": 0.4417479932308197, + "rewards/rejected": -1.6387215852737427, + "step": 5057 + }, + { + "epoch": 0.58, + "learning_rate": 1.2698115416130164e-07, + "logits/chosen": -2.131786823272705, + "logits/rejected": -2.189197063446045, + "logps/chosen": -296.80487060546875, + "logps/rejected": -288.62158203125, + "loss": 0.2181, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8883216381072998, + "rewards/margins": 2.3667802810668945, + "rewards/rejected": -3.2551021575927734, + "step": 5058 + }, + { + "epoch": 0.58, + "learning_rate": 1.269460376916774e-07, + "logits/chosen": -2.7051031589508057, + "logits/rejected": -2.7123122215270996, + "logps/chosen": -188.4088592529297, + "logps/rejected": -231.04769897460938, + "loss": 0.3319, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5919320583343506, + "rewards/margins": 1.797271966934204, + "rewards/rejected": -2.3892037868499756, + "step": 5059 + }, + { + "epoch": 0.58, + "learning_rate": 1.2691092122205312e-07, + "logits/chosen": -2.846792697906494, + "logits/rejected": -2.855929374694824, + "logps/chosen": -207.32437133789062, + "logps/rejected": -231.1500244140625, + "loss": 0.6213, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1076115369796753, + "rewards/margins": 1.1558605432510376, + "rewards/rejected": -2.263472080230713, + "step": 5060 + }, + { + "epoch": 0.58, + "learning_rate": 1.2687580475242888e-07, + "logits/chosen": -2.2140889167785645, + "logits/rejected": -2.605468273162842, + "logps/chosen": -581.0647583007812, + "logps/rejected": -227.8559112548828, + "loss": 0.2202, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5590634346008301, + "rewards/margins": 2.406604290008545, + "rewards/rejected": -2.965667724609375, + "step": 5061 + }, + { + "epoch": 0.58, + "learning_rate": 1.2684068828280463e-07, + "logits/chosen": -2.5682592391967773, + "logits/rejected": -2.6244640350341797, + "logps/chosen": -123.965576171875, + "logps/rejected": -188.02133178710938, + "loss": 0.1279, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6144627332687378, + "rewards/margins": 3.2820396423339844, + "rewards/rejected": -3.8965024948120117, + "step": 5062 + }, + { + "epoch": 0.58, + "learning_rate": 1.2680557181318039e-07, + "logits/chosen": -2.806334972381592, + "logits/rejected": -2.621147394180298, + "logps/chosen": -140.13937377929688, + "logps/rejected": -207.71502685546875, + "loss": 0.5174, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.555352807044983, + "rewards/margins": 3.252868413925171, + "rewards/rejected": -4.808221340179443, + "step": 5063 + }, + { + "epoch": 0.58, + "learning_rate": 1.267704553435561e-07, + "logits/chosen": -2.16051983833313, + "logits/rejected": -2.2117700576782227, + "logps/chosen": -197.08615112304688, + "logps/rejected": -244.10218811035156, + "loss": 0.2634, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0034606456756592, + "rewards/margins": 2.5765395164489746, + "rewards/rejected": -3.580000400543213, + "step": 5064 + }, + { + "epoch": 0.58, + "learning_rate": 1.2673533887393187e-07, + "logits/chosen": -2.3875205516815186, + "logits/rejected": -2.253617286682129, + "logps/chosen": -318.93536376953125, + "logps/rejected": -340.3249206542969, + "loss": 0.7391, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4396380186080933, + "rewards/margins": 1.4713258743286133, + "rewards/rejected": -2.910963773727417, + "step": 5065 + }, + { + "epoch": 0.58, + "learning_rate": 1.2670022240430762e-07, + "logits/chosen": -2.0733275413513184, + "logits/rejected": -2.380232334136963, + "logps/chosen": -192.41314697265625, + "logps/rejected": -205.40347290039062, + "loss": 0.4676, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.46319419145584106, + "rewards/margins": 1.809627890586853, + "rewards/rejected": -2.272822141647339, + "step": 5066 + }, + { + "epoch": 0.58, + "learning_rate": 1.2666510593468335e-07, + "logits/chosen": -2.1583757400512695, + "logits/rejected": -2.424407720565796, + "logps/chosen": -342.62481689453125, + "logps/rejected": -227.39764404296875, + "loss": 0.7933, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5291019678115845, + "rewards/margins": 1.3966976404190063, + "rewards/rejected": -2.925799608230591, + "step": 5067 + }, + { + "epoch": 0.58, + "learning_rate": 1.266299894650591e-07, + "logits/chosen": -1.9729417562484741, + "logits/rejected": -1.9621820449829102, + "logps/chosen": -273.27130126953125, + "logps/rejected": -380.0724182128906, + "loss": 0.6669, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2672489881515503, + "rewards/margins": 1.8715717792510986, + "rewards/rejected": -3.1388206481933594, + "step": 5068 + }, + { + "epoch": 0.58, + "learning_rate": 1.2659487299543486e-07, + "logits/chosen": -2.26566219329834, + "logits/rejected": -1.9600310325622559, + "logps/chosen": -269.0824279785156, + "logps/rejected": -313.7449035644531, + "loss": 0.5122, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3884556293487549, + "rewards/margins": 1.3114596605300903, + "rewards/rejected": -2.6999154090881348, + "step": 5069 + }, + { + "epoch": 0.58, + "learning_rate": 1.265597565258106e-07, + "logits/chosen": -1.8845174312591553, + "logits/rejected": -1.7632193565368652, + "logps/chosen": -375.9578857421875, + "logps/rejected": -268.7389831542969, + "loss": 0.7819, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3386400938034058, + "rewards/margins": 0.48294711112976074, + "rewards/rejected": -1.8215872049331665, + "step": 5070 + }, + { + "epoch": 0.58, + "learning_rate": 1.2652464005618634e-07, + "logits/chosen": -2.661144733428955, + "logits/rejected": -2.564034938812256, + "logps/chosen": -286.64208984375, + "logps/rejected": -147.29364013671875, + "loss": 0.2806, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7808257937431335, + "rewards/margins": 1.4752066135406494, + "rewards/rejected": -2.2560324668884277, + "step": 5071 + }, + { + "epoch": 0.58, + "learning_rate": 1.264895235865621e-07, + "logits/chosen": -2.128065586090088, + "logits/rejected": -2.285029411315918, + "logps/chosen": -326.6854248046875, + "logps/rejected": -345.26416015625, + "loss": 0.3419, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7510473132133484, + "rewards/margins": 1.9952001571655273, + "rewards/rejected": -2.7462475299835205, + "step": 5072 + }, + { + "epoch": 0.58, + "learning_rate": 1.2645440711693784e-07, + "logits/chosen": -2.4838271141052246, + "logits/rejected": -2.608769178390503, + "logps/chosen": -213.87489318847656, + "logps/rejected": -318.71221923828125, + "loss": 0.331, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4952781200408936, + "rewards/margins": 1.4365956783294678, + "rewards/rejected": -2.9318740367889404, + "step": 5073 + }, + { + "epoch": 0.58, + "learning_rate": 1.264192906473136e-07, + "logits/chosen": -2.464938163757324, + "logits/rejected": -2.474642753601074, + "logps/chosen": -226.44967651367188, + "logps/rejected": -169.55538940429688, + "loss": 0.3848, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.709566056728363, + "rewards/margins": 2.4050216674804688, + "rewards/rejected": -3.1145877838134766, + "step": 5074 + }, + { + "epoch": 0.59, + "learning_rate": 1.2638417417768933e-07, + "logits/chosen": -2.380741596221924, + "logits/rejected": -2.355562686920166, + "logps/chosen": -240.8399658203125, + "logps/rejected": -250.0812530517578, + "loss": 0.3125, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6632764935493469, + "rewards/margins": 2.384852647781372, + "rewards/rejected": -3.0481295585632324, + "step": 5075 + }, + { + "epoch": 0.59, + "learning_rate": 1.2634905770806508e-07, + "logits/chosen": -2.536655902862549, + "logits/rejected": -2.6977336406707764, + "logps/chosen": -194.3640899658203, + "logps/rejected": -226.76251220703125, + "loss": 0.2971, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.30679023265838623, + "rewards/margins": 2.935277223587036, + "rewards/rejected": -2.6284868717193604, + "step": 5076 + }, + { + "epoch": 0.59, + "learning_rate": 1.2631394123844083e-07, + "logits/chosen": -2.593773365020752, + "logits/rejected": -2.3008790016174316, + "logps/chosen": -128.22369384765625, + "logps/rejected": -234.87203979492188, + "loss": 0.7661, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2129331827163696, + "rewards/margins": 1.0434644222259521, + "rewards/rejected": -2.2563977241516113, + "step": 5077 + }, + { + "epoch": 0.59, + "learning_rate": 1.2627882476881656e-07, + "logits/chosen": -2.3685622215270996, + "logits/rejected": -2.205643653869629, + "logps/chosen": -141.40342712402344, + "logps/rejected": -306.3501281738281, + "loss": 0.6592, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7393061518669128, + "rewards/margins": 3.2111310958862305, + "rewards/rejected": -3.950437068939209, + "step": 5078 + }, + { + "epoch": 0.59, + "learning_rate": 1.2624370829919231e-07, + "logits/chosen": -2.289254665374756, + "logits/rejected": -2.189211368560791, + "logps/chosen": -391.47802734375, + "logps/rejected": -384.79522705078125, + "loss": 0.8568, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5065752267837524, + "rewards/margins": 0.9028227925300598, + "rewards/rejected": -2.409397840499878, + "step": 5079 + }, + { + "epoch": 0.59, + "learning_rate": 1.2620859182956807e-07, + "logits/chosen": -2.1210083961486816, + "logits/rejected": -2.169567823410034, + "logps/chosen": -225.7271728515625, + "logps/rejected": -226.22463989257812, + "loss": 0.5648, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0566740036010742, + "rewards/margins": 1.1218775510787964, + "rewards/rejected": -2.17855167388916, + "step": 5080 + }, + { + "epoch": 0.59, + "learning_rate": 1.2617347535994382e-07, + "logits/chosen": -2.504490852355957, + "logits/rejected": -2.5382583141326904, + "logps/chosen": -347.25689697265625, + "logps/rejected": -269.54736328125, + "loss": 0.7747, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8077311515808105, + "rewards/margins": 2.3478448390960693, + "rewards/rejected": -4.155576229095459, + "step": 5081 + }, + { + "epoch": 0.59, + "learning_rate": 1.2613835889031955e-07, + "logits/chosen": -2.1135404109954834, + "logits/rejected": -2.4004013538360596, + "logps/chosen": -443.9617919921875, + "logps/rejected": -280.2760925292969, + "loss": 0.4518, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.75721275806427, + "rewards/margins": 2.269991874694824, + "rewards/rejected": -3.027204751968384, + "step": 5082 + }, + { + "epoch": 0.59, + "learning_rate": 1.261032424206953e-07, + "logits/chosen": -2.2303059101104736, + "logits/rejected": -2.3372082710266113, + "logps/chosen": -348.769287109375, + "logps/rejected": -368.5367126464844, + "loss": 0.3381, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07605767250061035, + "rewards/margins": 2.034766674041748, + "rewards/rejected": -2.1108241081237793, + "step": 5083 + }, + { + "epoch": 0.59, + "learning_rate": 1.2606812595107103e-07, + "logits/chosen": -2.2314414978027344, + "logits/rejected": -2.06868839263916, + "logps/chosen": -139.56167602539062, + "logps/rejected": -315.3842468261719, + "loss": 0.2174, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.390287309885025, + "rewards/margins": 2.17912220954895, + "rewards/rejected": -2.5694093704223633, + "step": 5084 + }, + { + "epoch": 0.59, + "learning_rate": 1.260330094814468e-07, + "logits/chosen": -2.599625587463379, + "logits/rejected": -2.6318705081939697, + "logps/chosen": -228.3226776123047, + "logps/rejected": -361.3092346191406, + "loss": 0.0641, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2334621548652649, + "rewards/margins": 3.6188857555389404, + "rewards/rejected": -3.8523478507995605, + "step": 5085 + }, + { + "epoch": 0.59, + "learning_rate": 1.2599789301182254e-07, + "logits/chosen": -2.6092522144317627, + "logits/rejected": -2.6396450996398926, + "logps/chosen": -176.0206756591797, + "logps/rejected": -185.42141723632812, + "loss": 0.2269, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1994330883026123, + "rewards/margins": 2.9568676948547363, + "rewards/rejected": -4.1563005447387695, + "step": 5086 + }, + { + "epoch": 0.59, + "learning_rate": 1.259627765421983e-07, + "logits/chosen": -1.9629589319229126, + "logits/rejected": -2.209578037261963, + "logps/chosen": -347.16790771484375, + "logps/rejected": -313.66412353515625, + "loss": 0.5299, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6266745328903198, + "rewards/margins": 1.4187874794006348, + "rewards/rejected": -3.045462131500244, + "step": 5087 + }, + { + "epoch": 0.59, + "learning_rate": 1.2592766007257402e-07, + "logits/chosen": -2.568140745162964, + "logits/rejected": -2.223623037338257, + "logps/chosen": -154.1318359375, + "logps/rejected": -262.112060546875, + "loss": 0.1826, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.27371537685394287, + "rewards/margins": 3.0762939453125, + "rewards/rejected": -3.350008964538574, + "step": 5088 + }, + { + "epoch": 0.59, + "learning_rate": 1.2589254360294977e-07, + "logits/chosen": -1.9918463230133057, + "logits/rejected": -2.239043951034546, + "logps/chosen": -414.6201171875, + "logps/rejected": -362.7919921875, + "loss": 0.1618, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.29634106159210205, + "rewards/margins": 2.972273111343384, + "rewards/rejected": -3.2686142921447754, + "step": 5089 + }, + { + "epoch": 0.59, + "learning_rate": 1.2585742713332553e-07, + "logits/chosen": -2.3785037994384766, + "logits/rejected": -2.2108047008514404, + "logps/chosen": -292.87518310546875, + "logps/rejected": -355.3883056640625, + "loss": 0.1076, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18147294223308563, + "rewards/margins": 2.8600356578826904, + "rewards/rejected": -3.041508674621582, + "step": 5090 + }, + { + "epoch": 0.59, + "learning_rate": 1.2582231066370128e-07, + "logits/chosen": -2.1179165840148926, + "logits/rejected": -2.1081314086914062, + "logps/chosen": -306.46490478515625, + "logps/rejected": -263.7262268066406, + "loss": 0.7245, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9130589962005615, + "rewards/margins": 1.6839227676391602, + "rewards/rejected": -2.5969817638397217, + "step": 5091 + }, + { + "epoch": 0.59, + "learning_rate": 1.25787194194077e-07, + "logits/chosen": -2.381584644317627, + "logits/rejected": -2.5365352630615234, + "logps/chosen": -266.56298828125, + "logps/rejected": -221.2057342529297, + "loss": 0.2582, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5059379935264587, + "rewards/margins": 2.2187819480895996, + "rewards/rejected": -2.7247202396392822, + "step": 5092 + }, + { + "epoch": 0.59, + "learning_rate": 1.2575207772445276e-07, + "logits/chosen": -2.2322442531585693, + "logits/rejected": -2.0656065940856934, + "logps/chosen": -230.8194122314453, + "logps/rejected": -258.99639892578125, + "loss": 0.188, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.33227407932281494, + "rewards/margins": 3.1116228103637695, + "rewards/rejected": -3.443897008895874, + "step": 5093 + }, + { + "epoch": 0.59, + "learning_rate": 1.2571696125482852e-07, + "logits/chosen": -1.6840378046035767, + "logits/rejected": -1.96396005153656, + "logps/chosen": -416.8160705566406, + "logps/rejected": -314.3489685058594, + "loss": 1.4884, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.5298218727111816, + "rewards/margins": 0.17053627967834473, + "rewards/rejected": -2.7003581523895264, + "step": 5094 + }, + { + "epoch": 0.59, + "learning_rate": 1.2568184478520424e-07, + "logits/chosen": -1.8399778604507446, + "logits/rejected": -1.928375482559204, + "logps/chosen": -245.2256622314453, + "logps/rejected": -234.26995849609375, + "loss": 0.4228, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6059497594833374, + "rewards/margins": 1.67415452003479, + "rewards/rejected": -2.280104160308838, + "step": 5095 + }, + { + "epoch": 0.59, + "learning_rate": 1.2564672831558e-07, + "logits/chosen": -2.122770071029663, + "logits/rejected": -2.063204765319824, + "logps/chosen": -257.4309997558594, + "logps/rejected": -373.02239990234375, + "loss": 0.2186, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8313709497451782, + "rewards/margins": 4.601076126098633, + "rewards/rejected": -5.4324469566345215, + "step": 5096 + }, + { + "epoch": 0.59, + "learning_rate": 1.2561161184595575e-07, + "logits/chosen": -2.2388358116149902, + "logits/rejected": -2.319079637527466, + "logps/chosen": -298.3353576660156, + "logps/rejected": -173.31744384765625, + "loss": 0.7967, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.081230878829956, + "rewards/margins": 0.18108341097831726, + "rewards/rejected": -2.2623143196105957, + "step": 5097 + }, + { + "epoch": 0.59, + "learning_rate": 1.255764953763315e-07, + "logits/chosen": -1.6508699655532837, + "logits/rejected": -1.972090244293213, + "logps/chosen": -367.8564147949219, + "logps/rejected": -261.95245361328125, + "loss": 0.8508, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3083837032318115, + "rewards/margins": 1.811011791229248, + "rewards/rejected": -3.1193954944610596, + "step": 5098 + }, + { + "epoch": 0.59, + "learning_rate": 1.2554137890670723e-07, + "logits/chosen": -2.576368808746338, + "logits/rejected": -2.0998339653015137, + "logps/chosen": -155.0290985107422, + "logps/rejected": -264.36676025390625, + "loss": 0.863, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2665307521820068, + "rewards/margins": 0.3187963664531708, + "rewards/rejected": -1.5853271484375, + "step": 5099 + }, + { + "epoch": 0.59, + "learning_rate": 1.2550626243708299e-07, + "logits/chosen": -2.281020164489746, + "logits/rejected": -2.391296863555908, + "logps/chosen": -319.10736083984375, + "logps/rejected": -255.31796264648438, + "loss": 0.0654, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4087907671928406, + "rewards/margins": 4.2350029945373535, + "rewards/rejected": -4.64379358291626, + "step": 5100 + }, + { + "epoch": 0.59, + "learning_rate": 1.2547114596745871e-07, + "logits/chosen": -2.5836868286132812, + "logits/rejected": -2.5330593585968018, + "logps/chosen": -255.86541748046875, + "logps/rejected": -329.908203125, + "loss": 0.2826, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7361129522323608, + "rewards/margins": 2.3149807453155518, + "rewards/rejected": -3.051093816757202, + "step": 5101 + }, + { + "epoch": 0.59, + "learning_rate": 1.254360294978345e-07, + "logits/chosen": -1.9066969156265259, + "logits/rejected": -2.1648287773132324, + "logps/chosen": -301.91717529296875, + "logps/rejected": -232.513916015625, + "loss": 0.3872, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8339162468910217, + "rewards/margins": 1.22408127784729, + "rewards/rejected": -2.057997465133667, + "step": 5102 + }, + { + "epoch": 0.59, + "learning_rate": 1.2540091302821022e-07, + "logits/chosen": -2.575786590576172, + "logits/rejected": -2.683594226837158, + "logps/chosen": -247.0155029296875, + "logps/rejected": -215.35635375976562, + "loss": 0.393, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.21366804838180542, + "rewards/margins": 2.455566883087158, + "rewards/rejected": -2.6692347526550293, + "step": 5103 + }, + { + "epoch": 0.59, + "learning_rate": 1.2536579655858598e-07, + "logits/chosen": -2.544250965118408, + "logits/rejected": -2.6434249877929688, + "logps/chosen": -272.71771240234375, + "logps/rejected": -214.34210205078125, + "loss": 0.2076, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4978916645050049, + "rewards/margins": 1.8489341735839844, + "rewards/rejected": -3.3468258380889893, + "step": 5104 + }, + { + "epoch": 0.59, + "learning_rate": 1.253306800889617e-07, + "logits/chosen": -2.0629189014434814, + "logits/rejected": -1.9936943054199219, + "logps/chosen": -168.13763427734375, + "logps/rejected": -234.62326049804688, + "loss": 0.6358, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7327468395233154, + "rewards/margins": 2.7733047008514404, + "rewards/rejected": -4.506051540374756, + "step": 5105 + }, + { + "epoch": 0.59, + "learning_rate": 1.2529556361933746e-07, + "logits/chosen": -2.26607084274292, + "logits/rejected": -2.158322811126709, + "logps/chosen": -277.881591796875, + "logps/rejected": -388.8077392578125, + "loss": 0.5645, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1712939739227295, + "rewards/margins": 0.8913500905036926, + "rewards/rejected": -2.0626440048217773, + "step": 5106 + }, + { + "epoch": 0.59, + "learning_rate": 1.252604471497132e-07, + "logits/chosen": -2.570070266723633, + "logits/rejected": -2.290224075317383, + "logps/chosen": -266.9934387207031, + "logps/rejected": -267.666748046875, + "loss": 0.2165, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5238717794418335, + "rewards/margins": 2.20827054977417, + "rewards/rejected": -3.732142448425293, + "step": 5107 + }, + { + "epoch": 0.59, + "learning_rate": 1.2522533068008896e-07, + "logits/chosen": -2.287205457687378, + "logits/rejected": -2.1625118255615234, + "logps/chosen": -201.7726287841797, + "logps/rejected": -264.6920471191406, + "loss": 0.3156, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2933990955352783, + "rewards/margins": 2.3625447750091553, + "rewards/rejected": -2.6559438705444336, + "step": 5108 + }, + { + "epoch": 0.59, + "learning_rate": 1.251902142104647e-07, + "logits/chosen": -2.498176336288452, + "logits/rejected": -2.445925235748291, + "logps/chosen": -121.64617156982422, + "logps/rejected": -292.8716735839844, + "loss": 0.4466, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5316720008850098, + "rewards/margins": 1.805518627166748, + "rewards/rejected": -3.337190866470337, + "step": 5109 + }, + { + "epoch": 0.59, + "learning_rate": 1.2515509774084045e-07, + "logits/chosen": -2.7694084644317627, + "logits/rejected": -2.656118392944336, + "logps/chosen": -367.4310302734375, + "logps/rejected": -357.189453125, + "loss": 0.3835, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2272090911865234, + "rewards/margins": 1.9408060312271118, + "rewards/rejected": -3.1680150032043457, + "step": 5110 + }, + { + "epoch": 0.59, + "learning_rate": 1.251199812712162e-07, + "logits/chosen": -2.5199742317199707, + "logits/rejected": -2.340176582336426, + "logps/chosen": -237.571044921875, + "logps/rejected": -330.8365173339844, + "loss": 0.3497, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8290730714797974, + "rewards/margins": 2.0290069580078125, + "rewards/rejected": -2.8580799102783203, + "step": 5111 + }, + { + "epoch": 0.59, + "learning_rate": 1.2508486480159193e-07, + "logits/chosen": -1.6971237659454346, + "logits/rejected": -1.8755112886428833, + "logps/chosen": -393.8367614746094, + "logps/rejected": -366.91339111328125, + "loss": 0.3401, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8662568926811218, + "rewards/margins": 2.700901985168457, + "rewards/rejected": -3.5671586990356445, + "step": 5112 + }, + { + "epoch": 0.59, + "learning_rate": 1.2504974833196768e-07, + "logits/chosen": -2.6519956588745117, + "logits/rejected": -2.508150100708008, + "logps/chosen": -235.95579528808594, + "logps/rejected": -217.68994140625, + "loss": 0.8676, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.0011205673217773, + "rewards/margins": 0.8426473140716553, + "rewards/rejected": -2.8437676429748535, + "step": 5113 + }, + { + "epoch": 0.59, + "learning_rate": 1.2501463186234343e-07, + "logits/chosen": -2.4267780780792236, + "logits/rejected": -2.599292039871216, + "logps/chosen": -202.19647216796875, + "logps/rejected": -201.33480834960938, + "loss": 0.6692, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0194144248962402, + "rewards/margins": 2.0392906665802, + "rewards/rejected": -3.0587046146392822, + "step": 5114 + }, + { + "epoch": 0.59, + "learning_rate": 1.249795153927192e-07, + "logits/chosen": -2.1608548164367676, + "logits/rejected": -2.451021671295166, + "logps/chosen": -310.6056823730469, + "logps/rejected": -215.01710510253906, + "loss": 0.3722, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.27757102251052856, + "rewards/margins": 1.4894509315490723, + "rewards/rejected": -1.7670220136642456, + "step": 5115 + }, + { + "epoch": 0.59, + "learning_rate": 1.2494439892309492e-07, + "logits/chosen": -2.4422943592071533, + "logits/rejected": -2.5796115398406982, + "logps/chosen": -275.41241455078125, + "logps/rejected": -273.96429443359375, + "loss": 0.2756, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.428862988948822, + "rewards/margins": 2.15159273147583, + "rewards/rejected": -2.580455780029297, + "step": 5116 + }, + { + "epoch": 0.59, + "learning_rate": 1.2490928245347067e-07, + "logits/chosen": -2.0001943111419678, + "logits/rejected": -2.4234514236450195, + "logps/chosen": -679.0671997070312, + "logps/rejected": -250.99758911132812, + "loss": 1.5448, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.860027551651001, + "rewards/margins": 1.0410823822021484, + "rewards/rejected": -3.9011101722717285, + "step": 5117 + }, + { + "epoch": 0.59, + "learning_rate": 1.2487416598384642e-07, + "logits/chosen": -2.2416152954101562, + "logits/rejected": -2.2878003120422363, + "logps/chosen": -244.51254272460938, + "logps/rejected": -242.57618713378906, + "loss": 0.4729, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9085116982460022, + "rewards/margins": 1.314034104347229, + "rewards/rejected": -2.222545862197876, + "step": 5118 + }, + { + "epoch": 0.59, + "learning_rate": 1.2483904951422218e-07, + "logits/chosen": -1.773925542831421, + "logits/rejected": -2.039198160171509, + "logps/chosen": -259.01544189453125, + "logps/rejected": -247.53732299804688, + "loss": 0.4873, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5460593700408936, + "rewards/margins": 1.4388184547424316, + "rewards/rejected": -2.984877824783325, + "step": 5119 + }, + { + "epoch": 0.59, + "learning_rate": 1.248039330445979e-07, + "logits/chosen": -2.5598559379577637, + "logits/rejected": -2.486781358718872, + "logps/chosen": -158.851806640625, + "logps/rejected": -187.6783447265625, + "loss": 0.3509, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.498767614364624, + "rewards/margins": 1.0076274871826172, + "rewards/rejected": -2.506394863128662, + "step": 5120 + }, + { + "epoch": 0.59, + "learning_rate": 1.2476881657497366e-07, + "logits/chosen": -2.646883487701416, + "logits/rejected": -2.4826090335845947, + "logps/chosen": -134.88490295410156, + "logps/rejected": -174.4677276611328, + "loss": 0.2914, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0960397720336914, + "rewards/margins": 1.8664010763168335, + "rewards/rejected": -1.9624407291412354, + "step": 5121 + }, + { + "epoch": 0.59, + "learning_rate": 1.247337001053494e-07, + "logits/chosen": -2.358844041824341, + "logits/rejected": -2.3851547241210938, + "logps/chosen": -195.44708251953125, + "logps/rejected": -213.739013671875, + "loss": 0.5198, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7309314012527466, + "rewards/margins": 2.08389949798584, + "rewards/rejected": -2.814830780029297, + "step": 5122 + }, + { + "epoch": 0.59, + "learning_rate": 1.2469858363572514e-07, + "logits/chosen": -1.9349322319030762, + "logits/rejected": -2.1297874450683594, + "logps/chosen": -378.629638671875, + "logps/rejected": -327.7613830566406, + "loss": 0.4054, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1674015522003174, + "rewards/margins": 2.1007089614868164, + "rewards/rejected": -3.268110752105713, + "step": 5123 + }, + { + "epoch": 0.59, + "learning_rate": 1.246634671661009e-07, + "logits/chosen": -2.259451150894165, + "logits/rejected": -1.97844660282135, + "logps/chosen": -331.73175048828125, + "logps/rejected": -394.85400390625, + "loss": 0.2117, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8568210005760193, + "rewards/margins": 2.771575450897217, + "rewards/rejected": -3.6283962726593018, + "step": 5124 + }, + { + "epoch": 0.59, + "learning_rate": 1.2462835069647665e-07, + "logits/chosen": -2.0468897819519043, + "logits/rejected": -2.3621959686279297, + "logps/chosen": -553.7262573242188, + "logps/rejected": -355.6436767578125, + "loss": 0.2515, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0575456619262695, + "rewards/margins": 3.238020896911621, + "rewards/rejected": -4.295566558837891, + "step": 5125 + }, + { + "epoch": 0.59, + "learning_rate": 1.245932342268524e-07, + "logits/chosen": -2.8029794692993164, + "logits/rejected": -2.505204916000366, + "logps/chosen": -355.16070556640625, + "logps/rejected": -567.8734741210938, + "loss": 0.3225, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6701639890670776, + "rewards/margins": 2.128429889678955, + "rewards/rejected": -3.798593521118164, + "step": 5126 + }, + { + "epoch": 0.59, + "learning_rate": 1.2455811775722813e-07, + "logits/chosen": -2.688938617706299, + "logits/rejected": -2.7361462116241455, + "logps/chosen": -84.66516876220703, + "logps/rejected": -299.2529296875, + "loss": 0.3996, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4213804304599762, + "rewards/margins": 1.9512711763381958, + "rewards/rejected": -2.3726515769958496, + "step": 5127 + }, + { + "epoch": 0.59, + "learning_rate": 1.2452300128760388e-07, + "logits/chosen": -2.7237815856933594, + "logits/rejected": -2.7929162979125977, + "logps/chosen": -176.1667938232422, + "logps/rejected": -211.03182983398438, + "loss": 0.2126, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5953259468078613, + "rewards/margins": 3.0400352478027344, + "rewards/rejected": -3.6353611946105957, + "step": 5128 + }, + { + "epoch": 0.59, + "learning_rate": 1.244878848179796e-07, + "logits/chosen": -2.4520952701568604, + "logits/rejected": -2.1246397495269775, + "logps/chosen": -236.92221069335938, + "logps/rejected": -358.44403076171875, + "loss": 0.164, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6141819953918457, + "rewards/margins": 2.568542242050171, + "rewards/rejected": -3.1827242374420166, + "step": 5129 + }, + { + "epoch": 0.59, + "learning_rate": 1.244527683483554e-07, + "logits/chosen": -2.3462929725646973, + "logits/rejected": -2.1987171173095703, + "logps/chosen": -233.3837432861328, + "logps/rejected": -277.2842102050781, + "loss": 0.4588, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2968024015426636, + "rewards/margins": 1.8795828819274902, + "rewards/rejected": -3.1763854026794434, + "step": 5130 + }, + { + "epoch": 0.59, + "learning_rate": 1.2441765187873112e-07, + "logits/chosen": -2.2696621417999268, + "logits/rejected": -2.202528476715088, + "logps/chosen": -288.19866943359375, + "logps/rejected": -270.3172912597656, + "loss": 0.4292, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6939470171928406, + "rewards/margins": 1.1564185619354248, + "rewards/rejected": -1.8503656387329102, + "step": 5131 + }, + { + "epoch": 0.59, + "learning_rate": 1.2438253540910687e-07, + "logits/chosen": -2.615220069885254, + "logits/rejected": -2.4847590923309326, + "logps/chosen": -176.73583984375, + "logps/rejected": -286.228515625, + "loss": 0.2606, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8624539375305176, + "rewards/margins": 2.9930126667022705, + "rewards/rejected": -3.855466604232788, + "step": 5132 + }, + { + "epoch": 0.59, + "learning_rate": 1.243474189394826e-07, + "logits/chosen": -2.0282747745513916, + "logits/rejected": -2.140918731689453, + "logps/chosen": -326.58111572265625, + "logps/rejected": -304.451171875, + "loss": 0.342, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.707578182220459, + "rewards/margins": 2.0057473182678223, + "rewards/rejected": -2.7133255004882812, + "step": 5133 + }, + { + "epoch": 0.59, + "learning_rate": 1.2431230246985835e-07, + "logits/chosen": -1.950026512145996, + "logits/rejected": -2.11653733253479, + "logps/chosen": -240.01109313964844, + "logps/rejected": -167.6190643310547, + "loss": 0.8385, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2191449403762817, + "rewards/margins": 1.3772432804107666, + "rewards/rejected": -2.596388101577759, + "step": 5134 + }, + { + "epoch": 0.59, + "learning_rate": 1.242771860002341e-07, + "logits/chosen": -2.5056300163269043, + "logits/rejected": -2.5870871543884277, + "logps/chosen": -237.485107421875, + "logps/rejected": -334.32122802734375, + "loss": 0.1299, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.854702353477478, + "rewards/margins": 4.4082465171813965, + "rewards/rejected": -5.262948989868164, + "step": 5135 + }, + { + "epoch": 0.59, + "learning_rate": 1.2424206953060986e-07, + "logits/chosen": -1.874045968055725, + "logits/rejected": -2.147028923034668, + "logps/chosen": -285.1470947265625, + "logps/rejected": -253.89935302734375, + "loss": 0.5032, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2744590640068054, + "rewards/margins": 1.6870108842849731, + "rewards/rejected": -1.9614697694778442, + "step": 5136 + }, + { + "epoch": 0.59, + "learning_rate": 1.242069530609856e-07, + "logits/chosen": -2.091704845428467, + "logits/rejected": -2.375, + "logps/chosen": -411.34765625, + "logps/rejected": -252.51246643066406, + "loss": 0.2052, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24212665855884552, + "rewards/margins": 3.350468635559082, + "rewards/rejected": -3.592595100402832, + "step": 5137 + }, + { + "epoch": 0.59, + "learning_rate": 1.2417183659136134e-07, + "logits/chosen": -2.5846571922302246, + "logits/rejected": -2.6718215942382812, + "logps/chosen": -92.38253784179688, + "logps/rejected": -188.41522216796875, + "loss": 0.2714, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.28671765327453613, + "rewards/margins": 3.2121188640594482, + "rewards/rejected": -3.4988365173339844, + "step": 5138 + }, + { + "epoch": 0.59, + "learning_rate": 1.241367201217371e-07, + "logits/chosen": -2.3982646465301514, + "logits/rejected": -2.3336033821105957, + "logps/chosen": -296.39642333984375, + "logps/rejected": -203.48306274414062, + "loss": 0.4053, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8845977783203125, + "rewards/margins": 1.2847282886505127, + "rewards/rejected": -2.169325828552246, + "step": 5139 + }, + { + "epoch": 0.59, + "learning_rate": 1.2410160365211282e-07, + "logits/chosen": -2.5075368881225586, + "logits/rejected": -2.707899570465088, + "logps/chosen": -310.703857421875, + "logps/rejected": -258.4132080078125, + "loss": 0.2911, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.032081961631775, + "rewards/margins": 4.060009002685547, + "rewards/rejected": -5.0920915603637695, + "step": 5140 + }, + { + "epoch": 0.59, + "learning_rate": 1.2406648718248858e-07, + "logits/chosen": -2.090358018875122, + "logits/rejected": -2.453794240951538, + "logps/chosen": -437.66473388671875, + "logps/rejected": -260.0801086425781, + "loss": 1.1455, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.089050531387329, + "rewards/margins": 0.8948805332183838, + "rewards/rejected": -2.983931064605713, + "step": 5141 + }, + { + "epoch": 0.59, + "learning_rate": 1.2403137071286433e-07, + "logits/chosen": -2.696605682373047, + "logits/rejected": -2.6777095794677734, + "logps/chosen": -136.05850219726562, + "logps/rejected": -201.76461791992188, + "loss": 0.3899, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.41822171211242676, + "rewards/margins": 1.857213020324707, + "rewards/rejected": -2.275434732437134, + "step": 5142 + }, + { + "epoch": 0.59, + "learning_rate": 1.2399625424324008e-07, + "logits/chosen": -1.780404806137085, + "logits/rejected": -1.938934087753296, + "logps/chosen": -592.4853515625, + "logps/rejected": -346.6983642578125, + "loss": 0.8287, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.2128369808197021, + "rewards/margins": 0.5134589672088623, + "rewards/rejected": -1.7262959480285645, + "step": 5143 + }, + { + "epoch": 0.59, + "learning_rate": 1.239611377736158e-07, + "logits/chosen": -1.9078617095947266, + "logits/rejected": -2.3567190170288086, + "logps/chosen": -218.1095428466797, + "logps/rejected": -193.9951171875, + "loss": 0.6562, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2673548460006714, + "rewards/margins": 1.3581857681274414, + "rewards/rejected": -2.6255407333374023, + "step": 5144 + }, + { + "epoch": 0.59, + "learning_rate": 1.2392602130399157e-07, + "logits/chosen": -1.9368637800216675, + "logits/rejected": -1.7784193754196167, + "logps/chosen": -178.66281127929688, + "logps/rejected": -309.4040222167969, + "loss": 0.1827, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8921229839324951, + "rewards/margins": 3.462345600128174, + "rewards/rejected": -4.354468822479248, + "step": 5145 + }, + { + "epoch": 0.59, + "learning_rate": 1.238909048343673e-07, + "logits/chosen": -1.7279480695724487, + "logits/rejected": -1.8221501111984253, + "logps/chosen": -216.55160522460938, + "logps/rejected": -214.96778869628906, + "loss": 0.5919, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2229304313659668, + "rewards/margins": 1.4487677812576294, + "rewards/rejected": -2.6716980934143066, + "step": 5146 + }, + { + "epoch": 0.59, + "learning_rate": 1.2385578836474307e-07, + "logits/chosen": -2.6048340797424316, + "logits/rejected": -2.421171188354492, + "logps/chosen": -300.5141906738281, + "logps/rejected": -361.6895751953125, + "loss": 0.2731, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7769687175750732, + "rewards/margins": 2.897383213043213, + "rewards/rejected": -4.674351692199707, + "step": 5147 + }, + { + "epoch": 0.59, + "learning_rate": 1.238206718951188e-07, + "logits/chosen": -2.227670907974243, + "logits/rejected": -2.1775946617126465, + "logps/chosen": -199.71180725097656, + "logps/rejected": -241.11990356445312, + "loss": 1.2781, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.7715758085250854, + "rewards/margins": 0.2586686611175537, + "rewards/rejected": -2.0302443504333496, + "step": 5148 + }, + { + "epoch": 0.59, + "learning_rate": 1.2378555542549455e-07, + "logits/chosen": -1.6177904605865479, + "logits/rejected": -2.0534095764160156, + "logps/chosen": -462.7349853515625, + "logps/rejected": -239.46905517578125, + "loss": 0.83, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.15387044847011566, + "rewards/margins": 0.5341845154762268, + "rewards/rejected": -0.6880549788475037, + "step": 5149 + }, + { + "epoch": 0.59, + "learning_rate": 1.2375043895587028e-07, + "logits/chosen": -1.9017333984375, + "logits/rejected": -2.2422773838043213, + "logps/chosen": -399.6514892578125, + "logps/rejected": -318.65545654296875, + "loss": 0.5962, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2382564544677734, + "rewards/margins": 1.4600154161453247, + "rewards/rejected": -2.6982717514038086, + "step": 5150 + }, + { + "epoch": 0.59, + "learning_rate": 1.2371532248624604e-07, + "logits/chosen": -2.9365084171295166, + "logits/rejected": -2.8425605297088623, + "logps/chosen": -295.3229064941406, + "logps/rejected": -514.5519409179688, + "loss": 0.3159, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8633571863174438, + "rewards/margins": 2.5588884353637695, + "rewards/rejected": -3.422245502471924, + "step": 5151 + }, + { + "epoch": 0.59, + "learning_rate": 1.236802060166218e-07, + "logits/chosen": -2.24055552482605, + "logits/rejected": -2.347752094268799, + "logps/chosen": -358.7254333496094, + "logps/rejected": -337.4771423339844, + "loss": 0.3895, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3265365958213806, + "rewards/margins": 2.159018039703369, + "rewards/rejected": -2.4855544567108154, + "step": 5152 + }, + { + "epoch": 0.59, + "learning_rate": 1.2364508954699754e-07, + "logits/chosen": -1.7178153991699219, + "logits/rejected": -2.3025779724121094, + "logps/chosen": -452.73321533203125, + "logps/rejected": -209.96380615234375, + "loss": 0.1983, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8129291534423828, + "rewards/margins": 2.5913026332855225, + "rewards/rejected": -3.4042320251464844, + "step": 5153 + }, + { + "epoch": 0.59, + "learning_rate": 1.2360997307737327e-07, + "logits/chosen": -2.749083995819092, + "logits/rejected": -2.675489902496338, + "logps/chosen": -275.922607421875, + "logps/rejected": -301.83856201171875, + "loss": 0.503, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8006986379623413, + "rewards/margins": 3.487687110900879, + "rewards/rejected": -4.28838586807251, + "step": 5154 + }, + { + "epoch": 0.59, + "learning_rate": 1.2357485660774903e-07, + "logits/chosen": -2.589409351348877, + "logits/rejected": -2.64516544342041, + "logps/chosen": -167.62681579589844, + "logps/rejected": -191.62887573242188, + "loss": 0.3434, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5575075745582581, + "rewards/margins": 2.086587429046631, + "rewards/rejected": -2.644094944000244, + "step": 5155 + }, + { + "epoch": 0.59, + "learning_rate": 1.2353974013812478e-07, + "logits/chosen": -2.5678393840789795, + "logits/rejected": -2.4236602783203125, + "logps/chosen": -210.91412353515625, + "logps/rejected": -298.2120666503906, + "loss": 0.1375, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.31455278396606445, + "rewards/margins": 3.19486665725708, + "rewards/rejected": -3.5094194412231445, + "step": 5156 + }, + { + "epoch": 0.59, + "learning_rate": 1.235046236685005e-07, + "logits/chosen": -1.9158374071121216, + "logits/rejected": -2.00555682182312, + "logps/chosen": -326.3720703125, + "logps/rejected": -377.086181640625, + "loss": 0.5825, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4106953144073486, + "rewards/margins": 1.3564478158950806, + "rewards/rejected": -2.7671430110931396, + "step": 5157 + }, + { + "epoch": 0.59, + "learning_rate": 1.2346950719887626e-07, + "logits/chosen": -2.068728446960449, + "logits/rejected": -2.019965648651123, + "logps/chosen": -207.36607360839844, + "logps/rejected": -273.4039001464844, + "loss": 0.6072, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0175827741622925, + "rewards/margins": 1.2615989446640015, + "rewards/rejected": -2.279181957244873, + "step": 5158 + }, + { + "epoch": 0.59, + "learning_rate": 1.2343439072925201e-07, + "logits/chosen": -2.4359214305877686, + "logits/rejected": -2.4668350219726562, + "logps/chosen": -239.3494873046875, + "logps/rejected": -357.60382080078125, + "loss": 0.1814, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2558833956718445, + "rewards/margins": 3.1426303386688232, + "rewards/rejected": -3.3985137939453125, + "step": 5159 + }, + { + "epoch": 0.59, + "learning_rate": 1.2339927425962777e-07, + "logits/chosen": -2.681244134902954, + "logits/rejected": -2.7137205600738525, + "logps/chosen": -292.6919860839844, + "logps/rejected": -198.5551300048828, + "loss": 0.2009, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6245408654212952, + "rewards/margins": 2.9560766220092773, + "rewards/rejected": -3.5806174278259277, + "step": 5160 + }, + { + "epoch": 0.59, + "learning_rate": 1.233641577900035e-07, + "logits/chosen": -2.39142107963562, + "logits/rejected": -2.281585454940796, + "logps/chosen": -418.9163818359375, + "logps/rejected": -379.4627380371094, + "loss": 0.2682, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9188500642776489, + "rewards/margins": 3.262805223464966, + "rewards/rejected": -4.181654930114746, + "step": 5161 + }, + { + "epoch": 0.6, + "learning_rate": 1.2332904132037925e-07, + "logits/chosen": -2.2296249866485596, + "logits/rejected": -2.135780096054077, + "logps/chosen": -271.004638671875, + "logps/rejected": -357.5638732910156, + "loss": 0.7506, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7952907085418701, + "rewards/margins": 2.8891568183898926, + "rewards/rejected": -3.684447765350342, + "step": 5162 + }, + { + "epoch": 0.6, + "learning_rate": 1.23293924850755e-07, + "logits/chosen": -2.2653346061706543, + "logits/rejected": -2.1266725063323975, + "logps/chosen": -228.70687866210938, + "logps/rejected": -249.35092163085938, + "loss": 0.5613, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6733371615409851, + "rewards/margins": 1.4429676532745361, + "rewards/rejected": -2.116304874420166, + "step": 5163 + }, + { + "epoch": 0.6, + "learning_rate": 1.2325880838113076e-07, + "logits/chosen": -2.1807103157043457, + "logits/rejected": -2.1609017848968506, + "logps/chosen": -275.4418029785156, + "logps/rejected": -302.79730224609375, + "loss": 0.228, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8006897568702698, + "rewards/margins": 3.915034532546997, + "rewards/rejected": -4.715724945068359, + "step": 5164 + }, + { + "epoch": 0.6, + "learning_rate": 1.2322369191150648e-07, + "logits/chosen": -2.904268741607666, + "logits/rejected": -2.7383384704589844, + "logps/chosen": -315.7228088378906, + "logps/rejected": -256.6669921875, + "loss": 0.1285, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7686352729797363, + "rewards/margins": 2.451629638671875, + "rewards/rejected": -3.2202649116516113, + "step": 5165 + }, + { + "epoch": 0.6, + "learning_rate": 1.2318857544188224e-07, + "logits/chosen": -2.5746445655822754, + "logits/rejected": -2.490981340408325, + "logps/chosen": -183.89483642578125, + "logps/rejected": -234.3798065185547, + "loss": 0.1322, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3067685067653656, + "rewards/margins": 2.9540605545043945, + "rewards/rejected": -3.260828971862793, + "step": 5166 + }, + { + "epoch": 0.6, + "learning_rate": 1.23153458972258e-07, + "logits/chosen": -2.1206610202789307, + "logits/rejected": -2.2295591831207275, + "logps/chosen": -252.8957977294922, + "logps/rejected": -213.17340087890625, + "loss": 0.2513, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4557983875274658, + "rewards/margins": 1.780531644821167, + "rewards/rejected": -3.236330032348633, + "step": 5167 + }, + { + "epoch": 0.6, + "learning_rate": 1.2311834250263372e-07, + "logits/chosen": -2.0202877521514893, + "logits/rejected": -2.106490135192871, + "logps/chosen": -282.8031921386719, + "logps/rejected": -285.5933837890625, + "loss": 0.2205, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3999204635620117, + "rewards/margins": 3.770336151123047, + "rewards/rejected": -5.170256614685059, + "step": 5168 + }, + { + "epoch": 0.6, + "learning_rate": 1.2308322603300947e-07, + "logits/chosen": -2.854705810546875, + "logits/rejected": -2.846405506134033, + "logps/chosen": -345.3142395019531, + "logps/rejected": -287.8104553222656, + "loss": 0.5454, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4856873750686646, + "rewards/margins": 1.7996091842651367, + "rewards/rejected": -3.285296678543091, + "step": 5169 + }, + { + "epoch": 0.6, + "learning_rate": 1.2304810956338523e-07, + "logits/chosen": -1.9483678340911865, + "logits/rejected": -1.9771075248718262, + "logps/chosen": -328.80731201171875, + "logps/rejected": -442.8706970214844, + "loss": 0.2254, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1436113715171814, + "rewards/margins": 2.6787381172180176, + "rewards/rejected": -2.8223495483398438, + "step": 5170 + }, + { + "epoch": 0.6, + "learning_rate": 1.2301299309376098e-07, + "logits/chosen": -2.859274387359619, + "logits/rejected": -2.8702940940856934, + "logps/chosen": -185.3823699951172, + "logps/rejected": -204.55532836914062, + "loss": 0.1822, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4873325824737549, + "rewards/margins": 3.286703586578369, + "rewards/rejected": -3.774036407470703, + "step": 5171 + }, + { + "epoch": 0.6, + "learning_rate": 1.229778766241367e-07, + "logits/chosen": -2.300887107849121, + "logits/rejected": -2.677082061767578, + "logps/chosen": -360.12322998046875, + "logps/rejected": -363.46844482421875, + "loss": 0.461, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6072813272476196, + "rewards/margins": 1.514148473739624, + "rewards/rejected": -3.121429920196533, + "step": 5172 + }, + { + "epoch": 0.6, + "learning_rate": 1.2294276015451246e-07, + "logits/chosen": -2.3325791358947754, + "logits/rejected": -2.2212486267089844, + "logps/chosen": -303.351318359375, + "logps/rejected": -351.7943115234375, + "loss": 0.1362, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3304781913757324, + "rewards/margins": 3.6936726570129395, + "rewards/rejected": -4.024150848388672, + "step": 5173 + }, + { + "epoch": 0.6, + "learning_rate": 1.229076436848882e-07, + "logits/chosen": -2.644279718399048, + "logits/rejected": -2.6395020484924316, + "logps/chosen": -164.55520629882812, + "logps/rejected": -219.93975830078125, + "loss": 0.2599, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7759906053543091, + "rewards/margins": 2.6402740478515625, + "rewards/rejected": -3.4162650108337402, + "step": 5174 + }, + { + "epoch": 0.6, + "learning_rate": 1.2287252721526397e-07, + "logits/chosen": -1.9369186162948608, + "logits/rejected": -2.313844680786133, + "logps/chosen": -328.238525390625, + "logps/rejected": -221.26602172851562, + "loss": 0.4079, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5136460065841675, + "rewards/margins": 1.3272674083709717, + "rewards/rejected": -2.8409132957458496, + "step": 5175 + }, + { + "epoch": 0.6, + "learning_rate": 1.228374107456397e-07, + "logits/chosen": -2.6449296474456787, + "logits/rejected": -2.696863889694214, + "logps/chosen": -167.51806640625, + "logps/rejected": -331.23687744140625, + "loss": 0.0139, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9290230870246887, + "rewards/margins": 7.056583404541016, + "rewards/rejected": -7.985606670379639, + "step": 5176 + }, + { + "epoch": 0.6, + "learning_rate": 1.2280229427601545e-07, + "logits/chosen": -2.216654062271118, + "logits/rejected": -2.4370265007019043, + "logps/chosen": -292.9771728515625, + "logps/rejected": -275.7171630859375, + "loss": 0.274, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1639567613601685, + "rewards/margins": 1.6648913621902466, + "rewards/rejected": -2.828847885131836, + "step": 5177 + }, + { + "epoch": 0.6, + "learning_rate": 1.2276717780639118e-07, + "logits/chosen": -1.835982084274292, + "logits/rejected": -2.0004873275756836, + "logps/chosen": -318.2358703613281, + "logps/rejected": -308.0141906738281, + "loss": 0.1174, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2168457508087158, + "rewards/margins": 3.809075355529785, + "rewards/rejected": -5.025920867919922, + "step": 5178 + }, + { + "epoch": 0.6, + "learning_rate": 1.2273206133676693e-07, + "logits/chosen": -2.6088693141937256, + "logits/rejected": -2.6211931705474854, + "logps/chosen": -215.50091552734375, + "logps/rejected": -261.0126953125, + "loss": 0.2399, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4203895032405853, + "rewards/margins": 3.450970411300659, + "rewards/rejected": -3.8713598251342773, + "step": 5179 + }, + { + "epoch": 0.6, + "learning_rate": 1.2269694486714269e-07, + "logits/chosen": -2.468967914581299, + "logits/rejected": -2.6494998931884766, + "logps/chosen": -343.83868408203125, + "logps/rejected": -346.67059326171875, + "loss": 0.1628, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5499376058578491, + "rewards/margins": 2.6903936862945557, + "rewards/rejected": -3.2403314113616943, + "step": 5180 + }, + { + "epoch": 0.6, + "learning_rate": 1.2266182839751844e-07, + "logits/chosen": -2.492772102355957, + "logits/rejected": -2.3085713386535645, + "logps/chosen": -339.8787841796875, + "logps/rejected": -276.60784912109375, + "loss": 0.33, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2374653816223145, + "rewards/margins": 1.8412747383117676, + "rewards/rejected": -3.078739881515503, + "step": 5181 + }, + { + "epoch": 0.6, + "learning_rate": 1.2262671192789417e-07, + "logits/chosen": -2.4545230865478516, + "logits/rejected": -2.429392099380493, + "logps/chosen": -246.10389709472656, + "logps/rejected": -200.56289672851562, + "loss": 0.204, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9083758592605591, + "rewards/margins": 3.0442256927490234, + "rewards/rejected": -3.952601432800293, + "step": 5182 + }, + { + "epoch": 0.6, + "learning_rate": 1.2259159545826992e-07, + "logits/chosen": -2.176595687866211, + "logits/rejected": -2.1990065574645996, + "logps/chosen": -375.9437255859375, + "logps/rejected": -337.3277893066406, + "loss": 0.1328, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03217964246869087, + "rewards/margins": 3.3091862201690674, + "rewards/rejected": -3.3413660526275635, + "step": 5183 + }, + { + "epoch": 0.6, + "learning_rate": 1.2255647898864568e-07, + "logits/chosen": -2.648756742477417, + "logits/rejected": -2.4714226722717285, + "logps/chosen": -239.91748046875, + "logps/rejected": -268.8445739746094, + "loss": 0.5261, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1906429529190063, + "rewards/margins": 1.2129254341125488, + "rewards/rejected": -2.4035682678222656, + "step": 5184 + }, + { + "epoch": 0.6, + "learning_rate": 1.225213625190214e-07, + "logits/chosen": -2.0781736373901367, + "logits/rejected": -2.103604793548584, + "logps/chosen": -160.45773315429688, + "logps/rejected": -342.7168884277344, + "loss": 0.2023, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7629266977310181, + "rewards/margins": 4.385422706604004, + "rewards/rejected": -5.148349285125732, + "step": 5185 + }, + { + "epoch": 0.6, + "learning_rate": 1.2248624604939716e-07, + "logits/chosen": -2.483414649963379, + "logits/rejected": -2.8965883255004883, + "logps/chosen": -317.0429382324219, + "logps/rejected": -286.65643310546875, + "loss": 0.7117, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0839459896087646, + "rewards/margins": 1.3677282333374023, + "rewards/rejected": -2.451674222946167, + "step": 5186 + }, + { + "epoch": 0.6, + "learning_rate": 1.224511295797729e-07, + "logits/chosen": -2.184540271759033, + "logits/rejected": -1.9244441986083984, + "logps/chosen": -252.08596801757812, + "logps/rejected": -367.0909118652344, + "loss": 0.4021, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4876502752304077, + "rewards/margins": 1.8485150337219238, + "rewards/rejected": -3.336165428161621, + "step": 5187 + }, + { + "epoch": 0.6, + "learning_rate": 1.2241601311014866e-07, + "logits/chosen": -2.7744386196136475, + "logits/rejected": -2.6728076934814453, + "logps/chosen": -111.94705963134766, + "logps/rejected": -199.50732421875, + "loss": 0.2158, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.02448268234729767, + "rewards/margins": 2.044090986251831, + "rewards/rejected": -2.01960825920105, + "step": 5188 + }, + { + "epoch": 0.6, + "learning_rate": 1.223808966405244e-07, + "logits/chosen": -1.8531006574630737, + "logits/rejected": -1.7040563821792603, + "logps/chosen": -303.5675354003906, + "logps/rejected": -288.20361328125, + "loss": 0.3867, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2759757041931152, + "rewards/margins": 1.7941768169403076, + "rewards/rejected": -3.0701522827148438, + "step": 5189 + }, + { + "epoch": 0.6, + "learning_rate": 1.2234578017090015e-07, + "logits/chosen": -2.7240662574768066, + "logits/rejected": -2.730273723602295, + "logps/chosen": -370.37628173828125, + "logps/rejected": -298.47247314453125, + "loss": 0.3036, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5516933798789978, + "rewards/margins": 2.0259647369384766, + "rewards/rejected": -2.5776584148406982, + "step": 5190 + }, + { + "epoch": 0.6, + "learning_rate": 1.2231066370127587e-07, + "logits/chosen": -2.5696680545806885, + "logits/rejected": -2.415837526321411, + "logps/chosen": -255.31192016601562, + "logps/rejected": -268.72418212890625, + "loss": 0.2529, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5342772006988525, + "rewards/margins": 2.3319931030273438, + "rewards/rejected": -2.866270065307617, + "step": 5191 + }, + { + "epoch": 0.6, + "learning_rate": 1.2227554723165165e-07, + "logits/chosen": -2.09395432472229, + "logits/rejected": -2.0059680938720703, + "logps/chosen": -337.22314453125, + "logps/rejected": -288.612060546875, + "loss": 0.4439, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5751586556434631, + "rewards/margins": 1.2227940559387207, + "rewards/rejected": -1.797952651977539, + "step": 5192 + }, + { + "epoch": 0.6, + "learning_rate": 1.2224043076202738e-07, + "logits/chosen": -1.9058895111083984, + "logits/rejected": -1.9271063804626465, + "logps/chosen": -374.3997802734375, + "logps/rejected": -322.4251403808594, + "loss": 0.2823, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6594944000244141, + "rewards/margins": 2.1691057682037354, + "rewards/rejected": -2.8286001682281494, + "step": 5193 + }, + { + "epoch": 0.6, + "learning_rate": 1.2220531429240313e-07, + "logits/chosen": -2.940429210662842, + "logits/rejected": -2.9379642009735107, + "logps/chosen": -233.6118927001953, + "logps/rejected": -247.5408477783203, + "loss": 0.2344, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5185675621032715, + "rewards/margins": 3.2841978073120117, + "rewards/rejected": -3.802765369415283, + "step": 5194 + }, + { + "epoch": 0.6, + "learning_rate": 1.2217019782277886e-07, + "logits/chosen": -2.194718360900879, + "logits/rejected": -2.583632230758667, + "logps/chosen": -288.70416259765625, + "logps/rejected": -200.98724365234375, + "loss": 0.2998, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.47163793444633484, + "rewards/margins": 1.8908522129058838, + "rewards/rejected": -2.362490177154541, + "step": 5195 + }, + { + "epoch": 0.6, + "learning_rate": 1.2213508135315462e-07, + "logits/chosen": -2.6345930099487305, + "logits/rejected": -2.604592800140381, + "logps/chosen": -340.34075927734375, + "logps/rejected": -259.3560485839844, + "loss": 0.2015, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.378653883934021, + "rewards/margins": 2.1100988388061523, + "rewards/rejected": -3.488752841949463, + "step": 5196 + }, + { + "epoch": 0.6, + "learning_rate": 1.2209996488353037e-07, + "logits/chosen": -2.4384965896606445, + "logits/rejected": -2.4726877212524414, + "logps/chosen": -183.4750518798828, + "logps/rejected": -310.20068359375, + "loss": 1.0173, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.1092658042907715, + "rewards/margins": 1.067136287689209, + "rewards/rejected": -4.1764020919799805, + "step": 5197 + }, + { + "epoch": 0.6, + "learning_rate": 1.2206484841390612e-07, + "logits/chosen": -2.069547653198242, + "logits/rejected": -1.9256432056427002, + "logps/chosen": -275.7503356933594, + "logps/rejected": -323.43914794921875, + "loss": 0.2637, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9489201307296753, + "rewards/margins": 1.5064735412597656, + "rewards/rejected": -2.4553935527801514, + "step": 5198 + }, + { + "epoch": 0.6, + "learning_rate": 1.2202973194428185e-07, + "logits/chosen": -2.1734700202941895, + "logits/rejected": -2.3355813026428223, + "logps/chosen": -306.5966796875, + "logps/rejected": -323.00335693359375, + "loss": 0.1703, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3053332567214966, + "rewards/margins": 3.5915701389312744, + "rewards/rejected": -3.8969032764434814, + "step": 5199 + }, + { + "epoch": 0.6, + "learning_rate": 1.219946154746576e-07, + "logits/chosen": -1.9309444427490234, + "logits/rejected": -2.0777089595794678, + "logps/chosen": -276.77947998046875, + "logps/rejected": -330.1509094238281, + "loss": 0.6118, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0376700162887573, + "rewards/margins": 1.575904369354248, + "rewards/rejected": -2.613574504852295, + "step": 5200 + }, + { + "epoch": 0.6, + "learning_rate": 1.2195949900503336e-07, + "logits/chosen": -2.0091054439544678, + "logits/rejected": -1.6635370254516602, + "logps/chosen": -347.8858642578125, + "logps/rejected": -327.39544677734375, + "loss": 0.4665, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8849701881408691, + "rewards/margins": 1.077707052230835, + "rewards/rejected": -1.9626773595809937, + "step": 5201 + }, + { + "epoch": 0.6, + "learning_rate": 1.2192438253540909e-07, + "logits/chosen": -2.2411611080169678, + "logits/rejected": -2.3237147331237793, + "logps/chosen": -211.63427734375, + "logps/rejected": -175.0221405029297, + "loss": 0.3662, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9538352489471436, + "rewards/margins": 2.460172653198242, + "rewards/rejected": -3.4140076637268066, + "step": 5202 + }, + { + "epoch": 0.6, + "learning_rate": 1.2188926606578484e-07, + "logits/chosen": -2.222052574157715, + "logits/rejected": -2.600879669189453, + "logps/chosen": -366.32928466796875, + "logps/rejected": -309.24188232421875, + "loss": 0.7955, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5853102207183838, + "rewards/margins": 0.8275713920593262, + "rewards/rejected": -2.412881851196289, + "step": 5203 + }, + { + "epoch": 0.6, + "learning_rate": 1.218541495961606e-07, + "logits/chosen": -2.833378314971924, + "logits/rejected": -2.8512301445007324, + "logps/chosen": -133.1283416748047, + "logps/rejected": -214.99160766601562, + "loss": 0.3871, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7257417440414429, + "rewards/margins": 2.144832134246826, + "rewards/rejected": -2.8705739974975586, + "step": 5204 + }, + { + "epoch": 0.6, + "learning_rate": 1.2181903312653635e-07, + "logits/chosen": -2.237027406692505, + "logits/rejected": -1.9865663051605225, + "logps/chosen": -206.0262451171875, + "logps/rejected": -302.5347900390625, + "loss": 0.3078, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7057797908782959, + "rewards/margins": 2.525073528289795, + "rewards/rejected": -3.230853319168091, + "step": 5205 + }, + { + "epoch": 0.6, + "learning_rate": 1.2178391665691207e-07, + "logits/chosen": -2.3175699710845947, + "logits/rejected": -2.1307525634765625, + "logps/chosen": -268.21722412109375, + "logps/rejected": -328.606689453125, + "loss": 0.2133, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5757997035980225, + "rewards/margins": 2.3369712829589844, + "rewards/rejected": -2.912771224975586, + "step": 5206 + }, + { + "epoch": 0.6, + "learning_rate": 1.2174880018728783e-07, + "logits/chosen": -1.7185391187667847, + "logits/rejected": -2.2735557556152344, + "logps/chosen": -486.3223571777344, + "logps/rejected": -303.5440673828125, + "loss": 0.1725, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.048888809978961945, + "rewards/margins": 2.3446462154388428, + "rewards/rejected": -2.3935348987579346, + "step": 5207 + }, + { + "epoch": 0.6, + "learning_rate": 1.2171368371766358e-07, + "logits/chosen": -1.9850454330444336, + "logits/rejected": -2.2834439277648926, + "logps/chosen": -332.5591125488281, + "logps/rejected": -344.57440185546875, + "loss": 0.3693, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1043325662612915, + "rewards/margins": 2.127436876296997, + "rewards/rejected": -3.231769561767578, + "step": 5208 + }, + { + "epoch": 0.6, + "learning_rate": 1.2167856724803934e-07, + "logits/chosen": -2.621694326400757, + "logits/rejected": -2.5818424224853516, + "logps/chosen": -270.099853515625, + "logps/rejected": -290.132568359375, + "loss": 0.2198, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09936569631099701, + "rewards/margins": 2.1043758392333984, + "rewards/rejected": -2.2037413120269775, + "step": 5209 + }, + { + "epoch": 0.6, + "learning_rate": 1.2164345077841506e-07, + "logits/chosen": -2.0916266441345215, + "logits/rejected": -2.0750885009765625, + "logps/chosen": -445.91949462890625, + "logps/rejected": -427.98870849609375, + "loss": 0.2455, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5585242509841919, + "rewards/margins": 2.693469524383545, + "rewards/rejected": -3.2519941329956055, + "step": 5210 + }, + { + "epoch": 0.6, + "learning_rate": 1.2160833430879082e-07, + "logits/chosen": -2.310852527618408, + "logits/rejected": -2.3260505199432373, + "logps/chosen": -262.6798400878906, + "logps/rejected": -266.74462890625, + "loss": 0.3916, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1966320276260376, + "rewards/margins": 1.6201815605163574, + "rewards/rejected": -2.8168134689331055, + "step": 5211 + }, + { + "epoch": 0.6, + "learning_rate": 1.2157321783916657e-07, + "logits/chosen": -2.2948107719421387, + "logits/rejected": -2.2282631397247314, + "logps/chosen": -146.59469604492188, + "logps/rejected": -253.87789916992188, + "loss": 0.5886, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7316873073577881, + "rewards/margins": 1.3745359182357788, + "rewards/rejected": -2.1062231063842773, + "step": 5212 + }, + { + "epoch": 0.6, + "learning_rate": 1.215381013695423e-07, + "logits/chosen": -2.0087313652038574, + "logits/rejected": -2.4511146545410156, + "logps/chosen": -277.9221496582031, + "logps/rejected": -175.3979034423828, + "loss": 0.2063, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4101524353027344, + "rewards/margins": 2.133254051208496, + "rewards/rejected": -2.5434064865112305, + "step": 5213 + }, + { + "epoch": 0.6, + "learning_rate": 1.2150298489991805e-07, + "logits/chosen": -2.190732002258301, + "logits/rejected": -2.372589111328125, + "logps/chosen": -404.156494140625, + "logps/rejected": -371.25531005859375, + "loss": 0.0878, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19079196453094482, + "rewards/margins": 3.5477795600891113, + "rewards/rejected": -3.7385711669921875, + "step": 5214 + }, + { + "epoch": 0.6, + "learning_rate": 1.214678684302938e-07, + "logits/chosen": -2.4201388359069824, + "logits/rejected": -2.5551795959472656, + "logps/chosen": -163.76724243164062, + "logps/rejected": -132.94094848632812, + "loss": 0.2122, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21360602974891663, + "rewards/margins": 2.1918013095855713, + "rewards/rejected": -2.405407428741455, + "step": 5215 + }, + { + "epoch": 0.6, + "learning_rate": 1.2143275196066956e-07, + "logits/chosen": -2.098097801208496, + "logits/rejected": -2.048813819885254, + "logps/chosen": -298.58880615234375, + "logps/rejected": -266.958251953125, + "loss": 0.3656, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5496091842651367, + "rewards/margins": 1.262948751449585, + "rewards/rejected": -1.8125580549240112, + "step": 5216 + }, + { + "epoch": 0.6, + "learning_rate": 1.213976354910453e-07, + "logits/chosen": -2.4774527549743652, + "logits/rejected": -2.6650390625, + "logps/chosen": -371.76116943359375, + "logps/rejected": -241.71221923828125, + "loss": 0.402, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.471440315246582, + "rewards/margins": 1.0454483032226562, + "rewards/rejected": -2.5168886184692383, + "step": 5217 + }, + { + "epoch": 0.6, + "learning_rate": 1.2136251902142104e-07, + "logits/chosen": -1.7471857070922852, + "logits/rejected": -2.0874688625335693, + "logps/chosen": -394.54595947265625, + "logps/rejected": -266.8530578613281, + "loss": 0.342, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.09939515590667725, + "rewards/margins": 1.860026478767395, + "rewards/rejected": -1.7606312036514282, + "step": 5218 + }, + { + "epoch": 0.6, + "learning_rate": 1.2132740255179677e-07, + "logits/chosen": -2.0862958431243896, + "logits/rejected": -2.0407118797302246, + "logps/chosen": -460.28485107421875, + "logps/rejected": -406.255615234375, + "loss": 0.2869, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.40683358907699585, + "rewards/margins": 2.5523383617401123, + "rewards/rejected": -2.959171772003174, + "step": 5219 + }, + { + "epoch": 0.6, + "learning_rate": 1.2129228608217255e-07, + "logits/chosen": -2.5354504585266113, + "logits/rejected": -2.459247589111328, + "logps/chosen": -236.55386352539062, + "logps/rejected": -338.2970275878906, + "loss": 0.9466, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8753477931022644, + "rewards/margins": 1.0987513065338135, + "rewards/rejected": -1.9740991592407227, + "step": 5220 + }, + { + "epoch": 0.6, + "learning_rate": 1.2125716961254828e-07, + "logits/chosen": -2.462865114212036, + "logits/rejected": -2.2552382946014404, + "logps/chosen": -373.8764343261719, + "logps/rejected": -342.0340576171875, + "loss": 0.5429, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0627155303955078, + "rewards/margins": 1.5084869861602783, + "rewards/rejected": -2.571202516555786, + "step": 5221 + }, + { + "epoch": 0.6, + "learning_rate": 1.2122205314292403e-07, + "logits/chosen": -2.248605728149414, + "logits/rejected": -2.387946844100952, + "logps/chosen": -327.03851318359375, + "logps/rejected": -302.9496154785156, + "loss": 0.1508, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9471202492713928, + "rewards/margins": 2.39349365234375, + "rewards/rejected": -3.340614080429077, + "step": 5222 + }, + { + "epoch": 0.6, + "learning_rate": 1.2118693667329976e-07, + "logits/chosen": -2.604902505874634, + "logits/rejected": -2.313676357269287, + "logps/chosen": -136.00608825683594, + "logps/rejected": -217.42364501953125, + "loss": 0.2095, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.339271903038025, + "rewards/margins": 2.565232038497925, + "rewards/rejected": -3.9045040607452393, + "step": 5223 + }, + { + "epoch": 0.6, + "learning_rate": 1.2115182020367554e-07, + "logits/chosen": -2.632218360900879, + "logits/rejected": -2.9011216163635254, + "logps/chosen": -291.9913330078125, + "logps/rejected": -225.13433837890625, + "loss": 0.1967, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5422577261924744, + "rewards/margins": 3.308408498764038, + "rewards/rejected": -3.850666046142578, + "step": 5224 + }, + { + "epoch": 0.6, + "learning_rate": 1.2111670373405127e-07, + "logits/chosen": -2.5846445560455322, + "logits/rejected": -2.6328372955322266, + "logps/chosen": -247.58912658691406, + "logps/rejected": -230.64662170410156, + "loss": 0.2246, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6849974393844604, + "rewards/margins": 2.3557181358337402, + "rewards/rejected": -3.040715456008911, + "step": 5225 + }, + { + "epoch": 0.6, + "learning_rate": 1.2108158726442702e-07, + "logits/chosen": -2.2171642780303955, + "logits/rejected": -2.171893835067749, + "logps/chosen": -267.9458312988281, + "logps/rejected": -222.8854522705078, + "loss": 1.0598, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.490023136138916, + "rewards/margins": 0.26355260610580444, + "rewards/rejected": -1.7535758018493652, + "step": 5226 + }, + { + "epoch": 0.6, + "learning_rate": 1.2104647079480275e-07, + "logits/chosen": -2.59053635597229, + "logits/rejected": -2.460888385772705, + "logps/chosen": -252.00631713867188, + "logps/rejected": -235.90408325195312, + "loss": 0.2294, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.438579797744751, + "rewards/margins": 2.057889938354492, + "rewards/rejected": -3.496469736099243, + "step": 5227 + }, + { + "epoch": 0.6, + "learning_rate": 1.210113543251785e-07, + "logits/chosen": -2.504337787628174, + "logits/rejected": -2.483069658279419, + "logps/chosen": -189.23611450195312, + "logps/rejected": -159.81802368164062, + "loss": 0.2532, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.23612791299819946, + "rewards/margins": 3.401705741882324, + "rewards/rejected": -3.637834072113037, + "step": 5228 + }, + { + "epoch": 0.6, + "learning_rate": 1.2097623785555425e-07, + "logits/chosen": -2.3946306705474854, + "logits/rejected": -2.1267242431640625, + "logps/chosen": -285.0230712890625, + "logps/rejected": -321.12103271484375, + "loss": 0.4804, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4746553301811218, + "rewards/margins": 1.622304916381836, + "rewards/rejected": -2.0969600677490234, + "step": 5229 + }, + { + "epoch": 0.6, + "learning_rate": 1.2094112138592998e-07, + "logits/chosen": -2.481485366821289, + "logits/rejected": -2.4764370918273926, + "logps/chosen": -235.08609008789062, + "logps/rejected": -277.79022216796875, + "loss": 0.5963, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2714769840240479, + "rewards/margins": 2.1293880939483643, + "rewards/rejected": -3.400865077972412, + "step": 5230 + }, + { + "epoch": 0.6, + "learning_rate": 1.2090600491630574e-07, + "logits/chosen": -2.5949058532714844, + "logits/rejected": -2.7248079776763916, + "logps/chosen": -238.82315063476562, + "logps/rejected": -204.5538330078125, + "loss": 0.3016, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.19929730892181396, + "rewards/margins": 1.3251992464065552, + "rewards/rejected": -1.1259019374847412, + "step": 5231 + }, + { + "epoch": 0.6, + "learning_rate": 1.208708884466815e-07, + "logits/chosen": -2.416616916656494, + "logits/rejected": -2.345914602279663, + "logps/chosen": -386.8367004394531, + "logps/rejected": -257.5465087890625, + "loss": 0.235, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3894566297531128, + "rewards/margins": 1.920928955078125, + "rewards/rejected": -3.3103854656219482, + "step": 5232 + }, + { + "epoch": 0.6, + "learning_rate": 1.2083577197705724e-07, + "logits/chosen": -2.5362560749053955, + "logits/rejected": -2.4736227989196777, + "logps/chosen": -122.90847778320312, + "logps/rejected": -235.86337280273438, + "loss": 0.4122, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8217965960502625, + "rewards/margins": 3.872715473175049, + "rewards/rejected": -4.694512367248535, + "step": 5233 + }, + { + "epoch": 0.6, + "learning_rate": 1.2080065550743297e-07, + "logits/chosen": -2.2778728008270264, + "logits/rejected": -2.0210049152374268, + "logps/chosen": -224.85609436035156, + "logps/rejected": -226.1277313232422, + "loss": 1.3826, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3737602233886719, + "rewards/margins": -0.26930326223373413, + "rewards/rejected": -1.1044570207595825, + "step": 5234 + }, + { + "epoch": 0.6, + "learning_rate": 1.2076553903780872e-07, + "logits/chosen": -2.2713851928710938, + "logits/rejected": -2.5629990100860596, + "logps/chosen": -268.27069091796875, + "logps/rejected": -219.19198608398438, + "loss": 0.2202, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04325849935412407, + "rewards/margins": 2.256927490234375, + "rewards/rejected": -2.2136688232421875, + "step": 5235 + }, + { + "epoch": 0.6, + "learning_rate": 1.2073042256818448e-07, + "logits/chosen": -1.703715205192566, + "logits/rejected": -2.074387550354004, + "logps/chosen": -407.9605712890625, + "logps/rejected": -283.25091552734375, + "loss": 0.3302, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.102663516998291, + "rewards/margins": 1.464552879333496, + "rewards/rejected": -2.567216396331787, + "step": 5236 + }, + { + "epoch": 0.6, + "learning_rate": 1.2069530609856023e-07, + "logits/chosen": -2.776160955429077, + "logits/rejected": -2.7579691410064697, + "logps/chosen": -151.0760955810547, + "logps/rejected": -210.39141845703125, + "loss": 0.3659, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6354429721832275, + "rewards/margins": 1.4810365438461304, + "rewards/rejected": -2.1164793968200684, + "step": 5237 + }, + { + "epoch": 0.6, + "learning_rate": 1.2066018962893596e-07, + "logits/chosen": -2.2855048179626465, + "logits/rejected": -2.289560317993164, + "logps/chosen": -216.5088348388672, + "logps/rejected": -246.0506591796875, + "loss": 0.2093, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8770638108253479, + "rewards/margins": 1.787968397140503, + "rewards/rejected": -2.665032148361206, + "step": 5238 + }, + { + "epoch": 0.6, + "learning_rate": 1.2062507315931171e-07, + "logits/chosen": -2.1001391410827637, + "logits/rejected": -2.354151725769043, + "logps/chosen": -396.68878173828125, + "logps/rejected": -283.5289611816406, + "loss": 0.4345, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7042701840400696, + "rewards/margins": 1.5789594650268555, + "rewards/rejected": -2.2832298278808594, + "step": 5239 + }, + { + "epoch": 0.6, + "learning_rate": 1.2058995668968744e-07, + "logits/chosen": -2.4669134616851807, + "logits/rejected": -2.508183002471924, + "logps/chosen": -203.75604248046875, + "logps/rejected": -143.13653564453125, + "loss": 0.376, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.387195348739624, + "rewards/margins": 1.6116048097610474, + "rewards/rejected": -2.998800277709961, + "step": 5240 + }, + { + "epoch": 0.6, + "learning_rate": 1.2055484022006322e-07, + "logits/chosen": -2.8570189476013184, + "logits/rejected": -2.9479024410247803, + "logps/chosen": -203.89817810058594, + "logps/rejected": -248.90211486816406, + "loss": 0.4351, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5978524088859558, + "rewards/margins": 2.1416239738464355, + "rewards/rejected": -2.739476203918457, + "step": 5241 + }, + { + "epoch": 0.6, + "learning_rate": 1.2051972375043895e-07, + "logits/chosen": -1.973388433456421, + "logits/rejected": -2.309626340866089, + "logps/chosen": -440.9832763671875, + "logps/rejected": -328.9373779296875, + "loss": 0.2894, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.47478777170181274, + "rewards/margins": 1.7117815017700195, + "rewards/rejected": -2.1865692138671875, + "step": 5242 + }, + { + "epoch": 0.6, + "learning_rate": 1.204846072808147e-07, + "logits/chosen": -2.281942844390869, + "logits/rejected": -2.5443243980407715, + "logps/chosen": -310.9380187988281, + "logps/rejected": -131.21820068359375, + "loss": 0.3444, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5395124554634094, + "rewards/margins": 1.4522391557693481, + "rewards/rejected": -1.9917516708374023, + "step": 5243 + }, + { + "epoch": 0.6, + "learning_rate": 1.2044949081119043e-07, + "logits/chosen": -2.4945030212402344, + "logits/rejected": -2.5594160556793213, + "logps/chosen": -130.4227294921875, + "logps/rejected": -281.5563659667969, + "loss": 0.8595, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4902572631835938, + "rewards/margins": 3.034054756164551, + "rewards/rejected": -4.5243120193481445, + "step": 5244 + }, + { + "epoch": 0.6, + "learning_rate": 1.2041437434156618e-07, + "logits/chosen": -2.553192615509033, + "logits/rejected": -2.662266254425049, + "logps/chosen": -239.14381408691406, + "logps/rejected": -150.38868713378906, + "loss": 0.5718, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8790937662124634, + "rewards/margins": 1.2332227230072021, + "rewards/rejected": -2.112316608428955, + "step": 5245 + }, + { + "epoch": 0.6, + "learning_rate": 1.2037925787194194e-07, + "logits/chosen": -2.1322271823883057, + "logits/rejected": -2.0859429836273193, + "logps/chosen": -225.89903259277344, + "logps/rejected": -213.21266174316406, + "loss": 0.31, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7887819409370422, + "rewards/margins": 2.2142035961151123, + "rewards/rejected": -3.0029854774475098, + "step": 5246 + }, + { + "epoch": 0.6, + "learning_rate": 1.2034414140231767e-07, + "logits/chosen": -2.3681862354278564, + "logits/rejected": -2.4347710609436035, + "logps/chosen": -271.1920166015625, + "logps/rejected": -336.5887756347656, + "loss": 0.5024, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.48600268363952637, + "rewards/margins": 2.0033061504364014, + "rewards/rejected": -2.4893088340759277, + "step": 5247 + }, + { + "epoch": 0.6, + "learning_rate": 1.2030902493269342e-07, + "logits/chosen": -2.0951287746429443, + "logits/rejected": -1.9735523462295532, + "logps/chosen": -216.42596435546875, + "logps/rejected": -228.39395141601562, + "loss": 0.3645, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8504533171653748, + "rewards/margins": 1.8641235828399658, + "rewards/rejected": -2.7145769596099854, + "step": 5248 + }, + { + "epoch": 0.61, + "learning_rate": 1.2027390846306917e-07, + "logits/chosen": -2.1757125854492188, + "logits/rejected": -2.391387939453125, + "logps/chosen": -401.46197509765625, + "logps/rejected": -212.49867248535156, + "loss": 0.2453, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7423564195632935, + "rewards/margins": 2.02026104927063, + "rewards/rejected": -2.762617588043213, + "step": 5249 + }, + { + "epoch": 0.61, + "learning_rate": 1.2023879199344493e-07, + "logits/chosen": -2.0805716514587402, + "logits/rejected": -2.00065279006958, + "logps/chosen": -319.5083312988281, + "logps/rejected": -253.8655548095703, + "loss": 1.2721, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.028715133666992, + "rewards/margins": -0.06277215480804443, + "rewards/rejected": -1.9659427404403687, + "step": 5250 + }, + { + "epoch": 0.61, + "learning_rate": 1.2020367552382065e-07, + "logits/chosen": -2.202260732650757, + "logits/rejected": -1.9301460981369019, + "logps/chosen": -113.35295104980469, + "logps/rejected": -232.62147521972656, + "loss": 2.1821, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.7755277156829834, + "rewards/margins": 0.07082545757293701, + "rewards/rejected": -3.846353054046631, + "step": 5251 + }, + { + "epoch": 0.61, + "learning_rate": 1.201685590541964e-07, + "logits/chosen": -2.1651251316070557, + "logits/rejected": -2.0566937923431396, + "logps/chosen": -392.48443603515625, + "logps/rejected": -380.8036804199219, + "loss": 0.9111, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9819517135620117, + "rewards/margins": 0.5225945711135864, + "rewards/rejected": -1.5045461654663086, + "step": 5252 + }, + { + "epoch": 0.61, + "learning_rate": 1.2013344258457216e-07, + "logits/chosen": -2.6988730430603027, + "logits/rejected": -2.6879661083221436, + "logps/chosen": -191.7349395751953, + "logps/rejected": -206.84005737304688, + "loss": 0.2127, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5501146912574768, + "rewards/margins": 1.7095324993133545, + "rewards/rejected": -2.2596473693847656, + "step": 5253 + }, + { + "epoch": 0.61, + "learning_rate": 1.2009832611494792e-07, + "logits/chosen": -2.1552109718322754, + "logits/rejected": -2.402494430541992, + "logps/chosen": -210.6397705078125, + "logps/rejected": -156.88006591796875, + "loss": 0.2441, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18493826687335968, + "rewards/margins": 1.9164094924926758, + "rewards/rejected": -2.1013479232788086, + "step": 5254 + }, + { + "epoch": 0.61, + "learning_rate": 1.2006320964532364e-07, + "logits/chosen": -2.3418853282928467, + "logits/rejected": -2.479167938232422, + "logps/chosen": -320.8595275878906, + "logps/rejected": -271.4126281738281, + "loss": 0.4681, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.25332146883010864, + "rewards/margins": 1.0021779537200928, + "rewards/rejected": -1.2554993629455566, + "step": 5255 + }, + { + "epoch": 0.61, + "learning_rate": 1.200280931756994e-07, + "logits/chosen": -2.747835874557495, + "logits/rejected": -2.727235794067383, + "logps/chosen": -204.9402313232422, + "logps/rejected": -208.4278564453125, + "loss": 0.3108, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4681279957294464, + "rewards/margins": 2.737016439437866, + "rewards/rejected": -3.2051444053649902, + "step": 5256 + }, + { + "epoch": 0.61, + "learning_rate": 1.1999297670607515e-07, + "logits/chosen": -2.415562629699707, + "logits/rejected": -2.680436611175537, + "logps/chosen": -440.52288818359375, + "logps/rejected": -264.7852783203125, + "loss": 0.2636, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.27838000655174255, + "rewards/margins": 1.9471802711486816, + "rewards/rejected": -2.225560188293457, + "step": 5257 + }, + { + "epoch": 0.61, + "learning_rate": 1.199578602364509e-07, + "logits/chosen": -2.00614595413208, + "logits/rejected": -2.0317304134368896, + "logps/chosen": -259.76080322265625, + "logps/rejected": -352.1111755371094, + "loss": 0.3397, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9392711520195007, + "rewards/margins": 2.839817523956299, + "rewards/rejected": -3.7790889739990234, + "step": 5258 + }, + { + "epoch": 0.61, + "learning_rate": 1.1992274376682663e-07, + "logits/chosen": -2.312617540359497, + "logits/rejected": -2.1147236824035645, + "logps/chosen": -260.1307373046875, + "logps/rejected": -326.1502990722656, + "loss": 0.4619, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8150759935379028, + "rewards/margins": 1.2273433208465576, + "rewards/rejected": -2.042419195175171, + "step": 5259 + }, + { + "epoch": 0.61, + "learning_rate": 1.1988762729720239e-07, + "logits/chosen": -2.446321725845337, + "logits/rejected": -2.440417766571045, + "logps/chosen": -266.898681640625, + "logps/rejected": -256.04168701171875, + "loss": 0.1736, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6820745468139648, + "rewards/margins": 3.2183220386505127, + "rewards/rejected": -3.9003963470458984, + "step": 5260 + }, + { + "epoch": 0.61, + "learning_rate": 1.1985251082757814e-07, + "logits/chosen": -2.5326883792877197, + "logits/rejected": -2.7337570190429688, + "logps/chosen": -285.35107421875, + "logps/rejected": -181.5558319091797, + "loss": 0.3718, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6077631711959839, + "rewards/margins": 1.4367153644561768, + "rewards/rejected": -2.044478416442871, + "step": 5261 + }, + { + "epoch": 0.61, + "learning_rate": 1.1981739435795387e-07, + "logits/chosen": -2.1642441749572754, + "logits/rejected": -2.515996217727661, + "logps/chosen": -279.9645080566406, + "logps/rejected": -228.8147430419922, + "loss": 0.6274, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9079174995422363, + "rewards/margins": 1.0687706470489502, + "rewards/rejected": -2.9766881465911865, + "step": 5262 + }, + { + "epoch": 0.61, + "learning_rate": 1.1978227788832962e-07, + "logits/chosen": -2.83791184425354, + "logits/rejected": -2.9719254970550537, + "logps/chosen": -240.11614990234375, + "logps/rejected": -361.31207275390625, + "loss": 0.1807, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.44742971658706665, + "rewards/margins": 4.504103660583496, + "rewards/rejected": -4.951533317565918, + "step": 5263 + }, + { + "epoch": 0.61, + "learning_rate": 1.1974716141870535e-07, + "logits/chosen": -3.007301092147827, + "logits/rejected": -3.0086019039154053, + "logps/chosen": -119.37835693359375, + "logps/rejected": -213.56167602539062, + "loss": 0.3354, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9157172441482544, + "rewards/margins": 3.925114631652832, + "rewards/rejected": -4.840831756591797, + "step": 5264 + }, + { + "epoch": 0.61, + "learning_rate": 1.1971204494908113e-07, + "logits/chosen": -2.218656539916992, + "logits/rejected": -2.251793384552002, + "logps/chosen": -189.1025390625, + "logps/rejected": -204.01416015625, + "loss": 0.5001, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3742496967315674, + "rewards/margins": 1.8794634342193604, + "rewards/rejected": -3.253713369369507, + "step": 5265 + }, + { + "epoch": 0.61, + "learning_rate": 1.1967692847945686e-07, + "logits/chosen": -2.676490306854248, + "logits/rejected": -2.6896438598632812, + "logps/chosen": -385.8957824707031, + "logps/rejected": -360.2597351074219, + "loss": 0.3843, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9220395684242249, + "rewards/margins": 3.7788712978363037, + "rewards/rejected": -4.700910568237305, + "step": 5266 + }, + { + "epoch": 0.61, + "learning_rate": 1.196418120098326e-07, + "logits/chosen": -2.683879852294922, + "logits/rejected": -2.632425308227539, + "logps/chosen": -122.50836181640625, + "logps/rejected": -179.47320556640625, + "loss": 0.2495, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8107388019561768, + "rewards/margins": 2.1429951190948486, + "rewards/rejected": -2.9537339210510254, + "step": 5267 + }, + { + "epoch": 0.61, + "learning_rate": 1.1960669554020834e-07, + "logits/chosen": -1.9867444038391113, + "logits/rejected": -1.9159499406814575, + "logps/chosen": -240.5855712890625, + "logps/rejected": -338.6872863769531, + "loss": 0.2736, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3830242156982422, + "rewards/margins": 3.666710615158081, + "rewards/rejected": -5.049735069274902, + "step": 5268 + }, + { + "epoch": 0.61, + "learning_rate": 1.1957157907058412e-07, + "logits/chosen": -2.2181520462036133, + "logits/rejected": -2.5800046920776367, + "logps/chosen": -226.52987670898438, + "logps/rejected": -209.6242218017578, + "loss": 0.1157, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5874203443527222, + "rewards/margins": 3.446950912475586, + "rewards/rejected": -4.034371376037598, + "step": 5269 + }, + { + "epoch": 0.61, + "learning_rate": 1.1953646260095984e-07, + "logits/chosen": -2.577054977416992, + "logits/rejected": -2.3248953819274902, + "logps/chosen": -241.84283447265625, + "logps/rejected": -271.5489501953125, + "loss": 0.164, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.341145396232605, + "rewards/margins": 3.8165667057037354, + "rewards/rejected": -4.157711982727051, + "step": 5270 + }, + { + "epoch": 0.61, + "learning_rate": 1.195013461313356e-07, + "logits/chosen": -2.3080809116363525, + "logits/rejected": -2.6411349773406982, + "logps/chosen": -402.50634765625, + "logps/rejected": -217.15765380859375, + "loss": 0.478, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3254604637622833, + "rewards/margins": 1.2524924278259277, + "rewards/rejected": -1.5779528617858887, + "step": 5271 + }, + { + "epoch": 0.61, + "learning_rate": 1.1946622966171133e-07, + "logits/chosen": -2.084010124206543, + "logits/rejected": -2.2785606384277344, + "logps/chosen": -419.9219970703125, + "logps/rejected": -382.53485107421875, + "loss": 0.4291, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4919620156288147, + "rewards/margins": 2.47885799407959, + "rewards/rejected": -2.9708199501037598, + "step": 5272 + }, + { + "epoch": 0.61, + "learning_rate": 1.1943111319208708e-07, + "logits/chosen": -2.5482847690582275, + "logits/rejected": -2.55757474899292, + "logps/chosen": -256.5091857910156, + "logps/rejected": -238.8883056640625, + "loss": 0.1008, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6074199676513672, + "rewards/margins": 3.309047222137451, + "rewards/rejected": -3.9164671897888184, + "step": 5273 + }, + { + "epoch": 0.61, + "learning_rate": 1.1939599672246283e-07, + "logits/chosen": -2.3924496173858643, + "logits/rejected": -2.59629487991333, + "logps/chosen": -290.9903564453125, + "logps/rejected": -312.1172180175781, + "loss": 0.4463, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7855943441390991, + "rewards/margins": 1.5220891237258911, + "rewards/rejected": -2.3076834678649902, + "step": 5274 + }, + { + "epoch": 0.61, + "learning_rate": 1.193608802528386e-07, + "logits/chosen": -2.4116392135620117, + "logits/rejected": -2.2699785232543945, + "logps/chosen": -212.32679748535156, + "logps/rejected": -323.56353759765625, + "loss": 0.3739, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1831855773925781, + "rewards/margins": 2.0577008724212646, + "rewards/rejected": -3.2408864498138428, + "step": 5275 + }, + { + "epoch": 0.61, + "learning_rate": 1.1932576378321432e-07, + "logits/chosen": -2.247119188308716, + "logits/rejected": -2.4321112632751465, + "logps/chosen": -409.0295104980469, + "logps/rejected": -390.2786865234375, + "loss": 0.4872, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8870528340339661, + "rewards/margins": 2.192833423614502, + "rewards/rejected": -3.0798864364624023, + "step": 5276 + }, + { + "epoch": 0.61, + "learning_rate": 1.1929064731359007e-07, + "logits/chosen": -2.716107130050659, + "logits/rejected": -2.608109474182129, + "logps/chosen": -146.78834533691406, + "logps/rejected": -238.97186279296875, + "loss": 0.1699, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6466749310493469, + "rewards/margins": 3.280489921569824, + "rewards/rejected": -3.9271647930145264, + "step": 5277 + }, + { + "epoch": 0.61, + "learning_rate": 1.1925553084396582e-07, + "logits/chosen": -2.128269910812378, + "logits/rejected": -2.4099295139312744, + "logps/chosen": -280.10711669921875, + "logps/rejected": -274.9610900878906, + "loss": 0.3788, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5680408477783203, + "rewards/margins": 2.7495012283325195, + "rewards/rejected": -3.31754207611084, + "step": 5278 + }, + { + "epoch": 0.61, + "learning_rate": 1.1922041437434155e-07, + "logits/chosen": -2.2527215480804443, + "logits/rejected": -2.288602590560913, + "logps/chosen": -343.667724609375, + "logps/rejected": -273.8158264160156, + "loss": 0.1219, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0726928636431694, + "rewards/margins": 3.7157254219055176, + "rewards/rejected": -3.7884182929992676, + "step": 5279 + }, + { + "epoch": 0.61, + "learning_rate": 1.191852979047173e-07, + "logits/chosen": -1.823460578918457, + "logits/rejected": -1.8672056198120117, + "logps/chosen": -293.12139892578125, + "logps/rejected": -269.56292724609375, + "loss": 0.1648, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.407579243183136, + "rewards/margins": 2.6538562774658203, + "rewards/rejected": -3.0614356994628906, + "step": 5280 + }, + { + "epoch": 0.61, + "learning_rate": 1.1915018143509306e-07, + "logits/chosen": -2.6510422229766846, + "logits/rejected": -2.829820156097412, + "logps/chosen": -224.181396484375, + "logps/rejected": -205.59188842773438, + "loss": 0.9383, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7745726108551025, + "rewards/margins": 1.2897212505340576, + "rewards/rejected": -3.06429386138916, + "step": 5281 + }, + { + "epoch": 0.61, + "learning_rate": 1.191150649654688e-07, + "logits/chosen": -2.2828938961029053, + "logits/rejected": -2.6137900352478027, + "logps/chosen": -562.241943359375, + "logps/rejected": -235.59814453125, + "loss": 0.4013, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9234155416488647, + "rewards/margins": 2.309056282043457, + "rewards/rejected": -3.2324719429016113, + "step": 5282 + }, + { + "epoch": 0.61, + "learning_rate": 1.1907994849584454e-07, + "logits/chosen": -2.7054243087768555, + "logits/rejected": -2.3061745166778564, + "logps/chosen": -149.6876678466797, + "logps/rejected": -350.4064636230469, + "loss": 0.1401, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5798724889755249, + "rewards/margins": 3.8841114044189453, + "rewards/rejected": -4.46398401260376, + "step": 5283 + }, + { + "epoch": 0.61, + "learning_rate": 1.1904483202622029e-07, + "logits/chosen": -2.9578585624694824, + "logits/rejected": -2.919867515563965, + "logps/chosen": -228.72459411621094, + "logps/rejected": -263.39947509765625, + "loss": 0.4491, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5275073051452637, + "rewards/margins": 1.7190401554107666, + "rewards/rejected": -3.2465476989746094, + "step": 5284 + }, + { + "epoch": 0.61, + "learning_rate": 1.1900971555659605e-07, + "logits/chosen": -2.3820366859436035, + "logits/rejected": -2.130167245864868, + "logps/chosen": -153.84881591796875, + "logps/rejected": -233.8221893310547, + "loss": 0.5495, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0142676830291748, + "rewards/margins": 1.2205936908721924, + "rewards/rejected": -2.234861373901367, + "step": 5285 + }, + { + "epoch": 0.61, + "learning_rate": 1.1897459908697179e-07, + "logits/chosen": -2.781670570373535, + "logits/rejected": -2.6677772998809814, + "logps/chosen": -294.43719482421875, + "logps/rejected": -282.2125244140625, + "loss": 0.2082, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9582093954086304, + "rewards/margins": 2.854346990585327, + "rewards/rejected": -3.812556266784668, + "step": 5286 + }, + { + "epoch": 0.61, + "learning_rate": 1.1893948261734753e-07, + "logits/chosen": -2.608182430267334, + "logits/rejected": -2.4350452423095703, + "logps/chosen": -338.9859924316406, + "logps/rejected": -338.6726989746094, + "loss": 0.2674, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2784671783447266, + "rewards/margins": 2.185887575149536, + "rewards/rejected": -3.4643547534942627, + "step": 5287 + }, + { + "epoch": 0.61, + "learning_rate": 1.1890436614772327e-07, + "logits/chosen": -2.0954127311706543, + "logits/rejected": -2.575974225997925, + "logps/chosen": -418.2731628417969, + "logps/rejected": -267.92572021484375, + "loss": 0.3165, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.329087495803833, + "rewards/margins": 3.25565505027771, + "rewards/rejected": -4.584742546081543, + "step": 5288 + }, + { + "epoch": 0.61, + "learning_rate": 1.1886924967809901e-07, + "logits/chosen": -2.448378324508667, + "logits/rejected": -2.5204007625579834, + "logps/chosen": -221.14459228515625, + "logps/rejected": -164.15725708007812, + "loss": 0.411, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7609570026397705, + "rewards/margins": 1.3836923837661743, + "rewards/rejected": -2.1446495056152344, + "step": 5289 + }, + { + "epoch": 0.61, + "learning_rate": 1.1883413320847478e-07, + "logits/chosen": -1.824643850326538, + "logits/rejected": -2.066185474395752, + "logps/chosen": -263.80621337890625, + "logps/rejected": -213.94046020507812, + "loss": 0.6901, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.131890296936035, + "rewards/margins": 0.9385585188865662, + "rewards/rejected": -3.070448875427246, + "step": 5290 + }, + { + "epoch": 0.61, + "learning_rate": 1.1879901673885052e-07, + "logits/chosen": -2.490846872329712, + "logits/rejected": -2.5792741775512695, + "logps/chosen": -254.77499389648438, + "logps/rejected": -148.52499389648438, + "loss": 1.5885, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6207385063171387, + "rewards/margins": 0.2459844946861267, + "rewards/rejected": -1.8667229413986206, + "step": 5291 + }, + { + "epoch": 0.61, + "learning_rate": 1.1876390026922626e-07, + "logits/chosen": -2.0082154273986816, + "logits/rejected": -1.9884992837905884, + "logps/chosen": -251.5559844970703, + "logps/rejected": -274.8748779296875, + "loss": 0.3329, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.42866581678390503, + "rewards/margins": 1.4976627826690674, + "rewards/rejected": -1.926328420639038, + "step": 5292 + }, + { + "epoch": 0.61, + "learning_rate": 1.18728783799602e-07, + "logits/chosen": -2.2193093299865723, + "logits/rejected": -2.3961403369903564, + "logps/chosen": -487.4263000488281, + "logps/rejected": -356.3016357421875, + "loss": 0.236, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.942311704158783, + "rewards/margins": 1.7347195148468018, + "rewards/rejected": -2.6770312786102295, + "step": 5293 + }, + { + "epoch": 0.61, + "learning_rate": 1.1869366732997777e-07, + "logits/chosen": -2.410543918609619, + "logits/rejected": -2.5423672199249268, + "logps/chosen": -335.0536193847656, + "logps/rejected": -476.632568359375, + "loss": 0.4924, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7788617610931396, + "rewards/margins": 2.384239673614502, + "rewards/rejected": -3.1631014347076416, + "step": 5294 + }, + { + "epoch": 0.61, + "learning_rate": 1.186585508603535e-07, + "logits/chosen": -2.2367947101593018, + "logits/rejected": -2.5286130905151367, + "logps/chosen": -343.325927734375, + "logps/rejected": -301.6839599609375, + "loss": 0.4985, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7966296672821045, + "rewards/margins": 2.502110004425049, + "rewards/rejected": -4.298739433288574, + "step": 5295 + }, + { + "epoch": 0.61, + "learning_rate": 1.1862343439072925e-07, + "logits/chosen": -2.0787737369537354, + "logits/rejected": -2.508021831512451, + "logps/chosen": -205.05836486816406, + "logps/rejected": -171.7980499267578, + "loss": 0.7016, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3447362184524536, + "rewards/margins": 1.2032246589660645, + "rewards/rejected": -2.5479607582092285, + "step": 5296 + }, + { + "epoch": 0.61, + "learning_rate": 1.1858831792110499e-07, + "logits/chosen": -2.296032190322876, + "logits/rejected": -2.352954387664795, + "logps/chosen": -336.0308837890625, + "logps/rejected": -242.08151245117188, + "loss": 0.73, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8491241931915283, + "rewards/margins": 1.5926611423492432, + "rewards/rejected": -2.4417853355407715, + "step": 5297 + }, + { + "epoch": 0.61, + "learning_rate": 1.1855320145148074e-07, + "logits/chosen": -2.602036952972412, + "logits/rejected": -2.6324074268341064, + "logps/chosen": -237.73422241210938, + "logps/rejected": -226.67108154296875, + "loss": 0.2604, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0642964839935303, + "rewards/margins": 2.304025411605835, + "rewards/rejected": -3.3683218955993652, + "step": 5298 + }, + { + "epoch": 0.61, + "learning_rate": 1.1851808498185648e-07, + "logits/chosen": -2.422175645828247, + "logits/rejected": -2.3271825313568115, + "logps/chosen": -291.8004455566406, + "logps/rejected": -280.5736999511719, + "loss": 1.1853, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.338139772415161, + "rewards/margins": 0.6183207035064697, + "rewards/rejected": -2.9564602375030518, + "step": 5299 + }, + { + "epoch": 0.61, + "learning_rate": 1.1848296851223222e-07, + "logits/chosen": -1.834855079650879, + "logits/rejected": -2.1476263999938965, + "logps/chosen": -459.0288391113281, + "logps/rejected": -311.6556396484375, + "loss": 0.3889, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6413432359695435, + "rewards/margins": 1.098174810409546, + "rewards/rejected": -1.7395180463790894, + "step": 5300 + }, + { + "epoch": 0.61, + "learning_rate": 1.1844785204260798e-07, + "logits/chosen": -2.2390291690826416, + "logits/rejected": -2.3660531044006348, + "logps/chosen": -201.8503875732422, + "logps/rejected": -312.63555908203125, + "loss": 0.1264, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4642866849899292, + "rewards/margins": 3.5323193073272705, + "rewards/rejected": -3.9966063499450684, + "step": 5301 + }, + { + "epoch": 0.61, + "learning_rate": 1.1841273557298373e-07, + "logits/chosen": -2.085059642791748, + "logits/rejected": -2.2747862339019775, + "logps/chosen": -185.07293701171875, + "logps/rejected": -227.69100952148438, + "loss": 0.8876, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.7650306224822998, + "rewards/margins": 0.11068376898765564, + "rewards/rejected": -1.8757143020629883, + "step": 5302 + }, + { + "epoch": 0.61, + "learning_rate": 1.1837761910335947e-07, + "logits/chosen": -1.5979297161102295, + "logits/rejected": -2.0014147758483887, + "logps/chosen": -416.5557861328125, + "logps/rejected": -257.0513916015625, + "loss": 0.1214, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3041865825653076, + "rewards/margins": 3.841696262359619, + "rewards/rejected": -4.145882606506348, + "step": 5303 + }, + { + "epoch": 0.61, + "learning_rate": 1.1834250263373521e-07, + "logits/chosen": -2.0048611164093018, + "logits/rejected": -2.258552312850952, + "logps/chosen": -377.899658203125, + "logps/rejected": -419.4195861816406, + "loss": 0.5432, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0297778844833374, + "rewards/margins": 1.7697303295135498, + "rewards/rejected": -2.7995080947875977, + "step": 5304 + }, + { + "epoch": 0.61, + "learning_rate": 1.1830738616411095e-07, + "logits/chosen": -2.755769968032837, + "logits/rejected": -2.764385223388672, + "logps/chosen": -142.1779327392578, + "logps/rejected": -207.01168823242188, + "loss": 0.1755, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.226898193359375, + "rewards/margins": 2.8165175914764404, + "rewards/rejected": -4.0434160232543945, + "step": 5305 + }, + { + "epoch": 0.61, + "learning_rate": 1.1827226969448672e-07, + "logits/chosen": -1.9777356386184692, + "logits/rejected": -2.3397276401519775, + "logps/chosen": -401.15216064453125, + "logps/rejected": -285.54864501953125, + "loss": 0.5389, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5850536823272705, + "rewards/margins": 0.7347179651260376, + "rewards/rejected": -2.3197717666625977, + "step": 5306 + }, + { + "epoch": 0.61, + "learning_rate": 1.1823715322486246e-07, + "logits/chosen": -2.2343499660491943, + "logits/rejected": -2.2980542182922363, + "logps/chosen": -231.645263671875, + "logps/rejected": -215.9371337890625, + "loss": 0.2475, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1236025094985962, + "rewards/margins": 2.1025774478912354, + "rewards/rejected": -3.2261803150177, + "step": 5307 + }, + { + "epoch": 0.61, + "learning_rate": 1.182020367552382e-07, + "logits/chosen": -2.4395570755004883, + "logits/rejected": -2.3953423500061035, + "logps/chosen": -189.8695068359375, + "logps/rejected": -138.08163452148438, + "loss": 0.6591, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0009102821350098, + "rewards/margins": 0.8389284610748291, + "rewards/rejected": -1.8398386240005493, + "step": 5308 + }, + { + "epoch": 0.61, + "learning_rate": 1.1816692028561394e-07, + "logits/chosen": -2.617405414581299, + "logits/rejected": -2.4104390144348145, + "logps/chosen": -314.79205322265625, + "logps/rejected": -364.2369384765625, + "loss": 0.5637, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7510672807693481, + "rewards/margins": 3.245988130569458, + "rewards/rejected": -3.9970555305480957, + "step": 5309 + }, + { + "epoch": 0.61, + "learning_rate": 1.181318038159897e-07, + "logits/chosen": -2.190476179122925, + "logits/rejected": -2.0239429473876953, + "logps/chosen": -174.411865234375, + "logps/rejected": -358.7313232421875, + "loss": 0.251, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.024066872894763947, + "rewards/margins": 3.335222005844116, + "rewards/rejected": -3.3592889308929443, + "step": 5310 + }, + { + "epoch": 0.61, + "learning_rate": 1.1809668734636545e-07, + "logits/chosen": -2.8614683151245117, + "logits/rejected": -2.751142978668213, + "logps/chosen": -68.81102752685547, + "logps/rejected": -279.95623779296875, + "loss": 0.2266, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4493747651576996, + "rewards/margins": 6.07639217376709, + "rewards/rejected": -6.5257673263549805, + "step": 5311 + }, + { + "epoch": 0.61, + "learning_rate": 1.1806157087674119e-07, + "logits/chosen": -2.3674697875976562, + "logits/rejected": -2.6140947341918945, + "logps/chosen": -219.40231323242188, + "logps/rejected": -207.92068481445312, + "loss": 0.498, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7818887233734131, + "rewards/margins": 2.2031025886535645, + "rewards/rejected": -2.9849915504455566, + "step": 5312 + }, + { + "epoch": 0.61, + "learning_rate": 1.1802645440711693e-07, + "logits/chosen": -2.1207752227783203, + "logits/rejected": -2.330152750015259, + "logps/chosen": -195.8737030029297, + "logps/rejected": -199.23744201660156, + "loss": 0.6006, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8919495940208435, + "rewards/margins": 2.4257521629333496, + "rewards/rejected": -3.317701816558838, + "step": 5313 + }, + { + "epoch": 0.61, + "learning_rate": 1.1799133793749268e-07, + "logits/chosen": -2.41213321685791, + "logits/rejected": -2.575652599334717, + "logps/chosen": -345.4088439941406, + "logps/rejected": -210.1329345703125, + "loss": 0.386, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9962804317474365, + "rewards/margins": 2.3969311714172363, + "rewards/rejected": -3.393211603164673, + "step": 5314 + }, + { + "epoch": 0.61, + "learning_rate": 1.1795622146786842e-07, + "logits/chosen": -2.797417640686035, + "logits/rejected": -2.678356409072876, + "logps/chosen": -74.093505859375, + "logps/rejected": -207.44955444335938, + "loss": 0.2555, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5601485967636108, + "rewards/margins": 1.651524305343628, + "rewards/rejected": -2.211672782897949, + "step": 5315 + }, + { + "epoch": 0.61, + "learning_rate": 1.1792110499824416e-07, + "logits/chosen": -2.5882034301757812, + "logits/rejected": -2.656764507293701, + "logps/chosen": -206.87008666992188, + "logps/rejected": -261.11456298828125, + "loss": 0.1706, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3250744938850403, + "rewards/margins": 2.2737374305725098, + "rewards/rejected": -2.5988118648529053, + "step": 5316 + }, + { + "epoch": 0.61, + "learning_rate": 1.178859885286199e-07, + "logits/chosen": -2.0600428581237793, + "logits/rejected": -2.1844775676727295, + "logps/chosen": -174.46160888671875, + "logps/rejected": -204.407958984375, + "loss": 0.6277, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.7909114360809326, + "rewards/margins": 0.6901850700378418, + "rewards/rejected": -2.4810965061187744, + "step": 5317 + }, + { + "epoch": 0.61, + "learning_rate": 1.1785087205899567e-07, + "logits/chosen": -2.2465646266937256, + "logits/rejected": -2.0684022903442383, + "logps/chosen": -204.10153198242188, + "logps/rejected": -274.3660583496094, + "loss": 0.166, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20849671959877014, + "rewards/margins": 2.203580379486084, + "rewards/rejected": -2.412076950073242, + "step": 5318 + }, + { + "epoch": 0.61, + "learning_rate": 1.1781575558937141e-07, + "logits/chosen": -2.302450656890869, + "logits/rejected": -2.4133713245391846, + "logps/chosen": -261.42059326171875, + "logps/rejected": -183.464111328125, + "loss": 0.5026, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.244792103767395, + "rewards/margins": 1.8309617042541504, + "rewards/rejected": -3.075753688812256, + "step": 5319 + }, + { + "epoch": 0.61, + "learning_rate": 1.1778063911974715e-07, + "logits/chosen": -2.683342218399048, + "logits/rejected": -2.586233139038086, + "logps/chosen": -123.0413589477539, + "logps/rejected": -249.68190002441406, + "loss": 0.2939, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7075990438461304, + "rewards/margins": 1.4636852741241455, + "rewards/rejected": -3.1712841987609863, + "step": 5320 + }, + { + "epoch": 0.61, + "learning_rate": 1.177455226501229e-07, + "logits/chosen": -2.0077970027923584, + "logits/rejected": -1.7763688564300537, + "logps/chosen": -264.9046630859375, + "logps/rejected": -316.5230407714844, + "loss": 0.4007, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9659653902053833, + "rewards/margins": 1.1402028799057007, + "rewards/rejected": -2.106168270111084, + "step": 5321 + }, + { + "epoch": 0.61, + "learning_rate": 1.1771040618049866e-07, + "logits/chosen": -2.69364595413208, + "logits/rejected": -2.5658302307128906, + "logps/chosen": -251.99642944335938, + "logps/rejected": -336.88671875, + "loss": 0.2862, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6859337091445923, + "rewards/margins": 2.8497543334960938, + "rewards/rejected": -3.5356884002685547, + "step": 5322 + }, + { + "epoch": 0.61, + "learning_rate": 1.176752897108744e-07, + "logits/chosen": -2.125164031982422, + "logits/rejected": -1.9446626901626587, + "logps/chosen": -326.1508483886719, + "logps/rejected": -363.40142822265625, + "loss": 0.4826, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4106144607067108, + "rewards/margins": 2.5804591178894043, + "rewards/rejected": -2.9910733699798584, + "step": 5323 + }, + { + "epoch": 0.61, + "learning_rate": 1.1764017324125014e-07, + "logits/chosen": -1.9897005558013916, + "logits/rejected": -2.2325501441955566, + "logps/chosen": -286.8050231933594, + "logps/rejected": -336.3921813964844, + "loss": 0.197, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08168414235115051, + "rewards/margins": 3.21797251701355, + "rewards/rejected": -3.299656629562378, + "step": 5324 + }, + { + "epoch": 0.61, + "learning_rate": 1.1760505677162588e-07, + "logits/chosen": -2.0408153533935547, + "logits/rejected": -2.016515016555786, + "logps/chosen": -392.21197509765625, + "logps/rejected": -373.2381896972656, + "loss": 0.2375, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.785601794719696, + "rewards/margins": 1.8812576532363892, + "rewards/rejected": -2.6668591499328613, + "step": 5325 + }, + { + "epoch": 0.61, + "learning_rate": 1.1756994030200164e-07, + "logits/chosen": -2.163140296936035, + "logits/rejected": -1.9682502746582031, + "logps/chosen": -246.2681884765625, + "logps/rejected": -257.14410400390625, + "loss": 0.4994, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7951555848121643, + "rewards/margins": 2.313659906387329, + "rewards/rejected": -3.1088154315948486, + "step": 5326 + }, + { + "epoch": 0.61, + "learning_rate": 1.1753482383237738e-07, + "logits/chosen": -1.9857136011123657, + "logits/rejected": -2.0343360900878906, + "logps/chosen": -240.7485809326172, + "logps/rejected": -187.19851684570312, + "loss": 0.6461, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1519397497177124, + "rewards/margins": 0.7475970983505249, + "rewards/rejected": -1.8995368480682373, + "step": 5327 + }, + { + "epoch": 0.61, + "learning_rate": 1.1749970736275313e-07, + "logits/chosen": -2.431749105453491, + "logits/rejected": -2.3526217937469482, + "logps/chosen": -148.8150634765625, + "logps/rejected": -302.01318359375, + "loss": 0.3139, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6333363056182861, + "rewards/margins": 2.7308743000030518, + "rewards/rejected": -3.364210844039917, + "step": 5328 + }, + { + "epoch": 0.61, + "learning_rate": 1.1746459089312887e-07, + "logits/chosen": -1.7735590934753418, + "logits/rejected": -2.1539008617401123, + "logps/chosen": -484.96234130859375, + "logps/rejected": -242.0457305908203, + "loss": 0.4912, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0598011016845703, + "rewards/margins": 1.777912974357605, + "rewards/rejected": -2.837714195251465, + "step": 5329 + }, + { + "epoch": 0.61, + "learning_rate": 1.1742947442350463e-07, + "logits/chosen": -2.558706283569336, + "logits/rejected": -2.574490547180176, + "logps/chosen": -300.59576416015625, + "logps/rejected": -391.3599548339844, + "loss": 0.3003, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9583199620246887, + "rewards/margins": 2.7774314880371094, + "rewards/rejected": -3.7357516288757324, + "step": 5330 + }, + { + "epoch": 0.61, + "learning_rate": 1.1739435795388037e-07, + "logits/chosen": -2.471952199935913, + "logits/rejected": -2.187502145767212, + "logps/chosen": -207.99542236328125, + "logps/rejected": -345.1603698730469, + "loss": 0.3962, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.19646307826042175, + "rewards/margins": 3.44863224029541, + "rewards/rejected": -3.6450955867767334, + "step": 5331 + }, + { + "epoch": 0.61, + "learning_rate": 1.1735924148425611e-07, + "logits/chosen": -2.5599544048309326, + "logits/rejected": -2.438447952270508, + "logps/chosen": -151.35189819335938, + "logps/rejected": -209.62582397460938, + "loss": 0.2809, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2068268209695816, + "rewards/margins": 1.6786783933639526, + "rewards/rejected": -1.885505199432373, + "step": 5332 + }, + { + "epoch": 0.61, + "learning_rate": 1.1732412501463185e-07, + "logits/chosen": -2.594214916229248, + "logits/rejected": -2.6603128910064697, + "logps/chosen": -175.843017578125, + "logps/rejected": -193.3596954345703, + "loss": 0.3827, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.40068763494491577, + "rewards/margins": 1.7173824310302734, + "rewards/rejected": -2.118070125579834, + "step": 5333 + }, + { + "epoch": 0.61, + "learning_rate": 1.1728900854500759e-07, + "logits/chosen": -2.489854097366333, + "logits/rejected": -2.2815170288085938, + "logps/chosen": -119.95993041992188, + "logps/rejected": -221.01910400390625, + "loss": 0.1895, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9207738637924194, + "rewards/margins": 2.952218532562256, + "rewards/rejected": -3.872992753982544, + "step": 5334 + }, + { + "epoch": 0.62, + "learning_rate": 1.1725389207538336e-07, + "logits/chosen": -2.4200856685638428, + "logits/rejected": -2.3977558612823486, + "logps/chosen": -348.828369140625, + "logps/rejected": -286.15283203125, + "loss": 0.1419, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6047221422195435, + "rewards/margins": 2.864978313446045, + "rewards/rejected": -3.469700813293457, + "step": 5335 + }, + { + "epoch": 0.62, + "learning_rate": 1.172187756057591e-07, + "logits/chosen": -2.1073474884033203, + "logits/rejected": -1.8670254945755005, + "logps/chosen": -288.890380859375, + "logps/rejected": -302.942626953125, + "loss": 0.6598, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5015332698822021, + "rewards/margins": 1.4081501960754395, + "rewards/rejected": -1.909683346748352, + "step": 5336 + }, + { + "epoch": 0.62, + "learning_rate": 1.1718365913613484e-07, + "logits/chosen": -2.8764848709106445, + "logits/rejected": -2.9356436729431152, + "logps/chosen": -181.14126586914062, + "logps/rejected": -176.45599365234375, + "loss": 0.4244, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7914073467254639, + "rewards/margins": 2.2470948696136475, + "rewards/rejected": -3.0385022163391113, + "step": 5337 + }, + { + "epoch": 0.62, + "learning_rate": 1.1714854266651058e-07, + "logits/chosen": -2.4568939208984375, + "logits/rejected": -2.2949471473693848, + "logps/chosen": -139.98025512695312, + "logps/rejected": -300.286865234375, + "loss": 0.1889, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3174937963485718, + "rewards/margins": 3.389219284057617, + "rewards/rejected": -4.70671272277832, + "step": 5338 + }, + { + "epoch": 0.62, + "learning_rate": 1.1711342619688634e-07, + "logits/chosen": -2.293799638748169, + "logits/rejected": -2.203556776046753, + "logps/chosen": -246.8248291015625, + "logps/rejected": -393.4810791015625, + "loss": 0.2184, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24611593782901764, + "rewards/margins": 3.2690839767456055, + "rewards/rejected": -3.515199899673462, + "step": 5339 + }, + { + "epoch": 0.62, + "learning_rate": 1.1707830972726209e-07, + "logits/chosen": -2.3387553691864014, + "logits/rejected": -2.3015966415405273, + "logps/chosen": -300.0653076171875, + "logps/rejected": -301.26226806640625, + "loss": 0.6803, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8727096915245056, + "rewards/margins": 1.8823654651641846, + "rewards/rejected": -2.755075216293335, + "step": 5340 + }, + { + "epoch": 0.62, + "learning_rate": 1.1704319325763783e-07, + "logits/chosen": -2.5093026161193848, + "logits/rejected": -2.477430820465088, + "logps/chosen": -336.78662109375, + "logps/rejected": -293.71282958984375, + "loss": 0.3648, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2474600076675415, + "rewards/margins": 4.417331218719482, + "rewards/rejected": -5.664791107177734, + "step": 5341 + }, + { + "epoch": 0.62, + "learning_rate": 1.1700807678801357e-07, + "logits/chosen": -2.3759706020355225, + "logits/rejected": -2.5223796367645264, + "logps/chosen": -280.96307373046875, + "logps/rejected": -237.1524200439453, + "loss": 0.9036, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.4382450580596924, + "rewards/margins": 0.2797629237174988, + "rewards/rejected": -1.718008041381836, + "step": 5342 + }, + { + "epoch": 0.62, + "learning_rate": 1.1697296031838932e-07, + "logits/chosen": -2.022859811782837, + "logits/rejected": -1.952030897140503, + "logps/chosen": -224.29302978515625, + "logps/rejected": -347.0481262207031, + "loss": 0.2958, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1618136167526245, + "rewards/margins": 2.988699436187744, + "rewards/rejected": -3.150513172149658, + "step": 5343 + }, + { + "epoch": 0.62, + "learning_rate": 1.1693784384876506e-07, + "logits/chosen": -2.3774514198303223, + "logits/rejected": -2.5026497840881348, + "logps/chosen": -319.7191467285156, + "logps/rejected": -319.05560302734375, + "loss": 0.1773, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7925291061401367, + "rewards/margins": 3.222180128097534, + "rewards/rejected": -4.01470947265625, + "step": 5344 + }, + { + "epoch": 0.62, + "learning_rate": 1.1690272737914081e-07, + "logits/chosen": -2.3854243755340576, + "logits/rejected": -2.2145609855651855, + "logps/chosen": -250.60899353027344, + "logps/rejected": -350.28173828125, + "loss": 0.3455, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9382134675979614, + "rewards/margins": 3.3069517612457275, + "rewards/rejected": -4.245165824890137, + "step": 5345 + }, + { + "epoch": 0.62, + "learning_rate": 1.1686761090951656e-07, + "logits/chosen": -2.2612156867980957, + "logits/rejected": -2.221127986907959, + "logps/chosen": -139.4905548095703, + "logps/rejected": -228.86761474609375, + "loss": 0.5781, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.222481369972229, + "rewards/margins": 2.13678240776062, + "rewards/rejected": -3.3592638969421387, + "step": 5346 + }, + { + "epoch": 0.62, + "learning_rate": 1.1683249443989231e-07, + "logits/chosen": -2.6040725708007812, + "logits/rejected": -2.5595524311065674, + "logps/chosen": -173.6483917236328, + "logps/rejected": -336.4259033203125, + "loss": 0.395, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1510939598083496, + "rewards/margins": 1.5350412130355835, + "rewards/rejected": -2.6861350536346436, + "step": 5347 + }, + { + "epoch": 0.62, + "learning_rate": 1.1679737797026805e-07, + "logits/chosen": -2.4201407432556152, + "logits/rejected": -2.419053316116333, + "logps/chosen": -375.3688049316406, + "logps/rejected": -319.2771911621094, + "loss": 0.7112, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9617764949798584, + "rewards/margins": 1.1052770614624023, + "rewards/rejected": -2.0670535564422607, + "step": 5348 + }, + { + "epoch": 0.62, + "learning_rate": 1.1676226150064379e-07, + "logits/chosen": -2.443822145462036, + "logits/rejected": -2.5080666542053223, + "logps/chosen": -298.49658203125, + "logps/rejected": -177.47412109375, + "loss": 0.7145, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4388936758041382, + "rewards/margins": 1.3336596488952637, + "rewards/rejected": -2.7725532054901123, + "step": 5349 + }, + { + "epoch": 0.62, + "learning_rate": 1.1672714503101953e-07, + "logits/chosen": -1.9044177532196045, + "logits/rejected": -2.3091135025024414, + "logps/chosen": -410.21453857421875, + "logps/rejected": -290.1856994628906, + "loss": 0.1471, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8507246375083923, + "rewards/margins": 2.804502487182617, + "rewards/rejected": -3.655226707458496, + "step": 5350 + }, + { + "epoch": 0.62, + "learning_rate": 1.166920285613953e-07, + "logits/chosen": -2.378260612487793, + "logits/rejected": -2.5177078247070312, + "logps/chosen": -192.41363525390625, + "logps/rejected": -217.85107421875, + "loss": 0.268, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6526641845703125, + "rewards/margins": 2.560410976409912, + "rewards/rejected": -3.2130751609802246, + "step": 5351 + }, + { + "epoch": 0.62, + "learning_rate": 1.1665691209177104e-07, + "logits/chosen": -1.5819337368011475, + "logits/rejected": -1.748934268951416, + "logps/chosen": -295.8421325683594, + "logps/rejected": -258.809814453125, + "loss": 0.54, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6659493446350098, + "rewards/margins": 1.7741185426712036, + "rewards/rejected": -2.440067768096924, + "step": 5352 + }, + { + "epoch": 0.62, + "learning_rate": 1.1662179562214678e-07, + "logits/chosen": -2.7561323642730713, + "logits/rejected": -2.603034496307373, + "logps/chosen": -265.01519775390625, + "logps/rejected": -215.1693115234375, + "loss": 0.6151, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7129005193710327, + "rewards/margins": 1.9388811588287354, + "rewards/rejected": -3.6517817974090576, + "step": 5353 + }, + { + "epoch": 0.62, + "learning_rate": 1.1658667915252252e-07, + "logits/chosen": -2.8180136680603027, + "logits/rejected": -2.903200387954712, + "logps/chosen": -179.04794311523438, + "logps/rejected": -250.51205444335938, + "loss": 0.0949, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6621642112731934, + "rewards/margins": 3.1265010833740234, + "rewards/rejected": -3.788665294647217, + "step": 5354 + }, + { + "epoch": 0.62, + "learning_rate": 1.1655156268289827e-07, + "logits/chosen": -2.4349796772003174, + "logits/rejected": -2.4091227054595947, + "logps/chosen": -259.8687438964844, + "logps/rejected": -245.0784912109375, + "loss": 0.3209, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8777148127555847, + "rewards/margins": 1.3286347389221191, + "rewards/rejected": -2.2063493728637695, + "step": 5355 + }, + { + "epoch": 0.62, + "learning_rate": 1.1651644621327403e-07, + "logits/chosen": -2.776724338531494, + "logits/rejected": -2.74187970161438, + "logps/chosen": -297.1327819824219, + "logps/rejected": -282.83251953125, + "loss": 0.3556, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8919810056686401, + "rewards/margins": 1.3502709865570068, + "rewards/rejected": -2.2422521114349365, + "step": 5356 + }, + { + "epoch": 0.62, + "learning_rate": 1.1648132974364977e-07, + "logits/chosen": -2.0541329383850098, + "logits/rejected": -2.290245294570923, + "logps/chosen": -329.7704772949219, + "logps/rejected": -246.2041778564453, + "loss": 0.2477, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11683956533670425, + "rewards/margins": 3.348094940185547, + "rewards/rejected": -3.4649343490600586, + "step": 5357 + }, + { + "epoch": 0.62, + "learning_rate": 1.1644621327402551e-07, + "logits/chosen": -2.2097604274749756, + "logits/rejected": -1.9785140752792358, + "logps/chosen": -157.3350372314453, + "logps/rejected": -165.32913208007812, + "loss": 1.0316, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3914000988006592, + "rewards/margins": 1.09339439868927, + "rewards/rejected": -2.4847943782806396, + "step": 5358 + }, + { + "epoch": 0.62, + "learning_rate": 1.1641109680440126e-07, + "logits/chosen": -2.2623515129089355, + "logits/rejected": -1.8858963251113892, + "logps/chosen": -241.49801635742188, + "logps/rejected": -486.3470764160156, + "loss": 0.4269, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1326584815979004, + "rewards/margins": 4.471732139587402, + "rewards/rejected": -5.604390621185303, + "step": 5359 + }, + { + "epoch": 0.62, + "learning_rate": 1.16375980334777e-07, + "logits/chosen": -2.030855655670166, + "logits/rejected": -1.913886308670044, + "logps/chosen": -344.56549072265625, + "logps/rejected": -316.75830078125, + "loss": 0.629, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.287973403930664, + "rewards/margins": 1.2005740404129028, + "rewards/rejected": -2.4885475635528564, + "step": 5360 + }, + { + "epoch": 0.62, + "learning_rate": 1.1634086386515274e-07, + "logits/chosen": -2.000391721725464, + "logits/rejected": -1.9887663125991821, + "logps/chosen": -261.39892578125, + "logps/rejected": -355.41534423828125, + "loss": 0.5709, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0970975011587143, + "rewards/margins": 1.2940337657928467, + "rewards/rejected": -1.3911312818527222, + "step": 5361 + }, + { + "epoch": 0.62, + "learning_rate": 1.163057473955285e-07, + "logits/chosen": -2.292585849761963, + "logits/rejected": -2.352238416671753, + "logps/chosen": -315.60321044921875, + "logps/rejected": -274.040771484375, + "loss": 0.2724, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8661970496177673, + "rewards/margins": 3.055893898010254, + "rewards/rejected": -3.922091007232666, + "step": 5362 + }, + { + "epoch": 0.62, + "learning_rate": 1.1627063092590425e-07, + "logits/chosen": -2.231405258178711, + "logits/rejected": -1.9029228687286377, + "logps/chosen": -318.69232177734375, + "logps/rejected": -306.874755859375, + "loss": 1.3081, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.746023178100586, + "rewards/margins": 3.145822048187256, + "rewards/rejected": -5.891845226287842, + "step": 5363 + }, + { + "epoch": 0.62, + "learning_rate": 1.1623551445627999e-07, + "logits/chosen": -2.9571471214294434, + "logits/rejected": -2.70969295501709, + "logps/chosen": -261.4036865234375, + "logps/rejected": -277.9122009277344, + "loss": 0.2798, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1088975667953491, + "rewards/margins": 2.3880536556243896, + "rewards/rejected": -3.496951103210449, + "step": 5364 + }, + { + "epoch": 0.62, + "learning_rate": 1.1620039798665573e-07, + "logits/chosen": -2.460329532623291, + "logits/rejected": -2.432948589324951, + "logps/chosen": -229.12783813476562, + "logps/rejected": -222.01644897460938, + "loss": 0.0922, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5473877191543579, + "rewards/margins": 3.807968854904175, + "rewards/rejected": -4.355356693267822, + "step": 5365 + }, + { + "epoch": 0.62, + "learning_rate": 1.1616528151703147e-07, + "logits/chosen": -2.3313136100769043, + "logits/rejected": -2.295863151550293, + "logps/chosen": -259.6409606933594, + "logps/rejected": -226.403564453125, + "loss": 0.4694, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9522616863250732, + "rewards/margins": 1.7141141891479492, + "rewards/rejected": -2.6663758754730225, + "step": 5366 + }, + { + "epoch": 0.62, + "learning_rate": 1.1613016504740724e-07, + "logits/chosen": -2.3278591632843018, + "logits/rejected": -2.4073076248168945, + "logps/chosen": -318.2768859863281, + "logps/rejected": -292.916015625, + "loss": 0.4003, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2334420680999756, + "rewards/margins": 2.0039520263671875, + "rewards/rejected": -3.237394094467163, + "step": 5367 + }, + { + "epoch": 0.62, + "learning_rate": 1.1609504857778298e-07, + "logits/chosen": -2.1993799209594727, + "logits/rejected": -2.270242691040039, + "logps/chosen": -182.8971405029297, + "logps/rejected": -209.26536560058594, + "loss": 1.4472, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.0595176219940186, + "rewards/margins": 1.0259270668029785, + "rewards/rejected": -3.085444450378418, + "step": 5368 + }, + { + "epoch": 0.62, + "learning_rate": 1.1605993210815872e-07, + "logits/chosen": -2.1424736976623535, + "logits/rejected": -2.552548885345459, + "logps/chosen": -363.8639221191406, + "logps/rejected": -204.33026123046875, + "loss": 0.32, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.25192978978157043, + "rewards/margins": 2.2288808822631836, + "rewards/rejected": -2.4808106422424316, + "step": 5369 + }, + { + "epoch": 0.62, + "learning_rate": 1.1602481563853446e-07, + "logits/chosen": -2.1054189205169678, + "logits/rejected": -2.4384989738464355, + "logps/chosen": -278.56585693359375, + "logps/rejected": -213.43800354003906, + "loss": 0.7979, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4998130798339844, + "rewards/margins": 1.1933071613311768, + "rewards/rejected": -2.693120241165161, + "step": 5370 + }, + { + "epoch": 0.62, + "learning_rate": 1.1598969916891022e-07, + "logits/chosen": -2.818024158477783, + "logits/rejected": -2.759274959564209, + "logps/chosen": -204.22409057617188, + "logps/rejected": -288.8422546386719, + "loss": 0.2887, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9458082914352417, + "rewards/margins": 1.961253046989441, + "rewards/rejected": -2.9070613384246826, + "step": 5371 + }, + { + "epoch": 0.62, + "learning_rate": 1.1595458269928596e-07, + "logits/chosen": -2.2063779830932617, + "logits/rejected": -2.1567790508270264, + "logps/chosen": -252.4805450439453, + "logps/rejected": -226.65512084960938, + "loss": 0.5403, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9986060857772827, + "rewards/margins": 1.2751822471618652, + "rewards/rejected": -2.2737884521484375, + "step": 5372 + }, + { + "epoch": 0.62, + "learning_rate": 1.1591946622966171e-07, + "logits/chosen": -2.273359775543213, + "logits/rejected": -2.33944034576416, + "logps/chosen": -118.73954772949219, + "logps/rejected": -247.76959228515625, + "loss": 0.3639, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.425309181213379, + "rewards/margins": 3.1405346393585205, + "rewards/rejected": -4.5658440589904785, + "step": 5373 + }, + { + "epoch": 0.62, + "learning_rate": 1.1588434976003745e-07, + "logits/chosen": -2.093430757522583, + "logits/rejected": -2.2576584815979004, + "logps/chosen": -200.72134399414062, + "logps/rejected": -238.9456024169922, + "loss": 0.5347, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.225793719291687, + "rewards/margins": 1.2262094020843506, + "rewards/rejected": -2.452003240585327, + "step": 5374 + }, + { + "epoch": 0.62, + "learning_rate": 1.158492332904132e-07, + "logits/chosen": -1.8606374263763428, + "logits/rejected": -1.6878392696380615, + "logps/chosen": -421.0030822753906, + "logps/rejected": -423.4737243652344, + "loss": 0.4168, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20289376378059387, + "rewards/margins": 1.2168077230453491, + "rewards/rejected": -1.4197014570236206, + "step": 5375 + }, + { + "epoch": 0.62, + "learning_rate": 1.1581411682078895e-07, + "logits/chosen": -2.3498356342315674, + "logits/rejected": -2.3956570625305176, + "logps/chosen": -263.1499938964844, + "logps/rejected": -368.1671142578125, + "loss": 0.7568, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.7874255180358887, + "rewards/margins": 0.9083647131919861, + "rewards/rejected": -2.6957902908325195, + "step": 5376 + }, + { + "epoch": 0.62, + "learning_rate": 1.1577900035116469e-07, + "logits/chosen": -2.300663948059082, + "logits/rejected": -2.5787353515625, + "logps/chosen": -447.5445251464844, + "logps/rejected": -255.21771240234375, + "loss": 0.1365, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17094777524471283, + "rewards/margins": 3.046576738357544, + "rewards/rejected": -3.217524528503418, + "step": 5377 + }, + { + "epoch": 0.62, + "learning_rate": 1.1574388388154043e-07, + "logits/chosen": -2.829353094100952, + "logits/rejected": -2.564110279083252, + "logps/chosen": -199.2858428955078, + "logps/rejected": -168.325439453125, + "loss": 0.7396, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8981318473815918, + "rewards/margins": 1.3634262084960938, + "rewards/rejected": -2.2615580558776855, + "step": 5378 + }, + { + "epoch": 0.62, + "learning_rate": 1.157087674119162e-07, + "logits/chosen": -2.3734757900238037, + "logits/rejected": -2.4843926429748535, + "logps/chosen": -175.32379150390625, + "logps/rejected": -107.467041015625, + "loss": 1.3934, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.7744288444519043, + "rewards/margins": -0.8584328889846802, + "rewards/rejected": -0.9159958958625793, + "step": 5379 + }, + { + "epoch": 0.62, + "learning_rate": 1.1567365094229194e-07, + "logits/chosen": -2.7029242515563965, + "logits/rejected": -2.579683780670166, + "logps/chosen": -146.3284912109375, + "logps/rejected": -148.7958221435547, + "loss": 0.2392, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7124354243278503, + "rewards/margins": 1.517227053642273, + "rewards/rejected": -2.2296624183654785, + "step": 5380 + }, + { + "epoch": 0.62, + "learning_rate": 1.1563853447266768e-07, + "logits/chosen": -2.5979440212249756, + "logits/rejected": -2.8010101318359375, + "logps/chosen": -182.98065185546875, + "logps/rejected": -107.67953491210938, + "loss": 0.6108, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3765792846679688, + "rewards/margins": 0.39474958181381226, + "rewards/rejected": -1.7713288068771362, + "step": 5381 + }, + { + "epoch": 0.62, + "learning_rate": 1.1560341800304342e-07, + "logits/chosen": -2.3702316284179688, + "logits/rejected": -2.425980567932129, + "logps/chosen": -232.45327758789062, + "logps/rejected": -256.088623046875, + "loss": 0.4596, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8637068271636963, + "rewards/margins": 2.497554063796997, + "rewards/rejected": -3.3612608909606934, + "step": 5382 + }, + { + "epoch": 0.62, + "learning_rate": 1.1556830153341916e-07, + "logits/chosen": -2.1894607543945312, + "logits/rejected": -2.0799052715301514, + "logps/chosen": -334.5174255371094, + "logps/rejected": -367.7757263183594, + "loss": 0.3074, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6822738647460938, + "rewards/margins": 2.1835713386535645, + "rewards/rejected": -2.8658454418182373, + "step": 5383 + }, + { + "epoch": 0.62, + "learning_rate": 1.1553318506379492e-07, + "logits/chosen": -2.769996166229248, + "logits/rejected": -2.5166971683502197, + "logps/chosen": -515.9857788085938, + "logps/rejected": -256.1715393066406, + "loss": 0.6408, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.927295207977295, + "rewards/margins": 1.306774616241455, + "rewards/rejected": -3.23406982421875, + "step": 5384 + }, + { + "epoch": 0.62, + "learning_rate": 1.1549806859417066e-07, + "logits/chosen": -3.0268588066101074, + "logits/rejected": -2.9561667442321777, + "logps/chosen": -242.66351318359375, + "logps/rejected": -257.39434814453125, + "loss": 0.2969, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7999719381332397, + "rewards/margins": 4.440176486968994, + "rewards/rejected": -5.240148544311523, + "step": 5385 + }, + { + "epoch": 0.62, + "learning_rate": 1.154629521245464e-07, + "logits/chosen": -2.383998394012451, + "logits/rejected": -2.606128454208374, + "logps/chosen": -281.8030700683594, + "logps/rejected": -290.3827819824219, + "loss": 0.1119, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6334977149963379, + "rewards/margins": 2.757762908935547, + "rewards/rejected": -3.3912606239318848, + "step": 5386 + }, + { + "epoch": 0.62, + "learning_rate": 1.1542783565492215e-07, + "logits/chosen": -1.7635085582733154, + "logits/rejected": -2.2960281372070312, + "logps/chosen": -257.7177429199219, + "logps/rejected": -257.2599182128906, + "loss": 0.5448, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.22878393530845642, + "rewards/margins": 1.9541599750518799, + "rewards/rejected": -2.1829440593719482, + "step": 5387 + }, + { + "epoch": 0.62, + "learning_rate": 1.153927191852979e-07, + "logits/chosen": -2.7139840126037598, + "logits/rejected": -2.4388303756713867, + "logps/chosen": -397.7704162597656, + "logps/rejected": -374.826904296875, + "loss": 0.2288, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6028299927711487, + "rewards/margins": 2.1932036876678467, + "rewards/rejected": -2.7960338592529297, + "step": 5388 + }, + { + "epoch": 0.62, + "learning_rate": 1.1535760271567364e-07, + "logits/chosen": -1.9785491228103638, + "logits/rejected": -1.9840260744094849, + "logps/chosen": -300.9521484375, + "logps/rejected": -209.1636505126953, + "loss": 0.3508, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.919185221195221, + "rewards/margins": 2.0220794677734375, + "rewards/rejected": -2.9412646293640137, + "step": 5389 + }, + { + "epoch": 0.62, + "learning_rate": 1.153224862460494e-07, + "logits/chosen": -2.6753249168395996, + "logits/rejected": -2.424219846725464, + "logps/chosen": -146.4368896484375, + "logps/rejected": -225.45791625976562, + "loss": 0.1428, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08102984726428986, + "rewards/margins": 3.9227068424224854, + "rewards/rejected": -4.00373649597168, + "step": 5390 + }, + { + "epoch": 0.62, + "learning_rate": 1.1528736977642513e-07, + "logits/chosen": -2.371455669403076, + "logits/rejected": -1.9880931377410889, + "logps/chosen": -173.67828369140625, + "logps/rejected": -392.8952941894531, + "loss": 0.456, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.705518364906311, + "rewards/margins": 2.835723876953125, + "rewards/rejected": -3.5412425994873047, + "step": 5391 + }, + { + "epoch": 0.62, + "learning_rate": 1.1525225330680089e-07, + "logits/chosen": -2.051316738128662, + "logits/rejected": -1.797852635383606, + "logps/chosen": -386.4218444824219, + "logps/rejected": -352.65704345703125, + "loss": 0.3168, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8879094123840332, + "rewards/margins": 1.6851495504379272, + "rewards/rejected": -2.57305908203125, + "step": 5392 + }, + { + "epoch": 0.62, + "learning_rate": 1.1521713683717663e-07, + "logits/chosen": -2.050642251968384, + "logits/rejected": -1.8499658107757568, + "logps/chosen": -123.98992919921875, + "logps/rejected": -226.3799285888672, + "loss": 0.3125, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24214743077754974, + "rewards/margins": 2.0135996341705322, + "rewards/rejected": -2.255746841430664, + "step": 5393 + }, + { + "epoch": 0.62, + "learning_rate": 1.1518202036755237e-07, + "logits/chosen": -2.3704564571380615, + "logits/rejected": -2.477128744125366, + "logps/chosen": -168.77777099609375, + "logps/rejected": -205.6742401123047, + "loss": 0.1825, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6779804229736328, + "rewards/margins": 2.4674010276794434, + "rewards/rejected": -3.145381450653076, + "step": 5394 + }, + { + "epoch": 0.62, + "learning_rate": 1.1514690389792811e-07, + "logits/chosen": -2.3995139598846436, + "logits/rejected": -2.307478427886963, + "logps/chosen": -159.80824279785156, + "logps/rejected": -232.44021606445312, + "loss": 0.4418, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.222816824913025, + "rewards/margins": 1.5162279605865479, + "rewards/rejected": -2.739044666290283, + "step": 5395 + }, + { + "epoch": 0.62, + "learning_rate": 1.1511178742830388e-07, + "logits/chosen": -2.5220141410827637, + "logits/rejected": -2.523894786834717, + "logps/chosen": -464.92755126953125, + "logps/rejected": -323.13372802734375, + "loss": 0.0851, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3111059367656708, + "rewards/margins": 3.2808656692504883, + "rewards/rejected": -3.5919713973999023, + "step": 5396 + }, + { + "epoch": 0.62, + "learning_rate": 1.1507667095867962e-07, + "logits/chosen": -2.2536745071411133, + "logits/rejected": -2.3564066886901855, + "logps/chosen": -235.78652954101562, + "logps/rejected": -288.1522521972656, + "loss": 0.1286, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18000587821006775, + "rewards/margins": 2.470552682876587, + "rewards/rejected": -2.6505584716796875, + "step": 5397 + }, + { + "epoch": 0.62, + "learning_rate": 1.1504155448905536e-07, + "logits/chosen": -2.3445377349853516, + "logits/rejected": -2.41135835647583, + "logps/chosen": -283.2763366699219, + "logps/rejected": -259.7486877441406, + "loss": 0.3363, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9418483972549438, + "rewards/margins": 2.77783465385437, + "rewards/rejected": -3.7196831703186035, + "step": 5398 + }, + { + "epoch": 0.62, + "learning_rate": 1.150064380194311e-07, + "logits/chosen": -2.2306811809539795, + "logits/rejected": -2.105801820755005, + "logps/chosen": -364.2410888671875, + "logps/rejected": -414.999755859375, + "loss": 0.1858, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.49054187536239624, + "rewards/margins": 2.390150547027588, + "rewards/rejected": -2.880692481994629, + "step": 5399 + }, + { + "epoch": 0.62, + "learning_rate": 1.1497132154980687e-07, + "logits/chosen": -2.5953147411346436, + "logits/rejected": -2.5388283729553223, + "logps/chosen": -336.47052001953125, + "logps/rejected": -374.70269775390625, + "loss": 0.5433, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0039902925491333, + "rewards/margins": 1.9345160722732544, + "rewards/rejected": -2.9385063648223877, + "step": 5400 + }, + { + "epoch": 0.62, + "learning_rate": 1.1493620508018261e-07, + "logits/chosen": -2.654736280441284, + "logits/rejected": -2.5983784198760986, + "logps/chosen": -260.7300109863281, + "logps/rejected": -345.0323181152344, + "loss": 0.5431, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3550513982772827, + "rewards/margins": 2.1775062084198, + "rewards/rejected": -3.532557487487793, + "step": 5401 + }, + { + "epoch": 0.62, + "learning_rate": 1.1490108861055835e-07, + "logits/chosen": -2.4119324684143066, + "logits/rejected": -2.312351942062378, + "logps/chosen": -475.77899169921875, + "logps/rejected": -298.5447692871094, + "loss": 0.4703, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6224327087402344, + "rewards/margins": 1.858413815498352, + "rewards/rejected": -2.480846643447876, + "step": 5402 + }, + { + "epoch": 0.62, + "learning_rate": 1.1486597214093409e-07, + "logits/chosen": -1.9186275005340576, + "logits/rejected": -1.9563020467758179, + "logps/chosen": -469.30194091796875, + "logps/rejected": -388.20941162109375, + "loss": 0.3514, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7751048803329468, + "rewards/margins": 1.1167230606079102, + "rewards/rejected": -1.8918280601501465, + "step": 5403 + }, + { + "epoch": 0.62, + "learning_rate": 1.1483085567130984e-07, + "logits/chosen": -2.016710042953491, + "logits/rejected": -2.0270023345947266, + "logps/chosen": -134.00782775878906, + "logps/rejected": -341.12518310546875, + "loss": 0.3899, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5244331359863281, + "rewards/margins": 3.039349317550659, + "rewards/rejected": -3.5637826919555664, + "step": 5404 + }, + { + "epoch": 0.62, + "learning_rate": 1.1479573920168558e-07, + "logits/chosen": -2.487074851989746, + "logits/rejected": -2.218242645263672, + "logps/chosen": -291.2976989746094, + "logps/rejected": -255.40576171875, + "loss": 0.3114, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5904275178909302, + "rewards/margins": 1.6248235702514648, + "rewards/rejected": -3.2152512073516846, + "step": 5405 + }, + { + "epoch": 0.62, + "learning_rate": 1.1476062273206132e-07, + "logits/chosen": -2.1070549488067627, + "logits/rejected": -2.20717191696167, + "logps/chosen": -336.5833435058594, + "logps/rejected": -308.97442626953125, + "loss": 0.4988, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1953881978988647, + "rewards/margins": 2.4128239154815674, + "rewards/rejected": -3.6082119941711426, + "step": 5406 + }, + { + "epoch": 0.62, + "learning_rate": 1.1472550626243708e-07, + "logits/chosen": -2.531639575958252, + "logits/rejected": -2.4945523738861084, + "logps/chosen": -184.91416931152344, + "logps/rejected": -219.89842224121094, + "loss": 0.181, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7574025988578796, + "rewards/margins": 2.911071538925171, + "rewards/rejected": -3.668473958969116, + "step": 5407 + }, + { + "epoch": 0.62, + "learning_rate": 1.1469038979281283e-07, + "logits/chosen": -1.492102861404419, + "logits/rejected": -1.7855911254882812, + "logps/chosen": -481.26226806640625, + "logps/rejected": -403.33551025390625, + "loss": 0.6997, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.249938488006592, + "rewards/margins": 1.6083894968032837, + "rewards/rejected": -4.858327865600586, + "step": 5408 + }, + { + "epoch": 0.62, + "learning_rate": 1.1465527332318857e-07, + "logits/chosen": -2.16892147064209, + "logits/rejected": -2.1464953422546387, + "logps/chosen": -373.492919921875, + "logps/rejected": -469.7626647949219, + "loss": 0.2015, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3069398403167725, + "rewards/margins": 3.2057459354400635, + "rewards/rejected": -4.512685298919678, + "step": 5409 + }, + { + "epoch": 0.62, + "learning_rate": 1.1462015685356431e-07, + "logits/chosen": -1.9342010021209717, + "logits/rejected": -1.879265308380127, + "logps/chosen": -256.0663146972656, + "logps/rejected": -258.2719421386719, + "loss": 0.4787, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.999902069568634, + "rewards/margins": 1.9011245965957642, + "rewards/rejected": -2.901026964187622, + "step": 5410 + }, + { + "epoch": 0.62, + "learning_rate": 1.1458504038394005e-07, + "logits/chosen": -2.256546974182129, + "logits/rejected": -2.3530571460723877, + "logps/chosen": -210.22064208984375, + "logps/rejected": -159.26434326171875, + "loss": 0.4119, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5242147445678711, + "rewards/margins": 1.1373753547668457, + "rewards/rejected": -1.6615900993347168, + "step": 5411 + }, + { + "epoch": 0.62, + "learning_rate": 1.1454992391431582e-07, + "logits/chosen": -2.0544025897979736, + "logits/rejected": -1.8459665775299072, + "logps/chosen": -374.1295166015625, + "logps/rejected": -290.27655029296875, + "loss": 0.2772, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9272937774658203, + "rewards/margins": 1.9236174821853638, + "rewards/rejected": -2.8509111404418945, + "step": 5412 + }, + { + "epoch": 0.62, + "learning_rate": 1.1451480744469156e-07, + "logits/chosen": -2.571016550064087, + "logits/rejected": -2.5802083015441895, + "logps/chosen": -223.7191619873047, + "logps/rejected": -341.2024841308594, + "loss": 0.3324, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6081141829490662, + "rewards/margins": 2.472254514694214, + "rewards/rejected": -3.0803685188293457, + "step": 5413 + }, + { + "epoch": 0.62, + "learning_rate": 1.144796909750673e-07, + "logits/chosen": -2.3102850914001465, + "logits/rejected": -2.5389914512634277, + "logps/chosen": -196.64553833007812, + "logps/rejected": -175.83297729492188, + "loss": 0.5612, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.9168417453765869, + "rewards/margins": 0.8889888525009155, + "rewards/rejected": -1.8058305978775024, + "step": 5414 + }, + { + "epoch": 0.62, + "learning_rate": 1.1444457450544304e-07, + "logits/chosen": -2.7266547679901123, + "logits/rejected": -2.6225249767303467, + "logps/chosen": -91.62026977539062, + "logps/rejected": -151.73526000976562, + "loss": 0.1738, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40744495391845703, + "rewards/margins": 2.320789337158203, + "rewards/rejected": -2.7282345294952393, + "step": 5415 + }, + { + "epoch": 0.62, + "learning_rate": 1.144094580358188e-07, + "logits/chosen": -1.5010051727294922, + "logits/rejected": -2.0725765228271484, + "logps/chosen": -412.0226135253906, + "logps/rejected": -253.66781616210938, + "loss": 0.5063, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0595983266830444, + "rewards/margins": 1.2233991622924805, + "rewards/rejected": -2.2829976081848145, + "step": 5416 + }, + { + "epoch": 0.62, + "learning_rate": 1.1437434156619455e-07, + "logits/chosen": -2.4018394947052, + "logits/rejected": -2.357416868209839, + "logps/chosen": -393.787841796875, + "logps/rejected": -269.7915344238281, + "loss": 0.3518, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.627473771572113, + "rewards/margins": 1.4842078685760498, + "rewards/rejected": -2.1116816997528076, + "step": 5417 + }, + { + "epoch": 0.62, + "learning_rate": 1.1433922509657029e-07, + "logits/chosen": -2.5832338333129883, + "logits/rejected": -2.4260342121124268, + "logps/chosen": -150.24505615234375, + "logps/rejected": -250.91888427734375, + "loss": 0.4947, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8015336990356445, + "rewards/margins": 0.6868572235107422, + "rewards/rejected": -1.4883908033370972, + "step": 5418 + }, + { + "epoch": 0.62, + "learning_rate": 1.1430410862694603e-07, + "logits/chosen": -2.4633708000183105, + "logits/rejected": -2.3986575603485107, + "logps/chosen": -285.37384033203125, + "logps/rejected": -255.82662963867188, + "loss": 0.2164, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.43353381752967834, + "rewards/margins": 2.5894358158111572, + "rewards/rejected": -3.0229697227478027, + "step": 5419 + }, + { + "epoch": 0.62, + "learning_rate": 1.1426899215732178e-07, + "logits/chosen": -2.0466134548187256, + "logits/rejected": -1.722393274307251, + "logps/chosen": -175.87014770507812, + "logps/rejected": -321.9148864746094, + "loss": 0.2273, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4403825998306274, + "rewards/margins": 2.5663414001464844, + "rewards/rejected": -4.006723880767822, + "step": 5420 + }, + { + "epoch": 0.62, + "learning_rate": 1.1423387568769753e-07, + "logits/chosen": -2.0715341567993164, + "logits/rejected": -2.411234140396118, + "logps/chosen": -369.184326171875, + "logps/rejected": -312.81011962890625, + "loss": 0.5493, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9454295635223389, + "rewards/margins": 1.6205390691757202, + "rewards/rejected": -3.5659687519073486, + "step": 5421 + }, + { + "epoch": 0.63, + "learning_rate": 1.1419875921807327e-07, + "logits/chosen": -2.6687769889831543, + "logits/rejected": -2.6699092388153076, + "logps/chosen": -222.32826232910156, + "logps/rejected": -227.79293823242188, + "loss": 0.2712, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8682392835617065, + "rewards/margins": 2.510807991027832, + "rewards/rejected": -3.37904691696167, + "step": 5422 + }, + { + "epoch": 0.63, + "learning_rate": 1.1416364274844901e-07, + "logits/chosen": -1.9558240175247192, + "logits/rejected": -2.0776448249816895, + "logps/chosen": -400.4325256347656, + "logps/rejected": -311.0220642089844, + "loss": 0.4892, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1889333724975586, + "rewards/margins": 1.9022717475891113, + "rewards/rejected": -3.09120512008667, + "step": 5423 + }, + { + "epoch": 0.63, + "learning_rate": 1.1412852627882477e-07, + "logits/chosen": -2.8959803581237793, + "logits/rejected": -2.9318747520446777, + "logps/chosen": -249.5180206298828, + "logps/rejected": -198.01144409179688, + "loss": 0.2153, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8807402849197388, + "rewards/margins": 1.8686702251434326, + "rewards/rejected": -2.749410629272461, + "step": 5424 + }, + { + "epoch": 0.63, + "learning_rate": 1.1409340980920051e-07, + "logits/chosen": -2.6634137630462646, + "logits/rejected": -2.5885767936706543, + "logps/chosen": -297.7049865722656, + "logps/rejected": -193.63858032226562, + "loss": 0.2919, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0617327690124512, + "rewards/margins": 2.608635425567627, + "rewards/rejected": -3.670368194580078, + "step": 5425 + }, + { + "epoch": 0.63, + "learning_rate": 1.1405829333957626e-07, + "logits/chosen": -1.7757930755615234, + "logits/rejected": -1.8473458290100098, + "logps/chosen": -360.91778564453125, + "logps/rejected": -399.7983703613281, + "loss": 0.578, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0591819286346436, + "rewards/margins": 2.520230293273926, + "rewards/rejected": -3.5794124603271484, + "step": 5426 + }, + { + "epoch": 0.63, + "learning_rate": 1.14023176869952e-07, + "logits/chosen": -1.879016637802124, + "logits/rejected": -1.8484001159667969, + "logps/chosen": -336.000244140625, + "logps/rejected": -401.4143981933594, + "loss": 0.523, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6054561138153076, + "rewards/margins": 1.4621776342391968, + "rewards/rejected": -2.067633867263794, + "step": 5427 + }, + { + "epoch": 0.63, + "learning_rate": 1.1398806040032776e-07, + "logits/chosen": -2.22798490524292, + "logits/rejected": -2.321321964263916, + "logps/chosen": -280.29022216796875, + "logps/rejected": -273.225830078125, + "loss": 0.716, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.515822172164917, + "rewards/margins": 0.9333008527755737, + "rewards/rejected": -2.4491231441497803, + "step": 5428 + }, + { + "epoch": 0.63, + "learning_rate": 1.139529439307035e-07, + "logits/chosen": -2.5931975841522217, + "logits/rejected": -2.223018169403076, + "logps/chosen": -210.9309539794922, + "logps/rejected": -307.43609619140625, + "loss": 0.9652, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1850521564483643, + "rewards/margins": 1.09921395778656, + "rewards/rejected": -2.2842659950256348, + "step": 5429 + }, + { + "epoch": 0.63, + "learning_rate": 1.1391782746107924e-07, + "logits/chosen": -1.5196986198425293, + "logits/rejected": -1.7116198539733887, + "logps/chosen": -628.6118774414062, + "logps/rejected": -594.5115356445312, + "loss": 0.4856, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7352502346038818, + "rewards/margins": 0.9266065359115601, + "rewards/rejected": -2.6618566513061523, + "step": 5430 + }, + { + "epoch": 0.63, + "learning_rate": 1.1388271099145498e-07, + "logits/chosen": -2.5350120067596436, + "logits/rejected": -2.6339168548583984, + "logps/chosen": -372.15087890625, + "logps/rejected": -320.5565185546875, + "loss": 0.1984, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5753728151321411, + "rewards/margins": 3.11673903465271, + "rewards/rejected": -3.6921114921569824, + "step": 5431 + }, + { + "epoch": 0.63, + "learning_rate": 1.1384759452183073e-07, + "logits/chosen": -2.665975570678711, + "logits/rejected": -2.788450241088867, + "logps/chosen": -151.32977294921875, + "logps/rejected": -298.5323181152344, + "loss": 0.2243, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7654847502708435, + "rewards/margins": 4.439288139343262, + "rewards/rejected": -5.204772472381592, + "step": 5432 + }, + { + "epoch": 0.63, + "learning_rate": 1.1381247805220648e-07, + "logits/chosen": -2.4136016368865967, + "logits/rejected": -2.4633867740631104, + "logps/chosen": -188.915771484375, + "logps/rejected": -257.95062255859375, + "loss": 0.4007, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4134001731872559, + "rewards/margins": 1.3075898885726929, + "rewards/rejected": -2.7209901809692383, + "step": 5433 + }, + { + "epoch": 0.63, + "learning_rate": 1.1377736158258223e-07, + "logits/chosen": -2.4816651344299316, + "logits/rejected": -2.577969551086426, + "logps/chosen": -227.40016174316406, + "logps/rejected": -342.7960205078125, + "loss": 0.7131, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3802950382232666, + "rewards/margins": 2.1486361026763916, + "rewards/rejected": -3.528931140899658, + "step": 5434 + }, + { + "epoch": 0.63, + "learning_rate": 1.1374224511295797e-07, + "logits/chosen": -2.1442582607269287, + "logits/rejected": -2.2393178939819336, + "logps/chosen": -400.8770446777344, + "logps/rejected": -308.212646484375, + "loss": 0.27, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7704224586486816, + "rewards/margins": 1.7482389211654663, + "rewards/rejected": -2.5186612606048584, + "step": 5435 + }, + { + "epoch": 0.63, + "learning_rate": 1.1370712864333371e-07, + "logits/chosen": -2.2850687503814697, + "logits/rejected": -2.63344669342041, + "logps/chosen": -184.00790405273438, + "logps/rejected": -162.56924438476562, + "loss": 0.0904, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06700142472982407, + "rewards/margins": 3.7721543312072754, + "rewards/rejected": -3.839155912399292, + "step": 5436 + }, + { + "epoch": 0.63, + "learning_rate": 1.1367201217370947e-07, + "logits/chosen": -2.2334001064300537, + "logits/rejected": -2.532336711883545, + "logps/chosen": -225.44143676757812, + "logps/rejected": -341.98114013671875, + "loss": 0.4225, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4284932613372803, + "rewards/margins": 1.8124492168426514, + "rewards/rejected": -3.2409427165985107, + "step": 5437 + }, + { + "epoch": 0.63, + "learning_rate": 1.1363689570408521e-07, + "logits/chosen": -1.8127821683883667, + "logits/rejected": -1.7634660005569458, + "logps/chosen": -295.9225769042969, + "logps/rejected": -335.13385009765625, + "loss": 0.3524, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.41188594698905945, + "rewards/margins": 1.7959468364715576, + "rewards/rejected": -2.2078328132629395, + "step": 5438 + }, + { + "epoch": 0.63, + "learning_rate": 1.1360177923446095e-07, + "logits/chosen": -2.341364860534668, + "logits/rejected": -2.242830276489258, + "logps/chosen": -92.18663024902344, + "logps/rejected": -228.44482421875, + "loss": 0.4113, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6021777987480164, + "rewards/margins": 2.676131010055542, + "rewards/rejected": -3.278308868408203, + "step": 5439 + }, + { + "epoch": 0.63, + "learning_rate": 1.1356666276483669e-07, + "logits/chosen": -1.7327840328216553, + "logits/rejected": -1.839531660079956, + "logps/chosen": -258.60443115234375, + "logps/rejected": -259.8385314941406, + "loss": 0.944, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8076856136322021, + "rewards/margins": 2.0282368659973145, + "rewards/rejected": -3.8359227180480957, + "step": 5440 + }, + { + "epoch": 0.63, + "learning_rate": 1.1353154629521246e-07, + "logits/chosen": -2.2197694778442383, + "logits/rejected": -2.1365790367126465, + "logps/chosen": -215.75796508789062, + "logps/rejected": -298.4355773925781, + "loss": 0.4261, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15099698305130005, + "rewards/margins": 1.0388638973236084, + "rewards/rejected": -1.1898608207702637, + "step": 5441 + }, + { + "epoch": 0.63, + "learning_rate": 1.134964298255882e-07, + "logits/chosen": -2.17966890335083, + "logits/rejected": -1.9707107543945312, + "logps/chosen": -443.5997009277344, + "logps/rejected": -486.2425842285156, + "loss": 0.524, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0694912672042847, + "rewards/margins": 1.1181714534759521, + "rewards/rejected": -2.1876626014709473, + "step": 5442 + }, + { + "epoch": 0.63, + "learning_rate": 1.1346131335596394e-07, + "logits/chosen": -2.545380115509033, + "logits/rejected": -2.5656533241271973, + "logps/chosen": -187.98294067382812, + "logps/rejected": -206.4467315673828, + "loss": 0.4656, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1575238704681396, + "rewards/margins": 3.2941761016845703, + "rewards/rejected": -4.451700210571289, + "step": 5443 + }, + { + "epoch": 0.63, + "learning_rate": 1.1342619688633968e-07, + "logits/chosen": -2.406581401824951, + "logits/rejected": -2.332364797592163, + "logps/chosen": -220.2755889892578, + "logps/rejected": -275.83270263671875, + "loss": 0.3637, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.444041132926941, + "rewards/margins": 3.085299015045166, + "rewards/rejected": -4.5293402671813965, + "step": 5444 + }, + { + "epoch": 0.63, + "learning_rate": 1.1339108041671545e-07, + "logits/chosen": -2.2671217918395996, + "logits/rejected": -2.386080741882324, + "logps/chosen": -275.8888244628906, + "logps/rejected": -224.80841064453125, + "loss": 0.5186, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8246628046035767, + "rewards/margins": 1.5612666606903076, + "rewards/rejected": -2.385929584503174, + "step": 5445 + }, + { + "epoch": 0.63, + "learning_rate": 1.1335596394709119e-07, + "logits/chosen": -2.4268605709075928, + "logits/rejected": -2.485114574432373, + "logps/chosen": -191.7105255126953, + "logps/rejected": -165.86538696289062, + "loss": 0.36, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.27618005871772766, + "rewards/margins": 2.0142927169799805, + "rewards/rejected": -2.290472984313965, + "step": 5446 + }, + { + "epoch": 0.63, + "learning_rate": 1.1332084747746693e-07, + "logits/chosen": -2.2479469776153564, + "logits/rejected": -2.066061496734619, + "logps/chosen": -273.01397705078125, + "logps/rejected": -260.7538757324219, + "loss": 0.6233, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.260165810585022, + "rewards/margins": 1.6318737268447876, + "rewards/rejected": -2.8920395374298096, + "step": 5447 + }, + { + "epoch": 0.63, + "learning_rate": 1.1328573100784267e-07, + "logits/chosen": -2.255322217941284, + "logits/rejected": -2.049644947052002, + "logps/chosen": -116.44789123535156, + "logps/rejected": -243.29380798339844, + "loss": 0.3351, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22918111085891724, + "rewards/margins": 1.8547210693359375, + "rewards/rejected": -2.08390212059021, + "step": 5448 + }, + { + "epoch": 0.63, + "learning_rate": 1.1325061453821842e-07, + "logits/chosen": -1.9178907871246338, + "logits/rejected": -1.9242161512374878, + "logps/chosen": -241.75579833984375, + "logps/rejected": -183.72547912597656, + "loss": 0.6593, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.47830623388290405, + "rewards/margins": 1.3575057983398438, + "rewards/rejected": -1.835811972618103, + "step": 5449 + }, + { + "epoch": 0.63, + "learning_rate": 1.1321549806859416e-07, + "logits/chosen": -2.677459955215454, + "logits/rejected": -2.6192660331726074, + "logps/chosen": -285.2259521484375, + "logps/rejected": -304.8778991699219, + "loss": 0.4516, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.670414447784424, + "rewards/margins": 2.122715473175049, + "rewards/rejected": -4.793129920959473, + "step": 5450 + }, + { + "epoch": 0.63, + "learning_rate": 1.1318038159896992e-07, + "logits/chosen": -2.271003007888794, + "logits/rejected": -2.6756339073181152, + "logps/chosen": -308.5283508300781, + "logps/rejected": -252.90591430664062, + "loss": 0.1814, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5259175300598145, + "rewards/margins": 3.9789342880249023, + "rewards/rejected": -4.504851818084717, + "step": 5451 + }, + { + "epoch": 0.63, + "learning_rate": 1.1314526512934566e-07, + "logits/chosen": -2.093282699584961, + "logits/rejected": -2.321119785308838, + "logps/chosen": -614.2615356445312, + "logps/rejected": -406.6205749511719, + "loss": 0.3113, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15627726912498474, + "rewards/margins": 2.813748836517334, + "rewards/rejected": -2.9700260162353516, + "step": 5452 + }, + { + "epoch": 0.63, + "learning_rate": 1.1311014865972141e-07, + "logits/chosen": -2.430126190185547, + "logits/rejected": -2.4412355422973633, + "logps/chosen": -234.07098388671875, + "logps/rejected": -261.7791748046875, + "loss": 0.2809, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1978356838226318, + "rewards/margins": 2.473140239715576, + "rewards/rejected": -3.670976400375366, + "step": 5453 + }, + { + "epoch": 0.63, + "learning_rate": 1.1307503219009715e-07, + "logits/chosen": -2.5494484901428223, + "logits/rejected": -2.4953274726867676, + "logps/chosen": -136.62564086914062, + "logps/rejected": -245.68646240234375, + "loss": 0.0548, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4350791573524475, + "rewards/margins": 3.4286577701568604, + "rewards/rejected": -3.863737106323242, + "step": 5454 + }, + { + "epoch": 0.63, + "learning_rate": 1.1303991572047289e-07, + "logits/chosen": -1.8874800205230713, + "logits/rejected": -2.345634937286377, + "logps/chosen": -391.50244140625, + "logps/rejected": -197.27728271484375, + "loss": 0.2278, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36807167530059814, + "rewards/margins": 1.7518230676651, + "rewards/rejected": -2.1198947429656982, + "step": 5455 + }, + { + "epoch": 0.63, + "learning_rate": 1.1300479925084863e-07, + "logits/chosen": -2.6456165313720703, + "logits/rejected": -2.6131670475006104, + "logps/chosen": -234.95333862304688, + "logps/rejected": -119.00619506835938, + "loss": 0.4094, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5362637042999268, + "rewards/margins": 1.360579013824463, + "rewards/rejected": -1.8968427181243896, + "step": 5456 + }, + { + "epoch": 0.63, + "learning_rate": 1.129696827812244e-07, + "logits/chosen": -2.6162221431732178, + "logits/rejected": -2.6054041385650635, + "logps/chosen": -470.7397766113281, + "logps/rejected": -343.6514587402344, + "loss": 0.3139, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6505413055419922, + "rewards/margins": 2.3477494716644287, + "rewards/rejected": -2.998291015625, + "step": 5457 + }, + { + "epoch": 0.63, + "learning_rate": 1.1293456631160014e-07, + "logits/chosen": -2.4748826026916504, + "logits/rejected": -2.377842664718628, + "logps/chosen": -241.8787384033203, + "logps/rejected": -606.0159301757812, + "loss": 0.1927, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7518975138664246, + "rewards/margins": 3.9370310306549072, + "rewards/rejected": -4.688928604125977, + "step": 5458 + }, + { + "epoch": 0.63, + "learning_rate": 1.1289944984197588e-07, + "logits/chosen": -2.787813901901245, + "logits/rejected": -2.5414915084838867, + "logps/chosen": -255.99400329589844, + "logps/rejected": -431.27716064453125, + "loss": 0.2813, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2171398401260376, + "rewards/margins": 2.4081482887268066, + "rewards/rejected": -3.625288486480713, + "step": 5459 + }, + { + "epoch": 0.63, + "learning_rate": 1.1286433337235162e-07, + "logits/chosen": -1.9626514911651611, + "logits/rejected": -2.5091307163238525, + "logps/chosen": -418.3918151855469, + "logps/rejected": -289.7417297363281, + "loss": 0.4447, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.36336749792099, + "rewards/margins": 2.6125993728637695, + "rewards/rejected": -2.9759669303894043, + "step": 5460 + }, + { + "epoch": 0.63, + "learning_rate": 1.1282921690272738e-07, + "logits/chosen": -2.6068296432495117, + "logits/rejected": -2.4838979244232178, + "logps/chosen": -373.4358825683594, + "logps/rejected": -219.95474243164062, + "loss": 0.7086, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3034895658493042, + "rewards/margins": 1.1135988235473633, + "rewards/rejected": -2.417088270187378, + "step": 5461 + }, + { + "epoch": 0.63, + "learning_rate": 1.1279410043310313e-07, + "logits/chosen": -1.6449172496795654, + "logits/rejected": -1.854158878326416, + "logps/chosen": -307.3636779785156, + "logps/rejected": -250.60769653320312, + "loss": 0.6236, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6742698550224304, + "rewards/margins": 1.1333190202713013, + "rewards/rejected": -1.8075886964797974, + "step": 5462 + }, + { + "epoch": 0.63, + "learning_rate": 1.1275898396347887e-07, + "logits/chosen": -2.697445869445801, + "logits/rejected": -2.6311216354370117, + "logps/chosen": -247.65066528320312, + "logps/rejected": -225.4961700439453, + "loss": 0.2076, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6942193508148193, + "rewards/margins": 3.06760835647583, + "rewards/rejected": -4.7618279457092285, + "step": 5463 + }, + { + "epoch": 0.63, + "learning_rate": 1.1272386749385461e-07, + "logits/chosen": -2.5121588706970215, + "logits/rejected": -2.421703577041626, + "logps/chosen": -149.13397216796875, + "logps/rejected": -178.5073699951172, + "loss": 0.2099, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2258777916431427, + "rewards/margins": 2.033324956893921, + "rewards/rejected": -2.259202718734741, + "step": 5464 + }, + { + "epoch": 0.63, + "learning_rate": 1.1268875102423036e-07, + "logits/chosen": -2.389043092727661, + "logits/rejected": -2.663461446762085, + "logps/chosen": -502.3416442871094, + "logps/rejected": -348.53240966796875, + "loss": 0.5286, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.124276876449585, + "rewards/margins": 1.6856296062469482, + "rewards/rejected": -2.809906482696533, + "step": 5465 + }, + { + "epoch": 0.63, + "learning_rate": 1.126536345546061e-07, + "logits/chosen": -2.36320161819458, + "logits/rejected": -2.4937939643859863, + "logps/chosen": -334.62298583984375, + "logps/rejected": -285.7183532714844, + "loss": 0.2402, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1836555004119873, + "rewards/margins": 2.9954590797424316, + "rewards/rejected": -4.17911434173584, + "step": 5466 + }, + { + "epoch": 0.63, + "learning_rate": 1.1261851808498185e-07, + "logits/chosen": -1.8384675979614258, + "logits/rejected": -2.1756625175476074, + "logps/chosen": -455.585693359375, + "logps/rejected": -316.4755859375, + "loss": 0.2745, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6516450643539429, + "rewards/margins": 2.738966703414917, + "rewards/rejected": -3.3906118869781494, + "step": 5467 + }, + { + "epoch": 0.63, + "learning_rate": 1.1258340161535759e-07, + "logits/chosen": -2.0588769912719727, + "logits/rejected": -2.095445394515991, + "logps/chosen": -396.681884765625, + "logps/rejected": -296.0037841796875, + "loss": 0.4188, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6229843497276306, + "rewards/margins": 2.015803813934326, + "rewards/rejected": -2.6387882232666016, + "step": 5468 + }, + { + "epoch": 0.63, + "learning_rate": 1.1254828514573335e-07, + "logits/chosen": -2.5934290885925293, + "logits/rejected": -2.4982411861419678, + "logps/chosen": -250.21961975097656, + "logps/rejected": -208.74945068359375, + "loss": 0.3359, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5913262367248535, + "rewards/margins": 1.8035986423492432, + "rewards/rejected": -2.3949246406555176, + "step": 5469 + }, + { + "epoch": 0.63, + "learning_rate": 1.125131686761091e-07, + "logits/chosen": -2.2996158599853516, + "logits/rejected": -2.5139334201812744, + "logps/chosen": -220.21337890625, + "logps/rejected": -129.7056427001953, + "loss": 0.9158, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.3867863416671753, + "rewards/margins": 0.041236549615859985, + "rewards/rejected": -1.4280229806900024, + "step": 5470 + }, + { + "epoch": 0.63, + "learning_rate": 1.1247805220648483e-07, + "logits/chosen": -2.1424882411956787, + "logits/rejected": -2.1547915935516357, + "logps/chosen": -302.9087829589844, + "logps/rejected": -283.7110595703125, + "loss": 0.6674, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5093979835510254, + "rewards/margins": 1.883664846420288, + "rewards/rejected": -3.3930628299713135, + "step": 5471 + }, + { + "epoch": 0.63, + "learning_rate": 1.1244293573686058e-07, + "logits/chosen": -2.078779697418213, + "logits/rejected": -1.9963644742965698, + "logps/chosen": -342.9309387207031, + "logps/rejected": -266.26544189453125, + "loss": 0.355, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7630707621574402, + "rewards/margins": 2.3896021842956543, + "rewards/rejected": -3.1526730060577393, + "step": 5472 + }, + { + "epoch": 0.63, + "learning_rate": 1.1240781926723634e-07, + "logits/chosen": -2.182058572769165, + "logits/rejected": -2.4075355529785156, + "logps/chosen": -119.86439514160156, + "logps/rejected": -126.20564270019531, + "loss": 0.3553, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.39696866273880005, + "rewards/margins": 2.732038974761963, + "rewards/rejected": -3.129007577896118, + "step": 5473 + }, + { + "epoch": 0.63, + "learning_rate": 1.1237270279761208e-07, + "logits/chosen": -2.6933932304382324, + "logits/rejected": -2.5116686820983887, + "logps/chosen": -348.2706604003906, + "logps/rejected": -301.6791076660156, + "loss": 0.386, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7700435519218445, + "rewards/margins": 2.296135187149048, + "rewards/rejected": -3.066178798675537, + "step": 5474 + }, + { + "epoch": 0.63, + "learning_rate": 1.1233758632798782e-07, + "logits/chosen": -2.3181114196777344, + "logits/rejected": -2.4455108642578125, + "logps/chosen": -305.0810546875, + "logps/rejected": -186.36672973632812, + "loss": 0.1829, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24419888854026794, + "rewards/margins": 2.2780139446258545, + "rewards/rejected": -2.5222129821777344, + "step": 5475 + }, + { + "epoch": 0.63, + "learning_rate": 1.1230246985836356e-07, + "logits/chosen": -1.8259077072143555, + "logits/rejected": -1.9464497566223145, + "logps/chosen": -467.793701171875, + "logps/rejected": -358.5336608886719, + "loss": 0.426, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.347623586654663, + "rewards/margins": 1.2755696773529053, + "rewards/rejected": -2.6231932640075684, + "step": 5476 + }, + { + "epoch": 0.63, + "learning_rate": 1.122673533887393e-07, + "logits/chosen": -2.5832667350769043, + "logits/rejected": -2.517691135406494, + "logps/chosen": -190.8768310546875, + "logps/rejected": -282.978759765625, + "loss": 0.1944, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1770135909318924, + "rewards/margins": 2.668795585632324, + "rewards/rejected": -2.4917821884155273, + "step": 5477 + }, + { + "epoch": 0.63, + "learning_rate": 1.1223223691911506e-07, + "logits/chosen": -1.9239369630813599, + "logits/rejected": -1.9412219524383545, + "logps/chosen": -197.90330505371094, + "logps/rejected": -168.0760498046875, + "loss": 0.2527, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.34626504778862, + "rewards/margins": 2.1927449703216553, + "rewards/rejected": -2.5390100479125977, + "step": 5478 + }, + { + "epoch": 0.63, + "learning_rate": 1.1219712044949081e-07, + "logits/chosen": -2.704991340637207, + "logits/rejected": -2.706993579864502, + "logps/chosen": -192.03530883789062, + "logps/rejected": -182.62757873535156, + "loss": 0.3603, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.43015190958976746, + "rewards/margins": 2.7729146480560303, + "rewards/rejected": -3.203066349029541, + "step": 5479 + }, + { + "epoch": 0.63, + "learning_rate": 1.1216200397986655e-07, + "logits/chosen": -1.558610200881958, + "logits/rejected": -1.422545313835144, + "logps/chosen": -242.50393676757812, + "logps/rejected": -354.4683837890625, + "loss": 0.1134, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.886618971824646, + "rewards/margins": 2.7541794776916504, + "rewards/rejected": -3.640798330307007, + "step": 5480 + }, + { + "epoch": 0.63, + "learning_rate": 1.121268875102423e-07, + "logits/chosen": -2.012453556060791, + "logits/rejected": -2.040300130844116, + "logps/chosen": -440.25372314453125, + "logps/rejected": -349.57196044921875, + "loss": 0.2824, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9993611574172974, + "rewards/margins": 1.5796140432357788, + "rewards/rejected": -2.578975200653076, + "step": 5481 + }, + { + "epoch": 0.63, + "learning_rate": 1.1209177104061805e-07, + "logits/chosen": -2.609447956085205, + "logits/rejected": -2.5817291736602783, + "logps/chosen": -217.94403076171875, + "logps/rejected": -205.06834411621094, + "loss": 0.18, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.34135761857032776, + "rewards/margins": 2.3521149158477783, + "rewards/rejected": -2.6934726238250732, + "step": 5482 + }, + { + "epoch": 0.63, + "learning_rate": 1.1205665457099379e-07, + "logits/chosen": -2.304633378982544, + "logits/rejected": -2.3237624168395996, + "logps/chosen": -333.0596923828125, + "logps/rejected": -270.71356201171875, + "loss": 0.2582, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6085836887359619, + "rewards/margins": 3.3887572288513184, + "rewards/rejected": -3.997340679168701, + "step": 5483 + }, + { + "epoch": 0.63, + "learning_rate": 1.1202153810136953e-07, + "logits/chosen": -2.460526704788208, + "logits/rejected": -2.463899612426758, + "logps/chosen": -502.959716796875, + "logps/rejected": -318.19805908203125, + "loss": 0.3684, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5252898931503296, + "rewards/margins": 2.183157205581665, + "rewards/rejected": -2.708446979522705, + "step": 5484 + }, + { + "epoch": 0.63, + "learning_rate": 1.1198642163174527e-07, + "logits/chosen": -2.1159093379974365, + "logits/rejected": -2.441014528274536, + "logps/chosen": -380.6527099609375, + "logps/rejected": -261.2157897949219, + "loss": 0.8854, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7260886430740356, + "rewards/margins": 0.09141272306442261, + "rewards/rejected": -0.8175013065338135, + "step": 5485 + }, + { + "epoch": 0.63, + "learning_rate": 1.1195130516212104e-07, + "logits/chosen": -2.196779489517212, + "logits/rejected": -2.100970506668091, + "logps/chosen": -236.87086486816406, + "logps/rejected": -338.3034362792969, + "loss": 0.3003, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0090270042419434, + "rewards/margins": 2.7343499660491943, + "rewards/rejected": -3.7433767318725586, + "step": 5486 + }, + { + "epoch": 0.63, + "learning_rate": 1.1191618869249678e-07, + "logits/chosen": -2.4125406742095947, + "logits/rejected": -2.421085834503174, + "logps/chosen": -254.10211181640625, + "logps/rejected": -260.246337890625, + "loss": 0.4654, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2143902778625488, + "rewards/margins": 1.6028519868850708, + "rewards/rejected": -2.81724214553833, + "step": 5487 + }, + { + "epoch": 0.63, + "learning_rate": 1.1188107222287252e-07, + "logits/chosen": -1.8127858638763428, + "logits/rejected": -1.7650768756866455, + "logps/chosen": -151.5987548828125, + "logps/rejected": -204.08999633789062, + "loss": 0.9717, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.465550184249878, + "rewards/margins": 0.6308083534240723, + "rewards/rejected": -2.0963587760925293, + "step": 5488 + }, + { + "epoch": 0.63, + "learning_rate": 1.1184595575324826e-07, + "logits/chosen": -2.098820209503174, + "logits/rejected": -1.8481892347335815, + "logps/chosen": -288.78973388671875, + "logps/rejected": -443.69415283203125, + "loss": 0.1971, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.040366604924201965, + "rewards/margins": 2.551400661468506, + "rewards/rejected": -2.5110342502593994, + "step": 5489 + }, + { + "epoch": 0.63, + "learning_rate": 1.1181083928362403e-07, + "logits/chosen": -1.9012484550476074, + "logits/rejected": -1.8915960788726807, + "logps/chosen": -231.07305908203125, + "logps/rejected": -270.1834411621094, + "loss": 0.3253, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5929027795791626, + "rewards/margins": 1.0602256059646606, + "rewards/rejected": -2.6531283855438232, + "step": 5490 + }, + { + "epoch": 0.63, + "learning_rate": 1.1177572281399977e-07, + "logits/chosen": -2.528989791870117, + "logits/rejected": -2.7322659492492676, + "logps/chosen": -343.5570068359375, + "logps/rejected": -256.86376953125, + "loss": 0.4122, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.526418924331665, + "rewards/margins": 1.2876968383789062, + "rewards/rejected": -1.8141157627105713, + "step": 5491 + }, + { + "epoch": 0.63, + "learning_rate": 1.117406063443755e-07, + "logits/chosen": -2.084629774093628, + "logits/rejected": -2.0080080032348633, + "logps/chosen": -231.6866455078125, + "logps/rejected": -229.61923217773438, + "loss": 1.1311, + "rewards/accuracies": 0.375, + "rewards/chosen": -2.7332170009613037, + "rewards/margins": -0.17315316200256348, + "rewards/rejected": -2.5600638389587402, + "step": 5492 + }, + { + "epoch": 0.63, + "learning_rate": 1.1170548987475125e-07, + "logits/chosen": -1.8675024509429932, + "logits/rejected": -1.9839811325073242, + "logps/chosen": -349.17059326171875, + "logps/rejected": -303.9891662597656, + "loss": 0.4502, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.293508291244507, + "rewards/margins": 1.5872197151184082, + "rewards/rejected": -3.880728006362915, + "step": 5493 + }, + { + "epoch": 0.63, + "learning_rate": 1.11670373405127e-07, + "logits/chosen": -2.245471239089966, + "logits/rejected": -2.492793321609497, + "logps/chosen": -344.485595703125, + "logps/rejected": -308.9972229003906, + "loss": 0.634, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3099499940872192, + "rewards/margins": 1.103919267654419, + "rewards/rejected": -2.4138691425323486, + "step": 5494 + }, + { + "epoch": 0.63, + "learning_rate": 1.1163525693550274e-07, + "logits/chosen": -2.4646530151367188, + "logits/rejected": -2.732205390930176, + "logps/chosen": -313.4361572265625, + "logps/rejected": -164.47702026367188, + "loss": 0.4282, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8270527124404907, + "rewards/margins": 1.787187933921814, + "rewards/rejected": -2.6142406463623047, + "step": 5495 + }, + { + "epoch": 0.63, + "learning_rate": 1.116001404658785e-07, + "logits/chosen": -2.7661428451538086, + "logits/rejected": -2.668213367462158, + "logps/chosen": -235.88162231445312, + "logps/rejected": -288.44366455078125, + "loss": 0.4874, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1787203550338745, + "rewards/margins": 2.8973305225372314, + "rewards/rejected": -4.076050758361816, + "step": 5496 + }, + { + "epoch": 0.63, + "learning_rate": 1.1156502399625424e-07, + "logits/chosen": -1.9371417760849, + "logits/rejected": -2.3464653491973877, + "logps/chosen": -384.45343017578125, + "logps/rejected": -287.8562316894531, + "loss": 0.5016, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8573976755142212, + "rewards/margins": 2.5211760997772217, + "rewards/rejected": -3.3785736560821533, + "step": 5497 + }, + { + "epoch": 0.63, + "learning_rate": 1.1152990752662999e-07, + "logits/chosen": -2.43997859954834, + "logits/rejected": -2.4256341457366943, + "logps/chosen": -322.9646301269531, + "logps/rejected": -271.6009521484375, + "loss": 0.2241, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6982825398445129, + "rewards/margins": 1.9241347312927246, + "rewards/rejected": -2.622417449951172, + "step": 5498 + }, + { + "epoch": 0.63, + "learning_rate": 1.1149479105700573e-07, + "logits/chosen": -2.867121696472168, + "logits/rejected": -2.7045068740844727, + "logps/chosen": -139.77822875976562, + "logps/rejected": -241.1521759033203, + "loss": 0.2041, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5962227582931519, + "rewards/margins": 2.9819679260253906, + "rewards/rejected": -3.578190326690674, + "step": 5499 + }, + { + "epoch": 0.63, + "learning_rate": 1.1145967458738147e-07, + "logits/chosen": -1.857743263244629, + "logits/rejected": -2.301548957824707, + "logps/chosen": -674.794921875, + "logps/rejected": -439.75567626953125, + "loss": 0.4053, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.20720504224300385, + "rewards/margins": 1.5133311748504639, + "rewards/rejected": -1.7205361127853394, + "step": 5500 + }, + { + "epoch": 0.63, + "learning_rate": 1.1142455811775721e-07, + "logits/chosen": -2.2320446968078613, + "logits/rejected": -2.4492146968841553, + "logps/chosen": -417.688720703125, + "logps/rejected": -275.54107666015625, + "loss": 0.2934, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14202889800071716, + "rewards/margins": 1.869436502456665, + "rewards/rejected": -2.011465311050415, + "step": 5501 + }, + { + "epoch": 0.63, + "learning_rate": 1.1138944164813298e-07, + "logits/chosen": -1.9245822429656982, + "logits/rejected": -1.959216594696045, + "logps/chosen": -333.37591552734375, + "logps/rejected": -283.07574462890625, + "loss": 0.3363, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.789535641670227, + "rewards/margins": 2.033851385116577, + "rewards/rejected": -2.8233871459960938, + "step": 5502 + }, + { + "epoch": 0.63, + "learning_rate": 1.1135432517850872e-07, + "logits/chosen": -2.7026634216308594, + "logits/rejected": -2.754913330078125, + "logps/chosen": -307.6568603515625, + "logps/rejected": -324.16192626953125, + "loss": 0.4173, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7926677465438843, + "rewards/margins": 2.0092554092407227, + "rewards/rejected": -2.8019227981567383, + "step": 5503 + }, + { + "epoch": 0.63, + "learning_rate": 1.1131920870888446e-07, + "logits/chosen": -2.5598716735839844, + "logits/rejected": -2.665410041809082, + "logps/chosen": -341.57000732421875, + "logps/rejected": -307.872802734375, + "loss": 0.3022, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4113514423370361, + "rewards/margins": 3.8269076347351074, + "rewards/rejected": -5.238259315490723, + "step": 5504 + }, + { + "epoch": 0.63, + "learning_rate": 1.112840922392602e-07, + "logits/chosen": -2.36428165435791, + "logits/rejected": -2.3789501190185547, + "logps/chosen": -452.17816162109375, + "logps/rejected": -357.7655944824219, + "loss": 0.2886, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4126545190811157, + "rewards/margins": 2.643728256225586, + "rewards/rejected": -4.05638313293457, + "step": 5505 + }, + { + "epoch": 0.63, + "learning_rate": 1.1124897576963595e-07, + "logits/chosen": -2.5679328441619873, + "logits/rejected": -2.498276948928833, + "logps/chosen": -131.26210021972656, + "logps/rejected": -243.3218994140625, + "loss": 0.1572, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2232084721326828, + "rewards/margins": 2.6184937953948975, + "rewards/rejected": -2.8417022228240967, + "step": 5506 + }, + { + "epoch": 0.63, + "learning_rate": 1.1121385930001171e-07, + "logits/chosen": -2.1052327156066895, + "logits/rejected": -2.252913475036621, + "logps/chosen": -397.78973388671875, + "logps/rejected": -272.2942199707031, + "loss": 0.4024, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.50583416223526, + "rewards/margins": 1.7010862827301025, + "rewards/rejected": -2.206920623779297, + "step": 5507 + }, + { + "epoch": 0.63, + "learning_rate": 1.1117874283038745e-07, + "logits/chosen": -1.8601802587509155, + "logits/rejected": -2.0048553943634033, + "logps/chosen": -361.2356872558594, + "logps/rejected": -282.5200500488281, + "loss": 0.4477, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6012697219848633, + "rewards/margins": 1.2904384136199951, + "rewards/rejected": -1.8917081356048584, + "step": 5508 + }, + { + "epoch": 0.64, + "learning_rate": 1.1114362636076319e-07, + "logits/chosen": -2.473628520965576, + "logits/rejected": -2.431929111480713, + "logps/chosen": -274.25396728515625, + "logps/rejected": -282.70684814453125, + "loss": 0.3455, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2832317352294922, + "rewards/margins": 5.141310691833496, + "rewards/rejected": -6.424542427062988, + "step": 5509 + }, + { + "epoch": 0.64, + "learning_rate": 1.1110850989113894e-07, + "logits/chosen": -2.231903314590454, + "logits/rejected": -2.610936403274536, + "logps/chosen": -523.61767578125, + "logps/rejected": -304.8914489746094, + "loss": 1.0711, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4936854839324951, + "rewards/margins": 1.1012250185012817, + "rewards/rejected": -2.5949106216430664, + "step": 5510 + }, + { + "epoch": 0.64, + "learning_rate": 1.1107339342151468e-07, + "logits/chosen": -2.1744751930236816, + "logits/rejected": -1.9226876497268677, + "logps/chosen": -561.0772705078125, + "logps/rejected": -391.3957824707031, + "loss": 0.2363, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9385483264923096, + "rewards/margins": 3.019773006439209, + "rewards/rejected": -3.9583210945129395, + "step": 5511 + }, + { + "epoch": 0.64, + "learning_rate": 1.1103827695189042e-07, + "logits/chosen": -2.6121559143066406, + "logits/rejected": -2.76943302154541, + "logps/chosen": -295.04010009765625, + "logps/rejected": -135.56289672851562, + "loss": 0.2252, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7164534330368042, + "rewards/margins": 1.7710356712341309, + "rewards/rejected": -2.4874889850616455, + "step": 5512 + }, + { + "epoch": 0.64, + "learning_rate": 1.1100316048226618e-07, + "logits/chosen": -2.51597261428833, + "logits/rejected": -2.6213936805725098, + "logps/chosen": -162.2982177734375, + "logps/rejected": -267.1649169921875, + "loss": 0.4475, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.642615795135498, + "rewards/margins": 2.1473329067230225, + "rewards/rejected": -2.7899484634399414, + "step": 5513 + }, + { + "epoch": 0.64, + "learning_rate": 1.1096804401264193e-07, + "logits/chosen": -2.4587602615356445, + "logits/rejected": -2.5279557704925537, + "logps/chosen": -175.36898803710938, + "logps/rejected": -215.03158569335938, + "loss": 0.464, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.188178300857544, + "rewards/margins": 1.8569600582122803, + "rewards/rejected": -3.045138359069824, + "step": 5514 + }, + { + "epoch": 0.64, + "learning_rate": 1.1093292754301767e-07, + "logits/chosen": -1.9342350959777832, + "logits/rejected": -1.95188570022583, + "logps/chosen": -237.533203125, + "logps/rejected": -264.5866394042969, + "loss": 0.4502, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3204970061779022, + "rewards/margins": 1.751792311668396, + "rewards/rejected": -2.07228946685791, + "step": 5515 + }, + { + "epoch": 0.64, + "learning_rate": 1.1089781107339341e-07, + "logits/chosen": -2.4710097312927246, + "logits/rejected": -2.5887060165405273, + "logps/chosen": -226.98463439941406, + "logps/rejected": -264.1338195800781, + "loss": 0.4846, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3607786893844604, + "rewards/margins": 2.487375497817993, + "rewards/rejected": -3.848154067993164, + "step": 5516 + }, + { + "epoch": 0.64, + "learning_rate": 1.1086269460376915e-07, + "logits/chosen": -2.5153684616088867, + "logits/rejected": -2.5696616172790527, + "logps/chosen": -488.230712890625, + "logps/rejected": -362.4848327636719, + "loss": 0.3918, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7999332547187805, + "rewards/margins": 2.814695119857788, + "rewards/rejected": -3.614628314971924, + "step": 5517 + }, + { + "epoch": 0.64, + "learning_rate": 1.1082757813414492e-07, + "logits/chosen": -1.9549392461776733, + "logits/rejected": -2.3675854206085205, + "logps/chosen": -360.3967590332031, + "logps/rejected": -331.5201110839844, + "loss": 0.2627, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.566526472568512, + "rewards/margins": 3.446712017059326, + "rewards/rejected": -4.013238430023193, + "step": 5518 + }, + { + "epoch": 0.64, + "learning_rate": 1.1079246166452066e-07, + "logits/chosen": -2.6088428497314453, + "logits/rejected": -2.3497672080993652, + "logps/chosen": -263.40771484375, + "logps/rejected": -445.9223327636719, + "loss": 0.4944, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.555485725402832, + "rewards/margins": 2.568068742752075, + "rewards/rejected": -3.1235544681549072, + "step": 5519 + }, + { + "epoch": 0.64, + "learning_rate": 1.107573451948964e-07, + "logits/chosen": -2.0678582191467285, + "logits/rejected": -2.1280620098114014, + "logps/chosen": -166.9259796142578, + "logps/rejected": -215.9414825439453, + "loss": 0.3685, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1736061573028564, + "rewards/margins": 2.481637477874756, + "rewards/rejected": -3.6552436351776123, + "step": 5520 + }, + { + "epoch": 0.64, + "learning_rate": 1.1072222872527214e-07, + "logits/chosen": -1.934887409210205, + "logits/rejected": -1.8623541593551636, + "logps/chosen": -246.58877563476562, + "logps/rejected": -319.65185546875, + "loss": 0.5139, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5229830741882324, + "rewards/margins": 2.9015567302703857, + "rewards/rejected": -4.424539566040039, + "step": 5521 + }, + { + "epoch": 0.64, + "learning_rate": 1.106871122556479e-07, + "logits/chosen": -2.9078595638275146, + "logits/rejected": -2.890972852706909, + "logps/chosen": -195.57180786132812, + "logps/rejected": -228.421630859375, + "loss": 0.3115, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1879899501800537, + "rewards/margins": 2.3790924549102783, + "rewards/rejected": -3.567082405090332, + "step": 5522 + }, + { + "epoch": 0.64, + "learning_rate": 1.1065199578602364e-07, + "logits/chosen": -1.7918477058410645, + "logits/rejected": -1.6572329998016357, + "logps/chosen": -105.83893585205078, + "logps/rejected": -149.80587768554688, + "loss": 0.5905, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.421041488647461, + "rewards/margins": 1.0284435749053955, + "rewards/rejected": -2.4494850635528564, + "step": 5523 + }, + { + "epoch": 0.64, + "learning_rate": 1.1061687931639939e-07, + "logits/chosen": -2.371248960494995, + "logits/rejected": -2.431252956390381, + "logps/chosen": -304.0951843261719, + "logps/rejected": -283.615234375, + "loss": 0.3411, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0883253812789917, + "rewards/margins": 2.8483901023864746, + "rewards/rejected": -3.9367153644561768, + "step": 5524 + }, + { + "epoch": 0.64, + "learning_rate": 1.1058176284677513e-07, + "logits/chosen": -2.724224090576172, + "logits/rejected": -2.5106618404388428, + "logps/chosen": -172.97410583496094, + "logps/rejected": -183.27569580078125, + "loss": 0.7768, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7690479159355164, + "rewards/margins": 1.1870365142822266, + "rewards/rejected": -1.9560844898223877, + "step": 5525 + }, + { + "epoch": 0.64, + "learning_rate": 1.1054664637715087e-07, + "logits/chosen": -2.3227574825286865, + "logits/rejected": -2.3293752670288086, + "logps/chosen": -174.0053253173828, + "logps/rejected": -168.82798767089844, + "loss": 0.2041, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9489589929580688, + "rewards/margins": 1.8784199953079224, + "rewards/rejected": -2.8273792266845703, + "step": 5526 + }, + { + "epoch": 0.64, + "learning_rate": 1.1051152990752663e-07, + "logits/chosen": -2.783993721008301, + "logits/rejected": -2.6138646602630615, + "logps/chosen": -245.96841430664062, + "logps/rejected": -277.5014953613281, + "loss": 0.348, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.074092149734497, + "rewards/margins": 1.630502462387085, + "rewards/rejected": -2.704594612121582, + "step": 5527 + }, + { + "epoch": 0.64, + "learning_rate": 1.1047641343790237e-07, + "logits/chosen": -2.3982386589050293, + "logits/rejected": -2.5307459831237793, + "logps/chosen": -216.76565551757812, + "logps/rejected": -185.1531524658203, + "loss": 0.7893, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7405093908309937, + "rewards/margins": 1.171621322631836, + "rewards/rejected": -1.91213059425354, + "step": 5528 + }, + { + "epoch": 0.64, + "learning_rate": 1.1044129696827811e-07, + "logits/chosen": -1.9908465147018433, + "logits/rejected": -1.8561463356018066, + "logps/chosen": -361.026611328125, + "logps/rejected": -413.02374267578125, + "loss": 0.2895, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13497883081436157, + "rewards/margins": 2.1996233463287354, + "rewards/rejected": -2.3346023559570312, + "step": 5529 + }, + { + "epoch": 0.64, + "learning_rate": 1.1040618049865386e-07, + "logits/chosen": -2.4026360511779785, + "logits/rejected": -2.186408281326294, + "logps/chosen": -135.56961059570312, + "logps/rejected": -244.62539672851562, + "loss": 0.4014, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9476817846298218, + "rewards/margins": 2.705198287963867, + "rewards/rejected": -3.6528799533843994, + "step": 5530 + }, + { + "epoch": 0.64, + "learning_rate": 1.1037106402902962e-07, + "logits/chosen": -1.658682107925415, + "logits/rejected": -1.8532888889312744, + "logps/chosen": -276.60687255859375, + "logps/rejected": -297.80352783203125, + "loss": 0.2595, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0977966785430908, + "rewards/margins": 2.083993673324585, + "rewards/rejected": -3.181790351867676, + "step": 5531 + }, + { + "epoch": 0.64, + "learning_rate": 1.1033594755940536e-07, + "logits/chosen": -2.1734719276428223, + "logits/rejected": -2.224635601043701, + "logps/chosen": -342.1981201171875, + "logps/rejected": -266.532958984375, + "loss": 0.0769, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09991827607154846, + "rewards/margins": 3.4698448181152344, + "rewards/rejected": -3.3699264526367188, + "step": 5532 + }, + { + "epoch": 0.64, + "learning_rate": 1.103008310897811e-07, + "logits/chosen": -2.4710988998413086, + "logits/rejected": -2.352175712585449, + "logps/chosen": -222.31924438476562, + "logps/rejected": -267.20635986328125, + "loss": 0.6325, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.863031268119812, + "rewards/margins": 2.147642135620117, + "rewards/rejected": -4.010673522949219, + "step": 5533 + }, + { + "epoch": 0.64, + "learning_rate": 1.1026571462015684e-07, + "logits/chosen": -2.1692051887512207, + "logits/rejected": -2.158723831176758, + "logps/chosen": -428.8233642578125, + "logps/rejected": -391.8507080078125, + "loss": 0.334, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2907596528530121, + "rewards/margins": 2.3264400959014893, + "rewards/rejected": -2.6171998977661133, + "step": 5534 + }, + { + "epoch": 0.64, + "learning_rate": 1.102305981505326e-07, + "logits/chosen": -2.084421157836914, + "logits/rejected": -2.2060022354125977, + "logps/chosen": -120.27510070800781, + "logps/rejected": -177.24462890625, + "loss": 0.408, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7315717935562134, + "rewards/margins": 0.94380784034729, + "rewards/rejected": -1.6753796339035034, + "step": 5535 + }, + { + "epoch": 0.64, + "learning_rate": 1.1019548168090835e-07, + "logits/chosen": -1.9196196794509888, + "logits/rejected": -2.1621408462524414, + "logps/chosen": -463.7354431152344, + "logps/rejected": -293.4747009277344, + "loss": 0.1514, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6600782871246338, + "rewards/margins": 2.4410595893859863, + "rewards/rejected": -3.101138114929199, + "step": 5536 + }, + { + "epoch": 0.64, + "learning_rate": 1.1016036521128409e-07, + "logits/chosen": -2.517125129699707, + "logits/rejected": -2.4905574321746826, + "logps/chosen": -219.61233520507812, + "logps/rejected": -307.85009765625, + "loss": 0.4297, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2454501390457153, + "rewards/margins": 1.666011095046997, + "rewards/rejected": -2.911461114883423, + "step": 5537 + }, + { + "epoch": 0.64, + "learning_rate": 1.1012524874165983e-07, + "logits/chosen": -2.6104536056518555, + "logits/rejected": -2.7804155349731445, + "logps/chosen": -329.5884704589844, + "logps/rejected": -243.1458740234375, + "loss": 0.482, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8896388411521912, + "rewards/margins": 2.1109628677368164, + "rewards/rejected": -3.0006017684936523, + "step": 5538 + }, + { + "epoch": 0.64, + "learning_rate": 1.1009013227203558e-07, + "logits/chosen": -1.7923107147216797, + "logits/rejected": -1.8011860847473145, + "logps/chosen": -282.17193603515625, + "logps/rejected": -271.974609375, + "loss": 0.4239, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7214870452880859, + "rewards/margins": 2.209611654281616, + "rewards/rejected": -2.931098699569702, + "step": 5539 + }, + { + "epoch": 0.64, + "learning_rate": 1.1005501580241132e-07, + "logits/chosen": -2.2811801433563232, + "logits/rejected": -2.2868034839630127, + "logps/chosen": -119.14777374267578, + "logps/rejected": -178.03773498535156, + "loss": 0.5191, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.110123872756958, + "rewards/margins": 1.8757141828536987, + "rewards/rejected": -2.985837936401367, + "step": 5540 + }, + { + "epoch": 0.64, + "learning_rate": 1.1001989933278707e-07, + "logits/chosen": -2.527003765106201, + "logits/rejected": -2.4573793411254883, + "logps/chosen": -198.12265014648438, + "logps/rejected": -242.51272583007812, + "loss": 0.4442, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6773009300231934, + "rewards/margins": 0.9471610188484192, + "rewards/rejected": -1.6244621276855469, + "step": 5541 + }, + { + "epoch": 0.64, + "learning_rate": 1.0998478286316282e-07, + "logits/chosen": -2.4098188877105713, + "logits/rejected": -2.2583229541778564, + "logps/chosen": -297.90203857421875, + "logps/rejected": -332.44110107421875, + "loss": 0.1482, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6072774529457092, + "rewards/margins": 4.002562999725342, + "rewards/rejected": -4.609840393066406, + "step": 5542 + }, + { + "epoch": 0.64, + "learning_rate": 1.0994966639353857e-07, + "logits/chosen": -2.268745183944702, + "logits/rejected": -2.1674370765686035, + "logps/chosen": -251.98529052734375, + "logps/rejected": -248.5345458984375, + "loss": 0.3465, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.891592264175415, + "rewards/margins": 2.322603702545166, + "rewards/rejected": -3.21419620513916, + "step": 5543 + }, + { + "epoch": 0.64, + "learning_rate": 1.0991454992391431e-07, + "logits/chosen": -1.906010389328003, + "logits/rejected": -1.8362767696380615, + "logps/chosen": -221.3837890625, + "logps/rejected": -262.4632263183594, + "loss": 0.2892, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4286724030971527, + "rewards/margins": 1.279693603515625, + "rewards/rejected": -1.7083659172058105, + "step": 5544 + }, + { + "epoch": 0.64, + "learning_rate": 1.0987943345429005e-07, + "logits/chosen": -2.3237202167510986, + "logits/rejected": -2.4946789741516113, + "logps/chosen": -254.0275421142578, + "logps/rejected": -346.60467529296875, + "loss": 0.2105, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5830567479133606, + "rewards/margins": 2.7973520755767822, + "rewards/rejected": -3.380409002304077, + "step": 5545 + }, + { + "epoch": 0.64, + "learning_rate": 1.0984431698466579e-07, + "logits/chosen": -2.1747469902038574, + "logits/rejected": -2.1434006690979004, + "logps/chosen": -222.7962646484375, + "logps/rejected": -223.33319091796875, + "loss": 0.3335, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3371819853782654, + "rewards/margins": 1.3589471578598022, + "rewards/rejected": -1.696129322052002, + "step": 5546 + }, + { + "epoch": 0.64, + "learning_rate": 1.0980920051504156e-07, + "logits/chosen": -2.460968255996704, + "logits/rejected": -2.3450021743774414, + "logps/chosen": -224.14178466796875, + "logps/rejected": -192.32693481445312, + "loss": 0.4973, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.689854621887207, + "rewards/margins": 1.6999342441558838, + "rewards/rejected": -3.389788866043091, + "step": 5547 + }, + { + "epoch": 0.64, + "learning_rate": 1.097740840454173e-07, + "logits/chosen": -2.170053005218506, + "logits/rejected": -2.574366569519043, + "logps/chosen": -337.612060546875, + "logps/rejected": -277.0309753417969, + "loss": 0.3264, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8944666385650635, + "rewards/margins": 2.1232120990753174, + "rewards/rejected": -3.01767897605896, + "step": 5548 + }, + { + "epoch": 0.64, + "learning_rate": 1.0973896757579304e-07, + "logits/chosen": -2.8213958740234375, + "logits/rejected": -2.69346284866333, + "logps/chosen": -240.3663787841797, + "logps/rejected": -271.0744934082031, + "loss": 0.2232, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6323970556259155, + "rewards/margins": 2.3366928100585938, + "rewards/rejected": -2.9690897464752197, + "step": 5549 + }, + { + "epoch": 0.64, + "learning_rate": 1.0970385110616878e-07, + "logits/chosen": -2.356048107147217, + "logits/rejected": -2.195204257965088, + "logps/chosen": -181.2312774658203, + "logps/rejected": -214.54354858398438, + "loss": 0.2931, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5154659748077393, + "rewards/margins": 1.8112125396728516, + "rewards/rejected": -2.326678514480591, + "step": 5550 + }, + { + "epoch": 0.64, + "learning_rate": 1.0966873463654455e-07, + "logits/chosen": -2.756639003753662, + "logits/rejected": -2.6087000370025635, + "logps/chosen": -167.60574340820312, + "logps/rejected": -318.7206726074219, + "loss": 0.089, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0177587270736694, + "rewards/margins": 4.339715003967285, + "rewards/rejected": -5.357473850250244, + "step": 5551 + }, + { + "epoch": 0.64, + "learning_rate": 1.0963361816692029e-07, + "logits/chosen": -1.7398278713226318, + "logits/rejected": -1.9225804805755615, + "logps/chosen": -386.6575012207031, + "logps/rejected": -335.17230224609375, + "loss": 0.6405, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2972692251205444, + "rewards/margins": 1.1005157232284546, + "rewards/rejected": -2.397784948348999, + "step": 5552 + }, + { + "epoch": 0.64, + "learning_rate": 1.0959850169729603e-07, + "logits/chosen": -1.388869285583496, + "logits/rejected": -1.6465202569961548, + "logps/chosen": -496.0787353515625, + "logps/rejected": -425.1123352050781, + "loss": 0.1584, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8022287487983704, + "rewards/margins": 1.898432970046997, + "rewards/rejected": -2.7006618976593018, + "step": 5553 + }, + { + "epoch": 0.64, + "learning_rate": 1.0956338522767177e-07, + "logits/chosen": -2.1677136421203613, + "logits/rejected": -2.348090887069702, + "logps/chosen": -480.81939697265625, + "logps/rejected": -503.7747802734375, + "loss": 0.1979, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1710140705108643, + "rewards/margins": 2.660999298095703, + "rewards/rejected": -3.8320133686065674, + "step": 5554 + }, + { + "epoch": 0.64, + "learning_rate": 1.0952826875804752e-07, + "logits/chosen": -2.7069036960601807, + "logits/rejected": -2.49348783493042, + "logps/chosen": -369.5573425292969, + "logps/rejected": -382.29669189453125, + "loss": 0.1325, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6330816745758057, + "rewards/margins": 2.8396944999694824, + "rewards/rejected": -4.472776412963867, + "step": 5555 + }, + { + "epoch": 0.64, + "learning_rate": 1.0949315228842326e-07, + "logits/chosen": -2.3491435050964355, + "logits/rejected": -2.3708157539367676, + "logps/chosen": -270.6493835449219, + "logps/rejected": -178.67144775390625, + "loss": 0.3128, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.41420799493789673, + "rewards/margins": 2.070742607116699, + "rewards/rejected": -2.484950542449951, + "step": 5556 + }, + { + "epoch": 0.64, + "learning_rate": 1.09458035818799e-07, + "logits/chosen": -2.8265581130981445, + "logits/rejected": -2.6894094944000244, + "logps/chosen": -144.6244659423828, + "logps/rejected": -172.55059814453125, + "loss": 0.6159, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7256641387939453, + "rewards/margins": 0.5254242420196533, + "rewards/rejected": -2.2510883808135986, + "step": 5557 + }, + { + "epoch": 0.64, + "learning_rate": 1.0942291934917476e-07, + "logits/chosen": -2.361126184463501, + "logits/rejected": -2.1471877098083496, + "logps/chosen": -282.4525451660156, + "logps/rejected": -264.6736755371094, + "loss": 0.6621, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.41703492403030396, + "rewards/margins": 0.693306565284729, + "rewards/rejected": -1.1103414297103882, + "step": 5558 + }, + { + "epoch": 0.64, + "learning_rate": 1.0938780287955051e-07, + "logits/chosen": -2.4173483848571777, + "logits/rejected": -2.445361852645874, + "logps/chosen": -378.16741943359375, + "logps/rejected": -321.18792724609375, + "loss": 0.196, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6776261329650879, + "rewards/margins": 3.800227165222168, + "rewards/rejected": -4.477853775024414, + "step": 5559 + }, + { + "epoch": 0.64, + "learning_rate": 1.0935268640992625e-07, + "logits/chosen": -2.1468565464019775, + "logits/rejected": -2.1396102905273438, + "logps/chosen": -268.8265075683594, + "logps/rejected": -309.95758056640625, + "loss": 0.3786, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6377863883972168, + "rewards/margins": 1.829252004623413, + "rewards/rejected": -2.467038631439209, + "step": 5560 + }, + { + "epoch": 0.64, + "learning_rate": 1.0931756994030199e-07, + "logits/chosen": -2.638500452041626, + "logits/rejected": -2.6937355995178223, + "logps/chosen": -428.03564453125, + "logps/rejected": -294.43853759765625, + "loss": 0.3674, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.177924871444702, + "rewards/margins": 1.623688817024231, + "rewards/rejected": -3.8016133308410645, + "step": 5561 + }, + { + "epoch": 0.64, + "learning_rate": 1.0928245347067773e-07, + "logits/chosen": -2.5229363441467285, + "logits/rejected": -2.564765214920044, + "logps/chosen": -245.19497680664062, + "logps/rejected": -268.2645263671875, + "loss": 0.115, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21495136618614197, + "rewards/margins": 2.9537014961242676, + "rewards/rejected": -3.1686525344848633, + "step": 5562 + }, + { + "epoch": 0.64, + "learning_rate": 1.092473370010535e-07, + "logits/chosen": -2.4072132110595703, + "logits/rejected": -2.4426357746124268, + "logps/chosen": -266.6216125488281, + "logps/rejected": -289.98663330078125, + "loss": 0.1981, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4027960300445557, + "rewards/margins": 2.7453715801239014, + "rewards/rejected": -4.148167610168457, + "step": 5563 + }, + { + "epoch": 0.64, + "learning_rate": 1.0921222053142924e-07, + "logits/chosen": -2.1865084171295166, + "logits/rejected": -2.327721357345581, + "logps/chosen": -254.16552734375, + "logps/rejected": -254.553955078125, + "loss": 0.4211, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5063619613647461, + "rewards/margins": 1.6391568183898926, + "rewards/rejected": -2.1455187797546387, + "step": 5564 + }, + { + "epoch": 0.64, + "learning_rate": 1.0917710406180498e-07, + "logits/chosen": -2.444479465484619, + "logits/rejected": -2.573730230331421, + "logps/chosen": -147.16232299804688, + "logps/rejected": -213.8670654296875, + "loss": 0.228, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4231875240802765, + "rewards/margins": 4.136362075805664, + "rewards/rejected": -4.559549808502197, + "step": 5565 + }, + { + "epoch": 0.64, + "learning_rate": 1.0914198759218072e-07, + "logits/chosen": -2.471564531326294, + "logits/rejected": -2.199169397354126, + "logps/chosen": -221.87628173828125, + "logps/rejected": -298.2196350097656, + "loss": 0.5057, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2192538976669312, + "rewards/margins": 1.788335919380188, + "rewards/rejected": -3.007589817047119, + "step": 5566 + }, + { + "epoch": 0.64, + "learning_rate": 1.0910687112255648e-07, + "logits/chosen": -1.9626668691635132, + "logits/rejected": -2.3353991508483887, + "logps/chosen": -516.2098999023438, + "logps/rejected": -279.40203857421875, + "loss": 0.255, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6549344062805176, + "rewards/margins": 1.8272250890731812, + "rewards/rejected": -2.482159376144409, + "step": 5567 + }, + { + "epoch": 0.64, + "learning_rate": 1.0907175465293223e-07, + "logits/chosen": -1.8397397994995117, + "logits/rejected": -2.034342050552368, + "logps/chosen": -410.5095520019531, + "logps/rejected": -290.1748352050781, + "loss": 0.4848, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8454284071922302, + "rewards/margins": 1.5735955238342285, + "rewards/rejected": -2.4190239906311035, + "step": 5568 + }, + { + "epoch": 0.64, + "learning_rate": 1.0903663818330797e-07, + "logits/chosen": -2.211507797241211, + "logits/rejected": -2.447836399078369, + "logps/chosen": -237.5921173095703, + "logps/rejected": -216.11297607421875, + "loss": 0.6202, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.958911657333374, + "rewards/margins": 1.1393274068832397, + "rewards/rejected": -2.098238945007324, + "step": 5569 + }, + { + "epoch": 0.64, + "learning_rate": 1.0900152171368371e-07, + "logits/chosen": -2.2301084995269775, + "logits/rejected": -1.9524781703948975, + "logps/chosen": -134.12081909179688, + "logps/rejected": -292.4876403808594, + "loss": 0.0917, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5851396918296814, + "rewards/margins": 3.709113597869873, + "rewards/rejected": -4.294253349304199, + "step": 5570 + }, + { + "epoch": 0.64, + "learning_rate": 1.0896640524405947e-07, + "logits/chosen": -2.1229472160339355, + "logits/rejected": -2.36175274848938, + "logps/chosen": -439.3664245605469, + "logps/rejected": -257.3489990234375, + "loss": 0.9155, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.5474340915679932, + "rewards/margins": 0.7826644778251648, + "rewards/rejected": -2.3300986289978027, + "step": 5571 + }, + { + "epoch": 0.64, + "learning_rate": 1.089312887744352e-07, + "logits/chosen": -2.4150567054748535, + "logits/rejected": -1.824306607246399, + "logps/chosen": -159.84495544433594, + "logps/rejected": -358.83001708984375, + "loss": 0.352, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7172228097915649, + "rewards/margins": 2.1605656147003174, + "rewards/rejected": -2.877788543701172, + "step": 5572 + }, + { + "epoch": 0.64, + "learning_rate": 1.0889617230481095e-07, + "logits/chosen": -1.9664883613586426, + "logits/rejected": -2.2892069816589355, + "logps/chosen": -310.44647216796875, + "logps/rejected": -227.49322509765625, + "loss": 0.3871, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9796493053436279, + "rewards/margins": 2.4980478286743164, + "rewards/rejected": -3.4776968955993652, + "step": 5573 + }, + { + "epoch": 0.64, + "learning_rate": 1.0886105583518669e-07, + "logits/chosen": -2.3930506706237793, + "logits/rejected": -2.3166615962982178, + "logps/chosen": -304.6869812011719, + "logps/rejected": -507.9873962402344, + "loss": 0.7872, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9860292673110962, + "rewards/margins": 0.959740400314331, + "rewards/rejected": -1.9457696676254272, + "step": 5574 + }, + { + "epoch": 0.64, + "learning_rate": 1.0882593936556244e-07, + "logits/chosen": -2.2879509925842285, + "logits/rejected": -2.4337520599365234, + "logps/chosen": -192.58383178710938, + "logps/rejected": -245.15029907226562, + "loss": 0.2112, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8396120667457581, + "rewards/margins": 2.9191806316375732, + "rewards/rejected": -3.7587928771972656, + "step": 5575 + }, + { + "epoch": 0.64, + "learning_rate": 1.087908228959382e-07, + "logits/chosen": -1.9354324340820312, + "logits/rejected": -2.0458319187164307, + "logps/chosen": -353.0391540527344, + "logps/rejected": -249.40969848632812, + "loss": 0.4217, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6115959882736206, + "rewards/margins": 1.3576488494873047, + "rewards/rejected": -2.9692447185516357, + "step": 5576 + }, + { + "epoch": 0.64, + "learning_rate": 1.0875570642631394e-07, + "logits/chosen": -2.4494619369506836, + "logits/rejected": -2.4082210063934326, + "logps/chosen": -187.1236572265625, + "logps/rejected": -394.10687255859375, + "loss": 0.2019, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9941889643669128, + "rewards/margins": 3.245906352996826, + "rewards/rejected": -4.240095138549805, + "step": 5577 + }, + { + "epoch": 0.64, + "learning_rate": 1.0872058995668968e-07, + "logits/chosen": -1.8896100521087646, + "logits/rejected": -1.801406979560852, + "logps/chosen": -189.24835205078125, + "logps/rejected": -276.0801696777344, + "loss": 0.4646, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6654526591300964, + "rewards/margins": 1.294908881187439, + "rewards/rejected": -1.9603615999221802, + "step": 5578 + }, + { + "epoch": 0.64, + "learning_rate": 1.0868547348706542e-07, + "logits/chosen": -2.1619911193847656, + "logits/rejected": -2.2783608436584473, + "logps/chosen": -172.52169799804688, + "logps/rejected": -182.97164916992188, + "loss": 0.6624, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.20922963321208954, + "rewards/margins": 0.7564989328384399, + "rewards/rejected": -0.9657285809516907, + "step": 5579 + }, + { + "epoch": 0.64, + "learning_rate": 1.0865035701744118e-07, + "logits/chosen": -2.455559492111206, + "logits/rejected": -2.425769567489624, + "logps/chosen": -381.9794616699219, + "logps/rejected": -374.3108215332031, + "loss": 0.6869, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4399669170379639, + "rewards/margins": 2.131523609161377, + "rewards/rejected": -3.5714902877807617, + "step": 5580 + }, + { + "epoch": 0.64, + "learning_rate": 1.0861524054781692e-07, + "logits/chosen": -1.9247658252716064, + "logits/rejected": -1.823671817779541, + "logps/chosen": -446.347900390625, + "logps/rejected": -515.0973510742188, + "loss": 0.5573, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.379678726196289, + "rewards/margins": 1.6504451036453247, + "rewards/rejected": -3.030123710632324, + "step": 5581 + }, + { + "epoch": 0.64, + "learning_rate": 1.0858012407819267e-07, + "logits/chosen": -2.732410430908203, + "logits/rejected": -2.824697494506836, + "logps/chosen": -298.08050537109375, + "logps/rejected": -434.65863037109375, + "loss": 0.7239, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3860970735549927, + "rewards/margins": 2.2119970321655273, + "rewards/rejected": -3.5980944633483887, + "step": 5582 + }, + { + "epoch": 0.64, + "learning_rate": 1.085450076085684e-07, + "logits/chosen": -1.9530447721481323, + "logits/rejected": -1.9315264225006104, + "logps/chosen": -322.1188659667969, + "logps/rejected": -278.862548828125, + "loss": 0.5321, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3829708099365234, + "rewards/margins": 1.53606379032135, + "rewards/rejected": -2.919034719467163, + "step": 5583 + }, + { + "epoch": 0.64, + "learning_rate": 1.0850989113894416e-07, + "logits/chosen": -1.8645405769348145, + "logits/rejected": -1.790459394454956, + "logps/chosen": -257.4512939453125, + "logps/rejected": -252.94349670410156, + "loss": 0.5759, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.43504154682159424, + "rewards/margins": 1.4803849458694458, + "rewards/rejected": -1.9154266119003296, + "step": 5584 + }, + { + "epoch": 0.64, + "learning_rate": 1.0847477466931991e-07, + "logits/chosen": -2.001213788986206, + "logits/rejected": -1.999053716659546, + "logps/chosen": -418.18731689453125, + "logps/rejected": -315.06280517578125, + "loss": 0.3102, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8101486563682556, + "rewards/margins": 1.3036597967147827, + "rewards/rejected": -2.1138083934783936, + "step": 5585 + }, + { + "epoch": 0.64, + "learning_rate": 1.0843965819969565e-07, + "logits/chosen": -2.5611929893493652, + "logits/rejected": -2.6193950176239014, + "logps/chosen": -322.8603210449219, + "logps/rejected": -209.74102783203125, + "loss": 0.5386, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5503841638565063, + "rewards/margins": 2.168978452682495, + "rewards/rejected": -3.719362497329712, + "step": 5586 + }, + { + "epoch": 0.64, + "learning_rate": 1.084045417300714e-07, + "logits/chosen": -1.8840150833129883, + "logits/rejected": -2.3594179153442383, + "logps/chosen": -331.8907165527344, + "logps/rejected": -328.7940979003906, + "loss": 0.2932, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0341200828552246, + "rewards/margins": 2.150590419769287, + "rewards/rejected": -4.184710502624512, + "step": 5587 + }, + { + "epoch": 0.64, + "learning_rate": 1.0836942526044715e-07, + "logits/chosen": -2.3791275024414062, + "logits/rejected": -2.505131244659424, + "logps/chosen": -239.43856811523438, + "logps/rejected": -193.56382751464844, + "loss": 0.2364, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7207792401313782, + "rewards/margins": 3.677166700363159, + "rewards/rejected": -4.397945880889893, + "step": 5588 + }, + { + "epoch": 0.64, + "learning_rate": 1.0833430879082289e-07, + "logits/chosen": -2.2880499362945557, + "logits/rejected": -2.4117517471313477, + "logps/chosen": -301.909423828125, + "logps/rejected": -212.70619201660156, + "loss": 0.5504, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.60026216506958, + "rewards/margins": 1.4124491214752197, + "rewards/rejected": -3.012711524963379, + "step": 5589 + }, + { + "epoch": 0.64, + "learning_rate": 1.0829919232119863e-07, + "logits/chosen": -1.9369009733200073, + "logits/rejected": -2.2024333477020264, + "logps/chosen": -527.4369506835938, + "logps/rejected": -277.4558410644531, + "loss": 0.8335, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8332406878471375, + "rewards/margins": 1.774963617324829, + "rewards/rejected": -2.6082043647766113, + "step": 5590 + }, + { + "epoch": 0.64, + "learning_rate": 1.0826407585157437e-07, + "logits/chosen": -1.8434786796569824, + "logits/rejected": -2.1908059120178223, + "logps/chosen": -415.8649597167969, + "logps/rejected": -263.9734191894531, + "loss": 0.381, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5045161247253418, + "rewards/margins": 1.4477251768112183, + "rewards/rejected": -1.9522411823272705, + "step": 5591 + }, + { + "epoch": 0.64, + "learning_rate": 1.0822895938195014e-07, + "logits/chosen": -1.8057587146759033, + "logits/rejected": -1.400474190711975, + "logps/chosen": -413.08026123046875, + "logps/rejected": -457.78240966796875, + "loss": 0.5426, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3500701189041138, + "rewards/margins": 1.5425841808319092, + "rewards/rejected": -2.8926541805267334, + "step": 5592 + }, + { + "epoch": 0.64, + "learning_rate": 1.0819384291232588e-07, + "logits/chosen": -2.178710699081421, + "logits/rejected": -2.351886749267578, + "logps/chosen": -247.4307403564453, + "logps/rejected": -326.09283447265625, + "loss": 0.0691, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.997645914554596, + "rewards/margins": 4.379858016967773, + "rewards/rejected": -5.377503871917725, + "step": 5593 + }, + { + "epoch": 0.64, + "learning_rate": 1.0815872644270162e-07, + "logits/chosen": -2.0295114517211914, + "logits/rejected": -2.302011489868164, + "logps/chosen": -296.95355224609375, + "logps/rejected": -199.27291870117188, + "loss": 0.1873, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9091602563858032, + "rewards/margins": 3.282567024230957, + "rewards/rejected": -4.191727161407471, + "step": 5594 + }, + { + "epoch": 0.64, + "learning_rate": 1.0812360997307736e-07, + "logits/chosen": -1.9718683958053589, + "logits/rejected": -2.00360369682312, + "logps/chosen": -335.1490478515625, + "logps/rejected": -278.7640686035156, + "loss": 0.3346, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5854823589324951, + "rewards/margins": 1.7481465339660645, + "rewards/rejected": -2.3336288928985596, + "step": 5595 + }, + { + "epoch": 0.65, + "learning_rate": 1.0808849350345313e-07, + "logits/chosen": -1.996229648590088, + "logits/rejected": -1.878828763961792, + "logps/chosen": -389.683349609375, + "logps/rejected": -439.95391845703125, + "loss": 0.3922, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5403751730918884, + "rewards/margins": 2.159529685974121, + "rewards/rejected": -2.699904680252075, + "step": 5596 + }, + { + "epoch": 0.65, + "learning_rate": 1.0805337703382887e-07, + "logits/chosen": -2.0390567779541016, + "logits/rejected": -2.049042224884033, + "logps/chosen": -355.02093505859375, + "logps/rejected": -299.6063537597656, + "loss": 0.8792, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9055866003036499, + "rewards/margins": 0.7443209886550903, + "rewards/rejected": -1.6499077081680298, + "step": 5597 + }, + { + "epoch": 0.65, + "learning_rate": 1.0801826056420461e-07, + "logits/chosen": -1.717152714729309, + "logits/rejected": -2.033006191253662, + "logps/chosen": -468.76312255859375, + "logps/rejected": -351.53759765625, + "loss": 0.1197, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1294159889221191, + "rewards/margins": 3.0576748847961426, + "rewards/rejected": -4.187090873718262, + "step": 5598 + }, + { + "epoch": 0.65, + "learning_rate": 1.0798314409458035e-07, + "logits/chosen": -2.233872413635254, + "logits/rejected": -2.160238027572632, + "logps/chosen": -263.58758544921875, + "logps/rejected": -229.39566040039062, + "loss": 0.4159, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10782565176486969, + "rewards/margins": 1.6999895572662354, + "rewards/rejected": -1.8078150749206543, + "step": 5599 + }, + { + "epoch": 0.65, + "learning_rate": 1.079480276249561e-07, + "logits/chosen": -2.4013795852661133, + "logits/rejected": -2.407835006713867, + "logps/chosen": -228.54019165039062, + "logps/rejected": -190.7622528076172, + "loss": 0.5403, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.14045463502407074, + "rewards/margins": 1.2648131847381592, + "rewards/rejected": -1.4052678346633911, + "step": 5600 + }, + { + "epoch": 0.65, + "learning_rate": 1.0791291115533184e-07, + "logits/chosen": -2.1785454750061035, + "logits/rejected": -2.4887988567352295, + "logps/chosen": -398.0982666015625, + "logps/rejected": -291.22802734375, + "loss": 0.1482, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4517725706100464, + "rewards/margins": 3.2952332496643066, + "rewards/rejected": -3.7470054626464844, + "step": 5601 + }, + { + "epoch": 0.65, + "learning_rate": 1.078777946857076e-07, + "logits/chosen": -2.2058980464935303, + "logits/rejected": -2.2972564697265625, + "logps/chosen": -439.3061828613281, + "logps/rejected": -386.358154296875, + "loss": 0.2602, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2512134611606598, + "rewards/margins": 2.157348155975342, + "rewards/rejected": -2.4085617065429688, + "step": 5602 + }, + { + "epoch": 0.65, + "learning_rate": 1.0784267821608334e-07, + "logits/chosen": -2.2819292545318604, + "logits/rejected": -2.4838690757751465, + "logps/chosen": -425.13897705078125, + "logps/rejected": -348.1236267089844, + "loss": 0.5336, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5481464862823486, + "rewards/margins": 1.721369743347168, + "rewards/rejected": -2.2695159912109375, + "step": 5603 + }, + { + "epoch": 0.65, + "learning_rate": 1.0780756174645909e-07, + "logits/chosen": -2.0345189571380615, + "logits/rejected": -2.2497363090515137, + "logps/chosen": -685.9989624023438, + "logps/rejected": -549.169677734375, + "loss": 0.4617, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2099250853061676, + "rewards/margins": 1.9796102046966553, + "rewards/rejected": -2.18953537940979, + "step": 5604 + }, + { + "epoch": 0.65, + "learning_rate": 1.0777244527683483e-07, + "logits/chosen": -2.0433013439178467, + "logits/rejected": -1.7987258434295654, + "logps/chosen": -326.54534912109375, + "logps/rejected": -336.5845947265625, + "loss": 0.2176, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6944723129272461, + "rewards/margins": 4.010419845581055, + "rewards/rejected": -4.704892635345459, + "step": 5605 + }, + { + "epoch": 0.65, + "learning_rate": 1.0773732880721057e-07, + "logits/chosen": -2.455110788345337, + "logits/rejected": -2.2323079109191895, + "logps/chosen": -366.7408447265625, + "logps/rejected": -380.2358703613281, + "loss": 0.3644, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6092917919158936, + "rewards/margins": 1.9642760753631592, + "rewards/rejected": -3.5735678672790527, + "step": 5606 + }, + { + "epoch": 0.65, + "learning_rate": 1.0770221233758631e-07, + "logits/chosen": -2.127835273742676, + "logits/rejected": -2.0448732376098633, + "logps/chosen": -253.0057830810547, + "logps/rejected": -387.5718994140625, + "loss": 0.3838, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12459655106067657, + "rewards/margins": 2.3586678504943848, + "rewards/rejected": -2.483264207839966, + "step": 5607 + }, + { + "epoch": 0.65, + "learning_rate": 1.0766709586796208e-07, + "logits/chosen": -2.329587936401367, + "logits/rejected": -2.5643258094787598, + "logps/chosen": -300.1282958984375, + "logps/rejected": -278.5567626953125, + "loss": 0.4111, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8517460227012634, + "rewards/margins": 2.3316450119018555, + "rewards/rejected": -3.1833908557891846, + "step": 5608 + }, + { + "epoch": 0.65, + "learning_rate": 1.0763197939833782e-07, + "logits/chosen": -2.557091236114502, + "logits/rejected": -2.6145660877227783, + "logps/chosen": -288.943603515625, + "logps/rejected": -260.17852783203125, + "loss": 0.2408, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9093581438064575, + "rewards/margins": 2.2731032371520996, + "rewards/rejected": -3.1824612617492676, + "step": 5609 + }, + { + "epoch": 0.65, + "learning_rate": 1.0759686292871356e-07, + "logits/chosen": -2.4029834270477295, + "logits/rejected": -2.3982601165771484, + "logps/chosen": -327.260009765625, + "logps/rejected": -282.06768798828125, + "loss": 0.0546, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5124120712280273, + "rewards/margins": 3.99528431892395, + "rewards/rejected": -4.507696628570557, + "step": 5610 + }, + { + "epoch": 0.65, + "learning_rate": 1.075617464590893e-07, + "logits/chosen": -2.0922040939331055, + "logits/rejected": -2.122896194458008, + "logps/chosen": -147.6241455078125, + "logps/rejected": -153.61187744140625, + "loss": 0.5254, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.43518802523612976, + "rewards/margins": 0.9587024450302124, + "rewards/rejected": -1.3938905000686646, + "step": 5611 + }, + { + "epoch": 0.65, + "learning_rate": 1.0752662998946506e-07, + "logits/chosen": -2.450577735900879, + "logits/rejected": -2.467482805252075, + "logps/chosen": -309.4425964355469, + "logps/rejected": -330.58843994140625, + "loss": 0.2548, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0797570943832397, + "rewards/margins": 1.8771159648895264, + "rewards/rejected": -2.9568734169006348, + "step": 5612 + }, + { + "epoch": 0.65, + "learning_rate": 1.0749151351984081e-07, + "logits/chosen": -2.3028435707092285, + "logits/rejected": -2.2042300701141357, + "logps/chosen": -333.9439697265625, + "logps/rejected": -380.3712463378906, + "loss": 0.1465, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6185139417648315, + "rewards/margins": 3.1087608337402344, + "rewards/rejected": -3.7272746562957764, + "step": 5613 + }, + { + "epoch": 0.65, + "learning_rate": 1.0745639705021655e-07, + "logits/chosen": -2.117309808731079, + "logits/rejected": -2.2813258171081543, + "logps/chosen": -246.82359313964844, + "logps/rejected": -178.05038452148438, + "loss": 0.6188, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6931211948394775, + "rewards/margins": 1.167994499206543, + "rewards/rejected": -1.8611156940460205, + "step": 5614 + }, + { + "epoch": 0.65, + "learning_rate": 1.0742128058059229e-07, + "logits/chosen": -2.314840793609619, + "logits/rejected": -2.5595169067382812, + "logps/chosen": -405.5207214355469, + "logps/rejected": -230.56126403808594, + "loss": 0.5199, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.559601068496704, + "rewards/margins": 1.0992881059646606, + "rewards/rejected": -3.658889055252075, + "step": 5615 + }, + { + "epoch": 0.65, + "learning_rate": 1.0738616411096804e-07, + "logits/chosen": -3.0658230781555176, + "logits/rejected": -3.013986349105835, + "logps/chosen": -222.94912719726562, + "logps/rejected": -265.2013854980469, + "loss": 0.1987, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6741982698440552, + "rewards/margins": 2.871654510498047, + "rewards/rejected": -3.5458528995513916, + "step": 5616 + }, + { + "epoch": 0.65, + "learning_rate": 1.0735104764134379e-07, + "logits/chosen": -1.635995626449585, + "logits/rejected": -1.9761683940887451, + "logps/chosen": -207.94688415527344, + "logps/rejected": -198.60931396484375, + "loss": 0.3111, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5475813150405884, + "rewards/margins": 1.3875553607940674, + "rewards/rejected": -1.9351366758346558, + "step": 5617 + }, + { + "epoch": 0.65, + "learning_rate": 1.0731593117171953e-07, + "logits/chosen": -2.1910436153411865, + "logits/rejected": -2.4972901344299316, + "logps/chosen": -442.8512878417969, + "logps/rejected": -257.710693359375, + "loss": 0.5765, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.591421365737915, + "rewards/margins": 1.3347042798995972, + "rewards/rejected": -2.9261255264282227, + "step": 5618 + }, + { + "epoch": 0.65, + "learning_rate": 1.0728081470209528e-07, + "logits/chosen": -2.125455856323242, + "logits/rejected": -1.833089828491211, + "logps/chosen": -305.8532409667969, + "logps/rejected": -355.3576354980469, + "loss": 0.5027, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6512921452522278, + "rewards/margins": 0.9856712818145752, + "rewards/rejected": -1.6369634866714478, + "step": 5619 + }, + { + "epoch": 0.65, + "learning_rate": 1.0724569823247102e-07, + "logits/chosen": -2.1550841331481934, + "logits/rejected": -2.1846325397491455, + "logps/chosen": -347.1925354003906, + "logps/rejected": -350.3404846191406, + "loss": 0.4911, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.44188573956489563, + "rewards/margins": 2.226423740386963, + "rewards/rejected": -2.6683096885681152, + "step": 5620 + }, + { + "epoch": 0.65, + "learning_rate": 1.0721058176284677e-07, + "logits/chosen": -1.9006553888320923, + "logits/rejected": -2.1182827949523926, + "logps/chosen": -181.38673400878906, + "logps/rejected": -158.2355499267578, + "loss": 0.4258, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8381028771400452, + "rewards/margins": 0.7708994150161743, + "rewards/rejected": -1.6090022325515747, + "step": 5621 + }, + { + "epoch": 0.65, + "learning_rate": 1.0717546529322252e-07, + "logits/chosen": -2.1325886249542236, + "logits/rejected": -2.256268262863159, + "logps/chosen": -362.17486572265625, + "logps/rejected": -319.835693359375, + "loss": 0.4866, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7508196234703064, + "rewards/margins": 1.4601390361785889, + "rewards/rejected": -2.210958480834961, + "step": 5622 + }, + { + "epoch": 0.65, + "learning_rate": 1.0714034882359826e-07, + "logits/chosen": -2.5514633655548096, + "logits/rejected": -2.36921763420105, + "logps/chosen": -270.3897705078125, + "logps/rejected": -371.35406494140625, + "loss": 0.2, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9938151836395264, + "rewards/margins": 2.980790376663208, + "rewards/rejected": -3.9746057987213135, + "step": 5623 + }, + { + "epoch": 0.65, + "learning_rate": 1.07105232353974e-07, + "logits/chosen": -2.371365785598755, + "logits/rejected": -2.4686384201049805, + "logps/chosen": -324.34100341796875, + "logps/rejected": -285.80340576171875, + "loss": 0.1377, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.413335382938385, + "rewards/margins": 3.5004358291625977, + "rewards/rejected": -3.913771390914917, + "step": 5624 + }, + { + "epoch": 0.65, + "learning_rate": 1.0707011588434976e-07, + "logits/chosen": -2.3603098392486572, + "logits/rejected": -2.241528034210205, + "logps/chosen": -397.17303466796875, + "logps/rejected": -392.22039794921875, + "loss": 0.3724, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.729623556137085, + "rewards/margins": 2.5205764770507812, + "rewards/rejected": -4.250200271606445, + "step": 5625 + }, + { + "epoch": 0.65, + "learning_rate": 1.070349994147255e-07, + "logits/chosen": -2.4640543460845947, + "logits/rejected": -2.5000972747802734, + "logps/chosen": -195.0125732421875, + "logps/rejected": -226.14080810546875, + "loss": 0.4498, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0513452291488647, + "rewards/margins": 1.7887089252471924, + "rewards/rejected": -2.8400542736053467, + "step": 5626 + }, + { + "epoch": 0.65, + "learning_rate": 1.0699988294510124e-07, + "logits/chosen": -1.8685075044631958, + "logits/rejected": -2.069225788116455, + "logps/chosen": -379.5164794921875, + "logps/rejected": -395.8551330566406, + "loss": 0.2225, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5976929068565369, + "rewards/margins": 3.258848190307617, + "rewards/rejected": -3.856541156768799, + "step": 5627 + }, + { + "epoch": 0.65, + "learning_rate": 1.0696476647547699e-07, + "logits/chosen": -1.9062961339950562, + "logits/rejected": -1.8766331672668457, + "logps/chosen": -223.44879150390625, + "logps/rejected": -213.9352264404297, + "loss": 0.5111, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1833162307739258, + "rewards/margins": 1.4715023040771484, + "rewards/rejected": -2.654818534851074, + "step": 5628 + }, + { + "epoch": 0.65, + "learning_rate": 1.0692965000585274e-07, + "logits/chosen": -2.812434673309326, + "logits/rejected": -2.4676930904388428, + "logps/chosen": -177.1428680419922, + "logps/rejected": -193.751953125, + "loss": 0.7868, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1902793645858765, + "rewards/margins": 1.0837252140045166, + "rewards/rejected": -2.2740046977996826, + "step": 5629 + }, + { + "epoch": 0.65, + "learning_rate": 1.0689453353622849e-07, + "logits/chosen": -2.0272135734558105, + "logits/rejected": -2.3740599155426025, + "logps/chosen": -264.66754150390625, + "logps/rejected": -175.6949462890625, + "loss": 0.5511, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0671846866607666, + "rewards/margins": 2.500473737716675, + "rewards/rejected": -3.5676581859588623, + "step": 5630 + }, + { + "epoch": 0.65, + "learning_rate": 1.0685941706660423e-07, + "logits/chosen": -1.7309497594833374, + "logits/rejected": -1.6793313026428223, + "logps/chosen": -254.61270141601562, + "logps/rejected": -255.1759490966797, + "loss": 0.5233, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5166069269180298, + "rewards/margins": 2.443819522857666, + "rewards/rejected": -3.9604265689849854, + "step": 5631 + }, + { + "epoch": 0.65, + "learning_rate": 1.0682430059697997e-07, + "logits/chosen": -1.806485652923584, + "logits/rejected": -1.3924081325531006, + "logps/chosen": -183.73434448242188, + "logps/rejected": -346.2339172363281, + "loss": 0.2727, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24158629775047302, + "rewards/margins": 2.0396595001220703, + "rewards/rejected": -2.2812459468841553, + "step": 5632 + }, + { + "epoch": 0.65, + "learning_rate": 1.0678918412735573e-07, + "logits/chosen": -2.9140076637268066, + "logits/rejected": -2.9188828468322754, + "logps/chosen": -172.00897216796875, + "logps/rejected": -210.37319946289062, + "loss": 0.2967, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2149558663368225, + "rewards/margins": 1.6648229360580444, + "rewards/rejected": -1.8797786235809326, + "step": 5633 + }, + { + "epoch": 0.65, + "learning_rate": 1.0675406765773147e-07, + "logits/chosen": -2.4724903106689453, + "logits/rejected": -2.424015522003174, + "logps/chosen": -269.67333984375, + "logps/rejected": -349.21484375, + "loss": 0.206, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.921817421913147, + "rewards/margins": 3.3271288871765137, + "rewards/rejected": -4.248946189880371, + "step": 5634 + }, + { + "epoch": 0.65, + "learning_rate": 1.0671895118810721e-07, + "logits/chosen": -2.6048238277435303, + "logits/rejected": -2.769782543182373, + "logps/chosen": -312.489501953125, + "logps/rejected": -363.16583251953125, + "loss": 0.1902, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.300346851348877, + "rewards/margins": 3.847957134246826, + "rewards/rejected": -5.148303985595703, + "step": 5635 + }, + { + "epoch": 0.65, + "learning_rate": 1.0668383471848296e-07, + "logits/chosen": -2.407343626022339, + "logits/rejected": -2.534348487854004, + "logps/chosen": -367.62353515625, + "logps/rejected": -235.82501220703125, + "loss": 0.2419, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7707759141921997, + "rewards/margins": 2.4347386360168457, + "rewards/rejected": -3.205514669418335, + "step": 5636 + }, + { + "epoch": 0.65, + "learning_rate": 1.0664871824885872e-07, + "logits/chosen": -2.305006504058838, + "logits/rejected": -2.3481664657592773, + "logps/chosen": -349.2642822265625, + "logps/rejected": -264.667724609375, + "loss": 0.6912, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.49092626571655273, + "rewards/margins": 0.958369255065918, + "rewards/rejected": -1.4492955207824707, + "step": 5637 + }, + { + "epoch": 0.65, + "learning_rate": 1.0661360177923446e-07, + "logits/chosen": -1.9832887649536133, + "logits/rejected": -2.3566579818725586, + "logps/chosen": -174.01889038085938, + "logps/rejected": -158.16358947753906, + "loss": 0.3151, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5263004899024963, + "rewards/margins": 1.878779649734497, + "rewards/rejected": -2.4050800800323486, + "step": 5638 + }, + { + "epoch": 0.65, + "learning_rate": 1.065784853096102e-07, + "logits/chosen": -2.2318129539489746, + "logits/rejected": -2.323765277862549, + "logps/chosen": -230.85797119140625, + "logps/rejected": -236.1198272705078, + "loss": 0.21, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5241443514823914, + "rewards/margins": 3.2818410396575928, + "rewards/rejected": -3.805985450744629, + "step": 5639 + }, + { + "epoch": 0.65, + "learning_rate": 1.0654336883998594e-07, + "logits/chosen": -2.139678716659546, + "logits/rejected": -2.3132195472717285, + "logps/chosen": -405.6937255859375, + "logps/rejected": -348.08465576171875, + "loss": 0.3765, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.210774540901184, + "rewards/margins": 1.9715893268585205, + "rewards/rejected": -3.182363986968994, + "step": 5640 + }, + { + "epoch": 0.65, + "learning_rate": 1.065082523703617e-07, + "logits/chosen": -2.2731804847717285, + "logits/rejected": -2.237326145172119, + "logps/chosen": -390.6229248046875, + "logps/rejected": -273.4651794433594, + "loss": 0.6326, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4170808792114258, + "rewards/margins": 1.9511241912841797, + "rewards/rejected": -3.3682053089141846, + "step": 5641 + }, + { + "epoch": 0.65, + "learning_rate": 1.0647313590073745e-07, + "logits/chosen": -1.8641865253448486, + "logits/rejected": -1.8124009370803833, + "logps/chosen": -252.398193359375, + "logps/rejected": -346.5149841308594, + "loss": 1.2822, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3993488550186157, + "rewards/margins": -0.030715972185134888, + "rewards/rejected": -1.3686329126358032, + "step": 5642 + }, + { + "epoch": 0.65, + "learning_rate": 1.0643801943111319e-07, + "logits/chosen": -2.1011972427368164, + "logits/rejected": -2.0411744117736816, + "logps/chosen": -471.7186279296875, + "logps/rejected": -393.56787109375, + "loss": 0.1784, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0004043579101562, + "rewards/margins": 2.1873908042907715, + "rewards/rejected": -3.1877946853637695, + "step": 5643 + }, + { + "epoch": 0.65, + "learning_rate": 1.0640290296148893e-07, + "logits/chosen": -2.8499255180358887, + "logits/rejected": -2.739769220352173, + "logps/chosen": -209.415283203125, + "logps/rejected": -261.223876953125, + "loss": 0.9503, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1522139310836792, + "rewards/margins": 1.7671856880187988, + "rewards/rejected": -2.9193994998931885, + "step": 5644 + }, + { + "epoch": 0.65, + "learning_rate": 1.0636778649186468e-07, + "logits/chosen": -2.449686050415039, + "logits/rejected": -2.7535858154296875, + "logps/chosen": -248.28294372558594, + "logps/rejected": -252.53126525878906, + "loss": 0.3613, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.37471458315849304, + "rewards/margins": 1.958021879196167, + "rewards/rejected": -2.3327364921569824, + "step": 5645 + }, + { + "epoch": 0.65, + "learning_rate": 1.0633267002224042e-07, + "logits/chosen": -2.7482030391693115, + "logits/rejected": -2.709479331970215, + "logps/chosen": -127.42088317871094, + "logps/rejected": -280.70782470703125, + "loss": 0.2204, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5550940632820129, + "rewards/margins": 2.7461905479431152, + "rewards/rejected": -3.3012845516204834, + "step": 5646 + }, + { + "epoch": 0.65, + "learning_rate": 1.0629755355261618e-07, + "logits/chosen": -2.313114643096924, + "logits/rejected": -2.508451223373413, + "logps/chosen": -203.09161376953125, + "logps/rejected": -251.91159057617188, + "loss": 0.4695, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9922420978546143, + "rewards/margins": 2.9146928787231445, + "rewards/rejected": -3.906935214996338, + "step": 5647 + }, + { + "epoch": 0.65, + "learning_rate": 1.0626243708299192e-07, + "logits/chosen": -1.8519558906555176, + "logits/rejected": -2.10243821144104, + "logps/chosen": -456.8333435058594, + "logps/rejected": -325.2626647949219, + "loss": 0.5027, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.22830884158611298, + "rewards/margins": 1.862246036529541, + "rewards/rejected": -2.09055495262146, + "step": 5648 + }, + { + "epoch": 0.65, + "learning_rate": 1.0622732061336767e-07, + "logits/chosen": -2.6316661834716797, + "logits/rejected": -2.6460537910461426, + "logps/chosen": -382.28643798828125, + "logps/rejected": -257.8642578125, + "loss": 0.0906, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2938566505908966, + "rewards/margins": 3.851048707962036, + "rewards/rejected": -4.1449055671691895, + "step": 5649 + }, + { + "epoch": 0.65, + "learning_rate": 1.0619220414374341e-07, + "logits/chosen": -2.6458542346954346, + "logits/rejected": -2.7372851371765137, + "logps/chosen": -170.66017150878906, + "logps/rejected": -125.43811798095703, + "loss": 0.4208, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5566586852073669, + "rewards/margins": 0.8988291025161743, + "rewards/rejected": -1.455487847328186, + "step": 5650 + }, + { + "epoch": 0.65, + "learning_rate": 1.0615708767411915e-07, + "logits/chosen": -2.451138973236084, + "logits/rejected": -2.4854722023010254, + "logps/chosen": -338.141845703125, + "logps/rejected": -326.40625, + "loss": 0.4747, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6066317558288574, + "rewards/margins": 1.0784398317337036, + "rewards/rejected": -2.6850714683532715, + "step": 5651 + }, + { + "epoch": 0.65, + "learning_rate": 1.0612197120449489e-07, + "logits/chosen": -2.1198909282684326, + "logits/rejected": -2.1757965087890625, + "logps/chosen": -292.1712646484375, + "logps/rejected": -304.87274169921875, + "loss": 0.3487, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.18336820602417, + "rewards/margins": 1.8963401317596436, + "rewards/rejected": -3.0797083377838135, + "step": 5652 + }, + { + "epoch": 0.65, + "learning_rate": 1.0608685473487066e-07, + "logits/chosen": -2.5779097080230713, + "logits/rejected": -2.5212156772613525, + "logps/chosen": -383.8000793457031, + "logps/rejected": -328.08709716796875, + "loss": 0.4149, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8195387125015259, + "rewards/margins": 1.4220458269119263, + "rewards/rejected": -2.241584539413452, + "step": 5653 + }, + { + "epoch": 0.65, + "learning_rate": 1.060517382652464e-07, + "logits/chosen": -2.540013313293457, + "logits/rejected": -2.707318067550659, + "logps/chosen": -314.0069885253906, + "logps/rejected": -285.8877868652344, + "loss": 0.6672, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7095750570297241, + "rewards/margins": 0.6664403676986694, + "rewards/rejected": -2.3760156631469727, + "step": 5654 + }, + { + "epoch": 0.65, + "learning_rate": 1.0601662179562214e-07, + "logits/chosen": -2.1300480365753174, + "logits/rejected": -2.2666566371917725, + "logps/chosen": -202.470947265625, + "logps/rejected": -259.21466064453125, + "loss": 0.1714, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9343557953834534, + "rewards/margins": 4.6085076332092285, + "rewards/rejected": -5.542863368988037, + "step": 5655 + }, + { + "epoch": 0.65, + "learning_rate": 1.0598150532599788e-07, + "logits/chosen": -1.9161779880523682, + "logits/rejected": -2.1477365493774414, + "logps/chosen": -355.4100036621094, + "logps/rejected": -275.2769775390625, + "loss": 0.1853, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2644076943397522, + "rewards/margins": 2.367096185684204, + "rewards/rejected": -2.6315038204193115, + "step": 5656 + }, + { + "epoch": 0.65, + "learning_rate": 1.0594638885637364e-07, + "logits/chosen": -2.1417553424835205, + "logits/rejected": -2.1187193393707275, + "logps/chosen": -371.9355163574219, + "logps/rejected": -280.91650390625, + "loss": 0.2516, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9348464608192444, + "rewards/margins": 1.838855266571045, + "rewards/rejected": -2.7737016677856445, + "step": 5657 + }, + { + "epoch": 0.65, + "learning_rate": 1.0591127238674939e-07, + "logits/chosen": -2.335456132888794, + "logits/rejected": -2.4510414600372314, + "logps/chosen": -343.34722900390625, + "logps/rejected": -425.6786804199219, + "loss": 0.1765, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0883607864379883, + "rewards/margins": 3.223012685775757, + "rewards/rejected": -4.311373710632324, + "step": 5658 + }, + { + "epoch": 0.65, + "learning_rate": 1.0587615591712513e-07, + "logits/chosen": -2.1657283306121826, + "logits/rejected": -2.238218307495117, + "logps/chosen": -394.17535400390625, + "logps/rejected": -285.6178283691406, + "loss": 0.5376, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2833927869796753, + "rewards/margins": 2.1366286277770996, + "rewards/rejected": -3.4200215339660645, + "step": 5659 + }, + { + "epoch": 0.65, + "learning_rate": 1.0584103944750087e-07, + "logits/chosen": -2.38238525390625, + "logits/rejected": -1.9769011735916138, + "logps/chosen": -151.308349609375, + "logps/rejected": -304.7209167480469, + "loss": 0.9265, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6869240999221802, + "rewards/margins": 0.9533371329307556, + "rewards/rejected": -2.640261173248291, + "step": 5660 + }, + { + "epoch": 0.65, + "learning_rate": 1.0580592297787662e-07, + "logits/chosen": -1.912308692932129, + "logits/rejected": -1.8542673587799072, + "logps/chosen": -219.14695739746094, + "logps/rejected": -287.70111083984375, + "loss": 0.495, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.57902592420578, + "rewards/margins": 1.0080444812774658, + "rewards/rejected": -1.587070345878601, + "step": 5661 + }, + { + "epoch": 0.65, + "learning_rate": 1.0577080650825236e-07, + "logits/chosen": -2.7387471199035645, + "logits/rejected": -2.6675548553466797, + "logps/chosen": -145.60780334472656, + "logps/rejected": -224.58880615234375, + "loss": 0.1245, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8437469601631165, + "rewards/margins": 3.9355430603027344, + "rewards/rejected": -4.779290199279785, + "step": 5662 + }, + { + "epoch": 0.65, + "learning_rate": 1.057356900386281e-07, + "logits/chosen": -2.3204104900360107, + "logits/rejected": -2.389822483062744, + "logps/chosen": -162.6722412109375, + "logps/rejected": -206.92437744140625, + "loss": 0.3189, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.163535714149475, + "rewards/margins": 2.331393003463745, + "rewards/rejected": -3.4949283599853516, + "step": 5663 + }, + { + "epoch": 0.65, + "learning_rate": 1.0570057356900386e-07, + "logits/chosen": -2.5674283504486084, + "logits/rejected": -2.3997786045074463, + "logps/chosen": -131.04714965820312, + "logps/rejected": -223.2749786376953, + "loss": 0.2263, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8401740193367004, + "rewards/margins": 3.7120823860168457, + "rewards/rejected": -4.5522565841674805, + "step": 5664 + }, + { + "epoch": 0.65, + "learning_rate": 1.0566545709937961e-07, + "logits/chosen": -2.417459487915039, + "logits/rejected": -2.347984790802002, + "logps/chosen": -376.0937194824219, + "logps/rejected": -347.620361328125, + "loss": 0.7647, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8980207443237305, + "rewards/margins": 1.7594138383865356, + "rewards/rejected": -2.6574344635009766, + "step": 5665 + }, + { + "epoch": 0.65, + "learning_rate": 1.0563034062975535e-07, + "logits/chosen": -2.136585235595703, + "logits/rejected": -2.05348539352417, + "logps/chosen": -229.47042846679688, + "logps/rejected": -253.47024536132812, + "loss": 0.3507, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0591403245925903, + "rewards/margins": 1.6952770948410034, + "rewards/rejected": -2.7544174194335938, + "step": 5666 + }, + { + "epoch": 0.65, + "learning_rate": 1.055952241601311e-07, + "logits/chosen": -2.581386089324951, + "logits/rejected": -2.8544070720672607, + "logps/chosen": -272.68499755859375, + "logps/rejected": -278.5211181640625, + "loss": 0.5425, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.45875883102417, + "rewards/margins": 2.8793811798095703, + "rewards/rejected": -4.338139533996582, + "step": 5667 + }, + { + "epoch": 0.65, + "learning_rate": 1.0556010769050684e-07, + "logits/chosen": -2.2590126991271973, + "logits/rejected": -1.9395931959152222, + "logps/chosen": -176.84017944335938, + "logps/rejected": -314.5833435058594, + "loss": 0.1554, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22662262618541718, + "rewards/margins": 3.5194973945617676, + "rewards/rejected": -3.292874813079834, + "step": 5668 + }, + { + "epoch": 0.65, + "learning_rate": 1.0552499122088258e-07, + "logits/chosen": -2.5076005458831787, + "logits/rejected": -2.422013282775879, + "logps/chosen": -246.11256408691406, + "logps/rejected": -184.1165771484375, + "loss": 0.7394, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2656922340393066, + "rewards/margins": 0.9693318009376526, + "rewards/rejected": -2.2350239753723145, + "step": 5669 + }, + { + "epoch": 0.65, + "learning_rate": 1.0548987475125834e-07, + "logits/chosen": -2.298521041870117, + "logits/rejected": -2.3364081382751465, + "logps/chosen": -225.97506713867188, + "logps/rejected": -251.31854248046875, + "loss": 0.1734, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9442916512489319, + "rewards/margins": 4.122259140014648, + "rewards/rejected": -5.0665507316589355, + "step": 5670 + }, + { + "epoch": 0.65, + "learning_rate": 1.0545475828163408e-07, + "logits/chosen": -2.719184160232544, + "logits/rejected": -2.57948637008667, + "logps/chosen": -136.55389404296875, + "logps/rejected": -288.19342041015625, + "loss": 0.142, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09439735114574432, + "rewards/margins": 4.287075519561768, + "rewards/rejected": -4.192677974700928, + "step": 5671 + }, + { + "epoch": 0.65, + "learning_rate": 1.0541964181200982e-07, + "logits/chosen": -2.5185983180999756, + "logits/rejected": -2.632885456085205, + "logps/chosen": -124.23957824707031, + "logps/rejected": -173.80499267578125, + "loss": 0.5048, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8598602414131165, + "rewards/margins": 2.4440598487854004, + "rewards/rejected": -3.303920030593872, + "step": 5672 + }, + { + "epoch": 0.65, + "learning_rate": 1.0538452534238556e-07, + "logits/chosen": -1.7944927215576172, + "logits/rejected": -1.9280427694320679, + "logps/chosen": -284.15985107421875, + "logps/rejected": -202.96231079101562, + "loss": 0.5053, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.42961305379867554, + "rewards/margins": 1.5387709140777588, + "rewards/rejected": -1.9683837890625, + "step": 5673 + }, + { + "epoch": 0.65, + "learning_rate": 1.0534940887276132e-07, + "logits/chosen": -2.298281669616699, + "logits/rejected": -2.470494508743286, + "logps/chosen": -405.09576416015625, + "logps/rejected": -352.04290771484375, + "loss": 0.4868, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2720073461532593, + "rewards/margins": 2.5363125801086426, + "rewards/rejected": -3.8083198070526123, + "step": 5674 + }, + { + "epoch": 0.65, + "learning_rate": 1.0531429240313707e-07, + "logits/chosen": -2.219029188156128, + "logits/rejected": -2.5608296394348145, + "logps/chosen": -425.3847351074219, + "logps/rejected": -274.70806884765625, + "loss": 0.2117, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6170457005500793, + "rewards/margins": 2.044600009918213, + "rewards/rejected": -2.6616456508636475, + "step": 5675 + }, + { + "epoch": 0.65, + "learning_rate": 1.0527917593351281e-07, + "logits/chosen": -2.1080684661865234, + "logits/rejected": -1.9169576168060303, + "logps/chosen": -287.586669921875, + "logps/rejected": -270.7770080566406, + "loss": 0.3005, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1690387725830078, + "rewards/margins": 1.9831522703170776, + "rewards/rejected": -3.152191162109375, + "step": 5676 + }, + { + "epoch": 0.65, + "learning_rate": 1.0524405946388855e-07, + "logits/chosen": -2.651211738586426, + "logits/rejected": -2.363330602645874, + "logps/chosen": -175.14865112304688, + "logps/rejected": -270.2941589355469, + "loss": 0.1376, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6016370058059692, + "rewards/margins": 2.72334623336792, + "rewards/rejected": -3.3249831199645996, + "step": 5677 + }, + { + "epoch": 0.65, + "learning_rate": 1.0520894299426431e-07, + "logits/chosen": -2.173828363418579, + "logits/rejected": -2.071603775024414, + "logps/chosen": -415.19696044921875, + "logps/rejected": -461.73846435546875, + "loss": 0.1606, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.280199646949768, + "rewards/margins": 3.4798593521118164, + "rewards/rejected": -4.760058879852295, + "step": 5678 + }, + { + "epoch": 0.65, + "learning_rate": 1.0517382652464005e-07, + "logits/chosen": -2.9446043968200684, + "logits/rejected": -2.8157870769500732, + "logps/chosen": -150.44650268554688, + "logps/rejected": -288.963623046875, + "loss": 0.2521, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.29867225885391235, + "rewards/margins": 1.8686686754226685, + "rewards/rejected": -2.1673407554626465, + "step": 5679 + }, + { + "epoch": 0.65, + "learning_rate": 1.0513871005501579e-07, + "logits/chosen": -2.345489978790283, + "logits/rejected": -2.2938952445983887, + "logps/chosen": -505.8835754394531, + "logps/rejected": -322.29998779296875, + "loss": 0.2951, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0550458431243896, + "rewards/margins": 2.308913230895996, + "rewards/rejected": -3.363959312438965, + "step": 5680 + }, + { + "epoch": 0.65, + "learning_rate": 1.0510359358539154e-07, + "logits/chosen": -2.3227031230926514, + "logits/rejected": -2.3997042179107666, + "logps/chosen": -294.25518798828125, + "logps/rejected": -225.22137451171875, + "loss": 0.7558, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2227768898010254, + "rewards/margins": 0.7148479223251343, + "rewards/rejected": -1.9376246929168701, + "step": 5681 + }, + { + "epoch": 0.66, + "learning_rate": 1.050684771157673e-07, + "logits/chosen": -1.9755089282989502, + "logits/rejected": -1.9157469272613525, + "logps/chosen": -391.5302734375, + "logps/rejected": -298.05572509765625, + "loss": 0.6886, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5045655965805054, + "rewards/margins": 2.3746652603149414, + "rewards/rejected": -2.8792309761047363, + "step": 5682 + }, + { + "epoch": 0.66, + "learning_rate": 1.0503336064614304e-07, + "logits/chosen": -2.4700534343719482, + "logits/rejected": -2.442023515701294, + "logps/chosen": -276.93267822265625, + "logps/rejected": -537.17431640625, + "loss": 0.1508, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7490823268890381, + "rewards/margins": 4.3788228034973145, + "rewards/rejected": -5.127905368804932, + "step": 5683 + }, + { + "epoch": 0.66, + "learning_rate": 1.0499824417651878e-07, + "logits/chosen": -1.921546459197998, + "logits/rejected": -2.3695452213287354, + "logps/chosen": -423.7071533203125, + "logps/rejected": -215.4486083984375, + "loss": 0.2961, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7939810752868652, + "rewards/margins": 2.0178279876708984, + "rewards/rejected": -2.8118090629577637, + "step": 5684 + }, + { + "epoch": 0.66, + "learning_rate": 1.0496312770689452e-07, + "logits/chosen": -2.5501065254211426, + "logits/rejected": -2.5665245056152344, + "logps/chosen": -233.47096252441406, + "logps/rejected": -190.79493713378906, + "loss": 0.8834, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.8877928256988525, + "rewards/margins": 0.8280408978462219, + "rewards/rejected": -2.715833902359009, + "step": 5685 + }, + { + "epoch": 0.66, + "learning_rate": 1.0492801123727029e-07, + "logits/chosen": -2.234172821044922, + "logits/rejected": -1.9856493473052979, + "logps/chosen": -184.27760314941406, + "logps/rejected": -302.53314208984375, + "loss": 0.1805, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.31428587436676025, + "rewards/margins": 2.8526554107666016, + "rewards/rejected": -3.1669411659240723, + "step": 5686 + }, + { + "epoch": 0.66, + "learning_rate": 1.0489289476764603e-07, + "logits/chosen": -2.303079128265381, + "logits/rejected": -2.417276382446289, + "logps/chosen": -240.9273223876953, + "logps/rejected": -284.09326171875, + "loss": 0.4136, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9340453743934631, + "rewards/margins": 1.8214831352233887, + "rewards/rejected": -2.755528450012207, + "step": 5687 + }, + { + "epoch": 0.66, + "learning_rate": 1.0485777829802177e-07, + "logits/chosen": -2.156590700149536, + "logits/rejected": -1.9298861026763916, + "logps/chosen": -138.56259155273438, + "logps/rejected": -310.51116943359375, + "loss": 0.2862, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5507123470306396, + "rewards/margins": 2.7627382278442383, + "rewards/rejected": -3.313450813293457, + "step": 5688 + }, + { + "epoch": 0.66, + "learning_rate": 1.0482266182839751e-07, + "logits/chosen": -2.2891130447387695, + "logits/rejected": -2.3379080295562744, + "logps/chosen": -385.4124755859375, + "logps/rejected": -361.45611572265625, + "loss": 0.3407, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5394293665885925, + "rewards/margins": 3.196798324584961, + "rewards/rejected": -3.736227512359619, + "step": 5689 + }, + { + "epoch": 0.66, + "learning_rate": 1.0478754535877326e-07, + "logits/chosen": -2.4775166511535645, + "logits/rejected": -2.6067442893981934, + "logps/chosen": -329.22760009765625, + "logps/rejected": -183.94009399414062, + "loss": 0.1764, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2114008367061615, + "rewards/margins": 2.041354179382324, + "rewards/rejected": -2.2527551651000977, + "step": 5690 + }, + { + "epoch": 0.66, + "learning_rate": 1.04752428889149e-07, + "logits/chosen": -1.9750969409942627, + "logits/rejected": -2.085726499557495, + "logps/chosen": -265.07464599609375, + "logps/rejected": -227.4579315185547, + "loss": 0.5145, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.996993899345398, + "rewards/margins": 0.9955333471298218, + "rewards/rejected": -1.9925271272659302, + "step": 5691 + }, + { + "epoch": 0.66, + "learning_rate": 1.0471731241952476e-07, + "logits/chosen": -1.6703102588653564, + "logits/rejected": -1.4915144443511963, + "logps/chosen": -348.66802978515625, + "logps/rejected": -418.78912353515625, + "loss": 0.2381, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6815681457519531, + "rewards/margins": 2.7500290870666504, + "rewards/rejected": -4.4315972328186035, + "step": 5692 + }, + { + "epoch": 0.66, + "learning_rate": 1.046821959499005e-07, + "logits/chosen": -2.6127572059631348, + "logits/rejected": -2.4581902027130127, + "logps/chosen": -82.60704803466797, + "logps/rejected": -138.285400390625, + "loss": 0.3714, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.019514113664627075, + "rewards/margins": 1.8444013595581055, + "rewards/rejected": -1.8639154434204102, + "step": 5693 + }, + { + "epoch": 0.66, + "learning_rate": 1.0464707948027625e-07, + "logits/chosen": -2.152885913848877, + "logits/rejected": -2.2404067516326904, + "logps/chosen": -323.420166015625, + "logps/rejected": -260.9515686035156, + "loss": 0.5138, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.733629584312439, + "rewards/margins": 0.9388543367385864, + "rewards/rejected": -1.672484040260315, + "step": 5694 + }, + { + "epoch": 0.66, + "learning_rate": 1.0461196301065199e-07, + "logits/chosen": -2.2371981143951416, + "logits/rejected": -2.4369029998779297, + "logps/chosen": -224.87960815429688, + "logps/rejected": -202.58773803710938, + "loss": 0.3635, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5701181292533875, + "rewards/margins": 1.1959998607635498, + "rewards/rejected": -1.7661179304122925, + "step": 5695 + }, + { + "epoch": 0.66, + "learning_rate": 1.0457684654102773e-07, + "logits/chosen": -1.9629602432250977, + "logits/rejected": -1.963181734085083, + "logps/chosen": -574.3848266601562, + "logps/rejected": -381.58636474609375, + "loss": 0.1888, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9546892642974854, + "rewards/margins": 2.9975173473358154, + "rewards/rejected": -3.952206611633301, + "step": 5696 + }, + { + "epoch": 0.66, + "learning_rate": 1.0454173007140347e-07, + "logits/chosen": -2.466557502746582, + "logits/rejected": -2.621596336364746, + "logps/chosen": -141.0678253173828, + "logps/rejected": -135.29347229003906, + "loss": 0.2034, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6601530313491821, + "rewards/margins": 2.3164241313934326, + "rewards/rejected": -2.976577043533325, + "step": 5697 + }, + { + "epoch": 0.66, + "learning_rate": 1.0450661360177924e-07, + "logits/chosen": -2.8207778930664062, + "logits/rejected": -2.7943308353424072, + "logps/chosen": -198.5325927734375, + "logps/rejected": -164.2638702392578, + "loss": 0.6455, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3254730701446533, + "rewards/margins": 1.2059451341629028, + "rewards/rejected": -2.5314178466796875, + "step": 5698 + }, + { + "epoch": 0.66, + "learning_rate": 1.0447149713215498e-07, + "logits/chosen": -2.0074989795684814, + "logits/rejected": -2.1015431880950928, + "logps/chosen": -338.10650634765625, + "logps/rejected": -265.97216796875, + "loss": 0.3856, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5720778703689575, + "rewards/margins": 1.375298023223877, + "rewards/rejected": -1.9473758935928345, + "step": 5699 + }, + { + "epoch": 0.66, + "learning_rate": 1.0443638066253072e-07, + "logits/chosen": -2.7385709285736084, + "logits/rejected": -2.6709582805633545, + "logps/chosen": -295.9437255859375, + "logps/rejected": -256.158203125, + "loss": 0.0961, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1277034282684326, + "rewards/margins": 2.9576072692871094, + "rewards/rejected": -4.085310459136963, + "step": 5700 + }, + { + "epoch": 0.66, + "learning_rate": 1.0440126419290646e-07, + "logits/chosen": -3.0656261444091797, + "logits/rejected": -2.9948127269744873, + "logps/chosen": -241.9866943359375, + "logps/rejected": -156.59381103515625, + "loss": 0.5495, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0609720945358276, + "rewards/margins": 1.202999472618103, + "rewards/rejected": -2.2639715671539307, + "step": 5701 + }, + { + "epoch": 0.66, + "learning_rate": 1.0436614772328223e-07, + "logits/chosen": -1.9386740922927856, + "logits/rejected": -2.1603896617889404, + "logps/chosen": -609.4570922851562, + "logps/rejected": -373.91717529296875, + "loss": 0.3121, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.04111519455909729, + "rewards/margins": 2.1379001140594482, + "rewards/rejected": -2.096784830093384, + "step": 5702 + }, + { + "epoch": 0.66, + "learning_rate": 1.0433103125365797e-07, + "logits/chosen": -2.3202919960021973, + "logits/rejected": -2.4443297386169434, + "logps/chosen": -233.43174743652344, + "logps/rejected": -295.2108459472656, + "loss": 0.2665, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3245539665222168, + "rewards/margins": 2.1804087162017822, + "rewards/rejected": -3.504962921142578, + "step": 5703 + }, + { + "epoch": 0.66, + "learning_rate": 1.0429591478403371e-07, + "logits/chosen": -2.3717217445373535, + "logits/rejected": -2.1728713512420654, + "logps/chosen": -424.6638488769531, + "logps/rejected": -357.93743896484375, + "loss": 0.3591, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6052929162979126, + "rewards/margins": 1.706444263458252, + "rewards/rejected": -3.311737298965454, + "step": 5704 + }, + { + "epoch": 0.66, + "learning_rate": 1.0426079831440945e-07, + "logits/chosen": -1.785844087600708, + "logits/rejected": -1.7854957580566406, + "logps/chosen": -174.42269897460938, + "logps/rejected": -231.7388458251953, + "loss": 0.2683, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7294593453407288, + "rewards/margins": 3.1340508460998535, + "rewards/rejected": -3.8635103702545166, + "step": 5705 + }, + { + "epoch": 0.66, + "learning_rate": 1.042256818447852e-07, + "logits/chosen": -2.035210132598877, + "logits/rejected": -2.184805154800415, + "logps/chosen": -186.44113159179688, + "logps/rejected": -172.67144775390625, + "loss": 0.5824, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.150192141532898, + "rewards/margins": 0.43023961782455444, + "rewards/rejected": -1.5804316997528076, + "step": 5706 + }, + { + "epoch": 0.66, + "learning_rate": 1.0419056537516094e-07, + "logits/chosen": -2.5392239093780518, + "logits/rejected": -2.643209934234619, + "logps/chosen": -161.57931518554688, + "logps/rejected": -209.2503662109375, + "loss": 0.274, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2500506341457367, + "rewards/margins": 3.3888814449310303, + "rewards/rejected": -3.638932228088379, + "step": 5707 + }, + { + "epoch": 0.66, + "learning_rate": 1.0415544890553668e-07, + "logits/chosen": -1.7633156776428223, + "logits/rejected": -2.2284533977508545, + "logps/chosen": -435.9176330566406, + "logps/rejected": -481.9520568847656, + "loss": 0.5818, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9884277582168579, + "rewards/margins": 1.3457752466201782, + "rewards/rejected": -2.334202766418457, + "step": 5708 + }, + { + "epoch": 0.66, + "learning_rate": 1.0412033243591244e-07, + "logits/chosen": -2.8011467456817627, + "logits/rejected": -2.520160436630249, + "logps/chosen": -238.44436645507812, + "logps/rejected": -359.74920654296875, + "loss": 0.0669, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.44218945503234863, + "rewards/margins": 3.5146713256835938, + "rewards/rejected": -3.9568605422973633, + "step": 5709 + }, + { + "epoch": 0.66, + "learning_rate": 1.0408521596628819e-07, + "logits/chosen": -2.5002825260162354, + "logits/rejected": -2.7217118740081787, + "logps/chosen": -262.3436279296875, + "logps/rejected": -210.7422637939453, + "loss": 0.2262, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5573402643203735, + "rewards/margins": 2.2496848106384277, + "rewards/rejected": -2.8070249557495117, + "step": 5710 + }, + { + "epoch": 0.66, + "learning_rate": 1.0405009949666393e-07, + "logits/chosen": -2.7550952434539795, + "logits/rejected": -2.7177863121032715, + "logps/chosen": -285.30035400390625, + "logps/rejected": -171.1215057373047, + "loss": 0.3159, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9162176847457886, + "rewards/margins": 1.674346923828125, + "rewards/rejected": -2.590564727783203, + "step": 5711 + }, + { + "epoch": 0.66, + "learning_rate": 1.0401498302703967e-07, + "logits/chosen": -2.1477396488189697, + "logits/rejected": -2.1626768112182617, + "logps/chosen": -226.95428466796875, + "logps/rejected": -267.4537658691406, + "loss": 0.4574, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9913842678070068, + "rewards/margins": 1.6327577829360962, + "rewards/rejected": -2.6241421699523926, + "step": 5712 + }, + { + "epoch": 0.66, + "learning_rate": 1.0397986655741541e-07, + "logits/chosen": -2.329615354537964, + "logits/rejected": -2.6895484924316406, + "logps/chosen": -252.43252563476562, + "logps/rejected": -257.3902282714844, + "loss": 0.3232, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7277745604515076, + "rewards/margins": 1.568662405014038, + "rewards/rejected": -2.2964370250701904, + "step": 5713 + }, + { + "epoch": 0.66, + "learning_rate": 1.0394475008779118e-07, + "logits/chosen": -2.8019940853118896, + "logits/rejected": -2.675474166870117, + "logps/chosen": -271.59716796875, + "logps/rejected": -304.78076171875, + "loss": 0.1793, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0437378883361816, + "rewards/margins": 3.4706954956054688, + "rewards/rejected": -4.514432907104492, + "step": 5714 + }, + { + "epoch": 0.66, + "learning_rate": 1.0390963361816692e-07, + "logits/chosen": -2.9676547050476074, + "logits/rejected": -2.9903006553649902, + "logps/chosen": -118.59181213378906, + "logps/rejected": -233.93560791015625, + "loss": 0.4185, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8872296810150146, + "rewards/margins": 2.613929510116577, + "rewards/rejected": -3.501159191131592, + "step": 5715 + }, + { + "epoch": 0.66, + "learning_rate": 1.0387451714854266e-07, + "logits/chosen": -2.5413718223571777, + "logits/rejected": -2.7260096073150635, + "logps/chosen": -306.9344482421875, + "logps/rejected": -254.91322326660156, + "loss": 0.2356, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1909322738647461, + "rewards/margins": 1.7354875802993774, + "rewards/rejected": -1.5445551872253418, + "step": 5716 + }, + { + "epoch": 0.66, + "learning_rate": 1.038394006789184e-07, + "logits/chosen": -2.507018566131592, + "logits/rejected": -2.313147783279419, + "logps/chosen": -304.9520568847656, + "logps/rejected": -406.07769775390625, + "loss": 0.2806, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6595777869224548, + "rewards/margins": 3.1311697959899902, + "rewards/rejected": -3.79074764251709, + "step": 5717 + }, + { + "epoch": 0.66, + "learning_rate": 1.0380428420929414e-07, + "logits/chosen": -2.4249134063720703, + "logits/rejected": -2.1288821697235107, + "logps/chosen": -178.94927978515625, + "logps/rejected": -288.40966796875, + "loss": 0.4605, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.031457781791687, + "rewards/margins": 2.1030497550964355, + "rewards/rejected": -3.134507656097412, + "step": 5718 + }, + { + "epoch": 0.66, + "learning_rate": 1.0376916773966991e-07, + "logits/chosen": -2.4656498432159424, + "logits/rejected": -2.714864730834961, + "logps/chosen": -243.68521118164062, + "logps/rejected": -200.15281677246094, + "loss": 0.5729, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1591217517852783, + "rewards/margins": 1.1501410007476807, + "rewards/rejected": -2.309262752532959, + "step": 5719 + }, + { + "epoch": 0.66, + "learning_rate": 1.0373405127004565e-07, + "logits/chosen": -2.6968984603881836, + "logits/rejected": -2.5647103786468506, + "logps/chosen": -179.543701171875, + "logps/rejected": -188.81240844726562, + "loss": 0.4201, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.8223052024841309, + "rewards/margins": 1.444286584854126, + "rewards/rejected": -3.2665915489196777, + "step": 5720 + }, + { + "epoch": 0.66, + "learning_rate": 1.0369893480042139e-07, + "logits/chosen": -2.291816234588623, + "logits/rejected": -2.69321870803833, + "logps/chosen": -357.4403991699219, + "logps/rejected": -315.59442138671875, + "loss": 0.4697, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3053053617477417, + "rewards/margins": 1.4985462427139282, + "rewards/rejected": -2.803851842880249, + "step": 5721 + }, + { + "epoch": 0.66, + "learning_rate": 1.0366381833079713e-07, + "logits/chosen": -2.5653905868530273, + "logits/rejected": -2.371644973754883, + "logps/chosen": -253.13668823242188, + "logps/rejected": -269.6716613769531, + "loss": 0.2009, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3141050338745117, + "rewards/margins": 3.4303066730499268, + "rewards/rejected": -4.744411468505859, + "step": 5722 + }, + { + "epoch": 0.66, + "learning_rate": 1.0362870186117289e-07, + "logits/chosen": -2.474900960922241, + "logits/rejected": -2.5758140087127686, + "logps/chosen": -159.88973999023438, + "logps/rejected": -327.5445251464844, + "loss": 0.2215, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.29797011613845825, + "rewards/margins": 2.7032580375671387, + "rewards/rejected": -3.0012283325195312, + "step": 5723 + }, + { + "epoch": 0.66, + "learning_rate": 1.0359358539154863e-07, + "logits/chosen": -2.055781602859497, + "logits/rejected": -2.4604218006134033, + "logps/chosen": -241.7636260986328, + "logps/rejected": -197.8364715576172, + "loss": 0.9123, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.288061261177063, + "rewards/margins": 1.2242389917373657, + "rewards/rejected": -2.5123000144958496, + "step": 5724 + }, + { + "epoch": 0.66, + "learning_rate": 1.0355846892192437e-07, + "logits/chosen": -2.5089468955993652, + "logits/rejected": -2.381629228591919, + "logps/chosen": -310.4273986816406, + "logps/rejected": -251.5543975830078, + "loss": 0.4476, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0255155563354492, + "rewards/margins": 1.0527997016906738, + "rewards/rejected": -2.078315258026123, + "step": 5725 + }, + { + "epoch": 0.66, + "learning_rate": 1.0352335245230012e-07, + "logits/chosen": -1.9565938711166382, + "logits/rejected": -1.9590569734573364, + "logps/chosen": -184.97808837890625, + "logps/rejected": -163.8845672607422, + "loss": 0.6529, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.403435230255127, + "rewards/margins": 1.3752846717834473, + "rewards/rejected": -2.778719902038574, + "step": 5726 + }, + { + "epoch": 0.66, + "learning_rate": 1.0348823598267588e-07, + "logits/chosen": -2.6491541862487793, + "logits/rejected": -2.676936149597168, + "logps/chosen": -332.49383544921875, + "logps/rejected": -360.8451232910156, + "loss": 0.3321, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0506870746612549, + "rewards/margins": 1.6553637981414795, + "rewards/rejected": -2.7060508728027344, + "step": 5727 + }, + { + "epoch": 0.66, + "learning_rate": 1.0345311951305162e-07, + "logits/chosen": -1.9545543193817139, + "logits/rejected": -1.9132130146026611, + "logps/chosen": -324.6972351074219, + "logps/rejected": -322.864013671875, + "loss": 0.1884, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2158675193786621, + "rewards/margins": 2.735900640487671, + "rewards/rejected": -2.951768159866333, + "step": 5728 + }, + { + "epoch": 0.66, + "learning_rate": 1.0341800304342736e-07, + "logits/chosen": -2.4907193183898926, + "logits/rejected": -2.5974342823028564, + "logps/chosen": -269.1514892578125, + "logps/rejected": -311.33697509765625, + "loss": 0.7435, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3471369743347168, + "rewards/margins": 0.9533823728561401, + "rewards/rejected": -2.3005192279815674, + "step": 5729 + }, + { + "epoch": 0.66, + "learning_rate": 1.033828865738031e-07, + "logits/chosen": -2.5680992603302, + "logits/rejected": -2.730490207672119, + "logps/chosen": -176.63958740234375, + "logps/rejected": -175.8253173828125, + "loss": 0.312, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7767866849899292, + "rewards/margins": 1.873018741607666, + "rewards/rejected": -2.6498053073883057, + "step": 5730 + }, + { + "epoch": 0.66, + "learning_rate": 1.0334777010417886e-07, + "logits/chosen": -2.8148436546325684, + "logits/rejected": -2.8260810375213623, + "logps/chosen": -202.02487182617188, + "logps/rejected": -206.17047119140625, + "loss": 0.6067, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5113780498504639, + "rewards/margins": 1.7987616062164307, + "rewards/rejected": -3.3101396560668945, + "step": 5731 + }, + { + "epoch": 0.66, + "learning_rate": 1.033126536345546e-07, + "logits/chosen": -2.3554999828338623, + "logits/rejected": -2.433112144470215, + "logps/chosen": -320.044921875, + "logps/rejected": -345.0216369628906, + "loss": 0.4606, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6682032942771912, + "rewards/margins": 2.156243085861206, + "rewards/rejected": -2.824446201324463, + "step": 5732 + }, + { + "epoch": 0.66, + "learning_rate": 1.0327753716493035e-07, + "logits/chosen": -2.279508590698242, + "logits/rejected": -2.2670257091522217, + "logps/chosen": -296.9471740722656, + "logps/rejected": -324.8597412109375, + "loss": 0.5909, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4840614795684814, + "rewards/margins": 1.730691909790039, + "rewards/rejected": -3.2147533893585205, + "step": 5733 + }, + { + "epoch": 0.66, + "learning_rate": 1.0324242069530609e-07, + "logits/chosen": -2.738952398300171, + "logits/rejected": -2.664668560028076, + "logps/chosen": -255.69790649414062, + "logps/rejected": -322.28515625, + "loss": 0.4537, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.290281891822815, + "rewards/margins": 1.5269134044647217, + "rewards/rejected": -2.817195415496826, + "step": 5734 + }, + { + "epoch": 0.66, + "learning_rate": 1.0320730422568184e-07, + "logits/chosen": -2.6271159648895264, + "logits/rejected": -2.443211555480957, + "logps/chosen": -401.49554443359375, + "logps/rejected": -455.66485595703125, + "loss": 0.1709, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7698580026626587, + "rewards/margins": 2.540374994277954, + "rewards/rejected": -3.3102328777313232, + "step": 5735 + }, + { + "epoch": 0.66, + "learning_rate": 1.031721877560576e-07, + "logits/chosen": -1.8890104293823242, + "logits/rejected": -2.535559892654419, + "logps/chosen": -359.26947021484375, + "logps/rejected": -235.79440307617188, + "loss": 0.1815, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2986747920513153, + "rewards/margins": 2.555190086364746, + "rewards/rejected": -2.2565150260925293, + "step": 5736 + }, + { + "epoch": 0.66, + "learning_rate": 1.0313707128643333e-07, + "logits/chosen": -2.049762725830078, + "logits/rejected": -2.597771167755127, + "logps/chosen": -483.8975524902344, + "logps/rejected": -307.39111328125, + "loss": 0.3169, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7763696908950806, + "rewards/margins": 2.9110069274902344, + "rewards/rejected": -3.6873764991760254, + "step": 5737 + }, + { + "epoch": 0.66, + "learning_rate": 1.0310195481680908e-07, + "logits/chosen": -2.5682287216186523, + "logits/rejected": -2.5477845668792725, + "logps/chosen": -163.78221130371094, + "logps/rejected": -180.71780395507812, + "loss": 0.3223, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08506549894809723, + "rewards/margins": 1.8973002433776855, + "rewards/rejected": -1.9823658466339111, + "step": 5738 + }, + { + "epoch": 0.66, + "learning_rate": 1.0306683834718483e-07, + "logits/chosen": -2.2826128005981445, + "logits/rejected": -2.469999074935913, + "logps/chosen": -482.241455078125, + "logps/rejected": -419.9446105957031, + "loss": 0.2141, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.01264868676662445, + "rewards/margins": 3.080867052078247, + "rewards/rejected": -3.0935158729553223, + "step": 5739 + }, + { + "epoch": 0.66, + "learning_rate": 1.0303172187756057e-07, + "logits/chosen": -2.2141008377075195, + "logits/rejected": -2.285714626312256, + "logps/chosen": -211.00677490234375, + "logps/rejected": -179.84471130371094, + "loss": 0.6041, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7044484615325928, + "rewards/margins": 1.0390536785125732, + "rewards/rejected": -1.743502140045166, + "step": 5740 + }, + { + "epoch": 0.66, + "learning_rate": 1.0299660540793631e-07, + "logits/chosen": -2.4067182540893555, + "logits/rejected": -2.3694379329681396, + "logps/chosen": -303.5843200683594, + "logps/rejected": -329.01715087890625, + "loss": 0.3604, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5766245126724243, + "rewards/margins": 1.457743763923645, + "rewards/rejected": -2.0343685150146484, + "step": 5741 + }, + { + "epoch": 0.66, + "learning_rate": 1.0296148893831205e-07, + "logits/chosen": -2.2192835807800293, + "logits/rejected": -1.8990222215652466, + "logps/chosen": -269.4852294921875, + "logps/rejected": -249.796142578125, + "loss": 0.2033, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.45179906487464905, + "rewards/margins": 2.0734810829162598, + "rewards/rejected": -2.525280475616455, + "step": 5742 + }, + { + "epoch": 0.66, + "learning_rate": 1.0292637246868782e-07, + "logits/chosen": -2.534597635269165, + "logits/rejected": -2.626574754714966, + "logps/chosen": -301.3621520996094, + "logps/rejected": -236.03904724121094, + "loss": 0.2921, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3807438611984253, + "rewards/margins": 1.939186453819275, + "rewards/rejected": -3.3199303150177, + "step": 5743 + }, + { + "epoch": 0.66, + "learning_rate": 1.0289125599906356e-07, + "logits/chosen": -2.3341944217681885, + "logits/rejected": -2.4175450801849365, + "logps/chosen": -288.64483642578125, + "logps/rejected": -282.1702575683594, + "loss": 0.3534, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5436795949935913, + "rewards/margins": 3.0637640953063965, + "rewards/rejected": -4.607443809509277, + "step": 5744 + }, + { + "epoch": 0.66, + "learning_rate": 1.028561395294393e-07, + "logits/chosen": -2.6094963550567627, + "logits/rejected": -2.304442882537842, + "logps/chosen": -358.95367431640625, + "logps/rejected": -211.79888916015625, + "loss": 0.5993, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.777927041053772, + "rewards/margins": 1.4006092548370361, + "rewards/rejected": -2.1785364151000977, + "step": 5745 + }, + { + "epoch": 0.66, + "learning_rate": 1.0282102305981504e-07, + "logits/chosen": -2.979985475540161, + "logits/rejected": -2.992115020751953, + "logps/chosen": -196.01123046875, + "logps/rejected": -178.87420654296875, + "loss": 0.7111, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9266474843025208, + "rewards/margins": 1.129477858543396, + "rewards/rejected": -2.0561254024505615, + "step": 5746 + }, + { + "epoch": 0.66, + "learning_rate": 1.0278590659019081e-07, + "logits/chosen": -2.7230138778686523, + "logits/rejected": -2.6069114208221436, + "logps/chosen": -240.11529541015625, + "logps/rejected": -257.0327453613281, + "loss": 0.259, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3855454921722412, + "rewards/margins": 2.395850896835327, + "rewards/rejected": -3.7813961505889893, + "step": 5747 + }, + { + "epoch": 0.66, + "learning_rate": 1.0275079012056655e-07, + "logits/chosen": -2.424903392791748, + "logits/rejected": -2.5467004776000977, + "logps/chosen": -308.0525207519531, + "logps/rejected": -233.62725830078125, + "loss": 0.8076, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8607410192489624, + "rewards/margins": 2.942540407180786, + "rewards/rejected": -3.803281307220459, + "step": 5748 + }, + { + "epoch": 0.66, + "learning_rate": 1.0271567365094229e-07, + "logits/chosen": -2.7660584449768066, + "logits/rejected": -2.769490957260132, + "logps/chosen": -252.9109344482422, + "logps/rejected": -399.31927490234375, + "loss": 0.2999, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.476326584815979, + "rewards/margins": 1.9505536556243896, + "rewards/rejected": -2.426880359649658, + "step": 5749 + }, + { + "epoch": 0.66, + "learning_rate": 1.0268055718131803e-07, + "logits/chosen": -2.327259063720703, + "logits/rejected": -2.437598943710327, + "logps/chosen": -184.0611572265625, + "logps/rejected": -159.24899291992188, + "loss": 0.2381, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14388743042945862, + "rewards/margins": 2.179372787475586, + "rewards/rejected": -2.3232603073120117, + "step": 5750 + }, + { + "epoch": 0.66, + "learning_rate": 1.0264544071169378e-07, + "logits/chosen": -1.5329999923706055, + "logits/rejected": -2.2509496212005615, + "logps/chosen": -718.4505615234375, + "logps/rejected": -426.14971923828125, + "loss": 0.5225, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.851439356803894, + "rewards/margins": 2.1068716049194336, + "rewards/rejected": -2.958310842514038, + "step": 5751 + }, + { + "epoch": 0.66, + "learning_rate": 1.0261032424206952e-07, + "logits/chosen": -1.924591064453125, + "logits/rejected": -2.3919308185577393, + "logps/chosen": -347.8149719238281, + "logps/rejected": -289.066162109375, + "loss": 0.2787, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2398467063903809, + "rewards/margins": 1.5666049718856812, + "rewards/rejected": -2.8064517974853516, + "step": 5752 + }, + { + "epoch": 0.66, + "learning_rate": 1.0257520777244528e-07, + "logits/chosen": -2.838191509246826, + "logits/rejected": -2.80674409866333, + "logps/chosen": -212.5035400390625, + "logps/rejected": -294.6371765136719, + "loss": 0.4872, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7987270951271057, + "rewards/margins": 1.7270948886871338, + "rewards/rejected": -2.5258219242095947, + "step": 5753 + }, + { + "epoch": 0.66, + "learning_rate": 1.0254009130282102e-07, + "logits/chosen": -2.096773862838745, + "logits/rejected": -2.065037488937378, + "logps/chosen": -390.42864990234375, + "logps/rejected": -247.386962890625, + "loss": 0.0767, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.49891549348831177, + "rewards/margins": 2.9379265308380127, + "rewards/rejected": -3.436842441558838, + "step": 5754 + }, + { + "epoch": 0.66, + "learning_rate": 1.0250497483319677e-07, + "logits/chosen": -2.10544753074646, + "logits/rejected": -2.0663156509399414, + "logps/chosen": -200.29908752441406, + "logps/rejected": -278.55517578125, + "loss": 0.3349, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9470057487487793, + "rewards/margins": 1.6992268562316895, + "rewards/rejected": -3.6462326049804688, + "step": 5755 + }, + { + "epoch": 0.66, + "learning_rate": 1.0246985836357251e-07, + "logits/chosen": -2.7304868698120117, + "logits/rejected": -2.610189437866211, + "logps/chosen": -239.25076293945312, + "logps/rejected": -296.9072265625, + "loss": 0.151, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.831462025642395, + "rewards/margins": 2.8475236892700195, + "rewards/rejected": -3.678985595703125, + "step": 5756 + }, + { + "epoch": 0.66, + "learning_rate": 1.0243474189394825e-07, + "logits/chosen": -2.182795286178589, + "logits/rejected": -2.3067376613616943, + "logps/chosen": -216.99905395507812, + "logps/rejected": -340.1896667480469, + "loss": 0.7603, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2817652225494385, + "rewards/margins": 1.8306169509887695, + "rewards/rejected": -3.112382411956787, + "step": 5757 + }, + { + "epoch": 0.66, + "learning_rate": 1.02399625424324e-07, + "logits/chosen": -2.3828046321868896, + "logits/rejected": -2.301320791244507, + "logps/chosen": -230.26988220214844, + "logps/rejected": -220.81466674804688, + "loss": 0.7569, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5240811109542847, + "rewards/margins": 1.1060924530029297, + "rewards/rejected": -2.630173683166504, + "step": 5758 + }, + { + "epoch": 0.66, + "learning_rate": 1.0236450895469976e-07, + "logits/chosen": -2.603799819946289, + "logits/rejected": -2.6482486724853516, + "logps/chosen": -335.5642395019531, + "logps/rejected": -343.1947021484375, + "loss": 0.562, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.726197361946106, + "rewards/margins": 1.3762643337249756, + "rewards/rejected": -2.102461814880371, + "step": 5759 + }, + { + "epoch": 0.66, + "learning_rate": 1.023293924850755e-07, + "logits/chosen": -2.3432931900024414, + "logits/rejected": -2.5522899627685547, + "logps/chosen": -215.5906982421875, + "logps/rejected": -258.0415954589844, + "loss": 0.2314, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1663219928741455, + "rewards/margins": 2.170258045196533, + "rewards/rejected": -3.336580276489258, + "step": 5760 + }, + { + "epoch": 0.66, + "learning_rate": 1.0229427601545124e-07, + "logits/chosen": -2.04732084274292, + "logits/rejected": -2.1141550540924072, + "logps/chosen": -460.85345458984375, + "logps/rejected": -327.984619140625, + "loss": 0.122, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6200993061065674, + "rewards/margins": 3.078240394592285, + "rewards/rejected": -3.6983397006988525, + "step": 5761 + }, + { + "epoch": 0.66, + "learning_rate": 1.0225915954582698e-07, + "logits/chosen": -2.7802352905273438, + "logits/rejected": -2.594757318496704, + "logps/chosen": -161.83447265625, + "logps/rejected": -166.03936767578125, + "loss": 0.4385, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6712431907653809, + "rewards/margins": 2.270164966583252, + "rewards/rejected": -2.941408157348633, + "step": 5762 + }, + { + "epoch": 0.66, + "learning_rate": 1.0222404307620272e-07, + "logits/chosen": -2.531865358352661, + "logits/rejected": -2.6272354125976562, + "logps/chosen": -306.8194580078125, + "logps/rejected": -191.28579711914062, + "loss": 0.1342, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5398118495941162, + "rewards/margins": 2.694230556488037, + "rewards/rejected": -3.2340424060821533, + "step": 5763 + }, + { + "epoch": 0.66, + "learning_rate": 1.0218892660657849e-07, + "logits/chosen": -2.1430463790893555, + "logits/rejected": -2.511880874633789, + "logps/chosen": -450.4476623535156, + "logps/rejected": -309.68182373046875, + "loss": 0.2999, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4979545474052429, + "rewards/margins": 3.0586416721343994, + "rewards/rejected": -3.556596040725708, + "step": 5764 + }, + { + "epoch": 0.66, + "learning_rate": 1.0215381013695423e-07, + "logits/chosen": -2.600262403488159, + "logits/rejected": -2.5722265243530273, + "logps/chosen": -131.06051635742188, + "logps/rejected": -96.82307434082031, + "loss": 0.2723, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.575637936592102, + "rewards/margins": 1.8516716957092285, + "rewards/rejected": -2.427309513092041, + "step": 5765 + }, + { + "epoch": 0.66, + "learning_rate": 1.0211869366732997e-07, + "logits/chosen": -2.0984981060028076, + "logits/rejected": -2.3929429054260254, + "logps/chosen": -454.6372375488281, + "logps/rejected": -233.19700622558594, + "loss": 0.4284, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5476158261299133, + "rewards/margins": 1.3563112020492554, + "rewards/rejected": -1.9039270877838135, + "step": 5766 + }, + { + "epoch": 0.66, + "learning_rate": 1.0208357719770571e-07, + "logits/chosen": -2.103215217590332, + "logits/rejected": -1.941687822341919, + "logps/chosen": -310.0687561035156, + "logps/rejected": -478.34228515625, + "loss": 0.0765, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.052842848002910614, + "rewards/margins": 4.574426651000977, + "rewards/rejected": -4.5215840339660645, + "step": 5767 + }, + { + "epoch": 0.66, + "learning_rate": 1.0204846072808147e-07, + "logits/chosen": -2.169823169708252, + "logits/rejected": -2.4504194259643555, + "logps/chosen": -303.6185607910156, + "logps/rejected": -215.54022216796875, + "loss": 0.447, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.528468132019043, + "rewards/margins": 1.6027257442474365, + "rewards/rejected": -3.1311936378479004, + "step": 5768 + }, + { + "epoch": 0.67, + "learning_rate": 1.0201334425845721e-07, + "logits/chosen": -2.5440683364868164, + "logits/rejected": -2.3233137130737305, + "logps/chosen": -190.89022827148438, + "logps/rejected": -253.24037170410156, + "loss": 0.5008, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5194852948188782, + "rewards/margins": 1.8669583797454834, + "rewards/rejected": -2.386443853378296, + "step": 5769 + }, + { + "epoch": 0.67, + "learning_rate": 1.0197822778883296e-07, + "logits/chosen": -1.8523482084274292, + "logits/rejected": -2.1972362995147705, + "logps/chosen": -356.4728088378906, + "logps/rejected": -258.375732421875, + "loss": 0.4656, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9691612720489502, + "rewards/margins": 1.2059656381607056, + "rewards/rejected": -2.1751270294189453, + "step": 5770 + }, + { + "epoch": 0.67, + "learning_rate": 1.019431113192087e-07, + "logits/chosen": -2.366286516189575, + "logits/rejected": -2.1996541023254395, + "logps/chosen": -157.89764404296875, + "logps/rejected": -245.29981994628906, + "loss": 0.2668, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5076597332954407, + "rewards/margins": 2.43727445602417, + "rewards/rejected": -2.944934368133545, + "step": 5771 + }, + { + "epoch": 0.67, + "learning_rate": 1.0190799484958446e-07, + "logits/chosen": -2.209228277206421, + "logits/rejected": -2.2646780014038086, + "logps/chosen": -354.840576171875, + "logps/rejected": -268.95166015625, + "loss": 0.9996, + "rewards/accuracies": 0.375, + "rewards/chosen": -2.1572391986846924, + "rewards/margins": -0.26805347204208374, + "rewards/rejected": -1.8891856670379639, + "step": 5772 + }, + { + "epoch": 0.67, + "learning_rate": 1.018728783799602e-07, + "logits/chosen": -1.8258512020111084, + "logits/rejected": -1.703895926475525, + "logps/chosen": -508.70849609375, + "logps/rejected": -443.2032165527344, + "loss": 0.2237, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7586466670036316, + "rewards/margins": 2.4679274559020996, + "rewards/rejected": -3.226574420928955, + "step": 5773 + }, + { + "epoch": 0.67, + "learning_rate": 1.0183776191033594e-07, + "logits/chosen": -2.1507797241210938, + "logits/rejected": -2.1305453777313232, + "logps/chosen": -215.55332946777344, + "logps/rejected": -359.96246337890625, + "loss": 0.3192, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7736083269119263, + "rewards/margins": 3.5791079998016357, + "rewards/rejected": -4.352715969085693, + "step": 5774 + }, + { + "epoch": 0.67, + "learning_rate": 1.0180264544071168e-07, + "logits/chosen": -2.432033061981201, + "logits/rejected": -2.363323211669922, + "logps/chosen": -127.31592559814453, + "logps/rejected": -141.54592895507812, + "loss": 0.6823, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9855304956436157, + "rewards/margins": 1.2565829753875732, + "rewards/rejected": -2.2421133518218994, + "step": 5775 + }, + { + "epoch": 0.67, + "learning_rate": 1.0176752897108744e-07, + "logits/chosen": -1.968848705291748, + "logits/rejected": -2.481837749481201, + "logps/chosen": -390.8147277832031, + "logps/rejected": -294.9786376953125, + "loss": 0.503, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.012742519378662, + "rewards/margins": 1.1004981994628906, + "rewards/rejected": -2.1132407188415527, + "step": 5776 + }, + { + "epoch": 0.67, + "learning_rate": 1.0173241250146318e-07, + "logits/chosen": -1.9504578113555908, + "logits/rejected": -2.1193652153015137, + "logps/chosen": -336.303466796875, + "logps/rejected": -318.717041015625, + "loss": 0.372, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9595567584037781, + "rewards/margins": 1.8188047409057617, + "rewards/rejected": -2.7783613204956055, + "step": 5777 + }, + { + "epoch": 0.67, + "learning_rate": 1.0169729603183893e-07, + "logits/chosen": -2.4715211391448975, + "logits/rejected": -2.5372023582458496, + "logps/chosen": -350.3348388671875, + "logps/rejected": -293.1446228027344, + "loss": 1.1454, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4838535785675049, + "rewards/margins": 0.9814044237136841, + "rewards/rejected": -2.4652581214904785, + "step": 5778 + }, + { + "epoch": 0.67, + "learning_rate": 1.0166217956221467e-07, + "logits/chosen": -2.4898762702941895, + "logits/rejected": -2.4396865367889404, + "logps/chosen": -141.17762756347656, + "logps/rejected": -157.96865844726562, + "loss": 0.2567, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.305696964263916, + "rewards/margins": 2.310729503631592, + "rewards/rejected": -2.616426467895508, + "step": 5779 + }, + { + "epoch": 0.67, + "learning_rate": 1.0162706309259042e-07, + "logits/chosen": -2.2673020362854004, + "logits/rejected": -2.413107395172119, + "logps/chosen": -223.58799743652344, + "logps/rejected": -214.6735382080078, + "loss": 0.6989, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7975004315376282, + "rewards/margins": 2.7421610355377197, + "rewards/rejected": -3.5396616458892822, + "step": 5780 + }, + { + "epoch": 0.67, + "learning_rate": 1.0159194662296617e-07, + "logits/chosen": -2.4149932861328125, + "logits/rejected": -2.498514413833618, + "logps/chosen": -199.2200469970703, + "logps/rejected": -245.16015625, + "loss": 0.8397, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.093600273132324, + "rewards/margins": 2.1141269207000732, + "rewards/rejected": -4.207727432250977, + "step": 5781 + }, + { + "epoch": 0.67, + "learning_rate": 1.0155683015334191e-07, + "logits/chosen": -2.9589521884918213, + "logits/rejected": -2.7621874809265137, + "logps/chosen": -341.3348693847656, + "logps/rejected": -303.90069580078125, + "loss": 0.1706, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7897987961769104, + "rewards/margins": 3.614424228668213, + "rewards/rejected": -4.4042229652404785, + "step": 5782 + }, + { + "epoch": 0.67, + "learning_rate": 1.0152171368371765e-07, + "logits/chosen": -2.543236017227173, + "logits/rejected": -2.6027748584747314, + "logps/chosen": -351.3167419433594, + "logps/rejected": -208.47848510742188, + "loss": 0.3798, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1971062570810318, + "rewards/margins": 2.464439868927002, + "rewards/rejected": -2.661546230316162, + "step": 5783 + }, + { + "epoch": 0.67, + "learning_rate": 1.0148659721409341e-07, + "logits/chosen": -2.4043564796447754, + "logits/rejected": -2.08957839012146, + "logps/chosen": -223.07969665527344, + "logps/rejected": -331.665283203125, + "loss": 0.1149, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.873035192489624, + "rewards/margins": 3.0127406120300293, + "rewards/rejected": -3.8857758045196533, + "step": 5784 + }, + { + "epoch": 0.67, + "learning_rate": 1.0145148074446915e-07, + "logits/chosen": -2.0386579036712646, + "logits/rejected": -1.9393212795257568, + "logps/chosen": -171.784912109375, + "logps/rejected": -283.13287353515625, + "loss": 0.2447, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1767466068267822, + "rewards/margins": 2.3034725189208984, + "rewards/rejected": -3.4802193641662598, + "step": 5785 + }, + { + "epoch": 0.67, + "learning_rate": 1.0141636427484489e-07, + "logits/chosen": -2.557353973388672, + "logits/rejected": -2.874896764755249, + "logps/chosen": -317.61517333984375, + "logps/rejected": -234.8154296875, + "loss": 0.5099, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5640363693237305, + "rewards/margins": 2.5912516117095947, + "rewards/rejected": -4.155287742614746, + "step": 5786 + }, + { + "epoch": 0.67, + "learning_rate": 1.0138124780522064e-07, + "logits/chosen": -3.0023176670074463, + "logits/rejected": -2.9590015411376953, + "logps/chosen": -187.7206573486328, + "logps/rejected": -262.4122619628906, + "loss": 0.1805, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.329331636428833, + "rewards/margins": 2.9573707580566406, + "rewards/rejected": -4.2867021560668945, + "step": 5787 + }, + { + "epoch": 0.67, + "learning_rate": 1.013461313355964e-07, + "logits/chosen": -2.24813175201416, + "logits/rejected": -2.214661121368408, + "logps/chosen": -227.33360290527344, + "logps/rejected": -348.6695251464844, + "loss": 0.3431, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6006250381469727, + "rewards/margins": 2.6279749870300293, + "rewards/rejected": -3.228600025177002, + "step": 5788 + }, + { + "epoch": 0.67, + "learning_rate": 1.0131101486597214e-07, + "logits/chosen": -2.4367499351501465, + "logits/rejected": -2.4905526638031006, + "logps/chosen": -163.49131774902344, + "logps/rejected": -248.06796264648438, + "loss": 0.2593, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6885973811149597, + "rewards/margins": 3.241292953491211, + "rewards/rejected": -3.9298906326293945, + "step": 5789 + }, + { + "epoch": 0.67, + "learning_rate": 1.0127589839634788e-07, + "logits/chosen": -2.0364062786102295, + "logits/rejected": -2.046666145324707, + "logps/chosen": -192.34494018554688, + "logps/rejected": -256.6676025390625, + "loss": 0.5121, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5375380516052246, + "rewards/margins": 1.6641196012496948, + "rewards/rejected": -3.20165753364563, + "step": 5790 + }, + { + "epoch": 0.67, + "learning_rate": 1.0124078192672362e-07, + "logits/chosen": -2.0723938941955566, + "logits/rejected": -2.1776323318481445, + "logps/chosen": -180.7769317626953, + "logps/rejected": -183.048828125, + "loss": 0.3219, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4249597191810608, + "rewards/margins": 2.0448246002197266, + "rewards/rejected": -2.4697842597961426, + "step": 5791 + }, + { + "epoch": 0.67, + "learning_rate": 1.0120566545709939e-07, + "logits/chosen": -1.8932777643203735, + "logits/rejected": -1.5869066715240479, + "logps/chosen": -259.4695129394531, + "logps/rejected": -270.1017150878906, + "loss": 0.5838, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3925509452819824, + "rewards/margins": 1.3271286487579346, + "rewards/rejected": -2.719679355621338, + "step": 5792 + }, + { + "epoch": 0.67, + "learning_rate": 1.0117054898747513e-07, + "logits/chosen": -2.3413829803466797, + "logits/rejected": -2.4371066093444824, + "logps/chosen": -445.5179138183594, + "logps/rejected": -382.7377014160156, + "loss": 0.5332, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0778610706329346, + "rewards/margins": 1.7772059440612793, + "rewards/rejected": -2.855067014694214, + "step": 5793 + }, + { + "epoch": 0.67, + "learning_rate": 1.0113543251785087e-07, + "logits/chosen": -2.7010226249694824, + "logits/rejected": -2.8642125129699707, + "logps/chosen": -462.3601989746094, + "logps/rejected": -356.4249572753906, + "loss": 0.1421, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7674515247344971, + "rewards/margins": 3.2559828758239746, + "rewards/rejected": -4.023434638977051, + "step": 5794 + }, + { + "epoch": 0.67, + "learning_rate": 1.0110031604822661e-07, + "logits/chosen": -2.318119764328003, + "logits/rejected": -2.611132860183716, + "logps/chosen": -427.9949035644531, + "logps/rejected": -323.0230712890625, + "loss": 0.31, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7257928252220154, + "rewards/margins": 2.8142971992492676, + "rewards/rejected": -3.5400900840759277, + "step": 5795 + }, + { + "epoch": 0.67, + "learning_rate": 1.0106519957860236e-07, + "logits/chosen": -2.388810873031616, + "logits/rejected": -2.342639446258545, + "logps/chosen": -284.4624938964844, + "logps/rejected": -202.46961975097656, + "loss": 0.1773, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.550275444984436, + "rewards/margins": 3.1946542263031006, + "rewards/rejected": -3.744929790496826, + "step": 5796 + }, + { + "epoch": 0.67, + "learning_rate": 1.010300831089781e-07, + "logits/chosen": -1.6645551919937134, + "logits/rejected": -1.4902710914611816, + "logps/chosen": -340.2247009277344, + "logps/rejected": -436.4329833984375, + "loss": 0.6005, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.22605226933956146, + "rewards/margins": 0.6608957052230835, + "rewards/rejected": -0.8869479894638062, + "step": 5797 + }, + { + "epoch": 0.67, + "learning_rate": 1.0099496663935386e-07, + "logits/chosen": -2.011002540588379, + "logits/rejected": -2.162799119949341, + "logps/chosen": -363.4708251953125, + "logps/rejected": -332.3183288574219, + "loss": 0.4479, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9816151261329651, + "rewards/margins": 1.0177018642425537, + "rewards/rejected": -1.9993170499801636, + "step": 5798 + }, + { + "epoch": 0.67, + "learning_rate": 1.009598501697296e-07, + "logits/chosen": -2.1650068759918213, + "logits/rejected": -2.1103806495666504, + "logps/chosen": -334.1598205566406, + "logps/rejected": -203.33389282226562, + "loss": 0.657, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2594349384307861, + "rewards/margins": 0.3821430504322052, + "rewards/rejected": -1.641577959060669, + "step": 5799 + }, + { + "epoch": 0.67, + "learning_rate": 1.0092473370010535e-07, + "logits/chosen": -2.5013251304626465, + "logits/rejected": -2.6360795497894287, + "logps/chosen": -333.14813232421875, + "logps/rejected": -197.67068481445312, + "loss": 0.1766, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.38291099667549133, + "rewards/margins": 2.327216148376465, + "rewards/rejected": -2.710127353668213, + "step": 5800 + }, + { + "epoch": 0.67, + "learning_rate": 1.0088961723048109e-07, + "logits/chosen": -2.6074204444885254, + "logits/rejected": -2.47025465965271, + "logps/chosen": -227.99916076660156, + "logps/rejected": -224.25033569335938, + "loss": 0.4578, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9914069175720215, + "rewards/margins": 1.94358491897583, + "rewards/rejected": -2.9349915981292725, + "step": 5801 + }, + { + "epoch": 0.67, + "learning_rate": 1.0085450076085683e-07, + "logits/chosen": -2.345914840698242, + "logits/rejected": -2.2995972633361816, + "logps/chosen": -284.1635437011719, + "logps/rejected": -337.15313720703125, + "loss": 0.1945, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.01574118807911873, + "rewards/margins": 3.0465548038482666, + "rewards/rejected": -3.030813694000244, + "step": 5802 + }, + { + "epoch": 0.67, + "learning_rate": 1.0081938429123257e-07, + "logits/chosen": -1.9840166568756104, + "logits/rejected": -2.0114009380340576, + "logps/chosen": -181.5311279296875, + "logps/rejected": -242.80532836914062, + "loss": 0.8773, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8423576354980469, + "rewards/margins": 1.8306379318237305, + "rewards/rejected": -2.6729955673217773, + "step": 5803 + }, + { + "epoch": 0.67, + "learning_rate": 1.0078426782160834e-07, + "logits/chosen": -2.4374637603759766, + "logits/rejected": -2.437361001968384, + "logps/chosen": -267.7131652832031, + "logps/rejected": -274.6151123046875, + "loss": 0.1152, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6826760768890381, + "rewards/margins": 3.0142977237701416, + "rewards/rejected": -3.6969738006591797, + "step": 5804 + }, + { + "epoch": 0.67, + "learning_rate": 1.0074915135198408e-07, + "logits/chosen": -2.3488805294036865, + "logits/rejected": -2.323212146759033, + "logps/chosen": -174.93179321289062, + "logps/rejected": -263.0335998535156, + "loss": 0.4807, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.578291654586792, + "rewards/margins": 1.519524335861206, + "rewards/rejected": -2.097815990447998, + "step": 5805 + }, + { + "epoch": 0.67, + "learning_rate": 1.0071403488235982e-07, + "logits/chosen": -1.9262744188308716, + "logits/rejected": -1.6877388954162598, + "logps/chosen": -363.0496520996094, + "logps/rejected": -522.5819091796875, + "loss": 0.4204, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6882649660110474, + "rewards/margins": 2.373194932937622, + "rewards/rejected": -3.06145977973938, + "step": 5806 + }, + { + "epoch": 0.67, + "learning_rate": 1.0067891841273556e-07, + "logits/chosen": -2.5547232627868652, + "logits/rejected": -2.5805556774139404, + "logps/chosen": -172.67367553710938, + "logps/rejected": -123.34284973144531, + "loss": 0.765, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7209538221359253, + "rewards/margins": 0.43889114260673523, + "rewards/rejected": -2.1598451137542725, + "step": 5807 + }, + { + "epoch": 0.67, + "learning_rate": 1.0064380194311133e-07, + "logits/chosen": -2.3354103565216064, + "logits/rejected": -2.288517713546753, + "logps/chosen": -302.26141357421875, + "logps/rejected": -420.87945556640625, + "loss": 0.161, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6470586061477661, + "rewards/margins": 4.2209391593933105, + "rewards/rejected": -5.867997646331787, + "step": 5808 + }, + { + "epoch": 0.67, + "learning_rate": 1.0060868547348707e-07, + "logits/chosen": -2.6656224727630615, + "logits/rejected": -2.716975688934326, + "logps/chosen": -172.0455322265625, + "logps/rejected": -146.86886596679688, + "loss": 0.3393, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7674158811569214, + "rewards/margins": 1.6164729595184326, + "rewards/rejected": -2.3838887214660645, + "step": 5809 + }, + { + "epoch": 0.67, + "learning_rate": 1.0057356900386281e-07, + "logits/chosen": -2.0928964614868164, + "logits/rejected": -2.090266227722168, + "logps/chosen": -189.58154296875, + "logps/rejected": -263.04437255859375, + "loss": 0.3753, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7825819253921509, + "rewards/margins": 2.9724063873291016, + "rewards/rejected": -3.754988431930542, + "step": 5810 + }, + { + "epoch": 0.67, + "learning_rate": 1.0053845253423855e-07, + "logits/chosen": -2.5593314170837402, + "logits/rejected": -2.811067581176758, + "logps/chosen": -224.5653839111328, + "logps/rejected": -156.93878173828125, + "loss": 0.2555, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5580888390541077, + "rewards/margins": 1.8424553871154785, + "rewards/rejected": -2.4005441665649414, + "step": 5811 + }, + { + "epoch": 0.67, + "learning_rate": 1.0050333606461429e-07, + "logits/chosen": -1.9107921123504639, + "logits/rejected": -2.260237693786621, + "logps/chosen": -255.713134765625, + "logps/rejected": -217.40750122070312, + "loss": 0.1216, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12262755632400513, + "rewards/margins": 3.366689920425415, + "rewards/rejected": -3.2440624237060547, + "step": 5812 + }, + { + "epoch": 0.67, + "learning_rate": 1.0046821959499005e-07, + "logits/chosen": -2.6910674571990967, + "logits/rejected": -2.681023120880127, + "logps/chosen": -226.87367248535156, + "logps/rejected": -216.6140899658203, + "loss": 0.5609, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5244983434677124, + "rewards/margins": 1.7417902946472168, + "rewards/rejected": -3.2662885189056396, + "step": 5813 + }, + { + "epoch": 0.67, + "learning_rate": 1.0043310312536579e-07, + "logits/chosen": -2.304356098175049, + "logits/rejected": -2.4897842407226562, + "logps/chosen": -430.0655517578125, + "logps/rejected": -252.20339965820312, + "loss": 0.3191, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3755683898925781, + "rewards/margins": 2.539069890975952, + "rewards/rejected": -3.9146382808685303, + "step": 5814 + }, + { + "epoch": 0.67, + "learning_rate": 1.0039798665574154e-07, + "logits/chosen": -2.6202173233032227, + "logits/rejected": -2.738725185394287, + "logps/chosen": -117.13456726074219, + "logps/rejected": -173.1875762939453, + "loss": 0.3241, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1643667221069336, + "rewards/margins": 1.8836064338684082, + "rewards/rejected": -3.047973155975342, + "step": 5815 + }, + { + "epoch": 0.67, + "learning_rate": 1.0036287018611728e-07, + "logits/chosen": -2.577619791030884, + "logits/rejected": -2.358668327331543, + "logps/chosen": -323.02813720703125, + "logps/rejected": -323.84942626953125, + "loss": 0.4911, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9872258901596069, + "rewards/margins": 3.076256275177002, + "rewards/rejected": -4.063482284545898, + "step": 5816 + }, + { + "epoch": 0.67, + "learning_rate": 1.0032775371649303e-07, + "logits/chosen": -2.154141426086426, + "logits/rejected": -2.204366683959961, + "logps/chosen": -286.3234558105469, + "logps/rejected": -230.71170043945312, + "loss": 0.5676, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9723010063171387, + "rewards/margins": 0.8447703123092651, + "rewards/rejected": -1.8170711994171143, + "step": 5817 + }, + { + "epoch": 0.67, + "learning_rate": 1.0029263724686878e-07, + "logits/chosen": -2.101533889770508, + "logits/rejected": -2.3262939453125, + "logps/chosen": -284.7434997558594, + "logps/rejected": -227.63800048828125, + "loss": 0.4205, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9547522664070129, + "rewards/margins": 0.8372941017150879, + "rewards/rejected": -1.792046308517456, + "step": 5818 + }, + { + "epoch": 0.67, + "learning_rate": 1.0025752077724452e-07, + "logits/chosen": -2.412888526916504, + "logits/rejected": -2.5921974182128906, + "logps/chosen": -508.0835266113281, + "logps/rejected": -289.2279968261719, + "loss": 0.1258, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5935472249984741, + "rewards/margins": 3.740022659301758, + "rewards/rejected": -4.33357048034668, + "step": 5819 + }, + { + "epoch": 0.67, + "learning_rate": 1.0022240430762026e-07, + "logits/chosen": -2.876749277114868, + "logits/rejected": -2.8974130153656006, + "logps/chosen": -124.74503326416016, + "logps/rejected": -187.21266174316406, + "loss": 0.2812, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19040901958942413, + "rewards/margins": 2.184680938720703, + "rewards/rejected": -2.3750901222229004, + "step": 5820 + }, + { + "epoch": 0.67, + "learning_rate": 1.0018728783799602e-07, + "logits/chosen": -1.9879119396209717, + "logits/rejected": -1.8886771202087402, + "logps/chosen": -433.78558349609375, + "logps/rejected": -400.4073486328125, + "loss": 0.4017, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.727816104888916, + "rewards/margins": 2.361711263656616, + "rewards/rejected": -3.0895276069641113, + "step": 5821 + }, + { + "epoch": 0.67, + "learning_rate": 1.0015217136837176e-07, + "logits/chosen": -2.2142348289489746, + "logits/rejected": -2.1573328971862793, + "logps/chosen": -228.61856079101562, + "logps/rejected": -235.3550567626953, + "loss": 0.7228, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2906259298324585, + "rewards/margins": 1.2232048511505127, + "rewards/rejected": -2.5138306617736816, + "step": 5822 + }, + { + "epoch": 0.67, + "learning_rate": 1.001170548987475e-07, + "logits/chosen": -2.2179393768310547, + "logits/rejected": -2.660137891769409, + "logps/chosen": -382.8157958984375, + "logps/rejected": -354.5162353515625, + "loss": 0.5578, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8731977343559265, + "rewards/margins": 3.329010486602783, + "rewards/rejected": -4.202208518981934, + "step": 5823 + }, + { + "epoch": 0.67, + "learning_rate": 1.0008193842912325e-07, + "logits/chosen": -2.3447184562683105, + "logits/rejected": -2.16922664642334, + "logps/chosen": -182.07061767578125, + "logps/rejected": -245.0863037109375, + "loss": 0.2562, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5548027157783508, + "rewards/margins": 1.61906898021698, + "rewards/rejected": -2.1738715171813965, + "step": 5824 + }, + { + "epoch": 0.67, + "learning_rate": 1.0004682195949901e-07, + "logits/chosen": -2.4906888008117676, + "logits/rejected": -2.3971807956695557, + "logps/chosen": -231.6450958251953, + "logps/rejected": -291.85235595703125, + "loss": 0.5127, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0562688112258911, + "rewards/margins": 2.5288150310516357, + "rewards/rejected": -3.5850837230682373, + "step": 5825 + }, + { + "epoch": 0.67, + "learning_rate": 1.0001170548987475e-07, + "logits/chosen": -2.6778454780578613, + "logits/rejected": -2.8040361404418945, + "logps/chosen": -248.83389282226562, + "logps/rejected": -174.6170654296875, + "loss": 0.4081, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.20327290892601013, + "rewards/margins": 1.4420571327209473, + "rewards/rejected": -1.6453299522399902, + "step": 5826 + }, + { + "epoch": 0.67, + "learning_rate": 9.99765890202505e-08, + "logits/chosen": -2.1897969245910645, + "logits/rejected": -2.325848340988159, + "logps/chosen": -219.1000213623047, + "logps/rejected": -189.06851196289062, + "loss": 0.1484, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1313297003507614, + "rewards/margins": 3.2975144386291504, + "rewards/rejected": -3.166184902191162, + "step": 5827 + }, + { + "epoch": 0.67, + "learning_rate": 9.994147255062623e-08, + "logits/chosen": -2.6934597492218018, + "logits/rejected": -2.6888623237609863, + "logps/chosen": -229.66683959960938, + "logps/rejected": -228.57095336914062, + "loss": 0.4421, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5761322975158691, + "rewards/margins": 1.5012733936309814, + "rewards/rejected": -3.0774056911468506, + "step": 5828 + }, + { + "epoch": 0.67, + "learning_rate": 9.990635608100199e-08, + "logits/chosen": -2.0821003913879395, + "logits/rejected": -2.0682437419891357, + "logps/chosen": -607.0096435546875, + "logps/rejected": -344.0458068847656, + "loss": 0.1599, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0308988094329834, + "rewards/margins": 2.4780211448669434, + "rewards/rejected": -3.508920192718506, + "step": 5829 + }, + { + "epoch": 0.67, + "learning_rate": 9.987123961137773e-08, + "logits/chosen": -1.8641406297683716, + "logits/rejected": -1.953963041305542, + "logps/chosen": -216.04315185546875, + "logps/rejected": -214.38523864746094, + "loss": 0.395, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.16926109790802, + "rewards/margins": 2.1929025650024414, + "rewards/rejected": -3.362163543701172, + "step": 5830 + }, + { + "epoch": 0.67, + "learning_rate": 9.983612314175347e-08, + "logits/chosen": -2.5036087036132812, + "logits/rejected": -2.32686448097229, + "logps/chosen": -233.43771362304688, + "logps/rejected": -259.1863708496094, + "loss": 0.3167, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9226987957954407, + "rewards/margins": 2.849259376525879, + "rewards/rejected": -3.771958112716675, + "step": 5831 + }, + { + "epoch": 0.67, + "learning_rate": 9.980100667212922e-08, + "logits/chosen": -2.29482364654541, + "logits/rejected": -2.382972478866577, + "logps/chosen": -345.36859130859375, + "logps/rejected": -309.82159423828125, + "loss": 0.549, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.315504550933838, + "rewards/margins": 1.0359902381896973, + "rewards/rejected": -2.351494789123535, + "step": 5832 + }, + { + "epoch": 0.67, + "learning_rate": 9.976589020250498e-08, + "logits/chosen": -2.125720262527466, + "logits/rejected": -2.3740148544311523, + "logps/chosen": -279.8603210449219, + "logps/rejected": -183.1529541015625, + "loss": 0.6415, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1984037160873413, + "rewards/margins": 0.3677480220794678, + "rewards/rejected": -1.5661516189575195, + "step": 5833 + }, + { + "epoch": 0.67, + "learning_rate": 9.973077373288072e-08, + "logits/chosen": -2.457681655883789, + "logits/rejected": -2.5843241214752197, + "logps/chosen": -256.34906005859375, + "logps/rejected": -283.82586669921875, + "loss": 0.3147, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4532787799835205, + "rewards/margins": 2.234022617340088, + "rewards/rejected": -2.6873013973236084, + "step": 5834 + }, + { + "epoch": 0.67, + "learning_rate": 9.969565726325646e-08, + "logits/chosen": -2.644286870956421, + "logits/rejected": -2.8445777893066406, + "logps/chosen": -282.73193359375, + "logps/rejected": -314.9376525878906, + "loss": 0.3937, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.422476053237915, + "rewards/margins": 2.0059633255004883, + "rewards/rejected": -3.4284396171569824, + "step": 5835 + }, + { + "epoch": 0.67, + "learning_rate": 9.96605407936322e-08, + "logits/chosen": -2.5926015377044678, + "logits/rejected": -2.404954671859741, + "logps/chosen": -343.68096923828125, + "logps/rejected": -361.80804443359375, + "loss": 0.4269, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2192113399505615, + "rewards/margins": 3.4202446937561035, + "rewards/rejected": -4.639455795288086, + "step": 5836 + }, + { + "epoch": 0.67, + "learning_rate": 9.962542432400797e-08, + "logits/chosen": -2.4053564071655273, + "logits/rejected": -2.5371012687683105, + "logps/chosen": -411.634033203125, + "logps/rejected": -303.0355529785156, + "loss": 0.1107, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10559970140457153, + "rewards/margins": 4.0463457107543945, + "rewards/rejected": -4.151945114135742, + "step": 5837 + }, + { + "epoch": 0.67, + "learning_rate": 9.959030785438371e-08, + "logits/chosen": -2.631190776824951, + "logits/rejected": -2.712791919708252, + "logps/chosen": -111.68368530273438, + "logps/rejected": -175.854736328125, + "loss": 0.167, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6056941747665405, + "rewards/margins": 3.18589186668396, + "rewards/rejected": -3.791586399078369, + "step": 5838 + }, + { + "epoch": 0.67, + "learning_rate": 9.955519138475945e-08, + "logits/chosen": -2.211214542388916, + "logits/rejected": -2.0451791286468506, + "logps/chosen": -395.52679443359375, + "logps/rejected": -455.8987121582031, + "loss": 0.4972, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4574626386165619, + "rewards/margins": 1.8790602684020996, + "rewards/rejected": -2.3365228176116943, + "step": 5839 + }, + { + "epoch": 0.67, + "learning_rate": 9.952007491513519e-08, + "logits/chosen": -1.9083116054534912, + "logits/rejected": -2.148108959197998, + "logps/chosen": -323.9768371582031, + "logps/rejected": -335.24945068359375, + "loss": 0.549, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5130279064178467, + "rewards/margins": 2.048982620239258, + "rewards/rejected": -3.5620107650756836, + "step": 5840 + }, + { + "epoch": 0.67, + "learning_rate": 9.948495844551094e-08, + "logits/chosen": -2.7893872261047363, + "logits/rejected": -2.523491621017456, + "logps/chosen": -320.56109619140625, + "logps/rejected": -256.9158630371094, + "loss": 0.1318, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.35102593898773193, + "rewards/margins": 2.224493980407715, + "rewards/rejected": -2.5755200386047363, + "step": 5841 + }, + { + "epoch": 0.67, + "learning_rate": 9.94498419758867e-08, + "logits/chosen": -2.2854785919189453, + "logits/rejected": -1.9920728206634521, + "logps/chosen": -375.69671630859375, + "logps/rejected": -476.099609375, + "loss": 0.1762, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.2585437297821045, + "rewards/margins": 3.2528464794158936, + "rewards/rejected": -5.51138973236084, + "step": 5842 + }, + { + "epoch": 0.67, + "learning_rate": 9.941472550626244e-08, + "logits/chosen": -2.7191684246063232, + "logits/rejected": -2.5805795192718506, + "logps/chosen": -204.3260498046875, + "logps/rejected": -244.52113342285156, + "loss": 0.8919, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.07234787940979, + "rewards/margins": 0.39505070447921753, + "rewards/rejected": -2.4673986434936523, + "step": 5843 + }, + { + "epoch": 0.67, + "learning_rate": 9.937960903663818e-08, + "logits/chosen": -2.143219232559204, + "logits/rejected": -2.271467685699463, + "logps/chosen": -331.7947082519531, + "logps/rejected": -341.6451416015625, + "loss": 0.4194, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.22111913561820984, + "rewards/margins": 2.3687801361083984, + "rewards/rejected": -2.5898990631103516, + "step": 5844 + }, + { + "epoch": 0.67, + "learning_rate": 9.934449256701393e-08, + "logits/chosen": -2.376866340637207, + "logits/rejected": -2.5444130897521973, + "logps/chosen": -156.0446319580078, + "logps/rejected": -210.255859375, + "loss": 0.4355, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2954686880111694, + "rewards/margins": 1.5025452375411987, + "rewards/rejected": -2.798013687133789, + "step": 5845 + }, + { + "epoch": 0.67, + "learning_rate": 9.930937609738967e-08, + "logits/chosen": -2.636573314666748, + "logits/rejected": -2.7562904357910156, + "logps/chosen": -213.6164093017578, + "logps/rejected": -248.05764770507812, + "loss": 0.5955, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3136953115463257, + "rewards/margins": 0.9621015787124634, + "rewards/rejected": -2.275796890258789, + "step": 5846 + }, + { + "epoch": 0.67, + "learning_rate": 9.927425962776541e-08, + "logits/chosen": -2.689483880996704, + "logits/rejected": -2.6712915897369385, + "logps/chosen": -439.1947326660156, + "logps/rejected": -359.22381591796875, + "loss": 0.2355, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6534265875816345, + "rewards/margins": 2.3475351333618164, + "rewards/rejected": -3.0009617805480957, + "step": 5847 + }, + { + "epoch": 0.67, + "learning_rate": 9.923914315814115e-08, + "logits/chosen": -2.580860137939453, + "logits/rejected": -2.6792359352111816, + "logps/chosen": -249.42449951171875, + "logps/rejected": -274.2841491699219, + "loss": 0.4018, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5852203369140625, + "rewards/margins": 1.4741036891937256, + "rewards/rejected": -3.059323787689209, + "step": 5848 + }, + { + "epoch": 0.67, + "learning_rate": 9.920402668851692e-08, + "logits/chosen": -2.2755672931671143, + "logits/rejected": -2.6021554470062256, + "logps/chosen": -223.35726928710938, + "logps/rejected": -233.2545928955078, + "loss": 0.4727, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5244094133377075, + "rewards/margins": 2.1065592765808105, + "rewards/rejected": -2.6309685707092285, + "step": 5849 + }, + { + "epoch": 0.67, + "learning_rate": 9.916891021889266e-08, + "logits/chosen": -2.1838483810424805, + "logits/rejected": -2.3279805183410645, + "logps/chosen": -210.9104766845703, + "logps/rejected": -196.75338745117188, + "loss": 0.245, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17687305808067322, + "rewards/margins": 2.30863881111145, + "rewards/rejected": -2.4855120182037354, + "step": 5850 + }, + { + "epoch": 0.67, + "learning_rate": 9.91337937492684e-08, + "logits/chosen": -2.2390406131744385, + "logits/rejected": -2.397803544998169, + "logps/chosen": -255.7369384765625, + "logps/rejected": -303.3858337402344, + "loss": 0.1166, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5907735824584961, + "rewards/margins": 2.869858741760254, + "rewards/rejected": -3.46063232421875, + "step": 5851 + }, + { + "epoch": 0.67, + "learning_rate": 9.909867727964414e-08, + "logits/chosen": -2.6970765590667725, + "logits/rejected": -2.45837664604187, + "logps/chosen": -238.12149047851562, + "logps/rejected": -229.48733520507812, + "loss": 0.2685, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8949124217033386, + "rewards/margins": 2.2176568508148193, + "rewards/rejected": -3.1125693321228027, + "step": 5852 + }, + { + "epoch": 0.67, + "learning_rate": 9.906356081001991e-08, + "logits/chosen": -2.4984750747680664, + "logits/rejected": -2.7338409423828125, + "logps/chosen": -264.2468566894531, + "logps/rejected": -155.5898895263672, + "loss": 0.772, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1317110061645508, + "rewards/margins": 0.8484952449798584, + "rewards/rejected": -1.98020601272583, + "step": 5853 + }, + { + "epoch": 0.67, + "learning_rate": 9.902844434039565e-08, + "logits/chosen": -2.653135299682617, + "logits/rejected": -2.524137496948242, + "logps/chosen": -106.94476318359375, + "logps/rejected": -181.14382934570312, + "loss": 0.3301, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.47966673970222473, + "rewards/margins": 2.6057119369506836, + "rewards/rejected": -3.085378646850586, + "step": 5854 + }, + { + "epoch": 0.67, + "learning_rate": 9.899332787077139e-08, + "logits/chosen": -2.3991198539733887, + "logits/rejected": -2.671229600906372, + "logps/chosen": -356.1474609375, + "logps/rejected": -190.0288543701172, + "loss": 0.1778, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0853288173675537, + "rewards/margins": 2.936403751373291, + "rewards/rejected": -4.021732330322266, + "step": 5855 + }, + { + "epoch": 0.68, + "learning_rate": 9.895821140114713e-08, + "logits/chosen": -2.339034080505371, + "logits/rejected": -2.4099948406219482, + "logps/chosen": -156.89500427246094, + "logps/rejected": -214.2078857421875, + "loss": 0.2817, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5128605961799622, + "rewards/margins": 2.5492115020751953, + "rewards/rejected": -3.062072277069092, + "step": 5856 + }, + { + "epoch": 0.68, + "learning_rate": 9.892309493152288e-08, + "logits/chosen": -2.378849744796753, + "logits/rejected": -2.295527219772339, + "logps/chosen": -335.2061767578125, + "logps/rejected": -296.38897705078125, + "loss": 0.5012, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05511057376861572, + "rewards/margins": 1.6167824268341064, + "rewards/rejected": -1.5616719722747803, + "step": 5857 + }, + { + "epoch": 0.68, + "learning_rate": 9.888797846189862e-08, + "logits/chosen": -2.328953742980957, + "logits/rejected": -2.3770952224731445, + "logps/chosen": -333.0902099609375, + "logps/rejected": -317.4951171875, + "loss": 0.2163, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6872669458389282, + "rewards/margins": 1.9220763444900513, + "rewards/rejected": -2.6093432903289795, + "step": 5858 + }, + { + "epoch": 0.68, + "learning_rate": 9.885286199227438e-08, + "logits/chosen": -2.6191067695617676, + "logits/rejected": -2.4679527282714844, + "logps/chosen": -180.22344970703125, + "logps/rejected": -365.866455078125, + "loss": 0.3558, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7807741761207581, + "rewards/margins": 2.6295857429504395, + "rewards/rejected": -3.4103598594665527, + "step": 5859 + }, + { + "epoch": 0.68, + "learning_rate": 9.881774552265012e-08, + "logits/chosen": -2.4070286750793457, + "logits/rejected": -2.5253746509552, + "logps/chosen": -329.89483642578125, + "logps/rejected": -281.4985656738281, + "loss": 0.6588, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4529600143432617, + "rewards/margins": 1.4082236289978027, + "rewards/rejected": -1.8611836433410645, + "step": 5860 + }, + { + "epoch": 0.68, + "learning_rate": 9.878262905302586e-08, + "logits/chosen": -2.431056499481201, + "logits/rejected": -2.7049784660339355, + "logps/chosen": -207.1802215576172, + "logps/rejected": -183.36402893066406, + "loss": 0.2658, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.520931601524353, + "rewards/margins": 2.637509346008301, + "rewards/rejected": -3.1584410667419434, + "step": 5861 + }, + { + "epoch": 0.68, + "learning_rate": 9.874751258340161e-08, + "logits/chosen": -2.6588551998138428, + "logits/rejected": -2.656928300857544, + "logps/chosen": -231.4663543701172, + "logps/rejected": -262.3645324707031, + "loss": 0.1794, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6305428743362427, + "rewards/margins": 2.4801526069641113, + "rewards/rejected": -3.1106953620910645, + "step": 5862 + }, + { + "epoch": 0.68, + "learning_rate": 9.871239611377735e-08, + "logits/chosen": -2.6300082206726074, + "logits/rejected": -2.6879634857177734, + "logps/chosen": -229.76629638671875, + "logps/rejected": -304.06494140625, + "loss": 0.1743, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3029382526874542, + "rewards/margins": 3.6901955604553223, + "rewards/rejected": -3.993134021759033, + "step": 5863 + }, + { + "epoch": 0.68, + "learning_rate": 9.86772796441531e-08, + "logits/chosen": -2.4665188789367676, + "logits/rejected": -2.2146565914154053, + "logps/chosen": -371.21258544921875, + "logps/rejected": -404.5174560546875, + "loss": 0.3304, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4399399757385254, + "rewards/margins": 3.415588140487671, + "rewards/rejected": -4.855527877807617, + "step": 5864 + }, + { + "epoch": 0.68, + "learning_rate": 9.864216317452884e-08, + "logits/chosen": -1.775364875793457, + "logits/rejected": -1.5587488412857056, + "logps/chosen": -175.10086059570312, + "logps/rejected": -201.87429809570312, + "loss": 0.3173, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8135349154472351, + "rewards/margins": 1.516288161277771, + "rewards/rejected": -2.3298230171203613, + "step": 5865 + }, + { + "epoch": 0.68, + "learning_rate": 9.86070467049046e-08, + "logits/chosen": -2.655165910720825, + "logits/rejected": -2.7053279876708984, + "logps/chosen": -161.35311889648438, + "logps/rejected": -292.0706481933594, + "loss": 0.2887, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.071936011314392, + "rewards/margins": 2.404486656188965, + "rewards/rejected": -3.4764227867126465, + "step": 5866 + }, + { + "epoch": 0.68, + "learning_rate": 9.857193023528034e-08, + "logits/chosen": -2.4716572761535645, + "logits/rejected": -2.4668524265289307, + "logps/chosen": -299.8739318847656, + "logps/rejected": -245.63726806640625, + "loss": 0.4249, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9038283228874207, + "rewards/margins": 1.7715507745742798, + "rewards/rejected": -2.6753792762756348, + "step": 5867 + }, + { + "epoch": 0.68, + "learning_rate": 9.853681376565608e-08, + "logits/chosen": -1.6887320280075073, + "logits/rejected": -1.3978204727172852, + "logps/chosen": -249.39332580566406, + "logps/rejected": -374.22607421875, + "loss": 0.1075, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5933932065963745, + "rewards/margins": 3.1598126888275146, + "rewards/rejected": -3.7532060146331787, + "step": 5868 + }, + { + "epoch": 0.68, + "learning_rate": 9.850169729603182e-08, + "logits/chosen": -2.060507297515869, + "logits/rejected": -2.0728507041931152, + "logps/chosen": -464.0037536621094, + "logps/rejected": -326.7062072753906, + "loss": 0.4427, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.288403183221817, + "rewards/margins": 2.4660305976867676, + "rewards/rejected": -2.7544336318969727, + "step": 5869 + }, + { + "epoch": 0.68, + "learning_rate": 9.846658082640759e-08, + "logits/chosen": -2.3195769786834717, + "logits/rejected": -2.2507357597351074, + "logps/chosen": -293.63897705078125, + "logps/rejected": -259.10272216796875, + "loss": 0.3293, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6053926944732666, + "rewards/margins": 1.7391595840454102, + "rewards/rejected": -2.3445520401000977, + "step": 5870 + }, + { + "epoch": 0.68, + "learning_rate": 9.843146435678333e-08, + "logits/chosen": -2.242309331893921, + "logits/rejected": -2.522296190261841, + "logps/chosen": -417.08453369140625, + "logps/rejected": -279.15478515625, + "loss": 0.3749, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5631556510925293, + "rewards/margins": 2.0684077739715576, + "rewards/rejected": -3.631563186645508, + "step": 5871 + }, + { + "epoch": 0.68, + "learning_rate": 9.839634788715907e-08, + "logits/chosen": -2.017993927001953, + "logits/rejected": -2.056020498275757, + "logps/chosen": -445.89385986328125, + "logps/rejected": -305.0931396484375, + "loss": 0.3629, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3161582946777344, + "rewards/margins": 2.2922768592834473, + "rewards/rejected": -3.6084351539611816, + "step": 5872 + }, + { + "epoch": 0.68, + "learning_rate": 9.836123141753481e-08, + "logits/chosen": -2.689974308013916, + "logits/rejected": -2.497271776199341, + "logps/chosen": -278.74554443359375, + "logps/rejected": -215.18499755859375, + "loss": 0.3337, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.070467233657837, + "rewards/margins": 1.6438968181610107, + "rewards/rejected": -2.7143640518188477, + "step": 5873 + }, + { + "epoch": 0.68, + "learning_rate": 9.832611494791057e-08, + "logits/chosen": -2.28355073928833, + "logits/rejected": -2.22744083404541, + "logps/chosen": -155.2389678955078, + "logps/rejected": -265.2984924316406, + "loss": 0.1897, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24677830934524536, + "rewards/margins": 3.2944021224975586, + "rewards/rejected": -3.5411806106567383, + "step": 5874 + }, + { + "epoch": 0.68, + "learning_rate": 9.829099847828631e-08, + "logits/chosen": -2.1553001403808594, + "logits/rejected": -2.4603874683380127, + "logps/chosen": -425.5547180175781, + "logps/rejected": -394.45953369140625, + "loss": 0.2704, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.217241108417511, + "rewards/margins": 2.609262466430664, + "rewards/rejected": -2.8265035152435303, + "step": 5875 + }, + { + "epoch": 0.68, + "learning_rate": 9.825588200866206e-08, + "logits/chosen": -2.049682855606079, + "logits/rejected": -2.187194585800171, + "logps/chosen": -430.7407531738281, + "logps/rejected": -348.6118469238281, + "loss": 0.7752, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.4506709575653076, + "rewards/margins": 0.8054149150848389, + "rewards/rejected": -3.2560858726501465, + "step": 5876 + }, + { + "epoch": 0.68, + "learning_rate": 9.82207655390378e-08, + "logits/chosen": -2.2802324295043945, + "logits/rejected": -2.5725629329681396, + "logps/chosen": -258.2132263183594, + "logps/rejected": -226.36485290527344, + "loss": 0.5321, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2576465606689453, + "rewards/margins": 1.2883967161178589, + "rewards/rejected": -2.5460433959960938, + "step": 5877 + }, + { + "epoch": 0.68, + "learning_rate": 9.818564906941356e-08, + "logits/chosen": -2.2997326850891113, + "logits/rejected": -2.6496357917785645, + "logps/chosen": -287.0594177246094, + "logps/rejected": -191.71429443359375, + "loss": 0.1992, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.49937477707862854, + "rewards/margins": 2.6782822608947754, + "rewards/rejected": -3.177656650543213, + "step": 5878 + }, + { + "epoch": 0.68, + "learning_rate": 9.81505325997893e-08, + "logits/chosen": -1.9265496730804443, + "logits/rejected": -2.0141489505767822, + "logps/chosen": -305.12811279296875, + "logps/rejected": -266.13714599609375, + "loss": 0.3394, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8039377927780151, + "rewards/margins": 2.562197208404541, + "rewards/rejected": -3.3661351203918457, + "step": 5879 + }, + { + "epoch": 0.68, + "learning_rate": 9.811541613016504e-08, + "logits/chosen": -2.179687738418579, + "logits/rejected": -2.2278964519500732, + "logps/chosen": -260.74298095703125, + "logps/rejected": -242.8230743408203, + "loss": 0.2464, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8811929821968079, + "rewards/margins": 1.756737232208252, + "rewards/rejected": -2.637930393218994, + "step": 5880 + }, + { + "epoch": 0.68, + "learning_rate": 9.808029966054078e-08, + "logits/chosen": -2.035377264022827, + "logits/rejected": -2.159425735473633, + "logps/chosen": -315.95550537109375, + "logps/rejected": -320.70703125, + "loss": 0.2306, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.5728726387023926, + "rewards/margins": 2.6708085536956787, + "rewards/rejected": -5.243680953979492, + "step": 5881 + }, + { + "epoch": 0.68, + "learning_rate": 9.804518319091655e-08, + "logits/chosen": -2.648106575012207, + "logits/rejected": -2.870697259902954, + "logps/chosen": -249.56924438476562, + "logps/rejected": -197.55126953125, + "loss": 0.2378, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.48440268635749817, + "rewards/margins": 2.4518063068389893, + "rewards/rejected": -2.936209201812744, + "step": 5882 + }, + { + "epoch": 0.68, + "learning_rate": 9.801006672129229e-08, + "logits/chosen": -2.218578815460205, + "logits/rejected": -2.332797050476074, + "logps/chosen": -266.54443359375, + "logps/rejected": -312.66583251953125, + "loss": 0.0853, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5865565538406372, + "rewards/margins": 3.209064483642578, + "rewards/rejected": -3.7956206798553467, + "step": 5883 + }, + { + "epoch": 0.68, + "learning_rate": 9.797495025166803e-08, + "logits/chosen": -2.483747959136963, + "logits/rejected": -2.1148762702941895, + "logps/chosen": -197.16067504882812, + "logps/rejected": -293.08905029296875, + "loss": 0.4668, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8808627128601074, + "rewards/margins": 1.8169039487838745, + "rewards/rejected": -2.6977667808532715, + "step": 5884 + }, + { + "epoch": 0.68, + "learning_rate": 9.793983378204377e-08, + "logits/chosen": -1.9589927196502686, + "logits/rejected": -2.214646339416504, + "logps/chosen": -279.1494140625, + "logps/rejected": -202.86306762695312, + "loss": 0.4133, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1222554445266724, + "rewards/margins": 1.1616665124893188, + "rewards/rejected": -2.283921718597412, + "step": 5885 + }, + { + "epoch": 0.68, + "learning_rate": 9.790471731241952e-08, + "logits/chosen": -1.8006926774978638, + "logits/rejected": -1.535479187965393, + "logps/chosen": -198.99832153320312, + "logps/rejected": -334.17822265625, + "loss": 0.6667, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0874698162078857, + "rewards/margins": 1.0418689250946045, + "rewards/rejected": -2.1293387413024902, + "step": 5886 + }, + { + "epoch": 0.68, + "learning_rate": 9.786960084279527e-08, + "logits/chosen": -2.061830997467041, + "logits/rejected": -2.475558280944824, + "logps/chosen": -543.4737548828125, + "logps/rejected": -252.203125, + "loss": 0.6116, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.548776388168335, + "rewards/margins": 1.0616443157196045, + "rewards/rejected": -2.6104207038879395, + "step": 5887 + }, + { + "epoch": 0.68, + "learning_rate": 9.783448437317102e-08, + "logits/chosen": -2.391063690185547, + "logits/rejected": -2.57651948928833, + "logps/chosen": -270.288818359375, + "logps/rejected": -262.4133605957031, + "loss": 0.1278, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.988178014755249, + "rewards/margins": 4.206567287445068, + "rewards/rejected": -5.194745063781738, + "step": 5888 + }, + { + "epoch": 0.68, + "learning_rate": 9.779936790354676e-08, + "logits/chosen": -2.114365816116333, + "logits/rejected": -2.215757369995117, + "logps/chosen": -215.66851806640625, + "logps/rejected": -259.708740234375, + "loss": 0.1813, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5910957455635071, + "rewards/margins": 3.1215243339538574, + "rewards/rejected": -3.712620258331299, + "step": 5889 + }, + { + "epoch": 0.68, + "learning_rate": 9.776425143392251e-08, + "logits/chosen": -2.3701977729797363, + "logits/rejected": -2.20200777053833, + "logps/chosen": -296.2707824707031, + "logps/rejected": -282.8710632324219, + "loss": 0.3934, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4732928276062012, + "rewards/margins": 1.9439635276794434, + "rewards/rejected": -3.4172563552856445, + "step": 5890 + }, + { + "epoch": 0.68, + "learning_rate": 9.772913496429825e-08, + "logits/chosen": -1.5394607782363892, + "logits/rejected": -1.6851561069488525, + "logps/chosen": -366.64080810546875, + "logps/rejected": -273.72467041015625, + "loss": 1.2333, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.6970367431640625, + "rewards/margins": -0.15972593426704407, + "rewards/rejected": -1.5373108386993408, + "step": 5891 + }, + { + "epoch": 0.68, + "learning_rate": 9.769401849467399e-08, + "logits/chosen": -2.5858404636383057, + "logits/rejected": -2.4701924324035645, + "logps/chosen": -293.1413269042969, + "logps/rejected": -259.7341613769531, + "loss": 0.6209, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6510505676269531, + "rewards/margins": 1.2179052829742432, + "rewards/rejected": -2.8689558506011963, + "step": 5892 + }, + { + "epoch": 0.68, + "learning_rate": 9.765890202504975e-08, + "logits/chosen": -1.9450618028640747, + "logits/rejected": -1.7499048709869385, + "logps/chosen": -341.08148193359375, + "logps/rejected": -494.57904052734375, + "loss": 0.1084, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.29181742668151855, + "rewards/margins": 3.507110595703125, + "rewards/rejected": -3.7989282608032227, + "step": 5893 + }, + { + "epoch": 0.68, + "learning_rate": 9.76237855554255e-08, + "logits/chosen": -2.3416202068328857, + "logits/rejected": -2.2522671222686768, + "logps/chosen": -340.9111022949219, + "logps/rejected": -343.3782043457031, + "loss": 0.6135, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0078935623168945, + "rewards/margins": 1.6830086708068848, + "rewards/rejected": -2.6909022331237793, + "step": 5894 + }, + { + "epoch": 0.68, + "learning_rate": 9.758866908580124e-08, + "logits/chosen": -2.4442191123962402, + "logits/rejected": -2.5241713523864746, + "logps/chosen": -245.97154235839844, + "logps/rejected": -233.8253936767578, + "loss": 0.2627, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7518553137779236, + "rewards/margins": 2.369044303894043, + "rewards/rejected": -3.1208994388580322, + "step": 5895 + }, + { + "epoch": 0.68, + "learning_rate": 9.755355261617698e-08, + "logits/chosen": -2.7661337852478027, + "logits/rejected": -2.8051600456237793, + "logps/chosen": -298.1178894042969, + "logps/rejected": -156.05247497558594, + "loss": 0.3233, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1710796356201172, + "rewards/margins": 1.6722978353500366, + "rewards/rejected": -2.8433775901794434, + "step": 5896 + }, + { + "epoch": 0.68, + "learning_rate": 9.751843614655272e-08, + "logits/chosen": -2.803196907043457, + "logits/rejected": -2.6753454208374023, + "logps/chosen": -154.01055908203125, + "logps/rejected": -216.89373779296875, + "loss": 0.1873, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.339582085609436, + "rewards/margins": 1.8864303827285767, + "rewards/rejected": -3.226012706756592, + "step": 5897 + }, + { + "epoch": 0.68, + "learning_rate": 9.748331967692849e-08, + "logits/chosen": -2.5798819065093994, + "logits/rejected": -2.5176095962524414, + "logps/chosen": -191.46630859375, + "logps/rejected": -229.15892028808594, + "loss": 0.343, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5608572959899902, + "rewards/margins": 2.3492889404296875, + "rewards/rejected": -3.9101462364196777, + "step": 5898 + }, + { + "epoch": 0.68, + "learning_rate": 9.744820320730423e-08, + "logits/chosen": -2.173393964767456, + "logits/rejected": -1.7104063034057617, + "logps/chosen": -138.08677673339844, + "logps/rejected": -274.3212890625, + "loss": 0.3818, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9852153658866882, + "rewards/margins": 1.9111943244934082, + "rewards/rejected": -2.896409511566162, + "step": 5899 + }, + { + "epoch": 0.68, + "learning_rate": 9.741308673767997e-08, + "logits/chosen": -3.0516316890716553, + "logits/rejected": -3.0160701274871826, + "logps/chosen": -184.6030731201172, + "logps/rejected": -302.3101501464844, + "loss": 0.5307, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.177588939666748, + "rewards/margins": 1.4421426057815552, + "rewards/rejected": -2.6197314262390137, + "step": 5900 + }, + { + "epoch": 0.68, + "learning_rate": 9.737797026805571e-08, + "logits/chosen": -2.009225845336914, + "logits/rejected": -1.9746214151382446, + "logps/chosen": -343.68603515625, + "logps/rejected": -446.7064514160156, + "loss": 0.8375, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9425944089889526, + "rewards/margins": 0.5788329839706421, + "rewards/rejected": -1.5214273929595947, + "step": 5901 + }, + { + "epoch": 0.68, + "learning_rate": 9.734285379843146e-08, + "logits/chosen": -2.559412956237793, + "logits/rejected": -2.34832501411438, + "logps/chosen": -133.65313720703125, + "logps/rejected": -197.17828369140625, + "loss": 0.5119, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5479003190994263, + "rewards/margins": 1.2341747283935547, + "rewards/rejected": -2.7820751667022705, + "step": 5902 + }, + { + "epoch": 0.68, + "learning_rate": 9.73077373288072e-08, + "logits/chosen": -2.4237751960754395, + "logits/rejected": -2.6381404399871826, + "logps/chosen": -165.43765258789062, + "logps/rejected": -148.16461181640625, + "loss": 0.6036, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2515816688537598, + "rewards/margins": 1.8711447715759277, + "rewards/rejected": -3.1227264404296875, + "step": 5903 + }, + { + "epoch": 0.68, + "learning_rate": 9.727262085918296e-08, + "logits/chosen": -1.9899649620056152, + "logits/rejected": -2.0737571716308594, + "logps/chosen": -295.04693603515625, + "logps/rejected": -334.3018798828125, + "loss": 0.5004, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2331485003232956, + "rewards/margins": 0.8793476819992065, + "rewards/rejected": -1.1124961376190186, + "step": 5904 + }, + { + "epoch": 0.68, + "learning_rate": 9.72375043895587e-08, + "logits/chosen": -2.617586135864258, + "logits/rejected": -2.5963680744171143, + "logps/chosen": -244.44131469726562, + "logps/rejected": -225.257568359375, + "loss": 0.3389, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9671054482460022, + "rewards/margins": 2.779876947402954, + "rewards/rejected": -3.7469825744628906, + "step": 5905 + }, + { + "epoch": 0.68, + "learning_rate": 9.720238791993444e-08, + "logits/chosen": -1.831240177154541, + "logits/rejected": -1.876550555229187, + "logps/chosen": -358.7291564941406, + "logps/rejected": -329.97393798828125, + "loss": 0.8439, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5892256498336792, + "rewards/margins": 2.282827615737915, + "rewards/rejected": -3.872053384780884, + "step": 5906 + }, + { + "epoch": 0.68, + "learning_rate": 9.716727145031019e-08, + "logits/chosen": -2.3018720149993896, + "logits/rejected": -2.6091859340667725, + "logps/chosen": -181.65621948242188, + "logps/rejected": -261.93719482421875, + "loss": 0.3591, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5023682117462158, + "rewards/margins": 2.251295566558838, + "rewards/rejected": -2.7536637783050537, + "step": 5907 + }, + { + "epoch": 0.68, + "learning_rate": 9.713215498068593e-08, + "logits/chosen": -2.470827579498291, + "logits/rejected": -2.2529358863830566, + "logps/chosen": -304.5045166015625, + "logps/rejected": -270.88824462890625, + "loss": 0.343, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2775154113769531, + "rewards/margins": 2.8495912551879883, + "rewards/rejected": -4.127106666564941, + "step": 5908 + }, + { + "epoch": 0.68, + "learning_rate": 9.709703851106167e-08, + "logits/chosen": -2.121851921081543, + "logits/rejected": -2.3493950366973877, + "logps/chosen": -202.61329650878906, + "logps/rejected": -242.3651580810547, + "loss": 0.3043, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16453343629837036, + "rewards/margins": 3.3071787357330322, + "rewards/rejected": -3.471712112426758, + "step": 5909 + }, + { + "epoch": 0.68, + "learning_rate": 9.706192204143743e-08, + "logits/chosen": -2.3632965087890625, + "logits/rejected": -2.7178306579589844, + "logps/chosen": -524.746826171875, + "logps/rejected": -252.09413146972656, + "loss": 0.5889, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.605858564376831, + "rewards/margins": 2.0857045650482178, + "rewards/rejected": -3.691563129425049, + "step": 5910 + }, + { + "epoch": 0.68, + "learning_rate": 9.702680557181318e-08, + "logits/chosen": -2.71992826461792, + "logits/rejected": -2.6681084632873535, + "logps/chosen": -299.4682312011719, + "logps/rejected": -399.3896484375, + "loss": 0.6779, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5218631029129028, + "rewards/margins": 2.060413122177124, + "rewards/rejected": -3.5822761058807373, + "step": 5911 + }, + { + "epoch": 0.68, + "learning_rate": 9.699168910218892e-08, + "logits/chosen": -2.185333728790283, + "logits/rejected": -2.045022964477539, + "logps/chosen": -361.6159362792969, + "logps/rejected": -340.84552001953125, + "loss": 0.7274, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7646030187606812, + "rewards/margins": 0.7626093029975891, + "rewards/rejected": -1.527212381362915, + "step": 5912 + }, + { + "epoch": 0.68, + "learning_rate": 9.695657263256466e-08, + "logits/chosen": -2.703279495239258, + "logits/rejected": -2.760476589202881, + "logps/chosen": -284.23248291015625, + "logps/rejected": -257.06671142578125, + "loss": 0.0752, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4601329267024994, + "rewards/margins": 4.805935382843018, + "rewards/rejected": -5.266068458557129, + "step": 5913 + }, + { + "epoch": 0.68, + "learning_rate": 9.69214561629404e-08, + "logits/chosen": -1.5711064338684082, + "logits/rejected": -1.6576673984527588, + "logps/chosen": -245.51319885253906, + "logps/rejected": -263.14404296875, + "loss": 0.445, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0237839221954346, + "rewards/margins": 2.7251133918762207, + "rewards/rejected": -3.7488975524902344, + "step": 5914 + }, + { + "epoch": 0.68, + "learning_rate": 9.688633969331617e-08, + "logits/chosen": -2.2667036056518555, + "logits/rejected": -2.354959011077881, + "logps/chosen": -384.1712646484375, + "logps/rejected": -191.26361083984375, + "loss": 0.675, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.424706220626831, + "rewards/margins": 1.6487452983856201, + "rewards/rejected": -4.073451519012451, + "step": 5915 + }, + { + "epoch": 0.68, + "learning_rate": 9.685122322369191e-08, + "logits/chosen": -2.867250919342041, + "logits/rejected": -2.7567877769470215, + "logps/chosen": -326.19451904296875, + "logps/rejected": -254.13299560546875, + "loss": 0.5635, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.761209487915039, + "rewards/margins": 1.026589035987854, + "rewards/rejected": -2.7877981662750244, + "step": 5916 + }, + { + "epoch": 0.68, + "learning_rate": 9.681610675406765e-08, + "logits/chosen": -2.1803102493286133, + "logits/rejected": -2.3258652687072754, + "logps/chosen": -316.6592712402344, + "logps/rejected": -173.376953125, + "loss": 0.8816, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.1997902393341064, + "rewards/margins": 1.064765453338623, + "rewards/rejected": -3.2645554542541504, + "step": 5917 + }, + { + "epoch": 0.68, + "learning_rate": 9.678099028444339e-08, + "logits/chosen": -2.579665184020996, + "logits/rejected": -2.7761318683624268, + "logps/chosen": -222.3617706298828, + "logps/rejected": -264.1124267578125, + "loss": 0.1971, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6166342496871948, + "rewards/margins": 2.907386302947998, + "rewards/rejected": -4.524020671844482, + "step": 5918 + }, + { + "epoch": 0.68, + "learning_rate": 9.674587381481915e-08, + "logits/chosen": -1.9738539457321167, + "logits/rejected": -2.0219452381134033, + "logps/chosen": -157.28086853027344, + "logps/rejected": -188.32415771484375, + "loss": 1.4992, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.581118583679199, + "rewards/margins": -0.3494851589202881, + "rewards/rejected": -2.2316336631774902, + "step": 5919 + }, + { + "epoch": 0.68, + "learning_rate": 9.671075734519489e-08, + "logits/chosen": -1.771165132522583, + "logits/rejected": -2.030792713165283, + "logps/chosen": -311.8389892578125, + "logps/rejected": -283.66009521484375, + "loss": 0.5169, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1417630910873413, + "rewards/margins": 1.7900030612945557, + "rewards/rejected": -2.9317662715911865, + "step": 5920 + }, + { + "epoch": 0.68, + "learning_rate": 9.667564087557064e-08, + "logits/chosen": -2.321472406387329, + "logits/rejected": -2.4486587047576904, + "logps/chosen": -208.80865478515625, + "logps/rejected": -212.38211059570312, + "loss": 0.4637, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2990212440490723, + "rewards/margins": 1.7157738208770752, + "rewards/rejected": -3.0147950649261475, + "step": 5921 + }, + { + "epoch": 0.68, + "learning_rate": 9.664052440594638e-08, + "logits/chosen": -2.1427011489868164, + "logits/rejected": -2.232790470123291, + "logps/chosen": -233.0938262939453, + "logps/rejected": -258.9798583984375, + "loss": 0.5632, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0309193134307861, + "rewards/margins": 2.565411329269409, + "rewards/rejected": -3.5963306427001953, + "step": 5922 + }, + { + "epoch": 0.68, + "learning_rate": 9.660540793632214e-08, + "logits/chosen": -2.2991623878479004, + "logits/rejected": -2.236881971359253, + "logps/chosen": -400.3951416015625, + "logps/rejected": -359.4482421875, + "loss": 0.0811, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6249203681945801, + "rewards/margins": 4.372523307800293, + "rewards/rejected": -4.997443675994873, + "step": 5923 + }, + { + "epoch": 0.68, + "learning_rate": 9.657029146669788e-08, + "logits/chosen": -2.2128467559814453, + "logits/rejected": -2.252427101135254, + "logps/chosen": -249.81668090820312, + "logps/rejected": -273.4050598144531, + "loss": 0.1453, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2396380454301834, + "rewards/margins": 3.1033248901367188, + "rewards/rejected": -3.342963218688965, + "step": 5924 + }, + { + "epoch": 0.68, + "learning_rate": 9.653517499707362e-08, + "logits/chosen": -2.0213208198547363, + "logits/rejected": -2.2505226135253906, + "logps/chosen": -333.415283203125, + "logps/rejected": -327.62103271484375, + "loss": 0.2796, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9233433604240417, + "rewards/margins": 2.9878764152526855, + "rewards/rejected": -3.911219596862793, + "step": 5925 + }, + { + "epoch": 0.68, + "learning_rate": 9.650005852744936e-08, + "logits/chosen": -2.2454795837402344, + "logits/rejected": -2.385504722595215, + "logps/chosen": -381.904052734375, + "logps/rejected": -360.2966613769531, + "loss": 0.5441, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4101252555847168, + "rewards/margins": 2.1468257904052734, + "rewards/rejected": -3.5569510459899902, + "step": 5926 + }, + { + "epoch": 0.68, + "learning_rate": 9.646494205782512e-08, + "logits/chosen": -2.6177265644073486, + "logits/rejected": -2.60563325881958, + "logps/chosen": -279.9067077636719, + "logps/rejected": -305.0775146484375, + "loss": 0.6716, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9610737562179565, + "rewards/margins": 2.0131654739379883, + "rewards/rejected": -2.9742391109466553, + "step": 5927 + }, + { + "epoch": 0.68, + "learning_rate": 9.642982558820087e-08, + "logits/chosen": -2.298828363418579, + "logits/rejected": -2.102752208709717, + "logps/chosen": -200.90206909179688, + "logps/rejected": -172.13446044921875, + "loss": 0.6664, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6456950306892395, + "rewards/margins": 1.1825392246246338, + "rewards/rejected": -1.828234314918518, + "step": 5928 + }, + { + "epoch": 0.68, + "learning_rate": 9.63947091185766e-08, + "logits/chosen": -2.3235561847686768, + "logits/rejected": -2.261519193649292, + "logps/chosen": -424.626953125, + "logps/rejected": -483.98211669921875, + "loss": 0.2608, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5310120582580566, + "rewards/margins": 2.9232163429260254, + "rewards/rejected": -4.454228401184082, + "step": 5929 + }, + { + "epoch": 0.68, + "learning_rate": 9.635959264895235e-08, + "logits/chosen": -2.193279981613159, + "logits/rejected": -2.168783187866211, + "logps/chosen": -196.66534423828125, + "logps/rejected": -344.75872802734375, + "loss": 0.154, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3071674704551697, + "rewards/margins": 3.7979657649993896, + "rewards/rejected": -4.105133533477783, + "step": 5930 + }, + { + "epoch": 0.68, + "learning_rate": 9.63244761793281e-08, + "logits/chosen": -2.2100491523742676, + "logits/rejected": -2.33394718170166, + "logps/chosen": -274.7901306152344, + "logps/rejected": -186.6984100341797, + "loss": 0.4478, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.442754864692688, + "rewards/margins": 1.2694785594940186, + "rewards/rejected": -1.712233304977417, + "step": 5931 + }, + { + "epoch": 0.68, + "learning_rate": 9.628935970970385e-08, + "logits/chosen": -2.242906093597412, + "logits/rejected": -2.373746156692505, + "logps/chosen": -229.796142578125, + "logps/rejected": -343.0554504394531, + "loss": 0.3428, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7271361351013184, + "rewards/margins": 2.5388078689575195, + "rewards/rejected": -3.265944004058838, + "step": 5932 + }, + { + "epoch": 0.68, + "learning_rate": 9.62542432400796e-08, + "logits/chosen": -2.2134203910827637, + "logits/rejected": -2.5119943618774414, + "logps/chosen": -397.7553405761719, + "logps/rejected": -300.7170104980469, + "loss": 0.1472, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.28322985768318176, + "rewards/margins": 3.2859365940093994, + "rewards/rejected": -3.569166660308838, + "step": 5933 + }, + { + "epoch": 0.68, + "learning_rate": 9.621912677045534e-08, + "logits/chosen": -2.354097366333008, + "logits/rejected": -2.6107473373413086, + "logps/chosen": -238.7542266845703, + "logps/rejected": -251.48663330078125, + "loss": 0.2509, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4355705976486206, + "rewards/margins": 3.620750665664673, + "rewards/rejected": -5.056321620941162, + "step": 5934 + }, + { + "epoch": 0.68, + "learning_rate": 9.618401030083109e-08, + "logits/chosen": -2.065506935119629, + "logits/rejected": -1.8840605020523071, + "logps/chosen": -397.262451171875, + "logps/rejected": -401.5599670410156, + "loss": 0.3027, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0127981901168823, + "rewards/margins": 2.7312846183776855, + "rewards/rejected": -3.7440829277038574, + "step": 5935 + }, + { + "epoch": 0.68, + "learning_rate": 9.614889383120683e-08, + "logits/chosen": -2.7090649604797363, + "logits/rejected": -2.4079816341400146, + "logps/chosen": -219.1239776611328, + "logps/rejected": -249.48355102539062, + "loss": 0.0874, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.018746942281723022, + "rewards/margins": 2.91499400138855, + "rewards/rejected": -2.9337408542633057, + "step": 5936 + }, + { + "epoch": 0.68, + "learning_rate": 9.611377736158257e-08, + "logits/chosen": -2.072629928588867, + "logits/rejected": -2.126481056213379, + "logps/chosen": -336.76605224609375, + "logps/rejected": -252.45758056640625, + "loss": 0.1869, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.003393605351448059, + "rewards/margins": 2.043282985687256, + "rewards/rejected": -2.039889335632324, + "step": 5937 + }, + { + "epoch": 0.68, + "learning_rate": 9.607866089195832e-08, + "logits/chosen": -2.6279330253601074, + "logits/rejected": -2.516753673553467, + "logps/chosen": -283.472900390625, + "logps/rejected": -324.623779296875, + "loss": 0.1273, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.142038106918335, + "rewards/margins": 3.3853509426116943, + "rewards/rejected": -4.527389049530029, + "step": 5938 + }, + { + "epoch": 0.68, + "learning_rate": 9.604354442233408e-08, + "logits/chosen": -1.8265975713729858, + "logits/rejected": -2.260502815246582, + "logps/chosen": -386.86712646484375, + "logps/rejected": -230.07208251953125, + "loss": 0.2474, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22327494621276855, + "rewards/margins": 2.1082026958465576, + "rewards/rejected": -2.331477642059326, + "step": 5939 + }, + { + "epoch": 0.68, + "learning_rate": 9.600842795270982e-08, + "logits/chosen": -2.7083845138549805, + "logits/rejected": -2.604569911956787, + "logps/chosen": -270.76458740234375, + "logps/rejected": -272.6612854003906, + "loss": 0.2192, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7796770930290222, + "rewards/margins": 3.822317600250244, + "rewards/rejected": -4.601994514465332, + "step": 5940 + }, + { + "epoch": 0.68, + "learning_rate": 9.597331148308556e-08, + "logits/chosen": -2.5750815868377686, + "logits/rejected": -2.570061206817627, + "logps/chosen": -231.6147003173828, + "logps/rejected": -272.1545104980469, + "loss": 0.3533, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1812918186187744, + "rewards/margins": 1.616747498512268, + "rewards/rejected": -2.798039197921753, + "step": 5941 + }, + { + "epoch": 0.68, + "learning_rate": 9.59381950134613e-08, + "logits/chosen": -2.0372676849365234, + "logits/rejected": -2.23587965965271, + "logps/chosen": -378.9277038574219, + "logps/rejected": -245.69705200195312, + "loss": 0.2183, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1507989466190338, + "rewards/margins": 2.9557695388793945, + "rewards/rejected": -3.1065688133239746, + "step": 5942 + }, + { + "epoch": 0.69, + "learning_rate": 9.590307854383707e-08, + "logits/chosen": -2.647002935409546, + "logits/rejected": -2.589541435241699, + "logps/chosen": -424.1471862792969, + "logps/rejected": -265.90985107421875, + "loss": 0.3956, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.052579402923584, + "rewards/margins": 1.487472653388977, + "rewards/rejected": -2.5400519371032715, + "step": 5943 + }, + { + "epoch": 0.69, + "learning_rate": 9.586796207421281e-08, + "logits/chosen": -2.1723687648773193, + "logits/rejected": -2.1981804370880127, + "logps/chosen": -485.38836669921875, + "logps/rejected": -393.1559753417969, + "loss": 0.5001, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3253989219665527, + "rewards/margins": 2.678997039794922, + "rewards/rejected": -4.004395961761475, + "step": 5944 + }, + { + "epoch": 0.69, + "learning_rate": 9.583284560458855e-08, + "logits/chosen": -1.8818944692611694, + "logits/rejected": -2.2458436489105225, + "logps/chosen": -287.3106689453125, + "logps/rejected": -188.34698486328125, + "loss": 0.3897, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9657428860664368, + "rewards/margins": 1.0487455129623413, + "rewards/rejected": -2.014488458633423, + "step": 5945 + }, + { + "epoch": 0.69, + "learning_rate": 9.579772913496429e-08, + "logits/chosen": -1.9040768146514893, + "logits/rejected": -2.0115489959716797, + "logps/chosen": -261.96453857421875, + "logps/rejected": -326.2763977050781, + "loss": 0.2152, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8501332402229309, + "rewards/margins": 2.7946994304656982, + "rewards/rejected": -3.6448326110839844, + "step": 5946 + }, + { + "epoch": 0.69, + "learning_rate": 9.576261266534004e-08, + "logits/chosen": -2.327833652496338, + "logits/rejected": -2.6877408027648926, + "logps/chosen": -361.0320129394531, + "logps/rejected": -281.1632995605469, + "loss": 0.165, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3554188013076782, + "rewards/margins": 3.2980899810791016, + "rewards/rejected": -4.65350866317749, + "step": 5947 + }, + { + "epoch": 0.69, + "learning_rate": 9.572749619571578e-08, + "logits/chosen": -2.596139907836914, + "logits/rejected": -2.644249677658081, + "logps/chosen": -217.42141723632812, + "logps/rejected": -375.0761413574219, + "loss": 0.3284, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3506464958190918, + "rewards/margins": 3.0576982498168945, + "rewards/rejected": -4.408344745635986, + "step": 5948 + }, + { + "epoch": 0.69, + "learning_rate": 9.569237972609154e-08, + "logits/chosen": -2.6526288986206055, + "logits/rejected": -2.7842211723327637, + "logps/chosen": -141.6610565185547, + "logps/rejected": -138.69403076171875, + "loss": 0.8508, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4613677263259888, + "rewards/margins": 2.1564438343048096, + "rewards/rejected": -3.617811679840088, + "step": 5949 + }, + { + "epoch": 0.69, + "learning_rate": 9.565726325646728e-08, + "logits/chosen": -1.8414716720581055, + "logits/rejected": -2.033193588256836, + "logps/chosen": -707.9764404296875, + "logps/rejected": -477.2734069824219, + "loss": 0.4455, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1049467325210571, + "rewards/margins": 2.623344659805298, + "rewards/rejected": -3.7282912731170654, + "step": 5950 + }, + { + "epoch": 0.69, + "learning_rate": 9.562214678684303e-08, + "logits/chosen": -2.0836429595947266, + "logits/rejected": -1.7561333179473877, + "logps/chosen": -235.34506225585938, + "logps/rejected": -281.2230224609375, + "loss": 0.693, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.1289069652557373, + "rewards/margins": 0.12657080590724945, + "rewards/rejected": -2.2554779052734375, + "step": 5951 + }, + { + "epoch": 0.69, + "learning_rate": 9.558703031721877e-08, + "logits/chosen": -2.031999349594116, + "logits/rejected": -2.2388556003570557, + "logps/chosen": -241.16427612304688, + "logps/rejected": -238.68736267089844, + "loss": 0.8936, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4621806144714355, + "rewards/margins": 1.6048979759216309, + "rewards/rejected": -3.0670785903930664, + "step": 5952 + }, + { + "epoch": 0.69, + "learning_rate": 9.555191384759451e-08, + "logits/chosen": -2.709951877593994, + "logits/rejected": -2.5932562351226807, + "logps/chosen": -190.8139190673828, + "logps/rejected": -200.07937622070312, + "loss": 0.4484, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9868578910827637, + "rewards/margins": 1.5605186223983765, + "rewards/rejected": -2.5473766326904297, + "step": 5953 + }, + { + "epoch": 0.69, + "learning_rate": 9.551679737797025e-08, + "logits/chosen": -1.9027020931243896, + "logits/rejected": -2.040745735168457, + "logps/chosen": -305.6412048339844, + "logps/rejected": -192.61224365234375, + "loss": 0.4598, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2700889110565186, + "rewards/margins": 1.7184839248657227, + "rewards/rejected": -2.9885730743408203, + "step": 5954 + }, + { + "epoch": 0.69, + "learning_rate": 9.548168090834601e-08, + "logits/chosen": -2.5826754570007324, + "logits/rejected": -2.347727060317993, + "logps/chosen": -307.1794738769531, + "logps/rejected": -413.2513427734375, + "loss": 0.2645, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1260714530944824, + "rewards/margins": 2.4397661685943604, + "rewards/rejected": -3.5658373832702637, + "step": 5955 + }, + { + "epoch": 0.69, + "learning_rate": 9.544656443872176e-08, + "logits/chosen": -2.619035482406616, + "logits/rejected": -2.6048073768615723, + "logps/chosen": -326.826904296875, + "logps/rejected": -264.8863525390625, + "loss": 0.3956, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7101635336875916, + "rewards/margins": 2.0858123302459717, + "rewards/rejected": -2.795975923538208, + "step": 5956 + }, + { + "epoch": 0.69, + "learning_rate": 9.54114479690975e-08, + "logits/chosen": -2.3819501399993896, + "logits/rejected": -2.4544901847839355, + "logps/chosen": -306.53857421875, + "logps/rejected": -242.5723419189453, + "loss": 0.2417, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7327648997306824, + "rewards/margins": 2.281155586242676, + "rewards/rejected": -3.013920783996582, + "step": 5957 + }, + { + "epoch": 0.69, + "learning_rate": 9.537633149947324e-08, + "logits/chosen": -1.9832515716552734, + "logits/rejected": -2.2826271057128906, + "logps/chosen": -375.6788024902344, + "logps/rejected": -270.3933410644531, + "loss": 0.7695, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5482692718505859, + "rewards/margins": 0.9796749949455261, + "rewards/rejected": -1.5279443264007568, + "step": 5958 + }, + { + "epoch": 0.69, + "learning_rate": 9.534121502984898e-08, + "logits/chosen": -2.6071457862854004, + "logits/rejected": -2.971723794937134, + "logps/chosen": -305.45306396484375, + "logps/rejected": -387.1809387207031, + "loss": 0.2176, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.40449604392051697, + "rewards/margins": 3.4095914363861084, + "rewards/rejected": -3.814087152481079, + "step": 5959 + }, + { + "epoch": 0.69, + "learning_rate": 9.530609856022475e-08, + "logits/chosen": -2.0496339797973633, + "logits/rejected": -2.0270097255706787, + "logps/chosen": -319.63824462890625, + "logps/rejected": -238.59591674804688, + "loss": 0.5743, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7440630197525024, + "rewards/margins": 1.4089192152023315, + "rewards/rejected": -2.152982234954834, + "step": 5960 + }, + { + "epoch": 0.69, + "learning_rate": 9.527098209060049e-08, + "logits/chosen": -2.738217830657959, + "logits/rejected": -2.6368026733398438, + "logps/chosen": -206.11094665527344, + "logps/rejected": -257.92364501953125, + "loss": 0.3274, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.27177390456199646, + "rewards/margins": 2.6502432823181152, + "rewards/rejected": -2.9220173358917236, + "step": 5961 + }, + { + "epoch": 0.69, + "learning_rate": 9.523586562097623e-08, + "logits/chosen": -1.9473119974136353, + "logits/rejected": -1.9720216989517212, + "logps/chosen": -284.7404479980469, + "logps/rejected": -354.7353210449219, + "loss": 1.0004, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1871031522750854, + "rewards/margins": -0.08200028538703918, + "rewards/rejected": -1.1051026582717896, + "step": 5962 + }, + { + "epoch": 0.69, + "learning_rate": 9.520074915135197e-08, + "logits/chosen": -2.4136343002319336, + "logits/rejected": -2.2821578979492188, + "logps/chosen": -237.30198669433594, + "logps/rejected": -380.0972900390625, + "loss": 0.1191, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.020951420068740845, + "rewards/margins": 4.476369857788086, + "rewards/rejected": -4.497321128845215, + "step": 5963 + }, + { + "epoch": 0.69, + "learning_rate": 9.516563268172773e-08, + "logits/chosen": -1.5871236324310303, + "logits/rejected": -1.9586645364761353, + "logps/chosen": -289.40167236328125, + "logps/rejected": -218.79925537109375, + "loss": 0.6713, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8872052431106567, + "rewards/margins": 0.6686692833900452, + "rewards/rejected": -1.5558743476867676, + "step": 5964 + }, + { + "epoch": 0.69, + "learning_rate": 9.513051621210347e-08, + "logits/chosen": -2.2719314098358154, + "logits/rejected": -2.459393262863159, + "logps/chosen": -596.01025390625, + "logps/rejected": -414.46014404296875, + "loss": 0.1394, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8075354099273682, + "rewards/margins": 2.4614174365997314, + "rewards/rejected": -3.2689528465270996, + "step": 5965 + }, + { + "epoch": 0.69, + "learning_rate": 9.509539974247922e-08, + "logits/chosen": -2.7026309967041016, + "logits/rejected": -2.661684989929199, + "logps/chosen": -219.99166870117188, + "logps/rejected": -255.339111328125, + "loss": 0.1256, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9088590145111084, + "rewards/margins": 3.9640631675720215, + "rewards/rejected": -4.872921943664551, + "step": 5966 + }, + { + "epoch": 0.69, + "learning_rate": 9.506028327285496e-08, + "logits/chosen": -2.6545541286468506, + "logits/rejected": -2.4689505100250244, + "logps/chosen": -129.9110107421875, + "logps/rejected": -241.42774963378906, + "loss": 0.343, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3740221858024597, + "rewards/margins": 2.696873664855957, + "rewards/rejected": -3.0708956718444824, + "step": 5967 + }, + { + "epoch": 0.69, + "learning_rate": 9.502516680323072e-08, + "logits/chosen": -2.3017258644104004, + "logits/rejected": -1.9819267988204956, + "logps/chosen": -244.527587890625, + "logps/rejected": -433.4997863769531, + "loss": 0.3133, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0152146816253662, + "rewards/margins": 3.824099063873291, + "rewards/rejected": -4.839313507080078, + "step": 5968 + }, + { + "epoch": 0.69, + "learning_rate": 9.499005033360646e-08, + "logits/chosen": -2.154026508331299, + "logits/rejected": -2.3967010974884033, + "logps/chosen": -143.14404296875, + "logps/rejected": -205.7082061767578, + "loss": 0.5036, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1510214805603027, + "rewards/margins": 2.1999549865722656, + "rewards/rejected": -3.3509764671325684, + "step": 5969 + }, + { + "epoch": 0.69, + "learning_rate": 9.49549338639822e-08, + "logits/chosen": -2.2206993103027344, + "logits/rejected": -2.1769802570343018, + "logps/chosen": -277.8770751953125, + "logps/rejected": -378.7822265625, + "loss": 0.2185, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.212496042251587, + "rewards/margins": 3.643003463745117, + "rewards/rejected": -4.855500221252441, + "step": 5970 + }, + { + "epoch": 0.69, + "learning_rate": 9.491981739435794e-08, + "logits/chosen": -2.521219491958618, + "logits/rejected": -2.284978151321411, + "logps/chosen": -206.39044189453125, + "logps/rejected": -266.56005859375, + "loss": 0.2424, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9217272996902466, + "rewards/margins": 2.165983200073242, + "rewards/rejected": -3.087710380554199, + "step": 5971 + }, + { + "epoch": 0.69, + "learning_rate": 9.48847009247337e-08, + "logits/chosen": -2.194582939147949, + "logits/rejected": -1.9901292324066162, + "logps/chosen": -207.50064086914062, + "logps/rejected": -314.946044921875, + "loss": 0.1071, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10863161087036133, + "rewards/margins": 3.930204153060913, + "rewards/rejected": -4.0388360023498535, + "step": 5972 + }, + { + "epoch": 0.69, + "learning_rate": 9.484958445510944e-08, + "logits/chosen": -2.13435697555542, + "logits/rejected": -2.1458213329315186, + "logps/chosen": -374.87115478515625, + "logps/rejected": -416.7578125, + "loss": 0.3093, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13428840041160583, + "rewards/margins": 1.3331246376037598, + "rewards/rejected": -1.198836326599121, + "step": 5973 + }, + { + "epoch": 0.69, + "learning_rate": 9.481446798548519e-08, + "logits/chosen": -2.3956122398376465, + "logits/rejected": -2.319406509399414, + "logps/chosen": -203.3160400390625, + "logps/rejected": -254.53399658203125, + "loss": 0.2498, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3486107587814331, + "rewards/margins": 2.2894554138183594, + "rewards/rejected": -2.638066053390503, + "step": 5974 + }, + { + "epoch": 0.69, + "learning_rate": 9.477935151586093e-08, + "logits/chosen": -1.7221782207489014, + "logits/rejected": -2.1753478050231934, + "logps/chosen": -396.03515625, + "logps/rejected": -275.3739929199219, + "loss": 0.6141, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3372793197631836, + "rewards/margins": 2.130782127380371, + "rewards/rejected": -3.468061685562134, + "step": 5975 + }, + { + "epoch": 0.69, + "learning_rate": 9.474423504623669e-08, + "logits/chosen": -2.292358636856079, + "logits/rejected": -2.695232391357422, + "logps/chosen": -319.7359313964844, + "logps/rejected": -277.2606201171875, + "loss": 0.2885, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.43554869294166565, + "rewards/margins": 3.756648540496826, + "rewards/rejected": -4.192196846008301, + "step": 5976 + }, + { + "epoch": 0.69, + "learning_rate": 9.470911857661243e-08, + "logits/chosen": -2.1350865364074707, + "logits/rejected": -2.0813028812408447, + "logps/chosen": -380.15655517578125, + "logps/rejected": -250.79354858398438, + "loss": 0.6197, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9030795693397522, + "rewards/margins": 1.2749691009521484, + "rewards/rejected": -2.178048610687256, + "step": 5977 + }, + { + "epoch": 0.69, + "learning_rate": 9.467400210698817e-08, + "logits/chosen": -2.2731714248657227, + "logits/rejected": -2.316472291946411, + "logps/chosen": -458.34716796875, + "logps/rejected": -377.15966796875, + "loss": 0.2712, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5603494644165039, + "rewards/margins": 1.9987282752990723, + "rewards/rejected": -2.5590779781341553, + "step": 5978 + }, + { + "epoch": 0.69, + "learning_rate": 9.463888563736391e-08, + "logits/chosen": -1.8840585947036743, + "logits/rejected": -2.066044807434082, + "logps/chosen": -428.9197998046875, + "logps/rejected": -308.66619873046875, + "loss": 0.5878, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3508716821670532, + "rewards/margins": 1.9930334091186523, + "rewards/rejected": -3.343905210494995, + "step": 5979 + }, + { + "epoch": 0.69, + "learning_rate": 9.460376916773967e-08, + "logits/chosen": -2.5894296169281006, + "logits/rejected": -2.6012587547302246, + "logps/chosen": -462.85003662109375, + "logps/rejected": -148.69876098632812, + "loss": 1.3797, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.068213939666748, + "rewards/margins": -0.2648397386074066, + "rewards/rejected": -1.803374171257019, + "step": 5980 + }, + { + "epoch": 0.69, + "learning_rate": 9.456865269811541e-08, + "logits/chosen": -1.9888560771942139, + "logits/rejected": -1.9451481103897095, + "logps/chosen": -286.5230407714844, + "logps/rejected": -289.64013671875, + "loss": 0.3344, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17813773453235626, + "rewards/margins": 2.247560501098633, + "rewards/rejected": -2.4256982803344727, + "step": 5981 + }, + { + "epoch": 0.69, + "learning_rate": 9.453353622849115e-08, + "logits/chosen": -2.658245325088501, + "logits/rejected": -2.6979572772979736, + "logps/chosen": -356.996826171875, + "logps/rejected": -288.465576171875, + "loss": 0.418, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5583994388580322, + "rewards/margins": 1.887337327003479, + "rewards/rejected": -3.4457366466522217, + "step": 5982 + }, + { + "epoch": 0.69, + "learning_rate": 9.44984197588669e-08, + "logits/chosen": -2.70174503326416, + "logits/rejected": -2.732656955718994, + "logps/chosen": -474.74847412109375, + "logps/rejected": -404.0440673828125, + "loss": 0.7118, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5188184380531311, + "rewards/margins": 0.8786634206771851, + "rewards/rejected": -1.397481918334961, + "step": 5983 + }, + { + "epoch": 0.69, + "learning_rate": 9.446330328924266e-08, + "logits/chosen": -2.379761219024658, + "logits/rejected": -1.9846501350402832, + "logps/chosen": -214.34181213378906, + "logps/rejected": -300.8945617675781, + "loss": 0.6633, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5550726652145386, + "rewards/margins": 1.5661911964416504, + "rewards/rejected": -3.1212637424468994, + "step": 5984 + }, + { + "epoch": 0.69, + "learning_rate": 9.44281868196184e-08, + "logits/chosen": -2.084810733795166, + "logits/rejected": -2.2346932888031006, + "logps/chosen": -193.8028106689453, + "logps/rejected": -170.73707580566406, + "loss": 0.2135, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22161757946014404, + "rewards/margins": 2.4847288131713867, + "rewards/rejected": -2.2631113529205322, + "step": 5985 + }, + { + "epoch": 0.69, + "learning_rate": 9.439307034999414e-08, + "logits/chosen": -2.5996718406677246, + "logits/rejected": -2.5339512825012207, + "logps/chosen": -336.24200439453125, + "logps/rejected": -363.3848876953125, + "loss": 0.2714, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.951052725315094, + "rewards/margins": 2.591172218322754, + "rewards/rejected": -3.542224884033203, + "step": 5986 + }, + { + "epoch": 0.69, + "learning_rate": 9.435795388036988e-08, + "logits/chosen": -2.4796924591064453, + "logits/rejected": -2.6306729316711426, + "logps/chosen": -339.3733825683594, + "logps/rejected": -235.39093017578125, + "loss": 0.6835, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0239853858947754, + "rewards/margins": 1.4818499088287354, + "rewards/rejected": -2.5058352947235107, + "step": 5987 + }, + { + "epoch": 0.69, + "learning_rate": 9.432283741074565e-08, + "logits/chosen": -2.1777286529541016, + "logits/rejected": -2.4232685565948486, + "logps/chosen": -221.280517578125, + "logps/rejected": -244.74514770507812, + "loss": 0.8653, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.148838520050049, + "rewards/margins": 1.1362906694412231, + "rewards/rejected": -3.2851293087005615, + "step": 5988 + }, + { + "epoch": 0.69, + "learning_rate": 9.428772094112139e-08, + "logits/chosen": -2.265195369720459, + "logits/rejected": -2.4277288913726807, + "logps/chosen": -399.97186279296875, + "logps/rejected": -329.6835021972656, + "loss": 0.3886, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1999967098236084, + "rewards/margins": 1.7998549938201904, + "rewards/rejected": -2.999851703643799, + "step": 5989 + }, + { + "epoch": 0.69, + "learning_rate": 9.425260447149713e-08, + "logits/chosen": -2.301534652709961, + "logits/rejected": -1.9846458435058594, + "logps/chosen": -478.6919250488281, + "logps/rejected": -307.1087646484375, + "loss": 0.4455, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.024544358253479, + "rewards/margins": 2.240969657897949, + "rewards/rejected": -3.2655141353607178, + "step": 5990 + }, + { + "epoch": 0.69, + "learning_rate": 9.421748800187287e-08, + "logits/chosen": -2.307227373123169, + "logits/rejected": -2.058861255645752, + "logps/chosen": -197.83944702148438, + "logps/rejected": -245.55789184570312, + "loss": 0.6447, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1321998834609985, + "rewards/margins": 0.8695933818817139, + "rewards/rejected": -2.001793146133423, + "step": 5991 + }, + { + "epoch": 0.69, + "learning_rate": 9.418237153224862e-08, + "logits/chosen": -2.8862078189849854, + "logits/rejected": -2.809372901916504, + "logps/chosen": -209.11981201171875, + "logps/rejected": -183.2496337890625, + "loss": 0.4494, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6849541068077087, + "rewards/margins": 2.4938197135925293, + "rewards/rejected": -3.1787734031677246, + "step": 5992 + }, + { + "epoch": 0.69, + "learning_rate": 9.414725506262438e-08, + "logits/chosen": -1.4630820751190186, + "logits/rejected": -1.0994439125061035, + "logps/chosen": -316.8012390136719, + "logps/rejected": -390.0954895019531, + "loss": 0.3702, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6213783025741577, + "rewards/margins": 2.3430533409118652, + "rewards/rejected": -2.9644315242767334, + "step": 5993 + }, + { + "epoch": 0.69, + "learning_rate": 9.411213859300012e-08, + "logits/chosen": -2.0561084747314453, + "logits/rejected": -2.129819393157959, + "logps/chosen": -414.20965576171875, + "logps/rejected": -315.29290771484375, + "loss": 0.3752, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.33437082171440125, + "rewards/margins": 1.8911246061325073, + "rewards/rejected": -2.2254953384399414, + "step": 5994 + }, + { + "epoch": 0.69, + "learning_rate": 9.407702212337586e-08, + "logits/chosen": -2.3166162967681885, + "logits/rejected": -2.313007354736328, + "logps/chosen": -146.5743865966797, + "logps/rejected": -233.04257202148438, + "loss": 0.1207, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1006057262420654, + "rewards/margins": 5.001900672912598, + "rewards/rejected": -6.102506637573242, + "step": 5995 + }, + { + "epoch": 0.69, + "learning_rate": 9.404190565375161e-08, + "logits/chosen": -2.568479537963867, + "logits/rejected": -2.8012747764587402, + "logps/chosen": -250.25770568847656, + "logps/rejected": -246.3077850341797, + "loss": 0.3557, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.31088364124298096, + "rewards/margins": 2.346133232116699, + "rewards/rejected": -2.657017230987549, + "step": 5996 + }, + { + "epoch": 0.69, + "learning_rate": 9.400678918412735e-08, + "logits/chosen": -2.008059501647949, + "logits/rejected": -1.708351731300354, + "logps/chosen": -247.32647705078125, + "logps/rejected": -363.0456848144531, + "loss": 0.8066, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.3744025230407715, + "rewards/margins": 3.1432323455810547, + "rewards/rejected": -5.517634868621826, + "step": 5997 + }, + { + "epoch": 0.69, + "learning_rate": 9.397167271450309e-08, + "logits/chosen": -2.673511028289795, + "logits/rejected": -2.5705573558807373, + "logps/chosen": -347.96435546875, + "logps/rejected": -192.36392211914062, + "loss": 0.3092, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7467451691627502, + "rewards/margins": 1.6562604904174805, + "rewards/rejected": -2.403005599975586, + "step": 5998 + }, + { + "epoch": 0.69, + "learning_rate": 9.393655624487883e-08, + "logits/chosen": -2.227837324142456, + "logits/rejected": -2.4924566745758057, + "logps/chosen": -457.7298889160156, + "logps/rejected": -244.7939453125, + "loss": 0.469, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5677222609519958, + "rewards/margins": 2.366203784942627, + "rewards/rejected": -2.9339261054992676, + "step": 5999 + }, + { + "epoch": 0.69, + "learning_rate": 9.39014397752546e-08, + "logits/chosen": -2.323263168334961, + "logits/rejected": -2.421065330505371, + "logps/chosen": -435.8626403808594, + "logps/rejected": -270.777587890625, + "loss": 0.3106, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24794965982437134, + "rewards/margins": 2.1245007514953613, + "rewards/rejected": -2.372450351715088, + "step": 6000 + }, + { + "epoch": 0.69, + "eval_logits/chosen": -1.6403625011444092, + "eval_logits/rejected": -1.516456127166748, + "eval_logps/chosen": -300.6683044433594, + "eval_logps/rejected": -275.23345947265625, + "eval_loss": 0.33387646079063416, + "eval_rewards/accuracies": 0.8285714387893677, + "eval_rewards/chosen": -0.7988124489784241, + "eval_rewards/margins": 2.205794095993042, + "eval_rewards/rejected": -3.0046064853668213, + "eval_runtime": 24.2041, + "eval_samples_per_second": 2.892, + "eval_steps_per_second": 1.446, + "step": 6000 + }, + { + "epoch": 0.69, + "learning_rate": 9.386632330563034e-08, + "logits/chosen": -2.914450168609619, + "logits/rejected": -2.718535900115967, + "logps/chosen": -189.4487762451172, + "logps/rejected": -288.47039794921875, + "loss": 0.5317, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5834684371948242, + "rewards/margins": 1.9595532417297363, + "rewards/rejected": -2.5430214405059814, + "step": 6001 + }, + { + "epoch": 0.69, + "learning_rate": 9.383120683600608e-08, + "logits/chosen": -2.4571733474731445, + "logits/rejected": -2.4738550186157227, + "logps/chosen": -351.97637939453125, + "logps/rejected": -285.15362548828125, + "loss": 0.4599, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1855462789535522, + "rewards/margins": 1.1914527416229248, + "rewards/rejected": -2.3769991397857666, + "step": 6002 + }, + { + "epoch": 0.69, + "learning_rate": 9.379609036638182e-08, + "logits/chosen": -2.7960453033447266, + "logits/rejected": -2.7162041664123535, + "logps/chosen": -184.0650634765625, + "logps/rejected": -208.0610809326172, + "loss": 0.345, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6493286490440369, + "rewards/margins": 1.485764503479004, + "rewards/rejected": -2.1350932121276855, + "step": 6003 + }, + { + "epoch": 0.69, + "learning_rate": 9.376097389675756e-08, + "logits/chosen": -2.014759063720703, + "logits/rejected": -2.412592887878418, + "logps/chosen": -325.9913635253906, + "logps/rejected": -159.013671875, + "loss": 0.9011, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5234992504119873, + "rewards/margins": 0.7283097505569458, + "rewards/rejected": -2.2518091201782227, + "step": 6004 + }, + { + "epoch": 0.69, + "learning_rate": 9.372585742713333e-08, + "logits/chosen": -2.455319404602051, + "logits/rejected": -2.703672409057617, + "logps/chosen": -144.07708740234375, + "logps/rejected": -240.46115112304688, + "loss": 0.1616, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6579487323760986, + "rewards/margins": 2.78964900970459, + "rewards/rejected": -3.4475979804992676, + "step": 6005 + }, + { + "epoch": 0.69, + "learning_rate": 9.369074095750907e-08, + "logits/chosen": -2.2890830039978027, + "logits/rejected": -2.7095022201538086, + "logps/chosen": -263.0517272949219, + "logps/rejected": -223.2722930908203, + "loss": 0.2296, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7060090899467468, + "rewards/margins": 2.7878456115722656, + "rewards/rejected": -3.493854522705078, + "step": 6006 + }, + { + "epoch": 0.69, + "learning_rate": 9.365562448788481e-08, + "logits/chosen": -2.6321890354156494, + "logits/rejected": -2.8625972270965576, + "logps/chosen": -309.0157775878906, + "logps/rejected": -297.4720458984375, + "loss": 0.1491, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0303455591201782, + "rewards/margins": 3.673849105834961, + "rewards/rejected": -4.704195022583008, + "step": 6007 + }, + { + "epoch": 0.69, + "learning_rate": 9.362050801826055e-08, + "logits/chosen": -2.771127700805664, + "logits/rejected": -2.716874122619629, + "logps/chosen": -123.25067138671875, + "logps/rejected": -151.19085693359375, + "loss": 0.2821, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7449687719345093, + "rewards/margins": 2.611074924468994, + "rewards/rejected": -3.356043815612793, + "step": 6008 + }, + { + "epoch": 0.69, + "learning_rate": 9.35853915486363e-08, + "logits/chosen": -2.4119620323181152, + "logits/rejected": -2.4699628353118896, + "logps/chosen": -304.6172790527344, + "logps/rejected": -326.6138916015625, + "loss": 0.1099, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0001449584960938, + "rewards/margins": 4.456973075866699, + "rewards/rejected": -5.457117557525635, + "step": 6009 + }, + { + "epoch": 0.69, + "learning_rate": 9.355027507901206e-08, + "logits/chosen": -2.6924984455108643, + "logits/rejected": -2.6115188598632812, + "logps/chosen": -328.373291015625, + "logps/rejected": -198.0625762939453, + "loss": 0.5576, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9113095998764038, + "rewards/margins": 1.5804492235183716, + "rewards/rejected": -2.4917588233947754, + "step": 6010 + }, + { + "epoch": 0.69, + "learning_rate": 9.35151586093878e-08, + "logits/chosen": -2.654625415802002, + "logits/rejected": -2.5133774280548096, + "logps/chosen": -194.13584899902344, + "logps/rejected": -207.51795959472656, + "loss": 0.4178, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4817705750465393, + "rewards/margins": 2.8875043392181396, + "rewards/rejected": -3.369274616241455, + "step": 6011 + }, + { + "epoch": 0.69, + "learning_rate": 9.348004213976354e-08, + "logits/chosen": -1.918081521987915, + "logits/rejected": -1.9568582773208618, + "logps/chosen": -236.2742462158203, + "logps/rejected": -320.24603271484375, + "loss": 0.1447, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1582513153553009, + "rewards/margins": 3.7246475219726562, + "rewards/rejected": -3.8828988075256348, + "step": 6012 + }, + { + "epoch": 0.69, + "learning_rate": 9.34449256701393e-08, + "logits/chosen": -2.128321886062622, + "logits/rejected": -2.355919122695923, + "logps/chosen": -334.30560302734375, + "logps/rejected": -300.83544921875, + "loss": 0.1572, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2795487642288208, + "rewards/margins": 3.1014068126678467, + "rewards/rejected": -3.380955696105957, + "step": 6013 + }, + { + "epoch": 0.69, + "learning_rate": 9.340980920051504e-08, + "logits/chosen": -2.7232141494750977, + "logits/rejected": -2.5948402881622314, + "logps/chosen": -193.44137573242188, + "logps/rejected": -229.1077423095703, + "loss": 1.0696, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4595061540603638, + "rewards/margins": 0.2547004222869873, + "rewards/rejected": -1.714206576347351, + "step": 6014 + }, + { + "epoch": 0.69, + "learning_rate": 9.337469273089078e-08, + "logits/chosen": -1.9151302576065063, + "logits/rejected": -2.1101255416870117, + "logps/chosen": -431.0374755859375, + "logps/rejected": -270.75299072265625, + "loss": 0.2912, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4392200708389282, + "rewards/margins": 1.556518793106079, + "rewards/rejected": -2.995738983154297, + "step": 6015 + }, + { + "epoch": 0.69, + "learning_rate": 9.333957626126652e-08, + "logits/chosen": -2.2024502754211426, + "logits/rejected": -2.431135654449463, + "logps/chosen": -243.6868133544922, + "logps/rejected": -197.44143676757812, + "loss": 0.3054, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1963174045085907, + "rewards/margins": 1.4716248512268066, + "rewards/rejected": -1.6679422855377197, + "step": 6016 + }, + { + "epoch": 0.69, + "learning_rate": 9.330445979164228e-08, + "logits/chosen": -2.198483943939209, + "logits/rejected": -1.9259898662567139, + "logps/chosen": -275.1714782714844, + "logps/rejected": -350.1452331542969, + "loss": 0.203, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7407848238945007, + "rewards/margins": 2.506074905395508, + "rewards/rejected": -3.246859550476074, + "step": 6017 + }, + { + "epoch": 0.69, + "learning_rate": 9.326934332201802e-08, + "logits/chosen": -2.181814670562744, + "logits/rejected": -2.048032760620117, + "logps/chosen": -261.8662414550781, + "logps/rejected": -262.6600036621094, + "loss": 0.3775, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3077380657196045, + "rewards/margins": 2.0468568801879883, + "rewards/rejected": -3.3545949459075928, + "step": 6018 + }, + { + "epoch": 0.69, + "learning_rate": 9.323422685239376e-08, + "logits/chosen": -2.415172576904297, + "logits/rejected": -2.489560604095459, + "logps/chosen": -358.97601318359375, + "logps/rejected": -252.47393798828125, + "loss": 0.4522, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9549793004989624, + "rewards/margins": 1.3022818565368652, + "rewards/rejected": -2.257261276245117, + "step": 6019 + }, + { + "epoch": 0.69, + "learning_rate": 9.31991103827695e-08, + "logits/chosen": -2.4585483074188232, + "logits/rejected": -2.4999992847442627, + "logps/chosen": -197.996826171875, + "logps/rejected": -156.9735107421875, + "loss": 0.1959, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22745609283447266, + "rewards/margins": 2.2448983192443848, + "rewards/rejected": -2.4723544120788574, + "step": 6020 + }, + { + "epoch": 0.69, + "learning_rate": 9.316399391314527e-08, + "logits/chosen": -2.0662245750427246, + "logits/rejected": -2.296830177307129, + "logps/chosen": -284.0589599609375, + "logps/rejected": -180.39431762695312, + "loss": 1.3289, + "rewards/accuracies": 0.375, + "rewards/chosen": -2.1084766387939453, + "rewards/margins": -0.6328690052032471, + "rewards/rejected": -1.4756076335906982, + "step": 6021 + }, + { + "epoch": 0.69, + "learning_rate": 9.312887744352101e-08, + "logits/chosen": -1.7785272598266602, + "logits/rejected": -1.7673554420471191, + "logps/chosen": -222.17843627929688, + "logps/rejected": -277.27471923828125, + "loss": 0.4508, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9733016490936279, + "rewards/margins": 3.091355800628662, + "rewards/rejected": -4.064657211303711, + "step": 6022 + }, + { + "epoch": 0.69, + "learning_rate": 9.309376097389675e-08, + "logits/chosen": -1.6343908309936523, + "logits/rejected": -1.5924687385559082, + "logps/chosen": -391.9369201660156, + "logps/rejected": -404.6414794921875, + "loss": 0.2313, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24919061362743378, + "rewards/margins": 2.7039783000946045, + "rewards/rejected": -2.953169107437134, + "step": 6023 + }, + { + "epoch": 0.69, + "learning_rate": 9.30586445042725e-08, + "logits/chosen": -2.4275963306427, + "logits/rejected": -2.1567864418029785, + "logps/chosen": -178.23867797851562, + "logps/rejected": -266.61627197265625, + "loss": 0.3625, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0846009254455566, + "rewards/margins": 2.1463232040405273, + "rewards/rejected": -3.230923891067505, + "step": 6024 + }, + { + "epoch": 0.69, + "learning_rate": 9.302352803464825e-08, + "logits/chosen": -1.8129632472991943, + "logits/rejected": -2.2150087356567383, + "logps/chosen": -422.5513916015625, + "logps/rejected": -272.0251159667969, + "loss": 1.1252, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2695162296295166, + "rewards/margins": 0.7695901393890381, + "rewards/rejected": -2.0391061305999756, + "step": 6025 + }, + { + "epoch": 0.69, + "learning_rate": 9.298841156502399e-08, + "logits/chosen": -2.8067092895507812, + "logits/rejected": -2.7533016204833984, + "logps/chosen": -367.3408203125, + "logps/rejected": -443.75872802734375, + "loss": 0.2664, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2593039274215698, + "rewards/margins": 3.2550415992736816, + "rewards/rejected": -4.514345645904541, + "step": 6026 + }, + { + "epoch": 0.69, + "learning_rate": 9.295329509539974e-08, + "logits/chosen": -2.667269229888916, + "logits/rejected": -2.613478899002075, + "logps/chosen": -168.03895568847656, + "logps/rejected": -212.18707275390625, + "loss": 0.2324, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2200331687927246, + "rewards/margins": 2.5821456909179688, + "rewards/rejected": -2.8021788597106934, + "step": 6027 + }, + { + "epoch": 0.69, + "learning_rate": 9.291817862577548e-08, + "logits/chosen": -2.4735641479492188, + "logits/rejected": -2.157461643218994, + "logps/chosen": -112.9428939819336, + "logps/rejected": -318.3248291015625, + "loss": 0.5364, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4732843041419983, + "rewards/margins": 0.9413328170776367, + "rewards/rejected": -1.4146170616149902, + "step": 6028 + }, + { + "epoch": 0.7, + "learning_rate": 9.288306215615124e-08, + "logits/chosen": -2.3329999446868896, + "logits/rejected": -2.32413649559021, + "logps/chosen": -187.9293212890625, + "logps/rejected": -222.6494598388672, + "loss": 0.3223, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6148558259010315, + "rewards/margins": 2.282191276550293, + "rewards/rejected": -2.8970470428466797, + "step": 6029 + }, + { + "epoch": 0.7, + "learning_rate": 9.284794568652698e-08, + "logits/chosen": -2.815364122390747, + "logits/rejected": -2.7932610511779785, + "logps/chosen": -135.5255889892578, + "logps/rejected": -209.11810302734375, + "loss": 0.1416, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3190436363220215, + "rewards/margins": 3.087862968444824, + "rewards/rejected": -4.4069061279296875, + "step": 6030 + }, + { + "epoch": 0.7, + "learning_rate": 9.281282921690272e-08, + "logits/chosen": -3.017498016357422, + "logits/rejected": -3.0520238876342773, + "logps/chosen": -202.30812072753906, + "logps/rejected": -211.36642456054688, + "loss": 0.2381, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7881907224655151, + "rewards/margins": 2.4273264408111572, + "rewards/rejected": -3.215517044067383, + "step": 6031 + }, + { + "epoch": 0.7, + "learning_rate": 9.277771274727846e-08, + "logits/chosen": -2.2786049842834473, + "logits/rejected": -2.3146586418151855, + "logps/chosen": -263.91094970703125, + "logps/rejected": -315.1242370605469, + "loss": 0.3506, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8947294354438782, + "rewards/margins": 3.5100934505462646, + "rewards/rejected": -4.404822826385498, + "step": 6032 + }, + { + "epoch": 0.7, + "learning_rate": 9.274259627765423e-08, + "logits/chosen": -1.8633947372436523, + "logits/rejected": -2.420423746109009, + "logps/chosen": -547.3413696289062, + "logps/rejected": -294.3759765625, + "loss": 0.2546, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3919968605041504, + "rewards/margins": 1.5861362218856812, + "rewards/rejected": -2.978133201599121, + "step": 6033 + }, + { + "epoch": 0.7, + "learning_rate": 9.270747980802997e-08, + "logits/chosen": -1.5085406303405762, + "logits/rejected": -2.0291757583618164, + "logps/chosen": -763.5537109375, + "logps/rejected": -452.86663818359375, + "loss": 0.306, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3575572371482849, + "rewards/margins": 2.484161376953125, + "rewards/rejected": -2.8417184352874756, + "step": 6034 + }, + { + "epoch": 0.7, + "learning_rate": 9.267236333840571e-08, + "logits/chosen": -2.4532597064971924, + "logits/rejected": -2.5843136310577393, + "logps/chosen": -255.29893493652344, + "logps/rejected": -248.88584899902344, + "loss": 0.4439, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6747654676437378, + "rewards/margins": 2.7040793895721436, + "rewards/rejected": -3.378844738006592, + "step": 6035 + }, + { + "epoch": 0.7, + "learning_rate": 9.263724686878145e-08, + "logits/chosen": -2.05354642868042, + "logits/rejected": -2.089918851852417, + "logps/chosen": -344.2689208984375, + "logps/rejected": -201.5328826904297, + "loss": 0.2671, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.41863465309143066, + "rewards/margins": 1.561349868774414, + "rewards/rejected": -1.9799845218658447, + "step": 6036 + }, + { + "epoch": 0.7, + "learning_rate": 9.26021303991572e-08, + "logits/chosen": -2.0834178924560547, + "logits/rejected": -2.191586494445801, + "logps/chosen": -303.4660949707031, + "logps/rejected": -378.8426513671875, + "loss": 0.335, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6030334830284119, + "rewards/margins": 2.0546209812164307, + "rewards/rejected": -2.6576547622680664, + "step": 6037 + }, + { + "epoch": 0.7, + "learning_rate": 9.256701392953296e-08, + "logits/chosen": -2.4598679542541504, + "logits/rejected": -2.3251266479492188, + "logps/chosen": -199.62570190429688, + "logps/rejected": -198.02023315429688, + "loss": 0.3686, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.948102593421936, + "rewards/margins": 2.2042198181152344, + "rewards/rejected": -3.152322769165039, + "step": 6038 + }, + { + "epoch": 0.7, + "learning_rate": 9.25318974599087e-08, + "logits/chosen": -2.3875954151153564, + "logits/rejected": -2.2804982662200928, + "logps/chosen": -170.57435607910156, + "logps/rejected": -189.1092071533203, + "loss": 0.7947, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8994736671447754, + "rewards/margins": 1.4156988859176636, + "rewards/rejected": -2.3151726722717285, + "step": 6039 + }, + { + "epoch": 0.7, + "learning_rate": 9.249678099028444e-08, + "logits/chosen": -2.3107750415802, + "logits/rejected": -2.2040724754333496, + "logps/chosen": -134.3994140625, + "logps/rejected": -318.46844482421875, + "loss": 0.2487, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6453785300254822, + "rewards/margins": 4.281998634338379, + "rewards/rejected": -4.927376747131348, + "step": 6040 + }, + { + "epoch": 0.7, + "learning_rate": 9.246166452066019e-08, + "logits/chosen": -2.3065268993377686, + "logits/rejected": -2.1390671730041504, + "logps/chosen": -382.1268005371094, + "logps/rejected": -341.0967712402344, + "loss": 0.2747, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3923703134059906, + "rewards/margins": 2.42275333404541, + "rewards/rejected": -2.8151237964630127, + "step": 6041 + }, + { + "epoch": 0.7, + "learning_rate": 9.242654805103593e-08, + "logits/chosen": -1.4189714193344116, + "logits/rejected": -1.827667474746704, + "logps/chosen": -460.96014404296875, + "logps/rejected": -312.17193603515625, + "loss": 0.6073, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.554799199104309, + "rewards/margins": 0.9610637426376343, + "rewards/rejected": -2.5158629417419434, + "step": 6042 + }, + { + "epoch": 0.7, + "learning_rate": 9.239143158141167e-08, + "logits/chosen": -2.0591022968292236, + "logits/rejected": -1.7497501373291016, + "logps/chosen": -318.9335021972656, + "logps/rejected": -391.9284973144531, + "loss": 1.0275, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1745266914367676, + "rewards/margins": 0.20261861383914948, + "rewards/rejected": -1.3771452903747559, + "step": 6043 + }, + { + "epoch": 0.7, + "learning_rate": 9.235631511178743e-08, + "logits/chosen": -2.619204044342041, + "logits/rejected": -2.566314458847046, + "logps/chosen": -226.07260131835938, + "logps/rejected": -187.2613067626953, + "loss": 0.329, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3478190898895264, + "rewards/margins": 2.716977834701538, + "rewards/rejected": -4.0647969245910645, + "step": 6044 + }, + { + "epoch": 0.7, + "learning_rate": 9.232119864216318e-08, + "logits/chosen": -2.0313894748687744, + "logits/rejected": -2.1697754859924316, + "logps/chosen": -413.12896728515625, + "logps/rejected": -414.0924072265625, + "loss": 0.5689, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0425910949707031, + "rewards/margins": 1.4168579578399658, + "rewards/rejected": -2.45944881439209, + "step": 6045 + }, + { + "epoch": 0.7, + "learning_rate": 9.228608217253892e-08, + "logits/chosen": -2.3839030265808105, + "logits/rejected": -2.3940203189849854, + "logps/chosen": -230.7241668701172, + "logps/rejected": -284.66680908203125, + "loss": 0.4857, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5200244784355164, + "rewards/margins": 1.6872754096984863, + "rewards/rejected": -2.2072999477386475, + "step": 6046 + }, + { + "epoch": 0.7, + "learning_rate": 9.225096570291466e-08, + "logits/chosen": -2.6413185596466064, + "logits/rejected": -2.777343988418579, + "logps/chosen": -278.3441467285156, + "logps/rejected": -306.32781982421875, + "loss": 0.579, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5405857563018799, + "rewards/margins": 3.0929391384124756, + "rewards/rejected": -3.6335253715515137, + "step": 6047 + }, + { + "epoch": 0.7, + "learning_rate": 9.22158492332904e-08, + "logits/chosen": -2.5528478622436523, + "logits/rejected": -2.749799966812134, + "logps/chosen": -415.31292724609375, + "logps/rejected": -268.780517578125, + "loss": 0.184, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3223453760147095, + "rewards/margins": 2.6272835731506348, + "rewards/rejected": -2.9496285915374756, + "step": 6048 + }, + { + "epoch": 0.7, + "learning_rate": 9.218073276366617e-08, + "logits/chosen": -2.6671512126922607, + "logits/rejected": -2.605898141860962, + "logps/chosen": -245.50205993652344, + "logps/rejected": -262.3852233886719, + "loss": 0.572, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5344506502151489, + "rewards/margins": 1.3796125650405884, + "rewards/rejected": -1.9140632152557373, + "step": 6049 + }, + { + "epoch": 0.7, + "learning_rate": 9.214561629404191e-08, + "logits/chosen": -2.711176633834839, + "logits/rejected": -2.712951898574829, + "logps/chosen": -399.3323059082031, + "logps/rejected": -321.98455810546875, + "loss": 0.3082, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.255420446395874, + "rewards/margins": 2.5712997913360596, + "rewards/rejected": -2.8267202377319336, + "step": 6050 + }, + { + "epoch": 0.7, + "learning_rate": 9.211049982441765e-08, + "logits/chosen": -2.88053560256958, + "logits/rejected": -2.3504929542541504, + "logps/chosen": -343.2284851074219, + "logps/rejected": -299.3956298828125, + "loss": 0.6696, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3285987377166748, + "rewards/margins": 2.2626471519470215, + "rewards/rejected": -3.5912461280822754, + "step": 6051 + }, + { + "epoch": 0.7, + "learning_rate": 9.207538335479339e-08, + "logits/chosen": -2.4468789100646973, + "logits/rejected": -2.363882064819336, + "logps/chosen": -216.85926818847656, + "logps/rejected": -289.4908752441406, + "loss": 0.4011, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1352407932281494, + "rewards/margins": 3.5302786827087402, + "rewards/rejected": -4.665519714355469, + "step": 6052 + }, + { + "epoch": 0.7, + "learning_rate": 9.204026688516913e-08, + "logits/chosen": -2.1185646057128906, + "logits/rejected": -1.953892707824707, + "logps/chosen": -229.5150146484375, + "logps/rejected": -343.24664306640625, + "loss": 0.6047, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2460556030273438, + "rewards/margins": 2.540813684463501, + "rewards/rejected": -3.786869525909424, + "step": 6053 + }, + { + "epoch": 0.7, + "learning_rate": 9.200515041554488e-08, + "logits/chosen": -2.357841968536377, + "logits/rejected": -2.559157609939575, + "logps/chosen": -424.5704650878906, + "logps/rejected": -296.73284912109375, + "loss": 0.3352, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6290737390518188, + "rewards/margins": 2.6320676803588867, + "rewards/rejected": -3.261141300201416, + "step": 6054 + }, + { + "epoch": 0.7, + "learning_rate": 9.197003394592064e-08, + "logits/chosen": -2.097902536392212, + "logits/rejected": -2.4168362617492676, + "logps/chosen": -225.81326293945312, + "logps/rejected": -174.63604736328125, + "loss": 0.715, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7608486413955688, + "rewards/margins": 1.2794275283813477, + "rewards/rejected": -2.040276288986206, + "step": 6055 + }, + { + "epoch": 0.7, + "learning_rate": 9.193491747629638e-08, + "logits/chosen": -2.497854709625244, + "logits/rejected": -2.60089373588562, + "logps/chosen": -205.7002410888672, + "logps/rejected": -310.4296569824219, + "loss": 0.711, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.3775787353515625, + "rewards/margins": 2.4161975383758545, + "rewards/rejected": -4.793776512145996, + "step": 6056 + }, + { + "epoch": 0.7, + "learning_rate": 9.189980100667212e-08, + "logits/chosen": -2.4807653427124023, + "logits/rejected": -2.463392972946167, + "logps/chosen": -233.71234130859375, + "logps/rejected": -179.40402221679688, + "loss": 0.2478, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8600165247917175, + "rewards/margins": 2.6546006202697754, + "rewards/rejected": -3.514617443084717, + "step": 6057 + }, + { + "epoch": 0.7, + "learning_rate": 9.186468453704787e-08, + "logits/chosen": -2.1290273666381836, + "logits/rejected": -2.013575315475464, + "logps/chosen": -149.43621826171875, + "logps/rejected": -336.8380126953125, + "loss": 0.307, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6113423109054565, + "rewards/margins": 1.9769707918167114, + "rewards/rejected": -2.588313102722168, + "step": 6058 + }, + { + "epoch": 0.7, + "learning_rate": 9.182956806742361e-08, + "logits/chosen": -2.8379952907562256, + "logits/rejected": -2.849984645843506, + "logps/chosen": -360.39154052734375, + "logps/rejected": -269.68316650390625, + "loss": 0.1851, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.025176823139190674, + "rewards/margins": 2.715923547744751, + "rewards/rejected": -2.741100311279297, + "step": 6059 + }, + { + "epoch": 0.7, + "learning_rate": 9.179445159779936e-08, + "logits/chosen": -2.6582717895507812, + "logits/rejected": -2.646254062652588, + "logps/chosen": -160.3884735107422, + "logps/rejected": -198.54388427734375, + "loss": 0.2724, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5683735013008118, + "rewards/margins": 2.1344714164733887, + "rewards/rejected": -2.7028450965881348, + "step": 6060 + }, + { + "epoch": 0.7, + "learning_rate": 9.175933512817511e-08, + "logits/chosen": -2.2392170429229736, + "logits/rejected": -2.175208806991577, + "logps/chosen": -305.38287353515625, + "logps/rejected": -272.3172607421875, + "loss": 0.4032, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9149765968322754, + "rewards/margins": 2.734248638153076, + "rewards/rejected": -4.649224758148193, + "step": 6061 + }, + { + "epoch": 0.7, + "learning_rate": 9.172421865855086e-08, + "logits/chosen": -2.066453218460083, + "logits/rejected": -2.0282015800476074, + "logps/chosen": -209.70294189453125, + "logps/rejected": -311.35980224609375, + "loss": 0.3006, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8263659477233887, + "rewards/margins": 2.5450215339660645, + "rewards/rejected": -3.3713877201080322, + "step": 6062 + }, + { + "epoch": 0.7, + "learning_rate": 9.16891021889266e-08, + "logits/chosen": -2.626450300216675, + "logits/rejected": -2.8772096633911133, + "logps/chosen": -266.0572509765625, + "logps/rejected": -300.48944091796875, + "loss": 0.2398, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7599086761474609, + "rewards/margins": 1.9075181484222412, + "rewards/rejected": -2.667426824569702, + "step": 6063 + }, + { + "epoch": 0.7, + "learning_rate": 9.165398571930234e-08, + "logits/chosen": -2.0654492378234863, + "logits/rejected": -1.8805046081542969, + "logps/chosen": -285.2037353515625, + "logps/rejected": -405.9892578125, + "loss": 0.8987, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9335017204284668, + "rewards/margins": 0.7155274152755737, + "rewards/rejected": -1.6490291357040405, + "step": 6064 + }, + { + "epoch": 0.7, + "learning_rate": 9.161886924967808e-08, + "logits/chosen": -2.383409261703491, + "logits/rejected": -2.540316104888916, + "logps/chosen": -243.58096313476562, + "logps/rejected": -255.0548553466797, + "loss": 0.2386, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3485856056213379, + "rewards/margins": 2.7335293292999268, + "rewards/rejected": -3.0821151733398438, + "step": 6065 + }, + { + "epoch": 0.7, + "learning_rate": 9.158375278005385e-08, + "logits/chosen": -2.1452536582946777, + "logits/rejected": -2.3384461402893066, + "logps/chosen": -452.43408203125, + "logps/rejected": -313.0594787597656, + "loss": 0.1553, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.003443717956543, + "rewards/margins": 2.7672834396362305, + "rewards/rejected": -3.7707271575927734, + "step": 6066 + }, + { + "epoch": 0.7, + "learning_rate": 9.154863631042959e-08, + "logits/chosen": -2.7237417697906494, + "logits/rejected": -2.594980001449585, + "logps/chosen": -225.73831176757812, + "logps/rejected": -255.04837036132812, + "loss": 0.5078, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9261542558670044, + "rewards/margins": 1.1693512201309204, + "rewards/rejected": -2.095505475997925, + "step": 6067 + }, + { + "epoch": 0.7, + "learning_rate": 9.151351984080533e-08, + "logits/chosen": -2.663827896118164, + "logits/rejected": -2.5198097229003906, + "logps/chosen": -279.23333740234375, + "logps/rejected": -258.04119873046875, + "loss": 0.1018, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.46151843667030334, + "rewards/margins": 2.8479416370391846, + "rewards/rejected": -3.309460163116455, + "step": 6068 + }, + { + "epoch": 0.7, + "learning_rate": 9.147840337118107e-08, + "logits/chosen": -2.2257497310638428, + "logits/rejected": -2.2216944694519043, + "logps/chosen": -322.33013916015625, + "logps/rejected": -466.45574951171875, + "loss": 0.3236, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.642626166343689, + "rewards/margins": 1.7987874746322632, + "rewards/rejected": -2.441413640975952, + "step": 6069 + }, + { + "epoch": 0.7, + "learning_rate": 9.144328690155683e-08, + "logits/chosen": -2.313450813293457, + "logits/rejected": -2.39968204498291, + "logps/chosen": -277.35955810546875, + "logps/rejected": -252.26885986328125, + "loss": 0.3274, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0733577013015747, + "rewards/margins": 2.4503421783447266, + "rewards/rejected": -3.523699998855591, + "step": 6070 + }, + { + "epoch": 0.7, + "learning_rate": 9.140817043193257e-08, + "logits/chosen": -1.6233069896697998, + "logits/rejected": -1.9909138679504395, + "logps/chosen": -236.85076904296875, + "logps/rejected": -172.46353149414062, + "loss": 0.3989, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6169875860214233, + "rewards/margins": 1.0906460285186768, + "rewards/rejected": -1.7076336145401, + "step": 6071 + }, + { + "epoch": 0.7, + "learning_rate": 9.137305396230832e-08, + "logits/chosen": -2.268157482147217, + "logits/rejected": -2.6195926666259766, + "logps/chosen": -430.1422424316406, + "logps/rejected": -255.8083953857422, + "loss": 0.287, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3810442090034485, + "rewards/margins": 1.9833154678344727, + "rewards/rejected": -2.3643598556518555, + "step": 6072 + }, + { + "epoch": 0.7, + "learning_rate": 9.133793749268406e-08, + "logits/chosen": -2.0536417961120605, + "logits/rejected": -2.2252583503723145, + "logps/chosen": -259.6264343261719, + "logps/rejected": -246.66259765625, + "loss": 0.5814, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7084987163543701, + "rewards/margins": 0.8145111203193665, + "rewards/rejected": -2.523010015487671, + "step": 6073 + }, + { + "epoch": 0.7, + "learning_rate": 9.130282102305982e-08, + "logits/chosen": -1.7497096061706543, + "logits/rejected": -2.173398017883301, + "logps/chosen": -449.710205078125, + "logps/rejected": -286.6781005859375, + "loss": 0.3478, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0142745971679688, + "rewards/margins": 2.0319981575012207, + "rewards/rejected": -3.0462727546691895, + "step": 6074 + }, + { + "epoch": 0.7, + "learning_rate": 9.126770455343556e-08, + "logits/chosen": -1.9423695802688599, + "logits/rejected": -1.836917519569397, + "logps/chosen": -260.2427978515625, + "logps/rejected": -406.6575012207031, + "loss": 0.4101, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7072453498840332, + "rewards/margins": 2.8950090408325195, + "rewards/rejected": -3.602254629135132, + "step": 6075 + }, + { + "epoch": 0.7, + "learning_rate": 9.12325880838113e-08, + "logits/chosen": -2.2296602725982666, + "logits/rejected": -2.6464922428131104, + "logps/chosen": -218.22084045410156, + "logps/rejected": -165.37355041503906, + "loss": 0.8014, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.528538703918457, + "rewards/margins": 0.6045668125152588, + "rewards/rejected": -3.1331052780151367, + "step": 6076 + }, + { + "epoch": 0.7, + "learning_rate": 9.119747161418704e-08, + "logits/chosen": -2.7322638034820557, + "logits/rejected": -2.813605546951294, + "logps/chosen": -286.8382873535156, + "logps/rejected": -210.1479034423828, + "loss": 2.0124, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.3111376762390137, + "rewards/margins": -0.3515458106994629, + "rewards/rejected": -2.9595916271209717, + "step": 6077 + }, + { + "epoch": 0.7, + "learning_rate": 9.11623551445628e-08, + "logits/chosen": -2.1815128326416016, + "logits/rejected": -2.285156726837158, + "logps/chosen": -204.93496704101562, + "logps/rejected": -275.1066589355469, + "loss": 0.4083, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8094245195388794, + "rewards/margins": 1.9453128576278687, + "rewards/rejected": -3.754737377166748, + "step": 6078 + }, + { + "epoch": 0.7, + "learning_rate": 9.112723867493855e-08, + "logits/chosen": -2.806317090988159, + "logits/rejected": -2.632626533508301, + "logps/chosen": -151.15745544433594, + "logps/rejected": -305.0195007324219, + "loss": 0.1018, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.39969998598098755, + "rewards/margins": 3.126113176345825, + "rewards/rejected": -3.525813102722168, + "step": 6079 + }, + { + "epoch": 0.7, + "learning_rate": 9.109212220531429e-08, + "logits/chosen": -2.474453926086426, + "logits/rejected": -2.632815361022949, + "logps/chosen": -401.77587890625, + "logps/rejected": -259.293701171875, + "loss": 0.6722, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.097545862197876, + "rewards/margins": 2.2688186168670654, + "rewards/rejected": -3.3663642406463623, + "step": 6080 + }, + { + "epoch": 0.7, + "learning_rate": 9.105700573569003e-08, + "logits/chosen": -2.610790491104126, + "logits/rejected": -2.7020950317382812, + "logps/chosen": -160.89610290527344, + "logps/rejected": -121.808837890625, + "loss": 0.4928, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1964125633239746, + "rewards/margins": 0.779996395111084, + "rewards/rejected": -1.9764089584350586, + "step": 6081 + }, + { + "epoch": 0.7, + "learning_rate": 9.10218892660658e-08, + "logits/chosen": -2.656421184539795, + "logits/rejected": -2.5565459728240967, + "logps/chosen": -140.84214782714844, + "logps/rejected": -286.2243347167969, + "loss": 0.3511, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.790370762348175, + "rewards/margins": 2.3586630821228027, + "rewards/rejected": -3.149034023284912, + "step": 6082 + }, + { + "epoch": 0.7, + "learning_rate": 9.098677279644153e-08, + "logits/chosen": -2.8850207328796387, + "logits/rejected": -3.026606798171997, + "logps/chosen": -157.0398712158203, + "logps/rejected": -171.591064453125, + "loss": 0.2334, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.639042854309082, + "rewards/margins": 2.9132261276245117, + "rewards/rejected": -3.5522689819335938, + "step": 6083 + }, + { + "epoch": 0.7, + "learning_rate": 9.095165632681728e-08, + "logits/chosen": -2.0298609733581543, + "logits/rejected": -1.9037927389144897, + "logps/chosen": -384.67327880859375, + "logps/rejected": -458.96728515625, + "loss": 0.2322, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3119359016418457, + "rewards/margins": 2.884725332260132, + "rewards/rejected": -4.196661472320557, + "step": 6084 + }, + { + "epoch": 0.7, + "learning_rate": 9.091653985719302e-08, + "logits/chosen": -2.3108463287353516, + "logits/rejected": -2.401837110519409, + "logps/chosen": -350.689208984375, + "logps/rejected": -430.6939392089844, + "loss": 0.2004, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.21640683710575104, + "rewards/margins": 3.588820695877075, + "rewards/rejected": -3.3724138736724854, + "step": 6085 + }, + { + "epoch": 0.7, + "learning_rate": 9.088142338756877e-08, + "logits/chosen": -2.7962772846221924, + "logits/rejected": -2.7057156562805176, + "logps/chosen": -303.6152038574219, + "logps/rejected": -319.5945739746094, + "loss": 0.1433, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9988689422607422, + "rewards/margins": 3.368952512741089, + "rewards/rejected": -4.36782169342041, + "step": 6086 + }, + { + "epoch": 0.7, + "learning_rate": 9.084630691794451e-08, + "logits/chosen": -2.196558713912964, + "logits/rejected": -2.2271382808685303, + "logps/chosen": -251.28646850585938, + "logps/rejected": -237.97560119628906, + "loss": 0.4153, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3910335302352905, + "rewards/margins": 1.7061039209365845, + "rewards/rejected": -3.097137451171875, + "step": 6087 + }, + { + "epoch": 0.7, + "learning_rate": 9.081119044832025e-08, + "logits/chosen": -2.295334815979004, + "logits/rejected": -2.192410469055176, + "logps/chosen": -270.8932800292969, + "logps/rejected": -438.99700927734375, + "loss": 0.1536, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1333361864089966, + "rewards/margins": 4.399778366088867, + "rewards/rejected": -5.533114433288574, + "step": 6088 + }, + { + "epoch": 0.7, + "learning_rate": 9.0776073978696e-08, + "logits/chosen": -2.8280091285705566, + "logits/rejected": -2.6013619899749756, + "logps/chosen": -243.79518127441406, + "logps/rejected": -270.0487976074219, + "loss": 0.222, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5434753894805908, + "rewards/margins": 2.2006304264068604, + "rewards/rejected": -3.744105815887451, + "step": 6089 + }, + { + "epoch": 0.7, + "learning_rate": 9.074095750907176e-08, + "logits/chosen": -2.112873077392578, + "logits/rejected": -2.1528854370117188, + "logps/chosen": -354.19873046875, + "logps/rejected": -308.7135925292969, + "loss": 0.3675, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8939628601074219, + "rewards/margins": 2.5562527179718018, + "rewards/rejected": -3.4502155780792236, + "step": 6090 + }, + { + "epoch": 0.7, + "learning_rate": 9.07058410394475e-08, + "logits/chosen": -2.0785083770751953, + "logits/rejected": -2.1618006229400635, + "logps/chosen": -252.21726989746094, + "logps/rejected": -293.1405944824219, + "loss": 0.3889, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7915440797805786, + "rewards/margins": 3.25028657913208, + "rewards/rejected": -4.041830539703369, + "step": 6091 + }, + { + "epoch": 0.7, + "learning_rate": 9.067072456982324e-08, + "logits/chosen": -2.1962428092956543, + "logits/rejected": -2.1556901931762695, + "logps/chosen": -256.8179626464844, + "logps/rejected": -282.80206298828125, + "loss": 0.4005, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.542863130569458, + "rewards/margins": 2.6654701232910156, + "rewards/rejected": -3.2083334922790527, + "step": 6092 + }, + { + "epoch": 0.7, + "learning_rate": 9.063560810019898e-08, + "logits/chosen": -2.497468948364258, + "logits/rejected": -2.4551517963409424, + "logps/chosen": -192.83871459960938, + "logps/rejected": -243.23370361328125, + "loss": 0.1207, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.41034555435180664, + "rewards/margins": 3.025937080383301, + "rewards/rejected": -3.4362826347351074, + "step": 6093 + }, + { + "epoch": 0.7, + "learning_rate": 9.060049163057475e-08, + "logits/chosen": -1.940855622291565, + "logits/rejected": -1.996640920639038, + "logps/chosen": -290.8397216796875, + "logps/rejected": -270.15960693359375, + "loss": 0.4965, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4025501608848572, + "rewards/margins": 1.8477318286895752, + "rewards/rejected": -2.250281810760498, + "step": 6094 + }, + { + "epoch": 0.7, + "learning_rate": 9.056537516095049e-08, + "logits/chosen": -2.146371603012085, + "logits/rejected": -1.9846683740615845, + "logps/chosen": -261.3391418457031, + "logps/rejected": -225.1836395263672, + "loss": 0.4242, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.148637056350708, + "rewards/margins": 1.5632381439208984, + "rewards/rejected": -2.7118749618530273, + "step": 6095 + }, + { + "epoch": 0.7, + "learning_rate": 9.053025869132623e-08, + "logits/chosen": -2.277174472808838, + "logits/rejected": -2.3362085819244385, + "logps/chosen": -274.1580810546875, + "logps/rejected": -244.8883056640625, + "loss": 0.4685, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2799217700958252, + "rewards/margins": 1.1884372234344482, + "rewards/rejected": -2.4683589935302734, + "step": 6096 + }, + { + "epoch": 0.7, + "learning_rate": 9.049514222170197e-08, + "logits/chosen": -2.38399076461792, + "logits/rejected": -2.331813335418701, + "logps/chosen": -164.30564880371094, + "logps/rejected": -266.353271484375, + "loss": 0.1185, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6689045429229736, + "rewards/margins": 3.8773419857025146, + "rewards/rejected": -4.5462470054626465, + "step": 6097 + }, + { + "epoch": 0.7, + "learning_rate": 9.046002575207771e-08, + "logits/chosen": -2.489548444747925, + "logits/rejected": -2.705186605453491, + "logps/chosen": -218.4268798828125, + "logps/rejected": -267.13018798828125, + "loss": 0.2922, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7575407028198242, + "rewards/margins": 2.1347815990448, + "rewards/rejected": -3.892322063446045, + "step": 6098 + }, + { + "epoch": 0.7, + "learning_rate": 9.042490928245348e-08, + "logits/chosen": -2.3652448654174805, + "logits/rejected": -2.4244754314422607, + "logps/chosen": -326.3142395019531, + "logps/rejected": -337.103759765625, + "loss": 0.1789, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.018585443496704, + "rewards/margins": 4.149412631988525, + "rewards/rejected": -5.167998313903809, + "step": 6099 + }, + { + "epoch": 0.7, + "learning_rate": 9.038979281282922e-08, + "logits/chosen": -2.0188546180725098, + "logits/rejected": -2.1195929050445557, + "logps/chosen": -432.4693603515625, + "logps/rejected": -221.3465118408203, + "loss": 0.436, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0437607765197754, + "rewards/margins": 1.3250956535339355, + "rewards/rejected": -2.368856430053711, + "step": 6100 + }, + { + "epoch": 0.7, + "learning_rate": 9.035467634320496e-08, + "logits/chosen": -2.847350597381592, + "logits/rejected": -2.8895602226257324, + "logps/chosen": -131.0765838623047, + "logps/rejected": -150.47703552246094, + "loss": 0.2743, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0340982675552368, + "rewards/margins": 2.392296314239502, + "rewards/rejected": -3.4263947010040283, + "step": 6101 + }, + { + "epoch": 0.7, + "learning_rate": 9.03195598735807e-08, + "logits/chosen": -2.2960290908813477, + "logits/rejected": -2.0596365928649902, + "logps/chosen": -176.66287231445312, + "logps/rejected": -320.7604675292969, + "loss": 0.2107, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4428701400756836, + "rewards/margins": 3.936331272125244, + "rewards/rejected": -4.379201889038086, + "step": 6102 + }, + { + "epoch": 0.7, + "learning_rate": 9.028444340395645e-08, + "logits/chosen": -2.197458267211914, + "logits/rejected": -2.430166721343994, + "logps/chosen": -575.4521484375, + "logps/rejected": -237.2190399169922, + "loss": 0.5123, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1251893043518066, + "rewards/margins": 0.8283931016921997, + "rewards/rejected": -1.9535824060440063, + "step": 6103 + }, + { + "epoch": 0.7, + "learning_rate": 9.02493269343322e-08, + "logits/chosen": -2.096060276031494, + "logits/rejected": -2.319383382797241, + "logps/chosen": -370.24615478515625, + "logps/rejected": -345.7449951171875, + "loss": 0.4839, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0246026515960693, + "rewards/margins": 1.4290101528167725, + "rewards/rejected": -2.453612804412842, + "step": 6104 + }, + { + "epoch": 0.7, + "learning_rate": 9.021421046470793e-08, + "logits/chosen": -2.5132415294647217, + "logits/rejected": -2.442438840866089, + "logps/chosen": -273.9084167480469, + "logps/rejected": -275.3569030761719, + "loss": 0.1574, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5104299783706665, + "rewards/margins": 2.7880163192749023, + "rewards/rejected": -3.2984461784362793, + "step": 6105 + }, + { + "epoch": 0.7, + "learning_rate": 9.017909399508369e-08, + "logits/chosen": -2.3015377521514893, + "logits/rejected": -2.4222750663757324, + "logps/chosen": -261.1833190917969, + "logps/rejected": -231.75234985351562, + "loss": 0.5454, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1605298519134521, + "rewards/margins": 1.0598162412643433, + "rewards/rejected": -2.220345973968506, + "step": 6106 + }, + { + "epoch": 0.7, + "learning_rate": 9.014397752545944e-08, + "logits/chosen": -2.0930094718933105, + "logits/rejected": -2.1750848293304443, + "logps/chosen": -315.0576171875, + "logps/rejected": -222.39944458007812, + "loss": 0.2567, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.545478343963623, + "rewards/margins": 2.196843385696411, + "rewards/rejected": -2.742321729660034, + "step": 6107 + }, + { + "epoch": 0.7, + "learning_rate": 9.010886105583518e-08, + "logits/chosen": -1.5973973274230957, + "logits/rejected": -1.5819926261901855, + "logps/chosen": -240.2794647216797, + "logps/rejected": -285.83221435546875, + "loss": 0.7512, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0345209836959839, + "rewards/margins": 0.9356305599212646, + "rewards/rejected": -1.970151424407959, + "step": 6108 + }, + { + "epoch": 0.7, + "learning_rate": 9.007374458621092e-08, + "logits/chosen": -2.5603537559509277, + "logits/rejected": -2.3631350994110107, + "logps/chosen": -147.57716369628906, + "logps/rejected": -231.0362091064453, + "loss": 0.3728, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17646369338035583, + "rewards/margins": 1.5773206949234009, + "rewards/rejected": -1.7537842988967896, + "step": 6109 + }, + { + "epoch": 0.7, + "learning_rate": 9.003862811658666e-08, + "logits/chosen": -2.166748285293579, + "logits/rejected": -2.3997597694396973, + "logps/chosen": -390.64776611328125, + "logps/rejected": -348.7510986328125, + "loss": 0.2986, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2932277917861938, + "rewards/margins": 1.8098875284194946, + "rewards/rejected": -3.1031153202056885, + "step": 6110 + }, + { + "epoch": 0.7, + "learning_rate": 9.000351164696243e-08, + "logits/chosen": -2.0373668670654297, + "logits/rejected": -2.248993396759033, + "logps/chosen": -347.1954345703125, + "logps/rejected": -277.07501220703125, + "loss": 0.221, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5690158605575562, + "rewards/margins": 2.5872268676757812, + "rewards/rejected": -2.0182108879089355, + "step": 6111 + }, + { + "epoch": 0.7, + "learning_rate": 8.996839517733817e-08, + "logits/chosen": -2.2109174728393555, + "logits/rejected": -2.2749457359313965, + "logps/chosen": -196.2833251953125, + "logps/rejected": -268.78814697265625, + "loss": 0.3899, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0131326913833618, + "rewards/margins": 2.242460250854492, + "rewards/rejected": -3.2555930614471436, + "step": 6112 + }, + { + "epoch": 0.7, + "learning_rate": 8.993327870771391e-08, + "logits/chosen": -2.529012441635132, + "logits/rejected": -2.349348545074463, + "logps/chosen": -152.39190673828125, + "logps/rejected": -323.6757507324219, + "loss": 0.2596, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6848295331001282, + "rewards/margins": 2.681802272796631, + "rewards/rejected": -3.366631507873535, + "step": 6113 + }, + { + "epoch": 0.7, + "learning_rate": 8.989816223808965e-08, + "logits/chosen": -2.658557891845703, + "logits/rejected": -2.672640323638916, + "logps/chosen": -266.7954406738281, + "logps/rejected": -333.3808288574219, + "loss": 0.3216, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.988750696182251, + "rewards/margins": 1.9183460474014282, + "rewards/rejected": -2.9070966243743896, + "step": 6114 + }, + { + "epoch": 0.7, + "learning_rate": 8.986304576846541e-08, + "logits/chosen": -2.1746349334716797, + "logits/rejected": -1.9935827255249023, + "logps/chosen": -261.2762756347656, + "logps/rejected": -317.6075134277344, + "loss": 0.2707, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6536039113998413, + "rewards/margins": 2.173698663711548, + "rewards/rejected": -3.8273026943206787, + "step": 6115 + }, + { + "epoch": 0.71, + "learning_rate": 8.982792929884116e-08, + "logits/chosen": -2.8277945518493652, + "logits/rejected": -2.6748390197753906, + "logps/chosen": -109.9328384399414, + "logps/rejected": -156.9752655029297, + "loss": 0.1829, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9331077337265015, + "rewards/margins": 2.3547449111938477, + "rewards/rejected": -3.2878522872924805, + "step": 6116 + }, + { + "epoch": 0.71, + "learning_rate": 8.97928128292169e-08, + "logits/chosen": -2.6330783367156982, + "logits/rejected": -2.5492310523986816, + "logps/chosen": -317.43292236328125, + "logps/rejected": -196.96876525878906, + "loss": 0.3646, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1628271341323853, + "rewards/margins": 1.356303334236145, + "rewards/rejected": -2.5191304683685303, + "step": 6117 + }, + { + "epoch": 0.71, + "learning_rate": 8.975769635959264e-08, + "logits/chosen": -2.555044651031494, + "logits/rejected": -2.346109390258789, + "logps/chosen": -271.5126953125, + "logps/rejected": -249.34442138671875, + "loss": 0.7019, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6914716958999634, + "rewards/margins": 0.6800728440284729, + "rewards/rejected": -2.371544361114502, + "step": 6118 + }, + { + "epoch": 0.71, + "learning_rate": 8.97225798899684e-08, + "logits/chosen": -2.5934882164001465, + "logits/rejected": -2.6651718616485596, + "logps/chosen": -125.58202362060547, + "logps/rejected": -214.47996520996094, + "loss": 0.2394, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3544688820838928, + "rewards/margins": 3.117337465286255, + "rewards/rejected": -3.471806287765503, + "step": 6119 + }, + { + "epoch": 0.71, + "learning_rate": 8.968746342034414e-08, + "logits/chosen": -2.598773956298828, + "logits/rejected": -2.4138991832733154, + "logps/chosen": -230.00045776367188, + "logps/rejected": -298.6784973144531, + "loss": 0.5239, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.135131597518921, + "rewards/margins": 2.1409621238708496, + "rewards/rejected": -4.276093482971191, + "step": 6120 + }, + { + "epoch": 0.71, + "learning_rate": 8.965234695071988e-08, + "logits/chosen": -1.8508914709091187, + "logits/rejected": -1.7490363121032715, + "logps/chosen": -402.9183349609375, + "logps/rejected": -237.3831787109375, + "loss": 0.6732, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1811159998178482, + "rewards/margins": 1.61514151096344, + "rewards/rejected": -1.796257495880127, + "step": 6121 + }, + { + "epoch": 0.71, + "learning_rate": 8.961723048109562e-08, + "logits/chosen": -2.019672393798828, + "logits/rejected": -2.3779923915863037, + "logps/chosen": -169.24349975585938, + "logps/rejected": -188.91641235351562, + "loss": 1.0805, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9095783233642578, + "rewards/margins": 2.2481558322906494, + "rewards/rejected": -4.157734394073486, + "step": 6122 + }, + { + "epoch": 0.71, + "learning_rate": 8.958211401147138e-08, + "logits/chosen": -2.4682161808013916, + "logits/rejected": -2.5538156032562256, + "logps/chosen": -313.90350341796875, + "logps/rejected": -373.7087707519531, + "loss": 0.4635, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6222796440124512, + "rewards/margins": 1.4966480731964111, + "rewards/rejected": -3.1189277172088623, + "step": 6123 + }, + { + "epoch": 0.71, + "learning_rate": 8.954699754184713e-08, + "logits/chosen": -2.1356353759765625, + "logits/rejected": -2.574097156524658, + "logps/chosen": -268.2745361328125, + "logps/rejected": -202.589599609375, + "loss": 0.8242, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.9599130153656006, + "rewards/margins": 0.34853416681289673, + "rewards/rejected": -2.3084471225738525, + "step": 6124 + }, + { + "epoch": 0.71, + "learning_rate": 8.951188107222287e-08, + "logits/chosen": -2.154444694519043, + "logits/rejected": -2.59126353263855, + "logps/chosen": -353.0377502441406, + "logps/rejected": -221.9720458984375, + "loss": 0.5882, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5893027782440186, + "rewards/margins": 2.3154828548431396, + "rewards/rejected": -3.904785633087158, + "step": 6125 + }, + { + "epoch": 0.71, + "learning_rate": 8.94767646025986e-08, + "logits/chosen": -2.3135592937469482, + "logits/rejected": -2.3227298259735107, + "logps/chosen": -248.86639404296875, + "logps/rejected": -134.67286682128906, + "loss": 0.6681, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3207417726516724, + "rewards/margins": 0.8972508311271667, + "rewards/rejected": -2.2179925441741943, + "step": 6126 + }, + { + "epoch": 0.71, + "learning_rate": 8.944164813297437e-08, + "logits/chosen": -2.4001708030700684, + "logits/rejected": -2.4109086990356445, + "logps/chosen": -371.990234375, + "logps/rejected": -280.27081298828125, + "loss": 0.1369, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1175740957260132, + "rewards/margins": 2.8405439853668213, + "rewards/rejected": -3.958118200302124, + "step": 6127 + }, + { + "epoch": 0.71, + "learning_rate": 8.940653166335011e-08, + "logits/chosen": -2.5538575649261475, + "logits/rejected": -2.5187273025512695, + "logps/chosen": -113.33338165283203, + "logps/rejected": -165.78121948242188, + "loss": 0.369, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5716372132301331, + "rewards/margins": 2.2626636028289795, + "rewards/rejected": -2.8343007564544678, + "step": 6128 + }, + { + "epoch": 0.71, + "learning_rate": 8.937141519372585e-08, + "logits/chosen": -2.1050846576690674, + "logits/rejected": -1.8078359365463257, + "logps/chosen": -183.17193603515625, + "logps/rejected": -281.12969970703125, + "loss": 0.7202, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.7360919713974, + "rewards/margins": 1.2545721530914307, + "rewards/rejected": -2.99066424369812, + "step": 6129 + }, + { + "epoch": 0.71, + "learning_rate": 8.93362987241016e-08, + "logits/chosen": -2.3991339206695557, + "logits/rejected": -2.3849613666534424, + "logps/chosen": -95.38685607910156, + "logps/rejected": -192.43963623046875, + "loss": 0.3256, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09402275085449219, + "rewards/margins": 1.9844456911087036, + "rewards/rejected": -2.0784685611724854, + "step": 6130 + }, + { + "epoch": 0.71, + "learning_rate": 8.930118225447735e-08, + "logits/chosen": -2.240178108215332, + "logits/rejected": -2.33815336227417, + "logps/chosen": -257.9399719238281, + "logps/rejected": -264.226806640625, + "loss": 0.404, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.37571874260902405, + "rewards/margins": 1.870579719543457, + "rewards/rejected": -2.246298313140869, + "step": 6131 + }, + { + "epoch": 0.71, + "learning_rate": 8.926606578485309e-08, + "logits/chosen": -2.492788791656494, + "logits/rejected": -2.458845615386963, + "logps/chosen": -226.44918823242188, + "logps/rejected": -174.14785766601562, + "loss": 0.3955, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5542712211608887, + "rewards/margins": 1.985424280166626, + "rewards/rejected": -2.5396957397460938, + "step": 6132 + }, + { + "epoch": 0.71, + "learning_rate": 8.923094931522884e-08, + "logits/chosen": -2.7773211002349854, + "logits/rejected": -2.877485990524292, + "logps/chosen": -366.8025207519531, + "logps/rejected": -294.9124755859375, + "loss": 0.2725, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0968321561813354, + "rewards/margins": 1.7270824909210205, + "rewards/rejected": -2.8239145278930664, + "step": 6133 + }, + { + "epoch": 0.71, + "learning_rate": 8.919583284560458e-08, + "logits/chosen": -2.1681227684020996, + "logits/rejected": -2.0768425464630127, + "logps/chosen": -276.9809265136719, + "logps/rejected": -268.83001708984375, + "loss": 0.2071, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.630979061126709, + "rewards/margins": 4.142856597900391, + "rewards/rejected": -3.5118770599365234, + "step": 6134 + }, + { + "epoch": 0.71, + "learning_rate": 8.916071637598034e-08, + "logits/chosen": -1.7355139255523682, + "logits/rejected": -1.4270737171173096, + "logps/chosen": -313.8917236328125, + "logps/rejected": -447.778076171875, + "loss": 0.2022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.718410849571228, + "rewards/margins": 3.42305850982666, + "rewards/rejected": -4.141469478607178, + "step": 6135 + }, + { + "epoch": 0.71, + "learning_rate": 8.912559990635608e-08, + "logits/chosen": -2.137974739074707, + "logits/rejected": -2.2995765209198, + "logps/chosen": -306.6200866699219, + "logps/rejected": -317.6770935058594, + "loss": 0.5952, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7638098001480103, + "rewards/margins": 1.0555262565612793, + "rewards/rejected": -2.819336175918579, + "step": 6136 + }, + { + "epoch": 0.71, + "learning_rate": 8.909048343673182e-08, + "logits/chosen": -2.2134463787078857, + "logits/rejected": -2.3734703063964844, + "logps/chosen": -227.5125732421875, + "logps/rejected": -202.81076049804688, + "loss": 0.5939, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5272024869918823, + "rewards/margins": 0.7996586561203003, + "rewards/rejected": -2.3268611431121826, + "step": 6137 + }, + { + "epoch": 0.71, + "learning_rate": 8.905536696710756e-08, + "logits/chosen": -2.778536081314087, + "logits/rejected": -2.8285610675811768, + "logps/chosen": -114.69459533691406, + "logps/rejected": -262.6452941894531, + "loss": 0.5842, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.244386076927185, + "rewards/margins": 2.8926124572753906, + "rewards/rejected": -4.136998653411865, + "step": 6138 + }, + { + "epoch": 0.71, + "learning_rate": 8.902025049748333e-08, + "logits/chosen": -2.2868406772613525, + "logits/rejected": -2.501722574234009, + "logps/chosen": -253.80860900878906, + "logps/rejected": -211.8414306640625, + "loss": 0.2397, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6499472260475159, + "rewards/margins": 2.915842294692993, + "rewards/rejected": -3.5657894611358643, + "step": 6139 + }, + { + "epoch": 0.71, + "learning_rate": 8.898513402785907e-08, + "logits/chosen": -2.5463616847991943, + "logits/rejected": -2.448099374771118, + "logps/chosen": -222.86846923828125, + "logps/rejected": -240.45654296875, + "loss": 0.3571, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6658477187156677, + "rewards/margins": 2.059762954711914, + "rewards/rejected": -2.7256107330322266, + "step": 6140 + }, + { + "epoch": 0.71, + "learning_rate": 8.895001755823481e-08, + "logits/chosen": -2.1005847454071045, + "logits/rejected": -2.23909854888916, + "logps/chosen": -323.1762390136719, + "logps/rejected": -272.0298156738281, + "loss": 0.313, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4726940989494324, + "rewards/margins": 1.8792415857315063, + "rewards/rejected": -2.351935863494873, + "step": 6141 + }, + { + "epoch": 0.71, + "learning_rate": 8.891490108861055e-08, + "logits/chosen": -2.3607001304626465, + "logits/rejected": -2.1469614505767822, + "logps/chosen": -183.81268310546875, + "logps/rejected": -220.02430725097656, + "loss": 0.4635, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2558032274246216, + "rewards/margins": 2.6350507736206055, + "rewards/rejected": -3.8908538818359375, + "step": 6142 + }, + { + "epoch": 0.71, + "learning_rate": 8.88797846189863e-08, + "logits/chosen": -2.694230794906616, + "logits/rejected": -2.765622615814209, + "logps/chosen": -266.7296447753906, + "logps/rejected": -283.1936950683594, + "loss": 0.2565, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.608512282371521, + "rewards/margins": 2.2362709045410156, + "rewards/rejected": -2.844783306121826, + "step": 6143 + }, + { + "epoch": 0.71, + "learning_rate": 8.884466814936206e-08, + "logits/chosen": -2.3617355823516846, + "logits/rejected": -2.574936628341675, + "logps/chosen": -354.94976806640625, + "logps/rejected": -225.63186645507812, + "loss": 0.2795, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17770150303840637, + "rewards/margins": 2.151124954223633, + "rewards/rejected": -1.9734233617782593, + "step": 6144 + }, + { + "epoch": 0.71, + "learning_rate": 8.88095516797378e-08, + "logits/chosen": -1.9068537950515747, + "logits/rejected": -1.794187307357788, + "logps/chosen": -307.09075927734375, + "logps/rejected": -314.496337890625, + "loss": 0.5418, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.41888630390167236, + "rewards/margins": 3.291337251663208, + "rewards/rejected": -3.71022367477417, + "step": 6145 + }, + { + "epoch": 0.71, + "learning_rate": 8.877443521011354e-08, + "logits/chosen": -2.215986728668213, + "logits/rejected": -2.3929624557495117, + "logps/chosen": -178.47683715820312, + "logps/rejected": -157.8839111328125, + "loss": 0.7978, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3939528465270996, + "rewards/margins": 1.904208779335022, + "rewards/rejected": -3.298161506652832, + "step": 6146 + }, + { + "epoch": 0.71, + "learning_rate": 8.873931874048928e-08, + "logits/chosen": -1.9648983478546143, + "logits/rejected": -2.247304916381836, + "logps/chosen": -552.2230834960938, + "logps/rejected": -230.31369018554688, + "loss": 0.6009, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9739532470703125, + "rewards/margins": 1.0930547714233398, + "rewards/rejected": -2.0670080184936523, + "step": 6147 + }, + { + "epoch": 0.71, + "learning_rate": 8.870420227086503e-08, + "logits/chosen": -2.6179890632629395, + "logits/rejected": -2.5146231651306152, + "logps/chosen": -360.48785400390625, + "logps/rejected": -300.08441162109375, + "loss": 0.1463, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6329514384269714, + "rewards/margins": 2.7537193298339844, + "rewards/rejected": -3.3866708278656006, + "step": 6148 + }, + { + "epoch": 0.71, + "learning_rate": 8.866908580124077e-08, + "logits/chosen": -2.3098976612091064, + "logits/rejected": -2.680781126022339, + "logps/chosen": -298.0061340332031, + "logps/rejected": -301.6170959472656, + "loss": 0.5774, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5387417078018188, + "rewards/margins": 2.6052098274230957, + "rewards/rejected": -4.143951416015625, + "step": 6149 + }, + { + "epoch": 0.71, + "learning_rate": 8.863396933161653e-08, + "logits/chosen": -2.0894227027893066, + "logits/rejected": -2.1264638900756836, + "logps/chosen": -382.95184326171875, + "logps/rejected": -426.2306213378906, + "loss": 0.3967, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2775492668151855, + "rewards/margins": 2.800584554672241, + "rewards/rejected": -4.078134059906006, + "step": 6150 + }, + { + "epoch": 0.71, + "learning_rate": 8.859885286199227e-08, + "logits/chosen": -2.1027684211730957, + "logits/rejected": -2.195805072784424, + "logps/chosen": -284.9039001464844, + "logps/rejected": -358.63916015625, + "loss": 0.8137, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2547563314437866, + "rewards/margins": 1.868159532546997, + "rewards/rejected": -3.122915744781494, + "step": 6151 + }, + { + "epoch": 0.71, + "learning_rate": 8.856373639236802e-08, + "logits/chosen": -2.4216208457946777, + "logits/rejected": -2.201171398162842, + "logps/chosen": -344.0113830566406, + "logps/rejected": -318.802001953125, + "loss": 0.226, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8421778678894043, + "rewards/margins": 2.634335994720459, + "rewards/rejected": -3.4765138626098633, + "step": 6152 + }, + { + "epoch": 0.71, + "learning_rate": 8.852861992274376e-08, + "logits/chosen": -2.160922050476074, + "logits/rejected": -2.1362879276275635, + "logps/chosen": -260.1789245605469, + "logps/rejected": -239.5329132080078, + "loss": 0.3688, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5339524745941162, + "rewards/margins": 1.7837122678756714, + "rewards/rejected": -3.3176651000976562, + "step": 6153 + }, + { + "epoch": 0.71, + "learning_rate": 8.84935034531195e-08, + "logits/chosen": -2.0777018070220947, + "logits/rejected": -2.2267026901245117, + "logps/chosen": -340.8007507324219, + "logps/rejected": -240.7168426513672, + "loss": 0.4076, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3881120681762695, + "rewards/margins": 1.9614410400390625, + "rewards/rejected": -3.349553346633911, + "step": 6154 + }, + { + "epoch": 0.71, + "learning_rate": 8.845838698349524e-08, + "logits/chosen": -2.1008992195129395, + "logits/rejected": -1.9771218299865723, + "logps/chosen": -259.20465087890625, + "logps/rejected": -275.423828125, + "loss": 0.5248, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4104125499725342, + "rewards/margins": 0.8989276885986328, + "rewards/rejected": -2.309340476989746, + "step": 6155 + }, + { + "epoch": 0.71, + "learning_rate": 8.842327051387101e-08, + "logits/chosen": -1.962599754333496, + "logits/rejected": -2.388118267059326, + "logps/chosen": -278.3966979980469, + "logps/rejected": -240.79135131835938, + "loss": 0.587, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1807327270507812, + "rewards/margins": 1.6864612102508545, + "rewards/rejected": -3.867194175720215, + "step": 6156 + }, + { + "epoch": 0.71, + "learning_rate": 8.838815404424675e-08, + "logits/chosen": -1.6875964403152466, + "logits/rejected": -1.7606004476547241, + "logps/chosen": -403.45458984375, + "logps/rejected": -531.972412109375, + "loss": 0.5461, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9841209650039673, + "rewards/margins": 1.4287974834442139, + "rewards/rejected": -2.4129185676574707, + "step": 6157 + }, + { + "epoch": 0.71, + "learning_rate": 8.835303757462249e-08, + "logits/chosen": -2.816200017929077, + "logits/rejected": -2.692749500274658, + "logps/chosen": -150.14520263671875, + "logps/rejected": -242.1930389404297, + "loss": 0.1518, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.30175477266311646, + "rewards/margins": 2.863424777984619, + "rewards/rejected": -3.16517972946167, + "step": 6158 + }, + { + "epoch": 0.71, + "learning_rate": 8.831792110499823e-08, + "logits/chosen": -2.711773633956909, + "logits/rejected": -2.862509250640869, + "logps/chosen": -307.47412109375, + "logps/rejected": -177.81405639648438, + "loss": 0.5096, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1433566808700562, + "rewards/margins": 1.3615580797195435, + "rewards/rejected": -2.5049147605895996, + "step": 6159 + }, + { + "epoch": 0.71, + "learning_rate": 8.828280463537399e-08, + "logits/chosen": -1.8780901432037354, + "logits/rejected": -2.0687918663024902, + "logps/chosen": -319.620361328125, + "logps/rejected": -241.978271484375, + "loss": 0.1514, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3707530498504639, + "rewards/margins": 2.556392192840576, + "rewards/rejected": -3.927145004272461, + "step": 6160 + }, + { + "epoch": 0.71, + "learning_rate": 8.824768816574974e-08, + "logits/chosen": -2.178844690322876, + "logits/rejected": -1.680445671081543, + "logps/chosen": -331.2449645996094, + "logps/rejected": -428.6101379394531, + "loss": 0.3355, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5868672728538513, + "rewards/margins": 3.773357391357422, + "rewards/rejected": -4.360224723815918, + "step": 6161 + }, + { + "epoch": 0.71, + "learning_rate": 8.821257169612548e-08, + "logits/chosen": -1.7089107036590576, + "logits/rejected": -1.767610788345337, + "logps/chosen": -520.6619873046875, + "logps/rejected": -450.2231750488281, + "loss": 0.5083, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4492224454879761, + "rewards/margins": 2.820680618286133, + "rewards/rejected": -3.2699034214019775, + "step": 6162 + }, + { + "epoch": 0.71, + "learning_rate": 8.817745522650122e-08, + "logits/chosen": -2.2025299072265625, + "logits/rejected": -2.321361541748047, + "logps/chosen": -299.6578063964844, + "logps/rejected": -191.7101593017578, + "loss": 0.4306, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2243162393569946, + "rewards/margins": 0.9653903245925903, + "rewards/rejected": -2.189706325531006, + "step": 6163 + }, + { + "epoch": 0.71, + "learning_rate": 8.814233875687698e-08, + "logits/chosen": -2.4232802391052246, + "logits/rejected": -2.396778106689453, + "logps/chosen": -345.24847412109375, + "logps/rejected": -332.3375244140625, + "loss": 0.5068, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6821130514144897, + "rewards/margins": 1.8727120161056519, + "rewards/rejected": -2.5548248291015625, + "step": 6164 + }, + { + "epoch": 0.71, + "learning_rate": 8.810722228725272e-08, + "logits/chosen": -2.316709041595459, + "logits/rejected": -2.3270411491394043, + "logps/chosen": -331.1153564453125, + "logps/rejected": -286.8310546875, + "loss": 0.3628, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12353324890136719, + "rewards/margins": 2.033554792404175, + "rewards/rejected": -2.157087802886963, + "step": 6165 + }, + { + "epoch": 0.71, + "learning_rate": 8.807210581762846e-08, + "logits/chosen": -2.3775181770324707, + "logits/rejected": -2.551212787628174, + "logps/chosen": -217.720947265625, + "logps/rejected": -429.530029296875, + "loss": 0.3091, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0008635520935059, + "rewards/margins": 2.3300294876098633, + "rewards/rejected": -3.33089280128479, + "step": 6166 + }, + { + "epoch": 0.71, + "learning_rate": 8.80369893480042e-08, + "logits/chosen": -2.2404654026031494, + "logits/rejected": -1.7929389476776123, + "logps/chosen": -159.4123992919922, + "logps/rejected": -321.33111572265625, + "loss": 0.0863, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40147101879119873, + "rewards/margins": 3.3995065689086914, + "rewards/rejected": -3.8009777069091797, + "step": 6167 + }, + { + "epoch": 0.71, + "learning_rate": 8.800187287837996e-08, + "logits/chosen": -2.7610855102539062, + "logits/rejected": -2.7978734970092773, + "logps/chosen": -317.4927978515625, + "logps/rejected": -256.9922180175781, + "loss": 0.2033, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6109966039657593, + "rewards/margins": 3.1621103286743164, + "rewards/rejected": -4.773107051849365, + "step": 6168 + }, + { + "epoch": 0.71, + "learning_rate": 8.79667564087557e-08, + "logits/chosen": -2.049494743347168, + "logits/rejected": -2.1746389865875244, + "logps/chosen": -284.025390625, + "logps/rejected": -189.8956756591797, + "loss": 0.4041, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2513868808746338, + "rewards/margins": 1.4658844470977783, + "rewards/rejected": -1.717271327972412, + "step": 6169 + }, + { + "epoch": 0.71, + "learning_rate": 8.793163993913145e-08, + "logits/chosen": -2.6788039207458496, + "logits/rejected": -2.299433946609497, + "logps/chosen": -203.0902862548828, + "logps/rejected": -334.51275634765625, + "loss": 0.549, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3948184251785278, + "rewards/margins": 3.2731761932373047, + "rewards/rejected": -4.667994022369385, + "step": 6170 + }, + { + "epoch": 0.71, + "learning_rate": 8.789652346950719e-08, + "logits/chosen": -2.3940203189849854, + "logits/rejected": -2.0526034832000732, + "logps/chosen": -293.32427978515625, + "logps/rejected": -371.64105224609375, + "loss": 0.3957, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.48971083760261536, + "rewards/margins": 1.831725835800171, + "rewards/rejected": -2.321436643600464, + "step": 6171 + }, + { + "epoch": 0.71, + "learning_rate": 8.786140699988295e-08, + "logits/chosen": -2.3459835052490234, + "logits/rejected": -2.094160318374634, + "logps/chosen": -229.19004821777344, + "logps/rejected": -344.9610900878906, + "loss": 0.2133, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9459050893783569, + "rewards/margins": 2.7694404125213623, + "rewards/rejected": -3.7153451442718506, + "step": 6172 + }, + { + "epoch": 0.71, + "learning_rate": 8.78262905302587e-08, + "logits/chosen": -1.6830039024353027, + "logits/rejected": -1.748537540435791, + "logps/chosen": -394.5217590332031, + "logps/rejected": -370.7452697753906, + "loss": 0.4102, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4854365587234497, + "rewards/margins": 0.8642475008964539, + "rewards/rejected": -2.349684000015259, + "step": 6173 + }, + { + "epoch": 0.71, + "learning_rate": 8.779117406063443e-08, + "logits/chosen": -2.2515439987182617, + "logits/rejected": -2.444427251815796, + "logps/chosen": -384.309326171875, + "logps/rejected": -399.4783935546875, + "loss": 0.3135, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6917544603347778, + "rewards/margins": 2.652799367904663, + "rewards/rejected": -4.3445539474487305, + "step": 6174 + }, + { + "epoch": 0.71, + "learning_rate": 8.775605759101017e-08, + "logits/chosen": -1.973874807357788, + "logits/rejected": -2.206209659576416, + "logps/chosen": -263.089111328125, + "logps/rejected": -176.90396118164062, + "loss": 0.6552, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8009255528450012, + "rewards/margins": 0.6103115677833557, + "rewards/rejected": -1.411237120628357, + "step": 6175 + }, + { + "epoch": 0.71, + "learning_rate": 8.772094112138593e-08, + "logits/chosen": -2.458078622817993, + "logits/rejected": -2.488713264465332, + "logps/chosen": -85.55491638183594, + "logps/rejected": -105.53778839111328, + "loss": 0.3291, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.21544095873832703, + "rewards/margins": 1.9578044414520264, + "rewards/rejected": -2.173245429992676, + "step": 6176 + }, + { + "epoch": 0.71, + "learning_rate": 8.768582465176167e-08, + "logits/chosen": -2.1026411056518555, + "logits/rejected": -2.3044726848602295, + "logps/chosen": -358.8095703125, + "logps/rejected": -300.1336975097656, + "loss": 0.198, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5571789741516113, + "rewards/margins": 2.756686210632324, + "rewards/rejected": -3.3138651847839355, + "step": 6177 + }, + { + "epoch": 0.71, + "learning_rate": 8.765070818213742e-08, + "logits/chosen": -2.385298013687134, + "logits/rejected": -2.3448987007141113, + "logps/chosen": -288.7516174316406, + "logps/rejected": -243.1588897705078, + "loss": 0.3096, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.48636573553085327, + "rewards/margins": 1.7514667510986328, + "rewards/rejected": -2.237832546234131, + "step": 6178 + }, + { + "epoch": 0.71, + "learning_rate": 8.761559171251316e-08, + "logits/chosen": -2.337354898452759, + "logits/rejected": -2.7028117179870605, + "logps/chosen": -208.15176391601562, + "logps/rejected": -241.18115234375, + "loss": 0.221, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.329330176115036, + "rewards/margins": 4.71844482421875, + "rewards/rejected": -5.0477752685546875, + "step": 6179 + }, + { + "epoch": 0.71, + "learning_rate": 8.758047524288892e-08, + "logits/chosen": -2.4483394622802734, + "logits/rejected": -2.131082534790039, + "logps/chosen": -150.32733154296875, + "logps/rejected": -198.20355224609375, + "loss": 0.4574, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.085697889328003, + "rewards/margins": 0.8271303176879883, + "rewards/rejected": -1.9128282070159912, + "step": 6180 + }, + { + "epoch": 0.71, + "learning_rate": 8.754535877326466e-08, + "logits/chosen": -2.3778555393218994, + "logits/rejected": -2.294978618621826, + "logps/chosen": -261.01556396484375, + "logps/rejected": -304.4202575683594, + "loss": 0.5629, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4390363693237305, + "rewards/margins": 1.027552604675293, + "rewards/rejected": -2.4665889739990234, + "step": 6181 + }, + { + "epoch": 0.71, + "learning_rate": 8.75102423036404e-08, + "logits/chosen": -2.0908660888671875, + "logits/rejected": -2.293255090713501, + "logps/chosen": -385.5350341796875, + "logps/rejected": -295.1632385253906, + "loss": 0.8249, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.080095648765564, + "rewards/margins": 1.6279594898223877, + "rewards/rejected": -2.708055019378662, + "step": 6182 + }, + { + "epoch": 0.71, + "learning_rate": 8.747512583401614e-08, + "logits/chosen": -2.212101459503174, + "logits/rejected": -2.7455718517303467, + "logps/chosen": -378.6505126953125, + "logps/rejected": -285.29254150390625, + "loss": 0.1466, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5899208784103394, + "rewards/margins": 3.5056264400482178, + "rewards/rejected": -4.095547676086426, + "step": 6183 + }, + { + "epoch": 0.71, + "learning_rate": 8.744000936439191e-08, + "logits/chosen": -2.310612440109253, + "logits/rejected": -2.5761003494262695, + "logps/chosen": -492.63800048828125, + "logps/rejected": -210.136474609375, + "loss": 1.393, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.2206711769104004, + "rewards/margins": 0.977224588394165, + "rewards/rejected": -3.1978955268859863, + "step": 6184 + }, + { + "epoch": 0.71, + "learning_rate": 8.740489289476765e-08, + "logits/chosen": -2.2157676219940186, + "logits/rejected": -2.5786190032958984, + "logps/chosen": -298.37213134765625, + "logps/rejected": -225.1540069580078, + "loss": 0.3868, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5402271747589111, + "rewards/margins": 1.5173423290252686, + "rewards/rejected": -2.0575695037841797, + "step": 6185 + }, + { + "epoch": 0.71, + "learning_rate": 8.736977642514339e-08, + "logits/chosen": -2.467773914337158, + "logits/rejected": -2.291151762008667, + "logps/chosen": -203.16676330566406, + "logps/rejected": -321.2940979003906, + "loss": 0.2629, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19823291897773743, + "rewards/margins": 2.2042503356933594, + "rewards/rejected": -2.4024834632873535, + "step": 6186 + }, + { + "epoch": 0.71, + "learning_rate": 8.733465995551913e-08, + "logits/chosen": -1.7396914958953857, + "logits/rejected": -2.087843894958496, + "logps/chosen": -380.0852966308594, + "logps/rejected": -293.2144775390625, + "loss": 0.4186, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5532118678092957, + "rewards/margins": 1.9113185405731201, + "rewards/rejected": -2.4645304679870605, + "step": 6187 + }, + { + "epoch": 0.71, + "learning_rate": 8.729954348589488e-08, + "logits/chosen": -2.033709764480591, + "logits/rejected": -2.342491626739502, + "logps/chosen": -310.9771728515625, + "logps/rejected": -338.4034118652344, + "loss": 0.2007, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5981388092041016, + "rewards/margins": 3.754718542098999, + "rewards/rejected": -4.35285758972168, + "step": 6188 + }, + { + "epoch": 0.71, + "learning_rate": 8.726442701627064e-08, + "logits/chosen": -2.467287540435791, + "logits/rejected": -2.4262216091156006, + "logps/chosen": -94.55195617675781, + "logps/rejected": -199.69998168945312, + "loss": 0.2864, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6405789852142334, + "rewards/margins": 2.0471296310424805, + "rewards/rejected": -2.687708854675293, + "step": 6189 + }, + { + "epoch": 0.71, + "learning_rate": 8.722931054664638e-08, + "logits/chosen": -2.316804885864258, + "logits/rejected": -2.484097480773926, + "logps/chosen": -331.2386779785156, + "logps/rejected": -206.89999389648438, + "loss": 0.2403, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2173006534576416, + "rewards/margins": 2.336652994155884, + "rewards/rejected": -3.5539536476135254, + "step": 6190 + }, + { + "epoch": 0.71, + "learning_rate": 8.719419407702212e-08, + "logits/chosen": -1.589504361152649, + "logits/rejected": -1.777808666229248, + "logps/chosen": -519.73095703125, + "logps/rejected": -324.07794189453125, + "loss": 0.3439, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8344559073448181, + "rewards/margins": 1.8916982412338257, + "rewards/rejected": -2.726154088973999, + "step": 6191 + }, + { + "epoch": 0.71, + "learning_rate": 8.715907760739787e-08, + "logits/chosen": -1.536590337753296, + "logits/rejected": -1.8822869062423706, + "logps/chosen": -272.4548034667969, + "logps/rejected": -196.37103271484375, + "loss": 0.6412, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8957796096801758, + "rewards/margins": 1.1203229427337646, + "rewards/rejected": -2.0161027908325195, + "step": 6192 + }, + { + "epoch": 0.71, + "learning_rate": 8.712396113777361e-08, + "logits/chosen": -2.4647459983825684, + "logits/rejected": -2.423598527908325, + "logps/chosen": -271.8888854980469, + "logps/rejected": -205.6934814453125, + "loss": 0.2488, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8955461382865906, + "rewards/margins": 1.7816190719604492, + "rewards/rejected": -2.6771650314331055, + "step": 6193 + }, + { + "epoch": 0.71, + "learning_rate": 8.708884466814935e-08, + "logits/chosen": -2.8840909004211426, + "logits/rejected": -2.8646762371063232, + "logps/chosen": -215.15853881835938, + "logps/rejected": -220.98397827148438, + "loss": 0.4184, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0996675491333008, + "rewards/margins": 2.0482964515686035, + "rewards/rejected": -3.1479642391204834, + "step": 6194 + }, + { + "epoch": 0.71, + "learning_rate": 8.70537281985251e-08, + "logits/chosen": -2.7241976261138916, + "logits/rejected": -2.5369856357574463, + "logps/chosen": -240.04177856445312, + "logps/rejected": -315.6644287109375, + "loss": 0.857, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.874184250831604, + "rewards/margins": 1.3053021430969238, + "rewards/rejected": -3.1794862747192383, + "step": 6195 + }, + { + "epoch": 0.71, + "learning_rate": 8.701861172890085e-08, + "logits/chosen": -2.5188465118408203, + "logits/rejected": -2.531362533569336, + "logps/chosen": -173.82467651367188, + "logps/rejected": -137.5878448486328, + "loss": 0.378, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9109394550323486, + "rewards/margins": 1.4735020399093628, + "rewards/rejected": -2.384441375732422, + "step": 6196 + }, + { + "epoch": 0.71, + "learning_rate": 8.69834952592766e-08, + "logits/chosen": -2.7166085243225098, + "logits/rejected": -2.6546764373779297, + "logps/chosen": -162.39845275878906, + "logps/rejected": -264.8187255859375, + "loss": 0.1136, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1082184910774231, + "rewards/margins": 3.3365530967712402, + "rewards/rejected": -3.4447717666625977, + "step": 6197 + }, + { + "epoch": 0.71, + "learning_rate": 8.694837878965234e-08, + "logits/chosen": -1.9175708293914795, + "logits/rejected": -2.0489423274993896, + "logps/chosen": -308.520751953125, + "logps/rejected": -237.20892333984375, + "loss": 0.7136, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1692239046096802, + "rewards/margins": 0.8018376231193542, + "rewards/rejected": -1.9710614681243896, + "step": 6198 + }, + { + "epoch": 0.71, + "learning_rate": 8.691326232002808e-08, + "logits/chosen": -2.5873405933380127, + "logits/rejected": -2.487699031829834, + "logps/chosen": -473.41162109375, + "logps/rejected": -281.9477233886719, + "loss": 0.1924, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2237601280212402, + "rewards/margins": 1.8057142496109009, + "rewards/rejected": -3.0294742584228516, + "step": 6199 + }, + { + "epoch": 0.71, + "learning_rate": 8.687814585040382e-08, + "logits/chosen": -2.6799938678741455, + "logits/rejected": -2.709049940109253, + "logps/chosen": -196.27890014648438, + "logps/rejected": -242.76177978515625, + "loss": 0.0749, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20987120270729065, + "rewards/margins": 5.073108673095703, + "rewards/rejected": -5.282979965209961, + "step": 6200 + }, + { + "epoch": 0.71, + "learning_rate": 8.684302938077959e-08, + "logits/chosen": -2.445223808288574, + "logits/rejected": -2.323923349380493, + "logps/chosen": -193.24786376953125, + "logps/rejected": -215.07884216308594, + "loss": 0.4469, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.633120059967041, + "rewards/margins": 1.8511652946472168, + "rewards/rejected": -3.484285354614258, + "step": 6201 + }, + { + "epoch": 0.71, + "learning_rate": 8.680791291115533e-08, + "logits/chosen": -2.9592502117156982, + "logits/rejected": -2.840940475463867, + "logps/chosen": -259.636474609375, + "logps/rejected": -156.18875122070312, + "loss": 0.5043, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7022807002067566, + "rewards/margins": 1.7158386707305908, + "rewards/rejected": -2.418119430541992, + "step": 6202 + }, + { + "epoch": 0.72, + "learning_rate": 8.677279644153107e-08, + "logits/chosen": -2.0127007961273193, + "logits/rejected": -2.0989596843719482, + "logps/chosen": -451.509765625, + "logps/rejected": -327.6063537597656, + "loss": 0.3919, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.534195065498352, + "rewards/margins": 1.4910197257995605, + "rewards/rejected": -2.025214910507202, + "step": 6203 + }, + { + "epoch": 0.72, + "learning_rate": 8.673767997190681e-08, + "logits/chosen": -2.349172353744507, + "logits/rejected": -2.316131114959717, + "logps/chosen": -355.39300537109375, + "logps/rejected": -378.6082763671875, + "loss": 0.7494, + "rewards/accuracies": 0.375, + "rewards/chosen": -2.6011815071105957, + "rewards/margins": 0.8386721611022949, + "rewards/rejected": -3.4398536682128906, + "step": 6204 + }, + { + "epoch": 0.72, + "learning_rate": 8.670256350228257e-08, + "logits/chosen": -1.9987077713012695, + "logits/rejected": -2.1818203926086426, + "logps/chosen": -355.9614562988281, + "logps/rejected": -342.7575378417969, + "loss": 0.2639, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4726236164569855, + "rewards/margins": 1.960407018661499, + "rewards/rejected": -2.433030605316162, + "step": 6205 + }, + { + "epoch": 0.72, + "learning_rate": 8.666744703265832e-08, + "logits/chosen": -2.449876546859741, + "logits/rejected": -2.6645820140838623, + "logps/chosen": -214.16163635253906, + "logps/rejected": -216.79159545898438, + "loss": 0.3691, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.868526041507721, + "rewards/margins": 1.9818106889724731, + "rewards/rejected": -2.850336790084839, + "step": 6206 + }, + { + "epoch": 0.72, + "learning_rate": 8.663233056303406e-08, + "logits/chosen": -2.671157121658325, + "logits/rejected": -2.502305269241333, + "logps/chosen": -365.13836669921875, + "logps/rejected": -375.2608642578125, + "loss": 0.7819, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.634743571281433, + "rewards/margins": 0.6805524230003357, + "rewards/rejected": -2.315295934677124, + "step": 6207 + }, + { + "epoch": 0.72, + "learning_rate": 8.65972140934098e-08, + "logits/chosen": -2.2759830951690674, + "logits/rejected": -2.5965301990509033, + "logps/chosen": -223.31591796875, + "logps/rejected": -184.59384155273438, + "loss": 0.2795, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2041105180978775, + "rewards/margins": 2.721184253692627, + "rewards/rejected": -2.9252943992614746, + "step": 6208 + }, + { + "epoch": 0.72, + "learning_rate": 8.656209762378555e-08, + "logits/chosen": -2.0678625106811523, + "logits/rejected": -2.3158485889434814, + "logps/chosen": -227.2475128173828, + "logps/rejected": -243.40866088867188, + "loss": 1.4, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.284606456756592, + "rewards/margins": 0.11539855599403381, + "rewards/rejected": -2.4000051021575928, + "step": 6209 + }, + { + "epoch": 0.72, + "learning_rate": 8.65269811541613e-08, + "logits/chosen": -1.8168821334838867, + "logits/rejected": -1.7870240211486816, + "logps/chosen": -153.252685546875, + "logps/rejected": -218.20721435546875, + "loss": 0.404, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7985476851463318, + "rewards/margins": 1.6614161729812622, + "rewards/rejected": -2.459963798522949, + "step": 6210 + }, + { + "epoch": 0.72, + "learning_rate": 8.649186468453704e-08, + "logits/chosen": -2.2352347373962402, + "logits/rejected": -2.4274001121520996, + "logps/chosen": -302.0332336425781, + "logps/rejected": -214.2034912109375, + "loss": 0.2606, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8095235824584961, + "rewards/margins": 3.641233444213867, + "rewards/rejected": -4.450757026672363, + "step": 6211 + }, + { + "epoch": 0.72, + "learning_rate": 8.645674821491279e-08, + "logits/chosen": -2.3122451305389404, + "logits/rejected": -2.198265314102173, + "logps/chosen": -169.28871154785156, + "logps/rejected": -274.2882995605469, + "loss": 0.2133, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5922648906707764, + "rewards/margins": 2.2477073669433594, + "rewards/rejected": -2.8399720191955566, + "step": 6212 + }, + { + "epoch": 0.72, + "learning_rate": 8.642163174528854e-08, + "logits/chosen": -2.169801950454712, + "logits/rejected": -2.353966474533081, + "logps/chosen": -384.175048828125, + "logps/rejected": -332.61749267578125, + "loss": 0.1229, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3165304958820343, + "rewards/margins": 3.637519598007202, + "rewards/rejected": -3.954050064086914, + "step": 6213 + }, + { + "epoch": 0.72, + "learning_rate": 8.638651527566428e-08, + "logits/chosen": -3.0360028743743896, + "logits/rejected": -3.033855438232422, + "logps/chosen": -289.463623046875, + "logps/rejected": -304.6182556152344, + "loss": 0.0728, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5975638628005981, + "rewards/margins": 3.2501847743988037, + "rewards/rejected": -3.8477487564086914, + "step": 6214 + }, + { + "epoch": 0.72, + "learning_rate": 8.635139880604002e-08, + "logits/chosen": -2.374530553817749, + "logits/rejected": -2.6108243465423584, + "logps/chosen": -227.43910217285156, + "logps/rejected": -340.95367431640625, + "loss": 0.391, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4685827493667603, + "rewards/margins": 2.0610849857330322, + "rewards/rejected": -3.529667615890503, + "step": 6215 + }, + { + "epoch": 0.72, + "learning_rate": 8.631628233641577e-08, + "logits/chosen": -2.4087674617767334, + "logits/rejected": -2.5313873291015625, + "logps/chosen": -295.79058837890625, + "logps/rejected": -419.7264404296875, + "loss": 0.4465, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9196458458900452, + "rewards/margins": 2.3339061737060547, + "rewards/rejected": -3.253551959991455, + "step": 6216 + }, + { + "epoch": 0.72, + "learning_rate": 8.628116586679153e-08, + "logits/chosen": -2.311591625213623, + "logits/rejected": -2.556169271469116, + "logps/chosen": -261.9896240234375, + "logps/rejected": -282.6431884765625, + "loss": 0.244, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6495741009712219, + "rewards/margins": 2.539793014526367, + "rewards/rejected": -3.1893670558929443, + "step": 6217 + }, + { + "epoch": 0.72, + "learning_rate": 8.624604939716727e-08, + "logits/chosen": -2.535062074661255, + "logits/rejected": -2.3243207931518555, + "logps/chosen": -170.52117919921875, + "logps/rejected": -169.43997192382812, + "loss": 0.4544, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2071293592453003, + "rewards/margins": 2.3715157508850098, + "rewards/rejected": -3.5786452293395996, + "step": 6218 + }, + { + "epoch": 0.72, + "learning_rate": 8.621093292754301e-08, + "logits/chosen": -2.083336114883423, + "logits/rejected": -2.4134180545806885, + "logps/chosen": -200.167236328125, + "logps/rejected": -198.14930725097656, + "loss": 0.4066, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.307860791683197, + "rewards/margins": 1.8719635009765625, + "rewards/rejected": -2.1798243522644043, + "step": 6219 + }, + { + "epoch": 0.72, + "learning_rate": 8.617581645791875e-08, + "logits/chosen": -2.6232190132141113, + "logits/rejected": -2.694903612136841, + "logps/chosen": -192.19752502441406, + "logps/rejected": -183.9937286376953, + "loss": 0.5121, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3324905633926392, + "rewards/margins": 1.3167157173156738, + "rewards/rejected": -2.6492061614990234, + "step": 6220 + }, + { + "epoch": 0.72, + "learning_rate": 8.614069998829451e-08, + "logits/chosen": -2.531536817550659, + "logits/rejected": -2.3981587886810303, + "logps/chosen": -275.71917724609375, + "logps/rejected": -300.60601806640625, + "loss": 0.5318, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.7655932903289795, + "rewards/margins": 3.06807541847229, + "rewards/rejected": -5.833669185638428, + "step": 6221 + }, + { + "epoch": 0.72, + "learning_rate": 8.610558351867025e-08, + "logits/chosen": -3.0250625610351562, + "logits/rejected": -2.9628164768218994, + "logps/chosen": -372.26104736328125, + "logps/rejected": -400.2683410644531, + "loss": 0.4667, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9666861295700073, + "rewards/margins": 2.0843493938446045, + "rewards/rejected": -4.051035404205322, + "step": 6222 + }, + { + "epoch": 0.72, + "learning_rate": 8.6070467049046e-08, + "logits/chosen": -2.445547580718994, + "logits/rejected": -2.5548698902130127, + "logps/chosen": -229.5793914794922, + "logps/rejected": -209.2664337158203, + "loss": 0.4751, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8843400478363037, + "rewards/margins": 2.271664619445801, + "rewards/rejected": -4.156004905700684, + "step": 6223 + }, + { + "epoch": 0.72, + "learning_rate": 8.603535057942174e-08, + "logits/chosen": -1.9585689306259155, + "logits/rejected": -1.9865995645523071, + "logps/chosen": -249.76052856445312, + "logps/rejected": -275.6453552246094, + "loss": 0.5526, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3818783462047577, + "rewards/margins": 1.9093761444091797, + "rewards/rejected": -2.2912545204162598, + "step": 6224 + }, + { + "epoch": 0.72, + "learning_rate": 8.60002341097975e-08, + "logits/chosen": -2.756777048110962, + "logits/rejected": -2.6371142864227295, + "logps/chosen": -299.7702941894531, + "logps/rejected": -203.30587768554688, + "loss": 0.3375, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.673280656337738, + "rewards/margins": 2.406269073486328, + "rewards/rejected": -3.079549789428711, + "step": 6225 + }, + { + "epoch": 0.72, + "learning_rate": 8.596511764017324e-08, + "logits/chosen": -2.351699113845825, + "logits/rejected": -2.5207221508026123, + "logps/chosen": -221.9137725830078, + "logps/rejected": -169.52728271484375, + "loss": 0.3857, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3787599503993988, + "rewards/margins": 2.2964401245117188, + "rewards/rejected": -2.6752002239227295, + "step": 6226 + }, + { + "epoch": 0.72, + "learning_rate": 8.593000117054898e-08, + "logits/chosen": -2.3508238792419434, + "logits/rejected": -2.672532558441162, + "logps/chosen": -328.59722900390625, + "logps/rejected": -252.14012145996094, + "loss": 0.1714, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5166865587234497, + "rewards/margins": 2.5261361598968506, + "rewards/rejected": -3.04282283782959, + "step": 6227 + }, + { + "epoch": 0.72, + "learning_rate": 8.589488470092472e-08, + "logits/chosen": -2.9600658416748047, + "logits/rejected": -2.8548715114593506, + "logps/chosen": -421.419189453125, + "logps/rejected": -271.3509521484375, + "loss": 0.1933, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2512402534484863, + "rewards/margins": 2.490645408630371, + "rewards/rejected": -3.7418856620788574, + "step": 6228 + }, + { + "epoch": 0.72, + "learning_rate": 8.585976823130049e-08, + "logits/chosen": -2.4556198120117188, + "logits/rejected": -2.5138907432556152, + "logps/chosen": -262.9068603515625, + "logps/rejected": -348.6590576171875, + "loss": 0.5294, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.190985918045044, + "rewards/margins": 3.152900218963623, + "rewards/rejected": -4.343886375427246, + "step": 6229 + }, + { + "epoch": 0.72, + "learning_rate": 8.582465176167623e-08, + "logits/chosen": -2.3180813789367676, + "logits/rejected": -2.2863142490386963, + "logps/chosen": -290.7713623046875, + "logps/rejected": -285.36016845703125, + "loss": 0.1526, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.40277954936027527, + "rewards/margins": 2.637160301208496, + "rewards/rejected": -3.0399398803710938, + "step": 6230 + }, + { + "epoch": 0.72, + "learning_rate": 8.578953529205197e-08, + "logits/chosen": -2.0656368732452393, + "logits/rejected": -2.1217451095581055, + "logps/chosen": -183.75135803222656, + "logps/rejected": -236.05514526367188, + "loss": 0.4123, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.49361252784729, + "rewards/margins": 2.2187347412109375, + "rewards/rejected": -3.7123472690582275, + "step": 6231 + }, + { + "epoch": 0.72, + "learning_rate": 8.575441882242771e-08, + "logits/chosen": -2.7887349128723145, + "logits/rejected": -2.631556987762451, + "logps/chosen": -190.48135375976562, + "logps/rejected": -197.94378662109375, + "loss": 0.77, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2911311388015747, + "rewards/margins": 3.2840347290039062, + "rewards/rejected": -4.575165748596191, + "step": 6232 + }, + { + "epoch": 0.72, + "learning_rate": 8.571930235280347e-08, + "logits/chosen": -1.8201475143432617, + "logits/rejected": -2.0469186305999756, + "logps/chosen": -168.93081665039062, + "logps/rejected": -199.08407592773438, + "loss": 0.6117, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4113811254501343, + "rewards/margins": 1.8956215381622314, + "rewards/rejected": -3.3070027828216553, + "step": 6233 + }, + { + "epoch": 0.72, + "learning_rate": 8.568418588317922e-08, + "logits/chosen": -1.9375218152999878, + "logits/rejected": -2.3784291744232178, + "logps/chosen": -360.78564453125, + "logps/rejected": -365.2537536621094, + "loss": 0.653, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5580658912658691, + "rewards/margins": 1.247122049331665, + "rewards/rejected": -1.8051881790161133, + "step": 6234 + }, + { + "epoch": 0.72, + "learning_rate": 8.564906941355496e-08, + "logits/chosen": -2.1194241046905518, + "logits/rejected": -2.2172021865844727, + "logps/chosen": -363.1198425292969, + "logps/rejected": -270.3354187011719, + "loss": 0.6474, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4765533208847046, + "rewards/margins": 0.9412487745285034, + "rewards/rejected": -1.4178022146224976, + "step": 6235 + }, + { + "epoch": 0.72, + "learning_rate": 8.56139529439307e-08, + "logits/chosen": -2.040945053100586, + "logits/rejected": -1.896899938583374, + "logps/chosen": -234.91273498535156, + "logps/rejected": -290.121337890625, + "loss": 0.1161, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11177724599838257, + "rewards/margins": 4.518779754638672, + "rewards/rejected": -4.630557537078857, + "step": 6236 + }, + { + "epoch": 0.72, + "learning_rate": 8.557883647430645e-08, + "logits/chosen": -1.8312091827392578, + "logits/rejected": -2.085848569869995, + "logps/chosen": -374.21234130859375, + "logps/rejected": -212.74676513671875, + "loss": 1.1495, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5103250741958618, + "rewards/margins": 0.6943408846855164, + "rewards/rejected": -2.2046661376953125, + "step": 6237 + }, + { + "epoch": 0.72, + "learning_rate": 8.554372000468219e-08, + "logits/chosen": -2.625276803970337, + "logits/rejected": -2.6213512420654297, + "logps/chosen": -278.2691650390625, + "logps/rejected": -323.13800048828125, + "loss": 0.406, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1650588512420654, + "rewards/margins": 2.5468244552612305, + "rewards/rejected": -4.711883544921875, + "step": 6238 + }, + { + "epoch": 0.72, + "learning_rate": 8.550860353505793e-08, + "logits/chosen": -2.496263027191162, + "logits/rejected": -2.6129894256591797, + "logps/chosen": -232.45614624023438, + "logps/rejected": -270.8470458984375, + "loss": 0.85, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8769919872283936, + "rewards/margins": 1.6290640830993652, + "rewards/rejected": -3.506056070327759, + "step": 6239 + }, + { + "epoch": 0.72, + "learning_rate": 8.547348706543369e-08, + "logits/chosen": -2.3486509323120117, + "logits/rejected": -2.171541452407837, + "logps/chosen": -213.16464233398438, + "logps/rejected": -215.64056396484375, + "loss": 0.2296, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0578620582818985, + "rewards/margins": 2.4038729667663574, + "rewards/rejected": -2.461735248565674, + "step": 6240 + }, + { + "epoch": 0.72, + "learning_rate": 8.543837059580943e-08, + "logits/chosen": -2.305352210998535, + "logits/rejected": -2.2518153190612793, + "logps/chosen": -169.7694854736328, + "logps/rejected": -204.9095916748047, + "loss": 0.3967, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3925912380218506, + "rewards/margins": 1.5049256086349487, + "rewards/rejected": -2.8975167274475098, + "step": 6241 + }, + { + "epoch": 0.72, + "learning_rate": 8.540325412618518e-08, + "logits/chosen": -2.344578742980957, + "logits/rejected": -2.304237127304077, + "logps/chosen": -166.95640563964844, + "logps/rejected": -240.83468627929688, + "loss": 0.4956, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8200570940971375, + "rewards/margins": 2.377554178237915, + "rewards/rejected": -3.1976113319396973, + "step": 6242 + }, + { + "epoch": 0.72, + "learning_rate": 8.536813765656092e-08, + "logits/chosen": -2.4772255420684814, + "logits/rejected": -2.200425148010254, + "logps/chosen": -219.22738647460938, + "logps/rejected": -228.9664306640625, + "loss": 0.4696, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4954410195350647, + "rewards/margins": 1.5583279132843018, + "rewards/rejected": -2.0537688732147217, + "step": 6243 + }, + { + "epoch": 0.72, + "learning_rate": 8.533302118693666e-08, + "logits/chosen": -2.0537524223327637, + "logits/rejected": -2.1026902198791504, + "logps/chosen": -391.232421875, + "logps/rejected": -251.3817138671875, + "loss": 0.2312, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3428499698638916, + "rewards/margins": 2.8687703609466553, + "rewards/rejected": -4.211620330810547, + "step": 6244 + }, + { + "epoch": 0.72, + "learning_rate": 8.52979047173124e-08, + "logits/chosen": -2.002061605453491, + "logits/rejected": -2.1964375972747803, + "logps/chosen": -405.4421081542969, + "logps/rejected": -193.75973510742188, + "loss": 0.5161, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7963440418243408, + "rewards/margins": 0.7347193956375122, + "rewards/rejected": -1.5310635566711426, + "step": 6245 + }, + { + "epoch": 0.72, + "learning_rate": 8.526278824768817e-08, + "logits/chosen": -2.469667434692383, + "logits/rejected": -2.825885057449341, + "logps/chosen": -223.54409790039062, + "logps/rejected": -146.03575134277344, + "loss": 0.5422, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8746291399002075, + "rewards/margins": 1.878902554512024, + "rewards/rejected": -2.7535319328308105, + "step": 6246 + }, + { + "epoch": 0.72, + "learning_rate": 8.522767177806391e-08, + "logits/chosen": -2.809100866317749, + "logits/rejected": -2.5367591381073, + "logps/chosen": -76.53536987304688, + "logps/rejected": -246.2604217529297, + "loss": 0.1332, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5731302499771118, + "rewards/margins": 4.548472881317139, + "rewards/rejected": -5.121603012084961, + "step": 6247 + }, + { + "epoch": 0.72, + "learning_rate": 8.519255530843965e-08, + "logits/chosen": -2.6114556789398193, + "logits/rejected": -2.454530715942383, + "logps/chosen": -95.20970153808594, + "logps/rejected": -197.60986328125, + "loss": 0.2944, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8919050097465515, + "rewards/margins": 2.4475255012512207, + "rewards/rejected": -3.339430809020996, + "step": 6248 + }, + { + "epoch": 0.72, + "learning_rate": 8.515743883881539e-08, + "logits/chosen": -1.847048044204712, + "logits/rejected": -1.9666264057159424, + "logps/chosen": -266.1700744628906, + "logps/rejected": -246.45993041992188, + "loss": 0.4323, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7637683153152466, + "rewards/margins": 1.3202540874481201, + "rewards/rejected": -2.084022283554077, + "step": 6249 + }, + { + "epoch": 0.72, + "learning_rate": 8.512232236919116e-08, + "logits/chosen": -1.9692966938018799, + "logits/rejected": -2.0541789531707764, + "logps/chosen": -270.97442626953125, + "logps/rejected": -274.58642578125, + "loss": 0.2385, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06111513823270798, + "rewards/margins": 3.236833095550537, + "rewards/rejected": -3.2979483604431152, + "step": 6250 + }, + { + "epoch": 0.72, + "learning_rate": 8.50872058995669e-08, + "logits/chosen": -1.8663251399993896, + "logits/rejected": -1.8740090131759644, + "logps/chosen": -178.181396484375, + "logps/rejected": -169.3384246826172, + "loss": 0.442, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2971038818359375, + "rewards/margins": 2.8156981468200684, + "rewards/rejected": -4.112802028656006, + "step": 6251 + }, + { + "epoch": 0.72, + "learning_rate": 8.505208942994264e-08, + "logits/chosen": -2.2685301303863525, + "logits/rejected": -2.3917012214660645, + "logps/chosen": -379.77362060546875, + "logps/rejected": -268.8301696777344, + "loss": 0.5677, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1470048427581787, + "rewards/margins": 2.2153847217559814, + "rewards/rejected": -3.36238956451416, + "step": 6252 + }, + { + "epoch": 0.72, + "learning_rate": 8.501697296031838e-08, + "logits/chosen": -2.723034620285034, + "logits/rejected": -2.8447399139404297, + "logps/chosen": -196.20950317382812, + "logps/rejected": -233.71905517578125, + "loss": 0.1691, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7362836003303528, + "rewards/margins": 2.956698417663574, + "rewards/rejected": -3.6929819583892822, + "step": 6253 + }, + { + "epoch": 0.72, + "learning_rate": 8.498185649069413e-08, + "logits/chosen": -2.102679491043091, + "logits/rejected": -2.3206653594970703, + "logps/chosen": -482.04986572265625, + "logps/rejected": -539.90673828125, + "loss": 0.3567, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19942015409469604, + "rewards/margins": 2.1541764736175537, + "rewards/rejected": -2.3535966873168945, + "step": 6254 + }, + { + "epoch": 0.72, + "learning_rate": 8.494674002106987e-08, + "logits/chosen": -2.736917495727539, + "logits/rejected": -2.5941720008850098, + "logps/chosen": -332.55609130859375, + "logps/rejected": -274.94403076171875, + "loss": 0.4428, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0850908756256104, + "rewards/margins": 1.2626042366027832, + "rewards/rejected": -2.3476951122283936, + "step": 6255 + }, + { + "epoch": 0.72, + "learning_rate": 8.491162355144562e-08, + "logits/chosen": -1.905430793762207, + "logits/rejected": -2.2293131351470947, + "logps/chosen": -290.757080078125, + "logps/rejected": -193.73532104492188, + "loss": 0.5543, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2021278142929077, + "rewards/margins": 1.489453673362732, + "rewards/rejected": -2.6915814876556396, + "step": 6256 + }, + { + "epoch": 0.72, + "learning_rate": 8.487650708182137e-08, + "logits/chosen": -2.1031835079193115, + "logits/rejected": -2.313049793243408, + "logps/chosen": -485.8734130859375, + "logps/rejected": -298.0501403808594, + "loss": 0.3664, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.26816433668136597, + "rewards/margins": 2.8454794883728027, + "rewards/rejected": -3.1136438846588135, + "step": 6257 + }, + { + "epoch": 0.72, + "learning_rate": 8.484139061219712e-08, + "logits/chosen": -2.2178072929382324, + "logits/rejected": -2.1641619205474854, + "logps/chosen": -343.9017333984375, + "logps/rejected": -361.4326171875, + "loss": 0.6295, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0334972143173218, + "rewards/margins": 1.526493787765503, + "rewards/rejected": -2.5599911212921143, + "step": 6258 + }, + { + "epoch": 0.72, + "learning_rate": 8.480627414257286e-08, + "logits/chosen": -2.298251152038574, + "logits/rejected": -2.6258068084716797, + "logps/chosen": -206.75650024414062, + "logps/rejected": -183.8100128173828, + "loss": 0.387, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4347076416015625, + "rewards/margins": 1.8768974542617798, + "rewards/rejected": -3.311605215072632, + "step": 6259 + }, + { + "epoch": 0.72, + "learning_rate": 8.47711576729486e-08, + "logits/chosen": -2.20767879486084, + "logits/rejected": -2.4020462036132812, + "logps/chosen": -334.8534851074219, + "logps/rejected": -189.71958923339844, + "loss": 0.6038, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.464292049407959, + "rewards/margins": 1.5928560495376587, + "rewards/rejected": -3.057147979736328, + "step": 6260 + }, + { + "epoch": 0.72, + "learning_rate": 8.473604120332434e-08, + "logits/chosen": -2.0267252922058105, + "logits/rejected": -2.1763687133789062, + "logps/chosen": -363.3067626953125, + "logps/rejected": -287.04180908203125, + "loss": 0.3053, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0419611930847168, + "rewards/margins": 2.112278938293457, + "rewards/rejected": -3.154240369796753, + "step": 6261 + }, + { + "epoch": 0.72, + "learning_rate": 8.470092473370011e-08, + "logits/chosen": -2.7240254878997803, + "logits/rejected": -2.664196252822876, + "logps/chosen": -266.8621826171875, + "logps/rejected": -233.25686645507812, + "loss": 0.4963, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8420568108558655, + "rewards/margins": 2.4828171730041504, + "rewards/rejected": -3.324873924255371, + "step": 6262 + }, + { + "epoch": 0.72, + "learning_rate": 8.466580826407585e-08, + "logits/chosen": -2.4712893962860107, + "logits/rejected": -2.7003746032714844, + "logps/chosen": -330.08673095703125, + "logps/rejected": -329.222412109375, + "loss": 0.3473, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5522294044494629, + "rewards/margins": 2.4697670936584473, + "rewards/rejected": -3.021996259689331, + "step": 6263 + }, + { + "epoch": 0.72, + "learning_rate": 8.463069179445159e-08, + "logits/chosen": -2.586808919906616, + "logits/rejected": -2.333498001098633, + "logps/chosen": -62.43632888793945, + "logps/rejected": -302.2931213378906, + "loss": 0.1639, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.550913393497467, + "rewards/margins": 2.49265718460083, + "rewards/rejected": -3.0435705184936523, + "step": 6264 + }, + { + "epoch": 0.72, + "learning_rate": 8.459557532482733e-08, + "logits/chosen": -2.4490041732788086, + "logits/rejected": -2.5682499408721924, + "logps/chosen": -317.95556640625, + "logps/rejected": -306.7420349121094, + "loss": 0.1493, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5610074996948242, + "rewards/margins": 2.746495246887207, + "rewards/rejected": -3.307502508163452, + "step": 6265 + }, + { + "epoch": 0.72, + "learning_rate": 8.456045885520309e-08, + "logits/chosen": -2.346405029296875, + "logits/rejected": -2.443240165710449, + "logps/chosen": -359.6944580078125, + "logps/rejected": -312.82666015625, + "loss": 0.2765, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7803565263748169, + "rewards/margins": 2.509843587875366, + "rewards/rejected": -3.2902002334594727, + "step": 6266 + }, + { + "epoch": 0.72, + "learning_rate": 8.452534238557884e-08, + "logits/chosen": -1.8842633962631226, + "logits/rejected": -2.275463104248047, + "logps/chosen": -480.1339111328125, + "logps/rejected": -322.9457702636719, + "loss": 0.7562, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6630345582962036, + "rewards/margins": 2.324298620223999, + "rewards/rejected": -2.987333059310913, + "step": 6267 + }, + { + "epoch": 0.72, + "learning_rate": 8.449022591595458e-08, + "logits/chosen": -2.668201208114624, + "logits/rejected": -2.3942019939422607, + "logps/chosen": -238.50042724609375, + "logps/rejected": -262.74176025390625, + "loss": 0.1275, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8481181859970093, + "rewards/margins": 2.9891152381896973, + "rewards/rejected": -3.837233543395996, + "step": 6268 + }, + { + "epoch": 0.72, + "learning_rate": 8.445510944633032e-08, + "logits/chosen": -2.053083896636963, + "logits/rejected": -1.5361039638519287, + "logps/chosen": -211.11328125, + "logps/rejected": -415.10101318359375, + "loss": 0.2588, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6360751390457153, + "rewards/margins": 3.3985862731933594, + "rewards/rejected": -4.034661293029785, + "step": 6269 + }, + { + "epoch": 0.72, + "learning_rate": 8.441999297670608e-08, + "logits/chosen": -2.3112072944641113, + "logits/rejected": -2.3207337856292725, + "logps/chosen": -172.83441162109375, + "logps/rejected": -274.0652770996094, + "loss": 0.2199, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3971767723560333, + "rewards/margins": 3.6018712520599365, + "rewards/rejected": -3.9990479946136475, + "step": 6270 + }, + { + "epoch": 0.72, + "learning_rate": 8.438487650708182e-08, + "logits/chosen": -2.828967809677124, + "logits/rejected": -2.9233131408691406, + "logps/chosen": -206.46612548828125, + "logps/rejected": -236.363037109375, + "loss": 0.3766, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1695082187652588, + "rewards/margins": 1.5451996326446533, + "rewards/rejected": -2.714708089828491, + "step": 6271 + }, + { + "epoch": 0.72, + "learning_rate": 8.434976003745756e-08, + "logits/chosen": -2.152393102645874, + "logits/rejected": -2.1815567016601562, + "logps/chosen": -282.8763732910156, + "logps/rejected": -218.88328552246094, + "loss": 0.3726, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9461559653282166, + "rewards/margins": 2.489351749420166, + "rewards/rejected": -3.4355077743530273, + "step": 6272 + }, + { + "epoch": 0.72, + "learning_rate": 8.43146435678333e-08, + "logits/chosen": -1.810276746749878, + "logits/rejected": -1.7185372114181519, + "logps/chosen": -406.4959716796875, + "logps/rejected": -332.263671875, + "loss": 0.5358, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.675748348236084, + "rewards/margins": 2.423628807067871, + "rewards/rejected": -4.099377155303955, + "step": 6273 + }, + { + "epoch": 0.72, + "learning_rate": 8.427952709820907e-08, + "logits/chosen": -2.127286672592163, + "logits/rejected": -1.7812621593475342, + "logps/chosen": -216.92111206054688, + "logps/rejected": -312.0364990234375, + "loss": 0.5642, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7922180891036987, + "rewards/margins": 1.2021229267120361, + "rewards/rejected": -1.9943408966064453, + "step": 6274 + }, + { + "epoch": 0.72, + "learning_rate": 8.42444106285848e-08, + "logits/chosen": -1.9467281103134155, + "logits/rejected": -2.1019036769866943, + "logps/chosen": -362.27581787109375, + "logps/rejected": -426.86846923828125, + "loss": 0.3371, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1283092498779297, + "rewards/margins": 1.807726263999939, + "rewards/rejected": -1.9360355138778687, + "step": 6275 + }, + { + "epoch": 0.72, + "learning_rate": 8.420929415896055e-08, + "logits/chosen": -2.051866292953491, + "logits/rejected": -2.3566157817840576, + "logps/chosen": -476.6446228027344, + "logps/rejected": -290.12213134765625, + "loss": 0.3055, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0853092670440674, + "rewards/margins": 2.03243088722229, + "rewards/rejected": -3.1177401542663574, + "step": 6276 + }, + { + "epoch": 0.72, + "learning_rate": 8.417417768933629e-08, + "logits/chosen": -2.632045030593872, + "logits/rejected": -2.7655510902404785, + "logps/chosen": -186.8202667236328, + "logps/rejected": -195.79934692382812, + "loss": 1.0093, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.532071590423584, + "rewards/margins": 1.1283445358276367, + "rewards/rejected": -2.6604161262512207, + "step": 6277 + }, + { + "epoch": 0.72, + "learning_rate": 8.413906121971205e-08, + "logits/chosen": -1.9841128587722778, + "logits/rejected": -2.275911569595337, + "logps/chosen": -389.71502685546875, + "logps/rejected": -331.7031555175781, + "loss": 0.5265, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.102161169052124, + "rewards/margins": 1.8850029706954956, + "rewards/rejected": -2.987164258956909, + "step": 6278 + }, + { + "epoch": 0.72, + "learning_rate": 8.41039447500878e-08, + "logits/chosen": -1.9617255926132202, + "logits/rejected": -2.1431198120117188, + "logps/chosen": -298.78302001953125, + "logps/rejected": -237.45753479003906, + "loss": 0.4396, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0033448934555054, + "rewards/margins": 1.6836977005004883, + "rewards/rejected": -2.687042713165283, + "step": 6279 + }, + { + "epoch": 0.72, + "learning_rate": 8.406882828046354e-08, + "logits/chosen": -2.3424384593963623, + "logits/rejected": -2.6702613830566406, + "logps/chosen": -240.81494140625, + "logps/rejected": -187.25103759765625, + "loss": 0.7918, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3188762664794922, + "rewards/margins": 1.3201498985290527, + "rewards/rejected": -2.639026165008545, + "step": 6280 + }, + { + "epoch": 0.72, + "learning_rate": 8.403371181083928e-08, + "logits/chosen": -2.358591079711914, + "logits/rejected": -2.4465436935424805, + "logps/chosen": -358.1065368652344, + "logps/rejected": -328.54620361328125, + "loss": 0.4671, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1768549680709839, + "rewards/margins": 2.955094814300537, + "rewards/rejected": -4.131949424743652, + "step": 6281 + }, + { + "epoch": 0.72, + "learning_rate": 8.399859534121503e-08, + "logits/chosen": -2.4385628700256348, + "logits/rejected": -2.216125965118408, + "logps/chosen": -352.11053466796875, + "logps/rejected": -418.1222839355469, + "loss": 0.6321, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9493862390518188, + "rewards/margins": 2.265047788619995, + "rewards/rejected": -3.2144339084625244, + "step": 6282 + }, + { + "epoch": 0.72, + "learning_rate": 8.396347887159077e-08, + "logits/chosen": -2.225247621536255, + "logits/rejected": -2.426213026046753, + "logps/chosen": -637.1297607421875, + "logps/rejected": -316.78021240234375, + "loss": 1.2261, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.608068823814392, + "rewards/margins": -0.11638599634170532, + "rewards/rejected": -1.491682767868042, + "step": 6283 + }, + { + "epoch": 0.72, + "learning_rate": 8.392836240196652e-08, + "logits/chosen": -2.2592520713806152, + "logits/rejected": -2.203092098236084, + "logps/chosen": -260.341064453125, + "logps/rejected": -301.6064453125, + "loss": 0.1669, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.32679247856140137, + "rewards/margins": 3.5268356800079346, + "rewards/rejected": -3.853628158569336, + "step": 6284 + }, + { + "epoch": 0.72, + "learning_rate": 8.389324593234227e-08, + "logits/chosen": -1.6544750928878784, + "logits/rejected": -1.975681185722351, + "logps/chosen": -409.515380859375, + "logps/rejected": -378.61444091796875, + "loss": 0.7566, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1734788417816162, + "rewards/margins": 1.588842749595642, + "rewards/rejected": -2.7623214721679688, + "step": 6285 + }, + { + "epoch": 0.72, + "learning_rate": 8.385812946271802e-08, + "logits/chosen": -2.1219124794006348, + "logits/rejected": -2.240558385848999, + "logps/chosen": -140.54751586914062, + "logps/rejected": -187.87840270996094, + "loss": 0.4092, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.634811520576477, + "rewards/margins": 2.2331316471099854, + "rewards/rejected": -2.867943048477173, + "step": 6286 + }, + { + "epoch": 0.72, + "learning_rate": 8.382301299309376e-08, + "logits/chosen": -2.3203067779541016, + "logits/rejected": -2.18489670753479, + "logps/chosen": -198.83216857910156, + "logps/rejected": -307.10565185546875, + "loss": 0.3622, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7816671133041382, + "rewards/margins": 3.2575345039367676, + "rewards/rejected": -5.039201259613037, + "step": 6287 + }, + { + "epoch": 0.72, + "learning_rate": 8.37878965234695e-08, + "logits/chosen": -2.854043960571289, + "logits/rejected": -2.7232589721679688, + "logps/chosen": -242.0472412109375, + "logps/rejected": -313.05743408203125, + "loss": 0.4033, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.702271819114685, + "rewards/margins": 2.4782330989837646, + "rewards/rejected": -4.18050479888916, + "step": 6288 + }, + { + "epoch": 0.72, + "learning_rate": 8.375278005384524e-08, + "logits/chosen": -3.053872585296631, + "logits/rejected": -2.90291166305542, + "logps/chosen": -276.424560546875, + "logps/rejected": -211.84278869628906, + "loss": 0.1232, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5409419536590576, + "rewards/margins": 4.198063850402832, + "rewards/rejected": -4.739006042480469, + "step": 6289 + }, + { + "epoch": 0.73, + "learning_rate": 8.371766358422098e-08, + "logits/chosen": -2.2917733192443848, + "logits/rejected": -2.283764362335205, + "logps/chosen": -372.230712890625, + "logps/rejected": -320.65576171875, + "loss": 0.4618, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.674868106842041, + "rewards/margins": 1.6205614805221558, + "rewards/rejected": -2.2954297065734863, + "step": 6290 + }, + { + "epoch": 0.73, + "learning_rate": 8.368254711459675e-08, + "logits/chosen": -1.7775756120681763, + "logits/rejected": -2.191040277481079, + "logps/chosen": -212.28848266601562, + "logps/rejected": -204.53616333007812, + "loss": 0.1002, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2428429126739502, + "rewards/margins": 3.0034098625183105, + "rewards/rejected": -3.2462525367736816, + "step": 6291 + }, + { + "epoch": 0.73, + "learning_rate": 8.364743064497249e-08, + "logits/chosen": -2.1569011211395264, + "logits/rejected": -2.085763692855835, + "logps/chosen": -133.96307373046875, + "logps/rejected": -173.4049072265625, + "loss": 0.5716, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5472131967544556, + "rewards/margins": 1.4652241468429565, + "rewards/rejected": -2.012437343597412, + "step": 6292 + }, + { + "epoch": 0.73, + "learning_rate": 8.361231417534823e-08, + "logits/chosen": -1.9337258338928223, + "logits/rejected": -1.976317048072815, + "logps/chosen": -216.0757293701172, + "logps/rejected": -228.01596069335938, + "loss": 0.2141, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6833990216255188, + "rewards/margins": 2.5269076824188232, + "rewards/rejected": -3.2103066444396973, + "step": 6293 + }, + { + "epoch": 0.73, + "learning_rate": 8.357719770572397e-08, + "logits/chosen": -2.576507091522217, + "logits/rejected": -2.3936142921447754, + "logps/chosen": -180.05120849609375, + "logps/rejected": -200.62567138671875, + "loss": 0.539, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.9071810245513916, + "rewards/margins": 1.5646092891693115, + "rewards/rejected": -3.471790313720703, + "step": 6294 + }, + { + "epoch": 0.73, + "learning_rate": 8.354208123609974e-08, + "logits/chosen": -1.9053280353546143, + "logits/rejected": -1.6827921867370605, + "logps/chosen": -255.16571044921875, + "logps/rejected": -289.9258728027344, + "loss": 0.4344, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3726153373718262, + "rewards/margins": 1.2428689002990723, + "rewards/rejected": -2.6154842376708984, + "step": 6295 + }, + { + "epoch": 0.73, + "learning_rate": 8.350696476647548e-08, + "logits/chosen": -2.432304859161377, + "logits/rejected": -2.5747594833374023, + "logps/chosen": -269.69451904296875, + "logps/rejected": -234.71612548828125, + "loss": 0.2642, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5094896554946899, + "rewards/margins": 2.246695041656494, + "rewards/rejected": -2.7561848163604736, + "step": 6296 + }, + { + "epoch": 0.73, + "learning_rate": 8.347184829685122e-08, + "logits/chosen": -1.7535724639892578, + "logits/rejected": -2.0683133602142334, + "logps/chosen": -557.0764770507812, + "logps/rejected": -452.79510498046875, + "loss": 0.4149, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13574692606925964, + "rewards/margins": 0.936600923538208, + "rewards/rejected": -1.07234787940979, + "step": 6297 + }, + { + "epoch": 0.73, + "learning_rate": 8.343673182722696e-08, + "logits/chosen": -2.7656803131103516, + "logits/rejected": -2.583937883377075, + "logps/chosen": -155.14825439453125, + "logps/rejected": -172.0719757080078, + "loss": 0.1783, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1086208820343018, + "rewards/margins": 2.252397060394287, + "rewards/rejected": -3.361018180847168, + "step": 6298 + }, + { + "epoch": 0.73, + "learning_rate": 8.340161535760271e-08, + "logits/chosen": -2.7234582901000977, + "logits/rejected": -2.7397091388702393, + "logps/chosen": -222.14132690429688, + "logps/rejected": -210.48622131347656, + "loss": 0.2491, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8498482704162598, + "rewards/margins": 2.850170373916626, + "rewards/rejected": -3.7000186443328857, + "step": 6299 + }, + { + "epoch": 0.73, + "learning_rate": 8.336649888797845e-08, + "logits/chosen": -2.5049099922180176, + "logits/rejected": -2.6073670387268066, + "logps/chosen": -407.0989990234375, + "logps/rejected": -792.45751953125, + "loss": 0.5228, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.206594467163086, + "rewards/margins": 2.4738521575927734, + "rewards/rejected": -3.6804466247558594, + "step": 6300 + }, + { + "epoch": 0.73, + "learning_rate": 8.333138241835421e-08, + "logits/chosen": -2.4933619499206543, + "logits/rejected": -2.5508651733398438, + "logps/chosen": -246.44764709472656, + "logps/rejected": -260.18536376953125, + "loss": 0.2643, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6284351944923401, + "rewards/margins": 2.5107979774475098, + "rewards/rejected": -3.139233112335205, + "step": 6301 + }, + { + "epoch": 0.73, + "learning_rate": 8.329626594872995e-08, + "logits/chosen": -1.9370530843734741, + "logits/rejected": -2.5714504718780518, + "logps/chosen": -408.90924072265625, + "logps/rejected": -255.67584228515625, + "loss": 0.2367, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.45627665519714355, + "rewards/margins": 2.7804512977600098, + "rewards/rejected": -3.2367279529571533, + "step": 6302 + }, + { + "epoch": 0.73, + "learning_rate": 8.32611494791057e-08, + "logits/chosen": -2.3081648349761963, + "logits/rejected": -2.237351179122925, + "logps/chosen": -87.1420669555664, + "logps/rejected": -149.77261352539062, + "loss": 0.4214, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9402146339416504, + "rewards/margins": 1.3505504131317139, + "rewards/rejected": -2.2907652854919434, + "step": 6303 + }, + { + "epoch": 0.73, + "learning_rate": 8.322603300948144e-08, + "logits/chosen": -2.658010244369507, + "logits/rejected": -2.9151315689086914, + "logps/chosen": -286.6846923828125, + "logps/rejected": -220.746337890625, + "loss": 0.0763, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9516284465789795, + "rewards/margins": 3.6833229064941406, + "rewards/rejected": -4.634951114654541, + "step": 6304 + }, + { + "epoch": 0.73, + "learning_rate": 8.319091653985718e-08, + "logits/chosen": -2.4577794075012207, + "logits/rejected": -2.422764301300049, + "logps/chosen": -130.8128204345703, + "logps/rejected": -166.583740234375, + "loss": 0.361, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8331954479217529, + "rewards/margins": 1.2731245756149292, + "rewards/rejected": -2.1063199043273926, + "step": 6305 + }, + { + "epoch": 0.73, + "learning_rate": 8.315580007023292e-08, + "logits/chosen": -1.735997200012207, + "logits/rejected": -1.9524285793304443, + "logps/chosen": -502.53558349609375, + "logps/rejected": -295.09979248046875, + "loss": 0.2718, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1807554960250854, + "rewards/margins": 1.688530683517456, + "rewards/rejected": -2.869286298751831, + "step": 6306 + }, + { + "epoch": 0.73, + "learning_rate": 8.312068360060869e-08, + "logits/chosen": -2.261676788330078, + "logits/rejected": -2.0945165157318115, + "logps/chosen": -136.16497802734375, + "logps/rejected": -254.39349365234375, + "loss": 0.3536, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7004179954528809, + "rewards/margins": 2.580507755279541, + "rewards/rejected": -3.280925750732422, + "step": 6307 + }, + { + "epoch": 0.73, + "learning_rate": 8.308556713098443e-08, + "logits/chosen": -2.558598518371582, + "logits/rejected": -2.8362417221069336, + "logps/chosen": -333.75445556640625, + "logps/rejected": -275.148193359375, + "loss": 0.2515, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4476224482059479, + "rewards/margins": 2.029632806777954, + "rewards/rejected": -2.477255344390869, + "step": 6308 + }, + { + "epoch": 0.73, + "learning_rate": 8.305045066136017e-08, + "logits/chosen": -2.5991063117980957, + "logits/rejected": -2.787263870239258, + "logps/chosen": -241.02005004882812, + "logps/rejected": -238.08570861816406, + "loss": 0.2259, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2706470489501953, + "rewards/margins": 2.0298314094543457, + "rewards/rejected": -3.300478458404541, + "step": 6309 + }, + { + "epoch": 0.73, + "learning_rate": 8.301533419173591e-08, + "logits/chosen": -2.4461774826049805, + "logits/rejected": -2.8656060695648193, + "logps/chosen": -381.4349670410156, + "logps/rejected": -246.439697265625, + "loss": 0.3085, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2611300945281982, + "rewards/margins": 2.3638296127319336, + "rewards/rejected": -3.6249594688415527, + "step": 6310 + }, + { + "epoch": 0.73, + "learning_rate": 8.298021772211167e-08, + "logits/chosen": -2.311814069747925, + "logits/rejected": -2.1738498210906982, + "logps/chosen": -292.8114013671875, + "logps/rejected": -346.2243347167969, + "loss": 0.6282, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2429578304290771, + "rewards/margins": 1.461952567100525, + "rewards/rejected": -2.7049105167388916, + "step": 6311 + }, + { + "epoch": 0.73, + "learning_rate": 8.294510125248742e-08, + "logits/chosen": -2.598478317260742, + "logits/rejected": -2.697014093399048, + "logps/chosen": -150.05722045898438, + "logps/rejected": -204.35845947265625, + "loss": 0.2235, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.27323588728904724, + "rewards/margins": 3.542309522628784, + "rewards/rejected": -3.8155455589294434, + "step": 6312 + }, + { + "epoch": 0.73, + "learning_rate": 8.290998478286316e-08, + "logits/chosen": -2.0312795639038086, + "logits/rejected": -1.8510476350784302, + "logps/chosen": -402.5633239746094, + "logps/rejected": -472.817626953125, + "loss": 0.9803, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.2052173614501953, + "rewards/margins": -0.23508606851100922, + "rewards/rejected": -1.970131278038025, + "step": 6313 + }, + { + "epoch": 0.73, + "learning_rate": 8.28748683132389e-08, + "logits/chosen": -2.680542469024658, + "logits/rejected": -2.7087812423706055, + "logps/chosen": -468.7441711425781, + "logps/rejected": -484.3500061035156, + "loss": 0.372, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4851504564285278, + "rewards/margins": 3.1842026710510254, + "rewards/rejected": -4.669353008270264, + "step": 6314 + }, + { + "epoch": 0.73, + "learning_rate": 8.283975184361466e-08, + "logits/chosen": -2.3808321952819824, + "logits/rejected": -2.7338552474975586, + "logps/chosen": -280.03533935546875, + "logps/rejected": -219.5653076171875, + "loss": 0.1267, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1728891283273697, + "rewards/margins": 2.579183578491211, + "rewards/rejected": -2.752072811126709, + "step": 6315 + }, + { + "epoch": 0.73, + "learning_rate": 8.28046353739904e-08, + "logits/chosen": -2.1603496074676514, + "logits/rejected": -2.326261520385742, + "logps/chosen": -364.1602783203125, + "logps/rejected": -377.2039489746094, + "loss": 0.6212, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4827654361724854, + "rewards/margins": 0.7810717225074768, + "rewards/rejected": -2.2638370990753174, + "step": 6316 + }, + { + "epoch": 0.73, + "learning_rate": 8.276951890436614e-08, + "logits/chosen": -2.5820555686950684, + "logits/rejected": -2.7995035648345947, + "logps/chosen": -245.86297607421875, + "logps/rejected": -168.4051055908203, + "loss": 0.3584, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1203818321228027, + "rewards/margins": 2.1043167114257812, + "rewards/rejected": -3.224698543548584, + "step": 6317 + }, + { + "epoch": 0.73, + "learning_rate": 8.273440243474189e-08, + "logits/chosen": -1.6403486728668213, + "logits/rejected": -1.912435531616211, + "logps/chosen": -303.31011962890625, + "logps/rejected": -266.8112487792969, + "loss": 0.3765, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3808796405792236, + "rewards/margins": 1.2044973373413086, + "rewards/rejected": -2.5853769779205322, + "step": 6318 + }, + { + "epoch": 0.73, + "learning_rate": 8.269928596511764e-08, + "logits/chosen": -2.099250078201294, + "logits/rejected": -2.278073787689209, + "logps/chosen": -449.8609619140625, + "logps/rejected": -278.2136535644531, + "loss": 0.4461, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4488494396209717, + "rewards/margins": 1.751326322555542, + "rewards/rejected": -3.2001757621765137, + "step": 6319 + }, + { + "epoch": 0.73, + "learning_rate": 8.266416949549339e-08, + "logits/chosen": -2.1773674488067627, + "logits/rejected": -2.059082269668579, + "logps/chosen": -252.90711975097656, + "logps/rejected": -347.54412841796875, + "loss": 0.3297, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2883305549621582, + "rewards/margins": 1.572295904159546, + "rewards/rejected": -1.860626459121704, + "step": 6320 + }, + { + "epoch": 0.73, + "learning_rate": 8.262905302586913e-08, + "logits/chosen": -2.0481014251708984, + "logits/rejected": -2.0817267894744873, + "logps/chosen": -165.64231872558594, + "logps/rejected": -251.417724609375, + "loss": 0.2675, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6483482122421265, + "rewards/margins": 2.4120402336120605, + "rewards/rejected": -3.0603883266448975, + "step": 6321 + }, + { + "epoch": 0.73, + "learning_rate": 8.259393655624487e-08, + "logits/chosen": -2.0930862426757812, + "logits/rejected": -2.0147383213043213, + "logps/chosen": -365.9786376953125, + "logps/rejected": -294.86956787109375, + "loss": 0.5105, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.774067223072052, + "rewards/margins": 1.3825147151947021, + "rewards/rejected": -2.1565818786621094, + "step": 6322 + }, + { + "epoch": 0.73, + "learning_rate": 8.255882008662063e-08, + "logits/chosen": -2.0992512702941895, + "logits/rejected": -2.255445957183838, + "logps/chosen": -386.14404296875, + "logps/rejected": -382.43798828125, + "loss": 1.3444, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.9352588653564453, + "rewards/margins": 0.8255227208137512, + "rewards/rejected": -2.760781764984131, + "step": 6323 + }, + { + "epoch": 0.73, + "learning_rate": 8.252370361699637e-08, + "logits/chosen": -2.579922914505005, + "logits/rejected": -2.681164503097534, + "logps/chosen": -207.6590576171875, + "logps/rejected": -145.95529174804688, + "loss": 0.7416, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5555047988891602, + "rewards/margins": 1.5136170387268066, + "rewards/rejected": -3.069121837615967, + "step": 6324 + }, + { + "epoch": 0.73, + "learning_rate": 8.248858714737211e-08, + "logits/chosen": -2.739790678024292, + "logits/rejected": -2.7181785106658936, + "logps/chosen": -115.60224151611328, + "logps/rejected": -220.58192443847656, + "loss": 0.261, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1995336413383484, + "rewards/margins": 3.1064181327819824, + "rewards/rejected": -3.3059515953063965, + "step": 6325 + }, + { + "epoch": 0.73, + "learning_rate": 8.245347067774786e-08, + "logits/chosen": -2.205444574356079, + "logits/rejected": -2.2704739570617676, + "logps/chosen": -295.0021057128906, + "logps/rejected": -392.83428955078125, + "loss": 0.453, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.8465416431427002, + "rewards/margins": 1.9516528844833374, + "rewards/rejected": -3.798194408416748, + "step": 6326 + }, + { + "epoch": 0.73, + "learning_rate": 8.241835420812361e-08, + "logits/chosen": -2.8811004161834717, + "logits/rejected": -2.9405784606933594, + "logps/chosen": -275.05731201171875, + "logps/rejected": -261.36761474609375, + "loss": 0.0813, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6310133337974548, + "rewards/margins": 3.575018882751465, + "rewards/rejected": -4.2060322761535645, + "step": 6327 + }, + { + "epoch": 0.73, + "learning_rate": 8.238323773849935e-08, + "logits/chosen": -2.9718008041381836, + "logits/rejected": -2.9058055877685547, + "logps/chosen": -172.51541137695312, + "logps/rejected": -187.9818878173828, + "loss": 0.4153, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5655808448791504, + "rewards/margins": 1.770822525024414, + "rewards/rejected": -3.3364036083221436, + "step": 6328 + }, + { + "epoch": 0.73, + "learning_rate": 8.23481212688751e-08, + "logits/chosen": -1.8822696208953857, + "logits/rejected": -2.287810802459717, + "logps/chosen": -574.9346313476562, + "logps/rejected": -374.4867248535156, + "loss": 0.1808, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7697880864143372, + "rewards/margins": 2.742781639099121, + "rewards/rejected": -3.5125699043273926, + "step": 6329 + }, + { + "epoch": 0.73, + "learning_rate": 8.231300479925084e-08, + "logits/chosen": -2.4246649742126465, + "logits/rejected": -2.139294147491455, + "logps/chosen": -224.69412231445312, + "logps/rejected": -312.7685546875, + "loss": 0.1413, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2247076034545898, + "rewards/margins": 2.3694968223571777, + "rewards/rejected": -3.5942039489746094, + "step": 6330 + }, + { + "epoch": 0.73, + "learning_rate": 8.22778883296266e-08, + "logits/chosen": -2.7355690002441406, + "logits/rejected": -2.629284381866455, + "logps/chosen": -174.86404418945312, + "logps/rejected": -230.54818725585938, + "loss": 0.6041, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5500149130821228, + "rewards/margins": 1.9035766124725342, + "rewards/rejected": -2.453591823577881, + "step": 6331 + }, + { + "epoch": 0.73, + "learning_rate": 8.224277186000234e-08, + "logits/chosen": -2.463467836380005, + "logits/rejected": -2.288808584213257, + "logps/chosen": -320.1103820800781, + "logps/rejected": -295.31298828125, + "loss": 0.0961, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1424460411071777, + "rewards/margins": 3.7055978775024414, + "rewards/rejected": -4.848043441772461, + "step": 6332 + }, + { + "epoch": 0.73, + "learning_rate": 8.220765539037808e-08, + "logits/chosen": -1.7650851011276245, + "logits/rejected": -1.8801289796829224, + "logps/chosen": -350.5284423828125, + "logps/rejected": -298.7724914550781, + "loss": 0.4886, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1203511953353882, + "rewards/margins": 0.876106321811676, + "rewards/rejected": -1.996457576751709, + "step": 6333 + }, + { + "epoch": 0.73, + "learning_rate": 8.217253892075382e-08, + "logits/chosen": -1.869362711906433, + "logits/rejected": -1.9223203659057617, + "logps/chosen": -369.6859130859375, + "logps/rejected": -424.1309814453125, + "loss": 0.4455, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4077531695365906, + "rewards/margins": 1.405908465385437, + "rewards/rejected": -1.8136615753173828, + "step": 6334 + }, + { + "epoch": 0.73, + "learning_rate": 8.213742245112959e-08, + "logits/chosen": -2.434760570526123, + "logits/rejected": -2.507133722305298, + "logps/chosen": -195.1566925048828, + "logps/rejected": -205.16087341308594, + "loss": 0.4322, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1788511276245117, + "rewards/margins": 2.3693594932556152, + "rewards/rejected": -3.548210859298706, + "step": 6335 + }, + { + "epoch": 0.73, + "learning_rate": 8.210230598150533e-08, + "logits/chosen": -2.2666547298431396, + "logits/rejected": -2.731625556945801, + "logps/chosen": -327.86138916015625, + "logps/rejected": -171.06529235839844, + "loss": 0.3021, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6182619333267212, + "rewards/margins": 2.0705270767211914, + "rewards/rejected": -2.688788890838623, + "step": 6336 + }, + { + "epoch": 0.73, + "learning_rate": 8.206718951188107e-08, + "logits/chosen": -2.130941390991211, + "logits/rejected": -2.0676229000091553, + "logps/chosen": -241.40109252929688, + "logps/rejected": -254.65008544921875, + "loss": 0.5248, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8870254158973694, + "rewards/margins": 1.2933261394500732, + "rewards/rejected": -2.180351495742798, + "step": 6337 + }, + { + "epoch": 0.73, + "learning_rate": 8.203207304225681e-08, + "logits/chosen": -2.488656520843506, + "logits/rejected": -2.6010260581970215, + "logps/chosen": -265.89276123046875, + "logps/rejected": -201.60171508789062, + "loss": 0.2514, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.137716606259346, + "rewards/margins": 2.6144933700561523, + "rewards/rejected": -2.7522101402282715, + "step": 6338 + }, + { + "epoch": 0.73, + "learning_rate": 8.199695657263255e-08, + "logits/chosen": -2.4059457778930664, + "logits/rejected": -2.6443071365356445, + "logps/chosen": -321.11517333984375, + "logps/rejected": -205.2763214111328, + "loss": 0.2129, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4175490736961365, + "rewards/margins": 2.2336456775665283, + "rewards/rejected": -2.6511945724487305, + "step": 6339 + }, + { + "epoch": 0.73, + "learning_rate": 8.196184010300832e-08, + "logits/chosen": -2.130620002746582, + "logits/rejected": -2.2012245655059814, + "logps/chosen": -260.7149353027344, + "logps/rejected": -270.0628662109375, + "loss": 0.3249, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1248072385787964, + "rewards/margins": 2.2075963020324707, + "rewards/rejected": -3.3324034214019775, + "step": 6340 + }, + { + "epoch": 0.73, + "learning_rate": 8.192672363338406e-08, + "logits/chosen": -2.7342145442962646, + "logits/rejected": -2.869736671447754, + "logps/chosen": -333.152587890625, + "logps/rejected": -214.92230224609375, + "loss": 0.4802, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2198526859283447, + "rewards/margins": 2.1876723766326904, + "rewards/rejected": -3.407525062561035, + "step": 6341 + }, + { + "epoch": 0.73, + "learning_rate": 8.18916071637598e-08, + "logits/chosen": -2.309976816177368, + "logits/rejected": -2.316288948059082, + "logps/chosen": -343.37109375, + "logps/rejected": -200.9020233154297, + "loss": 0.4282, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0135836601257324, + "rewards/margins": 1.5553159713745117, + "rewards/rejected": -2.568899631500244, + "step": 6342 + }, + { + "epoch": 0.73, + "learning_rate": 8.185649069413554e-08, + "logits/chosen": -2.5239453315734863, + "logits/rejected": -2.4720537662506104, + "logps/chosen": -263.7887268066406, + "logps/rejected": -329.9689636230469, + "loss": 0.112, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37481689453125, + "rewards/margins": 3.1687703132629395, + "rewards/rejected": -3.5435869693756104, + "step": 6343 + }, + { + "epoch": 0.73, + "learning_rate": 8.182137422451129e-08, + "logits/chosen": -2.390998363494873, + "logits/rejected": -2.161921977996826, + "logps/chosen": -80.63632202148438, + "logps/rejected": -201.97872924804688, + "loss": 0.2668, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4494555592536926, + "rewards/margins": 1.3509294986724854, + "rewards/rejected": -1.8003849983215332, + "step": 6344 + }, + { + "epoch": 0.73, + "learning_rate": 8.178625775488703e-08, + "logits/chosen": -2.823690176010132, + "logits/rejected": -2.910547971725464, + "logps/chosen": -190.60546875, + "logps/rejected": -237.73574829101562, + "loss": 0.2504, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8142039775848389, + "rewards/margins": 2.607410430908203, + "rewards/rejected": -4.421614646911621, + "step": 6345 + }, + { + "epoch": 0.73, + "learning_rate": 8.175114128526279e-08, + "logits/chosen": -2.482372522354126, + "logits/rejected": -2.581169843673706, + "logps/chosen": -128.9905242919922, + "logps/rejected": -252.28872680664062, + "loss": 0.2369, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9971991181373596, + "rewards/margins": 2.3234171867370605, + "rewards/rejected": -3.3206162452697754, + "step": 6346 + }, + { + "epoch": 0.73, + "learning_rate": 8.171602481563853e-08, + "logits/chosen": -1.858801245689392, + "logits/rejected": -1.9574872255325317, + "logps/chosen": -230.8712615966797, + "logps/rejected": -320.7490234375, + "loss": 0.3839, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.008293628692627, + "rewards/margins": 3.370222568511963, + "rewards/rejected": -4.37851619720459, + "step": 6347 + }, + { + "epoch": 0.73, + "learning_rate": 8.168090834601428e-08, + "logits/chosen": -2.3160006999969482, + "logits/rejected": -2.2043659687042236, + "logps/chosen": -357.45404052734375, + "logps/rejected": -316.69677734375, + "loss": 0.2392, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1023449897766113, + "rewards/margins": 1.8960977792739868, + "rewards/rejected": -2.9984426498413086, + "step": 6348 + }, + { + "epoch": 0.73, + "learning_rate": 8.164579187639002e-08, + "logits/chosen": -2.2949202060699463, + "logits/rejected": -2.245363235473633, + "logps/chosen": -393.33343505859375, + "logps/rejected": -431.662841796875, + "loss": 0.2169, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6600640416145325, + "rewards/margins": 1.8105084896087646, + "rewards/rejected": -2.4705724716186523, + "step": 6349 + }, + { + "epoch": 0.73, + "learning_rate": 8.161067540676576e-08, + "logits/chosen": -2.2637951374053955, + "logits/rejected": -2.3149495124816895, + "logps/chosen": -283.7700500488281, + "logps/rejected": -380.4562683105469, + "loss": 0.0442, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10341956466436386, + "rewards/margins": 4.639217376708984, + "rewards/rejected": -4.535798072814941, + "step": 6350 + }, + { + "epoch": 0.73, + "learning_rate": 8.15755589371415e-08, + "logits/chosen": -1.6531705856323242, + "logits/rejected": -1.8525285720825195, + "logps/chosen": -463.3060302734375, + "logps/rejected": -338.70611572265625, + "loss": 0.6218, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1300806999206543, + "rewards/margins": 1.5060198307037354, + "rewards/rejected": -2.6361002922058105, + "step": 6351 + }, + { + "epoch": 0.73, + "learning_rate": 8.154044246751727e-08, + "logits/chosen": -2.255352020263672, + "logits/rejected": -2.5544281005859375, + "logps/chosen": -565.666748046875, + "logps/rejected": -379.1965026855469, + "loss": 0.2558, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7482801079750061, + "rewards/margins": 2.4713292121887207, + "rewards/rejected": -3.219609260559082, + "step": 6352 + }, + { + "epoch": 0.73, + "learning_rate": 8.150532599789301e-08, + "logits/chosen": -1.8975636959075928, + "logits/rejected": -2.0520801544189453, + "logps/chosen": -436.720947265625, + "logps/rejected": -362.0289306640625, + "loss": 0.7435, + "rewards/accuracies": 0.375, + "rewards/chosen": -2.18914794921875, + "rewards/margins": 0.2827470898628235, + "rewards/rejected": -2.4718949794769287, + "step": 6353 + }, + { + "epoch": 0.73, + "learning_rate": 8.147020952826875e-08, + "logits/chosen": -2.806766986846924, + "logits/rejected": -2.793584108352661, + "logps/chosen": -120.37533569335938, + "logps/rejected": -199.9814453125, + "loss": 0.3709, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19323423504829407, + "rewards/margins": 3.294577121734619, + "rewards/rejected": -3.487811326980591, + "step": 6354 + }, + { + "epoch": 0.73, + "learning_rate": 8.143509305864449e-08, + "logits/chosen": -2.0028045177459717, + "logits/rejected": -1.924503207206726, + "logps/chosen": -270.8369140625, + "logps/rejected": -225.6135711669922, + "loss": 0.6269, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.235102891921997, + "rewards/margins": 0.4417527914047241, + "rewards/rejected": -1.6768555641174316, + "step": 6355 + }, + { + "epoch": 0.73, + "learning_rate": 8.139997658902025e-08, + "logits/chosen": -2.2853167057037354, + "logits/rejected": -2.096168279647827, + "logps/chosen": -237.3240966796875, + "logps/rejected": -360.6643981933594, + "loss": 0.0541, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8597375750541687, + "rewards/margins": 6.0605974197387695, + "rewards/rejected": -6.920334815979004, + "step": 6356 + }, + { + "epoch": 0.73, + "learning_rate": 8.1364860119396e-08, + "logits/chosen": -2.394841194152832, + "logits/rejected": -2.3641326427459717, + "logps/chosen": -363.62237548828125, + "logps/rejected": -352.436767578125, + "loss": 0.3063, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4609930217266083, + "rewards/margins": 1.5598292350769043, + "rewards/rejected": -2.020822286605835, + "step": 6357 + }, + { + "epoch": 0.73, + "learning_rate": 8.132974364977174e-08, + "logits/chosen": -2.2497401237487793, + "logits/rejected": -2.5336315631866455, + "logps/chosen": -268.3782653808594, + "logps/rejected": -169.74700927734375, + "loss": 0.5569, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.331679344177246, + "rewards/margins": 1.05213463306427, + "rewards/rejected": -2.3838138580322266, + "step": 6358 + }, + { + "epoch": 0.73, + "learning_rate": 8.129462718014748e-08, + "logits/chosen": -1.6703121662139893, + "logits/rejected": -1.8668413162231445, + "logps/chosen": -263.4383239746094, + "logps/rejected": -286.4892578125, + "loss": 0.6445, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9325589537620544, + "rewards/margins": 0.706446647644043, + "rewards/rejected": -1.6390056610107422, + "step": 6359 + }, + { + "epoch": 0.73, + "learning_rate": 8.125951071052324e-08, + "logits/chosen": -2.5112390518188477, + "logits/rejected": -2.711806297302246, + "logps/chosen": -331.37945556640625, + "logps/rejected": -288.34686279296875, + "loss": 0.368, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8653655648231506, + "rewards/margins": 1.3231080770492554, + "rewards/rejected": -2.188473701477051, + "step": 6360 + }, + { + "epoch": 0.73, + "learning_rate": 8.122439424089898e-08, + "logits/chosen": -2.519791841506958, + "logits/rejected": -2.5204505920410156, + "logps/chosen": -306.25335693359375, + "logps/rejected": -265.5081787109375, + "loss": 0.3138, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3386424779891968, + "rewards/margins": 1.9262290000915527, + "rewards/rejected": -2.26487135887146, + "step": 6361 + }, + { + "epoch": 0.73, + "learning_rate": 8.118927777127472e-08, + "logits/chosen": -2.3090872764587402, + "logits/rejected": -2.2214293479919434, + "logps/chosen": -466.62652587890625, + "logps/rejected": -529.0289916992188, + "loss": 0.0946, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5240232348442078, + "rewards/margins": 2.822031021118164, + "rewards/rejected": -3.3460543155670166, + "step": 6362 + }, + { + "epoch": 0.73, + "learning_rate": 8.115416130165047e-08, + "logits/chosen": -1.7580111026763916, + "logits/rejected": -2.021451711654663, + "logps/chosen": -468.05438232421875, + "logps/rejected": -326.173828125, + "loss": 0.3078, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8243921995162964, + "rewards/margins": 1.7820764780044556, + "rewards/rejected": -2.606468677520752, + "step": 6363 + }, + { + "epoch": 0.73, + "learning_rate": 8.111904483202622e-08, + "logits/chosen": -2.5790743827819824, + "logits/rejected": -2.7491400241851807, + "logps/chosen": -291.67108154296875, + "logps/rejected": -213.4188232421875, + "loss": 0.1796, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6585649251937866, + "rewards/margins": 3.517352819442749, + "rewards/rejected": -4.175917625427246, + "step": 6364 + }, + { + "epoch": 0.73, + "learning_rate": 8.108392836240196e-08, + "logits/chosen": -2.3361470699310303, + "logits/rejected": -2.224724054336548, + "logps/chosen": -231.9283905029297, + "logps/rejected": -269.0089416503906, + "loss": 0.4064, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8580654859542847, + "rewards/margins": 2.6829113960266113, + "rewards/rejected": -3.5409770011901855, + "step": 6365 + }, + { + "epoch": 0.73, + "learning_rate": 8.10488118927777e-08, + "logits/chosen": -2.768564224243164, + "logits/rejected": -2.5820932388305664, + "logps/chosen": -217.0152130126953, + "logps/rejected": -220.43556213378906, + "loss": 0.4404, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.33282244205474854, + "rewards/margins": 1.3418736457824707, + "rewards/rejected": -1.6746959686279297, + "step": 6366 + }, + { + "epoch": 0.73, + "learning_rate": 8.101369542315345e-08, + "logits/chosen": -2.3345324993133545, + "logits/rejected": -2.244513988494873, + "logps/chosen": -249.9547119140625, + "logps/rejected": -286.08819580078125, + "loss": 0.4319, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4310364723205566, + "rewards/margins": 1.3876878023147583, + "rewards/rejected": -2.8187241554260254, + "step": 6367 + }, + { + "epoch": 0.73, + "learning_rate": 8.097857895352921e-08, + "logits/chosen": -2.266221284866333, + "logits/rejected": -2.297248125076294, + "logps/chosen": -180.04598999023438, + "logps/rejected": -221.8602294921875, + "loss": 0.2971, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7717633247375488, + "rewards/margins": 2.591947078704834, + "rewards/rejected": -3.363710403442383, + "step": 6368 + }, + { + "epoch": 0.73, + "learning_rate": 8.094346248390495e-08, + "logits/chosen": -2.61250901222229, + "logits/rejected": -2.58681321144104, + "logps/chosen": -251.95245361328125, + "logps/rejected": -253.99545288085938, + "loss": 0.4844, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.772936999797821, + "rewards/margins": 2.3300046920776367, + "rewards/rejected": -3.1029417514801025, + "step": 6369 + }, + { + "epoch": 0.73, + "learning_rate": 8.09083460142807e-08, + "logits/chosen": -2.5553829669952393, + "logits/rejected": -2.4301791191101074, + "logps/chosen": -126.31010437011719, + "logps/rejected": -205.8458251953125, + "loss": 0.7886, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8211429119110107, + "rewards/margins": 0.9629863500595093, + "rewards/rejected": -1.7841291427612305, + "step": 6370 + }, + { + "epoch": 0.73, + "learning_rate": 8.087322954465643e-08, + "logits/chosen": -1.948150873184204, + "logits/rejected": -2.0341827869415283, + "logps/chosen": -370.7632141113281, + "logps/rejected": -295.3415222167969, + "loss": 0.3934, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7262762188911438, + "rewards/margins": 1.8208978176116943, + "rewards/rejected": -2.5471739768981934, + "step": 6371 + }, + { + "epoch": 0.73, + "learning_rate": 8.083811307503219e-08, + "logits/chosen": -2.8742988109588623, + "logits/rejected": -2.921616554260254, + "logps/chosen": -202.2168731689453, + "logps/rejected": -283.3342590332031, + "loss": 0.1007, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23763859272003174, + "rewards/margins": 4.310562610626221, + "rewards/rejected": -4.548201560974121, + "step": 6372 + }, + { + "epoch": 0.73, + "learning_rate": 8.080299660540793e-08, + "logits/chosen": -1.3768284320831299, + "logits/rejected": -1.5724561214447021, + "logps/chosen": -794.1969604492188, + "logps/rejected": -531.1336059570312, + "loss": 0.116, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9047077298164368, + "rewards/margins": 3.2718091011047363, + "rewards/rejected": -2.3671014308929443, + "step": 6373 + }, + { + "epoch": 0.73, + "learning_rate": 8.076788013578368e-08, + "logits/chosen": -2.066056251525879, + "logits/rejected": -2.0103328227996826, + "logps/chosen": -258.2409973144531, + "logps/rejected": -278.01605224609375, + "loss": 0.1977, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3902689218521118, + "rewards/margins": 2.1864895820617676, + "rewards/rejected": -3.576758623123169, + "step": 6374 + }, + { + "epoch": 0.73, + "learning_rate": 8.073276366615942e-08, + "logits/chosen": -2.45991587638855, + "logits/rejected": -2.242097854614258, + "logps/chosen": -302.597900390625, + "logps/rejected": -277.66632080078125, + "loss": 0.1754, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09176439791917801, + "rewards/margins": 2.537973642349243, + "rewards/rejected": -2.6297380924224854, + "step": 6375 + }, + { + "epoch": 0.74, + "learning_rate": 8.069764719653518e-08, + "logits/chosen": -2.4948744773864746, + "logits/rejected": -2.447111129760742, + "logps/chosen": -213.85787963867188, + "logps/rejected": -233.74899291992188, + "loss": 0.1229, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2965967655181885, + "rewards/margins": 3.984081745147705, + "rewards/rejected": -5.2806782722473145, + "step": 6376 + }, + { + "epoch": 0.74, + "learning_rate": 8.066253072691092e-08, + "logits/chosen": -2.5758352279663086, + "logits/rejected": -2.632309675216675, + "logps/chosen": -193.438232421875, + "logps/rejected": -150.88259887695312, + "loss": 0.2016, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0917656272649765, + "rewards/margins": 2.7318274974823, + "rewards/rejected": -2.8235931396484375, + "step": 6377 + }, + { + "epoch": 0.74, + "learning_rate": 8.062741425728666e-08, + "logits/chosen": -2.487255573272705, + "logits/rejected": -2.461381435394287, + "logps/chosen": -278.92169189453125, + "logps/rejected": -251.03634643554688, + "loss": 0.343, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5720669031143188, + "rewards/margins": 2.720614433288574, + "rewards/rejected": -3.2926812171936035, + "step": 6378 + }, + { + "epoch": 0.74, + "learning_rate": 8.05922977876624e-08, + "logits/chosen": -2.732553720474243, + "logits/rejected": -2.8043510913848877, + "logps/chosen": -244.06129455566406, + "logps/rejected": -326.8020935058594, + "loss": 0.0627, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5959762334823608, + "rewards/margins": 3.772116184234619, + "rewards/rejected": -4.3680925369262695, + "step": 6379 + }, + { + "epoch": 0.74, + "learning_rate": 8.055718131803817e-08, + "logits/chosen": -2.7097082138061523, + "logits/rejected": -2.684781074523926, + "logps/chosen": -152.98069763183594, + "logps/rejected": -237.6201934814453, + "loss": 0.2068, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0992136001586914, + "rewards/margins": 2.562668800354004, + "rewards/rejected": -3.6618824005126953, + "step": 6380 + }, + { + "epoch": 0.74, + "learning_rate": 8.052206484841391e-08, + "logits/chosen": -2.6834774017333984, + "logits/rejected": -2.6130194664001465, + "logps/chosen": -445.918212890625, + "logps/rejected": -308.94622802734375, + "loss": 0.5191, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3532713651657104, + "rewards/margins": 0.8192808628082275, + "rewards/rejected": -2.1725521087646484, + "step": 6381 + }, + { + "epoch": 0.74, + "learning_rate": 8.048694837878965e-08, + "logits/chosen": -2.255643606185913, + "logits/rejected": -2.6175029277801514, + "logps/chosen": -245.38607788085938, + "logps/rejected": -204.0383758544922, + "loss": 0.5939, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.290708303451538, + "rewards/margins": 2.542229652404785, + "rewards/rejected": -3.832937717437744, + "step": 6382 + }, + { + "epoch": 0.74, + "learning_rate": 8.045183190916539e-08, + "logits/chosen": -2.272500991821289, + "logits/rejected": -2.5522613525390625, + "logps/chosen": -365.3365783691406, + "logps/rejected": -372.71820068359375, + "loss": 0.0515, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6581963300704956, + "rewards/margins": 3.5990185737609863, + "rewards/rejected": -4.257214546203613, + "step": 6383 + }, + { + "epoch": 0.74, + "learning_rate": 8.041671543954113e-08, + "logits/chosen": -2.1730363368988037, + "logits/rejected": -1.8182759284973145, + "logps/chosen": -238.58482360839844, + "logps/rejected": -227.3902130126953, + "loss": 0.7565, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.096449375152588, + "rewards/margins": 0.38243913650512695, + "rewards/rejected": -2.4788882732391357, + "step": 6384 + }, + { + "epoch": 0.74, + "learning_rate": 8.03815989699169e-08, + "logits/chosen": -2.781376600265503, + "logits/rejected": -3.0339813232421875, + "logps/chosen": -386.65106201171875, + "logps/rejected": -254.91329956054688, + "loss": 0.8309, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4083740711212158, + "rewards/margins": 0.9212251305580139, + "rewards/rejected": -2.329599142074585, + "step": 6385 + }, + { + "epoch": 0.74, + "learning_rate": 8.034648250029264e-08, + "logits/chosen": -1.5968824625015259, + "logits/rejected": -1.9977178573608398, + "logps/chosen": -532.6448364257812, + "logps/rejected": -370.3000183105469, + "loss": 0.5513, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4319171905517578, + "rewards/margins": 1.8617665767669678, + "rewards/rejected": -3.2936837673187256, + "step": 6386 + }, + { + "epoch": 0.74, + "learning_rate": 8.031136603066838e-08, + "logits/chosen": -2.138545513153076, + "logits/rejected": -2.2540013790130615, + "logps/chosen": -426.86651611328125, + "logps/rejected": -335.76470947265625, + "loss": 0.4627, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5455410480499268, + "rewards/margins": 2.2751266956329346, + "rewards/rejected": -3.8206675052642822, + "step": 6387 + }, + { + "epoch": 0.74, + "learning_rate": 8.027624956104412e-08, + "logits/chosen": -2.246570587158203, + "logits/rejected": -2.315622329711914, + "logps/chosen": -272.1458435058594, + "logps/rejected": -273.05108642578125, + "loss": 0.5677, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1001663208007812, + "rewards/margins": 1.9030261039733887, + "rewards/rejected": -3.00319242477417, + "step": 6388 + }, + { + "epoch": 0.74, + "learning_rate": 8.024113309141987e-08, + "logits/chosen": -2.0697903633117676, + "logits/rejected": -2.1557161808013916, + "logps/chosen": -351.2956237792969, + "logps/rejected": -424.05126953125, + "loss": 0.8872, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2370117902755737, + "rewards/margins": 0.5124993920326233, + "rewards/rejected": -1.7495112419128418, + "step": 6389 + }, + { + "epoch": 0.74, + "learning_rate": 8.020601662179561e-08, + "logits/chosen": -2.162193775177002, + "logits/rejected": -2.469771385192871, + "logps/chosen": -377.821044921875, + "logps/rejected": -218.41156005859375, + "loss": 0.2649, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9606779217720032, + "rewards/margins": 2.058166742324829, + "rewards/rejected": -3.0188450813293457, + "step": 6390 + }, + { + "epoch": 0.74, + "learning_rate": 8.017090015217137e-08, + "logits/chosen": -2.9660146236419678, + "logits/rejected": -2.8854427337646484, + "logps/chosen": -348.4544982910156, + "logps/rejected": -283.0543212890625, + "loss": 0.0665, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00573354959487915, + "rewards/margins": 4.998013496398926, + "rewards/rejected": -5.00374698638916, + "step": 6391 + }, + { + "epoch": 0.74, + "learning_rate": 8.013578368254711e-08, + "logits/chosen": -2.6124448776245117, + "logits/rejected": -2.7763028144836426, + "logps/chosen": -453.7452087402344, + "logps/rejected": -345.0311584472656, + "loss": 0.4382, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0088281631469727, + "rewards/margins": 2.7396328449249268, + "rewards/rejected": -3.7484612464904785, + "step": 6392 + }, + { + "epoch": 0.74, + "learning_rate": 8.010066721292286e-08, + "logits/chosen": -2.3813838958740234, + "logits/rejected": -2.5342705249786377, + "logps/chosen": -282.2845153808594, + "logps/rejected": -279.1298828125, + "loss": 0.3752, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1282429695129395, + "rewards/margins": 1.5379365682601929, + "rewards/rejected": -2.6661794185638428, + "step": 6393 + }, + { + "epoch": 0.74, + "learning_rate": 8.00655507432986e-08, + "logits/chosen": -2.48172664642334, + "logits/rejected": -2.4220025539398193, + "logps/chosen": -243.76797485351562, + "logps/rejected": -248.43336486816406, + "loss": 0.2959, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9995435476303101, + "rewards/margins": 2.834808349609375, + "rewards/rejected": -3.8343520164489746, + "step": 6394 + }, + { + "epoch": 0.74, + "learning_rate": 8.003043427367434e-08, + "logits/chosen": -1.945418119430542, + "logits/rejected": -1.86582612991333, + "logps/chosen": -369.0548095703125, + "logps/rejected": -414.2465515136719, + "loss": 0.3213, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5927164554595947, + "rewards/margins": 2.632634162902832, + "rewards/rejected": -3.225350856781006, + "step": 6395 + }, + { + "epoch": 0.74, + "learning_rate": 7.999531780405008e-08, + "logits/chosen": -2.653665065765381, + "logits/rejected": -2.700356960296631, + "logps/chosen": -172.72482299804688, + "logps/rejected": -203.80426025390625, + "loss": 0.6608, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3890851140022278, + "rewards/margins": 2.369786024093628, + "rewards/rejected": -2.75887131690979, + "step": 6396 + }, + { + "epoch": 0.74, + "learning_rate": 7.996020133442585e-08, + "logits/chosen": -2.669726848602295, + "logits/rejected": -2.524165630340576, + "logps/chosen": -257.71990966796875, + "logps/rejected": -184.6854248046875, + "loss": 0.3854, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6765429973602295, + "rewards/margins": 1.2921046018600464, + "rewards/rejected": -2.9686474800109863, + "step": 6397 + }, + { + "epoch": 0.74, + "learning_rate": 7.992508486480159e-08, + "logits/chosen": -2.180379867553711, + "logits/rejected": -2.097355604171753, + "logps/chosen": -257.0735168457031, + "logps/rejected": -247.42364501953125, + "loss": 0.7912, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2193245887756348, + "rewards/margins": 1.1020574569702148, + "rewards/rejected": -2.3213820457458496, + "step": 6398 + }, + { + "epoch": 0.74, + "learning_rate": 7.988996839517733e-08, + "logits/chosen": -2.722238063812256, + "logits/rejected": -2.9235591888427734, + "logps/chosen": -237.24237060546875, + "logps/rejected": -189.39865112304688, + "loss": 0.4851, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.083184838294983, + "rewards/margins": 2.5028064250946045, + "rewards/rejected": -3.585991144180298, + "step": 6399 + }, + { + "epoch": 0.74, + "learning_rate": 7.985485192555307e-08, + "logits/chosen": -1.6158764362335205, + "logits/rejected": -1.9502838850021362, + "logps/chosen": -448.6182556152344, + "logps/rejected": -396.8519287109375, + "loss": 0.4017, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5739383697509766, + "rewards/margins": 1.6110262870788574, + "rewards/rejected": -2.184964418411255, + "step": 6400 + }, + { + "epoch": 0.74, + "learning_rate": 7.981973545592884e-08, + "logits/chosen": -2.506779670715332, + "logits/rejected": -2.308211326599121, + "logps/chosen": -262.9653015136719, + "logps/rejected": -371.61138916015625, + "loss": 0.4415, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.45531612634658813, + "rewards/margins": 1.0279085636138916, + "rewards/rejected": -1.483224630355835, + "step": 6401 + }, + { + "epoch": 0.74, + "learning_rate": 7.978461898630458e-08, + "logits/chosen": -2.10707426071167, + "logits/rejected": -1.7326923608779907, + "logps/chosen": -285.470458984375, + "logps/rejected": -394.638671875, + "loss": 0.8362, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8227816224098206, + "rewards/margins": 1.529876708984375, + "rewards/rejected": -2.352658271789551, + "step": 6402 + }, + { + "epoch": 0.74, + "learning_rate": 7.974950251668032e-08, + "logits/chosen": -2.3302435874938965, + "logits/rejected": -2.4216527938842773, + "logps/chosen": -175.2929229736328, + "logps/rejected": -278.3038635253906, + "loss": 0.3045, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.746164321899414, + "rewards/margins": 2.1515235900878906, + "rewards/rejected": -3.8976879119873047, + "step": 6403 + }, + { + "epoch": 0.74, + "learning_rate": 7.971438604705606e-08, + "logits/chosen": -2.4256796836853027, + "logits/rejected": -2.5446412563323975, + "logps/chosen": -227.4824676513672, + "logps/rejected": -174.8285675048828, + "loss": 0.2375, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.042053669691085815, + "rewards/margins": 1.6868724822998047, + "rewards/rejected": -1.728926181793213, + "step": 6404 + }, + { + "epoch": 0.74, + "learning_rate": 7.967926957743181e-08, + "logits/chosen": -2.22690749168396, + "logits/rejected": -2.29463529586792, + "logps/chosen": -374.98614501953125, + "logps/rejected": -255.7134552001953, + "loss": 0.554, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4797999858856201, + "rewards/margins": 1.3305943012237549, + "rewards/rejected": -2.810394287109375, + "step": 6405 + }, + { + "epoch": 0.74, + "learning_rate": 7.964415310780756e-08, + "logits/chosen": -2.309441328048706, + "logits/rejected": -2.3212080001831055, + "logps/chosen": -307.76617431640625, + "logps/rejected": -246.54434204101562, + "loss": 0.1992, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9418140649795532, + "rewards/margins": 2.6336476802825928, + "rewards/rejected": -3.5754618644714355, + "step": 6406 + }, + { + "epoch": 0.74, + "learning_rate": 7.96090366381833e-08, + "logits/chosen": -2.4623544216156006, + "logits/rejected": -2.5658891201019287, + "logps/chosen": -386.93328857421875, + "logps/rejected": -140.93704223632812, + "loss": 0.9728, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8736176490783691, + "rewards/margins": 1.0964233875274658, + "rewards/rejected": -2.970041036605835, + "step": 6407 + }, + { + "epoch": 0.74, + "learning_rate": 7.957392016855905e-08, + "logits/chosen": -2.8318049907684326, + "logits/rejected": -2.626105785369873, + "logps/chosen": -137.23582458496094, + "logps/rejected": -344.92486572265625, + "loss": 0.7604, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3234087228775024, + "rewards/margins": 3.319016695022583, + "rewards/rejected": -4.642425537109375, + "step": 6408 + }, + { + "epoch": 0.74, + "learning_rate": 7.95388036989348e-08, + "logits/chosen": -2.455880641937256, + "logits/rejected": -2.577216148376465, + "logps/chosen": -331.03045654296875, + "logps/rejected": -288.5938720703125, + "loss": 0.2764, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11959246546030045, + "rewards/margins": 2.621522903442383, + "rewards/rejected": -2.7411155700683594, + "step": 6409 + }, + { + "epoch": 0.74, + "learning_rate": 7.950368722931054e-08, + "logits/chosen": -2.213174343109131, + "logits/rejected": -2.2856104373931885, + "logps/chosen": -294.5087890625, + "logps/rejected": -374.08111572265625, + "loss": 0.5857, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2300142049789429, + "rewards/margins": 2.2285032272338867, + "rewards/rejected": -3.458517551422119, + "step": 6410 + }, + { + "epoch": 0.74, + "learning_rate": 7.946857075968628e-08, + "logits/chosen": -1.3620768785476685, + "logits/rejected": -1.5563318729400635, + "logps/chosen": -505.12890625, + "logps/rejected": -405.96502685546875, + "loss": 1.2042, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8606098890304565, + "rewards/margins": 2.127412796020508, + "rewards/rejected": -3.988022565841675, + "step": 6411 + }, + { + "epoch": 0.74, + "learning_rate": 7.943345429006203e-08, + "logits/chosen": -1.6535234451293945, + "logits/rejected": -2.107401132583618, + "logps/chosen": -350.5997009277344, + "logps/rejected": -333.0582275390625, + "loss": 0.2219, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9591453671455383, + "rewards/margins": 2.9232308864593506, + "rewards/rejected": -3.882376194000244, + "step": 6412 + }, + { + "epoch": 0.74, + "learning_rate": 7.939833782043779e-08, + "logits/chosen": -2.4607157707214355, + "logits/rejected": -2.1406097412109375, + "logps/chosen": -153.85824584960938, + "logps/rejected": -286.51904296875, + "loss": 0.2263, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5715046525001526, + "rewards/margins": 2.5354812145233154, + "rewards/rejected": -3.1069858074188232, + "step": 6413 + }, + { + "epoch": 0.74, + "learning_rate": 7.936322135081353e-08, + "logits/chosen": -1.7988370656967163, + "logits/rejected": -1.9334993362426758, + "logps/chosen": -538.4217529296875, + "logps/rejected": -362.9787292480469, + "loss": 0.4981, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3033035397529602, + "rewards/margins": 2.029696464538574, + "rewards/rejected": -2.3329999446868896, + "step": 6414 + }, + { + "epoch": 0.74, + "learning_rate": 7.932810488118927e-08, + "logits/chosen": -2.216418743133545, + "logits/rejected": -2.053710460662842, + "logps/chosen": -235.7281036376953, + "logps/rejected": -148.7792510986328, + "loss": 0.6709, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.8406486511230469, + "rewards/margins": 0.7119261622428894, + "rewards/rejected": -2.552574872970581, + "step": 6415 + }, + { + "epoch": 0.74, + "learning_rate": 7.929298841156501e-08, + "logits/chosen": -2.6197280883789062, + "logits/rejected": -2.36440110206604, + "logps/chosen": -311.80902099609375, + "logps/rejected": -218.7776336669922, + "loss": 0.553, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7082455158233643, + "rewards/margins": 2.877552032470703, + "rewards/rejected": -4.585797309875488, + "step": 6416 + }, + { + "epoch": 0.74, + "learning_rate": 7.925787194194077e-08, + "logits/chosen": -2.720468759536743, + "logits/rejected": -2.725301504135132, + "logps/chosen": -221.96942138671875, + "logps/rejected": -238.26394653320312, + "loss": 0.3666, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6392718553543091, + "rewards/margins": 1.8233892917633057, + "rewards/rejected": -2.4626612663269043, + "step": 6417 + }, + { + "epoch": 0.74, + "learning_rate": 7.922275547231652e-08, + "logits/chosen": -1.735724925994873, + "logits/rejected": -2.142540693283081, + "logps/chosen": -499.6145324707031, + "logps/rejected": -315.0863037109375, + "loss": 0.4944, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5765542984008789, + "rewards/margins": 1.2061653137207031, + "rewards/rejected": -1.7827198505401611, + "step": 6418 + }, + { + "epoch": 0.74, + "learning_rate": 7.918763900269226e-08, + "logits/chosen": -2.370474338531494, + "logits/rejected": -2.2836248874664307, + "logps/chosen": -393.06536865234375, + "logps/rejected": -288.4326171875, + "loss": 0.3215, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3553296327590942, + "rewards/margins": 3.765162944793701, + "rewards/rejected": -5.120492935180664, + "step": 6419 + }, + { + "epoch": 0.74, + "learning_rate": 7.9152522533068e-08, + "logits/chosen": -2.1072311401367188, + "logits/rejected": -2.2619709968566895, + "logps/chosen": -190.51950073242188, + "logps/rejected": -157.4500732421875, + "loss": 1.2369, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1856937408447266, + "rewards/margins": 0.45386117696762085, + "rewards/rejected": -2.639554977416992, + "step": 6420 + }, + { + "epoch": 0.74, + "learning_rate": 7.911740606344376e-08, + "logits/chosen": -2.379077911376953, + "logits/rejected": -2.2015795707702637, + "logps/chosen": -261.4081115722656, + "logps/rejected": -394.0387878417969, + "loss": 0.3558, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3546395301818848, + "rewards/margins": 2.031014919281006, + "rewards/rejected": -3.3856544494628906, + "step": 6421 + }, + { + "epoch": 0.74, + "learning_rate": 7.90822895938195e-08, + "logits/chosen": -2.819373369216919, + "logits/rejected": -2.8321642875671387, + "logps/chosen": -371.3328857421875, + "logps/rejected": -278.80029296875, + "loss": 0.3441, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8624457120895386, + "rewards/margins": 3.117760181427002, + "rewards/rejected": -3.980205774307251, + "step": 6422 + }, + { + "epoch": 0.74, + "learning_rate": 7.904717312419524e-08, + "logits/chosen": -1.857879638671875, + "logits/rejected": -2.054764986038208, + "logps/chosen": -530.8917236328125, + "logps/rejected": -518.1456909179688, + "loss": 0.1813, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7746454477310181, + "rewards/margins": 3.3920633792877197, + "rewards/rejected": -4.166708469390869, + "step": 6423 + }, + { + "epoch": 0.74, + "learning_rate": 7.901205665457098e-08, + "logits/chosen": -1.8557720184326172, + "logits/rejected": -2.1491153240203857, + "logps/chosen": -424.20562744140625, + "logps/rejected": -215.3413848876953, + "loss": 0.717, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3785886764526367, + "rewards/margins": 1.2979018688201904, + "rewards/rejected": -2.676490306854248, + "step": 6424 + }, + { + "epoch": 0.74, + "learning_rate": 7.897694018494675e-08, + "logits/chosen": -2.637580394744873, + "logits/rejected": -2.407466173171997, + "logps/chosen": -98.4737319946289, + "logps/rejected": -225.00840759277344, + "loss": 0.2245, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1784374713897705, + "rewards/margins": 2.199950933456421, + "rewards/rejected": -3.3783884048461914, + "step": 6425 + }, + { + "epoch": 0.74, + "learning_rate": 7.894182371532249e-08, + "logits/chosen": -2.3134500980377197, + "logits/rejected": -2.131568193435669, + "logps/chosen": -111.96435546875, + "logps/rejected": -200.433349609375, + "loss": 0.4073, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8545811176300049, + "rewards/margins": 3.1020326614379883, + "rewards/rejected": -3.956613540649414, + "step": 6426 + }, + { + "epoch": 0.74, + "learning_rate": 7.890670724569823e-08, + "logits/chosen": -1.9434306621551514, + "logits/rejected": -2.1838417053222656, + "logps/chosen": -347.0010070800781, + "logps/rejected": -258.3846435546875, + "loss": 0.767, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4488327503204346, + "rewards/margins": 0.4761721193790436, + "rewards/rejected": -1.9250049591064453, + "step": 6427 + }, + { + "epoch": 0.74, + "learning_rate": 7.887159077607397e-08, + "logits/chosen": -2.722557783126831, + "logits/rejected": -2.5542032718658447, + "logps/chosen": -262.83056640625, + "logps/rejected": -242.60928344726562, + "loss": 0.2973, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.43471547961235046, + "rewards/margins": 2.5775246620178223, + "rewards/rejected": -3.012239933013916, + "step": 6428 + }, + { + "epoch": 0.74, + "learning_rate": 7.883647430644973e-08, + "logits/chosen": -2.2877869606018066, + "logits/rejected": -2.2301974296569824, + "logps/chosen": -163.96327209472656, + "logps/rejected": -319.1167297363281, + "loss": 0.2933, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9663816690444946, + "rewards/margins": 1.8517011404037476, + "rewards/rejected": -2.818082809448242, + "step": 6429 + }, + { + "epoch": 0.74, + "learning_rate": 7.880135783682548e-08, + "logits/chosen": -2.7155580520629883, + "logits/rejected": -2.6351821422576904, + "logps/chosen": -219.98056030273438, + "logps/rejected": -147.95458984375, + "loss": 0.7075, + "rewards/accuracies": 0.375, + "rewards/chosen": -2.6444718837738037, + "rewards/margins": 0.4376877248287201, + "rewards/rejected": -3.0821595191955566, + "step": 6430 + }, + { + "epoch": 0.74, + "learning_rate": 7.876624136720122e-08, + "logits/chosen": -1.9922778606414795, + "logits/rejected": -2.037260055541992, + "logps/chosen": -193.55072021484375, + "logps/rejected": -202.72698974609375, + "loss": 0.2609, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.52667635679245, + "rewards/margins": 2.8003146648406982, + "rewards/rejected": -3.326991081237793, + "step": 6431 + }, + { + "epoch": 0.74, + "learning_rate": 7.873112489757696e-08, + "logits/chosen": -1.70274019241333, + "logits/rejected": -1.6631757020950317, + "logps/chosen": -394.8659973144531, + "logps/rejected": -335.3757629394531, + "loss": 0.7651, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0694142580032349, + "rewards/margins": 0.5193524956703186, + "rewards/rejected": -1.5887668132781982, + "step": 6432 + }, + { + "epoch": 0.74, + "learning_rate": 7.86960084279527e-08, + "logits/chosen": -2.3300750255584717, + "logits/rejected": -2.3137431144714355, + "logps/chosen": -269.3847961425781, + "logps/rejected": -267.39764404296875, + "loss": 0.442, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7233819365501404, + "rewards/margins": 1.9817614555358887, + "rewards/rejected": -2.705143451690674, + "step": 6433 + }, + { + "epoch": 0.74, + "learning_rate": 7.866089195832845e-08, + "logits/chosen": -2.4147818088531494, + "logits/rejected": -2.6532108783721924, + "logps/chosen": -169.53817749023438, + "logps/rejected": -144.2789306640625, + "loss": 0.5888, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3313031792640686, + "rewards/margins": 1.128083348274231, + "rewards/rejected": -1.4593865871429443, + "step": 6434 + }, + { + "epoch": 0.74, + "learning_rate": 7.86257754887042e-08, + "logits/chosen": -2.5771708488464355, + "logits/rejected": -2.7032878398895264, + "logps/chosen": -567.4727172851562, + "logps/rejected": -427.50323486328125, + "loss": 0.5176, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1873751878738403, + "rewards/margins": 2.1483325958251953, + "rewards/rejected": -3.3357081413269043, + "step": 6435 + }, + { + "epoch": 0.74, + "learning_rate": 7.859065901907995e-08, + "logits/chosen": -1.969817042350769, + "logits/rejected": -1.9935349225997925, + "logps/chosen": -508.6150207519531, + "logps/rejected": -385.6596984863281, + "loss": 0.1001, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.29846641421318054, + "rewards/margins": 4.9982523918151855, + "rewards/rejected": -4.6997857093811035, + "step": 6436 + }, + { + "epoch": 0.74, + "learning_rate": 7.855554254945569e-08, + "logits/chosen": -2.258298397064209, + "logits/rejected": -2.2839980125427246, + "logps/chosen": -358.30133056640625, + "logps/rejected": -307.3736267089844, + "loss": 0.2978, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9639204144477844, + "rewards/margins": 3.1303770542144775, + "rewards/rejected": -4.094297409057617, + "step": 6437 + }, + { + "epoch": 0.74, + "learning_rate": 7.852042607983144e-08, + "logits/chosen": -2.2648444175720215, + "logits/rejected": -2.016960859298706, + "logps/chosen": -320.35498046875, + "logps/rejected": -295.598876953125, + "loss": 0.8387, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6584441661834717, + "rewards/margins": 2.0686872005462646, + "rewards/rejected": -2.727130889892578, + "step": 6438 + }, + { + "epoch": 0.74, + "learning_rate": 7.848530961020718e-08, + "logits/chosen": -2.680231809616089, + "logits/rejected": -2.6456551551818848, + "logps/chosen": -282.062255859375, + "logps/rejected": -214.152587890625, + "loss": 0.444, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1018109321594238, + "rewards/margins": 1.464250087738037, + "rewards/rejected": -2.566061019897461, + "step": 6439 + }, + { + "epoch": 0.74, + "learning_rate": 7.845019314058292e-08, + "logits/chosen": -2.050248146057129, + "logits/rejected": -2.2360692024230957, + "logps/chosen": -293.7068176269531, + "logps/rejected": -188.82896423339844, + "loss": 0.3453, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6356180906295776, + "rewards/margins": 1.3826115131378174, + "rewards/rejected": -2.0182294845581055, + "step": 6440 + }, + { + "epoch": 0.74, + "learning_rate": 7.841507667095866e-08, + "logits/chosen": -2.28641939163208, + "logits/rejected": -2.4408228397369385, + "logps/chosen": -238.4624786376953, + "logps/rejected": -195.0098876953125, + "loss": 0.4035, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6720303297042847, + "rewards/margins": 1.701149344444275, + "rewards/rejected": -2.3731796741485596, + "step": 6441 + }, + { + "epoch": 0.74, + "learning_rate": 7.837996020133443e-08, + "logits/chosen": -2.3619396686553955, + "logits/rejected": -2.1763460636138916, + "logps/chosen": -223.72113037109375, + "logps/rejected": -358.62353515625, + "loss": 0.2047, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4501643180847168, + "rewards/margins": 2.043321371078491, + "rewards/rejected": -2.493485689163208, + "step": 6442 + }, + { + "epoch": 0.74, + "learning_rate": 7.834484373171017e-08, + "logits/chosen": -1.8848910331726074, + "logits/rejected": -2.023162603378296, + "logps/chosen": -338.12408447265625, + "logps/rejected": -216.5050048828125, + "loss": 0.684, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0991652011871338, + "rewards/margins": 0.9076979160308838, + "rewards/rejected": -2.0068631172180176, + "step": 6443 + }, + { + "epoch": 0.74, + "learning_rate": 7.830972726208591e-08, + "logits/chosen": -2.832934856414795, + "logits/rejected": -2.592909097671509, + "logps/chosen": -229.02769470214844, + "logps/rejected": -273.10748291015625, + "loss": 0.5224, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3246642351150513, + "rewards/margins": 1.7036811113357544, + "rewards/rejected": -3.0283453464508057, + "step": 6444 + }, + { + "epoch": 0.74, + "learning_rate": 7.827461079246165e-08, + "logits/chosen": -1.912327527999878, + "logits/rejected": -2.174506902694702, + "logps/chosen": -346.92327880859375, + "logps/rejected": -169.9529571533203, + "loss": 1.2712, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.078052282333374, + "rewards/margins": 0.40852177143096924, + "rewards/rejected": -2.486574172973633, + "step": 6445 + }, + { + "epoch": 0.74, + "learning_rate": 7.823949432283742e-08, + "logits/chosen": -1.7650127410888672, + "logits/rejected": -1.8926111459732056, + "logps/chosen": -644.5081787109375, + "logps/rejected": -531.48388671875, + "loss": 0.2979, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1092402935028076, + "rewards/margins": 2.170907974243164, + "rewards/rejected": -3.2801482677459717, + "step": 6446 + }, + { + "epoch": 0.74, + "learning_rate": 7.820437785321316e-08, + "logits/chosen": -2.6793313026428223, + "logits/rejected": -2.679818630218506, + "logps/chosen": -197.7867889404297, + "logps/rejected": -264.0374450683594, + "loss": 0.3122, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6365486979484558, + "rewards/margins": 3.5393316745758057, + "rewards/rejected": -4.175880432128906, + "step": 6447 + }, + { + "epoch": 0.74, + "learning_rate": 7.81692613835889e-08, + "logits/chosen": -2.415499687194824, + "logits/rejected": -2.4725821018218994, + "logps/chosen": -226.4801025390625, + "logps/rejected": -247.71986389160156, + "loss": 0.4398, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.380735993385315, + "rewards/margins": 2.653061866760254, + "rewards/rejected": -4.033797740936279, + "step": 6448 + }, + { + "epoch": 0.74, + "learning_rate": 7.813414491396464e-08, + "logits/chosen": -2.317375898361206, + "logits/rejected": -2.331166982650757, + "logps/chosen": -379.4063720703125, + "logps/rejected": -275.5355529785156, + "loss": 0.6067, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4475027322769165, + "rewards/margins": 1.8581048250198364, + "rewards/rejected": -3.305607557296753, + "step": 6449 + }, + { + "epoch": 0.74, + "learning_rate": 7.80990284443404e-08, + "logits/chosen": -2.903177261352539, + "logits/rejected": -3.011775493621826, + "logps/chosen": -236.05715942382812, + "logps/rejected": -284.01251220703125, + "loss": 0.2052, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6547107100486755, + "rewards/margins": 3.0255746841430664, + "rewards/rejected": -3.6802852153778076, + "step": 6450 + }, + { + "epoch": 0.74, + "learning_rate": 7.806391197471613e-08, + "logits/chosen": -2.0783092975616455, + "logits/rejected": -2.1809611320495605, + "logps/chosen": -414.64605712890625, + "logps/rejected": -380.7646484375, + "loss": 0.2877, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7232978343963623, + "rewards/margins": 3.0545098781585693, + "rewards/rejected": -3.7778077125549316, + "step": 6451 + }, + { + "epoch": 0.74, + "learning_rate": 7.802879550509189e-08, + "logits/chosen": -2.606480598449707, + "logits/rejected": -2.767270088195801, + "logps/chosen": -297.9279479980469, + "logps/rejected": -217.21234130859375, + "loss": 0.1366, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10294044017791748, + "rewards/margins": 3.393662691116333, + "rewards/rejected": -3.290722370147705, + "step": 6452 + }, + { + "epoch": 0.74, + "learning_rate": 7.799367903546763e-08, + "logits/chosen": -1.9880716800689697, + "logits/rejected": -1.9042458534240723, + "logps/chosen": -321.5135192871094, + "logps/rejected": -403.478271484375, + "loss": 0.3837, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5273724794387817, + "rewards/margins": 3.0594444274902344, + "rewards/rejected": -3.5868167877197266, + "step": 6453 + }, + { + "epoch": 0.74, + "learning_rate": 7.795856256584338e-08, + "logits/chosen": -2.498074531555176, + "logits/rejected": -2.5496766567230225, + "logps/chosen": -259.3530578613281, + "logps/rejected": -304.1964111328125, + "loss": 0.1569, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5506477952003479, + "rewards/margins": 2.963754653930664, + "rewards/rejected": -3.5144026279449463, + "step": 6454 + }, + { + "epoch": 0.74, + "learning_rate": 7.792344609621912e-08, + "logits/chosen": -2.4887397289276123, + "logits/rejected": -2.4132375717163086, + "logps/chosen": -214.7025604248047, + "logps/rejected": -254.07061767578125, + "loss": 0.379, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.026071548461914, + "rewards/margins": 1.348911166191101, + "rewards/rejected": -2.3749825954437256, + "step": 6455 + }, + { + "epoch": 0.74, + "learning_rate": 7.788832962659486e-08, + "logits/chosen": -2.1765995025634766, + "logits/rejected": -2.181950330734253, + "logps/chosen": -281.34326171875, + "logps/rejected": -252.37261962890625, + "loss": 0.215, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0675787925720215, + "rewards/margins": 2.122422218322754, + "rewards/rejected": -4.190000534057617, + "step": 6456 + }, + { + "epoch": 0.74, + "learning_rate": 7.78532131569706e-08, + "logits/chosen": -2.072711706161499, + "logits/rejected": -2.375929594039917, + "logps/chosen": -324.7490539550781, + "logps/rejected": -331.50439453125, + "loss": 0.4401, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.40106475353240967, + "rewards/margins": 1.3583180904388428, + "rewards/rejected": -1.759382724761963, + "step": 6457 + }, + { + "epoch": 0.74, + "learning_rate": 7.781809668734637e-08, + "logits/chosen": -2.347142457962036, + "logits/rejected": -2.47988224029541, + "logps/chosen": -240.67274475097656, + "logps/rejected": -280.4209289550781, + "loss": 0.214, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6158212423324585, + "rewards/margins": 2.072762966156006, + "rewards/rejected": -2.688584327697754, + "step": 6458 + }, + { + "epoch": 0.74, + "learning_rate": 7.778298021772211e-08, + "logits/chosen": -2.6794614791870117, + "logits/rejected": -2.47979736328125, + "logps/chosen": -110.50767517089844, + "logps/rejected": -154.73573303222656, + "loss": 0.6089, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9727452993392944, + "rewards/margins": 1.5328203439712524, + "rewards/rejected": -2.505565643310547, + "step": 6459 + }, + { + "epoch": 0.74, + "learning_rate": 7.774786374809785e-08, + "logits/chosen": -2.2160696983337402, + "logits/rejected": -2.457637071609497, + "logps/chosen": -315.83563232421875, + "logps/rejected": -266.3427734375, + "loss": 0.395, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6930453777313232, + "rewards/margins": 2.726325035095215, + "rewards/rejected": -3.419370651245117, + "step": 6460 + }, + { + "epoch": 0.74, + "learning_rate": 7.77127472784736e-08, + "logits/chosen": -2.4251668453216553, + "logits/rejected": -2.5947484970092773, + "logps/chosen": -215.13075256347656, + "logps/rejected": -231.08673095703125, + "loss": 0.7755, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7164854407310486, + "rewards/margins": 3.1472415924072266, + "rewards/rejected": -3.8637266159057617, + "step": 6461 + }, + { + "epoch": 0.74, + "learning_rate": 7.767763080884935e-08, + "logits/chosen": -1.8866777420043945, + "logits/rejected": -1.7498369216918945, + "logps/chosen": -403.05316162109375, + "logps/rejected": -434.16156005859375, + "loss": 0.2121, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16899627447128296, + "rewards/margins": 1.8433284759521484, + "rewards/rejected": -2.012324810028076, + "step": 6462 + }, + { + "epoch": 0.75, + "learning_rate": 7.76425143392251e-08, + "logits/chosen": -2.758273124694824, + "logits/rejected": -2.876363515853882, + "logps/chosen": -300.9287109375, + "logps/rejected": -370.828125, + "loss": 0.1159, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7106441259384155, + "rewards/margins": 3.8336968421936035, + "rewards/rejected": -4.544341087341309, + "step": 6463 + }, + { + "epoch": 0.75, + "learning_rate": 7.760739786960084e-08, + "logits/chosen": -2.2251758575439453, + "logits/rejected": -2.33439302444458, + "logps/chosen": -430.48858642578125, + "logps/rejected": -358.87762451171875, + "loss": 0.2712, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.822379231452942, + "rewards/margins": 1.893376350402832, + "rewards/rejected": -3.7157554626464844, + "step": 6464 + }, + { + "epoch": 0.75, + "learning_rate": 7.757228139997658e-08, + "logits/chosen": -2.192392587661743, + "logits/rejected": -2.2670679092407227, + "logps/chosen": -415.9221496582031, + "logps/rejected": -310.71337890625, + "loss": 0.4758, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8885762691497803, + "rewards/margins": 1.7743630409240723, + "rewards/rejected": -2.6629390716552734, + "step": 6465 + }, + { + "epoch": 0.75, + "learning_rate": 7.753716493035234e-08, + "logits/chosen": -2.0446925163269043, + "logits/rejected": -2.2638275623321533, + "logps/chosen": -288.3824462890625, + "logps/rejected": -211.31930541992188, + "loss": 0.3476, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.10227110236883163, + "rewards/margins": 2.252305746078491, + "rewards/rejected": -2.1500344276428223, + "step": 6466 + }, + { + "epoch": 0.75, + "learning_rate": 7.750204846072808e-08, + "logits/chosen": -2.253478527069092, + "logits/rejected": -2.077934503555298, + "logps/chosen": -259.87567138671875, + "logps/rejected": -266.9906921386719, + "loss": 0.2538, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6499896049499512, + "rewards/margins": 2.248368740081787, + "rewards/rejected": -2.8983585834503174, + "step": 6467 + }, + { + "epoch": 0.75, + "learning_rate": 7.746693199110382e-08, + "logits/chosen": -2.173880100250244, + "logits/rejected": -2.0433952808380127, + "logps/chosen": -246.80105590820312, + "logps/rejected": -320.85552978515625, + "loss": 0.3556, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05316971242427826, + "rewards/margins": 2.830954074859619, + "rewards/rejected": -2.8841238021850586, + "step": 6468 + }, + { + "epoch": 0.75, + "learning_rate": 7.743181552147957e-08, + "logits/chosen": -2.685704231262207, + "logits/rejected": -2.7442378997802734, + "logps/chosen": -165.49127197265625, + "logps/rejected": -230.98651123046875, + "loss": 0.1221, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7995060086250305, + "rewards/margins": 3.158522844314575, + "rewards/rejected": -3.95802903175354, + "step": 6469 + }, + { + "epoch": 0.75, + "learning_rate": 7.739669905185533e-08, + "logits/chosen": -1.9478700160980225, + "logits/rejected": -1.789145827293396, + "logps/chosen": -448.3033447265625, + "logps/rejected": -319.44915771484375, + "loss": 1.644, + "rewards/accuracies": 0.375, + "rewards/chosen": -2.3934171199798584, + "rewards/margins": -1.1276413202285767, + "rewards/rejected": -1.2657759189605713, + "step": 6470 + }, + { + "epoch": 0.75, + "learning_rate": 7.736158258223107e-08, + "logits/chosen": -2.2000579833984375, + "logits/rejected": -2.1360456943511963, + "logps/chosen": -236.28794860839844, + "logps/rejected": -305.7265319824219, + "loss": 0.442, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.725384533405304, + "rewards/margins": 1.7610623836517334, + "rewards/rejected": -2.4864468574523926, + "step": 6471 + }, + { + "epoch": 0.75, + "learning_rate": 7.732646611260681e-08, + "logits/chosen": -2.201890230178833, + "logits/rejected": -2.3426687717437744, + "logps/chosen": -214.81680297851562, + "logps/rejected": -229.14779663085938, + "loss": 0.2287, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3438813090324402, + "rewards/margins": 1.8016855716705322, + "rewards/rejected": -2.145566940307617, + "step": 6472 + }, + { + "epoch": 0.75, + "learning_rate": 7.729134964298255e-08, + "logits/chosen": -2.517183303833008, + "logits/rejected": -2.5893726348876953, + "logps/chosen": -259.83782958984375, + "logps/rejected": -146.97825622558594, + "loss": 0.7993, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.691172480583191, + "rewards/margins": 0.508095920085907, + "rewards/rejected": -2.199268341064453, + "step": 6473 + }, + { + "epoch": 0.75, + "learning_rate": 7.725623317335831e-08, + "logits/chosen": -1.960783839225769, + "logits/rejected": -2.223646879196167, + "logps/chosen": -526.09375, + "logps/rejected": -411.030029296875, + "loss": 0.4106, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1662111282348633, + "rewards/margins": 1.4049971103668213, + "rewards/rejected": -2.5712082386016846, + "step": 6474 + }, + { + "epoch": 0.75, + "learning_rate": 7.722111670373405e-08, + "logits/chosen": -2.6032605171203613, + "logits/rejected": -2.4501490592956543, + "logps/chosen": -185.9112091064453, + "logps/rejected": -217.99652099609375, + "loss": 0.8407, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.798976182937622, + "rewards/margins": 1.307187795639038, + "rewards/rejected": -4.10616397857666, + "step": 6475 + }, + { + "epoch": 0.75, + "learning_rate": 7.71860002341098e-08, + "logits/chosen": -2.3290598392486572, + "logits/rejected": -2.2024855613708496, + "logps/chosen": -149.993896484375, + "logps/rejected": -251.70176696777344, + "loss": 0.3597, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7632923126220703, + "rewards/margins": 1.6520280838012695, + "rewards/rejected": -2.41532039642334, + "step": 6476 + }, + { + "epoch": 0.75, + "learning_rate": 7.715088376448554e-08, + "logits/chosen": -2.469768762588501, + "logits/rejected": -2.2430741786956787, + "logps/chosen": -158.18600463867188, + "logps/rejected": -317.341064453125, + "loss": 0.1712, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.26854658126831055, + "rewards/margins": 4.583853721618652, + "rewards/rejected": -4.852400302886963, + "step": 6477 + }, + { + "epoch": 0.75, + "learning_rate": 7.711576729486129e-08, + "logits/chosen": -2.0767250061035156, + "logits/rejected": -2.21982741355896, + "logps/chosen": -415.956298828125, + "logps/rejected": -403.03631591796875, + "loss": 0.2211, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6734244227409363, + "rewards/margins": 3.778588056564331, + "rewards/rejected": -4.452012538909912, + "step": 6478 + }, + { + "epoch": 0.75, + "learning_rate": 7.708065082523703e-08, + "logits/chosen": -2.5502030849456787, + "logits/rejected": -2.3850131034851074, + "logps/chosen": -222.0767059326172, + "logps/rejected": -311.1519470214844, + "loss": 0.6756, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2007946968078613, + "rewards/margins": 1.7595317363739014, + "rewards/rejected": -2.9603264331817627, + "step": 6479 + }, + { + "epoch": 0.75, + "learning_rate": 7.704553435561278e-08, + "logits/chosen": -2.0229430198669434, + "logits/rejected": -1.980771541595459, + "logps/chosen": -286.2564697265625, + "logps/rejected": -248.24533081054688, + "loss": 0.5432, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.0094361305236816, + "rewards/margins": 1.519949197769165, + "rewards/rejected": -3.5293850898742676, + "step": 6480 + }, + { + "epoch": 0.75, + "learning_rate": 7.701041788598853e-08, + "logits/chosen": -1.850069522857666, + "logits/rejected": -1.7068840265274048, + "logps/chosen": -249.18072509765625, + "logps/rejected": -312.6878662109375, + "loss": 1.067, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3956055641174316, + "rewards/margins": 0.3271951377391815, + "rewards/rejected": -1.7228004932403564, + "step": 6481 + }, + { + "epoch": 0.75, + "learning_rate": 7.697530141636427e-08, + "logits/chosen": -2.1871349811553955, + "logits/rejected": -2.2871479988098145, + "logps/chosen": -238.86981201171875, + "logps/rejected": -214.60415649414062, + "loss": 0.3232, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.74144446849823, + "rewards/margins": 2.7162628173828125, + "rewards/rejected": -3.457707166671753, + "step": 6482 + }, + { + "epoch": 0.75, + "learning_rate": 7.694018494674002e-08, + "logits/chosen": -2.129310369491577, + "logits/rejected": -1.9144517183303833, + "logps/chosen": -154.54473876953125, + "logps/rejected": -208.02503967285156, + "loss": 0.3, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5302218198776245, + "rewards/margins": 2.3674416542053223, + "rewards/rejected": -2.8976633548736572, + "step": 6483 + }, + { + "epoch": 0.75, + "learning_rate": 7.690506847711576e-08, + "logits/chosen": -1.739438533782959, + "logits/rejected": -1.9052987098693848, + "logps/chosen": -193.02511596679688, + "logps/rejected": -176.11343383789062, + "loss": 0.7002, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1895110607147217, + "rewards/margins": 1.047856092453003, + "rewards/rejected": -2.2373671531677246, + "step": 6484 + }, + { + "epoch": 0.75, + "learning_rate": 7.68699520074915e-08, + "logits/chosen": -1.9042754173278809, + "logits/rejected": -1.9840686321258545, + "logps/chosen": -308.4671630859375, + "logps/rejected": -214.16470336914062, + "loss": 0.4172, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9079703092575073, + "rewards/margins": 1.8565707206726074, + "rewards/rejected": -2.764540910720825, + "step": 6485 + }, + { + "epoch": 0.75, + "learning_rate": 7.683483553786725e-08, + "logits/chosen": -2.3319125175476074, + "logits/rejected": -2.474045753479004, + "logps/chosen": -247.04884338378906, + "logps/rejected": -224.63404846191406, + "loss": 0.3073, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.30757901072502136, + "rewards/margins": 2.0029287338256836, + "rewards/rejected": -2.3105077743530273, + "step": 6486 + }, + { + "epoch": 0.75, + "learning_rate": 7.679971906824301e-08, + "logits/chosen": -1.8910465240478516, + "logits/rejected": -2.3559327125549316, + "logps/chosen": -498.9220275878906, + "logps/rejected": -352.7589416503906, + "loss": 0.1684, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4249948561191559, + "rewards/margins": 3.207672119140625, + "rewards/rejected": -3.632667303085327, + "step": 6487 + }, + { + "epoch": 0.75, + "learning_rate": 7.676460259861875e-08, + "logits/chosen": -2.5066328048706055, + "logits/rejected": -2.6416282653808594, + "logps/chosen": -245.66726684570312, + "logps/rejected": -188.88156127929688, + "loss": 0.1888, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17780187726020813, + "rewards/margins": 2.5033740997314453, + "rewards/rejected": -2.681175947189331, + "step": 6488 + }, + { + "epoch": 0.75, + "learning_rate": 7.672948612899449e-08, + "logits/chosen": -1.6922242641448975, + "logits/rejected": -1.9426212310791016, + "logps/chosen": -336.20367431640625, + "logps/rejected": -206.18521118164062, + "loss": 1.066, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7395671606063843, + "rewards/margins": 0.42822396755218506, + "rewards/rejected": -2.1677911281585693, + "step": 6489 + }, + { + "epoch": 0.75, + "learning_rate": 7.669436965937023e-08, + "logits/chosen": -2.6009817123413086, + "logits/rejected": -2.8170647621154785, + "logps/chosen": -177.3418426513672, + "logps/rejected": -127.65711975097656, + "loss": 0.1894, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23389926552772522, + "rewards/margins": 2.6672275066375732, + "rewards/rejected": -2.9011266231536865, + "step": 6490 + }, + { + "epoch": 0.75, + "learning_rate": 7.6659253189746e-08, + "logits/chosen": -1.7711868286132812, + "logits/rejected": -1.8260753154754639, + "logps/chosen": -416.677001953125, + "logps/rejected": -289.44049072265625, + "loss": 0.3117, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4051016569137573, + "rewards/margins": 1.6917157173156738, + "rewards/rejected": -3.0968172550201416, + "step": 6491 + }, + { + "epoch": 0.75, + "learning_rate": 7.662413672012174e-08, + "logits/chosen": -1.9583958387374878, + "logits/rejected": -1.6283719539642334, + "logps/chosen": -248.11241149902344, + "logps/rejected": -368.11383056640625, + "loss": 0.8842, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1776360273361206, + "rewards/margins": 1.2929913997650146, + "rewards/rejected": -2.470627784729004, + "step": 6492 + }, + { + "epoch": 0.75, + "learning_rate": 7.658902025049748e-08, + "logits/chosen": -2.0132639408111572, + "logits/rejected": -1.9503183364868164, + "logps/chosen": -301.8848876953125, + "logps/rejected": -405.71856689453125, + "loss": 0.5138, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0242477655410767, + "rewards/margins": 2.196105718612671, + "rewards/rejected": -3.220353603363037, + "step": 6493 + }, + { + "epoch": 0.75, + "learning_rate": 7.655390378087322e-08, + "logits/chosen": -2.267557144165039, + "logits/rejected": -2.5001509189605713, + "logps/chosen": -292.73565673828125, + "logps/rejected": -259.6685791015625, + "loss": 0.2271, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.929923415184021, + "rewards/margins": 2.1006388664245605, + "rewards/rejected": -3.030562400817871, + "step": 6494 + }, + { + "epoch": 0.75, + "learning_rate": 7.651878731124897e-08, + "logits/chosen": -2.268791437149048, + "logits/rejected": -1.494250774383545, + "logps/chosen": -173.14205932617188, + "logps/rejected": -336.1485595703125, + "loss": 0.292, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5809130072593689, + "rewards/margins": 1.7123229503631592, + "rewards/rejected": -2.293236017227173, + "step": 6495 + }, + { + "epoch": 0.75, + "learning_rate": 7.648367084162471e-08, + "logits/chosen": -2.2383058071136475, + "logits/rejected": -2.4391872882843018, + "logps/chosen": -237.95669555664062, + "logps/rejected": -191.58140563964844, + "loss": 0.6903, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.847679615020752, + "rewards/margins": 1.319159746170044, + "rewards/rejected": -2.166839122772217, + "step": 6496 + }, + { + "epoch": 0.75, + "learning_rate": 7.644855437200047e-08, + "logits/chosen": -2.045574903488159, + "logits/rejected": -2.082995891571045, + "logps/chosen": -383.91046142578125, + "logps/rejected": -317.5429992675781, + "loss": 0.2542, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4630291163921356, + "rewards/margins": 2.2668778896331787, + "rewards/rejected": -2.7299070358276367, + "step": 6497 + }, + { + "epoch": 0.75, + "learning_rate": 7.641343790237621e-08, + "logits/chosen": -2.501556634902954, + "logits/rejected": -2.341615915298462, + "logps/chosen": -257.84442138671875, + "logps/rejected": -269.99169921875, + "loss": 0.6355, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8454974889755249, + "rewards/margins": 2.2847256660461426, + "rewards/rejected": -3.130223035812378, + "step": 6498 + }, + { + "epoch": 0.75, + "learning_rate": 7.637832143275196e-08, + "logits/chosen": -2.3816046714782715, + "logits/rejected": -2.6761558055877686, + "logps/chosen": -226.40771484375, + "logps/rejected": -214.3669891357422, + "loss": 0.4775, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4329633712768555, + "rewards/margins": 1.205554723739624, + "rewards/rejected": -2.6385180950164795, + "step": 6499 + }, + { + "epoch": 0.75, + "learning_rate": 7.63432049631277e-08, + "logits/chosen": -2.2832648754119873, + "logits/rejected": -2.303704261779785, + "logps/chosen": -214.30006408691406, + "logps/rejected": -191.6842041015625, + "loss": 0.6969, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6004682779312134, + "rewards/margins": 1.2417933940887451, + "rewards/rejected": -1.842261791229248, + "step": 6500 + }, + { + "epoch": 0.75, + "learning_rate": 7.630808849350344e-08, + "logits/chosen": -2.005276918411255, + "logits/rejected": -1.94741690158844, + "logps/chosen": -428.4644775390625, + "logps/rejected": -335.8933410644531, + "loss": 0.6511, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0604852437973022, + "rewards/margins": 1.5340880155563354, + "rewards/rejected": -2.5945730209350586, + "step": 6501 + }, + { + "epoch": 0.75, + "learning_rate": 7.627297202387918e-08, + "logits/chosen": -2.9867658615112305, + "logits/rejected": -2.942768096923828, + "logps/chosen": -464.26300048828125, + "logps/rejected": -298.5577087402344, + "loss": 0.2551, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.49452534317970276, + "rewards/margins": 2.887249231338501, + "rewards/rejected": -3.381774425506592, + "step": 6502 + }, + { + "epoch": 0.75, + "learning_rate": 7.623785555425495e-08, + "logits/chosen": -2.064805507659912, + "logits/rejected": -2.069572925567627, + "logps/chosen": -341.5733947753906, + "logps/rejected": -291.64739990234375, + "loss": 0.5591, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7619124054908752, + "rewards/margins": 1.0303221940994263, + "rewards/rejected": -1.7922346591949463, + "step": 6503 + }, + { + "epoch": 0.75, + "learning_rate": 7.620273908463069e-08, + "logits/chosen": -2.709573984146118, + "logits/rejected": -2.6088316440582275, + "logps/chosen": -173.1109619140625, + "logps/rejected": -208.47991943359375, + "loss": 0.1869, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3686271905899048, + "rewards/margins": 2.7250561714172363, + "rewards/rejected": -3.0936834812164307, + "step": 6504 + }, + { + "epoch": 0.75, + "learning_rate": 7.616762261500643e-08, + "logits/chosen": -2.211313247680664, + "logits/rejected": -2.196807861328125, + "logps/chosen": -435.70123291015625, + "logps/rejected": -274.5632019042969, + "loss": 0.648, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2147281169891357, + "rewards/margins": 1.5201786756515503, + "rewards/rejected": -2.7349066734313965, + "step": 6505 + }, + { + "epoch": 0.75, + "learning_rate": 7.613250614538217e-08, + "logits/chosen": -2.051121950149536, + "logits/rejected": -2.0336287021636963, + "logps/chosen": -410.99505615234375, + "logps/rejected": -250.73709106445312, + "loss": 0.6339, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1950275897979736, + "rewards/margins": 2.0298655033111572, + "rewards/rejected": -3.224893093109131, + "step": 6506 + }, + { + "epoch": 0.75, + "learning_rate": 7.609738967575794e-08, + "logits/chosen": -2.279683828353882, + "logits/rejected": -2.5178983211517334, + "logps/chosen": -519.961181640625, + "logps/rejected": -287.3538818359375, + "loss": 0.1237, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4544575810432434, + "rewards/margins": 2.5642402172088623, + "rewards/rejected": -3.01869797706604, + "step": 6507 + }, + { + "epoch": 0.75, + "learning_rate": 7.606227320613368e-08, + "logits/chosen": -2.2389440536499023, + "logits/rejected": -2.3212814331054688, + "logps/chosen": -331.82379150390625, + "logps/rejected": -234.5792694091797, + "loss": 0.6861, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9200719594955444, + "rewards/margins": 0.8365552425384521, + "rewards/rejected": -1.756627082824707, + "step": 6508 + }, + { + "epoch": 0.75, + "learning_rate": 7.602715673650942e-08, + "logits/chosen": -1.8031575679779053, + "logits/rejected": -2.0056111812591553, + "logps/chosen": -264.5646667480469, + "logps/rejected": -187.95199584960938, + "loss": 0.5223, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3304610550403595, + "rewards/margins": 1.1296597719192505, + "rewards/rejected": -1.4601209163665771, + "step": 6509 + }, + { + "epoch": 0.75, + "learning_rate": 7.599204026688516e-08, + "logits/chosen": -1.9961929321289062, + "logits/rejected": -2.2890443801879883, + "logps/chosen": -405.5023498535156, + "logps/rejected": -258.30230712890625, + "loss": 0.3033, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0799020528793335, + "rewards/margins": 1.8078974485397339, + "rewards/rejected": -1.8877995014190674, + "step": 6510 + }, + { + "epoch": 0.75, + "learning_rate": 7.595692379726092e-08, + "logits/chosen": -1.8363971710205078, + "logits/rejected": -1.9028605222702026, + "logps/chosen": -233.09954833984375, + "logps/rejected": -227.43063354492188, + "loss": 0.65, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7573280334472656, + "rewards/margins": 0.7344821691513062, + "rewards/rejected": -1.4918102025985718, + "step": 6511 + }, + { + "epoch": 0.75, + "learning_rate": 7.592180732763666e-08, + "logits/chosen": -2.5111567974090576, + "logits/rejected": -2.415351390838623, + "logps/chosen": -153.11068725585938, + "logps/rejected": -235.86837768554688, + "loss": 0.3681, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3229566812515259, + "rewards/margins": 2.685537815093994, + "rewards/rejected": -4.0084943771362305, + "step": 6512 + }, + { + "epoch": 0.75, + "learning_rate": 7.58866908580124e-08, + "logits/chosen": -2.5124893188476562, + "logits/rejected": -2.412489652633667, + "logps/chosen": -344.7152404785156, + "logps/rejected": -323.8879089355469, + "loss": 0.2923, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0501716136932373, + "rewards/margins": 2.8354883193969727, + "rewards/rejected": -3.885659694671631, + "step": 6513 + }, + { + "epoch": 0.75, + "learning_rate": 7.585157438838815e-08, + "logits/chosen": -2.0317864418029785, + "logits/rejected": -1.6914316415786743, + "logps/chosen": -227.73728942871094, + "logps/rejected": -343.60302734375, + "loss": 0.3372, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4235062599182129, + "rewards/margins": 1.795004963874817, + "rewards/rejected": -2.2185111045837402, + "step": 6514 + }, + { + "epoch": 0.75, + "learning_rate": 7.58164579187639e-08, + "logits/chosen": -2.3783464431762695, + "logits/rejected": -2.0359983444213867, + "logps/chosen": -178.5748748779297, + "logps/rejected": -249.02517700195312, + "loss": 0.4877, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1635771989822388, + "rewards/margins": 1.2228915691375732, + "rewards/rejected": -2.3864688873291016, + "step": 6515 + }, + { + "epoch": 0.75, + "learning_rate": 7.578134144913965e-08, + "logits/chosen": -2.103872537612915, + "logits/rejected": -2.3991751670837402, + "logps/chosen": -444.3415832519531, + "logps/rejected": -281.360107421875, + "loss": 0.2768, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2542705535888672, + "rewards/margins": 2.952610492706299, + "rewards/rejected": -3.206881046295166, + "step": 6516 + }, + { + "epoch": 0.75, + "learning_rate": 7.574622497951539e-08, + "logits/chosen": -2.367840528488159, + "logits/rejected": -2.3791844844818115, + "logps/chosen": -235.33074951171875, + "logps/rejected": -232.7765350341797, + "loss": 0.3635, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0908557176589966, + "rewards/margins": 2.5173451900482178, + "rewards/rejected": -3.608200788497925, + "step": 6517 + }, + { + "epoch": 0.75, + "learning_rate": 7.571110850989113e-08, + "logits/chosen": -2.391164779663086, + "logits/rejected": -2.6222333908081055, + "logps/chosen": -267.7840881347656, + "logps/rejected": -182.9072723388672, + "loss": 0.6888, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7840769290924072, + "rewards/margins": 1.0841069221496582, + "rewards/rejected": -1.8681838512420654, + "step": 6518 + }, + { + "epoch": 0.75, + "learning_rate": 7.56759920402669e-08, + "logits/chosen": -2.464383602142334, + "logits/rejected": -2.6007137298583984, + "logps/chosen": -199.37884521484375, + "logps/rejected": -241.95547485351562, + "loss": 0.4396, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5202879309654236, + "rewards/margins": 1.5222256183624268, + "rewards/rejected": -2.042513608932495, + "step": 6519 + }, + { + "epoch": 0.75, + "learning_rate": 7.564087557064263e-08, + "logits/chosen": -2.185819387435913, + "logits/rejected": -2.348371982574463, + "logps/chosen": -299.1833190917969, + "logps/rejected": -521.4767456054688, + "loss": 0.35, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8447004556655884, + "rewards/margins": 3.5025737285614014, + "rewards/rejected": -4.347273826599121, + "step": 6520 + }, + { + "epoch": 0.75, + "learning_rate": 7.560575910101837e-08, + "logits/chosen": -2.9716358184814453, + "logits/rejected": -2.920715093612671, + "logps/chosen": -423.7923278808594, + "logps/rejected": -299.03717041015625, + "loss": 0.2424, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3835979700088501, + "rewards/margins": 2.9728941917419434, + "rewards/rejected": -3.356492042541504, + "step": 6521 + }, + { + "epoch": 0.75, + "learning_rate": 7.557064263139412e-08, + "logits/chosen": -2.1132888793945312, + "logits/rejected": -2.0375471115112305, + "logps/chosen": -208.23886108398438, + "logps/rejected": -268.71337890625, + "loss": 0.3183, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3438490629196167, + "rewards/margins": 2.910587787628174, + "rewards/rejected": -3.25443696975708, + "step": 6522 + }, + { + "epoch": 0.75, + "learning_rate": 7.553552616176987e-08, + "logits/chosen": -2.1164724826812744, + "logits/rejected": -2.1015710830688477, + "logps/chosen": -233.88063049316406, + "logps/rejected": -271.7886962890625, + "loss": 0.2604, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7895898818969727, + "rewards/margins": 2.6725893020629883, + "rewards/rejected": -3.46217942237854, + "step": 6523 + }, + { + "epoch": 0.75, + "learning_rate": 7.550040969214562e-08, + "logits/chosen": -2.3494067192077637, + "logits/rejected": -2.3935399055480957, + "logps/chosen": -350.94854736328125, + "logps/rejected": -245.5450897216797, + "loss": 0.4034, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.44041019678115845, + "rewards/margins": 1.6430797576904297, + "rewards/rejected": -2.0834898948669434, + "step": 6524 + }, + { + "epoch": 0.75, + "learning_rate": 7.546529322252136e-08, + "logits/chosen": -2.4237725734710693, + "logits/rejected": -2.4851176738739014, + "logps/chosen": -245.5638427734375, + "logps/rejected": -209.06771850585938, + "loss": 0.3296, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6310980916023254, + "rewards/margins": 2.7168939113616943, + "rewards/rejected": -3.347992181777954, + "step": 6525 + }, + { + "epoch": 0.75, + "learning_rate": 7.54301767528971e-08, + "logits/chosen": -2.8471598625183105, + "logits/rejected": -2.8922555446624756, + "logps/chosen": -162.2968292236328, + "logps/rejected": -201.74087524414062, + "loss": 0.3825, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6267181634902954, + "rewards/margins": 2.235734462738037, + "rewards/rejected": -2.862452507019043, + "step": 6526 + }, + { + "epoch": 0.75, + "learning_rate": 7.539506028327285e-08, + "logits/chosen": -2.8730292320251465, + "logits/rejected": -2.5550527572631836, + "logps/chosen": -233.63046264648438, + "logps/rejected": -357.41748046875, + "loss": 0.4766, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5150270462036133, + "rewards/margins": 1.1016277074813843, + "rewards/rejected": -2.616654872894287, + "step": 6527 + }, + { + "epoch": 0.75, + "learning_rate": 7.53599438136486e-08, + "logits/chosen": -2.3865556716918945, + "logits/rejected": -2.5275461673736572, + "logps/chosen": -345.87255859375, + "logps/rejected": -371.1375427246094, + "loss": 1.2884, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.9495620727539062, + "rewards/margins": 1.434854507446289, + "rewards/rejected": -4.384416580200195, + "step": 6528 + }, + { + "epoch": 0.75, + "learning_rate": 7.532482734402434e-08, + "logits/chosen": -2.1130130290985107, + "logits/rejected": -1.9513466358184814, + "logps/chosen": -227.86856079101562, + "logps/rejected": -283.8628234863281, + "loss": 0.3194, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7803319692611694, + "rewards/margins": 2.5000288486480713, + "rewards/rejected": -3.2803609371185303, + "step": 6529 + }, + { + "epoch": 0.75, + "learning_rate": 7.528971087440008e-08, + "logits/chosen": -2.4571988582611084, + "logits/rejected": -2.5181427001953125, + "logps/chosen": -130.8956298828125, + "logps/rejected": -308.6048889160156, + "loss": 0.8704, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7510507702827454, + "rewards/margins": 1.8523708581924438, + "rewards/rejected": -2.603421688079834, + "step": 6530 + }, + { + "epoch": 0.75, + "learning_rate": 7.525459440477583e-08, + "logits/chosen": -2.451343059539795, + "logits/rejected": -2.508185863494873, + "logps/chosen": -261.58966064453125, + "logps/rejected": -254.5670623779297, + "loss": 0.2495, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08154930174350739, + "rewards/margins": 2.0503978729248047, + "rewards/rejected": -1.9688483476638794, + "step": 6531 + }, + { + "epoch": 0.75, + "learning_rate": 7.521947793515159e-08, + "logits/chosen": -2.5904693603515625, + "logits/rejected": -2.6953654289245605, + "logps/chosen": -304.54266357421875, + "logps/rejected": -305.7421875, + "loss": 0.2551, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.33403921127319336, + "rewards/margins": 2.014207363128662, + "rewards/rejected": -2.3482465744018555, + "step": 6532 + }, + { + "epoch": 0.75, + "learning_rate": 7.518436146552733e-08, + "logits/chosen": -2.675515651702881, + "logits/rejected": -2.4208860397338867, + "logps/chosen": -368.9052734375, + "logps/rejected": -381.0400085449219, + "loss": 0.8407, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.852917194366455, + "rewards/margins": 1.8000595569610596, + "rewards/rejected": -3.6529767513275146, + "step": 6533 + }, + { + "epoch": 0.75, + "learning_rate": 7.514924499590307e-08, + "logits/chosen": -2.325376510620117, + "logits/rejected": -2.142184257507324, + "logps/chosen": -248.01797485351562, + "logps/rejected": -373.1733703613281, + "loss": 0.3891, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03692248463630676, + "rewards/margins": 2.621149778366089, + "rewards/rejected": -2.5842273235321045, + "step": 6534 + }, + { + "epoch": 0.75, + "learning_rate": 7.511412852627881e-08, + "logits/chosen": -2.10790753364563, + "logits/rejected": -2.431419849395752, + "logps/chosen": -380.67620849609375, + "logps/rejected": -302.0724182128906, + "loss": 0.6, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.8414757251739502, + "rewards/margins": 1.406511902809143, + "rewards/rejected": -3.2479875087738037, + "step": 6535 + }, + { + "epoch": 0.75, + "learning_rate": 7.507901205665458e-08, + "logits/chosen": -2.257270336151123, + "logits/rejected": -2.576538324356079, + "logps/chosen": -212.97474670410156, + "logps/rejected": -215.43321228027344, + "loss": 0.6653, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.021593689918518, + "rewards/margins": 1.9753073453903198, + "rewards/rejected": -2.9969005584716797, + "step": 6536 + }, + { + "epoch": 0.75, + "learning_rate": 7.504389558703032e-08, + "logits/chosen": -2.4870450496673584, + "logits/rejected": -2.4572136402130127, + "logps/chosen": -300.9888916015625, + "logps/rejected": -231.99237060546875, + "loss": 0.5142, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.292267084121704, + "rewards/margins": 1.6817960739135742, + "rewards/rejected": -2.9740633964538574, + "step": 6537 + }, + { + "epoch": 0.75, + "learning_rate": 7.500877911740606e-08, + "logits/chosen": -2.4689488410949707, + "logits/rejected": -2.3120083808898926, + "logps/chosen": -163.42176818847656, + "logps/rejected": -255.65383911132812, + "loss": 0.4618, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9387460350990295, + "rewards/margins": 1.5746393203735352, + "rewards/rejected": -2.51338529586792, + "step": 6538 + }, + { + "epoch": 0.75, + "learning_rate": 7.497366264778181e-08, + "logits/chosen": -2.371561050415039, + "logits/rejected": -2.3300094604492188, + "logps/chosen": -227.23574829101562, + "logps/rejected": -215.56820678710938, + "loss": 0.1636, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3543737530708313, + "rewards/margins": 2.5975265502929688, + "rewards/rejected": -2.9519004821777344, + "step": 6539 + }, + { + "epoch": 0.75, + "learning_rate": 7.493854617815755e-08, + "logits/chosen": -2.7664542198181152, + "logits/rejected": -2.6186680793762207, + "logps/chosen": -265.27862548828125, + "logps/rejected": -225.8507080078125, + "loss": 0.7724, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8945990800857544, + "rewards/margins": 1.1368459463119507, + "rewards/rejected": -3.031445026397705, + "step": 6540 + }, + { + "epoch": 0.75, + "learning_rate": 7.49034297085333e-08, + "logits/chosen": -2.4153647422790527, + "logits/rejected": -2.4168612957000732, + "logps/chosen": -202.83535766601562, + "logps/rejected": -212.5263671875, + "loss": 0.5008, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0059704780578613, + "rewards/margins": 1.8065561056137085, + "rewards/rejected": -2.8125267028808594, + "step": 6541 + }, + { + "epoch": 0.75, + "learning_rate": 7.486831323890905e-08, + "logits/chosen": -2.9329519271850586, + "logits/rejected": -2.72926664352417, + "logps/chosen": -185.321533203125, + "logps/rejected": -189.74676513671875, + "loss": 0.1718, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7872130870819092, + "rewards/margins": 3.2370212078094482, + "rewards/rejected": -4.024234294891357, + "step": 6542 + }, + { + "epoch": 0.75, + "learning_rate": 7.483319676928479e-08, + "logits/chosen": -1.9831657409667969, + "logits/rejected": -1.9184350967407227, + "logps/chosen": -361.3648376464844, + "logps/rejected": -250.9932861328125, + "loss": 0.9632, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8539090752601624, + "rewards/margins": 1.1721525192260742, + "rewards/rejected": -2.026061534881592, + "step": 6543 + }, + { + "epoch": 0.75, + "learning_rate": 7.479808029966053e-08, + "logits/chosen": -2.154444932937622, + "logits/rejected": -2.1258625984191895, + "logps/chosen": -216.89759826660156, + "logps/rejected": -194.31024169921875, + "loss": 0.4222, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9849873185157776, + "rewards/margins": 1.9286086559295654, + "rewards/rejected": -2.9135961532592773, + "step": 6544 + }, + { + "epoch": 0.75, + "learning_rate": 7.476296383003628e-08, + "logits/chosen": -2.008776903152466, + "logits/rejected": -2.0637404918670654, + "logps/chosen": -312.03253173828125, + "logps/rejected": -370.0095520019531, + "loss": 0.336, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0047041177749634, + "rewards/margins": 2.989426374435425, + "rewards/rejected": -3.9941306114196777, + "step": 6545 + }, + { + "epoch": 0.75, + "learning_rate": 7.472784736041202e-08, + "logits/chosen": -2.572160243988037, + "logits/rejected": -2.653804063796997, + "logps/chosen": -220.6324005126953, + "logps/rejected": -361.3514404296875, + "loss": 0.765, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0395500659942627, + "rewards/margins": 1.799611210823059, + "rewards/rejected": -2.8391611576080322, + "step": 6546 + }, + { + "epoch": 0.75, + "learning_rate": 7.469273089078778e-08, + "logits/chosen": -2.842376708984375, + "logits/rejected": -2.879152774810791, + "logps/chosen": -126.06982421875, + "logps/rejected": -188.45046997070312, + "loss": 0.5795, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3957488536834717, + "rewards/margins": 1.2772650718688965, + "rewards/rejected": -2.673013687133789, + "step": 6547 + }, + { + "epoch": 0.75, + "learning_rate": 7.465761442116352e-08, + "logits/chosen": -1.8646867275238037, + "logits/rejected": -1.9415132999420166, + "logps/chosen": -183.50521850585938, + "logps/rejected": -184.82643127441406, + "loss": 0.387, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9253400564193726, + "rewards/margins": 1.9354753494262695, + "rewards/rejected": -2.8608155250549316, + "step": 6548 + }, + { + "epoch": 0.75, + "learning_rate": 7.462249795153927e-08, + "logits/chosen": -2.358790159225464, + "logits/rejected": -2.0125820636749268, + "logps/chosen": -267.31195068359375, + "logps/rejected": -308.25762939453125, + "loss": 0.2801, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0028194189071655, + "rewards/margins": 1.5927910804748535, + "rewards/rejected": -2.5956103801727295, + "step": 6549 + }, + { + "epoch": 0.76, + "learning_rate": 7.458738148191501e-08, + "logits/chosen": -1.8952430486679077, + "logits/rejected": -1.6923158168792725, + "logps/chosen": -297.2174072265625, + "logps/rejected": -361.2698059082031, + "loss": 0.1426, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6330310106277466, + "rewards/margins": 3.020524024963379, + "rewards/rejected": -3.653554916381836, + "step": 6550 + }, + { + "epoch": 0.76, + "learning_rate": 7.455226501229077e-08, + "logits/chosen": -2.5216784477233887, + "logits/rejected": -2.416544198989868, + "logps/chosen": -275.9848937988281, + "logps/rejected": -299.71026611328125, + "loss": 0.5614, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6461527347564697, + "rewards/margins": 0.5863826274871826, + "rewards/rejected": -1.2325353622436523, + "step": 6551 + }, + { + "epoch": 0.76, + "learning_rate": 7.45171485426665e-08, + "logits/chosen": -1.8184508085250854, + "logits/rejected": -2.041632890701294, + "logps/chosen": -331.14892578125, + "logps/rejected": -369.91851806640625, + "loss": 0.2374, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0888283252716064, + "rewards/margins": 2.6773321628570557, + "rewards/rejected": -3.766160488128662, + "step": 6552 + }, + { + "epoch": 0.76, + "learning_rate": 7.448203207304226e-08, + "logits/chosen": -2.263753652572632, + "logits/rejected": -2.3863837718963623, + "logps/chosen": -478.02496337890625, + "logps/rejected": -342.0985412597656, + "loss": 0.2564, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0690285786986351, + "rewards/margins": 2.699859857559204, + "rewards/rejected": -2.768888473510742, + "step": 6553 + }, + { + "epoch": 0.76, + "learning_rate": 7.4446915603418e-08, + "logits/chosen": -1.9955990314483643, + "logits/rejected": -2.055795669555664, + "logps/chosen": -225.96987915039062, + "logps/rejected": -263.33721923828125, + "loss": 0.2331, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.243265986442566, + "rewards/margins": 3.0684075355529785, + "rewards/rejected": -4.311673164367676, + "step": 6554 + }, + { + "epoch": 0.76, + "learning_rate": 7.441179913379375e-08, + "logits/chosen": -2.5525379180908203, + "logits/rejected": -2.7120401859283447, + "logps/chosen": -231.28079223632812, + "logps/rejected": -257.22021484375, + "loss": 0.2983, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.42234623432159424, + "rewards/margins": 1.4651424884796143, + "rewards/rejected": -1.8874887228012085, + "step": 6555 + }, + { + "epoch": 0.76, + "learning_rate": 7.43766826641695e-08, + "logits/chosen": -2.211019992828369, + "logits/rejected": -2.231252908706665, + "logps/chosen": -186.75820922851562, + "logps/rejected": -278.146728515625, + "loss": 0.3169, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.00223708152771, + "rewards/margins": 1.482954740524292, + "rewards/rejected": -2.485191822052002, + "step": 6556 + }, + { + "epoch": 0.76, + "learning_rate": 7.434156619454524e-08, + "logits/chosen": -2.65130877494812, + "logits/rejected": -2.5271048545837402, + "logps/chosen": -287.383544921875, + "logps/rejected": -356.7437744140625, + "loss": 0.3192, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1636954545974731, + "rewards/margins": 1.7053905725479126, + "rewards/rejected": -2.8690860271453857, + "step": 6557 + }, + { + "epoch": 0.76, + "learning_rate": 7.430644972492099e-08, + "logits/chosen": -2.0250144004821777, + "logits/rejected": -2.162489891052246, + "logps/chosen": -286.7723693847656, + "logps/rejected": -272.2680358886719, + "loss": 0.3532, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.543433666229248, + "rewards/margins": 1.9273301362991333, + "rewards/rejected": -2.470763683319092, + "step": 6558 + }, + { + "epoch": 0.76, + "learning_rate": 7.427133325529673e-08, + "logits/chosen": -2.372511386871338, + "logits/rejected": -2.4671077728271484, + "logps/chosen": -314.7593994140625, + "logps/rejected": -258.40234375, + "loss": 0.3977, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.073610782623291, + "rewards/margins": 1.463909387588501, + "rewards/rejected": -2.537520408630371, + "step": 6559 + }, + { + "epoch": 0.76, + "learning_rate": 7.423621678567247e-08, + "logits/chosen": -2.3915603160858154, + "logits/rejected": -2.524447202682495, + "logps/chosen": -207.2569580078125, + "logps/rejected": -167.04458618164062, + "loss": 0.602, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9495978355407715, + "rewards/margins": 1.5112944841384888, + "rewards/rejected": -2.4608922004699707, + "step": 6560 + }, + { + "epoch": 0.76, + "learning_rate": 7.420110031604822e-08, + "logits/chosen": -2.041560649871826, + "logits/rejected": -1.9854917526245117, + "logps/chosen": -312.84906005859375, + "logps/rejected": -256.93896484375, + "loss": 0.7648, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.545680284500122, + "rewards/margins": 1.4228872060775757, + "rewards/rejected": -2.968567371368408, + "step": 6561 + }, + { + "epoch": 0.76, + "learning_rate": 7.416598384642397e-08, + "logits/chosen": -2.7140684127807617, + "logits/rejected": -2.456681251525879, + "logps/chosen": -121.26030731201172, + "logps/rejected": -196.5677490234375, + "loss": 0.4487, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5589035153388977, + "rewards/margins": 1.5082623958587646, + "rewards/rejected": -2.0671658515930176, + "step": 6562 + }, + { + "epoch": 0.76, + "learning_rate": 7.413086737679972e-08, + "logits/chosen": -2.634596824645996, + "logits/rejected": -2.3143391609191895, + "logps/chosen": -355.01934814453125, + "logps/rejected": -335.652099609375, + "loss": 0.2151, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2961922883987427, + "rewards/margins": 1.9363828897476196, + "rewards/rejected": -3.2325754165649414, + "step": 6563 + }, + { + "epoch": 0.76, + "learning_rate": 7.409575090717546e-08, + "logits/chosen": -2.4834771156311035, + "logits/rejected": -2.354696750640869, + "logps/chosen": -114.84613800048828, + "logps/rejected": -208.1353759765625, + "loss": 0.3837, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8268216848373413, + "rewards/margins": 2.337830066680908, + "rewards/rejected": -3.16465163230896, + "step": 6564 + }, + { + "epoch": 0.76, + "learning_rate": 7.406063443755121e-08, + "logits/chosen": -2.6374833583831787, + "logits/rejected": -2.5853912830352783, + "logps/chosen": -257.8128356933594, + "logps/rejected": -269.9455261230469, + "loss": 0.3664, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.49639269709587097, + "rewards/margins": 3.126317024230957, + "rewards/rejected": -3.6227097511291504, + "step": 6565 + }, + { + "epoch": 0.76, + "learning_rate": 7.402551796792695e-08, + "logits/chosen": -2.8005611896514893, + "logits/rejected": -2.5086755752563477, + "logps/chosen": -239.63534545898438, + "logps/rejected": -344.18731689453125, + "loss": 0.4128, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7744991183280945, + "rewards/margins": 2.213545322418213, + "rewards/rejected": -2.988044261932373, + "step": 6566 + }, + { + "epoch": 0.76, + "learning_rate": 7.399040149830271e-08, + "logits/chosen": -2.7895588874816895, + "logits/rejected": -2.7739291191101074, + "logps/chosen": -293.31640625, + "logps/rejected": -192.71632385253906, + "loss": 0.2688, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7427619099617004, + "rewards/margins": 1.8814003467559814, + "rewards/rejected": -2.624162435531616, + "step": 6567 + }, + { + "epoch": 0.76, + "learning_rate": 7.395528502867845e-08, + "logits/chosen": -2.834423065185547, + "logits/rejected": -2.804849863052368, + "logps/chosen": -214.93899536132812, + "logps/rejected": -228.59039306640625, + "loss": 0.2648, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8908200860023499, + "rewards/margins": 1.956373691558838, + "rewards/rejected": -2.847193717956543, + "step": 6568 + }, + { + "epoch": 0.76, + "learning_rate": 7.39201685590542e-08, + "logits/chosen": -2.2467141151428223, + "logits/rejected": -2.041076183319092, + "logps/chosen": -283.466552734375, + "logps/rejected": -436.61627197265625, + "loss": 0.5101, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2436492443084717, + "rewards/margins": 1.08223295211792, + "rewards/rejected": -2.3258824348449707, + "step": 6569 + }, + { + "epoch": 0.76, + "learning_rate": 7.388505208942994e-08, + "logits/chosen": -2.3257620334625244, + "logits/rejected": -2.4218854904174805, + "logps/chosen": -367.0195617675781, + "logps/rejected": -210.86672973632812, + "loss": 0.203, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5212609171867371, + "rewards/margins": 3.43440580368042, + "rewards/rejected": -3.95566725730896, + "step": 6570 + }, + { + "epoch": 0.76, + "learning_rate": 7.384993561980568e-08, + "logits/chosen": -2.196176528930664, + "logits/rejected": -2.1333696842193604, + "logps/chosen": -470.4134826660156, + "logps/rejected": -394.4831848144531, + "loss": 0.2064, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20945584774017334, + "rewards/margins": 3.0995101928710938, + "rewards/rejected": -3.3089659214019775, + "step": 6571 + }, + { + "epoch": 0.76, + "learning_rate": 7.381481915018144e-08, + "logits/chosen": -1.7130157947540283, + "logits/rejected": -2.0039749145507812, + "logps/chosen": -452.6478576660156, + "logps/rejected": -322.0187683105469, + "loss": 0.2086, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4567682445049286, + "rewards/margins": 3.051999092102051, + "rewards/rejected": -3.5087671279907227, + "step": 6572 + }, + { + "epoch": 0.76, + "learning_rate": 7.377970268055718e-08, + "logits/chosen": -2.5567288398742676, + "logits/rejected": -2.5184102058410645, + "logps/chosen": -217.3290252685547, + "logps/rejected": -255.51336669921875, + "loss": 0.3386, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5566130876541138, + "rewards/margins": 1.7482980489730835, + "rewards/rejected": -2.3049111366271973, + "step": 6573 + }, + { + "epoch": 0.76, + "learning_rate": 7.374458621093292e-08, + "logits/chosen": -1.5472066402435303, + "logits/rejected": -2.191619396209717, + "logps/chosen": -365.0072937011719, + "logps/rejected": -168.0869140625, + "loss": 0.5875, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4284160137176514, + "rewards/margins": 1.1593319177627563, + "rewards/rejected": -2.5877480506896973, + "step": 6574 + }, + { + "epoch": 0.76, + "learning_rate": 7.370946974130867e-08, + "logits/chosen": -2.611938238143921, + "logits/rejected": -2.688091516494751, + "logps/chosen": -276.21856689453125, + "logps/rejected": -303.7895202636719, + "loss": 0.5968, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.476509690284729, + "rewards/margins": 1.3675659894943237, + "rewards/rejected": -2.8440756797790527, + "step": 6575 + }, + { + "epoch": 0.76, + "learning_rate": 7.367435327168441e-08, + "logits/chosen": -2.2597904205322266, + "logits/rejected": -2.4996111392974854, + "logps/chosen": -311.42926025390625, + "logps/rejected": -210.2392578125, + "loss": 0.2839, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.505340576171875, + "rewards/margins": 1.4675142765045166, + "rewards/rejected": -1.9728548526763916, + "step": 6576 + }, + { + "epoch": 0.76, + "learning_rate": 7.363923680206015e-08, + "logits/chosen": -2.7835841178894043, + "logits/rejected": -2.629908800125122, + "logps/chosen": -130.85293579101562, + "logps/rejected": -251.51821899414062, + "loss": 0.4515, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4980401396751404, + "rewards/margins": 1.8049132823944092, + "rewards/rejected": -2.3029534816741943, + "step": 6577 + }, + { + "epoch": 0.76, + "learning_rate": 7.360412033243591e-08, + "logits/chosen": -2.6325600147247314, + "logits/rejected": -2.564394950866699, + "logps/chosen": -287.59844970703125, + "logps/rejected": -208.68807983398438, + "loss": 1.6042, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.798750400543213, + "rewards/margins": -0.7476745247840881, + "rewards/rejected": -2.0510761737823486, + "step": 6578 + }, + { + "epoch": 0.76, + "learning_rate": 7.356900386281165e-08, + "logits/chosen": -2.437028408050537, + "logits/rejected": -2.459392786026001, + "logps/chosen": -281.8453063964844, + "logps/rejected": -302.4399108886719, + "loss": 0.5611, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.546616792678833, + "rewards/margins": 2.4129655361175537, + "rewards/rejected": -3.9595823287963867, + "step": 6579 + }, + { + "epoch": 0.76, + "learning_rate": 7.35338873931874e-08, + "logits/chosen": -1.9046369791030884, + "logits/rejected": -1.9541404247283936, + "logps/chosen": -419.6246337890625, + "logps/rejected": -254.93795776367188, + "loss": 0.3023, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1876707077026367, + "rewards/margins": 2.4051451683044434, + "rewards/rejected": -3.59281587600708, + "step": 6580 + }, + { + "epoch": 0.76, + "learning_rate": 7.349877092356314e-08, + "logits/chosen": -2.401534080505371, + "logits/rejected": -2.1959035396575928, + "logps/chosen": -168.58084106445312, + "logps/rejected": -289.45782470703125, + "loss": 0.4906, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3012022972106934, + "rewards/margins": 1.1061756610870361, + "rewards/rejected": -2.4073779582977295, + "step": 6581 + }, + { + "epoch": 0.76, + "learning_rate": 7.34636544539389e-08, + "logits/chosen": -2.6729252338409424, + "logits/rejected": -2.765552282333374, + "logps/chosen": -376.34625244140625, + "logps/rejected": -356.3989562988281, + "loss": 0.1831, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.015237540006637573, + "rewards/margins": 3.6852195262908936, + "rewards/rejected": -3.7004568576812744, + "step": 6582 + }, + { + "epoch": 0.76, + "learning_rate": 7.342853798431464e-08, + "logits/chosen": -2.017737865447998, + "logits/rejected": -2.3938307762145996, + "logps/chosen": -432.1676025390625, + "logps/rejected": -213.11270141601562, + "loss": 0.5671, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8585206866264343, + "rewards/margins": 2.0337889194488525, + "rewards/rejected": -2.8923096656799316, + "step": 6583 + }, + { + "epoch": 0.76, + "learning_rate": 7.339342151469039e-08, + "logits/chosen": -2.286989212036133, + "logits/rejected": -2.4330549240112305, + "logps/chosen": -412.40899658203125, + "logps/rejected": -352.7737731933594, + "loss": 0.2044, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7383686304092407, + "rewards/margins": 3.1922600269317627, + "rewards/rejected": -4.930628299713135, + "step": 6584 + }, + { + "epoch": 0.76, + "learning_rate": 7.335830504506613e-08, + "logits/chosen": -2.957315444946289, + "logits/rejected": -2.8954811096191406, + "logps/chosen": -215.84852600097656, + "logps/rejected": -209.32989501953125, + "loss": 0.7001, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.28403902053833, + "rewards/margins": 1.6776708364486694, + "rewards/rejected": -2.96170973777771, + "step": 6585 + }, + { + "epoch": 0.76, + "learning_rate": 7.332318857544189e-08, + "logits/chosen": -2.3982954025268555, + "logits/rejected": -2.6581361293792725, + "logps/chosen": -324.9609069824219, + "logps/rejected": -296.5204772949219, + "loss": 0.3534, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.46773651242256165, + "rewards/margins": 2.962765693664551, + "rewards/rejected": -3.430501937866211, + "step": 6586 + }, + { + "epoch": 0.76, + "learning_rate": 7.328807210581763e-08, + "logits/chosen": -2.498394727706909, + "logits/rejected": -2.6469500064849854, + "logps/chosen": -203.19271850585938, + "logps/rejected": -185.5350341796875, + "loss": 0.2946, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.42344141006469727, + "rewards/margins": 1.9482018947601318, + "rewards/rejected": -2.371643304824829, + "step": 6587 + }, + { + "epoch": 0.76, + "learning_rate": 7.325295563619337e-08, + "logits/chosen": -2.784658670425415, + "logits/rejected": -2.680602550506592, + "logps/chosen": -280.6339416503906, + "logps/rejected": -393.4029846191406, + "loss": 0.2617, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7935371398925781, + "rewards/margins": 2.337416648864746, + "rewards/rejected": -3.130953788757324, + "step": 6588 + }, + { + "epoch": 0.76, + "learning_rate": 7.321783916656912e-08, + "logits/chosen": -2.47892427444458, + "logits/rejected": -2.5817699432373047, + "logps/chosen": -463.1778869628906, + "logps/rejected": -291.6208801269531, + "loss": 0.4902, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.373926043510437, + "rewards/margins": 2.657534122467041, + "rewards/rejected": -4.031460285186768, + "step": 6589 + }, + { + "epoch": 0.76, + "learning_rate": 7.318272269694486e-08, + "logits/chosen": -2.142310619354248, + "logits/rejected": -2.1699774265289307, + "logps/chosen": -310.402099609375, + "logps/rejected": -355.59417724609375, + "loss": 0.2073, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.568584680557251, + "rewards/margins": 3.3260912895202637, + "rewards/rejected": -3.8946759700775146, + "step": 6590 + }, + { + "epoch": 0.76, + "learning_rate": 7.31476062273206e-08, + "logits/chosen": -2.2416329383850098, + "logits/rejected": -2.2963762283325195, + "logps/chosen": -379.60308837890625, + "logps/rejected": -275.83526611328125, + "loss": 0.2953, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7959184050559998, + "rewards/margins": 2.4863853454589844, + "rewards/rejected": -3.28230357170105, + "step": 6591 + }, + { + "epoch": 0.76, + "learning_rate": 7.311248975769636e-08, + "logits/chosen": -2.1255650520324707, + "logits/rejected": -2.041063070297241, + "logps/chosen": -277.0828552246094, + "logps/rejected": -292.6999206542969, + "loss": 0.3072, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5700984001159668, + "rewards/margins": 2.5986952781677246, + "rewards/rejected": -4.168793678283691, + "step": 6592 + }, + { + "epoch": 0.76, + "learning_rate": 7.30773732880721e-08, + "logits/chosen": -2.6184167861938477, + "logits/rejected": -2.5675034523010254, + "logps/chosen": -195.41470336914062, + "logps/rejected": -212.43106079101562, + "loss": 0.3319, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09418143332004547, + "rewards/margins": 2.903935432434082, + "rewards/rejected": -2.8097541332244873, + "step": 6593 + }, + { + "epoch": 0.76, + "learning_rate": 7.304225681844785e-08, + "logits/chosen": -2.2467994689941406, + "logits/rejected": -2.1661181449890137, + "logps/chosen": -179.3992919921875, + "logps/rejected": -284.5915222167969, + "loss": 0.2663, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3967955112457275, + "rewards/margins": 2.888115167617798, + "rewards/rejected": -4.284910678863525, + "step": 6594 + }, + { + "epoch": 0.76, + "learning_rate": 7.300714034882359e-08, + "logits/chosen": -2.2350986003875732, + "logits/rejected": -2.002697706222534, + "logps/chosen": -143.65359497070312, + "logps/rejected": -242.56759643554688, + "loss": 0.3316, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.010109543800354, + "rewards/margins": 1.9343818426132202, + "rewards/rejected": -2.9444916248321533, + "step": 6595 + }, + { + "epoch": 0.76, + "learning_rate": 7.297202387919934e-08, + "logits/chosen": -2.563735008239746, + "logits/rejected": -2.430177688598633, + "logps/chosen": -398.97308349609375, + "logps/rejected": -313.1370849609375, + "loss": 0.3684, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1243537962436676, + "rewards/margins": 1.8521254062652588, + "rewards/rejected": -1.976479172706604, + "step": 6596 + }, + { + "epoch": 0.76, + "learning_rate": 7.293690740957509e-08, + "logits/chosen": -2.528378486633301, + "logits/rejected": -2.5862021446228027, + "logps/chosen": -323.8849792480469, + "logps/rejected": -278.6942138671875, + "loss": 0.4497, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7279007434844971, + "rewards/margins": 1.5155599117279053, + "rewards/rejected": -2.2434606552124023, + "step": 6597 + }, + { + "epoch": 0.76, + "learning_rate": 7.290179093995084e-08, + "logits/chosen": -2.185154438018799, + "logits/rejected": -2.1501991748809814, + "logps/chosen": -471.0985107421875, + "logps/rejected": -462.4435119628906, + "loss": 0.2889, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3198137283325195, + "rewards/margins": 3.003755569458008, + "rewards/rejected": -4.323569297790527, + "step": 6598 + }, + { + "epoch": 0.76, + "learning_rate": 7.286667447032658e-08, + "logits/chosen": -2.231168270111084, + "logits/rejected": -2.4333651065826416, + "logps/chosen": -212.90109252929688, + "logps/rejected": -188.47264099121094, + "loss": 0.2319, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3626757860183716, + "rewards/margins": 2.292198657989502, + "rewards/rejected": -2.654874801635742, + "step": 6599 + }, + { + "epoch": 0.76, + "learning_rate": 7.283155800070233e-08, + "logits/chosen": -1.8090993165969849, + "logits/rejected": -1.906659722328186, + "logps/chosen": -317.9287414550781, + "logps/rejected": -308.2829284667969, + "loss": 0.5314, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6422858238220215, + "rewards/margins": 2.0823616981506348, + "rewards/rejected": -3.7246475219726562, + "step": 6600 + }, + { + "epoch": 0.76, + "learning_rate": 7.279644153107807e-08, + "logits/chosen": -2.617766857147217, + "logits/rejected": -2.4368526935577393, + "logps/chosen": -284.4976501464844, + "logps/rejected": -234.57962036132812, + "loss": 0.238, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2557376623153687, + "rewards/margins": 3.0636487007141113, + "rewards/rejected": -4.3193864822387695, + "step": 6601 + }, + { + "epoch": 0.76, + "learning_rate": 7.276132506145382e-08, + "logits/chosen": -2.2863454818725586, + "logits/rejected": -2.61448335647583, + "logps/chosen": -371.38214111328125, + "logps/rejected": -242.08778381347656, + "loss": 0.3088, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9491421580314636, + "rewards/margins": 1.4175398349761963, + "rewards/rejected": -2.3666818141937256, + "step": 6602 + }, + { + "epoch": 0.76, + "learning_rate": 7.272620859182957e-08, + "logits/chosen": -2.438598871231079, + "logits/rejected": -2.7348599433898926, + "logps/chosen": -265.7698059082031, + "logps/rejected": -198.6767578125, + "loss": 0.2986, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12247505784034729, + "rewards/margins": 2.7686986923217773, + "rewards/rejected": -2.891174077987671, + "step": 6603 + }, + { + "epoch": 0.76, + "learning_rate": 7.269109212220531e-08, + "logits/chosen": -1.9311282634735107, + "logits/rejected": -2.1137351989746094, + "logps/chosen": -300.0704040527344, + "logps/rejected": -222.742919921875, + "loss": 0.2893, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40256619453430176, + "rewards/margins": 1.3041784763336182, + "rewards/rejected": -1.7067445516586304, + "step": 6604 + }, + { + "epoch": 0.76, + "learning_rate": 7.265597565258105e-08, + "logits/chosen": -1.9400432109832764, + "logits/rejected": -2.0882747173309326, + "logps/chosen": -352.583984375, + "logps/rejected": -355.96270751953125, + "loss": 1.2258, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.974793791770935, + "rewards/margins": 0.9408553838729858, + "rewards/rejected": -2.915649175643921, + "step": 6605 + }, + { + "epoch": 0.76, + "learning_rate": 7.26208591829568e-08, + "logits/chosen": -2.3062849044799805, + "logits/rejected": -2.565511703491211, + "logps/chosen": -437.82391357421875, + "logps/rejected": -244.30642700195312, + "loss": 0.3761, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6056733131408691, + "rewards/margins": 1.532941222190857, + "rewards/rejected": -2.1386146545410156, + "step": 6606 + }, + { + "epoch": 0.76, + "learning_rate": 7.258574271333254e-08, + "logits/chosen": -1.48651123046875, + "logits/rejected": -2.2317004203796387, + "logps/chosen": -483.4029541015625, + "logps/rejected": -253.63717651367188, + "loss": 0.7378, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.227910041809082, + "rewards/margins": 0.6418951749801636, + "rewards/rejected": -1.8698053359985352, + "step": 6607 + }, + { + "epoch": 0.76, + "learning_rate": 7.25506262437083e-08, + "logits/chosen": -2.5285558700561523, + "logits/rejected": -2.5355467796325684, + "logps/chosen": -378.374755859375, + "logps/rejected": -205.38497924804688, + "loss": 0.2723, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4017135798931122, + "rewards/margins": 2.832773447036743, + "rewards/rejected": -3.2344868183135986, + "step": 6608 + }, + { + "epoch": 0.76, + "learning_rate": 7.251550977408404e-08, + "logits/chosen": -2.315748691558838, + "logits/rejected": -2.5525519847869873, + "logps/chosen": -314.86834716796875, + "logps/rejected": -187.89559936523438, + "loss": 0.298, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0312168598175049, + "rewards/margins": 2.155311107635498, + "rewards/rejected": -3.186527729034424, + "step": 6609 + }, + { + "epoch": 0.76, + "learning_rate": 7.248039330445979e-08, + "logits/chosen": -2.0819945335388184, + "logits/rejected": -1.9693776369094849, + "logps/chosen": -267.1115417480469, + "logps/rejected": -237.85768127441406, + "loss": 0.4482, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5762979984283447, + "rewards/margins": 0.8707927465438843, + "rewards/rejected": -1.447090744972229, + "step": 6610 + }, + { + "epoch": 0.76, + "learning_rate": 7.244527683483553e-08, + "logits/chosen": -2.6588051319122314, + "logits/rejected": -2.5955734252929688, + "logps/chosen": -123.91313171386719, + "logps/rejected": -368.10968017578125, + "loss": 0.3659, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9694968461990356, + "rewards/margins": 2.4469375610351562, + "rewards/rejected": -3.4164345264434814, + "step": 6611 + }, + { + "epoch": 0.76, + "learning_rate": 7.241016036521129e-08, + "logits/chosen": -2.1346940994262695, + "logits/rejected": -2.3000593185424805, + "logps/chosen": -303.23541259765625, + "logps/rejected": -260.87200927734375, + "loss": 0.5844, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.480230689048767, + "rewards/margins": 1.2645267248153687, + "rewards/rejected": -2.744757652282715, + "step": 6612 + }, + { + "epoch": 0.76, + "learning_rate": 7.237504389558703e-08, + "logits/chosen": -2.143786907196045, + "logits/rejected": -2.288109064102173, + "logps/chosen": -195.0227508544922, + "logps/rejected": -302.1187438964844, + "loss": 0.401, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7917927503585815, + "rewards/margins": 1.954554557800293, + "rewards/rejected": -2.746347188949585, + "step": 6613 + }, + { + "epoch": 0.76, + "learning_rate": 7.233992742596278e-08, + "logits/chosen": -2.4091379642486572, + "logits/rejected": -2.3132259845733643, + "logps/chosen": -283.0199279785156, + "logps/rejected": -232.9501190185547, + "loss": 0.4503, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.687978982925415, + "rewards/margins": 1.6174061298370361, + "rewards/rejected": -3.305385112762451, + "step": 6614 + }, + { + "epoch": 0.76, + "learning_rate": 7.230481095633852e-08, + "logits/chosen": -1.8619410991668701, + "logits/rejected": -2.154661178588867, + "logps/chosen": -329.353515625, + "logps/rejected": -352.0446472167969, + "loss": 0.4261, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5950424671173096, + "rewards/margins": 3.425875663757324, + "rewards/rejected": -4.020917892456055, + "step": 6615 + }, + { + "epoch": 0.76, + "learning_rate": 7.226969448671426e-08, + "logits/chosen": -2.7122578620910645, + "logits/rejected": -2.868542194366455, + "logps/chosen": -208.04763793945312, + "logps/rejected": -269.9236755371094, + "loss": 0.1943, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1376838684082031, + "rewards/margins": 2.82210636138916, + "rewards/rejected": -3.9597902297973633, + "step": 6616 + }, + { + "epoch": 0.76, + "learning_rate": 7.223457801709002e-08, + "logits/chosen": -1.8008925914764404, + "logits/rejected": -2.1433706283569336, + "logps/chosen": -343.53778076171875, + "logps/rejected": -281.83502197265625, + "loss": 0.3597, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08608091622591019, + "rewards/margins": 2.8699915409088135, + "rewards/rejected": -2.9560723304748535, + "step": 6617 + }, + { + "epoch": 0.76, + "learning_rate": 7.219946154746576e-08, + "logits/chosen": -2.7985053062438965, + "logits/rejected": -2.8169453144073486, + "logps/chosen": -295.3107604980469, + "logps/rejected": -256.7712707519531, + "loss": 0.5403, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3363258838653564, + "rewards/margins": 1.2768781185150146, + "rewards/rejected": -2.613204002380371, + "step": 6618 + }, + { + "epoch": 0.76, + "learning_rate": 7.21643450778415e-08, + "logits/chosen": -2.7100725173950195, + "logits/rejected": -2.7161123752593994, + "logps/chosen": -309.1927185058594, + "logps/rejected": -306.06787109375, + "loss": 0.3911, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0395057201385498, + "rewards/margins": 2.5139570236206055, + "rewards/rejected": -3.5534627437591553, + "step": 6619 + }, + { + "epoch": 0.76, + "learning_rate": 7.212922860821725e-08, + "logits/chosen": -2.3864426612854004, + "logits/rejected": -2.224055767059326, + "logps/chosen": -255.00660705566406, + "logps/rejected": -340.20263671875, + "loss": 0.2873, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6877084970474243, + "rewards/margins": 1.7278980016708374, + "rewards/rejected": -2.415606737136841, + "step": 6620 + }, + { + "epoch": 0.76, + "learning_rate": 7.209411213859299e-08, + "logits/chosen": -2.222851276397705, + "logits/rejected": -2.178166389465332, + "logps/chosen": -250.4063720703125, + "logps/rejected": -300.24896240234375, + "loss": 0.4544, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6697916388511658, + "rewards/margins": 1.8554027080535889, + "rewards/rejected": -2.5251944065093994, + "step": 6621 + }, + { + "epoch": 0.76, + "learning_rate": 7.205899566896873e-08, + "logits/chosen": -1.3920515775680542, + "logits/rejected": -1.5849072933197021, + "logps/chosen": -497.8637390136719, + "logps/rejected": -394.89300537109375, + "loss": 0.292, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1379123032093048, + "rewards/margins": 4.647264003753662, + "rewards/rejected": -4.7851762771606445, + "step": 6622 + }, + { + "epoch": 0.76, + "learning_rate": 7.202387919934449e-08, + "logits/chosen": -2.441420078277588, + "logits/rejected": -2.060556411743164, + "logps/chosen": -131.15036010742188, + "logps/rejected": -254.08920288085938, + "loss": 0.6922, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5589475035667419, + "rewards/margins": 0.8844887614250183, + "rewards/rejected": -1.4434363842010498, + "step": 6623 + }, + { + "epoch": 0.76, + "learning_rate": 7.198876272972023e-08, + "logits/chosen": -2.185793161392212, + "logits/rejected": -2.379706859588623, + "logps/chosen": -299.26104736328125, + "logps/rejected": -241.14076232910156, + "loss": 0.4724, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1591835021972656, + "rewards/margins": 2.01769757270813, + "rewards/rejected": -3.1768813133239746, + "step": 6624 + }, + { + "epoch": 0.76, + "learning_rate": 7.195364626009598e-08, + "logits/chosen": -2.4371981620788574, + "logits/rejected": -2.181095600128174, + "logps/chosen": -260.89569091796875, + "logps/rejected": -335.1754150390625, + "loss": 0.4793, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5741699934005737, + "rewards/margins": 1.6320455074310303, + "rewards/rejected": -2.2062156200408936, + "step": 6625 + }, + { + "epoch": 0.76, + "learning_rate": 7.191852979047172e-08, + "logits/chosen": -2.132554769515991, + "logits/rejected": -2.2779788970947266, + "logps/chosen": -319.11688232421875, + "logps/rejected": -316.0263977050781, + "loss": 0.4366, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07425469160079956, + "rewards/margins": 1.5968244075775146, + "rewards/rejected": -1.671079158782959, + "step": 6626 + }, + { + "epoch": 0.76, + "learning_rate": 7.188341332084748e-08, + "logits/chosen": -2.44722580909729, + "logits/rejected": -2.4340291023254395, + "logps/chosen": -291.0631408691406, + "logps/rejected": -275.71551513671875, + "loss": 0.4578, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5718967914581299, + "rewards/margins": 2.0638582706451416, + "rewards/rejected": -2.6357550621032715, + "step": 6627 + }, + { + "epoch": 0.76, + "learning_rate": 7.184829685122322e-08, + "logits/chosen": -2.7695412635803223, + "logits/rejected": -2.818105697631836, + "logps/chosen": -186.42742919921875, + "logps/rejected": -200.4744873046875, + "loss": 0.7051, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.597866177558899, + "rewards/margins": 1.4935550689697266, + "rewards/rejected": -3.091421604156494, + "step": 6628 + }, + { + "epoch": 0.76, + "learning_rate": 7.181318038159897e-08, + "logits/chosen": -2.3689475059509277, + "logits/rejected": -2.344979763031006, + "logps/chosen": -354.6333923339844, + "logps/rejected": -212.07627868652344, + "loss": 0.1981, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.42707204818725586, + "rewards/margins": 2.1798460483551025, + "rewards/rejected": -2.6069180965423584, + "step": 6629 + }, + { + "epoch": 0.76, + "learning_rate": 7.177806391197471e-08, + "logits/chosen": -2.1117889881134033, + "logits/rejected": -1.9801361560821533, + "logps/chosen": -202.88360595703125, + "logps/rejected": -278.56060791015625, + "loss": 0.3697, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.28791344165802, + "rewards/margins": 0.9206695556640625, + "rewards/rejected": -2.208583116531372, + "step": 6630 + }, + { + "epoch": 0.76, + "learning_rate": 7.174294744235047e-08, + "logits/chosen": -2.4711477756500244, + "logits/rejected": -2.5114822387695312, + "logps/chosen": -277.0702209472656, + "logps/rejected": -284.3976135253906, + "loss": 0.2985, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5287920832633972, + "rewards/margins": 1.7474228143692017, + "rewards/rejected": -2.276214838027954, + "step": 6631 + }, + { + "epoch": 0.76, + "learning_rate": 7.17078309727262e-08, + "logits/chosen": -2.5736236572265625, + "logits/rejected": -2.5790178775787354, + "logps/chosen": -278.5941162109375, + "logps/rejected": -324.6014404296875, + "loss": 0.0502, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9386100769042969, + "rewards/margins": 3.9223735332489014, + "rewards/rejected": -4.860983848571777, + "step": 6632 + }, + { + "epoch": 0.76, + "learning_rate": 7.167271450310195e-08, + "logits/chosen": -2.2694835662841797, + "logits/rejected": -2.6108412742614746, + "logps/chosen": -287.8614501953125, + "logps/rejected": -204.55496215820312, + "loss": 0.7688, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3927386403083801, + "rewards/margins": 1.1464695930480957, + "rewards/rejected": -1.539207935333252, + "step": 6633 + }, + { + "epoch": 0.76, + "learning_rate": 7.16375980334777e-08, + "logits/chosen": -2.5501368045806885, + "logits/rejected": -2.499814748764038, + "logps/chosen": -133.93191528320312, + "logps/rejected": -151.9707489013672, + "loss": 0.5682, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6122276186943054, + "rewards/margins": 1.5311548709869385, + "rewards/rejected": -2.1433825492858887, + "step": 6634 + }, + { + "epoch": 0.76, + "learning_rate": 7.160248156385344e-08, + "logits/chosen": -2.52129864692688, + "logits/rejected": -2.51596999168396, + "logps/chosen": -285.4609375, + "logps/rejected": -385.142822265625, + "loss": 0.2062, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7954022884368896, + "rewards/margins": 2.043616533279419, + "rewards/rejected": -3.8390188217163086, + "step": 6635 + }, + { + "epoch": 0.77, + "learning_rate": 7.156736509422918e-08, + "logits/chosen": -2.7884185314178467, + "logits/rejected": -2.8817081451416016, + "logps/chosen": -198.52052307128906, + "logps/rejected": -275.2896728515625, + "loss": 0.3558, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6985940933227539, + "rewards/margins": 1.8547452688217163, + "rewards/rejected": -2.5533394813537598, + "step": 6636 + }, + { + "epoch": 0.77, + "learning_rate": 7.153224862460494e-08, + "logits/chosen": -2.3733885288238525, + "logits/rejected": -2.310013771057129, + "logps/chosen": -394.68817138671875, + "logps/rejected": -326.8552551269531, + "loss": 0.4706, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7004455924034119, + "rewards/margins": 2.294874906539917, + "rewards/rejected": -2.9953205585479736, + "step": 6637 + }, + { + "epoch": 0.77, + "learning_rate": 7.149713215498068e-08, + "logits/chosen": -2.710007429122925, + "logits/rejected": -2.717602014541626, + "logps/chosen": -159.2116241455078, + "logps/rejected": -251.26870727539062, + "loss": 0.2853, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.10743259638547897, + "rewards/margins": 1.8636634349822998, + "rewards/rejected": -1.7562309503555298, + "step": 6638 + }, + { + "epoch": 0.77, + "learning_rate": 7.146201568535643e-08, + "logits/chosen": -2.0386080741882324, + "logits/rejected": -2.286372184753418, + "logps/chosen": -273.00140380859375, + "logps/rejected": -276.4611511230469, + "loss": 0.9953, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4381412267684937, + "rewards/margins": 1.9247448444366455, + "rewards/rejected": -3.3628859519958496, + "step": 6639 + }, + { + "epoch": 0.77, + "learning_rate": 7.142689921573217e-08, + "logits/chosen": -2.333872079849243, + "logits/rejected": -2.2770869731903076, + "logps/chosen": -257.40234375, + "logps/rejected": -318.359619140625, + "loss": 0.5435, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3101937770843506, + "rewards/margins": 1.566184639930725, + "rewards/rejected": -2.8763785362243652, + "step": 6640 + }, + { + "epoch": 0.77, + "learning_rate": 7.139178274610792e-08, + "logits/chosen": -1.7831604480743408, + "logits/rejected": -1.460348129272461, + "logps/chosen": -351.4964599609375, + "logps/rejected": -469.0722961425781, + "loss": 0.6309, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7639276385307312, + "rewards/margins": 2.331630229949951, + "rewards/rejected": -3.095557928085327, + "step": 6641 + }, + { + "epoch": 0.77, + "learning_rate": 7.135666627648366e-08, + "logits/chosen": -1.965080976486206, + "logits/rejected": -2.20934796333313, + "logps/chosen": -267.78485107421875, + "logps/rejected": -274.2578125, + "loss": 0.145, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5524245500564575, + "rewards/margins": 2.264028787612915, + "rewards/rejected": -3.816453456878662, + "step": 6642 + }, + { + "epoch": 0.77, + "learning_rate": 7.132154980685942e-08, + "logits/chosen": -2.222813844680786, + "logits/rejected": -2.1314659118652344, + "logps/chosen": -450.8258056640625, + "logps/rejected": -458.4003601074219, + "loss": 0.5875, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0648208856582642, + "rewards/margins": 1.9091533422470093, + "rewards/rejected": -2.9739742279052734, + "step": 6643 + }, + { + "epoch": 0.77, + "learning_rate": 7.128643333723516e-08, + "logits/chosen": -2.3750860691070557, + "logits/rejected": -2.3355636596679688, + "logps/chosen": -264.5611877441406, + "logps/rejected": -378.7880859375, + "loss": 0.7995, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6023264527320862, + "rewards/margins": 1.038507342338562, + "rewards/rejected": -1.6408339738845825, + "step": 6644 + }, + { + "epoch": 0.77, + "learning_rate": 7.125131686761091e-08, + "logits/chosen": -2.3957679271698, + "logits/rejected": -2.482029914855957, + "logps/chosen": -269.5824279785156, + "logps/rejected": -486.72528076171875, + "loss": 0.3433, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2733010947704315, + "rewards/margins": 4.989021301269531, + "rewards/rejected": -5.262322425842285, + "step": 6645 + }, + { + "epoch": 0.77, + "learning_rate": 7.121620039798665e-08, + "logits/chosen": -2.0328431129455566, + "logits/rejected": -1.785301923751831, + "logps/chosen": -348.91668701171875, + "logps/rejected": -500.7066345214844, + "loss": 0.7426, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4314830303192139, + "rewards/margins": 2.1884829998016357, + "rewards/rejected": -3.6199660301208496, + "step": 6646 + }, + { + "epoch": 0.77, + "learning_rate": 7.11810839283624e-08, + "logits/chosen": -2.6029913425445557, + "logits/rejected": -2.526090621948242, + "logps/chosen": -167.82225036621094, + "logps/rejected": -304.88641357421875, + "loss": 0.2055, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.153076410293579, + "rewards/margins": 3.5176544189453125, + "rewards/rejected": -4.670731067657471, + "step": 6647 + }, + { + "epoch": 0.77, + "learning_rate": 7.114596745873815e-08, + "logits/chosen": -2.3027520179748535, + "logits/rejected": -2.216878890991211, + "logps/chosen": -268.637939453125, + "logps/rejected": -319.3964538574219, + "loss": 0.2751, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5999893546104431, + "rewards/margins": 2.450369358062744, + "rewards/rejected": -3.050358772277832, + "step": 6648 + }, + { + "epoch": 0.77, + "learning_rate": 7.111085098911389e-08, + "logits/chosen": -2.279813766479492, + "logits/rejected": -2.1595828533172607, + "logps/chosen": -417.2252502441406, + "logps/rejected": -402.1839599609375, + "loss": 0.7794, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6174497604370117, + "rewards/margins": 1.1522210836410522, + "rewards/rejected": -1.769670844078064, + "step": 6649 + }, + { + "epoch": 0.77, + "learning_rate": 7.107573451948963e-08, + "logits/chosen": -2.0698928833007812, + "logits/rejected": -2.393535614013672, + "logps/chosen": -550.119873046875, + "logps/rejected": -360.8065185546875, + "loss": 0.2822, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7823317050933838, + "rewards/margins": 2.208890676498413, + "rewards/rejected": -2.991222381591797, + "step": 6650 + }, + { + "epoch": 0.77, + "learning_rate": 7.104061804986538e-08, + "logits/chosen": -2.2568604946136475, + "logits/rejected": -2.14437198638916, + "logps/chosen": -174.57992553710938, + "logps/rejected": -296.3221130371094, + "loss": 0.2252, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.229756474494934, + "rewards/margins": 3.3949360847473145, + "rewards/rejected": -4.624692916870117, + "step": 6651 + }, + { + "epoch": 0.77, + "learning_rate": 7.100550158024112e-08, + "logits/chosen": -2.184828758239746, + "logits/rejected": -2.1870028972625732, + "logps/chosen": -249.91981506347656, + "logps/rejected": -288.65228271484375, + "loss": 0.9448, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0637788772583008, + "rewards/margins": 0.8674069046974182, + "rewards/rejected": -1.9311856031417847, + "step": 6652 + }, + { + "epoch": 0.77, + "learning_rate": 7.097038511061688e-08, + "logits/chosen": -2.3692939281463623, + "logits/rejected": -2.2138023376464844, + "logps/chosen": -330.43170166015625, + "logps/rejected": -270.65692138671875, + "loss": 0.9772, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2998123168945312, + "rewards/margins": 0.827606201171875, + "rewards/rejected": -2.1274185180664062, + "step": 6653 + }, + { + "epoch": 0.77, + "learning_rate": 7.093526864099262e-08, + "logits/chosen": -1.8861517906188965, + "logits/rejected": -2.0191290378570557, + "logps/chosen": -352.6588134765625, + "logps/rejected": -381.59783935546875, + "loss": 0.6094, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7772753834724426, + "rewards/margins": 1.8504093885421753, + "rewards/rejected": -2.6276848316192627, + "step": 6654 + }, + { + "epoch": 0.77, + "learning_rate": 7.090015217136837e-08, + "logits/chosen": -2.072643995285034, + "logits/rejected": -2.350127696990967, + "logps/chosen": -317.91754150390625, + "logps/rejected": -176.4956817626953, + "loss": 0.2814, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.33601081371307373, + "rewards/margins": 1.524003267288208, + "rewards/rejected": -1.8600142002105713, + "step": 6655 + }, + { + "epoch": 0.77, + "learning_rate": 7.086503570174411e-08, + "logits/chosen": -2.1827728748321533, + "logits/rejected": -2.1750142574310303, + "logps/chosen": -283.655029296875, + "logps/rejected": -331.434326171875, + "loss": 0.2331, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9779973030090332, + "rewards/margins": 1.7950780391693115, + "rewards/rejected": -2.773075580596924, + "step": 6656 + }, + { + "epoch": 0.77, + "learning_rate": 7.082991923211987e-08, + "logits/chosen": -1.813084363937378, + "logits/rejected": -2.2749319076538086, + "logps/chosen": -704.039306640625, + "logps/rejected": -343.8106689453125, + "loss": 0.234, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.14798887073993683, + "rewards/margins": 2.7017929553985596, + "rewards/rejected": -2.5538039207458496, + "step": 6657 + }, + { + "epoch": 0.77, + "learning_rate": 7.079480276249561e-08, + "logits/chosen": -2.220482349395752, + "logits/rejected": -2.007847785949707, + "logps/chosen": -217.0960235595703, + "logps/rejected": -425.6585998535156, + "loss": 0.514, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8239858150482178, + "rewards/margins": 1.0241811275482178, + "rewards/rejected": -1.8481669425964355, + "step": 6658 + }, + { + "epoch": 0.77, + "learning_rate": 7.075968629287136e-08, + "logits/chosen": -2.9681222438812256, + "logits/rejected": -2.9564170837402344, + "logps/chosen": -405.97039794921875, + "logps/rejected": -354.3171691894531, + "loss": 0.2931, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8125168085098267, + "rewards/margins": 2.942134380340576, + "rewards/rejected": -3.7546510696411133, + "step": 6659 + }, + { + "epoch": 0.77, + "learning_rate": 7.07245698232471e-08, + "logits/chosen": -2.7305006980895996, + "logits/rejected": -2.5144970417022705, + "logps/chosen": -264.73046875, + "logps/rejected": -346.4132995605469, + "loss": 0.161, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4196627140045166, + "rewards/margins": 3.637057065963745, + "rewards/rejected": -5.056719779968262, + "step": 6660 + }, + { + "epoch": 0.77, + "learning_rate": 7.068945335362286e-08, + "logits/chosen": -2.0773165225982666, + "logits/rejected": -2.2103161811828613, + "logps/chosen": -415.56903076171875, + "logps/rejected": -348.16790771484375, + "loss": 0.2935, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9938632845878601, + "rewards/margins": 2.214933395385742, + "rewards/rejected": -3.208796501159668, + "step": 6661 + }, + { + "epoch": 0.77, + "learning_rate": 7.06543368839986e-08, + "logits/chosen": -2.3844354152679443, + "logits/rejected": -2.4711053371429443, + "logps/chosen": -293.5450744628906, + "logps/rejected": -393.40936279296875, + "loss": 0.7353, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9538382291793823, + "rewards/margins": 1.3180160522460938, + "rewards/rejected": -3.2718544006347656, + "step": 6662 + }, + { + "epoch": 0.77, + "learning_rate": 7.061922041437434e-08, + "logits/chosen": -2.5033559799194336, + "logits/rejected": -2.4870378971099854, + "logps/chosen": -254.18190002441406, + "logps/rejected": -323.4244384765625, + "loss": 0.5355, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2165428400039673, + "rewards/margins": 0.7042321562767029, + "rewards/rejected": -1.9207749366760254, + "step": 6663 + }, + { + "epoch": 0.77, + "learning_rate": 7.058410394475008e-08, + "logits/chosen": -2.4469728469848633, + "logits/rejected": -2.409763813018799, + "logps/chosen": -243.98846435546875, + "logps/rejected": -255.831787109375, + "loss": 0.494, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8237525224685669, + "rewards/margins": 1.6707022190093994, + "rewards/rejected": -2.4944546222686768, + "step": 6664 + }, + { + "epoch": 0.77, + "learning_rate": 7.054898747512583e-08, + "logits/chosen": -1.8271164894104004, + "logits/rejected": -2.1292757987976074, + "logps/chosen": -255.5673065185547, + "logps/rejected": -182.53799438476562, + "loss": 0.4124, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6737954616546631, + "rewards/margins": 1.6756551265716553, + "rewards/rejected": -2.3494503498077393, + "step": 6665 + }, + { + "epoch": 0.77, + "learning_rate": 7.051387100550157e-08, + "logits/chosen": -2.419255495071411, + "logits/rejected": -2.4634084701538086, + "logps/chosen": -285.1763916015625, + "logps/rejected": -294.8009033203125, + "loss": 0.303, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1805657148361206, + "rewards/margins": 2.189134120941162, + "rewards/rejected": -3.369699716567993, + "step": 6666 + }, + { + "epoch": 0.77, + "learning_rate": 7.047875453587733e-08, + "logits/chosen": -2.331864833831787, + "logits/rejected": -2.5453243255615234, + "logps/chosen": -278.716552734375, + "logps/rejected": -379.5384521484375, + "loss": 0.1247, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0536227226257324, + "rewards/margins": 3.6241683959960938, + "rewards/rejected": -4.677791118621826, + "step": 6667 + }, + { + "epoch": 0.77, + "learning_rate": 7.044363806625307e-08, + "logits/chosen": -2.459848642349243, + "logits/rejected": -2.237161159515381, + "logps/chosen": -338.3894958496094, + "logps/rejected": -404.4575500488281, + "loss": 0.2049, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9860031604766846, + "rewards/margins": 2.1903364658355713, + "rewards/rejected": -3.176339626312256, + "step": 6668 + }, + { + "epoch": 0.77, + "learning_rate": 7.040852159662882e-08, + "logits/chosen": -2.387862205505371, + "logits/rejected": -2.3465416431427, + "logps/chosen": -223.95755004882812, + "logps/rejected": -338.654052734375, + "loss": 0.3536, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7776167392730713, + "rewards/margins": 3.748176336288452, + "rewards/rejected": -4.525793075561523, + "step": 6669 + }, + { + "epoch": 0.77, + "learning_rate": 7.037340512700456e-08, + "logits/chosen": -1.9978386163711548, + "logits/rejected": -2.257354259490967, + "logps/chosen": -538.5094604492188, + "logps/rejected": -465.3037414550781, + "loss": 0.202, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6144254207611084, + "rewards/margins": 2.5862228870391846, + "rewards/rejected": -3.200648307800293, + "step": 6670 + }, + { + "epoch": 0.77, + "learning_rate": 7.03382886573803e-08, + "logits/chosen": -2.192404270172119, + "logits/rejected": -2.137925148010254, + "logps/chosen": -336.56036376953125, + "logps/rejected": -281.96197509765625, + "loss": 0.2679, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6038158535957336, + "rewards/margins": 3.045823812484741, + "rewards/rejected": -3.649639844894409, + "step": 6671 + }, + { + "epoch": 0.77, + "learning_rate": 7.030317218775606e-08, + "logits/chosen": -2.8039727210998535, + "logits/rejected": -2.6786060333251953, + "logps/chosen": -183.0984649658203, + "logps/rejected": -143.2879638671875, + "loss": 0.5208, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.1276135444641113, + "rewards/margins": 1.2629650831222534, + "rewards/rejected": -3.3905787467956543, + "step": 6672 + }, + { + "epoch": 0.77, + "learning_rate": 7.02680557181318e-08, + "logits/chosen": -2.756143093109131, + "logits/rejected": -2.6356470584869385, + "logps/chosen": -243.47708129882812, + "logps/rejected": -311.344482421875, + "loss": 0.2837, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5396296977996826, + "rewards/margins": 4.189191818237305, + "rewards/rejected": -5.728821754455566, + "step": 6673 + }, + { + "epoch": 0.77, + "learning_rate": 7.023293924850755e-08, + "logits/chosen": -1.7428600788116455, + "logits/rejected": -1.9756419658660889, + "logps/chosen": -412.53607177734375, + "logps/rejected": -456.5758361816406, + "loss": 0.305, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.32615458965301514, + "rewards/margins": 3.126408576965332, + "rewards/rejected": -3.4525630474090576, + "step": 6674 + }, + { + "epoch": 0.77, + "learning_rate": 7.019782277888329e-08, + "logits/chosen": -2.3834919929504395, + "logits/rejected": -2.040296792984009, + "logps/chosen": -258.19683837890625, + "logps/rejected": -313.76715087890625, + "loss": 0.4465, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.46639084815979, + "rewards/margins": 1.6773066520690918, + "rewards/rejected": -3.143697500228882, + "step": 6675 + }, + { + "epoch": 0.77, + "learning_rate": 7.016270630925904e-08, + "logits/chosen": -2.41153621673584, + "logits/rejected": -2.2828762531280518, + "logps/chosen": -264.454833984375, + "logps/rejected": -254.7000732421875, + "loss": 0.603, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8967492580413818, + "rewards/margins": 1.4105899333953857, + "rewards/rejected": -2.3073389530181885, + "step": 6676 + }, + { + "epoch": 0.77, + "learning_rate": 7.012758983963479e-08, + "logits/chosen": -1.6613078117370605, + "logits/rejected": -1.912219762802124, + "logps/chosen": -471.01934814453125, + "logps/rejected": -479.4209289550781, + "loss": 0.2702, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6742492914199829, + "rewards/margins": 2.258509635925293, + "rewards/rejected": -2.9327588081359863, + "step": 6677 + }, + { + "epoch": 0.77, + "learning_rate": 7.009247337001054e-08, + "logits/chosen": -1.9337022304534912, + "logits/rejected": -2.153909683227539, + "logps/chosen": -236.76797485351562, + "logps/rejected": -320.4718017578125, + "loss": 0.5937, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3952947854995728, + "rewards/margins": 2.8668792247772217, + "rewards/rejected": -4.262174129486084, + "step": 6678 + }, + { + "epoch": 0.77, + "learning_rate": 7.005735690038628e-08, + "logits/chosen": -2.339879035949707, + "logits/rejected": -2.439502000808716, + "logps/chosen": -309.5301818847656, + "logps/rejected": -354.8804016113281, + "loss": 0.3977, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9906840920448303, + "rewards/margins": 2.5133938789367676, + "rewards/rejected": -3.5040781497955322, + "step": 6679 + }, + { + "epoch": 0.77, + "learning_rate": 7.002224043076202e-08, + "logits/chosen": -1.7636220455169678, + "logits/rejected": -2.180699348449707, + "logps/chosen": -485.38409423828125, + "logps/rejected": -309.9059753417969, + "loss": 0.7889, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.2234880924224854, + "rewards/margins": 1.2125449180603027, + "rewards/rejected": -3.436033010482788, + "step": 6680 + }, + { + "epoch": 0.77, + "learning_rate": 6.998712396113776e-08, + "logits/chosen": -2.664578914642334, + "logits/rejected": -2.6728079319000244, + "logps/chosen": -238.37225341796875, + "logps/rejected": -514.2366943359375, + "loss": 0.2594, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.342194139957428, + "rewards/margins": 2.5897278785705566, + "rewards/rejected": -2.93192195892334, + "step": 6681 + }, + { + "epoch": 0.77, + "learning_rate": 6.995200749151351e-08, + "logits/chosen": -2.4200069904327393, + "logits/rejected": -2.212472438812256, + "logps/chosen": -216.6984100341797, + "logps/rejected": -251.43601989746094, + "loss": 0.3052, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.553771436214447, + "rewards/margins": 1.5679337978363037, + "rewards/rejected": -2.1217050552368164, + "step": 6682 + }, + { + "epoch": 0.77, + "learning_rate": 6.991689102188926e-08, + "logits/chosen": -2.6119937896728516, + "logits/rejected": -2.619546413421631, + "logps/chosen": -343.4845886230469, + "logps/rejected": -316.22100830078125, + "loss": 0.0728, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15900202095508575, + "rewards/margins": 3.1242940425872803, + "rewards/rejected": -3.2832958698272705, + "step": 6683 + }, + { + "epoch": 0.77, + "learning_rate": 6.988177455226501e-08, + "logits/chosen": -2.0351977348327637, + "logits/rejected": -2.343087673187256, + "logps/chosen": -635.7061157226562, + "logps/rejected": -403.1670227050781, + "loss": 0.3344, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.504303514957428, + "rewards/margins": 1.186483383178711, + "rewards/rejected": -1.6907869577407837, + "step": 6684 + }, + { + "epoch": 0.77, + "learning_rate": 6.984665808264075e-08, + "logits/chosen": -2.932283639907837, + "logits/rejected": -2.8298282623291016, + "logps/chosen": -283.55218505859375, + "logps/rejected": -153.82373046875, + "loss": 0.2429, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6139329671859741, + "rewards/margins": 1.642127513885498, + "rewards/rejected": -2.2560606002807617, + "step": 6685 + }, + { + "epoch": 0.77, + "learning_rate": 6.98115416130165e-08, + "logits/chosen": -1.9292421340942383, + "logits/rejected": -1.7105920314788818, + "logps/chosen": -195.61863708496094, + "logps/rejected": -315.07183837890625, + "loss": 0.1326, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9737096428871155, + "rewards/margins": 3.6154937744140625, + "rewards/rejected": -4.589203357696533, + "step": 6686 + }, + { + "epoch": 0.77, + "learning_rate": 6.977642514339224e-08, + "logits/chosen": -2.994924545288086, + "logits/rejected": -2.9803617000579834, + "logps/chosen": -187.88897705078125, + "logps/rejected": -219.99172973632812, + "loss": 0.3818, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8355489373207092, + "rewards/margins": 3.9951958656311035, + "rewards/rejected": -4.830744743347168, + "step": 6687 + }, + { + "epoch": 0.77, + "learning_rate": 6.9741308673768e-08, + "logits/chosen": -2.660395860671997, + "logits/rejected": -2.6100728511810303, + "logps/chosen": -377.8171691894531, + "logps/rejected": -451.81597900390625, + "loss": 0.4293, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1421983242034912, + "rewards/margins": 0.8054568767547607, + "rewards/rejected": -1.9476553201675415, + "step": 6688 + }, + { + "epoch": 0.77, + "learning_rate": 6.970619220414374e-08, + "logits/chosen": -1.9195563793182373, + "logits/rejected": -1.7944326400756836, + "logps/chosen": -162.62413024902344, + "logps/rejected": -280.0356140136719, + "loss": 0.8932, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8762900829315186, + "rewards/margins": 1.4634872674942017, + "rewards/rejected": -2.3397772312164307, + "step": 6689 + }, + { + "epoch": 0.77, + "learning_rate": 6.967107573451949e-08, + "logits/chosen": -2.3453638553619385, + "logits/rejected": -2.53228759765625, + "logps/chosen": -294.3785400390625, + "logps/rejected": -184.370849609375, + "loss": 0.5254, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5385358929634094, + "rewards/margins": 2.865504503250122, + "rewards/rejected": -3.4040403366088867, + "step": 6690 + }, + { + "epoch": 0.77, + "learning_rate": 6.963595926489523e-08, + "logits/chosen": -2.1498539447784424, + "logits/rejected": -2.400094747543335, + "logps/chosen": -294.10211181640625, + "logps/rejected": -339.94805908203125, + "loss": 0.3556, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7467180490493774, + "rewards/margins": 3.6471846103668213, + "rewards/rejected": -5.393902778625488, + "step": 6691 + }, + { + "epoch": 0.77, + "learning_rate": 6.960084279527099e-08, + "logits/chosen": -1.8392174243927002, + "logits/rejected": -1.8155105113983154, + "logps/chosen": -238.42953491210938, + "logps/rejected": -193.55844116210938, + "loss": 0.709, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5157294273376465, + "rewards/margins": 0.6092759370803833, + "rewards/rejected": -2.1250052452087402, + "step": 6692 + }, + { + "epoch": 0.77, + "learning_rate": 6.956572632564673e-08, + "logits/chosen": -2.52777099609375, + "logits/rejected": -2.5506081581115723, + "logps/chosen": -241.331787109375, + "logps/rejected": -302.6144104003906, + "loss": 0.2668, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6408417820930481, + "rewards/margins": 2.6004202365875244, + "rewards/rejected": -3.2412619590759277, + "step": 6693 + }, + { + "epoch": 0.77, + "learning_rate": 6.953060985602247e-08, + "logits/chosen": -2.9488143920898438, + "logits/rejected": -2.997279644012451, + "logps/chosen": -279.1662292480469, + "logps/rejected": -304.5806884765625, + "loss": 0.178, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6873432397842407, + "rewards/margins": 2.4695992469787598, + "rewards/rejected": -3.156942367553711, + "step": 6694 + }, + { + "epoch": 0.77, + "learning_rate": 6.949549338639822e-08, + "logits/chosen": -2.187013626098633, + "logits/rejected": -2.1650314331054688, + "logps/chosen": -175.84222412109375, + "logps/rejected": -243.56951904296875, + "loss": 0.2012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9230594635009766, + "rewards/margins": 1.9253065586090088, + "rewards/rejected": -2.8483660221099854, + "step": 6695 + }, + { + "epoch": 0.77, + "learning_rate": 6.946037691677396e-08, + "logits/chosen": -2.053874969482422, + "logits/rejected": -2.0610713958740234, + "logps/chosen": -308.840087890625, + "logps/rejected": -312.69549560546875, + "loss": 0.0448, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.695216178894043, + "rewards/margins": 4.9729228019714355, + "rewards/rejected": -5.6681389808654785, + "step": 6696 + }, + { + "epoch": 0.77, + "learning_rate": 6.94252604471497e-08, + "logits/chosen": -1.682120442390442, + "logits/rejected": -1.7478764057159424, + "logps/chosen": -308.6492004394531, + "logps/rejected": -181.69235229492188, + "loss": 0.3555, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2480722665786743, + "rewards/margins": 1.8888306617736816, + "rewards/rejected": -3.1369028091430664, + "step": 6697 + }, + { + "epoch": 0.77, + "learning_rate": 6.939014397752546e-08, + "logits/chosen": -2.199450731277466, + "logits/rejected": -2.2975873947143555, + "logps/chosen": -230.27716064453125, + "logps/rejected": -314.9104919433594, + "loss": 0.4612, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.088137686252594, + "rewards/margins": 2.202650785446167, + "rewards/rejected": -2.290788412094116, + "step": 6698 + }, + { + "epoch": 0.77, + "learning_rate": 6.93550275079012e-08, + "logits/chosen": -2.092144250869751, + "logits/rejected": -2.307640552520752, + "logps/chosen": -408.0372314453125, + "logps/rejected": -325.94696044921875, + "loss": 0.3821, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8559666872024536, + "rewards/margins": 1.5919098854064941, + "rewards/rejected": -2.4478766918182373, + "step": 6699 + }, + { + "epoch": 0.77, + "learning_rate": 6.931991103827695e-08, + "logits/chosen": -2.4803261756896973, + "logits/rejected": -2.389061450958252, + "logps/chosen": -244.63623046875, + "logps/rejected": -345.865478515625, + "loss": 0.2101, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7193007469177246, + "rewards/margins": 2.3583152294158936, + "rewards/rejected": -3.077615976333618, + "step": 6700 + }, + { + "epoch": 0.77, + "learning_rate": 6.928479456865269e-08, + "logits/chosen": -2.076972723007202, + "logits/rejected": -1.8387677669525146, + "logps/chosen": -216.87200927734375, + "logps/rejected": -393.7132568359375, + "loss": 0.1229, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4892551898956299, + "rewards/margins": 4.025708198547363, + "rewards/rejected": -5.5149641036987305, + "step": 6701 + }, + { + "epoch": 0.77, + "learning_rate": 6.924967809902845e-08, + "logits/chosen": -2.5698342323303223, + "logits/rejected": -2.601274013519287, + "logps/chosen": -260.08782958984375, + "logps/rejected": -243.79672241210938, + "loss": 0.4876, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0494881868362427, + "rewards/margins": 1.184032917022705, + "rewards/rejected": -2.233520984649658, + "step": 6702 + }, + { + "epoch": 0.77, + "learning_rate": 6.921456162940419e-08, + "logits/chosen": -2.3321187496185303, + "logits/rejected": -2.1709043979644775, + "logps/chosen": -195.76031494140625, + "logps/rejected": -216.59970092773438, + "loss": 0.7379, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4668716788291931, + "rewards/margins": 0.453809529542923, + "rewards/rejected": -0.9206812381744385, + "step": 6703 + }, + { + "epoch": 0.77, + "learning_rate": 6.917944515977994e-08, + "logits/chosen": -1.8849859237670898, + "logits/rejected": -2.190915584564209, + "logps/chosen": -174.98699951171875, + "logps/rejected": -140.4114227294922, + "loss": 0.843, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.458606243133545, + "rewards/margins": 0.012782968580722809, + "rewards/rejected": -1.4713892936706543, + "step": 6704 + }, + { + "epoch": 0.77, + "learning_rate": 6.914432869015568e-08, + "logits/chosen": -2.71390700340271, + "logits/rejected": -2.8381803035736084, + "logps/chosen": -208.2774200439453, + "logps/rejected": -266.2516174316406, + "loss": 0.0618, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4084419012069702, + "rewards/margins": 5.189676284790039, + "rewards/rejected": -6.598117828369141, + "step": 6705 + }, + { + "epoch": 0.77, + "learning_rate": 6.910921222053144e-08, + "logits/chosen": -2.2043848037719727, + "logits/rejected": -2.2306787967681885, + "logps/chosen": -236.7258758544922, + "logps/rejected": -237.10458374023438, + "loss": 0.4971, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6011323928833008, + "rewards/margins": 1.9283406734466553, + "rewards/rejected": -3.529472827911377, + "step": 6706 + }, + { + "epoch": 0.77, + "learning_rate": 6.907409575090718e-08, + "logits/chosen": -2.4481735229492188, + "logits/rejected": -2.3565847873687744, + "logps/chosen": -343.40704345703125, + "logps/rejected": -311.79608154296875, + "loss": 0.4841, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7922295928001404, + "rewards/margins": 1.9093494415283203, + "rewards/rejected": -2.7015790939331055, + "step": 6707 + }, + { + "epoch": 0.77, + "learning_rate": 6.903897928128292e-08, + "logits/chosen": -2.4036478996276855, + "logits/rejected": -2.5604841709136963, + "logps/chosen": -428.21575927734375, + "logps/rejected": -332.6805419921875, + "loss": 0.317, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.43616926670074463, + "rewards/margins": 2.134819984436035, + "rewards/rejected": -2.5709891319274902, + "step": 6708 + }, + { + "epoch": 0.77, + "learning_rate": 6.900386281165867e-08, + "logits/chosen": -1.7111815214157104, + "logits/rejected": -1.2251849174499512, + "logps/chosen": -453.6209411621094, + "logps/rejected": -564.198486328125, + "loss": 0.2716, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.44010859727859497, + "rewards/margins": 2.3928890228271484, + "rewards/rejected": -2.8329975605010986, + "step": 6709 + }, + { + "epoch": 0.77, + "learning_rate": 6.896874634203441e-08, + "logits/chosen": -1.8132636547088623, + "logits/rejected": -2.1342036724090576, + "logps/chosen": -375.5966796875, + "logps/rejected": -316.0268859863281, + "loss": 0.6643, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7243872880935669, + "rewards/margins": 0.7098809480667114, + "rewards/rejected": -1.4342682361602783, + "step": 6710 + }, + { + "epoch": 0.77, + "learning_rate": 6.893362987241015e-08, + "logits/chosen": -2.2398104667663574, + "logits/rejected": -2.281796932220459, + "logps/chosen": -308.53759765625, + "logps/rejected": -254.76519775390625, + "loss": 0.4235, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.32069826126098633, + "rewards/margins": 0.9435168504714966, + "rewards/rejected": -1.264215111732483, + "step": 6711 + }, + { + "epoch": 0.77, + "learning_rate": 6.88985134027859e-08, + "logits/chosen": -2.610139846801758, + "logits/rejected": -2.4927382469177246, + "logps/chosen": -184.14520263671875, + "logps/rejected": -200.5106201171875, + "loss": 0.4445, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0341622829437256, + "rewards/margins": 1.614742398262024, + "rewards/rejected": -2.64890456199646, + "step": 6712 + }, + { + "epoch": 0.77, + "learning_rate": 6.886339693316165e-08, + "logits/chosen": -1.9004400968551636, + "logits/rejected": -2.284421920776367, + "logps/chosen": -362.677978515625, + "logps/rejected": -383.91046142578125, + "loss": 0.4853, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.39425039291381836, + "rewards/margins": 2.0802254676818848, + "rewards/rejected": -2.474475860595703, + "step": 6713 + }, + { + "epoch": 0.77, + "learning_rate": 6.88282804635374e-08, + "logits/chosen": -2.522247552871704, + "logits/rejected": -2.458831787109375, + "logps/chosen": -147.76625061035156, + "logps/rejected": -232.4400634765625, + "loss": 0.3645, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6392085552215576, + "rewards/margins": 1.6319715976715088, + "rewards/rejected": -3.2711801528930664, + "step": 6714 + }, + { + "epoch": 0.77, + "learning_rate": 6.879316399391314e-08, + "logits/chosen": -2.4341437816619873, + "logits/rejected": -2.691225051879883, + "logps/chosen": -217.6982421875, + "logps/rejected": -112.67585754394531, + "loss": 1.7371, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.880936622619629, + "rewards/margins": -0.32142889499664307, + "rewards/rejected": -2.5595076084136963, + "step": 6715 + }, + { + "epoch": 0.77, + "learning_rate": 6.87580475242889e-08, + "logits/chosen": -2.3677783012390137, + "logits/rejected": -2.3081724643707275, + "logps/chosen": -342.00018310546875, + "logps/rejected": -328.4078369140625, + "loss": 0.7901, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7127505540847778, + "rewards/margins": 0.5245767831802368, + "rewards/rejected": -2.2373273372650146, + "step": 6716 + }, + { + "epoch": 0.77, + "learning_rate": 6.872293105466463e-08, + "logits/chosen": -2.3426902294158936, + "logits/rejected": -2.5827760696411133, + "logps/chosen": -357.72216796875, + "logps/rejected": -281.29364013671875, + "loss": 0.6096, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7562234401702881, + "rewards/margins": 2.4747653007507324, + "rewards/rejected": -3.2309887409210205, + "step": 6717 + }, + { + "epoch": 0.77, + "learning_rate": 6.868781458504038e-08, + "logits/chosen": -2.3596787452697754, + "logits/rejected": -2.3389084339141846, + "logps/chosen": -186.93238830566406, + "logps/rejected": -241.70733642578125, + "loss": 0.4783, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2079066038131714, + "rewards/margins": 1.587318778038025, + "rewards/rejected": -2.7952253818511963, + "step": 6718 + }, + { + "epoch": 0.77, + "learning_rate": 6.865269811541613e-08, + "logits/chosen": -2.257779121398926, + "logits/rejected": -2.4232845306396484, + "logps/chosen": -135.4590606689453, + "logps/rejected": -184.20945739746094, + "loss": 0.2609, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3578875660896301, + "rewards/margins": 2.3958842754364014, + "rewards/rejected": -2.7537717819213867, + "step": 6719 + }, + { + "epoch": 0.77, + "learning_rate": 6.861758164579187e-08, + "logits/chosen": -2.2364754676818848, + "logits/rejected": -2.1068968772888184, + "logps/chosen": -294.05902099609375, + "logps/rejected": -403.3668518066406, + "loss": 0.5464, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.956768274307251, + "rewards/margins": 0.8974238634109497, + "rewards/rejected": -1.8541920185089111, + "step": 6720 + }, + { + "epoch": 0.77, + "learning_rate": 6.858246517616762e-08, + "logits/chosen": -2.5469226837158203, + "logits/rejected": -2.304722547531128, + "logps/chosen": -281.9194641113281, + "logps/rejected": -249.95700073242188, + "loss": 0.1689, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40206781029701233, + "rewards/margins": 2.707293748855591, + "rewards/rejected": -3.109361410140991, + "step": 6721 + }, + { + "epoch": 0.77, + "learning_rate": 6.854734870654336e-08, + "logits/chosen": -2.110722303390503, + "logits/rejected": -2.1584880352020264, + "logps/chosen": -313.741455078125, + "logps/rejected": -325.2815856933594, + "loss": 0.1227, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5499436855316162, + "rewards/margins": 2.619649648666382, + "rewards/rejected": -3.169593572616577, + "step": 6722 + }, + { + "epoch": 0.78, + "learning_rate": 6.851223223691912e-08, + "logits/chosen": -2.071920156478882, + "logits/rejected": -2.1655197143554688, + "logps/chosen": -160.62347412109375, + "logps/rejected": -212.9387664794922, + "loss": 0.3078, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.34006929397583, + "rewards/margins": 2.044863224029541, + "rewards/rejected": -3.384932518005371, + "step": 6723 + }, + { + "epoch": 0.78, + "learning_rate": 6.847711576729486e-08, + "logits/chosen": -2.7086992263793945, + "logits/rejected": -2.670099973678589, + "logps/chosen": -287.3293151855469, + "logps/rejected": -294.02777099609375, + "loss": 0.4048, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06737788021564484, + "rewards/margins": 1.530268907546997, + "rewards/rejected": -1.597646713256836, + "step": 6724 + }, + { + "epoch": 0.78, + "learning_rate": 6.84419992976706e-08, + "logits/chosen": -1.9941571950912476, + "logits/rejected": -1.9007185697555542, + "logps/chosen": -376.92236328125, + "logps/rejected": -262.0211181640625, + "loss": 1.1195, + "rewards/accuracies": 0.375, + "rewards/chosen": -2.254122734069824, + "rewards/margins": 1.9133291244506836, + "rewards/rejected": -4.16745138168335, + "step": 6725 + }, + { + "epoch": 0.78, + "learning_rate": 6.840688282804635e-08, + "logits/chosen": -1.7950130701065063, + "logits/rejected": -1.9629592895507812, + "logps/chosen": -380.46697998046875, + "logps/rejected": -320.4427490234375, + "loss": 0.2805, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9200702905654907, + "rewards/margins": 2.521136999130249, + "rewards/rejected": -3.441206932067871, + "step": 6726 + }, + { + "epoch": 0.78, + "learning_rate": 6.83717663584221e-08, + "logits/chosen": -2.435770273208618, + "logits/rejected": -2.6334943771362305, + "logps/chosen": -261.5721130371094, + "logps/rejected": -334.029052734375, + "loss": 0.333, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1530113220214844, + "rewards/margins": 3.412742853164673, + "rewards/rejected": -4.565753936767578, + "step": 6727 + }, + { + "epoch": 0.78, + "learning_rate": 6.833664988879783e-08, + "logits/chosen": -2.0724642276763916, + "logits/rejected": -2.088942527770996, + "logps/chosen": -159.28170776367188, + "logps/rejected": -173.5338134765625, + "loss": 0.4413, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4802561402320862, + "rewards/margins": 2.3562281131744385, + "rewards/rejected": -2.836483955383301, + "step": 6728 + }, + { + "epoch": 0.78, + "learning_rate": 6.830153341917359e-08, + "logits/chosen": -2.0413806438446045, + "logits/rejected": -2.085826873779297, + "logps/chosen": -394.3807373046875, + "logps/rejected": -332.4981994628906, + "loss": 0.5055, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.41364243626594543, + "rewards/margins": 0.8301815390586853, + "rewards/rejected": -1.2438240051269531, + "step": 6729 + }, + { + "epoch": 0.78, + "learning_rate": 6.826641694954933e-08, + "logits/chosen": -2.6862616539001465, + "logits/rejected": -2.865792989730835, + "logps/chosen": -268.1058044433594, + "logps/rejected": -221.5977020263672, + "loss": 0.2284, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5949941277503967, + "rewards/margins": 2.418497323989868, + "rewards/rejected": -3.01349139213562, + "step": 6730 + }, + { + "epoch": 0.78, + "learning_rate": 6.823130047992508e-08, + "logits/chosen": -1.995114803314209, + "logits/rejected": -1.7002314329147339, + "logps/chosen": -413.95684814453125, + "logps/rejected": -439.14508056640625, + "loss": 0.4942, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7224481105804443, + "rewards/margins": 1.567549228668213, + "rewards/rejected": -2.2899973392486572, + "step": 6731 + }, + { + "epoch": 0.78, + "learning_rate": 6.819618401030082e-08, + "logits/chosen": -2.330484628677368, + "logits/rejected": -2.35072660446167, + "logps/chosen": -240.6108856201172, + "logps/rejected": -191.61265563964844, + "loss": 0.5301, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2294518947601318, + "rewards/margins": 0.9714474678039551, + "rewards/rejected": -2.200899362564087, + "step": 6732 + }, + { + "epoch": 0.78, + "learning_rate": 6.816106754067658e-08, + "logits/chosen": -2.4596974849700928, + "logits/rejected": -2.557394504547119, + "logps/chosen": -235.11187744140625, + "logps/rejected": -237.45196533203125, + "loss": 0.2035, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0717333555221558, + "rewards/margins": 2.422877550125122, + "rewards/rejected": -3.4946110248565674, + "step": 6733 + }, + { + "epoch": 0.78, + "learning_rate": 6.812595107105232e-08, + "logits/chosen": -2.44350004196167, + "logits/rejected": -2.5103728771209717, + "logps/chosen": -178.7794952392578, + "logps/rejected": -196.1624298095703, + "loss": 0.6409, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8669584393501282, + "rewards/margins": 2.116638660430908, + "rewards/rejected": -2.9835970401763916, + "step": 6734 + }, + { + "epoch": 0.78, + "learning_rate": 6.809083460142807e-08, + "logits/chosen": -2.1108617782592773, + "logits/rejected": -2.119063377380371, + "logps/chosen": -328.6092834472656, + "logps/rejected": -472.3523864746094, + "loss": 0.2815, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6108553409576416, + "rewards/margins": 3.145212173461914, + "rewards/rejected": -3.7560672760009766, + "step": 6735 + }, + { + "epoch": 0.78, + "learning_rate": 6.805571813180381e-08, + "logits/chosen": -1.9930908679962158, + "logits/rejected": -2.0819313526153564, + "logps/chosen": -327.0570068359375, + "logps/rejected": -205.13009643554688, + "loss": 0.2411, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6513717174530029, + "rewards/margins": 3.030007839202881, + "rewards/rejected": -3.681379556655884, + "step": 6736 + }, + { + "epoch": 0.78, + "learning_rate": 6.802060166217957e-08, + "logits/chosen": -2.530909776687622, + "logits/rejected": -2.4021825790405273, + "logps/chosen": -167.8187255859375, + "logps/rejected": -219.34219360351562, + "loss": 0.9467, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.0094399452209473, + "rewards/margins": 0.05032829940319061, + "rewards/rejected": -2.0597684383392334, + "step": 6737 + }, + { + "epoch": 0.78, + "learning_rate": 6.798548519255531e-08, + "logits/chosen": -2.071357250213623, + "logits/rejected": -2.2555766105651855, + "logps/chosen": -421.1507873535156, + "logps/rejected": -281.18023681640625, + "loss": 1.4135, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.3051280975341797, + "rewards/margins": -0.2884330749511719, + "rewards/rejected": -2.016695022583008, + "step": 6738 + }, + { + "epoch": 0.78, + "learning_rate": 6.795036872293105e-08, + "logits/chosen": -2.1231942176818848, + "logits/rejected": -2.191234827041626, + "logps/chosen": -259.568603515625, + "logps/rejected": -236.04058837890625, + "loss": 0.1556, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.584865927696228, + "rewards/margins": 2.9034790992736816, + "rewards/rejected": -3.48834490776062, + "step": 6739 + }, + { + "epoch": 0.78, + "learning_rate": 6.79152522533068e-08, + "logits/chosen": -2.1572012901306152, + "logits/rejected": -2.3489997386932373, + "logps/chosen": -324.45660400390625, + "logps/rejected": -294.21868896484375, + "loss": 0.3106, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3674315810203552, + "rewards/margins": 2.080003499984741, + "rewards/rejected": -2.447435140609741, + "step": 6740 + }, + { + "epoch": 0.78, + "learning_rate": 6.788013578368254e-08, + "logits/chosen": -2.7618861198425293, + "logits/rejected": -2.863643169403076, + "logps/chosen": -236.96994018554688, + "logps/rejected": -225.96591186523438, + "loss": 0.7964, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6347997188568115, + "rewards/margins": 0.5968242883682251, + "rewards/rejected": -2.231623888015747, + "step": 6741 + }, + { + "epoch": 0.78, + "learning_rate": 6.784501931405828e-08, + "logits/chosen": -2.574768304824829, + "logits/rejected": -2.6293134689331055, + "logps/chosen": -298.2076110839844, + "logps/rejected": -262.523681640625, + "loss": 0.6995, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7433321475982666, + "rewards/margins": 2.8223984241485596, + "rewards/rejected": -4.565730094909668, + "step": 6742 + }, + { + "epoch": 0.78, + "learning_rate": 6.780990284443404e-08, + "logits/chosen": -2.1157681941986084, + "logits/rejected": -2.197146415710449, + "logps/chosen": -292.79986572265625, + "logps/rejected": -300.8026428222656, + "loss": 0.5311, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6513869762420654, + "rewards/margins": 1.3964964151382446, + "rewards/rejected": -3.0478832721710205, + "step": 6743 + }, + { + "epoch": 0.78, + "learning_rate": 6.777478637480978e-08, + "logits/chosen": -2.336273670196533, + "logits/rejected": -2.4008865356445312, + "logps/chosen": -183.41891479492188, + "logps/rejected": -175.87307739257812, + "loss": 0.4037, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7054240107536316, + "rewards/margins": 0.9977059364318848, + "rewards/rejected": -1.7031300067901611, + "step": 6744 + }, + { + "epoch": 0.78, + "learning_rate": 6.773966990518553e-08, + "logits/chosen": -2.2042388916015625, + "logits/rejected": -2.2008016109466553, + "logps/chosen": -382.7676696777344, + "logps/rejected": -260.98211669921875, + "loss": 0.209, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1514668464660645, + "rewards/margins": 2.299259901046753, + "rewards/rejected": -3.4507269859313965, + "step": 6745 + }, + { + "epoch": 0.78, + "learning_rate": 6.770455343556127e-08, + "logits/chosen": -1.7062939405441284, + "logits/rejected": -1.933680772781372, + "logps/chosen": -626.3251953125, + "logps/rejected": -493.66162109375, + "loss": 0.1382, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.286014586687088, + "rewards/margins": 2.962904453277588, + "rewards/rejected": -3.2489190101623535, + "step": 6746 + }, + { + "epoch": 0.78, + "learning_rate": 6.766943696593703e-08, + "logits/chosen": -2.6966538429260254, + "logits/rejected": -2.677140951156616, + "logps/chosen": -221.00814819335938, + "logps/rejected": -141.9742431640625, + "loss": 0.719, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.802748203277588, + "rewards/margins": 1.2476435899734497, + "rewards/rejected": -3.050391674041748, + "step": 6747 + }, + { + "epoch": 0.78, + "learning_rate": 6.763432049631277e-08, + "logits/chosen": -2.011991500854492, + "logits/rejected": -2.3218023777008057, + "logps/chosen": -273.30474853515625, + "logps/rejected": -171.07669067382812, + "loss": 0.3309, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5432555675506592, + "rewards/margins": 2.826922655105591, + "rewards/rejected": -3.37017822265625, + "step": 6748 + }, + { + "epoch": 0.78, + "learning_rate": 6.759920402668852e-08, + "logits/chosen": -2.1936569213867188, + "logits/rejected": -2.140475273132324, + "logps/chosen": -395.368896484375, + "logps/rejected": -333.08172607421875, + "loss": 0.4526, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7065732479095459, + "rewards/margins": 1.2458465099334717, + "rewards/rejected": -1.9524197578430176, + "step": 6749 + }, + { + "epoch": 0.78, + "learning_rate": 6.756408755706426e-08, + "logits/chosen": -2.0482866764068604, + "logits/rejected": -2.0105032920837402, + "logps/chosen": -256.34808349609375, + "logps/rejected": -333.928466796875, + "loss": 0.6735, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4019403457641602, + "rewards/margins": 2.534996271133423, + "rewards/rejected": -3.936936378479004, + "step": 6750 + }, + { + "epoch": 0.78, + "learning_rate": 6.752897108744001e-08, + "logits/chosen": -1.8107874393463135, + "logits/rejected": -2.0105295181274414, + "logps/chosen": -473.51702880859375, + "logps/rejected": -345.30108642578125, + "loss": 0.1668, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2824500501155853, + "rewards/margins": 2.25250244140625, + "rewards/rejected": -2.534952402114868, + "step": 6751 + }, + { + "epoch": 0.78, + "learning_rate": 6.749385461781576e-08, + "logits/chosen": -2.6773886680603027, + "logits/rejected": -2.826993465423584, + "logps/chosen": -256.1320495605469, + "logps/rejected": -271.9828186035156, + "loss": 0.1815, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.150073766708374, + "rewards/margins": 3.541935920715332, + "rewards/rejected": -4.692009449005127, + "step": 6752 + }, + { + "epoch": 0.78, + "learning_rate": 6.74587381481915e-08, + "logits/chosen": -2.914210319519043, + "logits/rejected": -2.718595504760742, + "logps/chosen": -199.5098114013672, + "logps/rejected": -315.6686096191406, + "loss": 0.0817, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7106266021728516, + "rewards/margins": 3.369297981262207, + "rewards/rejected": -4.079924583435059, + "step": 6753 + }, + { + "epoch": 0.78, + "learning_rate": 6.742362167856725e-08, + "logits/chosen": -2.5597078800201416, + "logits/rejected": -2.3808703422546387, + "logps/chosen": -200.01513671875, + "logps/rejected": -261.0828857421875, + "loss": 0.3526, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.178149700164795, + "rewards/margins": 1.8171756267547607, + "rewards/rejected": -2.9953250885009766, + "step": 6754 + }, + { + "epoch": 0.78, + "learning_rate": 6.738850520894299e-08, + "logits/chosen": -2.0992627143859863, + "logits/rejected": -2.0525896549224854, + "logps/chosen": -438.15338134765625, + "logps/rejected": -307.8097839355469, + "loss": 0.2879, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5296735763549805, + "rewards/margins": 2.046416759490967, + "rewards/rejected": -2.5760903358459473, + "step": 6755 + }, + { + "epoch": 0.78, + "learning_rate": 6.735338873931873e-08, + "logits/chosen": -2.021305799484253, + "logits/rejected": -1.9387986660003662, + "logps/chosen": -250.940673828125, + "logps/rejected": -289.9202880859375, + "loss": 0.5993, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0702810287475586, + "rewards/margins": 0.6084862351417542, + "rewards/rejected": -1.6787670850753784, + "step": 6756 + }, + { + "epoch": 0.78, + "learning_rate": 6.731827226969448e-08, + "logits/chosen": -2.0168612003326416, + "logits/rejected": -1.851918339729309, + "logps/chosen": -201.95555114746094, + "logps/rejected": -340.2947998046875, + "loss": 0.072, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.188016414642334, + "rewards/margins": 4.151844024658203, + "rewards/rejected": -5.339860439300537, + "step": 6757 + }, + { + "epoch": 0.78, + "learning_rate": 6.728315580007023e-08, + "logits/chosen": -1.856821060180664, + "logits/rejected": -1.957711935043335, + "logps/chosen": -324.37548828125, + "logps/rejected": -233.42721557617188, + "loss": 0.6658, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0829427242279053, + "rewards/margins": 2.028001546859741, + "rewards/rejected": -3.1109442710876465, + "step": 6758 + }, + { + "epoch": 0.78, + "learning_rate": 6.724803933044598e-08, + "logits/chosen": -1.7226048707962036, + "logits/rejected": -2.172693967819214, + "logps/chosen": -411.015625, + "logps/rejected": -174.05392456054688, + "loss": 0.6557, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5601551532745361, + "rewards/margins": 1.0359444618225098, + "rewards/rejected": -1.5960993766784668, + "step": 6759 + }, + { + "epoch": 0.78, + "learning_rate": 6.721292286082172e-08, + "logits/chosen": -2.9917635917663574, + "logits/rejected": -3.0348973274230957, + "logps/chosen": -243.10836791992188, + "logps/rejected": -232.8865203857422, + "loss": 0.44, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7461118698120117, + "rewards/margins": 1.5473310947418213, + "rewards/rejected": -2.293442964553833, + "step": 6760 + }, + { + "epoch": 0.78, + "learning_rate": 6.717780639119747e-08, + "logits/chosen": -1.9500919580459595, + "logits/rejected": -1.768445611000061, + "logps/chosen": -270.3331604003906, + "logps/rejected": -336.6565246582031, + "loss": 1.1384, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8699797987937927, + "rewards/margins": 0.6286569833755493, + "rewards/rejected": -1.4986367225646973, + "step": 6761 + }, + { + "epoch": 0.78, + "learning_rate": 6.714268992157321e-08, + "logits/chosen": -2.8200371265411377, + "logits/rejected": -2.782599449157715, + "logps/chosen": -146.34422302246094, + "logps/rejected": -225.95578002929688, + "loss": 0.3857, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7270694971084595, + "rewards/margins": 1.3599169254302979, + "rewards/rejected": -2.086986541748047, + "step": 6762 + }, + { + "epoch": 0.78, + "learning_rate": 6.710757345194897e-08, + "logits/chosen": -2.0945863723754883, + "logits/rejected": -2.08369517326355, + "logps/chosen": -489.478515625, + "logps/rejected": -371.29058837890625, + "loss": 0.6772, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8564751744270325, + "rewards/margins": 2.305636405944824, + "rewards/rejected": -3.162111282348633, + "step": 6763 + }, + { + "epoch": 0.78, + "learning_rate": 6.707245698232471e-08, + "logits/chosen": -2.6738686561584473, + "logits/rejected": -2.5709304809570312, + "logps/chosen": -203.7376708984375, + "logps/rejected": -304.89239501953125, + "loss": 0.1866, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5141488909721375, + "rewards/margins": 4.311595916748047, + "rewards/rejected": -4.82574462890625, + "step": 6764 + }, + { + "epoch": 0.78, + "learning_rate": 6.703734051270045e-08, + "logits/chosen": -2.2680013179779053, + "logits/rejected": -2.2800562381744385, + "logps/chosen": -475.9147033691406, + "logps/rejected": -494.0921630859375, + "loss": 0.4331, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7031159996986389, + "rewards/margins": 3.7422990798950195, + "rewards/rejected": -4.445415496826172, + "step": 6765 + }, + { + "epoch": 0.78, + "learning_rate": 6.70022240430762e-08, + "logits/chosen": -2.4707326889038086, + "logits/rejected": -2.695087194442749, + "logps/chosen": -336.06268310546875, + "logps/rejected": -200.15663146972656, + "loss": 1.3773, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.055373191833496, + "rewards/margins": 0.050832346081733704, + "rewards/rejected": -2.1062052249908447, + "step": 6766 + }, + { + "epoch": 0.78, + "learning_rate": 6.696710757345194e-08, + "logits/chosen": -2.60652756690979, + "logits/rejected": -2.6623077392578125, + "logps/chosen": -339.21319580078125, + "logps/rejected": -263.388427734375, + "loss": 0.4346, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7031300067901611, + "rewards/margins": 2.076204538345337, + "rewards/rejected": -3.779334545135498, + "step": 6767 + }, + { + "epoch": 0.78, + "learning_rate": 6.69319911038277e-08, + "logits/chosen": -2.627056837081909, + "logits/rejected": -2.5336861610412598, + "logps/chosen": -216.57086181640625, + "logps/rejected": -183.80751037597656, + "loss": 0.7149, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.2248358726501465, + "rewards/margins": 1.3720977306365967, + "rewards/rejected": -3.596933364868164, + "step": 6768 + }, + { + "epoch": 0.78, + "learning_rate": 6.689687463420344e-08, + "logits/chosen": -2.4166347980499268, + "logits/rejected": -2.4694600105285645, + "logps/chosen": -451.06536865234375, + "logps/rejected": -315.8016662597656, + "loss": 0.5307, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0778346061706543, + "rewards/margins": 0.9430211782455444, + "rewards/rejected": -2.020855665206909, + "step": 6769 + }, + { + "epoch": 0.78, + "learning_rate": 6.686175816457918e-08, + "logits/chosen": -1.7808895111083984, + "logits/rejected": -2.0010509490966797, + "logps/chosen": -273.568115234375, + "logps/rejected": -268.51116943359375, + "loss": 0.541, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4335670471191406, + "rewards/margins": 0.7033036947250366, + "rewards/rejected": -2.136870861053467, + "step": 6770 + }, + { + "epoch": 0.78, + "learning_rate": 6.682664169495493e-08, + "logits/chosen": -2.5190796852111816, + "logits/rejected": -2.516016960144043, + "logps/chosen": -155.55191040039062, + "logps/rejected": -178.68014526367188, + "loss": 0.523, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7718937397003174, + "rewards/margins": 0.9838066697120667, + "rewards/rejected": -1.7557003498077393, + "step": 6771 + }, + { + "epoch": 0.78, + "learning_rate": 6.679152522533067e-08, + "logits/chosen": -2.283121109008789, + "logits/rejected": -1.9472428560256958, + "logps/chosen": -215.85305786132812, + "logps/rejected": -282.6402282714844, + "loss": 0.3425, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9406121969223022, + "rewards/margins": 1.473179578781128, + "rewards/rejected": -2.4137916564941406, + "step": 6772 + }, + { + "epoch": 0.78, + "learning_rate": 6.675640875570641e-08, + "logits/chosen": -2.288024425506592, + "logits/rejected": -2.3393540382385254, + "logps/chosen": -243.01585388183594, + "logps/rejected": -202.59898376464844, + "loss": 0.7595, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5901129245758057, + "rewards/margins": 2.15895414352417, + "rewards/rejected": -3.7490670680999756, + "step": 6773 + }, + { + "epoch": 0.78, + "learning_rate": 6.672129228608217e-08, + "logits/chosen": -2.1407065391540527, + "logits/rejected": -2.159972906112671, + "logps/chosen": -290.40924072265625, + "logps/rejected": -360.22210693359375, + "loss": 0.5362, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9491012096405029, + "rewards/margins": 1.617375373840332, + "rewards/rejected": -2.566476821899414, + "step": 6774 + }, + { + "epoch": 0.78, + "learning_rate": 6.668617581645791e-08, + "logits/chosen": -2.6825766563415527, + "logits/rejected": -2.467323064804077, + "logps/chosen": -185.65261840820312, + "logps/rejected": -213.68675231933594, + "loss": 0.4813, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9815826416015625, + "rewards/margins": 1.582878589630127, + "rewards/rejected": -2.5644612312316895, + "step": 6775 + }, + { + "epoch": 0.78, + "learning_rate": 6.665105934683366e-08, + "logits/chosen": -2.601382255554199, + "logits/rejected": -2.66544246673584, + "logps/chosen": -234.02471923828125, + "logps/rejected": -184.940185546875, + "loss": 0.3165, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6036818623542786, + "rewards/margins": 2.0944695472717285, + "rewards/rejected": -2.6981515884399414, + "step": 6776 + }, + { + "epoch": 0.78, + "learning_rate": 6.66159428772094e-08, + "logits/chosen": -2.4188427925109863, + "logits/rejected": -2.6484527587890625, + "logps/chosen": -306.64459228515625, + "logps/rejected": -273.6948547363281, + "loss": 0.2824, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7049412727355957, + "rewards/margins": 2.0789570808410645, + "rewards/rejected": -2.78389835357666, + "step": 6777 + }, + { + "epoch": 0.78, + "learning_rate": 6.658082640758516e-08, + "logits/chosen": -1.9719576835632324, + "logits/rejected": -2.1610329151153564, + "logps/chosen": -293.5367736816406, + "logps/rejected": -259.7406005859375, + "loss": 1.0076, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6181349158287048, + "rewards/margins": 2.3515548706054688, + "rewards/rejected": -2.9696898460388184, + "step": 6778 + }, + { + "epoch": 0.78, + "learning_rate": 6.65457099379609e-08, + "logits/chosen": -2.2257375717163086, + "logits/rejected": -2.036381959915161, + "logps/chosen": -237.18875122070312, + "logps/rejected": -325.569580078125, + "loss": 0.2783, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6959257125854492, + "rewards/margins": 2.919947624206543, + "rewards/rejected": -3.615873336791992, + "step": 6779 + }, + { + "epoch": 0.78, + "learning_rate": 6.651059346833665e-08, + "logits/chosen": -2.8000502586364746, + "logits/rejected": -2.6132843494415283, + "logps/chosen": -310.22235107421875, + "logps/rejected": -278.76629638671875, + "loss": 0.1579, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9703480005264282, + "rewards/margins": 2.4926693439483643, + "rewards/rejected": -3.463017225265503, + "step": 6780 + }, + { + "epoch": 0.78, + "learning_rate": 6.647547699871239e-08, + "logits/chosen": -2.607866048812866, + "logits/rejected": -2.7053604125976562, + "logps/chosen": -221.77098083496094, + "logps/rejected": -211.52182006835938, + "loss": 0.5192, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5397509336471558, + "rewards/margins": 1.957136631011963, + "rewards/rejected": -3.496887445449829, + "step": 6781 + }, + { + "epoch": 0.78, + "learning_rate": 6.644036052908815e-08, + "logits/chosen": -2.5807485580444336, + "logits/rejected": -2.830258369445801, + "logps/chosen": -346.17791748046875, + "logps/rejected": -223.7142791748047, + "loss": 0.4304, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0659537315368652, + "rewards/margins": 1.4967985153198242, + "rewards/rejected": -2.5627522468566895, + "step": 6782 + }, + { + "epoch": 0.78, + "learning_rate": 6.640524405946389e-08, + "logits/chosen": -1.5728838443756104, + "logits/rejected": -1.8070924282073975, + "logps/chosen": -329.43267822265625, + "logps/rejected": -307.4179382324219, + "loss": 0.5749, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4524281024932861, + "rewards/margins": 1.7923364639282227, + "rewards/rejected": -3.244764566421509, + "step": 6783 + }, + { + "epoch": 0.78, + "learning_rate": 6.637012758983963e-08, + "logits/chosen": -2.369983196258545, + "logits/rejected": -2.4812400341033936, + "logps/chosen": -279.7174377441406, + "logps/rejected": -267.1177062988281, + "loss": 0.6542, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9046896696090698, + "rewards/margins": 1.1285772323608398, + "rewards/rejected": -2.03326678276062, + "step": 6784 + }, + { + "epoch": 0.78, + "learning_rate": 6.633501112021538e-08, + "logits/chosen": -2.6439661979675293, + "logits/rejected": -2.7908198833465576, + "logps/chosen": -161.96359252929688, + "logps/rejected": -236.6552734375, + "loss": 0.1327, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3712241053581238, + "rewards/margins": 3.1658849716186523, + "rewards/rejected": -3.537109136581421, + "step": 6785 + }, + { + "epoch": 0.78, + "learning_rate": 6.629989465059112e-08, + "logits/chosen": -2.2999491691589355, + "logits/rejected": -2.2283339500427246, + "logps/chosen": -188.72544860839844, + "logps/rejected": -227.15415954589844, + "loss": 0.2759, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8326164484024048, + "rewards/margins": 2.068042039871216, + "rewards/rejected": -2.900658369064331, + "step": 6786 + }, + { + "epoch": 0.78, + "learning_rate": 6.626477818096686e-08, + "logits/chosen": -2.431715965270996, + "logits/rejected": -2.4571220874786377, + "logps/chosen": -263.57806396484375, + "logps/rejected": -396.55255126953125, + "loss": 0.0656, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5492641925811768, + "rewards/margins": 4.878589630126953, + "rewards/rejected": -5.427853584289551, + "step": 6787 + }, + { + "epoch": 0.78, + "learning_rate": 6.622966171134262e-08, + "logits/chosen": -2.5556631088256836, + "logits/rejected": -2.879878044128418, + "logps/chosen": -198.22027587890625, + "logps/rejected": -277.026123046875, + "loss": 0.5471, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9067612290382385, + "rewards/margins": 1.7037497758865356, + "rewards/rejected": -2.61051082611084, + "step": 6788 + }, + { + "epoch": 0.78, + "learning_rate": 6.619454524171836e-08, + "logits/chosen": -2.0479695796966553, + "logits/rejected": -2.123567581176758, + "logps/chosen": -395.83221435546875, + "logps/rejected": -366.9845275878906, + "loss": 0.5825, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8108676075935364, + "rewards/margins": 1.7491034269332886, + "rewards/rejected": -2.5599710941314697, + "step": 6789 + }, + { + "epoch": 0.78, + "learning_rate": 6.615942877209411e-08, + "logits/chosen": -2.6478304862976074, + "logits/rejected": -2.867030620574951, + "logps/chosen": -127.4749755859375, + "logps/rejected": -167.6136474609375, + "loss": 0.3591, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7514568567276001, + "rewards/margins": 2.579129695892334, + "rewards/rejected": -3.3305869102478027, + "step": 6790 + }, + { + "epoch": 0.78, + "learning_rate": 6.612431230246985e-08, + "logits/chosen": -1.858854055404663, + "logits/rejected": -2.2152907848358154, + "logps/chosen": -244.54669189453125, + "logps/rejected": -231.148193359375, + "loss": 0.6029, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1451268196105957, + "rewards/margins": 1.766560673713684, + "rewards/rejected": -2.9116876125335693, + "step": 6791 + }, + { + "epoch": 0.78, + "learning_rate": 6.60891958328456e-08, + "logits/chosen": -2.129737377166748, + "logits/rejected": -2.2087056636810303, + "logps/chosen": -416.76104736328125, + "logps/rejected": -456.5767822265625, + "loss": 0.3312, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5949267745018005, + "rewards/margins": 2.0101327896118164, + "rewards/rejected": -2.6050596237182617, + "step": 6792 + }, + { + "epoch": 0.78, + "learning_rate": 6.605407936322135e-08, + "logits/chosen": -2.4513185024261475, + "logits/rejected": -2.4295406341552734, + "logps/chosen": -440.2815856933594, + "logps/rejected": -393.01361083984375, + "loss": 0.4574, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9690696001052856, + "rewards/margins": 1.7270572185516357, + "rewards/rejected": -2.696126699447632, + "step": 6793 + }, + { + "epoch": 0.78, + "learning_rate": 6.60189628935971e-08, + "logits/chosen": -1.8550225496292114, + "logits/rejected": -2.1323044300079346, + "logps/chosen": -394.95294189453125, + "logps/rejected": -283.1553955078125, + "loss": 0.8319, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6435580253601074, + "rewards/margins": 0.3194997012615204, + "rewards/rejected": -1.9630576372146606, + "step": 6794 + }, + { + "epoch": 0.78, + "learning_rate": 6.598384642397284e-08, + "logits/chosen": -2.2899467945098877, + "logits/rejected": -2.110233783721924, + "logps/chosen": -219.33274841308594, + "logps/rejected": -334.7505187988281, + "loss": 0.3736, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.39988672733306885, + "rewards/margins": 2.3014886379241943, + "rewards/rejected": -2.7013754844665527, + "step": 6795 + }, + { + "epoch": 0.78, + "learning_rate": 6.59487299543486e-08, + "logits/chosen": -2.632880210876465, + "logits/rejected": -2.718238353729248, + "logps/chosen": -214.6731414794922, + "logps/rejected": -303.58758544921875, + "loss": 0.2796, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3535502254962921, + "rewards/margins": 2.367724895477295, + "rewards/rejected": -2.7212750911712646, + "step": 6796 + }, + { + "epoch": 0.78, + "learning_rate": 6.591361348472433e-08, + "logits/chosen": -2.4504032135009766, + "logits/rejected": -2.6165547370910645, + "logps/chosen": -299.236328125, + "logps/rejected": -229.42076110839844, + "loss": 0.2939, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5733634829521179, + "rewards/margins": 2.281489133834839, + "rewards/rejected": -2.8548526763916016, + "step": 6797 + }, + { + "epoch": 0.78, + "learning_rate": 6.587849701510009e-08, + "logits/chosen": -2.6449739933013916, + "logits/rejected": -2.618968963623047, + "logps/chosen": -194.58843994140625, + "logps/rejected": -208.524658203125, + "loss": 0.4716, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0409224033355713, + "rewards/margins": 1.7705823183059692, + "rewards/rejected": -2.81150484085083, + "step": 6798 + }, + { + "epoch": 0.78, + "learning_rate": 6.584338054547583e-08, + "logits/chosen": -2.1908280849456787, + "logits/rejected": -2.297477960586548, + "logps/chosen": -349.4020080566406, + "logps/rejected": -310.21630859375, + "loss": 0.4219, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7595160007476807, + "rewards/margins": 1.814941167831421, + "rewards/rejected": -2.5744571685791016, + "step": 6799 + }, + { + "epoch": 0.78, + "learning_rate": 6.580826407585157e-08, + "logits/chosen": -3.01666259765625, + "logits/rejected": -3.0405287742614746, + "logps/chosen": -246.61863708496094, + "logps/rejected": -319.7878723144531, + "loss": 0.2424, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2549418210983276, + "rewards/margins": 3.731464147567749, + "rewards/rejected": -4.986405849456787, + "step": 6800 + }, + { + "epoch": 0.78, + "learning_rate": 6.577314760622731e-08, + "logits/chosen": -2.414414644241333, + "logits/rejected": -2.7276084423065186, + "logps/chosen": -275.30255126953125, + "logps/rejected": -274.4233703613281, + "loss": 0.2491, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5587469935417175, + "rewards/margins": 3.147867202758789, + "rewards/rejected": -3.7066142559051514, + "step": 6801 + }, + { + "epoch": 0.78, + "learning_rate": 6.573803113660306e-08, + "logits/chosen": -2.633885383605957, + "logits/rejected": -2.345235824584961, + "logps/chosen": -164.71096801757812, + "logps/rejected": -201.78366088867188, + "loss": 0.1624, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8660491704940796, + "rewards/margins": 2.498040199279785, + "rewards/rejected": -3.364089250564575, + "step": 6802 + }, + { + "epoch": 0.78, + "learning_rate": 6.57029146669788e-08, + "logits/chosen": -2.690528392791748, + "logits/rejected": -2.7245538234710693, + "logps/chosen": -132.86856079101562, + "logps/rejected": -193.3789520263672, + "loss": 0.1678, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.772701621055603, + "rewards/margins": 2.325575828552246, + "rewards/rejected": -3.0982775688171387, + "step": 6803 + }, + { + "epoch": 0.78, + "learning_rate": 6.566779819735456e-08, + "logits/chosen": -1.9803295135498047, + "logits/rejected": -2.2819485664367676, + "logps/chosen": -436.168212890625, + "logps/rejected": -186.91065979003906, + "loss": 0.4137, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7849528789520264, + "rewards/margins": 1.1400883197784424, + "rewards/rejected": -1.9250411987304688, + "step": 6804 + }, + { + "epoch": 0.78, + "learning_rate": 6.56326817277303e-08, + "logits/chosen": -2.44053053855896, + "logits/rejected": -2.423001527786255, + "logps/chosen": -262.9609375, + "logps/rejected": -194.73268127441406, + "loss": 1.0019, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.795204758644104, + "rewards/margins": 0.6542066931724548, + "rewards/rejected": -2.449411392211914, + "step": 6805 + }, + { + "epoch": 0.78, + "learning_rate": 6.559756525810605e-08, + "logits/chosen": -2.1194980144500732, + "logits/rejected": -2.526887893676758, + "logps/chosen": -435.6983337402344, + "logps/rejected": -271.51727294921875, + "loss": 0.4612, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8332343697547913, + "rewards/margins": 1.4786070585250854, + "rewards/rejected": -2.3118412494659424, + "step": 6806 + }, + { + "epoch": 0.78, + "learning_rate": 6.55624487884818e-08, + "logits/chosen": -2.840635061264038, + "logits/rejected": -2.7609715461730957, + "logps/chosen": -160.47560119628906, + "logps/rejected": -294.90960693359375, + "loss": 0.2144, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.42821693420410156, + "rewards/margins": 2.6196584701538086, + "rewards/rejected": -3.0478756427764893, + "step": 6807 + }, + { + "epoch": 0.78, + "learning_rate": 6.552733231885755e-08, + "logits/chosen": -2.370821475982666, + "logits/rejected": -2.510779857635498, + "logps/chosen": -158.31326293945312, + "logps/rejected": -185.69964599609375, + "loss": 0.2342, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19807982444763184, + "rewards/margins": 2.9066529273986816, + "rewards/rejected": -3.1047329902648926, + "step": 6808 + }, + { + "epoch": 0.78, + "learning_rate": 6.549221584923329e-08, + "logits/chosen": -2.2150158882141113, + "logits/rejected": -2.5084586143493652, + "logps/chosen": -367.19744873046875, + "logps/rejected": -362.8836669921875, + "loss": 0.0992, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5443204641342163, + "rewards/margins": 4.466496467590332, + "rewards/rejected": -5.010817050933838, + "step": 6809 + }, + { + "epoch": 0.79, + "learning_rate": 6.545709937960904e-08, + "logits/chosen": -2.3195881843566895, + "logits/rejected": -2.156916618347168, + "logps/chosen": -197.03472900390625, + "logps/rejected": -354.0972900390625, + "loss": 0.2504, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6872421503067017, + "rewards/margins": 4.046545028686523, + "rewards/rejected": -4.7337870597839355, + "step": 6810 + }, + { + "epoch": 0.79, + "learning_rate": 6.542198290998478e-08, + "logits/chosen": -2.377964973449707, + "logits/rejected": -2.270233154296875, + "logps/chosen": -349.19775390625, + "logps/rejected": -394.40863037109375, + "loss": 1.3352, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.310338020324707, + "rewards/margins": 1.0567057132720947, + "rewards/rejected": -2.3670437335968018, + "step": 6811 + }, + { + "epoch": 0.79, + "learning_rate": 6.538686644036054e-08, + "logits/chosen": -2.422901153564453, + "logits/rejected": -2.2961535453796387, + "logps/chosen": -205.50009155273438, + "logps/rejected": -271.65765380859375, + "loss": 0.6314, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.117859363555908, + "rewards/margins": 1.093971610069275, + "rewards/rejected": -3.2118310928344727, + "step": 6812 + }, + { + "epoch": 0.79, + "learning_rate": 6.535174997073628e-08, + "logits/chosen": -2.564755439758301, + "logits/rejected": -2.408167600631714, + "logps/chosen": -275.1306457519531, + "logps/rejected": -228.74917602539062, + "loss": 0.3527, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5148214101791382, + "rewards/margins": 1.9296672344207764, + "rewards/rejected": -2.444488763809204, + "step": 6813 + }, + { + "epoch": 0.79, + "learning_rate": 6.531663350111202e-08, + "logits/chosen": -2.029031753540039, + "logits/rejected": -2.2683656215667725, + "logps/chosen": -461.0339050292969, + "logps/rejected": -306.6171569824219, + "loss": 0.3915, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0473973751068115, + "rewards/margins": 1.4416292905807495, + "rewards/rejected": -2.4890265464782715, + "step": 6814 + }, + { + "epoch": 0.79, + "learning_rate": 6.528151703148777e-08, + "logits/chosen": -2.333099842071533, + "logits/rejected": -2.1921796798706055, + "logps/chosen": -177.70770263671875, + "logps/rejected": -195.26622009277344, + "loss": 0.3916, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.39783722162246704, + "rewards/margins": 1.548140287399292, + "rewards/rejected": -1.9459775686264038, + "step": 6815 + }, + { + "epoch": 0.79, + "learning_rate": 6.524640056186351e-08, + "logits/chosen": -1.8335320949554443, + "logits/rejected": -2.005237102508545, + "logps/chosen": -281.1922607421875, + "logps/rejected": -277.9041442871094, + "loss": 0.5892, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8489654064178467, + "rewards/margins": 1.429627537727356, + "rewards/rejected": -2.278593063354492, + "step": 6816 + }, + { + "epoch": 0.79, + "learning_rate": 6.521128409223925e-08, + "logits/chosen": -2.308351516723633, + "logits/rejected": -2.4641335010528564, + "logps/chosen": -331.9294128417969, + "logps/rejected": -282.03173828125, + "loss": 0.5632, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.37554132938385, + "rewards/margins": 1.157702922821045, + "rewards/rejected": -2.5332443714141846, + "step": 6817 + }, + { + "epoch": 0.79, + "learning_rate": 6.5176167622615e-08, + "logits/chosen": -2.6345221996307373, + "logits/rejected": -2.3743748664855957, + "logps/chosen": -404.1719970703125, + "logps/rejected": -375.53936767578125, + "loss": 0.1674, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.15598464012146, + "rewards/margins": 2.79022479057312, + "rewards/rejected": -3.94620943069458, + "step": 6818 + }, + { + "epoch": 0.79, + "learning_rate": 6.514105115299075e-08, + "logits/chosen": -2.1076221466064453, + "logits/rejected": -1.9791494607925415, + "logps/chosen": -276.4073486328125, + "logps/rejected": -338.9691162109375, + "loss": 0.1217, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4180932939052582, + "rewards/margins": 3.7584335803985596, + "rewards/rejected": -4.1765265464782715, + "step": 6819 + }, + { + "epoch": 0.79, + "learning_rate": 6.510593468336649e-08, + "logits/chosen": -2.310253143310547, + "logits/rejected": -2.2261106967926025, + "logps/chosen": -191.516357421875, + "logps/rejected": -293.79681396484375, + "loss": 0.2871, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1884664297103882, + "rewards/margins": 2.6409661769866943, + "rewards/rejected": -3.829432487487793, + "step": 6820 + }, + { + "epoch": 0.79, + "learning_rate": 6.507081821374224e-08, + "logits/chosen": -2.4626314640045166, + "logits/rejected": -2.574373722076416, + "logps/chosen": -149.79049682617188, + "logps/rejected": -177.18678283691406, + "loss": 0.7905, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9192783236503601, + "rewards/margins": 1.8702671527862549, + "rewards/rejected": -2.7895452976226807, + "step": 6821 + }, + { + "epoch": 0.79, + "learning_rate": 6.503570174411798e-08, + "logits/chosen": -2.3150784969329834, + "logits/rejected": -2.411648750305176, + "logps/chosen": -305.6068115234375, + "logps/rejected": -258.31396484375, + "loss": 1.066, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.5867841243743896, + "rewards/margins": 1.3947479724884033, + "rewards/rejected": -3.981532335281372, + "step": 6822 + }, + { + "epoch": 0.79, + "learning_rate": 6.500058527449374e-08, + "logits/chosen": -1.9190587997436523, + "logits/rejected": -2.3275065422058105, + "logps/chosen": -462.8649597167969, + "logps/rejected": -257.3349304199219, + "loss": 0.337, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9986214637756348, + "rewards/margins": 1.9594573974609375, + "rewards/rejected": -2.9580788612365723, + "step": 6823 + }, + { + "epoch": 0.79, + "learning_rate": 6.496546880486948e-08, + "logits/chosen": -2.427703380584717, + "logits/rejected": -2.2299232482910156, + "logps/chosen": -215.10244750976562, + "logps/rejected": -243.4240264892578, + "loss": 0.4565, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3598549962043762, + "rewards/margins": 2.00165057182312, + "rewards/rejected": -2.3615055084228516, + "step": 6824 + }, + { + "epoch": 0.79, + "learning_rate": 6.493035233524523e-08, + "logits/chosen": -2.1420722007751465, + "logits/rejected": -2.3309669494628906, + "logps/chosen": -337.75738525390625, + "logps/rejected": -336.37457275390625, + "loss": 0.4139, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0740549564361572, + "rewards/margins": 1.8362916707992554, + "rewards/rejected": -2.910346508026123, + "step": 6825 + }, + { + "epoch": 0.79, + "learning_rate": 6.489523586562097e-08, + "logits/chosen": -1.9277005195617676, + "logits/rejected": -2.1954822540283203, + "logps/chosen": -293.2989501953125, + "logps/rejected": -252.59158325195312, + "loss": 0.7789, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.0362436771392822, + "rewards/margins": 1.519094467163086, + "rewards/rejected": -3.555338144302368, + "step": 6826 + }, + { + "epoch": 0.79, + "learning_rate": 6.486011939599673e-08, + "logits/chosen": -2.3743696212768555, + "logits/rejected": -2.612933397293091, + "logps/chosen": -274.4525451660156, + "logps/rejected": -350.62359619140625, + "loss": 0.1631, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0627318620681763, + "rewards/margins": 2.9363229274749756, + "rewards/rejected": -3.9990549087524414, + "step": 6827 + }, + { + "epoch": 0.79, + "learning_rate": 6.482500292637247e-08, + "logits/chosen": -1.9750678539276123, + "logits/rejected": -2.1997618675231934, + "logps/chosen": -165.7993927001953, + "logps/rejected": -203.3197021484375, + "loss": 0.4468, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0829401016235352, + "rewards/margins": 2.762280225753784, + "rewards/rejected": -3.8452203273773193, + "step": 6828 + }, + { + "epoch": 0.79, + "learning_rate": 6.478988645674822e-08, + "logits/chosen": -1.974446177482605, + "logits/rejected": -2.358427047729492, + "logps/chosen": -312.3114013671875, + "logps/rejected": -241.9840087890625, + "loss": 0.2146, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.013189882040023804, + "rewards/margins": 2.3445987701416016, + "rewards/rejected": -2.331408739089966, + "step": 6829 + }, + { + "epoch": 0.79, + "learning_rate": 6.475476998712396e-08, + "logits/chosen": -2.3530309200286865, + "logits/rejected": -2.571535348892212, + "logps/chosen": -416.4429016113281, + "logps/rejected": -387.11114501953125, + "loss": 0.4774, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1344430446624756, + "rewards/margins": 1.4807789325714111, + "rewards/rejected": -2.6152219772338867, + "step": 6830 + }, + { + "epoch": 0.79, + "learning_rate": 6.47196535174997e-08, + "logits/chosen": -2.542938709259033, + "logits/rejected": -2.1237635612487793, + "logps/chosen": -211.597900390625, + "logps/rejected": -362.9634704589844, + "loss": 0.4122, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7042100429534912, + "rewards/margins": 1.692026138305664, + "rewards/rejected": -2.3962361812591553, + "step": 6831 + }, + { + "epoch": 0.79, + "learning_rate": 6.468453704787545e-08, + "logits/chosen": -2.0462450981140137, + "logits/rejected": -2.2375783920288086, + "logps/chosen": -386.71844482421875, + "logps/rejected": -428.4761657714844, + "loss": 0.3629, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1949161291122437, + "rewards/margins": 2.329390525817871, + "rewards/rejected": -3.524306297302246, + "step": 6832 + }, + { + "epoch": 0.79, + "learning_rate": 6.46494205782512e-08, + "logits/chosen": -2.0522820949554443, + "logits/rejected": -1.947401762008667, + "logps/chosen": -330.91363525390625, + "logps/rejected": -294.4639892578125, + "loss": 0.4545, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8996899127960205, + "rewards/margins": 1.784024953842163, + "rewards/rejected": -2.6837148666381836, + "step": 6833 + }, + { + "epoch": 0.79, + "learning_rate": 6.461430410862694e-08, + "logits/chosen": -2.075944662094116, + "logits/rejected": -2.2201309204101562, + "logps/chosen": -293.6044006347656, + "logps/rejected": -239.86990356445312, + "loss": 0.7585, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8055740594863892, + "rewards/margins": 1.0578060150146484, + "rewards/rejected": -2.863380193710327, + "step": 6834 + }, + { + "epoch": 0.79, + "learning_rate": 6.457918763900269e-08, + "logits/chosen": -2.471877098083496, + "logits/rejected": -2.4153575897216797, + "logps/chosen": -164.34921264648438, + "logps/rejected": -160.5894012451172, + "loss": 0.4078, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0895023345947266, + "rewards/margins": 2.1414265632629395, + "rewards/rejected": -3.230928897857666, + "step": 6835 + }, + { + "epoch": 0.79, + "learning_rate": 6.454407116937843e-08, + "logits/chosen": -2.41898512840271, + "logits/rejected": -2.6531333923339844, + "logps/chosen": -312.7174072265625, + "logps/rejected": -209.5396728515625, + "loss": 0.4553, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9965577125549316, + "rewards/margins": 1.9555190801620483, + "rewards/rejected": -2.9520769119262695, + "step": 6836 + }, + { + "epoch": 0.79, + "learning_rate": 6.450895469975418e-08, + "logits/chosen": -2.5791001319885254, + "logits/rejected": -2.7482657432556152, + "logps/chosen": -373.66790771484375, + "logps/rejected": -262.7955322265625, + "loss": 0.7128, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5181938409805298, + "rewards/margins": 0.8524351119995117, + "rewards/rejected": -2.370629072189331, + "step": 6837 + }, + { + "epoch": 0.79, + "learning_rate": 6.447383823012992e-08, + "logits/chosen": -1.7851808071136475, + "logits/rejected": -1.7632691860198975, + "logps/chosen": -802.8873291015625, + "logps/rejected": -452.00238037109375, + "loss": 0.2785, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9930119514465332, + "rewards/margins": 2.506011724472046, + "rewards/rejected": -3.499023675918579, + "step": 6838 + }, + { + "epoch": 0.79, + "learning_rate": 6.443872176050568e-08, + "logits/chosen": -2.3639183044433594, + "logits/rejected": -2.375178813934326, + "logps/chosen": -264.9368591308594, + "logps/rejected": -229.76292419433594, + "loss": 0.1492, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3064592480659485, + "rewards/margins": 3.2044870853424072, + "rewards/rejected": -3.510946273803711, + "step": 6839 + }, + { + "epoch": 0.79, + "learning_rate": 6.440360529088142e-08, + "logits/chosen": -2.352050542831421, + "logits/rejected": -2.2617876529693604, + "logps/chosen": -318.64544677734375, + "logps/rejected": -343.67547607421875, + "loss": 0.2753, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.35248616337776184, + "rewards/margins": 3.182133674621582, + "rewards/rejected": -3.5346200466156006, + "step": 6840 + }, + { + "epoch": 0.79, + "learning_rate": 6.436848882125717e-08, + "logits/chosen": -2.241523027420044, + "logits/rejected": -2.2472829818725586, + "logps/chosen": -188.7084197998047, + "logps/rejected": -137.16224670410156, + "loss": 0.2698, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1981644630432129, + "rewards/margins": 1.9589509963989258, + "rewards/rejected": -2.1571154594421387, + "step": 6841 + }, + { + "epoch": 0.79, + "learning_rate": 6.433337235163291e-08, + "logits/chosen": -1.9848047494888306, + "logits/rejected": -2.12013578414917, + "logps/chosen": -255.08023071289062, + "logps/rejected": -310.8824462890625, + "loss": 0.5284, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6529523134231567, + "rewards/margins": 1.7044692039489746, + "rewards/rejected": -2.357421875, + "step": 6842 + }, + { + "epoch": 0.79, + "learning_rate": 6.429825588200867e-08, + "logits/chosen": -2.328629732131958, + "logits/rejected": -2.41884446144104, + "logps/chosen": -555.843017578125, + "logps/rejected": -631.0687866210938, + "loss": 0.1603, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9164315462112427, + "rewards/margins": 2.746680974960327, + "rewards/rejected": -3.6631126403808594, + "step": 6843 + }, + { + "epoch": 0.79, + "learning_rate": 6.426313941238441e-08, + "logits/chosen": -2.7623915672302246, + "logits/rejected": -2.6425015926361084, + "logps/chosen": -233.03855895996094, + "logps/rejected": -261.3377685546875, + "loss": 0.3897, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8095080852508545, + "rewards/margins": 1.8139317035675049, + "rewards/rejected": -2.6234397888183594, + "step": 6844 + }, + { + "epoch": 0.79, + "learning_rate": 6.422802294276015e-08, + "logits/chosen": -2.29732608795166, + "logits/rejected": -2.4286327362060547, + "logps/chosen": -452.2447509765625, + "logps/rejected": -274.87188720703125, + "loss": 0.7316, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5810626745224, + "rewards/margins": 0.36363816261291504, + "rewards/rejected": -1.944700837135315, + "step": 6845 + }, + { + "epoch": 0.79, + "learning_rate": 6.41929064731359e-08, + "logits/chosen": -2.522590398788452, + "logits/rejected": -2.6361217498779297, + "logps/chosen": -212.847412109375, + "logps/rejected": -272.34210205078125, + "loss": 0.3113, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1234307810664177, + "rewards/margins": 2.5776185989379883, + "rewards/rejected": -2.701049566268921, + "step": 6846 + }, + { + "epoch": 0.79, + "learning_rate": 6.415779000351164e-08, + "logits/chosen": -2.561873435974121, + "logits/rejected": -2.438279151916504, + "logps/chosen": -246.45062255859375, + "logps/rejected": -171.06649780273438, + "loss": 0.3219, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3283289074897766, + "rewards/margins": 1.9808437824249268, + "rewards/rejected": -2.3091726303100586, + "step": 6847 + }, + { + "epoch": 0.79, + "learning_rate": 6.412267353388738e-08, + "logits/chosen": -2.3890380859375, + "logits/rejected": -2.456540822982788, + "logps/chosen": -253.11215209960938, + "logps/rejected": -220.4935302734375, + "loss": 0.5184, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.852449357509613, + "rewards/margins": 2.0934383869171143, + "rewards/rejected": -2.945887804031372, + "step": 6848 + }, + { + "epoch": 0.79, + "learning_rate": 6.408755706426314e-08, + "logits/chosen": -2.405627727508545, + "logits/rejected": -2.3335044384002686, + "logps/chosen": -339.4437561035156, + "logps/rejected": -346.9122619628906, + "loss": 0.763, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3032546043395996, + "rewards/margins": 1.8179336786270142, + "rewards/rejected": -3.1211884021759033, + "step": 6849 + }, + { + "epoch": 0.79, + "learning_rate": 6.405244059463888e-08, + "logits/chosen": -2.7865729331970215, + "logits/rejected": -2.6825437545776367, + "logps/chosen": -174.97384643554688, + "logps/rejected": -217.71876525878906, + "loss": 0.2096, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3188386559486389, + "rewards/margins": 2.4356346130371094, + "rewards/rejected": -2.7544734477996826, + "step": 6850 + }, + { + "epoch": 0.79, + "learning_rate": 6.401732412501463e-08, + "logits/chosen": -1.980782151222229, + "logits/rejected": -1.9559051990509033, + "logps/chosen": -188.8347930908203, + "logps/rejected": -276.75830078125, + "loss": 0.0787, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5051185488700867, + "rewards/margins": 3.5288565158843994, + "rewards/rejected": -4.033975124359131, + "step": 6851 + }, + { + "epoch": 0.79, + "learning_rate": 6.398220765539037e-08, + "logits/chosen": -2.476848602294922, + "logits/rejected": -2.473994493484497, + "logps/chosen": -321.37237548828125, + "logps/rejected": -245.04705810546875, + "loss": 0.2076, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1710931062698364, + "rewards/margins": 2.1021790504455566, + "rewards/rejected": -3.2732722759246826, + "step": 6852 + }, + { + "epoch": 0.79, + "learning_rate": 6.394709118576613e-08, + "logits/chosen": -2.2122044563293457, + "logits/rejected": -2.1674370765686035, + "logps/chosen": -245.7066192626953, + "logps/rejected": -301.84088134765625, + "loss": 0.3985, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3319779634475708, + "rewards/margins": 3.184788703918457, + "rewards/rejected": -4.516766548156738, + "step": 6853 + }, + { + "epoch": 0.79, + "learning_rate": 6.391197471614187e-08, + "logits/chosen": -2.5855062007904053, + "logits/rejected": -2.76871657371521, + "logps/chosen": -115.5383071899414, + "logps/rejected": -102.14781188964844, + "loss": 0.6938, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.201623797416687, + "rewards/margins": 0.6798991560935974, + "rewards/rejected": -1.8815228939056396, + "step": 6854 + }, + { + "epoch": 0.79, + "learning_rate": 6.387685824651762e-08, + "logits/chosen": -2.19474196434021, + "logits/rejected": -2.095452070236206, + "logps/chosen": -157.486083984375, + "logps/rejected": -270.01275634765625, + "loss": 0.8189, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1672322750091553, + "rewards/margins": 3.260958194732666, + "rewards/rejected": -5.4281907081604, + "step": 6855 + }, + { + "epoch": 0.79, + "learning_rate": 6.384174177689336e-08, + "logits/chosen": -2.2883403301239014, + "logits/rejected": -2.3432774543762207, + "logps/chosen": -409.0180969238281, + "logps/rejected": -288.69818115234375, + "loss": 0.5497, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.258650779724121, + "rewards/margins": 1.0020880699157715, + "rewards/rejected": -2.2607388496398926, + "step": 6856 + }, + { + "epoch": 0.79, + "learning_rate": 6.380662530726912e-08, + "logits/chosen": -2.0561225414276123, + "logits/rejected": -2.0925235748291016, + "logps/chosen": -260.0471496582031, + "logps/rejected": -261.42242431640625, + "loss": 0.3127, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4454728066921234, + "rewards/margins": 2.4799437522888184, + "rewards/rejected": -2.9254167079925537, + "step": 6857 + }, + { + "epoch": 0.79, + "learning_rate": 6.377150883764486e-08, + "logits/chosen": -2.8362350463867188, + "logits/rejected": -2.8313217163085938, + "logps/chosen": -431.5497741699219, + "logps/rejected": -430.9704284667969, + "loss": 0.0829, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1349737644195557, + "rewards/margins": 4.2230448722839355, + "rewards/rejected": -5.358018398284912, + "step": 6858 + }, + { + "epoch": 0.79, + "learning_rate": 6.37363923680206e-08, + "logits/chosen": -1.941334843635559, + "logits/rejected": -2.0553860664367676, + "logps/chosen": -290.3048400878906, + "logps/rejected": -317.4251708984375, + "loss": 0.5043, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8443710803985596, + "rewards/margins": 1.4701385498046875, + "rewards/rejected": -3.314509868621826, + "step": 6859 + }, + { + "epoch": 0.79, + "learning_rate": 6.370127589839635e-08, + "logits/chosen": -2.4433717727661133, + "logits/rejected": -2.543614387512207, + "logps/chosen": -179.03347778320312, + "logps/rejected": -228.56668090820312, + "loss": 1.2302, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9606106281280518, + "rewards/margins": 0.49325740337371826, + "rewards/rejected": -2.4538679122924805, + "step": 6860 + }, + { + "epoch": 0.79, + "learning_rate": 6.366615942877209e-08, + "logits/chosen": -2.0535523891448975, + "logits/rejected": -1.9469530582427979, + "logps/chosen": -385.1776428222656, + "logps/rejected": -351.9556579589844, + "loss": 0.2766, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1757131814956665, + "rewards/margins": 4.023181438446045, + "rewards/rejected": -5.198894500732422, + "step": 6861 + }, + { + "epoch": 0.79, + "learning_rate": 6.363104295914783e-08, + "logits/chosen": -2.002330780029297, + "logits/rejected": -2.064260721206665, + "logps/chosen": -468.7704772949219, + "logps/rejected": -334.62432861328125, + "loss": 0.2664, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4013006091117859, + "rewards/margins": 2.224086284637451, + "rewards/rejected": -2.625386953353882, + "step": 6862 + }, + { + "epoch": 0.79, + "learning_rate": 6.359592648952359e-08, + "logits/chosen": -1.9053471088409424, + "logits/rejected": -2.232149124145508, + "logps/chosen": -327.67413330078125, + "logps/rejected": -251.69424438476562, + "loss": 0.5954, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.16550874710083, + "rewards/margins": 1.534170150756836, + "rewards/rejected": -2.699678897857666, + "step": 6863 + }, + { + "epoch": 0.79, + "learning_rate": 6.356081001989933e-08, + "logits/chosen": -2.653484344482422, + "logits/rejected": -2.5663256645202637, + "logps/chosen": -306.480224609375, + "logps/rejected": -244.6038055419922, + "loss": 0.3943, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8726951479911804, + "rewards/margins": 3.2201199531555176, + "rewards/rejected": -4.092815399169922, + "step": 6864 + }, + { + "epoch": 0.79, + "learning_rate": 6.352569355027507e-08, + "logits/chosen": -2.805711269378662, + "logits/rejected": -2.7897825241088867, + "logps/chosen": -293.3004455566406, + "logps/rejected": -362.9182434082031, + "loss": 0.3574, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4653940796852112, + "rewards/margins": 2.420046091079712, + "rewards/rejected": -2.8854401111602783, + "step": 6865 + }, + { + "epoch": 0.79, + "learning_rate": 6.349057708065082e-08, + "logits/chosen": -2.381333112716675, + "logits/rejected": -2.268808603286743, + "logps/chosen": -200.94654846191406, + "logps/rejected": -227.01856994628906, + "loss": 0.7506, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9685395359992981, + "rewards/margins": 2.006700038909912, + "rewards/rejected": -2.9752395153045654, + "step": 6866 + }, + { + "epoch": 0.79, + "learning_rate": 6.345546061102656e-08, + "logits/chosen": -1.3966131210327148, + "logits/rejected": -1.774094581604004, + "logps/chosen": -274.8880920410156, + "logps/rejected": -230.21701049804688, + "loss": 0.5916, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.160051703453064, + "rewards/margins": 0.9385606050491333, + "rewards/rejected": -2.0986123085021973, + "step": 6867 + }, + { + "epoch": 0.79, + "learning_rate": 6.342034414140232e-08, + "logits/chosen": -2.272901773452759, + "logits/rejected": -2.60251522064209, + "logps/chosen": -302.82470703125, + "logps/rejected": -307.0090026855469, + "loss": 0.5641, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6641680002212524, + "rewards/margins": 1.6391286849975586, + "rewards/rejected": -2.3032968044281006, + "step": 6868 + }, + { + "epoch": 0.79, + "learning_rate": 6.338522767177806e-08, + "logits/chosen": -2.5315983295440674, + "logits/rejected": -2.74898362159729, + "logps/chosen": -224.58033752441406, + "logps/rejected": -223.52682495117188, + "loss": 0.7253, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1090835332870483, + "rewards/margins": 1.0236388444900513, + "rewards/rejected": -2.1327223777770996, + "step": 6869 + }, + { + "epoch": 0.79, + "learning_rate": 6.335011120215381e-08, + "logits/chosen": -2.344853401184082, + "logits/rejected": -2.477792739868164, + "logps/chosen": -375.9382019042969, + "logps/rejected": -437.7358703613281, + "loss": 0.1085, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.296114981174469, + "rewards/margins": 4.086948394775391, + "rewards/rejected": -4.383063316345215, + "step": 6870 + }, + { + "epoch": 0.79, + "learning_rate": 6.331499473252955e-08, + "logits/chosen": -2.8248960971832275, + "logits/rejected": -2.928961753845215, + "logps/chosen": -197.8685302734375, + "logps/rejected": -211.15480041503906, + "loss": 0.3451, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5728464722633362, + "rewards/margins": 2.0828752517700195, + "rewards/rejected": -2.65572190284729, + "step": 6871 + }, + { + "epoch": 0.79, + "learning_rate": 6.32798782629053e-08, + "logits/chosen": -2.6072752475738525, + "logits/rejected": -2.5262198448181152, + "logps/chosen": -330.0268249511719, + "logps/rejected": -283.559814453125, + "loss": 0.2218, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7718624472618103, + "rewards/margins": 2.0364339351654053, + "rewards/rejected": -2.8082964420318604, + "step": 6872 + }, + { + "epoch": 0.79, + "learning_rate": 6.324476179328105e-08, + "logits/chosen": -2.685004711151123, + "logits/rejected": -2.1042919158935547, + "logps/chosen": -183.63504028320312, + "logps/rejected": -366.6676025390625, + "loss": 0.1503, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6141288876533508, + "rewards/margins": 3.460083484649658, + "rewards/rejected": -4.074212074279785, + "step": 6873 + }, + { + "epoch": 0.79, + "learning_rate": 6.32096453236568e-08, + "logits/chosen": -2.6815526485443115, + "logits/rejected": -2.621519088745117, + "logps/chosen": -323.91241455078125, + "logps/rejected": -331.79925537109375, + "loss": 0.5833, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6208256483078003, + "rewards/margins": 0.9754137396812439, + "rewards/rejected": -1.5962393283843994, + "step": 6874 + }, + { + "epoch": 0.79, + "learning_rate": 6.317452885403254e-08, + "logits/chosen": -1.827791452407837, + "logits/rejected": -1.9543964862823486, + "logps/chosen": -331.7216491699219, + "logps/rejected": -228.23263549804688, + "loss": 0.2817, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.21668128669261932, + "rewards/margins": 2.645789861679077, + "rewards/rejected": -2.4291086196899414, + "step": 6875 + }, + { + "epoch": 0.79, + "learning_rate": 6.313941238440828e-08, + "logits/chosen": -2.2843451499938965, + "logits/rejected": -2.3738770484924316, + "logps/chosen": -376.1451416015625, + "logps/rejected": -376.57904052734375, + "loss": 0.3032, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7141990661621094, + "rewards/margins": 1.664091944694519, + "rewards/rejected": -2.378290891647339, + "step": 6876 + }, + { + "epoch": 0.79, + "learning_rate": 6.310429591478403e-08, + "logits/chosen": -2.3193390369415283, + "logits/rejected": -2.5497288703918457, + "logps/chosen": -378.51470947265625, + "logps/rejected": -257.2939453125, + "loss": 0.3767, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8351219892501831, + "rewards/margins": 2.1825335025787354, + "rewards/rejected": -3.017655372619629, + "step": 6877 + }, + { + "epoch": 0.79, + "learning_rate": 6.306917944515977e-08, + "logits/chosen": -2.158205032348633, + "logits/rejected": -2.167006015777588, + "logps/chosen": -314.0841979980469, + "logps/rejected": -196.7584686279297, + "loss": 0.8009, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8571518659591675, + "rewards/margins": 0.3502312898635864, + "rewards/rejected": -1.207383155822754, + "step": 6878 + }, + { + "epoch": 0.79, + "learning_rate": 6.303406297553552e-08, + "logits/chosen": -2.4413928985595703, + "logits/rejected": -2.481924057006836, + "logps/chosen": -170.3463134765625, + "logps/rejected": -137.23739624023438, + "loss": 0.4932, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0329415798187256, + "rewards/margins": 1.2113935947418213, + "rewards/rejected": -2.244335174560547, + "step": 6879 + }, + { + "epoch": 0.79, + "learning_rate": 6.299894650591127e-08, + "logits/chosen": -2.312366008758545, + "logits/rejected": -2.1441683769226074, + "logps/chosen": -329.7878723144531, + "logps/rejected": -323.20904541015625, + "loss": 0.4758, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6264584064483643, + "rewards/margins": 0.8790335059165955, + "rewards/rejected": -2.5054919719696045, + "step": 6880 + }, + { + "epoch": 0.79, + "learning_rate": 6.296383003628701e-08, + "logits/chosen": -2.61234188079834, + "logits/rejected": -2.803431749343872, + "logps/chosen": -310.60968017578125, + "logps/rejected": -275.960693359375, + "loss": 0.0979, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36338010430336, + "rewards/margins": 3.1340460777282715, + "rewards/rejected": -3.4974265098571777, + "step": 6881 + }, + { + "epoch": 0.79, + "learning_rate": 6.292871356666276e-08, + "logits/chosen": -1.880882740020752, + "logits/rejected": -2.149312973022461, + "logps/chosen": -377.808837890625, + "logps/rejected": -227.36349487304688, + "loss": 0.2675, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15989214181900024, + "rewards/margins": 1.8822795152664185, + "rewards/rejected": -2.0421714782714844, + "step": 6882 + }, + { + "epoch": 0.79, + "learning_rate": 6.28935970970385e-08, + "logits/chosen": -2.66367769241333, + "logits/rejected": -2.729940891265869, + "logps/chosen": -166.4844970703125, + "logps/rejected": -201.95628356933594, + "loss": 0.2419, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.015511736273765564, + "rewards/margins": 2.052943229675293, + "rewards/rejected": -2.0684549808502197, + "step": 6883 + }, + { + "epoch": 0.79, + "learning_rate": 6.285848062741426e-08, + "logits/chosen": -2.111520767211914, + "logits/rejected": -2.2900099754333496, + "logps/chosen": -424.6336669921875, + "logps/rejected": -320.0793762207031, + "loss": 0.6583, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.33184292912483215, + "rewards/margins": 1.509871244430542, + "rewards/rejected": -1.8417142629623413, + "step": 6884 + }, + { + "epoch": 0.79, + "learning_rate": 6.282336415779e-08, + "logits/chosen": -1.9746522903442383, + "logits/rejected": -2.0304806232452393, + "logps/chosen": -259.2525329589844, + "logps/rejected": -244.4387969970703, + "loss": 0.3163, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0843321084976196, + "rewards/margins": 1.985378384590149, + "rewards/rejected": -3.0697102546691895, + "step": 6885 + }, + { + "epoch": 0.79, + "learning_rate": 6.278824768816575e-08, + "logits/chosen": -2.1468822956085205, + "logits/rejected": -1.751647710800171, + "logps/chosen": -321.7969970703125, + "logps/rejected": -342.9257507324219, + "loss": 0.134, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09193085879087448, + "rewards/margins": 3.819622039794922, + "rewards/rejected": -3.911552906036377, + "step": 6886 + }, + { + "epoch": 0.79, + "learning_rate": 6.275313121854149e-08, + "logits/chosen": -1.8659043312072754, + "logits/rejected": -1.6246849298477173, + "logps/chosen": -385.3845520019531, + "logps/rejected": -420.17340087890625, + "loss": 0.4737, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2012865543365479, + "rewards/margins": 1.5471917390823364, + "rewards/rejected": -2.7484781742095947, + "step": 6887 + }, + { + "epoch": 0.79, + "learning_rate": 6.271801474891725e-08, + "logits/chosen": -1.731594204902649, + "logits/rejected": -2.198716163635254, + "logps/chosen": -169.16331481933594, + "logps/rejected": -124.32424926757812, + "loss": 0.5807, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9391964673995972, + "rewards/margins": 1.5453846454620361, + "rewards/rejected": -2.4845809936523438, + "step": 6888 + }, + { + "epoch": 0.79, + "learning_rate": 6.268289827929299e-08, + "logits/chosen": -1.903830885887146, + "logits/rejected": -2.3820297718048096, + "logps/chosen": -572.901611328125, + "logps/rejected": -318.2956848144531, + "loss": 0.324, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6673647165298462, + "rewards/margins": 1.9371541738510132, + "rewards/rejected": -2.6045188903808594, + "step": 6889 + }, + { + "epoch": 0.79, + "learning_rate": 6.264778180966873e-08, + "logits/chosen": -2.469695568084717, + "logits/rejected": -2.500458002090454, + "logps/chosen": -241.4749755859375, + "logps/rejected": -176.3022003173828, + "loss": 0.2637, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.39088553190231323, + "rewards/margins": 1.6323943138122559, + "rewards/rejected": -2.023279905319214, + "step": 6890 + }, + { + "epoch": 0.79, + "learning_rate": 6.261266534004448e-08, + "logits/chosen": -2.297314167022705, + "logits/rejected": -2.454897165298462, + "logps/chosen": -404.67034912109375, + "logps/rejected": -321.5630187988281, + "loss": 1.0113, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.2532944679260254, + "rewards/margins": 1.1752396821975708, + "rewards/rejected": -3.4285340309143066, + "step": 6891 + }, + { + "epoch": 0.79, + "learning_rate": 6.257754887042022e-08, + "logits/chosen": -1.9674410820007324, + "logits/rejected": -1.8422348499298096, + "logps/chosen": -271.63232421875, + "logps/rejected": -279.911376953125, + "loss": 0.2667, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0831695795059204, + "rewards/margins": 1.946428656578064, + "rewards/rejected": -3.0295982360839844, + "step": 6892 + }, + { + "epoch": 0.79, + "learning_rate": 6.254243240079596e-08, + "logits/chosen": -2.1558265686035156, + "logits/rejected": -2.3131353855133057, + "logps/chosen": -249.77566528320312, + "logps/rejected": -145.00845336914062, + "loss": 0.6962, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6423943042755127, + "rewards/margins": 1.1229751110076904, + "rewards/rejected": -2.7653696537017822, + "step": 6893 + }, + { + "epoch": 0.79, + "learning_rate": 6.250731593117172e-08, + "logits/chosen": -2.0348119735717773, + "logits/rejected": -2.400914192199707, + "logps/chosen": -219.8411407470703, + "logps/rejected": -165.42547607421875, + "loss": 0.5353, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9326061010360718, + "rewards/margins": 1.2840402126312256, + "rewards/rejected": -2.216646194458008, + "step": 6894 + }, + { + "epoch": 0.79, + "learning_rate": 6.247219946154746e-08, + "logits/chosen": -2.6853320598602295, + "logits/rejected": -2.952836275100708, + "logps/chosen": -283.6237487792969, + "logps/rejected": -165.14952087402344, + "loss": 0.3134, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0877394676208496, + "rewards/margins": 2.5921919345855713, + "rewards/rejected": -3.679931640625, + "step": 6895 + }, + { + "epoch": 0.79, + "learning_rate": 6.243708299192321e-08, + "logits/chosen": -2.344827890396118, + "logits/rejected": -2.382887125015259, + "logps/chosen": -218.59515380859375, + "logps/rejected": -219.05892944335938, + "loss": 0.3403, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6893579959869385, + "rewards/margins": 1.6959856748580933, + "rewards/rejected": -2.385343551635742, + "step": 6896 + }, + { + "epoch": 0.8, + "learning_rate": 6.240196652229895e-08, + "logits/chosen": -2.437051773071289, + "logits/rejected": -2.4791667461395264, + "logps/chosen": -192.020751953125, + "logps/rejected": -295.0605163574219, + "loss": 0.4169, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9621539115905762, + "rewards/margins": 2.246706962585449, + "rewards/rejected": -3.2088606357574463, + "step": 6897 + }, + { + "epoch": 0.8, + "learning_rate": 6.23668500526747e-08, + "logits/chosen": -2.5893266201019287, + "logits/rejected": -2.2457194328308105, + "logps/chosen": -253.92010498046875, + "logps/rejected": -274.1339416503906, + "loss": 0.3429, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.002807140350342, + "rewards/margins": 2.6656250953674316, + "rewards/rejected": -4.668432235717773, + "step": 6898 + }, + { + "epoch": 0.8, + "learning_rate": 6.233173358305045e-08, + "logits/chosen": -2.643446683883667, + "logits/rejected": -2.628882884979248, + "logps/chosen": -181.9536590576172, + "logps/rejected": -321.6918640136719, + "loss": 0.4177, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5003721714019775, + "rewards/margins": 3.7735483646392822, + "rewards/rejected": -5.27392053604126, + "step": 6899 + }, + { + "epoch": 0.8, + "learning_rate": 6.22966171134262e-08, + "logits/chosen": -2.5448436737060547, + "logits/rejected": -2.5651936531066895, + "logps/chosen": -411.567138671875, + "logps/rejected": -383.07733154296875, + "loss": 0.1253, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6777936220169067, + "rewards/margins": 2.663693904876709, + "rewards/rejected": -3.341487407684326, + "step": 6900 + }, + { + "epoch": 0.8, + "learning_rate": 6.226150064380194e-08, + "logits/chosen": -1.998391032218933, + "logits/rejected": -2.102417230606079, + "logps/chosen": -334.2958068847656, + "logps/rejected": -215.9373779296875, + "loss": 0.5903, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9331172108650208, + "rewards/margins": 1.0596455335617065, + "rewards/rejected": -1.9927629232406616, + "step": 6901 + }, + { + "epoch": 0.8, + "learning_rate": 6.22263841741777e-08, + "logits/chosen": -2.1378743648529053, + "logits/rejected": -2.2564215660095215, + "logps/chosen": -236.0991973876953, + "logps/rejected": -194.99600219726562, + "loss": 0.4046, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0120856761932373, + "rewards/margins": 2.075699806213379, + "rewards/rejected": -3.087785482406616, + "step": 6902 + }, + { + "epoch": 0.8, + "learning_rate": 6.219126770455344e-08, + "logits/chosen": -2.851962089538574, + "logits/rejected": -2.8930070400238037, + "logps/chosen": -153.6956024169922, + "logps/rejected": -248.16677856445312, + "loss": 0.2098, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6684133410453796, + "rewards/margins": 3.2051987648010254, + "rewards/rejected": -3.8736119270324707, + "step": 6903 + }, + { + "epoch": 0.8, + "learning_rate": 6.215615123492918e-08, + "logits/chosen": -2.5759613513946533, + "logits/rejected": -2.8076071739196777, + "logps/chosen": -307.44793701171875, + "logps/rejected": -236.70236206054688, + "loss": 0.5665, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7795007824897766, + "rewards/margins": 1.820403814315796, + "rewards/rejected": -2.599904775619507, + "step": 6904 + }, + { + "epoch": 0.8, + "learning_rate": 6.212103476530493e-08, + "logits/chosen": -2.81252384185791, + "logits/rejected": -2.732680559158325, + "logps/chosen": -222.73190307617188, + "logps/rejected": -228.208740234375, + "loss": 0.1767, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3307693302631378, + "rewards/margins": 2.942641258239746, + "rewards/rejected": -3.2734105587005615, + "step": 6905 + }, + { + "epoch": 0.8, + "learning_rate": 6.208591829568067e-08, + "logits/chosen": -2.750746488571167, + "logits/rejected": -2.5686373710632324, + "logps/chosen": -231.2586669921875, + "logps/rejected": -308.18402099609375, + "loss": 0.1959, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2147774696350098, + "rewards/margins": 3.927485466003418, + "rewards/rejected": -5.142262935638428, + "step": 6906 + }, + { + "epoch": 0.8, + "learning_rate": 6.205080182605641e-08, + "logits/chosen": -1.4332116842269897, + "logits/rejected": -1.2469463348388672, + "logps/chosen": -356.50732421875, + "logps/rejected": -371.2027587890625, + "loss": 1.1104, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2049329280853271, + "rewards/margins": 0.7520174980163574, + "rewards/rejected": -1.956950306892395, + "step": 6907 + }, + { + "epoch": 0.8, + "learning_rate": 6.201568535643217e-08, + "logits/chosen": -2.4809017181396484, + "logits/rejected": -2.5339531898498535, + "logps/chosen": -362.98114013671875, + "logps/rejected": -289.17938232421875, + "loss": 0.254, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7892276644706726, + "rewards/margins": 2.1478471755981445, + "rewards/rejected": -2.937074661254883, + "step": 6908 + }, + { + "epoch": 0.8, + "learning_rate": 6.19805688868079e-08, + "logits/chosen": -1.9391052722930908, + "logits/rejected": -2.0673775672912598, + "logps/chosen": -514.816162109375, + "logps/rejected": -438.0986328125, + "loss": 0.1273, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.052457720041275024, + "rewards/margins": 2.810418128967285, + "rewards/rejected": -2.757960557937622, + "step": 6909 + }, + { + "epoch": 0.8, + "learning_rate": 6.194545241718365e-08, + "logits/chosen": -1.9397354125976562, + "logits/rejected": -2.161259174346924, + "logps/chosen": -355.17401123046875, + "logps/rejected": -354.2441711425781, + "loss": 0.5214, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2706999778747559, + "rewards/margins": 1.7024410963058472, + "rewards/rejected": -2.9731409549713135, + "step": 6910 + }, + { + "epoch": 0.8, + "learning_rate": 6.19103359475594e-08, + "logits/chosen": -2.6370818614959717, + "logits/rejected": -2.5974090099334717, + "logps/chosen": -149.8142547607422, + "logps/rejected": -243.13449096679688, + "loss": 0.2498, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4315264225006104, + "rewards/margins": 2.232046604156494, + "rewards/rejected": -3.6635732650756836, + "step": 6911 + }, + { + "epoch": 0.8, + "learning_rate": 6.187521947793514e-08, + "logits/chosen": -1.8795653581619263, + "logits/rejected": -2.0659220218658447, + "logps/chosen": -283.59375, + "logps/rejected": -205.70828247070312, + "loss": 0.4971, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.44553858041763306, + "rewards/margins": 1.4951368570327759, + "rewards/rejected": -1.9406756162643433, + "step": 6912 + }, + { + "epoch": 0.8, + "learning_rate": 6.18401030083109e-08, + "logits/chosen": -2.295135021209717, + "logits/rejected": -2.5830321311950684, + "logps/chosen": -289.8972473144531, + "logps/rejected": -240.70155334472656, + "loss": 0.0689, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6732576489448547, + "rewards/margins": 4.565822124481201, + "rewards/rejected": -5.23907995223999, + "step": 6913 + }, + { + "epoch": 0.8, + "learning_rate": 6.180498653868664e-08, + "logits/chosen": -2.2071213722229004, + "logits/rejected": -2.3928232192993164, + "logps/chosen": -307.36114501953125, + "logps/rejected": -277.12994384765625, + "loss": 1.0411, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3469408750534058, + "rewards/margins": 2.4193544387817383, + "rewards/rejected": -3.7662951946258545, + "step": 6914 + }, + { + "epoch": 0.8, + "learning_rate": 6.176987006906239e-08, + "logits/chosen": -1.6340184211730957, + "logits/rejected": -1.770059585571289, + "logps/chosen": -585.6217041015625, + "logps/rejected": -339.3521423339844, + "loss": 0.3098, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0330606698989868, + "rewards/margins": 2.615175247192383, + "rewards/rejected": -3.6482362747192383, + "step": 6915 + }, + { + "epoch": 0.8, + "learning_rate": 6.173475359943813e-08, + "logits/chosen": -2.2622032165527344, + "logits/rejected": -2.2112956047058105, + "logps/chosen": -242.49032592773438, + "logps/rejected": -328.9765625, + "loss": 0.9077, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1567256450653076, + "rewards/margins": 0.34902822971343994, + "rewards/rejected": -1.505753755569458, + "step": 6916 + }, + { + "epoch": 0.8, + "learning_rate": 6.169963712981388e-08, + "logits/chosen": -2.177788019180298, + "logits/rejected": -2.4888916015625, + "logps/chosen": -320.8072814941406, + "logps/rejected": -375.4753723144531, + "loss": 0.3036, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.804212212562561, + "rewards/margins": 2.1717634201049805, + "rewards/rejected": -2.975975751876831, + "step": 6917 + }, + { + "epoch": 0.8, + "learning_rate": 6.166452066018962e-08, + "logits/chosen": -2.0718464851379395, + "logits/rejected": -2.094839096069336, + "logps/chosen": -229.2825164794922, + "logps/rejected": -245.36785888671875, + "loss": 0.6991, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2283542156219482, + "rewards/margins": 2.37996768951416, + "rewards/rejected": -3.6083216667175293, + "step": 6918 + }, + { + "epoch": 0.8, + "learning_rate": 6.162940419056538e-08, + "logits/chosen": -1.947159767150879, + "logits/rejected": -2.149557113647461, + "logps/chosen": -477.40386962890625, + "logps/rejected": -395.0809631347656, + "loss": 0.3162, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7816839218139648, + "rewards/margins": 2.383342981338501, + "rewards/rejected": -3.165027141571045, + "step": 6919 + }, + { + "epoch": 0.8, + "learning_rate": 6.159428772094112e-08, + "logits/chosen": -2.2991466522216797, + "logits/rejected": -2.6514012813568115, + "logps/chosen": -198.71743774414062, + "logps/rejected": -138.50247192382812, + "loss": 1.4338, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.2873969078063965, + "rewards/margins": 0.010414361953735352, + "rewards/rejected": -2.297811269760132, + "step": 6920 + }, + { + "epoch": 0.8, + "learning_rate": 6.155917125131686e-08, + "logits/chosen": -2.1131410598754883, + "logits/rejected": -2.0989019870758057, + "logps/chosen": -128.87496948242188, + "logps/rejected": -174.8745880126953, + "loss": 0.4882, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9523494243621826, + "rewards/margins": 2.356301784515381, + "rewards/rejected": -3.3086514472961426, + "step": 6921 + }, + { + "epoch": 0.8, + "learning_rate": 6.152405478169261e-08, + "logits/chosen": -2.5398151874542236, + "logits/rejected": -2.581143856048584, + "logps/chosen": -287.71063232421875, + "logps/rejected": -373.31011962890625, + "loss": 0.5057, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.22781600058078766, + "rewards/margins": 2.9053006172180176, + "rewards/rejected": -3.1331164836883545, + "step": 6922 + }, + { + "epoch": 0.8, + "learning_rate": 6.148893831206835e-08, + "logits/chosen": -2.4986319541931152, + "logits/rejected": -2.45233154296875, + "logps/chosen": -324.3903503417969, + "logps/rejected": -271.6710205078125, + "loss": 0.4471, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9648712277412415, + "rewards/margins": 1.5554755926132202, + "rewards/rejected": -2.5203468799591064, + "step": 6923 + }, + { + "epoch": 0.8, + "learning_rate": 6.14538218424441e-08, + "logits/chosen": -1.8288460969924927, + "logits/rejected": -2.31166410446167, + "logps/chosen": -509.60260009765625, + "logps/rejected": -287.5552062988281, + "loss": 0.3875, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.600105345249176, + "rewards/margins": 1.1866382360458374, + "rewards/rejected": -1.7867436408996582, + "step": 6924 + }, + { + "epoch": 0.8, + "learning_rate": 6.141870537281985e-08, + "logits/chosen": -1.5589568614959717, + "logits/rejected": -2.0900700092315674, + "logps/chosen": -516.272216796875, + "logps/rejected": -287.6631164550781, + "loss": 0.4876, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8903667330741882, + "rewards/margins": 0.9390519261360168, + "rewards/rejected": -1.829418659210205, + "step": 6925 + }, + { + "epoch": 0.8, + "learning_rate": 6.138358890319559e-08, + "logits/chosen": -2.333366870880127, + "logits/rejected": -2.1457903385162354, + "logps/chosen": -230.68844604492188, + "logps/rejected": -328.1650390625, + "loss": 0.4337, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0153745412826538, + "rewards/margins": 1.9280561208724976, + "rewards/rejected": -2.9434306621551514, + "step": 6926 + }, + { + "epoch": 0.8, + "learning_rate": 6.134847243357134e-08, + "logits/chosen": -2.0294339656829834, + "logits/rejected": -1.8348294496536255, + "logps/chosen": -191.8592529296875, + "logps/rejected": -293.72259521484375, + "loss": 0.745, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6864017248153687, + "rewards/margins": 2.8356544971466064, + "rewards/rejected": -3.5220561027526855, + "step": 6927 + }, + { + "epoch": 0.8, + "learning_rate": 6.131335596394708e-08, + "logits/chosen": -2.6163032054901123, + "logits/rejected": -2.6271183490753174, + "logps/chosen": -261.5018005371094, + "logps/rejected": -215.87477111816406, + "loss": 0.3442, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.414683997631073, + "rewards/margins": 2.6266796588897705, + "rewards/rejected": -3.0413637161254883, + "step": 6928 + }, + { + "epoch": 0.8, + "learning_rate": 6.127823949432284e-08, + "logits/chosen": -2.016824960708618, + "logits/rejected": -2.1864514350891113, + "logps/chosen": -302.2618713378906, + "logps/rejected": -260.0023498535156, + "loss": 0.9709, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3320950269699097, + "rewards/margins": 0.32188063859939575, + "rewards/rejected": -1.6539756059646606, + "step": 6929 + }, + { + "epoch": 0.8, + "learning_rate": 6.124312302469858e-08, + "logits/chosen": -2.499617576599121, + "logits/rejected": -2.446164608001709, + "logps/chosen": -217.7705078125, + "logps/rejected": -264.0179443359375, + "loss": 0.1277, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1605592668056488, + "rewards/margins": 3.6118452548980713, + "rewards/rejected": -3.4512863159179688, + "step": 6930 + }, + { + "epoch": 0.8, + "learning_rate": 6.120800655507433e-08, + "logits/chosen": -2.3527255058288574, + "logits/rejected": -2.234752655029297, + "logps/chosen": -340.6293029785156, + "logps/rejected": -338.70068359375, + "loss": 0.3479, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8433895111083984, + "rewards/margins": 1.9562323093414307, + "rewards/rejected": -2.799621820449829, + "step": 6931 + }, + { + "epoch": 0.8, + "learning_rate": 6.117289008545007e-08, + "logits/chosen": -2.5602705478668213, + "logits/rejected": -2.590083122253418, + "logps/chosen": -226.10203552246094, + "logps/rejected": -127.32720947265625, + "loss": 0.4491, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9070965647697449, + "rewards/margins": 1.8822461366653442, + "rewards/rejected": -2.7893428802490234, + "step": 6932 + }, + { + "epoch": 0.8, + "learning_rate": 6.113777361582583e-08, + "logits/chosen": -1.700566053390503, + "logits/rejected": -2.038482427597046, + "logps/chosen": -424.6217956542969, + "logps/rejected": -414.8634033203125, + "loss": 0.146, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19962969422340393, + "rewards/margins": 3.623889923095703, + "rewards/rejected": -3.823519706726074, + "step": 6933 + }, + { + "epoch": 0.8, + "learning_rate": 6.110265714620157e-08, + "logits/chosen": -2.9345781803131104, + "logits/rejected": -2.9142308235168457, + "logps/chosen": -232.21070861816406, + "logps/rejected": -228.97232055664062, + "loss": 0.3539, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5149754285812378, + "rewards/margins": 1.307349681854248, + "rewards/rejected": -1.8223252296447754, + "step": 6934 + }, + { + "epoch": 0.8, + "learning_rate": 6.106754067657731e-08, + "logits/chosen": -2.2944695949554443, + "logits/rejected": -2.3054680824279785, + "logps/chosen": -241.836669921875, + "logps/rejected": -199.56845092773438, + "loss": 0.7711, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2315001487731934, + "rewards/margins": 0.9690013527870178, + "rewards/rejected": -2.2005014419555664, + "step": 6935 + }, + { + "epoch": 0.8, + "learning_rate": 6.103242420695306e-08, + "logits/chosen": -2.171759843826294, + "logits/rejected": -2.074716806411743, + "logps/chosen": -200.28773498535156, + "logps/rejected": -261.76544189453125, + "loss": 0.4981, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.35797807574272156, + "rewards/margins": 2.4606235027313232, + "rewards/rejected": -2.818601608276367, + "step": 6936 + }, + { + "epoch": 0.8, + "learning_rate": 6.09973077373288e-08, + "logits/chosen": -2.2881107330322266, + "logits/rejected": -1.8896279335021973, + "logps/chosen": -205.3148193359375, + "logps/rejected": -373.80120849609375, + "loss": 0.2941, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.458731085062027, + "rewards/margins": 2.6989967823028564, + "rewards/rejected": -3.1577277183532715, + "step": 6937 + }, + { + "epoch": 0.8, + "learning_rate": 6.096219126770454e-08, + "logits/chosen": -2.4499213695526123, + "logits/rejected": -2.300262689590454, + "logps/chosen": -266.0334777832031, + "logps/rejected": -336.941162109375, + "loss": 0.2212, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2614619731903076, + "rewards/margins": 1.8067854642868042, + "rewards/rejected": -3.0682473182678223, + "step": 6938 + }, + { + "epoch": 0.8, + "learning_rate": 6.09270747980803e-08, + "logits/chosen": -2.676755905151367, + "logits/rejected": -2.5950403213500977, + "logps/chosen": -186.6980438232422, + "logps/rejected": -129.6877899169922, + "loss": 0.5533, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6957677602767944, + "rewards/margins": 1.0247856378555298, + "rewards/rejected": -1.7205533981323242, + "step": 6939 + }, + { + "epoch": 0.8, + "learning_rate": 6.089195832845604e-08, + "logits/chosen": -2.040393829345703, + "logits/rejected": -2.2999260425567627, + "logps/chosen": -197.998046875, + "logps/rejected": -184.00985717773438, + "loss": 0.4131, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2968785762786865, + "rewards/margins": 2.0891871452331543, + "rewards/rejected": -2.386065721511841, + "step": 6940 + }, + { + "epoch": 0.8, + "learning_rate": 6.085684185883179e-08, + "logits/chosen": -2.437563419342041, + "logits/rejected": -2.304877519607544, + "logps/chosen": -240.6792449951172, + "logps/rejected": -313.318115234375, + "loss": 0.2864, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6596584916114807, + "rewards/margins": 2.3740785121917725, + "rewards/rejected": -3.0337371826171875, + "step": 6941 + }, + { + "epoch": 0.8, + "learning_rate": 6.082172538920753e-08, + "logits/chosen": -2.49041748046875, + "logits/rejected": -2.481534004211426, + "logps/chosen": -383.7710266113281, + "logps/rejected": -296.75677490234375, + "loss": 0.3613, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5257720947265625, + "rewards/margins": 1.649487018585205, + "rewards/rejected": -3.1752591133117676, + "step": 6942 + }, + { + "epoch": 0.8, + "learning_rate": 6.078660891958329e-08, + "logits/chosen": -2.7572972774505615, + "logits/rejected": -2.769415855407715, + "logps/chosen": -129.02841186523438, + "logps/rejected": -210.03724670410156, + "loss": 0.5986, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5941133499145508, + "rewards/margins": 1.2157213687896729, + "rewards/rejected": -2.8098344802856445, + "step": 6943 + }, + { + "epoch": 0.8, + "learning_rate": 6.075149244995903e-08, + "logits/chosen": -2.3869588375091553, + "logits/rejected": -2.4576404094696045, + "logps/chosen": -207.73895263671875, + "logps/rejected": -206.14376831054688, + "loss": 0.3395, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9550517797470093, + "rewards/margins": 2.4950385093688965, + "rewards/rejected": -3.4500904083251953, + "step": 6944 + }, + { + "epoch": 0.8, + "learning_rate": 6.071637598033478e-08, + "logits/chosen": -2.1576321125030518, + "logits/rejected": -2.339926242828369, + "logps/chosen": -280.31964111328125, + "logps/rejected": -219.13922119140625, + "loss": 0.1761, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7491805553436279, + "rewards/margins": 3.534120798110962, + "rewards/rejected": -4.28330135345459, + "step": 6945 + }, + { + "epoch": 0.8, + "learning_rate": 6.068125951071052e-08, + "logits/chosen": -2.569596290588379, + "logits/rejected": -2.688314199447632, + "logps/chosen": -262.8642883300781, + "logps/rejected": -240.20761108398438, + "loss": 0.2852, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9542795419692993, + "rewards/margins": 2.707235097885132, + "rewards/rejected": -3.6615147590637207, + "step": 6946 + }, + { + "epoch": 0.8, + "learning_rate": 6.064614304108627e-08, + "logits/chosen": -2.119539737701416, + "logits/rejected": -2.075045585632324, + "logps/chosen": -457.90423583984375, + "logps/rejected": -482.7684631347656, + "loss": 0.3831, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3590071201324463, + "rewards/margins": 2.5434038639068604, + "rewards/rejected": -3.9024109840393066, + "step": 6947 + }, + { + "epoch": 0.8, + "learning_rate": 6.061102657146202e-08, + "logits/chosen": -2.9449543952941895, + "logits/rejected": -2.97501277923584, + "logps/chosen": -228.0864715576172, + "logps/rejected": -270.7157897949219, + "loss": 0.1297, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7767482399940491, + "rewards/margins": 3.7780609130859375, + "rewards/rejected": -4.5548095703125, + "step": 6948 + }, + { + "epoch": 0.8, + "learning_rate": 6.057591010183777e-08, + "logits/chosen": -2.4862728118896484, + "logits/rejected": -2.5493502616882324, + "logps/chosen": -546.044189453125, + "logps/rejected": -278.0262451171875, + "loss": 0.1639, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14970724284648895, + "rewards/margins": 4.29694938659668, + "rewards/rejected": -4.446656703948975, + "step": 6949 + }, + { + "epoch": 0.8, + "learning_rate": 6.054079363221351e-08, + "logits/chosen": -1.7170045375823975, + "logits/rejected": -1.887441873550415, + "logps/chosen": -422.39190673828125, + "logps/rejected": -380.083984375, + "loss": 0.1165, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1870833784341812, + "rewards/margins": 2.8990659713745117, + "rewards/rejected": -3.0861494541168213, + "step": 6950 + }, + { + "epoch": 0.8, + "learning_rate": 6.050567716258925e-08, + "logits/chosen": -2.419288158416748, + "logits/rejected": -2.5846147537231445, + "logps/chosen": -495.87103271484375, + "logps/rejected": -315.5924377441406, + "loss": 0.2031, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17511653900146484, + "rewards/margins": 2.1482503414154053, + "rewards/rejected": -2.323367118835449, + "step": 6951 + }, + { + "epoch": 0.8, + "learning_rate": 6.047056069296499e-08, + "logits/chosen": -2.34621000289917, + "logits/rejected": -2.3577628135681152, + "logps/chosen": -153.36624145507812, + "logps/rejected": -290.71160888671875, + "loss": 0.2452, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1696982383728027, + "rewards/margins": 2.320465564727783, + "rewards/rejected": -3.490163803100586, + "step": 6952 + }, + { + "epoch": 0.8, + "learning_rate": 6.043544422334074e-08, + "logits/chosen": -2.93353009223938, + "logits/rejected": -3.011643409729004, + "logps/chosen": -98.11834716796875, + "logps/rejected": -227.0652313232422, + "loss": 0.225, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05513609200716019, + "rewards/margins": 4.36834716796875, + "rewards/rejected": -4.423483371734619, + "step": 6953 + }, + { + "epoch": 0.8, + "learning_rate": 6.040032775371649e-08, + "logits/chosen": -2.7697231769561768, + "logits/rejected": -2.4906575679779053, + "logps/chosen": -254.8189697265625, + "logps/rejected": -250.33724975585938, + "loss": 0.3999, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6609758138656616, + "rewards/margins": 2.3853845596313477, + "rewards/rejected": -3.0463602542877197, + "step": 6954 + }, + { + "epoch": 0.8, + "learning_rate": 6.036521128409224e-08, + "logits/chosen": -1.713281512260437, + "logits/rejected": -1.844252586364746, + "logps/chosen": -263.4537048339844, + "logps/rejected": -166.82086181640625, + "loss": 0.4057, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.521405041217804, + "rewards/margins": 1.6258068084716797, + "rewards/rejected": -2.147212028503418, + "step": 6955 + }, + { + "epoch": 0.8, + "learning_rate": 6.033009481446798e-08, + "logits/chosen": -2.301938056945801, + "logits/rejected": -2.0988800525665283, + "logps/chosen": -299.7094421386719, + "logps/rejected": -257.4898681640625, + "loss": 0.2133, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0115272998809814, + "rewards/margins": 2.011406421661377, + "rewards/rejected": -3.0229334831237793, + "step": 6956 + }, + { + "epoch": 0.8, + "learning_rate": 6.029497834484372e-08, + "logits/chosen": -2.3405094146728516, + "logits/rejected": -2.2374179363250732, + "logps/chosen": -287.2705383300781, + "logps/rejected": -301.6504211425781, + "loss": 0.3929, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0886428356170654, + "rewards/margins": 3.1382575035095215, + "rewards/rejected": -4.226900577545166, + "step": 6957 + }, + { + "epoch": 0.8, + "learning_rate": 6.025986187521947e-08, + "logits/chosen": -1.8204805850982666, + "logits/rejected": -2.0966053009033203, + "logps/chosen": -285.4175720214844, + "logps/rejected": -298.9682922363281, + "loss": 0.1507, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7138189077377319, + "rewards/margins": 2.8128528594970703, + "rewards/rejected": -3.5266716480255127, + "step": 6958 + }, + { + "epoch": 0.8, + "learning_rate": 6.022474540559521e-08, + "logits/chosen": -1.8954386711120605, + "logits/rejected": -2.051157236099243, + "logps/chosen": -184.38963317871094, + "logps/rejected": -178.59646606445312, + "loss": 0.6355, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9799470901489258, + "rewards/margins": 0.7649259567260742, + "rewards/rejected": -1.7448731660842896, + "step": 6959 + }, + { + "epoch": 0.8, + "learning_rate": 6.018962893597097e-08, + "logits/chosen": -2.7880146503448486, + "logits/rejected": -2.5839643478393555, + "logps/chosen": -127.83997344970703, + "logps/rejected": -165.33642578125, + "loss": 0.3978, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8312230110168457, + "rewards/margins": 2.1670989990234375, + "rewards/rejected": -3.998322010040283, + "step": 6960 + }, + { + "epoch": 0.8, + "learning_rate": 6.015451246634671e-08, + "logits/chosen": -1.6356256008148193, + "logits/rejected": -1.5774478912353516, + "logps/chosen": -332.08837890625, + "logps/rejected": -296.94097900390625, + "loss": 0.3244, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.28549444675445557, + "rewards/margins": 1.8145967721939087, + "rewards/rejected": -2.1000912189483643, + "step": 6961 + }, + { + "epoch": 0.8, + "learning_rate": 6.011939599672246e-08, + "logits/chosen": -2.3768608570098877, + "logits/rejected": -2.2780983448028564, + "logps/chosen": -259.97772216796875, + "logps/rejected": -324.95074462890625, + "loss": 0.2352, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1273918151855469, + "rewards/margins": 3.736426591873169, + "rewards/rejected": -4.863818645477295, + "step": 6962 + }, + { + "epoch": 0.8, + "learning_rate": 6.00842795270982e-08, + "logits/chosen": -2.35591721534729, + "logits/rejected": -2.5591683387756348, + "logps/chosen": -402.9870300292969, + "logps/rejected": -269.4527282714844, + "loss": 0.6795, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3092663288116455, + "rewards/margins": 2.427839756011963, + "rewards/rejected": -3.7371060848236084, + "step": 6963 + }, + { + "epoch": 0.8, + "learning_rate": 6.004916305747396e-08, + "logits/chosen": -2.472193717956543, + "logits/rejected": -2.4856011867523193, + "logps/chosen": -283.8755798339844, + "logps/rejected": -269.9736633300781, + "loss": 0.453, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2009755373001099, + "rewards/margins": 2.553168773651123, + "rewards/rejected": -3.7541444301605225, + "step": 6964 + }, + { + "epoch": 0.8, + "learning_rate": 6.00140465878497e-08, + "logits/chosen": -2.435241222381592, + "logits/rejected": -2.2796950340270996, + "logps/chosen": -301.84002685546875, + "logps/rejected": -436.3761291503906, + "loss": 0.445, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4116874933242798, + "rewards/margins": 1.6669281721115112, + "rewards/rejected": -3.078615665435791, + "step": 6965 + }, + { + "epoch": 0.8, + "learning_rate": 5.997893011822545e-08, + "logits/chosen": -2.1820647716522217, + "logits/rejected": -2.5173873901367188, + "logps/chosen": -339.65826416015625, + "logps/rejected": -187.62200927734375, + "loss": 0.6568, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0778107643127441, + "rewards/margins": 0.4899839162826538, + "rewards/rejected": -1.5677945613861084, + "step": 6966 + }, + { + "epoch": 0.8, + "learning_rate": 5.994381364860119e-08, + "logits/chosen": -2.044416904449463, + "logits/rejected": -2.083660125732422, + "logps/chosen": -256.789306640625, + "logps/rejected": -275.2480163574219, + "loss": 1.322, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4816925525665283, + "rewards/margins": 0.8544447422027588, + "rewards/rejected": -2.336137294769287, + "step": 6967 + }, + { + "epoch": 0.8, + "learning_rate": 5.990869717897693e-08, + "logits/chosen": -2.070823907852173, + "logits/rejected": -2.205775499343872, + "logps/chosen": -462.669189453125, + "logps/rejected": -334.4673767089844, + "loss": 0.3646, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7893566489219666, + "rewards/margins": 1.6139501333236694, + "rewards/rejected": -2.403306722640991, + "step": 6968 + }, + { + "epoch": 0.8, + "learning_rate": 5.987358070935267e-08, + "logits/chosen": -2.619964361190796, + "logits/rejected": -2.4807982444763184, + "logps/chosen": -169.80368041992188, + "logps/rejected": -252.877685546875, + "loss": 0.2809, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1325138807296753, + "rewards/margins": 1.7426555156707764, + "rewards/rejected": -2.875169277191162, + "step": 6969 + }, + { + "epoch": 0.8, + "learning_rate": 5.983846423972843e-08, + "logits/chosen": -2.7264184951782227, + "logits/rejected": -2.7095909118652344, + "logps/chosen": -275.49493408203125, + "logps/rejected": -377.848388671875, + "loss": 0.4934, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.068034052848816, + "rewards/margins": 2.6480441093444824, + "rewards/rejected": -3.716078042984009, + "step": 6970 + }, + { + "epoch": 0.8, + "learning_rate": 5.980334777010417e-08, + "logits/chosen": -1.7277114391326904, + "logits/rejected": -2.1017303466796875, + "logps/chosen": -305.0952453613281, + "logps/rejected": -205.8179168701172, + "loss": 0.2695, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.27815890312194824, + "rewards/margins": 2.005232334136963, + "rewards/rejected": -2.2833914756774902, + "step": 6971 + }, + { + "epoch": 0.8, + "learning_rate": 5.976823130047992e-08, + "logits/chosen": -2.151367664337158, + "logits/rejected": -2.545849084854126, + "logps/chosen": -314.5142517089844, + "logps/rejected": -253.12347412109375, + "loss": 0.1141, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.32836633920669556, + "rewards/margins": 3.3070905208587646, + "rewards/rejected": -3.6354568004608154, + "step": 6972 + }, + { + "epoch": 0.8, + "learning_rate": 5.973311483085566e-08, + "logits/chosen": -2.516669750213623, + "logits/rejected": -2.6901257038116455, + "logps/chosen": -229.76780700683594, + "logps/rejected": -184.459716796875, + "loss": 0.2906, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.456278920173645, + "rewards/margins": 3.0321741104125977, + "rewards/rejected": -3.488452911376953, + "step": 6973 + }, + { + "epoch": 0.8, + "learning_rate": 5.969799836123142e-08, + "logits/chosen": -2.6300899982452393, + "logits/rejected": -2.3983845710754395, + "logps/chosen": -243.14581298828125, + "logps/rejected": -451.3669738769531, + "loss": 0.2348, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1706745624542236, + "rewards/margins": 4.566593170166016, + "rewards/rejected": -5.73726749420166, + "step": 6974 + }, + { + "epoch": 0.8, + "learning_rate": 5.966288189160716e-08, + "logits/chosen": -2.652326822280884, + "logits/rejected": -2.3978140354156494, + "logps/chosen": -150.314697265625, + "logps/rejected": -174.57269287109375, + "loss": 0.4529, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7792613506317139, + "rewards/margins": 1.7270913124084473, + "rewards/rejected": -2.506352663040161, + "step": 6975 + }, + { + "epoch": 0.8, + "learning_rate": 5.962776542198291e-08, + "logits/chosen": -2.3355143070220947, + "logits/rejected": -2.329076051712036, + "logps/chosen": -446.8757019042969, + "logps/rejected": -433.47900390625, + "loss": 0.114, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8223702907562256, + "rewards/margins": 3.365715742111206, + "rewards/rejected": -4.188086032867432, + "step": 6976 + }, + { + "epoch": 0.8, + "learning_rate": 5.959264895235865e-08, + "logits/chosen": -2.305067539215088, + "logits/rejected": -2.340056896209717, + "logps/chosen": -298.6597900390625, + "logps/rejected": -246.81178283691406, + "loss": 0.5493, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6848537921905518, + "rewards/margins": 1.8912943601608276, + "rewards/rejected": -3.576148509979248, + "step": 6977 + }, + { + "epoch": 0.8, + "learning_rate": 5.95575324827344e-08, + "logits/chosen": -2.9281129837036133, + "logits/rejected": -2.8035080432891846, + "logps/chosen": -180.09307861328125, + "logps/rejected": -162.90443420410156, + "loss": 0.4552, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0380831956863403, + "rewards/margins": 1.1404287815093994, + "rewards/rejected": -2.1785120964050293, + "step": 6978 + }, + { + "epoch": 0.8, + "learning_rate": 5.9522416013110146e-08, + "logits/chosen": -2.2158267498016357, + "logits/rejected": -2.4629709720611572, + "logps/chosen": -265.5051574707031, + "logps/rejected": -209.05120849609375, + "loss": 0.332, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.494352251291275, + "rewards/margins": 2.069573163986206, + "rewards/rejected": -2.563925266265869, + "step": 6979 + }, + { + "epoch": 0.8, + "learning_rate": 5.9487299543485894e-08, + "logits/chosen": -2.710073232650757, + "logits/rejected": -2.785388231277466, + "logps/chosen": -243.6273651123047, + "logps/rejected": -314.0628662109375, + "loss": 0.0993, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.664793848991394, + "rewards/margins": 4.0910234451293945, + "rewards/rejected": -4.755817413330078, + "step": 6980 + }, + { + "epoch": 0.8, + "learning_rate": 5.9452183073861634e-08, + "logits/chosen": -2.4350218772888184, + "logits/rejected": -2.491790294647217, + "logps/chosen": -222.51129150390625, + "logps/rejected": -272.2130126953125, + "loss": 0.2111, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.998191237449646, + "rewards/margins": 4.32913875579834, + "rewards/rejected": -5.327329635620117, + "step": 6981 + }, + { + "epoch": 0.8, + "learning_rate": 5.941706660423739e-08, + "logits/chosen": -2.341859817504883, + "logits/rejected": -2.2519640922546387, + "logps/chosen": -348.0743713378906, + "logps/rejected": -301.7850341796875, + "loss": 0.187, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.020786866545677185, + "rewards/margins": 3.0426814556121826, + "rewards/rejected": -3.0218944549560547, + "step": 6982 + }, + { + "epoch": 0.81, + "learning_rate": 5.938195013461313e-08, + "logits/chosen": -2.63105845451355, + "logits/rejected": -2.7323129177093506, + "logps/chosen": -325.7155456542969, + "logps/rejected": -365.500244140625, + "loss": 0.0589, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4391675889492035, + "rewards/margins": 4.817363739013672, + "rewards/rejected": -5.256531715393066, + "step": 6983 + }, + { + "epoch": 0.81, + "learning_rate": 5.934683366498888e-08, + "logits/chosen": -2.2970385551452637, + "logits/rejected": -1.8988230228424072, + "logps/chosen": -152.7099609375, + "logps/rejected": -328.6342468261719, + "loss": 0.5031, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6903191804885864, + "rewards/margins": 1.469922661781311, + "rewards/rejected": -2.1602420806884766, + "step": 6984 + }, + { + "epoch": 0.81, + "learning_rate": 5.931171719536462e-08, + "logits/chosen": -2.1121909618377686, + "logits/rejected": -2.195810317993164, + "logps/chosen": -352.8022766113281, + "logps/rejected": -368.618896484375, + "loss": 0.2329, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6652179956436157, + "rewards/margins": 2.238710403442383, + "rewards/rejected": -2.903928279876709, + "step": 6985 + }, + { + "epoch": 0.81, + "learning_rate": 5.927660072574037e-08, + "logits/chosen": -2.5462429523468018, + "logits/rejected": -2.519068717956543, + "logps/chosen": -271.9087829589844, + "logps/rejected": -292.4693603515625, + "loss": 0.4418, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.377349317073822, + "rewards/margins": 2.1905033588409424, + "rewards/rejected": -2.56785249710083, + "step": 6986 + }, + { + "epoch": 0.81, + "learning_rate": 5.924148425611611e-08, + "logits/chosen": -2.8899149894714355, + "logits/rejected": -2.9522223472595215, + "logps/chosen": -187.5408477783203, + "logps/rejected": -368.24725341796875, + "loss": 0.1973, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5834145545959473, + "rewards/margins": 3.7379915714263916, + "rewards/rejected": -4.321406364440918, + "step": 6987 + }, + { + "epoch": 0.81, + "learning_rate": 5.9206367786491865e-08, + "logits/chosen": -2.6460185050964355, + "logits/rejected": -2.6402692794799805, + "logps/chosen": -259.3904113769531, + "logps/rejected": -236.14398193359375, + "loss": 0.3399, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4241176843643188, + "rewards/margins": 3.057199001312256, + "rewards/rejected": -4.481316566467285, + "step": 6988 + }, + { + "epoch": 0.81, + "learning_rate": 5.9171251316867606e-08, + "logits/chosen": -1.9991092681884766, + "logits/rejected": -1.9216277599334717, + "logps/chosen": -331.9601135253906, + "logps/rejected": -297.2203369140625, + "loss": 0.4671, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5741772651672363, + "rewards/margins": 2.7048087120056152, + "rewards/rejected": -4.278985977172852, + "step": 6989 + }, + { + "epoch": 0.81, + "learning_rate": 5.913613484724336e-08, + "logits/chosen": -2.1345386505126953, + "logits/rejected": -2.120213747024536, + "logps/chosen": -301.47601318359375, + "logps/rejected": -344.95465087890625, + "loss": 0.5157, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7985648512840271, + "rewards/margins": 2.588291645050049, + "rewards/rejected": -3.3868565559387207, + "step": 6990 + }, + { + "epoch": 0.81, + "learning_rate": 5.91010183776191e-08, + "logits/chosen": -2.545732021331787, + "logits/rejected": -2.5450870990753174, + "logps/chosen": -204.23971557617188, + "logps/rejected": -253.1505126953125, + "loss": 0.7913, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4975452423095703, + "rewards/margins": 2.7886569499969482, + "rewards/rejected": -3.2862019538879395, + "step": 6991 + }, + { + "epoch": 0.81, + "learning_rate": 5.906590190799485e-08, + "logits/chosen": -1.7846496105194092, + "logits/rejected": -1.9351811408996582, + "logps/chosen": -317.984375, + "logps/rejected": -219.96490478515625, + "loss": 1.2235, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.8123154640197754, + "rewards/margins": 0.7034961581230164, + "rewards/rejected": -2.5158119201660156, + "step": 6992 + }, + { + "epoch": 0.81, + "learning_rate": 5.9030785438370595e-08, + "logits/chosen": -2.485731363296509, + "logits/rejected": -2.220670461654663, + "logps/chosen": -181.32850646972656, + "logps/rejected": -336.76275634765625, + "loss": 0.1737, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7549896836280823, + "rewards/margins": 4.793028831481934, + "rewards/rejected": -5.548018932342529, + "step": 6993 + }, + { + "epoch": 0.81, + "learning_rate": 5.899566896874634e-08, + "logits/chosen": -2.1449027061462402, + "logits/rejected": -2.2989554405212402, + "logps/chosen": -288.0290832519531, + "logps/rejected": -198.68670654296875, + "loss": 0.4355, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1330098956823349, + "rewards/margins": 1.2618670463562012, + "rewards/rejected": -1.3948769569396973, + "step": 6994 + }, + { + "epoch": 0.81, + "learning_rate": 5.896055249912208e-08, + "logits/chosen": -2.3582923412323, + "logits/rejected": -2.293926239013672, + "logps/chosen": -432.909912109375, + "logps/rejected": -326.621337890625, + "loss": 0.3636, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0855170488357544, + "rewards/margins": 1.7309083938598633, + "rewards/rejected": -2.816425323486328, + "step": 6995 + }, + { + "epoch": 0.81, + "learning_rate": 5.8925436029497836e-08, + "logits/chosen": -2.5652992725372314, + "logits/rejected": -2.4000840187072754, + "logps/chosen": -249.1810302734375, + "logps/rejected": -186.2947998046875, + "loss": 0.1855, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5497367978096008, + "rewards/margins": 2.0473930835723877, + "rewards/rejected": -2.5971298217773438, + "step": 6996 + }, + { + "epoch": 0.81, + "learning_rate": 5.889031955987358e-08, + "logits/chosen": -1.6344926357269287, + "logits/rejected": -1.6647526025772095, + "logps/chosen": -378.4595642089844, + "logps/rejected": -274.4512939453125, + "loss": 0.2374, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3165944218635559, + "rewards/margins": 2.2828683853149414, + "rewards/rejected": -1.9662740230560303, + "step": 6997 + }, + { + "epoch": 0.81, + "learning_rate": 5.885520309024933e-08, + "logits/chosen": -1.8646836280822754, + "logits/rejected": -2.155508041381836, + "logps/chosen": -361.900146484375, + "logps/rejected": -257.2957458496094, + "loss": 0.6546, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9408642053604126, + "rewards/margins": 1.0430392026901245, + "rewards/rejected": -1.9839036464691162, + "step": 6998 + }, + { + "epoch": 0.81, + "learning_rate": 5.882008662062507e-08, + "logits/chosen": -2.388432502746582, + "logits/rejected": -2.5018084049224854, + "logps/chosen": -95.59988403320312, + "logps/rejected": -139.3844451904297, + "loss": 1.145, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0572092533111572, + "rewards/margins": 0.6995482444763184, + "rewards/rejected": -2.7567572593688965, + "step": 6999 + }, + { + "epoch": 0.81, + "learning_rate": 5.878497015100082e-08, + "logits/chosen": -2.9532229900360107, + "logits/rejected": -2.9840826988220215, + "logps/chosen": -350.7537841796875, + "logps/rejected": -270.036376953125, + "loss": 0.2319, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5788795351982117, + "rewards/margins": 1.8238173723220825, + "rewards/rejected": -2.4026968479156494, + "step": 7000 + }, + { + "epoch": 0.81, + "eval_logits/chosen": -1.6331984996795654, + "eval_logits/rejected": -1.508997917175293, + "eval_logps/chosen": -300.5733947753906, + "eval_logps/rejected": -275.28289794921875, + "eval_loss": 0.32897961139678955, + "eval_rewards/accuracies": 0.8428571224212646, + "eval_rewards/chosen": -0.7893202304840088, + "eval_rewards/margins": 2.220229387283325, + "eval_rewards/rejected": -3.009549617767334, + "eval_runtime": 24.4147, + "eval_samples_per_second": 2.867, + "eval_steps_per_second": 1.434, + "step": 7000 + }, + { + "epoch": 0.81, + "learning_rate": 5.8749853681376566e-08, + "logits/chosen": -2.004009246826172, + "logits/rejected": -2.249382734298706, + "logps/chosen": -337.1738586425781, + "logps/rejected": -235.34832763671875, + "loss": 0.2944, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3604500889778137, + "rewards/margins": 2.7777247428894043, + "rewards/rejected": -3.1381750106811523, + "step": 7001 + }, + { + "epoch": 0.81, + "learning_rate": 5.871473721175231e-08, + "logits/chosen": -2.2800703048706055, + "logits/rejected": -2.2966482639312744, + "logps/chosen": -288.99102783203125, + "logps/rejected": -322.8977966308594, + "loss": 0.328, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9851782917976379, + "rewards/margins": 2.0361578464508057, + "rewards/rejected": -3.021336317062378, + "step": 7002 + }, + { + "epoch": 0.81, + "learning_rate": 5.8679620742128054e-08, + "logits/chosen": -2.166234254837036, + "logits/rejected": -2.232252836227417, + "logps/chosen": -180.31231689453125, + "logps/rejected": -248.35177612304688, + "loss": 0.4856, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.212141990661621, + "rewards/margins": 1.5278992652893066, + "rewards/rejected": -2.7400412559509277, + "step": 7003 + }, + { + "epoch": 0.81, + "learning_rate": 5.8644504272503794e-08, + "logits/chosen": -2.763721466064453, + "logits/rejected": -2.4159131050109863, + "logps/chosen": -326.8785095214844, + "logps/rejected": -273.8802185058594, + "loss": 0.1711, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0254395008087158, + "rewards/margins": 2.271825075149536, + "rewards/rejected": -3.297264575958252, + "step": 7004 + }, + { + "epoch": 0.81, + "learning_rate": 5.860938780287955e-08, + "logits/chosen": -2.4173238277435303, + "logits/rejected": -2.418488025665283, + "logps/chosen": -399.8805236816406, + "logps/rejected": -358.32354736328125, + "loss": 0.5749, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0055242776870728, + "rewards/margins": 2.124933958053589, + "rewards/rejected": -3.130458354949951, + "step": 7005 + }, + { + "epoch": 0.81, + "learning_rate": 5.857427133325529e-08, + "logits/chosen": -2.5977413654327393, + "logits/rejected": -2.5643081665039062, + "logps/chosen": -376.0419006347656, + "logps/rejected": -322.3150329589844, + "loss": 0.3177, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5160046815872192, + "rewards/margins": 2.4517130851745605, + "rewards/rejected": -3.967717409133911, + "step": 7006 + }, + { + "epoch": 0.81, + "learning_rate": 5.853915486363104e-08, + "logits/chosen": -2.2231481075286865, + "logits/rejected": -2.328148365020752, + "logps/chosen": -253.1478729248047, + "logps/rejected": -238.024658203125, + "loss": 2.0512, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.583129405975342, + "rewards/margins": 0.9595119953155518, + "rewards/rejected": -3.5426414012908936, + "step": 7007 + }, + { + "epoch": 0.81, + "learning_rate": 5.850403839400678e-08, + "logits/chosen": -2.2356667518615723, + "logits/rejected": -1.9698785543441772, + "logps/chosen": -261.54974365234375, + "logps/rejected": -386.01605224609375, + "loss": 0.4652, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3279615640640259, + "rewards/margins": 2.524916172027588, + "rewards/rejected": -3.8528778553009033, + "step": 7008 + }, + { + "epoch": 0.81, + "learning_rate": 5.846892192438253e-08, + "logits/chosen": -2.4777350425720215, + "logits/rejected": -2.415152072906494, + "logps/chosen": -258.6273193359375, + "logps/rejected": -246.30215454101562, + "loss": 0.2507, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5045626759529114, + "rewards/margins": 1.5441713333129883, + "rewards/rejected": -2.048733949661255, + "step": 7009 + }, + { + "epoch": 0.81, + "learning_rate": 5.843380545475828e-08, + "logits/chosen": -2.553553581237793, + "logits/rejected": -2.568685293197632, + "logps/chosen": -166.77395629882812, + "logps/rejected": -197.92250061035156, + "loss": 0.6293, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.165440320968628, + "rewards/margins": 0.9436530470848083, + "rewards/rejected": -2.109093427658081, + "step": 7010 + }, + { + "epoch": 0.81, + "learning_rate": 5.8398688985134025e-08, + "logits/chosen": -1.9179625511169434, + "logits/rejected": -2.1810126304626465, + "logps/chosen": -401.8837890625, + "logps/rejected": -299.1473388671875, + "loss": 0.5224, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5955631136894226, + "rewards/margins": 1.042051076889038, + "rewards/rejected": -1.637614130973816, + "step": 7011 + }, + { + "epoch": 0.81, + "learning_rate": 5.8363572515509766e-08, + "logits/chosen": -2.5568878650665283, + "logits/rejected": -2.6173808574676514, + "logps/chosen": -413.1700439453125, + "logps/rejected": -233.42324829101562, + "loss": 0.2246, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8629463911056519, + "rewards/margins": 1.8908220529556274, + "rewards/rejected": -2.7537684440612793, + "step": 7012 + }, + { + "epoch": 0.81, + "learning_rate": 5.832845604588552e-08, + "logits/chosen": -2.3933157920837402, + "logits/rejected": -2.4855363368988037, + "logps/chosen": -362.3068542480469, + "logps/rejected": -337.4557800292969, + "loss": 0.7258, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9411617517471313, + "rewards/margins": 2.033318042755127, + "rewards/rejected": -2.974480152130127, + "step": 7013 + }, + { + "epoch": 0.81, + "learning_rate": 5.829333957626126e-08, + "logits/chosen": -2.830779552459717, + "logits/rejected": -2.6627981662750244, + "logps/chosen": -108.31309509277344, + "logps/rejected": -239.21116638183594, + "loss": 0.2019, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3398672342300415, + "rewards/margins": 2.568061351776123, + "rewards/rejected": -3.907928466796875, + "step": 7014 + }, + { + "epoch": 0.81, + "learning_rate": 5.8258223106637014e-08, + "logits/chosen": -2.2563321590423584, + "logits/rejected": -2.378906726837158, + "logps/chosen": -523.3800048828125, + "logps/rejected": -383.06072998046875, + "loss": 0.521, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7976995706558228, + "rewards/margins": 3.074474334716797, + "rewards/rejected": -3.87217378616333, + "step": 7015 + }, + { + "epoch": 0.81, + "learning_rate": 5.8223106637012755e-08, + "logits/chosen": -2.79473876953125, + "logits/rejected": -2.683104991912842, + "logps/chosen": -294.76715087890625, + "logps/rejected": -308.6383056640625, + "loss": 0.0723, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8917621970176697, + "rewards/margins": 4.68392276763916, + "rewards/rejected": -5.575685501098633, + "step": 7016 + }, + { + "epoch": 0.81, + "learning_rate": 5.81879901673885e-08, + "logits/chosen": -2.2294325828552246, + "logits/rejected": -2.476609468460083, + "logps/chosen": -336.1642150878906, + "logps/rejected": -218.7975311279297, + "loss": 0.2398, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9802026152610779, + "rewards/margins": 2.3303539752960205, + "rewards/rejected": -3.310556411743164, + "step": 7017 + }, + { + "epoch": 0.81, + "learning_rate": 5.815287369776425e-08, + "logits/chosen": -1.9742802381515503, + "logits/rejected": -1.9482016563415527, + "logps/chosen": -156.38839721679688, + "logps/rejected": -238.44952392578125, + "loss": 0.6806, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.279839277267456, + "rewards/margins": 2.0963964462280273, + "rewards/rejected": -3.3762354850769043, + "step": 7018 + }, + { + "epoch": 0.81, + "learning_rate": 5.8117757228139996e-08, + "logits/chosen": -2.4469566345214844, + "logits/rejected": -2.704575538635254, + "logps/chosen": -435.99676513671875, + "logps/rejected": -323.3425598144531, + "loss": 0.4162, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9149874448776245, + "rewards/margins": 2.018821954727173, + "rewards/rejected": -3.933809280395508, + "step": 7019 + }, + { + "epoch": 0.81, + "learning_rate": 5.808264075851574e-08, + "logits/chosen": -2.209064483642578, + "logits/rejected": -2.214794635772705, + "logps/chosen": -400.3335876464844, + "logps/rejected": -436.7138366699219, + "loss": 0.2533, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6254260540008545, + "rewards/margins": 2.065342664718628, + "rewards/rejected": -3.6907689571380615, + "step": 7020 + }, + { + "epoch": 0.81, + "learning_rate": 5.804752428889149e-08, + "logits/chosen": -2.8314261436462402, + "logits/rejected": -2.4012417793273926, + "logps/chosen": -170.69064331054688, + "logps/rejected": -255.8707275390625, + "loss": 0.1392, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5601309537887573, + "rewards/margins": 4.486408710479736, + "rewards/rejected": -5.046539783477783, + "step": 7021 + }, + { + "epoch": 0.81, + "learning_rate": 5.801240781926723e-08, + "logits/chosen": -2.753450632095337, + "logits/rejected": -2.6847152709960938, + "logps/chosen": -243.33047485351562, + "logps/rejected": -195.1016845703125, + "loss": 0.1227, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7934098243713379, + "rewards/margins": 2.611690044403076, + "rewards/rejected": -3.405100107192993, + "step": 7022 + }, + { + "epoch": 0.81, + "learning_rate": 5.797729134964298e-08, + "logits/chosen": -2.4107871055603027, + "logits/rejected": -2.6173744201660156, + "logps/chosen": -419.68115234375, + "logps/rejected": -265.9422607421875, + "loss": 0.2859, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6876038908958435, + "rewards/margins": 2.0085277557373047, + "rewards/rejected": -2.696131467819214, + "step": 7023 + }, + { + "epoch": 0.81, + "learning_rate": 5.7942174880018726e-08, + "logits/chosen": -2.221273183822632, + "logits/rejected": -2.3024871349334717, + "logps/chosen": -412.50640869140625, + "logps/rejected": -378.9554443359375, + "loss": 0.3768, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8756352663040161, + "rewards/margins": 1.5022809505462646, + "rewards/rejected": -3.3779163360595703, + "step": 7024 + }, + { + "epoch": 0.81, + "learning_rate": 5.790705841039447e-08, + "logits/chosen": -1.9954384565353394, + "logits/rejected": -2.3577303886413574, + "logps/chosen": -518.5911254882812, + "logps/rejected": -294.9359436035156, + "loss": 0.564, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.615550696849823, + "rewards/margins": 1.5462766885757446, + "rewards/rejected": -2.161827325820923, + "step": 7025 + }, + { + "epoch": 0.81, + "learning_rate": 5.7871941940770214e-08, + "logits/chosen": -2.3558855056762695, + "logits/rejected": -2.4514312744140625, + "logps/chosen": -311.6669006347656, + "logps/rejected": -282.67864990234375, + "loss": 0.4155, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1047344207763672, + "rewards/margins": 2.974092960357666, + "rewards/rejected": -4.078827381134033, + "step": 7026 + }, + { + "epoch": 0.81, + "learning_rate": 5.783682547114597e-08, + "logits/chosen": -2.61409330368042, + "logits/rejected": -2.779327630996704, + "logps/chosen": -459.1101989746094, + "logps/rejected": -279.38421630859375, + "loss": 0.3468, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07397165894508362, + "rewards/margins": 2.3334128856658936, + "rewards/rejected": -2.4073843955993652, + "step": 7027 + }, + { + "epoch": 0.81, + "learning_rate": 5.780170900152171e-08, + "logits/chosen": -2.297487497329712, + "logits/rejected": -2.2555651664733887, + "logps/chosen": -238.39678955078125, + "logps/rejected": -313.2664489746094, + "loss": 0.3288, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2064049243927002, + "rewards/margins": 1.6588999032974243, + "rewards/rejected": -2.865304708480835, + "step": 7028 + }, + { + "epoch": 0.81, + "learning_rate": 5.776659253189746e-08, + "logits/chosen": -2.3393750190734863, + "logits/rejected": -2.562971830368042, + "logps/chosen": -257.60601806640625, + "logps/rejected": -191.50823974609375, + "loss": 0.5204, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7650994062423706, + "rewards/margins": 1.4119833707809448, + "rewards/rejected": -2.1770827770233154, + "step": 7029 + }, + { + "epoch": 0.81, + "learning_rate": 5.77314760622732e-08, + "logits/chosen": -2.5435657501220703, + "logits/rejected": -2.6554579734802246, + "logps/chosen": -221.1560516357422, + "logps/rejected": -226.07345581054688, + "loss": 0.2634, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3567548990249634, + "rewards/margins": 2.3854894638061523, + "rewards/rejected": -2.742244243621826, + "step": 7030 + }, + { + "epoch": 0.81, + "learning_rate": 5.769635959264895e-08, + "logits/chosen": -2.4440977573394775, + "logits/rejected": -2.381467342376709, + "logps/chosen": -345.2782897949219, + "logps/rejected": -294.3912048339844, + "loss": 0.3543, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6228408217430115, + "rewards/margins": 2.355459690093994, + "rewards/rejected": -2.9783005714416504, + "step": 7031 + }, + { + "epoch": 0.81, + "learning_rate": 5.76612431230247e-08, + "logits/chosen": -2.1454176902770996, + "logits/rejected": -2.67305326461792, + "logps/chosen": -395.7582702636719, + "logps/rejected": -387.0134582519531, + "loss": 0.6053, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.513746976852417, + "rewards/margins": 1.1781738996505737, + "rewards/rejected": -2.691920757293701, + "step": 7032 + }, + { + "epoch": 0.81, + "learning_rate": 5.7626126653400444e-08, + "logits/chosen": -2.6379334926605225, + "logits/rejected": -2.625582218170166, + "logps/chosen": -332.3638916015625, + "logps/rejected": -162.98568725585938, + "loss": 0.4319, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6667803525924683, + "rewards/margins": 1.6794829368591309, + "rewards/rejected": -2.3462634086608887, + "step": 7033 + }, + { + "epoch": 0.81, + "learning_rate": 5.7591010183776185e-08, + "logits/chosen": -2.6148931980133057, + "logits/rejected": -2.553415298461914, + "logps/chosen": -194.74497985839844, + "logps/rejected": -184.74989318847656, + "loss": 0.5863, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6891196966171265, + "rewards/margins": 0.4482032060623169, + "rewards/rejected": -2.1373229026794434, + "step": 7034 + }, + { + "epoch": 0.81, + "learning_rate": 5.755589371415194e-08, + "logits/chosen": -2.528122663497925, + "logits/rejected": -2.4676804542541504, + "logps/chosen": -296.624267578125, + "logps/rejected": -169.0735321044922, + "loss": 0.982, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.392411231994629, + "rewards/margins": 1.0047049522399902, + "rewards/rejected": -2.39711594581604, + "step": 7035 + }, + { + "epoch": 0.81, + "learning_rate": 5.752077724452768e-08, + "logits/chosen": -2.2108471393585205, + "logits/rejected": -2.5361692905426025, + "logps/chosen": -391.3482360839844, + "logps/rejected": -252.47076416015625, + "loss": 0.4097, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.085898518562317, + "rewards/margins": 1.4112306833267212, + "rewards/rejected": -2.497128963470459, + "step": 7036 + }, + { + "epoch": 0.81, + "learning_rate": 5.7485660774903433e-08, + "logits/chosen": -2.7723963260650635, + "logits/rejected": -2.699674367904663, + "logps/chosen": -70.42849731445312, + "logps/rejected": -230.2744140625, + "loss": 0.1235, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3172210156917572, + "rewards/margins": 3.9987173080444336, + "rewards/rejected": -4.3159379959106445, + "step": 7037 + }, + { + "epoch": 0.81, + "learning_rate": 5.7450544305279174e-08, + "logits/chosen": -2.8865108489990234, + "logits/rejected": -2.6413822174072266, + "logps/chosen": -301.2420654296875, + "logps/rejected": -232.08590698242188, + "loss": 0.493, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3230558633804321, + "rewards/margins": 1.1261857748031616, + "rewards/rejected": -2.4492416381835938, + "step": 7038 + }, + { + "epoch": 0.81, + "learning_rate": 5.741542783565492e-08, + "logits/chosen": -2.5918941497802734, + "logits/rejected": -2.441594362258911, + "logps/chosen": -166.78506469726562, + "logps/rejected": -214.417236328125, + "loss": 0.1433, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8804888129234314, + "rewards/margins": 3.4401614665985107, + "rewards/rejected": -4.320650100708008, + "step": 7039 + }, + { + "epoch": 0.81, + "learning_rate": 5.738031136603066e-08, + "logits/chosen": -2.3698654174804688, + "logits/rejected": -2.435602903366089, + "logps/chosen": -282.1539001464844, + "logps/rejected": -393.0101318359375, + "loss": 0.602, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.012303352355957, + "rewards/margins": 1.2420620918273926, + "rewards/rejected": -3.2543654441833496, + "step": 7040 + }, + { + "epoch": 0.81, + "learning_rate": 5.7345194896406416e-08, + "logits/chosen": -2.60380220413208, + "logits/rejected": -2.8387415409088135, + "logps/chosen": -240.3653564453125, + "logps/rejected": -212.53985595703125, + "loss": 0.1551, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.26490914821624756, + "rewards/margins": 2.403752326965332, + "rewards/rejected": -2.66866135597229, + "step": 7041 + }, + { + "epoch": 0.81, + "learning_rate": 5.7310078426782156e-08, + "logits/chosen": -2.123818874359131, + "logits/rejected": -1.8854553699493408, + "logps/chosen": -321.37823486328125, + "logps/rejected": -323.38275146484375, + "loss": 0.3832, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4301697909832001, + "rewards/margins": 1.1149108409881592, + "rewards/rejected": -1.545080542564392, + "step": 7042 + }, + { + "epoch": 0.81, + "learning_rate": 5.727496195715791e-08, + "logits/chosen": -2.458160400390625, + "logits/rejected": -2.4013912677764893, + "logps/chosen": -143.17530822753906, + "logps/rejected": -231.94190979003906, + "loss": 0.4198, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3465132713317871, + "rewards/margins": 1.7721927165985107, + "rewards/rejected": -2.118705987930298, + "step": 7043 + }, + { + "epoch": 0.81, + "learning_rate": 5.723984548753365e-08, + "logits/chosen": -1.9738671779632568, + "logits/rejected": -2.0073533058166504, + "logps/chosen": -289.0260009765625, + "logps/rejected": -289.427978515625, + "loss": 0.3373, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5470874309539795, + "rewards/margins": 3.1198620796203613, + "rewards/rejected": -3.666949510574341, + "step": 7044 + }, + { + "epoch": 0.81, + "learning_rate": 5.72047290179094e-08, + "logits/chosen": -2.554351329803467, + "logits/rejected": -2.409162759780884, + "logps/chosen": -110.04646301269531, + "logps/rejected": -275.3749694824219, + "loss": 0.4338, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0463569164276123, + "rewards/margins": 2.1907358169555664, + "rewards/rejected": -3.2370927333831787, + "step": 7045 + }, + { + "epoch": 0.81, + "learning_rate": 5.7169612548285145e-08, + "logits/chosen": -2.3618783950805664, + "logits/rejected": -2.315739154815674, + "logps/chosen": -236.59161376953125, + "logps/rejected": -253.40298461914062, + "loss": 0.1494, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.81339430809021, + "rewards/margins": 3.5855660438537598, + "rewards/rejected": -4.398960590362549, + "step": 7046 + }, + { + "epoch": 0.81, + "learning_rate": 5.713449607866089e-08, + "logits/chosen": -2.5609662532806396, + "logits/rejected": -2.76776123046875, + "logps/chosen": -355.9845275878906, + "logps/rejected": -347.188720703125, + "loss": 0.1373, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.48167070746421814, + "rewards/margins": 2.9336647987365723, + "rewards/rejected": -3.4153354167938232, + "step": 7047 + }, + { + "epoch": 0.81, + "learning_rate": 5.709937960903663e-08, + "logits/chosen": -2.611384630203247, + "logits/rejected": -2.650163173675537, + "logps/chosen": -187.63157653808594, + "logps/rejected": -283.6307067871094, + "loss": 0.4541, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3047289848327637, + "rewards/margins": 1.6306947469711304, + "rewards/rejected": -2.9354236125946045, + "step": 7048 + }, + { + "epoch": 0.81, + "learning_rate": 5.706426313941239e-08, + "logits/chosen": -2.012169361114502, + "logits/rejected": -2.3593497276306152, + "logps/chosen": -437.9981994628906, + "logps/rejected": -269.57470703125, + "loss": 0.5434, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.05786267668008804, + "rewards/margins": 0.8118298649787903, + "rewards/rejected": -0.7539671063423157, + "step": 7049 + }, + { + "epoch": 0.81, + "learning_rate": 5.702914666978813e-08, + "logits/chosen": -1.68194580078125, + "logits/rejected": -2.0931143760681152, + "logps/chosen": -270.4744873046875, + "logps/rejected": -201.0367431640625, + "loss": 0.587, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6852275729179382, + "rewards/margins": 1.3089568614959717, + "rewards/rejected": -1.9941844940185547, + "step": 7050 + }, + { + "epoch": 0.81, + "learning_rate": 5.699403020016388e-08, + "logits/chosen": -1.9948463439941406, + "logits/rejected": -1.9361990690231323, + "logps/chosen": -233.7935028076172, + "logps/rejected": -409.6302490234375, + "loss": 0.1326, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5635000467300415, + "rewards/margins": 2.884984254837036, + "rewards/rejected": -3.448484182357788, + "step": 7051 + }, + { + "epoch": 0.81, + "learning_rate": 5.695891373053962e-08, + "logits/chosen": -2.425110340118408, + "logits/rejected": -2.5481138229370117, + "logps/chosen": -326.44036865234375, + "logps/rejected": -307.2161865234375, + "loss": 0.3221, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.46501630544662476, + "rewards/margins": 2.8275320529937744, + "rewards/rejected": -3.292548418045044, + "step": 7052 + }, + { + "epoch": 0.81, + "learning_rate": 5.692379726091536e-08, + "logits/chosen": -1.8442027568817139, + "logits/rejected": -1.700461506843567, + "logps/chosen": -249.5424041748047, + "logps/rejected": -305.7444152832031, + "loss": 0.8261, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.8271390199661255, + "rewards/margins": 1.1111007928848267, + "rewards/rejected": -2.938239812850952, + "step": 7053 + }, + { + "epoch": 0.81, + "learning_rate": 5.6888680791291116e-08, + "logits/chosen": -2.1902639865875244, + "logits/rejected": -2.637840509414673, + "logps/chosen": -313.8163757324219, + "logps/rejected": -218.492919921875, + "loss": 1.001, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.907599925994873, + "rewards/margins": 0.32997143268585205, + "rewards/rejected": -2.2375712394714355, + "step": 7054 + }, + { + "epoch": 0.81, + "learning_rate": 5.685356432166686e-08, + "logits/chosen": -2.4859776496887207, + "logits/rejected": -2.5372750759124756, + "logps/chosen": -354.6621398925781, + "logps/rejected": -339.5750732421875, + "loss": 0.0649, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4587794244289398, + "rewards/margins": 3.433932304382324, + "rewards/rejected": -3.892712116241455, + "step": 7055 + }, + { + "epoch": 0.81, + "learning_rate": 5.6818447852042604e-08, + "logits/chosen": -1.6903433799743652, + "logits/rejected": -1.820959210395813, + "logps/chosen": -238.31390380859375, + "logps/rejected": -210.59921264648438, + "loss": 0.3991, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0868330001831055, + "rewards/margins": 1.8039573431015015, + "rewards/rejected": -2.8907904624938965, + "step": 7056 + }, + { + "epoch": 0.81, + "learning_rate": 5.6783331382418345e-08, + "logits/chosen": -2.1951427459716797, + "logits/rejected": -2.0021183490753174, + "logps/chosen": -352.0788269042969, + "logps/rejected": -279.28570556640625, + "loss": 0.5871, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0362718105316162, + "rewards/margins": 1.8061951398849487, + "rewards/rejected": -2.8424668312072754, + "step": 7057 + }, + { + "epoch": 0.81, + "learning_rate": 5.67482149127941e-08, + "logits/chosen": -1.9039167165756226, + "logits/rejected": -1.8862395286560059, + "logps/chosen": -395.69647216796875, + "logps/rejected": -378.4107666015625, + "loss": 0.4904, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6532402634620667, + "rewards/margins": 1.288754940032959, + "rewards/rejected": -1.94199538230896, + "step": 7058 + }, + { + "epoch": 0.81, + "learning_rate": 5.671309844316984e-08, + "logits/chosen": -2.6776809692382812, + "logits/rejected": -2.6911933422088623, + "logps/chosen": -139.3343048095703, + "logps/rejected": -209.8145751953125, + "loss": 0.2297, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6443739533424377, + "rewards/margins": 3.5650768280029297, + "rewards/rejected": -4.209450721740723, + "step": 7059 + }, + { + "epoch": 0.81, + "learning_rate": 5.667798197354559e-08, + "logits/chosen": -2.1190147399902344, + "logits/rejected": -2.0942459106445312, + "logps/chosen": -425.87506103515625, + "logps/rejected": -416.4208984375, + "loss": 0.2999, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7610870599746704, + "rewards/margins": 2.9659459590911865, + "rewards/rejected": -3.7270328998565674, + "step": 7060 + }, + { + "epoch": 0.81, + "learning_rate": 5.6642865503921334e-08, + "logits/chosen": -2.1511123180389404, + "logits/rejected": -2.254532814025879, + "logps/chosen": -354.63995361328125, + "logps/rejected": -363.0119323730469, + "loss": 0.1796, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4380391538143158, + "rewards/margins": 2.581371784210205, + "rewards/rejected": -3.019411087036133, + "step": 7061 + }, + { + "epoch": 0.81, + "learning_rate": 5.660774903429708e-08, + "logits/chosen": -1.8683836460113525, + "logits/rejected": -2.3932080268859863, + "logps/chosen": -288.3729553222656, + "logps/rejected": -173.4343719482422, + "loss": 0.6774, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6839618682861328, + "rewards/margins": 1.5725998878479004, + "rewards/rejected": -2.256561517715454, + "step": 7062 + }, + { + "epoch": 0.81, + "learning_rate": 5.657263256467283e-08, + "logits/chosen": -2.4340457916259766, + "logits/rejected": -2.5623602867126465, + "logps/chosen": -339.0842590332031, + "logps/rejected": -263.3531799316406, + "loss": 0.5137, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.642436146736145, + "rewards/margins": 1.9179052114486694, + "rewards/rejected": -3.5603411197662354, + "step": 7063 + }, + { + "epoch": 0.81, + "learning_rate": 5.6537516095048576e-08, + "logits/chosen": -2.002734661102295, + "logits/rejected": -2.0782508850097656, + "logps/chosen": -284.87921142578125, + "logps/rejected": -246.2678680419922, + "loss": 0.5329, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0923330783843994, + "rewards/margins": 1.7683831453323364, + "rewards/rejected": -2.8607163429260254, + "step": 7064 + }, + { + "epoch": 0.81, + "learning_rate": 5.6502399625424316e-08, + "logits/chosen": -2.1668028831481934, + "logits/rejected": -1.9860082864761353, + "logps/chosen": -332.266357421875, + "logps/rejected": -351.4826965332031, + "loss": 0.2425, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17626360058784485, + "rewards/margins": 3.448281764984131, + "rewards/rejected": -3.624545097351074, + "step": 7065 + }, + { + "epoch": 0.81, + "learning_rate": 5.646728315580007e-08, + "logits/chosen": -2.022064685821533, + "logits/rejected": -2.3054351806640625, + "logps/chosen": -378.5797424316406, + "logps/rejected": -251.1720733642578, + "loss": 0.1362, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.007058333605527878, + "rewards/margins": 2.5430703163146973, + "rewards/rejected": -2.5360116958618164, + "step": 7066 + }, + { + "epoch": 0.81, + "learning_rate": 5.643216668617581e-08, + "logits/chosen": -2.4343316555023193, + "logits/rejected": -2.6257333755493164, + "logps/chosen": -213.930419921875, + "logps/rejected": -317.89459228515625, + "loss": 0.3284, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9530110359191895, + "rewards/margins": 3.7353315353393555, + "rewards/rejected": -4.688343048095703, + "step": 7067 + }, + { + "epoch": 0.81, + "learning_rate": 5.6397050216551565e-08, + "logits/chosen": -2.9656848907470703, + "logits/rejected": -2.9845566749572754, + "logps/chosen": -162.81655883789062, + "logps/rejected": -271.1162109375, + "loss": 0.2422, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.845746636390686, + "rewards/margins": 3.939739465713501, + "rewards/rejected": -4.785486221313477, + "step": 7068 + }, + { + "epoch": 0.81, + "learning_rate": 5.6361933746927305e-08, + "logits/chosen": -2.1793267726898193, + "logits/rejected": -1.7019636631011963, + "logps/chosen": -307.4990539550781, + "logps/rejected": -434.12921142578125, + "loss": 0.4649, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7240476608276367, + "rewards/margins": 2.110060214996338, + "rewards/rejected": -2.8341076374053955, + "step": 7069 + }, + { + "epoch": 0.82, + "learning_rate": 5.632681727730305e-08, + "logits/chosen": -2.4600861072540283, + "logits/rejected": -2.675184726715088, + "logps/chosen": -293.225341796875, + "logps/rejected": -127.49349975585938, + "loss": 0.3007, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7535724639892578, + "rewards/margins": 1.2201498746871948, + "rewards/rejected": -1.973722219467163, + "step": 7070 + }, + { + "epoch": 0.82, + "learning_rate": 5.629170080767879e-08, + "logits/chosen": -1.6361894607543945, + "logits/rejected": -1.7306393384933472, + "logps/chosen": -393.16900634765625, + "logps/rejected": -373.7480773925781, + "loss": 0.4269, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7889639139175415, + "rewards/margins": 2.212904930114746, + "rewards/rejected": -3.001868724822998, + "step": 7071 + }, + { + "epoch": 0.82, + "learning_rate": 5.625658433805455e-08, + "logits/chosen": -2.0681357383728027, + "logits/rejected": -2.3945512771606445, + "logps/chosen": -323.90216064453125, + "logps/rejected": -225.08535766601562, + "loss": 0.5736, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1559077650308609, + "rewards/margins": 1.3723703622817993, + "rewards/rejected": -1.528278112411499, + "step": 7072 + }, + { + "epoch": 0.82, + "learning_rate": 5.622146786843029e-08, + "logits/chosen": -2.756662130355835, + "logits/rejected": -2.7518208026885986, + "logps/chosen": -249.09127807617188, + "logps/rejected": -274.86083984375, + "loss": 0.0848, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13360299170017242, + "rewards/margins": 3.3414437770843506, + "rewards/rejected": -3.4750471115112305, + "step": 7073 + }, + { + "epoch": 0.82, + "learning_rate": 5.618635139880604e-08, + "logits/chosen": -1.9766474962234497, + "logits/rejected": -1.8087236881256104, + "logps/chosen": -226.14059448242188, + "logps/rejected": -240.52432250976562, + "loss": 0.2948, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24027971923351288, + "rewards/margins": 1.9362664222717285, + "rewards/rejected": -2.176546096801758, + "step": 7074 + }, + { + "epoch": 0.82, + "learning_rate": 5.615123492918178e-08, + "logits/chosen": -2.4700207710266113, + "logits/rejected": -2.381406545639038, + "logps/chosen": -247.760498046875, + "logps/rejected": -206.59765625, + "loss": 0.7381, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8324679136276245, + "rewards/margins": 1.61376953125, + "rewards/rejected": -2.446237564086914, + "step": 7075 + }, + { + "epoch": 0.82, + "learning_rate": 5.611611845955753e-08, + "logits/chosen": -2.221179485321045, + "logits/rejected": -2.1273860931396484, + "logps/chosen": -267.45697021484375, + "logps/rejected": -252.7991485595703, + "loss": 0.4094, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.911900520324707, + "rewards/margins": 1.8553318977355957, + "rewards/rejected": -2.7672324180603027, + "step": 7076 + }, + { + "epoch": 0.82, + "learning_rate": 5.6081001989933276e-08, + "logits/chosen": -2.368739128112793, + "logits/rejected": -2.3284425735473633, + "logps/chosen": -230.2257080078125, + "logps/rejected": -289.39691162109375, + "loss": 0.3442, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9554910659790039, + "rewards/margins": 3.350574254989624, + "rewards/rejected": -4.306065082550049, + "step": 7077 + }, + { + "epoch": 0.82, + "learning_rate": 5.6045885520309024e-08, + "logits/chosen": -2.5486695766448975, + "logits/rejected": -2.488175392150879, + "logps/chosen": -198.5979766845703, + "logps/rejected": -298.6623840332031, + "loss": 0.417, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4348304569721222, + "rewards/margins": 2.7749946117401123, + "rewards/rejected": -3.209825277328491, + "step": 7078 + }, + { + "epoch": 0.82, + "learning_rate": 5.6010769050684764e-08, + "logits/chosen": -1.847132682800293, + "logits/rejected": -2.28273606300354, + "logps/chosen": -578.1279296875, + "logps/rejected": -381.345458984375, + "loss": 0.7635, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.02010178565979, + "rewards/margins": 0.4087883532047272, + "rewards/rejected": -1.4288901090621948, + "step": 7079 + }, + { + "epoch": 0.82, + "learning_rate": 5.597565258106052e-08, + "logits/chosen": -1.8890166282653809, + "logits/rejected": -1.869277000427246, + "logps/chosen": -296.23095703125, + "logps/rejected": -253.93576049804688, + "loss": 0.4038, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4069940447807312, + "rewards/margins": 1.4408960342407227, + "rewards/rejected": -1.847890019416809, + "step": 7080 + }, + { + "epoch": 0.82, + "learning_rate": 5.594053611143626e-08, + "logits/chosen": -1.9519158601760864, + "logits/rejected": -2.0063111782073975, + "logps/chosen": -351.65301513671875, + "logps/rejected": -307.64093017578125, + "loss": 0.2466, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.369608759880066, + "rewards/margins": 1.724587082862854, + "rewards/rejected": -3.094195604324341, + "step": 7081 + }, + { + "epoch": 0.82, + "learning_rate": 5.590541964181201e-08, + "logits/chosen": -2.2497870922088623, + "logits/rejected": -2.4866209030151367, + "logps/chosen": -283.71954345703125, + "logps/rejected": -247.00540161132812, + "loss": 0.2477, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5448786616325378, + "rewards/margins": 3.682264804840088, + "rewards/rejected": -4.227143287658691, + "step": 7082 + }, + { + "epoch": 0.82, + "learning_rate": 5.587030317218775e-08, + "logits/chosen": -2.3605594635009766, + "logits/rejected": -2.520933151245117, + "logps/chosen": -643.1189575195312, + "logps/rejected": -373.3144226074219, + "loss": 0.3913, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.199467658996582, + "rewards/margins": 1.4346444606781006, + "rewards/rejected": -2.6341121196746826, + "step": 7083 + }, + { + "epoch": 0.82, + "learning_rate": 5.58351867025635e-08, + "logits/chosen": -2.4988741874694824, + "logits/rejected": -2.5163068771362305, + "logps/chosen": -334.6697998046875, + "logps/rejected": -249.5498504638672, + "loss": 0.9081, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0168565511703491, + "rewards/margins": 2.7495932579040527, + "rewards/rejected": -3.766449451446533, + "step": 7084 + }, + { + "epoch": 0.82, + "learning_rate": 5.580007023293925e-08, + "logits/chosen": -1.8639812469482422, + "logits/rejected": -1.7136328220367432, + "logps/chosen": -228.9183807373047, + "logps/rejected": -148.7114715576172, + "loss": 0.3358, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4598570466041565, + "rewards/margins": 1.375054121017456, + "rewards/rejected": -1.8349111080169678, + "step": 7085 + }, + { + "epoch": 0.82, + "learning_rate": 5.5764953763314995e-08, + "logits/chosen": -1.9961764812469482, + "logits/rejected": -1.9513719081878662, + "logps/chosen": -342.2557067871094, + "logps/rejected": -381.0055847167969, + "loss": 0.1529, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.010532528162002563, + "rewards/margins": 3.0735225677490234, + "rewards/rejected": -3.062990188598633, + "step": 7086 + }, + { + "epoch": 0.82, + "learning_rate": 5.5729837293690736e-08, + "logits/chosen": -2.7945477962493896, + "logits/rejected": -2.507558822631836, + "logps/chosen": -100.98140716552734, + "logps/rejected": -244.35986328125, + "loss": 0.3395, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2191559076309204, + "rewards/margins": 3.1308581829071045, + "rewards/rejected": -4.350013732910156, + "step": 7087 + }, + { + "epoch": 0.82, + "learning_rate": 5.569472082406649e-08, + "logits/chosen": -2.1768269538879395, + "logits/rejected": -2.6349973678588867, + "logps/chosen": -323.4734191894531, + "logps/rejected": -198.03564453125, + "loss": 0.3496, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7357678413391113, + "rewards/margins": 2.8443922996520996, + "rewards/rejected": -3.580160140991211, + "step": 7088 + }, + { + "epoch": 0.82, + "learning_rate": 5.565960435444223e-08, + "logits/chosen": -2.502763032913208, + "logits/rejected": -2.7539479732513428, + "logps/chosen": -329.8480224609375, + "logps/rejected": -171.7523193359375, + "loss": 0.4094, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9713897109031677, + "rewards/margins": 1.7579647302627563, + "rewards/rejected": -2.7293543815612793, + "step": 7089 + }, + { + "epoch": 0.82, + "learning_rate": 5.562448788481798e-08, + "logits/chosen": -2.4870243072509766, + "logits/rejected": -2.4867470264434814, + "logps/chosen": -184.96090698242188, + "logps/rejected": -197.78890991210938, + "loss": 0.3229, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6342436075210571, + "rewards/margins": 2.0707507133483887, + "rewards/rejected": -2.7049944400787354, + "step": 7090 + }, + { + "epoch": 0.82, + "learning_rate": 5.5589371415193725e-08, + "logits/chosen": -2.520477533340454, + "logits/rejected": -2.3204259872436523, + "logps/chosen": -82.54086303710938, + "logps/rejected": -115.5868911743164, + "loss": 0.2726, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10767592489719391, + "rewards/margins": 2.270519733428955, + "rewards/rejected": -2.3781957626342773, + "step": 7091 + }, + { + "epoch": 0.82, + "learning_rate": 5.555425494556947e-08, + "logits/chosen": -2.254502773284912, + "logits/rejected": -2.5445556640625, + "logps/chosen": -356.0037536621094, + "logps/rejected": -378.17144775390625, + "loss": 0.3707, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8089092969894409, + "rewards/margins": 3.1902389526367188, + "rewards/rejected": -3.999148368835449, + "step": 7092 + }, + { + "epoch": 0.82, + "learning_rate": 5.551913847594521e-08, + "logits/chosen": -1.8043854236602783, + "logits/rejected": -1.807718276977539, + "logps/chosen": -315.1115417480469, + "logps/rejected": -257.3134765625, + "loss": 0.4999, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2776379585266113, + "rewards/margins": 1.714411735534668, + "rewards/rejected": -2.9920494556427, + "step": 7093 + }, + { + "epoch": 0.82, + "learning_rate": 5.5484022006320966e-08, + "logits/chosen": -2.139056921005249, + "logits/rejected": -2.0399069786071777, + "logps/chosen": -416.5388488769531, + "logps/rejected": -409.2393493652344, + "loss": 0.1423, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.258283019065857, + "rewards/margins": 2.9352192878723145, + "rewards/rejected": -4.193502426147461, + "step": 7094 + }, + { + "epoch": 0.82, + "learning_rate": 5.544890553669671e-08, + "logits/chosen": -2.3959968090057373, + "logits/rejected": -2.4459285736083984, + "logps/chosen": -345.0435485839844, + "logps/rejected": -188.8983154296875, + "loss": 0.3297, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6617553234100342, + "rewards/margins": 1.9552850723266602, + "rewards/rejected": -2.6170403957366943, + "step": 7095 + }, + { + "epoch": 0.82, + "learning_rate": 5.541378906707246e-08, + "logits/chosen": -2.002683162689209, + "logits/rejected": -2.4040260314941406, + "logps/chosen": -478.4371337890625, + "logps/rejected": -293.310302734375, + "loss": 0.4564, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5755845308303833, + "rewards/margins": 3.1446564197540283, + "rewards/rejected": -3.720241069793701, + "step": 7096 + }, + { + "epoch": 0.82, + "learning_rate": 5.53786725974482e-08, + "logits/chosen": -2.0415124893188477, + "logits/rejected": -2.268434524536133, + "logps/chosen": -417.34588623046875, + "logps/rejected": -349.2891845703125, + "loss": 0.2012, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.07308520376682281, + "rewards/margins": 3.661778450012207, + "rewards/rejected": -3.588693380355835, + "step": 7097 + }, + { + "epoch": 0.82, + "learning_rate": 5.534355612782395e-08, + "logits/chosen": -2.190587043762207, + "logits/rejected": -2.3623781204223633, + "logps/chosen": -377.876708984375, + "logps/rejected": -277.80242919921875, + "loss": 0.1772, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.061254799365997314, + "rewards/margins": 2.248896360397339, + "rewards/rejected": -2.3101511001586914, + "step": 7098 + }, + { + "epoch": 0.82, + "learning_rate": 5.5308439658199696e-08, + "logits/chosen": -2.824688673019409, + "logits/rejected": -2.804532766342163, + "logps/chosen": -115.56138610839844, + "logps/rejected": -166.14149475097656, + "loss": 0.6626, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9250946044921875, + "rewards/margins": 1.8712270259857178, + "rewards/rejected": -2.7963216304779053, + "step": 7099 + }, + { + "epoch": 0.82, + "learning_rate": 5.5273323188575436e-08, + "logits/chosen": -2.4780800342559814, + "logits/rejected": -2.5017223358154297, + "logps/chosen": -233.21786499023438, + "logps/rejected": -254.75436401367188, + "loss": 0.185, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8192859888076782, + "rewards/margins": 2.9701437950134277, + "rewards/rejected": -3.7894296646118164, + "step": 7100 + }, + { + "epoch": 0.82, + "learning_rate": 5.5238206718951184e-08, + "logits/chosen": -2.6500885486602783, + "logits/rejected": -2.8405168056488037, + "logps/chosen": -178.41505432128906, + "logps/rejected": -229.3037567138672, + "loss": 0.1911, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8981398344039917, + "rewards/margins": 2.584699869155884, + "rewards/rejected": -3.482839822769165, + "step": 7101 + }, + { + "epoch": 0.82, + "learning_rate": 5.520309024932693e-08, + "logits/chosen": -2.052184581756592, + "logits/rejected": -2.0361380577087402, + "logps/chosen": -138.92640686035156, + "logps/rejected": -180.25811767578125, + "loss": 0.9836, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6044631004333496, + "rewards/margins": 0.010133028030395508, + "rewards/rejected": -1.6145962476730347, + "step": 7102 + }, + { + "epoch": 0.82, + "learning_rate": 5.516797377970268e-08, + "logits/chosen": -2.4655117988586426, + "logits/rejected": -2.583146810531616, + "logps/chosen": -221.93313598632812, + "logps/rejected": -238.71957397460938, + "loss": 0.5853, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9594799280166626, + "rewards/margins": 1.6028151512145996, + "rewards/rejected": -3.5622949600219727, + "step": 7103 + }, + { + "epoch": 0.82, + "learning_rate": 5.513285731007842e-08, + "logits/chosen": -2.2683348655700684, + "logits/rejected": -2.5244250297546387, + "logps/chosen": -262.03424072265625, + "logps/rejected": -302.1639099121094, + "loss": 0.3207, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9306219816207886, + "rewards/margins": 1.9370700120925903, + "rewards/rejected": -2.867691993713379, + "step": 7104 + }, + { + "epoch": 0.82, + "learning_rate": 5.509774084045417e-08, + "logits/chosen": -2.6437578201293945, + "logits/rejected": -2.7072362899780273, + "logps/chosen": -120.18257141113281, + "logps/rejected": -119.51300048828125, + "loss": 0.6454, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.48633110523223877, + "rewards/margins": 1.7849334478378296, + "rewards/rejected": -2.2712645530700684, + "step": 7105 + }, + { + "epoch": 0.82, + "learning_rate": 5.506262437082991e-08, + "logits/chosen": -1.8788599967956543, + "logits/rejected": -2.198746681213379, + "logps/chosen": -384.940673828125, + "logps/rejected": -169.3856201171875, + "loss": 0.2155, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.37599197030067444, + "rewards/margins": 2.2894346714019775, + "rewards/rejected": -2.665426731109619, + "step": 7106 + }, + { + "epoch": 0.82, + "learning_rate": 5.502750790120566e-08, + "logits/chosen": -2.443718433380127, + "logits/rejected": -2.4881539344787598, + "logps/chosen": -383.0528259277344, + "logps/rejected": -264.5816955566406, + "loss": 0.1968, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7016263008117676, + "rewards/margins": 2.96829891204834, + "rewards/rejected": -3.6699252128601074, + "step": 7107 + }, + { + "epoch": 0.82, + "learning_rate": 5.499239143158141e-08, + "logits/chosen": -1.725977897644043, + "logits/rejected": -1.7824034690856934, + "logps/chosen": -256.24273681640625, + "logps/rejected": -286.2607421875, + "loss": 0.3142, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2722061276435852, + "rewards/margins": 2.6583032608032227, + "rewards/rejected": -2.930509328842163, + "step": 7108 + }, + { + "epoch": 0.82, + "learning_rate": 5.4957274961957155e-08, + "logits/chosen": -2.666970729827881, + "logits/rejected": -2.5622425079345703, + "logps/chosen": -154.3745880126953, + "logps/rejected": -268.9423828125, + "loss": 0.3994, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7215744853019714, + "rewards/margins": 2.103933334350586, + "rewards/rejected": -2.825507640838623, + "step": 7109 + }, + { + "epoch": 0.82, + "learning_rate": 5.4922158492332896e-08, + "logits/chosen": -1.8569546937942505, + "logits/rejected": -1.81191885471344, + "logps/chosen": -437.6829833984375, + "logps/rejected": -338.07281494140625, + "loss": 0.3338, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5185791254043579, + "rewards/margins": 2.376856803894043, + "rewards/rejected": -2.8954360485076904, + "step": 7110 + }, + { + "epoch": 0.82, + "learning_rate": 5.488704202270865e-08, + "logits/chosen": -2.187706470489502, + "logits/rejected": -2.2127444744110107, + "logps/chosen": -198.24261474609375, + "logps/rejected": -197.6126251220703, + "loss": 0.7385, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7203956842422485, + "rewards/margins": 0.8742040395736694, + "rewards/rejected": -2.594599723815918, + "step": 7111 + }, + { + "epoch": 0.82, + "learning_rate": 5.485192555308439e-08, + "logits/chosen": -2.5815770626068115, + "logits/rejected": -2.926347017288208, + "logps/chosen": -240.32501220703125, + "logps/rejected": -236.33499145507812, + "loss": 0.1162, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2225843369960785, + "rewards/margins": 3.0416367053985596, + "rewards/rejected": -3.264220952987671, + "step": 7112 + }, + { + "epoch": 0.82, + "learning_rate": 5.4816809083460144e-08, + "logits/chosen": -2.5019514560699463, + "logits/rejected": -2.7279300689697266, + "logps/chosen": -248.61639404296875, + "logps/rejected": -219.82656860351562, + "loss": 0.2932, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1747623682022095, + "rewards/margins": 2.797123432159424, + "rewards/rejected": -3.971885919570923, + "step": 7113 + }, + { + "epoch": 0.82, + "learning_rate": 5.4781692613835885e-08, + "logits/chosen": -2.1194913387298584, + "logits/rejected": -2.0857326984405518, + "logps/chosen": -332.5823974609375, + "logps/rejected": -382.4829406738281, + "loss": 0.2468, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.338355302810669, + "rewards/margins": 3.780796766281128, + "rewards/rejected": -5.119152069091797, + "step": 7114 + }, + { + "epoch": 0.82, + "learning_rate": 5.474657614421163e-08, + "logits/chosen": -2.2649424076080322, + "logits/rejected": -2.6078217029571533, + "logps/chosen": -457.23529052734375, + "logps/rejected": -324.85205078125, + "loss": 0.1735, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0319961309432983, + "rewards/margins": 2.450456380844116, + "rewards/rejected": -3.482452869415283, + "step": 7115 + }, + { + "epoch": 0.82, + "learning_rate": 5.471145967458738e-08, + "logits/chosen": -2.8508191108703613, + "logits/rejected": -2.7100698947906494, + "logps/chosen": -151.71054077148438, + "logps/rejected": -300.2918395996094, + "loss": 0.2094, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4502295255661011, + "rewards/margins": 4.576083660125732, + "rewards/rejected": -5.026313304901123, + "step": 7116 + }, + { + "epoch": 0.82, + "learning_rate": 5.4676343204963126e-08, + "logits/chosen": -2.419678211212158, + "logits/rejected": -2.2719879150390625, + "logps/chosen": -314.026611328125, + "logps/rejected": -323.697998046875, + "loss": 0.2342, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8271456956863403, + "rewards/margins": 1.6029099225997925, + "rewards/rejected": -2.430055618286133, + "step": 7117 + }, + { + "epoch": 0.82, + "learning_rate": 5.464122673533887e-08, + "logits/chosen": -2.207477569580078, + "logits/rejected": -2.285003185272217, + "logps/chosen": -418.749267578125, + "logps/rejected": -308.58026123046875, + "loss": 0.3283, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.33260682225227356, + "rewards/margins": 1.7231792211532593, + "rewards/rejected": -2.0557861328125, + "step": 7118 + }, + { + "epoch": 0.82, + "learning_rate": 5.460611026571462e-08, + "logits/chosen": -1.9011023044586182, + "logits/rejected": -2.239638090133667, + "logps/chosen": -292.77313232421875, + "logps/rejected": -260.0099792480469, + "loss": 0.2441, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9707956314086914, + "rewards/margins": 2.2535619735717773, + "rewards/rejected": -3.224357843399048, + "step": 7119 + }, + { + "epoch": 0.82, + "learning_rate": 5.457099379609036e-08, + "logits/chosen": -2.2131829261779785, + "logits/rejected": -2.305100202560425, + "logps/chosen": -219.64198303222656, + "logps/rejected": -290.04718017578125, + "loss": 0.6206, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9155454635620117, + "rewards/margins": 0.6969982385635376, + "rewards/rejected": -1.6125438213348389, + "step": 7120 + }, + { + "epoch": 0.82, + "learning_rate": 5.4535877326466115e-08, + "logits/chosen": -2.541490316390991, + "logits/rejected": -2.635490894317627, + "logps/chosen": -195.52919006347656, + "logps/rejected": -250.01553344726562, + "loss": 0.321, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1470743417739868, + "rewards/margins": 3.9189629554748535, + "rewards/rejected": -5.066037654876709, + "step": 7121 + }, + { + "epoch": 0.82, + "learning_rate": 5.4500760856841856e-08, + "logits/chosen": -2.173649787902832, + "logits/rejected": -2.432514190673828, + "logps/chosen": -283.40576171875, + "logps/rejected": -322.4058532714844, + "loss": 0.104, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5080714225769043, + "rewards/margins": 3.340003728866577, + "rewards/rejected": -3.8480751514434814, + "step": 7122 + }, + { + "epoch": 0.82, + "learning_rate": 5.44656443872176e-08, + "logits/chosen": -2.2733330726623535, + "logits/rejected": -2.484586715698242, + "logps/chosen": -656.6171875, + "logps/rejected": -511.5628662109375, + "loss": 0.3227, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.923649787902832, + "rewards/margins": 2.2318201065063477, + "rewards/rejected": -3.1554698944091797, + "step": 7123 + }, + { + "epoch": 0.82, + "learning_rate": 5.4430527917593344e-08, + "logits/chosen": -2.421724557876587, + "logits/rejected": -2.49580979347229, + "logps/chosen": -325.9595031738281, + "logps/rejected": -254.79031372070312, + "loss": 1.4188, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.087810516357422, + "rewards/margins": -0.3735972046852112, + "rewards/rejected": -1.714213252067566, + "step": 7124 + }, + { + "epoch": 0.82, + "learning_rate": 5.43954114479691e-08, + "logits/chosen": -1.6093692779541016, + "logits/rejected": -2.0651626586914062, + "logps/chosen": -270.71807861328125, + "logps/rejected": -194.91119384765625, + "loss": 0.722, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.156537413597107, + "rewards/margins": 0.10467769205570221, + "rewards/rejected": -1.261215090751648, + "step": 7125 + }, + { + "epoch": 0.82, + "learning_rate": 5.436029497834484e-08, + "logits/chosen": -2.5239908695220947, + "logits/rejected": -2.423703670501709, + "logps/chosen": -220.93016052246094, + "logps/rejected": -244.96974182128906, + "loss": 0.118, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.810957670211792, + "rewards/margins": 4.762067794799805, + "rewards/rejected": -5.573025226593018, + "step": 7126 + }, + { + "epoch": 0.82, + "learning_rate": 5.432517850872059e-08, + "logits/chosen": -2.473165273666382, + "logits/rejected": -2.3789265155792236, + "logps/chosen": -192.12689208984375, + "logps/rejected": -149.24905395507812, + "loss": 0.5522, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6499751210212708, + "rewards/margins": 0.7605962157249451, + "rewards/rejected": -1.4105714559555054, + "step": 7127 + }, + { + "epoch": 0.82, + "learning_rate": 5.429006203909633e-08, + "logits/chosen": -2.20635986328125, + "logits/rejected": -2.3508808612823486, + "logps/chosen": -232.55813598632812, + "logps/rejected": -233.56381225585938, + "loss": 0.2596, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.138871192932129, + "rewards/margins": 1.776058554649353, + "rewards/rejected": -2.9149293899536133, + "step": 7128 + }, + { + "epoch": 0.82, + "learning_rate": 5.425494556947208e-08, + "logits/chosen": -2.551095962524414, + "logits/rejected": -2.575206995010376, + "logps/chosen": -400.002197265625, + "logps/rejected": -333.4089660644531, + "loss": 0.308, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6089664101600647, + "rewards/margins": 1.9910258054733276, + "rewards/rejected": -2.599992036819458, + "step": 7129 + }, + { + "epoch": 0.82, + "learning_rate": 5.421982909984783e-08, + "logits/chosen": -2.405886650085449, + "logits/rejected": -2.4057748317718506, + "logps/chosen": -208.29310607910156, + "logps/rejected": -309.6057434082031, + "loss": 0.1029, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0777133703231812, + "rewards/margins": 3.807872772216797, + "rewards/rejected": -4.885586261749268, + "step": 7130 + }, + { + "epoch": 0.82, + "learning_rate": 5.4184712630223574e-08, + "logits/chosen": -2.2199456691741943, + "logits/rejected": -2.2114346027374268, + "logps/chosen": -507.07830810546875, + "logps/rejected": -352.2135314941406, + "loss": 0.4971, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9912718534469604, + "rewards/margins": 0.9989370107650757, + "rewards/rejected": -1.9902088642120361, + "step": 7131 + }, + { + "epoch": 0.82, + "learning_rate": 5.4149596160599315e-08, + "logits/chosen": -2.064380168914795, + "logits/rejected": -2.248276710510254, + "logps/chosen": -171.64515686035156, + "logps/rejected": -177.49014282226562, + "loss": 0.3311, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5463807582855225, + "rewards/margins": 1.624342918395996, + "rewards/rejected": -2.1707236766815186, + "step": 7132 + }, + { + "epoch": 0.82, + "learning_rate": 5.411447969097507e-08, + "logits/chosen": -2.938338279724121, + "logits/rejected": -2.996809959411621, + "logps/chosen": -279.6861572265625, + "logps/rejected": -216.34254455566406, + "loss": 0.3291, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2770240306854248, + "rewards/margins": 2.031003713607788, + "rewards/rejected": -2.308027744293213, + "step": 7133 + }, + { + "epoch": 0.82, + "learning_rate": 5.407936322135081e-08, + "logits/chosen": -2.5718188285827637, + "logits/rejected": -2.3080334663391113, + "logps/chosen": -155.0849151611328, + "logps/rejected": -239.1304473876953, + "loss": 0.4987, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3493235111236572, + "rewards/margins": 2.2138633728027344, + "rewards/rejected": -3.5631868839263916, + "step": 7134 + }, + { + "epoch": 0.82, + "learning_rate": 5.4044246751726563e-08, + "logits/chosen": -2.539527177810669, + "logits/rejected": -2.1627068519592285, + "logps/chosen": -205.34193420410156, + "logps/rejected": -304.0393981933594, + "loss": 0.7439, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.813044011592865, + "rewards/margins": 1.258903980255127, + "rewards/rejected": -2.0719480514526367, + "step": 7135 + }, + { + "epoch": 0.82, + "learning_rate": 5.4009130282102304e-08, + "logits/chosen": -2.2090468406677246, + "logits/rejected": -2.6248092651367188, + "logps/chosen": -313.5826110839844, + "logps/rejected": -196.03955078125, + "loss": 1.073, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.2374930381774902, + "rewards/margins": 0.11688613891601562, + "rewards/rejected": -1.3543791770935059, + "step": 7136 + }, + { + "epoch": 0.82, + "learning_rate": 5.397401381247805e-08, + "logits/chosen": -2.0112805366516113, + "logits/rejected": -2.168398141860962, + "logps/chosen": -192.41427612304688, + "logps/rejected": -233.59637451171875, + "loss": 0.1179, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19376814365386963, + "rewards/margins": 3.692593812942505, + "rewards/rejected": -3.886361598968506, + "step": 7137 + }, + { + "epoch": 0.82, + "learning_rate": 5.39388973428538e-08, + "logits/chosen": -1.8330087661743164, + "logits/rejected": -1.7969868183135986, + "logps/chosen": -353.9332580566406, + "logps/rejected": -423.40191650390625, + "loss": 0.4269, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.22886089980602264, + "rewards/margins": 3.105919361114502, + "rewards/rejected": -3.33478045463562, + "step": 7138 + }, + { + "epoch": 0.82, + "learning_rate": 5.3903780873229546e-08, + "logits/chosen": -2.530026435852051, + "logits/rejected": -2.3452391624450684, + "logps/chosen": -272.086181640625, + "logps/rejected": -307.4956970214844, + "loss": 0.375, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3607080578804016, + "rewards/margins": 2.2484078407287598, + "rewards/rejected": -2.6091160774230957, + "step": 7139 + }, + { + "epoch": 0.82, + "learning_rate": 5.3868664403605286e-08, + "logits/chosen": -1.978267788887024, + "logits/rejected": -2.3172054290771484, + "logps/chosen": -187.3129425048828, + "logps/rejected": -155.6179656982422, + "loss": 0.2494, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6923530697822571, + "rewards/margins": 2.142158269882202, + "rewards/rejected": -2.8345112800598145, + "step": 7140 + }, + { + "epoch": 0.82, + "learning_rate": 5.383354793398104e-08, + "logits/chosen": -2.529987335205078, + "logits/rejected": -2.679201126098633, + "logps/chosen": -133.15509033203125, + "logps/rejected": -488.490478515625, + "loss": 0.7197, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.392309546470642, + "rewards/margins": 4.803168296813965, + "rewards/rejected": -6.19547700881958, + "step": 7141 + }, + { + "epoch": 0.82, + "learning_rate": 5.379843146435678e-08, + "logits/chosen": -2.291019916534424, + "logits/rejected": -2.4129533767700195, + "logps/chosen": -331.640625, + "logps/rejected": -178.2600555419922, + "loss": 0.5716, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.031630516052246, + "rewards/margins": 0.8944353461265564, + "rewards/rejected": -1.9260659217834473, + "step": 7142 + }, + { + "epoch": 0.82, + "learning_rate": 5.376331499473253e-08, + "logits/chosen": -2.472857713699341, + "logits/rejected": -2.517756700515747, + "logps/chosen": -434.6249084472656, + "logps/rejected": -220.4656524658203, + "loss": 0.2225, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9326352477073669, + "rewards/margins": 2.984132766723633, + "rewards/rejected": -3.9167675971984863, + "step": 7143 + }, + { + "epoch": 0.82, + "learning_rate": 5.3728198525108275e-08, + "logits/chosen": -2.108534812927246, + "logits/rejected": -2.390688896179199, + "logps/chosen": -406.3558349609375, + "logps/rejected": -259.78070068359375, + "loss": 0.1286, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06906281411647797, + "rewards/margins": 2.869485855102539, + "rewards/rejected": -2.9385485649108887, + "step": 7144 + }, + { + "epoch": 0.82, + "learning_rate": 5.369308205548402e-08, + "logits/chosen": -2.2422702312469482, + "logits/rejected": -2.1623029708862305, + "logps/chosen": -341.83074951171875, + "logps/rejected": -351.32647705078125, + "loss": 0.4748, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7593275308609009, + "rewards/margins": 0.8168696761131287, + "rewards/rejected": -1.5761971473693848, + "step": 7145 + }, + { + "epoch": 0.82, + "learning_rate": 5.365796558585976e-08, + "logits/chosen": -2.547623872756958, + "logits/rejected": -2.327434539794922, + "logps/chosen": -243.5827178955078, + "logps/rejected": -278.4971008300781, + "loss": 0.3075, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3390579223632812, + "rewards/margins": 2.392305374145508, + "rewards/rejected": -3.731363296508789, + "step": 7146 + }, + { + "epoch": 0.82, + "learning_rate": 5.362284911623551e-08, + "logits/chosen": -1.9318971633911133, + "logits/rejected": -2.109036445617676, + "logps/chosen": -263.7661437988281, + "logps/rejected": -250.76341247558594, + "loss": 0.5802, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09092403948307037, + "rewards/margins": 1.148384928703308, + "rewards/rejected": -1.2393088340759277, + "step": 7147 + }, + { + "epoch": 0.82, + "learning_rate": 5.358773264661126e-08, + "logits/chosen": -2.6596732139587402, + "logits/rejected": -2.464393377304077, + "logps/chosen": -346.9736328125, + "logps/rejected": -253.50523376464844, + "loss": 0.5052, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3993557691574097, + "rewards/margins": 2.54288387298584, + "rewards/rejected": -3.942239284515381, + "step": 7148 + }, + { + "epoch": 0.82, + "learning_rate": 5.3552616176987e-08, + "logits/chosen": -1.8716429471969604, + "logits/rejected": -2.034780502319336, + "logps/chosen": -331.22381591796875, + "logps/rejected": -284.052001953125, + "loss": 0.4166, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0944585800170898, + "rewards/margins": 0.9043800830841064, + "rewards/rejected": -1.9988386631011963, + "step": 7149 + }, + { + "epoch": 0.82, + "learning_rate": 5.351749970736275e-08, + "logits/chosen": -2.1611669063568115, + "logits/rejected": -2.358004331588745, + "logps/chosen": -250.18487548828125, + "logps/rejected": -179.97943115234375, + "loss": 0.7778, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.226472020149231, + "rewards/margins": 1.592092514038086, + "rewards/rejected": -2.8185644149780273, + "step": 7150 + }, + { + "epoch": 0.82, + "learning_rate": 5.348238323773849e-08, + "logits/chosen": -2.518217086791992, + "logits/rejected": -2.748173713684082, + "logps/chosen": -274.85174560546875, + "logps/rejected": -276.90771484375, + "loss": 0.6983, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0343819856643677, + "rewards/margins": 2.154916763305664, + "rewards/rejected": -3.189298629760742, + "step": 7151 + }, + { + "epoch": 0.82, + "learning_rate": 5.3447266768114247e-08, + "logits/chosen": -2.104149580001831, + "logits/rejected": -2.487414598464966, + "logps/chosen": -338.7984924316406, + "logps/rejected": -237.70005798339844, + "loss": 0.5996, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8511249423027039, + "rewards/margins": 1.8328890800476074, + "rewards/rejected": -2.684014081954956, + "step": 7152 + }, + { + "epoch": 0.82, + "learning_rate": 5.341215029848999e-08, + "logits/chosen": -2.452911853790283, + "logits/rejected": -2.342311382293701, + "logps/chosen": -330.32501220703125, + "logps/rejected": -298.1870422363281, + "loss": 0.118, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5341095924377441, + "rewards/margins": 2.956954002380371, + "rewards/rejected": -3.4910635948181152, + "step": 7153 + }, + { + "epoch": 0.82, + "learning_rate": 5.3377033828865734e-08, + "logits/chosen": -2.6528797149658203, + "logits/rejected": -2.4381303787231445, + "logps/chosen": -106.72041320800781, + "logps/rejected": -169.548828125, + "loss": 0.4723, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5310410261154175, + "rewards/margins": 1.3472378253936768, + "rewards/rejected": -1.8782788515090942, + "step": 7154 + }, + { + "epoch": 0.82, + "learning_rate": 5.334191735924148e-08, + "logits/chosen": -1.782837152481079, + "logits/rejected": -1.8789422512054443, + "logps/chosen": -309.36102294921875, + "logps/rejected": -243.76324462890625, + "loss": 0.1338, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5661271810531616, + "rewards/margins": 2.6485583782196045, + "rewards/rejected": -2.0824310779571533, + "step": 7155 + }, + { + "epoch": 0.82, + "learning_rate": 5.330680088961723e-08, + "logits/chosen": -2.4716925621032715, + "logits/rejected": -2.5101709365844727, + "logps/chosen": -253.6247100830078, + "logps/rejected": -311.02081298828125, + "loss": 0.169, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15205445885658264, + "rewards/margins": 2.741499185562134, + "rewards/rejected": -2.8935537338256836, + "step": 7156 + }, + { + "epoch": 0.83, + "learning_rate": 5.327168441999297e-08, + "logits/chosen": -1.8688998222351074, + "logits/rejected": -1.8070323467254639, + "logps/chosen": -372.9170227050781, + "logps/rejected": -368.0705261230469, + "loss": 0.3097, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5818227529525757, + "rewards/margins": 2.062072992324829, + "rewards/rejected": -2.6438956260681152, + "step": 7157 + }, + { + "epoch": 0.83, + "learning_rate": 5.323656795036872e-08, + "logits/chosen": -2.1660165786743164, + "logits/rejected": -2.413029909133911, + "logps/chosen": -358.3993835449219, + "logps/rejected": -328.1312255859375, + "loss": 0.1662, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5808766484260559, + "rewards/margins": 3.45005202293396, + "rewards/rejected": -4.030928611755371, + "step": 7158 + }, + { + "epoch": 0.83, + "learning_rate": 5.3201451480744464e-08, + "logits/chosen": -2.9197092056274414, + "logits/rejected": -2.5791432857513428, + "logps/chosen": -340.397216796875, + "logps/rejected": -275.0778503417969, + "loss": 0.9418, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.4212121963500977, + "rewards/margins": 0.9509994387626648, + "rewards/rejected": -3.3722119331359863, + "step": 7159 + }, + { + "epoch": 0.83, + "learning_rate": 5.316633501112021e-08, + "logits/chosen": -2.8117330074310303, + "logits/rejected": -2.7560458183288574, + "logps/chosen": -179.59205627441406, + "logps/rejected": -182.82098388671875, + "loss": 0.6189, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7462026476860046, + "rewards/margins": 3.8337860107421875, + "rewards/rejected": -4.579988479614258, + "step": 7160 + }, + { + "epoch": 0.83, + "learning_rate": 5.313121854149596e-08, + "logits/chosen": -2.811375379562378, + "logits/rejected": -2.673250436782837, + "logps/chosen": -253.61570739746094, + "logps/rejected": -321.8053894042969, + "loss": 0.2465, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4447867274284363, + "rewards/margins": 2.4794533252716064, + "rewards/rejected": -2.9242401123046875, + "step": 7161 + }, + { + "epoch": 0.83, + "learning_rate": 5.3096102071871706e-08, + "logits/chosen": -2.29842472076416, + "logits/rejected": -2.2165093421936035, + "logps/chosen": -164.83883666992188, + "logps/rejected": -241.24647521972656, + "loss": 0.559, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7053269147872925, + "rewards/margins": 2.420008420944214, + "rewards/rejected": -4.125335216522217, + "step": 7162 + }, + { + "epoch": 0.83, + "learning_rate": 5.3060985602247446e-08, + "logits/chosen": -1.9338903427124023, + "logits/rejected": -1.8196074962615967, + "logps/chosen": -380.88909912109375, + "logps/rejected": -359.86712646484375, + "loss": 0.3959, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20373797416687012, + "rewards/margins": 3.6299219131469727, + "rewards/rejected": -3.8336598873138428, + "step": 7163 + }, + { + "epoch": 0.83, + "learning_rate": 5.30258691326232e-08, + "logits/chosen": -2.6522464752197266, + "logits/rejected": -2.5447278022766113, + "logps/chosen": -299.14013671875, + "logps/rejected": -276.921875, + "loss": 0.2672, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4213322401046753, + "rewards/margins": 2.952280044555664, + "rewards/rejected": -4.373612403869629, + "step": 7164 + }, + { + "epoch": 0.83, + "learning_rate": 5.299075266299894e-08, + "logits/chosen": -2.7561542987823486, + "logits/rejected": -2.532552480697632, + "logps/chosen": -274.38134765625, + "logps/rejected": -306.2381591796875, + "loss": 0.599, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2962292432785034, + "rewards/margins": 1.2711799144744873, + "rewards/rejected": -2.5674092769622803, + "step": 7165 + }, + { + "epoch": 0.83, + "learning_rate": 5.2955636193374695e-08, + "logits/chosen": -2.4146456718444824, + "logits/rejected": -2.491600275039673, + "logps/chosen": -259.4040832519531, + "logps/rejected": -320.65118408203125, + "loss": 0.7309, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1386644840240479, + "rewards/margins": 2.8314108848571777, + "rewards/rejected": -3.9700751304626465, + "step": 7166 + }, + { + "epoch": 0.83, + "learning_rate": 5.2920519723750435e-08, + "logits/chosen": -2.500852584838867, + "logits/rejected": -2.27298903465271, + "logps/chosen": -180.52188110351562, + "logps/rejected": -272.02813720703125, + "loss": 0.3766, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2578537464141846, + "rewards/margins": 1.9975248575210571, + "rewards/rejected": -3.2553787231445312, + "step": 7167 + }, + { + "epoch": 0.83, + "learning_rate": 5.288540325412618e-08, + "logits/chosen": -1.7859941720962524, + "logits/rejected": -1.8496041297912598, + "logps/chosen": -331.417236328125, + "logps/rejected": -354.38067626953125, + "loss": 0.4609, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7099527716636658, + "rewards/margins": 1.798471212387085, + "rewards/rejected": -2.5084238052368164, + "step": 7168 + }, + { + "epoch": 0.83, + "learning_rate": 5.285028678450193e-08, + "logits/chosen": -2.6190571784973145, + "logits/rejected": -2.552661895751953, + "logps/chosen": -196.26248168945312, + "logps/rejected": -267.36846923828125, + "loss": 0.1864, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.31080567836761475, + "rewards/margins": 2.462019681930542, + "rewards/rejected": -2.772825241088867, + "step": 7169 + }, + { + "epoch": 0.83, + "learning_rate": 5.281517031487768e-08, + "logits/chosen": -2.404233932495117, + "logits/rejected": -2.461151599884033, + "logps/chosen": -331.49371337890625, + "logps/rejected": -207.09060668945312, + "loss": 0.2426, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4124845862388611, + "rewards/margins": 2.082721710205078, + "rewards/rejected": -2.495206356048584, + "step": 7170 + }, + { + "epoch": 0.83, + "learning_rate": 5.278005384525342e-08, + "logits/chosen": -2.7755441665649414, + "logits/rejected": -2.694096565246582, + "logps/chosen": -349.301025390625, + "logps/rejected": -206.1288299560547, + "loss": 0.2274, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5819769501686096, + "rewards/margins": 2.202749490737915, + "rewards/rejected": -2.78472638130188, + "step": 7171 + }, + { + "epoch": 0.83, + "learning_rate": 5.274493737562917e-08, + "logits/chosen": -1.7631943225860596, + "logits/rejected": -1.7255676984786987, + "logps/chosen": -333.34722900390625, + "logps/rejected": -259.626708984375, + "loss": 0.6033, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.056079626083374, + "rewards/margins": 1.161682367324829, + "rewards/rejected": -2.2177622318267822, + "step": 7172 + }, + { + "epoch": 0.83, + "learning_rate": 5.270982090600491e-08, + "logits/chosen": -1.834728717803955, + "logits/rejected": -2.0313401222229004, + "logps/chosen": -228.4856414794922, + "logps/rejected": -250.0263671875, + "loss": 0.1728, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7484372854232788, + "rewards/margins": 2.090771198272705, + "rewards/rejected": -2.8392083644866943, + "step": 7173 + }, + { + "epoch": 0.83, + "learning_rate": 5.267470443638066e-08, + "logits/chosen": -2.438368558883667, + "logits/rejected": -2.5409584045410156, + "logps/chosen": -212.87860107421875, + "logps/rejected": -189.8716278076172, + "loss": 0.3507, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1764240562915802, + "rewards/margins": 2.0488743782043457, + "rewards/rejected": -2.2252984046936035, + "step": 7174 + }, + { + "epoch": 0.83, + "learning_rate": 5.2639587966756406e-08, + "logits/chosen": -2.3006227016448975, + "logits/rejected": -2.2397267818450928, + "logps/chosen": -317.05364990234375, + "logps/rejected": -271.06341552734375, + "loss": 0.1491, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5062642097473145, + "rewards/margins": 2.893266201019287, + "rewards/rejected": -3.3995304107666016, + "step": 7175 + }, + { + "epoch": 0.83, + "learning_rate": 5.2604471497132154e-08, + "logits/chosen": -2.611565589904785, + "logits/rejected": -2.6480720043182373, + "logps/chosen": -261.9912414550781, + "logps/rejected": -272.62847900390625, + "loss": 0.3811, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0644336938858032, + "rewards/margins": 1.823170781135559, + "rewards/rejected": -2.887604236602783, + "step": 7176 + }, + { + "epoch": 0.83, + "learning_rate": 5.2569355027507894e-08, + "logits/chosen": -2.007169485092163, + "logits/rejected": -1.847978115081787, + "logps/chosen": -356.2672424316406, + "logps/rejected": -289.50799560546875, + "loss": 0.2783, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5013904571533203, + "rewards/margins": 3.202864170074463, + "rewards/rejected": -4.704254627227783, + "step": 7177 + }, + { + "epoch": 0.83, + "learning_rate": 5.253423855788365e-08, + "logits/chosen": -2.156097650527954, + "logits/rejected": -2.252380132675171, + "logps/chosen": -151.66470336914062, + "logps/rejected": -95.53778076171875, + "loss": 0.7127, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.390446662902832, + "rewards/margins": 0.8397713899612427, + "rewards/rejected": -2.230217933654785, + "step": 7178 + }, + { + "epoch": 0.83, + "learning_rate": 5.249912208825939e-08, + "logits/chosen": -2.244907855987549, + "logits/rejected": -1.7504644393920898, + "logps/chosen": -207.45310974121094, + "logps/rejected": -356.0186462402344, + "loss": 0.3297, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.30278247594833374, + "rewards/margins": 2.4086453914642334, + "rewards/rejected": -2.711427927017212, + "step": 7179 + }, + { + "epoch": 0.83, + "learning_rate": 5.246400561863514e-08, + "logits/chosen": -1.6858093738555908, + "logits/rejected": -1.7487070560455322, + "logps/chosen": -317.52288818359375, + "logps/rejected": -272.92950439453125, + "loss": 0.3052, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8068011403083801, + "rewards/margins": 2.283482074737549, + "rewards/rejected": -3.0902833938598633, + "step": 7180 + }, + { + "epoch": 0.83, + "learning_rate": 5.242888914901088e-08, + "logits/chosen": -2.3607654571533203, + "logits/rejected": -2.1072072982788086, + "logps/chosen": -262.0743103027344, + "logps/rejected": -315.94989013671875, + "loss": 0.1089, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9620770215988159, + "rewards/margins": 4.126479625701904, + "rewards/rejected": -5.08855676651001, + "step": 7181 + }, + { + "epoch": 0.83, + "learning_rate": 5.239377267938663e-08, + "logits/chosen": -2.189471483230591, + "logits/rejected": -2.0819287300109863, + "logps/chosen": -325.4284973144531, + "logps/rejected": -317.8225402832031, + "loss": 0.9424, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.9958621263504028, + "rewards/margins": -0.07716190814971924, + "rewards/rejected": -1.918700098991394, + "step": 7182 + }, + { + "epoch": 0.83, + "learning_rate": 5.235865620976238e-08, + "logits/chosen": -2.5197529792785645, + "logits/rejected": -2.7422704696655273, + "logps/chosen": -681.2741088867188, + "logps/rejected": -382.67071533203125, + "loss": 1.0161, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3263136148452759, + "rewards/margins": 1.3588793277740479, + "rewards/rejected": -2.6851930618286133, + "step": 7183 + }, + { + "epoch": 0.83, + "learning_rate": 5.2323539740138125e-08, + "logits/chosen": -1.9750211238861084, + "logits/rejected": -2.198866367340088, + "logps/chosen": -363.58392333984375, + "logps/rejected": -254.80740356445312, + "loss": 0.486, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.140499472618103, + "rewards/margins": 0.9953306317329407, + "rewards/rejected": -2.1358299255371094, + "step": 7184 + }, + { + "epoch": 0.83, + "learning_rate": 5.2288423270513866e-08, + "logits/chosen": -2.3213112354278564, + "logits/rejected": -2.163964033126831, + "logps/chosen": -230.47962951660156, + "logps/rejected": -144.85072326660156, + "loss": 0.4854, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6606873273849487, + "rewards/margins": 1.312245488166809, + "rewards/rejected": -1.9729328155517578, + "step": 7185 + }, + { + "epoch": 0.83, + "learning_rate": 5.225330680088962e-08, + "logits/chosen": -1.7757819890975952, + "logits/rejected": -2.177551031112671, + "logps/chosen": -279.2635192871094, + "logps/rejected": -204.81361389160156, + "loss": 0.6286, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2889131307601929, + "rewards/margins": 1.318432092666626, + "rewards/rejected": -2.6073451042175293, + "step": 7186 + }, + { + "epoch": 0.83, + "learning_rate": 5.221819033126536e-08, + "logits/chosen": -2.2514686584472656, + "logits/rejected": -2.3994784355163574, + "logps/chosen": -432.212646484375, + "logps/rejected": -468.0611572265625, + "loss": 0.5939, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.33361655473709106, + "rewards/margins": 1.2009074687957764, + "rewards/rejected": -1.5345240831375122, + "step": 7187 + }, + { + "epoch": 0.83, + "learning_rate": 5.2183073861641114e-08, + "logits/chosen": -2.675234794616699, + "logits/rejected": -2.497999906539917, + "logps/chosen": -144.41177368164062, + "logps/rejected": -317.3686218261719, + "loss": 0.2749, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.486955165863037, + "rewards/margins": 2.1059696674346924, + "rewards/rejected": -3.5929248332977295, + "step": 7188 + }, + { + "epoch": 0.83, + "learning_rate": 5.2147957392016855e-08, + "logits/chosen": -2.518085479736328, + "logits/rejected": -2.457730293273926, + "logps/chosen": -368.1285705566406, + "logps/rejected": -378.1603698730469, + "loss": 0.3166, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5468845367431641, + "rewards/margins": 3.497922897338867, + "rewards/rejected": -4.044807434082031, + "step": 7189 + }, + { + "epoch": 0.83, + "learning_rate": 5.21128409223926e-08, + "logits/chosen": -2.230729103088379, + "logits/rejected": -2.2873940467834473, + "logps/chosen": -436.5907287597656, + "logps/rejected": -338.6906433105469, + "loss": 0.3959, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2394778728485107, + "rewards/margins": 2.172034978866577, + "rewards/rejected": -3.411512851715088, + "step": 7190 + }, + { + "epoch": 0.83, + "learning_rate": 5.207772445276834e-08, + "logits/chosen": -2.1969099044799805, + "logits/rejected": -2.211914539337158, + "logps/chosen": -300.4556579589844, + "logps/rejected": -388.6889343261719, + "loss": 0.5299, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.281770944595337, + "rewards/margins": 1.5110708475112915, + "rewards/rejected": -2.792841911315918, + "step": 7191 + }, + { + "epoch": 0.83, + "learning_rate": 5.2042607983144096e-08, + "logits/chosen": -2.725314140319824, + "logits/rejected": -2.9392547607421875, + "logps/chosen": -303.039306640625, + "logps/rejected": -253.26092529296875, + "loss": 0.2405, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17772209644317627, + "rewards/margins": 2.892256498336792, + "rewards/rejected": -3.069978713989258, + "step": 7192 + }, + { + "epoch": 0.83, + "learning_rate": 5.200749151351984e-08, + "logits/chosen": -2.2116432189941406, + "logits/rejected": -2.1367082595825195, + "logps/chosen": -190.89756774902344, + "logps/rejected": -246.3823699951172, + "loss": 0.2759, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.43269655108451843, + "rewards/margins": 1.6153066158294678, + "rewards/rejected": -2.0480029582977295, + "step": 7193 + }, + { + "epoch": 0.83, + "learning_rate": 5.197237504389559e-08, + "logits/chosen": -2.39811110496521, + "logits/rejected": -2.559391736984253, + "logps/chosen": -220.576904296875, + "logps/rejected": -283.75799560546875, + "loss": 0.1407, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.018326997756958, + "rewards/margins": 3.7774147987365723, + "rewards/rejected": -4.795741558074951, + "step": 7194 + }, + { + "epoch": 0.83, + "learning_rate": 5.193725857427133e-08, + "logits/chosen": -2.1506941318511963, + "logits/rejected": -2.3738977909088135, + "logps/chosen": -302.43682861328125, + "logps/rejected": -201.6325225830078, + "loss": 0.2147, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.485377311706543, + "rewards/margins": 2.885329008102417, + "rewards/rejected": -4.370706558227539, + "step": 7195 + }, + { + "epoch": 0.83, + "learning_rate": 5.190214210464707e-08, + "logits/chosen": -2.254434823989868, + "logits/rejected": -2.795938730239868, + "logps/chosen": -344.4391784667969, + "logps/rejected": -259.1894226074219, + "loss": 0.0458, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1975388526916504, + "rewards/margins": 3.6026368141174316, + "rewards/rejected": -3.4050979614257812, + "step": 7196 + }, + { + "epoch": 0.83, + "learning_rate": 5.1867025635022826e-08, + "logits/chosen": -2.426177978515625, + "logits/rejected": -2.3700850009918213, + "logps/chosen": -289.00286865234375, + "logps/rejected": -287.3529968261719, + "loss": 0.2957, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6578540802001953, + "rewards/margins": 2.9179511070251465, + "rewards/rejected": -3.575805187225342, + "step": 7197 + }, + { + "epoch": 0.83, + "learning_rate": 5.1831909165398566e-08, + "logits/chosen": -2.0671873092651367, + "logits/rejected": -2.2005691528320312, + "logps/chosen": -229.76824951171875, + "logps/rejected": -283.3533935546875, + "loss": 0.2396, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.28792423009872437, + "rewards/margins": 3.1452064514160156, + "rewards/rejected": -3.433130979537964, + "step": 7198 + }, + { + "epoch": 0.83, + "learning_rate": 5.1796792695774314e-08, + "logits/chosen": -1.7950174808502197, + "logits/rejected": -1.8671622276306152, + "logps/chosen": -433.60638427734375, + "logps/rejected": -549.8947143554688, + "loss": 0.6983, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3714492321014404, + "rewards/margins": 2.2124547958374023, + "rewards/rejected": -3.5839037895202637, + "step": 7199 + }, + { + "epoch": 0.83, + "learning_rate": 5.176167622615006e-08, + "logits/chosen": -2.5540428161621094, + "logits/rejected": -2.812823534011841, + "logps/chosen": -337.2231140136719, + "logps/rejected": -296.0423583984375, + "loss": 0.7603, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.451378583908081, + "rewards/margins": 2.0122170448303223, + "rewards/rejected": -3.4635956287384033, + "step": 7200 + }, + { + "epoch": 0.83, + "learning_rate": 5.172655975652581e-08, + "logits/chosen": -2.0300369262695312, + "logits/rejected": -1.8469974994659424, + "logps/chosen": -183.1212921142578, + "logps/rejected": -231.2154998779297, + "loss": 0.35, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2699190378189087, + "rewards/margins": 1.1945980787277222, + "rewards/rejected": -1.4645169973373413, + "step": 7201 + }, + { + "epoch": 0.83, + "learning_rate": 5.169144328690155e-08, + "logits/chosen": -1.6902379989624023, + "logits/rejected": -1.8033596277236938, + "logps/chosen": -354.36761474609375, + "logps/rejected": -340.1617736816406, + "loss": 0.498, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.46899986267089844, + "rewards/margins": 2.4018666744232178, + "rewards/rejected": -2.870866537094116, + "step": 7202 + }, + { + "epoch": 0.83, + "learning_rate": 5.16563268172773e-08, + "logits/chosen": -2.268850326538086, + "logits/rejected": -2.458117723464966, + "logps/chosen": -212.01846313476562, + "logps/rejected": -191.74256896972656, + "loss": 0.7481, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.302077054977417, + "rewards/margins": 1.2280285358428955, + "rewards/rejected": -2.5301055908203125, + "step": 7203 + }, + { + "epoch": 0.83, + "learning_rate": 5.162121034765304e-08, + "logits/chosen": -2.3208861351013184, + "logits/rejected": -2.0214593410491943, + "logps/chosen": -119.21296691894531, + "logps/rejected": -276.5926818847656, + "loss": 0.2767, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2621908187866211, + "rewards/margins": 3.0067861080169678, + "rewards/rejected": -3.2689766883850098, + "step": 7204 + }, + { + "epoch": 0.83, + "learning_rate": 5.15860938780288e-08, + "logits/chosen": -2.680495023727417, + "logits/rejected": -2.6507272720336914, + "logps/chosen": -172.77685546875, + "logps/rejected": -247.81747436523438, + "loss": 0.2712, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.47115176916122437, + "rewards/margins": 1.9208496809005737, + "rewards/rejected": -2.3920013904571533, + "step": 7205 + }, + { + "epoch": 0.83, + "learning_rate": 5.155097740840454e-08, + "logits/chosen": -2.4801888465881348, + "logits/rejected": -2.5395703315734863, + "logps/chosen": -259.03265380859375, + "logps/rejected": -322.18212890625, + "loss": 0.452, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.40521562099456787, + "rewards/margins": 2.3210067749023438, + "rewards/rejected": -2.726222515106201, + "step": 7206 + }, + { + "epoch": 0.83, + "learning_rate": 5.1515860938780285e-08, + "logits/chosen": -2.685375213623047, + "logits/rejected": -2.9085843563079834, + "logps/chosen": -284.42767333984375, + "logps/rejected": -231.23851013183594, + "loss": 1.2132, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9470069408416748, + "rewards/margins": 0.6922325491905212, + "rewards/rejected": -2.639239549636841, + "step": 7207 + }, + { + "epoch": 0.83, + "learning_rate": 5.1480744469156026e-08, + "logits/chosen": -2.3412044048309326, + "logits/rejected": -2.2293269634246826, + "logps/chosen": -140.95864868164062, + "logps/rejected": -219.87319946289062, + "loss": 0.4004, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5568244457244873, + "rewards/margins": 1.7326797246932983, + "rewards/rejected": -3.289504051208496, + "step": 7208 + }, + { + "epoch": 0.83, + "learning_rate": 5.144562799953178e-08, + "logits/chosen": -2.3712573051452637, + "logits/rejected": -2.216940402984619, + "logps/chosen": -143.93051147460938, + "logps/rejected": -201.6456298828125, + "loss": 0.3323, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.052189588546753, + "rewards/margins": 2.5931344032287598, + "rewards/rejected": -3.6453239917755127, + "step": 7209 + }, + { + "epoch": 0.83, + "learning_rate": 5.141051152990752e-08, + "logits/chosen": -2.5799460411071777, + "logits/rejected": -2.065592050552368, + "logps/chosen": -155.17092895507812, + "logps/rejected": -328.8164367675781, + "loss": 0.3507, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9348251819610596, + "rewards/margins": 1.908057689666748, + "rewards/rejected": -2.8428831100463867, + "step": 7210 + }, + { + "epoch": 0.83, + "learning_rate": 5.1375395060283274e-08, + "logits/chosen": -1.8079086542129517, + "logits/rejected": -1.8885241746902466, + "logps/chosen": -427.7331237792969, + "logps/rejected": -397.05078125, + "loss": 0.4477, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.834671139717102, + "rewards/margins": 2.0311293601989746, + "rewards/rejected": -2.865800142288208, + "step": 7211 + }, + { + "epoch": 0.83, + "learning_rate": 5.1340278590659015e-08, + "logits/chosen": -1.95228910446167, + "logits/rejected": -1.9460948705673218, + "logps/chosen": -327.64013671875, + "logps/rejected": -313.65850830078125, + "loss": 0.3439, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8842681646347046, + "rewards/margins": 2.096390962600708, + "rewards/rejected": -2.980659008026123, + "step": 7212 + }, + { + "epoch": 0.83, + "learning_rate": 5.130516212103476e-08, + "logits/chosen": -2.1580965518951416, + "logits/rejected": -2.3880627155303955, + "logps/chosen": -216.40350341796875, + "logps/rejected": -254.39376831054688, + "loss": 0.5281, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.644020676612854, + "rewards/margins": 2.5423617362976074, + "rewards/rejected": -4.18638277053833, + "step": 7213 + }, + { + "epoch": 0.83, + "learning_rate": 5.127004565141051e-08, + "logits/chosen": -2.8138914108276367, + "logits/rejected": -2.8048555850982666, + "logps/chosen": -206.55044555664062, + "logps/rejected": -209.9591827392578, + "loss": 1.0076, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1515631675720215, + "rewards/margins": 1.1644757986068726, + "rewards/rejected": -3.3160388469696045, + "step": 7214 + }, + { + "epoch": 0.83, + "learning_rate": 5.1234929181786256e-08, + "logits/chosen": -2.5435898303985596, + "logits/rejected": -2.5735487937927246, + "logps/chosen": -460.4707336425781, + "logps/rejected": -306.26031494140625, + "loss": 0.4739, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9865454435348511, + "rewards/margins": 1.9258267879486084, + "rewards/rejected": -2.91237211227417, + "step": 7215 + }, + { + "epoch": 0.83, + "learning_rate": 5.1199812712162e-08, + "logits/chosen": -2.068819284439087, + "logits/rejected": -2.3339040279388428, + "logps/chosen": -274.9886474609375, + "logps/rejected": -188.88327026367188, + "loss": 0.2892, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4977863132953644, + "rewards/margins": 2.262338399887085, + "rewards/rejected": -2.760124683380127, + "step": 7216 + }, + { + "epoch": 0.83, + "learning_rate": 5.116469624253775e-08, + "logits/chosen": -2.025839328765869, + "logits/rejected": -2.0561347007751465, + "logps/chosen": -169.16802978515625, + "logps/rejected": -200.64137268066406, + "loss": 0.3009, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.029705286026001, + "rewards/margins": 2.6089107990264893, + "rewards/rejected": -3.6386160850524902, + "step": 7217 + }, + { + "epoch": 0.83, + "learning_rate": 5.112957977291349e-08, + "logits/chosen": -2.5921072959899902, + "logits/rejected": -2.3659796714782715, + "logps/chosen": -162.15692138671875, + "logps/rejected": -209.24118041992188, + "loss": 0.6799, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6285061836242676, + "rewards/margins": 1.30863356590271, + "rewards/rejected": -1.9371397495269775, + "step": 7218 + }, + { + "epoch": 0.83, + "learning_rate": 5.1094463303289245e-08, + "logits/chosen": -2.7249550819396973, + "logits/rejected": -2.7167863845825195, + "logps/chosen": -317.2860412597656, + "logps/rejected": -259.6837158203125, + "loss": 0.6835, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5386017560958862, + "rewards/margins": 1.7813482284545898, + "rewards/rejected": -3.3199496269226074, + "step": 7219 + }, + { + "epoch": 0.83, + "learning_rate": 5.1059346833664986e-08, + "logits/chosen": -2.022857189178467, + "logits/rejected": -2.286249876022339, + "logps/chosen": -217.19090270996094, + "logps/rejected": -210.1577911376953, + "loss": 0.7456, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8817850947380066, + "rewards/margins": 0.76344233751297, + "rewards/rejected": -1.6452274322509766, + "step": 7220 + }, + { + "epoch": 0.83, + "learning_rate": 5.102423036404073e-08, + "logits/chosen": -1.5514395236968994, + "logits/rejected": -1.7433217763900757, + "logps/chosen": -356.4668273925781, + "logps/rejected": -246.70591735839844, + "loss": 0.3213, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.534023106098175, + "rewards/margins": 1.9015636444091797, + "rewards/rejected": -2.435586929321289, + "step": 7221 + }, + { + "epoch": 0.83, + "learning_rate": 5.098911389441648e-08, + "logits/chosen": -2.0925803184509277, + "logits/rejected": -2.1301467418670654, + "logps/chosen": -392.15081787109375, + "logps/rejected": -354.69970703125, + "loss": 0.6861, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4199366569519043, + "rewards/margins": 1.2417306900024414, + "rewards/rejected": -2.661667585372925, + "step": 7222 + }, + { + "epoch": 0.83, + "learning_rate": 5.095399742479223e-08, + "logits/chosen": -2.1928088665008545, + "logits/rejected": -1.872366189956665, + "logps/chosen": -211.82106018066406, + "logps/rejected": -281.49969482421875, + "loss": 0.2172, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5761456489562988, + "rewards/margins": 2.5363783836364746, + "rewards/rejected": -3.1125242710113525, + "step": 7223 + }, + { + "epoch": 0.83, + "learning_rate": 5.091888095516797e-08, + "logits/chosen": -2.2244575023651123, + "logits/rejected": -2.117702007293701, + "logps/chosen": -328.3506164550781, + "logps/rejected": -209.4846649169922, + "loss": 0.2613, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1465390920639038, + "rewards/margins": 2.2716641426086426, + "rewards/rejected": -3.418203115463257, + "step": 7224 + }, + { + "epoch": 0.83, + "learning_rate": 5.088376448554372e-08, + "logits/chosen": -2.0217432975769043, + "logits/rejected": -2.0496668815612793, + "logps/chosen": -323.8356628417969, + "logps/rejected": -315.7806091308594, + "loss": 0.295, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0515860319137573, + "rewards/margins": 2.690314769744873, + "rewards/rejected": -3.74190092086792, + "step": 7225 + }, + { + "epoch": 0.83, + "learning_rate": 5.084864801591946e-08, + "logits/chosen": -2.1337413787841797, + "logits/rejected": -2.333298683166504, + "logps/chosen": -490.38177490234375, + "logps/rejected": -259.8387451171875, + "loss": 0.1331, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1633823662996292, + "rewards/margins": 2.9308531284332275, + "rewards/rejected": -3.09423565864563, + "step": 7226 + }, + { + "epoch": 0.83, + "learning_rate": 5.081353154629521e-08, + "logits/chosen": -1.960641622543335, + "logits/rejected": -2.1846866607666016, + "logps/chosen": -261.9680480957031, + "logps/rejected": -319.083740234375, + "loss": 0.7898, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6537787914276123, + "rewards/margins": 2.014585018157959, + "rewards/rejected": -3.668363571166992, + "step": 7227 + }, + { + "epoch": 0.83, + "learning_rate": 5.077841507667096e-08, + "logits/chosen": -2.476327896118164, + "logits/rejected": -2.4719414710998535, + "logps/chosen": -153.59764099121094, + "logps/rejected": -235.60760498046875, + "loss": 0.2134, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.016585271805524826, + "rewards/margins": 2.0656943321228027, + "rewards/rejected": -2.0491089820861816, + "step": 7228 + }, + { + "epoch": 0.83, + "learning_rate": 5.0743298607046704e-08, + "logits/chosen": -2.7169997692108154, + "logits/rejected": -2.506805658340454, + "logps/chosen": -328.0690002441406, + "logps/rejected": -320.72894287109375, + "loss": 0.38, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8370413780212402, + "rewards/margins": 2.009852409362793, + "rewards/rejected": -2.846893787384033, + "step": 7229 + }, + { + "epoch": 0.83, + "learning_rate": 5.0708182137422445e-08, + "logits/chosen": -2.9397664070129395, + "logits/rejected": -2.9874629974365234, + "logps/chosen": -209.56524658203125, + "logps/rejected": -373.86846923828125, + "loss": 0.3632, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7312964200973511, + "rewards/margins": 2.000731945037842, + "rewards/rejected": -2.7320284843444824, + "step": 7230 + }, + { + "epoch": 0.83, + "learning_rate": 5.06730656677982e-08, + "logits/chosen": -2.2690577507019043, + "logits/rejected": -2.5477023124694824, + "logps/chosen": -266.8725891113281, + "logps/rejected": -265.1025085449219, + "loss": 0.4805, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1616480350494385, + "rewards/margins": 2.1353516578674316, + "rewards/rejected": -3.29699969291687, + "step": 7231 + }, + { + "epoch": 0.83, + "learning_rate": 5.063794919817394e-08, + "logits/chosen": -2.5995259284973145, + "logits/rejected": -2.58968448638916, + "logps/chosen": -222.8599090576172, + "logps/rejected": -246.40829467773438, + "loss": 0.0655, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.283008337020874, + "rewards/margins": 3.8046557903289795, + "rewards/rejected": -4.0876641273498535, + "step": 7232 + }, + { + "epoch": 0.83, + "learning_rate": 5.0602832728549693e-08, + "logits/chosen": -1.9926097393035889, + "logits/rejected": -2.3807790279388428, + "logps/chosen": -591.778076171875, + "logps/rejected": -284.485595703125, + "loss": 0.4015, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7227982878684998, + "rewards/margins": 1.745593786239624, + "rewards/rejected": -2.4683921337127686, + "step": 7233 + }, + { + "epoch": 0.83, + "learning_rate": 5.0567716258925434e-08, + "logits/chosen": -2.4045310020446777, + "logits/rejected": -2.5347633361816406, + "logps/chosen": -351.5169372558594, + "logps/rejected": -344.2236022949219, + "loss": 0.3372, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6622905135154724, + "rewards/margins": 1.7580046653747559, + "rewards/rejected": -2.420295238494873, + "step": 7234 + }, + { + "epoch": 0.83, + "learning_rate": 5.053259978930118e-08, + "logits/chosen": -2.0782880783081055, + "logits/rejected": -2.337372303009033, + "logps/chosen": -410.94146728515625, + "logps/rejected": -234.59542846679688, + "loss": 0.2801, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7010893225669861, + "rewards/margins": 2.256122589111328, + "rewards/rejected": -2.95721173286438, + "step": 7235 + }, + { + "epoch": 0.83, + "learning_rate": 5.049748331967693e-08, + "logits/chosen": -2.26426100730896, + "logits/rejected": -2.278412342071533, + "logps/chosen": -297.26409912109375, + "logps/rejected": -227.9554901123047, + "loss": 0.2067, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6094452142715454, + "rewards/margins": 3.651373863220215, + "rewards/rejected": -4.260818958282471, + "step": 7236 + }, + { + "epoch": 0.83, + "learning_rate": 5.0462366850052676e-08, + "logits/chosen": -2.6916139125823975, + "logits/rejected": -2.6954431533813477, + "logps/chosen": -81.18074798583984, + "logps/rejected": -212.9363250732422, + "loss": 0.2643, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10459963977336884, + "rewards/margins": 3.3063931465148926, + "rewards/rejected": -3.4109928607940674, + "step": 7237 + }, + { + "epoch": 0.83, + "learning_rate": 5.0427250380428416e-08, + "logits/chosen": -2.2148823738098145, + "logits/rejected": -2.1314098834991455, + "logps/chosen": -422.39910888671875, + "logps/rejected": -390.92620849609375, + "loss": 0.2733, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08946369588375092, + "rewards/margins": 1.2677793502807617, + "rewards/rejected": -1.1783156394958496, + "step": 7238 + }, + { + "epoch": 0.83, + "learning_rate": 5.039213391080417e-08, + "logits/chosen": -2.3931853771209717, + "logits/rejected": -2.466517448425293, + "logps/chosen": -372.9898986816406, + "logps/rejected": -343.1514892578125, + "loss": 0.7426, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.252133846282959, + "rewards/margins": 2.088609218597412, + "rewards/rejected": -3.34074330329895, + "step": 7239 + }, + { + "epoch": 0.83, + "learning_rate": 5.035701744117991e-08, + "logits/chosen": -2.0660829544067383, + "logits/rejected": -2.0942013263702393, + "logps/chosen": -309.3421325683594, + "logps/rejected": -326.18792724609375, + "loss": 0.6117, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.944472074508667, + "rewards/margins": 2.829592227935791, + "rewards/rejected": -3.774064302444458, + "step": 7240 + }, + { + "epoch": 0.83, + "learning_rate": 5.0321900971555665e-08, + "logits/chosen": -2.2965810298919678, + "logits/rejected": -2.0075149536132812, + "logps/chosen": -232.9829864501953, + "logps/rejected": -368.4931335449219, + "loss": 0.162, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5190089344978333, + "rewards/margins": 5.346595764160156, + "rewards/rejected": -5.865604877471924, + "step": 7241 + }, + { + "epoch": 0.83, + "learning_rate": 5.0286784501931405e-08, + "logits/chosen": -2.3293588161468506, + "logits/rejected": -2.4744608402252197, + "logps/chosen": -215.79229736328125, + "logps/rejected": -243.5232391357422, + "loss": 0.3525, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1272411346435547, + "rewards/margins": 1.8889186382293701, + "rewards/rejected": -3.016160011291504, + "step": 7242 + }, + { + "epoch": 0.83, + "learning_rate": 5.0251668032307146e-08, + "logits/chosen": -2.35441255569458, + "logits/rejected": -2.3995237350463867, + "logps/chosen": -175.72804260253906, + "logps/rejected": -332.59613037109375, + "loss": 0.4963, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4506717920303345, + "rewards/margins": 2.1933908462524414, + "rewards/rejected": -3.6440627574920654, + "step": 7243 + }, + { + "epoch": 0.84, + "learning_rate": 5.021655156268289e-08, + "logits/chosen": -2.3383593559265137, + "logits/rejected": -2.319610595703125, + "logps/chosen": -200.57949829101562, + "logps/rejected": -253.17141723632812, + "loss": 0.1242, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7205010652542114, + "rewards/margins": 3.458714723587036, + "rewards/rejected": -4.179215431213379, + "step": 7244 + }, + { + "epoch": 0.84, + "learning_rate": 5.018143509305864e-08, + "logits/chosen": -2.715360164642334, + "logits/rejected": -2.4984588623046875, + "logps/chosen": -179.3698272705078, + "logps/rejected": -252.07232666015625, + "loss": 0.578, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.759719967842102, + "rewards/margins": 1.8751875162124634, + "rewards/rejected": -3.6349072456359863, + "step": 7245 + }, + { + "epoch": 0.84, + "learning_rate": 5.014631862343439e-08, + "logits/chosen": -2.245981454849243, + "logits/rejected": -2.258273124694824, + "logps/chosen": -217.67469787597656, + "logps/rejected": -262.87579345703125, + "loss": 0.5496, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3787639141082764, + "rewards/margins": 1.671628475189209, + "rewards/rejected": -3.0503923892974854, + "step": 7246 + }, + { + "epoch": 0.84, + "learning_rate": 5.011120215381013e-08, + "logits/chosen": -2.1571900844573975, + "logits/rejected": -2.150886058807373, + "logps/chosen": -235.77688598632812, + "logps/rejected": -254.27529907226562, + "loss": 0.3262, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.150801420211792, + "rewards/margins": 2.523118495941162, + "rewards/rejected": -3.673920154571533, + "step": 7247 + }, + { + "epoch": 0.84, + "learning_rate": 5.007608568418588e-08, + "logits/chosen": -1.929816722869873, + "logits/rejected": -1.6857976913452148, + "logps/chosen": -324.1819763183594, + "logps/rejected": -532.3345336914062, + "loss": 0.3879, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.47571253776550293, + "rewards/margins": 1.995173692703247, + "rewards/rejected": -2.47088623046875, + "step": 7248 + }, + { + "epoch": 0.84, + "learning_rate": 5.004096921456162e-08, + "logits/chosen": -1.9494678974151611, + "logits/rejected": -2.0375802516937256, + "logps/chosen": -545.190673828125, + "logps/rejected": -475.78094482421875, + "loss": 0.365, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6076822280883789, + "rewards/margins": 1.5344182252883911, + "rewards/rejected": -2.1421005725860596, + "step": 7249 + }, + { + "epoch": 0.84, + "learning_rate": 5.0005852744937377e-08, + "logits/chosen": -2.2016849517822266, + "logits/rejected": -2.4715962409973145, + "logps/chosen": -320.6661071777344, + "logps/rejected": -184.9225616455078, + "loss": 0.1162, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18723779916763306, + "rewards/margins": 3.5623297691345215, + "rewards/rejected": -3.7495675086975098, + "step": 7250 + }, + { + "epoch": 0.84, + "learning_rate": 4.997073627531312e-08, + "logits/chosen": -1.9853973388671875, + "logits/rejected": -2.078855514526367, + "logps/chosen": -213.9098358154297, + "logps/rejected": -188.1865692138672, + "loss": 0.3154, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5540071129798889, + "rewards/margins": 2.8730757236480713, + "rewards/rejected": -3.4270825386047363, + "step": 7251 + }, + { + "epoch": 0.84, + "learning_rate": 4.9935619805688864e-08, + "logits/chosen": -2.297330141067505, + "logits/rejected": -2.2123711109161377, + "logps/chosen": -379.5542297363281, + "logps/rejected": -484.07666015625, + "loss": 0.3788, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7103514671325684, + "rewards/margins": 2.6862282752990723, + "rewards/rejected": -3.3965797424316406, + "step": 7252 + }, + { + "epoch": 0.84, + "learning_rate": 4.990050333606461e-08, + "logits/chosen": -2.305027723312378, + "logits/rejected": -2.235659122467041, + "logps/chosen": -209.98789978027344, + "logps/rejected": -234.232177734375, + "loss": 1.0669, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.0811305046081543, + "rewards/margins": 1.0115444660186768, + "rewards/rejected": -3.09267520904541, + "step": 7253 + }, + { + "epoch": 0.84, + "learning_rate": 4.986538686644036e-08, + "logits/chosen": -2.3650448322296143, + "logits/rejected": -2.184328556060791, + "logps/chosen": -156.45401000976562, + "logps/rejected": -284.5594177246094, + "loss": 0.2345, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7144745588302612, + "rewards/margins": 2.558382511138916, + "rewards/rejected": -3.272857189178467, + "step": 7254 + }, + { + "epoch": 0.84, + "learning_rate": 4.98302703968161e-08, + "logits/chosen": -2.072157144546509, + "logits/rejected": -2.537466049194336, + "logps/chosen": -390.54937744140625, + "logps/rejected": -213.3436279296875, + "loss": 0.5868, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.612413763999939, + "rewards/margins": 1.776480793952942, + "rewards/rejected": -2.388894557952881, + "step": 7255 + }, + { + "epoch": 0.84, + "learning_rate": 4.9795153927191853e-08, + "logits/chosen": -2.6305923461914062, + "logits/rejected": -2.6383583545684814, + "logps/chosen": -237.12210083007812, + "logps/rejected": -208.11585998535156, + "loss": 0.2292, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5740627646446228, + "rewards/margins": 2.9658169746398926, + "rewards/rejected": -3.53987979888916, + "step": 7256 + }, + { + "epoch": 0.84, + "learning_rate": 4.9760037457567594e-08, + "logits/chosen": -2.434138774871826, + "logits/rejected": -2.469329357147217, + "logps/chosen": -383.02264404296875, + "logps/rejected": -293.258544921875, + "loss": 0.117, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.552263617515564, + "rewards/margins": 2.995818614959717, + "rewards/rejected": -3.5480823516845703, + "step": 7257 + }, + { + "epoch": 0.84, + "learning_rate": 4.972492098794335e-08, + "logits/chosen": -2.630955934524536, + "logits/rejected": -2.4941115379333496, + "logps/chosen": -301.7010803222656, + "logps/rejected": -308.50970458984375, + "loss": 0.1507, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.652472198009491, + "rewards/margins": 3.0132246017456055, + "rewards/rejected": -3.6656970977783203, + "step": 7258 + }, + { + "epoch": 0.84, + "learning_rate": 4.968980451831909e-08, + "logits/chosen": -2.3742265701293945, + "logits/rejected": -2.383502960205078, + "logps/chosen": -201.0978546142578, + "logps/rejected": -149.95220947265625, + "loss": 0.4435, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6671159267425537, + "rewards/margins": 1.6674373149871826, + "rewards/rejected": -2.3345532417297363, + "step": 7259 + }, + { + "epoch": 0.84, + "learning_rate": 4.9654688048694836e-08, + "logits/chosen": -1.931520700454712, + "logits/rejected": -2.2091856002807617, + "logps/chosen": -403.2753601074219, + "logps/rejected": -281.98590087890625, + "loss": 0.5054, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6242712140083313, + "rewards/margins": 1.4706047773361206, + "rewards/rejected": -2.0948760509490967, + "step": 7260 + }, + { + "epoch": 0.84, + "learning_rate": 4.9619571579070576e-08, + "logits/chosen": -1.817068099975586, + "logits/rejected": -1.918440341949463, + "logps/chosen": -302.2149658203125, + "logps/rejected": -322.07598876953125, + "loss": 0.4687, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8772796392440796, + "rewards/margins": 2.990447521209717, + "rewards/rejected": -3.867727279663086, + "step": 7261 + }, + { + "epoch": 0.84, + "learning_rate": 4.958445510944633e-08, + "logits/chosen": -2.530324935913086, + "logits/rejected": -2.487194061279297, + "logps/chosen": -188.61111450195312, + "logps/rejected": -148.5440216064453, + "loss": 0.2841, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9164831042289734, + "rewards/margins": 2.0274441242218018, + "rewards/rejected": -2.94392728805542, + "step": 7262 + }, + { + "epoch": 0.84, + "learning_rate": 4.954933863982207e-08, + "logits/chosen": -1.959425687789917, + "logits/rejected": -2.3483290672302246, + "logps/chosen": -368.928955078125, + "logps/rejected": -283.2363586425781, + "loss": 0.1715, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6996102333068848, + "rewards/margins": 3.3556129932403564, + "rewards/rejected": -4.055222988128662, + "step": 7263 + }, + { + "epoch": 0.84, + "learning_rate": 4.9514222170197825e-08, + "logits/chosen": -2.2813339233398438, + "logits/rejected": -2.5693154335021973, + "logps/chosen": -258.5870361328125, + "logps/rejected": -190.39227294921875, + "loss": 0.5828, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.38421630859375, + "rewards/margins": 1.3170967102050781, + "rewards/rejected": -2.701313018798828, + "step": 7264 + }, + { + "epoch": 0.84, + "learning_rate": 4.9479105700573565e-08, + "logits/chosen": -1.9620590209960938, + "logits/rejected": -2.076664686203003, + "logps/chosen": -491.17529296875, + "logps/rejected": -450.7002258300781, + "loss": 0.5857, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.0522947311401367, + "rewards/margins": 2.505197286605835, + "rewards/rejected": -4.557492256164551, + "step": 7265 + }, + { + "epoch": 0.84, + "learning_rate": 4.944398923094931e-08, + "logits/chosen": -2.1146090030670166, + "logits/rejected": -2.3280603885650635, + "logps/chosen": -221.52365112304688, + "logps/rejected": -227.35813903808594, + "loss": 0.6268, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0544127225875854, + "rewards/margins": 1.321507215499878, + "rewards/rejected": -2.375920057296753, + "step": 7266 + }, + { + "epoch": 0.84, + "learning_rate": 4.940887276132506e-08, + "logits/chosen": -2.1966753005981445, + "logits/rejected": -2.667703151702881, + "logps/chosen": -247.0874786376953, + "logps/rejected": -228.92926025390625, + "loss": 0.4752, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7298566102981567, + "rewards/margins": 2.3661305904388428, + "rewards/rejected": -3.09598708152771, + "step": 7267 + }, + { + "epoch": 0.84, + "learning_rate": 4.937375629170081e-08, + "logits/chosen": -2.1529312133789062, + "logits/rejected": -2.277385711669922, + "logps/chosen": -288.535888671875, + "logps/rejected": -256.9233093261719, + "loss": 0.2327, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5505092144012451, + "rewards/margins": 1.9768550395965576, + "rewards/rejected": -2.527364492416382, + "step": 7268 + }, + { + "epoch": 0.84, + "learning_rate": 4.933863982207655e-08, + "logits/chosen": -2.7843284606933594, + "logits/rejected": -2.8416123390197754, + "logps/chosen": -270.36822509765625, + "logps/rejected": -213.76869201660156, + "loss": 0.3166, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1481480598449707, + "rewards/margins": 2.8038203716278076, + "rewards/rejected": -3.9519686698913574, + "step": 7269 + }, + { + "epoch": 0.84, + "learning_rate": 4.93035233524523e-08, + "logits/chosen": -2.160677194595337, + "logits/rejected": -1.774748682975769, + "logps/chosen": -208.67306518554688, + "logps/rejected": -329.67864990234375, + "loss": 0.5586, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.825364112854004, + "rewards/margins": 1.240642786026001, + "rewards/rejected": -4.066006660461426, + "step": 7270 + }, + { + "epoch": 0.84, + "learning_rate": 4.926840688282804e-08, + "logits/chosen": -2.4147090911865234, + "logits/rejected": -2.4321448802948, + "logps/chosen": -398.826416015625, + "logps/rejected": -364.7746276855469, + "loss": 0.3007, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3086100816726685, + "rewards/margins": 2.062462091445923, + "rewards/rejected": -3.3710720539093018, + "step": 7271 + }, + { + "epoch": 0.84, + "learning_rate": 4.9233290413203796e-08, + "logits/chosen": -2.178495168685913, + "logits/rejected": -2.0501248836517334, + "logps/chosen": -270.69158935546875, + "logps/rejected": -275.7129211425781, + "loss": 0.2984, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.104563593864441, + "rewards/margins": 2.4890739917755127, + "rewards/rejected": -3.593637704849243, + "step": 7272 + }, + { + "epoch": 0.84, + "learning_rate": 4.9198173943579536e-08, + "logits/chosen": -2.3790717124938965, + "logits/rejected": -2.41184401512146, + "logps/chosen": -233.1165313720703, + "logps/rejected": -171.5457305908203, + "loss": 0.3996, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8021599650382996, + "rewards/margins": 1.4499127864837646, + "rewards/rejected": -2.252072811126709, + "step": 7273 + }, + { + "epoch": 0.84, + "learning_rate": 4.9163057473955284e-08, + "logits/chosen": -1.997494101524353, + "logits/rejected": -2.3650550842285156, + "logps/chosen": -381.7954406738281, + "logps/rejected": -231.65536499023438, + "loss": 0.7555, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.11872136592865, + "rewards/margins": 0.5717488527297974, + "rewards/rejected": -1.6904702186584473, + "step": 7274 + }, + { + "epoch": 0.84, + "learning_rate": 4.912794100433103e-08, + "logits/chosen": -2.3862342834472656, + "logits/rejected": -2.3883631229400635, + "logps/chosen": -311.0007019042969, + "logps/rejected": -294.7148132324219, + "loss": 0.3845, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.341214656829834, + "rewards/margins": 2.4103260040283203, + "rewards/rejected": -3.751540422439575, + "step": 7275 + }, + { + "epoch": 0.84, + "learning_rate": 4.909282453470678e-08, + "logits/chosen": -2.229391574859619, + "logits/rejected": -2.1864638328552246, + "logps/chosen": -407.21197509765625, + "logps/rejected": -351.0513916015625, + "loss": 0.5083, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5279223918914795, + "rewards/margins": 1.922231912612915, + "rewards/rejected": -3.4501543045043945, + "step": 7276 + }, + { + "epoch": 0.84, + "learning_rate": 4.905770806508252e-08, + "logits/chosen": -2.070378065109253, + "logits/rejected": -2.158921003341675, + "logps/chosen": -185.04296875, + "logps/rejected": -237.64598083496094, + "loss": 0.5511, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7511017322540283, + "rewards/margins": 1.3499760627746582, + "rewards/rejected": -2.1010777950286865, + "step": 7277 + }, + { + "epoch": 0.84, + "learning_rate": 4.902259159545827e-08, + "logits/chosen": -2.7711286544799805, + "logits/rejected": -2.724498748779297, + "logps/chosen": -240.59085083007812, + "logps/rejected": -412.13641357421875, + "loss": 0.2806, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.101622462272644, + "rewards/margins": 2.160853862762451, + "rewards/rejected": -3.2624764442443848, + "step": 7278 + }, + { + "epoch": 0.84, + "learning_rate": 4.898747512583401e-08, + "logits/chosen": -2.3306283950805664, + "logits/rejected": -2.2617979049682617, + "logps/chosen": -243.30311584472656, + "logps/rejected": -215.8109893798828, + "loss": 0.2049, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7628984451293945, + "rewards/margins": 2.550781488418579, + "rewards/rejected": -3.3136796951293945, + "step": 7279 + }, + { + "epoch": 0.84, + "learning_rate": 4.895235865620976e-08, + "logits/chosen": -1.8446046113967896, + "logits/rejected": -1.916938066482544, + "logps/chosen": -439.167236328125, + "logps/rejected": -315.48980712890625, + "loss": 0.2229, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12754496932029724, + "rewards/margins": 1.8154469728469849, + "rewards/rejected": -1.6879019737243652, + "step": 7280 + }, + { + "epoch": 0.84, + "learning_rate": 4.891724218658551e-08, + "logits/chosen": -2.9159696102142334, + "logits/rejected": -2.8371167182922363, + "logps/chosen": -321.82659912109375, + "logps/rejected": -284.08184814453125, + "loss": 0.428, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5901708006858826, + "rewards/margins": 2.6365318298339844, + "rewards/rejected": -3.2267026901245117, + "step": 7281 + }, + { + "epoch": 0.84, + "learning_rate": 4.8882125716961255e-08, + "logits/chosen": -2.425327777862549, + "logits/rejected": -2.1929054260253906, + "logps/chosen": -120.5645523071289, + "logps/rejected": -171.08865356445312, + "loss": 0.3583, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1917372941970825, + "rewards/margins": 1.5973973274230957, + "rewards/rejected": -2.7891347408294678, + "step": 7282 + }, + { + "epoch": 0.84, + "learning_rate": 4.8847009247336996e-08, + "logits/chosen": -2.3176190853118896, + "logits/rejected": -2.1720335483551025, + "logps/chosen": -321.5439758300781, + "logps/rejected": -271.946044921875, + "loss": 0.8357, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2777925729751587, + "rewards/margins": 1.4037256240844727, + "rewards/rejected": -2.681518077850342, + "step": 7283 + }, + { + "epoch": 0.84, + "learning_rate": 4.881189277771275e-08, + "logits/chosen": -1.8713210821151733, + "logits/rejected": -1.966714859008789, + "logps/chosen": -182.93115234375, + "logps/rejected": -217.39385986328125, + "loss": 0.1355, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.73880136013031, + "rewards/margins": 3.615701675415039, + "rewards/rejected": -5.354503631591797, + "step": 7284 + }, + { + "epoch": 0.84, + "learning_rate": 4.877677630808849e-08, + "logits/chosen": -2.432600736618042, + "logits/rejected": -2.762861728668213, + "logps/chosen": -226.95701599121094, + "logps/rejected": -287.908935546875, + "loss": 0.1311, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4562530219554901, + "rewards/margins": 3.789872407913208, + "rewards/rejected": -4.246125221252441, + "step": 7285 + }, + { + "epoch": 0.84, + "learning_rate": 4.8741659838464244e-08, + "logits/chosen": -1.963269591331482, + "logits/rejected": -1.797040343284607, + "logps/chosen": -234.1232452392578, + "logps/rejected": -326.4383544921875, + "loss": 0.2434, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3226543068885803, + "rewards/margins": 2.1336703300476074, + "rewards/rejected": -2.456324815750122, + "step": 7286 + }, + { + "epoch": 0.84, + "learning_rate": 4.8706543368839985e-08, + "logits/chosen": -2.4677631855010986, + "logits/rejected": -2.501086950302124, + "logps/chosen": -349.0850830078125, + "logps/rejected": -331.70404052734375, + "loss": 0.6119, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.9841102361679077, + "rewards/margins": 0.6531931161880493, + "rewards/rejected": -2.637303590774536, + "step": 7287 + }, + { + "epoch": 0.84, + "learning_rate": 4.867142689921573e-08, + "logits/chosen": -3.008638620376587, + "logits/rejected": -2.9822371006011963, + "logps/chosen": -260.9247741699219, + "logps/rejected": -228.87269592285156, + "loss": 0.1744, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.32814478874206543, + "rewards/margins": 2.4160542488098145, + "rewards/rejected": -2.74419903755188, + "step": 7288 + }, + { + "epoch": 0.84, + "learning_rate": 4.863631042959148e-08, + "logits/chosen": -2.106825590133667, + "logits/rejected": -2.3360869884490967, + "logps/chosen": -457.4708251953125, + "logps/rejected": -306.2024841308594, + "loss": 0.1455, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.32419466972351074, + "rewards/margins": 2.598379135131836, + "rewards/rejected": -2.922574043273926, + "step": 7289 + }, + { + "epoch": 0.84, + "learning_rate": 4.860119395996722e-08, + "logits/chosen": -2.6961710453033447, + "logits/rejected": -2.7128384113311768, + "logps/chosen": -300.36883544921875, + "logps/rejected": -364.462158203125, + "loss": 0.5101, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.543982744216919, + "rewards/margins": 2.413564682006836, + "rewards/rejected": -2.957547664642334, + "step": 7290 + }, + { + "epoch": 0.84, + "learning_rate": 4.856607749034297e-08, + "logits/chosen": -1.8901011943817139, + "logits/rejected": -1.8794264793395996, + "logps/chosen": -216.88327026367188, + "logps/rejected": -230.08724975585938, + "loss": 0.6834, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4406869411468506, + "rewards/margins": 1.0806382894515991, + "rewards/rejected": -2.52132511138916, + "step": 7291 + }, + { + "epoch": 0.84, + "learning_rate": 4.8530961020718714e-08, + "logits/chosen": -1.6082268953323364, + "logits/rejected": -2.0535430908203125, + "logps/chosen": -389.29962158203125, + "logps/rejected": -401.9963073730469, + "loss": 0.4125, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6371233463287354, + "rewards/margins": 2.967576503753662, + "rewards/rejected": -3.6046993732452393, + "step": 7292 + }, + { + "epoch": 0.84, + "learning_rate": 4.849584455109446e-08, + "logits/chosen": -2.670384168624878, + "logits/rejected": -2.506037712097168, + "logps/chosen": -229.10675048828125, + "logps/rejected": -333.47265625, + "loss": 0.2106, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0305969715118408, + "rewards/margins": 2.8201117515563965, + "rewards/rejected": -3.850708484649658, + "step": 7293 + }, + { + "epoch": 0.84, + "learning_rate": 4.84607280814702e-08, + "logits/chosen": -2.4680778980255127, + "logits/rejected": -2.6033294200897217, + "logps/chosen": -235.27438354492188, + "logps/rejected": -203.2781982421875, + "loss": 0.289, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.029720045626163483, + "rewards/margins": 2.880570888519287, + "rewards/rejected": -2.9102907180786133, + "step": 7294 + }, + { + "epoch": 0.84, + "learning_rate": 4.8425611611845956e-08, + "logits/chosen": -2.7800521850585938, + "logits/rejected": -2.765082836151123, + "logps/chosen": -329.00152587890625, + "logps/rejected": -200.01239013671875, + "loss": 0.2931, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.32153424620628357, + "rewards/margins": 1.7584521770477295, + "rewards/rejected": -2.079986572265625, + "step": 7295 + }, + { + "epoch": 0.84, + "learning_rate": 4.8390495142221696e-08, + "logits/chosen": -1.8561712503433228, + "logits/rejected": -1.5852630138397217, + "logps/chosen": -349.568603515625, + "logps/rejected": -467.7016296386719, + "loss": 0.2194, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08483996987342834, + "rewards/margins": 2.8235023021698, + "rewards/rejected": -2.7386622428894043, + "step": 7296 + }, + { + "epoch": 0.84, + "learning_rate": 4.8355378672597444e-08, + "logits/chosen": -2.356311559677124, + "logits/rejected": -2.2604169845581055, + "logps/chosen": -226.0631103515625, + "logps/rejected": -270.93292236328125, + "loss": 0.1525, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0002542734146118, + "rewards/margins": 3.0246949195861816, + "rewards/rejected": -4.024949073791504, + "step": 7297 + }, + { + "epoch": 0.84, + "learning_rate": 4.832026220297319e-08, + "logits/chosen": -2.219972610473633, + "logits/rejected": -2.4756057262420654, + "logps/chosen": -326.0572814941406, + "logps/rejected": -225.983642578125, + "loss": 0.3105, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1120400428771973, + "rewards/margins": 2.110990524291992, + "rewards/rejected": -3.2230305671691895, + "step": 7298 + }, + { + "epoch": 0.84, + "learning_rate": 4.828514573334894e-08, + "logits/chosen": -1.9699304103851318, + "logits/rejected": -1.6088011264801025, + "logps/chosen": -128.69012451171875, + "logps/rejected": -279.07470703125, + "loss": 0.249, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8654165267944336, + "rewards/margins": 4.745375633239746, + "rewards/rejected": -6.61079216003418, + "step": 7299 + }, + { + "epoch": 0.84, + "learning_rate": 4.825002926372468e-08, + "logits/chosen": -2.3980484008789062, + "logits/rejected": -2.7335386276245117, + "logps/chosen": -300.6334228515625, + "logps/rejected": -146.91970825195312, + "loss": 0.4162, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6760942935943604, + "rewards/margins": 1.5479154586791992, + "rewards/rejected": -2.2240097522735596, + "step": 7300 + }, + { + "epoch": 0.84, + "learning_rate": 4.821491279410043e-08, + "logits/chosen": -2.782285451889038, + "logits/rejected": -2.8202414512634277, + "logps/chosen": -153.24285888671875, + "logps/rejected": -158.77330017089844, + "loss": 0.5024, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1738152503967285, + "rewards/margins": 1.399414300918579, + "rewards/rejected": -2.5732295513153076, + "step": 7301 + }, + { + "epoch": 0.84, + "learning_rate": 4.817979632447617e-08, + "logits/chosen": -2.3012306690216064, + "logits/rejected": -2.4136085510253906, + "logps/chosen": -161.0286102294922, + "logps/rejected": -191.16558837890625, + "loss": 0.3567, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7214571237564087, + "rewards/margins": 2.7786026000976562, + "rewards/rejected": -3.5000598430633545, + "step": 7302 + }, + { + "epoch": 0.84, + "learning_rate": 4.814467985485193e-08, + "logits/chosen": -1.7633355855941772, + "logits/rejected": -1.7752476930618286, + "logps/chosen": -396.28302001953125, + "logps/rejected": -407.8428955078125, + "loss": 0.2028, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.091335415840149, + "rewards/margins": 2.7751331329345703, + "rewards/rejected": -3.866468667984009, + "step": 7303 + }, + { + "epoch": 0.84, + "learning_rate": 4.810956338522767e-08, + "logits/chosen": -2.4988319873809814, + "logits/rejected": -2.406043767929077, + "logps/chosen": -129.95703125, + "logps/rejected": -181.47921752929688, + "loss": 0.562, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0613905191421509, + "rewards/margins": 1.9226629734039307, + "rewards/rejected": -2.984053611755371, + "step": 7304 + }, + { + "epoch": 0.84, + "learning_rate": 4.8074446915603415e-08, + "logits/chosen": -2.266131639480591, + "logits/rejected": -2.232513666152954, + "logps/chosen": -266.8689270019531, + "logps/rejected": -333.6603088378906, + "loss": 0.3676, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0535262823104858, + "rewards/margins": 2.1334586143493652, + "rewards/rejected": -3.1869850158691406, + "step": 7305 + }, + { + "epoch": 0.84, + "learning_rate": 4.803933044597916e-08, + "logits/chosen": -1.9615838527679443, + "logits/rejected": -1.946822166442871, + "logps/chosen": -303.2620849609375, + "logps/rejected": -250.34219360351562, + "loss": 0.1951, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6513579487800598, + "rewards/margins": 2.6541497707366943, + "rewards/rejected": -3.3055078983306885, + "step": 7306 + }, + { + "epoch": 0.84, + "learning_rate": 4.800421397635491e-08, + "logits/chosen": -2.0211827754974365, + "logits/rejected": -2.449619770050049, + "logps/chosen": -375.2823791503906, + "logps/rejected": -329.2829895019531, + "loss": 0.3934, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5246737003326416, + "rewards/margins": 4.088783264160156, + "rewards/rejected": -5.613456726074219, + "step": 7307 + }, + { + "epoch": 0.84, + "learning_rate": 4.796909750673065e-08, + "logits/chosen": -2.3027145862579346, + "logits/rejected": -2.4098169803619385, + "logps/chosen": -285.78302001953125, + "logps/rejected": -277.8759765625, + "loss": 1.2074, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.826462984085083, + "rewards/margins": 0.12456482648849487, + "rewards/rejected": -1.951027750968933, + "step": 7308 + }, + { + "epoch": 0.84, + "learning_rate": 4.7933981037106404e-08, + "logits/chosen": -2.3030409812927246, + "logits/rejected": -2.3086605072021484, + "logps/chosen": -240.4357147216797, + "logps/rejected": -385.1460266113281, + "loss": 0.3, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.32092201709747314, + "rewards/margins": 2.600882053375244, + "rewards/rejected": -2.921804189682007, + "step": 7309 + }, + { + "epoch": 0.84, + "learning_rate": 4.7898864567482145e-08, + "logits/chosen": -1.960214376449585, + "logits/rejected": -2.2600045204162598, + "logps/chosen": -344.99517822265625, + "logps/rejected": -164.7296142578125, + "loss": 0.6403, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7914957404136658, + "rewards/margins": 0.9151383638381958, + "rewards/rejected": -1.7066341638565063, + "step": 7310 + }, + { + "epoch": 0.84, + "learning_rate": 4.786374809785789e-08, + "logits/chosen": -2.4303555488586426, + "logits/rejected": -2.3362534046173096, + "logps/chosen": -269.1781921386719, + "logps/rejected": -361.89019775390625, + "loss": 0.2873, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8123782873153687, + "rewards/margins": 2.6737425327301025, + "rewards/rejected": -3.4861209392547607, + "step": 7311 + }, + { + "epoch": 0.84, + "learning_rate": 4.782863162823364e-08, + "logits/chosen": -2.1915993690490723, + "logits/rejected": -2.3595898151397705, + "logps/chosen": -296.211181640625, + "logps/rejected": -214.77578735351562, + "loss": 0.5773, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9393360614776611, + "rewards/margins": 1.4613068103790283, + "rewards/rejected": -2.4006428718566895, + "step": 7312 + }, + { + "epoch": 0.84, + "learning_rate": 4.7793515158609386e-08, + "logits/chosen": -2.2860467433929443, + "logits/rejected": -2.420706272125244, + "logps/chosen": -460.8298645019531, + "logps/rejected": -287.64208984375, + "loss": 0.2676, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7474324703216553, + "rewards/margins": 2.8284013271331787, + "rewards/rejected": -3.575833797454834, + "step": 7313 + }, + { + "epoch": 0.84, + "learning_rate": 4.775839868898513e-08, + "logits/chosen": -2.0516626834869385, + "logits/rejected": -2.2836105823516846, + "logps/chosen": -398.1666259765625, + "logps/rejected": -287.06903076171875, + "loss": 0.246, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7805768847465515, + "rewards/margins": 2.5093071460723877, + "rewards/rejected": -3.289884090423584, + "step": 7314 + }, + { + "epoch": 0.84, + "learning_rate": 4.772328221936088e-08, + "logits/chosen": -2.5145325660705566, + "logits/rejected": -2.463958263397217, + "logps/chosen": -214.547607421875, + "logps/rejected": -239.49624633789062, + "loss": 0.3397, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8469486236572266, + "rewards/margins": 2.0946178436279297, + "rewards/rejected": -2.9415664672851562, + "step": 7315 + }, + { + "epoch": 0.84, + "learning_rate": 4.768816574973662e-08, + "logits/chosen": -2.721909999847412, + "logits/rejected": -2.808121919631958, + "logps/chosen": -425.7216796875, + "logps/rejected": -379.1286926269531, + "loss": 0.0913, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6900840997695923, + "rewards/margins": 2.610539197921753, + "rewards/rejected": -3.3006234169006348, + "step": 7316 + }, + { + "epoch": 0.84, + "learning_rate": 4.7653049280112375e-08, + "logits/chosen": -2.1834890842437744, + "logits/rejected": -2.3012478351593018, + "logps/chosen": -320.6518249511719, + "logps/rejected": -335.3675537109375, + "loss": 0.4511, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3142260313034058, + "rewards/margins": 1.9494850635528564, + "rewards/rejected": -3.2637112140655518, + "step": 7317 + }, + { + "epoch": 0.84, + "learning_rate": 4.7617932810488116e-08, + "logits/chosen": -2.5275161266326904, + "logits/rejected": -2.756046772003174, + "logps/chosen": -254.2745361328125, + "logps/rejected": -222.26060485839844, + "loss": 0.3464, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.391987681388855, + "rewards/margins": 2.82307767868042, + "rewards/rejected": -3.2150652408599854, + "step": 7318 + }, + { + "epoch": 0.84, + "learning_rate": 4.758281634086386e-08, + "logits/chosen": -1.8320703506469727, + "logits/rejected": -1.9057093858718872, + "logps/chosen": -356.4985046386719, + "logps/rejected": -551.4948120117188, + "loss": 0.7013, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8842523097991943, + "rewards/margins": 2.015127182006836, + "rewards/rejected": -2.8993794918060303, + "step": 7319 + }, + { + "epoch": 0.84, + "learning_rate": 4.754769987123961e-08, + "logits/chosen": -2.3368966579437256, + "logits/rejected": -2.4342589378356934, + "logps/chosen": -290.69970703125, + "logps/rejected": -218.5282745361328, + "loss": 0.6612, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0609315633773804, + "rewards/margins": 1.7782846689224243, + "rewards/rejected": -2.8392162322998047, + "step": 7320 + }, + { + "epoch": 0.84, + "learning_rate": 4.751258340161536e-08, + "logits/chosen": -2.712198495864868, + "logits/rejected": -2.6265006065368652, + "logps/chosen": -245.93023681640625, + "logps/rejected": -233.96139526367188, + "loss": 0.3081, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5572612285614014, + "rewards/margins": 1.731010913848877, + "rewards/rejected": -3.2882723808288574, + "step": 7321 + }, + { + "epoch": 0.84, + "learning_rate": 4.74774669319911e-08, + "logits/chosen": -1.8425078392028809, + "logits/rejected": -2.049987316131592, + "logps/chosen": -266.4859924316406, + "logps/rejected": -338.5353698730469, + "loss": 0.5178, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9267857670783997, + "rewards/margins": 1.6000885963439941, + "rewards/rejected": -2.5268747806549072, + "step": 7322 + }, + { + "epoch": 0.84, + "learning_rate": 4.744235046236685e-08, + "logits/chosen": -2.1844019889831543, + "logits/rejected": -2.35815167427063, + "logps/chosen": -273.868896484375, + "logps/rejected": -287.2154541015625, + "loss": 0.5586, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8161287307739258, + "rewards/margins": 2.2177720069885254, + "rewards/rejected": -3.033900737762451, + "step": 7323 + }, + { + "epoch": 0.84, + "learning_rate": 4.740723399274259e-08, + "logits/chosen": -2.505429267883301, + "logits/rejected": -2.069732427597046, + "logps/chosen": -183.54721069335938, + "logps/rejected": -310.20556640625, + "loss": 0.8631, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.371473789215088, + "rewards/margins": 0.9757555723190308, + "rewards/rejected": -2.347229242324829, + "step": 7324 + }, + { + "epoch": 0.84, + "learning_rate": 4.7372117523118347e-08, + "logits/chosen": -1.8088325262069702, + "logits/rejected": -2.2028937339782715, + "logps/chosen": -393.3784484863281, + "logps/rejected": -285.0054931640625, + "loss": 0.5525, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6838184595108032, + "rewards/margins": 2.0804266929626465, + "rewards/rejected": -3.764244794845581, + "step": 7325 + }, + { + "epoch": 0.84, + "learning_rate": 4.733700105349409e-08, + "logits/chosen": -2.8390767574310303, + "logits/rejected": -2.7798893451690674, + "logps/chosen": -287.4049987792969, + "logps/rejected": -237.73431396484375, + "loss": 0.1262, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0790375471115112, + "rewards/margins": 2.919384002685547, + "rewards/rejected": -3.9984211921691895, + "step": 7326 + }, + { + "epoch": 0.84, + "learning_rate": 4.7301884583869834e-08, + "logits/chosen": -2.320281982421875, + "logits/rejected": -2.065781593322754, + "logps/chosen": -179.0843505859375, + "logps/rejected": -260.58642578125, + "loss": 0.6638, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2579402923583984, + "rewards/margins": 0.9102254509925842, + "rewards/rejected": -2.168165683746338, + "step": 7327 + }, + { + "epoch": 0.84, + "learning_rate": 4.7266768114245575e-08, + "logits/chosen": -2.4306859970092773, + "logits/rejected": -2.7086009979248047, + "logps/chosen": -287.30340576171875, + "logps/rejected": -133.31544494628906, + "loss": 0.7278, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2722715139389038, + "rewards/margins": 0.9634913206100464, + "rewards/rejected": -2.235762596130371, + "step": 7328 + }, + { + "epoch": 0.84, + "learning_rate": 4.723165164462133e-08, + "logits/chosen": -2.323219060897827, + "logits/rejected": -2.457118511199951, + "logps/chosen": -309.797607421875, + "logps/rejected": -150.08663940429688, + "loss": 0.1915, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0835773944854736, + "rewards/margins": 2.0308618545532227, + "rewards/rejected": -3.1144392490386963, + "step": 7329 + }, + { + "epoch": 0.85, + "learning_rate": 4.719653517499707e-08, + "logits/chosen": -2.3200273513793945, + "logits/rejected": -2.184493064880371, + "logps/chosen": -224.3203887939453, + "logps/rejected": -323.3388977050781, + "loss": 0.6975, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9422980546951294, + "rewards/margins": 0.8998094797134399, + "rewards/rejected": -1.8421077728271484, + "step": 7330 + }, + { + "epoch": 0.85, + "learning_rate": 4.7161418705372823e-08, + "logits/chosen": -2.979684352874756, + "logits/rejected": -2.980836868286133, + "logps/chosen": -400.2698974609375, + "logps/rejected": -248.96861267089844, + "loss": 0.3622, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1700990200042725, + "rewards/margins": 2.710136890411377, + "rewards/rejected": -3.8802359104156494, + "step": 7331 + }, + { + "epoch": 0.85, + "learning_rate": 4.7126302235748564e-08, + "logits/chosen": -2.7952017784118652, + "logits/rejected": -2.6203784942626953, + "logps/chosen": -198.9842071533203, + "logps/rejected": -322.23193359375, + "loss": 0.4878, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4939923286437988, + "rewards/margins": 5.081569194793701, + "rewards/rejected": -6.5755615234375, + "step": 7332 + }, + { + "epoch": 0.85, + "learning_rate": 4.709118576612431e-08, + "logits/chosen": -2.081382989883423, + "logits/rejected": -2.1831417083740234, + "logps/chosen": -402.9104919433594, + "logps/rejected": -278.4792175292969, + "loss": 0.6204, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8787335157394409, + "rewards/margins": 1.1007673740386963, + "rewards/rejected": -1.9795008897781372, + "step": 7333 + }, + { + "epoch": 0.85, + "learning_rate": 4.705606929650006e-08, + "logits/chosen": -2.4893250465393066, + "logits/rejected": -2.338268280029297, + "logps/chosen": -187.31214904785156, + "logps/rejected": -268.58282470703125, + "loss": 0.3107, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9671935439109802, + "rewards/margins": 2.256805181503296, + "rewards/rejected": -3.223998785018921, + "step": 7334 + }, + { + "epoch": 0.85, + "learning_rate": 4.7020952826875806e-08, + "logits/chosen": -1.9515790939331055, + "logits/rejected": -2.272784471511841, + "logps/chosen": -387.48626708984375, + "logps/rejected": -284.8462219238281, + "loss": 0.2595, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20210716128349304, + "rewards/margins": 2.3945815563201904, + "rewards/rejected": -2.596688747406006, + "step": 7335 + }, + { + "epoch": 0.85, + "learning_rate": 4.6985836357251546e-08, + "logits/chosen": -2.0563743114471436, + "logits/rejected": -1.9288034439086914, + "logps/chosen": -145.31192016601562, + "logps/rejected": -199.50111389160156, + "loss": 0.4587, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1098664999008179, + "rewards/margins": 0.9790169596672058, + "rewards/rejected": -2.088883399963379, + "step": 7336 + }, + { + "epoch": 0.85, + "learning_rate": 4.69507198876273e-08, + "logits/chosen": -2.4690206050872803, + "logits/rejected": -2.5584425926208496, + "logps/chosen": -154.42498779296875, + "logps/rejected": -192.40057373046875, + "loss": 0.4498, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6103676557540894, + "rewards/margins": 1.7108219861984253, + "rewards/rejected": -3.3211894035339355, + "step": 7337 + }, + { + "epoch": 0.85, + "learning_rate": 4.691560341800304e-08, + "logits/chosen": -2.1140191555023193, + "logits/rejected": -2.115891695022583, + "logps/chosen": -401.87109375, + "logps/rejected": -386.97308349609375, + "loss": 0.2381, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8049266934394836, + "rewards/margins": 3.4868173599243164, + "rewards/rejected": -4.291743755340576, + "step": 7338 + }, + { + "epoch": 0.85, + "learning_rate": 4.688048694837878e-08, + "logits/chosen": -2.182647466659546, + "logits/rejected": -2.506237506866455, + "logps/chosen": -499.0415344238281, + "logps/rejected": -380.6024169921875, + "loss": 0.1872, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.26699838042259216, + "rewards/margins": 2.011826276779175, + "rewards/rejected": -2.278824806213379, + "step": 7339 + }, + { + "epoch": 0.85, + "learning_rate": 4.6845370478754535e-08, + "logits/chosen": -2.6984450817108154, + "logits/rejected": -2.830286979675293, + "logps/chosen": -229.9659881591797, + "logps/rejected": -259.6852722167969, + "loss": 0.2142, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9938878417015076, + "rewards/margins": 2.3159406185150146, + "rewards/rejected": -3.309828281402588, + "step": 7340 + }, + { + "epoch": 0.85, + "learning_rate": 4.6810254009130276e-08, + "logits/chosen": -2.5804405212402344, + "logits/rejected": -2.5668692588806152, + "logps/chosen": -160.1992950439453, + "logps/rejected": -347.4171142578125, + "loss": 0.9932, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6751238107681274, + "rewards/margins": 2.191007137298584, + "rewards/rejected": -3.866130828857422, + "step": 7341 + }, + { + "epoch": 0.85, + "learning_rate": 4.677513753950603e-08, + "logits/chosen": -2.066754102706909, + "logits/rejected": -2.110133647918701, + "logps/chosen": -349.7358093261719, + "logps/rejected": -331.6476745605469, + "loss": 0.4826, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8572406768798828, + "rewards/margins": 0.8725746870040894, + "rewards/rejected": -1.7298153638839722, + "step": 7342 + }, + { + "epoch": 0.85, + "learning_rate": 4.674002106988177e-08, + "logits/chosen": -2.626276731491089, + "logits/rejected": -2.6903271675109863, + "logps/chosen": -79.93378448486328, + "logps/rejected": -164.29396057128906, + "loss": 0.2346, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.341261863708496, + "rewards/margins": 3.421672821044922, + "rewards/rejected": -4.762934684753418, + "step": 7343 + }, + { + "epoch": 0.85, + "learning_rate": 4.670490460025752e-08, + "logits/chosen": -2.0217652320861816, + "logits/rejected": -1.9413948059082031, + "logps/chosen": -363.3044738769531, + "logps/rejected": -386.1546936035156, + "loss": 0.3478, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6540080904960632, + "rewards/margins": 2.1154584884643555, + "rewards/rejected": -2.7694666385650635, + "step": 7344 + }, + { + "epoch": 0.85, + "learning_rate": 4.666978813063326e-08, + "logits/chosen": -2.6911368370056152, + "logits/rejected": -2.5132508277893066, + "logps/chosen": -210.79293823242188, + "logps/rejected": -300.5208740234375, + "loss": 0.1832, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3136868476867676, + "rewards/margins": 3.717278003692627, + "rewards/rejected": -5.0309648513793945, + "step": 7345 + }, + { + "epoch": 0.85, + "learning_rate": 4.663467166100901e-08, + "logits/chosen": -2.779339551925659, + "logits/rejected": -2.814368963241577, + "logps/chosen": -203.60768127441406, + "logps/rejected": -330.09698486328125, + "loss": 0.1845, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7377355098724365, + "rewards/margins": 4.011395454406738, + "rewards/rejected": -4.749131202697754, + "step": 7346 + }, + { + "epoch": 0.85, + "learning_rate": 4.659955519138475e-08, + "logits/chosen": -2.782984733581543, + "logits/rejected": -2.6004319190979004, + "logps/chosen": -249.690673828125, + "logps/rejected": -152.55197143554688, + "loss": 0.3974, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5258853435516357, + "rewards/margins": 2.2234482765197754, + "rewards/rejected": -3.749333620071411, + "step": 7347 + }, + { + "epoch": 0.85, + "learning_rate": 4.6564438721760507e-08, + "logits/chosen": -2.0962719917297363, + "logits/rejected": -1.9737699031829834, + "logps/chosen": -208.99368286132812, + "logps/rejected": -213.7407989501953, + "loss": 0.3769, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04072170704603195, + "rewards/margins": 1.4590963125228882, + "rewards/rejected": -1.4183745384216309, + "step": 7348 + }, + { + "epoch": 0.85, + "learning_rate": 4.652932225213625e-08, + "logits/chosen": -2.273282051086426, + "logits/rejected": -2.4412598609924316, + "logps/chosen": -282.4058532714844, + "logps/rejected": -175.71133422851562, + "loss": 0.6585, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7225437164306641, + "rewards/margins": 1.2003979682922363, + "rewards/rejected": -1.9229415655136108, + "step": 7349 + }, + { + "epoch": 0.85, + "learning_rate": 4.6494205782511994e-08, + "logits/chosen": -2.1206798553466797, + "logits/rejected": -2.3557045459747314, + "logps/chosen": -317.0135498046875, + "logps/rejected": -377.1123046875, + "loss": 0.1403, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7578454613685608, + "rewards/margins": 3.480640172958374, + "rewards/rejected": -4.238485336303711, + "step": 7350 + }, + { + "epoch": 0.85, + "learning_rate": 4.645908931288774e-08, + "logits/chosen": -2.199305534362793, + "logits/rejected": -2.018768787384033, + "logps/chosen": -198.877197265625, + "logps/rejected": -301.0191345214844, + "loss": 0.5138, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9771105647087097, + "rewards/margins": 1.2366995811462402, + "rewards/rejected": -2.2138102054595947, + "step": 7351 + }, + { + "epoch": 0.85, + "learning_rate": 4.642397284326349e-08, + "logits/chosen": -2.1886441707611084, + "logits/rejected": -2.376877784729004, + "logps/chosen": -282.84332275390625, + "logps/rejected": -253.092041015625, + "loss": 0.1627, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7287342548370361, + "rewards/margins": 3.3264079093933105, + "rewards/rejected": -4.055141925811768, + "step": 7352 + }, + { + "epoch": 0.85, + "learning_rate": 4.638885637363923e-08, + "logits/chosen": -2.8143906593322754, + "logits/rejected": -2.744460105895996, + "logps/chosen": -170.47129821777344, + "logps/rejected": -263.8246765136719, + "loss": 0.3804, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2617719173431396, + "rewards/margins": 1.8498235940933228, + "rewards/rejected": -3.1115951538085938, + "step": 7353 + }, + { + "epoch": 0.85, + "learning_rate": 4.6353739904014983e-08, + "logits/chosen": -2.3225791454315186, + "logits/rejected": -2.3210065364837646, + "logps/chosen": -521.3934936523438, + "logps/rejected": -341.50567626953125, + "loss": 0.2371, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3489990234375, + "rewards/margins": 1.7862681150436401, + "rewards/rejected": -2.1352672576904297, + "step": 7354 + }, + { + "epoch": 0.85, + "learning_rate": 4.6318623434390724e-08, + "logits/chosen": -2.2286570072174072, + "logits/rejected": -2.1909966468811035, + "logps/chosen": -214.48272705078125, + "logps/rejected": -355.8939208984375, + "loss": 0.2937, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8539903163909912, + "rewards/margins": 4.200383186340332, + "rewards/rejected": -5.054373741149902, + "step": 7355 + }, + { + "epoch": 0.85, + "learning_rate": 4.628350696476648e-08, + "logits/chosen": -2.537149667739868, + "logits/rejected": -2.390428304672241, + "logps/chosen": -237.5272674560547, + "logps/rejected": -501.5705871582031, + "loss": 0.7431, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2378441095352173, + "rewards/margins": 1.480475902557373, + "rewards/rejected": -2.718319892883301, + "step": 7356 + }, + { + "epoch": 0.85, + "learning_rate": 4.624839049514222e-08, + "logits/chosen": -2.486976385116577, + "logits/rejected": -2.406822919845581, + "logps/chosen": -228.2180938720703, + "logps/rejected": -383.493896484375, + "loss": 0.1931, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.716055154800415, + "rewards/margins": 2.6523025035858154, + "rewards/rejected": -3.3683576583862305, + "step": 7357 + }, + { + "epoch": 0.85, + "learning_rate": 4.6213274025517966e-08, + "logits/chosen": -2.3625271320343018, + "logits/rejected": -2.61523175239563, + "logps/chosen": -350.3035888671875, + "logps/rejected": -298.187744140625, + "loss": 0.4518, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4036709070205688, + "rewards/margins": 1.8661209344863892, + "rewards/rejected": -3.269791841506958, + "step": 7358 + }, + { + "epoch": 0.85, + "learning_rate": 4.617815755589371e-08, + "logits/chosen": -2.303706169128418, + "logits/rejected": -2.566631317138672, + "logps/chosen": -158.440185546875, + "logps/rejected": -145.99102783203125, + "loss": 1.0982, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.1214438676834106, + "rewards/margins": 0.1482553482055664, + "rewards/rejected": -1.2696990966796875, + "step": 7359 + }, + { + "epoch": 0.85, + "learning_rate": 4.614304108626946e-08, + "logits/chosen": -2.245620012283325, + "logits/rejected": -2.3588004112243652, + "logps/chosen": -227.82760620117188, + "logps/rejected": -250.8612060546875, + "loss": 0.3199, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.39040374755859375, + "rewards/margins": 2.836153268814087, + "rewards/rejected": -3.2265572547912598, + "step": 7360 + }, + { + "epoch": 0.85, + "learning_rate": 4.61079246166452e-08, + "logits/chosen": -2.568685531616211, + "logits/rejected": -2.4254724979400635, + "logps/chosen": -171.14039611816406, + "logps/rejected": -237.40426635742188, + "loss": 0.325, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1081583499908447, + "rewards/margins": 1.9214646816253662, + "rewards/rejected": -4.029623031616211, + "step": 7361 + }, + { + "epoch": 0.85, + "learning_rate": 4.6072808147020955e-08, + "logits/chosen": -2.8439998626708984, + "logits/rejected": -2.7325448989868164, + "logps/chosen": -198.8822479248047, + "logps/rejected": -260.8642578125, + "loss": 0.1714, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3452911376953125, + "rewards/margins": 2.4716501235961914, + "rewards/rejected": -3.816941261291504, + "step": 7362 + }, + { + "epoch": 0.85, + "learning_rate": 4.6037691677396695e-08, + "logits/chosen": -2.145946979522705, + "logits/rejected": -2.3821945190429688, + "logps/chosen": -258.6530456542969, + "logps/rejected": -261.3774719238281, + "loss": 0.5088, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2001867294311523, + "rewards/margins": 1.3506054878234863, + "rewards/rejected": -2.5507922172546387, + "step": 7363 + }, + { + "epoch": 0.85, + "learning_rate": 4.600257520777244e-08, + "logits/chosen": -2.5895156860351562, + "logits/rejected": -2.4709951877593994, + "logps/chosen": -197.38644409179688, + "logps/rejected": -207.5634307861328, + "loss": 0.731, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4965643882751465, + "rewards/margins": 1.3818206787109375, + "rewards/rejected": -2.878385066986084, + "step": 7364 + }, + { + "epoch": 0.85, + "learning_rate": 4.596745873814819e-08, + "logits/chosen": -2.2680187225341797, + "logits/rejected": -2.1284170150756836, + "logps/chosen": -252.53895568847656, + "logps/rejected": -225.2239227294922, + "loss": 0.651, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4693708419799805, + "rewards/margins": 0.973408579826355, + "rewards/rejected": -2.442779541015625, + "step": 7365 + }, + { + "epoch": 0.85, + "learning_rate": 4.593234226852394e-08, + "logits/chosen": -2.5022201538085938, + "logits/rejected": -2.6172056198120117, + "logps/chosen": -191.92950439453125, + "logps/rejected": -156.96849060058594, + "loss": 0.2552, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.47999876737594604, + "rewards/margins": 1.9244474172592163, + "rewards/rejected": -2.4044463634490967, + "step": 7366 + }, + { + "epoch": 0.85, + "learning_rate": 4.589722579889968e-08, + "logits/chosen": -1.9648613929748535, + "logits/rejected": -1.9982712268829346, + "logps/chosen": -437.4859619140625, + "logps/rejected": -386.387451171875, + "loss": 0.4948, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2526462078094482, + "rewards/margins": 1.7954411506652832, + "rewards/rejected": -3.0480873584747314, + "step": 7367 + }, + { + "epoch": 0.85, + "learning_rate": 4.586210932927543e-08, + "logits/chosen": -1.861454725265503, + "logits/rejected": -2.3330888748168945, + "logps/chosen": -181.38856506347656, + "logps/rejected": -197.6517333984375, + "loss": 1.3378, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.9889634847640991, + "rewards/margins": 0.18425673246383667, + "rewards/rejected": -2.173220157623291, + "step": 7368 + }, + { + "epoch": 0.85, + "learning_rate": 4.582699285965117e-08, + "logits/chosen": -2.5482890605926514, + "logits/rejected": -2.54837703704834, + "logps/chosen": -175.46170043945312, + "logps/rejected": -229.77182006835938, + "loss": 0.3407, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7519211769104004, + "rewards/margins": 3.935912609100342, + "rewards/rejected": -4.687833786010742, + "step": 7369 + }, + { + "epoch": 0.85, + "learning_rate": 4.5791876390026926e-08, + "logits/chosen": -2.518636703491211, + "logits/rejected": -2.506042718887329, + "logps/chosen": -97.09517669677734, + "logps/rejected": -156.22560119628906, + "loss": 0.4377, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.674875020980835, + "rewards/margins": 1.6276566982269287, + "rewards/rejected": -2.3025317192077637, + "step": 7370 + }, + { + "epoch": 0.85, + "learning_rate": 4.5756759920402667e-08, + "logits/chosen": -2.4703166484832764, + "logits/rejected": -2.6621315479278564, + "logps/chosen": -483.91534423828125, + "logps/rejected": -325.303466796875, + "loss": 0.041, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5195743441581726, + "rewards/margins": 4.9678874015808105, + "rewards/rejected": -5.487462043762207, + "step": 7371 + }, + { + "epoch": 0.85, + "learning_rate": 4.5721643450778414e-08, + "logits/chosen": -1.451171875, + "logits/rejected": -1.8969917297363281, + "logps/chosen": -560.59130859375, + "logps/rejected": -356.7517395019531, + "loss": 0.0472, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18196427822113037, + "rewards/margins": 3.9272990226745605, + "rewards/rejected": -3.7453346252441406, + "step": 7372 + }, + { + "epoch": 0.85, + "learning_rate": 4.568652698115416e-08, + "logits/chosen": -2.4862658977508545, + "logits/rejected": -2.595750331878662, + "logps/chosen": -275.9267578125, + "logps/rejected": -176.8460235595703, + "loss": 0.2313, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9748167991638184, + "rewards/margins": 2.5803449153900146, + "rewards/rejected": -3.555161714553833, + "step": 7373 + }, + { + "epoch": 0.85, + "learning_rate": 4.565141051152991e-08, + "logits/chosen": -2.531768798828125, + "logits/rejected": -2.3789548873901367, + "logps/chosen": -179.43539428710938, + "logps/rejected": -242.91818237304688, + "loss": 0.0503, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6260957717895508, + "rewards/margins": 3.578594923019409, + "rewards/rejected": -4.204690933227539, + "step": 7374 + }, + { + "epoch": 0.85, + "learning_rate": 4.561629404190565e-08, + "logits/chosen": -2.2896742820739746, + "logits/rejected": -2.2634758949279785, + "logps/chosen": -384.76043701171875, + "logps/rejected": -240.40859985351562, + "loss": 0.4608, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2605373859405518, + "rewards/margins": 1.982020616531372, + "rewards/rejected": -3.242558002471924, + "step": 7375 + }, + { + "epoch": 0.85, + "learning_rate": 4.55811775722814e-08, + "logits/chosen": -2.5039727687835693, + "logits/rejected": -2.7070536613464355, + "logps/chosen": -296.219482421875, + "logps/rejected": -202.08103942871094, + "loss": 0.6116, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0626018047332764, + "rewards/margins": 0.8590136766433716, + "rewards/rejected": -1.921615481376648, + "step": 7376 + }, + { + "epoch": 0.85, + "learning_rate": 4.5546061102657143e-08, + "logits/chosen": -2.1589746475219727, + "logits/rejected": -2.5468788146972656, + "logps/chosen": -404.7319641113281, + "logps/rejected": -249.4989776611328, + "loss": 0.1798, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8204982876777649, + "rewards/margins": 2.2364683151245117, + "rewards/rejected": -3.056966543197632, + "step": 7377 + }, + { + "epoch": 0.85, + "learning_rate": 4.55109446330329e-08, + "logits/chosen": -1.9586091041564941, + "logits/rejected": -2.0063772201538086, + "logps/chosen": -390.5653381347656, + "logps/rejected": -329.5457458496094, + "loss": 0.8006, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2715144157409668, + "rewards/margins": 1.0807580947875977, + "rewards/rejected": -2.3522722721099854, + "step": 7378 + }, + { + "epoch": 0.85, + "learning_rate": 4.547582816340864e-08, + "logits/chosen": -2.9876601696014404, + "logits/rejected": -3.0032553672790527, + "logps/chosen": -190.66802978515625, + "logps/rejected": -187.4329833984375, + "loss": 0.4532, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1204335689544678, + "rewards/margins": 1.572859525680542, + "rewards/rejected": -2.6932930946350098, + "step": 7379 + }, + { + "epoch": 0.85, + "learning_rate": 4.5440711693784385e-08, + "logits/chosen": -2.5150701999664307, + "logits/rejected": -2.476348876953125, + "logps/chosen": -229.83096313476562, + "logps/rejected": -383.25592041015625, + "loss": 0.0989, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6109079122543335, + "rewards/margins": 5.043882846832275, + "rewards/rejected": -5.654790878295898, + "step": 7380 + }, + { + "epoch": 0.85, + "learning_rate": 4.5405595224160126e-08, + "logits/chosen": -2.2968838214874268, + "logits/rejected": -2.185492515563965, + "logps/chosen": -190.82476806640625, + "logps/rejected": -202.5922393798828, + "loss": 0.7261, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7866220474243164, + "rewards/margins": 1.2816216945648193, + "rewards/rejected": -3.0682437419891357, + "step": 7381 + }, + { + "epoch": 0.85, + "learning_rate": 4.537047875453588e-08, + "logits/chosen": -2.7142088413238525, + "logits/rejected": -2.622023105621338, + "logps/chosen": -274.409912109375, + "logps/rejected": -436.5790100097656, + "loss": 0.7439, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6485236883163452, + "rewards/margins": 1.8846359252929688, + "rewards/rejected": -3.5331597328186035, + "step": 7382 + }, + { + "epoch": 0.85, + "learning_rate": 4.533536228491162e-08, + "logits/chosen": -2.634042501449585, + "logits/rejected": -2.5832791328430176, + "logps/chosen": -341.2247314453125, + "logps/rejected": -346.5732421875, + "loss": 0.2834, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0900461673736572, + "rewards/margins": 2.6126246452331543, + "rewards/rejected": -3.7026710510253906, + "step": 7383 + }, + { + "epoch": 0.85, + "learning_rate": 4.5300245815287374e-08, + "logits/chosen": -2.176316022872925, + "logits/rejected": -2.334441661834717, + "logps/chosen": -252.56570434570312, + "logps/rejected": -191.998779296875, + "loss": 0.4778, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0922589302062988, + "rewards/margins": 0.8892672061920166, + "rewards/rejected": -1.9815261363983154, + "step": 7384 + }, + { + "epoch": 0.85, + "learning_rate": 4.5265129345663115e-08, + "logits/chosen": -2.6649065017700195, + "logits/rejected": -2.506429433822632, + "logps/chosen": -276.54437255859375, + "logps/rejected": -271.62255859375, + "loss": 0.2131, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0021019279956817627, + "rewards/margins": 2.8158764839172363, + "rewards/rejected": -2.813774585723877, + "step": 7385 + }, + { + "epoch": 0.85, + "learning_rate": 4.5230012876038855e-08, + "logits/chosen": -1.8098015785217285, + "logits/rejected": -2.23355770111084, + "logps/chosen": -578.3128662109375, + "logps/rejected": -377.5687561035156, + "loss": 0.1704, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.058961108326911926, + "rewards/margins": 3.1559553146362305, + "rewards/rejected": -3.096993923187256, + "step": 7386 + }, + { + "epoch": 0.85, + "learning_rate": 4.519489640641461e-08, + "logits/chosen": -2.6399106979370117, + "logits/rejected": -2.3453428745269775, + "logps/chosen": -265.43389892578125, + "logps/rejected": -373.181396484375, + "loss": 0.9388, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2168469429016113, + "rewards/margins": 1.0266607999801636, + "rewards/rejected": -2.2435076236724854, + "step": 7387 + }, + { + "epoch": 0.85, + "learning_rate": 4.515977993679035e-08, + "logits/chosen": -1.8723375797271729, + "logits/rejected": -2.0270888805389404, + "logps/chosen": -285.54547119140625, + "logps/rejected": -274.7577819824219, + "loss": 0.1688, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8339341878890991, + "rewards/margins": 3.040254592895508, + "rewards/rejected": -3.8741886615753174, + "step": 7388 + }, + { + "epoch": 0.85, + "learning_rate": 4.51246634671661e-08, + "logits/chosen": -1.9493401050567627, + "logits/rejected": -2.1283280849456787, + "logps/chosen": -485.62109375, + "logps/rejected": -311.7141418457031, + "loss": 1.0304, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6012643575668335, + "rewards/margins": 0.7051357626914978, + "rewards/rejected": -2.3064000606536865, + "step": 7389 + }, + { + "epoch": 0.85, + "learning_rate": 4.5089546997541844e-08, + "logits/chosen": -2.9167561531066895, + "logits/rejected": -2.641068935394287, + "logps/chosen": -268.8375244140625, + "logps/rejected": -302.04461669921875, + "loss": 0.6986, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1066226959228516, + "rewards/margins": 1.940929889678955, + "rewards/rejected": -4.047552585601807, + "step": 7390 + }, + { + "epoch": 0.85, + "learning_rate": 4.505443052791759e-08, + "logits/chosen": -1.9490737915039062, + "logits/rejected": -2.2306649684906006, + "logps/chosen": -589.6546630859375, + "logps/rejected": -255.03199768066406, + "loss": 0.2227, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7301672101020813, + "rewards/margins": 2.036431074142456, + "rewards/rejected": -2.7665982246398926, + "step": 7391 + }, + { + "epoch": 0.85, + "learning_rate": 4.501931405829333e-08, + "logits/chosen": -2.179072618484497, + "logits/rejected": -2.2032570838928223, + "logps/chosen": -261.5062255859375, + "logps/rejected": -157.56529235839844, + "loss": 1.1179, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.042496681213379, + "rewards/margins": 0.40971341729164124, + "rewards/rejected": -2.452209949493408, + "step": 7392 + }, + { + "epoch": 0.85, + "learning_rate": 4.4984197588669086e-08, + "logits/chosen": -2.232572555541992, + "logits/rejected": -2.144324779510498, + "logps/chosen": -350.55242919921875, + "logps/rejected": -324.6617431640625, + "loss": 0.1114, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4654722511768341, + "rewards/margins": 3.5676841735839844, + "rewards/rejected": -4.033156394958496, + "step": 7393 + }, + { + "epoch": 0.85, + "learning_rate": 4.4949081119044826e-08, + "logits/chosen": -1.8958989381790161, + "logits/rejected": -2.1752772331237793, + "logps/chosen": -279.81622314453125, + "logps/rejected": -182.52980041503906, + "loss": 0.7575, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9792165160179138, + "rewards/margins": 0.5440546274185181, + "rewards/rejected": -1.5232712030410767, + "step": 7394 + }, + { + "epoch": 0.85, + "learning_rate": 4.491396464942058e-08, + "logits/chosen": -2.109734296798706, + "logits/rejected": -2.3997347354888916, + "logps/chosen": -482.4043884277344, + "logps/rejected": -327.7159423828125, + "loss": 0.2026, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5063114166259766, + "rewards/margins": 2.255519151687622, + "rewards/rejected": -2.7618303298950195, + "step": 7395 + }, + { + "epoch": 0.85, + "learning_rate": 4.487884817979632e-08, + "logits/chosen": -2.09839129447937, + "logits/rejected": -2.1940572261810303, + "logps/chosen": -449.2315368652344, + "logps/rejected": -380.22149658203125, + "loss": 0.42, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3357468843460083, + "rewards/margins": 2.2085657119750977, + "rewards/rejected": -3.5443129539489746, + "step": 7396 + }, + { + "epoch": 0.85, + "learning_rate": 4.484373171017207e-08, + "logits/chosen": -2.326345443725586, + "logits/rejected": -2.3276400566101074, + "logps/chosen": -219.04977416992188, + "logps/rejected": -287.55389404296875, + "loss": 0.4477, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7755824327468872, + "rewards/margins": 0.8877670764923096, + "rewards/rejected": -1.6633495092391968, + "step": 7397 + }, + { + "epoch": 0.85, + "learning_rate": 4.480861524054781e-08, + "logits/chosen": -2.2928884029388428, + "logits/rejected": -2.395087718963623, + "logps/chosen": -489.06109619140625, + "logps/rejected": -364.6448669433594, + "loss": 0.2556, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5218384861946106, + "rewards/margins": 1.821781873703003, + "rewards/rejected": -2.3436203002929688, + "step": 7398 + }, + { + "epoch": 0.85, + "learning_rate": 4.477349877092356e-08, + "logits/chosen": -2.741809129714966, + "logits/rejected": -2.6927616596221924, + "logps/chosen": -137.96963500976562, + "logps/rejected": -234.47283935546875, + "loss": 0.2839, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.11734944581985474, + "rewards/margins": 2.4295852184295654, + "rewards/rejected": -2.3122355937957764, + "step": 7399 + }, + { + "epoch": 0.85, + "learning_rate": 4.47383823012993e-08, + "logits/chosen": -1.9918667078018188, + "logits/rejected": -2.1162450313568115, + "logps/chosen": -210.716796875, + "logps/rejected": -181.57791137695312, + "loss": 0.4389, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8065052628517151, + "rewards/margins": 1.4932416677474976, + "rewards/rejected": -2.2997469902038574, + "step": 7400 + }, + { + "epoch": 0.85, + "learning_rate": 4.470326583167506e-08, + "logits/chosen": -2.1317691802978516, + "logits/rejected": -2.3473081588745117, + "logps/chosen": -353.0464172363281, + "logps/rejected": -184.32537841796875, + "loss": 0.698, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5876554250717163, + "rewards/margins": 0.4898996949195862, + "rewards/rejected": -2.0775551795959473, + "step": 7401 + }, + { + "epoch": 0.85, + "learning_rate": 4.46681493620508e-08, + "logits/chosen": -2.3216912746429443, + "logits/rejected": -2.526937961578369, + "logps/chosen": -341.1524658203125, + "logps/rejected": -266.25006103515625, + "loss": 0.1866, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2110936641693115, + "rewards/margins": 2.112553358078003, + "rewards/rejected": -3.3236470222473145, + "step": 7402 + }, + { + "epoch": 0.85, + "learning_rate": 4.4633032892426545e-08, + "logits/chosen": -1.6644803285598755, + "logits/rejected": -1.9826099872589111, + "logps/chosen": -363.97723388671875, + "logps/rejected": -297.8166198730469, + "loss": 0.3999, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1602991223335266, + "rewards/margins": 1.5192598104476929, + "rewards/rejected": -1.6795589923858643, + "step": 7403 + }, + { + "epoch": 0.85, + "learning_rate": 4.459791642280229e-08, + "logits/chosen": -2.807807445526123, + "logits/rejected": -2.9258174896240234, + "logps/chosen": -321.53460693359375, + "logps/rejected": -276.27325439453125, + "loss": 0.0996, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.719008207321167, + "rewards/margins": 4.355127334594727, + "rewards/rejected": -6.074135780334473, + "step": 7404 + }, + { + "epoch": 0.85, + "learning_rate": 4.456279995317804e-08, + "logits/chosen": -2.2895302772521973, + "logits/rejected": -2.0814907550811768, + "logps/chosen": -337.15667724609375, + "logps/rejected": -257.2089538574219, + "loss": 0.2618, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.523491382598877, + "rewards/margins": 2.574216842651367, + "rewards/rejected": -4.097708225250244, + "step": 7405 + }, + { + "epoch": 0.85, + "learning_rate": 4.452768348355378e-08, + "logits/chosen": -2.438518524169922, + "logits/rejected": -2.5979645252227783, + "logps/chosen": -203.59835815429688, + "logps/rejected": -193.14569091796875, + "loss": 0.5022, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5971495509147644, + "rewards/margins": 1.5425304174423218, + "rewards/rejected": -2.1396801471710205, + "step": 7406 + }, + { + "epoch": 0.85, + "learning_rate": 4.4492567013929534e-08, + "logits/chosen": -1.8132691383361816, + "logits/rejected": -1.6804763078689575, + "logps/chosen": -384.73785400390625, + "logps/rejected": -285.72052001953125, + "loss": 0.1619, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04306548833847046, + "rewards/margins": 2.5854241847991943, + "rewards/rejected": -2.542358636856079, + "step": 7407 + }, + { + "epoch": 0.85, + "learning_rate": 4.4457450544305275e-08, + "logits/chosen": -2.1266748905181885, + "logits/rejected": -2.1621580123901367, + "logps/chosen": -248.78713989257812, + "logps/rejected": -226.697021484375, + "loss": 0.4751, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0329198837280273, + "rewards/margins": 1.6879171133041382, + "rewards/rejected": -2.720837116241455, + "step": 7408 + }, + { + "epoch": 0.85, + "learning_rate": 4.442233407468103e-08, + "logits/chosen": -2.0729854106903076, + "logits/rejected": -2.378262996673584, + "logps/chosen": -459.76495361328125, + "logps/rejected": -391.6922607421875, + "loss": 0.2161, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.938096284866333, + "rewards/margins": 3.4874649047851562, + "rewards/rejected": -4.425561428070068, + "step": 7409 + }, + { + "epoch": 0.85, + "learning_rate": 4.438721760505677e-08, + "logits/chosen": -2.071388006210327, + "logits/rejected": -1.9589684009552002, + "logps/chosen": -150.6940460205078, + "logps/rejected": -219.40733337402344, + "loss": 0.3961, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8232029676437378, + "rewards/margins": 0.9192464351654053, + "rewards/rejected": -1.7424492835998535, + "step": 7410 + }, + { + "epoch": 0.85, + "learning_rate": 4.4352101135432516e-08, + "logits/chosen": -2.175917387008667, + "logits/rejected": -2.1329588890075684, + "logps/chosen": -226.20223999023438, + "logps/rejected": -280.3760070800781, + "loss": 0.359, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6756523847579956, + "rewards/margins": 2.1090877056121826, + "rewards/rejected": -2.7847402095794678, + "step": 7411 + }, + { + "epoch": 0.85, + "learning_rate": 4.4316984665808264e-08, + "logits/chosen": -2.1825129985809326, + "logits/rejected": -2.400993824005127, + "logps/chosen": -429.58154296875, + "logps/rejected": -296.39697265625, + "loss": 0.3917, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5775346159934998, + "rewards/margins": 2.1420536041259766, + "rewards/rejected": -2.719588279724121, + "step": 7412 + }, + { + "epoch": 0.85, + "learning_rate": 4.428186819618401e-08, + "logits/chosen": -2.7672977447509766, + "logits/rejected": -2.6685643196105957, + "logps/chosen": -220.31448364257812, + "logps/rejected": -197.93553161621094, + "loss": 0.3523, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6600639820098877, + "rewards/margins": 1.8007986545562744, + "rewards/rejected": -2.460862398147583, + "step": 7413 + }, + { + "epoch": 0.85, + "learning_rate": 4.424675172655975e-08, + "logits/chosen": -2.2652482986450195, + "logits/rejected": -2.335005044937134, + "logps/chosen": -214.79722595214844, + "logps/rejected": -306.58837890625, + "loss": 0.4786, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5824177265167236, + "rewards/margins": 0.9495453834533691, + "rewards/rejected": -1.5319631099700928, + "step": 7414 + }, + { + "epoch": 0.85, + "learning_rate": 4.4211635256935505e-08, + "logits/chosen": -1.659095048904419, + "logits/rejected": -1.9365657567977905, + "logps/chosen": -399.21234130859375, + "logps/rejected": -280.1260070800781, + "loss": 0.5218, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1844228506088257, + "rewards/margins": 1.2074984312057495, + "rewards/rejected": -2.391921281814575, + "step": 7415 + }, + { + "epoch": 0.85, + "learning_rate": 4.4176518787311246e-08, + "logits/chosen": -1.881102442741394, + "logits/rejected": -2.0366194248199463, + "logps/chosen": -532.5531005859375, + "logps/rejected": -359.7048645019531, + "loss": 0.3786, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.790534257888794, + "rewards/margins": 1.6200798749923706, + "rewards/rejected": -2.410614252090454, + "step": 7416 + }, + { + "epoch": 0.86, + "learning_rate": 4.414140231768699e-08, + "logits/chosen": -2.5884742736816406, + "logits/rejected": -2.5323028564453125, + "logps/chosen": -203.90982055664062, + "logps/rejected": -356.2254638671875, + "loss": 0.1565, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3009432554244995, + "rewards/margins": 4.775388717651367, + "rewards/rejected": -6.076332092285156, + "step": 7417 + }, + { + "epoch": 0.86, + "learning_rate": 4.410628584806274e-08, + "logits/chosen": -1.9619522094726562, + "logits/rejected": -2.2848520278930664, + "logps/chosen": -425.37152099609375, + "logps/rejected": -407.6894226074219, + "loss": 0.2428, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.49090248346328735, + "rewards/margins": 2.158677816390991, + "rewards/rejected": -2.6495800018310547, + "step": 7418 + }, + { + "epoch": 0.86, + "learning_rate": 4.407116937843849e-08, + "logits/chosen": -2.853447914123535, + "logits/rejected": -2.790773391723633, + "logps/chosen": -231.89601135253906, + "logps/rejected": -287.97369384765625, + "loss": 0.3428, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3027158975601196, + "rewards/margins": 2.4594154357910156, + "rewards/rejected": -3.762131452560425, + "step": 7419 + }, + { + "epoch": 0.86, + "learning_rate": 4.403605290881423e-08, + "logits/chosen": -2.4921090602874756, + "logits/rejected": -2.485990285873413, + "logps/chosen": -274.7544860839844, + "logps/rejected": -224.8227081298828, + "loss": 0.4709, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.691592812538147, + "rewards/margins": 1.7348557710647583, + "rewards/rejected": -2.4264485836029053, + "step": 7420 + }, + { + "epoch": 0.86, + "learning_rate": 4.400093643918998e-08, + "logits/chosen": -2.558352470397949, + "logits/rejected": -2.3345627784729004, + "logps/chosen": -163.74636840820312, + "logps/rejected": -233.4769287109375, + "loss": 0.4697, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.41483253240585327, + "rewards/margins": 3.386021614074707, + "rewards/rejected": -3.800853729248047, + "step": 7421 + }, + { + "epoch": 0.86, + "learning_rate": 4.396581996956572e-08, + "logits/chosen": -2.157345771789551, + "logits/rejected": -2.3440680503845215, + "logps/chosen": -248.76683044433594, + "logps/rejected": -249.0823516845703, + "loss": 0.3031, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3297642469406128, + "rewards/margins": 3.248934745788574, + "rewards/rejected": -4.578699111938477, + "step": 7422 + }, + { + "epoch": 0.86, + "learning_rate": 4.3930703499941477e-08, + "logits/chosen": -2.4552242755889893, + "logits/rejected": -2.5504660606384277, + "logps/chosen": -307.13311767578125, + "logps/rejected": -275.07696533203125, + "loss": 0.6584, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6783178448677063, + "rewards/margins": 1.8819890022277832, + "rewards/rejected": -2.560307025909424, + "step": 7423 + }, + { + "epoch": 0.86, + "learning_rate": 4.389558703031722e-08, + "logits/chosen": -2.2303550243377686, + "logits/rejected": -2.302067279815674, + "logps/chosen": -156.11517333984375, + "logps/rejected": -193.0673828125, + "loss": 0.5183, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4537349939346313, + "rewards/margins": 1.7981752157211304, + "rewards/rejected": -3.2519102096557617, + "step": 7424 + }, + { + "epoch": 0.86, + "learning_rate": 4.3860470560692964e-08, + "logits/chosen": -2.706059217453003, + "logits/rejected": -2.844766616821289, + "logps/chosen": -350.58734130859375, + "logps/rejected": -218.78318786621094, + "loss": 0.3761, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8804764151573181, + "rewards/margins": 2.0639660358428955, + "rewards/rejected": -2.9444425106048584, + "step": 7425 + }, + { + "epoch": 0.86, + "learning_rate": 4.382535409106871e-08, + "logits/chosen": -1.713974952697754, + "logits/rejected": -1.8821617364883423, + "logps/chosen": -357.95440673828125, + "logps/rejected": -313.55914306640625, + "loss": 0.35, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2801162004470825, + "rewards/margins": 1.8493115901947021, + "rewards/rejected": -3.129427909851074, + "step": 7426 + }, + { + "epoch": 0.86, + "learning_rate": 4.379023762144446e-08, + "logits/chosen": -2.0263004302978516, + "logits/rejected": -2.362539291381836, + "logps/chosen": -331.5515441894531, + "logps/rejected": -280.1369323730469, + "loss": 0.3865, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7323320508003235, + "rewards/margins": 1.6369524002075195, + "rewards/rejected": -2.3692846298217773, + "step": 7427 + }, + { + "epoch": 0.86, + "learning_rate": 4.37551211518202e-08, + "logits/chosen": -2.2460291385650635, + "logits/rejected": -2.330732822418213, + "logps/chosen": -304.064208984375, + "logps/rejected": -186.72645568847656, + "loss": 0.7823, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.663807213306427, + "rewards/margins": 0.7997848391532898, + "rewards/rejected": -1.4635921716690063, + "step": 7428 + }, + { + "epoch": 0.86, + "learning_rate": 4.3720004682195953e-08, + "logits/chosen": -2.53085994720459, + "logits/rejected": -2.3026647567749023, + "logps/chosen": -106.99110412597656, + "logps/rejected": -236.2189178466797, + "loss": 0.2988, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4526205062866211, + "rewards/margins": 2.5999560356140137, + "rewards/rejected": -3.052577018737793, + "step": 7429 + }, + { + "epoch": 0.86, + "learning_rate": 4.3684888212571694e-08, + "logits/chosen": -2.2865138053894043, + "logits/rejected": -2.16333270072937, + "logps/chosen": -220.83822631835938, + "logps/rejected": -310.80865478515625, + "loss": 0.3192, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.839601755142212, + "rewards/margins": 1.425471305847168, + "rewards/rejected": -3.265073299407959, + "step": 7430 + }, + { + "epoch": 0.86, + "learning_rate": 4.364977174294744e-08, + "logits/chosen": -2.0317132472991943, + "logits/rejected": -2.081476926803589, + "logps/chosen": -399.7029113769531, + "logps/rejected": -367.17486572265625, + "loss": 0.4221, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.422279953956604, + "rewards/margins": 1.5719629526138306, + "rewards/rejected": -2.9942429065704346, + "step": 7431 + }, + { + "epoch": 0.86, + "learning_rate": 4.361465527332319e-08, + "logits/chosen": -2.337174892425537, + "logits/rejected": -2.386417865753174, + "logps/chosen": -184.50242614746094, + "logps/rejected": -265.27117919921875, + "loss": 0.734, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.75364089012146, + "rewards/margins": 1.7079036235809326, + "rewards/rejected": -3.4615447521209717, + "step": 7432 + }, + { + "epoch": 0.86, + "learning_rate": 4.3579538803698936e-08, + "logits/chosen": -2.0243780612945557, + "logits/rejected": -2.055367946624756, + "logps/chosen": -313.669677734375, + "logps/rejected": -272.46429443359375, + "loss": 0.239, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5029652714729309, + "rewards/margins": 1.8461456298828125, + "rewards/rejected": -2.3491110801696777, + "step": 7433 + }, + { + "epoch": 0.86, + "learning_rate": 4.3544422334074676e-08, + "logits/chosen": -1.978485345840454, + "logits/rejected": -2.0264461040496826, + "logps/chosen": -286.7430114746094, + "logps/rejected": -242.92913818359375, + "loss": 0.5374, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.13977591693401337, + "rewards/margins": 2.2109639644622803, + "rewards/rejected": -2.3507397174835205, + "step": 7434 + }, + { + "epoch": 0.86, + "learning_rate": 4.3509305864450424e-08, + "logits/chosen": -2.659950017929077, + "logits/rejected": -2.906338691711426, + "logps/chosen": -170.6924285888672, + "logps/rejected": -246.09249877929688, + "loss": 0.3155, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1937263011932373, + "rewards/margins": 2.737025499343872, + "rewards/rejected": -2.9307518005371094, + "step": 7435 + }, + { + "epoch": 0.86, + "learning_rate": 4.347418939482617e-08, + "logits/chosen": -2.604332685470581, + "logits/rejected": -2.663775682449341, + "logps/chosen": -338.0754699707031, + "logps/rejected": -312.74298095703125, + "loss": 0.0858, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5592730641365051, + "rewards/margins": 3.643566370010376, + "rewards/rejected": -4.202839374542236, + "step": 7436 + }, + { + "epoch": 0.86, + "learning_rate": 4.343907292520191e-08, + "logits/chosen": -2.1131303310394287, + "logits/rejected": -2.1810050010681152, + "logps/chosen": -505.82183837890625, + "logps/rejected": -491.3038330078125, + "loss": 0.1378, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7242613434791565, + "rewards/margins": 3.8325581550598145, + "rewards/rejected": -4.556819915771484, + "step": 7437 + }, + { + "epoch": 0.86, + "learning_rate": 4.3403956455577665e-08, + "logits/chosen": -1.7086243629455566, + "logits/rejected": -2.0247721672058105, + "logps/chosen": -470.3686218261719, + "logps/rejected": -309.64923095703125, + "loss": 0.2768, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8166908025741577, + "rewards/margins": 3.0296006202697754, + "rewards/rejected": -3.8462915420532227, + "step": 7438 + }, + { + "epoch": 0.86, + "learning_rate": 4.3368839985953406e-08, + "logits/chosen": -1.9433573484420776, + "logits/rejected": -2.136730432510376, + "logps/chosen": -512.1484375, + "logps/rejected": -380.3797302246094, + "loss": 0.0694, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.043488115072250366, + "rewards/margins": 3.1396801471710205, + "rewards/rejected": -3.183168411254883, + "step": 7439 + }, + { + "epoch": 0.86, + "learning_rate": 4.333372351632916e-08, + "logits/chosen": -2.085420846939087, + "logits/rejected": -2.072044849395752, + "logps/chosen": -280.26959228515625, + "logps/rejected": -242.2276611328125, + "loss": 0.2467, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.05652652680873871, + "rewards/margins": 1.9702247381210327, + "rewards/rejected": -1.9136983156204224, + "step": 7440 + }, + { + "epoch": 0.86, + "learning_rate": 4.32986070467049e-08, + "logits/chosen": -2.24910831451416, + "logits/rejected": -2.102224588394165, + "logps/chosen": -183.86863708496094, + "logps/rejected": -305.7812805175781, + "loss": 0.3284, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5528884530067444, + "rewards/margins": 3.1780643463134766, + "rewards/rejected": -3.7309529781341553, + "step": 7441 + }, + { + "epoch": 0.86, + "learning_rate": 4.326349057708065e-08, + "logits/chosen": -2.6978721618652344, + "logits/rejected": -2.72873592376709, + "logps/chosen": -232.96267700195312, + "logps/rejected": -259.0627136230469, + "loss": 0.4726, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9784818291664124, + "rewards/margins": 1.9933815002441406, + "rewards/rejected": -2.9718635082244873, + "step": 7442 + }, + { + "epoch": 0.86, + "learning_rate": 4.3228374107456395e-08, + "logits/chosen": -2.0610952377319336, + "logits/rejected": -1.9227511882781982, + "logps/chosen": -226.53274536132812, + "logps/rejected": -325.72894287109375, + "loss": 0.3358, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1252495050430298, + "rewards/margins": 2.412069797515869, + "rewards/rejected": -3.5373191833496094, + "step": 7443 + }, + { + "epoch": 0.86, + "learning_rate": 4.319325763783214e-08, + "logits/chosen": -2.987138032913208, + "logits/rejected": -2.9821767807006836, + "logps/chosen": -285.69232177734375, + "logps/rejected": -396.85443115234375, + "loss": 0.1087, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7393222451210022, + "rewards/margins": 4.984231472015381, + "rewards/rejected": -5.723553657531738, + "step": 7444 + }, + { + "epoch": 0.86, + "learning_rate": 4.315814116820788e-08, + "logits/chosen": -2.3210818767547607, + "logits/rejected": -2.125767230987549, + "logps/chosen": -275.5994567871094, + "logps/rejected": -246.52757263183594, + "loss": 0.2525, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9387637376785278, + "rewards/margins": 2.0375289916992188, + "rewards/rejected": -2.976292848587036, + "step": 7445 + }, + { + "epoch": 0.86, + "learning_rate": 4.3123024698583637e-08, + "logits/chosen": -2.6042673587799072, + "logits/rejected": -2.297902822494507, + "logps/chosen": -131.06507873535156, + "logps/rejected": -251.87062072753906, + "loss": 0.1754, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7403950691223145, + "rewards/margins": 2.4702858924865723, + "rewards/rejected": -3.2106807231903076, + "step": 7446 + }, + { + "epoch": 0.86, + "learning_rate": 4.308790822895938e-08, + "logits/chosen": -2.024714946746826, + "logits/rejected": -2.4725561141967773, + "logps/chosen": -249.38720703125, + "logps/rejected": -183.4595947265625, + "loss": 1.0027, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7594895362854004, + "rewards/margins": 0.05625259876251221, + "rewards/rejected": -1.815742015838623, + "step": 7447 + }, + { + "epoch": 0.86, + "learning_rate": 4.3052791759335124e-08, + "logits/chosen": -1.8797751665115356, + "logits/rejected": -2.049220085144043, + "logps/chosen": -412.44329833984375, + "logps/rejected": -228.8227996826172, + "loss": 0.5419, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8161269426345825, + "rewards/margins": 0.9367996454238892, + "rewards/rejected": -1.7529265880584717, + "step": 7448 + }, + { + "epoch": 0.86, + "learning_rate": 4.301767528971087e-08, + "logits/chosen": -1.8215161561965942, + "logits/rejected": -1.9123318195343018, + "logps/chosen": -282.0595703125, + "logps/rejected": -288.69268798828125, + "loss": 0.5091, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.319616675376892, + "rewards/margins": 1.9542500972747803, + "rewards/rejected": -3.273866891860962, + "step": 7449 + }, + { + "epoch": 0.86, + "learning_rate": 4.298255882008662e-08, + "logits/chosen": -2.6649045944213867, + "logits/rejected": -2.578676462173462, + "logps/chosen": -265.76593017578125, + "logps/rejected": -293.56298828125, + "loss": 0.2955, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7689720988273621, + "rewards/margins": 1.4938256740570068, + "rewards/rejected": -2.2627978324890137, + "step": 7450 + }, + { + "epoch": 0.86, + "learning_rate": 4.294744235046236e-08, + "logits/chosen": -2.2741501331329346, + "logits/rejected": -2.274785280227661, + "logps/chosen": -264.5917053222656, + "logps/rejected": -330.68548583984375, + "loss": 0.908, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.363509178161621, + "rewards/margins": 2.998049020767212, + "rewards/rejected": -5.361557960510254, + "step": 7451 + }, + { + "epoch": 0.86, + "learning_rate": 4.2912325880838113e-08, + "logits/chosen": -2.3355870246887207, + "logits/rejected": -2.421881914138794, + "logps/chosen": -121.90846252441406, + "logps/rejected": -164.748779296875, + "loss": 0.2998, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.47456130385398865, + "rewards/margins": 1.7894978523254395, + "rewards/rejected": -2.264059066772461, + "step": 7452 + }, + { + "epoch": 0.86, + "learning_rate": 4.2877209411213854e-08, + "logits/chosen": -2.476320743560791, + "logits/rejected": -2.794128894805908, + "logps/chosen": -413.2742614746094, + "logps/rejected": -272.76031494140625, + "loss": 1.0768, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.9584975242614746, + "rewards/margins": 1.8863873481750488, + "rewards/rejected": -3.8448848724365234, + "step": 7453 + }, + { + "epoch": 0.86, + "learning_rate": 4.284209294158961e-08, + "logits/chosen": -1.6976286172866821, + "logits/rejected": -2.028202533721924, + "logps/chosen": -410.1387939453125, + "logps/rejected": -329.0910949707031, + "loss": 0.1435, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9719187617301941, + "rewards/margins": 3.004127264022827, + "rewards/rejected": -3.976045846939087, + "step": 7454 + }, + { + "epoch": 0.86, + "learning_rate": 4.280697647196535e-08, + "logits/chosen": -1.851546049118042, + "logits/rejected": -2.0967025756835938, + "logps/chosen": -298.8568115234375, + "logps/rejected": -223.21798706054688, + "loss": 0.6766, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.434117078781128, + "rewards/margins": 1.677457571029663, + "rewards/rejected": -3.111574649810791, + "step": 7455 + }, + { + "epoch": 0.86, + "learning_rate": 4.2771860002341096e-08, + "logits/chosen": -1.886885643005371, + "logits/rejected": -2.1038894653320312, + "logps/chosen": -277.044677734375, + "logps/rejected": -233.49801635742188, + "loss": 0.4483, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8859398365020752, + "rewards/margins": 0.8691480159759521, + "rewards/rejected": -1.755087971687317, + "step": 7456 + }, + { + "epoch": 0.86, + "learning_rate": 4.273674353271684e-08, + "logits/chosen": -2.3416616916656494, + "logits/rejected": -2.4718308448791504, + "logps/chosen": -577.15966796875, + "logps/rejected": -402.67657470703125, + "loss": 0.4281, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.608887255191803, + "rewards/margins": 0.9860118627548218, + "rewards/rejected": -1.59489905834198, + "step": 7457 + }, + { + "epoch": 0.86, + "learning_rate": 4.270162706309259e-08, + "logits/chosen": -2.391200304031372, + "logits/rejected": -2.371710777282715, + "logps/chosen": -380.8275146484375, + "logps/rejected": -379.1246032714844, + "loss": 0.1566, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4903583526611328, + "rewards/margins": 2.577174663543701, + "rewards/rejected": -3.067533016204834, + "step": 7458 + }, + { + "epoch": 0.86, + "learning_rate": 4.266651059346833e-08, + "logits/chosen": -2.3995304107666016, + "logits/rejected": -2.3660783767700195, + "logps/chosen": -340.17132568359375, + "logps/rejected": -261.7540283203125, + "loss": 0.2394, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6598501205444336, + "rewards/margins": 2.333737850189209, + "rewards/rejected": -2.9935879707336426, + "step": 7459 + }, + { + "epoch": 0.86, + "learning_rate": 4.2631394123844085e-08, + "logits/chosen": -2.8603575229644775, + "logits/rejected": -2.7975025177001953, + "logps/chosen": -198.6427001953125, + "logps/rejected": -294.1598205566406, + "loss": 0.6395, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3900195360183716, + "rewards/margins": 1.7241451740264893, + "rewards/rejected": -3.1141648292541504, + "step": 7460 + }, + { + "epoch": 0.86, + "learning_rate": 4.2596277654219825e-08, + "logits/chosen": -2.6532886028289795, + "logits/rejected": -2.717475175857544, + "logps/chosen": -210.23016357421875, + "logps/rejected": -242.25262451171875, + "loss": 0.6243, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6128408908843994, + "rewards/margins": 4.096386909484863, + "rewards/rejected": -4.709227561950684, + "step": 7461 + }, + { + "epoch": 0.86, + "learning_rate": 4.256116118459558e-08, + "logits/chosen": -2.1740827560424805, + "logits/rejected": -2.6213157176971436, + "logps/chosen": -374.8035583496094, + "logps/rejected": -208.0325164794922, + "loss": 0.2397, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.023208528757095337, + "rewards/margins": 1.99513578414917, + "rewards/rejected": -1.971927285194397, + "step": 7462 + }, + { + "epoch": 0.86, + "learning_rate": 4.252604471497132e-08, + "logits/chosen": -2.3127636909484863, + "logits/rejected": -2.413865089416504, + "logps/chosen": -326.67718505859375, + "logps/rejected": -255.36495971679688, + "loss": 0.4859, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.35008060932159424, + "rewards/margins": 1.467564582824707, + "rewards/rejected": -1.8176451921463013, + "step": 7463 + }, + { + "epoch": 0.86, + "learning_rate": 4.249092824534707e-08, + "logits/chosen": -2.294793128967285, + "logits/rejected": -2.3757777214050293, + "logps/chosen": -167.74911499023438, + "logps/rejected": -145.938720703125, + "loss": 0.559, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.194220781326294, + "rewards/margins": 0.8626735210418701, + "rewards/rejected": -2.056894302368164, + "step": 7464 + }, + { + "epoch": 0.86, + "learning_rate": 4.245581177572281e-08, + "logits/chosen": -2.539025068283081, + "logits/rejected": -2.3619205951690674, + "logps/chosen": -255.9796142578125, + "logps/rejected": -291.4810791015625, + "loss": 0.1897, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7241159081459045, + "rewards/margins": 2.827911138534546, + "rewards/rejected": -3.5520272254943848, + "step": 7465 + }, + { + "epoch": 0.86, + "learning_rate": 4.242069530609856e-08, + "logits/chosen": -1.748805284500122, + "logits/rejected": -1.9884674549102783, + "logps/chosen": -547.159423828125, + "logps/rejected": -389.881103515625, + "loss": 1.0407, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2570686340332031, + "rewards/margins": 0.460968554019928, + "rewards/rejected": -1.7180371284484863, + "step": 7466 + }, + { + "epoch": 0.86, + "learning_rate": 4.23855788364743e-08, + "logits/chosen": -2.8498973846435547, + "logits/rejected": -2.750659465789795, + "logps/chosen": -268.2320556640625, + "logps/rejected": -183.88436889648438, + "loss": 0.5146, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6841235160827637, + "rewards/margins": 0.7594789862632751, + "rewards/rejected": -2.4436025619506836, + "step": 7467 + }, + { + "epoch": 0.86, + "learning_rate": 4.2350462366850056e-08, + "logits/chosen": -2.679567813873291, + "logits/rejected": -2.534468650817871, + "logps/chosen": -150.13973999023438, + "logps/rejected": -169.88075256347656, + "loss": 0.7966, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.157470703125, + "rewards/margins": 0.7844750285148621, + "rewards/rejected": -1.9419457912445068, + "step": 7468 + }, + { + "epoch": 0.86, + "learning_rate": 4.2315345897225797e-08, + "logits/chosen": -2.1812939643859863, + "logits/rejected": -2.2468156814575195, + "logps/chosen": -192.39114379882812, + "logps/rejected": -177.3850555419922, + "loss": 1.8862, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.0442419052124023, + "rewards/margins": -0.48685622215270996, + "rewards/rejected": -1.557385802268982, + "step": 7469 + }, + { + "epoch": 0.86, + "learning_rate": 4.2280229427601544e-08, + "logits/chosen": -2.3746769428253174, + "logits/rejected": -2.548194169998169, + "logps/chosen": -226.38621520996094, + "logps/rejected": -239.93124389648438, + "loss": 0.1878, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9337755441665649, + "rewards/margins": 3.186070203781128, + "rewards/rejected": -4.119845867156982, + "step": 7470 + }, + { + "epoch": 0.86, + "learning_rate": 4.224511295797729e-08, + "logits/chosen": -2.8442296981811523, + "logits/rejected": -2.7490346431732178, + "logps/chosen": -241.84622192382812, + "logps/rejected": -158.74365234375, + "loss": 0.2569, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0908689796924591, + "rewards/margins": 2.074324607849121, + "rewards/rejected": -2.165193796157837, + "step": 7471 + }, + { + "epoch": 0.86, + "learning_rate": 4.220999648835304e-08, + "logits/chosen": -2.8070526123046875, + "logits/rejected": -2.715545415878296, + "logps/chosen": -306.23089599609375, + "logps/rejected": -327.9366760253906, + "loss": 0.3443, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4176549911499023, + "rewards/margins": 2.243144989013672, + "rewards/rejected": -3.6608002185821533, + "step": 7472 + }, + { + "epoch": 0.86, + "learning_rate": 4.217488001872878e-08, + "logits/chosen": -2.6248128414154053, + "logits/rejected": -2.8444981575012207, + "logps/chosen": -237.771240234375, + "logps/rejected": -186.41690063476562, + "loss": 0.381, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6656134724617004, + "rewards/margins": 3.6443896293640137, + "rewards/rejected": -4.31000280380249, + "step": 7473 + }, + { + "epoch": 0.86, + "learning_rate": 4.213976354910453e-08, + "logits/chosen": -2.504709243774414, + "logits/rejected": -2.3882761001586914, + "logps/chosen": -230.8861083984375, + "logps/rejected": -284.23126220703125, + "loss": 0.5857, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8332610726356506, + "rewards/margins": 1.3221452236175537, + "rewards/rejected": -2.1554064750671387, + "step": 7474 + }, + { + "epoch": 0.86, + "learning_rate": 4.2104647079480273e-08, + "logits/chosen": -2.4355735778808594, + "logits/rejected": -2.350598096847534, + "logps/chosen": -186.42459106445312, + "logps/rejected": -361.8608093261719, + "loss": 0.657, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.557528018951416, + "rewards/margins": 2.3513331413269043, + "rewards/rejected": -3.908860921859741, + "step": 7475 + }, + { + "epoch": 0.86, + "learning_rate": 4.206953060985603e-08, + "logits/chosen": -2.3544063568115234, + "logits/rejected": -2.487353801727295, + "logps/chosen": -260.052734375, + "logps/rejected": -198.10494995117188, + "loss": 0.4206, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.46662649512290955, + "rewards/margins": 1.8815114498138428, + "rewards/rejected": -2.348137855529785, + "step": 7476 + }, + { + "epoch": 0.86, + "learning_rate": 4.203441414023177e-08, + "logits/chosen": -3.022876501083374, + "logits/rejected": -3.020024538040161, + "logps/chosen": -159.88507080078125, + "logps/rejected": -180.21908569335938, + "loss": 0.2104, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4659155309200287, + "rewards/margins": 3.4903454780578613, + "rewards/rejected": -3.956261157989502, + "step": 7477 + }, + { + "epoch": 0.86, + "learning_rate": 4.1999297670607515e-08, + "logits/chosen": -1.8859246969223022, + "logits/rejected": -2.045184850692749, + "logps/chosen": -350.952392578125, + "logps/rejected": -252.35641479492188, + "loss": 0.6442, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5165131688117981, + "rewards/margins": 1.0537543296813965, + "rewards/rejected": -1.5702674388885498, + "step": 7478 + }, + { + "epoch": 0.86, + "learning_rate": 4.196418120098326e-08, + "logits/chosen": -2.232231616973877, + "logits/rejected": -2.633094072341919, + "logps/chosen": -406.80889892578125, + "logps/rejected": -294.6513671875, + "loss": 0.234, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7894206643104553, + "rewards/margins": 2.8567283153533936, + "rewards/rejected": -3.646148920059204, + "step": 7479 + }, + { + "epoch": 0.86, + "learning_rate": 4.192906473135901e-08, + "logits/chosen": -2.4213578701019287, + "logits/rejected": -2.4989736080169678, + "logps/chosen": -282.13665771484375, + "logps/rejected": -273.6277770996094, + "loss": 0.8126, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.502234697341919, + "rewards/margins": 0.4509034752845764, + "rewards/rejected": -1.9531382322311401, + "step": 7480 + }, + { + "epoch": 0.86, + "learning_rate": 4.189394826173475e-08, + "logits/chosen": -2.926534652709961, + "logits/rejected": -2.943114757537842, + "logps/chosen": -158.57801818847656, + "logps/rejected": -201.6519775390625, + "loss": 0.2581, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2671055793762207, + "rewards/margins": 3.170275926589966, + "rewards/rejected": -3.4373812675476074, + "step": 7481 + }, + { + "epoch": 0.86, + "learning_rate": 4.185883179211049e-08, + "logits/chosen": -1.9633768796920776, + "logits/rejected": -1.8020273447036743, + "logps/chosen": -287.7214050292969, + "logps/rejected": -266.09246826171875, + "loss": 0.1914, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2234469652175903, + "rewards/margins": 2.4525394439697266, + "rewards/rejected": -3.6759862899780273, + "step": 7482 + }, + { + "epoch": 0.86, + "learning_rate": 4.1823715322486245e-08, + "logits/chosen": -2.596146583557129, + "logits/rejected": -2.796454906463623, + "logps/chosen": -378.31121826171875, + "logps/rejected": -325.807861328125, + "loss": 0.3547, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7134103775024414, + "rewards/margins": 2.0523793697357178, + "rewards/rejected": -2.7657899856567383, + "step": 7483 + }, + { + "epoch": 0.86, + "learning_rate": 4.1788598852861985e-08, + "logits/chosen": -1.7476987838745117, + "logits/rejected": -2.1250133514404297, + "logps/chosen": -466.0035400390625, + "logps/rejected": -336.5951232910156, + "loss": 0.2202, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.27531227469444275, + "rewards/margins": 2.4159553050994873, + "rewards/rejected": -2.691267490386963, + "step": 7484 + }, + { + "epoch": 0.86, + "learning_rate": 4.175348238323774e-08, + "logits/chosen": -2.8208353519439697, + "logits/rejected": -2.8272671699523926, + "logps/chosen": -356.6827392578125, + "logps/rejected": -255.71148681640625, + "loss": 0.2796, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7494503855705261, + "rewards/margins": 2.363878011703491, + "rewards/rejected": -3.113328456878662, + "step": 7485 + }, + { + "epoch": 0.86, + "learning_rate": 4.171836591361348e-08, + "logits/chosen": -2.3949296474456787, + "logits/rejected": -2.353266716003418, + "logps/chosen": -445.608642578125, + "logps/rejected": -335.0173034667969, + "loss": 0.5153, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9614450931549072, + "rewards/margins": 1.9081840515136719, + "rewards/rejected": -2.869629383087158, + "step": 7486 + }, + { + "epoch": 0.86, + "learning_rate": 4.168324944398923e-08, + "logits/chosen": -1.6578588485717773, + "logits/rejected": -2.205876350402832, + "logps/chosen": -342.8338623046875, + "logps/rejected": -301.457763671875, + "loss": 0.4397, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.372291922569275, + "rewards/margins": 2.395200490951538, + "rewards/rejected": -3.7674925327301025, + "step": 7487 + }, + { + "epoch": 0.86, + "learning_rate": 4.1648132974364974e-08, + "logits/chosen": -1.9005407094955444, + "logits/rejected": -2.020331859588623, + "logps/chosen": -378.144287109375, + "logps/rejected": -287.0534973144531, + "loss": 0.6462, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0650299787521362, + "rewards/margins": 1.4762245416641235, + "rewards/rejected": -2.5412545204162598, + "step": 7488 + }, + { + "epoch": 0.86, + "learning_rate": 4.161301650474072e-08, + "logits/chosen": -2.0083796977996826, + "logits/rejected": -2.065070390701294, + "logps/chosen": -213.40992736816406, + "logps/rejected": -196.4884033203125, + "loss": 0.303, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2848341464996338, + "rewards/margins": 1.7034051418304443, + "rewards/rejected": -1.9882392883300781, + "step": 7489 + }, + { + "epoch": 0.86, + "learning_rate": 4.157790003511646e-08, + "logits/chosen": -2.1465110778808594, + "logits/rejected": -2.215989112854004, + "logps/chosen": -168.363037109375, + "logps/rejected": -215.23068237304688, + "loss": 0.5291, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.161086082458496, + "rewards/margins": 2.4795281887054443, + "rewards/rejected": -3.6406142711639404, + "step": 7490 + }, + { + "epoch": 0.86, + "learning_rate": 4.1542783565492216e-08, + "logits/chosen": -2.350146532058716, + "logits/rejected": -2.233332395553589, + "logps/chosen": -154.01116943359375, + "logps/rejected": -261.0755310058594, + "loss": 0.2359, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5756946802139282, + "rewards/margins": 2.7197978496551514, + "rewards/rejected": -3.295492172241211, + "step": 7491 + }, + { + "epoch": 0.86, + "learning_rate": 4.1507667095867957e-08, + "logits/chosen": -2.783395767211914, + "logits/rejected": -2.836336612701416, + "logps/chosen": -165.8664093017578, + "logps/rejected": -194.21527099609375, + "loss": 0.2394, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0431623458862305, + "rewards/margins": 2.1451311111450195, + "rewards/rejected": -3.188293695449829, + "step": 7492 + }, + { + "epoch": 0.86, + "learning_rate": 4.147255062624371e-08, + "logits/chosen": -2.414186954498291, + "logits/rejected": -2.331249952316284, + "logps/chosen": -187.3816680908203, + "logps/rejected": -280.74749755859375, + "loss": 0.1371, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2066311091184616, + "rewards/margins": 2.8901665210723877, + "rewards/rejected": -3.096797466278076, + "step": 7493 + }, + { + "epoch": 0.86, + "learning_rate": 4.143743415661945e-08, + "logits/chosen": -1.6206676959991455, + "logits/rejected": -1.5441694259643555, + "logps/chosen": -402.8094177246094, + "logps/rejected": -433.8321838378906, + "loss": 0.5822, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8185575008392334, + "rewards/margins": 1.1413655281066895, + "rewards/rejected": -1.9599229097366333, + "step": 7494 + }, + { + "epoch": 0.86, + "learning_rate": 4.14023176869952e-08, + "logits/chosen": -2.807008981704712, + "logits/rejected": -2.858397960662842, + "logps/chosen": -164.58627319335938, + "logps/rejected": -204.79397583007812, + "loss": 0.188, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2735743522644043, + "rewards/margins": 2.382197618484497, + "rewards/rejected": -2.6557719707489014, + "step": 7495 + }, + { + "epoch": 0.86, + "learning_rate": 4.1367201217370945e-08, + "logits/chosen": -1.7390151023864746, + "logits/rejected": -2.039620876312256, + "logps/chosen": -363.0972900390625, + "logps/rejected": -163.7686004638672, + "loss": 0.6799, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8990013599395752, + "rewards/margins": 0.42109447717666626, + "rewards/rejected": -1.3200957775115967, + "step": 7496 + }, + { + "epoch": 0.86, + "learning_rate": 4.133208474774669e-08, + "logits/chosen": -1.8649652004241943, + "logits/rejected": -2.1058058738708496, + "logps/chosen": -293.58349609375, + "logps/rejected": -285.627197265625, + "loss": 0.8953, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7188546657562256, + "rewards/margins": 1.3167859315872192, + "rewards/rejected": -3.0356407165527344, + "step": 7497 + }, + { + "epoch": 0.86, + "learning_rate": 4.129696827812243e-08, + "logits/chosen": -2.8384084701538086, + "logits/rejected": -2.791487693786621, + "logps/chosen": -130.44561767578125, + "logps/rejected": -167.1997833251953, + "loss": 0.3065, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7268600463867188, + "rewards/margins": 1.6767876148223877, + "rewards/rejected": -2.4036478996276855, + "step": 7498 + }, + { + "epoch": 0.86, + "learning_rate": 4.126185180849819e-08, + "logits/chosen": -1.793875813484192, + "logits/rejected": -2.1201627254486084, + "logps/chosen": -337.8648681640625, + "logps/rejected": -276.68267822265625, + "loss": 0.487, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3823294639587402, + "rewards/margins": 1.4309145212173462, + "rewards/rejected": -2.813244104385376, + "step": 7499 + }, + { + "epoch": 0.86, + "learning_rate": 4.122673533887393e-08, + "logits/chosen": -2.2914321422576904, + "logits/rejected": -2.381237268447876, + "logps/chosen": -335.2586669921875, + "logps/rejected": -415.6112060546875, + "loss": 0.2692, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5394319295883179, + "rewards/margins": 2.2542355060577393, + "rewards/rejected": -3.7936673164367676, + "step": 7500 + }, + { + "epoch": 0.86, + "learning_rate": 4.1191618869249675e-08, + "logits/chosen": -2.424907922744751, + "logits/rejected": -2.564448356628418, + "logps/chosen": -322.3095397949219, + "logps/rejected": -353.70745849609375, + "loss": 0.3101, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3376273810863495, + "rewards/margins": 2.2963647842407227, + "rewards/rejected": -2.6339919567108154, + "step": 7501 + }, + { + "epoch": 0.86, + "learning_rate": 4.115650239962542e-08, + "logits/chosen": -2.541184902191162, + "logits/rejected": -2.2925355434417725, + "logps/chosen": -223.6905975341797, + "logps/rejected": -227.10330200195312, + "loss": 0.4311, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1420444250106812, + "rewards/margins": 1.7700960636138916, + "rewards/rejected": -2.9121406078338623, + "step": 7502 + }, + { + "epoch": 0.86, + "learning_rate": 4.112138593000117e-08, + "logits/chosen": -2.4408304691314697, + "logits/rejected": -2.3419291973114014, + "logps/chosen": -201.30694580078125, + "logps/rejected": -263.1689453125, + "loss": 0.6496, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8176108598709106, + "rewards/margins": 0.8527778387069702, + "rewards/rejected": -1.6703886985778809, + "step": 7503 + }, + { + "epoch": 0.87, + "learning_rate": 4.108626946037691e-08, + "logits/chosen": -2.6647634506225586, + "logits/rejected": -2.3238372802734375, + "logps/chosen": -200.30126953125, + "logps/rejected": -249.2705078125, + "loss": 0.6586, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.264829158782959, + "rewards/margins": 0.9703664183616638, + "rewards/rejected": -3.2351956367492676, + "step": 7504 + }, + { + "epoch": 0.87, + "learning_rate": 4.1051152990752664e-08, + "logits/chosen": -1.9796150922775269, + "logits/rejected": -2.3760640621185303, + "logps/chosen": -353.9818115234375, + "logps/rejected": -287.3360595703125, + "loss": 0.2783, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20126347243785858, + "rewards/margins": 3.6639132499694824, + "rewards/rejected": -3.8651769161224365, + "step": 7505 + }, + { + "epoch": 0.87, + "learning_rate": 4.1016036521128405e-08, + "logits/chosen": -2.4663097858428955, + "logits/rejected": -2.637627124786377, + "logps/chosen": -239.00192260742188, + "logps/rejected": -295.7079772949219, + "loss": 0.0539, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1759679615497589, + "rewards/margins": 4.2910356521606445, + "rewards/rejected": -4.467003345489502, + "step": 7506 + }, + { + "epoch": 0.87, + "learning_rate": 4.098092005150416e-08, + "logits/chosen": -2.9242026805877686, + "logits/rejected": -2.9764256477355957, + "logps/chosen": -260.2054138183594, + "logps/rejected": -239.4066925048828, + "loss": 0.2866, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4237958788871765, + "rewards/margins": 3.5138440132141113, + "rewards/rejected": -3.9376399517059326, + "step": 7507 + }, + { + "epoch": 0.87, + "learning_rate": 4.09458035818799e-08, + "logits/chosen": -2.4235496520996094, + "logits/rejected": -2.528323173522949, + "logps/chosen": -261.6034240722656, + "logps/rejected": -251.8378143310547, + "loss": 0.3156, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.45368051528930664, + "rewards/margins": 2.032405376434326, + "rewards/rejected": -2.486085891723633, + "step": 7508 + }, + { + "epoch": 0.87, + "learning_rate": 4.0910687112255646e-08, + "logits/chosen": -2.618927240371704, + "logits/rejected": -2.750270366668701, + "logps/chosen": -233.75906372070312, + "logps/rejected": -133.0506134033203, + "loss": 0.8015, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3712669610977173, + "rewards/margins": 1.250117301940918, + "rewards/rejected": -2.6213841438293457, + "step": 7509 + }, + { + "epoch": 0.87, + "learning_rate": 4.0875570642631394e-08, + "logits/chosen": -2.5869550704956055, + "logits/rejected": -2.548818826675415, + "logps/chosen": -292.52801513671875, + "logps/rejected": -231.97097778320312, + "loss": 0.2837, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3905150294303894, + "rewards/margins": 1.461206316947937, + "rewards/rejected": -1.8517215251922607, + "step": 7510 + }, + { + "epoch": 0.87, + "learning_rate": 4.084045417300714e-08, + "logits/chosen": -2.102616310119629, + "logits/rejected": -2.4358763694763184, + "logps/chosen": -370.6359558105469, + "logps/rejected": -300.03759765625, + "loss": 0.2006, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0874216556549072, + "rewards/margins": 3.3393044471740723, + "rewards/rejected": -4.426726341247559, + "step": 7511 + }, + { + "epoch": 0.87, + "learning_rate": 4.080533770338288e-08, + "logits/chosen": -2.563328266143799, + "logits/rejected": -2.701646327972412, + "logps/chosen": -336.7755126953125, + "logps/rejected": -222.9385223388672, + "loss": 0.3066, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2916334867477417, + "rewards/margins": 1.9822051525115967, + "rewards/rejected": -2.273838520050049, + "step": 7512 + }, + { + "epoch": 0.87, + "learning_rate": 4.0770221233758635e-08, + "logits/chosen": -2.3984975814819336, + "logits/rejected": -2.4663496017456055, + "logps/chosen": -358.2239990234375, + "logps/rejected": -261.54107666015625, + "loss": 0.6966, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9139443635940552, + "rewards/margins": 1.532222032546997, + "rewards/rejected": -3.4461662769317627, + "step": 7513 + }, + { + "epoch": 0.87, + "learning_rate": 4.0735104764134376e-08, + "logits/chosen": -2.241840124130249, + "logits/rejected": -2.0505874156951904, + "logps/chosen": -250.1867218017578, + "logps/rejected": -202.7620391845703, + "loss": 0.3473, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8126185536384583, + "rewards/margins": 1.822922945022583, + "rewards/rejected": -2.6355414390563965, + "step": 7514 + }, + { + "epoch": 0.87, + "learning_rate": 4.069998829451012e-08, + "logits/chosen": -2.314237356185913, + "logits/rejected": -2.4580187797546387, + "logps/chosen": -366.966064453125, + "logps/rejected": -364.1372375488281, + "loss": 0.1612, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.46622592210769653, + "rewards/margins": 3.3581039905548096, + "rewards/rejected": -3.8243303298950195, + "step": 7515 + }, + { + "epoch": 0.87, + "learning_rate": 4.066487182488587e-08, + "logits/chosen": -2.8547396659851074, + "logits/rejected": -2.697167158126831, + "logps/chosen": -243.09609985351562, + "logps/rejected": -241.2445831298828, + "loss": 0.189, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0107483863830566, + "rewards/margins": 2.3552334308624268, + "rewards/rejected": -3.3659820556640625, + "step": 7516 + }, + { + "epoch": 0.87, + "learning_rate": 4.062975535526162e-08, + "logits/chosen": -2.729201316833496, + "logits/rejected": -2.7970685958862305, + "logps/chosen": -311.0781555175781, + "logps/rejected": -275.69146728515625, + "loss": 0.1553, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.912248969078064, + "rewards/margins": 2.317959785461426, + "rewards/rejected": -3.2302088737487793, + "step": 7517 + }, + { + "epoch": 0.87, + "learning_rate": 4.059463888563736e-08, + "logits/chosen": -2.379669666290283, + "logits/rejected": -2.445725917816162, + "logps/chosen": -209.94052124023438, + "logps/rejected": -258.93121337890625, + "loss": 0.178, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.779931902885437, + "rewards/margins": 2.5424351692199707, + "rewards/rejected": -3.322366714477539, + "step": 7518 + }, + { + "epoch": 0.87, + "learning_rate": 4.055952241601311e-08, + "logits/chosen": -2.6602935791015625, + "logits/rejected": -2.433422565460205, + "logps/chosen": -250.99044799804688, + "logps/rejected": -305.8680419921875, + "loss": 0.1915, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8186811208724976, + "rewards/margins": 3.4214792251586914, + "rewards/rejected": -4.2401604652404785, + "step": 7519 + }, + { + "epoch": 0.87, + "learning_rate": 4.052440594638885e-08, + "logits/chosen": -1.9913411140441895, + "logits/rejected": -2.094290256500244, + "logps/chosen": -388.0360107421875, + "logps/rejected": -239.3806610107422, + "loss": 0.2584, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.119163990020752, + "rewards/margins": 2.364924669265747, + "rewards/rejected": -3.484088659286499, + "step": 7520 + }, + { + "epoch": 0.87, + "learning_rate": 4.0489289476764607e-08, + "logits/chosen": -2.4352734088897705, + "logits/rejected": -2.346705198287964, + "logps/chosen": -203.79498291015625, + "logps/rejected": -202.0792236328125, + "loss": 1.0357, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.9491441249847412, + "rewards/margins": 0.6300934553146362, + "rewards/rejected": -2.579237461090088, + "step": 7521 + }, + { + "epoch": 0.87, + "learning_rate": 4.045417300714035e-08, + "logits/chosen": -2.719390630722046, + "logits/rejected": -2.709427833557129, + "logps/chosen": -337.4186096191406, + "logps/rejected": -242.1144256591797, + "loss": 0.2412, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6519408822059631, + "rewards/margins": 1.5896891355514526, + "rewards/rejected": -2.2416298389434814, + "step": 7522 + }, + { + "epoch": 0.87, + "learning_rate": 4.0419056537516094e-08, + "logits/chosen": -2.8115625381469727, + "logits/rejected": -2.6325314044952393, + "logps/chosen": -160.04522705078125, + "logps/rejected": -204.9154052734375, + "loss": 0.3193, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.40140652656555176, + "rewards/margins": 2.7427916526794434, + "rewards/rejected": -3.144198179244995, + "step": 7523 + }, + { + "epoch": 0.87, + "learning_rate": 4.038394006789184e-08, + "logits/chosen": -2.3726420402526855, + "logits/rejected": -2.360283374786377, + "logps/chosen": -71.20538330078125, + "logps/rejected": -148.68621826171875, + "loss": 0.2091, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37499210238456726, + "rewards/margins": 2.7490732669830322, + "rewards/rejected": -3.124065399169922, + "step": 7524 + }, + { + "epoch": 0.87, + "learning_rate": 4.034882359826759e-08, + "logits/chosen": -1.9176112413406372, + "logits/rejected": -2.1262168884277344, + "logps/chosen": -317.58624267578125, + "logps/rejected": -184.47825622558594, + "loss": 0.2989, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0018503665924072, + "rewards/margins": 2.7567479610443115, + "rewards/rejected": -3.7585983276367188, + "step": 7525 + }, + { + "epoch": 0.87, + "learning_rate": 4.031370712864333e-08, + "logits/chosen": -2.5442988872528076, + "logits/rejected": -2.617154359817505, + "logps/chosen": -236.14608764648438, + "logps/rejected": -253.9678192138672, + "loss": 0.4828, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3300795555114746, + "rewards/margins": 1.3579845428466797, + "rewards/rejected": -2.6880640983581543, + "step": 7526 + }, + { + "epoch": 0.87, + "learning_rate": 4.0278590659019083e-08, + "logits/chosen": -2.2587192058563232, + "logits/rejected": -2.31790828704834, + "logps/chosen": -229.33071899414062, + "logps/rejected": -140.41720581054688, + "loss": 0.4555, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.215193510055542, + "rewards/margins": 1.1618530750274658, + "rewards/rejected": -1.3770465850830078, + "step": 7527 + }, + { + "epoch": 0.87, + "learning_rate": 4.0243474189394824e-08, + "logits/chosen": -2.234761953353882, + "logits/rejected": -2.3226914405822754, + "logps/chosen": -155.66769409179688, + "logps/rejected": -185.65365600585938, + "loss": 0.2045, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.065538763999939, + "rewards/margins": 3.22977352142334, + "rewards/rejected": -4.295312404632568, + "step": 7528 + }, + { + "epoch": 0.87, + "learning_rate": 4.0208357719770565e-08, + "logits/chosen": -1.7613496780395508, + "logits/rejected": -2.1993746757507324, + "logps/chosen": -394.0764465332031, + "logps/rejected": -356.50799560546875, + "loss": 0.4086, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3739781379699707, + "rewards/margins": 1.30168879032135, + "rewards/rejected": -2.6756668090820312, + "step": 7529 + }, + { + "epoch": 0.87, + "learning_rate": 4.017324125014632e-08, + "logits/chosen": -2.661224126815796, + "logits/rejected": -2.681973695755005, + "logps/chosen": -275.219482421875, + "logps/rejected": -254.4276123046875, + "loss": 0.3146, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.36622798442840576, + "rewards/margins": 2.8606464862823486, + "rewards/rejected": -3.226874589920044, + "step": 7530 + }, + { + "epoch": 0.87, + "learning_rate": 4.013812478052206e-08, + "logits/chosen": -2.264545440673828, + "logits/rejected": -2.454760789871216, + "logps/chosen": -245.18365478515625, + "logps/rejected": -194.8502655029297, + "loss": 0.3037, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8981022834777832, + "rewards/margins": 1.3970304727554321, + "rewards/rejected": -2.295132875442505, + "step": 7531 + }, + { + "epoch": 0.87, + "learning_rate": 4.0103008310897806e-08, + "logits/chosen": -2.3789925575256348, + "logits/rejected": -2.609006881713867, + "logps/chosen": -194.16207885742188, + "logps/rejected": -189.72427368164062, + "loss": 0.423, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4925520122051239, + "rewards/margins": 1.9560290575027466, + "rewards/rejected": -2.4485809803009033, + "step": 7532 + }, + { + "epoch": 0.87, + "learning_rate": 4.0067891841273554e-08, + "logits/chosen": -2.70192813873291, + "logits/rejected": -2.6087169647216797, + "logps/chosen": -248.53189086914062, + "logps/rejected": -348.90826416015625, + "loss": 0.1229, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6886446475982666, + "rewards/margins": 3.488880157470703, + "rewards/rejected": -4.177524566650391, + "step": 7533 + }, + { + "epoch": 0.87, + "learning_rate": 4.00327753716493e-08, + "logits/chosen": -2.315213918685913, + "logits/rejected": -2.465031147003174, + "logps/chosen": -393.60333251953125, + "logps/rejected": -463.9503173828125, + "loss": 0.5679, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3655800819396973, + "rewards/margins": 1.4065046310424805, + "rewards/rejected": -2.772084951400757, + "step": 7534 + }, + { + "epoch": 0.87, + "learning_rate": 3.999765890202504e-08, + "logits/chosen": -2.637594223022461, + "logits/rejected": -2.5930404663085938, + "logps/chosen": -294.009521484375, + "logps/rejected": -265.9351501464844, + "loss": 0.5254, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5565595626831055, + "rewards/margins": 1.0750296115875244, + "rewards/rejected": -2.631588935852051, + "step": 7535 + }, + { + "epoch": 0.87, + "learning_rate": 3.9962542432400795e-08, + "logits/chosen": -2.248750686645508, + "logits/rejected": -2.3279969692230225, + "logps/chosen": -402.39996337890625, + "logps/rejected": -330.3780822753906, + "loss": 0.2622, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7374509572982788, + "rewards/margins": 2.860165596008301, + "rewards/rejected": -3.597616672515869, + "step": 7536 + }, + { + "epoch": 0.87, + "learning_rate": 3.9927425962776536e-08, + "logits/chosen": -2.90905499458313, + "logits/rejected": -2.953458786010742, + "logps/chosen": -342.9341125488281, + "logps/rejected": -312.940673828125, + "loss": 0.2283, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.036321997642517, + "rewards/margins": 3.8503317832946777, + "rewards/rejected": -4.886653900146484, + "step": 7537 + }, + { + "epoch": 0.87, + "learning_rate": 3.989230949315229e-08, + "logits/chosen": -3.0758986473083496, + "logits/rejected": -3.000636577606201, + "logps/chosen": -329.55462646484375, + "logps/rejected": -208.89804077148438, + "loss": 0.2458, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8870946764945984, + "rewards/margins": 2.7557690143585205, + "rewards/rejected": -3.6428637504577637, + "step": 7538 + }, + { + "epoch": 0.87, + "learning_rate": 3.985719302352803e-08, + "logits/chosen": -1.715355396270752, + "logits/rejected": -1.9713284969329834, + "logps/chosen": -432.6436767578125, + "logps/rejected": -280.99151611328125, + "loss": 0.483, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9243981242179871, + "rewards/margins": 1.5974667072296143, + "rewards/rejected": -2.521864891052246, + "step": 7539 + }, + { + "epoch": 0.87, + "learning_rate": 3.982207655390378e-08, + "logits/chosen": -2.2022705078125, + "logits/rejected": -2.323035955429077, + "logps/chosen": -155.5452880859375, + "logps/rejected": -188.96170043945312, + "loss": 0.2724, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3363412022590637, + "rewards/margins": 1.9329942464828491, + "rewards/rejected": -2.2693355083465576, + "step": 7540 + }, + { + "epoch": 0.87, + "learning_rate": 3.9786960084279525e-08, + "logits/chosen": -2.166306257247925, + "logits/rejected": -2.158281087875366, + "logps/chosen": -281.3714599609375, + "logps/rejected": -298.8066101074219, + "loss": 0.1515, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9264020919799805, + "rewards/margins": 3.0562384128570557, + "rewards/rejected": -3.982640266418457, + "step": 7541 + }, + { + "epoch": 0.87, + "learning_rate": 3.975184361465527e-08, + "logits/chosen": -2.0385122299194336, + "logits/rejected": -2.1324801445007324, + "logps/chosen": -369.99261474609375, + "logps/rejected": -345.5143127441406, + "loss": 0.4947, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0742484331130981, + "rewards/margins": 1.356101632118225, + "rewards/rejected": -2.4303500652313232, + "step": 7542 + }, + { + "epoch": 0.87, + "learning_rate": 3.971672714503101e-08, + "logits/chosen": -1.7924877405166626, + "logits/rejected": -2.301332950592041, + "logps/chosen": -423.908203125, + "logps/rejected": -217.57388305664062, + "loss": 0.2164, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4299887418746948, + "rewards/margins": 1.9769560098648071, + "rewards/rejected": -2.406944751739502, + "step": 7543 + }, + { + "epoch": 0.87, + "learning_rate": 3.9681610675406767e-08, + "logits/chosen": -2.7183899879455566, + "logits/rejected": -2.590620994567871, + "logps/chosen": -98.57495880126953, + "logps/rejected": -304.0135803222656, + "loss": 0.31, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7602600455284119, + "rewards/margins": 3.621117353439331, + "rewards/rejected": -4.381377220153809, + "step": 7544 + }, + { + "epoch": 0.87, + "learning_rate": 3.964649420578251e-08, + "logits/chosen": -2.1747498512268066, + "logits/rejected": -2.1420092582702637, + "logps/chosen": -359.351318359375, + "logps/rejected": -310.3707275390625, + "loss": 0.5706, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0415724515914917, + "rewards/margins": 1.1413862705230713, + "rewards/rejected": -2.1829586029052734, + "step": 7545 + }, + { + "epoch": 0.87, + "learning_rate": 3.961137773615826e-08, + "logits/chosen": -2.097243309020996, + "logits/rejected": -2.351807117462158, + "logps/chosen": -261.9786682128906, + "logps/rejected": -228.93307495117188, + "loss": 0.4342, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9904984831809998, + "rewards/margins": 2.0506412982940674, + "rewards/rejected": -3.041139841079712, + "step": 7546 + }, + { + "epoch": 0.87, + "learning_rate": 3.9576261266534e-08, + "logits/chosen": -2.6173813343048096, + "logits/rejected": -2.3987603187561035, + "logps/chosen": -240.67904663085938, + "logps/rejected": -362.9930725097656, + "loss": 0.2716, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2374383211135864, + "rewards/margins": 2.816534996032715, + "rewards/rejected": -4.053973197937012, + "step": 7547 + }, + { + "epoch": 0.87, + "learning_rate": 3.954114479690975e-08, + "logits/chosen": -2.084681272506714, + "logits/rejected": -2.147388458251953, + "logps/chosen": -175.54293823242188, + "logps/rejected": -222.44479370117188, + "loss": 0.7755, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.7286593914031982, + "rewards/margins": 2.6505022048950195, + "rewards/rejected": -4.379161834716797, + "step": 7548 + }, + { + "epoch": 0.87, + "learning_rate": 3.950602832728549e-08, + "logits/chosen": -1.863872766494751, + "logits/rejected": -1.5728349685668945, + "logps/chosen": -240.07144165039062, + "logps/rejected": -299.8373107910156, + "loss": 0.1668, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5597153902053833, + "rewards/margins": 2.827357053756714, + "rewards/rejected": -3.3870723247528076, + "step": 7549 + }, + { + "epoch": 0.87, + "learning_rate": 3.9470911857661243e-08, + "logits/chosen": -2.434000015258789, + "logits/rejected": -2.5398874282836914, + "logps/chosen": -435.833251953125, + "logps/rejected": -363.2742614746094, + "loss": 0.2964, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4592422246932983, + "rewards/margins": 2.6399426460266113, + "rewards/rejected": -4.099184989929199, + "step": 7550 + }, + { + "epoch": 0.87, + "learning_rate": 3.9435795388036984e-08, + "logits/chosen": -2.1052515506744385, + "logits/rejected": -2.159970998764038, + "logps/chosen": -351.5837707519531, + "logps/rejected": -306.22906494140625, + "loss": 0.3295, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.324763536453247, + "rewards/margins": 1.868759036064148, + "rewards/rejected": -3.1935226917266846, + "step": 7551 + }, + { + "epoch": 0.87, + "learning_rate": 3.940067891841274e-08, + "logits/chosen": -2.354217052459717, + "logits/rejected": -2.2274014949798584, + "logps/chosen": -223.59542846679688, + "logps/rejected": -246.10028076171875, + "loss": 0.5174, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2646274268627167, + "rewards/margins": 1.3750152587890625, + "rewards/rejected": -1.6396427154541016, + "step": 7552 + }, + { + "epoch": 0.87, + "learning_rate": 3.936556244878848e-08, + "logits/chosen": -2.417807102203369, + "logits/rejected": -2.3245816230773926, + "logps/chosen": -167.568603515625, + "logps/rejected": -279.58416748046875, + "loss": 0.1451, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3330408334732056, + "rewards/margins": 3.752023935317993, + "rewards/rejected": -5.085064888000488, + "step": 7553 + }, + { + "epoch": 0.87, + "learning_rate": 3.9330445979164226e-08, + "logits/chosen": -2.3962574005126953, + "logits/rejected": -2.226170301437378, + "logps/chosen": -426.444091796875, + "logps/rejected": -432.75811767578125, + "loss": 0.0554, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5808284282684326, + "rewards/margins": 3.3455164432525635, + "rewards/rejected": -3.926345109939575, + "step": 7554 + }, + { + "epoch": 0.87, + "learning_rate": 3.929532950953997e-08, + "logits/chosen": -2.7967631816864014, + "logits/rejected": -2.651817560195923, + "logps/chosen": -208.7089385986328, + "logps/rejected": -211.56561279296875, + "loss": 0.3193, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6945184469223022, + "rewards/margins": 3.3156073093414307, + "rewards/rejected": -4.010125637054443, + "step": 7555 + }, + { + "epoch": 0.87, + "learning_rate": 3.926021303991572e-08, + "logits/chosen": -2.5062777996063232, + "logits/rejected": -2.483285665512085, + "logps/chosen": -218.7024688720703, + "logps/rejected": -198.64471435546875, + "loss": 0.2083, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8440978527069092, + "rewards/margins": 3.1184048652648926, + "rewards/rejected": -3.9625027179718018, + "step": 7556 + }, + { + "epoch": 0.87, + "learning_rate": 3.922509657029146e-08, + "logits/chosen": -2.3957252502441406, + "logits/rejected": -2.4865262508392334, + "logps/chosen": -321.997802734375, + "logps/rejected": -303.7561340332031, + "loss": 0.1485, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8731811046600342, + "rewards/margins": 3.410677671432495, + "rewards/rejected": -4.283858776092529, + "step": 7557 + }, + { + "epoch": 0.87, + "learning_rate": 3.9189980100667215e-08, + "logits/chosen": -2.136732339859009, + "logits/rejected": -2.2526450157165527, + "logps/chosen": -263.60699462890625, + "logps/rejected": -266.573486328125, + "loss": 0.246, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.631232500076294, + "rewards/margins": 2.5644524097442627, + "rewards/rejected": -3.1956849098205566, + "step": 7558 + }, + { + "epoch": 0.87, + "learning_rate": 3.9154863631042955e-08, + "logits/chosen": -2.478757858276367, + "logits/rejected": -2.542677402496338, + "logps/chosen": -341.0230407714844, + "logps/rejected": -274.6773376464844, + "loss": 0.589, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4226865768432617, + "rewards/margins": 1.5971177816390991, + "rewards/rejected": -3.0198044776916504, + "step": 7559 + }, + { + "epoch": 0.87, + "learning_rate": 3.911974716141871e-08, + "logits/chosen": -2.2002620697021484, + "logits/rejected": -2.2286300659179688, + "logps/chosen": -198.65701293945312, + "logps/rejected": -204.58157348632812, + "loss": 0.615, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0901488065719604, + "rewards/margins": 1.1594581604003906, + "rewards/rejected": -2.2496070861816406, + "step": 7560 + }, + { + "epoch": 0.87, + "learning_rate": 3.908463069179445e-08, + "logits/chosen": -2.479179620742798, + "logits/rejected": -2.679821491241455, + "logps/chosen": -226.02613830566406, + "logps/rejected": -176.0762939453125, + "loss": 0.19, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.31847622990608215, + "rewards/margins": 3.416290044784546, + "rewards/rejected": -3.7347660064697266, + "step": 7561 + }, + { + "epoch": 0.87, + "learning_rate": 3.90495142221702e-08, + "logits/chosen": -1.8925652503967285, + "logits/rejected": -1.9360384941101074, + "logps/chosen": -215.83322143554688, + "logps/rejected": -254.88438415527344, + "loss": 0.2015, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.674406111240387, + "rewards/margins": 3.629361391067505, + "rewards/rejected": -4.303767681121826, + "step": 7562 + }, + { + "epoch": 0.87, + "learning_rate": 3.9014397752545944e-08, + "logits/chosen": -2.840471029281616, + "logits/rejected": -2.779991865158081, + "logps/chosen": -281.0681457519531, + "logps/rejected": -248.24021911621094, + "loss": 0.2313, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8361169099807739, + "rewards/margins": 2.4479265213012695, + "rewards/rejected": -3.284043550491333, + "step": 7563 + }, + { + "epoch": 0.87, + "learning_rate": 3.897928128292169e-08, + "logits/chosen": -2.610849380493164, + "logits/rejected": -2.424318313598633, + "logps/chosen": -112.3808364868164, + "logps/rejected": -195.7122802734375, + "loss": 0.3468, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9224683046340942, + "rewards/margins": 1.5665699243545532, + "rewards/rejected": -3.4890384674072266, + "step": 7564 + }, + { + "epoch": 0.87, + "learning_rate": 3.894416481329743e-08, + "logits/chosen": -2.7990548610687256, + "logits/rejected": -2.550705909729004, + "logps/chosen": -228.29791259765625, + "logps/rejected": -333.25494384765625, + "loss": 0.9407, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4352778196334839, + "rewards/margins": 1.780655860900879, + "rewards/rejected": -3.2159335613250732, + "step": 7565 + }, + { + "epoch": 0.87, + "learning_rate": 3.8909048343673186e-08, + "logits/chosen": -2.2471694946289062, + "logits/rejected": -2.3788323402404785, + "logps/chosen": -438.6754455566406, + "logps/rejected": -411.74322509765625, + "loss": 0.2306, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9712814092636108, + "rewards/margins": 3.168523073196411, + "rewards/rejected": -4.139804363250732, + "step": 7566 + }, + { + "epoch": 0.87, + "learning_rate": 3.8873931874048927e-08, + "logits/chosen": -2.2636685371398926, + "logits/rejected": -2.402958869934082, + "logps/chosen": -288.006591796875, + "logps/rejected": -271.5134582519531, + "loss": 0.7556, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.4155006408691406, + "rewards/margins": 1.7116987705230713, + "rewards/rejected": -4.127199649810791, + "step": 7567 + }, + { + "epoch": 0.87, + "learning_rate": 3.8838815404424674e-08, + "logits/chosen": -1.916447401046753, + "logits/rejected": -1.8759442567825317, + "logps/chosen": -286.0819091796875, + "logps/rejected": -273.30975341796875, + "loss": 0.2267, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5353466272354126, + "rewards/margins": 2.8344125747680664, + "rewards/rejected": -3.3697593212127686, + "step": 7568 + }, + { + "epoch": 0.87, + "learning_rate": 3.880369893480042e-08, + "logits/chosen": -2.3117804527282715, + "logits/rejected": -2.40217924118042, + "logps/chosen": -102.16349792480469, + "logps/rejected": -217.80641174316406, + "loss": 0.3381, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1976301074028015, + "rewards/margins": 4.547174453735352, + "rewards/rejected": -4.744804382324219, + "step": 7569 + }, + { + "epoch": 0.87, + "learning_rate": 3.876858246517617e-08, + "logits/chosen": -2.344174385070801, + "logits/rejected": -2.6585772037506104, + "logps/chosen": -322.94091796875, + "logps/rejected": -232.6300811767578, + "loss": 0.3785, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7049229145050049, + "rewards/margins": 2.6299924850463867, + "rewards/rejected": -4.334915637969971, + "step": 7570 + }, + { + "epoch": 0.87, + "learning_rate": 3.873346599555191e-08, + "logits/chosen": -2.0238518714904785, + "logits/rejected": -2.22352933883667, + "logps/chosen": -439.46453857421875, + "logps/rejected": -458.3946228027344, + "loss": 0.3985, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9989187717437744, + "rewards/margins": 1.6780037879943848, + "rewards/rejected": -2.676922559738159, + "step": 7571 + }, + { + "epoch": 0.87, + "learning_rate": 3.869834952592766e-08, + "logits/chosen": -1.7033915519714355, + "logits/rejected": -1.8442928791046143, + "logps/chosen": -311.1752624511719, + "logps/rejected": -344.08319091796875, + "loss": 0.8796, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5092178583145142, + "rewards/margins": 1.0118064880371094, + "rewards/rejected": -2.521024227142334, + "step": 7572 + }, + { + "epoch": 0.87, + "learning_rate": 3.8663233056303403e-08, + "logits/chosen": -1.4178073406219482, + "logits/rejected": -1.3605231046676636, + "logps/chosen": -385.3553466796875, + "logps/rejected": -368.298583984375, + "loss": 0.6622, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3767895698547363, + "rewards/margins": 0.34568509459495544, + "rewards/rejected": -1.7224745750427246, + "step": 7573 + }, + { + "epoch": 0.87, + "learning_rate": 3.862811658667916e-08, + "logits/chosen": -2.457643508911133, + "logits/rejected": -2.515566110610962, + "logps/chosen": -369.76593017578125, + "logps/rejected": -378.76739501953125, + "loss": 0.3694, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9113942384719849, + "rewards/margins": 1.7871763706207275, + "rewards/rejected": -2.698570489883423, + "step": 7574 + }, + { + "epoch": 0.87, + "learning_rate": 3.85930001170549e-08, + "logits/chosen": -2.2132139205932617, + "logits/rejected": -2.483563184738159, + "logps/chosen": -384.8194274902344, + "logps/rejected": -218.84054565429688, + "loss": 0.3322, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.212713599205017, + "rewards/margins": 1.3745758533477783, + "rewards/rejected": -2.587289333343506, + "step": 7575 + }, + { + "epoch": 0.87, + "learning_rate": 3.8557883647430645e-08, + "logits/chosen": -2.447251319885254, + "logits/rejected": -2.4890899658203125, + "logps/chosen": -353.8602600097656, + "logps/rejected": -237.3513946533203, + "loss": 0.9973, + "rewards/accuracies": 0.375, + "rewards/chosen": -2.3536765575408936, + "rewards/margins": 0.06694018840789795, + "rewards/rejected": -2.420616626739502, + "step": 7576 + }, + { + "epoch": 0.87, + "learning_rate": 3.852276717780639e-08, + "logits/chosen": -2.514781951904297, + "logits/rejected": -2.7234480381011963, + "logps/chosen": -111.59180450439453, + "logps/rejected": -153.18026733398438, + "loss": 0.8096, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3559272289276123, + "rewards/margins": 1.2821589708328247, + "rewards/rejected": -2.6380863189697266, + "step": 7577 + }, + { + "epoch": 0.87, + "learning_rate": 3.848765070818213e-08, + "logits/chosen": -2.231362819671631, + "logits/rejected": -2.3762760162353516, + "logps/chosen": -218.24307250976562, + "logps/rejected": -216.6553955078125, + "loss": 0.1988, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.157924622297287, + "rewards/margins": 2.5012166500091553, + "rewards/rejected": -2.6591415405273438, + "step": 7578 + }, + { + "epoch": 0.87, + "learning_rate": 3.845253423855788e-08, + "logits/chosen": -2.391177177429199, + "logits/rejected": -2.105705976486206, + "logps/chosen": -263.9400634765625, + "logps/rejected": -323.7769470214844, + "loss": 0.1459, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36674371361732483, + "rewards/margins": 3.380344867706299, + "rewards/rejected": -3.7470884323120117, + "step": 7579 + }, + { + "epoch": 0.87, + "learning_rate": 3.841741776893363e-08, + "logits/chosen": -2.6238882541656494, + "logits/rejected": -2.6592824459075928, + "logps/chosen": -295.6209716796875, + "logps/rejected": -318.3763427734375, + "loss": 0.1749, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6643638014793396, + "rewards/margins": 4.113394737243652, + "rewards/rejected": -4.777758598327637, + "step": 7580 + }, + { + "epoch": 0.87, + "learning_rate": 3.8382301299309375e-08, + "logits/chosen": -2.234147787094116, + "logits/rejected": -2.4516258239746094, + "logps/chosen": -335.5015869140625, + "logps/rejected": -241.4929656982422, + "loss": 0.2723, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4940994381904602, + "rewards/margins": 2.1642277240753174, + "rewards/rejected": -2.658327102661133, + "step": 7581 + }, + { + "epoch": 0.87, + "learning_rate": 3.8347184829685115e-08, + "logits/chosen": -1.761262059211731, + "logits/rejected": -1.840355396270752, + "logps/chosen": -433.2630615234375, + "logps/rejected": -288.1517639160156, + "loss": 0.702, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.387540578842163, + "rewards/margins": 1.2375441789627075, + "rewards/rejected": -2.62508487701416, + "step": 7582 + }, + { + "epoch": 0.87, + "learning_rate": 3.831206836006087e-08, + "logits/chosen": -2.6377856731414795, + "logits/rejected": -2.5670037269592285, + "logps/chosen": -274.51873779296875, + "logps/rejected": -204.5861358642578, + "loss": 0.5604, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9361938238143921, + "rewards/margins": 0.7640535831451416, + "rewards/rejected": -1.7002472877502441, + "step": 7583 + }, + { + "epoch": 0.87, + "learning_rate": 3.827695189043661e-08, + "logits/chosen": -2.3704800605773926, + "logits/rejected": -1.8122014999389648, + "logps/chosen": -297.3657531738281, + "logps/rejected": -359.18359375, + "loss": 0.2475, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8320540189743042, + "rewards/margins": 1.7036641836166382, + "rewards/rejected": -2.5357182025909424, + "step": 7584 + }, + { + "epoch": 0.87, + "learning_rate": 3.824183542081236e-08, + "logits/chosen": -2.2943179607391357, + "logits/rejected": -2.4403393268585205, + "logps/chosen": -277.45770263671875, + "logps/rejected": -262.6260681152344, + "loss": 0.671, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8952635526657104, + "rewards/margins": 0.8945561647415161, + "rewards/rejected": -1.789819598197937, + "step": 7585 + }, + { + "epoch": 0.87, + "learning_rate": 3.8206718951188104e-08, + "logits/chosen": -2.3926308155059814, + "logits/rejected": -2.6411452293395996, + "logps/chosen": -310.4696350097656, + "logps/rejected": -191.87725830078125, + "loss": 0.4787, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3582119941711426, + "rewards/margins": 2.3750839233398438, + "rewards/rejected": -3.7332956790924072, + "step": 7586 + }, + { + "epoch": 0.87, + "learning_rate": 3.817160248156385e-08, + "logits/chosen": -2.423198699951172, + "logits/rejected": -2.4489402770996094, + "logps/chosen": -376.89056396484375, + "logps/rejected": -399.7763366699219, + "loss": 0.1597, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8943209052085876, + "rewards/margins": 3.464130163192749, + "rewards/rejected": -4.358450889587402, + "step": 7587 + }, + { + "epoch": 0.87, + "learning_rate": 3.813648601193959e-08, + "logits/chosen": -2.2467758655548096, + "logits/rejected": -2.221707582473755, + "logps/chosen": -160.34130859375, + "logps/rejected": -316.0932922363281, + "loss": 0.353, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1376258134841919, + "rewards/margins": 1.734709620475769, + "rewards/rejected": -1.872335433959961, + "step": 7588 + }, + { + "epoch": 0.87, + "learning_rate": 3.8101369542315346e-08, + "logits/chosen": -2.06607985496521, + "logits/rejected": -2.245748281478882, + "logps/chosen": -318.489013671875, + "logps/rejected": -309.62432861328125, + "loss": 0.496, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5557911396026611, + "rewards/margins": 1.1467632055282593, + "rewards/rejected": -1.7025543451309204, + "step": 7589 + }, + { + "epoch": 0.87, + "learning_rate": 3.8066253072691087e-08, + "logits/chosen": -2.2497050762176514, + "logits/rejected": -2.760563850402832, + "logps/chosen": -402.3710632324219, + "logps/rejected": -202.63723754882812, + "loss": 0.3878, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5424169898033142, + "rewards/margins": 2.248213529586792, + "rewards/rejected": -2.790630340576172, + "step": 7590 + }, + { + "epoch": 0.88, + "learning_rate": 3.803113660306684e-08, + "logits/chosen": -2.611191511154175, + "logits/rejected": -2.400834321975708, + "logps/chosen": -265.66143798828125, + "logps/rejected": -336.4165344238281, + "loss": 0.0907, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.41461172699928284, + "rewards/margins": 3.060845375061035, + "rewards/rejected": -2.6462337970733643, + "step": 7591 + }, + { + "epoch": 0.88, + "learning_rate": 3.799602013344258e-08, + "logits/chosen": -2.0967507362365723, + "logits/rejected": -2.0535805225372314, + "logps/chosen": -245.67755126953125, + "logps/rejected": -266.02685546875, + "loss": 0.639, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5199843645095825, + "rewards/margins": 1.3994218111038208, + "rewards/rejected": -2.9194064140319824, + "step": 7592 + }, + { + "epoch": 0.88, + "learning_rate": 3.796090366381833e-08, + "logits/chosen": -2.790924310684204, + "logits/rejected": -2.830070972442627, + "logps/chosen": -318.023193359375, + "logps/rejected": -347.7693786621094, + "loss": 0.2355, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.076363205909729, + "rewards/margins": 2.7447195053100586, + "rewards/rejected": -3.821082592010498, + "step": 7593 + }, + { + "epoch": 0.88, + "learning_rate": 3.7925787194194075e-08, + "logits/chosen": -2.2200231552124023, + "logits/rejected": -2.1876964569091797, + "logps/chosen": -369.31268310546875, + "logps/rejected": -268.8595275878906, + "loss": 0.5266, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.19622114300727844, + "rewards/margins": 1.2837793827056885, + "rewards/rejected": -1.0875582695007324, + "step": 7594 + }, + { + "epoch": 0.88, + "learning_rate": 3.789067072456982e-08, + "logits/chosen": -2.298361301422119, + "logits/rejected": -2.564056396484375, + "logps/chosen": -366.12286376953125, + "logps/rejected": -256.2095642089844, + "loss": 0.5174, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7134386897087097, + "rewards/margins": 2.3086352348327637, + "rewards/rejected": -3.022073984146118, + "step": 7595 + }, + { + "epoch": 0.88, + "learning_rate": 3.7855554254945563e-08, + "logits/chosen": -2.6363275051116943, + "logits/rejected": -2.7121798992156982, + "logps/chosen": -153.76829528808594, + "logps/rejected": -145.18954467773438, + "loss": 0.4083, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8111209869384766, + "rewards/margins": 1.2863900661468506, + "rewards/rejected": -2.097510814666748, + "step": 7596 + }, + { + "epoch": 0.88, + "learning_rate": 3.782043778532132e-08, + "logits/chosen": -1.7692501544952393, + "logits/rejected": -2.293520927429199, + "logps/chosen": -357.5631103515625, + "logps/rejected": -242.7952880859375, + "loss": 0.3237, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6769450306892395, + "rewards/margins": 1.5293500423431396, + "rewards/rejected": -2.2062950134277344, + "step": 7597 + }, + { + "epoch": 0.88, + "learning_rate": 3.778532131569706e-08, + "logits/chosen": -2.326474666595459, + "logits/rejected": -2.2334718704223633, + "logps/chosen": -211.11386108398438, + "logps/rejected": -217.61834716796875, + "loss": 0.2037, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.41463765501976013, + "rewards/margins": 2.3273634910583496, + "rewards/rejected": -2.7420010566711426, + "step": 7598 + }, + { + "epoch": 0.88, + "learning_rate": 3.775020484607281e-08, + "logits/chosen": -2.5009169578552246, + "logits/rejected": -2.497222423553467, + "logps/chosen": -150.82107543945312, + "logps/rejected": -122.76165771484375, + "loss": 0.4611, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4836559295654297, + "rewards/margins": 1.973900318145752, + "rewards/rejected": -3.4575562477111816, + "step": 7599 + }, + { + "epoch": 0.88, + "learning_rate": 3.771508837644855e-08, + "logits/chosen": -2.2696197032928467, + "logits/rejected": -1.915759801864624, + "logps/chosen": -544.6593017578125, + "logps/rejected": -502.37249755859375, + "loss": 1.035, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.284263014793396, + "rewards/margins": 0.9640071392059326, + "rewards/rejected": -2.248270273208618, + "step": 7600 + }, + { + "epoch": 0.88, + "learning_rate": 3.76799719068243e-08, + "logits/chosen": -1.8168089389801025, + "logits/rejected": -2.018188238143921, + "logps/chosen": -464.15216064453125, + "logps/rejected": -365.32623291015625, + "loss": 0.3603, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6629183888435364, + "rewards/margins": 1.9765257835388184, + "rewards/rejected": -2.63944411277771, + "step": 7601 + }, + { + "epoch": 0.88, + "learning_rate": 3.764485543720004e-08, + "logits/chosen": -2.110297679901123, + "logits/rejected": -2.104180335998535, + "logps/chosen": -226.90919494628906, + "logps/rejected": -261.2591247558594, + "loss": 0.1778, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3866872191429138, + "rewards/margins": 2.8482353687286377, + "rewards/rejected": -3.234922409057617, + "step": 7602 + }, + { + "epoch": 0.88, + "learning_rate": 3.7609738967575794e-08, + "logits/chosen": -2.5437521934509277, + "logits/rejected": -2.6660571098327637, + "logps/chosen": -255.0076141357422, + "logps/rejected": -228.81723022460938, + "loss": 0.4194, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2185173034667969, + "rewards/margins": 1.9950311183929443, + "rewards/rejected": -3.213548183441162, + "step": 7603 + }, + { + "epoch": 0.88, + "learning_rate": 3.7574622497951535e-08, + "logits/chosen": -2.2687230110168457, + "logits/rejected": -2.611802101135254, + "logps/chosen": -320.4962463378906, + "logps/rejected": -191.3428955078125, + "loss": 0.5876, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7698574662208557, + "rewards/margins": 1.9359993934631348, + "rewards/rejected": -2.7058568000793457, + "step": 7604 + }, + { + "epoch": 0.88, + "learning_rate": 3.753950602832729e-08, + "logits/chosen": -2.3091983795166016, + "logits/rejected": -2.0873517990112305, + "logps/chosen": -152.04234313964844, + "logps/rejected": -232.86917114257812, + "loss": 0.1514, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9141587615013123, + "rewards/margins": 3.2055556774139404, + "rewards/rejected": -4.119714736938477, + "step": 7605 + }, + { + "epoch": 0.88, + "learning_rate": 3.750438955870303e-08, + "logits/chosen": -2.213381290435791, + "logits/rejected": -2.338050603866577, + "logps/chosen": -202.42083740234375, + "logps/rejected": -188.90896606445312, + "loss": 0.6102, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.2932491302490234, + "rewards/margins": 0.6104132533073425, + "rewards/rejected": -2.9036622047424316, + "step": 7606 + }, + { + "epoch": 0.88, + "learning_rate": 3.7469273089078776e-08, + "logits/chosen": -2.412971019744873, + "logits/rejected": -2.2068400382995605, + "logps/chosen": -213.8238983154297, + "logps/rejected": -288.2431335449219, + "loss": 0.524, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6482973098754883, + "rewards/margins": 1.472902774810791, + "rewards/rejected": -2.1212000846862793, + "step": 7607 + }, + { + "epoch": 0.88, + "learning_rate": 3.7434156619454524e-08, + "logits/chosen": -2.5956857204437256, + "logits/rejected": -2.7295186519622803, + "logps/chosen": -135.80929565429688, + "logps/rejected": -224.96177673339844, + "loss": 0.3219, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.47712159156799316, + "rewards/margins": 3.6609458923339844, + "rewards/rejected": -4.138067722320557, + "step": 7608 + }, + { + "epoch": 0.88, + "learning_rate": 3.7399040149830264e-08, + "logits/chosen": -2.5198307037353516, + "logits/rejected": -2.3081862926483154, + "logps/chosen": -219.67037963867188, + "logps/rejected": -310.84356689453125, + "loss": 0.2342, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6815521717071533, + "rewards/margins": 4.351569652557373, + "rewards/rejected": -5.0331220626831055, + "step": 7609 + }, + { + "epoch": 0.88, + "learning_rate": 3.736392368020601e-08, + "logits/chosen": -2.574465274810791, + "logits/rejected": -2.6421470642089844, + "logps/chosen": -300.6468200683594, + "logps/rejected": -271.7974853515625, + "loss": 0.7187, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.164821743965149, + "rewards/margins": 2.7273669242858887, + "rewards/rejected": -3.892188787460327, + "step": 7610 + }, + { + "epoch": 0.88, + "learning_rate": 3.732880721058176e-08, + "logits/chosen": -2.5672850608825684, + "logits/rejected": -2.470646858215332, + "logps/chosen": -144.57118225097656, + "logps/rejected": -223.0200958251953, + "loss": 0.7987, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1840938329696655, + "rewards/margins": 1.3591073751449585, + "rewards/rejected": -2.543201208114624, + "step": 7611 + }, + { + "epoch": 0.88, + "learning_rate": 3.7293690740957506e-08, + "logits/chosen": -2.0790042877197266, + "logits/rejected": -2.391023635864258, + "logps/chosen": -336.16998291015625, + "logps/rejected": -351.1151428222656, + "loss": 0.1272, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3086414635181427, + "rewards/margins": 4.0718536376953125, + "rewards/rejected": -4.380495071411133, + "step": 7612 + }, + { + "epoch": 0.88, + "learning_rate": 3.725857427133325e-08, + "logits/chosen": -2.7889997959136963, + "logits/rejected": -2.8704066276550293, + "logps/chosen": -147.91317749023438, + "logps/rejected": -215.3875732421875, + "loss": 0.308, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0195238590240479, + "rewards/margins": 2.4544105529785156, + "rewards/rejected": -3.4739341735839844, + "step": 7613 + }, + { + "epoch": 0.88, + "learning_rate": 3.7223457801709e-08, + "logits/chosen": -2.554138422012329, + "logits/rejected": -2.366821765899658, + "logps/chosen": -206.76475524902344, + "logps/rejected": -287.65570068359375, + "loss": 0.617, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3316506147384644, + "rewards/margins": 0.7609056830406189, + "rewards/rejected": -2.0925562381744385, + "step": 7614 + }, + { + "epoch": 0.88, + "learning_rate": 3.718834133208475e-08, + "logits/chosen": -2.24306583404541, + "logits/rejected": -2.3737263679504395, + "logps/chosen": -340.44122314453125, + "logps/rejected": -243.3203125, + "loss": 0.3964, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.49788445234298706, + "rewards/margins": 2.4672741889953613, + "rewards/rejected": -2.9651589393615723, + "step": 7615 + }, + { + "epoch": 0.88, + "learning_rate": 3.7153224862460495e-08, + "logits/chosen": -2.535917282104492, + "logits/rejected": -2.3684442043304443, + "logps/chosen": -121.02259063720703, + "logps/rejected": -240.55178833007812, + "loss": 0.2205, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.29045093059539795, + "rewards/margins": 2.632274627685547, + "rewards/rejected": -2.9227256774902344, + "step": 7616 + }, + { + "epoch": 0.88, + "learning_rate": 3.7118108392836235e-08, + "logits/chosen": -1.816133737564087, + "logits/rejected": -2.121682643890381, + "logps/chosen": -324.0577697753906, + "logps/rejected": -241.56378173828125, + "loss": 0.5124, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9316275715827942, + "rewards/margins": 2.0875449180603027, + "rewards/rejected": -3.019172430038452, + "step": 7617 + }, + { + "epoch": 0.88, + "learning_rate": 3.708299192321198e-08, + "logits/chosen": -2.1340603828430176, + "logits/rejected": -2.3059380054473877, + "logps/chosen": -282.2361145019531, + "logps/rejected": -269.09765625, + "loss": 0.1655, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0715084075927734, + "rewards/margins": 3.7027251720428467, + "rewards/rejected": -4.774233341217041, + "step": 7618 + }, + { + "epoch": 0.88, + "learning_rate": 3.704787545358773e-08, + "logits/chosen": -1.8381123542785645, + "logits/rejected": -1.8967840671539307, + "logps/chosen": -342.41583251953125, + "logps/rejected": -291.07305908203125, + "loss": 0.3967, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4344272017478943, + "rewards/margins": 1.478322982788086, + "rewards/rejected": -1.912750244140625, + "step": 7619 + }, + { + "epoch": 0.88, + "learning_rate": 3.701275898396348e-08, + "logits/chosen": -2.9696996212005615, + "logits/rejected": -3.00032114982605, + "logps/chosen": -287.9123229980469, + "logps/rejected": -320.58612060546875, + "loss": 0.2964, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9019024968147278, + "rewards/margins": 2.562774181365967, + "rewards/rejected": -3.46467661857605, + "step": 7620 + }, + { + "epoch": 0.88, + "learning_rate": 3.6977642514339224e-08, + "logits/chosen": -2.05057954788208, + "logits/rejected": -2.2364606857299805, + "logps/chosen": -460.4440002441406, + "logps/rejected": -354.68157958984375, + "loss": 0.1605, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7960618138313293, + "rewards/margins": 3.218963384628296, + "rewards/rejected": -4.0150251388549805, + "step": 7621 + }, + { + "epoch": 0.88, + "learning_rate": 3.694252604471497e-08, + "logits/chosen": -2.369102954864502, + "logits/rejected": -2.1293792724609375, + "logps/chosen": -215.52981567382812, + "logps/rejected": -284.6708679199219, + "loss": 0.2959, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6570247411727905, + "rewards/margins": 1.9379452466964722, + "rewards/rejected": -2.5949699878692627, + "step": 7622 + }, + { + "epoch": 0.88, + "learning_rate": 3.690740957509072e-08, + "logits/chosen": -2.306387186050415, + "logits/rejected": -2.5195717811584473, + "logps/chosen": -342.00872802734375, + "logps/rejected": -240.43154907226562, + "loss": 0.8593, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.567335307598114, + "rewards/margins": 1.751168966293335, + "rewards/rejected": -2.3185043334960938, + "step": 7623 + }, + { + "epoch": 0.88, + "learning_rate": 3.687229310546646e-08, + "logits/chosen": -2.4011125564575195, + "logits/rejected": -2.226346969604492, + "logps/chosen": -245.65634155273438, + "logps/rejected": -328.53729248046875, + "loss": 0.3691, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.277052879333496, + "rewards/margins": 1.755968689918518, + "rewards/rejected": -3.0330216884613037, + "step": 7624 + }, + { + "epoch": 0.88, + "learning_rate": 3.683717663584221e-08, + "logits/chosen": -2.8983306884765625, + "logits/rejected": -2.936523199081421, + "logps/chosen": -139.89517211914062, + "logps/rejected": -135.12225341796875, + "loss": 0.3134, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.514391303062439, + "rewards/margins": 2.047098159790039, + "rewards/rejected": -2.5614898204803467, + "step": 7625 + }, + { + "epoch": 0.88, + "learning_rate": 3.6802060166217954e-08, + "logits/chosen": -2.251530170440674, + "logits/rejected": -2.325709581375122, + "logps/chosen": -185.07679748535156, + "logps/rejected": -241.65501403808594, + "loss": 0.3827, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1687101125717163, + "rewards/margins": 2.35092830657959, + "rewards/rejected": -3.5196385383605957, + "step": 7626 + }, + { + "epoch": 0.88, + "learning_rate": 3.67669436965937e-08, + "logits/chosen": -2.517899751663208, + "logits/rejected": -2.657790184020996, + "logps/chosen": -463.03472900390625, + "logps/rejected": -285.5252990722656, + "loss": 0.1284, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.455763578414917, + "rewards/margins": 2.6972789764404297, + "rewards/rejected": -3.153042793273926, + "step": 7627 + }, + { + "epoch": 0.88, + "learning_rate": 3.673182722696945e-08, + "logits/chosen": -2.036123752593994, + "logits/rejected": -2.4560182094573975, + "logps/chosen": -516.357177734375, + "logps/rejected": -259.60009765625, + "loss": 0.2135, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15285740792751312, + "rewards/margins": 2.346633195877075, + "rewards/rejected": -2.499490737915039, + "step": 7628 + }, + { + "epoch": 0.88, + "learning_rate": 3.6696710757345196e-08, + "logits/chosen": -2.397996664047241, + "logits/rejected": -2.351597785949707, + "logps/chosen": -309.767333984375, + "logps/rejected": -305.6370849609375, + "loss": 0.7302, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6956058740615845, + "rewards/margins": 3.248697280883789, + "rewards/rejected": -4.944303035736084, + "step": 7629 + }, + { + "epoch": 0.88, + "learning_rate": 3.666159428772094e-08, + "logits/chosen": -1.8792271614074707, + "logits/rejected": -2.1552324295043945, + "logps/chosen": -422.0877685546875, + "logps/rejected": -246.3463592529297, + "loss": 1.3404, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.5022215843200684, + "rewards/margins": 0.35583919286727905, + "rewards/rejected": -2.858060836791992, + "step": 7630 + }, + { + "epoch": 0.88, + "learning_rate": 3.6626477818096684e-08, + "logits/chosen": -2.518083333969116, + "logits/rejected": -2.1180076599121094, + "logps/chosen": -244.14105224609375, + "logps/rejected": -263.4066162109375, + "loss": 0.1679, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3674881160259247, + "rewards/margins": 2.5561699867248535, + "rewards/rejected": -2.9236578941345215, + "step": 7631 + }, + { + "epoch": 0.88, + "learning_rate": 3.659136134847243e-08, + "logits/chosen": -2.3760385513305664, + "logits/rejected": -2.6306376457214355, + "logps/chosen": -87.7342529296875, + "logps/rejected": -153.00877380371094, + "loss": 0.5464, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2406471967697144, + "rewards/margins": 2.2922098636627197, + "rewards/rejected": -3.5328569412231445, + "step": 7632 + }, + { + "epoch": 0.88, + "learning_rate": 3.655624487884818e-08, + "logits/chosen": -2.546008825302124, + "logits/rejected": -2.542081832885742, + "logps/chosen": -254.76669311523438, + "logps/rejected": -235.71951293945312, + "loss": 0.4484, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.672581672668457, + "rewards/margins": 2.2298853397369385, + "rewards/rejected": -3.9024670124053955, + "step": 7633 + }, + { + "epoch": 0.88, + "learning_rate": 3.6521128409223925e-08, + "logits/chosen": -2.1725401878356934, + "logits/rejected": -2.2664358615875244, + "logps/chosen": -239.4859619140625, + "logps/rejected": -205.58444213867188, + "loss": 0.2122, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0798816680908203, + "rewards/margins": 2.1151492595672607, + "rewards/rejected": -3.195030927658081, + "step": 7634 + }, + { + "epoch": 0.88, + "learning_rate": 3.648601193959967e-08, + "logits/chosen": -2.7167625427246094, + "logits/rejected": -2.596041202545166, + "logps/chosen": -107.46363830566406, + "logps/rejected": -151.67214965820312, + "loss": 0.4662, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0779465436935425, + "rewards/margins": 1.442287802696228, + "rewards/rejected": -2.5202341079711914, + "step": 7635 + }, + { + "epoch": 0.88, + "learning_rate": 3.645089546997542e-08, + "logits/chosen": -1.8701915740966797, + "logits/rejected": -2.5224246978759766, + "logps/chosen": -411.1097412109375, + "logps/rejected": -285.65185546875, + "loss": 0.144, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1277473121881485, + "rewards/margins": 3.5440077781677246, + "rewards/rejected": -3.671755075454712, + "step": 7636 + }, + { + "epoch": 0.88, + "learning_rate": 3.641577900035117e-08, + "logits/chosen": -1.957030177116394, + "logits/rejected": -2.1735100746154785, + "logps/chosen": -381.7926330566406, + "logps/rejected": -299.6653747558594, + "loss": 0.553, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9712985754013062, + "rewards/margins": 1.312341332435608, + "rewards/rejected": -2.283639907836914, + "step": 7637 + }, + { + "epoch": 0.88, + "learning_rate": 3.638066253072691e-08, + "logits/chosen": -2.1870203018188477, + "logits/rejected": -2.4979805946350098, + "logps/chosen": -326.92474365234375, + "logps/rejected": -310.567626953125, + "loss": 0.3834, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6806592345237732, + "rewards/margins": 2.376617431640625, + "rewards/rejected": -3.057276487350464, + "step": 7638 + }, + { + "epoch": 0.88, + "learning_rate": 3.6345546061102655e-08, + "logits/chosen": -1.315168857574463, + "logits/rejected": -1.9217079877853394, + "logps/chosen": -532.4680786132812, + "logps/rejected": -313.51806640625, + "loss": 0.6361, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3494558334350586, + "rewards/margins": 1.0063464641571045, + "rewards/rejected": -1.355802297592163, + "step": 7639 + }, + { + "epoch": 0.88, + "learning_rate": 3.63104295914784e-08, + "logits/chosen": -2.1987226009368896, + "logits/rejected": -2.3495471477508545, + "logps/chosen": -234.27865600585938, + "logps/rejected": -282.8988037109375, + "loss": 1.2173, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8428293466567993, + "rewards/margins": 0.19882401823997498, + "rewards/rejected": -2.0416533946990967, + "step": 7640 + }, + { + "epoch": 0.88, + "learning_rate": 3.627531312185415e-08, + "logits/chosen": -2.16135835647583, + "logits/rejected": -2.2725889682769775, + "logps/chosen": -339.17987060546875, + "logps/rejected": -281.27911376953125, + "loss": 0.4854, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5599008798599243, + "rewards/margins": 1.85972261428833, + "rewards/rejected": -2.419623613357544, + "step": 7641 + }, + { + "epoch": 0.88, + "learning_rate": 3.6240196652229897e-08, + "logits/chosen": -1.9190900325775146, + "logits/rejected": -2.196112871170044, + "logps/chosen": -391.73126220703125, + "logps/rejected": -411.88397216796875, + "loss": 0.3439, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0294833183288574, + "rewards/margins": 1.5182551145553589, + "rewards/rejected": -2.547738552093506, + "step": 7642 + }, + { + "epoch": 0.88, + "learning_rate": 3.6205080182605644e-08, + "logits/chosen": -2.3411865234375, + "logits/rejected": -2.4146571159362793, + "logps/chosen": -268.1297302246094, + "logps/rejected": -262.19268798828125, + "loss": 0.2743, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7934829592704773, + "rewards/margins": 1.772361397743225, + "rewards/rejected": -2.5658445358276367, + "step": 7643 + }, + { + "epoch": 0.88, + "learning_rate": 3.616996371298139e-08, + "logits/chosen": -1.8894768953323364, + "logits/rejected": -1.871047019958496, + "logps/chosen": -201.58984375, + "logps/rejected": -278.2076416015625, + "loss": 0.6576, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8301854133605957, + "rewards/margins": 0.7942512035369873, + "rewards/rejected": -1.624436378479004, + "step": 7644 + }, + { + "epoch": 0.88, + "learning_rate": 3.613484724335713e-08, + "logits/chosen": -2.3971405029296875, + "logits/rejected": -2.4892516136169434, + "logps/chosen": -324.4115905761719, + "logps/rejected": -486.62677001953125, + "loss": 0.3826, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9376389384269714, + "rewards/margins": 1.5543683767318726, + "rewards/rejected": -2.492007255554199, + "step": 7645 + }, + { + "epoch": 0.88, + "learning_rate": 3.609973077373288e-08, + "logits/chosen": -2.2361679077148438, + "logits/rejected": -1.9648921489715576, + "logps/chosen": -255.9193115234375, + "logps/rejected": -348.984130859375, + "loss": 0.0937, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5614056587219238, + "rewards/margins": 5.066359519958496, + "rewards/rejected": -6.627764701843262, + "step": 7646 + }, + { + "epoch": 0.88, + "learning_rate": 3.6064614304108626e-08, + "logits/chosen": -2.3607125282287598, + "logits/rejected": -2.360319137573242, + "logps/chosen": -235.31703186035156, + "logps/rejected": -269.70233154296875, + "loss": 1.3728, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4844701290130615, + "rewards/margins": -0.0728369951248169, + "rewards/rejected": -1.411633014678955, + "step": 7647 + }, + { + "epoch": 0.88, + "learning_rate": 3.602949783448437e-08, + "logits/chosen": -2.4825825691223145, + "logits/rejected": -2.382988691329956, + "logps/chosen": -243.44277954101562, + "logps/rejected": -194.50741577148438, + "loss": 0.4585, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.142404556274414, + "rewards/margins": 0.7458131909370422, + "rewards/rejected": -1.8882179260253906, + "step": 7648 + }, + { + "epoch": 0.88, + "learning_rate": 3.5994381364860114e-08, + "logits/chosen": -2.6230669021606445, + "logits/rejected": -2.5938446521759033, + "logps/chosen": -282.06597900390625, + "logps/rejected": -259.9070129394531, + "loss": 0.3528, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1031594276428223, + "rewards/margins": 1.8995213508605957, + "rewards/rejected": -4.002680778503418, + "step": 7649 + }, + { + "epoch": 0.88, + "learning_rate": 3.595926489523586e-08, + "logits/chosen": -1.8734415769577026, + "logits/rejected": -1.6792871952056885, + "logps/chosen": -289.23089599609375, + "logps/rejected": -305.80340576171875, + "loss": 0.2862, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.49450063705444336, + "rewards/margins": 2.111537218093872, + "rewards/rejected": -2.6060376167297363, + "step": 7650 + }, + { + "epoch": 0.88, + "learning_rate": 3.592414842561161e-08, + "logits/chosen": -2.1359872817993164, + "logits/rejected": -2.4251315593719482, + "logps/chosen": -349.60626220703125, + "logps/rejected": -360.6485900878906, + "loss": 0.5616, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0068488121032715, + "rewards/margins": 1.1401435136795044, + "rewards/rejected": -2.1469924449920654, + "step": 7651 + }, + { + "epoch": 0.88, + "learning_rate": 3.5889031955987356e-08, + "logits/chosen": -2.6894404888153076, + "logits/rejected": -2.822608232498169, + "logps/chosen": -266.65985107421875, + "logps/rejected": -264.83099365234375, + "loss": 0.4858, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7015536427497864, + "rewards/margins": 1.511680245399475, + "rewards/rejected": -2.2132339477539062, + "step": 7652 + }, + { + "epoch": 0.88, + "learning_rate": 3.58539154863631e-08, + "logits/chosen": -1.9303874969482422, + "logits/rejected": -2.4172441959381104, + "logps/chosen": -310.4579162597656, + "logps/rejected": -332.2309875488281, + "loss": 0.3302, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2793598771095276, + "rewards/margins": 1.7261765003204346, + "rewards/rejected": -2.0055360794067383, + "step": 7653 + }, + { + "epoch": 0.88, + "learning_rate": 3.581879901673885e-08, + "logits/chosen": -2.0367050170898438, + "logits/rejected": -2.009347677230835, + "logps/chosen": -308.9525146484375, + "logps/rejected": -287.2828674316406, + "loss": 0.381, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7367438673973083, + "rewards/margins": 1.6765090227127075, + "rewards/rejected": -2.413252830505371, + "step": 7654 + }, + { + "epoch": 0.88, + "learning_rate": 3.578368254711459e-08, + "logits/chosen": -2.1706879138946533, + "logits/rejected": -2.33687686920166, + "logps/chosen": -251.94796752929688, + "logps/rejected": -306.6009826660156, + "loss": 0.4131, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9019744992256165, + "rewards/margins": 3.528773069381714, + "rewards/rejected": -4.430747985839844, + "step": 7655 + }, + { + "epoch": 0.88, + "learning_rate": 3.574856607749034e-08, + "logits/chosen": -2.2455506324768066, + "logits/rejected": -2.016961097717285, + "logps/chosen": -222.58204650878906, + "logps/rejected": -321.8441162109375, + "loss": 0.2589, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2461462020874023, + "rewards/margins": 2.6042733192443848, + "rewards/rejected": -3.8504199981689453, + "step": 7656 + }, + { + "epoch": 0.88, + "learning_rate": 3.5713449607866085e-08, + "logits/chosen": -2.7106754779815674, + "logits/rejected": -2.9082953929901123, + "logps/chosen": -388.475341796875, + "logps/rejected": -303.8260498046875, + "loss": 0.247, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.43714460730552673, + "rewards/margins": 2.528367280960083, + "rewards/rejected": -2.9655117988586426, + "step": 7657 + }, + { + "epoch": 0.88, + "learning_rate": 3.567833313824183e-08, + "logits/chosen": -2.328787088394165, + "logits/rejected": -2.3466796875, + "logps/chosen": -170.07644653320312, + "logps/rejected": -213.994384765625, + "loss": 0.2677, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8228338956832886, + "rewards/margins": 2.2984375953674316, + "rewards/rejected": -3.1212713718414307, + "step": 7658 + }, + { + "epoch": 0.88, + "learning_rate": 3.564321666861758e-08, + "logits/chosen": -2.128739356994629, + "logits/rejected": -2.423116683959961, + "logps/chosen": -256.5826416015625, + "logps/rejected": -139.09425354003906, + "loss": 0.2264, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2197335660457611, + "rewards/margins": 2.3784451484680176, + "rewards/rejected": -2.5981788635253906, + "step": 7659 + }, + { + "epoch": 0.88, + "learning_rate": 3.560810019899333e-08, + "logits/chosen": -2.4460983276367188, + "logits/rejected": -2.6357386112213135, + "logps/chosen": -274.1183776855469, + "logps/rejected": -232.5332794189453, + "loss": 0.3836, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1439675092697144, + "rewards/margins": 2.0147812366485596, + "rewards/rejected": -3.1587486267089844, + "step": 7660 + }, + { + "epoch": 0.88, + "learning_rate": 3.5572983729369074e-08, + "logits/chosen": -2.18436336517334, + "logits/rejected": -2.3753700256347656, + "logps/chosen": -371.6943359375, + "logps/rejected": -190.6923370361328, + "loss": 0.36, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3921122550964355, + "rewards/margins": 1.7094831466674805, + "rewards/rejected": -3.101595401763916, + "step": 7661 + }, + { + "epoch": 0.88, + "learning_rate": 3.5537867259744815e-08, + "logits/chosen": -2.4731829166412354, + "logits/rejected": -2.587034225463867, + "logps/chosen": -246.12551879882812, + "logps/rejected": -263.6348876953125, + "loss": 0.2035, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16244837641716003, + "rewards/margins": 2.951547145843506, + "rewards/rejected": -3.1139955520629883, + "step": 7662 + }, + { + "epoch": 0.88, + "learning_rate": 3.550275079012056e-08, + "logits/chosen": -2.4781503677368164, + "logits/rejected": -2.6346521377563477, + "logps/chosen": -274.7044982910156, + "logps/rejected": -232.7277069091797, + "loss": 0.3966, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.78547203540802, + "rewards/margins": 1.7934417724609375, + "rewards/rejected": -2.578913927078247, + "step": 7663 + }, + { + "epoch": 0.88, + "learning_rate": 3.546763432049631e-08, + "logits/chosen": -2.1284079551696777, + "logits/rejected": -2.255500316619873, + "logps/chosen": -315.6387939453125, + "logps/rejected": -179.48582458496094, + "loss": 0.6194, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.198010802268982, + "rewards/margins": 0.958947479724884, + "rewards/rejected": -2.1569583415985107, + "step": 7664 + }, + { + "epoch": 0.88, + "learning_rate": 3.5432517850872057e-08, + "logits/chosen": -2.669585704803467, + "logits/rejected": -2.419811725616455, + "logps/chosen": -168.63848876953125, + "logps/rejected": -181.93116760253906, + "loss": 0.4621, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.212699294090271, + "rewards/margins": 2.371127128601074, + "rewards/rejected": -3.5838265419006348, + "step": 7665 + }, + { + "epoch": 0.88, + "learning_rate": 3.5397401381247804e-08, + "logits/chosen": -2.656625986099243, + "logits/rejected": -2.706429958343506, + "logps/chosen": -242.05914306640625, + "logps/rejected": -219.478515625, + "loss": 0.307, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8480664491653442, + "rewards/margins": 2.327561378479004, + "rewards/rejected": -3.1756279468536377, + "step": 7666 + }, + { + "epoch": 0.88, + "learning_rate": 3.536228491162355e-08, + "logits/chosen": -2.255711078643799, + "logits/rejected": -2.5622527599334717, + "logps/chosen": -336.4443664550781, + "logps/rejected": -207.42298889160156, + "loss": 0.7418, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6150546073913574, + "rewards/margins": 0.6354745030403137, + "rewards/rejected": -1.2505290508270264, + "step": 7667 + }, + { + "epoch": 0.88, + "learning_rate": 3.53271684419993e-08, + "logits/chosen": -2.69334077835083, + "logits/rejected": -2.702910900115967, + "logps/chosen": -207.30508422851562, + "logps/rejected": -281.2126770019531, + "loss": 0.2377, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.45038920640945435, + "rewards/margins": 2.5339245796203613, + "rewards/rejected": -2.98431396484375, + "step": 7668 + }, + { + "epoch": 0.88, + "learning_rate": 3.529205197237504e-08, + "logits/chosen": -2.308938503265381, + "logits/rejected": -2.50313401222229, + "logps/chosen": -268.8461608886719, + "logps/rejected": -295.4465026855469, + "loss": 0.2601, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20993784070014954, + "rewards/margins": 2.776597023010254, + "rewards/rejected": -2.986534833908081, + "step": 7669 + }, + { + "epoch": 0.88, + "learning_rate": 3.5256935502750786e-08, + "logits/chosen": -1.6740827560424805, + "logits/rejected": -2.231977939605713, + "logps/chosen": -540.766357421875, + "logps/rejected": -242.0211181640625, + "loss": 0.1845, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4685783386230469, + "rewards/margins": 2.1850104331970215, + "rewards/rejected": -2.6535885334014893, + "step": 7670 + }, + { + "epoch": 0.88, + "learning_rate": 3.5221819033126533e-08, + "logits/chosen": -1.7859537601470947, + "logits/rejected": -2.126965284347534, + "logps/chosen": -299.497314453125, + "logps/rejected": -243.0687713623047, + "loss": 0.3023, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.49298980832099915, + "rewards/margins": 2.5874481201171875, + "rewards/rejected": -3.0804381370544434, + "step": 7671 + }, + { + "epoch": 0.88, + "learning_rate": 3.518670256350228e-08, + "logits/chosen": -2.834700584411621, + "logits/rejected": -2.732513189315796, + "logps/chosen": -207.63978576660156, + "logps/rejected": -394.83306884765625, + "loss": 0.1924, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3270772099494934, + "rewards/margins": 3.0009262561798096, + "rewards/rejected": -3.328003406524658, + "step": 7672 + }, + { + "epoch": 0.88, + "learning_rate": 3.515158609387803e-08, + "logits/chosen": -2.0258522033691406, + "logits/rejected": -2.124094247817993, + "logps/chosen": -338.9714660644531, + "logps/rejected": -250.68482971191406, + "loss": 0.4231, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6678339838981628, + "rewards/margins": 1.548957109451294, + "rewards/rejected": -2.2167911529541016, + "step": 7673 + }, + { + "epoch": 0.88, + "learning_rate": 3.5116469624253775e-08, + "logits/chosen": -1.8853163719177246, + "logits/rejected": -2.122370958328247, + "logps/chosen": -470.3941650390625, + "logps/rejected": -381.4261474609375, + "loss": 0.421, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.02705879509449005, + "rewards/margins": 2.780996084213257, + "rewards/rejected": -2.8080549240112305, + "step": 7674 + }, + { + "epoch": 0.88, + "learning_rate": 3.508135315462952e-08, + "logits/chosen": -2.3904552459716797, + "logits/rejected": -2.4282150268554688, + "logps/chosen": -378.7237548828125, + "logps/rejected": -265.2531433105469, + "loss": 0.6104, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9716200232505798, + "rewards/margins": 0.9817904829978943, + "rewards/rejected": -1.9534105062484741, + "step": 7675 + }, + { + "epoch": 0.88, + "learning_rate": 3.504623668500527e-08, + "logits/chosen": -2.1094751358032227, + "logits/rejected": -2.2373785972595215, + "logps/chosen": -314.8578186035156, + "logps/rejected": -317.5130310058594, + "loss": 0.3838, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2882606983184814, + "rewards/margins": 2.068084716796875, + "rewards/rejected": -3.3563456535339355, + "step": 7676 + }, + { + "epoch": 0.89, + "learning_rate": 3.501112021538101e-08, + "logits/chosen": -2.4764556884765625, + "logits/rejected": -2.531290054321289, + "logps/chosen": -458.5775146484375, + "logps/rejected": -217.2240447998047, + "loss": 0.2932, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6712689995765686, + "rewards/margins": 1.9117660522460938, + "rewards/rejected": -2.5830349922180176, + "step": 7677 + }, + { + "epoch": 0.89, + "learning_rate": 3.497600374575676e-08, + "logits/chosen": -2.276031017303467, + "logits/rejected": -2.4337222576141357, + "logps/chosen": -254.1245880126953, + "logps/rejected": -132.2064971923828, + "loss": 0.5523, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9094932079315186, + "rewards/margins": 0.8025199770927429, + "rewards/rejected": -1.7120131254196167, + "step": 7678 + }, + { + "epoch": 0.89, + "learning_rate": 3.4940887276132505e-08, + "logits/chosen": -2.5685272216796875, + "logits/rejected": -2.421717643737793, + "logps/chosen": -129.11563110351562, + "logps/rejected": -172.61985778808594, + "loss": 0.4339, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0580546855926514, + "rewards/margins": 1.3002249002456665, + "rewards/rejected": -2.3582797050476074, + "step": 7679 + }, + { + "epoch": 0.89, + "learning_rate": 3.490577080650825e-08, + "logits/chosen": -2.6627707481384277, + "logits/rejected": -2.723033905029297, + "logps/chosen": -328.5127258300781, + "logps/rejected": -307.375, + "loss": 0.8403, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4634556770324707, + "rewards/margins": 1.845894694328308, + "rewards/rejected": -3.3093504905700684, + "step": 7680 + }, + { + "epoch": 0.89, + "learning_rate": 3.4870654336884e-08, + "logits/chosen": -2.1690757274627686, + "logits/rejected": -2.2740631103515625, + "logps/chosen": -235.9796142578125, + "logps/rejected": -201.190185546875, + "loss": 0.8947, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4504756927490234, + "rewards/margins": 0.6948909759521484, + "rewards/rejected": -2.145366668701172, + "step": 7681 + }, + { + "epoch": 0.89, + "learning_rate": 3.4835537867259746e-08, + "logits/chosen": -2.008413553237915, + "logits/rejected": -2.2767932415008545, + "logps/chosen": -590.4195556640625, + "logps/rejected": -407.90557861328125, + "loss": 0.7101, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.187876582145691, + "rewards/margins": 0.7277486324310303, + "rewards/rejected": -1.9156250953674316, + "step": 7682 + }, + { + "epoch": 0.89, + "learning_rate": 3.4800421397635494e-08, + "logits/chosen": -1.3911703824996948, + "logits/rejected": -2.014355421066284, + "logps/chosen": -429.91986083984375, + "logps/rejected": -228.23060607910156, + "loss": 0.3519, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.43580788373947144, + "rewards/margins": 1.5024839639663696, + "rewards/rejected": -1.9382917881011963, + "step": 7683 + }, + { + "epoch": 0.89, + "learning_rate": 3.4765304928011234e-08, + "logits/chosen": -2.5406575202941895, + "logits/rejected": -2.6221420764923096, + "logps/chosen": -359.62860107421875, + "logps/rejected": -382.4454650878906, + "loss": 0.5647, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.35041648149490356, + "rewards/margins": 1.809815526008606, + "rewards/rejected": -2.160231828689575, + "step": 7684 + }, + { + "epoch": 0.89, + "learning_rate": 3.473018845838698e-08, + "logits/chosen": -2.5634779930114746, + "logits/rejected": -2.651944637298584, + "logps/chosen": -257.21038818359375, + "logps/rejected": -292.8565673828125, + "loss": 0.3725, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2962853908538818, + "rewards/margins": 2.215205669403076, + "rewards/rejected": -3.511490821838379, + "step": 7685 + }, + { + "epoch": 0.89, + "learning_rate": 3.469507198876273e-08, + "logits/chosen": -1.9232065677642822, + "logits/rejected": -2.0166378021240234, + "logps/chosen": -336.1746520996094, + "logps/rejected": -332.3335266113281, + "loss": 0.6641, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0493605136871338, + "rewards/margins": 0.49245262145996094, + "rewards/rejected": -1.5418131351470947, + "step": 7686 + }, + { + "epoch": 0.89, + "learning_rate": 3.4659955519138476e-08, + "logits/chosen": -2.5011048316955566, + "logits/rejected": -2.648940324783325, + "logps/chosen": -368.1035461425781, + "logps/rejected": -252.93235778808594, + "loss": 0.6098, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.269737720489502, + "rewards/margins": 0.9992527961730957, + "rewards/rejected": -2.2689905166625977, + "step": 7687 + }, + { + "epoch": 0.89, + "learning_rate": 3.462483904951422e-08, + "logits/chosen": -2.5897884368896484, + "logits/rejected": -2.7001051902770996, + "logps/chosen": -557.380126953125, + "logps/rejected": -305.0050354003906, + "loss": 0.1256, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2431885302066803, + "rewards/margins": 3.0542638301849365, + "rewards/rejected": -3.297452211380005, + "step": 7688 + }, + { + "epoch": 0.89, + "learning_rate": 3.458972257988997e-08, + "logits/chosen": -1.6825684309005737, + "logits/rejected": -2.1921818256378174, + "logps/chosen": -317.08245849609375, + "logps/rejected": -209.4751739501953, + "loss": 0.2204, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17619775235652924, + "rewards/margins": 2.747352123260498, + "rewards/rejected": -2.5711543560028076, + "step": 7689 + }, + { + "epoch": 0.89, + "learning_rate": 3.455460611026572e-08, + "logits/chosen": -2.317370891571045, + "logits/rejected": -2.3082053661346436, + "logps/chosen": -303.9096984863281, + "logps/rejected": -310.05499267578125, + "loss": 0.2479, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.597853422164917, + "rewards/margins": 2.38346004486084, + "rewards/rejected": -2.9813132286071777, + "step": 7690 + }, + { + "epoch": 0.89, + "learning_rate": 3.451948964064146e-08, + "logits/chosen": -1.9971892833709717, + "logits/rejected": -1.8831970691680908, + "logps/chosen": -299.331787109375, + "logps/rejected": -283.55120849609375, + "loss": 0.9673, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0332856178283691, + "rewards/margins": 0.3862651288509369, + "rewards/rejected": -1.4195507764816284, + "step": 7691 + }, + { + "epoch": 0.89, + "learning_rate": 3.4484373171017205e-08, + "logits/chosen": -2.1585800647735596, + "logits/rejected": -2.1952123641967773, + "logps/chosen": -249.69146728515625, + "logps/rejected": -202.83152770996094, + "loss": 0.3144, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14795465767383575, + "rewards/margins": 2.4162540435791016, + "rewards/rejected": -2.564208745956421, + "step": 7692 + }, + { + "epoch": 0.89, + "learning_rate": 3.444925670139295e-08, + "logits/chosen": -2.4637129306793213, + "logits/rejected": -2.400831699371338, + "logps/chosen": -276.20611572265625, + "logps/rejected": -401.423583984375, + "loss": 0.4417, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9625006318092346, + "rewards/margins": 2.850466728210449, + "rewards/rejected": -3.812967300415039, + "step": 7693 + }, + { + "epoch": 0.89, + "learning_rate": 3.44141402317687e-08, + "logits/chosen": -2.2919111251831055, + "logits/rejected": -2.394190549850464, + "logps/chosen": -300.20458984375, + "logps/rejected": -296.2492370605469, + "loss": 0.2219, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6197214126586914, + "rewards/margins": 3.0372073650360107, + "rewards/rejected": -3.656928539276123, + "step": 7694 + }, + { + "epoch": 0.89, + "learning_rate": 3.437902376214445e-08, + "logits/chosen": -2.0587637424468994, + "logits/rejected": -2.084031343460083, + "logps/chosen": -149.17428588867188, + "logps/rejected": -209.0660400390625, + "loss": 0.1212, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5089694261550903, + "rewards/margins": 2.75356125831604, + "rewards/rejected": -3.26253080368042, + "step": 7695 + }, + { + "epoch": 0.89, + "learning_rate": 3.434390729252019e-08, + "logits/chosen": -2.6698880195617676, + "logits/rejected": -2.5368635654449463, + "logps/chosen": -110.40037536621094, + "logps/rejected": -158.10386657714844, + "loss": 0.4647, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.850436806678772, + "rewards/margins": 2.102870464324951, + "rewards/rejected": -2.9533073902130127, + "step": 7696 + }, + { + "epoch": 0.89, + "learning_rate": 3.4308790822895935e-08, + "logits/chosen": -2.516723155975342, + "logits/rejected": -2.6093857288360596, + "logps/chosen": -168.3937530517578, + "logps/rejected": -211.00521850585938, + "loss": 1.1784, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5318776369094849, + "rewards/margins": 0.6846703886985779, + "rewards/rejected": -2.216547966003418, + "step": 7697 + }, + { + "epoch": 0.89, + "learning_rate": 3.427367435327168e-08, + "logits/chosen": -2.285191297531128, + "logits/rejected": -2.252102851867676, + "logps/chosen": -147.36692810058594, + "logps/rejected": -240.16006469726562, + "loss": 0.7564, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6192715167999268, + "rewards/margins": 0.9118064641952515, + "rewards/rejected": -1.5310779809951782, + "step": 7698 + }, + { + "epoch": 0.89, + "learning_rate": 3.423855788364743e-08, + "logits/chosen": -2.0279672145843506, + "logits/rejected": -2.1023826599121094, + "logps/chosen": -274.4266052246094, + "logps/rejected": -235.86245727539062, + "loss": 0.3587, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5651324391365051, + "rewards/margins": 1.07767653465271, + "rewards/rejected": -1.6428089141845703, + "step": 7699 + }, + { + "epoch": 0.89, + "learning_rate": 3.420344141402318e-08, + "logits/chosen": -2.370246410369873, + "logits/rejected": -2.5675606727600098, + "logps/chosen": -418.8695068359375, + "logps/rejected": -235.29994201660156, + "loss": 0.7819, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7753409147262573, + "rewards/margins": 0.9380996823310852, + "rewards/rejected": -1.7134405374526978, + "step": 7700 + }, + { + "epoch": 0.89, + "learning_rate": 3.416832494439892e-08, + "logits/chosen": -1.7012174129486084, + "logits/rejected": -1.993349313735962, + "logps/chosen": -367.7757873535156, + "logps/rejected": -229.60215759277344, + "loss": 0.2445, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012983381748199463, + "rewards/margins": 2.453892469406128, + "rewards/rejected": -2.4668760299682617, + "step": 7701 + }, + { + "epoch": 0.89, + "learning_rate": 3.4133208474774665e-08, + "logits/chosen": -2.5794014930725098, + "logits/rejected": -2.63238525390625, + "logps/chosen": -178.0714569091797, + "logps/rejected": -233.79396057128906, + "loss": 0.1414, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1294524669647217, + "rewards/margins": 2.976274251937866, + "rewards/rejected": -4.105726718902588, + "step": 7702 + }, + { + "epoch": 0.89, + "learning_rate": 3.409809200515041e-08, + "logits/chosen": -2.313204765319824, + "logits/rejected": -2.4116480350494385, + "logps/chosen": -434.841552734375, + "logps/rejected": -460.95660400390625, + "loss": 0.3077, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.44623351097106934, + "rewards/margins": 2.4482641220092773, + "rewards/rejected": -2.8944973945617676, + "step": 7703 + }, + { + "epoch": 0.89, + "learning_rate": 3.406297553552616e-08, + "logits/chosen": -2.152040719985962, + "logits/rejected": -2.1907436847686768, + "logps/chosen": -308.7745361328125, + "logps/rejected": -273.82794189453125, + "loss": 0.5955, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5848753452301025, + "rewards/margins": 0.5045797228813171, + "rewards/rejected": -2.0894551277160645, + "step": 7704 + }, + { + "epoch": 0.89, + "learning_rate": 3.4027859065901906e-08, + "logits/chosen": -2.8790524005889893, + "logits/rejected": -2.891763925552368, + "logps/chosen": -127.2575454711914, + "logps/rejected": -329.5953674316406, + "loss": 0.0419, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1890193223953247, + "rewards/margins": 3.9832963943481445, + "rewards/rejected": -4.17231559753418, + "step": 7705 + }, + { + "epoch": 0.89, + "learning_rate": 3.3992742596277654e-08, + "logits/chosen": -2.199113368988037, + "logits/rejected": -1.9829490184783936, + "logps/chosen": -153.1700439453125, + "logps/rejected": -211.8437957763672, + "loss": 0.6378, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5239874720573425, + "rewards/margins": 1.29427170753479, + "rewards/rejected": -1.8182592391967773, + "step": 7706 + }, + { + "epoch": 0.89, + "learning_rate": 3.39576261266534e-08, + "logits/chosen": -1.9719116687774658, + "logits/rejected": -1.8408949375152588, + "logps/chosen": -222.666748046875, + "logps/rejected": -271.96649169921875, + "loss": 0.3718, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.967483401298523, + "rewards/margins": 1.594865322113037, + "rewards/rejected": -2.5623486042022705, + "step": 7707 + }, + { + "epoch": 0.89, + "learning_rate": 3.392250965702914e-08, + "logits/chosen": -2.3984224796295166, + "logits/rejected": -2.656397581100464, + "logps/chosen": -311.6865234375, + "logps/rejected": -208.355712890625, + "loss": 0.6541, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2915862798690796, + "rewards/margins": 1.0772355794906616, + "rewards/rejected": -2.368821859359741, + "step": 7708 + }, + { + "epoch": 0.89, + "learning_rate": 3.388739318740489e-08, + "logits/chosen": -2.4026787281036377, + "logits/rejected": -2.2649452686309814, + "logps/chosen": -209.80947875976562, + "logps/rejected": -271.5369873046875, + "loss": 0.4978, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8190467953681946, + "rewards/margins": 2.4968314170837402, + "rewards/rejected": -3.315878391265869, + "step": 7709 + }, + { + "epoch": 0.89, + "learning_rate": 3.3852276717780636e-08, + "logits/chosen": -2.287297010421753, + "logits/rejected": -2.252338409423828, + "logps/chosen": -129.02345275878906, + "logps/rejected": -229.91497802734375, + "loss": 0.1102, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7598488330841064, + "rewards/margins": 3.247920513153076, + "rewards/rejected": -4.0077691078186035, + "step": 7710 + }, + { + "epoch": 0.89, + "learning_rate": 3.381716024815638e-08, + "logits/chosen": -2.9230844974517822, + "logits/rejected": -2.924823760986328, + "logps/chosen": -138.87924194335938, + "logps/rejected": -205.4293975830078, + "loss": 0.1511, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.050357699394226074, + "rewards/margins": 4.747934341430664, + "rewards/rejected": -4.7982916831970215, + "step": 7711 + }, + { + "epoch": 0.89, + "learning_rate": 3.378204377853213e-08, + "logits/chosen": -2.9629430770874023, + "logits/rejected": -2.8529162406921387, + "logps/chosen": -379.9742431640625, + "logps/rejected": -351.1012268066406, + "loss": 0.3219, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5136414766311646, + "rewards/margins": 1.9243948459625244, + "rewards/rejected": -3.4380362033843994, + "step": 7712 + }, + { + "epoch": 0.89, + "learning_rate": 3.374692730890788e-08, + "logits/chosen": -2.240494728088379, + "logits/rejected": -2.098135471343994, + "logps/chosen": -269.1435546875, + "logps/rejected": -210.62026977539062, + "loss": 0.3063, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4206077754497528, + "rewards/margins": 1.7435803413391113, + "rewards/rejected": -2.1641881465911865, + "step": 7713 + }, + { + "epoch": 0.89, + "learning_rate": 3.3711810839283625e-08, + "logits/chosen": -1.9595187902450562, + "logits/rejected": -2.5592446327209473, + "logps/chosen": -249.41598510742188, + "logps/rejected": -201.84925842285156, + "loss": 0.266, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0638620853424072, + "rewards/margins": 1.9330099821090698, + "rewards/rejected": -2.9968719482421875, + "step": 7714 + }, + { + "epoch": 0.89, + "learning_rate": 3.3676694369659365e-08, + "logits/chosen": -1.9838345050811768, + "logits/rejected": -2.289902687072754, + "logps/chosen": -351.151611328125, + "logps/rejected": -275.84405517578125, + "loss": 0.3248, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9869866967201233, + "rewards/margins": 1.3617656230926514, + "rewards/rejected": -2.34875226020813, + "step": 7715 + }, + { + "epoch": 0.89, + "learning_rate": 3.364157790003511e-08, + "logits/chosen": -2.4442012310028076, + "logits/rejected": -2.3965089321136475, + "logps/chosen": -231.9322509765625, + "logps/rejected": -215.55043029785156, + "loss": 0.2758, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16577228903770447, + "rewards/margins": 2.2073440551757812, + "rewards/rejected": -2.3731164932250977, + "step": 7716 + }, + { + "epoch": 0.89, + "learning_rate": 3.360646143041086e-08, + "logits/chosen": -2.5612940788269043, + "logits/rejected": -2.542301654815674, + "logps/chosen": -207.21385192871094, + "logps/rejected": -160.53207397460938, + "loss": 0.1921, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.32207465171813965, + "rewards/margins": 2.7675223350524902, + "rewards/rejected": -3.08959698677063, + "step": 7717 + }, + { + "epoch": 0.89, + "learning_rate": 3.357134496078661e-08, + "logits/chosen": -1.5562989711761475, + "logits/rejected": -1.6994856595993042, + "logps/chosen": -298.48736572265625, + "logps/rejected": -260.163330078125, + "loss": 0.245, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.40806686878204346, + "rewards/margins": 2.5809226036071777, + "rewards/rejected": -2.9889895915985107, + "step": 7718 + }, + { + "epoch": 0.89, + "learning_rate": 3.3536228491162354e-08, + "logits/chosen": -2.368241310119629, + "logits/rejected": -2.2603142261505127, + "logps/chosen": -144.07940673828125, + "logps/rejected": -198.371337890625, + "loss": 0.178, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1694271564483643, + "rewards/margins": 2.140177011489868, + "rewards/rejected": -3.3096044063568115, + "step": 7719 + }, + { + "epoch": 0.89, + "learning_rate": 3.35011120215381e-08, + "logits/chosen": -2.5783305168151855, + "logits/rejected": -2.743602991104126, + "logps/chosen": -283.2409973144531, + "logps/rejected": -312.7596130371094, + "loss": 0.0924, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8904169201850891, + "rewards/margins": 4.37298583984375, + "rewards/rejected": -5.263402938842773, + "step": 7720 + }, + { + "epoch": 0.89, + "learning_rate": 3.346599555191385e-08, + "logits/chosen": -2.326948642730713, + "logits/rejected": -2.2149157524108887, + "logps/chosen": -243.05870056152344, + "logps/rejected": -267.78240966796875, + "loss": 0.6566, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2033333778381348, + "rewards/margins": 0.9848553538322449, + "rewards/rejected": -2.1881890296936035, + "step": 7721 + }, + { + "epoch": 0.89, + "learning_rate": 3.343087908228959e-08, + "logits/chosen": -2.1927695274353027, + "logits/rejected": -2.0581259727478027, + "logps/chosen": -193.00616455078125, + "logps/rejected": -229.74795532226562, + "loss": 0.4215, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.36048540472984314, + "rewards/margins": 3.1347875595092773, + "rewards/rejected": -3.4952731132507324, + "step": 7722 + }, + { + "epoch": 0.89, + "learning_rate": 3.339576261266534e-08, + "logits/chosen": -2.2859485149383545, + "logits/rejected": -2.2888731956481934, + "logps/chosen": -150.078369140625, + "logps/rejected": -220.9678192138672, + "loss": 0.4003, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.846217691898346, + "rewards/margins": 3.4274165630340576, + "rewards/rejected": -4.273634433746338, + "step": 7723 + }, + { + "epoch": 0.89, + "learning_rate": 3.3360646143041084e-08, + "logits/chosen": -2.3326358795166016, + "logits/rejected": -2.278925657272339, + "logps/chosen": -256.3759765625, + "logps/rejected": -430.4447021484375, + "loss": 0.4039, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.865236222743988, + "rewards/margins": 2.2884349822998047, + "rewards/rejected": -3.1536712646484375, + "step": 7724 + }, + { + "epoch": 0.89, + "learning_rate": 3.332552967341683e-08, + "logits/chosen": -2.3243930339813232, + "logits/rejected": -1.9423987865447998, + "logps/chosen": -167.1228485107422, + "logps/rejected": -286.625244140625, + "loss": 0.3749, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.569049060344696, + "rewards/margins": 3.236964225769043, + "rewards/rejected": -3.806013584136963, + "step": 7725 + }, + { + "epoch": 0.89, + "learning_rate": 3.329041320379258e-08, + "logits/chosen": -1.9611401557922363, + "logits/rejected": -2.1511223316192627, + "logps/chosen": -314.2109375, + "logps/rejected": -236.42477416992188, + "loss": 0.4264, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0847291946411133, + "rewards/margins": 2.718377113342285, + "rewards/rejected": -3.8031063079833984, + "step": 7726 + }, + { + "epoch": 0.89, + "learning_rate": 3.3255296734168326e-08, + "logits/chosen": -2.186511516571045, + "logits/rejected": -2.2881627082824707, + "logps/chosen": -143.94483947753906, + "logps/rejected": -230.45196533203125, + "loss": 0.2843, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6710127592086792, + "rewards/margins": 2.728794813156128, + "rewards/rejected": -3.3998074531555176, + "step": 7727 + }, + { + "epoch": 0.89, + "learning_rate": 3.322018026454407e-08, + "logits/chosen": -3.084226608276367, + "logits/rejected": -2.964677572250366, + "logps/chosen": -257.337158203125, + "logps/rejected": -186.50665283203125, + "loss": 0.0917, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.013799265027046204, + "rewards/margins": 3.8501641750335693, + "rewards/rejected": -3.836364984512329, + "step": 7728 + }, + { + "epoch": 0.89, + "learning_rate": 3.3185063794919814e-08, + "logits/chosen": -1.830977201461792, + "logits/rejected": -2.187405824661255, + "logps/chosen": -139.32431030273438, + "logps/rejected": -182.01861572265625, + "loss": 0.5483, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4898453950881958, + "rewards/margins": 0.7712026238441467, + "rewards/rejected": -2.2610480785369873, + "step": 7729 + }, + { + "epoch": 0.89, + "learning_rate": 3.314994732529556e-08, + "logits/chosen": -2.1425275802612305, + "logits/rejected": -2.404850482940674, + "logps/chosen": -472.3903503417969, + "logps/rejected": -218.79225158691406, + "loss": 1.2425, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.8446526527404785, + "rewards/margins": -0.1943172812461853, + "rewards/rejected": -1.6503353118896484, + "step": 7730 + }, + { + "epoch": 0.89, + "learning_rate": 3.311483085567131e-08, + "logits/chosen": -2.09987211227417, + "logits/rejected": -2.0498032569885254, + "logps/chosen": -253.1121826171875, + "logps/rejected": -302.0452880859375, + "loss": 0.3878, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7232459783554077, + "rewards/margins": 2.5576093196868896, + "rewards/rejected": -3.280855417251587, + "step": 7731 + }, + { + "epoch": 0.89, + "learning_rate": 3.3079714386047055e-08, + "logits/chosen": -1.9961273670196533, + "logits/rejected": -2.176591634750366, + "logps/chosen": -262.44482421875, + "logps/rejected": -273.97833251953125, + "loss": 0.4829, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6331450939178467, + "rewards/margins": 2.037581205368042, + "rewards/rejected": -2.6707262992858887, + "step": 7732 + }, + { + "epoch": 0.89, + "learning_rate": 3.30445979164228e-08, + "logits/chosen": -1.799683690071106, + "logits/rejected": -1.6862168312072754, + "logps/chosen": -300.705322265625, + "logps/rejected": -320.76983642578125, + "loss": 0.2518, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7959109544754028, + "rewards/margins": 1.7192158699035645, + "rewards/rejected": -2.515126943588257, + "step": 7733 + }, + { + "epoch": 0.89, + "learning_rate": 3.300948144679855e-08, + "logits/chosen": -2.518460750579834, + "logits/rejected": -2.697282075881958, + "logps/chosen": -249.45504760742188, + "logps/rejected": -332.825439453125, + "loss": 0.3533, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.26817673444747925, + "rewards/margins": 2.9980459213256836, + "rewards/rejected": -3.2662229537963867, + "step": 7734 + }, + { + "epoch": 0.89, + "learning_rate": 3.29743649771743e-08, + "logits/chosen": -2.9512686729431152, + "logits/rejected": -3.0133256912231445, + "logps/chosen": -171.240234375, + "logps/rejected": -322.688232421875, + "loss": 0.2092, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3522701263427734, + "rewards/margins": 2.436440944671631, + "rewards/rejected": -3.7887110710144043, + "step": 7735 + }, + { + "epoch": 0.89, + "learning_rate": 3.2939248507550044e-08, + "logits/chosen": -2.124912738800049, + "logits/rejected": -2.072199821472168, + "logps/chosen": -185.34356689453125, + "logps/rejected": -247.97244262695312, + "loss": 0.7087, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.538901448249817, + "rewards/margins": 1.872093915939331, + "rewards/rejected": -3.4109952449798584, + "step": 7736 + }, + { + "epoch": 0.89, + "learning_rate": 3.2904132037925785e-08, + "logits/chosen": -2.603991985321045, + "logits/rejected": -2.7173566818237305, + "logps/chosen": -382.07159423828125, + "logps/rejected": -297.4794921875, + "loss": 0.3205, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06581413745880127, + "rewards/margins": 2.829267978668213, + "rewards/rejected": -2.8950822353363037, + "step": 7737 + }, + { + "epoch": 0.89, + "learning_rate": 3.286901556830153e-08, + "logits/chosen": -2.252377986907959, + "logits/rejected": -2.299586296081543, + "logps/chosen": -336.9881591796875, + "logps/rejected": -349.07098388671875, + "loss": 0.5141, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5231863856315613, + "rewards/margins": 1.8218573331832886, + "rewards/rejected": -2.345043659210205, + "step": 7738 + }, + { + "epoch": 0.89, + "learning_rate": 3.283389909867728e-08, + "logits/chosen": -1.8107081651687622, + "logits/rejected": -2.202208995819092, + "logps/chosen": -333.9664306640625, + "logps/rejected": -241.58633422851562, + "loss": 0.961, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.42695951461792, + "rewards/margins": 0.6075176000595093, + "rewards/rejected": -2.0344769954681396, + "step": 7739 + }, + { + "epoch": 0.89, + "learning_rate": 3.2798782629053027e-08, + "logits/chosen": -2.263324737548828, + "logits/rejected": -2.0216193199157715, + "logps/chosen": -355.46612548828125, + "logps/rejected": -439.6827392578125, + "loss": 0.2672, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5651949644088745, + "rewards/margins": 3.4785306453704834, + "rewards/rejected": -4.043725490570068, + "step": 7740 + }, + { + "epoch": 0.89, + "learning_rate": 3.2763666159428774e-08, + "logits/chosen": -2.548990249633789, + "logits/rejected": -2.6619744300842285, + "logps/chosen": -343.2739562988281, + "logps/rejected": -277.1781921386719, + "loss": 0.5961, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.302166223526001, + "rewards/margins": 1.7737139463424683, + "rewards/rejected": -3.0758800506591797, + "step": 7741 + }, + { + "epoch": 0.89, + "learning_rate": 3.272854968980452e-08, + "logits/chosen": -2.747896909713745, + "logits/rejected": -2.48871111869812, + "logps/chosen": -204.14309692382812, + "logps/rejected": -287.70709228515625, + "loss": 0.2165, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6988859176635742, + "rewards/margins": 2.277465581893921, + "rewards/rejected": -2.976351261138916, + "step": 7742 + }, + { + "epoch": 0.89, + "learning_rate": 3.269343322018027e-08, + "logits/chosen": -1.966975450515747, + "logits/rejected": -1.8846971988677979, + "logps/chosen": -229.77044677734375, + "logps/rejected": -255.89849853515625, + "loss": 0.2937, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9470267295837402, + "rewards/margins": 2.1690938472747803, + "rewards/rejected": -3.1161205768585205, + "step": 7743 + }, + { + "epoch": 0.89, + "learning_rate": 3.265831675055601e-08, + "logits/chosen": -1.9962986707687378, + "logits/rejected": -2.088124990463257, + "logps/chosen": -322.7370910644531, + "logps/rejected": -307.5950012207031, + "loss": 0.1478, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0593178272247314, + "rewards/margins": 3.3310554027557373, + "rewards/rejected": -4.390373229980469, + "step": 7744 + }, + { + "epoch": 0.89, + "learning_rate": 3.2623200280931756e-08, + "logits/chosen": -2.0811052322387695, + "logits/rejected": -2.219136953353882, + "logps/chosen": -226.4538116455078, + "logps/rejected": -322.26824951171875, + "loss": 0.2255, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18113026022911072, + "rewards/margins": 2.532777786254883, + "rewards/rejected": -2.7139079570770264, + "step": 7745 + }, + { + "epoch": 0.89, + "learning_rate": 3.25880838113075e-08, + "logits/chosen": -2.7677547931671143, + "logits/rejected": -2.7158756256103516, + "logps/chosen": -104.23442077636719, + "logps/rejected": -187.16384887695312, + "loss": 0.0838, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5896985530853271, + "rewards/margins": 3.0544188022613525, + "rewards/rejected": -3.6441173553466797, + "step": 7746 + }, + { + "epoch": 0.89, + "learning_rate": 3.2552967341683244e-08, + "logits/chosen": -2.0530660152435303, + "logits/rejected": -2.154198408126831, + "logps/chosen": -424.9596862792969, + "logps/rejected": -314.5118408203125, + "loss": 0.3354, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6192426085472107, + "rewards/margins": 2.5848464965820312, + "rewards/rejected": -3.2040889263153076, + "step": 7747 + }, + { + "epoch": 0.89, + "learning_rate": 3.251785087205899e-08, + "logits/chosen": -2.4322304725646973, + "logits/rejected": -2.4451401233673096, + "logps/chosen": -275.4624938964844, + "logps/rejected": -243.75515747070312, + "loss": 0.2805, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02950447052717209, + "rewards/margins": 1.5647940635681152, + "rewards/rejected": -1.5942983627319336, + "step": 7748 + }, + { + "epoch": 0.89, + "learning_rate": 3.248273440243474e-08, + "logits/chosen": -2.6123709678649902, + "logits/rejected": -2.313234329223633, + "logps/chosen": -280.3338623046875, + "logps/rejected": -273.18896484375, + "loss": 0.2634, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.98757004737854, + "rewards/margins": 2.6663522720336914, + "rewards/rejected": -3.6539223194122314, + "step": 7749 + }, + { + "epoch": 0.89, + "learning_rate": 3.2447617932810486e-08, + "logits/chosen": -2.6038818359375, + "logits/rejected": -2.495328903198242, + "logps/chosen": -228.30548095703125, + "logps/rejected": -483.71246337890625, + "loss": 0.079, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2140063047409058, + "rewards/margins": 5.180537700653076, + "rewards/rejected": -6.3945441246032715, + "step": 7750 + }, + { + "epoch": 0.89, + "learning_rate": 3.241250146318623e-08, + "logits/chosen": -2.4154601097106934, + "logits/rejected": -2.5722076892852783, + "logps/chosen": -308.36700439453125, + "logps/rejected": -358.2683410644531, + "loss": 1.0602, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.0255343914031982, + "rewards/margins": 1.2050687074661255, + "rewards/rejected": -3.230602979660034, + "step": 7751 + }, + { + "epoch": 0.89, + "learning_rate": 3.237738499356198e-08, + "logits/chosen": -2.543015718460083, + "logits/rejected": -2.6500048637390137, + "logps/chosen": -161.1113739013672, + "logps/rejected": -181.050048828125, + "loss": 0.4188, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6071715354919434, + "rewards/margins": 1.930715799331665, + "rewards/rejected": -2.5378870964050293, + "step": 7752 + }, + { + "epoch": 0.89, + "learning_rate": 3.234226852393773e-08, + "logits/chosen": -2.1313719749450684, + "logits/rejected": -2.165459156036377, + "logps/chosen": -351.11688232421875, + "logps/rejected": -394.4210205078125, + "loss": 0.0945, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1089693158864975, + "rewards/margins": 4.167938232421875, + "rewards/rejected": -4.058969020843506, + "step": 7753 + }, + { + "epoch": 0.89, + "learning_rate": 3.230715205431347e-08, + "logits/chosen": -2.5504045486450195, + "logits/rejected": -2.359442710876465, + "logps/chosen": -236.010498046875, + "logps/rejected": -266.6126708984375, + "loss": 0.3076, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5201554298400879, + "rewards/margins": 1.540719985961914, + "rewards/rejected": -2.060875415802002, + "step": 7754 + }, + { + "epoch": 0.89, + "learning_rate": 3.2272035584689215e-08, + "logits/chosen": -2.2819700241088867, + "logits/rejected": -2.087265968322754, + "logps/chosen": -334.5153503417969, + "logps/rejected": -275.8519287109375, + "loss": 0.5106, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1885727643966675, + "rewards/margins": 2.1080799102783203, + "rewards/rejected": -3.2966527938842773, + "step": 7755 + }, + { + "epoch": 0.89, + "learning_rate": 3.223691911506496e-08, + "logits/chosen": -2.508798360824585, + "logits/rejected": -2.4639902114868164, + "logps/chosen": -411.9925537109375, + "logps/rejected": -294.4930114746094, + "loss": 0.3518, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.057823896408081, + "rewards/margins": 2.3190078735351562, + "rewards/rejected": -3.376831531524658, + "step": 7756 + }, + { + "epoch": 0.89, + "learning_rate": 3.220180264544071e-08, + "logits/chosen": -2.224127769470215, + "logits/rejected": -1.971278429031372, + "logps/chosen": -173.33242797851562, + "logps/rejected": -295.1640625, + "loss": 0.4549, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3937039375305176, + "rewards/margins": 2.251711845397949, + "rewards/rejected": -2.645415782928467, + "step": 7757 + }, + { + "epoch": 0.89, + "learning_rate": 3.216668617581646e-08, + "logits/chosen": -2.285060405731201, + "logits/rejected": -2.347956895828247, + "logps/chosen": -442.49969482421875, + "logps/rejected": -432.93408203125, + "loss": 0.5179, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7350695133209229, + "rewards/margins": 1.0984224081039429, + "rewards/rejected": -1.8334919214248657, + "step": 7758 + }, + { + "epoch": 0.89, + "learning_rate": 3.2131569706192204e-08, + "logits/chosen": -2.1475989818573, + "logits/rejected": -2.4209847450256348, + "logps/chosen": -340.4014892578125, + "logps/rejected": -289.6851806640625, + "loss": 0.3649, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6641992926597595, + "rewards/margins": 2.8197975158691406, + "rewards/rejected": -3.483996629714966, + "step": 7759 + }, + { + "epoch": 0.89, + "learning_rate": 3.209645323656795e-08, + "logits/chosen": -2.2453129291534424, + "logits/rejected": -2.7549242973327637, + "logps/chosen": -487.5395202636719, + "logps/rejected": -347.4922180175781, + "loss": 0.4463, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4547771215438843, + "rewards/margins": 4.042242527008057, + "rewards/rejected": -4.4970197677612305, + "step": 7760 + }, + { + "epoch": 0.89, + "learning_rate": 3.206133676694369e-08, + "logits/chosen": -2.187450647354126, + "logits/rejected": -1.9313061237335205, + "logps/chosen": -132.9918670654297, + "logps/rejected": -280.1961975097656, + "loss": 0.6408, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.41056489944458, + "rewards/margins": 1.2950234413146973, + "rewards/rejected": -2.7055883407592773, + "step": 7761 + }, + { + "epoch": 0.89, + "learning_rate": 3.202622029731944e-08, + "logits/chosen": -2.1219654083251953, + "logits/rejected": -2.1014959812164307, + "logps/chosen": -325.26104736328125, + "logps/rejected": -308.1058044433594, + "loss": 0.1271, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7653735876083374, + "rewards/margins": 3.0139031410217285, + "rewards/rejected": -3.7792773246765137, + "step": 7762 + }, + { + "epoch": 0.89, + "learning_rate": 3.1991103827695187e-08, + "logits/chosen": -2.397500514984131, + "logits/rejected": -2.525139570236206, + "logps/chosen": -336.96856689453125, + "logps/rejected": -404.4151306152344, + "loss": 0.1493, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1076405048370361, + "rewards/margins": 3.7542176246643066, + "rewards/rejected": -4.861857891082764, + "step": 7763 + }, + { + "epoch": 0.9, + "learning_rate": 3.1955987358070934e-08, + "logits/chosen": -2.578953504562378, + "logits/rejected": -2.502467155456543, + "logps/chosen": -192.2303466796875, + "logps/rejected": -217.60789489746094, + "loss": 0.2602, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5186965465545654, + "rewards/margins": 1.459625244140625, + "rewards/rejected": -1.9783217906951904, + "step": 7764 + }, + { + "epoch": 0.9, + "learning_rate": 3.192087088844668e-08, + "logits/chosen": -2.426961898803711, + "logits/rejected": -2.601515293121338, + "logps/chosen": -374.6986389160156, + "logps/rejected": -163.15850830078125, + "loss": 0.5043, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7077993154525757, + "rewards/margins": 2.1536941528320312, + "rewards/rejected": -3.8614935874938965, + "step": 7765 + }, + { + "epoch": 0.9, + "learning_rate": 3.188575441882243e-08, + "logits/chosen": -2.3310935497283936, + "logits/rejected": -2.2450833320617676, + "logps/chosen": -105.08920288085938, + "logps/rejected": -208.11073303222656, + "loss": 0.8555, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.071372628211975, + "rewards/margins": 1.5059012174606323, + "rewards/rejected": -2.5772738456726074, + "step": 7766 + }, + { + "epoch": 0.9, + "learning_rate": 3.1850637949198175e-08, + "logits/chosen": -2.1022727489471436, + "logits/rejected": -2.0405924320220947, + "logps/chosen": -281.078857421875, + "logps/rejected": -361.3533935546875, + "loss": 0.2874, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5915601253509521, + "rewards/margins": 3.969989538192749, + "rewards/rejected": -4.561549663543701, + "step": 7767 + }, + { + "epoch": 0.9, + "learning_rate": 3.1815521479573916e-08, + "logits/chosen": -2.629926919937134, + "logits/rejected": -2.532914638519287, + "logps/chosen": -258.6230163574219, + "logps/rejected": -385.47344970703125, + "loss": 0.9862, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.8543758392333984, + "rewards/margins": 0.5252612829208374, + "rewards/rejected": -2.3796372413635254, + "step": 7768 + }, + { + "epoch": 0.9, + "learning_rate": 3.1780405009949663e-08, + "logits/chosen": -2.487126588821411, + "logits/rejected": -2.5545263290405273, + "logps/chosen": -332.4251403808594, + "logps/rejected": -157.8769989013672, + "loss": 0.2378, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8402407169342041, + "rewards/margins": 1.8422471284866333, + "rewards/rejected": -2.682487726211548, + "step": 7769 + }, + { + "epoch": 0.9, + "learning_rate": 3.174528854032541e-08, + "logits/chosen": -2.2373037338256836, + "logits/rejected": -2.2716064453125, + "logps/chosen": -434.1160888671875, + "logps/rejected": -359.14154052734375, + "loss": 0.4235, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0993608236312866, + "rewards/margins": 1.7002687454223633, + "rewards/rejected": -2.7996296882629395, + "step": 7770 + }, + { + "epoch": 0.9, + "learning_rate": 3.171017207070116e-08, + "logits/chosen": -2.002115488052368, + "logits/rejected": -2.087672710418701, + "logps/chosen": -238.25009155273438, + "logps/rejected": -295.51995849609375, + "loss": 0.1509, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.30523765087127686, + "rewards/margins": 3.653996706008911, + "rewards/rejected": -3.3487589359283447, + "step": 7771 + }, + { + "epoch": 0.9, + "learning_rate": 3.1675055601076905e-08, + "logits/chosen": -2.629565954208374, + "logits/rejected": -2.5598199367523193, + "logps/chosen": -218.02427673339844, + "logps/rejected": -355.326171875, + "loss": 0.2902, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4533228874206543, + "rewards/margins": 4.261801719665527, + "rewards/rejected": -5.71512508392334, + "step": 7772 + }, + { + "epoch": 0.9, + "learning_rate": 3.163993913145265e-08, + "logits/chosen": -2.6964526176452637, + "logits/rejected": -2.7621726989746094, + "logps/chosen": -330.0565185546875, + "logps/rejected": -246.92007446289062, + "loss": 0.289, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8890577554702759, + "rewards/margins": 2.471423864364624, + "rewards/rejected": -3.3604817390441895, + "step": 7773 + }, + { + "epoch": 0.9, + "learning_rate": 3.16048226618284e-08, + "logits/chosen": -2.376642942428589, + "logits/rejected": -2.3509509563446045, + "logps/chosen": -420.1831359863281, + "logps/rejected": -318.1520080566406, + "loss": 0.4157, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3422211408615112, + "rewards/margins": 1.6447314023971558, + "rewards/rejected": -2.986952543258667, + "step": 7774 + }, + { + "epoch": 0.9, + "learning_rate": 3.156970619220414e-08, + "logits/chosen": -2.5532329082489014, + "logits/rejected": -2.566293478012085, + "logps/chosen": -278.35888671875, + "logps/rejected": -284.6294250488281, + "loss": 0.3822, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1648955345153809, + "rewards/margins": 3.350891590118408, + "rewards/rejected": -4.515787124633789, + "step": 7775 + }, + { + "epoch": 0.9, + "learning_rate": 3.153458972257989e-08, + "logits/chosen": -1.9072871208190918, + "logits/rejected": -1.9228417873382568, + "logps/chosen": -244.8494873046875, + "logps/rejected": -300.0744323730469, + "loss": 0.1225, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2642182409763336, + "rewards/margins": 3.8407561779022217, + "rewards/rejected": -4.104974269866943, + "step": 7776 + }, + { + "epoch": 0.9, + "learning_rate": 3.1499473252955635e-08, + "logits/chosen": -2.452558755874634, + "logits/rejected": -2.5038838386535645, + "logps/chosen": -379.39013671875, + "logps/rejected": -322.1480407714844, + "loss": 0.7113, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.288236379623413, + "rewards/margins": 0.477772057056427, + "rewards/rejected": -1.7660084962844849, + "step": 7777 + }, + { + "epoch": 0.9, + "learning_rate": 3.146435678333138e-08, + "logits/chosen": -2.550894260406494, + "logits/rejected": -2.5706443786621094, + "logps/chosen": -191.433349609375, + "logps/rejected": -176.70462036132812, + "loss": 0.4115, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0338395833969116, + "rewards/margins": 1.8496990203857422, + "rewards/rejected": -2.8835384845733643, + "step": 7778 + }, + { + "epoch": 0.9, + "learning_rate": 3.142924031370713e-08, + "logits/chosen": -2.1405017375946045, + "logits/rejected": -2.060760021209717, + "logps/chosen": -211.12245178222656, + "logps/rejected": -234.83839416503906, + "loss": 0.5118, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8802003860473633, + "rewards/margins": 0.799628496170044, + "rewards/rejected": -2.6798288822174072, + "step": 7779 + }, + { + "epoch": 0.9, + "learning_rate": 3.1394123844082876e-08, + "logits/chosen": -2.2091104984283447, + "logits/rejected": -2.2864465713500977, + "logps/chosen": -341.756591796875, + "logps/rejected": -200.90536499023438, + "loss": 0.3293, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.25639039278030396, + "rewards/margins": 2.0032708644866943, + "rewards/rejected": -2.2596611976623535, + "step": 7780 + }, + { + "epoch": 0.9, + "learning_rate": 3.1359007374458624e-08, + "logits/chosen": -2.727565050125122, + "logits/rejected": -2.3346498012542725, + "logps/chosen": -330.0169982910156, + "logps/rejected": -336.78253173828125, + "loss": 0.5474, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0789263248443604, + "rewards/margins": 0.8852009773254395, + "rewards/rejected": -1.9641271829605103, + "step": 7781 + }, + { + "epoch": 0.9, + "learning_rate": 3.1323890904834364e-08, + "logits/chosen": -2.280982255935669, + "logits/rejected": -2.1240129470825195, + "logps/chosen": -282.5441589355469, + "logps/rejected": -216.00868225097656, + "loss": 0.4201, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8011537790298462, + "rewards/margins": 1.0181472301483154, + "rewards/rejected": -1.819300889968872, + "step": 7782 + }, + { + "epoch": 0.9, + "learning_rate": 3.128877443521011e-08, + "logits/chosen": -2.611847400665283, + "logits/rejected": -2.7618069648742676, + "logps/chosen": -193.86331176757812, + "logps/rejected": -210.12936401367188, + "loss": 0.293, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8858484625816345, + "rewards/margins": 2.350532293319702, + "rewards/rejected": -3.2363805770874023, + "step": 7783 + }, + { + "epoch": 0.9, + "learning_rate": 3.125365796558586e-08, + "logits/chosen": -2.438551425933838, + "logits/rejected": -2.239175796508789, + "logps/chosen": -216.66053771972656, + "logps/rejected": -327.38885498046875, + "loss": 0.63, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5215566158294678, + "rewards/margins": 2.8829095363616943, + "rewards/rejected": -4.404466152191162, + "step": 7784 + }, + { + "epoch": 0.9, + "learning_rate": 3.1218541495961606e-08, + "logits/chosen": -2.634230852127075, + "logits/rejected": -2.705084800720215, + "logps/chosen": -402.6549072265625, + "logps/rejected": -267.1183166503906, + "loss": 0.1307, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3616647720336914, + "rewards/margins": 2.829927921295166, + "rewards/rejected": -3.1915926933288574, + "step": 7785 + }, + { + "epoch": 0.9, + "learning_rate": 3.118342502633735e-08, + "logits/chosen": -2.3530640602111816, + "logits/rejected": -2.2961254119873047, + "logps/chosen": -318.0668029785156, + "logps/rejected": -459.8592529296875, + "loss": 0.5723, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4223897457122803, + "rewards/margins": 1.2273802757263184, + "rewards/rejected": -1.6497700214385986, + "step": 7786 + }, + { + "epoch": 0.9, + "learning_rate": 3.11483085567131e-08, + "logits/chosen": -1.9521081447601318, + "logits/rejected": -2.30452823638916, + "logps/chosen": -508.573974609375, + "logps/rejected": -316.1066589355469, + "loss": 1.0907, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.157536029815674, + "rewards/margins": 0.4950718879699707, + "rewards/rejected": -2.6526081562042236, + "step": 7787 + }, + { + "epoch": 0.9, + "learning_rate": 3.111319208708885e-08, + "logits/chosen": -2.0856075286865234, + "logits/rejected": -2.417038917541504, + "logps/chosen": -306.369140625, + "logps/rejected": -373.4622497558594, + "loss": 0.2875, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6579126715660095, + "rewards/margins": 2.3737106323242188, + "rewards/rejected": -3.031623125076294, + "step": 7788 + }, + { + "epoch": 0.9, + "learning_rate": 3.107807561746459e-08, + "logits/chosen": -2.4752423763275146, + "logits/rejected": -1.870532751083374, + "logps/chosen": -194.5684051513672, + "logps/rejected": -367.83795166015625, + "loss": 0.4033, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0880169868469238, + "rewards/margins": 1.3346867561340332, + "rewards/rejected": -2.422703742980957, + "step": 7789 + }, + { + "epoch": 0.9, + "learning_rate": 3.1042959147840335e-08, + "logits/chosen": -2.582197427749634, + "logits/rejected": -2.665958881378174, + "logps/chosen": -296.4146423339844, + "logps/rejected": -262.8143615722656, + "loss": 0.0955, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6048218607902527, + "rewards/margins": 3.9760491847991943, + "rewards/rejected": -4.580871105194092, + "step": 7790 + }, + { + "epoch": 0.9, + "learning_rate": 3.100784267821608e-08, + "logits/chosen": -2.7052202224731445, + "logits/rejected": -2.8574154376983643, + "logps/chosen": -340.2071533203125, + "logps/rejected": -265.6077880859375, + "loss": 0.409, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2383109331130981, + "rewards/margins": 2.1690471172332764, + "rewards/rejected": -3.407357692718506, + "step": 7791 + }, + { + "epoch": 0.9, + "learning_rate": 3.0972726208591823e-08, + "logits/chosen": -2.4364800453186035, + "logits/rejected": -2.4661507606506348, + "logps/chosen": -213.91551208496094, + "logps/rejected": -193.0897979736328, + "loss": 0.3914, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1993294358253479, + "rewards/margins": 2.296081066131592, + "rewards/rejected": -2.495410442352295, + "step": 7792 + }, + { + "epoch": 0.9, + "learning_rate": 3.093760973896757e-08, + "logits/chosen": -1.7939965724945068, + "logits/rejected": -1.6910228729248047, + "logps/chosen": -289.55926513671875, + "logps/rejected": -361.628662109375, + "loss": 0.3355, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9797838926315308, + "rewards/margins": 3.868645668029785, + "rewards/rejected": -4.848429203033447, + "step": 7793 + }, + { + "epoch": 0.9, + "learning_rate": 3.090249326934332e-08, + "logits/chosen": -2.182715892791748, + "logits/rejected": -2.2860522270202637, + "logps/chosen": -263.1430969238281, + "logps/rejected": -217.00640869140625, + "loss": 0.414, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8084133267402649, + "rewards/margins": 2.134078025817871, + "rewards/rejected": -2.942491292953491, + "step": 7794 + }, + { + "epoch": 0.9, + "learning_rate": 3.0867376799719065e-08, + "logits/chosen": -2.554781913757324, + "logits/rejected": -2.3610916137695312, + "logps/chosen": -166.81283569335938, + "logps/rejected": -279.2550964355469, + "loss": 0.619, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2542524337768555, + "rewards/margins": 1.6425656080245972, + "rewards/rejected": -2.896818161010742, + "step": 7795 + }, + { + "epoch": 0.9, + "learning_rate": 3.083226033009481e-08, + "logits/chosen": -2.3763909339904785, + "logits/rejected": -2.5308380126953125, + "logps/chosen": -235.40872192382812, + "logps/rejected": -177.53448486328125, + "loss": 0.7911, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.412224531173706, + "rewards/margins": 1.820230484008789, + "rewards/rejected": -3.232455015182495, + "step": 7796 + }, + { + "epoch": 0.9, + "learning_rate": 3.079714386047056e-08, + "logits/chosen": -1.7120732069015503, + "logits/rejected": -1.969780445098877, + "logps/chosen": -419.9483337402344, + "logps/rejected": -304.21295166015625, + "loss": 0.3245, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3473223447799683, + "rewards/margins": 2.035573720932007, + "rewards/rejected": -3.3828959465026855, + "step": 7797 + }, + { + "epoch": 0.9, + "learning_rate": 3.076202739084631e-08, + "logits/chosen": -1.7464556694030762, + "logits/rejected": -1.7187163829803467, + "logps/chosen": -423.1590270996094, + "logps/rejected": -468.7716979980469, + "loss": 0.423, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1322236061096191, + "rewards/margins": 0.9397507905960083, + "rewards/rejected": -2.071974515914917, + "step": 7798 + }, + { + "epoch": 0.9, + "learning_rate": 3.072691092122205e-08, + "logits/chosen": -2.7088658809661865, + "logits/rejected": -2.3454856872558594, + "logps/chosen": -297.9410400390625, + "logps/rejected": -456.4759826660156, + "loss": 0.1653, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2268062829971313, + "rewards/margins": 2.6993210315704346, + "rewards/rejected": -3.9261274337768555, + "step": 7799 + }, + { + "epoch": 0.9, + "learning_rate": 3.0691794451597795e-08, + "logits/chosen": -1.507920742034912, + "logits/rejected": -1.8162438869476318, + "logps/chosen": -413.73089599609375, + "logps/rejected": -320.82855224609375, + "loss": 0.3479, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.33466795086860657, + "rewards/margins": 1.2817670106887817, + "rewards/rejected": -1.616434931755066, + "step": 7800 + }, + { + "epoch": 0.9, + "learning_rate": 3.065667798197354e-08, + "logits/chosen": -2.5648887157440186, + "logits/rejected": -2.6806721687316895, + "logps/chosen": -291.9832458496094, + "logps/rejected": -255.92315673828125, + "loss": 0.2378, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7362032532691956, + "rewards/margins": 3.6562488079071045, + "rewards/rejected": -4.392451763153076, + "step": 7801 + }, + { + "epoch": 0.9, + "learning_rate": 3.062156151234929e-08, + "logits/chosen": -1.5648744106292725, + "logits/rejected": -1.365364909172058, + "logps/chosen": -414.5032043457031, + "logps/rejected": -455.41571044921875, + "loss": 0.5628, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8465229272842407, + "rewards/margins": 1.0834910869598389, + "rewards/rejected": -1.9300140142440796, + "step": 7802 + }, + { + "epoch": 0.9, + "learning_rate": 3.0586445042725036e-08, + "logits/chosen": -2.047071695327759, + "logits/rejected": -2.1955389976501465, + "logps/chosen": -329.52484130859375, + "logps/rejected": -307.8638000488281, + "loss": 0.1334, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.218333899974823, + "rewards/margins": 2.6998379230499268, + "rewards/rejected": -2.481503963470459, + "step": 7803 + }, + { + "epoch": 0.9, + "learning_rate": 3.0551328573100784e-08, + "logits/chosen": -2.183506965637207, + "logits/rejected": -2.5473742485046387, + "logps/chosen": -289.09405517578125, + "logps/rejected": -172.39447021484375, + "loss": 0.3855, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20103585720062256, + "rewards/margins": 2.950186252593994, + "rewards/rejected": -3.151221990585327, + "step": 7804 + }, + { + "epoch": 0.9, + "learning_rate": 3.051621210347653e-08, + "logits/chosen": -1.4534153938293457, + "logits/rejected": -1.14898681640625, + "logps/chosen": -295.1035461425781, + "logps/rejected": -390.8168029785156, + "loss": 0.6087, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5071995258331299, + "rewards/margins": 0.6224125623703003, + "rewards/rejected": -2.1296122074127197, + "step": 7805 + }, + { + "epoch": 0.9, + "learning_rate": 3.048109563385227e-08, + "logits/chosen": -2.443800926208496, + "logits/rejected": -2.5755677223205566, + "logps/chosen": -283.05126953125, + "logps/rejected": -231.8121337890625, + "loss": 0.3115, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5054517388343811, + "rewards/margins": 2.382434844970703, + "rewards/rejected": -2.8878865242004395, + "step": 7806 + }, + { + "epoch": 0.9, + "learning_rate": 3.044597916422802e-08, + "logits/chosen": -2.3252687454223633, + "logits/rejected": -2.305295944213867, + "logps/chosen": -109.73515319824219, + "logps/rejected": -123.11703491210938, + "loss": 0.7634, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9885817170143127, + "rewards/margins": 0.8287754058837891, + "rewards/rejected": -1.8173571825027466, + "step": 7807 + }, + { + "epoch": 0.9, + "learning_rate": 3.0410862694603766e-08, + "logits/chosen": -1.9996016025543213, + "logits/rejected": -2.1535747051239014, + "logps/chosen": -652.5499267578125, + "logps/rejected": -495.08978271484375, + "loss": 0.1458, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13884106278419495, + "rewards/margins": 2.455857753753662, + "rewards/rejected": -2.594698905944824, + "step": 7808 + }, + { + "epoch": 0.9, + "learning_rate": 3.037574622497951e-08, + "logits/chosen": -2.229231595993042, + "logits/rejected": -2.139097213745117, + "logps/chosen": -275.2127990722656, + "logps/rejected": -285.0658874511719, + "loss": 0.2146, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.49773961305618286, + "rewards/margins": 2.6398215293884277, + "rewards/rejected": -3.137561321258545, + "step": 7809 + }, + { + "epoch": 0.9, + "learning_rate": 3.034062975535526e-08, + "logits/chosen": -2.669858932495117, + "logits/rejected": -2.524514675140381, + "logps/chosen": -220.32748413085938, + "logps/rejected": -240.64810180664062, + "loss": 0.1944, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3565008044242859, + "rewards/margins": 2.8311405181884766, + "rewards/rejected": -3.1876413822174072, + "step": 7810 + }, + { + "epoch": 0.9, + "learning_rate": 3.030551328573101e-08, + "logits/chosen": -1.9021483659744263, + "logits/rejected": -1.8970377445220947, + "logps/chosen": -363.2001037597656, + "logps/rejected": -400.2444152832031, + "loss": 0.4769, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6321268081665039, + "rewards/margins": 2.763423204421997, + "rewards/rejected": -3.395549774169922, + "step": 7811 + }, + { + "epoch": 0.9, + "learning_rate": 3.0270396816106755e-08, + "logits/chosen": -2.115290880203247, + "logits/rejected": -1.8719067573547363, + "logps/chosen": -147.27976989746094, + "logps/rejected": -312.0670471191406, + "loss": 0.7443, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1722807884216309, + "rewards/margins": 1.5292608737945557, + "rewards/rejected": -2.7015419006347656, + "step": 7812 + }, + { + "epoch": 0.9, + "learning_rate": 3.0235280346482495e-08, + "logits/chosen": -2.171464681625366, + "logits/rejected": -2.192656993865967, + "logps/chosen": -314.113037109375, + "logps/rejected": -223.01129150390625, + "loss": 0.2197, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7714571356773376, + "rewards/margins": 1.9436366558074951, + "rewards/rejected": -2.7150938510894775, + "step": 7813 + }, + { + "epoch": 0.9, + "learning_rate": 3.020016387685824e-08, + "logits/chosen": -2.4287211894989014, + "logits/rejected": -2.4628632068634033, + "logps/chosen": -278.2745361328125, + "logps/rejected": -191.72544860839844, + "loss": 0.1398, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09307196736335754, + "rewards/margins": 3.703491449356079, + "rewards/rejected": -3.610419750213623, + "step": 7814 + }, + { + "epoch": 0.9, + "learning_rate": 3.016504740723399e-08, + "logits/chosen": -2.8216521739959717, + "logits/rejected": -2.8200855255126953, + "logps/chosen": -299.74615478515625, + "logps/rejected": -331.3973083496094, + "loss": 0.301, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6103441119194031, + "rewards/margins": 2.3404452800750732, + "rewards/rejected": -2.950789451599121, + "step": 7815 + }, + { + "epoch": 0.9, + "learning_rate": 3.012993093760974e-08, + "logits/chosen": -2.646479368209839, + "logits/rejected": -2.534156084060669, + "logps/chosen": -119.87173461914062, + "logps/rejected": -203.85565185546875, + "loss": 0.1435, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1931735873222351, + "rewards/margins": 2.4635331630706787, + "rewards/rejected": -2.6567068099975586, + "step": 7816 + }, + { + "epoch": 0.9, + "learning_rate": 3.0094814467985484e-08, + "logits/chosen": -2.27001953125, + "logits/rejected": -2.4568557739257812, + "logps/chosen": -473.3355712890625, + "logps/rejected": -322.6025085449219, + "loss": 0.6414, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.404711961746216, + "rewards/margins": 1.5442450046539307, + "rewards/rejected": -4.9489569664001465, + "step": 7817 + }, + { + "epoch": 0.9, + "learning_rate": 3.005969799836123e-08, + "logits/chosen": -2.6045541763305664, + "logits/rejected": -2.6652793884277344, + "logps/chosen": -299.11822509765625, + "logps/rejected": -264.7386169433594, + "loss": 0.1291, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2418922483921051, + "rewards/margins": 3.8544530868530273, + "rewards/rejected": -4.096344947814941, + "step": 7818 + }, + { + "epoch": 0.9, + "learning_rate": 3.002458152873698e-08, + "logits/chosen": -2.053269147872925, + "logits/rejected": -1.712350606918335, + "logps/chosen": -272.300048828125, + "logps/rejected": -284.70159912109375, + "loss": 0.8002, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2194459438323975, + "rewards/margins": 0.872439980506897, + "rewards/rejected": -2.091885805130005, + "step": 7819 + }, + { + "epoch": 0.9, + "learning_rate": 2.9989465059112726e-08, + "logits/chosen": -2.2350566387176514, + "logits/rejected": -2.389604091644287, + "logps/chosen": -394.4754638671875, + "logps/rejected": -307.9250793457031, + "loss": 0.6078, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.334620475769043, + "rewards/margins": 1.5053985118865967, + "rewards/rejected": -2.8400192260742188, + "step": 7820 + }, + { + "epoch": 0.9, + "learning_rate": 2.995434858948847e-08, + "logits/chosen": -1.791680932044983, + "logits/rejected": -1.8963921070098877, + "logps/chosen": -169.24851989746094, + "logps/rejected": -208.86190795898438, + "loss": 0.2096, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20897822082042694, + "rewards/margins": 2.014077663421631, + "rewards/rejected": -2.223055839538574, + "step": 7821 + }, + { + "epoch": 0.9, + "learning_rate": 2.9919232119864214e-08, + "logits/chosen": -2.4243922233581543, + "logits/rejected": -2.490581750869751, + "logps/chosen": -202.53167724609375, + "logps/rejected": -308.45550537109375, + "loss": 0.3183, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.38128605484962463, + "rewards/margins": 3.1219162940979004, + "rewards/rejected": -3.5032026767730713, + "step": 7822 + }, + { + "epoch": 0.9, + "learning_rate": 2.988411565023996e-08, + "logits/chosen": -2.2597763538360596, + "logits/rejected": -2.5087010860443115, + "logps/chosen": -398.4971618652344, + "logps/rejected": -316.57720947265625, + "loss": 0.2795, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9495426416397095, + "rewards/margins": 2.1709370613098145, + "rewards/rejected": -3.1204795837402344, + "step": 7823 + }, + { + "epoch": 0.9, + "learning_rate": 2.984899918061571e-08, + "logits/chosen": -2.689176082611084, + "logits/rejected": -2.5018444061279297, + "logps/chosen": -79.74661254882812, + "logps/rejected": -407.9849853515625, + "loss": 0.258, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2763814628124237, + "rewards/margins": 4.591627597808838, + "rewards/rejected": -4.868008613586426, + "step": 7824 + }, + { + "epoch": 0.9, + "learning_rate": 2.9813882710991456e-08, + "logits/chosen": -2.632117986679077, + "logits/rejected": -2.705517292022705, + "logps/chosen": -156.12716674804688, + "logps/rejected": -193.8616485595703, + "loss": 0.337, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3943396210670471, + "rewards/margins": 1.455801248550415, + "rewards/rejected": -1.8501408100128174, + "step": 7825 + }, + { + "epoch": 0.9, + "learning_rate": 2.97787662413672e-08, + "logits/chosen": -2.040623188018799, + "logits/rejected": -2.325932025909424, + "logps/chosen": -260.33819580078125, + "logps/rejected": -231.10604858398438, + "loss": 0.5446, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.212012767791748, + "rewards/margins": 2.274451732635498, + "rewards/rejected": -3.486464500427246, + "step": 7826 + }, + { + "epoch": 0.9, + "learning_rate": 2.9743649771742947e-08, + "logits/chosen": -2.6034255027770996, + "logits/rejected": -2.2943875789642334, + "logps/chosen": -136.12576293945312, + "logps/rejected": -263.41351318359375, + "loss": 0.2423, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4723179340362549, + "rewards/margins": 1.8738491535186768, + "rewards/rejected": -3.3461670875549316, + "step": 7827 + }, + { + "epoch": 0.9, + "learning_rate": 2.9708533302118694e-08, + "logits/chosen": -2.411874294281006, + "logits/rejected": -2.0416173934936523, + "logps/chosen": -331.6393737792969, + "logps/rejected": -303.662841796875, + "loss": 0.4744, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8066532611846924, + "rewards/margins": 2.549976110458374, + "rewards/rejected": -3.3566291332244873, + "step": 7828 + }, + { + "epoch": 0.9, + "learning_rate": 2.967341683249444e-08, + "logits/chosen": -2.108121633529663, + "logits/rejected": -2.1941800117492676, + "logps/chosen": -239.42745971679688, + "logps/rejected": -254.42144775390625, + "loss": 0.3619, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2854317426681519, + "rewards/margins": 1.7613493204116821, + "rewards/rejected": -3.046781063079834, + "step": 7829 + }, + { + "epoch": 0.9, + "learning_rate": 2.9638300362870185e-08, + "logits/chosen": -2.3079652786254883, + "logits/rejected": -2.32143497467041, + "logps/chosen": -255.05517578125, + "logps/rejected": -341.39056396484375, + "loss": 0.42, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2284395694732666, + "rewards/margins": 2.8419506549835205, + "rewards/rejected": -4.070389747619629, + "step": 7830 + }, + { + "epoch": 0.9, + "learning_rate": 2.9603183893245932e-08, + "logits/chosen": -2.174834728240967, + "logits/rejected": -2.0799508094787598, + "logps/chosen": -368.1812438964844, + "logps/rejected": -421.82843017578125, + "loss": 0.5292, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.26333343982696533, + "rewards/margins": 1.1822445392608643, + "rewards/rejected": -1.4455780982971191, + "step": 7831 + }, + { + "epoch": 0.9, + "learning_rate": 2.956806742362168e-08, + "logits/chosen": -2.3146181106567383, + "logits/rejected": -2.2737996578216553, + "logps/chosen": -255.1608123779297, + "logps/rejected": -316.06182861328125, + "loss": 0.5721, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7390213012695312, + "rewards/margins": 1.5259342193603516, + "rewards/rejected": -3.264955520629883, + "step": 7832 + }, + { + "epoch": 0.9, + "learning_rate": 2.9532950953997424e-08, + "logits/chosen": -2.0112504959106445, + "logits/rejected": -2.0153379440307617, + "logps/chosen": -297.5968017578125, + "logps/rejected": -312.0605773925781, + "loss": 0.2415, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5977417826652527, + "rewards/margins": 2.206505298614502, + "rewards/rejected": -2.8042473793029785, + "step": 7833 + }, + { + "epoch": 0.9, + "learning_rate": 2.949783448437317e-08, + "logits/chosen": -2.3405203819274902, + "logits/rejected": -2.201765775680542, + "logps/chosen": -247.38763427734375, + "logps/rejected": -364.97442626953125, + "loss": 0.335, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8441212177276611, + "rewards/margins": 4.5331621170043945, + "rewards/rejected": -5.377283096313477, + "step": 7834 + }, + { + "epoch": 0.9, + "learning_rate": 2.9462718014748918e-08, + "logits/chosen": -2.8152706623077393, + "logits/rejected": -2.838977575302124, + "logps/chosen": -386.3665771484375, + "logps/rejected": -233.84359741210938, + "loss": 0.729, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7552316188812256, + "rewards/margins": 1.991896629333496, + "rewards/rejected": -3.7471280097961426, + "step": 7835 + }, + { + "epoch": 0.9, + "learning_rate": 2.9427601545124665e-08, + "logits/chosen": -1.9159830808639526, + "logits/rejected": -2.0551445484161377, + "logps/chosen": -521.6961059570312, + "logps/rejected": -375.7913513183594, + "loss": 0.4275, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.27315986156463623, + "rewards/margins": 1.4468343257904053, + "rewards/rejected": -1.719994068145752, + "step": 7836 + }, + { + "epoch": 0.9, + "learning_rate": 2.939248507550041e-08, + "logits/chosen": -2.525613307952881, + "logits/rejected": -2.535604953765869, + "logps/chosen": -292.1324157714844, + "logps/rejected": -278.1415710449219, + "loss": 0.156, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4178040623664856, + "rewards/margins": 3.467650890350342, + "rewards/rejected": -3.8854548931121826, + "step": 7837 + }, + { + "epoch": 0.9, + "learning_rate": 2.9357368605876157e-08, + "logits/chosen": -2.725311756134033, + "logits/rejected": -2.6884446144104004, + "logps/chosen": -276.5940856933594, + "logps/rejected": -293.69573974609375, + "loss": 0.3687, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6136242151260376, + "rewards/margins": 2.564065456390381, + "rewards/rejected": -4.177689552307129, + "step": 7838 + }, + { + "epoch": 0.9, + "learning_rate": 2.9322252136251897e-08, + "logits/chosen": -1.9399254322052002, + "logits/rejected": -1.9505587816238403, + "logps/chosen": -347.7784423828125, + "logps/rejected": -301.96295166015625, + "loss": 0.4339, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8179692625999451, + "rewards/margins": 2.3594350814819336, + "rewards/rejected": -3.1774044036865234, + "step": 7839 + }, + { + "epoch": 0.9, + "learning_rate": 2.9287135666627644e-08, + "logits/chosen": -2.159005641937256, + "logits/rejected": -2.176974058151245, + "logps/chosen": -211.04415893554688, + "logps/rejected": -286.5129699707031, + "loss": 0.6629, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1113959550857544, + "rewards/margins": 1.8426814079284668, + "rewards/rejected": -2.9540774822235107, + "step": 7840 + }, + { + "epoch": 0.9, + "learning_rate": 2.925201919700339e-08, + "logits/chosen": -2.740962266921997, + "logits/rejected": -2.8250904083251953, + "logps/chosen": -258.8719482421875, + "logps/rejected": -206.56103515625, + "loss": 0.707, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6633591651916504, + "rewards/margins": 2.783994197845459, + "rewards/rejected": -4.447353363037109, + "step": 7841 + }, + { + "epoch": 0.9, + "learning_rate": 2.921690272737914e-08, + "logits/chosen": -2.6476807594299316, + "logits/rejected": -2.7273342609405518, + "logps/chosen": -143.66836547851562, + "logps/rejected": -151.08035278320312, + "loss": 0.2424, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5763943791389465, + "rewards/margins": 2.6553730964660645, + "rewards/rejected": -3.2317676544189453, + "step": 7842 + }, + { + "epoch": 0.9, + "learning_rate": 2.9181786257754883e-08, + "logits/chosen": -2.6416311264038086, + "logits/rejected": -2.7174124717712402, + "logps/chosen": -355.9296875, + "logps/rejected": -201.6541748046875, + "loss": 0.3312, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3323820233345032, + "rewards/margins": 1.8370152711868286, + "rewards/rejected": -2.1693973541259766, + "step": 7843 + }, + { + "epoch": 0.9, + "learning_rate": 2.914666978813063e-08, + "logits/chosen": -2.219940423965454, + "logits/rejected": -2.285609722137451, + "logps/chosen": -205.7432098388672, + "logps/rejected": -285.0948181152344, + "loss": 0.1835, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4212843179702759, + "rewards/margins": 2.589872121810913, + "rewards/rejected": -3.0111565589904785, + "step": 7844 + }, + { + "epoch": 0.9, + "learning_rate": 2.9111553318506377e-08, + "logits/chosen": -2.021092414855957, + "logits/rejected": -1.9893817901611328, + "logps/chosen": -250.2208251953125, + "logps/rejected": -266.6905517578125, + "loss": 0.5263, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9873904585838318, + "rewards/margins": 1.6404173374176025, + "rewards/rejected": -2.6278076171875, + "step": 7845 + }, + { + "epoch": 0.9, + "learning_rate": 2.9076436848882125e-08, + "logits/chosen": -2.519212245941162, + "logits/rejected": -2.4597506523132324, + "logps/chosen": -338.5380554199219, + "logps/rejected": -313.0321044921875, + "loss": 0.2126, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.611980676651001, + "rewards/margins": 2.580061197280884, + "rewards/rejected": -3.1920418739318848, + "step": 7846 + }, + { + "epoch": 0.9, + "learning_rate": 2.904132037925787e-08, + "logits/chosen": -1.9678834676742554, + "logits/rejected": -1.9932615756988525, + "logps/chosen": -419.37591552734375, + "logps/rejected": -389.83502197265625, + "loss": 0.4218, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1192610263824463, + "rewards/margins": 2.534130573272705, + "rewards/rejected": -3.6533915996551514, + "step": 7847 + }, + { + "epoch": 0.9, + "learning_rate": 2.9006203909633616e-08, + "logits/chosen": -2.8915979862213135, + "logits/rejected": -2.751833438873291, + "logps/chosen": -173.86712646484375, + "logps/rejected": -155.37889099121094, + "loss": 0.1421, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06362435221672058, + "rewards/margins": 2.922840118408203, + "rewards/rejected": -2.986464500427246, + "step": 7848 + }, + { + "epoch": 0.9, + "learning_rate": 2.8971087440009363e-08, + "logits/chosen": -2.4268476963043213, + "logits/rejected": -2.861466884613037, + "logps/chosen": -371.21307373046875, + "logps/rejected": -195.56292724609375, + "loss": 0.7076, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1649627685546875, + "rewards/margins": 1.6564035415649414, + "rewards/rejected": -2.821366548538208, + "step": 7849 + }, + { + "epoch": 0.9, + "learning_rate": 2.8935970970385107e-08, + "logits/chosen": -2.7110674381256104, + "logits/rejected": -2.742645740509033, + "logps/chosen": -324.718017578125, + "logps/rejected": -325.20928955078125, + "loss": 0.4928, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0769376754760742, + "rewards/margins": 2.3513057231903076, + "rewards/rejected": -3.428243398666382, + "step": 7850 + }, + { + "epoch": 0.91, + "learning_rate": 2.8900854500760854e-08, + "logits/chosen": -2.469632148742676, + "logits/rejected": -2.398460865020752, + "logps/chosen": -87.74954986572266, + "logps/rejected": -259.39593505859375, + "loss": 0.3854, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7573103904724121, + "rewards/margins": 2.2786741256713867, + "rewards/rejected": -3.035984516143799, + "step": 7851 + }, + { + "epoch": 0.91, + "learning_rate": 2.88657380311366e-08, + "logits/chosen": -2.6747989654541016, + "logits/rejected": -2.4539682865142822, + "logps/chosen": -254.0884246826172, + "logps/rejected": -300.2509460449219, + "loss": 0.2262, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1935410499572754, + "rewards/margins": 2.96374773979187, + "rewards/rejected": -4.157289028167725, + "step": 7852 + }, + { + "epoch": 0.91, + "learning_rate": 2.883062156151235e-08, + "logits/chosen": -2.4985036849975586, + "logits/rejected": -2.470470428466797, + "logps/chosen": -202.23809814453125, + "logps/rejected": -210.07394409179688, + "loss": 0.8869, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7230794429779053, + "rewards/margins": 0.5175111293792725, + "rewards/rejected": -2.2405905723571777, + "step": 7853 + }, + { + "epoch": 0.91, + "learning_rate": 2.8795505091888092e-08, + "logits/chosen": -2.947355270385742, + "logits/rejected": -2.948686122894287, + "logps/chosen": -282.93695068359375, + "logps/rejected": -215.79611206054688, + "loss": 0.2524, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7467034459114075, + "rewards/margins": 2.155587911605835, + "rewards/rejected": -2.902291774749756, + "step": 7854 + }, + { + "epoch": 0.91, + "learning_rate": 2.876038862226384e-08, + "logits/chosen": -2.0271124839782715, + "logits/rejected": -1.8504524230957031, + "logps/chosen": -400.46600341796875, + "logps/rejected": -429.71417236328125, + "loss": 0.314, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.628656029701233, + "rewards/margins": 3.785752773284912, + "rewards/rejected": -5.414409160614014, + "step": 7855 + }, + { + "epoch": 0.91, + "learning_rate": 2.8725272152639587e-08, + "logits/chosen": -2.5979526042938232, + "logits/rejected": -2.5816726684570312, + "logps/chosen": -113.69511413574219, + "logps/rejected": -201.48193359375, + "loss": 0.2679, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8856635689735413, + "rewards/margins": 2.851900100708008, + "rewards/rejected": -3.7375636100769043, + "step": 7856 + }, + { + "epoch": 0.91, + "learning_rate": 2.869015568301533e-08, + "logits/chosen": -2.545318841934204, + "logits/rejected": -2.3769404888153076, + "logps/chosen": -338.064208984375, + "logps/rejected": -370.29595947265625, + "loss": 0.2017, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1712854504585266, + "rewards/margins": 3.0149974822998047, + "rewards/rejected": -3.1862826347351074, + "step": 7857 + }, + { + "epoch": 0.91, + "learning_rate": 2.8655039213391078e-08, + "logits/chosen": -2.4742584228515625, + "logits/rejected": -2.624135732650757, + "logps/chosen": -320.96624755859375, + "logps/rejected": -368.4248046875, + "loss": 0.4662, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8006337881088257, + "rewards/margins": 2.3504109382629395, + "rewards/rejected": -4.151044845581055, + "step": 7858 + }, + { + "epoch": 0.91, + "learning_rate": 2.8619922743766825e-08, + "logits/chosen": -2.3680789470672607, + "logits/rejected": -1.999261498451233, + "logps/chosen": -212.7048797607422, + "logps/rejected": -341.7615661621094, + "loss": 0.312, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8566781282424927, + "rewards/margins": 1.4314956665039062, + "rewards/rejected": -2.2881736755371094, + "step": 7859 + }, + { + "epoch": 0.91, + "learning_rate": 2.8584806274142573e-08, + "logits/chosen": -1.6808809041976929, + "logits/rejected": -2.039175510406494, + "logps/chosen": -345.2652893066406, + "logps/rejected": -299.86163330078125, + "loss": 0.6353, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4074530601501465, + "rewards/margins": 0.5011134147644043, + "rewards/rejected": -1.9085663557052612, + "step": 7860 + }, + { + "epoch": 0.91, + "learning_rate": 2.8549689804518317e-08, + "logits/chosen": -2.4827351570129395, + "logits/rejected": -2.2474234104156494, + "logps/chosen": -163.96641540527344, + "logps/rejected": -188.02188110351562, + "loss": 0.416, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.28533607721328735, + "rewards/margins": 2.03159761428833, + "rewards/rejected": -2.3169338703155518, + "step": 7861 + }, + { + "epoch": 0.91, + "learning_rate": 2.8514573334894064e-08, + "logits/chosen": -2.724707841873169, + "logits/rejected": -2.3808491230010986, + "logps/chosen": -293.755126953125, + "logps/rejected": -490.5436706542969, + "loss": 0.1951, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5109676122665405, + "rewards/margins": 3.3742551803588867, + "rewards/rejected": -3.885222911834717, + "step": 7862 + }, + { + "epoch": 0.91, + "learning_rate": 2.847945686526981e-08, + "logits/chosen": -2.291688919067383, + "logits/rejected": -2.1925294399261475, + "logps/chosen": -139.6977081298828, + "logps/rejected": -223.02767944335938, + "loss": 0.1169, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4159156084060669, + "rewards/margins": 3.700437068939209, + "rewards/rejected": -4.116352558135986, + "step": 7863 + }, + { + "epoch": 0.91, + "learning_rate": 2.8444340395645558e-08, + "logits/chosen": -2.655106544494629, + "logits/rejected": -2.5618252754211426, + "logps/chosen": -175.7471923828125, + "logps/rejected": -176.45887756347656, + "loss": 0.2544, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.911042332649231, + "rewards/margins": 1.7194764614105225, + "rewards/rejected": -2.630518913269043, + "step": 7864 + }, + { + "epoch": 0.91, + "learning_rate": 2.8409223926021302e-08, + "logits/chosen": -2.0870518684387207, + "logits/rejected": -2.275094747543335, + "logps/chosen": -199.842041015625, + "logps/rejected": -310.765625, + "loss": 0.9037, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1544463634490967, + "rewards/margins": 0.6263965368270874, + "rewards/rejected": -1.7808427810668945, + "step": 7865 + }, + { + "epoch": 0.91, + "learning_rate": 2.837410745639705e-08, + "logits/chosen": -2.219531297683716, + "logits/rejected": -2.5686347484588623, + "logps/chosen": -280.79095458984375, + "logps/rejected": -225.68846130371094, + "loss": 0.6705, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2777676582336426, + "rewards/margins": 0.5287134647369385, + "rewards/rejected": -1.8064810037612915, + "step": 7866 + }, + { + "epoch": 0.91, + "learning_rate": 2.8338990986772797e-08, + "logits/chosen": -1.9266750812530518, + "logits/rejected": -2.1281251907348633, + "logps/chosen": -231.8021240234375, + "logps/rejected": -213.84616088867188, + "loss": 0.2324, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.780190110206604, + "rewards/margins": 2.5213534832000732, + "rewards/rejected": -3.3015434741973877, + "step": 7867 + }, + { + "epoch": 0.91, + "learning_rate": 2.830387451714854e-08, + "logits/chosen": -2.4611868858337402, + "logits/rejected": -2.1779847145080566, + "logps/chosen": -244.83595275878906, + "logps/rejected": -338.8970947265625, + "loss": 0.1075, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1864977478981018, + "rewards/margins": 3.0665154457092285, + "rewards/rejected": -3.2530133724212646, + "step": 7868 + }, + { + "epoch": 0.91, + "learning_rate": 2.8268758047524288e-08, + "logits/chosen": -2.9609808921813965, + "logits/rejected": -2.9824986457824707, + "logps/chosen": -166.99790954589844, + "logps/rejected": -179.33029174804688, + "loss": 0.4268, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.557857096195221, + "rewards/margins": 2.274674415588379, + "rewards/rejected": -2.832531690597534, + "step": 7869 + }, + { + "epoch": 0.91, + "learning_rate": 2.8233641577900035e-08, + "logits/chosen": -2.9317266941070557, + "logits/rejected": -2.930508852005005, + "logps/chosen": -311.19036865234375, + "logps/rejected": -304.1529846191406, + "loss": 0.4084, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9642273783683777, + "rewards/margins": 2.666348934173584, + "rewards/rejected": -3.6305763721466064, + "step": 7870 + }, + { + "epoch": 0.91, + "learning_rate": 2.8198525108275782e-08, + "logits/chosen": -2.69722843170166, + "logits/rejected": -2.6374523639678955, + "logps/chosen": -252.79513549804688, + "logps/rejected": -214.32415771484375, + "loss": 0.3715, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9559469223022461, + "rewards/margins": 2.529980421066284, + "rewards/rejected": -3.4859275817871094, + "step": 7871 + }, + { + "epoch": 0.91, + "learning_rate": 2.8163408638651526e-08, + "logits/chosen": -2.5882575511932373, + "logits/rejected": -2.849398612976074, + "logps/chosen": -255.12405395507812, + "logps/rejected": -220.69642639160156, + "loss": 0.2388, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6609883308410645, + "rewards/margins": 3.1001505851745605, + "rewards/rejected": -3.761139154434204, + "step": 7872 + }, + { + "epoch": 0.91, + "learning_rate": 2.8128292169027273e-08, + "logits/chosen": -1.908644437789917, + "logits/rejected": -1.8487716913223267, + "logps/chosen": -201.3468780517578, + "logps/rejected": -233.73724365234375, + "loss": 0.2095, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1679613590240479, + "rewards/margins": 2.9382948875427246, + "rewards/rejected": -4.106256484985352, + "step": 7873 + }, + { + "epoch": 0.91, + "learning_rate": 2.809317569940302e-08, + "logits/chosen": -2.480494499206543, + "logits/rejected": -2.1367199420928955, + "logps/chosen": -184.50759887695312, + "logps/rejected": -254.856689453125, + "loss": 0.5042, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5322829484939575, + "rewards/margins": 1.75816011428833, + "rewards/rejected": -3.290442705154419, + "step": 7874 + }, + { + "epoch": 0.91, + "learning_rate": 2.8058059229778765e-08, + "logits/chosen": -2.4071438312530518, + "logits/rejected": -2.4902236461639404, + "logps/chosen": -217.30996704101562, + "logps/rejected": -210.87889099121094, + "loss": 0.3834, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2998597621917725, + "rewards/margins": 1.049420952796936, + "rewards/rejected": -2.349280834197998, + "step": 7875 + }, + { + "epoch": 0.91, + "learning_rate": 2.8022942760154512e-08, + "logits/chosen": -1.7572383880615234, + "logits/rejected": -1.7163586616516113, + "logps/chosen": -250.0083465576172, + "logps/rejected": -385.672119140625, + "loss": 0.3671, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9939137697219849, + "rewards/margins": 2.5144243240356445, + "rewards/rejected": -3.50833797454834, + "step": 7876 + }, + { + "epoch": 0.91, + "learning_rate": 2.798782629053026e-08, + "logits/chosen": -2.4219772815704346, + "logits/rejected": -2.495617628097534, + "logps/chosen": -301.75238037109375, + "logps/rejected": -258.08746337890625, + "loss": 0.2222, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8638376593589783, + "rewards/margins": 1.998578429222107, + "rewards/rejected": -2.8624160289764404, + "step": 7877 + }, + { + "epoch": 0.91, + "learning_rate": 2.7952709820906006e-08, + "logits/chosen": -1.8919785022735596, + "logits/rejected": -2.3548924922943115, + "logps/chosen": -412.10260009765625, + "logps/rejected": -302.4402160644531, + "loss": 0.3168, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5497013926506042, + "rewards/margins": 2.4671761989593506, + "rewards/rejected": -3.0168776512145996, + "step": 7878 + }, + { + "epoch": 0.91, + "learning_rate": 2.791759335128175e-08, + "logits/chosen": -2.178861141204834, + "logits/rejected": -2.136131525039673, + "logps/chosen": -235.32838439941406, + "logps/rejected": -169.66696166992188, + "loss": 0.3625, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1266330480575562, + "rewards/margins": 1.8395273685455322, + "rewards/rejected": -2.966160297393799, + "step": 7879 + }, + { + "epoch": 0.91, + "learning_rate": 2.7882476881657498e-08, + "logits/chosen": -2.298980236053467, + "logits/rejected": -2.2366323471069336, + "logps/chosen": -426.3937072753906, + "logps/rejected": -319.04248046875, + "loss": 0.431, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5285546779632568, + "rewards/margins": 1.6263375282287598, + "rewards/rejected": -3.1548922061920166, + "step": 7880 + }, + { + "epoch": 0.91, + "learning_rate": 2.7847360412033245e-08, + "logits/chosen": -2.349588394165039, + "logits/rejected": -2.361445188522339, + "logps/chosen": -216.35113525390625, + "logps/rejected": -230.49266052246094, + "loss": 0.2305, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7296566367149353, + "rewards/margins": 1.8930583000183105, + "rewards/rejected": -2.6227149963378906, + "step": 7881 + }, + { + "epoch": 0.91, + "learning_rate": 2.781224394240899e-08, + "logits/chosen": -2.202083110809326, + "logits/rejected": -2.620872974395752, + "logps/chosen": -367.0317687988281, + "logps/rejected": -257.3352355957031, + "loss": 0.1592, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2126089334487915, + "rewards/margins": 3.81614351272583, + "rewards/rejected": -4.028752326965332, + "step": 7882 + }, + { + "epoch": 0.91, + "learning_rate": 2.7777127472784736e-08, + "logits/chosen": -2.0030415058135986, + "logits/rejected": -2.0143909454345703, + "logps/chosen": -361.0922546386719, + "logps/rejected": -290.2274475097656, + "loss": 0.588, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3261258602142334, + "rewards/margins": 1.376261591911316, + "rewards/rejected": -2.7023873329162598, + "step": 7883 + }, + { + "epoch": 0.91, + "learning_rate": 2.7742011003160483e-08, + "logits/chosen": -2.6278042793273926, + "logits/rejected": -2.789889335632324, + "logps/chosen": -379.7264099121094, + "logps/rejected": -415.92462158203125, + "loss": 0.1285, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3738631010055542, + "rewards/margins": 4.848926544189453, + "rewards/rejected": -6.222789764404297, + "step": 7884 + }, + { + "epoch": 0.91, + "learning_rate": 2.770689453353623e-08, + "logits/chosen": -2.5731940269470215, + "logits/rejected": -2.3016703128814697, + "logps/chosen": -140.15518188476562, + "logps/rejected": -302.266845703125, + "loss": 0.5544, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.129391074180603, + "rewards/margins": 1.9457746744155884, + "rewards/rejected": -3.0751657485961914, + "step": 7885 + }, + { + "epoch": 0.91, + "learning_rate": 2.7671778063911974e-08, + "logits/chosen": -1.8645521402359009, + "logits/rejected": -1.9202004671096802, + "logps/chosen": -290.4646911621094, + "logps/rejected": -282.6368408203125, + "loss": 0.3737, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.15749123692512512, + "rewards/margins": 2.140650987625122, + "rewards/rejected": -1.9831597805023193, + "step": 7886 + }, + { + "epoch": 0.91, + "learning_rate": 2.7636661594287718e-08, + "logits/chosen": -2.686659812927246, + "logits/rejected": -2.268165349960327, + "logps/chosen": -331.06854248046875, + "logps/rejected": -424.2467956542969, + "loss": 0.353, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3579627275466919, + "rewards/margins": 2.0797054767608643, + "rewards/rejected": -2.4376680850982666, + "step": 7887 + }, + { + "epoch": 0.91, + "learning_rate": 2.7601545124663465e-08, + "logits/chosen": -2.6133923530578613, + "logits/rejected": -2.5652778148651123, + "logps/chosen": -243.8087615966797, + "logps/rejected": -275.141357421875, + "loss": 0.166, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7043401002883911, + "rewards/margins": 3.6885344982147217, + "rewards/rejected": -4.392874717712402, + "step": 7888 + }, + { + "epoch": 0.91, + "learning_rate": 2.756642865503921e-08, + "logits/chosen": -1.8927730321884155, + "logits/rejected": -2.0016067028045654, + "logps/chosen": -271.4025573730469, + "logps/rejected": -223.58843994140625, + "loss": 0.3791, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2578846216201782, + "rewards/margins": 2.003251314163208, + "rewards/rejected": -3.261136054992676, + "step": 7889 + }, + { + "epoch": 0.91, + "learning_rate": 2.7531312185414957e-08, + "logits/chosen": -2.436270236968994, + "logits/rejected": -2.368621826171875, + "logps/chosen": -478.5572509765625, + "logps/rejected": -325.9060974121094, + "loss": 0.1901, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.34044399857521057, + "rewards/margins": 2.8540964126586914, + "rewards/rejected": -3.194540500640869, + "step": 7890 + }, + { + "epoch": 0.91, + "learning_rate": 2.7496195715790704e-08, + "logits/chosen": -2.1476755142211914, + "logits/rejected": -2.6367499828338623, + "logps/chosen": -357.45843505859375, + "logps/rejected": -230.6267852783203, + "loss": 0.7056, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6487197875976562, + "rewards/margins": 0.852898359298706, + "rewards/rejected": -1.5016182661056519, + "step": 7891 + }, + { + "epoch": 0.91, + "learning_rate": 2.7461079246166448e-08, + "logits/chosen": -2.287351131439209, + "logits/rejected": -2.1816515922546387, + "logps/chosen": -207.721435546875, + "logps/rejected": -224.85394287109375, + "loss": 0.4362, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8428857326507568, + "rewards/margins": 1.4311418533325195, + "rewards/rejected": -2.2740275859832764, + "step": 7892 + }, + { + "epoch": 0.91, + "learning_rate": 2.7425962776542195e-08, + "logits/chosen": -2.193406343460083, + "logits/rejected": -2.4694371223449707, + "logps/chosen": -488.705078125, + "logps/rejected": -275.6812438964844, + "loss": 0.3058, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0255508422851562, + "rewards/margins": 1.9035043716430664, + "rewards/rejected": -2.9290552139282227, + "step": 7893 + }, + { + "epoch": 0.91, + "learning_rate": 2.7390846306917942e-08, + "logits/chosen": -2.7273430824279785, + "logits/rejected": -2.6328678131103516, + "logps/chosen": -231.26593017578125, + "logps/rejected": -172.87452697753906, + "loss": 0.2797, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.43583691120147705, + "rewards/margins": 2.3536221981048584, + "rewards/rejected": -2.789459228515625, + "step": 7894 + }, + { + "epoch": 0.91, + "learning_rate": 2.735572983729369e-08, + "logits/chosen": -2.1558899879455566, + "logits/rejected": -2.432643175125122, + "logps/chosen": -340.94964599609375, + "logps/rejected": -209.436767578125, + "loss": 0.5961, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0540640354156494, + "rewards/margins": 1.0044114589691162, + "rewards/rejected": -2.0584757328033447, + "step": 7895 + }, + { + "epoch": 0.91, + "learning_rate": 2.7320613367669433e-08, + "logits/chosen": -1.7320096492767334, + "logits/rejected": -1.5047173500061035, + "logps/chosen": -215.64292907714844, + "logps/rejected": -309.4922180175781, + "loss": 0.6701, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4520454406738281, + "rewards/margins": 0.9097317457199097, + "rewards/rejected": -2.3617770671844482, + "step": 7896 + }, + { + "epoch": 0.91, + "learning_rate": 2.728549689804518e-08, + "logits/chosen": -2.555753469467163, + "logits/rejected": -2.5577423572540283, + "logps/chosen": -236.6990509033203, + "logps/rejected": -234.62266540527344, + "loss": 0.6388, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6378649473190308, + "rewards/margins": 1.265794277191162, + "rewards/rejected": -1.9036591053009033, + "step": 7897 + }, + { + "epoch": 0.91, + "learning_rate": 2.7250380428420928e-08, + "logits/chosen": -2.6694488525390625, + "logits/rejected": -2.7107694149017334, + "logps/chosen": -168.1529998779297, + "logps/rejected": -146.51760864257812, + "loss": 0.2877, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8995202779769897, + "rewards/margins": 2.2221405506134033, + "rewards/rejected": -3.1216607093811035, + "step": 7898 + }, + { + "epoch": 0.91, + "learning_rate": 2.7215263958796672e-08, + "logits/chosen": -2.3562331199645996, + "logits/rejected": -2.435901641845703, + "logps/chosen": -122.61396789550781, + "logps/rejected": -146.184814453125, + "loss": 0.4092, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.589576780796051, + "rewards/margins": 2.3897640705108643, + "rewards/rejected": -2.9793407917022705, + "step": 7899 + }, + { + "epoch": 0.91, + "learning_rate": 2.718014748917242e-08, + "logits/chosen": -2.168180227279663, + "logits/rejected": -2.0314595699310303, + "logps/chosen": -257.6254577636719, + "logps/rejected": -300.8764953613281, + "loss": 0.9084, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.5340254306793213, + "rewards/margins": 1.17645263671875, + "rewards/rejected": -3.7104780673980713, + "step": 7900 + }, + { + "epoch": 0.91, + "learning_rate": 2.7145031019548166e-08, + "logits/chosen": -2.269465923309326, + "logits/rejected": -2.2100985050201416, + "logps/chosen": -178.5724639892578, + "logps/rejected": -260.63909912109375, + "loss": 0.4627, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.45511242747306824, + "rewards/margins": 2.7561192512512207, + "rewards/rejected": -3.2112317085266113, + "step": 7901 + }, + { + "epoch": 0.91, + "learning_rate": 2.7109914549923914e-08, + "logits/chosen": -2.4752166271209717, + "logits/rejected": -2.427276611328125, + "logps/chosen": -227.4945068359375, + "logps/rejected": -197.8341827392578, + "loss": 0.2999, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5350973606109619, + "rewards/margins": 1.7736585140228271, + "rewards/rejected": -2.308755874633789, + "step": 7902 + }, + { + "epoch": 0.91, + "learning_rate": 2.7074798080299657e-08, + "logits/chosen": -2.415501594543457, + "logits/rejected": -2.5985107421875, + "logps/chosen": -346.3138732910156, + "logps/rejected": -315.5746154785156, + "loss": 0.3151, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9118140339851379, + "rewards/margins": 2.5882413387298584, + "rewards/rejected": -3.5000555515289307, + "step": 7903 + }, + { + "epoch": 0.91, + "learning_rate": 2.7039681610675405e-08, + "logits/chosen": -2.612701177597046, + "logits/rejected": -2.542186737060547, + "logps/chosen": -222.89166259765625, + "logps/rejected": -264.91058349609375, + "loss": 0.4329, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.37292030453681946, + "rewards/margins": 2.811491012573242, + "rewards/rejected": -3.1844115257263184, + "step": 7904 + }, + { + "epoch": 0.91, + "learning_rate": 2.7004565141051152e-08, + "logits/chosen": -2.121709108352661, + "logits/rejected": -2.35532546043396, + "logps/chosen": -326.2662353515625, + "logps/rejected": -234.0961151123047, + "loss": 0.4836, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6787885427474976, + "rewards/margins": 1.8046655654907227, + "rewards/rejected": -2.4834542274475098, + "step": 7905 + }, + { + "epoch": 0.91, + "learning_rate": 2.69694486714269e-08, + "logits/chosen": -2.8975820541381836, + "logits/rejected": -2.9109959602355957, + "logps/chosen": -106.19837188720703, + "logps/rejected": -311.75701904296875, + "loss": 0.4404, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.488909363746643, + "rewards/margins": 1.3380744457244873, + "rewards/rejected": -2.82698392868042, + "step": 7906 + }, + { + "epoch": 0.91, + "learning_rate": 2.6934332201802643e-08, + "logits/chosen": -1.5731983184814453, + "logits/rejected": -1.8547786474227905, + "logps/chosen": -503.28143310546875, + "logps/rejected": -382.94879150390625, + "loss": 0.7451, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.846464991569519, + "rewards/margins": 1.39413583278656, + "rewards/rejected": -2.2406005859375, + "step": 7907 + }, + { + "epoch": 0.91, + "learning_rate": 2.689921573217839e-08, + "logits/chosen": -1.910660982131958, + "logits/rejected": -1.9629290103912354, + "logps/chosen": -424.7901306152344, + "logps/rejected": -303.16485595703125, + "loss": 13.5737, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.996828079223633, + "rewards/margins": -11.516777038574219, + "rewards/rejected": -2.480051040649414, + "step": 7908 + }, + { + "epoch": 0.91, + "learning_rate": 2.6864099262554138e-08, + "logits/chosen": -2.345705509185791, + "logits/rejected": -2.487189531326294, + "logps/chosen": -119.86653137207031, + "logps/rejected": -164.32113647460938, + "loss": 0.6898, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4151384830474854, + "rewards/margins": 1.300445795059204, + "rewards/rejected": -2.7155842781066895, + "step": 7909 + }, + { + "epoch": 0.91, + "learning_rate": 2.682898279292988e-08, + "logits/chosen": -2.4573752880096436, + "logits/rejected": -2.696132183074951, + "logps/chosen": -303.72344970703125, + "logps/rejected": -152.7987060546875, + "loss": 0.4243, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6950236558914185, + "rewards/margins": 1.6709285974502563, + "rewards/rejected": -3.365952491760254, + "step": 7910 + }, + { + "epoch": 0.91, + "learning_rate": 2.679386632330563e-08, + "logits/chosen": -2.1079111099243164, + "logits/rejected": -2.407094717025757, + "logps/chosen": -299.0151062011719, + "logps/rejected": -292.00244140625, + "loss": 0.0762, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06071844696998596, + "rewards/margins": 4.3516998291015625, + "rewards/rejected": -4.412417888641357, + "step": 7911 + }, + { + "epoch": 0.91, + "learning_rate": 2.6758749853681376e-08, + "logits/chosen": -2.455517292022705, + "logits/rejected": -2.549466609954834, + "logps/chosen": -242.267333984375, + "logps/rejected": -136.1950225830078, + "loss": 0.2466, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9485938549041748, + "rewards/margins": 2.0519845485687256, + "rewards/rejected": -3.0005784034729004, + "step": 7912 + }, + { + "epoch": 0.91, + "learning_rate": 2.6723633384057123e-08, + "logits/chosen": -2.334010601043701, + "logits/rejected": -2.308318853378296, + "logps/chosen": -219.37002563476562, + "logps/rejected": -202.9193878173828, + "loss": 0.2849, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.30558496713638306, + "rewards/margins": 2.3593719005584717, + "rewards/rejected": -2.664957046508789, + "step": 7913 + }, + { + "epoch": 0.91, + "learning_rate": 2.6688516914432867e-08, + "logits/chosen": -2.2834348678588867, + "logits/rejected": -2.371680736541748, + "logps/chosen": -493.6947021484375, + "logps/rejected": -333.503662109375, + "loss": 0.3009, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3982277512550354, + "rewards/margins": 1.85195791721344, + "rewards/rejected": -1.4537302255630493, + "step": 7914 + }, + { + "epoch": 0.91, + "learning_rate": 2.6653400444808614e-08, + "logits/chosen": -2.3223578929901123, + "logits/rejected": -2.060149908065796, + "logps/chosen": -315.5164794921875, + "logps/rejected": -370.55828857421875, + "loss": 0.1988, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5897472500801086, + "rewards/margins": 2.1900367736816406, + "rewards/rejected": -2.7797839641571045, + "step": 7915 + }, + { + "epoch": 0.91, + "learning_rate": 2.661828397518436e-08, + "logits/chosen": -2.3132598400115967, + "logits/rejected": -2.252614736557007, + "logps/chosen": -165.99998474121094, + "logps/rejected": -212.15859985351562, + "loss": 0.4795, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3185781240463257, + "rewards/margins": 1.4304397106170654, + "rewards/rejected": -2.7490177154541016, + "step": 7916 + }, + { + "epoch": 0.91, + "learning_rate": 2.6583167505560106e-08, + "logits/chosen": -2.003272533416748, + "logits/rejected": -1.8240543603897095, + "logps/chosen": -283.2339782714844, + "logps/rejected": -360.0828857421875, + "loss": 1.6414, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.4437209367752075, + "rewards/margins": -0.8432519435882568, + "rewards/rejected": -0.6004689335823059, + "step": 7917 + }, + { + "epoch": 0.91, + "learning_rate": 2.6548051035935853e-08, + "logits/chosen": -2.2840089797973633, + "logits/rejected": -2.41945219039917, + "logps/chosen": -299.0297546386719, + "logps/rejected": -207.6694793701172, + "loss": 0.3756, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4284057021141052, + "rewards/margins": 1.9043974876403809, + "rewards/rejected": -2.332803249359131, + "step": 7918 + }, + { + "epoch": 0.91, + "learning_rate": 2.65129345663116e-08, + "logits/chosen": -2.018362045288086, + "logits/rejected": -2.1882858276367188, + "logps/chosen": -470.7598571777344, + "logps/rejected": -432.47723388671875, + "loss": 0.1332, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.336517572402954, + "rewards/margins": 2.7813901901245117, + "rewards/rejected": -4.117907524108887, + "step": 7919 + }, + { + "epoch": 0.91, + "learning_rate": 2.6477818096687347e-08, + "logits/chosen": -2.4348912239074707, + "logits/rejected": -2.3319289684295654, + "logps/chosen": -271.45025634765625, + "logps/rejected": -285.6554260253906, + "loss": 0.28, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14889755845069885, + "rewards/margins": 2.5068488121032715, + "rewards/rejected": -2.6557464599609375, + "step": 7920 + }, + { + "epoch": 0.91, + "learning_rate": 2.644270162706309e-08, + "logits/chosen": -2.178983449935913, + "logits/rejected": -2.0044517517089844, + "logps/chosen": -102.91868591308594, + "logps/rejected": -184.43382263183594, + "loss": 0.9059, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5111470222473145, + "rewards/margins": 2.6697402000427246, + "rewards/rejected": -4.180887222290039, + "step": 7921 + }, + { + "epoch": 0.91, + "learning_rate": 2.640758515743884e-08, + "logits/chosen": -2.6182587146759033, + "logits/rejected": -2.8526554107666016, + "logps/chosen": -190.76907348632812, + "logps/rejected": -178.7761993408203, + "loss": 0.2504, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8242783546447754, + "rewards/margins": 2.5161614418029785, + "rewards/rejected": -3.340439558029175, + "step": 7922 + }, + { + "epoch": 0.91, + "learning_rate": 2.6372468687814586e-08, + "logits/chosen": -2.4635190963745117, + "logits/rejected": -2.4676308631896973, + "logps/chosen": -343.40264892578125, + "logps/rejected": -265.278076171875, + "loss": 0.3017, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1543282270431519, + "rewards/margins": 2.490509033203125, + "rewards/rejected": -3.6448373794555664, + "step": 7923 + }, + { + "epoch": 0.91, + "learning_rate": 2.633735221819033e-08, + "logits/chosen": -1.9988335371017456, + "logits/rejected": -2.0696964263916016, + "logps/chosen": -278.5461730957031, + "logps/rejected": -265.822998046875, + "loss": 0.8177, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1573535203933716, + "rewards/margins": 0.29818010330200195, + "rewards/rejected": -1.455533504486084, + "step": 7924 + }, + { + "epoch": 0.91, + "learning_rate": 2.6302235748566077e-08, + "logits/chosen": -2.4634604454040527, + "logits/rejected": -2.214754581451416, + "logps/chosen": -349.13665771484375, + "logps/rejected": -359.8349304199219, + "loss": 0.3868, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.40541309118270874, + "rewards/margins": 1.2292563915252686, + "rewards/rejected": -1.6346694231033325, + "step": 7925 + }, + { + "epoch": 0.91, + "learning_rate": 2.6267119278941824e-08, + "logits/chosen": -2.789246082305908, + "logits/rejected": -2.6346826553344727, + "logps/chosen": -59.08598327636719, + "logps/rejected": -118.89693450927734, + "loss": 0.4218, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.919471025466919, + "rewards/margins": 1.0392208099365234, + "rewards/rejected": -1.9586918354034424, + "step": 7926 + }, + { + "epoch": 0.91, + "learning_rate": 2.623200280931757e-08, + "logits/chosen": -2.5190954208374023, + "logits/rejected": -2.6807937622070312, + "logps/chosen": -227.6588592529297, + "logps/rejected": -158.4053192138672, + "loss": 0.2786, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.529279351234436, + "rewards/margins": 1.7110544443130493, + "rewards/rejected": -2.2403335571289062, + "step": 7927 + }, + { + "epoch": 0.91, + "learning_rate": 2.6196886339693315e-08, + "logits/chosen": -2.214243173599243, + "logits/rejected": -2.3479583263397217, + "logps/chosen": -265.7738342285156, + "logps/rejected": -203.70761108398438, + "loss": 0.4649, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4448554515838623, + "rewards/margins": 2.7158989906311035, + "rewards/rejected": -4.160754680633545, + "step": 7928 + }, + { + "epoch": 0.91, + "learning_rate": 2.6161769870069063e-08, + "logits/chosen": -2.5484094619750977, + "logits/rejected": -2.368006706237793, + "logps/chosen": -162.62387084960938, + "logps/rejected": -192.23675537109375, + "loss": 0.353, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1321300268173218, + "rewards/margins": 1.6132688522338867, + "rewards/rejected": -2.745398998260498, + "step": 7929 + }, + { + "epoch": 0.91, + "learning_rate": 2.612665340044481e-08, + "logits/chosen": -1.6721910238265991, + "logits/rejected": -1.8826470375061035, + "logps/chosen": -625.6976928710938, + "logps/rejected": -318.296630859375, + "loss": 0.4196, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.823427140712738, + "rewards/margins": 2.209343194961548, + "rewards/rejected": -3.0327703952789307, + "step": 7930 + }, + { + "epoch": 0.91, + "learning_rate": 2.6091536930820557e-08, + "logits/chosen": -2.0239362716674805, + "logits/rejected": -1.8545031547546387, + "logps/chosen": -173.038818359375, + "logps/rejected": -252.42742919921875, + "loss": 0.4565, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8482315540313721, + "rewards/margins": 1.7384318113327026, + "rewards/rejected": -2.586663246154785, + "step": 7931 + }, + { + "epoch": 0.91, + "learning_rate": 2.60564204611963e-08, + "logits/chosen": -2.4117443561553955, + "logits/rejected": -2.5951216220855713, + "logps/chosen": -136.74191284179688, + "logps/rejected": -185.44813537597656, + "loss": 0.544, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2164782285690308, + "rewards/margins": 2.054809093475342, + "rewards/rejected": -3.271287202835083, + "step": 7932 + }, + { + "epoch": 0.91, + "learning_rate": 2.6021303991572048e-08, + "logits/chosen": -2.4254584312438965, + "logits/rejected": -2.4631006717681885, + "logps/chosen": -383.7232666015625, + "logps/rejected": -361.01416015625, + "loss": 0.184, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8235704898834229, + "rewards/margins": 3.213639736175537, + "rewards/rejected": -4.037210464477539, + "step": 7933 + }, + { + "epoch": 0.91, + "learning_rate": 2.5986187521947795e-08, + "logits/chosen": -2.140418767929077, + "logits/rejected": -2.433166027069092, + "logps/chosen": -195.97169494628906, + "logps/rejected": -239.20933532714844, + "loss": 0.8501, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.9341049194335938, + "rewards/margins": 0.8049024343490601, + "rewards/rejected": -2.7390074729919434, + "step": 7934 + }, + { + "epoch": 0.91, + "learning_rate": 2.5951071052323536e-08, + "logits/chosen": -2.461272716522217, + "logits/rejected": -2.3971548080444336, + "logps/chosen": -173.41708374023438, + "logps/rejected": -222.59848022460938, + "loss": 0.3255, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7493229508399963, + "rewards/margins": 2.6332149505615234, + "rewards/rejected": -3.382537841796875, + "step": 7935 + }, + { + "epoch": 0.91, + "learning_rate": 2.5915954582699283e-08, + "logits/chosen": -2.2529735565185547, + "logits/rejected": -1.961387038230896, + "logps/chosen": -179.29318237304688, + "logps/rejected": -260.40496826171875, + "loss": 0.5355, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.19050875306129456, + "rewards/margins": 1.7810827493667603, + "rewards/rejected": -1.971591591835022, + "step": 7936 + }, + { + "epoch": 0.91, + "learning_rate": 2.588083811307503e-08, + "logits/chosen": -2.383786678314209, + "logits/rejected": -2.437634229660034, + "logps/chosen": -227.972412109375, + "logps/rejected": -261.08233642578125, + "loss": 0.2357, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6941680908203125, + "rewards/margins": 2.5438079833984375, + "rewards/rejected": -3.237975835800171, + "step": 7937 + }, + { + "epoch": 0.92, + "learning_rate": 2.5845721643450774e-08, + "logits/chosen": -2.1194143295288086, + "logits/rejected": -2.2397806644439697, + "logps/chosen": -395.91015625, + "logps/rejected": -339.5650634765625, + "loss": 0.2112, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5438458919525146, + "rewards/margins": 2.819409132003784, + "rewards/rejected": -3.363255023956299, + "step": 7938 + }, + { + "epoch": 0.92, + "learning_rate": 2.581060517382652e-08, + "logits/chosen": -1.8771910667419434, + "logits/rejected": -1.6854463815689087, + "logps/chosen": -340.8329772949219, + "logps/rejected": -594.0023193359375, + "loss": 0.0672, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.211423635482788, + "rewards/margins": 14.907511711120605, + "rewards/rejected": -16.118934631347656, + "step": 7939 + }, + { + "epoch": 0.92, + "learning_rate": 2.577548870420227e-08, + "logits/chosen": -2.384984016418457, + "logits/rejected": -2.41628360748291, + "logps/chosen": -347.45245361328125, + "logps/rejected": -301.0610046386719, + "loss": 0.3207, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7761733531951904, + "rewards/margins": 1.6995614767074585, + "rewards/rejected": -2.4757344722747803, + "step": 7940 + }, + { + "epoch": 0.92, + "learning_rate": 2.5740372234578013e-08, + "logits/chosen": -1.8425207138061523, + "logits/rejected": -2.1831307411193848, + "logps/chosen": -338.968994140625, + "logps/rejected": -331.0360412597656, + "loss": 0.4932, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4209365844726562, + "rewards/margins": 3.139718532562256, + "rewards/rejected": -4.560654640197754, + "step": 7941 + }, + { + "epoch": 0.92, + "learning_rate": 2.570525576495376e-08, + "logits/chosen": -2.2301082611083984, + "logits/rejected": -1.9759471416473389, + "logps/chosen": -287.1338195800781, + "logps/rejected": -298.2421875, + "loss": 0.4869, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.40211182832717896, + "rewards/margins": 1.376604676246643, + "rewards/rejected": -1.7787165641784668, + "step": 7942 + }, + { + "epoch": 0.92, + "learning_rate": 2.5670139295329507e-08, + "logits/chosen": -1.9329414367675781, + "logits/rejected": -2.3484137058258057, + "logps/chosen": -490.6224365234375, + "logps/rejected": -423.9716796875, + "loss": 0.4015, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4685306549072266, + "rewards/margins": 2.728022575378418, + "rewards/rejected": -4.1965532302856445, + "step": 7943 + }, + { + "epoch": 0.92, + "learning_rate": 2.5635022825705255e-08, + "logits/chosen": -2.5953941345214844, + "logits/rejected": -2.33013916015625, + "logps/chosen": -281.2446594238281, + "logps/rejected": -262.09112548828125, + "loss": 0.2088, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6306992769241333, + "rewards/margins": 4.468068599700928, + "rewards/rejected": -6.09876823425293, + "step": 7944 + }, + { + "epoch": 0.92, + "learning_rate": 2.5599906356081e-08, + "logits/chosen": -2.1875452995300293, + "logits/rejected": -1.9482173919677734, + "logps/chosen": -300.9415283203125, + "logps/rejected": -400.718017578125, + "loss": 0.3679, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7652021646499634, + "rewards/margins": 2.5914535522460938, + "rewards/rejected": -3.3566555976867676, + "step": 7945 + }, + { + "epoch": 0.92, + "learning_rate": 2.5564789886456746e-08, + "logits/chosen": -2.9216277599334717, + "logits/rejected": -2.644627332687378, + "logps/chosen": -246.27365112304688, + "logps/rejected": -291.2127990722656, + "loss": 0.1722, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17529571056365967, + "rewards/margins": 3.2669758796691895, + "rewards/rejected": -3.4422712326049805, + "step": 7946 + }, + { + "epoch": 0.92, + "learning_rate": 2.5529673416832493e-08, + "logits/chosen": -1.8689746856689453, + "logits/rejected": -2.1523914337158203, + "logps/chosen": -242.00189208984375, + "logps/rejected": -168.36215209960938, + "loss": 0.8964, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3728996515274048, + "rewards/margins": 1.4787664413452148, + "rewards/rejected": -2.851666212081909, + "step": 7947 + }, + { + "epoch": 0.92, + "learning_rate": 2.549455694720824e-08, + "logits/chosen": -2.1064488887786865, + "logits/rejected": -2.086669683456421, + "logps/chosen": -284.23809814453125, + "logps/rejected": -332.189208984375, + "loss": 0.3496, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2948110103607178, + "rewards/margins": 3.1547279357910156, + "rewards/rejected": -3.4495394229888916, + "step": 7948 + }, + { + "epoch": 0.92, + "learning_rate": 2.5459440477583984e-08, + "logits/chosen": -2.1445200443267822, + "logits/rejected": -2.260432481765747, + "logps/chosen": -275.6441650390625, + "logps/rejected": -258.1583251953125, + "loss": 0.1611, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.010674141347408295, + "rewards/margins": 3.046226978302002, + "rewards/rejected": -3.035552978515625, + "step": 7949 + }, + { + "epoch": 0.92, + "learning_rate": 2.542432400795973e-08, + "logits/chosen": -2.3027186393737793, + "logits/rejected": -2.308840751647949, + "logps/chosen": -318.65130615234375, + "logps/rejected": -269.3037109375, + "loss": 0.3872, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.34152960777282715, + "rewards/margins": 1.5411145687103271, + "rewards/rejected": -1.8826440572738647, + "step": 7950 + }, + { + "epoch": 0.92, + "learning_rate": 2.538920753833548e-08, + "logits/chosen": -2.1946935653686523, + "logits/rejected": -2.2204785346984863, + "logps/chosen": -302.30389404296875, + "logps/rejected": -307.34381103515625, + "loss": 0.3916, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4629209041595459, + "rewards/margins": 2.7410671710968018, + "rewards/rejected": -3.2039883136749268, + "step": 7951 + }, + { + "epoch": 0.92, + "learning_rate": 2.5354091068711222e-08, + "logits/chosen": -2.7997429370880127, + "logits/rejected": -2.7111737728118896, + "logps/chosen": -187.79705810546875, + "logps/rejected": -285.216796875, + "loss": 0.5125, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1643644571304321, + "rewards/margins": 1.8327627182006836, + "rewards/rejected": -2.9971275329589844, + "step": 7952 + }, + { + "epoch": 0.92, + "learning_rate": 2.531897459908697e-08, + "logits/chosen": -2.275604724884033, + "logits/rejected": -2.170408010482788, + "logps/chosen": -162.2772979736328, + "logps/rejected": -240.9927215576172, + "loss": 0.1868, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5381966829299927, + "rewards/margins": 2.4241862297058105, + "rewards/rejected": -2.9623830318450928, + "step": 7953 + }, + { + "epoch": 0.92, + "learning_rate": 2.5283858129462717e-08, + "logits/chosen": -1.884989619255066, + "logits/rejected": -1.6629589796066284, + "logps/chosen": -283.96490478515625, + "logps/rejected": -338.5690612792969, + "loss": 0.2445, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7201260924339294, + "rewards/margins": 3.3323168754577637, + "rewards/rejected": -4.052443027496338, + "step": 7954 + }, + { + "epoch": 0.92, + "learning_rate": 2.5248741659838464e-08, + "logits/chosen": -2.624089241027832, + "logits/rejected": -2.5906567573547363, + "logps/chosen": -254.6405487060547, + "logps/rejected": -246.08705139160156, + "loss": 0.2701, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2043540477752686, + "rewards/margins": 2.5407679080963135, + "rewards/rejected": -3.745121717453003, + "step": 7955 + }, + { + "epoch": 0.92, + "learning_rate": 2.5213625190214208e-08, + "logits/chosen": -1.9026548862457275, + "logits/rejected": -1.9817763566970825, + "logps/chosen": -387.6874694824219, + "logps/rejected": -266.6608581542969, + "loss": 0.2438, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7027819156646729, + "rewards/margins": 2.622652530670166, + "rewards/rejected": -3.325434446334839, + "step": 7956 + }, + { + "epoch": 0.92, + "learning_rate": 2.5178508720589955e-08, + "logits/chosen": -2.3209643363952637, + "logits/rejected": -2.2427775859832764, + "logps/chosen": -259.4503479003906, + "logps/rejected": -199.68443298339844, + "loss": 0.1973, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1345204859972, + "rewards/margins": 2.7132296562194824, + "rewards/rejected": -2.578709125518799, + "step": 7957 + }, + { + "epoch": 0.92, + "learning_rate": 2.5143392250965703e-08, + "logits/chosen": -2.186829090118408, + "logits/rejected": -2.055520534515381, + "logps/chosen": -310.9998779296875, + "logps/rejected": -223.76026916503906, + "loss": 0.2481, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3977583348751068, + "rewards/margins": 3.0789241790771484, + "rewards/rejected": -3.476682424545288, + "step": 7958 + }, + { + "epoch": 0.92, + "learning_rate": 2.5108275781341447e-08, + "logits/chosen": -2.157151937484741, + "logits/rejected": -2.2185189723968506, + "logps/chosen": -322.1352844238281, + "logps/rejected": -337.7017822265625, + "loss": 0.1465, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.26287004351615906, + "rewards/margins": 3.0998849868774414, + "rewards/rejected": -3.362755060195923, + "step": 7959 + }, + { + "epoch": 0.92, + "learning_rate": 2.5073159311717194e-08, + "logits/chosen": -2.283550500869751, + "logits/rejected": -2.501621961593628, + "logps/chosen": -275.48687744140625, + "logps/rejected": -209.3139190673828, + "loss": 0.6017, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9186394214630127, + "rewards/margins": 1.634263277053833, + "rewards/rejected": -2.5529024600982666, + "step": 7960 + }, + { + "epoch": 0.92, + "learning_rate": 2.503804284209294e-08, + "logits/chosen": -2.0567309856414795, + "logits/rejected": -2.057215690612793, + "logps/chosen": -260.655029296875, + "logps/rejected": -391.56182861328125, + "loss": 0.2841, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.742200493812561, + "rewards/margins": 2.5645506381988525, + "rewards/rejected": -3.306751251220703, + "step": 7961 + }, + { + "epoch": 0.92, + "learning_rate": 2.5002926372468688e-08, + "logits/chosen": -2.879340887069702, + "logits/rejected": -2.709871768951416, + "logps/chosen": -249.3451385498047, + "logps/rejected": -296.47003173828125, + "loss": 0.0676, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2764851450920105, + "rewards/margins": 3.201364755630493, + "rewards/rejected": -3.4778499603271484, + "step": 7962 + }, + { + "epoch": 0.92, + "learning_rate": 2.4967809902844432e-08, + "logits/chosen": -2.4194018840789795, + "logits/rejected": -2.3675832748413086, + "logps/chosen": -317.4794921875, + "logps/rejected": -202.4337921142578, + "loss": 0.1869, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1880510151386261, + "rewards/margins": 2.448050022125244, + "rewards/rejected": -2.636101007461548, + "step": 7963 + }, + { + "epoch": 0.92, + "learning_rate": 2.493269343322018e-08, + "logits/chosen": -2.596465587615967, + "logits/rejected": -2.7641937732696533, + "logps/chosen": -202.74334716796875, + "logps/rejected": -194.50704956054688, + "loss": 1.161, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6283352375030518, + "rewards/margins": 1.1944266557693481, + "rewards/rejected": -2.8227617740631104, + "step": 7964 + }, + { + "epoch": 0.92, + "learning_rate": 2.4897576963595927e-08, + "logits/chosen": -2.2125988006591797, + "logits/rejected": -2.5537092685699463, + "logps/chosen": -270.644775390625, + "logps/rejected": -252.64541625976562, + "loss": 0.5453, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8752261996269226, + "rewards/margins": 0.9476624727249146, + "rewards/rejected": -1.8228886127471924, + "step": 7965 + }, + { + "epoch": 0.92, + "learning_rate": 2.4862460493971674e-08, + "logits/chosen": -2.282735824584961, + "logits/rejected": -2.299506664276123, + "logps/chosen": -235.76446533203125, + "logps/rejected": -223.64266967773438, + "loss": 0.4453, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2483576536178589, + "rewards/margins": 2.5425667762756348, + "rewards/rejected": -3.7909247875213623, + "step": 7966 + }, + { + "epoch": 0.92, + "learning_rate": 2.4827344024347418e-08, + "logits/chosen": -2.704442024230957, + "logits/rejected": -2.7465097904205322, + "logps/chosen": -392.4009094238281, + "logps/rejected": -370.87652587890625, + "loss": 0.6874, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7589830160140991, + "rewards/margins": 0.44415295124053955, + "rewards/rejected": -2.2031359672546387, + "step": 7967 + }, + { + "epoch": 0.92, + "learning_rate": 2.4792227554723165e-08, + "logits/chosen": -2.300156593322754, + "logits/rejected": -2.618299722671509, + "logps/chosen": -378.92108154296875, + "logps/rejected": -239.39828491210938, + "loss": 0.9922, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.5367978811264038, + "rewards/margins": 0.020688682794570923, + "rewards/rejected": -1.5574865341186523, + "step": 7968 + }, + { + "epoch": 0.92, + "learning_rate": 2.4757111085098912e-08, + "logits/chosen": -2.112884283065796, + "logits/rejected": -2.230515480041504, + "logps/chosen": -405.89453125, + "logps/rejected": -254.05191040039062, + "loss": 0.3948, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.004725694656372, + "rewards/margins": 1.2999248504638672, + "rewards/rejected": -2.30465030670166, + "step": 7969 + }, + { + "epoch": 0.92, + "learning_rate": 2.4721994615474656e-08, + "logits/chosen": -2.0453460216522217, + "logits/rejected": -2.0984721183776855, + "logps/chosen": -197.6361846923828, + "logps/rejected": -219.46485900878906, + "loss": 0.7246, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.7032006978988647, + "rewards/margins": 1.8483047485351562, + "rewards/rejected": -3.5515055656433105, + "step": 7970 + }, + { + "epoch": 0.92, + "learning_rate": 2.4686878145850403e-08, + "logits/chosen": -2.28348708152771, + "logits/rejected": -2.341890811920166, + "logps/chosen": -255.81820678710938, + "logps/rejected": -254.7948455810547, + "loss": 0.4962, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4245321750640869, + "rewards/margins": 1.4827511310577393, + "rewards/rejected": -1.9072835445404053, + "step": 7971 + }, + { + "epoch": 0.92, + "learning_rate": 2.465176167622615e-08, + "logits/chosen": -2.4236204624176025, + "logits/rejected": -2.406270980834961, + "logps/chosen": -326.50286865234375, + "logps/rejected": -241.43031311035156, + "loss": 0.325, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.036383435130119324, + "rewards/margins": 2.515599012374878, + "rewards/rejected": -2.5519824028015137, + "step": 7972 + }, + { + "epoch": 0.92, + "learning_rate": 2.4616645206601898e-08, + "logits/chosen": -2.619987964630127, + "logits/rejected": -2.761524200439453, + "logps/chosen": -317.42877197265625, + "logps/rejected": -326.02288818359375, + "loss": 0.7384, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5977512001991272, + "rewards/margins": 2.4540319442749023, + "rewards/rejected": -3.0517830848693848, + "step": 7973 + }, + { + "epoch": 0.92, + "learning_rate": 2.4581528736977642e-08, + "logits/chosen": -1.7255373001098633, + "logits/rejected": -1.9925191402435303, + "logps/chosen": -355.96551513671875, + "logps/rejected": -256.57781982421875, + "loss": 0.579, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4353444576263428, + "rewards/margins": 1.0862152576446533, + "rewards/rejected": -2.521559715270996, + "step": 7974 + }, + { + "epoch": 0.92, + "learning_rate": 2.454641226735339e-08, + "logits/chosen": -2.2333333492279053, + "logits/rejected": -2.6716959476470947, + "logps/chosen": -297.9132385253906, + "logps/rejected": -295.19500732421875, + "loss": 0.59, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8271734714508057, + "rewards/margins": 3.528221368789673, + "rewards/rejected": -4.35539436340332, + "step": 7975 + }, + { + "epoch": 0.92, + "learning_rate": 2.4511295797729136e-08, + "logits/chosen": -2.292989492416382, + "logits/rejected": -2.444507598876953, + "logps/chosen": -196.35650634765625, + "logps/rejected": -238.32208251953125, + "loss": 0.6735, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.139883041381836, + "rewards/margins": 1.209906816482544, + "rewards/rejected": -2.349789619445801, + "step": 7976 + }, + { + "epoch": 0.92, + "learning_rate": 2.447617932810488e-08, + "logits/chosen": -1.9035946130752563, + "logits/rejected": -1.877413272857666, + "logps/chosen": -256.125, + "logps/rejected": -262.0096435546875, + "loss": 1.197, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.342403769493103, + "rewards/margins": 0.19593873620033264, + "rewards/rejected": -1.5383424758911133, + "step": 7977 + }, + { + "epoch": 0.92, + "learning_rate": 2.4441062858480628e-08, + "logits/chosen": -2.579969644546509, + "logits/rejected": -2.5049846172332764, + "logps/chosen": -230.22581481933594, + "logps/rejected": -316.4233703613281, + "loss": 0.367, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3679075241088867, + "rewards/margins": 1.6811161041259766, + "rewards/rejected": -2.0490236282348633, + "step": 7978 + }, + { + "epoch": 0.92, + "learning_rate": 2.4405946388856375e-08, + "logits/chosen": -1.7023515701293945, + "logits/rejected": -1.8329591751098633, + "logps/chosen": -390.2530212402344, + "logps/rejected": -209.48297119140625, + "loss": 0.5604, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8583643436431885, + "rewards/margins": 1.9075846672058105, + "rewards/rejected": -2.76594877243042, + "step": 7979 + }, + { + "epoch": 0.92, + "learning_rate": 2.4370829919232122e-08, + "logits/chosen": -2.1890316009521484, + "logits/rejected": -2.392117500305176, + "logps/chosen": -150.7616729736328, + "logps/rejected": -184.7576141357422, + "loss": 0.4306, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2377855777740479, + "rewards/margins": 2.1653151512145996, + "rewards/rejected": -3.4031007289886475, + "step": 7980 + }, + { + "epoch": 0.92, + "learning_rate": 2.4335713449607866e-08, + "logits/chosen": -2.1879711151123047, + "logits/rejected": -2.52791428565979, + "logps/chosen": -271.1788330078125, + "logps/rejected": -205.53952026367188, + "loss": 0.5761, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4141037464141846, + "rewards/margins": 0.8576000332832336, + "rewards/rejected": -2.2717037200927734, + "step": 7981 + }, + { + "epoch": 0.92, + "learning_rate": 2.430059697998361e-08, + "logits/chosen": -2.0124831199645996, + "logits/rejected": -2.206648349761963, + "logps/chosen": -416.4339904785156, + "logps/rejected": -281.56683349609375, + "loss": 1.0432, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9984142780303955, + "rewards/margins": 1.4513847827911377, + "rewards/rejected": -2.449799060821533, + "step": 7982 + }, + { + "epoch": 0.92, + "learning_rate": 2.4265480510359357e-08, + "logits/chosen": -2.2459805011749268, + "logits/rejected": -2.2001824378967285, + "logps/chosen": -254.1149444580078, + "logps/rejected": -254.16146850585938, + "loss": 0.2952, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6748419404029846, + "rewards/margins": 1.8520872592926025, + "rewards/rejected": -2.5269293785095215, + "step": 7983 + }, + { + "epoch": 0.92, + "learning_rate": 2.42303640407351e-08, + "logits/chosen": -2.243138074874878, + "logits/rejected": -2.322699546813965, + "logps/chosen": -415.220947265625, + "logps/rejected": -320.42755126953125, + "loss": 0.4226, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6283180713653564, + "rewards/margins": 3.229635000228882, + "rewards/rejected": -4.857953071594238, + "step": 7984 + }, + { + "epoch": 0.92, + "learning_rate": 2.4195247571110848e-08, + "logits/chosen": -2.051774501800537, + "logits/rejected": -2.040580987930298, + "logps/chosen": -366.4346618652344, + "logps/rejected": -391.1112976074219, + "loss": 0.2122, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5587220191955566, + "rewards/margins": 2.6427206993103027, + "rewards/rejected": -3.2014427185058594, + "step": 7985 + }, + { + "epoch": 0.92, + "learning_rate": 2.4160131101486595e-08, + "logits/chosen": -2.5001060962677, + "logits/rejected": -2.5435738563537598, + "logps/chosen": -310.6750793457031, + "logps/rejected": -271.450439453125, + "loss": 0.6817, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7711272239685059, + "rewards/margins": 1.674680471420288, + "rewards/rejected": -2.445807456970215, + "step": 7986 + }, + { + "epoch": 0.92, + "learning_rate": 2.412501463186234e-08, + "logits/chosen": -2.47019362449646, + "logits/rejected": -2.4218692779541016, + "logps/chosen": -243.45303344726562, + "logps/rejected": -292.1478271484375, + "loss": 0.6933, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8055421710014343, + "rewards/margins": 1.6013330221176147, + "rewards/rejected": -2.4068751335144043, + "step": 7987 + }, + { + "epoch": 0.92, + "learning_rate": 2.4089898162238087e-08, + "logits/chosen": -2.0759572982788086, + "logits/rejected": -2.0378317832946777, + "logps/chosen": -185.98800659179688, + "logps/rejected": -223.36732482910156, + "loss": 0.613, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3433224856853485, + "rewards/margins": 1.00516939163208, + "rewards/rejected": -1.348491907119751, + "step": 7988 + }, + { + "epoch": 0.92, + "learning_rate": 2.4054781692613834e-08, + "logits/chosen": -2.0998573303222656, + "logits/rejected": -2.3632519245147705, + "logps/chosen": -313.5267028808594, + "logps/rejected": -251.0155029296875, + "loss": 0.4354, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.126469612121582, + "rewards/margins": 1.9325957298278809, + "rewards/rejected": -3.059065341949463, + "step": 7989 + }, + { + "epoch": 0.92, + "learning_rate": 2.401966522298958e-08, + "logits/chosen": -2.467822790145874, + "logits/rejected": -2.2740914821624756, + "logps/chosen": -204.64015197753906, + "logps/rejected": -271.980712890625, + "loss": 0.7758, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9970902800559998, + "rewards/margins": 0.5758788585662842, + "rewards/rejected": -1.5729691982269287, + "step": 7990 + }, + { + "epoch": 0.92, + "learning_rate": 2.3984548753365325e-08, + "logits/chosen": -2.1266536712646484, + "logits/rejected": -2.0652976036071777, + "logps/chosen": -278.6253967285156, + "logps/rejected": -347.6343994140625, + "loss": 0.3174, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0103261470794678, + "rewards/margins": 3.1827075481414795, + "rewards/rejected": -4.193033695220947, + "step": 7991 + }, + { + "epoch": 0.92, + "learning_rate": 2.3949432283741072e-08, + "logits/chosen": -2.4808101654052734, + "logits/rejected": -2.3082432746887207, + "logps/chosen": -134.5464324951172, + "logps/rejected": -209.2574462890625, + "loss": 0.0587, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2227286845445633, + "rewards/margins": 3.44327449798584, + "rewards/rejected": -3.220545768737793, + "step": 7992 + }, + { + "epoch": 0.92, + "learning_rate": 2.391431581411682e-08, + "logits/chosen": -2.610947608947754, + "logits/rejected": -2.428813934326172, + "logps/chosen": -432.90045166015625, + "logps/rejected": -325.150390625, + "loss": 0.5724, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1244633197784424, + "rewards/margins": 1.763071894645691, + "rewards/rejected": -2.887535333633423, + "step": 7993 + }, + { + "epoch": 0.92, + "learning_rate": 2.3879199344492563e-08, + "logits/chosen": -2.027118682861328, + "logits/rejected": -2.0337979793548584, + "logps/chosen": -286.8166809082031, + "logps/rejected": -252.29684448242188, + "loss": 0.4649, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7377948760986328, + "rewards/margins": 2.2767531871795654, + "rewards/rejected": -3.0145483016967773, + "step": 7994 + }, + { + "epoch": 0.92, + "learning_rate": 2.384408287486831e-08, + "logits/chosen": -2.2422051429748535, + "logits/rejected": -2.2678146362304688, + "logps/chosen": -241.46380615234375, + "logps/rejected": -231.72438049316406, + "loss": 0.6274, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.015571504831314087, + "rewards/margins": 1.099024772644043, + "rewards/rejected": -1.0834531784057617, + "step": 7995 + }, + { + "epoch": 0.92, + "learning_rate": 2.3808966405244058e-08, + "logits/chosen": -1.7849007844924927, + "logits/rejected": -1.9037796258926392, + "logps/chosen": -382.7829284667969, + "logps/rejected": -266.2860107421875, + "loss": 0.1904, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06041821837425232, + "rewards/margins": 3.781071186065674, + "rewards/rejected": -3.84148907661438, + "step": 7996 + }, + { + "epoch": 0.92, + "learning_rate": 2.3773849935619805e-08, + "logits/chosen": -2.65255069732666, + "logits/rejected": -2.5772480964660645, + "logps/chosen": -302.5143127441406, + "logps/rejected": -289.709716796875, + "loss": 0.0704, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4356674551963806, + "rewards/margins": 3.669508457183838, + "rewards/rejected": -4.105175971984863, + "step": 7997 + }, + { + "epoch": 0.92, + "learning_rate": 2.373873346599555e-08, + "logits/chosen": -2.3596601486206055, + "logits/rejected": -2.1404800415039062, + "logps/chosen": -277.390625, + "logps/rejected": -560.5557861328125, + "loss": 0.1903, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.48323917388916016, + "rewards/margins": 3.0429110527038574, + "rewards/rejected": -3.5261502265930176, + "step": 7998 + }, + { + "epoch": 0.92, + "learning_rate": 2.3703616996371296e-08, + "logits/chosen": -2.8508636951446533, + "logits/rejected": -2.8412904739379883, + "logps/chosen": -203.67747497558594, + "logps/rejected": -172.1082763671875, + "loss": 0.77, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6707607507705688, + "rewards/margins": 0.12187343090772629, + "rewards/rejected": -0.7926342487335205, + "step": 7999 + }, + { + "epoch": 0.92, + "learning_rate": 2.3668500526747044e-08, + "logits/chosen": -2.163043737411499, + "logits/rejected": -2.1286308765411377, + "logps/chosen": -383.9332275390625, + "logps/rejected": -463.80670166015625, + "loss": 0.3575, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3382974863052368, + "rewards/margins": 2.317645788192749, + "rewards/rejected": -2.6559433937072754, + "step": 8000 + }, + { + "epoch": 0.92, + "eval_logits/chosen": -1.640053153038025, + "eval_logits/rejected": -1.516674280166626, + "eval_logps/chosen": -300.4789123535156, + "eval_logps/rejected": -275.2611389160156, + "eval_loss": 0.325385719537735, + "eval_rewards/accuracies": 0.8428571224212646, + "eval_rewards/chosen": -0.7798688411712646, + "eval_rewards/margins": 2.227503776550293, + "eval_rewards/rejected": -3.0073728561401367, + "eval_runtime": 24.4363, + "eval_samples_per_second": 2.865, + "eval_steps_per_second": 1.432, + "step": 8000 + }, + { + "epoch": 0.92, + "learning_rate": 2.3633384057122787e-08, + "logits/chosen": -1.727211356163025, + "logits/rejected": -2.0669326782226562, + "logps/chosen": -294.05462646484375, + "logps/rejected": -271.88385009765625, + "loss": 1.088, + "rewards/accuracies": 0.5, + "rewards/chosen": -3.3197848796844482, + "rewards/margins": 1.4044333696365356, + "rewards/rejected": -4.724218368530273, + "step": 8001 + }, + { + "epoch": 0.92, + "learning_rate": 2.3598267587498535e-08, + "logits/chosen": -2.7395362854003906, + "logits/rejected": -2.725303888320923, + "logps/chosen": -348.44647216796875, + "logps/rejected": -301.9689025878906, + "loss": 0.4267, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.23994404077529907, + "rewards/margins": 2.335481882095337, + "rewards/rejected": -2.5754261016845703, + "step": 8002 + }, + { + "epoch": 0.92, + "learning_rate": 2.3563151117874282e-08, + "logits/chosen": -2.4385030269622803, + "logits/rejected": -2.5888333320617676, + "logps/chosen": -194.12388610839844, + "logps/rejected": -187.42337036132812, + "loss": 0.3798, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3142239451408386, + "rewards/margins": 1.0447605848312378, + "rewards/rejected": -1.3589844703674316, + "step": 8003 + }, + { + "epoch": 0.92, + "learning_rate": 2.352803464825003e-08, + "logits/chosen": -2.0804443359375, + "logits/rejected": -2.2046918869018555, + "logps/chosen": -446.28558349609375, + "logps/rejected": -433.6258544921875, + "loss": 0.2474, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0712069645524025, + "rewards/margins": 3.1128790378570557, + "rewards/rejected": -3.1840858459472656, + "step": 8004 + }, + { + "epoch": 0.92, + "learning_rate": 2.3492918178625773e-08, + "logits/chosen": -2.4699134826660156, + "logits/rejected": -2.7598936557769775, + "logps/chosen": -240.19021606445312, + "logps/rejected": -180.3365478515625, + "loss": 1.0861, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.124800205230713, + "rewards/margins": 0.9764668941497803, + "rewards/rejected": -3.101267099380493, + "step": 8005 + }, + { + "epoch": 0.92, + "learning_rate": 2.345780170900152e-08, + "logits/chosen": -2.6472973823547363, + "logits/rejected": -2.3627712726593018, + "logps/chosen": -241.35162353515625, + "logps/rejected": -289.7120666503906, + "loss": 0.1132, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3711957931518555, + "rewards/margins": 3.117471933364868, + "rewards/rejected": -4.488668441772461, + "step": 8006 + }, + { + "epoch": 0.92, + "learning_rate": 2.3422685239377268e-08, + "logits/chosen": -2.163382053375244, + "logits/rejected": -2.517810106277466, + "logps/chosen": -390.4927062988281, + "logps/rejected": -299.3760986328125, + "loss": 0.5335, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6440669894218445, + "rewards/margins": 1.071441411972046, + "rewards/rejected": -1.7155084609985352, + "step": 8007 + }, + { + "epoch": 0.92, + "learning_rate": 2.3387568769753015e-08, + "logits/chosen": -2.7821602821350098, + "logits/rejected": -2.6666059494018555, + "logps/chosen": -245.72325134277344, + "logps/rejected": -127.10804748535156, + "loss": 0.3445, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8432515263557434, + "rewards/margins": 1.8702940940856934, + "rewards/rejected": -2.713545799255371, + "step": 8008 + }, + { + "epoch": 0.92, + "learning_rate": 2.335245230012876e-08, + "logits/chosen": -1.88661527633667, + "logits/rejected": -1.8927161693572998, + "logps/chosen": -308.1923828125, + "logps/rejected": -289.492919921875, + "loss": 0.3321, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5269761085510254, + "rewards/margins": 1.6604819297790527, + "rewards/rejected": -2.187458038330078, + "step": 8009 + }, + { + "epoch": 0.92, + "learning_rate": 2.3317335830504506e-08, + "logits/chosen": -2.4831485748291016, + "logits/rejected": -2.2009918689727783, + "logps/chosen": -114.97100830078125, + "logps/rejected": -206.8245849609375, + "loss": 0.6502, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.343667984008789, + "rewards/margins": 2.1097264289855957, + "rewards/rejected": -3.4533944129943848, + "step": 8010 + }, + { + "epoch": 0.92, + "learning_rate": 2.3282219360880253e-08, + "logits/chosen": -2.31575870513916, + "logits/rejected": -2.1218888759613037, + "logps/chosen": -218.78271484375, + "logps/rejected": -310.2528076171875, + "loss": 0.0991, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4712557792663574, + "rewards/margins": 4.6177287101745605, + "rewards/rejected": -5.088984489440918, + "step": 8011 + }, + { + "epoch": 0.92, + "learning_rate": 2.3247102891255997e-08, + "logits/chosen": -2.7192981243133545, + "logits/rejected": -2.5992722511291504, + "logps/chosen": -287.84130859375, + "logps/rejected": -269.2106628417969, + "loss": 0.3391, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9499363303184509, + "rewards/margins": 1.923132061958313, + "rewards/rejected": -2.873068332672119, + "step": 8012 + }, + { + "epoch": 0.92, + "learning_rate": 2.3211986421631744e-08, + "logits/chosen": -2.477170705795288, + "logits/rejected": -2.190561056137085, + "logps/chosen": -131.58871459960938, + "logps/rejected": -244.78155517578125, + "loss": 0.3259, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.551000952720642, + "rewards/margins": 3.24194598197937, + "rewards/rejected": -4.792946815490723, + "step": 8013 + }, + { + "epoch": 0.92, + "learning_rate": 2.3176869952007492e-08, + "logits/chosen": -2.588256597518921, + "logits/rejected": -2.8125662803649902, + "logps/chosen": -340.9859619140625, + "logps/rejected": -183.24667358398438, + "loss": 0.8817, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5597469806671143, + "rewards/margins": 0.370612233877182, + "rewards/rejected": -0.9303592443466187, + "step": 8014 + }, + { + "epoch": 0.92, + "learning_rate": 2.314175348238324e-08, + "logits/chosen": -1.9166234731674194, + "logits/rejected": -2.3260443210601807, + "logps/chosen": -413.12750244140625, + "logps/rejected": -354.782958984375, + "loss": 1.3851, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.041051149368286, + "rewards/margins": 0.5164900422096252, + "rewards/rejected": -2.5575413703918457, + "step": 8015 + }, + { + "epoch": 0.92, + "learning_rate": 2.3106637012758983e-08, + "logits/chosen": -2.500204563140869, + "logits/rejected": -2.3705828189849854, + "logps/chosen": -266.44091796875, + "logps/rejected": -349.49560546875, + "loss": 0.4118, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2819578647613525, + "rewards/margins": 2.247570037841797, + "rewards/rejected": -3.5295281410217285, + "step": 8016 + }, + { + "epoch": 0.92, + "learning_rate": 2.307152054313473e-08, + "logits/chosen": -1.926844835281372, + "logits/rejected": -1.89159095287323, + "logps/chosen": -338.97467041015625, + "logps/rejected": -315.851318359375, + "loss": 0.2919, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0047550201416016, + "rewards/margins": 1.827989935874939, + "rewards/rejected": -2.83274507522583, + "step": 8017 + }, + { + "epoch": 0.92, + "learning_rate": 2.3036404073510477e-08, + "logits/chosen": -2.2141470909118652, + "logits/rejected": -2.5589869022369385, + "logps/chosen": -337.39276123046875, + "logps/rejected": -248.5738983154297, + "loss": 0.3093, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4035731554031372, + "rewards/margins": 1.8142328262329102, + "rewards/rejected": -3.217805862426758, + "step": 8018 + }, + { + "epoch": 0.92, + "learning_rate": 2.300128760388622e-08, + "logits/chosen": -2.215358018875122, + "logits/rejected": -2.4610724449157715, + "logps/chosen": -221.86241149902344, + "logps/rejected": -272.8912353515625, + "loss": 0.4026, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9760676026344299, + "rewards/margins": 1.4020559787750244, + "rewards/rejected": -2.3781235218048096, + "step": 8019 + }, + { + "epoch": 0.92, + "learning_rate": 2.296617113426197e-08, + "logits/chosen": -2.4768128395080566, + "logits/rejected": -2.4153151512145996, + "logps/chosen": -422.77349853515625, + "logps/rejected": -395.3193664550781, + "loss": 0.3395, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8190810680389404, + "rewards/margins": 2.8413681983947754, + "rewards/rejected": -3.660449504852295, + "step": 8020 + }, + { + "epoch": 0.92, + "learning_rate": 2.2931054664637716e-08, + "logits/chosen": -1.9208064079284668, + "logits/rejected": -2.2290852069854736, + "logps/chosen": -552.4647827148438, + "logps/rejected": -314.0313720703125, + "loss": 0.2276, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04554721713066101, + "rewards/margins": 2.9588541984558105, + "rewards/rejected": -2.913306951522827, + "step": 8021 + }, + { + "epoch": 0.92, + "learning_rate": 2.2895938195013463e-08, + "logits/chosen": -2.460216999053955, + "logits/rejected": -2.3924832344055176, + "logps/chosen": -233.3521728515625, + "logps/rejected": -269.45721435546875, + "loss": 0.1221, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5381984114646912, + "rewards/margins": 4.595158576965332, + "rewards/rejected": -5.133357048034668, + "step": 8022 + }, + { + "epoch": 0.92, + "learning_rate": 2.2860821725389207e-08, + "logits/chosen": -2.3799757957458496, + "logits/rejected": -2.606182813644409, + "logps/chosen": -285.5190734863281, + "logps/rejected": -307.9370422363281, + "loss": 0.1822, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1849431991577148, + "rewards/margins": 3.4819560050964355, + "rewards/rejected": -4.666899681091309, + "step": 8023 + }, + { + "epoch": 0.93, + "learning_rate": 2.2825705255764954e-08, + "logits/chosen": -2.034059762954712, + "logits/rejected": -2.0152504444122314, + "logps/chosen": -675.531494140625, + "logps/rejected": -499.2812805175781, + "loss": 0.5663, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3552458882331848, + "rewards/margins": 1.4822595119476318, + "rewards/rejected": -1.8375053405761719, + "step": 8024 + }, + { + "epoch": 0.93, + "learning_rate": 2.27905887861407e-08, + "logits/chosen": -2.559624195098877, + "logits/rejected": -2.550332546234131, + "logps/chosen": -291.66650390625, + "logps/rejected": -379.0856018066406, + "loss": 0.1817, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8571634292602539, + "rewards/margins": 3.9580836296081543, + "rewards/rejected": -4.815247058868408, + "step": 8025 + }, + { + "epoch": 0.93, + "learning_rate": 2.275547231651645e-08, + "logits/chosen": -1.4868243932724, + "logits/rejected": -1.4415820837020874, + "logps/chosen": -176.86566162109375, + "logps/rejected": -184.55641174316406, + "loss": 0.532, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12481652200222015, + "rewards/margins": 1.0970962047576904, + "rewards/rejected": -1.2219127416610718, + "step": 8026 + }, + { + "epoch": 0.93, + "learning_rate": 2.2720355846892193e-08, + "logits/chosen": -2.803255319595337, + "logits/rejected": -2.8224802017211914, + "logps/chosen": -152.8406982421875, + "logps/rejected": -157.11053466796875, + "loss": 0.4168, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9498602747917175, + "rewards/margins": 1.767000675201416, + "rewards/rejected": -2.716860771179199, + "step": 8027 + }, + { + "epoch": 0.93, + "learning_rate": 2.268523937726794e-08, + "logits/chosen": -2.314789056777954, + "logits/rejected": -2.362908363342285, + "logps/chosen": -244.14321899414062, + "logps/rejected": -221.86309814453125, + "loss": 0.3945, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7744793891906738, + "rewards/margins": 2.079401731491089, + "rewards/rejected": -2.8538811206817627, + "step": 8028 + }, + { + "epoch": 0.93, + "learning_rate": 2.2650122907643687e-08, + "logits/chosen": -2.234675407409668, + "logits/rejected": -2.5412240028381348, + "logps/chosen": -462.1874084472656, + "logps/rejected": -252.2324981689453, + "loss": 0.2896, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5653849840164185, + "rewards/margins": 2.390014171600342, + "rewards/rejected": -2.9553990364074707, + "step": 8029 + }, + { + "epoch": 0.93, + "learning_rate": 2.2615006438019428e-08, + "logits/chosen": -2.86687970161438, + "logits/rejected": -2.8236544132232666, + "logps/chosen": -282.0748291015625, + "logps/rejected": -336.0312194824219, + "loss": 0.1655, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7125049233436584, + "rewards/margins": 2.994574546813965, + "rewards/rejected": -3.7070796489715576, + "step": 8030 + }, + { + "epoch": 0.93, + "learning_rate": 2.2579889968395175e-08, + "logits/chosen": -2.3407578468322754, + "logits/rejected": -2.2612578868865967, + "logps/chosen": -374.48101806640625, + "logps/rejected": -312.0296630859375, + "loss": 0.2817, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8881957530975342, + "rewards/margins": 2.040003776550293, + "rewards/rejected": -2.928199291229248, + "step": 8031 + }, + { + "epoch": 0.93, + "learning_rate": 2.2544773498770922e-08, + "logits/chosen": -2.4454245567321777, + "logits/rejected": -2.4936065673828125, + "logps/chosen": -210.84996032714844, + "logps/rejected": -247.21685791015625, + "loss": 0.2253, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7587481737136841, + "rewards/margins": 2.8205552101135254, + "rewards/rejected": -3.579303503036499, + "step": 8032 + }, + { + "epoch": 0.93, + "learning_rate": 2.2509657029146666e-08, + "logits/chosen": -2.092776298522949, + "logits/rejected": -2.4064321517944336, + "logps/chosen": -351.5941162109375, + "logps/rejected": -282.9680480957031, + "loss": 0.3042, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9990458488464355, + "rewards/margins": 1.6931586265563965, + "rewards/rejected": -2.692204475402832, + "step": 8033 + }, + { + "epoch": 0.93, + "learning_rate": 2.2474540559522413e-08, + "logits/chosen": -2.3127365112304688, + "logits/rejected": -1.93901789188385, + "logps/chosen": -112.932373046875, + "logps/rejected": -271.2450866699219, + "loss": 0.2214, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1971675157546997, + "rewards/margins": 3.8987200260162354, + "rewards/rejected": -5.095887184143066, + "step": 8034 + }, + { + "epoch": 0.93, + "learning_rate": 2.243942408989816e-08, + "logits/chosen": -2.1176199913024902, + "logits/rejected": -2.16212797164917, + "logps/chosen": -383.971435546875, + "logps/rejected": -401.117919921875, + "loss": 0.6642, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1816565990447998, + "rewards/margins": 2.0282604694366455, + "rewards/rejected": -3.2099170684814453, + "step": 8035 + }, + { + "epoch": 0.93, + "learning_rate": 2.2404307620273904e-08, + "logits/chosen": -2.6315064430236816, + "logits/rejected": -3.0080220699310303, + "logps/chosen": -254.18472290039062, + "logps/rejected": -203.9610595703125, + "loss": 0.6073, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8916068077087402, + "rewards/margins": 1.6405951976776123, + "rewards/rejected": -2.5322020053863525, + "step": 8036 + }, + { + "epoch": 0.93, + "learning_rate": 2.236919115064965e-08, + "logits/chosen": -2.9139833450317383, + "logits/rejected": -2.925522565841675, + "logps/chosen": -125.62236785888672, + "logps/rejected": -139.19960021972656, + "loss": 0.3658, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.530962347984314, + "rewards/margins": 2.6651670932769775, + "rewards/rejected": -3.196129560470581, + "step": 8037 + }, + { + "epoch": 0.93, + "learning_rate": 2.23340746810254e-08, + "logits/chosen": -2.0718331336975098, + "logits/rejected": -1.992037057876587, + "logps/chosen": -288.14813232421875, + "logps/rejected": -261.36175537109375, + "loss": 0.8783, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9949958920478821, + "rewards/margins": 0.9714283347129822, + "rewards/rejected": -1.9664242267608643, + "step": 8038 + }, + { + "epoch": 0.93, + "learning_rate": 2.2298958211401146e-08, + "logits/chosen": -2.001793622970581, + "logits/rejected": -2.0495264530181885, + "logps/chosen": -394.26611328125, + "logps/rejected": -359.25091552734375, + "loss": 0.3853, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0583388805389404, + "rewards/margins": 2.5483763217926025, + "rewards/rejected": -3.606715202331543, + "step": 8039 + }, + { + "epoch": 0.93, + "learning_rate": 2.226384174177689e-08, + "logits/chosen": -2.414942502975464, + "logits/rejected": -2.3972887992858887, + "logps/chosen": -196.10137939453125, + "logps/rejected": -212.43124389648438, + "loss": 0.1138, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2638244032859802, + "rewards/margins": 3.2913384437561035, + "rewards/rejected": -3.0275137424468994, + "step": 8040 + }, + { + "epoch": 0.93, + "learning_rate": 2.2228725272152637e-08, + "logits/chosen": -2.3400583267211914, + "logits/rejected": -2.4855797290802, + "logps/chosen": -426.43463134765625, + "logps/rejected": -284.4242248535156, + "loss": 0.227, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2535637617111206, + "rewards/margins": 2.233203887939453, + "rewards/rejected": -3.4867677688598633, + "step": 8041 + }, + { + "epoch": 0.93, + "learning_rate": 2.2193608802528385e-08, + "logits/chosen": -2.484945058822632, + "logits/rejected": -2.559516668319702, + "logps/chosen": -213.8190155029297, + "logps/rejected": -164.20587158203125, + "loss": 0.1303, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37957948446273804, + "rewards/margins": 2.36370587348938, + "rewards/rejected": -2.7432854175567627, + "step": 8042 + }, + { + "epoch": 0.93, + "learning_rate": 2.2158492332904132e-08, + "logits/chosen": -1.882157564163208, + "logits/rejected": -2.081753730773926, + "logps/chosen": -354.047607421875, + "logps/rejected": -341.27801513671875, + "loss": 0.1765, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6901981234550476, + "rewards/margins": 2.620514154434204, + "rewards/rejected": -3.3107123374938965, + "step": 8043 + }, + { + "epoch": 0.93, + "learning_rate": 2.2123375863279876e-08, + "logits/chosen": -2.6991090774536133, + "logits/rejected": -2.744072437286377, + "logps/chosen": -427.2034606933594, + "logps/rejected": -368.8812255859375, + "loss": 0.4297, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.571763038635254, + "rewards/margins": 2.188021421432495, + "rewards/rejected": -3.759784460067749, + "step": 8044 + }, + { + "epoch": 0.93, + "learning_rate": 2.2088259393655623e-08, + "logits/chosen": -2.6490492820739746, + "logits/rejected": -2.7385873794555664, + "logps/chosen": -193.2989959716797, + "logps/rejected": -197.6710205078125, + "loss": 0.4834, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.203658938407898, + "rewards/margins": 2.095623016357422, + "rewards/rejected": -3.2992818355560303, + "step": 8045 + }, + { + "epoch": 0.93, + "learning_rate": 2.205314292403137e-08, + "logits/chosen": -2.0773563385009766, + "logits/rejected": -1.988635540008545, + "logps/chosen": -237.49069213867188, + "logps/rejected": -207.83311462402344, + "loss": 0.5458, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7556308507919312, + "rewards/margins": 1.1622142791748047, + "rewards/rejected": -1.9178452491760254, + "step": 8046 + }, + { + "epoch": 0.93, + "learning_rate": 2.2018026454407114e-08, + "logits/chosen": -2.389038562774658, + "logits/rejected": -2.1082701683044434, + "logps/chosen": -296.8581848144531, + "logps/rejected": -314.30047607421875, + "loss": 0.2646, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.555383563041687, + "rewards/margins": 3.2236342430114746, + "rewards/rejected": -3.779017925262451, + "step": 8047 + }, + { + "epoch": 0.93, + "learning_rate": 2.198290998478286e-08, + "logits/chosen": -1.884583830833435, + "logits/rejected": -2.0225348472595215, + "logps/chosen": -403.97613525390625, + "logps/rejected": -345.47882080078125, + "loss": 0.4132, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.17545121908187866, + "rewards/margins": 1.5633718967437744, + "rewards/rejected": -1.3879207372665405, + "step": 8048 + }, + { + "epoch": 0.93, + "learning_rate": 2.194779351515861e-08, + "logits/chosen": -1.8191460371017456, + "logits/rejected": -1.6300086975097656, + "logps/chosen": -252.63555908203125, + "logps/rejected": -278.56988525390625, + "loss": 0.4764, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2282061576843262, + "rewards/margins": 1.2577382326126099, + "rewards/rejected": -2.4859445095062256, + "step": 8049 + }, + { + "epoch": 0.93, + "learning_rate": 2.1912677045534356e-08, + "logits/chosen": -1.6136863231658936, + "logits/rejected": -1.9428294897079468, + "logps/chosen": -716.8534545898438, + "logps/rejected": -628.8558959960938, + "loss": 0.2293, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3059358596801758, + "rewards/margins": 2.2746922969818115, + "rewards/rejected": -3.580627918243408, + "step": 8050 + }, + { + "epoch": 0.93, + "learning_rate": 2.18775605759101e-08, + "logits/chosen": -2.5423707962036133, + "logits/rejected": -2.4410624504089355, + "logps/chosen": -130.41712951660156, + "logps/rejected": -205.40316772460938, + "loss": 0.6496, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0189788341522217, + "rewards/margins": 1.2047953605651855, + "rewards/rejected": -2.2237741947174072, + "step": 8051 + }, + { + "epoch": 0.93, + "learning_rate": 2.1842444106285847e-08, + "logits/chosen": -2.328059196472168, + "logits/rejected": -2.3724663257598877, + "logps/chosen": -177.95616149902344, + "logps/rejected": -621.1369018554688, + "loss": 0.4182, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7964255213737488, + "rewards/margins": 3.947221040725708, + "rewards/rejected": -4.743646621704102, + "step": 8052 + }, + { + "epoch": 0.93, + "learning_rate": 2.1807327636661594e-08, + "logits/chosen": -2.154226064682007, + "logits/rejected": -2.041048765182495, + "logps/chosen": -376.0736083984375, + "logps/rejected": -420.39251708984375, + "loss": 0.3729, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.160469651222229, + "rewards/margins": 2.70066499710083, + "rewards/rejected": -3.8611347675323486, + "step": 8053 + }, + { + "epoch": 0.93, + "learning_rate": 2.1772211167037338e-08, + "logits/chosen": -2.8919692039489746, + "logits/rejected": -2.7526605129241943, + "logps/chosen": -203.51918029785156, + "logps/rejected": -240.5071258544922, + "loss": 0.2761, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7306939959526062, + "rewards/margins": 1.5878710746765137, + "rewards/rejected": -2.3185651302337646, + "step": 8054 + }, + { + "epoch": 0.93, + "learning_rate": 2.1737094697413085e-08, + "logits/chosen": -2.81131911277771, + "logits/rejected": -2.681817054748535, + "logps/chosen": -256.5846252441406, + "logps/rejected": -202.3837890625, + "loss": 0.475, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9550582766532898, + "rewards/margins": 2.5237913131713867, + "rewards/rejected": -3.478849411010742, + "step": 8055 + }, + { + "epoch": 0.93, + "learning_rate": 2.1701978227788833e-08, + "logits/chosen": -2.972468376159668, + "logits/rejected": -2.833733558654785, + "logps/chosen": -487.737060546875, + "logps/rejected": -364.95562744140625, + "loss": 0.4915, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3167140483856201, + "rewards/margins": 1.9386610984802246, + "rewards/rejected": -3.2553751468658447, + "step": 8056 + }, + { + "epoch": 0.93, + "learning_rate": 2.166686175816458e-08, + "logits/chosen": -2.3145089149475098, + "logits/rejected": -2.233398675918579, + "logps/chosen": -207.550048828125, + "logps/rejected": -305.73797607421875, + "loss": 0.4168, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4056170582771301, + "rewards/margins": 2.181347370147705, + "rewards/rejected": -2.5869643688201904, + "step": 8057 + }, + { + "epoch": 0.93, + "learning_rate": 2.1631745288540324e-08, + "logits/chosen": -2.680022954940796, + "logits/rejected": -2.6437878608703613, + "logps/chosen": -462.6702880859375, + "logps/rejected": -313.98236083984375, + "loss": 0.3473, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3672430515289307, + "rewards/margins": 3.466883659362793, + "rewards/rejected": -4.8341264724731445, + "step": 8058 + }, + { + "epoch": 0.93, + "learning_rate": 2.159662881891607e-08, + "logits/chosen": -2.746063709259033, + "logits/rejected": -2.710871696472168, + "logps/chosen": -273.6298522949219, + "logps/rejected": -231.34129333496094, + "loss": 0.1918, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40894290804862976, + "rewards/margins": 3.7252378463745117, + "rewards/rejected": -4.134181022644043, + "step": 8059 + }, + { + "epoch": 0.93, + "learning_rate": 2.1561512349291818e-08, + "logits/chosen": -2.3712267875671387, + "logits/rejected": -2.2425036430358887, + "logps/chosen": -322.791015625, + "logps/rejected": -284.43182373046875, + "loss": 0.3536, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7194404006004333, + "rewards/margins": 1.1971780061721802, + "rewards/rejected": -1.9166184663772583, + "step": 8060 + }, + { + "epoch": 0.93, + "learning_rate": 2.1526395879667562e-08, + "logits/chosen": -1.9884893894195557, + "logits/rejected": -2.269681692123413, + "logps/chosen": -440.521728515625, + "logps/rejected": -222.27218627929688, + "loss": 0.4086, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2748217582702637, + "rewards/margins": 1.6841208934783936, + "rewards/rejected": -2.9589426517486572, + "step": 8061 + }, + { + "epoch": 0.93, + "learning_rate": 2.149127941004331e-08, + "logits/chosen": -1.609856367111206, + "logits/rejected": -1.681458592414856, + "logps/chosen": -240.2979736328125, + "logps/rejected": -228.07565307617188, + "loss": 0.2887, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4940998554229736, + "rewards/margins": 1.4926273822784424, + "rewards/rejected": -2.986727237701416, + "step": 8062 + }, + { + "epoch": 0.93, + "learning_rate": 2.1456162940419057e-08, + "logits/chosen": -2.479255437850952, + "logits/rejected": -2.380574941635132, + "logps/chosen": -250.5419158935547, + "logps/rejected": -297.61199951171875, + "loss": 0.5539, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5070276260375977, + "rewards/margins": 1.934414267539978, + "rewards/rejected": -3.4414422512054443, + "step": 8063 + }, + { + "epoch": 0.93, + "learning_rate": 2.1421046470794804e-08, + "logits/chosen": -3.0077662467956543, + "logits/rejected": -2.927079439163208, + "logps/chosen": -270.6216125488281, + "logps/rejected": -214.67498779296875, + "loss": 0.1198, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6355464458465576, + "rewards/margins": 3.755626678466797, + "rewards/rejected": -4.391173362731934, + "step": 8064 + }, + { + "epoch": 0.93, + "learning_rate": 2.1385930001170548e-08, + "logits/chosen": -2.0152359008789062, + "logits/rejected": -2.1505253314971924, + "logps/chosen": -550.4222412109375, + "logps/rejected": -529.86962890625, + "loss": 0.1359, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.66627436876297, + "rewards/margins": 3.3316187858581543, + "rewards/rejected": -3.9978930950164795, + "step": 8065 + }, + { + "epoch": 0.93, + "learning_rate": 2.1350813531546295e-08, + "logits/chosen": -2.2665247917175293, + "logits/rejected": -1.9935152530670166, + "logps/chosen": -260.3659362792969, + "logps/rejected": -253.87255859375, + "loss": 0.1153, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.877005398273468, + "rewards/margins": 2.9879140853881836, + "rewards/rejected": -3.864919662475586, + "step": 8066 + }, + { + "epoch": 0.93, + "learning_rate": 2.1315697061922042e-08, + "logits/chosen": -1.5434281826019287, + "logits/rejected": -1.9740060567855835, + "logps/chosen": -400.4876403808594, + "logps/rejected": -373.4506530761719, + "loss": 1.0116, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1919230222702026, + "rewards/margins": 1.0025911331176758, + "rewards/rejected": -2.194514036178589, + "step": 8067 + }, + { + "epoch": 0.93, + "learning_rate": 2.128058059229779e-08, + "logits/chosen": -2.102769374847412, + "logits/rejected": -2.105457067489624, + "logps/chosen": -243.809814453125, + "logps/rejected": -276.90533447265625, + "loss": 0.5818, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6462103128433228, + "rewards/margins": 1.441040277481079, + "rewards/rejected": -3.0872507095336914, + "step": 8068 + }, + { + "epoch": 0.93, + "learning_rate": 2.1245464122673533e-08, + "logits/chosen": -2.734963893890381, + "logits/rejected": -2.653597593307495, + "logps/chosen": -244.30760192871094, + "logps/rejected": -281.0797119140625, + "loss": 0.1595, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.660015881061554, + "rewards/margins": 2.4097836017608643, + "rewards/rejected": -3.0697994232177734, + "step": 8069 + }, + { + "epoch": 0.93, + "learning_rate": 2.121034765304928e-08, + "logits/chosen": -2.28670072555542, + "logits/rejected": -2.4381861686706543, + "logps/chosen": -482.25799560546875, + "logps/rejected": -501.3110046386719, + "loss": 0.2651, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.627545177936554, + "rewards/margins": 1.758797287940979, + "rewards/rejected": -2.3863425254821777, + "step": 8070 + }, + { + "epoch": 0.93, + "learning_rate": 2.1175231183425028e-08, + "logits/chosen": -2.820034980773926, + "logits/rejected": -2.7260613441467285, + "logps/chosen": -242.08897399902344, + "logps/rejected": -245.12722778320312, + "loss": 0.386, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4009783267974854, + "rewards/margins": 2.7594261169433594, + "rewards/rejected": -4.160404205322266, + "step": 8071 + }, + { + "epoch": 0.93, + "learning_rate": 2.1140114713800772e-08, + "logits/chosen": -2.007713794708252, + "logits/rejected": -2.09729266166687, + "logps/chosen": -287.074462890625, + "logps/rejected": -229.90562438964844, + "loss": 0.3795, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.031831979751587, + "rewards/margins": 1.870666742324829, + "rewards/rejected": -2.902498722076416, + "step": 8072 + }, + { + "epoch": 0.93, + "learning_rate": 2.110499824417652e-08, + "logits/chosen": -1.3121700286865234, + "logits/rejected": -1.7124099731445312, + "logps/chosen": -469.056396484375, + "logps/rejected": -307.59967041015625, + "loss": 0.3823, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4380505681037903, + "rewards/margins": 1.76896333694458, + "rewards/rejected": -2.2070138454437256, + "step": 8073 + }, + { + "epoch": 0.93, + "learning_rate": 2.1069881774552266e-08, + "logits/chosen": -2.1536691188812256, + "logits/rejected": -2.550309896469116, + "logps/chosen": -415.66961669921875, + "logps/rejected": -181.7409210205078, + "loss": 1.0523, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.8653243780136108, + "rewards/margins": 0.5866624116897583, + "rewards/rejected": -1.4519867897033691, + "step": 8074 + }, + { + "epoch": 0.93, + "learning_rate": 2.1034765304928014e-08, + "logits/chosen": -1.7016233205795288, + "logits/rejected": -2.0704338550567627, + "logps/chosen": -292.2049865722656, + "logps/rejected": -243.2834014892578, + "loss": 0.4861, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2336004227399826, + "rewards/margins": 0.9238179326057434, + "rewards/rejected": -1.1574184894561768, + "step": 8075 + }, + { + "epoch": 0.93, + "learning_rate": 2.0999648835303758e-08, + "logits/chosen": -2.3975284099578857, + "logits/rejected": -2.42624568939209, + "logps/chosen": -371.28314208984375, + "logps/rejected": -307.4358215332031, + "loss": 0.0569, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2839590609073639, + "rewards/margins": 5.253778457641602, + "rewards/rejected": -5.537736892700195, + "step": 8076 + }, + { + "epoch": 0.93, + "learning_rate": 2.0964532365679505e-08, + "logits/chosen": -2.666745662689209, + "logits/rejected": -2.524859666824341, + "logps/chosen": -258.08465576171875, + "logps/rejected": -170.0246124267578, + "loss": 0.2021, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4041402339935303, + "rewards/margins": 2.0326151847839355, + "rewards/rejected": -3.4367551803588867, + "step": 8077 + }, + { + "epoch": 0.93, + "learning_rate": 2.0929415896055245e-08, + "logits/chosen": -2.645129442214966, + "logits/rejected": -2.587233781814575, + "logps/chosen": -236.84938049316406, + "logps/rejected": -292.9193420410156, + "loss": 0.2759, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.840434730052948, + "rewards/margins": 2.151762008666992, + "rewards/rejected": -2.992196559906006, + "step": 8078 + }, + { + "epoch": 0.93, + "learning_rate": 2.0894299426430993e-08, + "logits/chosen": -2.5167882442474365, + "logits/rejected": -2.5039596557617188, + "logps/chosen": -407.3101806640625, + "logps/rejected": -352.7266845703125, + "loss": 0.7808, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.528185486793518, + "rewards/margins": 0.9134079217910767, + "rewards/rejected": -2.441593647003174, + "step": 8079 + }, + { + "epoch": 0.93, + "learning_rate": 2.085918295680674e-08, + "logits/chosen": -2.160578727722168, + "logits/rejected": -2.3333749771118164, + "logps/chosen": -215.38003540039062, + "logps/rejected": -131.12542724609375, + "loss": 1.1227, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0718060731887817, + "rewards/margins": 0.7409054040908813, + "rewards/rejected": -1.8127115964889526, + "step": 8080 + }, + { + "epoch": 0.93, + "learning_rate": 2.0824066487182487e-08, + "logits/chosen": -2.4746174812316895, + "logits/rejected": -2.381664752960205, + "logps/chosen": -222.2865753173828, + "logps/rejected": -253.0215606689453, + "loss": 0.2556, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2554761469364166, + "rewards/margins": 1.899051547050476, + "rewards/rejected": -2.1545276641845703, + "step": 8081 + }, + { + "epoch": 0.93, + "learning_rate": 2.078895001755823e-08, + "logits/chosen": -2.6063108444213867, + "logits/rejected": -2.732450485229492, + "logps/chosen": -300.7806701660156, + "logps/rejected": -303.85821533203125, + "loss": 0.2678, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9179660677909851, + "rewards/margins": 2.499382972717285, + "rewards/rejected": -3.417349338531494, + "step": 8082 + }, + { + "epoch": 0.93, + "learning_rate": 2.0753833547933978e-08, + "logits/chosen": -2.207839012145996, + "logits/rejected": -2.1748733520507812, + "logps/chosen": -199.78018188476562, + "logps/rejected": -210.58180236816406, + "loss": 0.3514, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.84557044506073, + "rewards/margins": 2.1545140743255615, + "rewards/rejected": -3.000084400177002, + "step": 8083 + }, + { + "epoch": 0.93, + "learning_rate": 2.0718717078309725e-08, + "logits/chosen": -2.1222310066223145, + "logits/rejected": -2.104604482650757, + "logps/chosen": -222.49090576171875, + "logps/rejected": -243.8779754638672, + "loss": 0.2197, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.46622729301452637, + "rewards/margins": 1.8904234170913696, + "rewards/rejected": -2.3566508293151855, + "step": 8084 + }, + { + "epoch": 0.93, + "learning_rate": 2.0683600608685473e-08, + "logits/chosen": -2.538891315460205, + "logits/rejected": -2.2966291904449463, + "logps/chosen": -378.77850341796875, + "logps/rejected": -455.6708984375, + "loss": 0.7638, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9497359991073608, + "rewards/margins": 1.2523490190505981, + "rewards/rejected": -2.202085018157959, + "step": 8085 + }, + { + "epoch": 0.93, + "learning_rate": 2.0648484139061217e-08, + "logits/chosen": -2.532944679260254, + "logits/rejected": -2.5076184272766113, + "logps/chosen": -218.25723266601562, + "logps/rejected": -184.46160888671875, + "loss": 0.5054, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8103346824645996, + "rewards/margins": 1.1364216804504395, + "rewards/rejected": -1.9467566013336182, + "step": 8086 + }, + { + "epoch": 0.93, + "learning_rate": 2.0613367669436964e-08, + "logits/chosen": -1.7146018743515015, + "logits/rejected": -1.9794139862060547, + "logps/chosen": -340.31219482421875, + "logps/rejected": -438.26544189453125, + "loss": 0.2903, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.672619104385376, + "rewards/margins": 1.6638097763061523, + "rewards/rejected": -2.336428642272949, + "step": 8087 + }, + { + "epoch": 0.93, + "learning_rate": 2.057825119981271e-08, + "logits/chosen": -2.4784953594207764, + "logits/rejected": -2.334138870239258, + "logps/chosen": -331.834716796875, + "logps/rejected": -322.4658203125, + "loss": 0.6889, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6242340803146362, + "rewards/margins": 2.4455466270446777, + "rewards/rejected": -3.0697808265686035, + "step": 8088 + }, + { + "epoch": 0.93, + "learning_rate": 2.0543134730188455e-08, + "logits/chosen": -1.906294822692871, + "logits/rejected": -2.3768417835235596, + "logps/chosen": -231.09661865234375, + "logps/rejected": -217.25648498535156, + "loss": 0.3761, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8508915305137634, + "rewards/margins": 2.0427627563476562, + "rewards/rejected": -2.8936541080474854, + "step": 8089 + }, + { + "epoch": 0.93, + "learning_rate": 2.0508018260564202e-08, + "logits/chosen": -2.636890172958374, + "logits/rejected": -2.4415411949157715, + "logps/chosen": -322.4605712890625, + "logps/rejected": -352.596435546875, + "loss": 0.1212, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08544294536113739, + "rewards/margins": 4.0906291007995605, + "rewards/rejected": -4.176072120666504, + "step": 8090 + }, + { + "epoch": 0.93, + "learning_rate": 2.047290179093995e-08, + "logits/chosen": -2.473681688308716, + "logits/rejected": -2.6036882400512695, + "logps/chosen": -444.648681640625, + "logps/rejected": -299.16180419921875, + "loss": 0.1959, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4225046634674072, + "rewards/margins": 3.1243205070495605, + "rewards/rejected": -4.546824932098389, + "step": 8091 + }, + { + "epoch": 0.93, + "learning_rate": 2.0437785321315697e-08, + "logits/chosen": -2.186833143234253, + "logits/rejected": -2.389738082885742, + "logps/chosen": -501.9102783203125, + "logps/rejected": -262.55535888671875, + "loss": 0.6913, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3426625430583954, + "rewards/margins": 1.414920449256897, + "rewards/rejected": -1.7575830221176147, + "step": 8092 + }, + { + "epoch": 0.93, + "learning_rate": 2.040266885169144e-08, + "logits/chosen": -2.3475708961486816, + "logits/rejected": -2.224820137023926, + "logps/chosen": -306.4709777832031, + "logps/rejected": -311.8168029785156, + "loss": 0.3781, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7770971059799194, + "rewards/margins": 2.408881664276123, + "rewards/rejected": -3.185978889465332, + "step": 8093 + }, + { + "epoch": 0.93, + "learning_rate": 2.0367552382067188e-08, + "logits/chosen": -2.2966299057006836, + "logits/rejected": -2.5389914512634277, + "logps/chosen": -256.01495361328125, + "logps/rejected": -257.27996826171875, + "loss": 0.2158, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4351927638053894, + "rewards/margins": 2.9444143772125244, + "rewards/rejected": -2.5092215538024902, + "step": 8094 + }, + { + "epoch": 0.93, + "learning_rate": 2.0332435912442935e-08, + "logits/chosen": -2.9518613815307617, + "logits/rejected": -2.962223529815674, + "logps/chosen": -191.11630249023438, + "logps/rejected": -240.0181884765625, + "loss": 0.8926, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7866244316101074, + "rewards/margins": 2.2709925174713135, + "rewards/rejected": -4.057616710662842, + "step": 8095 + }, + { + "epoch": 0.93, + "learning_rate": 2.029731944281868e-08, + "logits/chosen": -2.533609628677368, + "logits/rejected": -2.790781021118164, + "logps/chosen": -271.4576721191406, + "logps/rejected": -513.1048583984375, + "loss": 0.2859, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1460769176483154, + "rewards/margins": 3.367175817489624, + "rewards/rejected": -4.513252258300781, + "step": 8096 + }, + { + "epoch": 0.93, + "learning_rate": 2.0262202973194426e-08, + "logits/chosen": -2.5465469360351562, + "logits/rejected": -2.4404311180114746, + "logps/chosen": -296.77532958984375, + "logps/rejected": -341.95599365234375, + "loss": 0.7276, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9453568458557129, + "rewards/margins": 1.867914080619812, + "rewards/rejected": -2.8132710456848145, + "step": 8097 + }, + { + "epoch": 0.93, + "learning_rate": 2.0227086503570174e-08, + "logits/chosen": -2.9351441860198975, + "logits/rejected": -2.8625500202178955, + "logps/chosen": -237.6062774658203, + "logps/rejected": -133.6382598876953, + "loss": 0.3799, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0420663356781006, + "rewards/margins": 1.7318322658538818, + "rewards/rejected": -2.7738986015319824, + "step": 8098 + }, + { + "epoch": 0.93, + "learning_rate": 2.019197003394592e-08, + "logits/chosen": -2.37322998046875, + "logits/rejected": -2.3019556999206543, + "logps/chosen": -216.26683044433594, + "logps/rejected": -275.05877685546875, + "loss": 0.415, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7815064191818237, + "rewards/margins": 1.5053508281707764, + "rewards/rejected": -2.2868573665618896, + "step": 8099 + }, + { + "epoch": 0.93, + "learning_rate": 2.0156853564321665e-08, + "logits/chosen": -1.9320027828216553, + "logits/rejected": -2.3278377056121826, + "logps/chosen": -315.202880859375, + "logps/rejected": -259.7108154296875, + "loss": 0.332, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.530734658241272, + "rewards/margins": 1.579380989074707, + "rewards/rejected": -2.1101155281066895, + "step": 8100 + }, + { + "epoch": 0.93, + "learning_rate": 2.0121737094697412e-08, + "logits/chosen": -2.711944103240967, + "logits/rejected": -2.712198495864868, + "logps/chosen": -199.160400390625, + "logps/rejected": -241.42774963378906, + "loss": 0.3231, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17167577147483826, + "rewards/margins": 2.717149257659912, + "rewards/rejected": -2.8888251781463623, + "step": 8101 + }, + { + "epoch": 0.93, + "learning_rate": 2.008662062507316e-08, + "logits/chosen": -2.5369956493377686, + "logits/rejected": -2.5615334510803223, + "logps/chosen": -284.3658447265625, + "logps/rejected": -176.5325469970703, + "loss": 0.2498, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.438169002532959, + "rewards/margins": 1.5559782981872559, + "rewards/rejected": -1.9941474199295044, + "step": 8102 + }, + { + "epoch": 0.93, + "learning_rate": 2.0051504155448903e-08, + "logits/chosen": -2.621553659439087, + "logits/rejected": -2.5922038555145264, + "logps/chosen": -201.1980743408203, + "logps/rejected": -271.534912109375, + "loss": 0.1621, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6491113305091858, + "rewards/margins": 2.1675379276275635, + "rewards/rejected": -2.8166491985321045, + "step": 8103 + }, + { + "epoch": 0.93, + "learning_rate": 2.001638768582465e-08, + "logits/chosen": -2.672778606414795, + "logits/rejected": -2.8296005725860596, + "logps/chosen": -434.8971862792969, + "logps/rejected": -249.6883544921875, + "loss": 0.1509, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0445482730865479, + "rewards/margins": 2.686720371246338, + "rewards/rejected": -3.7312686443328857, + "step": 8104 + }, + { + "epoch": 0.93, + "learning_rate": 1.9981271216200398e-08, + "logits/chosen": -2.3564393520355225, + "logits/rejected": -2.2387521266937256, + "logps/chosen": -207.4256591796875, + "logps/rejected": -345.7054443359375, + "loss": 0.4081, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1369152069091797, + "rewards/margins": 1.965397596359253, + "rewards/rejected": -3.1023128032684326, + "step": 8105 + }, + { + "epoch": 0.93, + "learning_rate": 1.9946154746576145e-08, + "logits/chosen": -2.746163845062256, + "logits/rejected": -2.615163803100586, + "logps/chosen": -175.35995483398438, + "logps/rejected": -204.22528076171875, + "loss": 0.2579, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.40976211428642273, + "rewards/margins": 2.916468620300293, + "rewards/rejected": -3.326230764389038, + "step": 8106 + }, + { + "epoch": 0.93, + "learning_rate": 1.991103827695189e-08, + "logits/chosen": -2.2764968872070312, + "logits/rejected": -2.260000228881836, + "logps/chosen": -350.6235656738281, + "logps/rejected": -324.25244140625, + "loss": 0.1798, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.49070417881011963, + "rewards/margins": 3.278726577758789, + "rewards/rejected": -3.769430637359619, + "step": 8107 + }, + { + "epoch": 0.93, + "learning_rate": 1.9875921807327636e-08, + "logits/chosen": -2.5449092388153076, + "logits/rejected": -2.5334219932556152, + "logps/chosen": -159.14385986328125, + "logps/rejected": -220.74652099609375, + "loss": 0.1857, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5376614928245544, + "rewards/margins": 3.856485605239868, + "rewards/rejected": -4.394146919250488, + "step": 8108 + }, + { + "epoch": 0.93, + "learning_rate": 1.9840805337703383e-08, + "logits/chosen": -2.3497042655944824, + "logits/rejected": -2.464545249938965, + "logps/chosen": -327.0340881347656, + "logps/rejected": -311.6513671875, + "loss": 0.2961, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2008581161499023, + "rewards/margins": 3.551166296005249, + "rewards/rejected": -4.7520246505737305, + "step": 8109 + }, + { + "epoch": 0.93, + "learning_rate": 1.980568886807913e-08, + "logits/chosen": -2.3762545585632324, + "logits/rejected": -2.2434005737304688, + "logps/chosen": -217.98184204101562, + "logps/rejected": -272.71649169921875, + "loss": 0.7318, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.982007384300232, + "rewards/margins": 1.1921448707580566, + "rewards/rejected": -3.174152374267578, + "step": 8110 + }, + { + "epoch": 0.94, + "learning_rate": 1.9770572398454874e-08, + "logits/chosen": -1.8532942533493042, + "logits/rejected": -1.6394548416137695, + "logps/chosen": -139.44815063476562, + "logps/rejected": -213.7678985595703, + "loss": 0.3243, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.701310396194458, + "rewards/margins": 2.485291004180908, + "rewards/rejected": -3.186601400375366, + "step": 8111 + }, + { + "epoch": 0.94, + "learning_rate": 1.9735455928830622e-08, + "logits/chosen": -2.842682361602783, + "logits/rejected": -2.8688626289367676, + "logps/chosen": -232.5930633544922, + "logps/rejected": -304.4053955078125, + "loss": 0.218, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5826524496078491, + "rewards/margins": 3.0461483001708984, + "rewards/rejected": -3.628800868988037, + "step": 8112 + }, + { + "epoch": 0.94, + "learning_rate": 1.970033945920637e-08, + "logits/chosen": -1.9972021579742432, + "logits/rejected": -2.0315356254577637, + "logps/chosen": -294.98968505859375, + "logps/rejected": -312.4488220214844, + "loss": 0.5556, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8911386728286743, + "rewards/margins": 1.5952855348587036, + "rewards/rejected": -2.486424446105957, + "step": 8113 + }, + { + "epoch": 0.94, + "learning_rate": 1.9665222989582113e-08, + "logits/chosen": -2.4440438747406006, + "logits/rejected": -2.349111795425415, + "logps/chosen": -183.18093872070312, + "logps/rejected": -245.42535400390625, + "loss": 0.648, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7627634406089783, + "rewards/margins": 0.35878780484199524, + "rewards/rejected": -1.1215511560440063, + "step": 8114 + }, + { + "epoch": 0.94, + "learning_rate": 1.963010651995786e-08, + "logits/chosen": -2.978451728820801, + "logits/rejected": -2.9752368927001953, + "logps/chosen": -108.08931732177734, + "logps/rejected": -163.5369110107422, + "loss": 0.4653, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3261418342590332, + "rewards/margins": 2.2510693073272705, + "rewards/rejected": -3.577211380004883, + "step": 8115 + }, + { + "epoch": 0.94, + "learning_rate": 1.9594990050333607e-08, + "logits/chosen": -2.48939847946167, + "logits/rejected": -2.6847243309020996, + "logps/chosen": -204.51283264160156, + "logps/rejected": -185.689697265625, + "loss": 0.2836, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.47890204191207886, + "rewards/margins": 1.6688411235809326, + "rewards/rejected": -2.147742986679077, + "step": 8116 + }, + { + "epoch": 0.94, + "learning_rate": 1.9559873580709355e-08, + "logits/chosen": -2.5085806846618652, + "logits/rejected": -2.5138015747070312, + "logps/chosen": -197.87655639648438, + "logps/rejected": -185.9969024658203, + "loss": 0.2439, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9238815903663635, + "rewards/margins": 2.290247678756714, + "rewards/rejected": -3.2141289710998535, + "step": 8117 + }, + { + "epoch": 0.94, + "learning_rate": 1.95247571110851e-08, + "logits/chosen": -2.639984130859375, + "logits/rejected": -2.543231964111328, + "logps/chosen": -118.21876525878906, + "logps/rejected": -318.6590576171875, + "loss": 0.445, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6682771444320679, + "rewards/margins": 2.524174690246582, + "rewards/rejected": -3.1924517154693604, + "step": 8118 + }, + { + "epoch": 0.94, + "learning_rate": 1.9489640641460846e-08, + "logits/chosen": -2.0941128730773926, + "logits/rejected": -2.1762382984161377, + "logps/chosen": -276.4208068847656, + "logps/rejected": -239.35214233398438, + "loss": 0.545, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1166574954986572, + "rewards/margins": 1.031604290008545, + "rewards/rejected": -2.1482620239257812, + "step": 8119 + }, + { + "epoch": 0.94, + "learning_rate": 1.9454524171836593e-08, + "logits/chosen": -2.607720375061035, + "logits/rejected": -2.7542572021484375, + "logps/chosen": -252.21929931640625, + "logps/rejected": -261.56243896484375, + "loss": 0.2268, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6566469669342041, + "rewards/margins": 3.4182732105255127, + "rewards/rejected": -4.074920177459717, + "step": 8120 + }, + { + "epoch": 0.94, + "learning_rate": 1.9419407702212337e-08, + "logits/chosen": -2.652019500732422, + "logits/rejected": -2.572399616241455, + "logps/chosen": -206.6992950439453, + "logps/rejected": -349.39373779296875, + "loss": 0.2543, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.419711709022522, + "rewards/margins": 2.4652950763702393, + "rewards/rejected": -3.885006904602051, + "step": 8121 + }, + { + "epoch": 0.94, + "learning_rate": 1.9384291232588084e-08, + "logits/chosen": -1.8756651878356934, + "logits/rejected": -2.106210708618164, + "logps/chosen": -382.41131591796875, + "logps/rejected": -292.2900695800781, + "loss": 0.337, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0802979469299316, + "rewards/margins": 2.0542712211608887, + "rewards/rejected": -3.1345694065093994, + "step": 8122 + }, + { + "epoch": 0.94, + "learning_rate": 1.934917476296383e-08, + "logits/chosen": -2.550036907196045, + "logits/rejected": -2.8363943099975586, + "logps/chosen": -251.1975555419922, + "logps/rejected": -178.04351806640625, + "loss": 0.2494, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8518477082252502, + "rewards/margins": 2.2917118072509766, + "rewards/rejected": -3.143559455871582, + "step": 8123 + }, + { + "epoch": 0.94, + "learning_rate": 1.931405829333958e-08, + "logits/chosen": -2.372332811355591, + "logits/rejected": -2.534424066543579, + "logps/chosen": -259.30242919921875, + "logps/rejected": -174.19403076171875, + "loss": 0.2767, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8753980994224548, + "rewards/margins": 1.8114076852798462, + "rewards/rejected": -2.6868057250976562, + "step": 8124 + }, + { + "epoch": 0.94, + "learning_rate": 1.9278941823715323e-08, + "logits/chosen": -2.238434076309204, + "logits/rejected": -2.2519938945770264, + "logps/chosen": -368.1033935546875, + "logps/rejected": -409.7409362792969, + "loss": 0.9798, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9164296388626099, + "rewards/margins": 0.5685678720474243, + "rewards/rejected": -1.4849975109100342, + "step": 8125 + }, + { + "epoch": 0.94, + "learning_rate": 1.9243825354091066e-08, + "logits/chosen": -2.491692543029785, + "logits/rejected": -2.3127083778381348, + "logps/chosen": -222.25302124023438, + "logps/rejected": -297.8294677734375, + "loss": 0.3204, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.0088751316070557, + "rewards/margins": 3.2860984802246094, + "rewards/rejected": -5.294973373413086, + "step": 8126 + }, + { + "epoch": 0.94, + "learning_rate": 1.9208708884466814e-08, + "logits/chosen": -2.393009901046753, + "logits/rejected": -2.4488565921783447, + "logps/chosen": -263.55828857421875, + "logps/rejected": -204.20440673828125, + "loss": 0.1307, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9929935932159424, + "rewards/margins": 2.8161935806274414, + "rewards/rejected": -3.8091869354248047, + "step": 8127 + }, + { + "epoch": 0.94, + "learning_rate": 1.9173592414842558e-08, + "logits/chosen": -2.6539793014526367, + "logits/rejected": -2.519970655441284, + "logps/chosen": -183.77418518066406, + "logps/rejected": -308.0970458984375, + "loss": 0.3039, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.37829989194869995, + "rewards/margins": 1.9982059001922607, + "rewards/rejected": -2.3765056133270264, + "step": 8128 + }, + { + "epoch": 0.94, + "learning_rate": 1.9138475945218305e-08, + "logits/chosen": -2.072500228881836, + "logits/rejected": -2.026235818862915, + "logps/chosen": -291.1907043457031, + "logps/rejected": -396.7208557128906, + "loss": 0.192, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3417267799377441, + "rewards/margins": 3.0002312660217285, + "rewards/rejected": -4.341958522796631, + "step": 8129 + }, + { + "epoch": 0.94, + "learning_rate": 1.9103359475594052e-08, + "logits/chosen": -2.2559258937835693, + "logits/rejected": -2.3180482387542725, + "logps/chosen": -222.89111328125, + "logps/rejected": -279.9412536621094, + "loss": 0.7873, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.78950035572052, + "rewards/margins": 0.23658472299575806, + "rewards/rejected": -1.0260850191116333, + "step": 8130 + }, + { + "epoch": 0.94, + "learning_rate": 1.9068243005969796e-08, + "logits/chosen": -2.016721725463867, + "logits/rejected": -2.081981897354126, + "logps/chosen": -492.0289001464844, + "logps/rejected": -412.5901794433594, + "loss": 0.3247, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7306250929832458, + "rewards/margins": 1.342524528503418, + "rewards/rejected": -2.0731499195098877, + "step": 8131 + }, + { + "epoch": 0.94, + "learning_rate": 1.9033126536345543e-08, + "logits/chosen": -2.009392738342285, + "logits/rejected": -2.02174973487854, + "logps/chosen": -281.39544677734375, + "logps/rejected": -304.932373046875, + "loss": 0.1726, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9168058633804321, + "rewards/margins": 2.1233038902282715, + "rewards/rejected": -3.040109634399414, + "step": 8132 + }, + { + "epoch": 0.94, + "learning_rate": 1.899801006672129e-08, + "logits/chosen": -1.592451810836792, + "logits/rejected": -2.0027754306793213, + "logps/chosen": -451.9306640625, + "logps/rejected": -249.4476318359375, + "loss": 0.3476, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.24177920818328857, + "rewards/margins": 1.9440956115722656, + "rewards/rejected": -2.1858747005462646, + "step": 8133 + }, + { + "epoch": 0.94, + "learning_rate": 1.8962893597097038e-08, + "logits/chosen": -2.293309450149536, + "logits/rejected": -2.5079641342163086, + "logps/chosen": -320.20703125, + "logps/rejected": -273.35821533203125, + "loss": 0.4244, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0210518836975098, + "rewards/margins": 1.3344807624816895, + "rewards/rejected": -2.3555328845977783, + "step": 8134 + }, + { + "epoch": 0.94, + "learning_rate": 1.8927777127472782e-08, + "logits/chosen": -2.1958417892456055, + "logits/rejected": -1.980018138885498, + "logps/chosen": -211.34063720703125, + "logps/rejected": -321.3881530761719, + "loss": 0.2529, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5377745032310486, + "rewards/margins": 1.9858922958374023, + "rewards/rejected": -2.5236666202545166, + "step": 8135 + }, + { + "epoch": 0.94, + "learning_rate": 1.889266065784853e-08, + "logits/chosen": -2.3023438453674316, + "logits/rejected": -2.135390043258667, + "logps/chosen": -425.58929443359375, + "logps/rejected": -463.8940124511719, + "loss": 0.4098, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.47292980551719666, + "rewards/margins": 1.7638658285140991, + "rewards/rejected": -2.236795663833618, + "step": 8136 + }, + { + "epoch": 0.94, + "learning_rate": 1.8857544188224276e-08, + "logits/chosen": -2.5262160301208496, + "logits/rejected": -2.336660385131836, + "logps/chosen": -169.91632080078125, + "logps/rejected": -214.24270629882812, + "loss": 0.2419, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17407353222370148, + "rewards/margins": 2.232255220413208, + "rewards/rejected": -2.4063289165496826, + "step": 8137 + }, + { + "epoch": 0.94, + "learning_rate": 1.882242771860002e-08, + "logits/chosen": -2.5283608436584473, + "logits/rejected": -2.6750950813293457, + "logps/chosen": -376.40350341796875, + "logps/rejected": -262.7095642089844, + "loss": 0.3351, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.032286286354065, + "rewards/margins": 2.9640772342681885, + "rewards/rejected": -3.9963631629943848, + "step": 8138 + }, + { + "epoch": 0.94, + "learning_rate": 1.8787311248975767e-08, + "logits/chosen": -2.3486740589141846, + "logits/rejected": -2.174960136413574, + "logps/chosen": -225.68389892578125, + "logps/rejected": -354.6002197265625, + "loss": 0.3328, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2413952350616455, + "rewards/margins": 2.9358339309692383, + "rewards/rejected": -4.177229404449463, + "step": 8139 + }, + { + "epoch": 0.94, + "learning_rate": 1.8752194779351515e-08, + "logits/chosen": -2.117156744003296, + "logits/rejected": -2.2235708236694336, + "logps/chosen": -269.7586669921875, + "logps/rejected": -333.2239990234375, + "loss": 0.2082, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5229411125183105, + "rewards/margins": 2.8172314167022705, + "rewards/rejected": -3.34017276763916, + "step": 8140 + }, + { + "epoch": 0.94, + "learning_rate": 1.8717078309727262e-08, + "logits/chosen": -2.1865925788879395, + "logits/rejected": -2.529515027999878, + "logps/chosen": -346.7727966308594, + "logps/rejected": -201.08143615722656, + "loss": 0.255, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.779011607170105, + "rewards/margins": 3.278463840484619, + "rewards/rejected": -4.057475566864014, + "step": 8141 + }, + { + "epoch": 0.94, + "learning_rate": 1.8681961840103006e-08, + "logits/chosen": -2.377535581588745, + "logits/rejected": -2.2578048706054688, + "logps/chosen": -233.55001831054688, + "logps/rejected": -292.73516845703125, + "loss": 0.3683, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6014212369918823, + "rewards/margins": 3.0836286544799805, + "rewards/rejected": -3.6850500106811523, + "step": 8142 + }, + { + "epoch": 0.94, + "learning_rate": 1.8646845370478753e-08, + "logits/chosen": -1.9825491905212402, + "logits/rejected": -2.423421859741211, + "logps/chosen": -293.56298828125, + "logps/rejected": -254.56900024414062, + "loss": 0.9085, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.045905590057373, + "rewards/margins": 1.8218262195587158, + "rewards/rejected": -3.867731809616089, + "step": 8143 + }, + { + "epoch": 0.94, + "learning_rate": 1.86117289008545e-08, + "logits/chosen": -1.946230411529541, + "logits/rejected": -2.034101963043213, + "logps/chosen": -238.82528686523438, + "logps/rejected": -311.2569580078125, + "loss": 0.3394, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5382232666015625, + "rewards/margins": 3.202604293823242, + "rewards/rejected": -4.740827560424805, + "step": 8144 + }, + { + "epoch": 0.94, + "learning_rate": 1.8576612431230247e-08, + "logits/chosen": -1.717896819114685, + "logits/rejected": -1.8481788635253906, + "logps/chosen": -376.6169128417969, + "logps/rejected": -403.3065490722656, + "loss": 0.6468, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7882087230682373, + "rewards/margins": 2.489161491394043, + "rewards/rejected": -3.2773702144622803, + "step": 8145 + }, + { + "epoch": 0.94, + "learning_rate": 1.854149596160599e-08, + "logits/chosen": -2.452314853668213, + "logits/rejected": -2.4924638271331787, + "logps/chosen": -218.17633056640625, + "logps/rejected": -189.92889404296875, + "loss": 0.6216, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7745981216430664, + "rewards/margins": 1.6655194759368896, + "rewards/rejected": -2.440117359161377, + "step": 8146 + }, + { + "epoch": 0.94, + "learning_rate": 1.850637949198174e-08, + "logits/chosen": -2.263920783996582, + "logits/rejected": -2.421557664871216, + "logps/chosen": -274.28106689453125, + "logps/rejected": -180.4593048095703, + "loss": 0.3657, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5523489713668823, + "rewards/margins": 1.2422863245010376, + "rewards/rejected": -1.79463529586792, + "step": 8147 + }, + { + "epoch": 0.94, + "learning_rate": 1.8471263022357486e-08, + "logits/chosen": -2.45158052444458, + "logits/rejected": -2.5493595600128174, + "logps/chosen": -109.45873260498047, + "logps/rejected": -152.40122985839844, + "loss": 0.4534, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2507826089859009, + "rewards/margins": 1.9168219566345215, + "rewards/rejected": -2.167604446411133, + "step": 8148 + }, + { + "epoch": 0.94, + "learning_rate": 1.843614655273323e-08, + "logits/chosen": -2.394573211669922, + "logits/rejected": -2.209852695465088, + "logps/chosen": -233.06875610351562, + "logps/rejected": -371.17022705078125, + "loss": 0.2847, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1913212537765503, + "rewards/margins": 4.39200496673584, + "rewards/rejected": -4.5833258628845215, + "step": 8149 + }, + { + "epoch": 0.94, + "learning_rate": 1.8401030083108977e-08, + "logits/chosen": -2.404933214187622, + "logits/rejected": -2.5047688484191895, + "logps/chosen": -200.28297424316406, + "logps/rejected": -229.76776123046875, + "loss": 0.3426, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9667690396308899, + "rewards/margins": 2.2775797843933105, + "rewards/rejected": -3.2443485260009766, + "step": 8150 + }, + { + "epoch": 0.94, + "learning_rate": 1.8365913613484724e-08, + "logits/chosen": -2.2152836322784424, + "logits/rejected": -2.2597434520721436, + "logps/chosen": -402.3509826660156, + "logps/rejected": -412.9957275390625, + "loss": 0.289, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6950278282165527, + "rewards/margins": 2.6096842288970947, + "rewards/rejected": -3.3047120571136475, + "step": 8151 + }, + { + "epoch": 0.94, + "learning_rate": 1.833079714386047e-08, + "logits/chosen": -2.3099238872528076, + "logits/rejected": -2.458723783493042, + "logps/chosen": -420.80517578125, + "logps/rejected": -271.6904602050781, + "loss": 0.234, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4275329113006592, + "rewards/margins": 2.979870557785034, + "rewards/rejected": -4.407403469085693, + "step": 8152 + }, + { + "epoch": 0.94, + "learning_rate": 1.8295680674236215e-08, + "logits/chosen": -2.4029781818389893, + "logits/rejected": -2.3585710525512695, + "logps/chosen": -164.19992065429688, + "logps/rejected": -309.216552734375, + "loss": 0.2142, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3954885005950928, + "rewards/margins": 2.449732780456543, + "rewards/rejected": -2.8452212810516357, + "step": 8153 + }, + { + "epoch": 0.94, + "learning_rate": 1.8260564204611963e-08, + "logits/chosen": -2.0979769229888916, + "logits/rejected": -2.028346300125122, + "logps/chosen": -341.53076171875, + "logps/rejected": -202.17758178710938, + "loss": 0.7221, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.136964201927185, + "rewards/margins": 1.6106061935424805, + "rewards/rejected": -2.747570514678955, + "step": 8154 + }, + { + "epoch": 0.94, + "learning_rate": 1.822544773498771e-08, + "logits/chosen": -2.426248073577881, + "logits/rejected": -2.11586856842041, + "logps/chosen": -322.2477111816406, + "logps/rejected": -449.3094177246094, + "loss": 0.1895, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.798880934715271, + "rewards/margins": 2.601071357727051, + "rewards/rejected": -3.3999524116516113, + "step": 8155 + }, + { + "epoch": 0.94, + "learning_rate": 1.8190331265363454e-08, + "logits/chosen": -2.720050096511841, + "logits/rejected": -2.5063087940216064, + "logps/chosen": -204.860107421875, + "logps/rejected": -287.1957702636719, + "loss": 1.1978, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1547751426696777, + "rewards/margins": 1.8868929147720337, + "rewards/rejected": -4.041667938232422, + "step": 8156 + }, + { + "epoch": 0.94, + "learning_rate": 1.81552147957392e-08, + "logits/chosen": -2.0370497703552246, + "logits/rejected": -2.1650426387786865, + "logps/chosen": -280.40753173828125, + "logps/rejected": -246.46630859375, + "loss": 0.4205, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.20203453302383423, + "rewards/margins": 2.099140167236328, + "rewards/rejected": -2.3011748790740967, + "step": 8157 + }, + { + "epoch": 0.94, + "learning_rate": 1.8120098326114948e-08, + "logits/chosen": -2.2073678970336914, + "logits/rejected": -2.4967048168182373, + "logps/chosen": -299.67767333984375, + "logps/rejected": -210.34088134765625, + "loss": 0.2448, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.044482797384262085, + "rewards/margins": 2.3133649826049805, + "rewards/rejected": -2.3578479290008545, + "step": 8158 + }, + { + "epoch": 0.94, + "learning_rate": 1.8084981856490696e-08, + "logits/chosen": -2.144329071044922, + "logits/rejected": -2.043788433074951, + "logps/chosen": -227.84744262695312, + "logps/rejected": -317.3116455078125, + "loss": 0.5008, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1369023323059082, + "rewards/margins": 3.2728116512298584, + "rewards/rejected": -4.4097137451171875, + "step": 8159 + }, + { + "epoch": 0.94, + "learning_rate": 1.804986538686644e-08, + "logits/chosen": -1.7260644435882568, + "logits/rejected": -2.030290365219116, + "logps/chosen": -602.107666015625, + "logps/rejected": -484.1224365234375, + "loss": 0.7391, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3287100791931152, + "rewards/margins": 2.1301143169403076, + "rewards/rejected": -3.458824634552002, + "step": 8160 + }, + { + "epoch": 0.94, + "learning_rate": 1.8014748917242183e-08, + "logits/chosen": -2.1366219520568848, + "logits/rejected": -2.425872325897217, + "logps/chosen": -244.5740966796875, + "logps/rejected": -240.87583923339844, + "loss": 0.5516, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.39935895800590515, + "rewards/margins": 1.6087499856948853, + "rewards/rejected": -2.0081090927124023, + "step": 8161 + }, + { + "epoch": 0.94, + "learning_rate": 1.797963244761793e-08, + "logits/chosen": -2.55888295173645, + "logits/rejected": -2.466442584991455, + "logps/chosen": -275.3948059082031, + "logps/rejected": -361.0814208984375, + "loss": 0.5494, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7599387168884277, + "rewards/margins": 2.3416988849639893, + "rewards/rejected": -3.101637363433838, + "step": 8162 + }, + { + "epoch": 0.94, + "learning_rate": 1.7944515977993678e-08, + "logits/chosen": -1.6659010648727417, + "logits/rejected": -1.9412996768951416, + "logps/chosen": -728.2576293945312, + "logps/rejected": -514.31005859375, + "loss": 0.1954, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7403205633163452, + "rewards/margins": 2.3953564167022705, + "rewards/rejected": -3.1356770992279053, + "step": 8163 + }, + { + "epoch": 0.94, + "learning_rate": 1.7909399508369425e-08, + "logits/chosen": -2.2777180671691895, + "logits/rejected": -2.5278050899505615, + "logps/chosen": -351.7295837402344, + "logps/rejected": -249.47727966308594, + "loss": 0.4494, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.624904751777649, + "rewards/margins": 2.1528408527374268, + "rewards/rejected": -3.777745485305786, + "step": 8164 + }, + { + "epoch": 0.94, + "learning_rate": 1.787428303874517e-08, + "logits/chosen": -2.324885845184326, + "logits/rejected": -2.3119118213653564, + "logps/chosen": -307.2552490234375, + "logps/rejected": -167.0502166748047, + "loss": 0.3492, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4973376989364624, + "rewards/margins": 1.4707942008972168, + "rewards/rejected": -1.9681317806243896, + "step": 8165 + }, + { + "epoch": 0.94, + "learning_rate": 1.7839166569120916e-08, + "logits/chosen": -2.7784740924835205, + "logits/rejected": -2.5356380939483643, + "logps/chosen": -219.6847686767578, + "logps/rejected": -234.53866577148438, + "loss": 0.1992, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5861449241638184, + "rewards/margins": 3.2679028511047363, + "rewards/rejected": -3.8540477752685547, + "step": 8166 + }, + { + "epoch": 0.94, + "learning_rate": 1.7804050099496663e-08, + "logits/chosen": -2.497760534286499, + "logits/rejected": -2.330015182495117, + "logps/chosen": -247.51132202148438, + "logps/rejected": -373.7793273925781, + "loss": 0.6523, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9584033489227295, + "rewards/margins": 2.236299514770508, + "rewards/rejected": -4.194703102111816, + "step": 8167 + }, + { + "epoch": 0.94, + "learning_rate": 1.7768933629872407e-08, + "logits/chosen": -1.602071762084961, + "logits/rejected": -1.6658133268356323, + "logps/chosen": -378.3448791503906, + "logps/rejected": -279.374267578125, + "loss": 0.6062, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7903972268104553, + "rewards/margins": 0.6803631782531738, + "rewards/rejected": -1.4707603454589844, + "step": 8168 + }, + { + "epoch": 0.94, + "learning_rate": 1.7733817160248155e-08, + "logits/chosen": -2.1665120124816895, + "logits/rejected": -2.498502492904663, + "logps/chosen": -459.28717041015625, + "logps/rejected": -188.31964111328125, + "loss": 0.4296, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1927913427352905, + "rewards/margins": 1.7540433406829834, + "rewards/rejected": -2.9468345642089844, + "step": 8169 + }, + { + "epoch": 0.94, + "learning_rate": 1.7698700690623902e-08, + "logits/chosen": -2.446507692337036, + "logits/rejected": -2.6770541667938232, + "logps/chosen": -415.47589111328125, + "logps/rejected": -261.4321594238281, + "loss": 0.8275, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.072735071182251, + "rewards/margins": 0.6676521301269531, + "rewards/rejected": -1.740387201309204, + "step": 8170 + }, + { + "epoch": 0.94, + "learning_rate": 1.766358422099965e-08, + "logits/chosen": -2.82211971282959, + "logits/rejected": -2.859978675842285, + "logps/chosen": -84.7589111328125, + "logps/rejected": -166.81008911132812, + "loss": 0.245, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8863639831542969, + "rewards/margins": 2.639151096343994, + "rewards/rejected": -3.525515079498291, + "step": 8171 + }, + { + "epoch": 0.94, + "learning_rate": 1.7628467751375393e-08, + "logits/chosen": -2.518660068511963, + "logits/rejected": -2.444251537322998, + "logps/chosen": -131.01699829101562, + "logps/rejected": -260.0523681640625, + "loss": 0.804, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4685804843902588, + "rewards/margins": 2.1397857666015625, + "rewards/rejected": -3.608366012573242, + "step": 8172 + }, + { + "epoch": 0.94, + "learning_rate": 1.759335128175114e-08, + "logits/chosen": -1.8968803882598877, + "logits/rejected": -1.8559350967407227, + "logps/chosen": -305.83282470703125, + "logps/rejected": -433.79052734375, + "loss": 0.2847, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3333754539489746, + "rewards/margins": 4.094691276550293, + "rewards/rejected": -5.428066253662109, + "step": 8173 + }, + { + "epoch": 0.94, + "learning_rate": 1.7558234812126888e-08, + "logits/chosen": -2.713555097579956, + "logits/rejected": -2.733992576599121, + "logps/chosen": -303.2196960449219, + "logps/rejected": -312.6822814941406, + "loss": 0.3112, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1272270679473877, + "rewards/margins": 3.5570502281188965, + "rewards/rejected": -4.684277057647705, + "step": 8174 + }, + { + "epoch": 0.94, + "learning_rate": 1.7523118342502635e-08, + "logits/chosen": -2.809509754180908, + "logits/rejected": -2.869765043258667, + "logps/chosen": -180.4478302001953, + "logps/rejected": -306.34466552734375, + "loss": 0.7516, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9061955213546753, + "rewards/margins": 0.8909664154052734, + "rewards/rejected": -1.7971619367599487, + "step": 8175 + }, + { + "epoch": 0.94, + "learning_rate": 1.748800187287838e-08, + "logits/chosen": -2.4571220874786377, + "logits/rejected": -2.1540284156799316, + "logps/chosen": -141.29586791992188, + "logps/rejected": -172.46588134765625, + "loss": 0.6283, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2003452777862549, + "rewards/margins": 1.0517287254333496, + "rewards/rejected": -2.2520740032196045, + "step": 8176 + }, + { + "epoch": 0.94, + "learning_rate": 1.7452885403254126e-08, + "logits/chosen": -2.4331822395324707, + "logits/rejected": -2.6066837310791016, + "logps/chosen": -317.2767639160156, + "logps/rejected": -185.41357421875, + "loss": 0.718, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9218087196350098, + "rewards/margins": 0.5755246877670288, + "rewards/rejected": -1.4973334074020386, + "step": 8177 + }, + { + "epoch": 0.94, + "learning_rate": 1.7417768933629873e-08, + "logits/chosen": -2.3700063228607178, + "logits/rejected": -2.4833719730377197, + "logps/chosen": -179.44239807128906, + "logps/rejected": -246.79397583007812, + "loss": 0.5015, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0645594596862793, + "rewards/margins": 1.3356292247772217, + "rewards/rejected": -2.400188684463501, + "step": 8178 + }, + { + "epoch": 0.94, + "learning_rate": 1.7382652464005617e-08, + "logits/chosen": -2.2674059867858887, + "logits/rejected": -2.0828773975372314, + "logps/chosen": -315.78167724609375, + "logps/rejected": -372.3127136230469, + "loss": 0.256, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4680521488189697, + "rewards/margins": 1.9939452409744263, + "rewards/rejected": -3.4619972705841064, + "step": 8179 + }, + { + "epoch": 0.94, + "learning_rate": 1.7347535994381364e-08, + "logits/chosen": -2.3098535537719727, + "logits/rejected": -2.586071014404297, + "logps/chosen": -239.424560546875, + "logps/rejected": -212.87237548828125, + "loss": 0.1887, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6618673801422119, + "rewards/margins": 2.0979366302490234, + "rewards/rejected": -2.7598042488098145, + "step": 8180 + }, + { + "epoch": 0.94, + "learning_rate": 1.731241952475711e-08, + "logits/chosen": -2.760016441345215, + "logits/rejected": -2.702655792236328, + "logps/chosen": -279.0628662109375, + "logps/rejected": -291.2651672363281, + "loss": 0.2898, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6345263123512268, + "rewards/margins": 2.1446378231048584, + "rewards/rejected": -2.7791640758514404, + "step": 8181 + }, + { + "epoch": 0.94, + "learning_rate": 1.727730305513286e-08, + "logits/chosen": -1.7902262210845947, + "logits/rejected": -2.174769878387451, + "logps/chosen": -512.635498046875, + "logps/rejected": -363.5780334472656, + "loss": 0.6062, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0390602350234985, + "rewards/margins": 1.082381248474121, + "rewards/rejected": -2.121441602706909, + "step": 8182 + }, + { + "epoch": 0.94, + "learning_rate": 1.7242186585508603e-08, + "logits/chosen": -2.295875072479248, + "logits/rejected": -2.378467082977295, + "logps/chosen": -299.13409423828125, + "logps/rejected": -226.79067993164062, + "loss": 0.3006, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00865219533443451, + "rewards/margins": 2.146746873855591, + "rewards/rejected": -2.1553988456726074, + "step": 8183 + }, + { + "epoch": 0.94, + "learning_rate": 1.720707011588435e-08, + "logits/chosen": -1.9958667755126953, + "logits/rejected": -2.270721435546875, + "logps/chosen": -322.41583251953125, + "logps/rejected": -343.321533203125, + "loss": 0.2715, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4313373565673828, + "rewards/margins": 2.4305531978607178, + "rewards/rejected": -3.8618907928466797, + "step": 8184 + }, + { + "epoch": 0.94, + "learning_rate": 1.7171953646260094e-08, + "logits/chosen": -2.8657469749450684, + "logits/rejected": -2.48457407951355, + "logps/chosen": -302.8947448730469, + "logps/rejected": -296.0092468261719, + "loss": 0.3865, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04086145758628845, + "rewards/margins": 1.7642102241516113, + "rewards/rejected": -1.8050715923309326, + "step": 8185 + }, + { + "epoch": 0.94, + "learning_rate": 1.713683717663584e-08, + "logits/chosen": -3.0193088054656982, + "logits/rejected": -2.9574198722839355, + "logps/chosen": -493.1130065917969, + "logps/rejected": -381.75897216796875, + "loss": 0.3138, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1921759843826294, + "rewards/margins": 1.8497570753097534, + "rewards/rejected": -3.041933059692383, + "step": 8186 + }, + { + "epoch": 0.94, + "learning_rate": 1.710172070701159e-08, + "logits/chosen": -1.8846335411071777, + "logits/rejected": -1.9460599422454834, + "logps/chosen": -258.0340881347656, + "logps/rejected": -292.7242126464844, + "loss": 0.3468, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8579805493354797, + "rewards/margins": 2.545137643814087, + "rewards/rejected": -3.403118133544922, + "step": 8187 + }, + { + "epoch": 0.94, + "learning_rate": 1.7066604237387332e-08, + "logits/chosen": -2.5236294269561768, + "logits/rejected": -2.580681324005127, + "logps/chosen": -352.84356689453125, + "logps/rejected": -217.56983947753906, + "loss": 1.0681, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.099961996078491, + "rewards/margins": 0.3872624635696411, + "rewards/rejected": -2.4872243404388428, + "step": 8188 + }, + { + "epoch": 0.94, + "learning_rate": 1.703148776776308e-08, + "logits/chosen": -2.2302629947662354, + "logits/rejected": -2.3786473274230957, + "logps/chosen": -222.1090850830078, + "logps/rejected": -247.5248565673828, + "loss": 0.5815, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5269153118133545, + "rewards/margins": 3.309379816055298, + "rewards/rejected": -4.836295127868652, + "step": 8189 + }, + { + "epoch": 0.94, + "learning_rate": 1.6996371298138827e-08, + "logits/chosen": -2.584146738052368, + "logits/rejected": -2.7179343700408936, + "logps/chosen": -267.829833984375, + "logps/rejected": -316.5887451171875, + "loss": 0.4288, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7239404916763306, + "rewards/margins": 1.613139271736145, + "rewards/rejected": -2.3370800018310547, + "step": 8190 + }, + { + "epoch": 0.94, + "learning_rate": 1.696125482851457e-08, + "logits/chosen": -2.608283281326294, + "logits/rejected": -2.664529800415039, + "logps/chosen": -256.19476318359375, + "logps/rejected": -205.29608154296875, + "loss": 0.4968, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0382856130599976, + "rewards/margins": 1.9353828430175781, + "rewards/rejected": -2.9736685752868652, + "step": 8191 + }, + { + "epoch": 0.94, + "learning_rate": 1.6926138358890318e-08, + "logits/chosen": -1.8154821395874023, + "logits/rejected": -1.8645521402359009, + "logps/chosen": -423.34503173828125, + "logps/rejected": -340.8004150390625, + "loss": 0.6708, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5375185012817383, + "rewards/margins": 1.3737825155258179, + "rewards/rejected": -2.9113011360168457, + "step": 8192 + }, + { + "epoch": 0.94, + "learning_rate": 1.6891021889266065e-08, + "logits/chosen": -2.4700751304626465, + "logits/rejected": -2.385730743408203, + "logps/chosen": -461.7625732421875, + "logps/rejected": -542.2485961914062, + "loss": 0.211, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.43330496549606323, + "rewards/margins": 4.091864109039307, + "rewards/rejected": -4.5251688957214355, + "step": 8193 + }, + { + "epoch": 0.94, + "learning_rate": 1.6855905419641812e-08, + "logits/chosen": -1.855569839477539, + "logits/rejected": -2.1301751136779785, + "logps/chosen": -420.329833984375, + "logps/rejected": -345.2410888671875, + "loss": 0.704, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.939949095249176, + "rewards/margins": 1.7802691459655762, + "rewards/rejected": -2.7202181816101074, + "step": 8194 + }, + { + "epoch": 0.94, + "learning_rate": 1.6820788950017556e-08, + "logits/chosen": -2.862884521484375, + "logits/rejected": -2.739243984222412, + "logps/chosen": -288.0238037109375, + "logps/rejected": -132.3043670654297, + "loss": 0.5506, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7543685436248779, + "rewards/margins": 1.1747944355010986, + "rewards/rejected": -1.9291629791259766, + "step": 8195 + }, + { + "epoch": 0.94, + "learning_rate": 1.6785672480393304e-08, + "logits/chosen": -2.771317481994629, + "logits/rejected": -2.7990431785583496, + "logps/chosen": -213.4417724609375, + "logps/rejected": -375.68792724609375, + "loss": 0.3317, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11982503533363342, + "rewards/margins": 4.241664886474609, + "rewards/rejected": -4.361489772796631, + "step": 8196 + }, + { + "epoch": 0.94, + "learning_rate": 1.675055601076905e-08, + "logits/chosen": -2.4944674968719482, + "logits/rejected": -2.62619948387146, + "logps/chosen": -275.557373046875, + "logps/rejected": -193.25328063964844, + "loss": 0.6042, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.574531078338623, + "rewards/margins": 0.5062745213508606, + "rewards/rejected": -2.080805540084839, + "step": 8197 + }, + { + "epoch": 0.95, + "learning_rate": 1.6715439541144795e-08, + "logits/chosen": -2.4456677436828613, + "logits/rejected": -2.332033634185791, + "logps/chosen": -226.994384765625, + "logps/rejected": -246.35189819335938, + "loss": 0.4299, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8503901958465576, + "rewards/margins": 1.6149919033050537, + "rewards/rejected": -3.4653818607330322, + "step": 8198 + }, + { + "epoch": 0.95, + "learning_rate": 1.6680323071520542e-08, + "logits/chosen": -2.1658315658569336, + "logits/rejected": -2.1744847297668457, + "logps/chosen": -368.84918212890625, + "logps/rejected": -352.6676025390625, + "loss": 0.4266, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.738531768321991, + "rewards/margins": 1.7000806331634521, + "rewards/rejected": -2.438612461090088, + "step": 8199 + }, + { + "epoch": 0.95, + "learning_rate": 1.664520660189629e-08, + "logits/chosen": -1.6906189918518066, + "logits/rejected": -2.1500256061553955, + "logps/chosen": -281.77215576171875, + "logps/rejected": -327.18182373046875, + "loss": 1.0224, + "rewards/accuracies": 0.375, + "rewards/chosen": -2.795760154724121, + "rewards/margins": 1.1204688549041748, + "rewards/rejected": -3.916228771209717, + "step": 8200 + }, + { + "epoch": 0.95, + "learning_rate": 1.6610090132272036e-08, + "logits/chosen": -1.8201966285705566, + "logits/rejected": -1.8776257038116455, + "logps/chosen": -280.2467956542969, + "logps/rejected": -307.6736755371094, + "loss": 0.2467, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.42889752984046936, + "rewards/margins": 2.7011399269104004, + "rewards/rejected": -3.130037546157837, + "step": 8201 + }, + { + "epoch": 0.95, + "learning_rate": 1.657497366264778e-08, + "logits/chosen": -2.70828914642334, + "logits/rejected": -2.840585708618164, + "logps/chosen": -393.4550476074219, + "logps/rejected": -369.01068115234375, + "loss": 0.1672, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5851802825927734, + "rewards/margins": 4.002715587615967, + "rewards/rejected": -4.58789587020874, + "step": 8202 + }, + { + "epoch": 0.95, + "learning_rate": 1.6539857193023528e-08, + "logits/chosen": -1.7927446365356445, + "logits/rejected": -2.0363922119140625, + "logps/chosen": -496.46868896484375, + "logps/rejected": -370.66259765625, + "loss": 0.0854, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.27017414569854736, + "rewards/margins": 3.4532222747802734, + "rewards/rejected": -3.1830482482910156, + "step": 8203 + }, + { + "epoch": 0.95, + "learning_rate": 1.6504740723399275e-08, + "logits/chosen": -2.9254908561706543, + "logits/rejected": -2.9328722953796387, + "logps/chosen": -191.5768585205078, + "logps/rejected": -208.43807983398438, + "loss": 0.2551, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4687057137489319, + "rewards/margins": 2.02437686920166, + "rewards/rejected": -2.4930825233459473, + "step": 8204 + }, + { + "epoch": 0.95, + "learning_rate": 1.6469624253775022e-08, + "logits/chosen": -2.8879494667053223, + "logits/rejected": -2.926835775375366, + "logps/chosen": -265.4115295410156, + "logps/rejected": -249.80389404296875, + "loss": 0.2274, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37318164110183716, + "rewards/margins": 3.107426643371582, + "rewards/rejected": -3.4806082248687744, + "step": 8205 + }, + { + "epoch": 0.95, + "learning_rate": 1.6434507784150766e-08, + "logits/chosen": -1.622157335281372, + "logits/rejected": -1.7287001609802246, + "logps/chosen": -210.0323944091797, + "logps/rejected": -270.10308837890625, + "loss": 0.4719, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.531172037124634, + "rewards/margins": 1.560816764831543, + "rewards/rejected": -4.091988563537598, + "step": 8206 + }, + { + "epoch": 0.95, + "learning_rate": 1.6399391314526513e-08, + "logits/chosen": -2.6388707160949707, + "logits/rejected": -2.439101457595825, + "logps/chosen": -416.63592529296875, + "logps/rejected": -286.23931884765625, + "loss": 0.0979, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6867427229881287, + "rewards/margins": 4.614508152008057, + "rewards/rejected": -5.30125093460083, + "step": 8207 + }, + { + "epoch": 0.95, + "learning_rate": 1.636427484490226e-08, + "logits/chosen": -2.18853497505188, + "logits/rejected": -2.287935733795166, + "logps/chosen": -212.99168395996094, + "logps/rejected": -285.8011474609375, + "loss": 0.1168, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4588041305541992, + "rewards/margins": 3.3347809314727783, + "rewards/rejected": -3.7935848236083984, + "step": 8208 + }, + { + "epoch": 0.95, + "learning_rate": 1.6329158375278004e-08, + "logits/chosen": -2.498514175415039, + "logits/rejected": -2.565370798110962, + "logps/chosen": -211.82144165039062, + "logps/rejected": -197.08810424804688, + "loss": 0.8321, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.625975489616394, + "rewards/margins": 0.20730775594711304, + "rewards/rejected": -0.8332832455635071, + "step": 8209 + }, + { + "epoch": 0.95, + "learning_rate": 1.629404190565375e-08, + "logits/chosen": -2.0376484394073486, + "logits/rejected": -2.0786025524139404, + "logps/chosen": -187.56394958496094, + "logps/rejected": -233.8419189453125, + "loss": 0.4121, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.142170786857605, + "rewards/margins": 1.0035699605941772, + "rewards/rejected": -2.1457407474517822, + "step": 8210 + }, + { + "epoch": 0.95, + "learning_rate": 1.6258925436029496e-08, + "logits/chosen": -2.523833990097046, + "logits/rejected": -2.538939952850342, + "logps/chosen": -337.9959716796875, + "logps/rejected": -194.1157684326172, + "loss": 0.1378, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8995602130889893, + "rewards/margins": 2.601954936981201, + "rewards/rejected": -3.5015149116516113, + "step": 8211 + }, + { + "epoch": 0.95, + "learning_rate": 1.6223808966405243e-08, + "logits/chosen": -2.4620113372802734, + "logits/rejected": -2.7022454738616943, + "logps/chosen": -324.7372741699219, + "logps/rejected": -236.843994140625, + "loss": 0.4739, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2222168445587158, + "rewards/margins": 1.7802751064300537, + "rewards/rejected": -3.0024921894073486, + "step": 8212 + }, + { + "epoch": 0.95, + "learning_rate": 1.618869249678099e-08, + "logits/chosen": -2.4587042331695557, + "logits/rejected": -2.2859785556793213, + "logps/chosen": -211.43780517578125, + "logps/rejected": -203.7998046875, + "loss": 0.4432, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8418912887573242, + "rewards/margins": 1.5411264896392822, + "rewards/rejected": -2.3830180168151855, + "step": 8213 + }, + { + "epoch": 0.95, + "learning_rate": 1.6153576027156734e-08, + "logits/chosen": -2.2295024394989014, + "logits/rejected": -2.489387273788452, + "logps/chosen": -438.202880859375, + "logps/rejected": -278.6086730957031, + "loss": 0.5593, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8279708027839661, + "rewards/margins": 2.3538756370544434, + "rewards/rejected": -3.1818466186523438, + "step": 8214 + }, + { + "epoch": 0.95, + "learning_rate": 1.611845955753248e-08, + "logits/chosen": -2.0290493965148926, + "logits/rejected": -1.3608362674713135, + "logps/chosen": -86.40227508544922, + "logps/rejected": -285.88623046875, + "loss": 0.292, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1940462589263916, + "rewards/margins": 1.9054410457611084, + "rewards/rejected": -2.0994873046875, + "step": 8215 + }, + { + "epoch": 0.95, + "learning_rate": 1.608334308790823e-08, + "logits/chosen": -2.816053867340088, + "logits/rejected": -2.6682522296905518, + "logps/chosen": -125.1980972290039, + "logps/rejected": -266.6331481933594, + "loss": 0.4526, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19784721732139587, + "rewards/margins": 4.227978706359863, + "rewards/rejected": -4.425826072692871, + "step": 8216 + }, + { + "epoch": 0.95, + "learning_rate": 1.6048226618283976e-08, + "logits/chosen": -2.5747103691101074, + "logits/rejected": -2.6068766117095947, + "logps/chosen": -187.69766235351562, + "logps/rejected": -193.7725372314453, + "loss": 0.579, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1610240936279297, + "rewards/margins": 1.0796091556549072, + "rewards/rejected": -2.240633249282837, + "step": 8217 + }, + { + "epoch": 0.95, + "learning_rate": 1.601311014865972e-08, + "logits/chosen": -1.9534863233566284, + "logits/rejected": -2.4427616596221924, + "logps/chosen": -417.46710205078125, + "logps/rejected": -188.46994018554688, + "loss": 0.6294, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4457488059997559, + "rewards/margins": 1.6276757717132568, + "rewards/rejected": -3.0734243392944336, + "step": 8218 + }, + { + "epoch": 0.95, + "learning_rate": 1.5977993679035467e-08, + "logits/chosen": -2.5642786026000977, + "logits/rejected": -2.5183029174804688, + "logps/chosen": -361.74896240234375, + "logps/rejected": -344.7005615234375, + "loss": 0.4601, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.098114252090454, + "rewards/margins": 1.1188938617706299, + "rewards/rejected": -2.217008113861084, + "step": 8219 + }, + { + "epoch": 0.95, + "learning_rate": 1.5942877209411214e-08, + "logits/chosen": -2.8011789321899414, + "logits/rejected": -2.817629814147949, + "logps/chosen": -175.84642028808594, + "logps/rejected": -193.57666015625, + "loss": 0.362, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.29534780979156494, + "rewards/margins": 1.6376304626464844, + "rewards/rejected": -1.9329782724380493, + "step": 8220 + }, + { + "epoch": 0.95, + "learning_rate": 1.5907760739786958e-08, + "logits/chosen": -2.1089046001434326, + "logits/rejected": -2.0645246505737305, + "logps/chosen": -229.26502990722656, + "logps/rejected": -284.0816955566406, + "loss": 0.6712, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.033544898033142, + "rewards/margins": 1.173237681388855, + "rewards/rejected": -2.206782579421997, + "step": 8221 + }, + { + "epoch": 0.95, + "learning_rate": 1.5872644270162705e-08, + "logits/chosen": -2.884287118911743, + "logits/rejected": -2.809424877166748, + "logps/chosen": -172.59078979492188, + "logps/rejected": -213.135009765625, + "loss": 0.1743, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.02587127685546875, + "rewards/margins": 3.5372891426086426, + "rewards/rejected": -3.5631604194641113, + "step": 8222 + }, + { + "epoch": 0.95, + "learning_rate": 1.5837527800538453e-08, + "logits/chosen": -2.107008218765259, + "logits/rejected": -2.1737654209136963, + "logps/chosen": -244.62570190429688, + "logps/rejected": -296.54217529296875, + "loss": 0.2686, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2894548177719116, + "rewards/margins": 2.0945942401885986, + "rewards/rejected": -2.3840491771698, + "step": 8223 + }, + { + "epoch": 0.95, + "learning_rate": 1.58024113309142e-08, + "logits/chosen": -1.8714792728424072, + "logits/rejected": -1.765846848487854, + "logps/chosen": -498.1934509277344, + "logps/rejected": -396.21832275390625, + "loss": 0.2147, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2628223896026611, + "rewards/margins": 2.1048974990844727, + "rewards/rejected": -3.367719888687134, + "step": 8224 + }, + { + "epoch": 0.95, + "learning_rate": 1.5767294861289944e-08, + "logits/chosen": -2.169072389602661, + "logits/rejected": -2.2308976650238037, + "logps/chosen": -336.8297119140625, + "logps/rejected": -369.13775634765625, + "loss": 0.0859, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.48563235998153687, + "rewards/margins": 3.016700506210327, + "rewards/rejected": -3.502333164215088, + "step": 8225 + }, + { + "epoch": 0.95, + "learning_rate": 1.573217839166569e-08, + "logits/chosen": -2.634875774383545, + "logits/rejected": -2.555824041366577, + "logps/chosen": -256.6591796875, + "logps/rejected": -232.9036865234375, + "loss": 0.3416, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4669743776321411, + "rewards/margins": 2.3485300540924072, + "rewards/rejected": -2.8155040740966797, + "step": 8226 + }, + { + "epoch": 0.95, + "learning_rate": 1.5697061922041438e-08, + "logits/chosen": -2.454815149307251, + "logits/rejected": -2.52728009223938, + "logps/chosen": -256.1822204589844, + "logps/rejected": -279.79388427734375, + "loss": 0.1328, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4380068778991699, + "rewards/margins": 4.091358661651611, + "rewards/rejected": -4.529365539550781, + "step": 8227 + }, + { + "epoch": 0.95, + "learning_rate": 1.5661945452417182e-08, + "logits/chosen": -2.513610363006592, + "logits/rejected": -2.577791452407837, + "logps/chosen": -233.11141967773438, + "logps/rejected": -269.00592041015625, + "loss": 0.4925, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6388925313949585, + "rewards/margins": 1.8703453540802002, + "rewards/rejected": -3.5092380046844482, + "step": 8228 + }, + { + "epoch": 0.95, + "learning_rate": 1.562682898279293e-08, + "logits/chosen": -1.9687576293945312, + "logits/rejected": -1.9327940940856934, + "logps/chosen": -442.5373840332031, + "logps/rejected": -398.97607421875, + "loss": 0.3742, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4559400081634521, + "rewards/margins": 2.040642261505127, + "rewards/rejected": -3.496582508087158, + "step": 8229 + }, + { + "epoch": 0.95, + "learning_rate": 1.5591712513168677e-08, + "logits/chosen": -2.401973247528076, + "logits/rejected": -2.3751590251922607, + "logps/chosen": -389.6524353027344, + "logps/rejected": -302.4639587402344, + "loss": 0.7419, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4333403408527374, + "rewards/margins": 1.428665280342102, + "rewards/rejected": -1.8620057106018066, + "step": 8230 + }, + { + "epoch": 0.95, + "learning_rate": 1.5556596043544424e-08, + "logits/chosen": -2.2603259086608887, + "logits/rejected": -2.3683745861053467, + "logps/chosen": -531.7265625, + "logps/rejected": -356.43292236328125, + "loss": 0.2136, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9334300756454468, + "rewards/margins": 2.877683639526367, + "rewards/rejected": -3.8111135959625244, + "step": 8231 + }, + { + "epoch": 0.95, + "learning_rate": 1.5521479573920168e-08, + "logits/chosen": -2.143435001373291, + "logits/rejected": -2.394838333129883, + "logps/chosen": -262.005615234375, + "logps/rejected": -234.4247589111328, + "loss": 0.9698, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.7956013679504395, + "rewards/margins": 1.3967947959899902, + "rewards/rejected": -4.1923956871032715, + "step": 8232 + }, + { + "epoch": 0.95, + "learning_rate": 1.5486363104295912e-08, + "logits/chosen": -2.86734676361084, + "logits/rejected": -2.66302752494812, + "logps/chosen": -234.0819091796875, + "logps/rejected": -236.04405212402344, + "loss": 0.4475, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8684165477752686, + "rewards/margins": 2.7250876426696777, + "rewards/rejected": -4.593503952026367, + "step": 8233 + }, + { + "epoch": 0.95, + "learning_rate": 1.545124663467166e-08, + "logits/chosen": -2.621300220489502, + "logits/rejected": -2.388458251953125, + "logps/chosen": -307.4089660644531, + "logps/rejected": -261.4266357421875, + "loss": 0.7851, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.806519627571106, + "rewards/margins": 2.096412181854248, + "rewards/rejected": -3.9029316902160645, + "step": 8234 + }, + { + "epoch": 0.95, + "learning_rate": 1.5416130165047406e-08, + "logits/chosen": -1.9723665714263916, + "logits/rejected": -2.2552828788757324, + "logps/chosen": -255.02734375, + "logps/rejected": -290.4146728515625, + "loss": 0.6616, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0572690963745117, + "rewards/margins": 2.3401753902435303, + "rewards/rejected": -3.397444725036621, + "step": 8235 + }, + { + "epoch": 0.95, + "learning_rate": 1.5381013695423153e-08, + "logits/chosen": -2.572010040283203, + "logits/rejected": -2.459028720855713, + "logps/chosen": -121.62605285644531, + "logps/rejected": -237.59063720703125, + "loss": 0.1606, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3555238246917725, + "rewards/margins": 3.9534902572631836, + "rewards/rejected": -5.309013843536377, + "step": 8236 + }, + { + "epoch": 0.95, + "learning_rate": 1.5345897225798897e-08, + "logits/chosen": -2.2141237258911133, + "logits/rejected": -2.030273199081421, + "logps/chosen": -195.24932861328125, + "logps/rejected": -301.466064453125, + "loss": 0.3221, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4247231483459473, + "rewards/margins": 1.6057147979736328, + "rewards/rejected": -3.03043794631958, + "step": 8237 + }, + { + "epoch": 0.95, + "learning_rate": 1.5310780756174645e-08, + "logits/chosen": -1.966910481452942, + "logits/rejected": -2.3881940841674805, + "logps/chosen": -366.10400390625, + "logps/rejected": -284.4371643066406, + "loss": 0.292, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04185466468334198, + "rewards/margins": 1.9368526935577393, + "rewards/rejected": -1.9787073135375977, + "step": 8238 + }, + { + "epoch": 0.95, + "learning_rate": 1.5275664286550392e-08, + "logits/chosen": -2.148286819458008, + "logits/rejected": -2.2799129486083984, + "logps/chosen": -263.22314453125, + "logps/rejected": -280.1767883300781, + "loss": 0.5132, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4436075687408447, + "rewards/margins": 3.6960599422454834, + "rewards/rejected": -5.139667510986328, + "step": 8239 + }, + { + "epoch": 0.95, + "learning_rate": 1.5240547816926136e-08, + "logits/chosen": -2.6712136268615723, + "logits/rejected": -2.6024484634399414, + "logps/chosen": -290.7613220214844, + "logps/rejected": -304.1287536621094, + "loss": 0.2398, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6359004974365234, + "rewards/margins": 2.2557363510131836, + "rewards/rejected": -2.891636848449707, + "step": 8240 + }, + { + "epoch": 0.95, + "learning_rate": 1.5205431347301883e-08, + "logits/chosen": -2.364969253540039, + "logits/rejected": -2.1837048530578613, + "logps/chosen": -352.7098693847656, + "logps/rejected": -352.56072998046875, + "loss": 0.2007, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0288026332855225, + "rewards/margins": 2.70569109916687, + "rewards/rejected": -3.7344934940338135, + "step": 8241 + }, + { + "epoch": 0.95, + "learning_rate": 1.517031487767763e-08, + "logits/chosen": -2.4664523601531982, + "logits/rejected": -2.358767032623291, + "logps/chosen": -141.3353271484375, + "logps/rejected": -355.6557312011719, + "loss": 0.8921, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.503057599067688, + "rewards/margins": 3.1054506301879883, + "rewards/rejected": -4.608508110046387, + "step": 8242 + }, + { + "epoch": 0.95, + "learning_rate": 1.5135198408053377e-08, + "logits/chosen": -1.965165615081787, + "logits/rejected": -1.8724510669708252, + "logps/chosen": -328.6653137207031, + "logps/rejected": -332.27349853515625, + "loss": 0.5506, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9438848495483398, + "rewards/margins": 1.7462775707244873, + "rewards/rejected": -2.690162420272827, + "step": 8243 + }, + { + "epoch": 0.95, + "learning_rate": 1.510008193842912e-08, + "logits/chosen": -2.628300666809082, + "logits/rejected": -2.5096161365509033, + "logps/chosen": -196.82150268554688, + "logps/rejected": -252.96627807617188, + "loss": 0.3269, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05504554510116577, + "rewards/margins": 1.7126929759979248, + "rewards/rejected": -1.7677384614944458, + "step": 8244 + }, + { + "epoch": 0.95, + "learning_rate": 1.506496546880487e-08, + "logits/chosen": -1.7013025283813477, + "logits/rejected": -1.8580608367919922, + "logps/chosen": -305.80975341796875, + "logps/rejected": -332.0093994140625, + "loss": 0.2645, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1714463233947754, + "rewards/margins": 2.3482580184936523, + "rewards/rejected": -3.5197043418884277, + "step": 8245 + }, + { + "epoch": 0.95, + "learning_rate": 1.5029848999180616e-08, + "logits/chosen": -2.723078727722168, + "logits/rejected": -2.827258348464966, + "logps/chosen": -307.30413818359375, + "logps/rejected": -341.986083984375, + "loss": 0.6002, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8744150400161743, + "rewards/margins": 1.9179627895355225, + "rewards/rejected": -3.7923779487609863, + "step": 8246 + }, + { + "epoch": 0.95, + "learning_rate": 1.4994732529556363e-08, + "logits/chosen": -1.4323316812515259, + "logits/rejected": -1.397562026977539, + "logps/chosen": -345.17022705078125, + "logps/rejected": -358.51654052734375, + "loss": 0.5931, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3762288093566895, + "rewards/margins": 0.7336118221282959, + "rewards/rejected": -2.1098408699035645, + "step": 8247 + }, + { + "epoch": 0.95, + "learning_rate": 1.4959616059932107e-08, + "logits/chosen": -2.167080879211426, + "logits/rejected": -2.019486904144287, + "logps/chosen": -135.76815795898438, + "logps/rejected": -159.17315673828125, + "loss": 0.2523, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6041639447212219, + "rewards/margins": 2.044163703918457, + "rewards/rejected": -2.6483278274536133, + "step": 8248 + }, + { + "epoch": 0.95, + "learning_rate": 1.4924499590307854e-08, + "logits/chosen": -1.7477171421051025, + "logits/rejected": -1.9014275074005127, + "logps/chosen": -236.1300048828125, + "logps/rejected": -200.15380859375, + "loss": 1.0353, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.165822982788086, + "rewards/margins": 0.39269596338272095, + "rewards/rejected": -1.558518886566162, + "step": 8249 + }, + { + "epoch": 0.95, + "learning_rate": 1.48893831206836e-08, + "logits/chosen": -2.3506147861480713, + "logits/rejected": -2.5523416996002197, + "logps/chosen": -219.05801391601562, + "logps/rejected": -254.22509765625, + "loss": 0.4711, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3129764795303345, + "rewards/margins": 3.3325858116149902, + "rewards/rejected": -4.645562171936035, + "step": 8250 + }, + { + "epoch": 0.95, + "learning_rate": 1.4854266651059347e-08, + "logits/chosen": -2.3689956665039062, + "logits/rejected": -2.2617290019989014, + "logps/chosen": -118.78758239746094, + "logps/rejected": -118.387939453125, + "loss": 0.8193, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4409905672073364, + "rewards/margins": 0.2985152304172516, + "rewards/rejected": -1.7395058870315552, + "step": 8251 + }, + { + "epoch": 0.95, + "learning_rate": 1.4819150181435093e-08, + "logits/chosen": -1.9632024765014648, + "logits/rejected": -1.903199553489685, + "logps/chosen": -392.3169250488281, + "logps/rejected": -343.50506591796875, + "loss": 0.5067, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5512116551399231, + "rewards/margins": 2.214841842651367, + "rewards/rejected": -2.7660536766052246, + "step": 8252 + }, + { + "epoch": 0.95, + "learning_rate": 1.478403371181084e-08, + "logits/chosen": -1.479838252067566, + "logits/rejected": -2.05582857131958, + "logps/chosen": -440.92242431640625, + "logps/rejected": -294.76165771484375, + "loss": 0.224, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7981674671173096, + "rewards/margins": 2.340716600418091, + "rewards/rejected": -3.1388840675354004, + "step": 8253 + }, + { + "epoch": 0.95, + "learning_rate": 1.4748917242186585e-08, + "logits/chosen": -2.218390941619873, + "logits/rejected": -1.9446861743927002, + "logps/chosen": -189.9794158935547, + "logps/rejected": -252.72122192382812, + "loss": 0.1945, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3049435019493103, + "rewards/margins": 2.5821781158447266, + "rewards/rejected": -2.8871216773986816, + "step": 8254 + }, + { + "epoch": 0.95, + "learning_rate": 1.4713800772562333e-08, + "logits/chosen": -2.098259687423706, + "logits/rejected": -1.952418327331543, + "logps/chosen": -264.3125305175781, + "logps/rejected": -293.2491149902344, + "loss": 0.2286, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4137636423110962, + "rewards/margins": 2.2075295448303223, + "rewards/rejected": -2.621293067932129, + "step": 8255 + }, + { + "epoch": 0.95, + "learning_rate": 1.4678684302938078e-08, + "logits/chosen": -2.0404295921325684, + "logits/rejected": -2.0525307655334473, + "logps/chosen": -240.16970825195312, + "logps/rejected": -303.0284423828125, + "loss": 0.3952, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.49049675464630127, + "rewards/margins": 1.403594970703125, + "rewards/rejected": -1.8940916061401367, + "step": 8256 + }, + { + "epoch": 0.95, + "learning_rate": 1.4643567833313822e-08, + "logits/chosen": -2.646061420440674, + "logits/rejected": -2.4989523887634277, + "logps/chosen": -246.05908203125, + "logps/rejected": -396.67938232421875, + "loss": 0.4155, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8750490546226501, + "rewards/margins": 2.630784749984741, + "rewards/rejected": -3.505833864212036, + "step": 8257 + }, + { + "epoch": 0.95, + "learning_rate": 1.460845136368957e-08, + "logits/chosen": -2.160069465637207, + "logits/rejected": -2.0073602199554443, + "logps/chosen": -232.36801147460938, + "logps/rejected": -246.19619750976562, + "loss": 0.7523, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0552514791488647, + "rewards/margins": 0.3573099374771118, + "rewards/rejected": -1.4125614166259766, + "step": 8258 + }, + { + "epoch": 0.95, + "learning_rate": 1.4573334894065315e-08, + "logits/chosen": -2.3211896419525146, + "logits/rejected": -2.3896095752716064, + "logps/chosen": -155.46127319335938, + "logps/rejected": -310.55194091796875, + "loss": 0.4189, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6645941734313965, + "rewards/margins": 2.8528265953063965, + "rewards/rejected": -3.517420768737793, + "step": 8259 + }, + { + "epoch": 0.95, + "learning_rate": 1.4538218424441062e-08, + "logits/chosen": -1.8535430431365967, + "logits/rejected": -1.8440980911254883, + "logps/chosen": -354.426025390625, + "logps/rejected": -353.385498046875, + "loss": 0.5944, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5027316808700562, + "rewards/margins": 0.9534693360328674, + "rewards/rejected": -2.4562010765075684, + "step": 8260 + }, + { + "epoch": 0.95, + "learning_rate": 1.4503101954816808e-08, + "logits/chosen": -1.9669585227966309, + "logits/rejected": -1.9647859334945679, + "logps/chosen": -187.64813232421875, + "logps/rejected": -249.197021484375, + "loss": 0.5093, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6276511549949646, + "rewards/margins": 0.8348908424377441, + "rewards/rejected": -1.462541937828064, + "step": 8261 + }, + { + "epoch": 0.95, + "learning_rate": 1.4467985485192553e-08, + "logits/chosen": -2.4270570278167725, + "logits/rejected": -1.853094220161438, + "logps/chosen": -254.35533142089844, + "logps/rejected": -351.2856140136719, + "loss": 0.4709, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7937659621238708, + "rewards/margins": 1.9420839548110962, + "rewards/rejected": -2.7358498573303223, + "step": 8262 + }, + { + "epoch": 0.95, + "learning_rate": 1.44328690155683e-08, + "logits/chosen": -2.4770724773406982, + "logits/rejected": -2.60392427444458, + "logps/chosen": -282.8480529785156, + "logps/rejected": -330.65911865234375, + "loss": 1.0436, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.0564939975738525, + "rewards/margins": 2.9303104877471924, + "rewards/rejected": -4.986804485321045, + "step": 8263 + }, + { + "epoch": 0.95, + "learning_rate": 1.4397752545944046e-08, + "logits/chosen": -2.5933139324188232, + "logits/rejected": -2.620811700820923, + "logps/chosen": -342.4826354980469, + "logps/rejected": -181.97262573242188, + "loss": 0.5413, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6550470590591431, + "rewards/margins": 1.526991844177246, + "rewards/rejected": -2.1820390224456787, + "step": 8264 + }, + { + "epoch": 0.95, + "learning_rate": 1.4362636076319793e-08, + "logits/chosen": -2.009542226791382, + "logits/rejected": -2.3255162239074707, + "logps/chosen": -361.3996887207031, + "logps/rejected": -361.939208984375, + "loss": 0.1003, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21725411713123322, + "rewards/margins": 2.820631265640259, + "rewards/rejected": -3.0378854274749756, + "step": 8265 + }, + { + "epoch": 0.95, + "learning_rate": 1.4327519606695539e-08, + "logits/chosen": -1.8696386814117432, + "logits/rejected": -2.2370352745056152, + "logps/chosen": -329.7279357910156, + "logps/rejected": -222.33663940429688, + "loss": 0.6158, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6406790018081665, + "rewards/margins": 0.854901134967804, + "rewards/rejected": -1.4955799579620361, + "step": 8266 + }, + { + "epoch": 0.95, + "learning_rate": 1.4292403137071286e-08, + "logits/chosen": -2.126434803009033, + "logits/rejected": -1.8886560201644897, + "logps/chosen": -402.7474365234375, + "logps/rejected": -460.1066589355469, + "loss": 0.3395, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7668194770812988, + "rewards/margins": 1.6816372871398926, + "rewards/rejected": -2.4484570026397705, + "step": 8267 + }, + { + "epoch": 0.95, + "learning_rate": 1.4257286667447032e-08, + "logits/chosen": -2.2287673950195312, + "logits/rejected": -2.5332276821136475, + "logps/chosen": -356.6734924316406, + "logps/rejected": -219.8162841796875, + "loss": 0.279, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7882445454597473, + "rewards/margins": 1.5328540802001953, + "rewards/rejected": -2.321098566055298, + "step": 8268 + }, + { + "epoch": 0.95, + "learning_rate": 1.4222170197822779e-08, + "logits/chosen": -2.016425609588623, + "logits/rejected": -2.0506887435913086, + "logps/chosen": -241.19195556640625, + "logps/rejected": -182.08282470703125, + "loss": 0.6087, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3056004047393799, + "rewards/margins": 1.283187747001648, + "rewards/rejected": -2.5887882709503174, + "step": 8269 + }, + { + "epoch": 0.95, + "learning_rate": 1.4187053728198525e-08, + "logits/chosen": -2.2754054069519043, + "logits/rejected": -2.2868995666503906, + "logps/chosen": -264.8188171386719, + "logps/rejected": -368.41998291015625, + "loss": 0.7867, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8417835235595703, + "rewards/margins": 1.264532446861267, + "rewards/rejected": -2.106316089630127, + "step": 8270 + }, + { + "epoch": 0.95, + "learning_rate": 1.415193725857427e-08, + "logits/chosen": -2.4832239151000977, + "logits/rejected": -2.6688332557678223, + "logps/chosen": -491.25653076171875, + "logps/rejected": -277.1683044433594, + "loss": 0.3781, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7949410080909729, + "rewards/margins": 1.8627746105194092, + "rewards/rejected": -2.6577157974243164, + "step": 8271 + }, + { + "epoch": 0.95, + "learning_rate": 1.4116820788950018e-08, + "logits/chosen": -1.8937352895736694, + "logits/rejected": -2.0666918754577637, + "logps/chosen": -274.9108581542969, + "logps/rejected": -201.6082763671875, + "loss": 0.2893, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8092123866081238, + "rewards/margins": 1.8088558912277222, + "rewards/rejected": -2.6180684566497803, + "step": 8272 + }, + { + "epoch": 0.95, + "learning_rate": 1.4081704319325763e-08, + "logits/chosen": -2.014192819595337, + "logits/rejected": -2.167184352874756, + "logps/chosen": -271.2029724121094, + "logps/rejected": -300.60308837890625, + "loss": 0.6789, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.972693681716919, + "rewards/margins": 2.377427101135254, + "rewards/rejected": -3.3501205444335938, + "step": 8273 + }, + { + "epoch": 0.95, + "learning_rate": 1.404658784970151e-08, + "logits/chosen": -2.182666301727295, + "logits/rejected": -2.028249502182007, + "logps/chosen": -254.47665405273438, + "logps/rejected": -359.631103515625, + "loss": 0.2226, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0379501581192017, + "rewards/margins": 2.69347882270813, + "rewards/rejected": -3.731429100036621, + "step": 8274 + }, + { + "epoch": 0.95, + "learning_rate": 1.4011471380077256e-08, + "logits/chosen": -2.41398286819458, + "logits/rejected": -2.1091766357421875, + "logps/chosen": -266.19110107421875, + "logps/rejected": -320.46453857421875, + "loss": 0.3577, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.916825532913208, + "rewards/margins": 1.3636507987976074, + "rewards/rejected": -2.2804763317108154, + "step": 8275 + }, + { + "epoch": 0.95, + "learning_rate": 1.3976354910453003e-08, + "logits/chosen": -2.2793538570404053, + "logits/rejected": -2.464970588684082, + "logps/chosen": -296.716552734375, + "logps/rejected": -197.0606231689453, + "loss": 0.3853, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.430849552154541, + "rewards/margins": 1.3943884372711182, + "rewards/rejected": -1.8252379894256592, + "step": 8276 + }, + { + "epoch": 0.95, + "learning_rate": 1.3941238440828749e-08, + "logits/chosen": -2.6014444828033447, + "logits/rejected": -2.644308090209961, + "logps/chosen": -395.38970947265625, + "logps/rejected": -274.37884521484375, + "loss": 0.9315, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7527880668640137, + "rewards/margins": 1.3874837160110474, + "rewards/rejected": -3.1402716636657715, + "step": 8277 + }, + { + "epoch": 0.95, + "learning_rate": 1.3906121971204494e-08, + "logits/chosen": -2.1856250762939453, + "logits/rejected": -2.4860968589782715, + "logps/chosen": -436.31201171875, + "logps/rejected": -290.99285888671875, + "loss": 0.3027, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7600710391998291, + "rewards/margins": 2.0688252449035645, + "rewards/rejected": -2.8288960456848145, + "step": 8278 + }, + { + "epoch": 0.95, + "learning_rate": 1.3871005501580242e-08, + "logits/chosen": -2.2752315998077393, + "logits/rejected": -2.4241459369659424, + "logps/chosen": -347.80926513671875, + "logps/rejected": -281.23065185546875, + "loss": 0.2008, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.31274324655532837, + "rewards/margins": 3.225252866744995, + "rewards/rejected": -3.5379960536956787, + "step": 8279 + }, + { + "epoch": 0.95, + "learning_rate": 1.3835889031955987e-08, + "logits/chosen": -2.2316653728485107, + "logits/rejected": -2.078423500061035, + "logps/chosen": -209.36602783203125, + "logps/rejected": -236.28411865234375, + "loss": 0.3123, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.943693995475769, + "rewards/margins": 1.5914027690887451, + "rewards/rejected": -2.5350968837738037, + "step": 8280 + }, + { + "epoch": 0.95, + "learning_rate": 1.3800772562331733e-08, + "logits/chosen": -1.867465615272522, + "logits/rejected": -2.0226948261260986, + "logps/chosen": -212.71426391601562, + "logps/rejected": -293.33001708984375, + "loss": 0.4407, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6688316464424133, + "rewards/margins": 1.8256241083145142, + "rewards/rejected": -2.4944558143615723, + "step": 8281 + }, + { + "epoch": 0.95, + "learning_rate": 1.3765656092707478e-08, + "logits/chosen": -2.7119414806365967, + "logits/rejected": -2.636848211288452, + "logps/chosen": -259.87652587890625, + "logps/rejected": -250.5284881591797, + "loss": 0.2677, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8691458106040955, + "rewards/margins": 3.2916018962860107, + "rewards/rejected": -4.160747528076172, + "step": 8282 + }, + { + "epoch": 0.95, + "learning_rate": 1.3730539623083224e-08, + "logits/chosen": -2.266608953475952, + "logits/rejected": -1.931445837020874, + "logps/chosen": -268.96917724609375, + "logps/rejected": -479.87872314453125, + "loss": 0.0974, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0745906829833984, + "rewards/margins": 4.163519859313965, + "rewards/rejected": -5.238110542297363, + "step": 8283 + }, + { + "epoch": 0.95, + "learning_rate": 1.3695423153458971e-08, + "logits/chosen": -2.5415749549865723, + "logits/rejected": -2.382261276245117, + "logps/chosen": -195.99969482421875, + "logps/rejected": -240.52716064453125, + "loss": 0.6959, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.045905113220215, + "rewards/margins": 1.9108092784881592, + "rewards/rejected": -3.956713914871216, + "step": 8284 + }, + { + "epoch": 0.96, + "learning_rate": 1.3660306683834717e-08, + "logits/chosen": -2.3314874172210693, + "logits/rejected": -2.303858518600464, + "logps/chosen": -192.1435089111328, + "logps/rejected": -282.242919921875, + "loss": 0.4581, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1718794107437134, + "rewards/margins": 3.334355354309082, + "rewards/rejected": -4.506234645843506, + "step": 8285 + }, + { + "epoch": 0.96, + "learning_rate": 1.3625190214210464e-08, + "logits/chosen": -2.642104148864746, + "logits/rejected": -2.64909029006958, + "logps/chosen": -148.199951171875, + "logps/rejected": -194.25291442871094, + "loss": 0.8069, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.109351634979248, + "rewards/margins": 0.3912842571735382, + "rewards/rejected": -1.5006358623504639, + "step": 8286 + }, + { + "epoch": 0.96, + "learning_rate": 1.359007374458621e-08, + "logits/chosen": -1.7473870515823364, + "logits/rejected": -1.6132011413574219, + "logps/chosen": -246.00296020507812, + "logps/rejected": -250.97964477539062, + "loss": 0.4049, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1095666885375977, + "rewards/margins": 1.4861431121826172, + "rewards/rejected": -2.595709800720215, + "step": 8287 + }, + { + "epoch": 0.96, + "learning_rate": 1.3554957274961957e-08, + "logits/chosen": -2.3946666717529297, + "logits/rejected": -2.1484169960021973, + "logps/chosen": -341.1733093261719, + "logps/rejected": -316.71417236328125, + "loss": 0.4006, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7947309017181396, + "rewards/margins": 1.8691835403442383, + "rewards/rejected": -2.663914442062378, + "step": 8288 + }, + { + "epoch": 0.96, + "learning_rate": 1.3519840805337702e-08, + "logits/chosen": -2.184041976928711, + "logits/rejected": -2.406175136566162, + "logps/chosen": -352.58502197265625, + "logps/rejected": -231.67877197265625, + "loss": 0.2433, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3094976544380188, + "rewards/margins": 1.7849819660186768, + "rewards/rejected": -2.09447979927063, + "step": 8289 + }, + { + "epoch": 0.96, + "learning_rate": 1.348472433571345e-08, + "logits/chosen": -3.0867319107055664, + "logits/rejected": -2.905766248703003, + "logps/chosen": -351.31103515625, + "logps/rejected": -249.64791870117188, + "loss": 0.3659, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.791554868221283, + "rewards/margins": 2.4145970344543457, + "rewards/rejected": -3.2061517238616943, + "step": 8290 + }, + { + "epoch": 0.96, + "learning_rate": 1.3449607866089195e-08, + "logits/chosen": -1.882153868675232, + "logits/rejected": -1.9802619218826294, + "logps/chosen": -437.22650146484375, + "logps/rejected": -288.1961975097656, + "loss": 0.4811, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6404318809509277, + "rewards/margins": 0.7211199998855591, + "rewards/rejected": -1.3615517616271973, + "step": 8291 + }, + { + "epoch": 0.96, + "learning_rate": 1.341449139646494e-08, + "logits/chosen": -2.733654022216797, + "logits/rejected": -2.5760626792907715, + "logps/chosen": -89.0242691040039, + "logps/rejected": -116.89396667480469, + "loss": 0.2682, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.489016592502594, + "rewards/margins": 1.429884672164917, + "rewards/rejected": -1.9189014434814453, + "step": 8292 + }, + { + "epoch": 0.96, + "learning_rate": 1.3379374926840688e-08, + "logits/chosen": -2.7596194744110107, + "logits/rejected": -2.4150569438934326, + "logps/chosen": -227.75491333007812, + "logps/rejected": -211.98941040039062, + "loss": 0.3876, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.698533296585083, + "rewards/margins": 1.4590494632720947, + "rewards/rejected": -2.1575827598571777, + "step": 8293 + }, + { + "epoch": 0.96, + "learning_rate": 1.3344258457216434e-08, + "logits/chosen": -2.516569137573242, + "logits/rejected": -2.3535304069519043, + "logps/chosen": -169.99005126953125, + "logps/rejected": -123.03577423095703, + "loss": 0.7062, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1712150573730469, + "rewards/margins": 0.36245197057724, + "rewards/rejected": -1.5336668491363525, + "step": 8294 + }, + { + "epoch": 0.96, + "learning_rate": 1.330914198759218e-08, + "logits/chosen": -1.80202054977417, + "logits/rejected": -1.947664737701416, + "logps/chosen": -401.722412109375, + "logps/rejected": -315.82769775390625, + "loss": 0.5492, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3292683362960815, + "rewards/margins": 0.452523797750473, + "rewards/rejected": -1.781792163848877, + "step": 8295 + }, + { + "epoch": 0.96, + "learning_rate": 1.3274025517967926e-08, + "logits/chosen": -2.299027681350708, + "logits/rejected": -2.2782106399536133, + "logps/chosen": -270.0833435058594, + "logps/rejected": -246.8682861328125, + "loss": 0.522, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8363924026489258, + "rewards/margins": 1.6614110469818115, + "rewards/rejected": -2.497803211212158, + "step": 8296 + }, + { + "epoch": 0.96, + "learning_rate": 1.3238909048343674e-08, + "logits/chosen": -2.483064651489258, + "logits/rejected": -2.2416744232177734, + "logps/chosen": -222.86912536621094, + "logps/rejected": -333.1945495605469, + "loss": 0.7814, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4272649586200714, + "rewards/margins": 1.0058245658874512, + "rewards/rejected": -1.4330894947052002, + "step": 8297 + }, + { + "epoch": 0.96, + "learning_rate": 1.320379257871942e-08, + "logits/chosen": -2.869241714477539, + "logits/rejected": -2.7254879474639893, + "logps/chosen": -407.4052734375, + "logps/rejected": -301.5380554199219, + "loss": 0.165, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.30546730756759644, + "rewards/margins": 2.424659013748169, + "rewards/rejected": -2.73012638092041, + "step": 8298 + }, + { + "epoch": 0.96, + "learning_rate": 1.3168676109095165e-08, + "logits/chosen": -2.561314105987549, + "logits/rejected": -2.567498207092285, + "logps/chosen": -232.40536499023438, + "logps/rejected": -219.89523315429688, + "loss": 0.3178, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7321026921272278, + "rewards/margins": 2.317187786102295, + "rewards/rejected": -3.049290418624878, + "step": 8299 + }, + { + "epoch": 0.96, + "learning_rate": 1.3133559639470912e-08, + "logits/chosen": -2.373229503631592, + "logits/rejected": -2.182745933532715, + "logps/chosen": -211.27923583984375, + "logps/rejected": -285.32562255859375, + "loss": 0.5853, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0045630931854248, + "rewards/margins": 2.177781581878662, + "rewards/rejected": -3.182344436645508, + "step": 8300 + }, + { + "epoch": 0.96, + "learning_rate": 1.3098443169846658e-08, + "logits/chosen": -2.4689865112304688, + "logits/rejected": -2.5584490299224854, + "logps/chosen": -133.2224578857422, + "logps/rejected": -245.5079803466797, + "loss": 0.3709, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.004721328616142273, + "rewards/margins": 2.6925065517425537, + "rewards/rejected": -2.6877851486206055, + "step": 8301 + }, + { + "epoch": 0.96, + "learning_rate": 1.3063326700222405e-08, + "logits/chosen": -2.1465210914611816, + "logits/rejected": -2.409426689147949, + "logps/chosen": -310.6603088378906, + "logps/rejected": -216.81036376953125, + "loss": 0.9781, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9175841808319092, + "rewards/margins": 0.3360099792480469, + "rewards/rejected": -1.2535940408706665, + "step": 8302 + }, + { + "epoch": 0.96, + "learning_rate": 1.302821023059815e-08, + "logits/chosen": -1.995065450668335, + "logits/rejected": -2.0032949447631836, + "logps/chosen": -263.4268493652344, + "logps/rejected": -202.44847106933594, + "loss": 0.6664, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.074084758758545, + "rewards/margins": 0.6471154689788818, + "rewards/rejected": -1.7212002277374268, + "step": 8303 + }, + { + "epoch": 0.96, + "learning_rate": 1.2993093760973898e-08, + "logits/chosen": -1.8358466625213623, + "logits/rejected": -1.7912236452102661, + "logps/chosen": -529.1168823242188, + "logps/rejected": -430.79974365234375, + "loss": 0.312, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5544756054878235, + "rewards/margins": 2.937216281890869, + "rewards/rejected": -3.491692066192627, + "step": 8304 + }, + { + "epoch": 0.96, + "learning_rate": 1.2957977291349642e-08, + "logits/chosen": -1.7502506971359253, + "logits/rejected": -1.9026598930358887, + "logps/chosen": -313.86602783203125, + "logps/rejected": -198.50137329101562, + "loss": 0.5531, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6114131808280945, + "rewards/margins": 0.66356360912323, + "rewards/rejected": -1.2749768495559692, + "step": 8305 + }, + { + "epoch": 0.96, + "learning_rate": 1.2922860821725387e-08, + "logits/chosen": -2.182704448699951, + "logits/rejected": -2.589506149291992, + "logps/chosen": -346.578125, + "logps/rejected": -167.0293731689453, + "loss": 0.4267, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8164417147636414, + "rewards/margins": 1.8456387519836426, + "rewards/rejected": -2.6620802879333496, + "step": 8306 + }, + { + "epoch": 0.96, + "learning_rate": 1.2887744352101134e-08, + "logits/chosen": -2.203845500946045, + "logits/rejected": -2.3202567100524902, + "logps/chosen": -234.7093505859375, + "logps/rejected": -173.7859649658203, + "loss": 0.1138, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4492347538471222, + "rewards/margins": 3.841684579849243, + "rewards/rejected": -3.3924498558044434, + "step": 8307 + }, + { + "epoch": 0.96, + "learning_rate": 1.285262788247688e-08, + "logits/chosen": -2.741480588912964, + "logits/rejected": -2.530499219894409, + "logps/chosen": -132.44192504882812, + "logps/rejected": -165.7700653076172, + "loss": 0.3554, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.42083218693733215, + "rewards/margins": 1.3639628887176514, + "rewards/rejected": -1.7847950458526611, + "step": 8308 + }, + { + "epoch": 0.96, + "learning_rate": 1.2817511412852627e-08, + "logits/chosen": -2.3506808280944824, + "logits/rejected": -2.2855687141418457, + "logps/chosen": -255.98614501953125, + "logps/rejected": -295.5982666015625, + "loss": 0.1818, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9876425266265869, + "rewards/margins": 2.7364578247070312, + "rewards/rejected": -3.724100351333618, + "step": 8309 + }, + { + "epoch": 0.96, + "learning_rate": 1.2782394943228373e-08, + "logits/chosen": -2.314842939376831, + "logits/rejected": -2.171907901763916, + "logps/chosen": -309.693603515625, + "logps/rejected": -316.6365051269531, + "loss": 0.6404, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.601499319076538, + "rewards/margins": 2.4783599376678467, + "rewards/rejected": -4.079859256744385, + "step": 8310 + }, + { + "epoch": 0.96, + "learning_rate": 1.274727847360412e-08, + "logits/chosen": -2.2239649295806885, + "logits/rejected": -2.5767478942871094, + "logps/chosen": -184.0691375732422, + "logps/rejected": -167.1852264404297, + "loss": 0.2578, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4596540927886963, + "rewards/margins": 2.3791871070861816, + "rewards/rejected": -2.838840961456299, + "step": 8311 + }, + { + "epoch": 0.96, + "learning_rate": 1.2712162003979866e-08, + "logits/chosen": -2.283226490020752, + "logits/rejected": -2.0641965866088867, + "logps/chosen": -229.63143920898438, + "logps/rejected": -239.77825927734375, + "loss": 0.6753, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7967970967292786, + "rewards/margins": 1.634911060333252, + "rewards/rejected": -2.4317078590393066, + "step": 8312 + }, + { + "epoch": 0.96, + "learning_rate": 1.2677045534355611e-08, + "logits/chosen": -2.3818631172180176, + "logits/rejected": -2.399522304534912, + "logps/chosen": -194.3130645751953, + "logps/rejected": -254.11880493164062, + "loss": 0.3959, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7192416191101074, + "rewards/margins": 1.7099722623825073, + "rewards/rejected": -2.429213762283325, + "step": 8313 + }, + { + "epoch": 0.96, + "learning_rate": 1.2641929064731358e-08, + "logits/chosen": -2.9684715270996094, + "logits/rejected": -2.964405059814453, + "logps/chosen": -232.5870361328125, + "logps/rejected": -286.07867431640625, + "loss": 0.2561, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6005650758743286, + "rewards/margins": 2.231799840927124, + "rewards/rejected": -2.832365036010742, + "step": 8314 + }, + { + "epoch": 0.96, + "learning_rate": 1.2606812595107104e-08, + "logits/chosen": -2.4707913398742676, + "logits/rejected": -2.651073455810547, + "logps/chosen": -672.8053588867188, + "logps/rejected": -576.0823364257812, + "loss": 0.6994, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1168054342269897, + "rewards/margins": 1.3123998641967773, + "rewards/rejected": -2.4292054176330566, + "step": 8315 + }, + { + "epoch": 0.96, + "learning_rate": 1.2571696125482851e-08, + "logits/chosen": -1.8781647682189941, + "logits/rejected": -2.075632095336914, + "logps/chosen": -393.7124938964844, + "logps/rejected": -268.91162109375, + "loss": 0.2723, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.604841947555542, + "rewards/margins": 2.7693192958831787, + "rewards/rejected": -3.3741610050201416, + "step": 8316 + }, + { + "epoch": 0.96, + "learning_rate": 1.2536579655858597e-08, + "logits/chosen": -2.2544960975646973, + "logits/rejected": -2.503030776977539, + "logps/chosen": -245.5410919189453, + "logps/rejected": -242.69638061523438, + "loss": 0.8396, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3987938165664673, + "rewards/margins": 1.1991896629333496, + "rewards/rejected": -2.5979838371276855, + "step": 8317 + }, + { + "epoch": 0.96, + "learning_rate": 1.2501463186234344e-08, + "logits/chosen": -2.422173500061035, + "logits/rejected": -2.6213648319244385, + "logps/chosen": -214.62802124023438, + "logps/rejected": -249.0385284423828, + "loss": 0.2275, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.097285270690918, + "rewards/margins": 3.554110050201416, + "rewards/rejected": -4.651395320892334, + "step": 8318 + }, + { + "epoch": 0.96, + "learning_rate": 1.246634671661009e-08, + "logits/chosen": -2.674834728240967, + "logits/rejected": -2.488154888153076, + "logps/chosen": -208.8865966796875, + "logps/rejected": -233.4732208251953, + "loss": 0.1696, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.33576852083206177, + "rewards/margins": 2.4751651287078857, + "rewards/rejected": -2.8109335899353027, + "step": 8319 + }, + { + "epoch": 0.96, + "learning_rate": 1.2431230246985837e-08, + "logits/chosen": -2.4127469062805176, + "logits/rejected": -2.4286203384399414, + "logps/chosen": -150.64634704589844, + "logps/rejected": -160.19589233398438, + "loss": 0.2545, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7533305883407593, + "rewards/margins": 2.091662645339966, + "rewards/rejected": -2.8449933528900146, + "step": 8320 + }, + { + "epoch": 0.96, + "learning_rate": 1.2396113777361583e-08, + "logits/chosen": -2.3117785453796387, + "logits/rejected": -2.3148465156555176, + "logps/chosen": -147.966064453125, + "logps/rejected": -292.65576171875, + "loss": 0.4317, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11068554222583771, + "rewards/margins": 1.6150825023651123, + "rewards/rejected": -1.7257680892944336, + "step": 8321 + }, + { + "epoch": 0.96, + "learning_rate": 1.2360997307737328e-08, + "logits/chosen": -2.0014476776123047, + "logits/rejected": -2.1686179637908936, + "logps/chosen": -347.53106689453125, + "logps/rejected": -273.9087219238281, + "loss": 0.2929, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1336725950241089, + "rewards/margins": 1.9572792053222656, + "rewards/rejected": -3.090951919555664, + "step": 8322 + }, + { + "epoch": 0.96, + "learning_rate": 1.2325880838113075e-08, + "logits/chosen": -1.845354676246643, + "logits/rejected": -2.0995259284973145, + "logps/chosen": -234.32553100585938, + "logps/rejected": -239.40492248535156, + "loss": 0.9494, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.0456719398498535, + "rewards/margins": 3.152944326400757, + "rewards/rejected": -5.198616027832031, + "step": 8323 + }, + { + "epoch": 0.96, + "learning_rate": 1.2290764368488821e-08, + "logits/chosen": -2.7591135501861572, + "logits/rejected": -2.5578761100769043, + "logps/chosen": -168.6669158935547, + "logps/rejected": -214.67758178710938, + "loss": 0.3543, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9702253937721252, + "rewards/margins": 1.551332950592041, + "rewards/rejected": -2.5215582847595215, + "step": 8324 + }, + { + "epoch": 0.96, + "learning_rate": 1.2255647898864568e-08, + "logits/chosen": -2.6209511756896973, + "logits/rejected": -2.8848488330841064, + "logps/chosen": -342.75006103515625, + "logps/rejected": -261.943603515625, + "loss": 0.1254, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9924281239509583, + "rewards/margins": 3.194971799850464, + "rewards/rejected": -4.187399864196777, + "step": 8325 + }, + { + "epoch": 0.96, + "learning_rate": 1.2220531429240314e-08, + "logits/chosen": -2.1353514194488525, + "logits/rejected": -2.087111473083496, + "logps/chosen": -163.5817108154297, + "logps/rejected": -223.1149444580078, + "loss": 0.1112, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23247739672660828, + "rewards/margins": 2.353888988494873, + "rewards/rejected": -2.586366653442383, + "step": 8326 + }, + { + "epoch": 0.96, + "learning_rate": 1.2185414959616061e-08, + "logits/chosen": -2.4908699989318848, + "logits/rejected": -2.39754056930542, + "logps/chosen": -366.8436279296875, + "logps/rejected": -268.8402099609375, + "loss": 0.5818, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.297037124633789, + "rewards/margins": 1.5794470310211182, + "rewards/rejected": -2.876483917236328, + "step": 8327 + }, + { + "epoch": 0.96, + "learning_rate": 1.2150298489991805e-08, + "logits/chosen": -2.616123676300049, + "logits/rejected": -2.5066447257995605, + "logps/chosen": -164.83224487304688, + "logps/rejected": -263.5330810546875, + "loss": 0.3196, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1744861602783203, + "rewards/margins": 3.100475549697876, + "rewards/rejected": -4.274961471557617, + "step": 8328 + }, + { + "epoch": 0.96, + "learning_rate": 1.211518202036755e-08, + "logits/chosen": -2.797945261001587, + "logits/rejected": -2.8271989822387695, + "logps/chosen": -328.6097717285156, + "logps/rejected": -286.74365234375, + "loss": 0.0922, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2515861988067627, + "rewards/margins": 4.3691864013671875, + "rewards/rejected": -4.620772361755371, + "step": 8329 + }, + { + "epoch": 0.96, + "learning_rate": 1.2080065550743298e-08, + "logits/chosen": -2.183475971221924, + "logits/rejected": -2.027939558029175, + "logps/chosen": -345.29193115234375, + "logps/rejected": -426.15179443359375, + "loss": 0.2061, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1226952075958252, + "rewards/margins": 2.5388548374176025, + "rewards/rejected": -3.6615500450134277, + "step": 8330 + }, + { + "epoch": 0.96, + "learning_rate": 1.2044949081119043e-08, + "logits/chosen": -2.0780155658721924, + "logits/rejected": -2.2834160327911377, + "logps/chosen": -388.6920471191406, + "logps/rejected": -275.80780029296875, + "loss": 0.3909, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4854518175125122, + "rewards/margins": 1.5190620422363281, + "rewards/rejected": -2.004513740539551, + "step": 8331 + }, + { + "epoch": 0.96, + "learning_rate": 1.200983261149479e-08, + "logits/chosen": -2.009094476699829, + "logits/rejected": -2.1120615005493164, + "logps/chosen": -209.4219970703125, + "logps/rejected": -154.80186462402344, + "loss": 1.1734, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.44200599193573, + "rewards/margins": 1.0006000995635986, + "rewards/rejected": -2.442605972290039, + "step": 8332 + }, + { + "epoch": 0.96, + "learning_rate": 1.1974716141870536e-08, + "logits/chosen": -2.6403942108154297, + "logits/rejected": -2.5122833251953125, + "logps/chosen": -260.5957946777344, + "logps/rejected": -308.1060791015625, + "loss": 0.2414, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0806761980056763, + "rewards/margins": 2.0823841094970703, + "rewards/rejected": -3.163060188293457, + "step": 8333 + }, + { + "epoch": 0.96, + "learning_rate": 1.1939599672246282e-08, + "logits/chosen": -2.613759756088257, + "logits/rejected": -2.7317590713500977, + "logps/chosen": -378.3291015625, + "logps/rejected": -272.3094482421875, + "loss": 0.0804, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3641070127487183, + "rewards/margins": 4.291399002075195, + "rewards/rejected": -5.655506134033203, + "step": 8334 + }, + { + "epoch": 0.96, + "learning_rate": 1.1904483202622029e-08, + "logits/chosen": -2.510594367980957, + "logits/rejected": -2.5940353870391846, + "logps/chosen": -363.4239501953125, + "logps/rejected": -275.9864501953125, + "loss": 0.5416, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5530037879943848, + "rewards/margins": 0.8638337850570679, + "rewards/rejected": -2.416837692260742, + "step": 8335 + }, + { + "epoch": 0.96, + "learning_rate": 1.1869366732997775e-08, + "logits/chosen": -2.0395989418029785, + "logits/rejected": -1.6583694219589233, + "logps/chosen": -93.79373931884766, + "logps/rejected": -196.49053955078125, + "loss": 0.3318, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18197748064994812, + "rewards/margins": 2.343773603439331, + "rewards/rejected": -2.5257506370544434, + "step": 8336 + }, + { + "epoch": 0.96, + "learning_rate": 1.1834250263373522e-08, + "logits/chosen": -2.744706153869629, + "logits/rejected": -2.73565411567688, + "logps/chosen": -277.739013671875, + "logps/rejected": -219.75411987304688, + "loss": 0.8703, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4350608587265015, + "rewards/margins": 0.9087343215942383, + "rewards/rejected": -2.3437952995300293, + "step": 8337 + }, + { + "epoch": 0.96, + "learning_rate": 1.1799133793749267e-08, + "logits/chosen": -2.650865077972412, + "logits/rejected": -2.704742431640625, + "logps/chosen": -187.36856079101562, + "logps/rejected": -362.14019775390625, + "loss": 0.6334, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5672621726989746, + "rewards/margins": 1.4485169649124146, + "rewards/rejected": -3.0157790184020996, + "step": 8338 + }, + { + "epoch": 0.96, + "learning_rate": 1.1764017324125015e-08, + "logits/chosen": -2.1804003715515137, + "logits/rejected": -2.5430922508239746, + "logps/chosen": -311.6703186035156, + "logps/rejected": -227.91119384765625, + "loss": 0.185, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6618409156799316, + "rewards/margins": 1.875669002532959, + "rewards/rejected": -2.5375099182128906, + "step": 8339 + }, + { + "epoch": 0.96, + "learning_rate": 1.172890085450076e-08, + "logits/chosen": -2.3114020824432373, + "logits/rejected": -2.440953016281128, + "logps/chosen": -295.8179626464844, + "logps/rejected": -225.33917236328125, + "loss": 0.241, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8443284630775452, + "rewards/margins": 2.608386516571045, + "rewards/rejected": -3.4527149200439453, + "step": 8340 + }, + { + "epoch": 0.96, + "learning_rate": 1.1693784384876507e-08, + "logits/chosen": -2.8399760723114014, + "logits/rejected": -2.83090877532959, + "logps/chosen": -347.62249755859375, + "logps/rejected": -203.3816680908203, + "loss": 0.2144, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0266105979681015, + "rewards/margins": 2.6882643699645996, + "rewards/rejected": -2.7148752212524414, + "step": 8341 + }, + { + "epoch": 0.96, + "learning_rate": 1.1658667915252253e-08, + "logits/chosen": -2.800057888031006, + "logits/rejected": -2.837852954864502, + "logps/chosen": -90.20486450195312, + "logps/rejected": -216.0350341796875, + "loss": 0.2234, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6288833618164062, + "rewards/margins": 2.125999689102173, + "rewards/rejected": -2.754883050918579, + "step": 8342 + }, + { + "epoch": 0.96, + "learning_rate": 1.1623551445627999e-08, + "logits/chosen": -2.6464991569519043, + "logits/rejected": -2.7831966876983643, + "logps/chosen": -382.4117431640625, + "logps/rejected": -267.1006774902344, + "loss": 0.168, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2462035417556763, + "rewards/margins": 2.375673532485962, + "rewards/rejected": -3.6218767166137695, + "step": 8343 + }, + { + "epoch": 0.96, + "learning_rate": 1.1588434976003746e-08, + "logits/chosen": -2.0379257202148438, + "logits/rejected": -1.929629921913147, + "logps/chosen": -277.3254699707031, + "logps/rejected": -366.7955017089844, + "loss": 0.5455, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6427661180496216, + "rewards/margins": 1.944831371307373, + "rewards/rejected": -2.587597370147705, + "step": 8344 + }, + { + "epoch": 0.96, + "learning_rate": 1.1553318506379491e-08, + "logits/chosen": -2.6318931579589844, + "logits/rejected": -2.662583827972412, + "logps/chosen": -433.4043884277344, + "logps/rejected": -360.694091796875, + "loss": 0.245, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8979843258857727, + "rewards/margins": 3.1341025829315186, + "rewards/rejected": -4.0320868492126465, + "step": 8345 + }, + { + "epoch": 0.96, + "learning_rate": 1.1518202036755239e-08, + "logits/chosen": -1.9129647016525269, + "logits/rejected": -2.1180474758148193, + "logps/chosen": -261.7285461425781, + "logps/rejected": -205.52598571777344, + "loss": 0.5119, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.307497262954712, + "rewards/margins": 0.769997775554657, + "rewards/rejected": -2.0774948596954346, + "step": 8346 + }, + { + "epoch": 0.96, + "learning_rate": 1.1483085567130984e-08, + "logits/chosen": -2.5308220386505127, + "logits/rejected": -2.749809980392456, + "logps/chosen": -285.37762451171875, + "logps/rejected": -209.07908630371094, + "loss": 0.2636, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6827643513679504, + "rewards/margins": 2.4410688877105713, + "rewards/rejected": -3.123833179473877, + "step": 8347 + }, + { + "epoch": 0.96, + "learning_rate": 1.1447969097506731e-08, + "logits/chosen": -1.7427754402160645, + "logits/rejected": -2.1333024501800537, + "logps/chosen": -496.4306335449219, + "logps/rejected": -384.4765625, + "loss": 0.6181, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.0662267208099365, + "rewards/margins": 1.6618634462356567, + "rewards/rejected": -3.728090286254883, + "step": 8348 + }, + { + "epoch": 0.96, + "learning_rate": 1.1412852627882477e-08, + "logits/chosen": -2.139800786972046, + "logits/rejected": -2.3158645629882812, + "logps/chosen": -426.8211669921875, + "logps/rejected": -310.1700439453125, + "loss": 0.1968, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5164937376976013, + "rewards/margins": 2.9736170768737793, + "rewards/rejected": -3.4901108741760254, + "step": 8349 + }, + { + "epoch": 0.96, + "learning_rate": 1.1377736158258224e-08, + "logits/chosen": -2.008643627166748, + "logits/rejected": -2.1921370029449463, + "logps/chosen": -517.6875, + "logps/rejected": -387.7219543457031, + "loss": 0.3592, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9780626893043518, + "rewards/margins": 1.2510390281677246, + "rewards/rejected": -2.2291016578674316, + "step": 8350 + }, + { + "epoch": 0.96, + "learning_rate": 1.134261968863397e-08, + "logits/chosen": -2.5303196907043457, + "logits/rejected": -2.4674668312072754, + "logps/chosen": -191.76254272460938, + "logps/rejected": -262.684814453125, + "loss": 0.181, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.183746799826622, + "rewards/margins": 3.1275417804718018, + "rewards/rejected": -3.311288356781006, + "step": 8351 + }, + { + "epoch": 0.96, + "learning_rate": 1.1307503219009714e-08, + "logits/chosen": -1.5036811828613281, + "logits/rejected": -1.650536298751831, + "logps/chosen": -615.1025390625, + "logps/rejected": -595.3621826171875, + "loss": 0.2583, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15967635810375214, + "rewards/margins": 1.787734031677246, + "rewards/rejected": -1.9474103450775146, + "step": 8352 + }, + { + "epoch": 0.96, + "learning_rate": 1.1272386749385461e-08, + "logits/chosen": -2.129429340362549, + "logits/rejected": -2.0648295879364014, + "logps/chosen": -223.14907836914062, + "logps/rejected": -287.7755432128906, + "loss": 0.3631, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3212404251098633, + "rewards/margins": 3.13417911529541, + "rewards/rejected": -4.455419540405273, + "step": 8353 + }, + { + "epoch": 0.96, + "learning_rate": 1.1237270279761207e-08, + "logits/chosen": -2.349257707595825, + "logits/rejected": -2.0856149196624756, + "logps/chosen": -326.10546875, + "logps/rejected": -282.08428955078125, + "loss": 0.1942, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8466626405715942, + "rewards/margins": 3.523059844970703, + "rewards/rejected": -4.369722366333008, + "step": 8354 + }, + { + "epoch": 0.96, + "learning_rate": 1.1202153810136952e-08, + "logits/chosen": -2.606567621231079, + "logits/rejected": -2.5063748359680176, + "logps/chosen": -305.0150451660156, + "logps/rejected": -362.9066162109375, + "loss": 0.2884, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19293349981307983, + "rewards/margins": 2.0173916816711426, + "rewards/rejected": -2.210325241088867, + "step": 8355 + }, + { + "epoch": 0.96, + "learning_rate": 1.11670373405127e-08, + "logits/chosen": -2.3936777114868164, + "logits/rejected": -2.2557549476623535, + "logps/chosen": -261.9889221191406, + "logps/rejected": -324.365478515625, + "loss": 0.8145, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4995136260986328, + "rewards/margins": 0.410325288772583, + "rewards/rejected": -1.909839153289795, + "step": 8356 + }, + { + "epoch": 0.96, + "learning_rate": 1.1131920870888445e-08, + "logits/chosen": -2.5067572593688965, + "logits/rejected": -2.739018440246582, + "logps/chosen": -208.34677124023438, + "logps/rejected": -212.5944366455078, + "loss": 0.1982, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7004114985466003, + "rewards/margins": 2.7531867027282715, + "rewards/rejected": -3.4535982608795166, + "step": 8357 + }, + { + "epoch": 0.96, + "learning_rate": 1.1096804401264192e-08, + "logits/chosen": -2.702263832092285, + "logits/rejected": -2.740178346633911, + "logps/chosen": -160.24822998046875, + "logps/rejected": -175.5379180908203, + "loss": 0.3439, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5023748874664307, + "rewards/margins": 2.0374419689178467, + "rewards/rejected": -2.5398166179656982, + "step": 8358 + }, + { + "epoch": 0.96, + "learning_rate": 1.1061687931639938e-08, + "logits/chosen": -2.314253568649292, + "logits/rejected": -2.3762874603271484, + "logps/chosen": -331.922119140625, + "logps/rejected": -369.65045166015625, + "loss": 0.3011, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8392266631126404, + "rewards/margins": 2.367802381515503, + "rewards/rejected": -3.207029104232788, + "step": 8359 + }, + { + "epoch": 0.96, + "learning_rate": 1.1026571462015685e-08, + "logits/chosen": -2.1273655891418457, + "logits/rejected": -2.3007333278656006, + "logps/chosen": -222.2478790283203, + "logps/rejected": -214.18377685546875, + "loss": 0.327, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8562527894973755, + "rewards/margins": 1.806591272354126, + "rewards/rejected": -2.662843942642212, + "step": 8360 + }, + { + "epoch": 0.96, + "learning_rate": 1.099145499239143e-08, + "logits/chosen": -1.9120498895645142, + "logits/rejected": -2.1091065406799316, + "logps/chosen": -486.4223327636719, + "logps/rejected": -511.06787109375, + "loss": 0.4268, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.295891523361206, + "rewards/margins": 0.9330209493637085, + "rewards/rejected": -2.228912591934204, + "step": 8361 + }, + { + "epoch": 0.96, + "learning_rate": 1.0956338522767178e-08, + "logits/chosen": -2.9467382431030273, + "logits/rejected": -2.9587178230285645, + "logps/chosen": -203.23277282714844, + "logps/rejected": -207.21951293945312, + "loss": 0.5154, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6276135444641113, + "rewards/margins": 0.9154582023620605, + "rewards/rejected": -2.543071746826172, + "step": 8362 + }, + { + "epoch": 0.96, + "learning_rate": 1.0921222053142923e-08, + "logits/chosen": -2.495922803878784, + "logits/rejected": -2.298585891723633, + "logps/chosen": -116.84510803222656, + "logps/rejected": -155.1190185546875, + "loss": 0.3319, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.667948842048645, + "rewards/margins": 1.3089797496795654, + "rewards/rejected": -1.9769287109375, + "step": 8363 + }, + { + "epoch": 0.96, + "learning_rate": 1.0886105583518669e-08, + "logits/chosen": -2.4220097064971924, + "logits/rejected": -2.3403759002685547, + "logps/chosen": -182.02761840820312, + "logps/rejected": -341.67779541015625, + "loss": 0.1731, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8480395078659058, + "rewards/margins": 3.0251736640930176, + "rewards/rejected": -3.873213052749634, + "step": 8364 + }, + { + "epoch": 0.96, + "learning_rate": 1.0850989113894416e-08, + "logits/chosen": -2.388019561767578, + "logits/rejected": -2.4484024047851562, + "logps/chosen": -287.01947021484375, + "logps/rejected": -198.30892944335938, + "loss": 0.6342, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8785523176193237, + "rewards/margins": 1.2589061260223389, + "rewards/rejected": -2.137458324432373, + "step": 8365 + }, + { + "epoch": 0.96, + "learning_rate": 1.0815872644270162e-08, + "logits/chosen": -2.0814003944396973, + "logits/rejected": -2.551509380340576, + "logps/chosen": -396.2338562011719, + "logps/rejected": -310.4056396484375, + "loss": 0.3369, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1779236793518066, + "rewards/margins": 1.3112752437591553, + "rewards/rejected": -2.489198923110962, + "step": 8366 + }, + { + "epoch": 0.96, + "learning_rate": 1.0780756174645909e-08, + "logits/chosen": -2.182602882385254, + "logits/rejected": -2.5371525287628174, + "logps/chosen": -283.15557861328125, + "logps/rejected": -296.3240661621094, + "loss": 0.145, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17658302187919617, + "rewards/margins": 3.1400084495544434, + "rewards/rejected": -2.963425636291504, + "step": 8367 + }, + { + "epoch": 0.96, + "learning_rate": 1.0745639705021655e-08, + "logits/chosen": -1.6723763942718506, + "logits/rejected": -2.0123720169067383, + "logps/chosen": -527.8401489257812, + "logps/rejected": -339.003662109375, + "loss": 0.2695, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21728341281414032, + "rewards/margins": 1.7566845417022705, + "rewards/rejected": -1.9739679098129272, + "step": 8368 + }, + { + "epoch": 0.96, + "learning_rate": 1.0710523235397402e-08, + "logits/chosen": -2.4372353553771973, + "logits/rejected": -2.2919459342956543, + "logps/chosen": -359.54241943359375, + "logps/rejected": -314.7723388671875, + "loss": 0.3644, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2879011631011963, + "rewards/margins": 2.1402170658111572, + "rewards/rejected": -2.4281182289123535, + "step": 8369 + }, + { + "epoch": 0.96, + "learning_rate": 1.0675406765773148e-08, + "logits/chosen": -1.6858837604522705, + "logits/rejected": -2.123563051223755, + "logps/chosen": -199.5888214111328, + "logps/rejected": -180.88050842285156, + "loss": 0.9634, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.3971197605133057, + "rewards/margins": 0.1946083903312683, + "rewards/rejected": -2.5917282104492188, + "step": 8370 + }, + { + "epoch": 0.97, + "learning_rate": 1.0640290296148895e-08, + "logits/chosen": -2.4023284912109375, + "logits/rejected": -2.480525255203247, + "logps/chosen": -260.797119140625, + "logps/rejected": -175.55218505859375, + "loss": 0.5071, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6283969879150391, + "rewards/margins": 0.6727646589279175, + "rewards/rejected": -1.3011616468429565, + "step": 8371 + }, + { + "epoch": 0.97, + "learning_rate": 1.060517382652464e-08, + "logits/chosen": -2.7160072326660156, + "logits/rejected": -2.6111092567443848, + "logps/chosen": -416.96954345703125, + "logps/rejected": -380.8495178222656, + "loss": 0.8074, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.103729486465454, + "rewards/margins": 2.214080333709717, + "rewards/rejected": -4.31781005859375, + "step": 8372 + }, + { + "epoch": 0.97, + "learning_rate": 1.0570057356900386e-08, + "logits/chosen": -2.4761133193969727, + "logits/rejected": -2.5608744621276855, + "logps/chosen": -310.0489807128906, + "logps/rejected": -343.32574462890625, + "loss": 0.7043, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6957098245620728, + "rewards/margins": 2.204440116882324, + "rewards/rejected": -2.9001498222351074, + "step": 8373 + }, + { + "epoch": 0.97, + "learning_rate": 1.0534940887276133e-08, + "logits/chosen": -2.8245413303375244, + "logits/rejected": -3.0089709758758545, + "logps/chosen": -215.49038696289062, + "logps/rejected": -283.6546936035156, + "loss": 0.1386, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6938375234603882, + "rewards/margins": 3.643580198287964, + "rewards/rejected": -4.337418079376221, + "step": 8374 + }, + { + "epoch": 0.97, + "learning_rate": 1.0499824417651879e-08, + "logits/chosen": -2.2214760780334473, + "logits/rejected": -2.4063704013824463, + "logps/chosen": -205.7017059326172, + "logps/rejected": -181.4359130859375, + "loss": 0.3438, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.36798667907714844, + "rewards/margins": 1.850818157196045, + "rewards/rejected": -2.2188045978546143, + "step": 8375 + }, + { + "epoch": 0.97, + "learning_rate": 1.0464707948027623e-08, + "logits/chosen": -2.189925193786621, + "logits/rejected": -2.05820631980896, + "logps/chosen": -170.58558654785156, + "logps/rejected": -275.1644287109375, + "loss": 0.067, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16998320817947388, + "rewards/margins": 3.7354393005371094, + "rewards/rejected": -3.9054229259490967, + "step": 8376 + }, + { + "epoch": 0.97, + "learning_rate": 1.042959147840337e-08, + "logits/chosen": -2.870516777038574, + "logits/rejected": -2.78706693649292, + "logps/chosen": -182.8394775390625, + "logps/rejected": -203.48080444335938, + "loss": 0.337, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.926659107208252, + "rewards/margins": 2.695619821548462, + "rewards/rejected": -3.622279167175293, + "step": 8377 + }, + { + "epoch": 0.97, + "learning_rate": 1.0394475008779116e-08, + "logits/chosen": -1.9190280437469482, + "logits/rejected": -2.1181323528289795, + "logps/chosen": -331.53533935546875, + "logps/rejected": -351.9927978515625, + "loss": 0.2822, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.27062880992889404, + "rewards/margins": 1.9045546054840088, + "rewards/rejected": -2.1751835346221924, + "step": 8378 + }, + { + "epoch": 0.97, + "learning_rate": 1.0359358539154863e-08, + "logits/chosen": -2.4606943130493164, + "logits/rejected": -2.5108375549316406, + "logps/chosen": -171.96475219726562, + "logps/rejected": -271.4129638671875, + "loss": 0.2928, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.20001220703125, + "rewards/margins": 2.9910120964050293, + "rewards/rejected": -5.191024303436279, + "step": 8379 + }, + { + "epoch": 0.97, + "learning_rate": 1.0324242069530608e-08, + "logits/chosen": -2.567128896713257, + "logits/rejected": -2.2686984539031982, + "logps/chosen": -359.4850158691406, + "logps/rejected": -412.60601806640625, + "loss": 0.1675, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2938648760318756, + "rewards/margins": 2.7479419708251953, + "rewards/rejected": -3.041806936264038, + "step": 8380 + }, + { + "epoch": 0.97, + "learning_rate": 1.0289125599906356e-08, + "logits/chosen": -2.3116774559020996, + "logits/rejected": -2.5535573959350586, + "logps/chosen": -316.4712219238281, + "logps/rejected": -236.83799743652344, + "loss": 0.3917, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7769836187362671, + "rewards/margins": 1.8503042459487915, + "rewards/rejected": -2.6272876262664795, + "step": 8381 + }, + { + "epoch": 0.97, + "learning_rate": 1.0254009130282101e-08, + "logits/chosen": -2.3520944118499756, + "logits/rejected": -2.3014025688171387, + "logps/chosen": -178.96823120117188, + "logps/rejected": -177.26138305664062, + "loss": 0.6147, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5672270059585571, + "rewards/margins": 0.6370965242385864, + "rewards/rejected": -1.2043235301971436, + "step": 8382 + }, + { + "epoch": 0.97, + "learning_rate": 1.0218892660657848e-08, + "logits/chosen": -1.9172190427780151, + "logits/rejected": -1.8786840438842773, + "logps/chosen": -169.0308380126953, + "logps/rejected": -204.85377502441406, + "loss": 0.4554, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2656364440917969, + "rewards/margins": 1.5155607461929321, + "rewards/rejected": -2.7811970710754395, + "step": 8383 + }, + { + "epoch": 0.97, + "learning_rate": 1.0183776191033594e-08, + "logits/chosen": -2.4761786460876465, + "logits/rejected": -2.497262954711914, + "logps/chosen": -222.07118225097656, + "logps/rejected": -186.08819580078125, + "loss": 0.4851, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7391010522842407, + "rewards/margins": 1.1740089654922485, + "rewards/rejected": -1.9131100177764893, + "step": 8384 + }, + { + "epoch": 0.97, + "learning_rate": 1.014865972140934e-08, + "logits/chosen": -2.993076801300049, + "logits/rejected": -2.908433198928833, + "logps/chosen": -208.86965942382812, + "logps/rejected": -143.27818298339844, + "loss": 0.209, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8966814875602722, + "rewards/margins": 1.6802152395248413, + "rewards/rejected": -2.576896905899048, + "step": 8385 + }, + { + "epoch": 0.97, + "learning_rate": 1.0113543251785087e-08, + "logits/chosen": -1.8301589488983154, + "logits/rejected": -2.1155362129211426, + "logps/chosen": -408.30120849609375, + "logps/rejected": -291.9586486816406, + "loss": 0.2549, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21162846684455872, + "rewards/margins": 2.105375051498413, + "rewards/rejected": -2.3170034885406494, + "step": 8386 + }, + { + "epoch": 0.97, + "learning_rate": 1.0078426782160832e-08, + "logits/chosen": -2.553605079650879, + "logits/rejected": -2.7967727184295654, + "logps/chosen": -265.89263916015625, + "logps/rejected": -245.9368896484375, + "loss": 0.5222, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6838734149932861, + "rewards/margins": 1.3829152584075928, + "rewards/rejected": -2.066788911819458, + "step": 8387 + }, + { + "epoch": 0.97, + "learning_rate": 1.004331031253658e-08, + "logits/chosen": -2.32712459564209, + "logits/rejected": -2.3606061935424805, + "logps/chosen": -199.00662231445312, + "logps/rejected": -223.4898681640625, + "loss": 0.3669, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1587296724319458, + "rewards/margins": 2.156916379928589, + "rewards/rejected": -2.315646171569824, + "step": 8388 + }, + { + "epoch": 0.97, + "learning_rate": 1.0008193842912325e-08, + "logits/chosen": -2.3676209449768066, + "logits/rejected": -2.5795581340789795, + "logps/chosen": -221.90245056152344, + "logps/rejected": -320.09942626953125, + "loss": 0.2581, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17607629299163818, + "rewards/margins": 3.6530799865722656, + "rewards/rejected": -3.8291563987731934, + "step": 8389 + }, + { + "epoch": 0.97, + "learning_rate": 9.973077373288072e-09, + "logits/chosen": -2.948756456375122, + "logits/rejected": -2.9246022701263428, + "logps/chosen": -229.9978485107422, + "logps/rejected": -216.25189208984375, + "loss": 0.1896, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9101969599723816, + "rewards/margins": 3.4040255546569824, + "rewards/rejected": -4.31422233581543, + "step": 8390 + }, + { + "epoch": 0.97, + "learning_rate": 9.937960903663818e-09, + "logits/chosen": -1.9687762260437012, + "logits/rejected": -2.217067241668701, + "logps/chosen": -412.5858459472656, + "logps/rejected": -278.7555236816406, + "loss": 0.3331, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6938618421554565, + "rewards/margins": 3.107849359512329, + "rewards/rejected": -3.8017115592956543, + "step": 8391 + }, + { + "epoch": 0.97, + "learning_rate": 9.902844434039565e-09, + "logits/chosen": -2.413455009460449, + "logits/rejected": -2.712930202484131, + "logps/chosen": -344.479248046875, + "logps/rejected": -138.1405487060547, + "loss": 0.6373, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0567580461502075, + "rewards/margins": 0.7029148936271667, + "rewards/rejected": -1.759672999382019, + "step": 8392 + }, + { + "epoch": 0.97, + "learning_rate": 9.867727964415311e-09, + "logits/chosen": -2.4147820472717285, + "logits/rejected": -2.5075461864471436, + "logps/chosen": -228.2041473388672, + "logps/rejected": -250.17330932617188, + "loss": 0.2644, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5766911506652832, + "rewards/margins": 2.0213782787323, + "rewards/rejected": -2.598069429397583, + "step": 8393 + }, + { + "epoch": 0.97, + "learning_rate": 9.832611494791056e-09, + "logits/chosen": -2.4402339458465576, + "logits/rejected": -2.1104846000671387, + "logps/chosen": -129.6728515625, + "logps/rejected": -367.72003173828125, + "loss": 0.1861, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6151391267776489, + "rewards/margins": 3.3012380599975586, + "rewards/rejected": -3.916377067565918, + "step": 8394 + }, + { + "epoch": 0.97, + "learning_rate": 9.797495025166804e-09, + "logits/chosen": -2.630643129348755, + "logits/rejected": -2.747492790222168, + "logps/chosen": -318.7740173339844, + "logps/rejected": -458.446044921875, + "loss": 0.3499, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0934139490127563, + "rewards/margins": 1.9393370151519775, + "rewards/rejected": -3.0327510833740234, + "step": 8395 + }, + { + "epoch": 0.97, + "learning_rate": 9.76237855554255e-09, + "logits/chosen": -2.0203745365142822, + "logits/rejected": -2.2958216667175293, + "logps/chosen": -267.0394592285156, + "logps/rejected": -206.6710205078125, + "loss": 0.3368, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4207903742790222, + "rewards/margins": 1.819718360900879, + "rewards/rejected": -2.240508556365967, + "step": 8396 + }, + { + "epoch": 0.97, + "learning_rate": 9.727262085918296e-09, + "logits/chosen": -2.4318740367889404, + "logits/rejected": -2.4358489513397217, + "logps/chosen": -175.37762451171875, + "logps/rejected": -365.2696838378906, + "loss": 0.1148, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9505590200424194, + "rewards/margins": 4.034226417541504, + "rewards/rejected": -4.984785556793213, + "step": 8397 + }, + { + "epoch": 0.97, + "learning_rate": 9.692145616294042e-09, + "logits/chosen": -2.0274646282196045, + "logits/rejected": -1.964007019996643, + "logps/chosen": -264.40301513671875, + "logps/rejected": -295.22711181640625, + "loss": 0.5561, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9495329856872559, + "rewards/margins": 1.4474470615386963, + "rewards/rejected": -2.396979808807373, + "step": 8398 + }, + { + "epoch": 0.97, + "learning_rate": 9.65702914666979e-09, + "logits/chosen": -1.9673595428466797, + "logits/rejected": -2.028980016708374, + "logps/chosen": -324.910888671875, + "logps/rejected": -379.6582946777344, + "loss": 0.4766, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8268479108810425, + "rewards/margins": 1.4966554641723633, + "rewards/rejected": -2.323503255844116, + "step": 8399 + }, + { + "epoch": 0.97, + "learning_rate": 9.621912677045533e-09, + "logits/chosen": -2.2759084701538086, + "logits/rejected": -2.0854763984680176, + "logps/chosen": -216.49288940429688, + "logps/rejected": -298.20721435546875, + "loss": 0.3991, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9102282524108887, + "rewards/margins": 2.9069297313690186, + "rewards/rejected": -3.8171579837799072, + "step": 8400 + }, + { + "epoch": 0.97, + "learning_rate": 9.586796207421279e-09, + "logits/chosen": -2.4856977462768555, + "logits/rejected": -2.3366668224334717, + "logps/chosen": -627.30126953125, + "logps/rejected": -235.92022705078125, + "loss": 0.2106, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9008491039276123, + "rewards/margins": 2.4839982986450195, + "rewards/rejected": -3.384847402572632, + "step": 8401 + }, + { + "epoch": 0.97, + "learning_rate": 9.551679737797026e-09, + "logits/chosen": -2.1556365489959717, + "logits/rejected": -2.4564969539642334, + "logps/chosen": -330.7919616699219, + "logps/rejected": -247.71498107910156, + "loss": 0.3122, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3052625060081482, + "rewards/margins": 1.7459367513656616, + "rewards/rejected": -2.051199197769165, + "step": 8402 + }, + { + "epoch": 0.97, + "learning_rate": 9.516563268172772e-09, + "logits/chosen": -2.299556016921997, + "logits/rejected": -2.6164138317108154, + "logps/chosen": -254.48617553710938, + "logps/rejected": -266.404052734375, + "loss": 0.1161, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8405889272689819, + "rewards/margins": 3.4196653366088867, + "rewards/rejected": -4.26025390625, + "step": 8403 + }, + { + "epoch": 0.97, + "learning_rate": 9.481446798548519e-09, + "logits/chosen": -2.028167486190796, + "logits/rejected": -2.0373454093933105, + "logps/chosen": -138.8681640625, + "logps/rejected": -180.28134155273438, + "loss": 0.5544, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.48787590861320496, + "rewards/margins": 0.6757506132125854, + "rewards/rejected": -1.1636265516281128, + "step": 8404 + }, + { + "epoch": 0.97, + "learning_rate": 9.446330328924264e-09, + "logits/chosen": -2.51798677444458, + "logits/rejected": -2.563838005065918, + "logps/chosen": -259.8948974609375, + "logps/rejected": -201.4373779296875, + "loss": 1.0176, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7226051092147827, + "rewards/margins": 1.8653123378753662, + "rewards/rejected": -3.5879173278808594, + "step": 8405 + }, + { + "epoch": 0.97, + "learning_rate": 9.41121385930001e-09, + "logits/chosen": -2.3128793239593506, + "logits/rejected": -2.285437822341919, + "logps/chosen": -348.2469787597656, + "logps/rejected": -329.6056213378906, + "loss": 0.232, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5434321761131287, + "rewards/margins": 3.336115837097168, + "rewards/rejected": -3.8795483112335205, + "step": 8406 + }, + { + "epoch": 0.97, + "learning_rate": 9.376097389675757e-09, + "logits/chosen": -2.339718818664551, + "logits/rejected": -2.5531773567199707, + "logps/chosen": -182.07159423828125, + "logps/rejected": -176.2247314453125, + "loss": 0.5255, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.706298291683197, + "rewards/margins": 1.5179007053375244, + "rewards/rejected": -2.224199056625366, + "step": 8407 + }, + { + "epoch": 0.97, + "learning_rate": 9.340980920051503e-09, + "logits/chosen": -2.2047243118286133, + "logits/rejected": -2.4734063148498535, + "logps/chosen": -373.60125732421875, + "logps/rejected": -340.7923889160156, + "loss": 0.6506, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0271782875061035, + "rewards/margins": 1.4973087310791016, + "rewards/rejected": -2.524487018585205, + "step": 8408 + }, + { + "epoch": 0.97, + "learning_rate": 9.30586445042725e-09, + "logits/chosen": -1.5984361171722412, + "logits/rejected": -1.8253536224365234, + "logps/chosen": -664.356689453125, + "logps/rejected": -407.9173583984375, + "loss": 0.6915, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2449456453323364, + "rewards/margins": 1.3275699615478516, + "rewards/rejected": -2.5725157260894775, + "step": 8409 + }, + { + "epoch": 0.97, + "learning_rate": 9.270747980802996e-09, + "logits/chosen": -2.409397602081299, + "logits/rejected": -2.634779930114746, + "logps/chosen": -213.42935180664062, + "logps/rejected": -275.1302795410156, + "loss": 0.5803, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9373065829277039, + "rewards/margins": 1.116811990737915, + "rewards/rejected": -2.0541186332702637, + "step": 8410 + }, + { + "epoch": 0.97, + "learning_rate": 9.235631511178743e-09, + "logits/chosen": -2.589153528213501, + "logits/rejected": -2.6448936462402344, + "logps/chosen": -206.9823455810547, + "logps/rejected": -246.4625244140625, + "loss": 0.5031, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0592098236083984, + "rewards/margins": 2.0644991397857666, + "rewards/rejected": -3.123708963394165, + "step": 8411 + }, + { + "epoch": 0.97, + "learning_rate": 9.200515041554488e-09, + "logits/chosen": -2.246551036834717, + "logits/rejected": -2.416977643966675, + "logps/chosen": -157.06478881835938, + "logps/rejected": -255.98048400878906, + "loss": 0.1729, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2859539985656738, + "rewards/margins": 2.885085105895996, + "rewards/rejected": -4.171039581298828, + "step": 8412 + }, + { + "epoch": 0.97, + "learning_rate": 9.165398571930236e-09, + "logits/chosen": -2.1479861736297607, + "logits/rejected": -2.2481179237365723, + "logps/chosen": -229.3449249267578, + "logps/rejected": -253.6527099609375, + "loss": 0.1834, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6046233177185059, + "rewards/margins": 5.002126693725586, + "rewards/rejected": -5.606749534606934, + "step": 8413 + }, + { + "epoch": 0.97, + "learning_rate": 9.130282102305981e-09, + "logits/chosen": -2.287985324859619, + "logits/rejected": -2.3611249923706055, + "logps/chosen": -402.48876953125, + "logps/rejected": -259.831787109375, + "loss": 0.3544, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5902811288833618, + "rewards/margins": 1.9375516176223755, + "rewards/rejected": -2.5278327465057373, + "step": 8414 + }, + { + "epoch": 0.97, + "learning_rate": 9.095165632681727e-09, + "logits/chosen": -2.4897353649139404, + "logits/rejected": -2.6036157608032227, + "logps/chosen": -155.35748291015625, + "logps/rejected": -154.2424774169922, + "loss": 0.5213, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6645352840423584, + "rewards/margins": 1.402674674987793, + "rewards/rejected": -2.0672099590301514, + "step": 8415 + }, + { + "epoch": 0.97, + "learning_rate": 9.060049163057474e-09, + "logits/chosen": -1.909859299659729, + "logits/rejected": -2.067521572113037, + "logps/chosen": -436.4457092285156, + "logps/rejected": -215.24057006835938, + "loss": 0.234, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.044885069131851196, + "rewards/margins": 1.5276838541030884, + "rewards/rejected": -1.5725688934326172, + "step": 8416 + }, + { + "epoch": 0.97, + "learning_rate": 9.02493269343322e-09, + "logits/chosen": -1.978961706161499, + "logits/rejected": -1.8502230644226074, + "logps/chosen": -241.63818359375, + "logps/rejected": -264.1119384765625, + "loss": 0.282, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8807603120803833, + "rewards/margins": 2.3985047340393066, + "rewards/rejected": -3.2792649269104004, + "step": 8417 + }, + { + "epoch": 0.97, + "learning_rate": 8.989816223808965e-09, + "logits/chosen": -3.0196008682250977, + "logits/rejected": -2.9454689025878906, + "logps/chosen": -373.75543212890625, + "logps/rejected": -203.13394165039062, + "loss": 0.1179, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0270015001296997, + "rewards/margins": 4.20919132232666, + "rewards/rejected": -5.23619270324707, + "step": 8418 + }, + { + "epoch": 0.97, + "learning_rate": 8.954699754184713e-09, + "logits/chosen": -2.8854613304138184, + "logits/rejected": -2.934691905975342, + "logps/chosen": -219.82176208496094, + "logps/rejected": -282.32415771484375, + "loss": 0.3162, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.36583125591278076, + "rewards/margins": 2.231091022491455, + "rewards/rejected": -2.5969223976135254, + "step": 8419 + }, + { + "epoch": 0.97, + "learning_rate": 8.919583284560458e-09, + "logits/chosen": -2.639030933380127, + "logits/rejected": -2.7142393589019775, + "logps/chosen": -203.0560302734375, + "logps/rejected": -268.38787841796875, + "loss": 0.2854, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4809105396270752, + "rewards/margins": 2.2100799083709717, + "rewards/rejected": -3.690990686416626, + "step": 8420 + }, + { + "epoch": 0.97, + "learning_rate": 8.884466814936204e-09, + "logits/chosen": -2.655822277069092, + "logits/rejected": -2.4218966960906982, + "logps/chosen": -206.80197143554688, + "logps/rejected": -325.5673828125, + "loss": 0.1672, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6886341571807861, + "rewards/margins": 6.644310474395752, + "rewards/rejected": -7.332944869995117, + "step": 8421 + }, + { + "epoch": 0.97, + "learning_rate": 8.849350345311951e-09, + "logits/chosen": -2.163048267364502, + "logits/rejected": -2.18239688873291, + "logps/chosen": -221.98056030273438, + "logps/rejected": -257.9514465332031, + "loss": 0.8945, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3112821578979492, + "rewards/margins": 1.2023851871490479, + "rewards/rejected": -2.513667345046997, + "step": 8422 + }, + { + "epoch": 0.97, + "learning_rate": 8.814233875687697e-09, + "logits/chosen": -2.2217307090759277, + "logits/rejected": -2.412492513656616, + "logps/chosen": -380.6215515136719, + "logps/rejected": -372.36810302734375, + "loss": 0.3261, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9262643456459045, + "rewards/margins": 1.637362003326416, + "rewards/rejected": -2.563626289367676, + "step": 8423 + }, + { + "epoch": 0.97, + "learning_rate": 8.779117406063444e-09, + "logits/chosen": -2.445065975189209, + "logits/rejected": -2.487485408782959, + "logps/chosen": -253.18890380859375, + "logps/rejected": -243.03173828125, + "loss": 0.2458, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.20670637488365173, + "rewards/margins": 2.946176290512085, + "rewards/rejected": -3.1528828144073486, + "step": 8424 + }, + { + "epoch": 0.97, + "learning_rate": 8.74400093643919e-09, + "logits/chosen": -2.676379680633545, + "logits/rejected": -2.437950372695923, + "logps/chosen": -142.74575805664062, + "logps/rejected": -296.29443359375, + "loss": 0.0406, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.28637588024139404, + "rewards/margins": 4.49939489364624, + "rewards/rejected": -4.785770893096924, + "step": 8425 + }, + { + "epoch": 0.97, + "learning_rate": 8.708884466814937e-09, + "logits/chosen": -1.8102874755859375, + "logits/rejected": -1.8177192211151123, + "logps/chosen": -255.17330932617188, + "logps/rejected": -274.2316589355469, + "loss": 0.6289, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0164135694503784, + "rewards/margins": 0.9111753702163696, + "rewards/rejected": -1.9275890588760376, + "step": 8426 + }, + { + "epoch": 0.97, + "learning_rate": 8.673767997190682e-09, + "logits/chosen": -2.4315032958984375, + "logits/rejected": -2.2815349102020264, + "logps/chosen": -183.38619995117188, + "logps/rejected": -284.7322692871094, + "loss": 0.1927, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8901017308235168, + "rewards/margins": 3.3233911991119385, + "rewards/rejected": -4.213493347167969, + "step": 8427 + }, + { + "epoch": 0.97, + "learning_rate": 8.63865152756643e-09, + "logits/chosen": -2.6783084869384766, + "logits/rejected": -2.859133243560791, + "logps/chosen": -370.22039794921875, + "logps/rejected": -194.05516052246094, + "loss": 0.1951, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2504768371582031, + "rewards/margins": 1.9177899360656738, + "rewards/rejected": -3.168267011642456, + "step": 8428 + }, + { + "epoch": 0.97, + "learning_rate": 8.603535057942175e-09, + "logits/chosen": -2.206371784210205, + "logits/rejected": -2.1961545944213867, + "logps/chosen": -321.0281982421875, + "logps/rejected": -361.3043212890625, + "loss": 0.3832, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3232915699481964, + "rewards/margins": 1.9583040475845337, + "rewards/rejected": -2.2815959453582764, + "step": 8429 + }, + { + "epoch": 0.97, + "learning_rate": 8.56841858831792e-09, + "logits/chosen": -2.118619203567505, + "logits/rejected": -2.3430416584014893, + "logps/chosen": -164.27108764648438, + "logps/rejected": -158.57290649414062, + "loss": 0.6987, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0910154581069946, + "rewards/margins": 0.7776462435722351, + "rewards/rejected": -1.868661880493164, + "step": 8430 + }, + { + "epoch": 0.97, + "learning_rate": 8.533302118693666e-09, + "logits/chosen": -2.2668521404266357, + "logits/rejected": -2.7218658924102783, + "logps/chosen": -298.9278564453125, + "logps/rejected": -172.4051055908203, + "loss": 0.2774, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5913408994674683, + "rewards/margins": 2.0442304611206055, + "rewards/rejected": -2.635571241378784, + "step": 8431 + }, + { + "epoch": 0.97, + "learning_rate": 8.498185649069413e-09, + "logits/chosen": -2.696138858795166, + "logits/rejected": -2.371289014816284, + "logps/chosen": -148.36148071289062, + "logps/rejected": -295.6851806640625, + "loss": 0.2495, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7903006672859192, + "rewards/margins": 2.7723388671875, + "rewards/rejected": -3.5626394748687744, + "step": 8432 + }, + { + "epoch": 0.97, + "learning_rate": 8.463069179445159e-09, + "logits/chosen": -2.6563878059387207, + "logits/rejected": -2.562042236328125, + "logps/chosen": -214.13916015625, + "logps/rejected": -301.710693359375, + "loss": 0.16, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22401773929595947, + "rewards/margins": 3.359710216522217, + "rewards/rejected": -3.5837278366088867, + "step": 8433 + }, + { + "epoch": 0.97, + "learning_rate": 8.427952709820906e-09, + "logits/chosen": -2.559955358505249, + "logits/rejected": -2.3921244144439697, + "logps/chosen": -169.23150634765625, + "logps/rejected": -239.02145385742188, + "loss": 0.394, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3260271549224854, + "rewards/margins": 1.7006765604019165, + "rewards/rejected": -3.0267038345336914, + "step": 8434 + }, + { + "epoch": 0.97, + "learning_rate": 8.392836240196652e-09, + "logits/chosen": -2.4688239097595215, + "logits/rejected": -2.323859214782715, + "logps/chosen": -340.4674377441406, + "logps/rejected": -398.9156494140625, + "loss": 0.4213, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5898779034614563, + "rewards/margins": 4.1423468589782715, + "rewards/rejected": -4.732224464416504, + "step": 8435 + }, + { + "epoch": 0.97, + "learning_rate": 8.357719770572397e-09, + "logits/chosen": -2.108008623123169, + "logits/rejected": -2.1916425228118896, + "logps/chosen": -180.27439880371094, + "logps/rejected": -247.47738647460938, + "loss": 0.2035, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8181644678115845, + "rewards/margins": 2.2078418731689453, + "rewards/rejected": -3.0260062217712402, + "step": 8436 + }, + { + "epoch": 0.97, + "learning_rate": 8.322603300948145e-09, + "logits/chosen": -2.053110361099243, + "logits/rejected": -1.9350277185440063, + "logps/chosen": -333.744140625, + "logps/rejected": -257.29803466796875, + "loss": 0.2417, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7816123962402344, + "rewards/margins": 3.2292702198028564, + "rewards/rejected": -4.010882377624512, + "step": 8437 + }, + { + "epoch": 0.97, + "learning_rate": 8.28748683132389e-09, + "logits/chosen": -2.612433433532715, + "logits/rejected": -2.0896010398864746, + "logps/chosen": -102.53045654296875, + "logps/rejected": -217.12452697753906, + "loss": 0.2855, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09041046351194382, + "rewards/margins": 2.1633105278015137, + "rewards/rejected": -2.0729000568389893, + "step": 8438 + }, + { + "epoch": 0.97, + "learning_rate": 8.252370361699637e-09, + "logits/chosen": -2.4467837810516357, + "logits/rejected": -2.574544906616211, + "logps/chosen": -303.5771179199219, + "logps/rejected": -288.9832763671875, + "loss": 0.2289, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.41929447650909424, + "rewards/margins": 2.2972071170806885, + "rewards/rejected": -2.716501474380493, + "step": 8439 + }, + { + "epoch": 0.97, + "learning_rate": 8.217253892075383e-09, + "logits/chosen": -2.5233187675476074, + "logits/rejected": -2.3599650859832764, + "logps/chosen": -392.84991455078125, + "logps/rejected": -298.3934326171875, + "loss": 0.7364, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4866716861724854, + "rewards/margins": 1.0514823198318481, + "rewards/rejected": -2.538154125213623, + "step": 8440 + }, + { + "epoch": 0.97, + "learning_rate": 8.18213742245113e-09, + "logits/chosen": -2.0399293899536133, + "logits/rejected": -2.518890380859375, + "logps/chosen": -366.0811767578125, + "logps/rejected": -366.5260314941406, + "loss": 0.3213, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0305548906326294, + "rewards/margins": 2.899721145629883, + "rewards/rejected": -3.9302761554718018, + "step": 8441 + }, + { + "epoch": 0.97, + "learning_rate": 8.147020952826874e-09, + "logits/chosen": -2.5006184577941895, + "logits/rejected": -2.4994943141937256, + "logps/chosen": -159.21337890625, + "logps/rejected": -211.19351196289062, + "loss": 0.3644, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5631324648857117, + "rewards/margins": 1.516002893447876, + "rewards/rejected": -2.0791354179382324, + "step": 8442 + }, + { + "epoch": 0.97, + "learning_rate": 8.111904483202621e-09, + "logits/chosen": -1.8470720052719116, + "logits/rejected": -2.1139330863952637, + "logps/chosen": -207.09913635253906, + "logps/rejected": -158.02163696289062, + "loss": 0.1949, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0920605659484863, + "rewards/margins": 2.1227970123291016, + "rewards/rejected": -3.214857578277588, + "step": 8443 + }, + { + "epoch": 0.97, + "learning_rate": 8.076788013578367e-09, + "logits/chosen": -1.8499335050582886, + "logits/rejected": -1.9008562564849854, + "logps/chosen": -294.2664794921875, + "logps/rejected": -315.2945556640625, + "loss": 0.1116, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8120604753494263, + "rewards/margins": 3.4194204807281494, + "rewards/rejected": -4.231481075286865, + "step": 8444 + }, + { + "epoch": 0.97, + "learning_rate": 8.041671543954114e-09, + "logits/chosen": -2.773874521255493, + "logits/rejected": -2.85134220123291, + "logps/chosen": -322.83056640625, + "logps/rejected": -204.3278350830078, + "loss": 0.3247, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8521467447280884, + "rewards/margins": 1.825416088104248, + "rewards/rejected": -2.677562713623047, + "step": 8445 + }, + { + "epoch": 0.97, + "learning_rate": 8.00655507432986e-09, + "logits/chosen": -2.0128047466278076, + "logits/rejected": -2.0048131942749023, + "logps/chosen": -225.31585693359375, + "logps/rejected": -223.79904174804688, + "loss": 0.2037, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7173347473144531, + "rewards/margins": 2.6230626106262207, + "rewards/rejected": -3.340397357940674, + "step": 8446 + }, + { + "epoch": 0.97, + "learning_rate": 7.971438604705607e-09, + "logits/chosen": -2.151486873626709, + "logits/rejected": -2.251532554626465, + "logps/chosen": -422.81597900390625, + "logps/rejected": -410.079345703125, + "loss": 0.4775, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1074963808059692, + "rewards/margins": 3.2716031074523926, + "rewards/rejected": -4.379099369049072, + "step": 8447 + }, + { + "epoch": 0.97, + "learning_rate": 7.936322135081353e-09, + "logits/chosen": -2.7269506454467773, + "logits/rejected": -2.7312510013580322, + "logps/chosen": -122.10626220703125, + "logps/rejected": -154.63555908203125, + "loss": 0.6169, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5635226368904114, + "rewards/margins": 1.727094292640686, + "rewards/rejected": -2.290616989135742, + "step": 8448 + }, + { + "epoch": 0.97, + "learning_rate": 7.9012056654571e-09, + "logits/chosen": -2.327188491821289, + "logits/rejected": -2.1768832206726074, + "logps/chosen": -229.79515075683594, + "logps/rejected": -299.3763427734375, + "loss": 0.5187, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.941375494003296, + "rewards/margins": 1.9541521072387695, + "rewards/rejected": -3.8955276012420654, + "step": 8449 + }, + { + "epoch": 0.97, + "learning_rate": 7.866089195832845e-09, + "logits/chosen": -2.3564236164093018, + "logits/rejected": -2.475240707397461, + "logps/chosen": -215.36032104492188, + "logps/rejected": -374.1624755859375, + "loss": 0.2707, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7595632672309875, + "rewards/margins": 3.4790289402008057, + "rewards/rejected": -4.238592147827148, + "step": 8450 + }, + { + "epoch": 0.97, + "learning_rate": 7.830972726208591e-09, + "logits/chosen": -2.5557243824005127, + "logits/rejected": -2.7531206607818604, + "logps/chosen": -350.0534973144531, + "logps/rejected": -270.44012451171875, + "loss": 0.2549, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9070001244544983, + "rewards/margins": 3.1894826889038086, + "rewards/rejected": -4.096482753753662, + "step": 8451 + }, + { + "epoch": 0.97, + "learning_rate": 7.795856256584338e-09, + "logits/chosen": -2.39145827293396, + "logits/rejected": -2.3992197513580322, + "logps/chosen": -311.3741455078125, + "logps/rejected": -308.7923278808594, + "loss": 0.2371, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0654470920562744, + "rewards/margins": 1.9509546756744385, + "rewards/rejected": -3.016401767730713, + "step": 8452 + }, + { + "epoch": 0.97, + "learning_rate": 7.760739786960084e-09, + "logits/chosen": -1.8805304765701294, + "logits/rejected": -2.1488897800445557, + "logps/chosen": -330.45843505859375, + "logps/rejected": -292.6246337890625, + "loss": 0.202, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6373904347419739, + "rewards/margins": 2.306143283843994, + "rewards/rejected": -2.9435338973999023, + "step": 8453 + }, + { + "epoch": 0.97, + "learning_rate": 7.72562331733583e-09, + "logits/chosen": -2.2977209091186523, + "logits/rejected": -2.2298743724823, + "logps/chosen": -189.55801391601562, + "logps/rejected": -203.00070190429688, + "loss": 0.9245, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.2205562591552734, + "rewards/margins": 1.218540906906128, + "rewards/rejected": -4.4390974044799805, + "step": 8454 + }, + { + "epoch": 0.97, + "learning_rate": 7.690506847711577e-09, + "logits/chosen": -2.3518152236938477, + "logits/rejected": -2.3523049354553223, + "logps/chosen": -335.3187561035156, + "logps/rejected": -294.3480529785156, + "loss": 0.51, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.2504191398620605, + "rewards/margins": 0.6971675157546997, + "rewards/rejected": -2.9475865364074707, + "step": 8455 + }, + { + "epoch": 0.97, + "learning_rate": 7.655390378087322e-09, + "logits/chosen": -1.9327547550201416, + "logits/rejected": -2.2921504974365234, + "logps/chosen": -183.25210571289062, + "logps/rejected": -287.7257080078125, + "loss": 1.1323, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.2553977966308594, + "rewards/margins": 1.2120604515075684, + "rewards/rejected": -3.4674582481384277, + "step": 8456 + }, + { + "epoch": 0.97, + "learning_rate": 7.620273908463068e-09, + "logits/chosen": -2.774350166320801, + "logits/rejected": -2.708225727081299, + "logps/chosen": -204.94412231445312, + "logps/rejected": -281.1272888183594, + "loss": 0.3392, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9145235419273376, + "rewards/margins": 2.293826103210449, + "rewards/rejected": -3.2083499431610107, + "step": 8457 + }, + { + "epoch": 0.98, + "learning_rate": 7.585157438838815e-09, + "logits/chosen": -2.35969614982605, + "logits/rejected": -2.3000917434692383, + "logps/chosen": -434.26531982421875, + "logps/rejected": -186.80047607421875, + "loss": 0.5042, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2804967164993286, + "rewards/margins": 2.2580714225769043, + "rewards/rejected": -3.5385677814483643, + "step": 8458 + }, + { + "epoch": 0.98, + "learning_rate": 7.55004096921456e-09, + "logits/chosen": -2.2147278785705566, + "logits/rejected": -2.1098904609680176, + "logps/chosen": -250.34591674804688, + "logps/rejected": -414.33941650390625, + "loss": 0.3068, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8472565412521362, + "rewards/margins": 2.7441601753234863, + "rewards/rejected": -3.591416835784912, + "step": 8459 + }, + { + "epoch": 0.98, + "learning_rate": 7.514924499590308e-09, + "logits/chosen": -2.2401275634765625, + "logits/rejected": -2.3988518714904785, + "logps/chosen": -601.376953125, + "logps/rejected": -461.1140441894531, + "loss": 0.303, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0365780591964722, + "rewards/margins": 1.6847600936889648, + "rewards/rejected": -2.7213382720947266, + "step": 8460 + }, + { + "epoch": 0.98, + "learning_rate": 7.479808029966053e-09, + "logits/chosen": -2.53128719329834, + "logits/rejected": -2.517385482788086, + "logps/chosen": -277.9274597167969, + "logps/rejected": -187.48593139648438, + "loss": 0.4543, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7334717512130737, + "rewards/margins": 2.384124279022217, + "rewards/rejected": -4.117595672607422, + "step": 8461 + }, + { + "epoch": 0.98, + "learning_rate": 7.4446915603418e-09, + "logits/chosen": -2.4843227863311768, + "logits/rejected": -2.370933771133423, + "logps/chosen": -296.039306640625, + "logps/rejected": -202.974365234375, + "loss": 0.4919, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0356693267822266, + "rewards/margins": 1.760483741760254, + "rewards/rejected": -2.7961530685424805, + "step": 8462 + }, + { + "epoch": 0.98, + "learning_rate": 7.409575090717546e-09, + "logits/chosen": -2.021712303161621, + "logits/rejected": -1.8931679725646973, + "logps/chosen": -196.9453582763672, + "logps/rejected": -270.7169494628906, + "loss": 0.3731, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4046558141708374, + "rewards/margins": 1.4026098251342773, + "rewards/rejected": -2.8072657585144043, + "step": 8463 + }, + { + "epoch": 0.98, + "learning_rate": 7.374458621093293e-09, + "logits/chosen": -2.3469386100769043, + "logits/rejected": -2.346116304397583, + "logps/chosen": -285.6854248046875, + "logps/rejected": -260.259521484375, + "loss": 0.8481, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.573390483856201, + "rewards/margins": 1.1955279111862183, + "rewards/rejected": -3.76891827583313, + "step": 8464 + }, + { + "epoch": 0.98, + "learning_rate": 7.339342151469039e-09, + "logits/chosen": -1.9180872440338135, + "logits/rejected": -1.8970829248428345, + "logps/chosen": -392.4430847167969, + "logps/rejected": -473.2355651855469, + "loss": 0.4957, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.625613272190094, + "rewards/margins": 2.0374536514282227, + "rewards/rejected": -2.663067102432251, + "step": 8465 + }, + { + "epoch": 0.98, + "learning_rate": 7.304225681844785e-09, + "logits/chosen": -2.0925774574279785, + "logits/rejected": -2.4617092609405518, + "logps/chosen": -309.6151123046875, + "logps/rejected": -303.0201416015625, + "loss": 0.1372, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6062551736831665, + "rewards/margins": 3.427872657775879, + "rewards/rejected": -4.034128189086914, + "step": 8466 + }, + { + "epoch": 0.98, + "learning_rate": 7.269109212220531e-09, + "logits/chosen": -2.794356346130371, + "logits/rejected": -2.859220504760742, + "logps/chosen": -321.44879150390625, + "logps/rejected": -372.75115966796875, + "loss": 0.223, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3593827486038208, + "rewards/margins": 3.003735303878784, + "rewards/rejected": -3.3631181716918945, + "step": 8467 + }, + { + "epoch": 0.98, + "learning_rate": 7.233992742596277e-09, + "logits/chosen": -1.8485530614852905, + "logits/rejected": -2.374054431915283, + "logps/chosen": -396.03668212890625, + "logps/rejected": -234.1451873779297, + "loss": 0.4235, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8647726774215698, + "rewards/margins": 1.0368945598602295, + "rewards/rejected": -1.9016672372817993, + "step": 8468 + }, + { + "epoch": 0.98, + "learning_rate": 7.198876272972023e-09, + "logits/chosen": -1.888779640197754, + "logits/rejected": -1.9226495027542114, + "logps/chosen": -250.9748992919922, + "logps/rejected": -260.90899658203125, + "loss": 0.4635, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4386930465698242, + "rewards/margins": 2.029904842376709, + "rewards/rejected": -3.468597888946533, + "step": 8469 + }, + { + "epoch": 0.98, + "learning_rate": 7.1637598033477695e-09, + "logits/chosen": -1.7851676940917969, + "logits/rejected": -1.5483484268188477, + "logps/chosen": -187.0677490234375, + "logps/rejected": -331.21954345703125, + "loss": 0.5257, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.404583215713501, + "rewards/margins": 1.0482404232025146, + "rewards/rejected": -2.4528234004974365, + "step": 8470 + }, + { + "epoch": 0.98, + "learning_rate": 7.128643333723516e-09, + "logits/chosen": -2.340640068054199, + "logits/rejected": -2.118523120880127, + "logps/chosen": -377.1434020996094, + "logps/rejected": -405.0869140625, + "loss": 0.6342, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7218585014343262, + "rewards/margins": 0.2536790668964386, + "rewards/rejected": -0.9755375981330872, + "step": 8471 + }, + { + "epoch": 0.98, + "learning_rate": 7.093526864099262e-09, + "logits/chosen": -1.7049620151519775, + "logits/rejected": -1.8794035911560059, + "logps/chosen": -418.2523193359375, + "logps/rejected": -268.1214294433594, + "loss": 0.3905, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.27639317512512207, + "rewards/margins": 1.5497424602508545, + "rewards/rejected": -1.8261356353759766, + "step": 8472 + }, + { + "epoch": 0.98, + "learning_rate": 7.058410394475009e-09, + "logits/chosen": -2.357656717300415, + "logits/rejected": -2.4500880241394043, + "logps/chosen": -242.92967224121094, + "logps/rejected": -212.33749389648438, + "loss": 0.5014, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.623845100402832, + "rewards/margins": 1.15473210811615, + "rewards/rejected": -1.778577208518982, + "step": 8473 + }, + { + "epoch": 0.98, + "learning_rate": 7.023293924850755e-09, + "logits/chosen": -2.1115357875823975, + "logits/rejected": -2.1977157592773438, + "logps/chosen": -250.7509765625, + "logps/rejected": -317.7409362792969, + "loss": 0.4713, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2584967613220215, + "rewards/margins": 1.7262907028198242, + "rewards/rejected": -2.984787702560425, + "step": 8474 + }, + { + "epoch": 0.98, + "learning_rate": 6.9881774552265016e-09, + "logits/chosen": -1.9817159175872803, + "logits/rejected": -2.327359676361084, + "logps/chosen": -467.133056640625, + "logps/rejected": -440.22027587890625, + "loss": 0.8097, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.36333322525024414, + "rewards/margins": 1.559122920036316, + "rewards/rejected": -1.9224562644958496, + "step": 8475 + }, + { + "epoch": 0.98, + "learning_rate": 6.953060985602247e-09, + "logits/chosen": -2.5577423572540283, + "logits/rejected": -2.533324956893921, + "logps/chosen": -81.59384155273438, + "logps/rejected": -177.96197509765625, + "loss": 0.2627, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.007173709571361542, + "rewards/margins": 2.6655609607696533, + "rewards/rejected": -2.6583871841430664, + "step": 8476 + }, + { + "epoch": 0.98, + "learning_rate": 6.9179445159779936e-09, + "logits/chosen": -1.9206476211547852, + "logits/rejected": -2.3174118995666504, + "logps/chosen": -259.77093505859375, + "logps/rejected": -141.95111083984375, + "loss": 0.3673, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4696621894836426, + "rewards/margins": 1.037964105606079, + "rewards/rejected": -1.5076262950897217, + "step": 8477 + }, + { + "epoch": 0.98, + "learning_rate": 6.882828046353739e-09, + "logits/chosen": -2.5760674476623535, + "logits/rejected": -2.596973180770874, + "logps/chosen": -308.33514404296875, + "logps/rejected": -211.84490966796875, + "loss": 0.265, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1953165978193283, + "rewards/margins": 2.3088505268096924, + "rewards/rejected": -2.1135339736938477, + "step": 8478 + }, + { + "epoch": 0.98, + "learning_rate": 6.8477115767294856e-09, + "logits/chosen": -2.514979362487793, + "logits/rejected": -2.2379837036132812, + "logps/chosen": -176.68826293945312, + "logps/rejected": -185.00308227539062, + "loss": 0.4525, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4733874797821045, + "rewards/margins": 1.502231240272522, + "rewards/rejected": -2.975618839263916, + "step": 8479 + }, + { + "epoch": 0.98, + "learning_rate": 6.812595107105232e-09, + "logits/chosen": -2.422226905822754, + "logits/rejected": -2.5960888862609863, + "logps/chosen": -420.156005859375, + "logps/rejected": -354.58087158203125, + "loss": 0.3354, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7392680048942566, + "rewards/margins": 4.886551380157471, + "rewards/rejected": -5.625819206237793, + "step": 8480 + }, + { + "epoch": 0.98, + "learning_rate": 6.777478637480978e-09, + "logits/chosen": -2.777900457382202, + "logits/rejected": -2.8080880641937256, + "logps/chosen": -295.37469482421875, + "logps/rejected": -284.02874755859375, + "loss": 0.7571, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.7650753259658813, + "rewards/margins": 0.5031784176826477, + "rewards/rejected": -2.2682535648345947, + "step": 8481 + }, + { + "epoch": 0.98, + "learning_rate": 6.742362167856725e-09, + "logits/chosen": -2.6968226432800293, + "logits/rejected": -2.5184292793273926, + "logps/chosen": -211.6857452392578, + "logps/rejected": -262.84674072265625, + "loss": 0.1532, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6243649125099182, + "rewards/margins": 3.4682109355926514, + "rewards/rejected": -4.092576026916504, + "step": 8482 + }, + { + "epoch": 0.98, + "learning_rate": 6.70724569823247e-09, + "logits/chosen": -2.948683977127075, + "logits/rejected": -3.0015039443969727, + "logps/chosen": -177.06317138671875, + "logps/rejected": -356.3477783203125, + "loss": 0.1867, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2477782964706421, + "rewards/margins": 3.039097785949707, + "rewards/rejected": -3.2868759632110596, + "step": 8483 + }, + { + "epoch": 0.98, + "learning_rate": 6.672129228608217e-09, + "logits/chosen": -1.7611205577850342, + "logits/rejected": -1.8529596328735352, + "logps/chosen": -340.00885009765625, + "logps/rejected": -232.8406219482422, + "loss": 0.3419, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8714094161987305, + "rewards/margins": 1.6812615394592285, + "rewards/rejected": -2.552670955657959, + "step": 8484 + }, + { + "epoch": 0.98, + "learning_rate": 6.637012758983963e-09, + "logits/chosen": -2.6390457153320312, + "logits/rejected": -2.72440767288208, + "logps/chosen": -148.60923767089844, + "logps/rejected": -254.42076110839844, + "loss": 0.3306, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.13457870483398438, + "rewards/margins": 1.592367172241211, + "rewards/rejected": -1.7269458770751953, + "step": 8485 + }, + { + "epoch": 0.98, + "learning_rate": 6.60189628935971e-09, + "logits/chosen": -2.6434357166290283, + "logits/rejected": -2.7125282287597656, + "logps/chosen": -210.27261352539062, + "logps/rejected": -220.42337036132812, + "loss": 0.2473, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3298322558403015, + "rewards/margins": 2.685760498046875, + "rewards/rejected": -3.0155928134918213, + "step": 8486 + }, + { + "epoch": 0.98, + "learning_rate": 6.566779819735456e-09, + "logits/chosen": -2.274538040161133, + "logits/rejected": -2.41640305519104, + "logps/chosen": -443.8210754394531, + "logps/rejected": -276.3377685546875, + "loss": 0.199, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0636454820632935, + "rewards/margins": 2.703766345977783, + "rewards/rejected": -3.767411947250366, + "step": 8487 + }, + { + "epoch": 0.98, + "learning_rate": 6.5316633501112024e-09, + "logits/chosen": -2.9260764122009277, + "logits/rejected": -2.907597541809082, + "logps/chosen": -207.48597717285156, + "logps/rejected": -171.93212890625, + "loss": 0.6123, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.688122272491455, + "rewards/margins": 1.2313318252563477, + "rewards/rejected": -2.9194538593292236, + "step": 8488 + }, + { + "epoch": 0.98, + "learning_rate": 6.496546880486949e-09, + "logits/chosen": -2.52170467376709, + "logits/rejected": -2.298595428466797, + "logps/chosen": -402.81622314453125, + "logps/rejected": -443.9622802734375, + "loss": 0.1544, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.024604082107544, + "rewards/margins": 2.4087886810302734, + "rewards/rejected": -3.4333930015563965, + "step": 8489 + }, + { + "epoch": 0.98, + "learning_rate": 6.461430410862694e-09, + "logits/chosen": -2.706122398376465, + "logits/rejected": -2.6484553813934326, + "logps/chosen": -150.76556396484375, + "logps/rejected": -213.15994262695312, + "loss": 0.1575, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.566582441329956, + "rewards/margins": 2.8353819847106934, + "rewards/rejected": -3.4019644260406494, + "step": 8490 + }, + { + "epoch": 0.98, + "learning_rate": 6.42631394123844e-09, + "logits/chosen": -2.3881940841674805, + "logits/rejected": -2.2961409091949463, + "logps/chosen": -448.29449462890625, + "logps/rejected": -549.4191284179688, + "loss": 0.1351, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6636947989463806, + "rewards/margins": 3.9868624210357666, + "rewards/rejected": -4.650557041168213, + "step": 8491 + }, + { + "epoch": 0.98, + "learning_rate": 6.3911974716141864e-09, + "logits/chosen": -2.7703592777252197, + "logits/rejected": -2.629903554916382, + "logps/chosen": -203.15036010742188, + "logps/rejected": -272.80657958984375, + "loss": 0.3661, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.344296932220459, + "rewards/margins": 1.7558144330978394, + "rewards/rejected": -4.100111484527588, + "step": 8492 + }, + { + "epoch": 0.98, + "learning_rate": 6.356081001989933e-09, + "logits/chosen": -2.4355063438415527, + "logits/rejected": -2.3113856315612793, + "logps/chosen": -272.4638671875, + "logps/rejected": -304.50665283203125, + "loss": 0.3163, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5562509298324585, + "rewards/margins": 2.7043566703796387, + "rewards/rejected": -4.260607719421387, + "step": 8493 + }, + { + "epoch": 0.98, + "learning_rate": 6.320964532365679e-09, + "logits/chosen": -2.581007957458496, + "logits/rejected": -2.520467519760132, + "logps/chosen": -257.6852722167969, + "logps/rejected": -309.59716796875, + "loss": 0.2099, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1051421165466309, + "rewards/margins": 2.7887587547302246, + "rewards/rejected": -3.8939008712768555, + "step": 8494 + }, + { + "epoch": 0.98, + "learning_rate": 6.285848062741426e-09, + "logits/chosen": -2.198974370956421, + "logits/rejected": -2.3636386394500732, + "logps/chosen": -208.6853790283203, + "logps/rejected": -202.41114807128906, + "loss": 0.3292, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12969352304935455, + "rewards/margins": 2.903615713119507, + "rewards/rejected": -3.0333094596862793, + "step": 8495 + }, + { + "epoch": 0.98, + "learning_rate": 6.250731593117172e-09, + "logits/chosen": -2.872110366821289, + "logits/rejected": -2.806889533996582, + "logps/chosen": -145.74301147460938, + "logps/rejected": -203.60739135742188, + "loss": 0.1421, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.407367080450058, + "rewards/margins": 3.2024621963500977, + "rewards/rejected": -3.6098294258117676, + "step": 8496 + }, + { + "epoch": 0.98, + "learning_rate": 6.2156151234929185e-09, + "logits/chosen": -2.274214506149292, + "logits/rejected": -2.1042048931121826, + "logps/chosen": -199.17156982421875, + "logps/rejected": -259.7486267089844, + "loss": 0.1213, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3836100399494171, + "rewards/margins": 3.120122194290161, + "rewards/rejected": -3.5037319660186768, + "step": 8497 + }, + { + "epoch": 0.98, + "learning_rate": 6.180498653868664e-09, + "logits/chosen": -2.9312455654144287, + "logits/rejected": -2.652904510498047, + "logps/chosen": -333.82720947265625, + "logps/rejected": -292.65618896484375, + "loss": 0.3402, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6703366637229919, + "rewards/margins": 2.1785459518432617, + "rewards/rejected": -2.8488826751708984, + "step": 8498 + }, + { + "epoch": 0.98, + "learning_rate": 6.1453821842444105e-09, + "logits/chosen": -2.4762117862701416, + "logits/rejected": -2.5071158409118652, + "logps/chosen": -315.080078125, + "logps/rejected": -318.1513977050781, + "loss": 0.0661, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.610782265663147, + "rewards/margins": 2.97161865234375, + "rewards/rejected": -3.5824012756347656, + "step": 8499 + }, + { + "epoch": 0.98, + "learning_rate": 6.110265714620157e-09, + "logits/chosen": -1.987937569618225, + "logits/rejected": -2.2189435958862305, + "logps/chosen": -372.2132873535156, + "logps/rejected": -236.9523162841797, + "loss": 0.2151, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3515934944152832, + "rewards/margins": 2.570495128631592, + "rewards/rejected": -3.922088623046875, + "step": 8500 + } + ], + "logging_steps": 1, + "max_steps": 8674, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}