diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4118 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9998286203941731, + "eval_steps": 500, + "global_step": 2917, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 1.7123287671232876e-08, + "logits/chosen": -2.21498966217041, + "logits/rejected": -1.5619134902954102, + "logps/chosen": -448.18634033203125, + "logps/rejected": -230.1645965576172, + "loss": 0.1703, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 1.7123287671232878e-07, + "logits/chosen": -1.9158155918121338, + "logits/rejected": -1.947864055633545, + "logps/chosen": -236.8881072998047, + "logps/rejected": -271.3336181640625, + "loss": 0.1345, + "rewards/accuracies": 0.3888888955116272, + "rewards/chosen": 1.0350075172027573e-05, + "rewards/margins": -1.4042092516319826e-05, + "rewards/rejected": 2.4392174964305013e-05, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 3.4246575342465755e-07, + "logits/chosen": -1.9916515350341797, + "logits/rejected": -1.7161877155303955, + "logps/chosen": -181.11163330078125, + "logps/rejected": -196.61138916015625, + "loss": 0.1196, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 4.240129783283919e-06, + "rewards/margins": 0.00010243832366541028, + "rewards/rejected": -9.819817205425352e-05, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 5.136986301369864e-07, + "logits/chosen": -1.9248673915863037, + "logits/rejected": -1.8731294870376587, + "logps/chosen": -227.29898071289062, + "logps/rejected": -272.0036315917969, + "loss": 0.1104, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.00023775253794156015, + "rewards/margins": 0.00018605976947583258, + "rewards/rejected": -0.00042381230741739273, + "step": 30 + }, + { + "epoch": 0.01, + "learning_rate": 6.849315068493151e-07, + "logits/chosen": -1.9346742630004883, + "logits/rejected": -1.6983258724212646, + "logps/chosen": -284.8092346191406, + "logps/rejected": -235.57955932617188, + "loss": 0.103, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.00043622878729365766, + "rewards/margins": 0.0012934322003275156, + "rewards/rejected": -0.001729660900309682, + "step": 40 + }, + { + "epoch": 0.02, + "learning_rate": 8.561643835616439e-07, + "logits/chosen": -1.997926950454712, + "logits/rejected": -1.5733534097671509, + "logps/chosen": -234.2779998779297, + "logps/rejected": -186.01576232910156, + "loss": 0.1645, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.004240738693624735, + "rewards/margins": 0.0029823766089975834, + "rewards/rejected": -0.007223114371299744, + "step": 50 + }, + { + "epoch": 0.02, + "learning_rate": 1.0273972602739727e-06, + "logits/chosen": -1.8750112056732178, + "logits/rejected": -1.5471408367156982, + "logps/chosen": -226.1807098388672, + "logps/rejected": -231.8741455078125, + "loss": 0.1159, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.026738092303276062, + "rewards/margins": 0.011812428012490273, + "rewards/rejected": -0.038550518453121185, + "step": 60 + }, + { + "epoch": 0.02, + "learning_rate": 1.1986301369863014e-06, + "logits/chosen": -1.8785406351089478, + "logits/rejected": -1.6199915409088135, + "logps/chosen": -309.3338623046875, + "logps/rejected": -310.3027038574219, + "loss": 0.1245, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.06837661564350128, + "rewards/margins": 0.023886824026703835, + "rewards/rejected": -0.09226343780755997, + "step": 70 + }, + { + "epoch": 0.03, + "learning_rate": 1.3698630136986302e-06, + "logits/chosen": -1.8794485330581665, + "logits/rejected": -1.588679552078247, + "logps/chosen": -331.4400939941406, + "logps/rejected": -316.6481018066406, + "loss": 0.1391, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09261558204889297, + "rewards/margins": 0.05595500394701958, + "rewards/rejected": -0.14857056736946106, + "step": 80 + }, + { + "epoch": 0.03, + "learning_rate": 1.541095890410959e-06, + "logits/chosen": -1.73800790309906, + "logits/rejected": -1.5066587924957275, + "logps/chosen": -349.1307678222656, + "logps/rejected": -445.1373596191406, + "loss": 0.1099, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.1359734833240509, + "rewards/margins": 0.0638502761721611, + "rewards/rejected": -0.1998237520456314, + "step": 90 + }, + { + "epoch": 0.03, + "learning_rate": 1.7123287671232877e-06, + "logits/chosen": -1.9925429821014404, + "logits/rejected": -1.5489239692687988, + "logps/chosen": -431.8665466308594, + "logps/rejected": -449.634521484375, + "loss": 0.124, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.14928413927555084, + "rewards/margins": 0.0672868937253952, + "rewards/rejected": -0.21657104790210724, + "step": 100 + }, + { + "epoch": 0.04, + "learning_rate": 1.8835616438356166e-06, + "logits/chosen": -1.938245177268982, + "logits/rejected": -1.5735212564468384, + "logps/chosen": -445.81585693359375, + "logps/rejected": -423.18975830078125, + "loss": 0.1018, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.15499410033226013, + "rewards/margins": 0.051616422832012177, + "rewards/rejected": -0.2066105306148529, + "step": 110 + }, + { + "epoch": 0.04, + "learning_rate": 2.0547945205479454e-06, + "logits/chosen": -1.8546117544174194, + "logits/rejected": -1.6232588291168213, + "logps/chosen": -401.47039794921875, + "logps/rejected": -448.80926513671875, + "loss": 0.0749, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.13759753108024597, + "rewards/margins": 0.0838770940899849, + "rewards/rejected": -0.22147460281848907, + "step": 120 + }, + { + "epoch": 0.04, + "learning_rate": 2.2260273972602743e-06, + "logits/chosen": -1.7494646310806274, + "logits/rejected": -1.6120729446411133, + "logps/chosen": -393.23095703125, + "logps/rejected": -414.7491760253906, + "loss": 0.067, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1571560651063919, + "rewards/margins": 0.04211575910449028, + "rewards/rejected": -0.19927182793617249, + "step": 130 + }, + { + "epoch": 0.05, + "learning_rate": 2.3972602739726027e-06, + "logits/chosen": -1.8314697742462158, + "logits/rejected": -1.4660162925720215, + "logps/chosen": -343.4485778808594, + "logps/rejected": -441.86578369140625, + "loss": 0.1026, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11567596346139908, + "rewards/margins": 0.09308116137981415, + "rewards/rejected": -0.20875711739063263, + "step": 140 + }, + { + "epoch": 0.05, + "learning_rate": 2.568493150684932e-06, + "logits/chosen": -1.769061803817749, + "logits/rejected": -1.5675675868988037, + "logps/chosen": -377.51739501953125, + "logps/rejected": -394.83465576171875, + "loss": 0.0902, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.15446873009204865, + "rewards/margins": 0.04394759237766266, + "rewards/rejected": -0.1984163224697113, + "step": 150 + }, + { + "epoch": 0.05, + "learning_rate": 2.7397260273972604e-06, + "logits/chosen": -1.7472827434539795, + "logits/rejected": -1.5711588859558105, + "logps/chosen": -377.2109069824219, + "logps/rejected": -495.3035583496094, + "loss": 0.0971, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.1695915013551712, + "rewards/margins": 0.08974287658929825, + "rewards/rejected": -0.25933438539505005, + "step": 160 + }, + { + "epoch": 0.06, + "learning_rate": 2.9109589041095893e-06, + "logits/chosen": -1.8495628833770752, + "logits/rejected": -1.5801998376846313, + "logps/chosen": -324.21337890625, + "logps/rejected": -401.52374267578125, + "loss": 0.1013, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.06990089267492294, + "rewards/margins": 0.1042117103934288, + "rewards/rejected": -0.17411258816719055, + "step": 170 + }, + { + "epoch": 0.06, + "learning_rate": 3.082191780821918e-06, + "logits/chosen": -1.929395079612732, + "logits/rejected": -1.5674490928649902, + "logps/chosen": -332.16339111328125, + "logps/rejected": -346.80450439453125, + "loss": 0.087, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.08584436774253845, + "rewards/margins": 0.055128611624240875, + "rewards/rejected": -0.14097298681735992, + "step": 180 + }, + { + "epoch": 0.07, + "learning_rate": 3.253424657534247e-06, + "logits/chosen": -2.0753486156463623, + "logits/rejected": -1.6974204778671265, + "logps/chosen": -361.5826721191406, + "logps/rejected": -427.80426025390625, + "loss": 0.0783, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.09311284124851227, + "rewards/margins": 0.08312083035707474, + "rewards/rejected": -0.1762336641550064, + "step": 190 + }, + { + "epoch": 0.07, + "learning_rate": 3.4246575342465754e-06, + "logits/chosen": -2.206291675567627, + "logits/rejected": -1.9346578121185303, + "logps/chosen": -351.0352783203125, + "logps/rejected": -328.7702941894531, + "loss": 0.0932, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.09371860325336456, + "rewards/margins": 0.030456852167844772, + "rewards/rejected": -0.12417546659708023, + "step": 200 + }, + { + "epoch": 0.07, + "learning_rate": 3.5958904109589043e-06, + "logits/chosen": -1.9507122039794922, + "logits/rejected": -1.7119086980819702, + "logps/chosen": -292.4835205078125, + "logps/rejected": -361.983154296875, + "loss": 0.0902, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.10735298693180084, + "rewards/margins": 0.070334292948246, + "rewards/rejected": -0.17768728733062744, + "step": 210 + }, + { + "epoch": 0.08, + "learning_rate": 3.767123287671233e-06, + "logits/chosen": -1.8516258001327515, + "logits/rejected": -1.7747443914413452, + "logps/chosen": -324.3973388671875, + "logps/rejected": -373.698486328125, + "loss": 0.1124, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.1862211674451828, + "rewards/margins": 0.040039997547864914, + "rewards/rejected": -0.226261168718338, + "step": 220 + }, + { + "epoch": 0.08, + "learning_rate": 3.938356164383562e-06, + "logits/chosen": -2.0984139442443848, + "logits/rejected": -1.8558800220489502, + "logps/chosen": -437.14459228515625, + "logps/rejected": -443.85089111328125, + "loss": 0.0886, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.15925905108451843, + "rewards/margins": 0.06432916969060898, + "rewards/rejected": -0.223588228225708, + "step": 230 + }, + { + "epoch": 0.08, + "learning_rate": 4.109589041095891e-06, + "logits/chosen": -1.8379871845245361, + "logits/rejected": -1.6939672231674194, + "logps/chosen": -351.00531005859375, + "logps/rejected": -395.9615783691406, + "loss": 0.0919, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1394365429878235, + "rewards/margins": 0.06487082690000534, + "rewards/rejected": -0.20430736243724823, + "step": 240 + }, + { + "epoch": 0.09, + "learning_rate": 4.28082191780822e-06, + "logits/chosen": -2.0115137100219727, + "logits/rejected": -1.811408281326294, + "logps/chosen": -411.9342346191406, + "logps/rejected": -407.1709289550781, + "loss": 0.085, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.115334153175354, + "rewards/margins": 0.04877592995762825, + "rewards/rejected": -0.16411006450653076, + "step": 250 + }, + { + "epoch": 0.09, + "learning_rate": 4.4520547945205486e-06, + "logits/chosen": -1.917514443397522, + "logits/rejected": -1.6930125951766968, + "logps/chosen": -409.4349365234375, + "logps/rejected": -449.8536071777344, + "loss": 0.0983, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.15566658973693848, + "rewards/margins": 0.08868524432182312, + "rewards/rejected": -0.24435186386108398, + "step": 260 + }, + { + "epoch": 0.09, + "learning_rate": 4.6232876712328774e-06, + "logits/chosen": -1.9329277276992798, + "logits/rejected": -1.6485719680786133, + "logps/chosen": -373.9496765136719, + "logps/rejected": -461.1018981933594, + "loss": 0.0747, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.17361843585968018, + "rewards/margins": 0.12690795958042145, + "rewards/rejected": -0.30052638053894043, + "step": 270 + }, + { + "epoch": 0.1, + "learning_rate": 4.7945205479452054e-06, + "logits/chosen": -2.2051990032196045, + "logits/rejected": -1.7319234609603882, + "logps/chosen": -330.0890197753906, + "logps/rejected": -398.8971252441406, + "loss": 0.1105, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.09975247085094452, + "rewards/margins": 0.12828990817070007, + "rewards/rejected": -0.2280423939228058, + "step": 280 + }, + { + "epoch": 0.1, + "learning_rate": 4.965753424657534e-06, + "logits/chosen": -2.15606427192688, + "logits/rejected": -2.102695941925049, + "logps/chosen": -421.451171875, + "logps/rejected": -486.7090759277344, + "loss": 0.0951, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.15245939791202545, + "rewards/margins": 0.05438145995140076, + "rewards/rejected": -0.20684084296226501, + "step": 290 + }, + { + "epoch": 0.1, + "learning_rate": 4.99988541499203e-06, + "logits/chosen": -1.9494987726211548, + "logits/rejected": -1.7490851879119873, + "logps/chosen": -476.6026916503906, + "logps/rejected": -605.9563598632812, + "loss": 0.1238, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.25119680166244507, + "rewards/margins": 0.1370239108800888, + "rewards/rejected": -0.38822072744369507, + "step": 300 + }, + { + "epoch": 0.11, + "learning_rate": 4.999419931399174e-06, + "logits/chosen": -1.973769187927246, + "logits/rejected": -2.039909839630127, + "logps/chosen": -482.8154296875, + "logps/rejected": -553.8546142578125, + "loss": 0.0801, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.27585893869400024, + "rewards/margins": 0.04923711344599724, + "rewards/rejected": -0.325096070766449, + "step": 310 + }, + { + "epoch": 0.11, + "learning_rate": 4.998596454278661e-06, + "logits/chosen": -2.062340021133423, + "logits/rejected": -1.94599187374115, + "logps/chosen": -504.60546875, + "logps/rejected": -554.1131591796875, + "loss": 0.0724, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.24708731472492218, + "rewards/margins": 0.06553421169519424, + "rewards/rejected": -0.31262150406837463, + "step": 320 + }, + { + "epoch": 0.11, + "learning_rate": 4.99741510157765e-06, + "logits/chosen": -1.9648933410644531, + "logits/rejected": -1.780860185623169, + "logps/chosen": -620.1107788085938, + "logps/rejected": -667.7522583007812, + "loss": 0.1007, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.313930869102478, + "rewards/margins": 0.06759389489889145, + "rewards/rejected": -0.3815247416496277, + "step": 330 + }, + { + "epoch": 0.12, + "learning_rate": 4.995876042502048e-06, + "logits/chosen": -2.1744627952575684, + "logits/rejected": -1.8270336389541626, + "logps/chosen": -484.35101318359375, + "logps/rejected": -553.4263305664062, + "loss": 0.1147, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2436065375804901, + "rewards/margins": 0.08232472836971283, + "rewards/rejected": -0.32593125104904175, + "step": 340 + }, + { + "epoch": 0.12, + "learning_rate": 4.993979497492282e-06, + "logits/chosen": -1.914698600769043, + "logits/rejected": -1.7681747674942017, + "logps/chosen": -472.70794677734375, + "logps/rejected": -629.3675537109375, + "loss": 0.1157, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2542678713798523, + "rewards/margins": 0.09860087186098099, + "rewards/rejected": -0.3528687059879303, + "step": 350 + }, + { + "epoch": 0.12, + "learning_rate": 4.9917257381917115e-06, + "logits/chosen": -2.0197696685791016, + "logits/rejected": -1.7743446826934814, + "logps/chosen": -639.1318359375, + "logps/rejected": -686.443603515625, + "loss": 0.1203, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3869578242301941, + "rewards/margins": 0.06751660257577896, + "rewards/rejected": -0.45447444915771484, + "step": 360 + }, + { + "epoch": 0.13, + "learning_rate": 4.989115087407737e-06, + "logits/chosen": -1.904762864112854, + "logits/rejected": -1.8776410818099976, + "logps/chosen": -539.9891357421875, + "logps/rejected": -601.7970581054688, + "loss": 0.0625, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3085237145423889, + "rewards/margins": 0.08412410318851471, + "rewards/rejected": -0.3926478326320648, + "step": 370 + }, + { + "epoch": 0.13, + "learning_rate": 4.986147919065551e-06, + "logits/chosen": -1.876813530921936, + "logits/rejected": -1.7952611446380615, + "logps/chosen": -488.16107177734375, + "logps/rejected": -583.83056640625, + "loss": 0.0983, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.25909119844436646, + "rewards/margins": 0.04833118990063667, + "rewards/rejected": -0.30742236971855164, + "step": 380 + }, + { + "epoch": 0.13, + "learning_rate": 4.982824658154589e-06, + "logits/chosen": -2.1078734397888184, + "logits/rejected": -1.9783456325531006, + "logps/chosen": -396.71630859375, + "logps/rejected": -487.6923828125, + "loss": 0.0596, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.23540160059928894, + "rewards/margins": 0.09165269136428833, + "rewards/rejected": -0.32705432176589966, + "step": 390 + }, + { + "epoch": 0.14, + "learning_rate": 4.979145780667652e-06, + "logits/chosen": -2.0578982830047607, + "logits/rejected": -1.7134244441986084, + "logps/chosen": -535.1932983398438, + "logps/rejected": -577.6029663085938, + "loss": 0.0977, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.24049289524555206, + "rewards/margins": 0.0864410474896431, + "rewards/rejected": -0.32693392038345337, + "step": 400 + }, + { + "epoch": 0.14, + "learning_rate": 4.975111813532733e-06, + "logits/chosen": -1.844091773033142, + "logits/rejected": -1.5848596096038818, + "logps/chosen": -569.6875, + "logps/rejected": -624.45849609375, + "loss": 0.0983, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.30232542753219604, + "rewards/margins": 0.08498513698577881, + "rewards/rejected": -0.38731056451797485, + "step": 410 + }, + { + "epoch": 0.14, + "learning_rate": 4.970723334537547e-06, + "logits/chosen": -1.7672590017318726, + "logits/rejected": -1.5241564512252808, + "logps/chosen": -493.00628662109375, + "logps/rejected": -542.5633544921875, + "loss": 0.0683, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2543202340602875, + "rewards/margins": 0.0733615979552269, + "rewards/rejected": -0.32768186926841736, + "step": 420 + }, + { + "epoch": 0.15, + "learning_rate": 4.965980972246767e-06, + "logits/chosen": -1.9794318675994873, + "logits/rejected": -1.9301669597625732, + "logps/chosen": -460.9219665527344, + "logps/rejected": -573.4908447265625, + "loss": 0.1151, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.26725929975509644, + "rewards/margins": 0.08439052850008011, + "rewards/rejected": -0.3516498804092407, + "step": 430 + }, + { + "epoch": 0.15, + "learning_rate": 4.960885405912001e-06, + "logits/chosen": -1.9738355875015259, + "logits/rejected": -1.8872768878936768, + "logps/chosen": -490.4803771972656, + "logps/rejected": -552.944091796875, + "loss": 0.1051, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.22316575050354004, + "rewards/margins": 0.07666633278131485, + "rewards/rejected": -0.2998320460319519, + "step": 440 + }, + { + "epoch": 0.15, + "learning_rate": 4.955437365374499e-06, + "logits/chosen": -2.042649507522583, + "logits/rejected": -1.833367943763733, + "logps/chosen": -427.7418518066406, + "logps/rejected": -428.240966796875, + "loss": 0.0845, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.15850776433944702, + "rewards/margins": 0.04146740958094597, + "rewards/rejected": -0.1999751627445221, + "step": 450 + }, + { + "epoch": 0.16, + "learning_rate": 4.949637630960618e-06, + "logits/chosen": -2.2080771923065186, + "logits/rejected": -1.9125267267227173, + "logps/chosen": -363.9242858886719, + "logps/rejected": -413.38140869140625, + "loss": 0.1212, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.151325061917305, + "rewards/margins": 0.08935161679983139, + "rewards/rejected": -0.2406766712665558, + "step": 460 + }, + { + "epoch": 0.16, + "learning_rate": 4.943487033370056e-06, + "logits/chosen": -2.1318681240081787, + "logits/rejected": -1.7986198663711548, + "logps/chosen": -613.80712890625, + "logps/rejected": -721.6338500976562, + "loss": 0.0967, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3193725049495697, + "rewards/margins": 0.10743044316768646, + "rewards/rejected": -0.42680296301841736, + "step": 470 + }, + { + "epoch": 0.16, + "learning_rate": 4.936986453556871e-06, + "logits/chosen": -2.0002779960632324, + "logits/rejected": -1.736196756362915, + "logps/chosen": -580.8267211914062, + "logps/rejected": -735.5391845703125, + "loss": 0.0949, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3693394362926483, + "rewards/margins": 0.12292595952749252, + "rewards/rejected": -0.49226540327072144, + "step": 480 + }, + { + "epoch": 0.17, + "learning_rate": 4.930136822603299e-06, + "logits/chosen": -1.894122838973999, + "logits/rejected": -1.7605243921279907, + "logps/chosen": -661.7432250976562, + "logps/rejected": -752.2081909179688, + "loss": 0.0547, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.40788716077804565, + "rewards/margins": 0.11583086103200912, + "rewards/rejected": -0.5237180590629578, + "step": 490 + }, + { + "epoch": 0.17, + "learning_rate": 4.922939121586396e-06, + "logits/chosen": -1.8540306091308594, + "logits/rejected": -1.6888984441757202, + "logps/chosen": -703.7432861328125, + "logps/rejected": -763.1231689453125, + "loss": 0.1083, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.4497678875923157, + "rewards/margins": 0.08937375247478485, + "rewards/rejected": -0.5391416549682617, + "step": 500 + }, + { + "epoch": 0.17, + "learning_rate": 4.915394381437517e-06, + "logits/chosen": -2.064244508743286, + "logits/rejected": -1.787021279335022, + "logps/chosen": -432.42877197265625, + "logps/rejected": -491.94720458984375, + "loss": 0.1268, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.30147069692611694, + "rewards/margins": 0.08128456771373749, + "rewards/rejected": -0.3827553391456604, + "step": 510 + }, + { + "epoch": 0.18, + "learning_rate": 4.907503682794656e-06, + "logits/chosen": -2.1678388118743896, + "logits/rejected": -1.9495502710342407, + "logps/chosen": -532.3627319335938, + "logps/rejected": -569.6402587890625, + "loss": 0.106, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.314094603061676, + "rewards/margins": 0.050387926399707794, + "rewards/rejected": -0.36448249220848083, + "step": 520 + }, + { + "epoch": 0.18, + "learning_rate": 4.899268155847667e-06, + "logits/chosen": -1.9529002904891968, + "logits/rejected": -1.7975317239761353, + "logps/chosen": -348.3830871582031, + "logps/rejected": -417.27947998046875, + "loss": 0.0803, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.20450131595134735, + "rewards/margins": 0.07301940768957138, + "rewards/rejected": -0.2775207459926605, + "step": 530 + }, + { + "epoch": 0.19, + "learning_rate": 4.890688980176381e-06, + "logits/chosen": -2.286426067352295, + "logits/rejected": -1.9330307245254517, + "logps/chosen": -511.94647216796875, + "logps/rejected": -562.6546630859375, + "loss": 0.0752, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.24677078425884247, + "rewards/margins": 0.09590072929859161, + "rewards/rejected": -0.34267157316207886, + "step": 540 + }, + { + "epoch": 0.19, + "learning_rate": 4.881767384581658e-06, + "logits/chosen": -2.3378746509552, + "logits/rejected": -2.0311076641082764, + "logps/chosen": -499.9852600097656, + "logps/rejected": -580.5631103515625, + "loss": 0.0586, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.24438512325286865, + "rewards/margins": 0.08834482729434967, + "rewards/rejected": -0.3327299654483795, + "step": 550 + }, + { + "epoch": 0.19, + "learning_rate": 4.872504646909387e-06, + "logits/chosen": -2.234276056289673, + "logits/rejected": -1.9043071269989014, + "logps/chosen": -550.0889282226562, + "logps/rejected": -577.0806884765625, + "loss": 0.0994, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.29700541496276855, + "rewards/margins": 0.07648201286792755, + "rewards/rejected": -0.3734873831272125, + "step": 560 + }, + { + "epoch": 0.2, + "learning_rate": 4.8629020938674536e-06, + "logits/chosen": -2.295952558517456, + "logits/rejected": -1.7963495254516602, + "logps/chosen": -415.9605407714844, + "logps/rejected": -460.894287109375, + "loss": 0.0775, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.14038066565990448, + "rewards/margins": 0.0807698667049408, + "rewards/rejected": -0.22115054726600647, + "step": 570 + }, + { + "epoch": 0.2, + "learning_rate": 4.852961100835717e-06, + "logits/chosen": -2.271327257156372, + "logits/rejected": -1.7637627124786377, + "logps/chosen": -500.87481689453125, + "logps/rejected": -481.28228759765625, + "loss": 0.1147, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.21432165801525116, + "rewards/margins": 0.062083542346954346, + "rewards/rejected": -0.2764051854610443, + "step": 580 + }, + { + "epoch": 0.2, + "learning_rate": 4.84268309166902e-06, + "logits/chosen": -1.9748462438583374, + "logits/rejected": -1.959495186805725, + "logps/chosen": -416.51531982421875, + "logps/rejected": -488.2625427246094, + "loss": 0.0904, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": -0.24128298461437225, + "rewards/margins": 0.043503545224666595, + "rewards/rejected": -0.28478652238845825, + "step": 590 + }, + { + "epoch": 0.21, + "learning_rate": 4.832069538493237e-06, + "logits/chosen": -2.1981587409973145, + "logits/rejected": -1.8468116521835327, + "logps/chosen": -520.8303833007812, + "logps/rejected": -545.0802001953125, + "loss": 0.0926, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.259421169757843, + "rewards/margins": 0.06593780219554901, + "rewards/rejected": -0.3253589868545532, + "step": 600 + }, + { + "epoch": 0.21, + "learning_rate": 4.821121961494431e-06, + "logits/chosen": -2.008756160736084, + "logits/rejected": -1.8565582036972046, + "logps/chosen": -487.271240234375, + "logps/rejected": -622.4865112304688, + "loss": 0.1233, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.26701903343200684, + "rewards/margins": 0.1124114841222763, + "rewards/rejected": -0.37943053245544434, + "step": 610 + }, + { + "epoch": 0.21, + "learning_rate": 4.80984192870111e-06, + "logits/chosen": -2.2143161296844482, + "logits/rejected": -2.0251948833465576, + "logps/chosen": -475.0022888183594, + "logps/rejected": -536.0418090820312, + "loss": 0.1049, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.23833520710468292, + "rewards/margins": 0.08123020827770233, + "rewards/rejected": -0.31956541538238525, + "step": 620 + }, + { + "epoch": 0.22, + "learning_rate": 4.798231055759643e-06, + "logits/chosen": -2.1074166297912598, + "logits/rejected": -1.8376226425170898, + "logps/chosen": -575.6253662109375, + "logps/rejected": -653.4076538085938, + "loss": 0.0738, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.31133222579956055, + "rewards/margins": 0.125440776348114, + "rewards/rejected": -0.43677300214767456, + "step": 630 + }, + { + "epoch": 0.22, + "learning_rate": 4.786291005702841e-06, + "logits/chosen": -1.996763825416565, + "logits/rejected": -1.8434244394302368, + "logps/chosen": -673.5606689453125, + "logps/rejected": -735.9039306640625, + "loss": 0.0861, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.4302713871002197, + "rewards/margins": 0.07868895679712296, + "rewards/rejected": -0.5089603662490845, + "step": 640 + }, + { + "epoch": 0.22, + "learning_rate": 4.7740234887117745e-06, + "logits/chosen": -2.1286087036132812, + "logits/rejected": -2.0892319679260254, + "logps/chosen": -660.74267578125, + "logps/rejected": -708.2584838867188, + "loss": 0.0776, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.3790927827358246, + "rewards/margins": 0.09251677989959717, + "rewards/rejected": -0.47160959243774414, + "step": 650 + }, + { + "epoch": 0.23, + "learning_rate": 4.761430261870804e-06, + "logits/chosen": -2.271576404571533, + "logits/rejected": -2.0834946632385254, + "logps/chosen": -636.5614624023438, + "logps/rejected": -729.883544921875, + "loss": 0.0955, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.3770856559276581, + "rewards/margins": 0.0929422676563263, + "rewards/rejected": -0.4700279235839844, + "step": 660 + }, + { + "epoch": 0.23, + "learning_rate": 4.748513128915928e-06, + "logits/chosen": -2.1836562156677246, + "logits/rejected": -1.9008392095565796, + "logps/chosen": -611.7904663085938, + "logps/rejected": -666.9830932617188, + "loss": 0.0629, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.35665163397789, + "rewards/margins": 0.07555453479290009, + "rewards/rejected": -0.4322062134742737, + "step": 670 + }, + { + "epoch": 0.23, + "learning_rate": 4.735273939976425e-06, + "logits/chosen": -2.0491878986358643, + "logits/rejected": -1.8870747089385986, + "logps/chosen": -617.8563842773438, + "logps/rejected": -739.245361328125, + "loss": 0.0922, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3806142210960388, + "rewards/margins": 0.11178413778543472, + "rewards/rejected": -0.49239835143089294, + "step": 680 + }, + { + "epoch": 0.24, + "learning_rate": 4.721714591309859e-06, + "logits/chosen": -2.241105079650879, + "logits/rejected": -1.764789342880249, + "logps/chosen": -493.39361572265625, + "logps/rejected": -587.4572143554688, + "loss": 0.0869, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.25177472829818726, + "rewards/margins": 0.11156761646270752, + "rewards/rejected": -0.3633423447608948, + "step": 690 + }, + { + "epoch": 0.24, + "learning_rate": 4.707837025030478e-06, + "logits/chosen": -2.0533032417297363, + "logits/rejected": -1.9060271978378296, + "logps/chosen": -480.1116638183594, + "logps/rejected": -561.89599609375, + "loss": 0.0799, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.291046142578125, + "rewards/margins": 0.0875721424818039, + "rewards/rejected": -0.3786182999610901, + "step": 700 + }, + { + "epoch": 0.24, + "learning_rate": 4.693643228831046e-06, + "logits/chosen": -2.1423745155334473, + "logits/rejected": -1.8586080074310303, + "logps/chosen": -486.69598388671875, + "logps/rejected": -578.1256713867188, + "loss": 0.0801, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2516762316226959, + "rewards/margins": 0.10918694734573364, + "rewards/rejected": -0.3608631491661072, + "step": 710 + }, + { + "epoch": 0.25, + "learning_rate": 4.67913523569814e-06, + "logits/chosen": -2.124239444732666, + "logits/rejected": -1.8211021423339844, + "logps/chosen": -505.65789794921875, + "logps/rejected": -655.1524658203125, + "loss": 0.0577, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2613288164138794, + "rewards/margins": 0.16436012089252472, + "rewards/rejected": -0.4256889224052429, + "step": 720 + }, + { + "epoch": 0.25, + "learning_rate": 4.664315123620965e-06, + "logits/chosen": -2.059915542602539, + "logits/rejected": -1.8637025356292725, + "logps/chosen": -627.88232421875, + "logps/rejected": -771.10595703125, + "loss": 0.0728, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.3556235432624817, + "rewards/margins": 0.15885277092456818, + "rewards/rejected": -0.5144763588905334, + "step": 730 + }, + { + "epoch": 0.25, + "learning_rate": 4.649185015293728e-06, + "logits/chosen": -2.202380895614624, + "logits/rejected": -1.7330690622329712, + "logps/chosen": -586.8130493164062, + "logps/rejected": -704.4319458007812, + "loss": 0.0514, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3131484091281891, + "rewards/margins": 0.13473856449127197, + "rewards/rejected": -0.44788694381713867, + "step": 740 + }, + { + "epoch": 0.26, + "learning_rate": 4.6337470778115946e-06, + "logits/chosen": -2.2767772674560547, + "logits/rejected": -1.9689744710922241, + "logps/chosen": -588.1864013671875, + "logps/rejected": -624.829345703125, + "loss": 0.0587, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.2952669858932495, + "rewards/margins": 0.10641799122095108, + "rewards/rejected": -0.4016849398612976, + "step": 750 + }, + { + "epoch": 0.26, + "learning_rate": 4.6180035223603e-06, + "logits/chosen": -2.1548593044281006, + "logits/rejected": -1.7463791370391846, + "logps/chosen": -608.1296997070312, + "logps/rejected": -624.1170043945312, + "loss": 0.0462, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3367452323436737, + "rewards/margins": 0.07480922341346741, + "rewards/rejected": -0.4115544855594635, + "step": 760 + }, + { + "epoch": 0.26, + "learning_rate": 4.60195660389944e-06, + "logits/chosen": -1.9999465942382812, + "logits/rejected": -1.7375587224960327, + "logps/chosen": -535.9517822265625, + "logps/rejected": -687.1197509765625, + "loss": 0.1019, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3230445384979248, + "rewards/margins": 0.11864666640758514, + "rewards/rejected": -0.44169121980667114, + "step": 770 + }, + { + "epoch": 0.27, + "learning_rate": 4.585608620839487e-06, + "logits/chosen": -2.0938560962677, + "logits/rejected": -1.678571343421936, + "logps/chosen": -540.8283081054688, + "logps/rejected": -611.6381225585938, + "loss": 0.0954, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3455389142036438, + "rewards/margins": 0.125118687748909, + "rewards/rejected": -0.4706575870513916, + "step": 780 + }, + { + "epoch": 0.27, + "learning_rate": 4.56896191471259e-06, + "logits/chosen": -2.2690582275390625, + "logits/rejected": -1.8676296472549438, + "logps/chosen": -589.1588745117188, + "logps/rejected": -705.4854736328125, + "loss": 0.0739, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3118017613887787, + "rewards/margins": 0.1364874541759491, + "rewards/rejected": -0.4482892155647278, + "step": 790 + }, + { + "epoch": 0.27, + "learning_rate": 4.552018869837197e-06, + "logits/chosen": -2.1564137935638428, + "logits/rejected": -1.83013916015625, + "logps/chosen": -609.3465576171875, + "logps/rejected": -699.4984130859375, + "loss": 0.0745, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3426007628440857, + "rewards/margins": 0.13322630524635315, + "rewards/rejected": -0.47582703828811646, + "step": 800 + }, + { + "epoch": 0.28, + "learning_rate": 4.534781912976546e-06, + "logits/chosen": -2.1592516899108887, + "logits/rejected": -1.8255417346954346, + "logps/chosen": -491.94366455078125, + "logps/rejected": -572.9866943359375, + "loss": 0.0519, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.29055461287498474, + "rewards/margins": 0.10878726094961166, + "rewards/rejected": -0.399341881275177, + "step": 810 + }, + { + "epoch": 0.28, + "learning_rate": 4.517253512991077e-06, + "logits/chosen": -2.1750411987304688, + "logits/rejected": -1.8149001598358154, + "logps/chosen": -588.0299682617188, + "logps/rejected": -729.2427368164062, + "loss": 0.0431, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3461843430995941, + "rewards/margins": 0.1527387797832489, + "rewards/rejected": -0.498923122882843, + "step": 820 + }, + { + "epoch": 0.28, + "learning_rate": 4.499436180484816e-06, + "logits/chosen": -2.0803980827331543, + "logits/rejected": -1.8419866561889648, + "logps/chosen": -657.2296142578125, + "logps/rejected": -715.0289916992188, + "loss": 0.0468, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3858215808868408, + "rewards/margins": 0.1072002500295639, + "rewards/rejected": -0.49302178621292114, + "step": 830 + }, + { + "epoch": 0.29, + "learning_rate": 4.481332467445784e-06, + "logits/chosen": -2.1348459720611572, + "logits/rejected": -1.8123409748077393, + "logps/chosen": -555.2808837890625, + "logps/rejected": -704.3569946289062, + "loss": 0.0561, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.36617225408554077, + "rewards/margins": 0.1446944773197174, + "rewards/rejected": -0.5108667016029358, + "step": 840 + }, + { + "epoch": 0.29, + "learning_rate": 4.462944966880464e-06, + "logits/chosen": -2.137538433074951, + "logits/rejected": -1.9754664897918701, + "logps/chosen": -614.5938110351562, + "logps/rejected": -646.7432250976562, + "loss": 0.0661, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.3780440390110016, + "rewards/margins": 0.04127226397395134, + "rewards/rejected": -0.41931629180908203, + "step": 850 + }, + { + "epoch": 0.29, + "learning_rate": 4.444276312442415e-06, + "logits/chosen": -2.0289080142974854, + "logits/rejected": -1.7629003524780273, + "logps/chosen": -541.0709838867188, + "logps/rejected": -660.1776123046875, + "loss": 0.1268, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3162756562232971, + "rewards/margins": 0.12853361666202545, + "rewards/rejected": -0.4448092579841614, + "step": 860 + }, + { + "epoch": 0.3, + "learning_rate": 4.425329178055044e-06, + "logits/chosen": -2.194471597671509, + "logits/rejected": -2.0296597480773926, + "logps/chosen": -475.4537048339844, + "logps/rejected": -510.2806701660156, + "loss": 0.0885, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.2241251915693283, + "rewards/margins": 0.07522304356098175, + "rewards/rejected": -0.29934826493263245, + "step": 870 + }, + { + "epoch": 0.3, + "learning_rate": 4.40610627752862e-06, + "logits/chosen": -2.2251474857330322, + "logits/rejected": -1.7184536457061768, + "logps/chosen": -499.7225036621094, + "logps/rejected": -632.2643432617188, + "loss": 0.0669, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2541348338127136, + "rewards/margins": 0.1577138453722, + "rewards/rejected": -0.4118487238883972, + "step": 880 + }, + { + "epoch": 0.31, + "learning_rate": 4.386610364171575e-06, + "logits/chosen": -2.0547173023223877, + "logits/rejected": -1.9380995035171509, + "logps/chosen": -583.1637573242188, + "logps/rejected": -675.6350708007812, + "loss": 0.0608, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.31417426466941833, + "rewards/margins": 0.07938437163829803, + "rewards/rejected": -0.3935586214065552, + "step": 890 + }, + { + "epoch": 0.31, + "learning_rate": 4.366844230396145e-06, + "logits/chosen": -2.1797802448272705, + "logits/rejected": -1.739689588546753, + "logps/chosen": -619.1736450195312, + "logps/rejected": -779.1910400390625, + "loss": 0.0831, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.354561984539032, + "rewards/margins": 0.16057774424552917, + "rewards/rejected": -0.5151397585868835, + "step": 900 + }, + { + "epoch": 0.31, + "learning_rate": 4.346810707318409e-06, + "logits/chosen": -2.101902723312378, + "logits/rejected": -1.756699800491333, + "logps/chosen": -550.2091064453125, + "logps/rejected": -652.6905517578125, + "loss": 0.0876, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.34942546486854553, + "rewards/margins": 0.1116580218076706, + "rewards/rejected": -0.4610835015773773, + "step": 910 + }, + { + "epoch": 0.32, + "learning_rate": 4.326512664352788e-06, + "logits/chosen": -2.1261112689971924, + "logits/rejected": -1.695481538772583, + "logps/chosen": -602.4801635742188, + "logps/rejected": -667.7581176757812, + "loss": 0.0785, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3986489176750183, + "rewards/margins": 0.07721661776304245, + "rewards/rejected": -0.47586554288864136, + "step": 920 + }, + { + "epoch": 0.32, + "learning_rate": 4.30595300880106e-06, + "logits/chosen": -1.9621734619140625, + "logits/rejected": -1.8363412618637085, + "logps/chosen": -516.5333862304688, + "logps/rejected": -650.1600952148438, + "loss": 0.0876, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3551289439201355, + "rewards/margins": 0.12143947929143906, + "rewards/rejected": -0.47656846046447754, + "step": 930 + }, + { + "epoch": 0.32, + "learning_rate": 4.285134685435941e-06, + "logits/chosen": -2.111262321472168, + "logits/rejected": -1.8083570003509521, + "logps/chosen": -619.1475830078125, + "logps/rejected": -671.5328369140625, + "loss": 0.069, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3735567033290863, + "rewards/margins": 0.09376771748065948, + "rewards/rejected": -0.467324435710907, + "step": 940 + }, + { + "epoch": 0.33, + "learning_rate": 4.264060676079302e-06, + "logits/chosen": -1.897774338722229, + "logits/rejected": -1.758368730545044, + "logps/chosen": -648.8074951171875, + "logps/rejected": -753.0172119140625, + "loss": 0.099, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3972818851470947, + "rewards/margins": 0.11452829837799072, + "rewards/rejected": -0.5118101239204407, + "step": 950 + }, + { + "epoch": 0.33, + "learning_rate": 4.242733999175087e-06, + "logits/chosen": -2.1442208290100098, + "logits/rejected": -1.8442842960357666, + "logps/chosen": -574.822265625, + "logps/rejected": -674.7249145507812, + "loss": 0.0727, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3319811522960663, + "rewards/margins": 0.10243819653987885, + "rewards/rejected": -0.4344193935394287, + "step": 960 + }, + { + "epoch": 0.33, + "learning_rate": 4.221157709356973e-06, + "logits/chosen": -2.069833517074585, + "logits/rejected": -2.053417682647705, + "logps/chosen": -473.2567443847656, + "logps/rejected": -551.2691650390625, + "loss": 0.0748, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.28391993045806885, + "rewards/margins": 0.07798723131418228, + "rewards/rejected": -0.3619071841239929, + "step": 970 + }, + { + "epoch": 0.34, + "learning_rate": 4.199334897010857e-06, + "logits/chosen": -2.428363561630249, + "logits/rejected": -1.9904381036758423, + "logps/chosen": -574.2955932617188, + "logps/rejected": -636.1633911132812, + "loss": 0.0508, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.29294562339782715, + "rewards/margins": 0.11134348064661026, + "rewards/rejected": -0.404289186000824, + "step": 980 + }, + { + "epoch": 0.34, + "learning_rate": 4.177268687832216e-06, + "logits/chosen": -2.2618508338928223, + "logits/rejected": -1.9453493356704712, + "logps/chosen": -583.1436767578125, + "logps/rejected": -659.47998046875, + "loss": 0.0492, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3153453767299652, + "rewards/margins": 0.0904412493109703, + "rewards/rejected": -0.4057866036891937, + "step": 990 + }, + { + "epoch": 0.34, + "learning_rate": 4.154962242378413e-06, + "logits/chosen": -2.2178263664245605, + "logits/rejected": -1.5476510524749756, + "logps/chosen": -663.134765625, + "logps/rejected": -688.6644287109375, + "loss": 0.0723, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.3838910460472107, + "rewards/margins": 0.10614491999149323, + "rewards/rejected": -0.4900360107421875, + "step": 1000 + }, + { + "epoch": 0.35, + "learning_rate": 4.132418755616006e-06, + "logits/chosen": -2.1056065559387207, + "logits/rejected": -1.8169822692871094, + "logps/chosen": -689.2779541015625, + "logps/rejected": -805.4830322265625, + "loss": 0.0795, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.42020684480667114, + "rewards/margins": 0.09372207522392273, + "rewards/rejected": -0.5139288902282715, + "step": 1010 + }, + { + "epoch": 0.35, + "learning_rate": 4.109641456463135e-06, + "logits/chosen": -2.270031452178955, + "logits/rejected": -2.0831856727600098, + "logps/chosen": -583.760009765625, + "logps/rejected": -599.2212524414062, + "loss": 0.1385, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.3357154428958893, + "rewards/margins": 0.05204144865274429, + "rewards/rejected": -0.387756884098053, + "step": 1020 + }, + { + "epoch": 0.35, + "learning_rate": 4.086633607327036e-06, + "logits/chosen": -1.9891811609268188, + "logits/rejected": -1.892112135887146, + "logps/chosen": -592.84814453125, + "logps/rejected": -705.7501831054688, + "loss": 0.0726, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.40938377380371094, + "rewards/margins": 0.07667826116085052, + "rewards/rejected": -0.4860619902610779, + "step": 1030 + }, + { + "epoch": 0.36, + "learning_rate": 4.06339850363677e-06, + "logits/chosen": -2.229407548904419, + "logits/rejected": -1.667838454246521, + "logps/chosen": -679.7462158203125, + "logps/rejected": -710.1227416992188, + "loss": 0.0926, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.380540132522583, + "rewards/margins": 0.09510533511638641, + "rewards/rejected": -0.4756454527378082, + "step": 1040 + }, + { + "epoch": 0.36, + "learning_rate": 4.039939473371213e-06, + "logits/chosen": -2.238617420196533, + "logits/rejected": -1.8673069477081299, + "logps/chosen": -528.8187255859375, + "logps/rejected": -692.1638793945312, + "loss": 0.0882, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.32173532247543335, + "rewards/margins": 0.16088572144508362, + "rewards/rejected": -0.48262104392051697, + "step": 1050 + }, + { + "epoch": 0.36, + "learning_rate": 4.01625987658239e-06, + "logits/chosen": -2.209980010986328, + "logits/rejected": -1.9042339324951172, + "logps/chosen": -692.3306884765625, + "logps/rejected": -672.4944458007812, + "loss": 0.0636, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.3526589572429657, + "rewards/margins": 0.04882120341062546, + "rewards/rejected": -0.40148013830184937, + "step": 1060 + }, + { + "epoch": 0.37, + "learning_rate": 3.992363104914211e-06, + "logits/chosen": -2.256624698638916, + "logits/rejected": -1.9974693059921265, + "logps/chosen": -560.591796875, + "logps/rejected": -578.005859375, + "loss": 0.0446, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.27544528245925903, + "rewards/margins": 0.07316794246435165, + "rewards/rejected": -0.3486132025718689, + "step": 1070 + }, + { + "epoch": 0.37, + "learning_rate": 3.9682525811166835e-06, + "logits/chosen": -2.007150173187256, + "logits/rejected": -1.7732871770858765, + "logps/chosen": -560.3460693359375, + "logps/rejected": -629.5783081054688, + "loss": 0.0836, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.37917017936706543, + "rewards/margins": 0.04371767118573189, + "rewards/rejected": -0.4228878617286682, + "step": 1080 + }, + { + "epoch": 0.37, + "learning_rate": 3.943931758555669e-06, + "logits/chosen": -2.0913567543029785, + "logits/rejected": -1.7354761362075806, + "logps/chosen": -608.615478515625, + "logps/rejected": -674.1383666992188, + "loss": 0.0904, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3448850214481354, + "rewards/margins": 0.10200424492359161, + "rewards/rejected": -0.4468892514705658, + "step": 1090 + }, + { + "epoch": 0.38, + "learning_rate": 3.91940412071826e-06, + "logits/chosen": -2.225922107696533, + "logits/rejected": -1.8326537609100342, + "logps/chosen": -651.896728515625, + "logps/rejected": -687.8331298828125, + "loss": 0.0807, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3168756365776062, + "rewards/margins": 0.10071317106485367, + "rewards/rejected": -0.4175888001918793, + "step": 1100 + }, + { + "epoch": 0.38, + "learning_rate": 3.894673180713829e-06, + "logits/chosen": -2.0696487426757812, + "logits/rejected": -1.9062414169311523, + "logps/chosen": -542.2778930664062, + "logps/rejected": -659.351318359375, + "loss": 0.1094, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.29152894020080566, + "rewards/margins": 0.09752384573221207, + "rewards/rejected": -0.38905277848243713, + "step": 1110 + }, + { + "epoch": 0.38, + "learning_rate": 3.869742480770855e-06, + "logits/chosen": -2.371598958969116, + "logits/rejected": -2.1790008544921875, + "logps/chosen": -508.5203552246094, + "logps/rejected": -568.6731567382812, + "loss": 0.0912, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2144095003604889, + "rewards/margins": 0.08987125009298325, + "rewards/rejected": -0.30428069829940796, + "step": 1120 + }, + { + "epoch": 0.39, + "learning_rate": 3.844615591729558e-06, + "logits/chosen": -2.045975685119629, + "logits/rejected": -1.9603124856948853, + "logps/chosen": -437.69635009765625, + "logps/rejected": -593.5106201171875, + "loss": 0.0675, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.2701892852783203, + "rewards/margins": 0.0922120064496994, + "rewards/rejected": -0.3624013364315033, + "step": 1130 + }, + { + "epoch": 0.39, + "learning_rate": 3.819296112530448e-06, + "logits/chosen": -1.885154128074646, + "logits/rejected": -1.9763774871826172, + "logps/chosen": -534.5982055664062, + "logps/rejected": -644.5667114257812, + "loss": 0.103, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3009013533592224, + "rewards/margins": 0.07459478080272675, + "rewards/rejected": -0.37549614906311035, + "step": 1140 + }, + { + "epoch": 0.39, + "learning_rate": 3.7937876696988505e-06, + "logits/chosen": -2.228935718536377, + "logits/rejected": -1.992790937423706, + "logps/chosen": -595.5947265625, + "logps/rejected": -676.4202880859375, + "loss": 0.0764, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.33396145701408386, + "rewards/margins": 0.09032727777957916, + "rewards/rejected": -0.4242887496948242, + "step": 1150 + }, + { + "epoch": 0.4, + "learning_rate": 3.7680939168254733e-06, + "logits/chosen": -2.148974657058716, + "logits/rejected": -1.8327066898345947, + "logps/chosen": -651.0219116210938, + "logps/rejected": -701.9970092773438, + "loss": 0.0987, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.37643447518348694, + "rewards/margins": 0.08819916099309921, + "rewards/rejected": -0.46463364362716675, + "step": 1160 + }, + { + "epoch": 0.4, + "learning_rate": 3.7422185340430983e-06, + "logits/chosen": -2.2028675079345703, + "logits/rejected": -2.0429883003234863, + "logps/chosen": -584.5667114257812, + "logps/rejected": -641.1693115234375, + "loss": 0.0937, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.34211236238479614, + "rewards/margins": 0.09406879544258118, + "rewards/rejected": -0.4361811578273773, + "step": 1170 + }, + { + "epoch": 0.4, + "learning_rate": 3.71616522749948e-06, + "logits/chosen": -2.465292453765869, + "logits/rejected": -2.052821159362793, + "logps/chosen": -604.7883911132812, + "logps/rejected": -693.3052978515625, + "loss": 0.0774, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.33726420998573303, + "rewards/margins": 0.13398997485637665, + "rewards/rejected": -0.4712541103363037, + "step": 1180 + }, + { + "epoch": 0.41, + "learning_rate": 3.6899377288265043e-06, + "logits/chosen": -2.0992684364318848, + "logits/rejected": -1.9688608646392822, + "logps/chosen": -596.2180786132812, + "logps/rejected": -685.1240844726562, + "loss": 0.0809, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3490615487098694, + "rewards/margins": 0.11136557906866074, + "rewards/rejected": -0.46042710542678833, + "step": 1190 + }, + { + "epoch": 0.41, + "learning_rate": 3.6635397946057114e-06, + "logits/chosen": -2.260376453399658, + "logits/rejected": -1.8176358938217163, + "logps/chosen": -605.2567138671875, + "logps/rejected": -649.65673828125, + "loss": 0.0525, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3633373975753784, + "rewards/margins": 0.11985437572002411, + "rewards/rejected": -0.4831917881965637, + "step": 1200 + }, + { + "epoch": 0.41, + "learning_rate": 3.6369752058302327e-06, + "logits/chosen": -2.275251865386963, + "logits/rejected": -1.9965429306030273, + "logps/chosen": -539.2314453125, + "logps/rejected": -636.8073120117188, + "loss": 0.0995, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3215656876564026, + "rewards/margins": 0.13287541270256042, + "rewards/rejected": -0.4544410705566406, + "step": 1210 + }, + { + "epoch": 0.42, + "learning_rate": 3.610247767363239e-06, + "logits/chosen": -1.9407621622085571, + "logits/rejected": -1.8752870559692383, + "logps/chosen": -587.7548828125, + "logps/rejected": -671.2868041992188, + "loss": 0.0756, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.3427480161190033, + "rewards/margins": 0.06522272527217865, + "rewards/rejected": -0.40797075629234314, + "step": 1220 + }, + { + "epoch": 0.42, + "learning_rate": 3.5833613073929684e-06, + "logits/chosen": -2.1943631172180176, + "logits/rejected": -1.859442114830017, + "logps/chosen": -558.7838134765625, + "logps/rejected": -717.7572021484375, + "loss": 0.0603, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3341369032859802, + "rewards/margins": 0.13402590155601501, + "rewards/rejected": -0.46816277503967285, + "step": 1230 + }, + { + "epoch": 0.43, + "learning_rate": 3.55631967688441e-06, + "logits/chosen": -2.1805663108825684, + "logits/rejected": -1.8173482418060303, + "logps/chosen": -807.4880981445312, + "logps/rejected": -837.0833740234375, + "loss": 0.0518, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.45633822679519653, + "rewards/margins": 0.08246854692697525, + "rewards/rejected": -0.5388067364692688, + "step": 1240 + }, + { + "epoch": 0.43, + "learning_rate": 3.5291267490277316e-06, + "logits/chosen": -1.8296005725860596, + "logits/rejected": -1.6457901000976562, + "logps/chosen": -621.5354614257812, + "logps/rejected": -701.6083374023438, + "loss": 0.1176, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.3704153597354889, + "rewards/margins": 0.11859778314828873, + "rewards/rejected": -0.48901304602622986, + "step": 1250 + }, + { + "epoch": 0.43, + "learning_rate": 3.501786418683515e-06, + "logits/chosen": -2.0369582176208496, + "logits/rejected": -1.9080692529678345, + "logps/chosen": -765.410400390625, + "logps/rejected": -825.2081298828125, + "loss": 0.0802, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5283640623092651, + "rewards/margins": 0.08725164830684662, + "rewards/rejected": -0.615615725517273, + "step": 1260 + }, + { + "epoch": 0.44, + "learning_rate": 3.474302601824896e-06, + "logits/chosen": -2.2294676303863525, + "logits/rejected": -1.7623846530914307, + "logps/chosen": -824.7463989257812, + "logps/rejected": -865.5594482421875, + "loss": 0.104, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5527879595756531, + "rewards/margins": 0.07611383497714996, + "rewards/rejected": -0.6289017796516418, + "step": 1270 + }, + { + "epoch": 0.44, + "learning_rate": 3.4466792349766767e-06, + "logits/chosen": -2.3877675533294678, + "logits/rejected": -2.136277198791504, + "logps/chosen": -631.8321533203125, + "logps/rejected": -653.0026245117188, + "loss": 0.0999, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.3877051770687103, + "rewards/margins": 0.06024375557899475, + "rewards/rejected": -0.4479489326477051, + "step": 1280 + }, + { + "epoch": 0.44, + "learning_rate": 3.4189202746514938e-06, + "logits/chosen": -2.127175807952881, + "logits/rejected": -1.8073135614395142, + "logps/chosen": -673.6165771484375, + "logps/rejected": -750.7117309570312, + "loss": 0.0823, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4296782910823822, + "rewards/margins": 0.1071913093328476, + "rewards/rejected": -0.5368696451187134, + "step": 1290 + }, + { + "epoch": 0.45, + "learning_rate": 3.391029696783127e-06, + "logits/chosen": -1.9842946529388428, + "logits/rejected": -1.5577259063720703, + "logps/chosen": -650.3092041015625, + "logps/rejected": -740.8306274414062, + "loss": 0.0833, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4263342022895813, + "rewards/margins": 0.12290897220373154, + "rewards/rejected": -0.549243152141571, + "step": 1300 + }, + { + "epoch": 0.45, + "learning_rate": 3.3630114961570187e-06, + "logits/chosen": -2.326686382293701, + "logits/rejected": -1.8404957056045532, + "logps/chosen": -725.4781494140625, + "logps/rejected": -822.6609497070312, + "loss": 0.0923, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4563368260860443, + "rewards/margins": 0.14041298627853394, + "rewards/rejected": -0.5967497825622559, + "step": 1310 + }, + { + "epoch": 0.45, + "learning_rate": 3.3348696858381023e-06, + "logits/chosen": -2.081413984298706, + "logits/rejected": -1.8651702404022217, + "logps/chosen": -677.7269897460938, + "logps/rejected": -756.968505859375, + "loss": 0.0801, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.4635470509529114, + "rewards/margins": 0.09350712597370148, + "rewards/rejected": -0.5570541024208069, + "step": 1320 + }, + { + "epoch": 0.46, + "learning_rate": 3.3066082965960082e-06, + "logits/chosen": -2.1301093101501465, + "logits/rejected": -2.035060405731201, + "logps/chosen": -712.52685546875, + "logps/rejected": -769.6886596679688, + "loss": 0.0568, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4348185062408447, + "rewards/margins": 0.09993582218885422, + "rewards/rejected": -0.5347543954849243, + "step": 1330 + }, + { + "epoch": 0.46, + "learning_rate": 3.278231376327731e-06, + "logits/chosen": -2.1865429878234863, + "logits/rejected": -1.7728700637817383, + "logps/chosen": -692.3992919921875, + "logps/rejected": -818.3483276367188, + "loss": 0.0822, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.4515460431575775, + "rewards/margins": 0.13385489583015442, + "rewards/rejected": -0.5854009985923767, + "step": 1340 + }, + { + "epoch": 0.46, + "learning_rate": 3.249742989477851e-06, + "logits/chosen": -2.199068546295166, + "logits/rejected": -1.866813063621521, + "logps/chosen": -817.2093505859375, + "logps/rejected": -920.2403564453125, + "loss": 0.0511, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5562337040901184, + "rewards/margins": 0.11897413432598114, + "rewards/rejected": -0.6752079129219055, + "step": 1350 + }, + { + "epoch": 0.47, + "learning_rate": 3.2211472164563756e-06, + "logits/chosen": -2.109049081802368, + "logits/rejected": -1.7997972965240479, + "logps/chosen": -705.8816528320312, + "logps/rejected": -768.1865234375, + "loss": 0.095, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4462059438228607, + "rewards/margins": 0.09233604371547699, + "rewards/rejected": -0.5385419130325317, + "step": 1360 + }, + { + "epoch": 0.47, + "learning_rate": 3.192448153054306e-06, + "logits/chosen": -2.2047770023345947, + "logits/rejected": -1.8513100147247314, + "logps/chosen": -771.5567626953125, + "logps/rejected": -856.609375, + "loss": 0.0693, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5159443616867065, + "rewards/margins": 0.11039619147777557, + "rewards/rejected": -0.6263405680656433, + "step": 1370 + }, + { + "epoch": 0.47, + "learning_rate": 3.16364990985699e-06, + "logits/chosen": -2.357393980026245, + "logits/rejected": -1.782231330871582, + "logps/chosen": -714.89404296875, + "logps/rejected": -804.54150390625, + "loss": 0.0874, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.446491539478302, + "rewards/margins": 0.16014492511749268, + "rewards/rejected": -0.6066364645957947, + "step": 1380 + }, + { + "epoch": 0.48, + "learning_rate": 3.134756611655362e-06, + "logits/chosen": -2.3336434364318848, + "logits/rejected": -2.126812219619751, + "logps/chosen": -558.0953369140625, + "logps/rejected": -690.7258911132812, + "loss": 0.0954, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.36971205472946167, + "rewards/margins": 0.10154370963573456, + "rewards/rejected": -0.4712557792663574, + "step": 1390 + }, + { + "epoch": 0.48, + "learning_rate": 3.1057723968551427e-06, + "logits/chosen": -2.052511215209961, + "logits/rejected": -1.6268789768218994, + "logps/chosen": -709.0806884765625, + "logps/rejected": -791.6183471679688, + "loss": 0.0784, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.45772653818130493, + "rewards/margins": 0.14719423651695251, + "rewards/rejected": -0.6049207448959351, + "step": 1400 + }, + { + "epoch": 0.48, + "learning_rate": 3.0767014168841e-06, + "logits/chosen": -2.0302186012268066, + "logits/rejected": -1.9973528385162354, + "logps/chosen": -628.5819091796875, + "logps/rejected": -709.375, + "loss": 0.0716, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.34966421127319336, + "rewards/margins": 0.07439250499010086, + "rewards/rejected": -0.4240567088127136, + "step": 1410 + }, + { + "epoch": 0.49, + "learning_rate": 3.047547835597432e-06, + "logits/chosen": -1.909949541091919, + "logits/rejected": -1.8792356252670288, + "logps/chosen": -582.7291259765625, + "logps/rejected": -683.0811767578125, + "loss": 0.0874, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.3829612135887146, + "rewards/margins": 0.07136234641075134, + "rewards/rejected": -0.45432358980178833, + "step": 1420 + }, + { + "epoch": 0.49, + "learning_rate": 3.0183158286813755e-06, + "logits/chosen": -2.278263568878174, + "logits/rejected": -1.8273910284042358, + "logps/chosen": -655.0009765625, + "logps/rejected": -684.890625, + "loss": 0.075, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3779233396053314, + "rewards/margins": 0.10793854296207428, + "rewards/rejected": -0.4858619272708893, + "step": 1430 + }, + { + "epoch": 0.49, + "learning_rate": 2.989009583055121e-06, + "logits/chosen": -2.129441022872925, + "logits/rejected": -1.9885050058364868, + "logps/chosen": -765.9420166015625, + "logps/rejected": -878.65576171875, + "loss": 0.0642, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.47016677260398865, + "rewards/margins": 0.11287762969732285, + "rewards/rejected": -0.5830444097518921, + "step": 1440 + }, + { + "epoch": 0.5, + "learning_rate": 2.959633296271117e-06, + "logits/chosen": -2.113435983657837, + "logits/rejected": -1.8716232776641846, + "logps/chosen": -611.4641723632812, + "logps/rejected": -689.0220336914062, + "loss": 0.0852, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.4303819537162781, + "rewards/margins": 0.09832239896059036, + "rewards/rejected": -0.5287044048309326, + "step": 1450 + }, + { + "epoch": 0.5, + "learning_rate": 2.9301911759138535e-06, + "logits/chosen": -2.1812188625335693, + "logits/rejected": -1.9601389169692993, + "logps/chosen": -618.2063598632812, + "logps/rejected": -735.6884155273438, + "loss": 0.0752, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4212234914302826, + "rewards/margins": 0.11810547113418579, + "rewards/rejected": -0.539328932762146, + "step": 1460 + }, + { + "epoch": 0.5, + "learning_rate": 2.900687438997205e-06, + "logits/chosen": -2.0657143592834473, + "logits/rejected": -1.8039214611053467, + "logps/chosen": -632.2223510742188, + "logps/rejected": -755.59228515625, + "loss": 0.0795, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4010123312473297, + "rewards/margins": 0.09653668105602264, + "rewards/rejected": -0.49754899740219116, + "step": 1470 + }, + { + "epoch": 0.51, + "learning_rate": 2.871126311360424e-06, + "logits/chosen": -2.4564061164855957, + "logits/rejected": -1.8799717426300049, + "logps/chosen": -642.9153442382812, + "logps/rejected": -700.4526977539062, + "loss": 0.08, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.341757595539093, + "rewards/margins": 0.12287576496601105, + "rewards/rejected": -0.46463337540626526, + "step": 1480 + }, + { + "epoch": 0.51, + "learning_rate": 2.8415120270628756e-06, + "logits/chosen": -2.2899577617645264, + "logits/rejected": -1.8913564682006836, + "logps/chosen": -625.5499877929688, + "logps/rejected": -751.3653564453125, + "loss": 0.059, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.34753698110580444, + "rewards/margins": 0.17222611606121063, + "rewards/rejected": -0.5197631120681763, + "step": 1490 + }, + { + "epoch": 0.51, + "learning_rate": 2.8118488277775852e-06, + "logits/chosen": -2.1799449920654297, + "logits/rejected": -2.2029454708099365, + "logps/chosen": -541.7261352539062, + "logps/rejected": -652.3367919921875, + "loss": 0.0522, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.3303873538970947, + "rewards/margins": 0.08029700815677643, + "rewards/rejected": -0.4106842875480652, + "step": 1500 + }, + { + "epoch": 0.52, + "learning_rate": 2.7821409621837042e-06, + "logits/chosen": -2.434246063232422, + "logits/rejected": -1.9443886280059814, + "logps/chosen": -614.01220703125, + "logps/rejected": -698.8470458984375, + "loss": 0.048, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.36497288942337036, + "rewards/margins": 0.10476745665073395, + "rewards/rejected": -0.4697403311729431, + "step": 1510 + }, + { + "epoch": 0.52, + "learning_rate": 2.7523926853579702e-06, + "logits/chosen": -2.0561671257019043, + "logits/rejected": -1.7254194021224976, + "logps/chosen": -624.192626953125, + "logps/rejected": -757.9207763671875, + "loss": 0.0956, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.4151741564273834, + "rewards/margins": 0.1256508082151413, + "rewards/rejected": -0.5408250093460083, + "step": 1520 + }, + { + "epoch": 0.52, + "learning_rate": 2.722608258165244e-06, + "logits/chosen": -2.2351367473602295, + "logits/rejected": -1.6919574737548828, + "logps/chosen": -727.2100830078125, + "logps/rejected": -801.911865234375, + "loss": 0.094, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.43864089250564575, + "rewards/margins": 0.1414739489555359, + "rewards/rejected": -0.5801147818565369, + "step": 1530 + }, + { + "epoch": 0.53, + "learning_rate": 2.6927919466482293e-06, + "logits/chosen": -2.0343658924102783, + "logits/rejected": -1.7758781909942627, + "logps/chosen": -717.9749755859375, + "logps/rejected": -791.5612182617188, + "loss": 0.0753, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.501410186290741, + "rewards/margins": 0.08459311723709106, + "rewards/rejected": -0.5860033631324768, + "step": 1540 + }, + { + "epoch": 0.53, + "learning_rate": 2.662948021416441e-06, + "logits/chosen": -2.232266426086426, + "logits/rejected": -2.0454351902008057, + "logps/chosen": -727.7106323242188, + "logps/rejected": -814.9327392578125, + "loss": 0.0557, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.46008116006851196, + "rewards/margins": 0.10495243221521378, + "rewards/rejected": -0.5650335550308228, + "step": 1550 + }, + { + "epoch": 0.53, + "learning_rate": 2.6330807570345253e-06, + "logits/chosen": -2.1601340770721436, + "logits/rejected": -1.9538816213607788, + "logps/chosen": -685.18896484375, + "logps/rejected": -795.1231689453125, + "loss": 0.0791, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.4878101348876953, + "rewards/margins": 0.1159614771604538, + "rewards/rejected": -0.6037715673446655, + "step": 1560 + }, + { + "epoch": 0.54, + "learning_rate": 2.6031944314100077e-06, + "logits/chosen": -2.442682981491089, + "logits/rejected": -2.2220892906188965, + "logps/chosen": -678.3485717773438, + "logps/rejected": -837.5185546875, + "loss": 0.0906, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4468511641025543, + "rewards/margins": 0.12742993235588074, + "rewards/rejected": -0.5742811560630798, + "step": 1570 + }, + { + "epoch": 0.54, + "learning_rate": 2.5732933251805716e-06, + "logits/chosen": -2.238412380218506, + "logits/rejected": -2.0841736793518066, + "logps/chosen": -718.62060546875, + "logps/rejected": -867.1959228515625, + "loss": 0.0541, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4921676218509674, + "rewards/margins": 0.10790624469518661, + "rewards/rejected": -0.6000738143920898, + "step": 1580 + }, + { + "epoch": 0.54, + "learning_rate": 2.543381721100931e-06, + "logits/chosen": -2.335407257080078, + "logits/rejected": -2.028334140777588, + "logps/chosen": -614.9449462890625, + "logps/rejected": -788.1812133789062, + "loss": 0.0811, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.41624245047569275, + "rewards/margins": 0.17170578241348267, + "rewards/rejected": -0.587948203086853, + "step": 1590 + }, + { + "epoch": 0.55, + "learning_rate": 2.513463903429418e-06, + "logits/chosen": -2.4693076610565186, + "logits/rejected": -2.0430703163146973, + "logps/chosen": -725.3846435546875, + "logps/rejected": -755.0938720703125, + "loss": 0.1021, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.4062575697898865, + "rewards/margins": 0.10169617831707001, + "rewards/rejected": -0.5079537630081177, + "step": 1600 + }, + { + "epoch": 0.55, + "learning_rate": 2.483544157314338e-06, + "logits/chosen": -2.293912410736084, + "logits/rejected": -1.9852508306503296, + "logps/chosen": -639.7050170898438, + "logps/rejected": -763.0711059570312, + "loss": 0.0737, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4078396260738373, + "rewards/margins": 0.1239791288971901, + "rewards/rejected": -0.5318187475204468, + "step": 1610 + }, + { + "epoch": 0.56, + "learning_rate": 2.453626768180214e-06, + "logits/chosen": -2.122490167617798, + "logits/rejected": -1.8506181240081787, + "logps/chosen": -720.9524536132812, + "logps/rejected": -728.7947998046875, + "loss": 0.1032, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.4711574912071228, + "rewards/margins": 0.0486263632774353, + "rewards/rejected": -0.5197838544845581, + "step": 1620 + }, + { + "epoch": 0.56, + "learning_rate": 2.4237160211139697e-06, + "logits/chosen": -2.0483882427215576, + "logits/rejected": -1.869739294052124, + "logps/chosen": -622.5816650390625, + "logps/rejected": -701.8380126953125, + "loss": 0.0577, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.41618186235427856, + "rewards/margins": 0.08069188892841339, + "rewards/rejected": -0.49687376618385315, + "step": 1630 + }, + { + "epoch": 0.56, + "learning_rate": 2.393816200251187e-06, + "logits/chosen": -2.0031468868255615, + "logits/rejected": -1.54337739944458, + "logps/chosen": -700.4393310546875, + "logps/rejected": -752.5362548828125, + "loss": 0.0727, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.4837239384651184, + "rewards/margins": 0.10101475566625595, + "rewards/rejected": -0.584738552570343, + "step": 1640 + }, + { + "epoch": 0.57, + "learning_rate": 2.3639315881624776e-06, + "logits/chosen": -2.3299944400787354, + "logits/rejected": -1.946599006652832, + "logps/chosen": -644.5682373046875, + "logps/rejected": -742.0753173828125, + "loss": 0.0999, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.44712233543395996, + "rewards/margins": 0.09644552320241928, + "rewards/rejected": -0.5435678362846375, + "step": 1650 + }, + { + "epoch": 0.57, + "learning_rate": 2.334066465240093e-06, + "logits/chosen": -2.055642604827881, + "logits/rejected": -1.5556762218475342, + "logps/chosen": -778.634033203125, + "logps/rejected": -831.9388427734375, + "loss": 0.0767, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4928356111049652, + "rewards/margins": 0.1221829205751419, + "rewards/rejected": -0.6150184869766235, + "step": 1660 + }, + { + "epoch": 0.57, + "learning_rate": 2.3042251090848357e-06, + "logits/chosen": -2.259159564971924, + "logits/rejected": -1.7662973403930664, + "logps/chosen": -643.4910888671875, + "logps/rejected": -777.8291015625, + "loss": 0.0825, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.4092964231967926, + "rewards/margins": 0.1640159636735916, + "rewards/rejected": -0.5733123421669006, + "step": 1670 + }, + { + "epoch": 0.58, + "learning_rate": 2.2744117938933814e-06, + "logits/chosen": -2.2976372241973877, + "logits/rejected": -1.9439456462860107, + "logps/chosen": -806.0792236328125, + "logps/rejected": -835.6940307617188, + "loss": 0.0729, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.49456721544265747, + "rewards/margins": 0.07213100790977478, + "rewards/rejected": -0.5666981935501099, + "step": 1680 + }, + { + "epoch": 0.58, + "learning_rate": 2.2446307898460807e-06, + "logits/chosen": -2.1391043663024902, + "logits/rejected": -1.7574889659881592, + "logps/chosen": -778.1689453125, + "logps/rejected": -875.1658935546875, + "loss": 0.0758, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5087161064147949, + "rewards/margins": 0.12866072356700897, + "rewards/rejected": -0.6373767852783203, + "step": 1690 + }, + { + "epoch": 0.58, + "learning_rate": 2.2148863624953364e-06, + "logits/chosen": -2.1262030601501465, + "logits/rejected": -1.8642327785491943, + "logps/chosen": -721.8367919921875, + "logps/rejected": -874.4264526367188, + "loss": 0.0491, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.48331218957901, + "rewards/margins": 0.12461258471012115, + "rewards/rejected": -0.6079246997833252, + "step": 1700 + }, + { + "epoch": 0.59, + "learning_rate": 2.1851827721546483e-06, + "logits/chosen": -2.0042014122009277, + "logits/rejected": -1.7912979125976562, + "logps/chosen": -779.1425170898438, + "logps/rejected": -934.787109375, + "loss": 0.046, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5522741079330444, + "rewards/margins": 0.15596961975097656, + "rewards/rejected": -0.7082436680793762, + "step": 1710 + }, + { + "epoch": 0.59, + "learning_rate": 2.155524273288405e-06, + "logits/chosen": -2.2937986850738525, + "logits/rejected": -1.780461072921753, + "logps/chosen": -812.2808837890625, + "logps/rejected": -918.3689575195312, + "loss": 0.0805, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4778391420841217, + "rewards/margins": 0.1493079662322998, + "rewards/rejected": -0.6271471381187439, + "step": 1720 + }, + { + "epoch": 0.59, + "learning_rate": 2.125915113902514e-06, + "logits/chosen": -2.125365734100342, + "logits/rejected": -1.9615271091461182, + "logps/chosen": -657.5159912109375, + "logps/rejected": -758.6436767578125, + "loss": 0.0946, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.42783278226852417, + "rewards/margins": 0.09773501008749008, + "rewards/rejected": -0.5255678296089172, + "step": 1730 + }, + { + "epoch": 0.6, + "learning_rate": 2.096359534935958e-06, + "logits/chosen": -1.965488076210022, + "logits/rejected": -1.7874730825424194, + "logps/chosen": -719.5172119140625, + "logps/rejected": -838.3024291992188, + "loss": 0.0825, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.520493745803833, + "rewards/margins": 0.09665954858064651, + "rewards/rejected": -0.6171532869338989, + "step": 1740 + }, + { + "epoch": 0.6, + "learning_rate": 2.0668617696533603e-06, + "logits/chosen": -2.1595165729522705, + "logits/rejected": -1.795940637588501, + "logps/chosen": -750.8433227539062, + "logps/rejected": -791.1815795898438, + "loss": 0.0898, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5119301080703735, + "rewards/margins": 0.08175458759069443, + "rewards/rejected": -0.5936846137046814, + "step": 1750 + }, + { + "epoch": 0.6, + "learning_rate": 2.0374260430386542e-06, + "logits/chosen": -2.072263717651367, + "logits/rejected": -1.8800386190414429, + "logps/chosen": -735.1260986328125, + "logps/rejected": -801.7227172851562, + "loss": 0.0447, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.49476614594459534, + "rewards/margins": 0.09266269207000732, + "rewards/rejected": -0.5874288082122803, + "step": 1760 + }, + { + "epoch": 0.61, + "learning_rate": 2.0080565711899327e-06, + "logits/chosen": -1.9065930843353271, + "logits/rejected": -1.6727092266082764, + "logps/chosen": -684.2578125, + "logps/rejected": -754.9581298828125, + "loss": 0.0742, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.47697681188583374, + "rewards/margins": 0.08060398697853088, + "rewards/rejected": -0.5575807690620422, + "step": 1770 + }, + { + "epoch": 0.61, + "learning_rate": 1.978757560715579e-06, + "logits/chosen": -2.1670429706573486, + "logits/rejected": -1.99936842918396, + "logps/chosen": -735.7562866210938, + "logps/rejected": -859.4212036132812, + "loss": 0.0952, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.4949742257595062, + "rewards/margins": 0.13264210522174835, + "rewards/rejected": -0.627616286277771, + "step": 1780 + }, + { + "epoch": 0.61, + "learning_rate": 1.9495332081317466e-06, + "logits/chosen": -2.0642776489257812, + "logits/rejected": -1.8938987255096436, + "logps/chosen": -819.5340576171875, + "logps/rejected": -883.53564453125, + "loss": 0.0872, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.5653868317604065, + "rewards/margins": 0.07868463546037674, + "rewards/rejected": -0.6440714597702026, + "step": 1790 + }, + { + "epoch": 0.62, + "learning_rate": 1.9203876992612904e-06, + "logits/chosen": -2.0933823585510254, + "logits/rejected": -1.7218735218048096, + "logps/chosen": -661.1719360351562, + "logps/rejected": -781.9850463867188, + "loss": 0.1006, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.43831753730773926, + "rewards/margins": 0.16069361567497253, + "rewards/rejected": -0.5990111231803894, + "step": 1800 + }, + { + "epoch": 0.62, + "learning_rate": 1.891325208634231e-06, + "logits/chosen": -2.287635087966919, + "logits/rejected": -1.9859931468963623, + "logps/chosen": -612.9700927734375, + "logps/rejected": -735.92919921875, + "loss": 0.0608, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4461449682712555, + "rewards/margins": 0.13415846228599548, + "rewards/rejected": -0.580303430557251, + "step": 1810 + }, + { + "epoch": 0.62, + "learning_rate": 1.8623498988898309e-06, + "logits/chosen": -2.2971930503845215, + "logits/rejected": -1.8358027935028076, + "logps/chosen": -716.4727172851562, + "logps/rejected": -849.65869140625, + "loss": 0.0872, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.47384414076805115, + "rewards/margins": 0.13949860632419586, + "rewards/rejected": -0.6133427023887634, + "step": 1820 + }, + { + "epoch": 0.63, + "learning_rate": 1.83346592018038e-06, + "logits/chosen": -2.231283664703369, + "logits/rejected": -1.9385484457015991, + "logps/chosen": -650.612060546875, + "logps/rejected": -773.4234008789062, + "loss": 0.1013, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.4305783212184906, + "rewards/margins": 0.13019639253616333, + "rewards/rejected": -0.5607747435569763, + "step": 1830 + }, + { + "epoch": 0.63, + "learning_rate": 1.8046774095767652e-06, + "logits/chosen": -2.3146252632141113, + "logits/rejected": -2.038005828857422, + "logps/chosen": -650.5593872070312, + "logps/rejected": -713.4359130859375, + "loss": 0.0792, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4199472963809967, + "rewards/margins": 0.11954204738140106, + "rewards/rejected": -0.5394893884658813, + "step": 1840 + }, + { + "epoch": 0.63, + "learning_rate": 1.775988490475914e-06, + "logits/chosen": -1.9583438634872437, + "logits/rejected": -1.9360641241073608, + "logps/chosen": -607.3677978515625, + "logps/rejected": -768.2495727539062, + "loss": 0.0828, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4246648848056793, + "rewards/margins": 0.1216825619339943, + "rewards/rejected": -0.546347439289093, + "step": 1850 + }, + { + "epoch": 0.64, + "learning_rate": 1.7474032720101991e-06, + "logits/chosen": -2.302241802215576, + "logits/rejected": -2.053952932357788, + "logps/chosen": -578.2738647460938, + "logps/rejected": -709.3789672851562, + "loss": 0.0998, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.3617783188819885, + "rewards/margins": 0.12743337452411652, + "rewards/rejected": -0.48921164870262146, + "step": 1860 + }, + { + "epoch": 0.64, + "learning_rate": 1.7189258484588853e-06, + "logits/chosen": -2.295841932296753, + "logits/rejected": -1.8487634658813477, + "logps/chosen": -805.4288330078125, + "logps/rejected": -849.3688354492188, + "loss": 0.0616, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5005868673324585, + "rewards/margins": 0.10474538803100586, + "rewards/rejected": -0.6053322553634644, + "step": 1870 + }, + { + "epoch": 0.64, + "learning_rate": 1.6905602986617006e-06, + "logits/chosen": -2.2122347354888916, + "logits/rejected": -1.8240172863006592, + "logps/chosen": -648.9810791015625, + "logps/rejected": -798.175048828125, + "loss": 0.0807, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4233244061470032, + "rewards/margins": 0.15655961632728577, + "rewards/rejected": -0.5798839926719666, + "step": 1880 + }, + { + "epoch": 0.65, + "learning_rate": 1.662310685434625e-06, + "logits/chosen": -2.3458707332611084, + "logits/rejected": -2.2734622955322266, + "logps/chosen": -647.6605224609375, + "logps/rejected": -782.6307373046875, + "loss": 0.073, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3885071873664856, + "rewards/margins": 0.09876324236392975, + "rewards/rejected": -0.48727044463157654, + "step": 1890 + }, + { + "epoch": 0.65, + "learning_rate": 1.6341810549879666e-06, + "logits/chosen": -2.344203233718872, + "logits/rejected": -2.0581679344177246, + "logps/chosen": -571.4796752929688, + "logps/rejected": -596.5723876953125, + "loss": 0.0721, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.35855913162231445, + "rewards/margins": 0.06313261389732361, + "rewards/rejected": -0.4216917157173157, + "step": 1900 + }, + { + "epoch": 0.65, + "learning_rate": 1.6061754363468255e-06, + "logits/chosen": -2.259507894515991, + "logits/rejected": -2.1293258666992188, + "logps/chosen": -650.2342529296875, + "logps/rejected": -721.8099365234375, + "loss": 0.0816, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3676004707813263, + "rewards/margins": 0.07684332877397537, + "rewards/rejected": -0.44444379210472107, + "step": 1910 + }, + { + "epoch": 0.66, + "learning_rate": 1.5782978407740087e-06, + "logits/chosen": -2.028473138809204, + "logits/rejected": -2.022217273712158, + "logps/chosen": -670.6177368164062, + "logps/rejected": -752.2406616210938, + "loss": 0.0834, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.44136539101600647, + "rewards/margins": 0.08413775265216827, + "rewards/rejected": -0.5255030393600464, + "step": 1920 + }, + { + "epoch": 0.66, + "learning_rate": 1.5505522611954977e-06, + "logits/chosen": -2.2423367500305176, + "logits/rejected": -1.7249571084976196, + "logps/chosen": -652.0220947265625, + "logps/rejected": -791.6546020507812, + "loss": 0.1056, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.42029309272766113, + "rewards/margins": 0.17177040874958038, + "rewards/rejected": -0.5920634865760803, + "step": 1930 + }, + { + "epoch": 0.66, + "learning_rate": 1.522942671628537e-06, + "logits/chosen": -2.2815046310424805, + "logits/rejected": -2.0123918056488037, + "logps/chosen": -581.7095336914062, + "logps/rejected": -690.0474853515625, + "loss": 0.0764, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.37874311208724976, + "rewards/margins": 0.09943266212940216, + "rewards/rejected": -0.4781757891178131, + "step": 1940 + }, + { + "epoch": 0.67, + "learning_rate": 1.495473026612435e-06, + "logits/chosen": -2.2578682899475098, + "logits/rejected": -1.8968786001205444, + "logps/chosen": -684.0667724609375, + "logps/rejected": -732.0203247070312, + "loss": 0.0857, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.43941551446914673, + "rewards/margins": 0.09977763891220093, + "rewards/rejected": -0.5391931533813477, + "step": 1950 + }, + { + "epoch": 0.67, + "learning_rate": 1.4681472606421512e-06, + "logits/chosen": -2.2577121257781982, + "logits/rejected": -1.9029508829116821, + "logps/chosen": -682.9171142578125, + "logps/rejected": -752.029296875, + "loss": 0.0774, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.46012359857559204, + "rewards/margins": 0.10639479011297226, + "rewards/rejected": -0.5665184259414673, + "step": 1960 + }, + { + "epoch": 0.68, + "learning_rate": 1.4409692876047582e-06, + "logits/chosen": -2.3715949058532715, + "logits/rejected": -1.9981542825698853, + "logps/chosen": -686.21630859375, + "logps/rejected": -765.2901611328125, + "loss": 0.0823, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.44159477949142456, + "rewards/margins": 0.10385797917842865, + "rewards/rejected": -0.5454527735710144, + "step": 1970 + }, + { + "epoch": 0.68, + "learning_rate": 1.4139430002188486e-06, + "logits/chosen": -2.1407675743103027, + "logits/rejected": -2.0084660053253174, + "logps/chosen": -565.1947631835938, + "logps/rejected": -623.761474609375, + "loss": 0.0826, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.380687952041626, + "rewards/margins": 0.06562694162130356, + "rewards/rejected": -0.44631490111351013, + "step": 1980 + }, + { + "epoch": 0.68, + "learning_rate": 1.3870722694769858e-06, + "logits/chosen": -2.301060199737549, + "logits/rejected": -2.099889039993286, + "logps/chosen": -667.0670166015625, + "logps/rejected": -778.9105834960938, + "loss": 0.0774, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.43501314520835876, + "rewards/margins": 0.11666470766067505, + "rewards/rejected": -0.5516778230667114, + "step": 1990 + }, + { + "epoch": 0.69, + "learning_rate": 1.3603609440912508e-06, + "logits/chosen": -2.1625216007232666, + "logits/rejected": -2.026824951171875, + "logps/chosen": -466.3900451660156, + "logps/rejected": -600.93896484375, + "loss": 0.1211, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3264867663383484, + "rewards/margins": 0.1292310655117035, + "rewards/rejected": -0.4557178020477295, + "step": 2000 + }, + { + "epoch": 0.69, + "learning_rate": 1.3338128499419925e-06, + "logits/chosen": -2.277644634246826, + "logits/rejected": -1.7413593530654907, + "logps/chosen": -672.1463623046875, + "logps/rejected": -718.1644287109375, + "loss": 0.0756, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4492368698120117, + "rewards/margins": 0.10236699879169464, + "rewards/rejected": -0.5516039133071899, + "step": 2010 + }, + { + "epoch": 0.69, + "learning_rate": 1.3074317895298492e-06, + "logits/chosen": -2.2224433422088623, + "logits/rejected": -1.9981178045272827, + "logps/chosen": -816.6317138671875, + "logps/rejected": -843.4306640625, + "loss": 0.0589, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.539585292339325, + "rewards/margins": 0.07050606608390808, + "rewards/rejected": -0.6100913882255554, + "step": 2020 + }, + { + "epoch": 0.7, + "learning_rate": 1.2812215414311036e-06, + "logits/chosen": -2.049561023712158, + "logits/rejected": -1.8878101110458374, + "logps/chosen": -747.5322265625, + "logps/rejected": -847.0015869140625, + "loss": 0.0788, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5520612597465515, + "rewards/margins": 0.08974529802799225, + "rewards/rejected": -0.6418064832687378, + "step": 2030 + }, + { + "epoch": 0.7, + "learning_rate": 1.2551858597564859e-06, + "logits/chosen": -2.118635654449463, + "logits/rejected": -1.993412971496582, + "logps/chosen": -758.1939697265625, + "logps/rejected": -849.9517822265625, + "loss": 0.0765, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5349586009979248, + "rewards/margins": 0.06833256781101227, + "rewards/rejected": -0.6032911539077759, + "step": 2040 + }, + { + "epoch": 0.7, + "learning_rate": 1.2293284736134605e-06, + "logits/chosen": -2.226203441619873, + "logits/rejected": -2.0049796104431152, + "logps/chosen": -663.0206298828125, + "logps/rejected": -744.8146362304688, + "loss": 0.0652, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.4578720033168793, + "rewards/margins": 0.0849481076002121, + "rewards/rejected": -0.5428200960159302, + "step": 2050 + }, + { + "epoch": 0.71, + "learning_rate": 1.2036530865721115e-06, + "logits/chosen": -2.1977055072784424, + "logits/rejected": -1.923651099205017, + "logps/chosen": -752.8314208984375, + "logps/rejected": -866.5847778320312, + "loss": 0.0559, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.5004793405532837, + "rewards/margins": 0.11119532585144043, + "rewards/rejected": -0.6116746664047241, + "step": 2060 + }, + { + "epoch": 0.71, + "learning_rate": 1.178163376134671e-06, + "logits/chosen": -2.203216552734375, + "logits/rejected": -2.1631100177764893, + "logps/chosen": -730.8253784179688, + "logps/rejected": -790.9908447265625, + "loss": 0.086, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.49312084913253784, + "rewards/margins": 0.06093892455101013, + "rewards/rejected": -0.5540598034858704, + "step": 2070 + }, + { + "epoch": 0.71, + "learning_rate": 1.152862993208794e-06, + "logits/chosen": -2.1542184352874756, + "logits/rejected": -1.721289038658142, + "logps/chosen": -694.8465576171875, + "logps/rejected": -716.5363159179688, + "loss": 0.0579, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4218994081020355, + "rewards/margins": 0.10558446496725082, + "rewards/rejected": -0.5274838805198669, + "step": 2080 + }, + { + "epoch": 0.72, + "learning_rate": 1.1277555615846339e-06, + "logits/chosen": -2.0153145790100098, + "logits/rejected": -1.8428608179092407, + "logps/chosen": -670.2786865234375, + "logps/rejected": -854.0362548828125, + "loss": 0.0693, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.43655434250831604, + "rewards/margins": 0.15288135409355164, + "rewards/rejected": -0.5894356966018677, + "step": 2090 + }, + { + "epoch": 0.72, + "learning_rate": 1.1028446774158021e-06, + "logits/chosen": -2.2273738384246826, + "logits/rejected": -2.0033624172210693, + "logps/chosen": -650.3299560546875, + "logps/rejected": -771.5960693359375, + "loss": 0.0669, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.44391244649887085, + "rewards/margins": 0.1265546977519989, + "rewards/rejected": -0.5704671740531921, + "step": 2100 + }, + { + "epoch": 0.72, + "learning_rate": 1.0781339087042955e-06, + "logits/chosen": -2.233987808227539, + "logits/rejected": -1.9722740650177002, + "logps/chosen": -668.952392578125, + "logps/rejected": -746.8803100585938, + "loss": 0.092, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4446966052055359, + "rewards/margins": 0.10449746996164322, + "rewards/rejected": -0.5491940975189209, + "step": 2110 + }, + { + "epoch": 0.73, + "learning_rate": 1.053626794789441e-06, + "logits/chosen": -2.167900562286377, + "logits/rejected": -2.1640636920928955, + "logps/chosen": -722.8687744140625, + "logps/rejected": -834.3153076171875, + "loss": 0.0647, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.5201176404953003, + "rewards/margins": 0.06028919294476509, + "rewards/rejected": -0.5804067850112915, + "step": 2120 + }, + { + "epoch": 0.73, + "learning_rate": 1.029326845840961e-06, + "logits/chosen": -2.3251118659973145, + "logits/rejected": -1.9303410053253174, + "logps/chosen": -679.392822265625, + "logps/rejected": -753.8246459960938, + "loss": 0.056, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.43752750754356384, + "rewards/margins": 0.13090887665748596, + "rewards/rejected": -0.5684363842010498, + "step": 2130 + }, + { + "epoch": 0.73, + "learning_rate": 1.0052375423562038e-06, + "logits/chosen": -2.242652416229248, + "logits/rejected": -2.0578553676605225, + "logps/chosen": -684.14013671875, + "logps/rejected": -813.5594482421875, + "loss": 0.0601, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4639926850795746, + "rewards/margins": 0.08700541406869888, + "rewards/rejected": -0.5509980916976929, + "step": 2140 + }, + { + "epoch": 0.74, + "learning_rate": 9.813623346616325e-07, + "logits/chosen": -1.9593162536621094, + "logits/rejected": -1.3148808479309082, + "logps/chosen": -727.7343139648438, + "logps/rejected": -799.2838134765625, + "loss": 0.0755, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.43542367219924927, + "rewards/margins": 0.15335293114185333, + "rewards/rejected": -0.5887765884399414, + "step": 2150 + }, + { + "epoch": 0.74, + "learning_rate": 9.577046424186336e-07, + "logits/chosen": -2.250488758087158, + "logits/rejected": -2.152696132659912, + "logps/chosen": -743.4170532226562, + "logps/rejected": -749.8002319335938, + "loss": 0.0858, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.45544886589050293, + "rewards/margins": 0.04140906408429146, + "rewards/rejected": -0.4968579411506653, + "step": 2160 + }, + { + "epoch": 0.74, + "learning_rate": 9.342678541337155e-07, + "logits/chosen": -2.17391037940979, + "logits/rejected": -1.7850377559661865, + "logps/chosen": -670.3488159179688, + "logps/rejected": -719.6155395507812, + "loss": 0.1111, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.47339048981666565, + "rewards/margins": 0.07163342088460922, + "rewards/rejected": -0.5450239181518555, + "step": 2170 + }, + { + "epoch": 0.75, + "learning_rate": 9.110553266731676e-07, + "logits/chosen": -1.9487330913543701, + "logits/rejected": -1.920854926109314, + "logps/chosen": -684.4945068359375, + "logps/rejected": -799.9359130859375, + "loss": 0.0455, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.47916141152381897, + "rewards/margins": 0.1000744104385376, + "rewards/rejected": -0.5792357921600342, + "step": 2180 + }, + { + "epoch": 0.75, + "learning_rate": 8.880703847822603e-07, + "logits/chosen": -2.07055401802063, + "logits/rejected": -1.928727149963379, + "logps/chosen": -658.1580200195312, + "logps/rejected": -768.7928466796875, + "loss": 0.0684, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4428200125694275, + "rewards/margins": 0.12602929770946503, + "rewards/rejected": -0.5688492655754089, + "step": 2190 + }, + { + "epoch": 0.75, + "learning_rate": 8.653163206090326e-07, + "logits/chosen": -2.4357573986053467, + "logits/rejected": -1.9457374811172485, + "logps/chosen": -623.8139038085938, + "logps/rejected": -643.5352783203125, + "loss": 0.0653, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.37632665038108826, + "rewards/margins": 0.08694492280483246, + "rewards/rejected": -0.4632716178894043, + "step": 2200 + }, + { + "epoch": 0.76, + "learning_rate": 8.427963932327621e-07, + "logits/chosen": -2.18113112449646, + "logits/rejected": -2.0196382999420166, + "logps/chosen": -602.2532958984375, + "logps/rejected": -781.3187255859375, + "loss": 0.0677, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4085915982723236, + "rewards/margins": 0.16642124950885773, + "rewards/rejected": -0.5750128030776978, + "step": 2210 + }, + { + "epoch": 0.76, + "learning_rate": 8.205138281971617e-07, + "logits/chosen": -2.0964841842651367, + "logits/rejected": -1.816980004310608, + "logps/chosen": -691.4669189453125, + "logps/rejected": -711.4818725585938, + "loss": 0.0626, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4612464904785156, + "rewards/margins": 0.0856751948595047, + "rewards/rejected": -0.5469216704368591, + "step": 2220 + }, + { + "epoch": 0.76, + "learning_rate": 7.984718170483813e-07, + "logits/chosen": -2.1438546180725098, + "logits/rejected": -1.992221474647522, + "logps/chosen": -613.9456787109375, + "logps/rejected": -792.9467163085938, + "loss": 0.1208, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.41083821654319763, + "rewards/margins": 0.1366868019104004, + "rewards/rejected": -0.5475250482559204, + "step": 2230 + }, + { + "epoch": 0.77, + "learning_rate": 7.766735168778853e-07, + "logits/chosen": -2.3303608894348145, + "logits/rejected": -1.964838981628418, + "logps/chosen": -724.9031982421875, + "logps/rejected": -794.0396728515625, + "loss": 0.065, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4382435381412506, + "rewards/margins": 0.1007271558046341, + "rewards/rejected": -0.5389707088470459, + "step": 2240 + }, + { + "epoch": 0.77, + "learning_rate": 7.551220498702547e-07, + "logits/chosen": -2.219709873199463, + "logits/rejected": -1.8147242069244385, + "logps/chosen": -690.845458984375, + "logps/rejected": -772.4855346679688, + "loss": 0.0714, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4500810205936432, + "rewards/margins": 0.10746470838785172, + "rewards/rejected": -0.5575457215309143, + "step": 2250 + }, + { + "epoch": 0.77, + "learning_rate": 7.338205028560003e-07, + "logits/chosen": -2.296119213104248, + "logits/rejected": -1.979353904724121, + "logps/chosen": -654.2385864257812, + "logps/rejected": -730.311767578125, + "loss": 0.0731, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4028875231742859, + "rewards/margins": 0.1070215255022049, + "rewards/rejected": -0.5099090337753296, + "step": 2260 + }, + { + "epoch": 0.78, + "learning_rate": 7.127719268694294e-07, + "logits/chosen": -2.161729097366333, + "logits/rejected": -1.8845264911651611, + "logps/chosen": -690.8424682617188, + "logps/rejected": -765.8018798828125, + "loss": 0.0763, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.45226621627807617, + "rewards/margins": 0.1182960644364357, + "rewards/rejected": -0.5705623626708984, + "step": 2270 + }, + { + "epoch": 0.78, + "learning_rate": 6.919793367116453e-07, + "logits/chosen": -2.2758870124816895, + "logits/rejected": -2.1513137817382812, + "logps/chosen": -645.7116088867188, + "logps/rejected": -750.9886474609375, + "loss": 0.0744, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.39909255504608154, + "rewards/margins": 0.11400028318166733, + "rewards/rejected": -0.5130928158760071, + "step": 2280 + }, + { + "epoch": 0.78, + "learning_rate": 6.714457105187383e-07, + "logits/chosen": -2.358992338180542, + "logits/rejected": -1.8777456283569336, + "logps/chosen": -718.3897094726562, + "logps/rejected": -846.4613037109375, + "loss": 0.069, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.4483721852302551, + "rewards/margins": 0.1424846649169922, + "rewards/rejected": -0.5908567905426025, + "step": 2290 + }, + { + "epoch": 0.79, + "learning_rate": 6.511739893352226e-07, + "logits/chosen": -2.1870875358581543, + "logits/rejected": -2.080655336380005, + "logps/chosen": -697.439208984375, + "logps/rejected": -725.9364013671875, + "loss": 0.0663, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.4647183418273926, + "rewards/margins": 0.0482785627245903, + "rewards/rejected": -0.5129969120025635, + "step": 2300 + }, + { + "epoch": 0.79, + "learning_rate": 6.311670766927869e-07, + "logits/chosen": -1.9962774515151978, + "logits/rejected": -1.9848453998565674, + "logps/chosen": -651.7489013671875, + "logps/rejected": -729.2185668945312, + "loss": 0.1025, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.46112656593322754, + "rewards/margins": 0.07899859547615051, + "rewards/rejected": -0.5401251912117004, + "step": 2310 + }, + { + "epoch": 0.8, + "learning_rate": 6.114278381944253e-07, + "logits/chosen": -2.1914873123168945, + "logits/rejected": -2.372349500656128, + "logps/chosen": -576.5204467773438, + "logps/rejected": -652.2879638671875, + "loss": 0.0874, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.38175448775291443, + "rewards/margins": 0.045350007712841034, + "rewards/rejected": -0.42710447311401367, + "step": 2320 + }, + { + "epoch": 0.8, + "learning_rate": 5.91959101103988e-07, + "logits/chosen": -2.443941593170166, + "logits/rejected": -2.3585076332092285, + "logps/chosen": -623.3038940429688, + "logps/rejected": -749.5606689453125, + "loss": 0.0805, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.40153616666793823, + "rewards/margins": 0.12084267288446426, + "rewards/rejected": -0.5223788619041443, + "step": 2330 + }, + { + "epoch": 0.8, + "learning_rate": 5.727636539412368e-07, + "logits/chosen": -2.2379660606384277, + "logits/rejected": -1.8004734516143799, + "logps/chosen": -637.1851806640625, + "logps/rejected": -713.89697265625, + "loss": 0.0514, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3866319954395294, + "rewards/margins": 0.12998130917549133, + "rewards/rejected": -0.5166133642196655, + "step": 2340 + }, + { + "epoch": 0.81, + "learning_rate": 5.538442460824417e-07, + "logits/chosen": -2.189680576324463, + "logits/rejected": -1.9954487085342407, + "logps/chosen": -629.7688598632812, + "logps/rejected": -715.0001831054688, + "loss": 0.0985, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4157083034515381, + "rewards/margins": 0.10353025048971176, + "rewards/rejected": -0.5192385911941528, + "step": 2350 + }, + { + "epoch": 0.81, + "learning_rate": 5.352035873665817e-07, + "logits/chosen": -2.3851559162139893, + "logits/rejected": -2.140918016433716, + "logps/chosen": -579.01123046875, + "logps/rejected": -623.9471435546875, + "loss": 0.0898, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.35611358284950256, + "rewards/margins": 0.07924910634756088, + "rewards/rejected": -0.43536263704299927, + "step": 2360 + }, + { + "epoch": 0.81, + "learning_rate": 5.168443477072207e-07, + "logits/chosen": -2.320765256881714, + "logits/rejected": -1.9919421672821045, + "logps/chosen": -578.265625, + "logps/rejected": -717.7510375976562, + "loss": 0.065, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.3810497522354126, + "rewards/margins": 0.13789904117584229, + "rewards/rejected": -0.5189487338066101, + "step": 2370 + }, + { + "epoch": 0.82, + "learning_rate": 4.987691567100866e-07, + "logits/chosen": -2.205519199371338, + "logits/rejected": -1.9962717294692993, + "logps/chosen": -694.5599365234375, + "logps/rejected": -781.102783203125, + "loss": 0.0873, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4531136155128479, + "rewards/margins": 0.08138440549373627, + "rewards/rejected": -0.5344979763031006, + "step": 2380 + }, + { + "epoch": 0.82, + "learning_rate": 4.809806032964351e-07, + "logits/chosen": -2.2204413414001465, + "logits/rejected": -2.0109105110168457, + "logps/chosen": -601.0872802734375, + "logps/rejected": -644.5216064453125, + "loss": 0.1044, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": -0.3811626136302948, + "rewards/margins": 0.06587550789117813, + "rewards/rejected": -0.44703811407089233, + "step": 2390 + }, + { + "epoch": 0.82, + "learning_rate": 4.634812353322371e-07, + "logits/chosen": -2.3256497383117676, + "logits/rejected": -1.8949310779571533, + "logps/chosen": -661.0736083984375, + "logps/rejected": -761.2631225585938, + "loss": 0.0912, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4190675616264343, + "rewards/margins": 0.11600930988788605, + "rewards/rejected": -0.535076916217804, + "step": 2400 + }, + { + "epoch": 0.83, + "learning_rate": 4.462735592632439e-07, + "logits/chosen": -2.003680944442749, + "logits/rejected": -1.8513685464859009, + "logps/chosen": -706.0535278320312, + "logps/rejected": -843.9226684570312, + "loss": 0.0716, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.45064783096313477, + "rewards/margins": 0.1290721446275711, + "rewards/rejected": -0.5797199606895447, + "step": 2410 + }, + { + "epoch": 0.83, + "learning_rate": 4.293600397559897e-07, + "logits/chosen": -2.1723484992980957, + "logits/rejected": -1.9190336465835571, + "logps/chosen": -624.4666137695312, + "logps/rejected": -677.8482055664062, + "loss": 0.0984, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.41719359159469604, + "rewards/margins": 0.05301886796951294, + "rewards/rejected": -0.470212459564209, + "step": 2420 + }, + { + "epoch": 0.83, + "learning_rate": 4.1274309934477454e-07, + "logits/chosen": -2.106175422668457, + "logits/rejected": -1.8884546756744385, + "logps/chosen": -612.33544921875, + "logps/rejected": -652.5370483398438, + "loss": 0.0818, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.39127975702285767, + "rewards/margins": 0.054196156561374664, + "rewards/rejected": -0.44547590613365173, + "step": 2430 + }, + { + "epoch": 0.84, + "learning_rate": 3.964251180846826e-07, + "logits/chosen": -2.3211159706115723, + "logits/rejected": -1.9752849340438843, + "logps/chosen": -544.6336669921875, + "logps/rejected": -604.1102294921875, + "loss": 0.0891, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.34942182898521423, + "rewards/margins": 0.07262709736824036, + "rewards/rejected": -0.422048956155777, + "step": 2440 + }, + { + "epoch": 0.84, + "learning_rate": 3.8040843321068746e-07, + "logits/chosen": -2.1404693126678467, + "logits/rejected": -1.9147093296051025, + "logps/chosen": -709.58154296875, + "logps/rejected": -804.4060668945312, + "loss": 0.0657, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.48798832297325134, + "rewards/margins": 0.10476765781641006, + "rewards/rejected": -0.5927559733390808, + "step": 2450 + }, + { + "epoch": 0.84, + "learning_rate": 3.646953388028854e-07, + "logits/chosen": -2.04292631149292, + "logits/rejected": -1.5764219760894775, + "logps/chosen": -687.10546875, + "logps/rejected": -760.7067260742188, + "loss": 0.0816, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4793581962585449, + "rewards/margins": 0.11864249408245087, + "rewards/rejected": -0.598000705242157, + "step": 2460 + }, + { + "epoch": 0.85, + "learning_rate": 3.4928808545791614e-07, + "logits/chosen": -2.073615550994873, + "logits/rejected": -2.1788148880004883, + "logps/chosen": -638.221435546875, + "logps/rejected": -761.6395263671875, + "loss": 0.0891, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.44060444831848145, + "rewards/margins": 0.09731185436248779, + "rewards/rejected": -0.5379163026809692, + "step": 2470 + }, + { + "epoch": 0.85, + "learning_rate": 3.341888799666068e-07, + "logits/chosen": -2.262629747390747, + "logits/rejected": -1.9284954071044922, + "logps/chosen": -686.7388916015625, + "logps/rejected": -743.3775634765625, + "loss": 0.0855, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.4059697091579437, + "rewards/margins": 0.09988965839147568, + "rewards/rejected": -0.505859375, + "step": 2480 + }, + { + "epoch": 0.85, + "learning_rate": 3.1939988499789075e-07, + "logits/chosen": -2.0978739261627197, + "logits/rejected": -1.8978351354599, + "logps/chosen": -708.4168701171875, + "logps/rejected": -820.39404296875, + "loss": 0.0743, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.49596619606018066, + "rewards/margins": 0.1125420555472374, + "rewards/rejected": -0.6085082292556763, + "step": 2490 + }, + { + "epoch": 0.86, + "learning_rate": 3.0492321878904913e-07, + "logits/chosen": -2.175656795501709, + "logits/rejected": -1.7882719039916992, + "logps/chosen": -786.6962280273438, + "logps/rejected": -890.03466796875, + "loss": 0.1061, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.5101443529129028, + "rewards/margins": 0.11035291850566864, + "rewards/rejected": -0.6204972267150879, + "step": 2500 + }, + { + "epoch": 0.86, + "learning_rate": 2.907609548423135e-07, + "logits/chosen": -2.3059334754943848, + "logits/rejected": -2.0130183696746826, + "logps/chosen": -542.1705322265625, + "logps/rejected": -715.5118408203125, + "loss": 0.0811, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.36655205488204956, + "rewards/margins": 0.14779023826122284, + "rewards/rejected": -0.5143422484397888, + "step": 2510 + }, + { + "epoch": 0.86, + "learning_rate": 2.7691512162787567e-07, + "logits/chosen": -2.0405287742614746, + "logits/rejected": -2.0915091037750244, + "logps/chosen": -702.5558471679688, + "logps/rejected": -788.6998291015625, + "loss": 0.0696, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.4405028223991394, + "rewards/margins": 0.09354646503925323, + "rewards/rejected": -0.5340492129325867, + "step": 2520 + }, + { + "epoch": 0.87, + "learning_rate": 2.6338770229335176e-07, + "logits/chosen": -2.1023449897766113, + "logits/rejected": -1.8752168416976929, + "logps/chosen": -760.4970092773438, + "logps/rejected": -833.1339721679688, + "loss": 0.0622, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.46654874086380005, + "rewards/margins": 0.10432098805904388, + "rewards/rejected": -0.5708697438240051, + "step": 2530 + }, + { + "epoch": 0.87, + "learning_rate": 2.501806343797303e-07, + "logits/chosen": -2.3457446098327637, + "logits/rejected": -2.3163278102874756, + "logps/chosen": -633.0409545898438, + "logps/rejected": -750.288818359375, + "loss": 0.093, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.43354344367980957, + "rewards/margins": 0.08344617486000061, + "rewards/rejected": -0.516989529132843, + "step": 2540 + }, + { + "epoch": 0.87, + "learning_rate": 2.3729580954386183e-07, + "logits/chosen": -2.204981803894043, + "logits/rejected": -1.9701652526855469, + "logps/chosen": -688.1578979492188, + "logps/rejected": -808.5591430664062, + "loss": 0.061, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.44313064217567444, + "rewards/margins": 0.14626091718673706, + "rewards/rejected": -0.5893915891647339, + "step": 2550 + }, + { + "epoch": 0.88, + "learning_rate": 2.2473507328751086e-07, + "logits/chosen": -1.9686768054962158, + "logits/rejected": -1.7817811965942383, + "logps/chosen": -637.0069580078125, + "logps/rejected": -730.8543090820312, + "loss": 0.0792, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3913404643535614, + "rewards/margins": 0.11009415239095688, + "rewards/rejected": -0.5014346241950989, + "step": 2560 + }, + { + "epoch": 0.88, + "learning_rate": 2.1250022469302745e-07, + "logits/chosen": -2.3968021869659424, + "logits/rejected": -1.9710814952850342, + "logps/chosen": -783.5001220703125, + "logps/rejected": -783.8637084960938, + "loss": 0.0559, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4583209156990051, + "rewards/margins": 0.07906268537044525, + "rewards/rejected": -0.537383496761322, + "step": 2570 + }, + { + "epoch": 0.88, + "learning_rate": 2.0059301616566107e-07, + "logits/chosen": -1.9925310611724854, + "logits/rejected": -2.1229825019836426, + "logps/chosen": -628.9017944335938, + "logps/rejected": -814.9282836914062, + "loss": 0.0987, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.48036131262779236, + "rewards/margins": 0.1412503570318222, + "rewards/rejected": -0.6216117143630981, + "step": 2580 + }, + { + "epoch": 0.89, + "learning_rate": 1.8901515318256318e-07, + "logits/chosen": -2.3602724075317383, + "logits/rejected": -1.8974645137786865, + "logps/chosen": -728.95947265625, + "logps/rejected": -866.9889526367188, + "loss": 0.053, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4281562268733978, + "rewards/margins": 0.11615494638681412, + "rewards/rejected": -0.5443111658096313, + "step": 2590 + }, + { + "epoch": 0.89, + "learning_rate": 1.7776829404851092e-07, + "logits/chosen": -2.2596402168273926, + "logits/rejected": -1.877781867980957, + "logps/chosen": -777.1273193359375, + "logps/rejected": -868.2491455078125, + "loss": 0.0797, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.4920421242713928, + "rewards/margins": 0.14914286136627197, + "rewards/rejected": -0.64118492603302, + "step": 2600 + }, + { + "epoch": 0.89, + "learning_rate": 1.6685404965838647e-07, + "logits/chosen": -2.3345799446105957, + "logits/rejected": -2.0257980823516846, + "logps/chosen": -681.239990234375, + "logps/rejected": -695.14501953125, + "loss": 0.0789, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.4106011986732483, + "rewards/margins": 0.07871082425117493, + "rewards/rejected": -0.48931199312210083, + "step": 2610 + }, + { + "epoch": 0.9, + "learning_rate": 1.5627398326644811e-07, + "logits/chosen": -2.2587788105010986, + "logits/rejected": -2.0362513065338135, + "logps/chosen": -705.2887573242188, + "logps/rejected": -740.9500122070312, + "loss": 0.0835, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.45956987142562866, + "rewards/margins": 0.06559441983699799, + "rewards/rejected": -0.5251643061637878, + "step": 2620 + }, + { + "epoch": 0.9, + "learning_rate": 1.460296102624248e-07, + "logits/chosen": -2.258861541748047, + "logits/rejected": -2.2292912006378174, + "logps/chosen": -662.3797607421875, + "logps/rejected": -802.5979614257812, + "loss": 0.0791, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.43469151854515076, + "rewards/margins": 0.11645804345607758, + "rewards/rejected": -0.5511494874954224, + "step": 2630 + }, + { + "epoch": 0.9, + "learning_rate": 1.3612239795446348e-07, + "logits/chosen": -2.229052782058716, + "logits/rejected": -1.8262239694595337, + "logps/chosen": -585.1064453125, + "logps/rejected": -665.6681518554688, + "loss": 0.0482, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.3851460814476013, + "rewards/margins": 0.09434196352958679, + "rewards/rejected": -0.4794880449771881, + "step": 2640 + }, + { + "epoch": 0.91, + "learning_rate": 1.2655376535896852e-07, + "logits/chosen": -2.274597644805908, + "logits/rejected": -1.9262031316757202, + "logps/chosen": -641.3375244140625, + "logps/rejected": -756.7232666015625, + "loss": 0.0618, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3911328911781311, + "rewards/margins": 0.13705289363861084, + "rewards/rejected": -0.5281857252120972, + "step": 2650 + }, + { + "epoch": 0.91, + "learning_rate": 1.1732508299735379e-07, + "logits/chosen": -2.2501373291015625, + "logits/rejected": -1.8707962036132812, + "logps/chosen": -603.0183715820312, + "logps/rejected": -690.1578369140625, + "loss": 0.0705, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.39655619859695435, + "rewards/margins": 0.1162206381559372, + "rewards/rejected": -0.5127768516540527, + "step": 2660 + }, + { + "epoch": 0.92, + "learning_rate": 1.0843767269974131e-07, + "logits/chosen": -2.2717010974884033, + "logits/rejected": -2.084096908569336, + "logps/chosen": -690.6661987304688, + "logps/rejected": -738.0612182617188, + "loss": 0.0704, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.44601020216941833, + "rewards/margins": 0.0691831111907959, + "rewards/rejected": -0.5151932239532471, + "step": 2670 + }, + { + "epoch": 0.92, + "learning_rate": 9.989280741563689e-08, + "logits/chosen": -2.2434730529785156, + "logits/rejected": -1.928205132484436, + "logps/chosen": -684.7159423828125, + "logps/rejected": -753.5374755859375, + "loss": 0.0737, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.42067623138427734, + "rewards/margins": 0.11073021590709686, + "rewards/rejected": -0.5314064621925354, + "step": 2680 + }, + { + "epoch": 0.92, + "learning_rate": 9.169171103160123e-08, + "logits/chosen": -2.2055764198303223, + "logits/rejected": -1.9301769733428955, + "logps/chosen": -670.9398803710938, + "logps/rejected": -797.8900756835938, + "loss": 0.0828, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4384661316871643, + "rewards/margins": 0.12283768504858017, + "rewards/rejected": -0.5613037347793579, + "step": 2690 + }, + { + "epoch": 0.93, + "learning_rate": 8.383555819595601e-08, + "logits/chosen": -2.2271904945373535, + "logits/rejected": -2.005030393600464, + "logps/chosen": -687.7010498046875, + "logps/rejected": -819.2864379882812, + "loss": 0.0767, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4591870903968811, + "rewards/margins": 0.14114715158939362, + "rewards/rejected": -0.6003342270851135, + "step": 2700 + }, + { + "epoch": 0.93, + "learning_rate": 7.632547415053482e-08, + "logits/chosen": -2.4016735553741455, + "logits/rejected": -2.1341352462768555, + "logps/chosen": -751.8325805664062, + "logps/rejected": -827.7970581054688, + "loss": 0.1146, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.48245877027511597, + "rewards/margins": 0.08054832369089127, + "rewards/rejected": -0.5630070567131042, + "step": 2710 + }, + { + "epoch": 0.93, + "learning_rate": 6.916253456951572e-08, + "logits/chosen": -2.2689526081085205, + "logits/rejected": -2.0502147674560547, + "logps/chosen": -725.8679809570312, + "logps/rejected": -830.0611572265625, + "loss": 0.042, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.46661239862442017, + "rewards/margins": 0.1116294115781784, + "rewards/rejected": -0.578241765499115, + "step": 2720 + }, + { + "epoch": 0.94, + "learning_rate": 6.23477654053517e-08, + "logits/chosen": -2.389768600463867, + "logits/rejected": -1.788630723953247, + "logps/chosen": -633.7202758789062, + "logps/rejected": -682.6315307617188, + "loss": 0.078, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.37593162059783936, + "rewards/margins": 0.1151694804430008, + "rewards/rejected": -0.49110108613967896, + "step": 2730 + }, + { + "epoch": 0.94, + "learning_rate": 5.588214274182158e-08, + "logits/chosen": -2.29005765914917, + "logits/rejected": -1.8733975887298584, + "logps/chosen": -736.537841796875, + "logps/rejected": -748.2416381835938, + "loss": 0.1075, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.42465394735336304, + "rewards/margins": 0.09247386455535889, + "rewards/rejected": -0.5171278119087219, + "step": 2740 + }, + { + "epoch": 0.94, + "learning_rate": 4.9766592654227344e-08, + "logits/chosen": -2.290553092956543, + "logits/rejected": -1.8525292873382568, + "logps/chosen": -776.4290161132812, + "logps/rejected": -816.3228149414062, + "loss": 0.0501, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.49286454916000366, + "rewards/margins": 0.10129784047603607, + "rewards/rejected": -0.5941623449325562, + "step": 2750 + }, + { + "epoch": 0.95, + "learning_rate": 4.400199107674946e-08, + "logits/chosen": -2.1630892753601074, + "logits/rejected": -1.940159559249878, + "logps/chosen": -665.2659912109375, + "logps/rejected": -710.6292724609375, + "loss": 0.0637, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.4292607307434082, + "rewards/margins": 0.062006641179323196, + "rewards/rejected": -0.4912673532962799, + "step": 2760 + }, + { + "epoch": 0.95, + "learning_rate": 3.8589163676986674e-08, + "logits/chosen": -2.389782190322876, + "logits/rejected": -2.072308301925659, + "logps/chosen": -739.9847412109375, + "logps/rejected": -799.0582885742188, + "loss": 0.0508, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.42423492670059204, + "rewards/margins": 0.11466383934020996, + "rewards/rejected": -0.538898766040802, + "step": 2770 + }, + { + "epoch": 0.95, + "learning_rate": 3.3528885737696136e-08, + "logits/chosen": -2.229954957962036, + "logits/rejected": -2.001962661743164, + "logps/chosen": -763.1449584960938, + "logps/rejected": -890.884765625, + "loss": 0.0776, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4881008267402649, + "rewards/margins": 0.1101212278008461, + "rewards/rejected": -0.598222017288208, + "step": 2780 + }, + { + "epoch": 0.96, + "learning_rate": 2.8821882045748928e-08, + "logits/chosen": -2.1927006244659424, + "logits/rejected": -1.9519379138946533, + "logps/chosen": -609.7821044921875, + "logps/rejected": -670.4013671875, + "loss": 0.1037, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.43954816460609436, + "rewards/margins": 0.08661254495382309, + "rewards/rejected": -0.526160717010498, + "step": 2790 + }, + { + "epoch": 0.96, + "learning_rate": 2.4468826788316967e-08, + "logits/chosen": -2.2692456245422363, + "logits/rejected": -1.8533859252929688, + "logps/chosen": -694.7283935546875, + "logps/rejected": -838.3411865234375, + "loss": 0.0369, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.41949623823165894, + "rewards/margins": 0.1585858017206192, + "rewards/rejected": -0.5780820846557617, + "step": 2800 + }, + { + "epoch": 0.96, + "learning_rate": 2.0470343456310827e-08, + "logits/chosen": -2.1465401649475098, + "logits/rejected": -1.92649245262146, + "logps/chosen": -801.5432739257812, + "logps/rejected": -876.83203125, + "loss": 0.0925, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5086441040039062, + "rewards/margins": 0.1165408119559288, + "rewards/rejected": -0.6251848936080933, + "step": 2810 + }, + { + "epoch": 0.97, + "learning_rate": 1.682700475507476e-08, + "logits/chosen": -2.319272518157959, + "logits/rejected": -2.125800609588623, + "logps/chosen": -730.833740234375, + "logps/rejected": -750.211669921875, + "loss": 0.08, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.44253936409950256, + "rewards/margins": 0.06718467175960541, + "rewards/rejected": -0.5097240209579468, + "step": 2820 + }, + { + "epoch": 0.97, + "learning_rate": 1.3539332522359282e-08, + "logits/chosen": -2.2746429443359375, + "logits/rejected": -1.9740593433380127, + "logps/chosen": -795.7684936523438, + "logps/rejected": -879.1627197265625, + "loss": 0.0623, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.519045352935791, + "rewards/margins": 0.11925999075174332, + "rewards/rejected": -0.6383053660392761, + "step": 2830 + }, + { + "epoch": 0.97, + "learning_rate": 1.0607797653577333e-08, + "logits/chosen": -2.160733938217163, + "logits/rejected": -1.9417308568954468, + "logps/chosen": -561.7853393554688, + "logps/rejected": -670.7401733398438, + "loss": 0.0739, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3653794825077057, + "rewards/margins": 0.09239096939563751, + "rewards/rejected": -0.457770437002182, + "step": 2840 + }, + { + "epoch": 0.98, + "learning_rate": 8.032820034357126e-09, + "logits/chosen": -2.2955965995788574, + "logits/rejected": -2.136737585067749, + "logps/chosen": -670.2855224609375, + "logps/rejected": -771.9925537109375, + "loss": 0.0742, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4561973214149475, + "rewards/margins": 0.10345951467752457, + "rewards/rejected": -0.5596567988395691, + "step": 2850 + }, + { + "epoch": 0.98, + "learning_rate": 5.814768480403021e-09, + "logits/chosen": -2.0290589332580566, + "logits/rejected": -1.981529951095581, + "logps/chosen": -576.2341918945312, + "logps/rejected": -762.9359130859375, + "loss": 0.0692, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4308190941810608, + "rewards/margins": 0.13548357784748077, + "rewards/rejected": -0.5663026571273804, + "step": 2860 + }, + { + "epoch": 0.98, + "learning_rate": 3.953960684668634e-09, + "logits/chosen": -2.110337018966675, + "logits/rejected": -2.0485644340515137, + "logps/chosen": -660.661865234375, + "logps/rejected": -738.81884765625, + "loss": 0.0598, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.44688519835472107, + "rewards/margins": 0.08198593556880951, + "rewards/rejected": -0.5288710594177246, + "step": 2870 + }, + { + "epoch": 0.99, + "learning_rate": 2.4506631718534956e-09, + "logits/chosen": -2.2245066165924072, + "logits/rejected": -1.9781780242919922, + "logps/chosen": -732.562255859375, + "logps/rejected": -833.1080932617188, + "loss": 0.0685, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.48917946219444275, + "rewards/margins": 0.0765211433172226, + "rewards/rejected": -0.5657006502151489, + "step": 2880 + }, + { + "epoch": 0.99, + "learning_rate": 1.3050912602297071e-09, + "logits/chosen": -2.0510215759277344, + "logits/rejected": -1.8791606426239014, + "logps/chosen": -708.2174072265625, + "logps/rejected": -814.8040161132812, + "loss": 0.079, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.46390801668167114, + "rewards/margins": 0.11161540448665619, + "rewards/rejected": -0.5755234360694885, + "step": 2890 + }, + { + "epoch": 0.99, + "learning_rate": 5.1740903080022e-10, + "logits/chosen": -2.4098868370056152, + "logits/rejected": -1.9928724765777588, + "logps/chosen": -596.1256713867188, + "logps/rejected": -630.7376708984375, + "loss": 0.0822, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3659174144268036, + "rewards/margins": 0.11456866562366486, + "rewards/rejected": -0.48048609495162964, + "step": 2900 + }, + { + "epoch": 1.0, + "learning_rate": 8.772930379846723e-11, + "logits/chosen": -2.240025043487549, + "logits/rejected": -2.1385819911956787, + "logps/chosen": -616.0072021484375, + "logps/rejected": -749.4381103515625, + "loss": 0.0707, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.4194551110267639, + "rewards/margins": 0.09669093787670135, + "rewards/rejected": -0.5161460041999817, + "step": 2910 + }, + { + "epoch": 1.0, + "step": 2917, + "total_flos": 0.0, + "train_loss": 0.0819665433958372, + "train_runtime": 16805.1619, + "train_samples_per_second": 1.389, + "train_steps_per_second": 0.174 + } + ], + "logging_steps": 10, + "max_steps": 2917, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}