{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2648831203231574, "eval_steps": 200, "global_step": 1500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 9.652509652509653e-06, "logits/chosen": -3.11246657371521, "logits/rejected": -3.086373805999756, "logps/chosen": -113.73238372802734, "logps/rejected": -109.32698822021484, "loss": 0.721, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.2993558943271637, "rewards/margins": -0.0277109295129776, "rewards/rejected": -0.2716449797153473, "step": 10 }, { "epoch": 0.0, "learning_rate": 1.9305019305019306e-05, "logits/chosen": -3.110931873321533, "logits/rejected": -3.1171531677246094, "logps/chosen": -122.4663314819336, "logps/rejected": -113.23054504394531, "loss": 0.7161, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.2805718183517456, "rewards/margins": -0.03167964145541191, "rewards/rejected": -0.2488921880722046, "step": 20 }, { "epoch": 0.01, "learning_rate": 2.895752895752896e-05, "logits/chosen": -3.139052629470825, "logits/rejected": -3.1156527996063232, "logps/chosen": -126.01689147949219, "logps/rejected": -100.77046203613281, "loss": 0.7224, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.3138067126274109, "rewards/margins": -0.038806475698947906, "rewards/rejected": -0.2750001847743988, "step": 30 }, { "epoch": 0.01, "learning_rate": 3.764478764478765e-05, "logits/chosen": -3.155150890350342, "logits/rejected": -3.1715970039367676, "logps/chosen": -133.27737426757812, "logps/rejected": -118.9439926147461, "loss": 0.713, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.2480003386735916, "rewards/margins": -0.023857835680246353, "rewards/rejected": -0.22414250671863556, "step": 40 }, { "epoch": 0.01, "learning_rate": 4.72972972972973e-05, "logits/chosen": -3.1669116020202637, "logits/rejected": -3.1525278091430664, "logps/chosen": -123.1195297241211, "logps/rejected": -128.38714599609375, "loss": 0.6781, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.19161827862262726, "rewards/margins": 0.04476013034582138, "rewards/rejected": -0.23637838661670685, "step": 50 }, { "epoch": 0.01, "learning_rate": 5.694980694980695e-05, "logits/chosen": -3.1426501274108887, "logits/rejected": -3.132570266723633, "logps/chosen": -120.64261627197266, "logps/rejected": -113.0268783569336, "loss": 0.7107, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.3047412037849426, "rewards/margins": -0.014080168679356575, "rewards/rejected": -0.2906610071659088, "step": 60 }, { "epoch": 0.01, "learning_rate": 6.660231660231661e-05, "logits/chosen": -3.170804500579834, "logits/rejected": -3.164586305618286, "logps/chosen": -116.2149887084961, "logps/rejected": -126.68898010253906, "loss": 0.6885, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.33613839745521545, "rewards/margins": 0.028235793113708496, "rewards/rejected": -0.36437422037124634, "step": 70 }, { "epoch": 0.02, "learning_rate": 7.625482625482626e-05, "logits/chosen": -3.17895245552063, "logits/rejected": -3.1590020656585693, "logps/chosen": -126.33811950683594, "logps/rejected": -103.02183532714844, "loss": 0.6215, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.4042009711265564, "rewards/margins": 0.19598612189292908, "rewards/rejected": -0.6001870632171631, "step": 80 }, { "epoch": 0.02, "learning_rate": 8.59073359073359e-05, "logits/chosen": -3.180785894393921, "logits/rejected": -3.1576857566833496, "logps/chosen": -129.47866821289062, "logps/rejected": -126.99539947509766, "loss": 0.6889, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.8047823905944824, "rewards/margins": 0.08432246744632721, "rewards/rejected": -0.8891048431396484, "step": 90 }, { "epoch": 0.02, "learning_rate": 9.555984555984557e-05, "logits/chosen": -3.139349937438965, "logits/rejected": -3.114441394805908, "logps/chosen": -139.87002563476562, "logps/rejected": -135.06802368164062, "loss": 0.6326, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2678108215332031, "rewards/margins": 0.2349836528301239, "rewards/rejected": -1.5027945041656494, "step": 100 }, { "epoch": 0.02, "learning_rate": 0.00010424710424710426, "logits/chosen": -3.155695676803589, "logits/rejected": -3.1192359924316406, "logps/chosen": -126.76655578613281, "logps/rejected": -119.91800689697266, "loss": 0.6925, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1118037700653076, "rewards/margins": 0.12902173399925232, "rewards/rejected": -1.2408255338668823, "step": 110 }, { "epoch": 0.02, "learning_rate": 0.0001138996138996139, "logits/chosen": -3.203996181488037, "logits/rejected": -3.181511402130127, "logps/chosen": -121.38505554199219, "logps/rejected": -123.63691711425781, "loss": 0.6905, "rewards/accuracies": 0.5625, "rewards/chosen": -0.83983314037323, "rewards/margins": 0.14224112033843994, "rewards/rejected": -0.9820743799209595, "step": 120 }, { "epoch": 0.03, "learning_rate": 0.00012355212355212355, "logits/chosen": -3.199700355529785, "logits/rejected": -3.142176628112793, "logps/chosen": -144.78311157226562, "logps/rejected": -131.46128845214844, "loss": 0.7421, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.7906621694564819, "rewards/margins": 0.08599194139242172, "rewards/rejected": -0.8766541481018066, "step": 130 }, { "epoch": 0.03, "learning_rate": 0.00013223938223938227, "logits/chosen": -3.11432147026062, "logits/rejected": -3.0819637775421143, "logps/chosen": -131.96109008789062, "logps/rejected": -118.2151107788086, "loss": 0.7438, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.9345771074295044, "rewards/margins": 0.03206203132867813, "rewards/rejected": -0.9666391611099243, "step": 140 }, { "epoch": 0.03, "learning_rate": 0.00014189189189189188, "logits/chosen": -3.103154182434082, "logits/rejected": -3.0087850093841553, "logps/chosen": -111.38960266113281, "logps/rejected": -109.3032455444336, "loss": 0.6658, "rewards/accuracies": 0.625, "rewards/chosen": -0.4318141043186188, "rewards/margins": 0.15236088633537292, "rewards/rejected": -0.5841749906539917, "step": 150 }, { "epoch": 0.03, "learning_rate": 0.00015154440154440155, "logits/chosen": -3.065882444381714, "logits/rejected": -3.014258623123169, "logps/chosen": -115.94306945800781, "logps/rejected": -129.7348175048828, "loss": 0.7302, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6290556192398071, "rewards/margins": 0.06416401267051697, "rewards/rejected": -0.6932196021080017, "step": 160 }, { "epoch": 0.03, "learning_rate": 0.0001611969111969112, "logits/chosen": -2.8696858882904053, "logits/rejected": -2.820652723312378, "logps/chosen": -121.01307678222656, "logps/rejected": -122.356201171875, "loss": 0.7221, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.7266199588775635, "rewards/margins": 0.14146149158477783, "rewards/rejected": -0.8680814504623413, "step": 170 }, { "epoch": 0.03, "learning_rate": 0.00017084942084942084, "logits/chosen": -2.89375638961792, "logits/rejected": -2.8223800659179688, "logps/chosen": -131.37777709960938, "logps/rejected": -125.69004821777344, "loss": 0.583, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9062817692756653, "rewards/margins": 0.5299333333969116, "rewards/rejected": -1.4362150430679321, "step": 180 }, { "epoch": 0.04, "learning_rate": 0.0001805019305019305, "logits/chosen": -2.8086953163146973, "logits/rejected": -2.8810436725616455, "logps/chosen": -115.71038818359375, "logps/rejected": -133.2216339111328, "loss": 0.7593, "rewards/accuracies": 0.5625, "rewards/chosen": -1.390483021736145, "rewards/margins": 0.12736426293849945, "rewards/rejected": -1.5178472995758057, "step": 190 }, { "epoch": 0.04, "learning_rate": 0.00019015444015444015, "logits/chosen": -3.0167882442474365, "logits/rejected": -2.996938943862915, "logps/chosen": -118.67408752441406, "logps/rejected": -106.15169525146484, "loss": 0.672, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.7122364640235901, "rewards/margins": 0.25746825337409973, "rewards/rejected": -0.9697047472000122, "step": 200 }, { "epoch": 0.04, "eval_logits/chosen": -3.1546952724456787, "eval_logits/rejected": -3.1338424682617188, "eval_logps/chosen": -127.55575561523438, "eval_logps/rejected": -128.7761993408203, "eval_loss": 0.743442177772522, "eval_rewards/accuracies": 0.5858798623085022, "eval_rewards/chosen": -1.0754988193511963, "eval_rewards/margins": 0.3047899305820465, "eval_rewards/rejected": -1.3802887201309204, "eval_runtime": 1335.2681, "eval_samples_per_second": 0.711, "eval_steps_per_second": 0.711, "step": 200 }, { "epoch": 0.04, "learning_rate": 0.0001998069498069498, "logits/chosen": -3.2106406688690186, "logits/rejected": -3.1746304035186768, "logps/chosen": -131.38128662109375, "logps/rejected": -121.36234283447266, "loss": 0.664, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.005927324295044, "rewards/margins": 0.5132917165756226, "rewards/rejected": -1.5192190408706665, "step": 210 }, { "epoch": 0.04, "learning_rate": 0.00020945945945945947, "logits/chosen": -3.2851333618164062, "logits/rejected": -3.267256259918213, "logps/chosen": -120.11392974853516, "logps/rejected": -117.89964294433594, "loss": 0.6965, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.7865055203437805, "rewards/margins": 0.19706687331199646, "rewards/rejected": -0.9835723638534546, "step": 220 }, { "epoch": 0.04, "learning_rate": 0.0002191119691119691, "logits/chosen": -3.408318281173706, "logits/rejected": -3.351940870285034, "logps/chosen": -107.76014709472656, "logps/rejected": -107.02482604980469, "loss": 0.7385, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.41882553696632385, "rewards/margins": 0.01567765511572361, "rewards/rejected": -0.4345032274723053, "step": 230 }, { "epoch": 0.05, "learning_rate": 0.00022876447876447875, "logits/chosen": -3.155478000640869, "logits/rejected": -3.135077953338623, "logps/chosen": -131.94711303710938, "logps/rejected": -112.65836334228516, "loss": 0.7328, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.6660552024841309, "rewards/margins": 0.10303208976984024, "rewards/rejected": -0.7690872550010681, "step": 240 }, { "epoch": 0.05, "learning_rate": 0.00023841698841698842, "logits/chosen": -3.290839433670044, "logits/rejected": -3.2837767601013184, "logps/chosen": -132.83676147460938, "logps/rejected": -119.4383773803711, "loss": 0.678, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.9368747472763062, "rewards/margins": 0.25725504755973816, "rewards/rejected": -1.1941298246383667, "step": 250 }, { "epoch": 0.05, "learning_rate": 0.0002480694980694981, "logits/chosen": -3.3188934326171875, "logits/rejected": -3.361811876296997, "logps/chosen": -130.7174835205078, "logps/rejected": -147.4563751220703, "loss": 0.7069, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.9200389981269836, "rewards/margins": 0.4393937587738037, "rewards/rejected": -1.3594326972961426, "step": 260 }, { "epoch": 0.05, "learning_rate": 0.0002567567567567567, "logits/chosen": -3.249516010284424, "logits/rejected": -3.2256407737731934, "logps/chosen": -108.8515853881836, "logps/rejected": -132.24813842773438, "loss": 0.6983, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.42829591035842896, "rewards/margins": 0.3723019063472748, "rewards/rejected": -0.8005977869033813, "step": 270 }, { "epoch": 0.05, "learning_rate": 0.0002635135135135135, "logits/chosen": -3.0320539474487305, "logits/rejected": -2.982640027999878, "logps/chosen": -175.3157501220703, "logps/rejected": -154.48963928222656, "loss": 1.4131, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -2.973146915435791, "rewards/margins": -0.0525052547454834, "rewards/rejected": -2.9206414222717285, "step": 280 }, { "epoch": 0.06, "learning_rate": 0.00027316602316602317, "logits/chosen": -2.9082584381103516, "logits/rejected": -2.8451313972473145, "logps/chosen": -129.400146484375, "logps/rejected": -118.70863342285156, "loss": 0.9237, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.8987582921981812, "rewards/margins": 0.024892251938581467, "rewards/rejected": -1.9236505031585693, "step": 290 }, { "epoch": 0.06, "learning_rate": 0.00028281853281853284, "logits/chosen": -3.066049337387085, "logits/rejected": -2.9455220699310303, "logps/chosen": -131.84317016601562, "logps/rejected": -121.24568176269531, "loss": 0.6998, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1568044424057007, "rewards/margins": 0.3450776934623718, "rewards/rejected": -1.5018823146820068, "step": 300 }, { "epoch": 0.06, "learning_rate": 0.0002924710424710425, "logits/chosen": -2.8079309463500977, "logits/rejected": -2.773160457611084, "logps/chosen": -169.48828125, "logps/rejected": -148.60374450683594, "loss": 2.3505, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -5.812082290649414, "rewards/margins": -0.6856533288955688, "rewards/rejected": -5.126428127288818, "step": 310 }, { "epoch": 0.06, "learning_rate": 0.0003021235521235521, "logits/chosen": -2.6579480171203613, "logits/rejected": -2.6659698486328125, "logps/chosen": -141.78701782226562, "logps/rejected": -160.35110473632812, "loss": 0.8982, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -3.301175594329834, "rewards/margins": 0.01409349124878645, "rewards/rejected": -3.3152689933776855, "step": 320 }, { "epoch": 0.06, "learning_rate": 0.0003117760617760618, "logits/chosen": -2.8775956630706787, "logits/rejected": -2.8255507946014404, "logps/chosen": -150.46177673339844, "logps/rejected": -144.97073364257812, "loss": 0.9436, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -3.29494047164917, "rewards/margins": 0.0633418932557106, "rewards/rejected": -3.3582825660705566, "step": 330 }, { "epoch": 0.07, "learning_rate": 0.0003204633204633205, "logits/chosen": -2.8166918754577637, "logits/rejected": -2.818556308746338, "logps/chosen": -171.34437561035156, "logps/rejected": -172.5870361328125, "loss": 1.0895, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -4.722414016723633, "rewards/margins": 0.23760518431663513, "rewards/rejected": -4.960019111633301, "step": 340 }, { "epoch": 0.07, "learning_rate": 0.00033011583011583015, "logits/chosen": -2.9005284309387207, "logits/rejected": -2.905046224594116, "logps/chosen": -179.5493621826172, "logps/rejected": -189.4880828857422, "loss": 1.4019, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -4.879184722900391, "rewards/margins": 0.2637065649032593, "rewards/rejected": -5.142890930175781, "step": 350 }, { "epoch": 0.07, "learning_rate": 0.00033976833976833977, "logits/chosen": -2.4955551624298096, "logits/rejected": -2.5379605293273926, "logps/chosen": -142.00270080566406, "logps/rejected": -147.41220092773438, "loss": 0.7586, "rewards/accuracies": 0.625, "rewards/chosen": -3.0069820880889893, "rewards/margins": 0.49537187814712524, "rewards/rejected": -3.502354383468628, "step": 360 }, { "epoch": 0.07, "learning_rate": 0.00034942084942084944, "logits/chosen": -2.451601266860962, "logits/rejected": -2.4406824111938477, "logps/chosen": -177.74746704101562, "logps/rejected": -181.7180633544922, "loss": 1.6042, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -6.1398024559021, "rewards/margins": -0.07387089729309082, "rewards/rejected": -6.065931797027588, "step": 370 }, { "epoch": 0.07, "learning_rate": 0.0003590733590733591, "logits/chosen": -2.4471051692962646, "logits/rejected": -2.409393548965454, "logps/chosen": -182.06051635742188, "logps/rejected": -163.97035217285156, "loss": 2.4314, "rewards/accuracies": 0.4375, "rewards/chosen": -6.733994483947754, "rewards/margins": -1.0469824075698853, "rewards/rejected": -5.687012672424316, "step": 380 }, { "epoch": 0.08, "learning_rate": 0.0003687258687258687, "logits/chosen": -2.8171439170837402, "logits/rejected": -2.7251698970794678, "logps/chosen": -160.04652404785156, "logps/rejected": -142.2588348388672, "loss": 1.0093, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -3.551713466644287, "rewards/margins": 0.07490300387144089, "rewards/rejected": -3.6266167163848877, "step": 390 }, { "epoch": 0.08, "learning_rate": 0.0003783783783783784, "logits/chosen": -2.8222527503967285, "logits/rejected": -2.8754923343658447, "logps/chosen": -138.70736694335938, "logps/rejected": -142.30128479003906, "loss": 0.945, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -3.170539140701294, "rewards/margins": 0.011356920003890991, "rewards/rejected": -3.1818957328796387, "step": 400 }, { "epoch": 0.08, "eval_logits/chosen": -2.9538896083831787, "eval_logits/rejected": -2.8971762657165527, "eval_logps/chosen": -152.2897491455078, "eval_logps/rejected": -150.13941955566406, "eval_loss": 1.0823436975479126, "eval_rewards/accuracies": 0.5100105404853821, "eval_rewards/chosen": -3.548898458480835, "eval_rewards/margins": -0.032288454473018646, "eval_rewards/rejected": -3.5166099071502686, "eval_runtime": 1347.2142, "eval_samples_per_second": 0.704, "eval_steps_per_second": 0.704, "step": 400 }, { "epoch": 0.08, "learning_rate": 0.00038803088803088807, "logits/chosen": -2.8151559829711914, "logits/rejected": -2.8057456016540527, "logps/chosen": -145.69772338867188, "logps/rejected": -156.96774291992188, "loss": 0.8857, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -3.8467044830322266, "rewards/margins": 0.3242764472961426, "rewards/rejected": -4.170981407165527, "step": 410 }, { "epoch": 0.08, "learning_rate": 0.0003976833976833977, "logits/chosen": -2.7274651527404785, "logits/rejected": -2.713927745819092, "logps/chosen": -164.06692504882812, "logps/rejected": -149.8723907470703, "loss": 0.9745, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -3.5636115074157715, "rewards/margins": 0.0345739908516407, "rewards/rejected": -3.5981857776641846, "step": 420 }, { "epoch": 0.08, "learning_rate": 0.00040733590733590735, "logits/chosen": -2.8782758712768555, "logits/rejected": -2.8031697273254395, "logps/chosen": -165.91473388671875, "logps/rejected": -147.03627014160156, "loss": 0.8281, "rewards/accuracies": 0.625, "rewards/chosen": -4.2417216300964355, "rewards/margins": 0.32142549753189087, "rewards/rejected": -4.563147068023682, "step": 430 }, { "epoch": 0.09, "learning_rate": 0.000416988416988417, "logits/chosen": -2.623711585998535, "logits/rejected": -2.622528314590454, "logps/chosen": -149.8426513671875, "logps/rejected": -159.93692016601562, "loss": 0.9961, "rewards/accuracies": 0.5, "rewards/chosen": -4.433084487915039, "rewards/margins": 0.28495556116104126, "rewards/rejected": -4.7180399894714355, "step": 440 }, { "epoch": 0.09, "learning_rate": 0.00042664092664092664, "logits/chosen": -2.580371618270874, "logits/rejected": -2.5741231441497803, "logps/chosen": -160.84347534179688, "logps/rejected": -141.23475646972656, "loss": 1.2914, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -4.076364994049072, "rewards/margins": -0.23183032870292664, "rewards/rejected": -3.8445351123809814, "step": 450 }, { "epoch": 0.09, "learning_rate": 0.0004362934362934363, "logits/chosen": -2.889563798904419, "logits/rejected": -2.8342082500457764, "logps/chosen": -186.77017211914062, "logps/rejected": -168.42330932617188, "loss": 1.1317, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -4.674792289733887, "rewards/margins": -0.265504390001297, "rewards/rejected": -4.409287929534912, "step": 460 }, { "epoch": 0.09, "learning_rate": 0.000445945945945946, "logits/chosen": -2.3731606006622314, "logits/rejected": -2.344404697418213, "logps/chosen": -172.8909454345703, "logps/rejected": -175.9696502685547, "loss": 0.9674, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -5.584943771362305, "rewards/margins": 0.14772634208202362, "rewards/rejected": -5.732670783996582, "step": 470 }, { "epoch": 0.09, "learning_rate": 0.0004555984555984556, "logits/chosen": -2.3436319828033447, "logits/rejected": -2.301845073699951, "logps/chosen": -173.07313537597656, "logps/rejected": -169.7339630126953, "loss": 1.2097, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -5.504385471343994, "rewards/margins": 0.23244301974773407, "rewards/rejected": -5.736828804016113, "step": 480 }, { "epoch": 0.09, "learning_rate": 0.00046525096525096526, "logits/chosen": -2.6778111457824707, "logits/rejected": -2.5807526111602783, "logps/chosen": -166.180419921875, "logps/rejected": -168.2677001953125, "loss": 1.2799, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -4.63643217086792, "rewards/margins": 0.5320998430252075, "rewards/rejected": -5.168532371520996, "step": 490 }, { "epoch": 0.1, "learning_rate": 0.00047490347490347493, "logits/chosen": -3.083743095397949, "logits/rejected": -3.080765962600708, "logps/chosen": -193.96841430664062, "logps/rejected": -182.3042755126953, "loss": 1.6246, "rewards/accuracies": 0.5, "rewards/chosen": -6.152979850769043, "rewards/margins": -0.20669928193092346, "rewards/rejected": -5.946280479431152, "step": 500 }, { "epoch": 0.1, "learning_rate": 0.0004777992277992278, "logits/chosen": -2.692495584487915, "logits/rejected": -2.6421055793762207, "logps/chosen": -172.01806640625, "logps/rejected": -165.9178466796875, "loss": 1.2994, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -4.898122787475586, "rewards/margins": 0.04086846113204956, "rewards/rejected": -4.938991546630859, "step": 510 }, { "epoch": 0.1, "learning_rate": 0.0004874517374517375, "logits/chosen": -2.7154297828674316, "logits/rejected": -2.5865087509155273, "logps/chosen": -134.430419921875, "logps/rejected": -144.77151489257812, "loss": 1.0245, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -3.9665591716766357, "rewards/margins": 0.17584654688835144, "rewards/rejected": -4.1424055099487305, "step": 520 }, { "epoch": 0.1, "learning_rate": 0.0004961389961389962, "logits/chosen": -2.863107442855835, "logits/rejected": -2.8306002616882324, "logps/chosen": -269.8907165527344, "logps/rejected": -273.8556823730469, "loss": 3.3909, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -13.715133666992188, "rewards/margins": 0.3205181956291199, "rewards/rejected": -14.035652160644531, "step": 530 }, { "epoch": 0.1, "learning_rate": 0.0004999979503849796, "logits/chosen": -3.0216221809387207, "logits/rejected": -3.014930248260498, "logps/chosen": -194.19422912597656, "logps/rejected": -201.08251953125, "loss": 2.3037, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -7.459778785705566, "rewards/margins": -0.22785942256450653, "rewards/rejected": -7.231919288635254, "step": 540 }, { "epoch": 0.11, "learning_rate": 0.0004999854250815602, "logits/chosen": -2.967331647872925, "logits/rejected": -2.933845043182373, "logps/chosen": -215.81240844726562, "logps/rejected": -204.9075927734375, "loss": 2.1232, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -7.889649868011475, "rewards/margins": 0.17769476771354675, "rewards/rejected": -8.067344665527344, "step": 550 }, { "epoch": 0.11, "learning_rate": 0.00049996151371953, "logits/chosen": -3.052489757537842, "logits/rejected": -3.0332350730895996, "logps/chosen": -181.4810791015625, "logps/rejected": -172.9596405029297, "loss": 1.3367, "rewards/accuracies": 0.5, "rewards/chosen": -5.431277275085449, "rewards/margins": 0.03274815157055855, "rewards/rejected": -5.464025020599365, "step": 560 }, { "epoch": 0.11, "learning_rate": 0.0004999262173879769, "logits/chosen": -3.040531635284424, "logits/rejected": -3.037515163421631, "logps/chosen": -179.90150451660156, "logps/rejected": -185.0644989013672, "loss": 1.4866, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -6.061680316925049, "rewards/margins": 0.6980069279670715, "rewards/rejected": -6.759686470031738, "step": 570 }, { "epoch": 0.11, "learning_rate": 0.0004998795376945392, "logits/chosen": -3.021232843399048, "logits/rejected": -2.9936585426330566, "logps/chosen": -175.13389587402344, "logps/rejected": -156.2028350830078, "loss": 1.7235, "rewards/accuracies": 0.375, "rewards/chosen": -5.4631123542785645, "rewards/margins": -0.5610149502754211, "rewards/rejected": -4.902098178863525, "step": 580 }, { "epoch": 0.11, "learning_rate": 0.0004998214767653319, "logits/chosen": -2.944594621658325, "logits/rejected": -3.0027194023132324, "logps/chosen": -189.1737823486328, "logps/rejected": -175.7019500732422, "loss": 1.9699, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -6.499063014984131, "rewards/margins": -0.3209795355796814, "rewards/rejected": -6.178082466125488, "step": 590 }, { "epoch": 0.12, "learning_rate": 0.0004997520372448494, "logits/chosen": -2.8185458183288574, "logits/rejected": -2.798320770263672, "logps/chosen": -262.44500732421875, "logps/rejected": -245.83889770507812, "loss": 3.8268, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -12.636558532714844, "rewards/margins": -0.5035432577133179, "rewards/rejected": -12.133015632629395, "step": 600 }, { "epoch": 0.12, "eval_logits/chosen": -2.9935925006866455, "eval_logits/rejected": -2.961137533187866, "eval_logps/chosen": -184.5051727294922, "eval_logps/rejected": -181.61184692382812, "eval_loss": 1.3598365783691406, "eval_rewards/accuracies": 0.5193312168121338, "eval_rewards/chosen": -6.594781875610352, "eval_rewards/margins": 0.12169010192155838, "eval_rewards/rejected": -6.7164716720581055, "eval_runtime": 1314.8357, "eval_samples_per_second": 0.728, "eval_steps_per_second": 0.728, "step": 600 }, { "epoch": 0.12, "learning_rate": 0.0004996712222958462, "logits/chosen": -2.9863028526306152, "logits/rejected": -2.841834306716919, "logps/chosen": -234.49893188476562, "logps/rejected": -215.51123046875, "loss": 3.4529, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -11.542932510375977, "rewards/margins": -1.085506796836853, "rewards/rejected": -10.457425117492676, "step": 610 }, { "epoch": 0.12, "learning_rate": 0.0004995790355991916, "logits/chosen": -2.9887356758117676, "logits/rejected": -2.887108564376831, "logps/chosen": -202.21853637695312, "logps/rejected": -198.64749145507812, "loss": 1.6552, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -7.725579261779785, "rewards/margins": 0.2432982176542282, "rewards/rejected": -7.96887731552124, "step": 620 }, { "epoch": 0.12, "learning_rate": 0.0004994754813537031, "logits/chosen": -3.116293430328369, "logits/rejected": -3.126661539077759, "logps/chosen": -201.38604736328125, "logps/rejected": -192.84515380859375, "loss": 1.9923, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -8.627758979797363, "rewards/margins": -0.4443356990814209, "rewards/rejected": -8.183423042297363, "step": 630 }, { "epoch": 0.12, "learning_rate": 0.000499372567166064, "logits/chosen": -3.480961561203003, "logits/rejected": -3.476128339767456, "logps/chosen": -226.2942657470703, "logps/rejected": -198.20462036132812, "loss": 3.5697, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -10.370187759399414, "rewards/margins": -1.4912974834442139, "rewards/rejected": -8.878890037536621, "step": 640 }, { "epoch": 0.13, "learning_rate": 0.0004992474279997049, "logits/chosen": -3.380039930343628, "logits/rejected": -3.3540236949920654, "logps/chosen": -183.65228271484375, "logps/rejected": -192.43350219726562, "loss": 2.0561, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -6.808587551116943, "rewards/margins": 1.2175410985946655, "rewards/rejected": -8.026129722595215, "step": 650 }, { "epoch": 0.13, "learning_rate": 0.0004991109363882065, "logits/chosen": -2.5853641033172607, "logits/rejected": -2.6103484630584717, "logps/chosen": -464.68310546875, "logps/rejected": -447.2250061035156, "loss": 11.4738, "rewards/accuracies": 0.5625, "rewards/chosen": -34.34156036376953, "rewards/margins": -1.95975661277771, "rewards/rejected": -32.381797790527344, "step": 660 }, { "epoch": 0.13, "learning_rate": 0.0004989630985483375, "logits/chosen": -2.7174885272979736, "logits/rejected": -2.707397937774658, "logps/chosen": -447.7940979003906, "logps/rejected": -407.99493408203125, "loss": 12.9625, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -32.60967254638672, "rewards/margins": -3.206895112991333, "rewards/rejected": -29.40277671813965, "step": 670 }, { "epoch": 0.13, "learning_rate": 0.0004988203490218075, "logits/chosen": -2.946742296218872, "logits/rejected": -2.8993031978607178, "logps/chosen": -445.21142578125, "logps/rejected": -432.021240234375, "loss": 10.7741, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -32.348819732666016, "rewards/margins": -1.2561819553375244, "rewards/rejected": -31.092632293701172, "step": 680 }, { "epoch": 0.13, "learning_rate": 0.0004986509723258511, "logits/chosen": -3.1020426750183105, "logits/rejected": -3.133068561553955, "logps/chosen": -427.81756591796875, "logps/rejected": -416.2047424316406, "loss": 11.4242, "rewards/accuracies": 0.4375, "rewards/chosen": -31.230037689208984, "rewards/margins": -0.655289351940155, "rewards/rejected": -30.57474708557129, "step": 690 }, { "epoch": 0.14, "learning_rate": 0.0004984702703514565, "logits/chosen": -3.0160446166992188, "logits/rejected": -3.0138049125671387, "logps/chosen": -433.4644470214844, "logps/rejected": -405.3623046875, "loss": 10.8165, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -30.836090087890625, "rewards/margins": -2.514590263366699, "rewards/rejected": -28.321496963500977, "step": 700 }, { "epoch": 0.14, "learning_rate": 0.0004982782513290365, "logits/chosen": -3.1978919506073, "logits/rejected": -3.197380542755127, "logps/chosen": -422.511962890625, "logps/rejected": -402.30938720703125, "loss": 12.8189, "rewards/accuracies": 0.4375, "rewards/chosen": -30.785781860351562, "rewards/margins": -2.3963069915771484, "rewards/rejected": -28.389474868774414, "step": 710 }, { "epoch": 0.14, "learning_rate": 0.0004980749240044603, "logits/chosen": -3.1342532634735107, "logits/rejected": -3.1338047981262207, "logps/chosen": -403.13494873046875, "logps/rejected": -357.790771484375, "loss": 11.5675, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -28.892202377319336, "rewards/margins": -3.723827362060547, "rewards/rejected": -25.168371200561523, "step": 720 }, { "epoch": 0.14, "learning_rate": 0.0004978602976386554, "logits/chosen": -3.0739312171936035, "logits/rejected": -3.0738184452056885, "logps/chosen": -381.2265625, "logps/rejected": -378.4680480957031, "loss": 11.5225, "rewards/accuracies": 0.5, "rewards/chosen": -27.621994018554688, "rewards/margins": -0.9019744992256165, "rewards/rejected": -26.720022201538086, "step": 730 }, { "epoch": 0.14, "learning_rate": 0.0004976343820071849, "logits/chosen": -3.166983127593994, "logits/rejected": -3.1671650409698486, "logps/chosen": -408.42071533203125, "logps/rejected": -387.2364196777344, "loss": 13.9818, "rewards/accuracies": 0.5, "rewards/chosen": -29.39678382873535, "rewards/margins": -2.448943614959717, "rewards/rejected": -26.94784164428711, "step": 740 }, { "epoch": 0.14, "learning_rate": 0.0004973971873998035, "logits/chosen": -3.0561656951904297, "logits/rejected": -3.0557007789611816, "logps/chosen": -417.0025329589844, "logps/rejected": -349.56463623046875, "loss": 12.3073, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -29.997058868408203, "rewards/margins": -5.650521278381348, "rewards/rejected": -24.346534729003906, "step": 750 }, { "epoch": 0.15, "learning_rate": 0.0004971487246199875, "logits/chosen": -3.0265376567840576, "logits/rejected": -3.0265283584594727, "logps/chosen": -434.55419921875, "logps/rejected": -381.22808837890625, "loss": 12.0398, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -30.918407440185547, "rewards/margins": -4.258307456970215, "rewards/rejected": -26.66009521484375, "step": 760 }, { "epoch": 0.15, "learning_rate": 0.000496889004984444, "logits/chosen": -2.8932366371154785, "logits/rejected": -2.895204544067383, "logps/chosen": -396.3167419433594, "logps/rejected": -428.53839111328125, "loss": 9.4104, "rewards/accuracies": 0.5625, "rewards/chosen": -28.4284610748291, "rewards/margins": 1.9928890466690063, "rewards/rejected": -30.42134666442871, "step": 770 }, { "epoch": 0.15, "learning_rate": 0.0004966180403225946, "logits/chosen": -2.895068407058716, "logits/rejected": -2.894937753677368, "logps/chosen": -395.245849609375, "logps/rejected": -378.4429626464844, "loss": 10.2846, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -28.237285614013672, "rewards/margins": -1.503316879272461, "rewards/rejected": -26.733972549438477, "step": 780 }, { "epoch": 0.15, "learning_rate": 0.0004963358429760368, "logits/chosen": -2.551323652267456, "logits/rejected": -2.5523290634155273, "logps/chosen": -477.17327880859375, "logps/rejected": -427.474853515625, "loss": 12.1626, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -35.30048370361328, "rewards/margins": -4.13530158996582, "rewards/rejected": -31.165185928344727, "step": 790 }, { "epoch": 0.15, "learning_rate": 0.0004960424257979822, "logits/chosen": -2.7914838790893555, "logits/rejected": -2.790367841720581, "logps/chosen": -478.9364318847656, "logps/rejected": -461.02655029296875, "loss": 10.3404, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -35.25132369995117, "rewards/margins": -1.6512939929962158, "rewards/rejected": -33.60003662109375, "step": 800 }, { "epoch": 0.15, "eval_logits/chosen": -3.0016679763793945, "eval_logits/rejected": -3.0014870166778564, "eval_logps/chosen": -441.87054443359375, "eval_logps/rejected": -399.2597961425781, "eval_loss": 11.33322811126709, "eval_rewards/accuracies": 0.4555903971195221, "eval_rewards/chosen": -32.331321716308594, "eval_rewards/margins": -3.850048303604126, "eval_rewards/rejected": -28.481277465820312, "eval_runtime": 1312.1317, "eval_samples_per_second": 0.729, "eval_steps_per_second": 0.729, "step": 800 }, { "epoch": 0.16, "learning_rate": 0.0004957378021526705, "logits/chosen": -2.9228155612945557, "logits/rejected": -2.925412654876709, "logps/chosen": -481.1300354003906, "logps/rejected": -465.1952209472656, "loss": 11.3707, "rewards/accuracies": 0.4375, "rewards/chosen": -35.0019416809082, "rewards/margins": -1.9142730236053467, "rewards/rejected": -33.08766555786133, "step": 810 }, { "epoch": 0.16, "learning_rate": 0.0004954219859147614, "logits/chosen": -3.0219039916992188, "logits/rejected": -3.0174221992492676, "logps/chosen": -364.8722839355469, "logps/rejected": -295.5256652832031, "loss": 12.3607, "rewards/accuracies": 0.38749998807907104, "rewards/chosen": -25.810903549194336, "rewards/margins": -5.978564262390137, "rewards/rejected": -19.832340240478516, "step": 820 }, { "epoch": 0.16, "learning_rate": 0.0004950949914687023, "logits/chosen": -3.118417263031006, "logits/rejected": -3.1218113899230957, "logps/chosen": -469.4906311035156, "logps/rejected": -412.5103454589844, "loss": 11.9496, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -33.402565002441406, "rewards/margins": -4.156603813171387, "rewards/rejected": -29.245960235595703, "step": 830 }, { "epoch": 0.16, "learning_rate": 0.0004947568337080732, "logits/chosen": -3.0231635570526123, "logits/rejected": -3.0243794918060303, "logps/chosen": -382.8542785644531, "logps/rejected": -346.9595031738281, "loss": 9.7701, "rewards/accuracies": 0.4375, "rewards/chosen": -27.092443466186523, "rewards/margins": -3.154633045196533, "rewards/rejected": -23.93781089782715, "step": 840 }, { "epoch": 0.16, "learning_rate": 0.0004944075280349084, "logits/chosen": -3.034963846206665, "logits/rejected": -3.0339653491973877, "logps/chosen": -385.3253173828125, "logps/rejected": -367.23638916015625, "loss": 9.2328, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -27.111583709716797, "rewards/margins": -1.6716159582138062, "rewards/rejected": -25.43996810913086, "step": 850 }, { "epoch": 0.17, "learning_rate": 0.0004940470903589948, "logits/chosen": -3.1586310863494873, "logits/rejected": -3.128281831741333, "logps/chosen": -487.345703125, "logps/rejected": -389.9637756347656, "loss": 12.7587, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": -34.73926544189453, "rewards/margins": -6.996462821960449, "rewards/rejected": -27.7428035736084, "step": 860 }, { "epoch": 0.17, "learning_rate": 0.0004936755370971475, "logits/chosen": -2.9109790325164795, "logits/rejected": -2.888288974761963, "logps/chosen": -463.9207458496094, "logps/rejected": -376.2655334472656, "loss": 13.8618, "rewards/accuracies": 0.36250001192092896, "rewards/chosen": -34.083106994628906, "rewards/margins": -7.244679927825928, "rewards/rejected": -26.838430404663086, "step": 870 }, { "epoch": 0.17, "learning_rate": 0.0004932928851724621, "logits/chosen": -2.8432798385620117, "logits/rejected": -2.8495278358459473, "logps/chosen": -365.90679931640625, "logps/rejected": -361.8638000488281, "loss": 8.7379, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -26.055978775024414, "rewards/margins": -0.3134794235229492, "rewards/rejected": -25.742502212524414, "step": 880 }, { "epoch": 0.17, "learning_rate": 0.0004928991520135436, "logits/chosen": -2.6536898612976074, "logits/rejected": -2.6361289024353027, "logps/chosen": -519.7141723632812, "logps/rejected": -389.46575927734375, "loss": 15.3084, "rewards/accuracies": 0.38749998807907104, "rewards/chosen": -37.92496871948242, "rewards/margins": -10.107343673706055, "rewards/rejected": -27.817623138427734, "step": 890 }, { "epoch": 0.17, "learning_rate": 0.0004924943555537128, "logits/chosen": -3.1115312576293945, "logits/rejected": -3.0791449546813965, "logps/chosen": -469.66351318359375, "logps/rejected": -401.14508056640625, "loss": 13.3952, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -33.486183166503906, "rewards/margins": -5.449090480804443, "rewards/rejected": -28.037090301513672, "step": 900 }, { "epoch": 0.18, "learning_rate": 0.0004920785142301893, "logits/chosen": -2.454453945159912, "logits/rejected": -2.4585988521575928, "logps/chosen": -463.24517822265625, "logps/rejected": -402.23773193359375, "loss": 13.7904, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -34.2692985534668, "rewards/margins": -5.860762596130371, "rewards/rejected": -28.40853500366211, "step": 910 }, { "epoch": 0.18, "learning_rate": 0.0004916516469832524, "logits/chosen": -2.8716561794281006, "logits/rejected": -2.8634109497070312, "logps/chosen": -347.35198974609375, "logps/rejected": -340.46942138671875, "loss": 7.8634, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -24.902542114257812, "rewards/margins": -1.0167404413223267, "rewards/rejected": -23.885799407958984, "step": 920 }, { "epoch": 0.18, "learning_rate": 0.0004912137732553772, "logits/chosen": -3.2272415161132812, "logits/rejected": -3.2235121726989746, "logps/chosen": -450.973876953125, "logps/rejected": -462.0083923339844, "loss": 9.2953, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -32.52794647216797, "rewards/margins": 0.38686689734458923, "rewards/rejected": -32.91481399536133, "step": 930 }, { "epoch": 0.18, "learning_rate": 0.0004907649129903504, "logits/chosen": -2.647204875946045, "logits/rejected": -2.6482150554656982, "logps/chosen": -373.737060546875, "logps/rejected": -383.5912780761719, "loss": 7.0717, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -26.5350284576416, "rewards/margins": 0.7499195337295532, "rewards/rejected": -27.284948348999023, "step": 940 }, { "epoch": 0.18, "learning_rate": 0.0004903050866323608, "logits/chosen": -3.079465627670288, "logits/rejected": -3.0794615745544434, "logps/chosen": -395.6184997558594, "logps/rejected": -406.3051452636719, "loss": 9.1411, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -29.57635498046875, "rewards/margins": 0.8788874745368958, "rewards/rejected": -30.45524024963379, "step": 950 }, { "epoch": 0.19, "learning_rate": 0.000489834315125069, "logits/chosen": -3.1981568336486816, "logits/rejected": -3.1923341751098633, "logps/chosen": -453.5596618652344, "logps/rejected": -425.4774475097656, "loss": 11.2943, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -32.61115646362305, "rewards/margins": -2.69136118888855, "rewards/rejected": -29.9197998046875, "step": 960 }, { "epoch": 0.19, "learning_rate": 0.0004893526199106531, "logits/chosen": -2.876206874847412, "logits/rejected": -2.881593942642212, "logps/chosen": -433.71636962890625, "logps/rejected": -391.15692138671875, "loss": 10.9992, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -31.739410400390625, "rewards/margins": -3.6809983253479004, "rewards/rejected": -28.058406829833984, "step": 970 }, { "epoch": 0.19, "learning_rate": 0.0004888600229288316, "logits/chosen": -2.865589141845703, "logits/rejected": -2.8664183616638184, "logps/chosen": -359.43023681640625, "logps/rejected": -321.70599365234375, "loss": 7.9915, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -25.906265258789062, "rewards/margins": -3.289003849029541, "rewards/rejected": -22.617259979248047, "step": 980 }, { "epoch": 0.19, "learning_rate": 0.0004883565466158652, "logits/chosen": -2.8116517066955566, "logits/rejected": -2.782489776611328, "logps/chosen": -494.1553649902344, "logps/rejected": -430.222412109375, "loss": 13.6529, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -36.09266662597656, "rewards/margins": -5.284867286682129, "rewards/rejected": -30.807796478271484, "step": 990 }, { "epoch": 0.19, "learning_rate": 0.0004878422139035341, "logits/chosen": -2.4114489555358887, "logits/rejected": -2.377622604370117, "logps/chosen": -482.96856689453125, "logps/rejected": -437.3125, "loss": 11.137, "rewards/accuracies": 0.5, "rewards/chosen": -34.935813903808594, "rewards/margins": -3.0001707077026367, "rewards/rejected": -31.93564224243164, "step": 1000 }, { "epoch": 0.19, "eval_logits/chosen": -2.2959094047546387, "eval_logits/rejected": -2.2838947772979736, "eval_logps/chosen": -438.8805236816406, "eval_logps/rejected": -399.5718688964844, "eval_loss": 10.402800559997559, "eval_rewards/accuracies": 0.45036572217941284, "eval_rewards/chosen": -32.032310485839844, "eval_rewards/margins": -3.519833564758301, "eval_rewards/rejected": -28.51247787475586, "eval_runtime": 1313.6418, "eval_samples_per_second": 0.729, "eval_steps_per_second": 0.729, "step": 1000 }, { "epoch": 0.18, "learning_rate": 0.0004917027842051741, "logits/chosen": -2.3743691444396973, "logits/rejected": -2.3645715713500977, "logps/chosen": -467.918701171875, "logps/rejected": -392.2208557128906, "loss": 13.2044, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -34.59047317504883, "rewards/margins": -6.761924743652344, "rewards/rejected": -27.82854652404785, "step": 1010 }, { "epoch": 0.18, "learning_rate": 0.0004913043488808868, "logits/chosen": -2.2389774322509766, "logits/rejected": -2.229212760925293, "logps/chosen": -486.40362548828125, "logps/rejected": -460.124267578125, "loss": 12.247, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -36.01793670654297, "rewards/margins": -2.4814937114715576, "rewards/rejected": -33.53643798828125, "step": 1020 }, { "epoch": 0.18, "learning_rate": 0.0004909379125837757, "logits/chosen": -2.3661270141601562, "logits/rejected": -2.35339093208313, "logps/chosen": -450.0957946777344, "logps/rejected": -419.64453125, "loss": 9.5754, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -32.51788330078125, "rewards/margins": -2.8007171154022217, "rewards/rejected": -29.717166900634766, "step": 1030 }, { "epoch": 0.18, "learning_rate": 0.000490689498823928, "logits/chosen": -2.502885341644287, "logits/rejected": -2.475445032119751, "logps/chosen": -544.1683349609375, "logps/rejected": -447.0438537597656, "loss": 14.0802, "rewards/accuracies": 0.38749998807907104, "rewards/chosen": -40.440181732177734, "rewards/margins": -7.666708946228027, "rewards/rejected": -32.773475646972656, "step": 1040 }, { "epoch": 0.19, "learning_rate": 0.0004903107023416835, "logits/chosen": -2.5913939476013184, "logits/rejected": -2.5680718421936035, "logps/chosen": -486.11907958984375, "logps/rejected": -368.26251220703125, "loss": 13.0584, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": -35.45328140258789, "rewards/margins": -9.003652572631836, "rewards/rejected": -26.449630737304688, "step": 1050 }, { "epoch": 0.19, "learning_rate": 0.0004898811381307269, "logits/chosen": -2.5621819496154785, "logits/rejected": -2.5630691051483154, "logps/chosen": -415.2149353027344, "logps/rejected": -372.393798828125, "loss": 10.9069, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -30.74907875061035, "rewards/margins": -3.5560336112976074, "rewards/rejected": -27.193042755126953, "step": 1060 }, { "epoch": 0.19, "learning_rate": 0.0004894424536834149, "logits/chosen": -2.6120645999908447, "logits/rejected": -2.604825019836426, "logps/chosen": -513.4226684570312, "logps/rejected": -459.19647216796875, "loss": 11.3672, "rewards/accuracies": 0.4375, "rewards/chosen": -39.00333023071289, "rewards/margins": -4.482884883880615, "rewards/rejected": -34.520442962646484, "step": 1070 }, { "epoch": 0.19, "learning_rate": 0.000488994665678449, "logits/chosen": -2.880282163619995, "logits/rejected": -2.8795719146728516, "logps/chosen": -424.77874755859375, "logps/rejected": -367.4000244140625, "loss": 11.0388, "rewards/accuracies": 0.4375, "rewards/chosen": -31.353382110595703, "rewards/margins": -5.197685718536377, "rewards/rejected": -26.15569496154785, "step": 1080 }, { "epoch": 0.19, "learning_rate": 0.0004885377911406459, "logits/chosen": -2.947252035140991, "logits/rejected": -2.940441370010376, "logps/chosen": -442.61651611328125, "logps/rejected": -374.4497985839844, "loss": 11.9975, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -31.82815170288086, "rewards/margins": -5.211056709289551, "rewards/rejected": -26.617095947265625, "step": 1090 }, { "epoch": 0.19, "learning_rate": 0.00048807184744029076, "logits/chosen": -2.9204514026641846, "logits/rejected": -2.922818660736084, "logps/chosen": -416.2978515625, "logps/rejected": -395.44378662109375, "loss": 9.0951, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -30.16245460510254, "rewards/margins": -1.954272985458374, "rewards/rejected": -28.208179473876953, "step": 1100 }, { "epoch": 0.2, "learning_rate": 0.00048759685229247675, "logits/chosen": -2.950378179550171, "logits/rejected": -2.9522385597229004, "logps/chosen": -456.4190368652344, "logps/rejected": -417.6458435058594, "loss": 10.6616, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -33.50844192504883, "rewards/margins": -3.696442127227783, "rewards/rejected": -29.811996459960938, "step": 1110 }, { "epoch": 0.2, "learning_rate": 0.0004872103512563103, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 24.7455, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 1120 }, { "epoch": 0.2, "learning_rate": 0.00048716163259071837, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 41.8228, "rewards/accuracies": 0.375, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 1130 }, { "epoch": 0.2, "learning_rate": 0.000487112823756431, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 110.7265, "rewards/accuracies": 0.25, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 1140 }, { "epoch": 0.2, "learning_rate": 0.000487112823756431, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 245.3322, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 1150 }, { "epoch": 0.2, "learning_rate": 0.000487112823756431, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 1041.9771, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 1160 }, { "epoch": 0.21, "learning_rate": 0.0004870639247720053, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 437.3346, "rewards/accuracies": 0.3375000059604645, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 1170 }, { "epoch": 0.21, "learning_rate": 0.0004870639247720053, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 123.7268, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 1180 }, { "epoch": 0.21, "learning_rate": 0.0004870639247720053, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 315.4085, "rewards/accuracies": 0.23749999701976776, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 1190 }, { "epoch": 0.21, "learning_rate": 0.0004870639247720053, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 52542.0625, "rewards/accuracies": 0.25, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 1200 }, { "epoch": 0.21, "eval_logits/chosen": NaN, "eval_logits/rejected": NaN, "eval_logps/chosen": NaN, "eval_logps/rejected": NaN, "eval_loss": NaN, "eval_rewards/accuracies": 0.2244604378938675, "eval_rewards/chosen": NaN, "eval_rewards/margins": NaN, "eval_rewards/rejected": NaN, "eval_runtime": 988.4543, "eval_samples_per_second": 0.703, "eval_steps_per_second": 0.703, "step": 1200 }, { "epoch": 0.21, "learning_rate": 0.0004870149356560326, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 197.8537, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 1210 }, { "epoch": 0.22, "learning_rate": 0.0004870149356560326, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 4813.8465, "rewards/accuracies": 0.13750000298023224, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 1220 }, { "epoch": 0.22, "learning_rate": 0.0004870149356560326, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 2048.3082, "rewards/accuracies": 0.25, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 1230 }, { "epoch": 0.22, "learning_rate": 0.0004870149356560326, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 143.0693, "rewards/accuracies": 0.16249999403953552, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 1240 }, { "epoch": 0.22, "learning_rate": 0.0004870149356560326, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 16874.5531, "rewards/accuracies": 0.21250000596046448, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 1250 }, { "epoch": 0.22, "learning_rate": 0.0004870149356560326, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 2509.9437, "rewards/accuracies": 0.17499999701976776, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 1260 }, { "epoch": 0.22, "learning_rate": 0.0004870149356560326, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 9635.2938, "rewards/accuracies": 0.1875, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 1270 }, { "epoch": 0.23, "learning_rate": 0.0004870149356560326, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 3857.9113, "rewards/accuracies": 0.13750000298023224, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 1280 }, { "epoch": 0.23, "learning_rate": 0.0004870149356560326, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 4572.7609, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 1290 }, { "epoch": 0.23, "learning_rate": 0.0004870149356560326, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 606.0347, "rewards/accuracies": 0.15000000596046448, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 1300 }, { "epoch": 0.23, "learning_rate": 0.0004870149356560326, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 1759.0598, "rewards/accuracies": 0.17499999701976776, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 1310 }, { "epoch": 0.23, "learning_rate": 0.0004870149356560326, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 14436.2953, "rewards/accuracies": 0.21250000596046448, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 1320 }, { "epoch": 0.23, "learning_rate": 0.0004870149356560326, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 386264.375, "rewards/accuracies": 0.0625, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 1330 }, { "epoch": 0.24, "learning_rate": 0.0004870149356560326, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 8430.657, "rewards/accuracies": 0.1875, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 1340 }, { "epoch": 0.24, "learning_rate": 0.0004870149356560326, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 6939.275, "rewards/accuracies": 0.13750000298023224, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 1350 }, { "epoch": 0.24, "learning_rate": 0.0004870149356560326, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 4295.0949, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 1360 }, { "epoch": 0.24, "learning_rate": 0.0004870149356560326, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 17283.3672, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 1370 }, { "epoch": 0.24, "learning_rate": 0.0004870149356560326, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 24895.7469, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 1380 }, { "epoch": 0.25, "learning_rate": 0.0004870149356560326, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 45136.4094, "rewards/accuracies": 0.17499999701976776, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 1390 }, { "epoch": 0.25, "learning_rate": 0.0004870149356560326, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 165.0189, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 1400 }, { "epoch": 0.25, "eval_logits/chosen": NaN, "eval_logits/rejected": NaN, "eval_logps/chosen": NaN, "eval_logps/rejected": NaN, "eval_loss": NaN, "eval_rewards/accuracies": 0.17553956806659698, "eval_rewards/chosen": NaN, "eval_rewards/margins": NaN, "eval_rewards/rejected": NaN, "eval_runtime": 987.1061, "eval_samples_per_second": 0.704, "eval_steps_per_second": 0.704, "step": 1400 }, { "epoch": 0.25, "learning_rate": 0.0004870149356560326, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 8688.7203, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 1410 }, { "epoch": 0.25, "learning_rate": 0.0004870149356560326, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 9484.9836, "rewards/accuracies": 0.1875, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 1420 }, { "epoch": 0.25, "learning_rate": 0.0004870149356560326, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 9308.6922, "rewards/accuracies": 0.17499999701976776, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 1430 }, { "epoch": 0.25, "learning_rate": 0.0004870149356560326, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 2905.9373, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 1440 }, { "epoch": 0.26, "learning_rate": 0.0004870149356560326, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 4738.7867, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 1450 }, { "epoch": 0.26, "learning_rate": 0.0004870149356560326, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 424.0728, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 1460 }, { "epoch": 0.26, "learning_rate": 0.0004870149356560326, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 2181.5133, "rewards/accuracies": 0.13750000298023224, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 1470 }, { "epoch": 0.26, "learning_rate": 0.0004870149356560326, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 23597.5219, "rewards/accuracies": 0.16249999403953552, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 1480 }, { "epoch": 0.26, "learning_rate": 0.0004870149356560326, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 1259.9596, "rewards/accuracies": 0.25, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 1490 }, { "epoch": 0.26, "learning_rate": 0.0004870149356560326, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 7475.4719, "rewards/accuracies": 0.21250000596046448, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 1500 } ], "logging_steps": 10, "max_steps": 5662, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "trial_name": null, "trial_params": null }