{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998707732069783, "eval_steps": 100, "global_step": 5803, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.2412109375, "learning_rate": 8.605851979345955e-09, "logits/chosen": -3.5356550216674805, "logits/rejected": -3.5272138118743896, "logps/chosen": -54.58121871948242, "logps/rejected": -48.71324920654297, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "grad_norm": 0.271484375, "learning_rate": 8.605851979345955e-08, "logits/chosen": -3.500865936279297, "logits/rejected": -3.4951529502868652, "logps/chosen": -61.67301559448242, "logps/rejected": -57.93098449707031, "loss": 0.693, "rewards/accuracies": 0.4861111044883728, "rewards/chosen": 0.00022746472677681595, "rewards/margins": 0.0002776283654384315, "rewards/rejected": -5.016366776544601e-05, "step": 10 }, { "epoch": 0.0, "grad_norm": 0.2490234375, "learning_rate": 1.721170395869191e-07, "logits/chosen": -3.529344081878662, "logits/rejected": -3.5275402069091797, "logps/chosen": -63.89023971557617, "logps/rejected": -61.82512664794922, "loss": 0.6931, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 8.464425627607852e-05, "rewards/margins": 8.538198744645342e-05, "rewards/rejected": -7.377296924460097e-07, "step": 20 }, { "epoch": 0.01, "grad_norm": 0.302734375, "learning_rate": 2.5817555938037866e-07, "logits/chosen": -3.519536256790161, "logits/rejected": -3.5142159461975098, "logps/chosen": -67.57096099853516, "logps/rejected": -66.90937042236328, "loss": 0.6933, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 4.063212691107765e-05, "rewards/margins": -0.0002516761014703661, "rewards/rejected": 0.0002923081920016557, "step": 30 }, { "epoch": 0.01, "grad_norm": 0.23828125, "learning_rate": 3.442340791738382e-07, "logits/chosen": -3.505084276199341, "logits/rejected": -3.4988949298858643, "logps/chosen": -66.155517578125, "logps/rejected": -58.66449737548828, "loss": 0.6932, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0003996006562374532, "rewards/margins": -2.4692481019883417e-05, "rewards/rejected": 0.0004242931609041989, "step": 40 }, { "epoch": 0.01, "grad_norm": 0.251953125, "learning_rate": 4.302925989672978e-07, "logits/chosen": -3.510054111480713, "logits/rejected": -3.505955457687378, "logps/chosen": -62.5504150390625, "logps/rejected": -58.739418029785156, "loss": 0.6931, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.0005882784607820213, "rewards/margins": 7.251681381603703e-05, "rewards/rejected": 0.0005157616687938571, "step": 50 }, { "epoch": 0.01, "grad_norm": 0.3046875, "learning_rate": 5.163511187607573e-07, "logits/chosen": -3.532723903656006, "logits/rejected": -3.5259487628936768, "logps/chosen": -67.88531494140625, "logps/rejected": -62.6588134765625, "loss": 0.6931, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 0.0007649865001440048, "rewards/margins": 1.9654471543617547e-06, "rewards/rejected": 0.0007630210602656007, "step": 60 }, { "epoch": 0.01, "grad_norm": 0.2578125, "learning_rate": 6.024096385542169e-07, "logits/chosen": -3.5082130432128906, "logits/rejected": -3.5031533241271973, "logps/chosen": -64.43404388427734, "logps/rejected": -61.06683349609375, "loss": 0.6931, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.0014903316041454673, "rewards/margins": 9.859764395514503e-05, "rewards/rejected": 0.0013917339965701103, "step": 70 }, { "epoch": 0.01, "grad_norm": 0.2734375, "learning_rate": 6.884681583476764e-07, "logits/chosen": -3.543382167816162, "logits/rejected": -3.536128282546997, "logps/chosen": -69.94771575927734, "logps/rejected": -62.211883544921875, "loss": 0.6929, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.0019744255114346743, "rewards/margins": 0.0004983833059668541, "rewards/rejected": 0.0014760419726371765, "step": 80 }, { "epoch": 0.02, "grad_norm": 0.26171875, "learning_rate": 7.745266781411361e-07, "logits/chosen": -3.510272264480591, "logits/rejected": -3.5065231323242188, "logps/chosen": -64.821044921875, "logps/rejected": -60.5690803527832, "loss": 0.693, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.002713029272854328, "rewards/margins": 0.00037466129288077354, "rewards/rejected": 0.0023383679799735546, "step": 90 }, { "epoch": 0.02, "grad_norm": 0.263671875, "learning_rate": 8.605851979345956e-07, "logits/chosen": -3.534849166870117, "logits/rejected": -3.5216782093048096, "logps/chosen": -66.38619232177734, "logps/rejected": -57.2902717590332, "loss": 0.6926, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.003791350871324539, "rewards/margins": 0.0010863130446523428, "rewards/rejected": 0.0027050375938415527, "step": 100 }, { "epoch": 0.02, "eval_logits/chosen": -3.4979639053344727, "eval_logits/rejected": -3.4962334632873535, "eval_logps/chosen": -70.9185791015625, "eval_logps/rejected": -74.63919830322266, "eval_loss": 0.6930303573608398, "eval_rewards/accuracies": 0.5192843675613403, "eval_rewards/chosen": 0.004914432298392057, "eval_rewards/margins": 0.00023599098494742066, "eval_rewards/rejected": 0.00467844121158123, "eval_runtime": 486.9073, "eval_samples_per_second": 8.839, "eval_steps_per_second": 1.105, "step": 100 }, { "epoch": 0.02, "grad_norm": 0.291015625, "learning_rate": 9.466437177280551e-07, "logits/chosen": -3.5418262481689453, "logits/rejected": -3.537301540374756, "logps/chosen": -66.12630462646484, "logps/rejected": -60.84473419189453, "loss": 0.6928, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.0047551849856972694, "rewards/margins": 0.0006528754602186382, "rewards/rejected": 0.004102309234440327, "step": 110 }, { "epoch": 0.02, "grad_norm": 0.2890625, "learning_rate": 1.0327022375215146e-06, "logits/chosen": -3.5191383361816406, "logits/rejected": -3.515841007232666, "logps/chosen": -62.40716552734375, "logps/rejected": -63.18220138549805, "loss": 0.6929, "rewards/accuracies": 0.59375, "rewards/chosen": 0.004575005732476711, "rewards/margins": 0.0005694165593013167, "rewards/rejected": 0.004005589056760073, "step": 120 }, { "epoch": 0.02, "grad_norm": 0.251953125, "learning_rate": 1.1187607573149743e-06, "logits/chosen": -3.5376014709472656, "logits/rejected": -3.5348377227783203, "logps/chosen": -66.56596374511719, "logps/rejected": -61.85821533203125, "loss": 0.6928, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.005736568011343479, "rewards/margins": 0.0006158509640954435, "rewards/rejected": 0.0051207165233790874, "step": 130 }, { "epoch": 0.02, "grad_norm": 0.27734375, "learning_rate": 1.2048192771084338e-06, "logits/chosen": -3.5332303047180176, "logits/rejected": -3.528048038482666, "logps/chosen": -63.472206115722656, "logps/rejected": -60.507774353027344, "loss": 0.6928, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.005915635731071234, "rewards/margins": 0.0006704704137519002, "rewards/rejected": 0.005245164968073368, "step": 140 }, { "epoch": 0.03, "grad_norm": 0.26953125, "learning_rate": 1.2908777969018935e-06, "logits/chosen": -3.506129026412964, "logits/rejected": -3.502753496170044, "logps/chosen": -62.16828155517578, "logps/rejected": -59.287452697753906, "loss": 0.6925, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.00653434032574296, "rewards/margins": 0.0012093739351257682, "rewards/rejected": 0.00532496627420187, "step": 150 }, { "epoch": 0.03, "grad_norm": 0.2431640625, "learning_rate": 1.3769363166953528e-06, "logits/chosen": -3.504394054412842, "logits/rejected": -3.5007827281951904, "logps/chosen": -63.689369201660156, "logps/rejected": -62.33386993408203, "loss": 0.6927, "rewards/accuracies": 0.5625, "rewards/chosen": 0.007361248135566711, "rewards/margins": 0.0008054021745920181, "rewards/rejected": 0.006555846426635981, "step": 160 }, { "epoch": 0.03, "grad_norm": 0.2734375, "learning_rate": 1.4629948364888125e-06, "logits/chosen": -3.5105483531951904, "logits/rejected": -3.506767749786377, "logps/chosen": -64.63548278808594, "logps/rejected": -58.397705078125, "loss": 0.6925, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.008624577894806862, "rewards/margins": 0.00120867311488837, "rewards/rejected": 0.007415904197841883, "step": 170 }, { "epoch": 0.03, "grad_norm": 0.29296875, "learning_rate": 1.5490533562822722e-06, "logits/chosen": -3.5382277965545654, "logits/rejected": -3.5333220958709717, "logps/chosen": -64.91260528564453, "logps/rejected": -59.960670471191406, "loss": 0.6927, "rewards/accuracies": 0.5625, "rewards/chosen": 0.00788290984928608, "rewards/margins": 0.0009173898724839091, "rewards/rejected": 0.006965520326048136, "step": 180 }, { "epoch": 0.03, "grad_norm": 0.306640625, "learning_rate": 1.6351118760757316e-06, "logits/chosen": -3.517756700515747, "logits/rejected": -3.5076632499694824, "logps/chosen": -68.32835388183594, "logps/rejected": -59.00849151611328, "loss": 0.6916, "rewards/accuracies": 0.625, "rewards/chosen": 0.010297342203557491, "rewards/margins": 0.003029712475836277, "rewards/rejected": 0.007267629262059927, "step": 190 }, { "epoch": 0.03, "grad_norm": 0.28125, "learning_rate": 1.7211703958691911e-06, "logits/chosen": -3.505796432495117, "logits/rejected": -3.5029215812683105, "logps/chosen": -63.019500732421875, "logps/rejected": -61.403404235839844, "loss": 0.6919, "rewards/accuracies": 0.59375, "rewards/chosen": 0.010612553916871548, "rewards/margins": 0.0024268892593681812, "rewards/rejected": 0.008185665123164654, "step": 200 }, { "epoch": 0.03, "eval_logits/chosen": -3.49253511428833, "eval_logits/rejected": -3.4908037185668945, "eval_logps/chosen": -69.95048522949219, "eval_logps/rejected": -73.75398254394531, "eval_loss": 0.6926223635673523, "eval_rewards/accuracies": 0.5678438544273376, "eval_rewards/chosen": 0.014595309272408485, "eval_rewards/margins": 0.0010647153249010444, "eval_rewards/rejected": 0.013530593365430832, "eval_runtime": 483.8947, "eval_samples_per_second": 8.894, "eval_steps_per_second": 1.112, "step": 200 }, { "epoch": 0.04, "grad_norm": 0.259765625, "learning_rate": 1.8072289156626508e-06, "logits/chosen": -3.5087637901306152, "logits/rejected": -3.5072245597839355, "logps/chosen": -62.840614318847656, "logps/rejected": -63.97832107543945, "loss": 0.6918, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.011572018265724182, "rewards/margins": 0.0027770684100687504, "rewards/rejected": 0.008794950321316719, "step": 210 }, { "epoch": 0.04, "grad_norm": 0.2578125, "learning_rate": 1.8932874354561103e-06, "logits/chosen": -3.502013683319092, "logits/rejected": -3.494791030883789, "logps/chosen": -62.28466796875, "logps/rejected": -58.1067008972168, "loss": 0.6912, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.014032213017344475, "rewards/margins": 0.0038597029633820057, "rewards/rejected": 0.010172510519623756, "step": 220 }, { "epoch": 0.04, "grad_norm": 0.271484375, "learning_rate": 1.9793459552495696e-06, "logits/chosen": -3.515824794769287, "logits/rejected": -3.512291431427002, "logps/chosen": -57.69233322143555, "logps/rejected": -56.74806594848633, "loss": 0.6917, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.014439724385738373, "rewards/margins": 0.0030108988285064697, "rewards/rejected": 0.011428825557231903, "step": 230 }, { "epoch": 0.04, "grad_norm": 0.255859375, "learning_rate": 2.0654044750430293e-06, "logits/chosen": -3.5206668376922607, "logits/rejected": -3.5107674598693848, "logps/chosen": -64.91423034667969, "logps/rejected": -59.14391326904297, "loss": 0.6907, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.016033673658967018, "rewards/margins": 0.004976017866283655, "rewards/rejected": 0.01105765625834465, "step": 240 }, { "epoch": 0.04, "grad_norm": 0.271484375, "learning_rate": 2.151462994836489e-06, "logits/chosen": -3.505598783493042, "logits/rejected": -3.502315044403076, "logps/chosen": -61.88445281982422, "logps/rejected": -57.96600341796875, "loss": 0.6914, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.018100310117006302, "rewards/margins": 0.003605439094826579, "rewards/rejected": 0.014494871720671654, "step": 250 }, { "epoch": 0.04, "grad_norm": 0.271484375, "learning_rate": 2.2375215146299486e-06, "logits/chosen": -3.5090572834014893, "logits/rejected": -3.5079474449157715, "logps/chosen": -63.0159912109375, "logps/rejected": -62.857337951660156, "loss": 0.6894, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.018999049440026283, "rewards/margins": 0.007648964412510395, "rewards/rejected": 0.011350083164870739, "step": 260 }, { "epoch": 0.05, "grad_norm": 0.267578125, "learning_rate": 2.323580034423408e-06, "logits/chosen": -3.501816987991333, "logits/rejected": -3.497560501098633, "logps/chosen": -60.06858444213867, "logps/rejected": -59.70167922973633, "loss": 0.69, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.02066418156027794, "rewards/margins": 0.0063255527056753635, "rewards/rejected": 0.014338627457618713, "step": 270 }, { "epoch": 0.05, "grad_norm": 0.28125, "learning_rate": 2.4096385542168676e-06, "logits/chosen": -3.5297698974609375, "logits/rejected": -3.52246356010437, "logps/chosen": -66.10899353027344, "logps/rejected": -59.8133430480957, "loss": 0.6892, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.022841677069664, "rewards/margins": 0.0080425925552845, "rewards/rejected": 0.014799085445702076, "step": 280 }, { "epoch": 0.05, "grad_norm": 0.287109375, "learning_rate": 2.4956970740103273e-06, "logits/chosen": -3.5089409351348877, "logits/rejected": -3.506542682647705, "logps/chosen": -64.64678192138672, "logps/rejected": -60.837615966796875, "loss": 0.6903, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.020989563316106796, "rewards/margins": 0.005816182587295771, "rewards/rejected": 0.015173378400504589, "step": 290 }, { "epoch": 0.05, "grad_norm": 0.283203125, "learning_rate": 2.581755593803787e-06, "logits/chosen": -3.509040355682373, "logits/rejected": -3.5068492889404297, "logps/chosen": -62.775413513183594, "logps/rejected": -60.65922927856445, "loss": 0.6888, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.023325210437178612, "rewards/margins": 0.008827079087495804, "rewards/rejected": 0.014498132281005383, "step": 300 }, { "epoch": 0.05, "eval_logits/chosen": -3.486070156097412, "eval_logits/rejected": -3.4843297004699707, "eval_logps/chosen": -67.8994369506836, "eval_logps/rejected": -72.02379608154297, "eval_loss": 0.6910805106163025, "eval_rewards/accuracies": 0.5748141407966614, "eval_rewards/chosen": 0.035105764865875244, "eval_rewards/margins": 0.004273186903446913, "eval_rewards/rejected": 0.030832577496767044, "eval_runtime": 483.9038, "eval_samples_per_second": 8.894, "eval_steps_per_second": 1.112, "step": 300 }, { "epoch": 0.05, "grad_norm": 0.294921875, "learning_rate": 2.6678141135972463e-06, "logits/chosen": -3.5071263313293457, "logits/rejected": -3.506284236907959, "logps/chosen": -62.853904724121094, "logps/rejected": -59.47548294067383, "loss": 0.69, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.02449950948357582, "rewards/margins": 0.006591873709112406, "rewards/rejected": 0.01790763810276985, "step": 310 }, { "epoch": 0.06, "grad_norm": 0.279296875, "learning_rate": 2.7538726333907055e-06, "logits/chosen": -3.4970526695251465, "logits/rejected": -3.4916062355041504, "logps/chosen": -64.11039733886719, "logps/rejected": -56.41526412963867, "loss": 0.6884, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.020039405673742294, "rewards/margins": 0.009693250991404057, "rewards/rejected": 0.010346156544983387, "step": 320 }, { "epoch": 0.06, "grad_norm": 0.279296875, "learning_rate": 2.8399311531841657e-06, "logits/chosen": -3.506737232208252, "logits/rejected": -3.502964496612549, "logps/chosen": -63.77069091796875, "logps/rejected": -58.83369064331055, "loss": 0.6875, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.019164523109793663, "rewards/margins": 0.011498978361487389, "rewards/rejected": 0.007665542419999838, "step": 330 }, { "epoch": 0.06, "grad_norm": 0.30859375, "learning_rate": 2.925989672977625e-06, "logits/chosen": -3.5138423442840576, "logits/rejected": -3.509005308151245, "logps/chosen": -60.804222106933594, "logps/rejected": -60.57802200317383, "loss": 0.6877, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.017904791980981827, "rewards/margins": 0.011298349127173424, "rewards/rejected": 0.006606444716453552, "step": 340 }, { "epoch": 0.06, "grad_norm": 0.310546875, "learning_rate": 3.012048192771085e-06, "logits/chosen": -3.4848923683166504, "logits/rejected": -3.4764468669891357, "logps/chosen": -65.43782806396484, "logps/rejected": -62.749595642089844, "loss": 0.6885, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.014699401333928108, "rewards/margins": 0.009541595354676247, "rewards/rejected": 0.005157806910574436, "step": 350 }, { "epoch": 0.06, "grad_norm": 0.310546875, "learning_rate": 3.0981067125645443e-06, "logits/chosen": -3.5276389122009277, "logits/rejected": -3.523733139038086, "logps/chosen": -64.9666519165039, "logps/rejected": -58.31694412231445, "loss": 0.6882, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.006131452973932028, "rewards/margins": 0.010225333273410797, "rewards/rejected": -0.004093879368156195, "step": 360 }, { "epoch": 0.06, "grad_norm": 0.314453125, "learning_rate": 3.1841652323580036e-06, "logits/chosen": -3.5156378746032715, "logits/rejected": -3.5106139183044434, "logps/chosen": -64.2197494506836, "logps/rejected": -61.08530807495117, "loss": 0.6867, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.002987655345350504, "rewards/margins": 0.013294967822730541, "rewards/rejected": -0.010307312943041325, "step": 370 }, { "epoch": 0.07, "grad_norm": 0.298828125, "learning_rate": 3.2702237521514633e-06, "logits/chosen": -3.523965358734131, "logits/rejected": -3.5195980072021484, "logps/chosen": -64.2259292602539, "logps/rejected": -60.46531295776367, "loss": 0.6863, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.003907281905412674, "rewards/margins": 0.01409735344350338, "rewards/rejected": -0.010190071538090706, "step": 380 }, { "epoch": 0.07, "grad_norm": 0.3359375, "learning_rate": 3.356282271944923e-06, "logits/chosen": -3.508693218231201, "logits/rejected": -3.5038185119628906, "logps/chosen": -67.0108413696289, "logps/rejected": -63.60310745239258, "loss": 0.6856, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.0033676461316645145, "rewards/margins": 0.015611497685313225, "rewards/rejected": -0.012243852019309998, "step": 390 }, { "epoch": 0.07, "grad_norm": 0.296875, "learning_rate": 3.4423407917383822e-06, "logits/chosen": -3.5082168579101562, "logits/rejected": -3.5035948753356934, "logps/chosen": -64.50901794433594, "logps/rejected": -61.93944549560547, "loss": 0.6864, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.0032909319270402193, "rewards/margins": 0.01393433939665556, "rewards/rejected": -0.017225272953510284, "step": 400 }, { "epoch": 0.07, "eval_logits/chosen": -3.4826815128326416, "eval_logits/rejected": -3.4808835983276367, "eval_logps/chosen": -69.75044250488281, "eval_logps/rejected": -74.32178497314453, "eval_loss": 0.6890121698379517, "eval_rewards/accuracies": 0.5627323389053345, "eval_rewards/chosen": 0.016595730558037758, "eval_rewards/margins": 0.008743190206587315, "eval_rewards/rejected": 0.00785253755748272, "eval_runtime": 483.6906, "eval_samples_per_second": 8.898, "eval_steps_per_second": 1.112, "step": 400 }, { "epoch": 0.07, "grad_norm": 0.306640625, "learning_rate": 3.528399311531842e-06, "logits/chosen": -3.5057005882263184, "logits/rejected": -3.505286455154419, "logps/chosen": -60.99641036987305, "logps/rejected": -65.09056091308594, "loss": 0.6871, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.007737401872873306, "rewards/margins": 0.012748445384204388, "rewards/rejected": -0.02048584818840027, "step": 410 }, { "epoch": 0.07, "grad_norm": 0.3671875, "learning_rate": 3.6144578313253016e-06, "logits/chosen": -3.4838051795959473, "logits/rejected": -3.4827370643615723, "logps/chosen": -66.43543243408203, "logps/rejected": -64.77276611328125, "loss": 0.6903, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.015771260485053062, "rewards/margins": 0.006309186574071646, "rewards/rejected": -0.022080447524785995, "step": 420 }, { "epoch": 0.07, "grad_norm": 0.333984375, "learning_rate": 3.700516351118761e-06, "logits/chosen": -3.513059139251709, "logits/rejected": -3.5096065998077393, "logps/chosen": -65.94549560546875, "logps/rejected": -64.00807189941406, "loss": 0.6865, "rewards/accuracies": 0.625, "rewards/chosen": -0.006407345645129681, "rewards/margins": 0.013842855580151081, "rewards/rejected": -0.020250199362635612, "step": 430 }, { "epoch": 0.08, "grad_norm": 0.369140625, "learning_rate": 3.7865748709122206e-06, "logits/chosen": -3.5059781074523926, "logits/rejected": -3.498321056365967, "logps/chosen": -65.25879669189453, "logps/rejected": -62.97725296020508, "loss": 0.6843, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.00661065150052309, "rewards/margins": 0.018645433709025383, "rewards/rejected": -0.02525608241558075, "step": 440 }, { "epoch": 0.08, "grad_norm": 0.322265625, "learning_rate": 3.87263339070568e-06, "logits/chosen": -3.4879813194274902, "logits/rejected": -3.4803619384765625, "logps/chosen": -66.70152282714844, "logps/rejected": -66.2663803100586, "loss": 0.6814, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0003645656688604504, "rewards/margins": 0.0243084616959095, "rewards/rejected": -0.02394389547407627, "step": 450 }, { "epoch": 0.08, "grad_norm": 0.34375, "learning_rate": 3.958691910499139e-06, "logits/chosen": -3.480090618133545, "logits/rejected": -3.4755382537841797, "logps/chosen": -68.15080261230469, "logps/rejected": -63.669464111328125, "loss": 0.6876, "rewards/accuracies": 0.59375, "rewards/chosen": -0.01759023405611515, "rewards/margins": 0.012100599706172943, "rewards/rejected": -0.029690831899642944, "step": 460 }, { "epoch": 0.08, "grad_norm": 0.34765625, "learning_rate": 4.0447504302926e-06, "logits/chosen": -3.4969534873962402, "logits/rejected": -3.4901657104492188, "logps/chosen": -65.4578857421875, "logps/rejected": -62.07141876220703, "loss": 0.6826, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.01021095085889101, "rewards/margins": 0.022059690207242966, "rewards/rejected": -0.0322706401348114, "step": 470 }, { "epoch": 0.08, "grad_norm": 0.384765625, "learning_rate": 4.1308089500860585e-06, "logits/chosen": -3.4784607887268066, "logits/rejected": -3.478670120239258, "logps/chosen": -65.68806457519531, "logps/rejected": -70.04661560058594, "loss": 0.6836, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.015385270118713379, "rewards/margins": 0.02035362645983696, "rewards/rejected": -0.03573889657855034, "step": 480 }, { "epoch": 0.08, "grad_norm": 0.392578125, "learning_rate": 4.216867469879519e-06, "logits/chosen": -3.473278760910034, "logits/rejected": -3.4622421264648438, "logps/chosen": -71.43934631347656, "logps/rejected": -62.09912872314453, "loss": 0.6781, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.00411622179672122, "rewards/margins": 0.03165096789598465, "rewards/rejected": -0.03576719015836716, "step": 490 }, { "epoch": 0.09, "grad_norm": 0.390625, "learning_rate": 4.302925989672978e-06, "logits/chosen": -3.476792097091675, "logits/rejected": -3.4702491760253906, "logps/chosen": -67.82813262939453, "logps/rejected": -63.65681076049805, "loss": 0.6864, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.01605495624244213, "rewards/margins": 0.01460212655365467, "rewards/rejected": -0.030657082796096802, "step": 500 }, { "epoch": 0.09, "eval_logits/chosen": -3.4687280654907227, "eval_logits/rejected": -3.4669294357299805, "eval_logps/chosen": -69.05587005615234, "eval_logps/rejected": -74.20924377441406, "eval_loss": 0.6864105463027954, "eval_rewards/accuracies": 0.571561336517334, "eval_rewards/chosen": 0.023541457951068878, "eval_rewards/margins": 0.014563486911356449, "eval_rewards/rejected": 0.00897796917706728, "eval_runtime": 483.8192, "eval_samples_per_second": 8.896, "eval_steps_per_second": 1.112, "step": 500 }, { "epoch": 0.09, "grad_norm": 0.423828125, "learning_rate": 4.388984509466438e-06, "logits/chosen": -3.469301700592041, "logits/rejected": -3.4632949829101562, "logps/chosen": -71.26062774658203, "logps/rejected": -65.58277130126953, "loss": 0.6858, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.026670118793845177, "rewards/margins": 0.01618189923465252, "rewards/rejected": -0.042852021753787994, "step": 510 }, { "epoch": 0.09, "grad_norm": 0.59375, "learning_rate": 4.475043029259897e-06, "logits/chosen": -3.4782233238220215, "logits/rejected": -3.4720966815948486, "logps/chosen": -72.25337982177734, "logps/rejected": -67.89925384521484, "loss": 0.6831, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0589582622051239, "rewards/margins": 0.021894726902246475, "rewards/rejected": -0.08085299283266068, "step": 520 }, { "epoch": 0.09, "grad_norm": 0.404296875, "learning_rate": 4.561101549053357e-06, "logits/chosen": -3.468972682952881, "logits/rejected": -3.4607043266296387, "logps/chosen": -70.99177551269531, "logps/rejected": -66.74734497070312, "loss": 0.6744, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.044935859739780426, "rewards/margins": 0.03998479247093201, "rewards/rejected": -0.08492065221071243, "step": 530 }, { "epoch": 0.09, "grad_norm": 0.435546875, "learning_rate": 4.647160068846816e-06, "logits/chosen": -3.4679083824157715, "logits/rejected": -3.4652633666992188, "logps/chosen": -70.12334442138672, "logps/rejected": -68.92913818359375, "loss": 0.6822, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.05896232649683952, "rewards/margins": 0.024110907688736916, "rewards/rejected": -0.08307323604822159, "step": 540 }, { "epoch": 0.09, "grad_norm": 0.46875, "learning_rate": 4.7332185886402755e-06, "logits/chosen": -3.4796371459960938, "logits/rejected": -3.476511001586914, "logps/chosen": -68.4463119506836, "logps/rejected": -67.37413024902344, "loss": 0.676, "rewards/accuracies": 0.65625, "rewards/chosen": -0.04435231536626816, "rewards/margins": 0.037112440913915634, "rewards/rejected": -0.08146476745605469, "step": 550 }, { "epoch": 0.1, "grad_norm": 0.5546875, "learning_rate": 4.819277108433735e-06, "logits/chosen": -3.4870948791503906, "logits/rejected": -3.486546754837036, "logps/chosen": -67.91651916503906, "logps/rejected": -71.3250961303711, "loss": 0.6749, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.04496331140398979, "rewards/margins": 0.03978149592876434, "rewards/rejected": -0.08474480360746384, "step": 560 }, { "epoch": 0.1, "grad_norm": 0.455078125, "learning_rate": 4.905335628227195e-06, "logits/chosen": -3.4745421409606934, "logits/rejected": -3.4724769592285156, "logps/chosen": -66.85447692871094, "logps/rejected": -70.38420867919922, "loss": 0.6793, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.04901731759309769, "rewards/margins": 0.03062910959124565, "rewards/rejected": -0.07964642345905304, "step": 570 }, { "epoch": 0.1, "grad_norm": 0.498046875, "learning_rate": 4.991394148020655e-06, "logits/chosen": -3.474553346633911, "logits/rejected": -3.473367214202881, "logps/chosen": -64.95735168457031, "logps/rejected": -66.52529907226562, "loss": 0.6809, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.04008790850639343, "rewards/margins": 0.026910793036222458, "rewards/rejected": -0.06699870526790619, "step": 580 }, { "epoch": 0.1, "grad_norm": 0.62109375, "learning_rate": 4.999963354556567e-06, "logits/chosen": -3.468188524246216, "logits/rejected": -3.4631600379943848, "logps/chosen": -66.08407592773438, "logps/rejected": -68.20244598388672, "loss": 0.6766, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.012904520146548748, "rewards/margins": 0.03578165918588638, "rewards/rejected": -0.04868617653846741, "step": 590 }, { "epoch": 0.1, "grad_norm": 0.5546875, "learning_rate": 4.9998366803288885e-06, "logits/chosen": -3.458890438079834, "logits/rejected": -3.4556713104248047, "logps/chosen": -68.03291320800781, "logps/rejected": -70.33735656738281, "loss": 0.6729, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.02153713069856167, "rewards/margins": 0.043937359005212784, "rewards/rejected": -0.0654744952917099, "step": 600 }, { "epoch": 0.1, "eval_logits/chosen": -3.450634479522705, "eval_logits/rejected": -3.448904514312744, "eval_logps/chosen": -71.3561782836914, "eval_logps/rejected": -77.16294860839844, "eval_loss": 0.6836954951286316, "eval_rewards/accuracies": 0.5868958830833435, "eval_rewards/chosen": 0.0005383504321798682, "eval_rewards/margins": 0.021097427234053612, "eval_rewards/rejected": -0.020559076219797134, "eval_runtime": 483.8167, "eval_samples_per_second": 8.896, "eval_steps_per_second": 1.112, "step": 600 }, { "epoch": 0.11, "grad_norm": 0.578125, "learning_rate": 4.9996195294877135e-06, "logits/chosen": -3.468038558959961, "logits/rejected": -3.4673168659210205, "logps/chosen": -72.26264953613281, "logps/rejected": -73.00647735595703, "loss": 0.6772, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.07366601377725601, "rewards/margins": 0.03525187447667122, "rewards/rejected": -0.10891789197921753, "step": 610 }, { "epoch": 0.11, "grad_norm": 0.703125, "learning_rate": 4.999311909892384e-06, "logits/chosen": -3.4714443683624268, "logits/rejected": -3.4664993286132812, "logps/chosen": -76.85301208496094, "logps/rejected": -79.18445587158203, "loss": 0.6598, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.10608841478824615, "rewards/margins": 0.07308591902256012, "rewards/rejected": -0.17917433381080627, "step": 620 }, { "epoch": 0.11, "grad_norm": 0.625, "learning_rate": 4.998913832676579e-06, "logits/chosen": -3.4586329460144043, "logits/rejected": -3.454993724822998, "logps/chosen": -78.71052551269531, "logps/rejected": -76.41368103027344, "loss": 0.6742, "rewards/accuracies": 0.59375, "rewards/chosen": -0.13212424516677856, "rewards/margins": 0.043377432972192764, "rewards/rejected": -0.17550167441368103, "step": 630 }, { "epoch": 0.11, "grad_norm": 0.99609375, "learning_rate": 4.998425312247913e-06, "logits/chosen": -3.470881700515747, "logits/rejected": -3.467952013015747, "logps/chosen": -77.85877990722656, "logps/rejected": -81.37117767333984, "loss": 0.6766, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.15040381252765656, "rewards/margins": 0.04006998986005783, "rewards/rejected": -0.19047380983829498, "step": 640 }, { "epoch": 0.11, "grad_norm": 0.59375, "learning_rate": 4.997846366287408e-06, "logits/chosen": -3.490457057952881, "logits/rejected": -3.4913439750671387, "logps/chosen": -80.30027770996094, "logps/rejected": -80.63001251220703, "loss": 0.6962, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.17833223938941956, "rewards/margins": -0.0010680959094315767, "rewards/rejected": -0.17726415395736694, "step": 650 }, { "epoch": 0.11, "grad_norm": 0.578125, "learning_rate": 4.997177015748862e-06, "logits/chosen": -3.451584577560425, "logits/rejected": -3.4501731395721436, "logps/chosen": -75.44596862792969, "logps/rejected": -75.20318603515625, "loss": 0.673, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.09660879522562027, "rewards/margins": 0.04417235776782036, "rewards/rejected": -0.14078114926815033, "step": 660 }, { "epoch": 0.12, "grad_norm": 0.5234375, "learning_rate": 4.996417284858085e-06, "logits/chosen": -3.46248197555542, "logits/rejected": -3.461444854736328, "logps/chosen": -71.69635009765625, "logps/rejected": -77.65562438964844, "loss": 0.6755, "rewards/accuracies": 0.625, "rewards/chosen": -0.07967537641525269, "rewards/margins": 0.04023570939898491, "rewards/rejected": -0.1199110895395279, "step": 670 }, { "epoch": 0.12, "grad_norm": 0.53515625, "learning_rate": 4.995567201112025e-06, "logits/chosen": -3.4511241912841797, "logits/rejected": -3.447023868560791, "logps/chosen": -73.80950927734375, "logps/rejected": -71.80841827392578, "loss": 0.6741, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.0922587662935257, "rewards/margins": 0.041829634457826614, "rewards/rejected": -0.134088397026062, "step": 680 }, { "epoch": 0.12, "grad_norm": 0.68359375, "learning_rate": 4.994626795277772e-06, "logits/chosen": -3.475782871246338, "logits/rejected": -3.4695351123809814, "logps/chosen": -81.76004791259766, "logps/rejected": -76.49635314941406, "loss": 0.6668, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.11172202974557877, "rewards/margins": 0.058455634862184525, "rewards/rejected": -0.1701776683330536, "step": 690 }, { "epoch": 0.12, "grad_norm": 0.61328125, "learning_rate": 4.993596101391443e-06, "logits/chosen": -3.4717812538146973, "logits/rejected": -3.4656291007995605, "logps/chosen": -82.42352294921875, "logps/rejected": -81.38658905029297, "loss": 0.6745, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1619960516691208, "rewards/margins": 0.04331028833985329, "rewards/rejected": -0.20530633628368378, "step": 700 }, { "epoch": 0.12, "eval_logits/chosen": -3.4486892223358154, "eval_logits/rejected": -3.446747303009033, "eval_logps/chosen": -78.93724060058594, "eval_logps/rejected": -85.99559020996094, "eval_loss": 0.6785964965820312, "eval_rewards/accuracies": 0.5954925417900085, "eval_rewards/chosen": -0.07527217268943787, "eval_rewards/margins": 0.03361326456069946, "eval_rewards/rejected": -0.10888542979955673, "eval_runtime": 483.831, "eval_samples_per_second": 8.896, "eval_steps_per_second": 1.112, "step": 700 }, { "epoch": 0.12, "grad_norm": 0.73828125, "learning_rate": 4.992475156756952e-06, "logits/chosen": -3.457979679107666, "logits/rejected": -3.452868938446045, "logps/chosen": -78.66218566894531, "logps/rejected": -82.35487365722656, "loss": 0.6669, "rewards/accuracies": 0.625, "rewards/chosen": -0.14062485098838806, "rewards/margins": 0.05895204097032547, "rewards/rejected": -0.19957688450813293, "step": 710 }, { "epoch": 0.12, "grad_norm": 0.703125, "learning_rate": 4.991264001944659e-06, "logits/chosen": -3.448965549468994, "logits/rejected": -3.4486708641052246, "logps/chosen": -79.89633178710938, "logps/rejected": -84.61561584472656, "loss": 0.6662, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.15700937807559967, "rewards/margins": 0.06110849231481552, "rewards/rejected": -0.2181178778409958, "step": 720 }, { "epoch": 0.13, "grad_norm": 0.98828125, "learning_rate": 4.989962680789901e-06, "logits/chosen": -3.4754321575164795, "logits/rejected": -3.47139048576355, "logps/chosen": -91.14657592773438, "logps/rejected": -93.26203918457031, "loss": 0.6583, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.24363143742084503, "rewards/margins": 0.07978501170873642, "rewards/rejected": -0.32341647148132324, "step": 730 }, { "epoch": 0.13, "grad_norm": 0.90234375, "learning_rate": 4.9885712403914095e-06, "logits/chosen": -3.4460208415985107, "logits/rejected": -3.4406116008758545, "logps/chosen": -96.26573181152344, "logps/rejected": -100.246826171875, "loss": 0.6611, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.3055499196052551, "rewards/margins": 0.07419048994779587, "rewards/rejected": -0.3797404170036316, "step": 740 }, { "epoch": 0.13, "grad_norm": 0.81640625, "learning_rate": 4.9870897311096e-06, "logits/chosen": -3.4695637226104736, "logits/rejected": -3.4636406898498535, "logps/chosen": -96.90727233886719, "logps/rejected": -99.74751281738281, "loss": 0.6584, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.2984744906425476, "rewards/margins": 0.08067616075277328, "rewards/rejected": -0.3791506886482239, "step": 750 }, { "epoch": 0.13, "grad_norm": 0.7578125, "learning_rate": 4.985518206564751e-06, "logits/chosen": -3.433748722076416, "logits/rejected": -3.428129196166992, "logps/chosen": -82.85520935058594, "logps/rejected": -78.89864349365234, "loss": 0.6736, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.17378118634223938, "rewards/margins": 0.04856756702065468, "rewards/rejected": -0.22234873473644257, "step": 760 }, { "epoch": 0.13, "grad_norm": 0.71484375, "learning_rate": 4.983856723635067e-06, "logits/chosen": -3.450777530670166, "logits/rejected": -3.446965456008911, "logps/chosen": -74.33808898925781, "logps/rejected": -76.29521179199219, "loss": 0.6671, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.10210736095905304, "rewards/margins": 0.05967769771814346, "rewards/rejected": -0.1617850363254547, "step": 770 }, { "epoch": 0.13, "grad_norm": 0.7109375, "learning_rate": 4.982105342454616e-06, "logits/chosen": -3.4446187019348145, "logits/rejected": -3.438462734222412, "logps/chosen": -78.2662353515625, "logps/rejected": -80.68006896972656, "loss": 0.6607, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.13819073140621185, "rewards/margins": 0.07206545025110245, "rewards/rejected": -0.2102561742067337, "step": 780 }, { "epoch": 0.14, "grad_norm": 0.70703125, "learning_rate": 4.980264126411153e-06, "logits/chosen": -3.4261698722839355, "logits/rejected": -3.421785354614258, "logps/chosen": -79.30335998535156, "logps/rejected": -82.90193176269531, "loss": 0.6664, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.14551669359207153, "rewards/margins": 0.062169916927814484, "rewards/rejected": -0.2076866179704666, "step": 790 }, { "epoch": 0.14, "grad_norm": 0.97265625, "learning_rate": 4.97833314214383e-06, "logits/chosen": -3.4357306957244873, "logits/rejected": -3.4315574169158936, "logps/chosen": -87.78187561035156, "logps/rejected": -92.76189422607422, "loss": 0.6681, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.23072782158851624, "rewards/margins": 0.06284385174512863, "rewards/rejected": -0.29357171058654785, "step": 800 }, { "epoch": 0.14, "eval_logits/chosen": -3.416940212249756, "eval_logits/rejected": -3.415050983428955, "eval_logps/chosen": -90.19148254394531, "eval_logps/rejected": -98.6569595336914, "eval_loss": 0.6738145351409912, "eval_rewards/accuracies": 0.5954925417900085, "eval_rewards/chosen": -0.1878146529197693, "eval_rewards/margins": 0.0476844422519207, "eval_rewards/rejected": -0.23549909889698029, "eval_runtime": 484.0365, "eval_samples_per_second": 8.892, "eval_steps_per_second": 1.111, "step": 800 }, { "epoch": 0.14, "grad_norm": 0.87890625, "learning_rate": 4.9763124595407785e-06, "logits/chosen": -3.4379210472106934, "logits/rejected": -3.4322097301483154, "logps/chosen": -96.42681884765625, "logps/rejected": -98.7668228149414, "loss": 0.6686, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.29752808809280396, "rewards/margins": 0.061955541372299194, "rewards/rejected": -0.35948362946510315, "step": 810 }, { "epoch": 0.14, "grad_norm": 0.70703125, "learning_rate": 4.974202151736584e-06, "logits/chosen": -3.4384703636169434, "logits/rejected": -3.4332988262176514, "logps/chosen": -87.07771301269531, "logps/rejected": -90.78187561035156, "loss": 0.6614, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.24036762118339539, "rewards/margins": 0.07609249651432037, "rewards/rejected": -0.31646016240119934, "step": 820 }, { "epoch": 0.14, "grad_norm": 0.94921875, "learning_rate": 4.972002295109638e-06, "logits/chosen": -3.4144272804260254, "logits/rejected": -3.4126338958740234, "logps/chosen": -84.09645080566406, "logps/rejected": -88.27267456054688, "loss": 0.6581, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.18734727799892426, "rewards/margins": 0.08099165558815002, "rewards/rejected": -0.2683389186859131, "step": 830 }, { "epoch": 0.14, "grad_norm": 0.93359375, "learning_rate": 4.969712969279372e-06, "logits/chosen": -3.4207637310028076, "logits/rejected": -3.41292142868042, "logps/chosen": -85.66937255859375, "logps/rejected": -87.91021728515625, "loss": 0.6529, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.203573539853096, "rewards/margins": 0.09274569153785706, "rewards/rejected": -0.29631924629211426, "step": 840 }, { "epoch": 0.15, "grad_norm": 0.90234375, "learning_rate": 4.967334257103379e-06, "logits/chosen": -3.4048008918762207, "logits/rejected": -3.4047226905822754, "logps/chosen": -88.8246841430664, "logps/rejected": -97.03109741210938, "loss": 0.6693, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.2746245265007019, "rewards/margins": 0.06276088953018188, "rewards/rejected": -0.3373854160308838, "step": 850 }, { "epoch": 0.15, "grad_norm": 1.5546875, "learning_rate": 4.9648662446744115e-06, "logits/chosen": -3.4170143604278564, "logits/rejected": -3.4106361865997314, "logps/chosen": -86.87603759765625, "logps/rejected": -92.53409576416016, "loss": 0.6534, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.23126204311847687, "rewards/margins": 0.09931255877017975, "rewards/rejected": -0.3305746018886566, "step": 860 }, { "epoch": 0.15, "grad_norm": 1.015625, "learning_rate": 4.962309021317268e-06, "logits/chosen": -3.4022164344787598, "logits/rejected": -3.4014739990234375, "logps/chosen": -83.60894775390625, "logps/rejected": -93.01090240478516, "loss": 0.6531, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.220720574259758, "rewards/margins": 0.09608150273561478, "rewards/rejected": -0.316802054643631, "step": 870 }, { "epoch": 0.15, "grad_norm": 0.99609375, "learning_rate": 4.959662679585559e-06, "logits/chosen": -3.402447462081909, "logits/rejected": -3.3985283374786377, "logps/chosen": -91.08452606201172, "logps/rejected": -96.7422103881836, "loss": 0.6504, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.2804550230503082, "rewards/margins": 0.10522060096263885, "rewards/rejected": -0.3856756091117859, "step": 880 }, { "epoch": 0.15, "grad_norm": 0.921875, "learning_rate": 4.956927315258356e-06, "logits/chosen": -3.385986804962158, "logits/rejected": -3.3800766468048096, "logps/chosen": -96.92965698242188, "logps/rejected": -94.84703826904297, "loss": 0.6758, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.271820604801178, "rewards/margins": 0.051150452345609665, "rewards/rejected": -0.32297104597091675, "step": 890 }, { "epoch": 0.16, "grad_norm": 1.09375, "learning_rate": 4.9541030273367276e-06, "logits/chosen": -3.386122226715088, "logits/rejected": -3.385578155517578, "logps/chosen": -91.31932067871094, "logps/rejected": -93.32504272460938, "loss": 0.6661, "rewards/accuracies": 0.59375, "rewards/chosen": -0.26877859234809875, "rewards/margins": 0.06785809248685837, "rewards/rejected": -0.3366366922855377, "step": 900 }, { "epoch": 0.16, "eval_logits/chosen": -3.375498056411743, "eval_logits/rejected": -3.374028444290161, "eval_logps/chosen": -88.59937286376953, "eval_logps/rejected": -97.6375961303711, "eval_loss": 0.671495795249939, "eval_rewards/accuracies": 0.5922397971153259, "eval_rewards/chosen": -0.17189349234104156, "eval_rewards/margins": 0.053412046283483505, "eval_rewards/rejected": -0.22530555725097656, "eval_runtime": 483.8194, "eval_samples_per_second": 8.896, "eval_steps_per_second": 1.112, "step": 900 }, { "epoch": 0.16, "grad_norm": 0.93359375, "learning_rate": 4.951189918040154e-06, "logits/chosen": -3.3894095420837402, "logits/rejected": -3.3857262134552, "logps/chosen": -91.1655044555664, "logps/rejected": -99.69454193115234, "loss": 0.6582, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.29526156187057495, "rewards/margins": 0.09328372031450272, "rewards/rejected": -0.3885452151298523, "step": 910 }, { "epoch": 0.16, "grad_norm": 1.0625, "learning_rate": 4.948188092802828e-06, "logits/chosen": -3.379016160964966, "logits/rejected": -3.3721976280212402, "logps/chosen": -94.36839294433594, "logps/rejected": -92.09681701660156, "loss": 0.6691, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.2924317717552185, "rewards/margins": 0.06343097984790802, "rewards/rejected": -0.35586273670196533, "step": 920 }, { "epoch": 0.16, "grad_norm": 0.9609375, "learning_rate": 4.94509766026984e-06, "logits/chosen": -3.3837249279022217, "logits/rejected": -3.3793246746063232, "logps/chosen": -87.13783264160156, "logps/rejected": -95.68738555908203, "loss": 0.6522, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.24055281281471252, "rewards/margins": 0.10128758102655411, "rewards/rejected": -0.34184038639068604, "step": 930 }, { "epoch": 0.16, "grad_norm": 0.9296875, "learning_rate": 4.941918732293246e-06, "logits/chosen": -3.3826732635498047, "logits/rejected": -3.3760414123535156, "logps/chosen": -101.3785171508789, "logps/rejected": -102.48109436035156, "loss": 0.6574, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.3002975881099701, "rewards/margins": 0.0871744379401207, "rewards/rejected": -0.3874720633029938, "step": 940 }, { "epoch": 0.16, "grad_norm": 0.94921875, "learning_rate": 4.9386514239280156e-06, "logits/chosen": -3.3428120613098145, "logits/rejected": -3.3408846855163574, "logps/chosen": -104.9477767944336, "logps/rejected": -110.55946350097656, "loss": 0.6662, "rewards/accuracies": 0.53125, "rewards/chosen": -0.3990733325481415, "rewards/margins": 0.07994810491800308, "rewards/rejected": -0.47902145981788635, "step": 950 }, { "epoch": 0.17, "grad_norm": 0.91015625, "learning_rate": 4.935295853427875e-06, "logits/chosen": -3.345571517944336, "logits/rejected": -3.3475120067596436, "logps/chosen": -93.88117218017578, "logps/rejected": -102.92850494384766, "loss": 0.6674, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.32920488715171814, "rewards/margins": 0.06851590424776077, "rewards/rejected": -0.3977208137512207, "step": 960 }, { "epoch": 0.17, "grad_norm": 1.140625, "learning_rate": 4.9318521422410186e-06, "logits/chosen": -3.3579020500183105, "logits/rejected": -3.353294849395752, "logps/chosen": -101.0849838256836, "logps/rejected": -95.56061553955078, "loss": 0.6744, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.307528555393219, "rewards/margins": 0.05522537976503372, "rewards/rejected": -0.3627539277076721, "step": 970 }, { "epoch": 0.17, "grad_norm": 0.8828125, "learning_rate": 4.928320415005718e-06, "logits/chosen": -3.3926453590393066, "logits/rejected": -3.3888983726501465, "logps/chosen": -90.27783203125, "logps/rejected": -94.86637878417969, "loss": 0.6541, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.22734880447387695, "rewards/margins": 0.09869574755430222, "rewards/rejected": -0.3260445296764374, "step": 980 }, { "epoch": 0.17, "grad_norm": 1.0546875, "learning_rate": 4.924700799545815e-06, "logits/chosen": -3.3724656105041504, "logits/rejected": -3.3683433532714844, "logps/chosen": -92.61779022216797, "logps/rejected": -98.67154693603516, "loss": 0.6466, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.28296729922294617, "rewards/margins": 0.11286024749279022, "rewards/rejected": -0.3958275616168976, "step": 990 }, { "epoch": 0.17, "grad_norm": 1.3515625, "learning_rate": 4.920993426866085e-06, "logits/chosen": -3.3497300148010254, "logits/rejected": -3.3454887866973877, "logps/chosen": -116.35418701171875, "logps/rejected": -115.3893051147461, "loss": 0.6686, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.4851459562778473, "rewards/margins": 0.07257985323667526, "rewards/rejected": -0.5577257871627808, "step": 1000 }, { "epoch": 0.17, "eval_logits/chosen": -3.3483028411865234, "eval_logits/rejected": -3.3466553688049316, "eval_logps/chosen": -111.16059112548828, "eval_logps/rejected": -121.91670227050781, "eval_loss": 0.6680831909179688, "eval_rewards/accuracies": 0.5936338305473328, "eval_rewards/chosen": -0.39750567078590393, "eval_rewards/margins": 0.07059081643819809, "eval_rewards/rejected": -0.4680964946746826, "eval_runtime": 483.8996, "eval_samples_per_second": 8.894, "eval_steps_per_second": 1.112, "step": 1000 }, { "epoch": 0.17, "grad_norm": 1.609375, "learning_rate": 4.917198431147504e-06, "logits/chosen": -3.3379311561584473, "logits/rejected": -3.3369412422180176, "logps/chosen": -116.28370666503906, "logps/rejected": -136.38829040527344, "loss": 0.6223, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.5286880731582642, "rewards/margins": 0.19197988510131836, "rewards/rejected": -0.7206679582595825, "step": 1010 }, { "epoch": 0.18, "grad_norm": 1.3203125, "learning_rate": 4.91331594974239e-06, "logits/chosen": -3.3675708770751953, "logits/rejected": -3.3619797229766846, "logps/chosen": -126.566650390625, "logps/rejected": -133.0839385986328, "loss": 0.6425, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5853036642074585, "rewards/margins": 0.12873545289039612, "rewards/rejected": -0.714039146900177, "step": 1020 }, { "epoch": 0.18, "grad_norm": 1.1953125, "learning_rate": 4.90934612316943e-06, "logits/chosen": -3.3567848205566406, "logits/rejected": -3.352221965789795, "logps/chosen": -112.8319320678711, "logps/rejected": -117.05516052246094, "loss": 0.6595, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.47556638717651367, "rewards/margins": 0.09899481385946274, "rewards/rejected": -0.5745611190795898, "step": 1030 }, { "epoch": 0.18, "grad_norm": 1.3984375, "learning_rate": 4.905289095108597e-06, "logits/chosen": -3.362544298171997, "logits/rejected": -3.360288619995117, "logps/chosen": -109.68827819824219, "logps/rejected": -116.13682556152344, "loss": 0.6821, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.4699419140815735, "rewards/margins": 0.05295227840542793, "rewards/rejected": -0.5228942632675171, "step": 1040 }, { "epoch": 0.18, "grad_norm": 1.3125, "learning_rate": 4.901145012395945e-06, "logits/chosen": -3.340716600418091, "logits/rejected": -3.3355400562286377, "logps/chosen": -110.9808578491211, "logps/rejected": -114.1646728515625, "loss": 0.6485, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.43013253808021545, "rewards/margins": 0.12207241356372833, "rewards/rejected": -0.552204966545105, "step": 1050 }, { "epoch": 0.18, "grad_norm": 1.3828125, "learning_rate": 4.8969140250183036e-06, "logits/chosen": -3.3481738567352295, "logits/rejected": -3.344827175140381, "logps/chosen": -110.7804183959961, "logps/rejected": -113.61982727050781, "loss": 0.6655, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.4308958649635315, "rewards/margins": 0.07684727013111115, "rewards/rejected": -0.5077431201934814, "step": 1060 }, { "epoch": 0.18, "grad_norm": 1.09375, "learning_rate": 4.892596286107838e-06, "logits/chosen": -3.381743907928467, "logits/rejected": -3.3770880699157715, "logps/chosen": -109.32756042480469, "logps/rejected": -108.6933822631836, "loss": 0.6764, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.41801881790161133, "rewards/margins": 0.06484386324882507, "rewards/rejected": -0.4828626215457916, "step": 1070 }, { "epoch": 0.19, "grad_norm": 0.859375, "learning_rate": 4.888191951936516e-06, "logits/chosen": -3.3660483360290527, "logits/rejected": -3.3634610176086426, "logps/chosen": -104.37919616699219, "logps/rejected": -105.4195785522461, "loss": 0.6605, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.3800170421600342, "rewards/margins": 0.0873311385512352, "rewards/rejected": -0.46734818816185, "step": 1080 }, { "epoch": 0.19, "grad_norm": 1.0078125, "learning_rate": 4.883701181910447e-06, "logits/chosen": -3.3472914695739746, "logits/rejected": -3.345745801925659, "logps/chosen": -100.11200714111328, "logps/rejected": -107.61392974853516, "loss": 0.6578, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.3755348324775696, "rewards/margins": 0.09375335276126862, "rewards/rejected": -0.469288170337677, "step": 1090 }, { "epoch": 0.19, "grad_norm": 1.1328125, "learning_rate": 4.879124138564116e-06, "logits/chosen": -3.357037305831909, "logits/rejected": -3.356238842010498, "logps/chosen": -97.80937194824219, "logps/rejected": -106.3392333984375, "loss": 0.665, "rewards/accuracies": 0.59375, "rewards/chosen": -0.35469111800193787, "rewards/margins": 0.08345221728086472, "rewards/rejected": -0.4381433427333832, "step": 1100 }, { "epoch": 0.19, "eval_logits/chosen": -3.3477072715759277, "eval_logits/rejected": -3.3463261127471924, "eval_logps/chosen": -92.17500305175781, "eval_logps/rejected": -101.6746597290039, "eval_loss": 0.6707614064216614, "eval_rewards/accuracies": 0.5950278639793396, "eval_rewards/chosen": -0.20764988660812378, "eval_rewards/margins": 0.058026209473609924, "eval_rewards/rejected": -0.2656761109828949, "eval_runtime": 483.5387, "eval_samples_per_second": 8.901, "eval_steps_per_second": 1.113, "step": 1100 }, { "epoch": 0.19, "grad_norm": 1.2578125, "learning_rate": 4.874460987554495e-06, "logits/chosen": -3.361199140548706, "logits/rejected": -3.363349437713623, "logps/chosen": -93.87645721435547, "logps/rejected": -103.97225189208984, "loss": 0.6512, "rewards/accuracies": 0.65625, "rewards/chosen": -0.30472373962402344, "rewards/margins": 0.10587634146213531, "rewards/rejected": -0.41060003638267517, "step": 1110 }, { "epoch": 0.19, "grad_norm": 1.0, "learning_rate": 4.869711897655058e-06, "logits/chosen": -3.3703665733337402, "logits/rejected": -3.3666203022003174, "logps/chosen": -95.50496673583984, "logps/rejected": -99.84251403808594, "loss": 0.6539, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.287777841091156, "rewards/margins": 0.09858438372612, "rewards/rejected": -0.386362224817276, "step": 1120 }, { "epoch": 0.19, "grad_norm": 1.1015625, "learning_rate": 4.864877040749659e-06, "logits/chosen": -3.348633289337158, "logits/rejected": -3.344589948654175, "logps/chosen": -96.18132019042969, "logps/rejected": -108.21700286865234, "loss": 0.6387, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.3397810161113739, "rewards/margins": 0.13241739571094513, "rewards/rejected": -0.4721984267234802, "step": 1130 }, { "epoch": 0.2, "grad_norm": 1.3203125, "learning_rate": 4.859956591826323e-06, "logits/chosen": -3.3522610664367676, "logits/rejected": -3.347743272781372, "logps/chosen": -113.341064453125, "logps/rejected": -117.01106262207031, "loss": 0.6554, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.4580538868904114, "rewards/margins": 0.10810957849025726, "rewards/rejected": -0.5661634802818298, "step": 1140 }, { "epoch": 0.2, "grad_norm": 1.3515625, "learning_rate": 4.854950728970905e-06, "logits/chosen": -3.325697660446167, "logits/rejected": -3.3218960762023926, "logps/chosen": -110.71246337890625, "logps/rejected": -122.715576171875, "loss": 0.6389, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.46634015440940857, "rewards/margins": 0.1538677215576172, "rewards/rejected": -0.6202079057693481, "step": 1150 }, { "epoch": 0.2, "grad_norm": 1.46875, "learning_rate": 4.849859633360649e-06, "logits/chosen": -3.3398876190185547, "logits/rejected": -3.3386433124542236, "logps/chosen": -107.8994140625, "logps/rejected": -120.6123275756836, "loss": 0.6295, "rewards/accuracies": 0.6875, "rewards/chosen": -0.41763836145401, "rewards/margins": 0.16287484765052795, "rewards/rejected": -0.5805131793022156, "step": 1160 }, { "epoch": 0.2, "grad_norm": 1.5234375, "learning_rate": 4.84468348925763e-06, "logits/chosen": -3.3097126483917236, "logits/rejected": -3.3056182861328125, "logps/chosen": -118.73319244384766, "logps/rejected": -131.2072296142578, "loss": 0.6517, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.5627143383026123, "rewards/margins": 0.12967202067375183, "rewards/rejected": -0.6923863291740417, "step": 1170 }, { "epoch": 0.2, "grad_norm": 2.0, "learning_rate": 4.83942248400208e-06, "logits/chosen": -3.298457384109497, "logits/rejected": -3.293428421020508, "logps/chosen": -129.34249877929688, "logps/rejected": -134.14891052246094, "loss": 0.6615, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.6600443124771118, "rewards/margins": 0.1017390713095665, "rewards/rejected": -0.7617834210395813, "step": 1180 }, { "epoch": 0.21, "grad_norm": 1.6953125, "learning_rate": 4.834076808005615e-06, "logits/chosen": -3.3326334953308105, "logits/rejected": -3.327303409576416, "logps/chosen": -130.8115692138672, "logps/rejected": -134.27078247070312, "loss": 0.645, "rewards/accuracies": 0.625, "rewards/chosen": -0.625819206237793, "rewards/margins": 0.14104318618774414, "rewards/rejected": -0.7668623924255371, "step": 1190 }, { "epoch": 0.21, "grad_norm": 1.2578125, "learning_rate": 4.828646654744338e-06, "logits/chosen": -3.317185640335083, "logits/rejected": -3.3153128623962402, "logps/chosen": -117.51283264160156, "logps/rejected": -124.31459045410156, "loss": 0.6549, "rewards/accuracies": 0.59375, "rewards/chosen": -0.5366128087043762, "rewards/margins": 0.11689277738332748, "rewards/rejected": -0.6535056233406067, "step": 1200 }, { "epoch": 0.21, "eval_logits/chosen": -3.317342758178711, "eval_logits/rejected": -3.3158934116363525, "eval_logps/chosen": -107.33206176757812, "eval_logps/rejected": -119.39058685302734, "eval_loss": 0.6630815267562866, "eval_rewards/accuracies": 0.5973513126373291, "eval_rewards/chosen": -0.3592205047607422, "eval_rewards/margins": 0.08361494541168213, "eval_rewards/rejected": -0.4428354501724243, "eval_runtime": 483.8876, "eval_samples_per_second": 8.895, "eval_steps_per_second": 1.112, "step": 1200 }, { "epoch": 0.21, "grad_norm": 1.4375, "learning_rate": 4.82313222075184e-06, "logits/chosen": -3.3175208568573, "logits/rejected": -3.3128483295440674, "logps/chosen": -119.05058288574219, "logps/rejected": -130.1739044189453, "loss": 0.6404, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.5425583124160767, "rewards/margins": 0.14496661722660065, "rewards/rejected": -0.6875249147415161, "step": 1210 }, { "epoch": 0.21, "grad_norm": 1.3515625, "learning_rate": 4.8175337056120844e-06, "logits/chosen": -3.318523406982422, "logits/rejected": -3.313075542449951, "logps/chosen": -114.37835693359375, "logps/rejected": -130.38021850585938, "loss": 0.6252, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5133898258209229, "rewards/margins": 0.1854238063097, "rewards/rejected": -0.6988136172294617, "step": 1220 }, { "epoch": 0.21, "grad_norm": 1.7578125, "learning_rate": 4.811851311952185e-06, "logits/chosen": -3.3374500274658203, "logits/rejected": -3.3314366340637207, "logps/chosen": -117.9470443725586, "logps/rejected": -122.54820251464844, "loss": 0.6514, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.5068861246109009, "rewards/margins": 0.11951172351837158, "rewards/rejected": -0.6263978481292725, "step": 1230 }, { "epoch": 0.21, "grad_norm": 1.5859375, "learning_rate": 4.80608524543507e-06, "logits/chosen": -3.31003999710083, "logits/rejected": -3.307063579559326, "logps/chosen": -114.16108703613281, "logps/rejected": -123.75483703613281, "loss": 0.6615, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.516891360282898, "rewards/margins": 0.11133322864770889, "rewards/rejected": -0.6282245516777039, "step": 1240 }, { "epoch": 0.22, "grad_norm": 1.5703125, "learning_rate": 4.800235714752042e-06, "logits/chosen": -3.3228580951690674, "logits/rejected": -3.317410707473755, "logps/chosen": -107.6076431274414, "logps/rejected": -112.98020935058594, "loss": 0.655, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.4211878776550293, "rewards/margins": 0.10868742316961288, "rewards/rejected": -0.5298753380775452, "step": 1250 }, { "epoch": 0.22, "grad_norm": 1.5703125, "learning_rate": 4.7943029316152235e-06, "logits/chosen": -3.303680896759033, "logits/rejected": -3.296949863433838, "logps/chosen": -109.7947769165039, "logps/rejected": -117.12117004394531, "loss": 0.6462, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.4526275098323822, "rewards/margins": 0.13787886500358582, "rewards/rejected": -0.590506374835968, "step": 1260 }, { "epoch": 0.22, "grad_norm": 1.6796875, "learning_rate": 4.788287110749892e-06, "logits/chosen": -3.304987668991089, "logits/rejected": -3.3044636249542236, "logps/chosen": -117.20857238769531, "logps/rejected": -128.5338134765625, "loss": 0.6531, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.5425751805305481, "rewards/margins": 0.11775865405797958, "rewards/rejected": -0.6603338122367859, "step": 1270 }, { "epoch": 0.22, "grad_norm": 1.5703125, "learning_rate": 4.782188469886711e-06, "logits/chosen": -3.3322014808654785, "logits/rejected": -3.3311448097229004, "logps/chosen": -118.02180480957031, "logps/rejected": -141.24038696289062, "loss": 0.6264, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5545832514762878, "rewards/margins": 0.1757308542728424, "rewards/rejected": -0.7303141355514526, "step": 1280 }, { "epoch": 0.22, "grad_norm": 1.65625, "learning_rate": 4.776007229753847e-06, "logits/chosen": -3.297036647796631, "logits/rejected": -3.292588710784912, "logps/chosen": -134.52230834960938, "logps/rejected": -141.50653076171875, "loss": 0.6549, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.6898329854011536, "rewards/margins": 0.13372239470481873, "rewards/rejected": -0.8235553503036499, "step": 1290 }, { "epoch": 0.22, "grad_norm": 1.609375, "learning_rate": 4.7697436140689894e-06, "logits/chosen": -3.283554792404175, "logits/rejected": -3.280672788619995, "logps/chosen": -129.9937744140625, "logps/rejected": -145.84243774414062, "loss": 0.6536, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.7106422185897827, "rewards/margins": 0.13219983875751495, "rewards/rejected": -0.8428421020507812, "step": 1300 }, { "epoch": 0.22, "eval_logits/chosen": -3.2736623287200928, "eval_logits/rejected": -3.2721986770629883, "eval_logps/chosen": -121.81114959716797, "eval_logps/rejected": -135.54385375976562, "eval_loss": 0.6590760946273804, "eval_rewards/accuracies": 0.597815990447998, "eval_rewards/chosen": -0.5040112733840942, "eval_rewards/margins": 0.10035695135593414, "eval_rewards/rejected": -0.6043682098388672, "eval_runtime": 483.6725, "eval_samples_per_second": 8.899, "eval_steps_per_second": 1.112, "step": 1300 }, { "epoch": 0.23, "grad_norm": 1.5859375, "learning_rate": 4.763397849531239e-06, "logits/chosen": -3.269982099533081, "logits/rejected": -3.26440167427063, "logps/chosen": -129.6372833251953, "logps/rejected": -141.26046752929688, "loss": 0.6313, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.6593326330184937, "rewards/margins": 0.1772792637348175, "rewards/rejected": -0.8366119265556335, "step": 1310 }, { "epoch": 0.23, "grad_norm": 1.921875, "learning_rate": 4.756970165812914e-06, "logits/chosen": -3.2888553142547607, "logits/rejected": -3.2863876819610596, "logps/chosen": -132.07308959960938, "logps/rejected": -137.927001953125, "loss": 0.6499, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.6589027643203735, "rewards/margins": 0.1327061951160431, "rewards/rejected": -0.791608989238739, "step": 1320 }, { "epoch": 0.23, "grad_norm": 1.3828125, "learning_rate": 4.750460795551235e-06, "logits/chosen": -3.291693925857544, "logits/rejected": -3.2884392738342285, "logps/chosen": -127.37693786621094, "logps/rejected": -136.66612243652344, "loss": 0.6309, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6089495420455933, "rewards/margins": 0.16879227757453918, "rewards/rejected": -0.7777417898178101, "step": 1330 }, { "epoch": 0.23, "grad_norm": 1.3984375, "learning_rate": 4.743869974339904e-06, "logits/chosen": -3.283473491668701, "logits/rejected": -3.2803750038146973, "logps/chosen": -123.50019836425781, "logps/rejected": -133.4915008544922, "loss": 0.6349, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5720099806785583, "rewards/margins": 0.1526518613100052, "rewards/rejected": -0.7246618866920471, "step": 1340 }, { "epoch": 0.23, "grad_norm": 2.390625, "learning_rate": 4.737197940720577e-06, "logits/chosen": -3.2804999351501465, "logits/rejected": -3.277661085128784, "logps/chosen": -144.21261596679688, "logps/rejected": -147.98348999023438, "loss": 0.6961, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.8038604855537415, "rewards/margins": 0.057847969233989716, "rewards/rejected": -0.8617083430290222, "step": 1350 }, { "epoch": 0.23, "grad_norm": 1.6796875, "learning_rate": 4.730444936174233e-06, "logits/chosen": -3.2664542198181152, "logits/rejected": -3.265286922454834, "logps/chosen": -131.89950561523438, "logps/rejected": -141.6063232421875, "loss": 0.6582, "rewards/accuracies": 0.625, "rewards/chosen": -0.655137836933136, "rewards/margins": 0.12065533548593521, "rewards/rejected": -0.775793194770813, "step": 1360 }, { "epoch": 0.24, "grad_norm": 1.78125, "learning_rate": 4.723611205112431e-06, "logits/chosen": -3.2844204902648926, "logits/rejected": -3.2812061309814453, "logps/chosen": -126.83049011230469, "logps/rejected": -141.07415771484375, "loss": 0.6365, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.6328147649765015, "rewards/margins": 0.1619427502155304, "rewards/rejected": -0.794757604598999, "step": 1370 }, { "epoch": 0.24, "grad_norm": 1.578125, "learning_rate": 4.716696994868467e-06, "logits/chosen": -3.2731170654296875, "logits/rejected": -3.269042491912842, "logps/chosen": -130.5900115966797, "logps/rejected": -138.87216186523438, "loss": 0.6507, "rewards/accuracies": 0.59375, "rewards/chosen": -0.6424145698547363, "rewards/margins": 0.1399935781955719, "rewards/rejected": -0.7824081182479858, "step": 1380 }, { "epoch": 0.24, "grad_norm": 1.453125, "learning_rate": 4.70970255568842e-06, "logits/chosen": -3.2995362281799316, "logits/rejected": -3.2949798107147217, "logps/chosen": -133.54818725585938, "logps/rejected": -139.49545288085938, "loss": 0.6564, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.6576114892959595, "rewards/margins": 0.1290406435728073, "rewards/rejected": -0.7866522073745728, "step": 1390 }, { "epoch": 0.24, "grad_norm": 1.3359375, "learning_rate": 4.702628140722096e-06, "logits/chosen": -3.266584873199463, "logits/rejected": -3.2621452808380127, "logps/chosen": -122.13375091552734, "logps/rejected": -132.69252014160156, "loss": 0.6303, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.5562669634819031, "rewards/margins": 0.16778331995010376, "rewards/rejected": -0.7240502238273621, "step": 1400 }, { "epoch": 0.24, "eval_logits/chosen": -3.279047966003418, "eval_logits/rejected": -3.2774879932403564, "eval_logps/chosen": -111.65291595458984, "eval_logps/rejected": -124.72959899902344, "eval_loss": 0.6592565774917603, "eval_rewards/accuracies": 0.6054832935333252, "eval_rewards/chosen": -0.40242883563041687, "eval_rewards/margins": 0.09379658102989197, "eval_rewards/rejected": -0.4962254762649536, "eval_runtime": 483.833, "eval_samples_per_second": 8.896, "eval_steps_per_second": 1.112, "step": 1400 }, { "epoch": 0.24, "grad_norm": 1.703125, "learning_rate": 4.695474006013865e-06, "logits/chosen": -3.281132459640503, "logits/rejected": -3.2756476402282715, "logps/chosen": -119.60661315917969, "logps/rejected": -133.5, "loss": 0.6398, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.561919629573822, "rewards/margins": 0.16075286269187927, "rewards/rejected": -0.7226725220680237, "step": 1410 }, { "epoch": 0.24, "grad_norm": 1.4609375, "learning_rate": 4.688240410493394e-06, "logits/chosen": -3.2566521167755127, "logits/rejected": -3.2544357776641846, "logps/chosen": -121.44035339355469, "logps/rejected": -143.53671264648438, "loss": 0.6311, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6094900965690613, "rewards/margins": 0.19036082923412323, "rewards/rejected": -0.7998508214950562, "step": 1420 }, { "epoch": 0.25, "grad_norm": 2.1875, "learning_rate": 4.6809276159662785e-06, "logits/chosen": -3.2630069255828857, "logits/rejected": -3.262012004852295, "logps/chosen": -140.1363983154297, "logps/rejected": -158.3402557373047, "loss": 0.6233, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7564435005187988, "rewards/margins": 0.19494546949863434, "rewards/rejected": -0.9513890147209167, "step": 1430 }, { "epoch": 0.25, "grad_norm": 2.296875, "learning_rate": 4.673535887104561e-06, "logits/chosen": -3.2323620319366455, "logits/rejected": -3.22904896736145, "logps/chosen": -142.92724609375, "logps/rejected": -149.91787719726562, "loss": 0.6612, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.8208937644958496, "rewards/margins": 0.1224142462015152, "rewards/rejected": -0.9433080554008484, "step": 1440 }, { "epoch": 0.25, "grad_norm": 2.0, "learning_rate": 4.6660654914371575e-06, "logits/chosen": -3.2538230419158936, "logits/rejected": -3.250257968902588, "logps/chosen": -145.50843811035156, "logps/rejected": -160.82852172851562, "loss": 0.626, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.8006998300552368, "rewards/margins": 0.19958093762397766, "rewards/rejected": -1.000280737876892, "step": 1450 }, { "epoch": 0.25, "grad_norm": 2.046875, "learning_rate": 4.658516699340171e-06, "logits/chosen": -3.237740993499756, "logits/rejected": -3.236027479171753, "logps/chosen": -146.0584716796875, "logps/rejected": -158.69305419921875, "loss": 0.6456, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.8310154676437378, "rewards/margins": 0.17097845673561096, "rewards/rejected": -1.0019938945770264, "step": 1460 }, { "epoch": 0.25, "grad_norm": 1.53125, "learning_rate": 4.650889784027109e-06, "logits/chosen": -3.275007724761963, "logits/rejected": -3.270531415939331, "logps/chosen": -137.88790893554688, "logps/rejected": -147.54556274414062, "loss": 0.6391, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.6958156824111938, "rewards/margins": 0.15993863344192505, "rewards/rejected": -0.8557542562484741, "step": 1470 }, { "epoch": 0.26, "grad_norm": 1.875, "learning_rate": 4.64318502153899e-06, "logits/chosen": -3.270411252975464, "logits/rejected": -3.2654106616973877, "logps/chosen": -133.38864135742188, "logps/rejected": -146.6710968017578, "loss": 0.646, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.7125257253646851, "rewards/margins": 0.14800339937210083, "rewards/rejected": -0.8605291247367859, "step": 1480 }, { "epoch": 0.26, "grad_norm": 2.0, "learning_rate": 4.635402690734362e-06, "logits/chosen": -3.2487213611602783, "logits/rejected": -3.2444915771484375, "logps/chosen": -146.71372985839844, "logps/rejected": -153.9183807373047, "loss": 0.6525, "rewards/accuracies": 0.59375, "rewards/chosen": -0.7878586649894714, "rewards/margins": 0.14298923313617706, "rewards/rejected": -0.9308478236198425, "step": 1490 }, { "epoch": 0.26, "grad_norm": 1.96875, "learning_rate": 4.627543073279197e-06, "logits/chosen": -3.250176191329956, "logits/rejected": -3.25103759765625, "logps/chosen": -146.10208129882812, "logps/rejected": -159.60694885253906, "loss": 0.6611, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.8511150479316711, "rewards/margins": 0.13717147707939148, "rewards/rejected": -0.9882864952087402, "step": 1500 }, { "epoch": 0.26, "eval_logits/chosen": -3.2471868991851807, "eval_logits/rejected": -3.2453789710998535, "eval_logps/chosen": -132.2458038330078, "eval_logps/rejected": -148.12802124023438, "eval_loss": 0.6527114510536194, "eval_rewards/accuracies": 0.6138476133346558, "eval_rewards/chosen": -0.6083579063415527, "eval_rewards/margins": 0.12185192108154297, "eval_rewards/rejected": -0.7302098274230957, "eval_runtime": 483.8595, "eval_samples_per_second": 8.895, "eval_steps_per_second": 1.112, "step": 1500 }, { "epoch": 0.26, "grad_norm": 1.3203125, "learning_rate": 4.619606453636708e-06, "logits/chosen": -3.2485604286193848, "logits/rejected": -3.245424270629883, "logps/chosen": -145.904296875, "logps/rejected": -158.60733032226562, "loss": 0.6205, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7720716595649719, "rewards/margins": 0.1980876922607422, "rewards/rejected": -0.9701593518257141, "step": 1510 }, { "epoch": 0.26, "grad_norm": 1.7890625, "learning_rate": 4.611593119057047e-06, "logits/chosen": -3.263840436935425, "logits/rejected": -3.2636096477508545, "logps/chosen": -144.23703002929688, "logps/rejected": -151.71929931640625, "loss": 0.6676, "rewards/accuracies": 0.53125, "rewards/chosen": -0.8146566152572632, "rewards/margins": 0.10837093740701675, "rewards/rejected": -0.9230276942253113, "step": 1520 }, { "epoch": 0.26, "grad_norm": 2.015625, "learning_rate": 4.603503359566912e-06, "logits/chosen": -3.2455577850341797, "logits/rejected": -3.243157148361206, "logps/chosen": -143.5106658935547, "logps/rejected": -151.66490173339844, "loss": 0.6637, "rewards/accuracies": 0.59375, "rewards/chosen": -0.8118158578872681, "rewards/margins": 0.12848272919654846, "rewards/rejected": -0.9402983784675598, "step": 1530 }, { "epoch": 0.27, "grad_norm": 2.171875, "learning_rate": 4.595337467959046e-06, "logits/chosen": -3.275472640991211, "logits/rejected": -3.269941806793213, "logps/chosen": -129.93222045898438, "logps/rejected": -141.9746856689453, "loss": 0.6267, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6406632661819458, "rewards/margins": 0.19468863308429718, "rewards/rejected": -0.8353517651557922, "step": 1540 }, { "epoch": 0.27, "grad_norm": 1.921875, "learning_rate": 4.587095739781645e-06, "logits/chosen": -3.2549495697021484, "logits/rejected": -3.2512078285217285, "logps/chosen": -123.9298095703125, "logps/rejected": -132.6895751953125, "loss": 0.6375, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.5939527750015259, "rewards/margins": 0.1560622900724411, "rewards/rejected": -0.7500149607658386, "step": 1550 }, { "epoch": 0.27, "grad_norm": 1.890625, "learning_rate": 4.578778473327659e-06, "logits/chosen": -3.2367331981658936, "logits/rejected": -3.2333521842956543, "logps/chosen": -124.5437240600586, "logps/rejected": -136.7928466796875, "loss": 0.6426, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.6369401216506958, "rewards/margins": 0.14378319680690765, "rewards/rejected": -0.7807233333587646, "step": 1560 }, { "epoch": 0.27, "grad_norm": 2.25, "learning_rate": 4.570385969623993e-06, "logits/chosen": -3.254424571990967, "logits/rejected": -3.2544474601745605, "logps/chosen": -132.67491149902344, "logps/rejected": -152.40585327148438, "loss": 0.6331, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6961439251899719, "rewards/margins": 0.1778034120798111, "rewards/rejected": -0.8739473223686218, "step": 1570 }, { "epoch": 0.27, "grad_norm": 1.625, "learning_rate": 4.561918532420615e-06, "logits/chosen": -3.2401375770568848, "logits/rejected": -3.2378089427948, "logps/chosen": -142.2194061279297, "logps/rejected": -158.73133850097656, "loss": 0.6443, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.8008275032043457, "rewards/margins": 0.1721135675907135, "rewards/rejected": -0.9729412198066711, "step": 1580 }, { "epoch": 0.27, "grad_norm": 1.9921875, "learning_rate": 4.553376468179564e-06, "logits/chosen": -3.2497572898864746, "logits/rejected": -3.2477023601531982, "logps/chosen": -139.50601196289062, "logps/rejected": -152.8370819091797, "loss": 0.65, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.7493128776550293, "rewards/margins": 0.1411186009645462, "rewards/rejected": -0.8904315233230591, "step": 1590 }, { "epoch": 0.28, "grad_norm": 1.6640625, "learning_rate": 4.544760086063856e-06, "logits/chosen": -3.2545619010925293, "logits/rejected": -3.2462317943573, "logps/chosen": -132.92030334472656, "logps/rejected": -144.53115844726562, "loss": 0.6395, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6942413449287415, "rewards/margins": 0.1516888588666916, "rewards/rejected": -0.845930278301239, "step": 1600 }, { "epoch": 0.28, "eval_logits/chosen": -3.252528190612793, "eval_logits/rejected": -3.250530958175659, "eval_logps/chosen": -126.27056121826172, "eval_logps/rejected": -141.61703491210938, "eval_loss": 0.6536267399787903, "eval_rewards/accuracies": 0.6154739856719971, "eval_rewards/chosen": -0.5486056208610535, "eval_rewards/margins": 0.11649421602487564, "eval_rewards/rejected": -0.6650997996330261, "eval_runtime": 483.7907, "eval_samples_per_second": 8.896, "eval_steps_per_second": 1.112, "step": 1600 }, { "epoch": 0.28, "grad_norm": 1.53125, "learning_rate": 4.536069697926291e-06, "logits/chosen": -3.2715904712677, "logits/rejected": -3.268829345703125, "logps/chosen": -142.8290557861328, "logps/rejected": -159.7598419189453, "loss": 0.6385, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.7724953293800354, "rewards/margins": 0.1773601919412613, "rewards/rejected": -0.9498555064201355, "step": 1610 }, { "epoch": 0.28, "grad_norm": 2.640625, "learning_rate": 4.527305618298173e-06, "logits/chosen": -3.2432327270507812, "logits/rejected": -3.238971710205078, "logps/chosen": -154.33795166015625, "logps/rejected": -174.77117919921875, "loss": 0.6388, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.9221574664115906, "rewards/margins": 0.2066594660282135, "rewards/rejected": -1.128816843032837, "step": 1620 }, { "epoch": 0.28, "grad_norm": 2.125, "learning_rate": 4.518468164377923e-06, "logits/chosen": -3.2425739765167236, "logits/rejected": -3.236532211303711, "logps/chosen": -153.01971435546875, "logps/rejected": -165.77304077148438, "loss": 0.6226, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.8580877184867859, "rewards/margins": 0.2107679396867752, "rewards/rejected": -1.0688556432724, "step": 1630 }, { "epoch": 0.28, "grad_norm": 1.71875, "learning_rate": 4.5095576560195975e-06, "logits/chosen": -3.2647926807403564, "logits/rejected": -3.2611937522888184, "logps/chosen": -146.43026733398438, "logps/rejected": -149.50869750976562, "loss": 0.6614, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.7866324186325073, "rewards/margins": 0.12221284210681915, "rewards/rejected": -0.9088452458381653, "step": 1640 }, { "epoch": 0.28, "grad_norm": 2.03125, "learning_rate": 4.500574415721311e-06, "logits/chosen": -3.27087664604187, "logits/rejected": -3.2663211822509766, "logps/chosen": -131.85440063476562, "logps/rejected": -147.97909545898438, "loss": 0.6225, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6372531652450562, "rewards/margins": 0.20080938935279846, "rewards/rejected": -0.838062584400177, "step": 1650 }, { "epoch": 0.29, "grad_norm": 1.4140625, "learning_rate": 4.491518768613569e-06, "logits/chosen": -3.263221025466919, "logits/rejected": -3.2580618858337402, "logps/chosen": -135.19631958007812, "logps/rejected": -139.4954071044922, "loss": 0.6525, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6873170137405396, "rewards/margins": 0.13492907583713531, "rewards/rejected": -0.8222460746765137, "step": 1660 }, { "epoch": 0.29, "grad_norm": 1.3828125, "learning_rate": 4.482391042447497e-06, "logits/chosen": -3.263942003250122, "logits/rejected": -3.2601189613342285, "logps/chosen": -129.59695434570312, "logps/rejected": -143.82839965820312, "loss": 0.6372, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6334278583526611, "rewards/margins": 0.16828377544879913, "rewards/rejected": -0.8017115592956543, "step": 1670 }, { "epoch": 0.29, "grad_norm": 1.9921875, "learning_rate": 4.473191567582975e-06, "logits/chosen": -3.239767074584961, "logits/rejected": -3.2362465858459473, "logps/chosen": -133.9244842529297, "logps/rejected": -149.71267700195312, "loss": 0.6279, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.67822265625, "rewards/margins": 0.19946426153182983, "rewards/rejected": -0.8776868581771851, "step": 1680 }, { "epoch": 0.29, "grad_norm": 1.8359375, "learning_rate": 4.46392067697669e-06, "logits/chosen": -3.2343764305114746, "logits/rejected": -3.2323060035705566, "logps/chosen": -127.72633361816406, "logps/rejected": -136.99798583984375, "loss": 0.6509, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6233143210411072, "rewards/margins": 0.14099445939064026, "rewards/rejected": -0.7643088102340698, "step": 1690 }, { "epoch": 0.29, "grad_norm": 2.15625, "learning_rate": 4.454578706170075e-06, "logits/chosen": -3.202291488647461, "logits/rejected": -3.204252243041992, "logps/chosen": -124.95072937011719, "logps/rejected": -136.50686645507812, "loss": 0.678, "rewards/accuracies": 0.59375, "rewards/chosen": -0.647601306438446, "rewards/margins": 0.07817380130290985, "rewards/rejected": -0.7257751226425171, "step": 1700 }, { "epoch": 0.29, "eval_logits/chosen": -3.2125089168548584, "eval_logits/rejected": -3.2106754779815674, "eval_logps/chosen": -117.87283325195312, "eval_logps/rejected": -131.22845458984375, "eval_loss": 0.6587392687797546, "eval_rewards/accuracies": 0.6168680191040039, "eval_rewards/chosen": -0.464628130197525, "eval_rewards/margins": 0.09658578038215637, "eval_rewards/rejected": -0.5612139701843262, "eval_runtime": 484.1087, "eval_samples_per_second": 8.891, "eval_steps_per_second": 1.111, "step": 1700 }, { "epoch": 0.29, "grad_norm": 2.40625, "learning_rate": 4.445165993277171e-06, "logits/chosen": -3.2155022621154785, "logits/rejected": -3.2090964317321777, "logps/chosen": -130.38784790039062, "logps/rejected": -139.3701629638672, "loss": 0.6437, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6083312630653381, "rewards/margins": 0.1592980921268463, "rewards/rejected": -0.7676293253898621, "step": 1710 }, { "epoch": 0.3, "grad_norm": 2.328125, "learning_rate": 4.435682878972389e-06, "logits/chosen": -3.2114365100860596, "logits/rejected": -3.206124782562256, "logps/chosen": -134.76806640625, "logps/rejected": -137.71871948242188, "loss": 0.6645, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6806883811950684, "rewards/margins": 0.10775679349899292, "rewards/rejected": -0.7884451746940613, "step": 1720 }, { "epoch": 0.3, "grad_norm": 2.515625, "learning_rate": 4.426129706478178e-06, "logits/chosen": -3.191399574279785, "logits/rejected": -3.1889612674713135, "logps/chosen": -136.0282745361328, "logps/rejected": -142.6835174560547, "loss": 0.6721, "rewards/accuracies": 0.59375, "rewards/chosen": -0.7149096131324768, "rewards/margins": 0.10171695053577423, "rewards/rejected": -0.8166265487670898, "step": 1730 }, { "epoch": 0.3, "grad_norm": 1.6484375, "learning_rate": 4.416506821552603e-06, "logits/chosen": -3.1809914112091064, "logits/rejected": -3.1785573959350586, "logps/chosen": -129.92225646972656, "logps/rejected": -149.59298706054688, "loss": 0.6364, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.6773825883865356, "rewards/margins": 0.19856294989585876, "rewards/rejected": -0.8759455680847168, "step": 1740 }, { "epoch": 0.3, "grad_norm": 1.96875, "learning_rate": 4.406814572476833e-06, "logits/chosen": -3.1837871074676514, "logits/rejected": -3.1800642013549805, "logps/chosen": -128.7941436767578, "logps/rejected": -145.5009765625, "loss": 0.6338, "rewards/accuracies": 0.625, "rewards/chosen": -0.6254864931106567, "rewards/margins": 0.17526546120643616, "rewards/rejected": -0.8007518649101257, "step": 1750 }, { "epoch": 0.3, "grad_norm": 2.046875, "learning_rate": 4.397053310042533e-06, "logits/chosen": -3.178910493850708, "logits/rejected": -3.1739954948425293, "logps/chosen": -136.34449768066406, "logps/rejected": -150.8899688720703, "loss": 0.627, "rewards/accuracies": 0.625, "rewards/chosen": -0.6980650424957275, "rewards/margins": 0.2017349898815155, "rewards/rejected": -0.8998001217842102, "step": 1760 }, { "epoch": 0.3, "grad_norm": 1.9140625, "learning_rate": 4.3872233875391715e-06, "logits/chosen": -3.1531150341033936, "logits/rejected": -3.1482579708099365, "logps/chosen": -144.26266479492188, "logps/rejected": -154.70135498046875, "loss": 0.6282, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7813176512718201, "rewards/margins": 0.20064513385295868, "rewards/rejected": -0.9819628596305847, "step": 1770 }, { "epoch": 0.31, "grad_norm": 2.328125, "learning_rate": 4.3773251607412294e-06, "logits/chosen": -3.1487860679626465, "logits/rejected": -3.147592782974243, "logps/chosen": -138.44879150390625, "logps/rejected": -167.2222442626953, "loss": 0.6012, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.7416083216667175, "rewards/margins": 0.2687363028526306, "rewards/rejected": -1.0103447437286377, "step": 1780 }, { "epoch": 0.31, "grad_norm": 1.8515625, "learning_rate": 4.367358987895327e-06, "logits/chosen": -3.1224420070648193, "logits/rejected": -3.1191248893737793, "logps/chosen": -145.96359252929688, "logps/rejected": -164.42001342773438, "loss": 0.6217, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.8266698122024536, "rewards/margins": 0.2106233835220337, "rewards/rejected": -1.0372931957244873, "step": 1790 }, { "epoch": 0.31, "grad_norm": 2.46875, "learning_rate": 4.3573252297072544e-06, "logits/chosen": -3.1158227920532227, "logits/rejected": -3.111827850341797, "logps/chosen": -153.73141479492188, "logps/rejected": -172.12696838378906, "loss": 0.629, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.9160531163215637, "rewards/margins": 0.20312508940696716, "rewards/rejected": -1.1191781759262085, "step": 1800 }, { "epoch": 0.31, "eval_logits/chosen": -3.1113333702087402, "eval_logits/rejected": -3.1087486743927, "eval_logps/chosen": -146.88597106933594, "eval_logps/rejected": -164.902587890625, "eval_loss": 0.6488604545593262, "eval_rewards/accuracies": 0.6187267899513245, "eval_rewards/chosen": -0.7547595500946045, "eval_rewards/margins": 0.14319583773612976, "eval_rewards/rejected": -0.8979554176330566, "eval_runtime": 483.4992, "eval_samples_per_second": 8.902, "eval_steps_per_second": 1.113, "step": 1800 }, { "epoch": 0.31, "grad_norm": 2.1875, "learning_rate": 4.347224249328922e-06, "logits/chosen": -3.119220018386841, "logits/rejected": -3.1162116527557373, "logps/chosen": -164.25729370117188, "logps/rejected": -172.886962890625, "loss": 0.6747, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.96868497133255, "rewards/margins": 0.13878265023231506, "rewards/rejected": -1.107467532157898, "step": 1810 }, { "epoch": 0.31, "grad_norm": 2.484375, "learning_rate": 4.337056412345209e-06, "logits/chosen": -3.1267404556274414, "logits/rejected": -3.121772527694702, "logps/chosen": -152.7144775390625, "logps/rejected": -158.19595336914062, "loss": 0.6566, "rewards/accuracies": 0.59375, "rewards/chosen": -0.8691015243530273, "rewards/margins": 0.1269044280052185, "rewards/rejected": -0.9960060119628906, "step": 1820 }, { "epoch": 0.32, "grad_norm": 1.9765625, "learning_rate": 4.326822086760743e-06, "logits/chosen": -3.1357274055480957, "logits/rejected": -3.128983497619629, "logps/chosen": -140.71310424804688, "logps/rejected": -157.28184509277344, "loss": 0.6349, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.7482028007507324, "rewards/margins": 0.19133971631526947, "rewards/rejected": -0.9395424723625183, "step": 1830 }, { "epoch": 0.32, "grad_norm": 2.265625, "learning_rate": 4.316521642986566e-06, "logits/chosen": -3.1679399013519287, "logits/rejected": -3.1663129329681396, "logps/chosen": -146.0639190673828, "logps/rejected": -159.45327758789062, "loss": 0.6507, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.8074442148208618, "rewards/margins": 0.1614806205034256, "rewards/rejected": -0.9689248204231262, "step": 1840 }, { "epoch": 0.32, "grad_norm": 1.984375, "learning_rate": 4.3061554538267444e-06, "logits/chosen": -3.163782835006714, "logits/rejected": -3.1629064083099365, "logps/chosen": -135.1237030029297, "logps/rejected": -145.4768524169922, "loss": 0.6644, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.7083430290222168, "rewards/margins": 0.12782330811023712, "rewards/rejected": -0.8361663818359375, "step": 1850 }, { "epoch": 0.32, "grad_norm": 1.7734375, "learning_rate": 4.295723894464862e-06, "logits/chosen": -3.167217969894409, "logits/rejected": -3.1629233360290527, "logps/chosen": -129.897216796875, "logps/rejected": -137.09112548828125, "loss": 0.6383, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6009452939033508, "rewards/margins": 0.15543445944786072, "rewards/rejected": -0.7563797235488892, "step": 1860 }, { "epoch": 0.32, "grad_norm": 1.7109375, "learning_rate": 4.285227342450449e-06, "logits/chosen": -3.169278621673584, "logits/rejected": -3.1676392555236816, "logps/chosen": -123.1519546508789, "logps/rejected": -136.8570098876953, "loss": 0.6279, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5699488520622253, "rewards/margins": 0.19082696735858917, "rewards/rejected": -0.7607758045196533, "step": 1870 }, { "epoch": 0.32, "grad_norm": 1.7578125, "learning_rate": 4.274666177685317e-06, "logits/chosen": -3.168717622756958, "logits/rejected": -3.162923574447632, "logps/chosen": -125.75395202636719, "logps/rejected": -139.8086395263672, "loss": 0.6249, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6144317984580994, "rewards/margins": 0.192767933011055, "rewards/rejected": -0.8071997761726379, "step": 1880 }, { "epoch": 0.33, "grad_norm": 2.125, "learning_rate": 4.264040782409804e-06, "logits/chosen": -3.169412612915039, "logits/rejected": -3.1647706031799316, "logps/chosen": -125.84661865234375, "logps/rejected": -145.9022216796875, "loss": 0.6215, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.6267572641372681, "rewards/margins": 0.21151788532733917, "rewards/rejected": -0.838275134563446, "step": 1890 }, { "epoch": 0.33, "grad_norm": 1.9453125, "learning_rate": 4.253351541188947e-06, "logits/chosen": -3.1573798656463623, "logits/rejected": -3.154900074005127, "logps/chosen": -139.20106506347656, "logps/rejected": -142.49725341796875, "loss": 0.6622, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.7105225324630737, "rewards/margins": 0.11785916239023209, "rewards/rejected": -0.8283816576004028, "step": 1900 }, { "epoch": 0.33, "eval_logits/chosen": -3.141881227493286, "eval_logits/rejected": -3.1398515701293945, "eval_logps/chosen": -125.99919891357422, "eval_logps/rejected": -140.6699981689453, "eval_loss": 0.6555050015449524, "eval_rewards/accuracies": 0.606877326965332, "eval_rewards/chosen": -0.5458918213844299, "eval_rewards/margins": 0.10973773896694183, "eval_rewards/rejected": -0.655629575252533, "eval_runtime": 483.8573, "eval_samples_per_second": 8.895, "eval_steps_per_second": 1.112, "step": 1900 }, { "epoch": 0.33, "grad_norm": 2.109375, "learning_rate": 4.242598840898558e-06, "logits/chosen": -3.1242589950561523, "logits/rejected": -3.121159791946411, "logps/chosen": -140.1244659423828, "logps/rejected": -156.30972290039062, "loss": 0.6312, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.7592523694038391, "rewards/margins": 0.1901467740535736, "rewards/rejected": -0.9493991732597351, "step": 1910 }, { "epoch": 0.33, "grad_norm": 2.328125, "learning_rate": 4.231783070711223e-06, "logits/chosen": -3.15392804145813, "logits/rejected": -3.1520493030548096, "logps/chosen": -145.32569885253906, "logps/rejected": -157.24732971191406, "loss": 0.6421, "rewards/accuracies": 0.625, "rewards/chosen": -0.7951496243476868, "rewards/margins": 0.16277767717838287, "rewards/rejected": -0.9579272270202637, "step": 1920 }, { "epoch": 0.33, "grad_norm": 2.1875, "learning_rate": 4.22090462208222e-06, "logits/chosen": -3.1390862464904785, "logits/rejected": -3.134338617324829, "logps/chosen": -148.74644470214844, "logps/rejected": -165.2567596435547, "loss": 0.622, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8294029235839844, "rewards/margins": 0.2131255865097046, "rewards/rejected": -1.042528510093689, "step": 1930 }, { "epoch": 0.33, "grad_norm": 2.046875, "learning_rate": 4.209963888735346e-06, "logits/chosen": -3.169201374053955, "logits/rejected": -3.1704461574554443, "logps/chosen": -141.8172607421875, "logps/rejected": -159.37860107421875, "loss": 0.628, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.8085492849349976, "rewards/margins": 0.20195230841636658, "rewards/rejected": -1.0105016231536865, "step": 1940 }, { "epoch": 0.34, "grad_norm": 2.453125, "learning_rate": 4.198961266648671e-06, "logits/chosen": -3.156470537185669, "logits/rejected": -3.1528687477111816, "logps/chosen": -154.35397338867188, "logps/rejected": -175.27505493164062, "loss": 0.6384, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.9214574098587036, "rewards/margins": 0.19601932168006897, "rewards/rejected": -1.1174767017364502, "step": 1950 }, { "epoch": 0.34, "grad_norm": 2.5625, "learning_rate": 4.187897154040205e-06, "logits/chosen": -3.1518867015838623, "logits/rejected": -3.1515769958496094, "logps/chosen": -159.7874298095703, "logps/rejected": -172.596923828125, "loss": 0.6478, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.9553605318069458, "rewards/margins": 0.1520630419254303, "rewards/rejected": -1.1074237823486328, "step": 1960 }, { "epoch": 0.34, "grad_norm": 2.015625, "learning_rate": 4.176771951353481e-06, "logits/chosen": -3.169792652130127, "logits/rejected": -3.165846347808838, "logps/chosen": -151.9120330810547, "logps/rejected": -157.11441040039062, "loss": 0.6641, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.8450883626937866, "rewards/margins": 0.11743706464767456, "rewards/rejected": -0.9625255465507507, "step": 1970 }, { "epoch": 0.34, "grad_norm": 2.0625, "learning_rate": 4.165586061243074e-06, "logits/chosen": -3.1747801303863525, "logits/rejected": -3.170104503631592, "logps/chosen": -153.86459350585938, "logps/rejected": -167.02542114257812, "loss": 0.6464, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8814204335212708, "rewards/margins": 0.16699686646461487, "rewards/rejected": -1.048417329788208, "step": 1980 }, { "epoch": 0.34, "grad_norm": 1.6796875, "learning_rate": 4.154339888560008e-06, "logits/chosen": -3.1978580951690674, "logits/rejected": -3.195103645324707, "logps/chosen": -152.34295654296875, "logps/rejected": -165.3287811279297, "loss": 0.6294, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.9096066355705261, "rewards/margins": 0.1885077953338623, "rewards/rejected": -1.0981144905090332, "step": 1990 }, { "epoch": 0.34, "grad_norm": 2.140625, "learning_rate": 4.1430338403371275e-06, "logits/chosen": -3.1856448650360107, "logits/rejected": -3.186370372772217, "logps/chosen": -153.1247100830078, "logps/rejected": -176.1088409423828, "loss": 0.64, "rewards/accuracies": 0.625, "rewards/chosen": -0.9345780611038208, "rewards/margins": 0.1859598159790039, "rewards/rejected": -1.1205378770828247, "step": 2000 }, { "epoch": 0.34, "eval_logits/chosen": -3.1846792697906494, "eval_logits/rejected": -3.182372570037842, "eval_logps/chosen": -140.17138671875, "eval_logps/rejected": -156.38426208496094, "eval_loss": 0.6522988080978394, "eval_rewards/accuracies": 0.6101301312446594, "eval_rewards/chosen": -0.6876136660575867, "eval_rewards/margins": 0.12515859305858612, "eval_rewards/rejected": -0.8127721548080444, "eval_runtime": 483.6746, "eval_samples_per_second": 8.899, "eval_steps_per_second": 1.112, "step": 2000 }, { "epoch": 0.35, "grad_norm": 2.046875, "learning_rate": 4.131668325774343e-06, "logits/chosen": -3.1847736835479736, "logits/rejected": -3.180018424987793, "logps/chosen": -151.97802734375, "logps/rejected": -163.89077758789062, "loss": 0.6384, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.8523083925247192, "rewards/margins": 0.19287896156311035, "rewards/rejected": -1.0451873540878296, "step": 2010 }, { "epoch": 0.35, "grad_norm": 2.546875, "learning_rate": 4.120243756223835e-06, "logits/chosen": -3.1633083820343018, "logits/rejected": -3.157390594482422, "logps/chosen": -151.98509216308594, "logps/rejected": -178.520751953125, "loss": 0.6136, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9141998291015625, "rewards/margins": 0.24424409866333008, "rewards/rejected": -1.1584439277648926, "step": 2020 }, { "epoch": 0.35, "grad_norm": 2.4375, "learning_rate": 4.108760545175163e-06, "logits/chosen": -3.1734821796417236, "logits/rejected": -3.169147491455078, "logps/chosen": -160.19607543945312, "logps/rejected": -176.56297302246094, "loss": 0.6368, "rewards/accuracies": 0.6875, "rewards/chosen": -0.928693950176239, "rewards/margins": 0.1943078488111496, "rewards/rejected": -1.1230019330978394, "step": 2030 }, { "epoch": 0.35, "grad_norm": 2.0, "learning_rate": 4.097219108240295e-06, "logits/chosen": -3.1459593772888184, "logits/rejected": -3.143369674682617, "logps/chosen": -152.8044891357422, "logps/rejected": -172.223876953125, "loss": 0.6289, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.887675940990448, "rewards/margins": 0.21855147182941437, "rewards/rejected": -1.1062272787094116, "step": 2040 }, { "epoch": 0.35, "grad_norm": 2.90625, "learning_rate": 4.085619863138574e-06, "logits/chosen": -3.141997814178467, "logits/rejected": -3.1412582397460938, "logps/chosen": -143.8739776611328, "logps/rejected": -167.2696075439453, "loss": 0.6185, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.8224587440490723, "rewards/margins": 0.2269144058227539, "rewards/rejected": -1.0493730306625366, "step": 2050 }, { "epoch": 0.35, "grad_norm": 2.09375, "learning_rate": 4.0739632296815886e-06, "logits/chosen": -3.1491408348083496, "logits/rejected": -3.142490863800049, "logps/chosen": -146.3575439453125, "logps/rejected": -159.93038940429688, "loss": 0.6363, "rewards/accuracies": 0.59375, "rewards/chosen": -0.8111427426338196, "rewards/margins": 0.1872086524963379, "rewards/rejected": -0.9983514547348022, "step": 2060 }, { "epoch": 0.36, "grad_norm": 2.078125, "learning_rate": 4.0622496297579905e-06, "logits/chosen": -3.157477855682373, "logits/rejected": -3.154186725616455, "logps/chosen": -143.78921508789062, "logps/rejected": -161.87496948242188, "loss": 0.6229, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.7739632725715637, "rewards/margins": 0.21119961142539978, "rewards/rejected": -0.9851628541946411, "step": 2070 }, { "epoch": 0.36, "grad_norm": 2.3125, "learning_rate": 4.0504794873182144e-06, "logits/chosen": -3.13997220993042, "logits/rejected": -3.1348297595977783, "logps/chosen": -145.42245483398438, "logps/rejected": -164.66867065429688, "loss": 0.6139, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.8327577710151672, "rewards/margins": 0.23421470820903778, "rewards/rejected": -1.0669724941253662, "step": 2080 }, { "epoch": 0.36, "grad_norm": 2.515625, "learning_rate": 4.038653228359143e-06, "logits/chosen": -3.138305902481079, "logits/rejected": -3.138002634048462, "logps/chosen": -156.08377075195312, "logps/rejected": -172.61456298828125, "loss": 0.6431, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.9054716229438782, "rewards/margins": 0.1922367513179779, "rewards/rejected": -1.0977084636688232, "step": 2090 }, { "epoch": 0.36, "grad_norm": 2.25, "learning_rate": 4.026771280908682e-06, "logits/chosen": -3.1057121753692627, "logits/rejected": -3.102771282196045, "logps/chosen": -163.07972717285156, "logps/rejected": -173.98348999023438, "loss": 0.6479, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.975011944770813, "rewards/margins": 0.1709790974855423, "rewards/rejected": -1.1459910869598389, "step": 2100 }, { "epoch": 0.36, "eval_logits/chosen": -3.1159632205963135, "eval_logits/rejected": -3.112962245941162, "eval_logps/chosen": -150.8987579345703, "eval_logps/rejected": -167.6335906982422, "eval_loss": 0.653691828250885, "eval_rewards/accuracies": 0.6103624701499939, "eval_rewards/chosen": -0.7948872447013855, "eval_rewards/margins": 0.13037818670272827, "eval_rewards/rejected": -0.9252654314041138, "eval_runtime": 483.6902, "eval_samples_per_second": 8.898, "eval_steps_per_second": 1.112, "step": 2100 }, { "epoch": 0.36, "grad_norm": 2.46875, "learning_rate": 4.014834075010271e-06, "logits/chosen": -3.117692470550537, "logits/rejected": -3.1150553226470947, "logps/chosen": -165.73184204101562, "logps/rejected": -192.96554565429688, "loss": 0.6186, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.0519955158233643, "rewards/margins": 0.24650517106056213, "rewards/rejected": -1.298500657081604, "step": 2110 }, { "epoch": 0.37, "grad_norm": 2.671875, "learning_rate": 4.002842042707323e-06, "logits/chosen": -3.074627161026001, "logits/rejected": -3.068140745162964, "logps/chosen": -164.93692016601562, "logps/rejected": -188.56765747070312, "loss": 0.6003, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0069468021392822, "rewards/margins": 0.28368431329727173, "rewards/rejected": -1.2906310558319092, "step": 2120 }, { "epoch": 0.37, "grad_norm": 2.5625, "learning_rate": 3.9907956180275785e-06, "logits/chosen": -3.041842222213745, "logits/rejected": -3.0408873558044434, "logps/chosen": -180.57772827148438, "logps/rejected": -207.1053466796875, "loss": 0.5915, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.1625645160675049, "rewards/margins": 0.30793333053588867, "rewards/rejected": -1.4704978466033936, "step": 2130 }, { "epoch": 0.37, "grad_norm": 2.6875, "learning_rate": 3.978695236967405e-06, "logits/chosen": -3.074413776397705, "logits/rejected": -3.0732433795928955, "logps/chosen": -173.1667022705078, "logps/rejected": -194.44573974609375, "loss": 0.635, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.1367294788360596, "rewards/margins": 0.19113154709339142, "rewards/rejected": -1.3278610706329346, "step": 2140 }, { "epoch": 0.37, "grad_norm": 2.390625, "learning_rate": 3.966541337476012e-06, "logits/chosen": -3.066967487335205, "logits/rejected": -3.0620956420898438, "logps/chosen": -170.68936157226562, "logps/rejected": -187.6675567626953, "loss": 0.6023, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.0465517044067383, "rewards/margins": 0.26273447275161743, "rewards/rejected": -1.30928635597229, "step": 2150 }, { "epoch": 0.37, "grad_norm": 2.71875, "learning_rate": 3.9543343594396035e-06, "logits/chosen": -3.0941786766052246, "logits/rejected": -3.0885229110717773, "logps/chosen": -177.64736938476562, "logps/rejected": -191.93521118164062, "loss": 0.625, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.1087653636932373, "rewards/margins": 0.23340511322021484, "rewards/rejected": -1.3421704769134521, "step": 2160 }, { "epoch": 0.37, "grad_norm": 3.171875, "learning_rate": 3.942074744665456e-06, "logits/chosen": -3.100074291229248, "logits/rejected": -3.09078311920166, "logps/chosen": -180.03762817382812, "logps/rejected": -204.2498321533203, "loss": 0.5995, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.1526836156845093, "rewards/margins": 0.295266330242157, "rewards/rejected": -1.447949767112732, "step": 2170 }, { "epoch": 0.38, "grad_norm": 3.390625, "learning_rate": 3.929762936865926e-06, "logits/chosen": -3.1241230964660645, "logits/rejected": -3.121016025543213, "logps/chosen": -197.09913635253906, "logps/rejected": -210.7930908203125, "loss": 0.639, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.2780972719192505, "rewards/margins": 0.2249462604522705, "rewards/rejected": -1.5030434131622314, "step": 2180 }, { "epoch": 0.38, "grad_norm": 2.125, "learning_rate": 3.917399381642395e-06, "logits/chosen": -3.154695510864258, "logits/rejected": -3.150545597076416, "logps/chosen": -175.54129028320312, "logps/rejected": -191.65377807617188, "loss": 0.6552, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.126696228981018, "rewards/margins": 0.17622551321983337, "rewards/rejected": -1.3029218912124634, "step": 2190 }, { "epoch": 0.38, "grad_norm": 2.40625, "learning_rate": 3.904984526469139e-06, "logits/chosen": -3.1482882499694824, "logits/rejected": -3.144016742706299, "logps/chosen": -148.60595703125, "logps/rejected": -175.087646484375, "loss": 0.6023, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8616147041320801, "rewards/margins": 0.26853346824645996, "rewards/rejected": -1.13014817237854, "step": 2200 }, { "epoch": 0.38, "eval_logits/chosen": -3.147921562194824, "eval_logits/rejected": -3.144941568374634, "eval_logps/chosen": -137.71630859375, "eval_logps/rejected": -153.79273986816406, "eval_loss": 0.653631329536438, "eval_rewards/accuracies": 0.6033921837806702, "eval_rewards/chosen": -0.6630630493164062, "eval_rewards/margins": 0.12379389256238937, "eval_rewards/rejected": -0.7868569493293762, "eval_runtime": 483.7826, "eval_samples_per_second": 8.897, "eval_steps_per_second": 1.112, "step": 2200 }, { "epoch": 0.38, "grad_norm": 2.515625, "learning_rate": 3.892518820677131e-06, "logits/chosen": -3.1459808349609375, "logits/rejected": -3.142317056655884, "logps/chosen": -148.6342010498047, "logps/rejected": -164.85739135742188, "loss": 0.6286, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8283321261405945, "rewards/margins": 0.2048930823802948, "rewards/rejected": -1.0332252979278564, "step": 2210 }, { "epoch": 0.38, "grad_norm": 2.125, "learning_rate": 3.880002715437786e-06, "logits/chosen": -3.129206418991089, "logits/rejected": -3.126734495162964, "logps/chosen": -147.17568969726562, "logps/rejected": -165.13308715820312, "loss": 0.6334, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.8225167393684387, "rewards/margins": 0.20732179284095764, "rewards/rejected": -1.0298385620117188, "step": 2220 }, { "epoch": 0.38, "grad_norm": 2.703125, "learning_rate": 3.867436663746622e-06, "logits/chosen": -3.1609504222869873, "logits/rejected": -3.1583099365234375, "logps/chosen": -152.25564575195312, "logps/rejected": -160.70870971679688, "loss": 0.6572, "rewards/accuracies": 0.59375, "rewards/chosen": -0.8773072957992554, "rewards/margins": 0.1397085189819336, "rewards/rejected": -1.0170156955718994, "step": 2230 }, { "epoch": 0.39, "grad_norm": 1.8359375, "learning_rate": 3.854821120406871e-06, "logits/chosen": -3.173096179962158, "logits/rejected": -3.171881914138794, "logps/chosen": -145.95648193359375, "logps/rejected": -153.50050354003906, "loss": 0.6612, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.7911441326141357, "rewards/margins": 0.12236903607845306, "rewards/rejected": -0.91351318359375, "step": 2240 }, { "epoch": 0.39, "grad_norm": 2.953125, "learning_rate": 3.842156542013017e-06, "logits/chosen": -3.1604485511779785, "logits/rejected": -3.156759738922119, "logps/chosen": -151.40597534179688, "logps/rejected": -166.3770294189453, "loss": 0.6246, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.854582667350769, "rewards/margins": 0.19725193083286285, "rewards/rejected": -1.0518347024917603, "step": 2250 }, { "epoch": 0.39, "grad_norm": 2.296875, "learning_rate": 3.8294433869342695e-06, "logits/chosen": -3.1788666248321533, "logits/rejected": -3.177128553390503, "logps/chosen": -160.8689727783203, "logps/rejected": -169.00955200195312, "loss": 0.665, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.9033733606338501, "rewards/margins": 0.13598643243312836, "rewards/rejected": -1.0393598079681396, "step": 2260 }, { "epoch": 0.39, "grad_norm": 2.203125, "learning_rate": 3.816682115297976e-06, "logits/chosen": -3.1626791954040527, "logits/rejected": -3.15694260597229, "logps/chosen": -157.9209747314453, "logps/rejected": -171.28599548339844, "loss": 0.6421, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.9369276762008667, "rewards/margins": 0.18542329967021942, "rewards/rejected": -1.122351050376892, "step": 2270 }, { "epoch": 0.39, "grad_norm": 1.9296875, "learning_rate": 3.803873188972966e-06, "logits/chosen": -3.152496576309204, "logits/rejected": -3.1473207473754883, "logps/chosen": -154.1539764404297, "logps/rejected": -175.39974975585938, "loss": 0.6214, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.8972307443618774, "rewards/margins": 0.2114594280719757, "rewards/rejected": -1.1086901426315308, "step": 2280 }, { "epoch": 0.39, "grad_norm": 2.28125, "learning_rate": 3.791017071552835e-06, "logits/chosen": -3.1055197715759277, "logits/rejected": -3.100161075592041, "logps/chosen": -158.73663330078125, "logps/rejected": -183.3207244873047, "loss": 0.5997, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.9550874829292297, "rewards/margins": 0.27279362082481384, "rewards/rejected": -1.2278810739517212, "step": 2290 }, { "epoch": 0.4, "grad_norm": 3.15625, "learning_rate": 3.778114228339168e-06, "logits/chosen": -3.1239192485809326, "logits/rejected": -3.1184122562408447, "logps/chosen": -167.10992431640625, "logps/rejected": -189.25596618652344, "loss": 0.5962, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.9823795557022095, "rewards/margins": 0.2892586290836334, "rewards/rejected": -1.2716381549835205, "step": 2300 }, { "epoch": 0.4, "eval_logits/chosen": -3.1011734008789062, "eval_logits/rejected": -3.097487449645996, "eval_logps/chosen": -159.4140625, "eval_logps/rejected": -177.23011779785156, "eval_loss": 0.6523010730743408, "eval_rewards/accuracies": 0.6078066825866699, "eval_rewards/chosen": -0.8800405263900757, "eval_rewards/margins": 0.14119039475917816, "eval_rewards/rejected": -1.021230936050415, "eval_runtime": 483.6223, "eval_samples_per_second": 8.9, "eval_steps_per_second": 1.112, "step": 2300 }, { "epoch": 0.4, "grad_norm": 3.296875, "learning_rate": 3.7651651263246947e-06, "logits/chosen": -3.0919950008392334, "logits/rejected": -3.086912155151367, "logps/chosen": -166.24557495117188, "logps/rejected": -190.60391235351562, "loss": 0.5973, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.0429760217666626, "rewards/margins": 0.2770213186740875, "rewards/rejected": -1.3199971914291382, "step": 2310 }, { "epoch": 0.4, "grad_norm": 2.96875, "learning_rate": 3.752170234176392e-06, "logits/chosen": -3.055863857269287, "logits/rejected": -3.051103115081787, "logps/chosen": -181.3167724609375, "logps/rejected": -201.3364715576172, "loss": 0.6001, "rewards/accuracies": 0.65625, "rewards/chosen": -1.1310415267944336, "rewards/margins": 0.29059791564941406, "rewards/rejected": -1.4216395616531372, "step": 2320 }, { "epoch": 0.4, "grad_norm": 2.671875, "learning_rate": 3.739130022218519e-06, "logits/chosen": -3.0658938884735107, "logits/rejected": -3.0609793663024902, "logps/chosen": -178.57443237304688, "logps/rejected": -207.0660400390625, "loss": 0.5796, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.1136971712112427, "rewards/margins": 0.35894858837127686, "rewards/rejected": -1.4726457595825195, "step": 2330 }, { "epoch": 0.4, "grad_norm": 2.625, "learning_rate": 3.726044962415595e-06, "logits/chosen": -3.0586750507354736, "logits/rejected": -3.0569536685943604, "logps/chosen": -184.21685791015625, "logps/rejected": -205.1505889892578, "loss": 0.6387, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.2161709070205688, "rewards/margins": 0.19973869621753693, "rewards/rejected": -1.4159094095230103, "step": 2340 }, { "epoch": 0.4, "grad_norm": 2.515625, "learning_rate": 3.712915528355317e-06, "logits/chosen": -3.0611510276794434, "logits/rejected": -3.0518746376037598, "logps/chosen": -177.49746704101562, "logps/rejected": -198.83865356445312, "loss": 0.6174, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.0889817476272583, "rewards/margins": 0.27505603432655334, "rewards/rejected": -1.3640376329421997, "step": 2350 }, { "epoch": 0.41, "grad_norm": 3.171875, "learning_rate": 3.6997421952314223e-06, "logits/chosen": -3.062241792678833, "logits/rejected": -3.0571401119232178, "logps/chosen": -172.83241271972656, "logps/rejected": -198.0221710205078, "loss": 0.6132, "rewards/accuracies": 0.65625, "rewards/chosen": -1.1134238243103027, "rewards/margins": 0.2823295295238495, "rewards/rejected": -1.3957535028457642, "step": 2360 }, { "epoch": 0.41, "grad_norm": 2.578125, "learning_rate": 3.686525439826484e-06, "logits/chosen": -3.058692216873169, "logits/rejected": -3.050508975982666, "logps/chosen": -177.1395263671875, "logps/rejected": -203.19869995117188, "loss": 0.6078, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1302058696746826, "rewards/margins": 0.2856082022190094, "rewards/rejected": -1.4158140420913696, "step": 2370 }, { "epoch": 0.41, "grad_norm": 3.53125, "learning_rate": 3.6732657404946624e-06, "logits/chosen": -3.0415139198303223, "logits/rejected": -3.03214955329895, "logps/chosen": -170.95999145507812, "logps/rejected": -192.58023071289062, "loss": 0.6288, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.0619453191757202, "rewards/margins": 0.251602441072464, "rewards/rejected": -1.3135477304458618, "step": 2380 }, { "epoch": 0.41, "grad_norm": 2.765625, "learning_rate": 3.6599635771443844e-06, "logits/chosen": -3.0108182430267334, "logits/rejected": -3.004117488861084, "logps/chosen": -192.63619995117188, "logps/rejected": -220.47128295898438, "loss": 0.5854, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.2579948902130127, "rewards/margins": 0.34344083070755005, "rewards/rejected": -1.601435661315918, "step": 2390 }, { "epoch": 0.41, "grad_norm": 2.859375, "learning_rate": 3.646619431220978e-06, "logits/chosen": -3.014111042022705, "logits/rejected": -3.0105881690979004, "logps/chosen": -183.05506896972656, "logps/rejected": -208.9149627685547, "loss": 0.6176, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.204532265663147, "rewards/margins": 0.27077314257621765, "rewards/rejected": -1.4753053188323975, "step": 2400 }, { "epoch": 0.41, "eval_logits/chosen": -3.03197979927063, "eval_logits/rejected": -3.0264739990234375, "eval_logps/chosen": -172.7088623046875, "eval_logps/rejected": -192.77476501464844, "eval_loss": 0.6506057977676392, "eval_rewards/accuracies": 0.6026951670646667, "eval_rewards/chosen": -1.0129884481430054, "eval_rewards/margins": 0.16368862986564636, "eval_rewards/rejected": -1.1766771078109741, "eval_runtime": 483.6491, "eval_samples_per_second": 8.899, "eval_steps_per_second": 1.112, "step": 2400 }, { "epoch": 0.42, "grad_norm": 3.34375, "learning_rate": 3.6332337856892475e-06, "logits/chosen": -3.0102365016937256, "logits/rejected": -3.0060172080993652, "logps/chosen": -186.672119140625, "logps/rejected": -205.01229858398438, "loss": 0.6124, "rewards/accuracies": 0.6875, "rewards/chosen": -1.209280014038086, "rewards/margins": 0.2705342769622803, "rewards/rejected": -1.4798142910003662, "step": 2410 }, { "epoch": 0.42, "grad_norm": 2.75, "learning_rate": 3.6198071250159945e-06, "logits/chosen": -3.0345890522003174, "logits/rejected": -3.0291411876678467, "logps/chosen": -187.89105224609375, "logps/rejected": -214.79861450195312, "loss": 0.5981, "rewards/accuracies": 0.71875, "rewards/chosen": -1.2190016508102417, "rewards/margins": 0.3049257695674896, "rewards/rejected": -1.5239274501800537, "step": 2420 }, { "epoch": 0.42, "grad_norm": 2.703125, "learning_rate": 3.6063399351524793e-06, "logits/chosen": -3.056429386138916, "logits/rejected": -3.0526039600372314, "logps/chosen": -181.78106689453125, "logps/rejected": -193.84323120117188, "loss": 0.6579, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1559642553329468, "rewards/margins": 0.18488237261772156, "rewards/rejected": -1.3408466577529907, "step": 2430 }, { "epoch": 0.42, "grad_norm": 2.21875, "learning_rate": 3.592832703516836e-06, "logits/chosen": -3.0940728187561035, "logits/rejected": -3.0855565071105957, "logps/chosen": -167.07205200195312, "logps/rejected": -191.03402709960938, "loss": 0.6023, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.002306580543518, "rewards/margins": 0.28310757875442505, "rewards/rejected": -1.285414218902588, "step": 2440 }, { "epoch": 0.42, "grad_norm": 2.390625, "learning_rate": 3.5792859189764335e-06, "logits/chosen": -3.092224597930908, "logits/rejected": -3.0879569053649902, "logps/chosen": -167.45254516601562, "logps/rejected": -182.4717254638672, "loss": 0.639, "rewards/accuracies": 0.59375, "rewards/chosen": -1.0350295305252075, "rewards/margins": 0.18815535306930542, "rewards/rejected": -1.2231849431991577, "step": 2450 }, { "epoch": 0.42, "grad_norm": 1.9765625, "learning_rate": 3.5657000718301765e-06, "logits/chosen": -3.0908710956573486, "logits/rejected": -3.0826985836029053, "logps/chosen": -161.75523376464844, "logps/rejected": -174.8667449951172, "loss": 0.6481, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.9404891133308411, "rewards/margins": 0.19990110397338867, "rewards/rejected": -1.1403902769088745, "step": 2460 }, { "epoch": 0.43, "grad_norm": 2.484375, "learning_rate": 3.5520756537907645e-06, "logits/chosen": -3.103588819503784, "logits/rejected": -3.0951483249664307, "logps/chosen": -156.4438934326172, "logps/rejected": -175.2889862060547, "loss": 0.622, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9040303230285645, "rewards/margins": 0.23538950085639954, "rewards/rejected": -1.1394197940826416, "step": 2470 }, { "epoch": 0.43, "grad_norm": 2.328125, "learning_rate": 3.538413157966893e-06, "logits/chosen": -3.073765993118286, "logits/rejected": -3.0662198066711426, "logps/chosen": -166.85813903808594, "logps/rejected": -182.69912719726562, "loss": 0.6287, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.026293158531189, "rewards/margins": 0.2210662066936493, "rewards/rejected": -1.2473593950271606, "step": 2480 }, { "epoch": 0.43, "grad_norm": 2.296875, "learning_rate": 3.5247130788454076e-06, "logits/chosen": -3.06006121635437, "logits/rejected": -3.0558817386627197, "logps/chosen": -157.5069580078125, "logps/rejected": -180.8059539794922, "loss": 0.6243, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.9881949424743652, "rewards/margins": 0.22879700362682343, "rewards/rejected": -1.2169920206069946, "step": 2490 }, { "epoch": 0.43, "grad_norm": 2.390625, "learning_rate": 3.510975912273406e-06, "logits/chosen": -3.0647358894348145, "logits/rejected": -3.0556392669677734, "logps/chosen": -175.3211212158203, "logps/rejected": -197.31922912597656, "loss": 0.6255, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.0933088064193726, "rewards/margins": 0.23709645867347717, "rewards/rejected": -1.3304052352905273, "step": 2500 }, { "epoch": 0.43, "eval_logits/chosen": -3.0629374980926514, "eval_logits/rejected": -3.058375358581543, "eval_logps/chosen": -156.9641571044922, "eval_logps/rejected": -175.3397979736328, "eval_loss": 0.6506990790367126, "eval_rewards/accuracies": 0.6101301312446594, "eval_rewards/chosen": -0.8555412888526917, "eval_rewards/margins": 0.1467861831188202, "eval_rewards/rejected": -1.002327561378479, "eval_runtime": 483.7437, "eval_samples_per_second": 8.897, "eval_steps_per_second": 1.112, "step": 2500 }, { "epoch": 0.43, "grad_norm": 2.0625, "learning_rate": 3.4972021554402924e-06, "logits/chosen": -3.0609078407287598, "logits/rejected": -3.0537776947021484, "logps/chosen": -174.7488555908203, "logps/rejected": -198.6224365234375, "loss": 0.6094, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.1154464483261108, "rewards/margins": 0.27513909339904785, "rewards/rejected": -1.3905855417251587, "step": 2510 }, { "epoch": 0.43, "grad_norm": 2.515625, "learning_rate": 3.483392306859784e-06, "logits/chosen": -3.042370319366455, "logits/rejected": -3.0388169288635254, "logps/chosen": -176.43917846679688, "logps/rejected": -196.3103790283203, "loss": 0.6254, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.134268045425415, "rewards/margins": 0.2571788430213928, "rewards/rejected": -1.391446828842163, "step": 2520 }, { "epoch": 0.44, "grad_norm": 2.796875, "learning_rate": 3.469546866351866e-06, "logits/chosen": -3.0680062770843506, "logits/rejected": -3.0632071495056152, "logps/chosen": -171.026123046875, "logps/rejected": -188.93869018554688, "loss": 0.6492, "rewards/accuracies": 0.5625, "rewards/chosen": -1.0946629047393799, "rewards/margins": 0.17611399292945862, "rewards/rejected": -1.2707767486572266, "step": 2530 }, { "epoch": 0.44, "grad_norm": 3.03125, "learning_rate": 3.455666335024701e-06, "logits/chosen": -3.0372817516326904, "logits/rejected": -3.0323173999786377, "logps/chosen": -186.8362274169922, "logps/rejected": -209.86343383789062, "loss": 0.6371, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2337965965270996, "rewards/margins": 0.24707002937793732, "rewards/rejected": -1.480866551399231, "step": 2540 }, { "epoch": 0.44, "grad_norm": 2.703125, "learning_rate": 3.4417512152564976e-06, "logits/chosen": -3.0695526599884033, "logits/rejected": -3.061310291290283, "logps/chosen": -177.91978454589844, "logps/rejected": -196.3243865966797, "loss": 0.628, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.0750468969345093, "rewards/margins": 0.26663774251937866, "rewards/rejected": -1.3416846990585327, "step": 2550 }, { "epoch": 0.44, "grad_norm": 2.140625, "learning_rate": 3.42780201067732e-06, "logits/chosen": -3.0941805839538574, "logits/rejected": -3.0907230377197266, "logps/chosen": -160.5797119140625, "logps/rejected": -178.7494354248047, "loss": 0.6354, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.9490875005722046, "rewards/margins": 0.21290269494056702, "rewards/rejected": -1.1619904041290283, "step": 2560 }, { "epoch": 0.44, "grad_norm": 1.953125, "learning_rate": 3.413819226150868e-06, "logits/chosen": -3.108309268951416, "logits/rejected": -3.102802038192749, "logps/chosen": -162.13426208496094, "logps/rejected": -179.61190795898438, "loss": 0.6276, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.9272769689559937, "rewards/margins": 0.22005334496498108, "rewards/rejected": -1.1473302841186523, "step": 2570 }, { "epoch": 0.44, "grad_norm": 2.578125, "learning_rate": 3.399803367756198e-06, "logits/chosen": -3.0917704105377197, "logits/rejected": -3.086387872695923, "logps/chosen": -163.72564697265625, "logps/rejected": -176.5128936767578, "loss": 0.6543, "rewards/accuracies": 0.5625, "rewards/chosen": -0.9989093542098999, "rewards/margins": 0.15462008118629456, "rewards/rejected": -1.153529405593872, "step": 2580 }, { "epoch": 0.45, "grad_norm": 2.609375, "learning_rate": 3.3857549427694114e-06, "logits/chosen": -3.1144728660583496, "logits/rejected": -3.10710072517395, "logps/chosen": -150.10018920898438, "logps/rejected": -160.92153930664062, "loss": 0.6288, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.8386899828910828, "rewards/margins": 0.19079235196113586, "rewards/rejected": -1.029482364654541, "step": 2590 }, { "epoch": 0.45, "grad_norm": 3.0625, "learning_rate": 3.3716744596452918e-06, "logits/chosen": -3.0912861824035645, "logits/rejected": -3.0834295749664307, "logps/chosen": -157.7835235595703, "logps/rejected": -171.8795928955078, "loss": 0.6075, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.8950634002685547, "rewards/margins": 0.25771036744117737, "rewards/rejected": -1.1527738571166992, "step": 2600 }, { "epoch": 0.45, "eval_logits/chosen": -3.087653160095215, "eval_logits/rejected": -3.0838546752929688, "eval_logps/chosen": -146.07357788085938, "eval_logps/rejected": -162.31468200683594, "eval_loss": 0.6547107100486755, "eval_rewards/accuracies": 0.6045538783073425, "eval_rewards/chosen": -0.7466354370117188, "eval_rewards/margins": 0.12544085085391998, "eval_rewards/rejected": -0.8720762729644775, "eval_runtime": 483.6537, "eval_samples_per_second": 8.899, "eval_steps_per_second": 1.112, "step": 2600 }, { "epoch": 0.45, "grad_norm": 2.359375, "learning_rate": 3.3575624279989017e-06, "logits/chosen": -3.077409029006958, "logits/rejected": -3.0710222721099854, "logps/chosen": -161.51071166992188, "logps/rejected": -175.16867065429688, "loss": 0.6298, "rewards/accuracies": 0.625, "rewards/chosen": -0.9341617822647095, "rewards/margins": 0.21895930171012878, "rewards/rejected": -1.153120994567871, "step": 2610 }, { "epoch": 0.45, "grad_norm": 2.5625, "learning_rate": 3.3434193585871405e-06, "logits/chosen": -3.0789408683776855, "logits/rejected": -3.07112193107605, "logps/chosen": -160.45144653320312, "logps/rejected": -184.2188720703125, "loss": 0.6051, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9884397387504578, "rewards/margins": 0.2745136320590973, "rewards/rejected": -1.262953281402588, "step": 2620 }, { "epoch": 0.45, "grad_norm": 2.265625, "learning_rate": 3.3292457632902603e-06, "logits/chosen": -3.0605623722076416, "logits/rejected": -3.0544679164886475, "logps/chosen": -169.61282348632812, "logps/rejected": -192.62991333007812, "loss": 0.6109, "rewards/accuracies": 0.6875, "rewards/chosen": -1.03895103931427, "rewards/margins": 0.27460330724716187, "rewards/rejected": -1.313554286956787, "step": 2630 }, { "epoch": 0.45, "grad_norm": 2.65625, "learning_rate": 3.315042155093334e-06, "logits/chosen": -3.048037052154541, "logits/rejected": -3.0404555797576904, "logps/chosen": -171.00094604492188, "logps/rejected": -193.4027557373047, "loss": 0.6088, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0972424745559692, "rewards/margins": 0.26539820432662964, "rewards/rejected": -1.362640619277954, "step": 2640 }, { "epoch": 0.46, "grad_norm": 2.640625, "learning_rate": 3.300809048067692e-06, "logits/chosen": -3.036726474761963, "logits/rejected": -3.0282886028289795, "logps/chosen": -178.0996856689453, "logps/rejected": -201.49282836914062, "loss": 0.6315, "rewards/accuracies": 0.65625, "rewards/chosen": -1.1393941640853882, "rewards/margins": 0.25805196166038513, "rewards/rejected": -1.3974461555480957, "step": 2650 }, { "epoch": 0.46, "grad_norm": 3.109375, "learning_rate": 3.2865469573523163e-06, "logits/chosen": -3.0682671070098877, "logits/rejected": -3.0621070861816406, "logps/chosen": -177.4022979736328, "logps/rejected": -191.1795654296875, "loss": 0.6378, "rewards/accuracies": 0.625, "rewards/chosen": -1.0945355892181396, "rewards/margins": 0.21731004118919373, "rewards/rejected": -1.3118455410003662, "step": 2660 }, { "epoch": 0.46, "grad_norm": 2.046875, "learning_rate": 3.2722563991351965e-06, "logits/chosen": -3.0843520164489746, "logits/rejected": -3.077232837677002, "logps/chosen": -162.65525817871094, "logps/rejected": -180.09799194335938, "loss": 0.6406, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.9851263165473938, "rewards/margins": 0.2147480696439743, "rewards/rejected": -1.1998745203018188, "step": 2670 }, { "epoch": 0.46, "grad_norm": 1.96875, "learning_rate": 3.2579378906346464e-06, "logits/chosen": -3.1301608085632324, "logits/rejected": -3.1294732093811035, "logps/chosen": -152.4020538330078, "logps/rejected": -161.99658203125, "loss": 0.6408, "rewards/accuracies": 0.625, "rewards/chosen": -0.8279502987861633, "rewards/margins": 0.17696353793144226, "rewards/rejected": -1.0049139261245728, "step": 2680 }, { "epoch": 0.46, "grad_norm": 1.7578125, "learning_rate": 3.243591950080584e-06, "logits/chosen": -3.1527717113494873, "logits/rejected": -3.1481423377990723, "logps/chosen": -139.64170837402344, "logps/rejected": -159.52427673339844, "loss": 0.6076, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.7912130951881409, "rewards/margins": 0.23410753905773163, "rewards/rejected": -1.0253206491470337, "step": 2690 }, { "epoch": 0.47, "grad_norm": 2.25, "learning_rate": 3.2292190966957776e-06, "logits/chosen": -3.1262025833129883, "logits/rejected": -3.1222102642059326, "logps/chosen": -148.70486450195312, "logps/rejected": -168.97586059570312, "loss": 0.6282, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8543796539306641, "rewards/margins": 0.22417394816875458, "rewards/rejected": -1.078553557395935, "step": 2700 }, { "epoch": 0.47, "eval_logits/chosen": -3.122093915939331, "eval_logits/rejected": -3.118544340133667, "eval_logps/chosen": -140.73252868652344, "eval_logps/rejected": -157.2623748779297, "eval_loss": 0.6531208157539368, "eval_rewards/accuracies": 0.6101301312446594, "eval_rewards/chosen": -0.693225085735321, "eval_rewards/margins": 0.12832820415496826, "eval_rewards/rejected": -0.8215532302856445, "eval_runtime": 483.8913, "eval_samples_per_second": 8.895, "eval_steps_per_second": 1.112, "step": 2700 }, { "epoch": 0.47, "grad_norm": 2.4375, "learning_rate": 3.21481985067705e-06, "logits/chosen": -3.1101999282836914, "logits/rejected": -3.1087584495544434, "logps/chosen": -159.03683471679688, "logps/rejected": -170.97689819335938, "loss": 0.6379, "rewards/accuracies": 0.625, "rewards/chosen": -0.8768089413642883, "rewards/margins": 0.18777629733085632, "rewards/rejected": -1.0645853281021118, "step": 2710 }, { "epoch": 0.47, "grad_norm": 2.6875, "learning_rate": 3.200394733176454e-06, "logits/chosen": -3.107483148574829, "logits/rejected": -3.1052358150482178, "logps/chosen": -151.2173309326172, "logps/rejected": -175.96107482910156, "loss": 0.6139, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.8823947906494141, "rewards/margins": 0.2533818185329437, "rewards/rejected": -1.1357766389846802, "step": 2720 }, { "epoch": 0.47, "grad_norm": 3.25, "learning_rate": 3.1859442662824085e-06, "logits/chosen": -3.103843927383423, "logits/rejected": -3.0988357067108154, "logps/chosen": -162.48377990722656, "logps/rejected": -177.31251525878906, "loss": 0.6416, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.929785430431366, "rewards/margins": 0.21908466517925262, "rewards/rejected": -1.1488702297210693, "step": 2730 }, { "epoch": 0.47, "grad_norm": 2.15625, "learning_rate": 3.1714689730008043e-06, "logits/chosen": -3.117023468017578, "logits/rejected": -3.1135358810424805, "logps/chosen": -149.63278198242188, "logps/rejected": -160.5523223876953, "loss": 0.6527, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.857519805431366, "rewards/margins": 0.16857053339481354, "rewards/rejected": -1.026090383529663, "step": 2740 }, { "epoch": 0.47, "grad_norm": 2.15625, "learning_rate": 3.156969377236072e-06, "logits/chosen": -3.1075711250305176, "logits/rejected": -3.0988144874572754, "logps/chosen": -140.55865478515625, "logps/rejected": -167.2582244873047, "loss": 0.6084, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.8101563453674316, "rewards/margins": 0.25592103600502014, "rewards/rejected": -1.066077470779419, "step": 2750 }, { "epoch": 0.48, "grad_norm": 3.3125, "learning_rate": 3.1424460037722237e-06, "logits/chosen": -3.096191644668579, "logits/rejected": -3.0911002159118652, "logps/chosen": -143.87744140625, "logps/rejected": -162.12014770507812, "loss": 0.6342, "rewards/accuracies": 0.625, "rewards/chosen": -0.7865437269210815, "rewards/margins": 0.20428061485290527, "rewards/rejected": -0.9908244013786316, "step": 2760 }, { "epoch": 0.48, "grad_norm": 2.4375, "learning_rate": 3.127899378253858e-06, "logits/chosen": -3.111184597015381, "logits/rejected": -3.106191635131836, "logps/chosen": -155.27023315429688, "logps/rejected": -169.6674346923828, "loss": 0.6283, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.8901050686836243, "rewards/margins": 0.1963515430688858, "rewards/rejected": -1.0864566564559937, "step": 2770 }, { "epoch": 0.48, "grad_norm": 2.21875, "learning_rate": 3.1133300271671354e-06, "logits/chosen": -3.073341131210327, "logits/rejected": -3.0657191276550293, "logps/chosen": -164.946044921875, "logps/rejected": -182.87684631347656, "loss": 0.6265, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0226364135742188, "rewards/margins": 0.22265009582042694, "rewards/rejected": -1.2452863454818726, "step": 2780 }, { "epoch": 0.48, "grad_norm": 2.265625, "learning_rate": 3.0987384778207218e-06, "logits/chosen": -3.07016658782959, "logits/rejected": -3.06632661819458, "logps/chosen": -160.828857421875, "logps/rejected": -181.21853637695312, "loss": 0.6099, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.9760662913322449, "rewards/margins": 0.2529025971889496, "rewards/rejected": -1.228968858718872, "step": 2790 }, { "epoch": 0.48, "grad_norm": 2.203125, "learning_rate": 3.0841252583267067e-06, "logits/chosen": -3.08182954788208, "logits/rejected": -3.0774779319763184, "logps/chosen": -178.40939331054688, "logps/rejected": -192.86624145507812, "loss": 0.6495, "rewards/accuracies": 0.625, "rewards/chosen": -1.087641954421997, "rewards/margins": 0.19885390996932983, "rewards/rejected": -1.2864959239959717, "step": 2800 }, { "epoch": 0.48, "eval_logits/chosen": -3.0925514698028564, "eval_logits/rejected": -3.0887064933776855, "eval_logps/chosen": -148.7371826171875, "eval_logps/rejected": -166.3008575439453, "eval_loss": 0.6517302989959717, "eval_rewards/accuracies": 0.6080390214920044, "eval_rewards/chosen": -0.7732716798782349, "eval_rewards/margins": 0.1386662721633911, "eval_rewards/rejected": -0.9119380116462708, "eval_runtime": 483.7013, "eval_samples_per_second": 8.898, "eval_steps_per_second": 1.112, "step": 2800 }, { "epoch": 0.48, "grad_norm": 2.59375, "learning_rate": 3.069490897581486e-06, "logits/chosen": -3.089717388153076, "logits/rejected": -3.0846445560455322, "logps/chosen": -158.58767700195312, "logps/rejected": -185.58558654785156, "loss": 0.5971, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9179424047470093, "rewards/margins": 0.29216495156288147, "rewards/rejected": -1.2101073265075684, "step": 2810 }, { "epoch": 0.49, "grad_norm": 2.875, "learning_rate": 3.054835925246622e-06, "logits/chosen": -3.0747859477996826, "logits/rejected": -3.0709011554718018, "logps/chosen": -172.0844268798828, "logps/rejected": -184.31924438476562, "loss": 0.6543, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.068491816520691, "rewards/margins": 0.17100703716278076, "rewards/rejected": -1.2394988536834717, "step": 2820 }, { "epoch": 0.49, "grad_norm": 2.34375, "learning_rate": 3.040160871729672e-06, "logits/chosen": -3.0449886322021484, "logits/rejected": -3.0391552448272705, "logps/chosen": -173.50198364257812, "logps/rejected": -198.10621643066406, "loss": 0.6103, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0645406246185303, "rewards/margins": 0.28309398889541626, "rewards/rejected": -1.3476345539093018, "step": 2830 }, { "epoch": 0.49, "grad_norm": 2.671875, "learning_rate": 3.025466268164992e-06, "logits/chosen": -3.068354368209839, "logits/rejected": -3.0641281604766846, "logps/chosen": -173.51577758789062, "logps/rejected": -186.96096801757812, "loss": 0.6479, "rewards/accuracies": 0.625, "rewards/chosen": -1.0937964916229248, "rewards/margins": 0.18635380268096924, "rewards/rejected": -1.2801504135131836, "step": 2840 }, { "epoch": 0.49, "grad_norm": 3.328125, "learning_rate": 3.0107526463945124e-06, "logits/chosen": -3.0679664611816406, "logits/rejected": -3.062669277191162, "logps/chosen": -166.79176330566406, "logps/rejected": -194.7895050048828, "loss": 0.6058, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.03748619556427, "rewards/margins": 0.28654587268829346, "rewards/rejected": -1.3240320682525635, "step": 2850 }, { "epoch": 0.49, "grad_norm": 2.953125, "learning_rate": 2.9960205389484918e-06, "logits/chosen": -3.078962564468384, "logits/rejected": -3.074683666229248, "logps/chosen": -166.84970092773438, "logps/rejected": -187.77969360351562, "loss": 0.6094, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.020708680152893, "rewards/margins": 0.25686854124069214, "rewards/rejected": -1.27757728099823, "step": 2860 }, { "epoch": 0.49, "grad_norm": 2.609375, "learning_rate": 2.981270479026239e-06, "logits/chosen": -3.0889625549316406, "logits/rejected": -3.0859227180480957, "logps/chosen": -175.95822143554688, "logps/rejected": -191.1319122314453, "loss": 0.6327, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.0878520011901855, "rewards/margins": 0.22063109278678894, "rewards/rejected": -1.3084831237792969, "step": 2870 }, { "epoch": 0.5, "grad_norm": 2.65625, "learning_rate": 2.9665030004768158e-06, "logits/chosen": -3.0921990871429443, "logits/rejected": -3.0831103324890137, "logps/chosen": -172.875244140625, "logps/rejected": -190.37728881835938, "loss": 0.6286, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.0688822269439697, "rewards/margins": 0.24558386206626892, "rewards/rejected": -1.314465880393982, "step": 2880 }, { "epoch": 0.5, "grad_norm": 3.34375, "learning_rate": 2.9517186377797203e-06, "logits/chosen": -3.0864691734313965, "logits/rejected": -3.081881046295166, "logps/chosen": -163.46083068847656, "logps/rejected": -187.11343383789062, "loss": 0.6167, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.0008008480072021, "rewards/margins": 0.25917181372642517, "rewards/rejected": -1.2599728107452393, "step": 2890 }, { "epoch": 0.5, "grad_norm": 2.703125, "learning_rate": 2.936917926025536e-06, "logits/chosen": -3.0828022956848145, "logits/rejected": -3.078491687774658, "logps/chosen": -159.6892547607422, "logps/rejected": -181.262451171875, "loss": 0.6202, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9650337100028992, "rewards/margins": 0.24236789345741272, "rewards/rejected": -1.2074015140533447, "step": 2900 }, { "epoch": 0.5, "eval_logits/chosen": -3.0786919593811035, "eval_logits/rejected": -3.074411392211914, "eval_logps/chosen": -152.96585083007812, "eval_logps/rejected": -170.98321533203125, "eval_loss": 0.6511818170547485, "eval_rewards/accuracies": 0.604786217212677, "eval_rewards/chosen": -0.815558135509491, "eval_rewards/margins": 0.143203467130661, "eval_rewards/rejected": -0.9587615728378296, "eval_runtime": 483.7714, "eval_samples_per_second": 8.897, "eval_steps_per_second": 1.112, "step": 2900 }, { "epoch": 0.5, "grad_norm": 2.359375, "learning_rate": 2.9221014008965686e-06, "logits/chosen": -3.0856261253356934, "logits/rejected": -3.0778799057006836, "logps/chosen": -171.34780883789062, "logps/rejected": -201.55828857421875, "loss": 0.5884, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.042557716369629, "rewards/margins": 0.33153969049453735, "rewards/rejected": -1.3740973472595215, "step": 2910 }, { "epoch": 0.5, "grad_norm": 2.875, "learning_rate": 2.907269598647457e-06, "logits/chosen": -3.02229905128479, "logits/rejected": -3.0171494483947754, "logps/chosen": -185.18301391601562, "logps/rejected": -215.0197296142578, "loss": 0.6061, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2088510990142822, "rewards/margins": 0.3223053812980652, "rewards/rejected": -1.5311565399169922, "step": 2920 }, { "epoch": 0.5, "grad_norm": 2.53125, "learning_rate": 2.8924230560857657e-06, "logits/chosen": -3.023981809616089, "logits/rejected": -3.0177435874938965, "logps/chosen": -178.6295623779297, "logps/rejected": -202.44158935546875, "loss": 0.6029, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1470363140106201, "rewards/margins": 0.2915676534175873, "rewards/rejected": -1.4386039972305298, "step": 2930 }, { "epoch": 0.51, "grad_norm": 2.5, "learning_rate": 2.8775623105525557e-06, "logits/chosen": -3.05517840385437, "logits/rejected": -3.053927183151245, "logps/chosen": -166.50283813476562, "logps/rejected": -187.00949096679688, "loss": 0.6236, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0485191345214844, "rewards/margins": 0.23477879166603088, "rewards/rejected": -1.2832978963851929, "step": 2940 }, { "epoch": 0.51, "grad_norm": 3.09375, "learning_rate": 2.8626878999029354e-06, "logits/chosen": -3.046323299407959, "logits/rejected": -3.040755033493042, "logps/chosen": -176.8887939453125, "logps/rejected": -198.10455322265625, "loss": 0.6306, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.115063190460205, "rewards/margins": 0.23778335750102997, "rewards/rejected": -1.352846622467041, "step": 2950 }, { "epoch": 0.51, "grad_norm": 2.453125, "learning_rate": 2.847800362486596e-06, "logits/chosen": -3.0453903675079346, "logits/rejected": -3.0353028774261475, "logps/chosen": -170.50367736816406, "logps/rejected": -198.71578979492188, "loss": 0.5976, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0572386980056763, "rewards/margins": 0.32733428478240967, "rewards/rejected": -1.384572982788086, "step": 2960 }, { "epoch": 0.51, "grad_norm": 3.1875, "learning_rate": 2.832900237128325e-06, "logits/chosen": -3.0243773460388184, "logits/rejected": -3.02040958404541, "logps/chosen": -179.3520965576172, "logps/rejected": -198.31239318847656, "loss": 0.6296, "rewards/accuracies": 0.65625, "rewards/chosen": -1.1419975757598877, "rewards/margins": 0.24163858592510223, "rewards/rejected": -1.383636236190796, "step": 2970 }, { "epoch": 0.51, "grad_norm": 3.875, "learning_rate": 2.8179880631085053e-06, "logits/chosen": -3.0174291133880615, "logits/rejected": -3.0091402530670166, "logps/chosen": -177.42556762695312, "logps/rejected": -202.75509643554688, "loss": 0.6054, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1225911378860474, "rewards/margins": 0.30541062355041504, "rewards/rejected": -1.4280017614364624, "step": 2980 }, { "epoch": 0.52, "grad_norm": 3.03125, "learning_rate": 2.803064380143598e-06, "logits/chosen": -3.0254709720611572, "logits/rejected": -3.0239815711975098, "logps/chosen": -189.5607452392578, "logps/rejected": -203.75823974609375, "loss": 0.6487, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2223767042160034, "rewards/margins": 0.1916547566652298, "rewards/rejected": -1.4140313863754272, "step": 2990 }, { "epoch": 0.52, "grad_norm": 2.546875, "learning_rate": 2.7881297283666063e-06, "logits/chosen": -3.0681405067443848, "logits/rejected": -3.0591464042663574, "logps/chosen": -164.3688201904297, "logps/rejected": -189.56399536132812, "loss": 0.6252, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0210239887237549, "rewards/margins": 0.24706561863422394, "rewards/rejected": -1.2680894136428833, "step": 3000 }, { "epoch": 0.52, "eval_logits/chosen": -3.082378625869751, "eval_logits/rejected": -3.0782182216644287, "eval_logps/chosen": -148.42666625976562, "eval_logps/rejected": -166.3867950439453, "eval_loss": 0.6505253314971924, "eval_rewards/accuracies": 0.6054832935333252, "eval_rewards/chosen": -0.7701665759086609, "eval_rewards/margins": 0.14263089001178741, "eval_rewards/rejected": -0.9127974510192871, "eval_runtime": 483.7394, "eval_samples_per_second": 8.897, "eval_steps_per_second": 1.112, "step": 3000 }, { "epoch": 0.52, "grad_norm": 2.515625, "learning_rate": 2.77318464830753e-06, "logits/chosen": -3.0817339420318604, "logits/rejected": -3.075918674468994, "logps/chosen": -163.93724060058594, "logps/rejected": -179.66001892089844, "loss": 0.6391, "rewards/accuracies": 0.625, "rewards/chosen": -0.9847095608711243, "rewards/margins": 0.20721586048603058, "rewards/rejected": -1.191925287246704, "step": 3010 }, { "epoch": 0.52, "grad_norm": 2.171875, "learning_rate": 2.7582296808737964e-06, "logits/chosen": -3.099862575531006, "logits/rejected": -3.0948238372802734, "logps/chosen": -164.39498901367188, "logps/rejected": -180.0300750732422, "loss": 0.633, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9961506128311157, "rewards/margins": 0.23695461452007294, "rewards/rejected": -1.2331053018569946, "step": 3020 }, { "epoch": 0.52, "grad_norm": 2.390625, "learning_rate": 2.7432653673306896e-06, "logits/chosen": -3.1014318466186523, "logits/rejected": -3.0969395637512207, "logps/chosen": -155.86209106445312, "logps/rejected": -171.91366577148438, "loss": 0.6356, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.9031952619552612, "rewards/margins": 0.1848280131816864, "rewards/rejected": -1.08802330493927, "step": 3030 }, { "epoch": 0.52, "grad_norm": 2.109375, "learning_rate": 2.7282922492817565e-06, "logits/chosen": -3.1041133403778076, "logits/rejected": -3.0992465019226074, "logps/chosen": -151.39370727539062, "logps/rejected": -179.11111450195312, "loss": 0.6077, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.905647873878479, "rewards/margins": 0.27351897954940796, "rewards/rejected": -1.1791667938232422, "step": 3040 }, { "epoch": 0.53, "grad_norm": 2.875, "learning_rate": 2.7133108686492054e-06, "logits/chosen": -3.074476957321167, "logits/rejected": -3.074582576751709, "logps/chosen": -155.0388641357422, "logps/rejected": -175.10678100585938, "loss": 0.631, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9062402844429016, "rewards/margins": 0.2049740105867386, "rewards/rejected": -1.111214280128479, "step": 3050 }, { "epoch": 0.53, "grad_norm": 2.5, "learning_rate": 2.6983217676542927e-06, "logits/chosen": -3.061591148376465, "logits/rejected": -3.0571389198303223, "logps/chosen": -162.79901123046875, "logps/rejected": -183.30624389648438, "loss": 0.6187, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.964520275592804, "rewards/margins": 0.23759868741035461, "rewards/rejected": -1.2021191120147705, "step": 3060 }, { "epoch": 0.53, "grad_norm": 2.625, "learning_rate": 2.6833254887976974e-06, "logits/chosen": -3.0584092140197754, "logits/rejected": -3.0508856773376465, "logps/chosen": -170.1864776611328, "logps/rejected": -185.8191680908203, "loss": 0.6235, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.0173214673995972, "rewards/margins": 0.22812402248382568, "rewards/rejected": -1.2454454898834229, "step": 3070 }, { "epoch": 0.53, "grad_norm": 2.625, "learning_rate": 2.6683225748398877e-06, "logits/chosen": -3.055368185043335, "logits/rejected": -3.0461692810058594, "logps/chosen": -166.15420532226562, "logps/rejected": -192.15284729003906, "loss": 0.6274, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.0636614561080933, "rewards/margins": 0.24721452593803406, "rewards/rejected": -1.3108760118484497, "step": 3080 }, { "epoch": 0.53, "grad_norm": 2.890625, "learning_rate": 2.6533135687814753e-06, "logits/chosen": -3.0698859691619873, "logits/rejected": -3.0649948120117188, "logps/chosen": -162.9439697265625, "logps/rejected": -186.91714477539062, "loss": 0.6026, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9822311401367188, "rewards/margins": 0.2814427614212036, "rewards/rejected": -1.2636739015579224, "step": 3090 }, { "epoch": 0.53, "grad_norm": 3.3125, "learning_rate": 2.638299013843564e-06, "logits/chosen": -3.0506327152252197, "logits/rejected": -3.0413401126861572, "logps/chosen": -169.4080047607422, "logps/rejected": -192.44485473632812, "loss": 0.6082, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.0290443897247314, "rewards/margins": 0.290638267993927, "rewards/rejected": -1.3196827173233032, "step": 3100 }, { "epoch": 0.53, "eval_logits/chosen": -3.072305202484131, "eval_logits/rejected": -3.067795753479004, "eval_logps/chosen": -149.20468139648438, "eval_logps/rejected": -167.45477294921875, "eval_loss": 0.6500055193901062, "eval_rewards/accuracies": 0.6115241646766663, "eval_rewards/chosen": -0.7779466509819031, "eval_rewards/margins": 0.14553073048591614, "eval_rewards/rejected": -0.9234774708747864, "eval_runtime": 483.8699, "eval_samples_per_second": 8.895, "eval_steps_per_second": 1.112, "step": 3100 }, { "epoch": 0.54, "grad_norm": 2.890625, "learning_rate": 2.6232794534480866e-06, "logits/chosen": -3.064969301223755, "logits/rejected": -3.0633625984191895, "logps/chosen": -165.35995483398438, "logps/rejected": -188.6304168701172, "loss": 0.6339, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.046263337135315, "rewards/margins": 0.20431256294250488, "rewards/rejected": -1.2505757808685303, "step": 3110 }, { "epoch": 0.54, "grad_norm": 2.4375, "learning_rate": 2.6082554311981425e-06, "logits/chosen": -3.0731215476989746, "logits/rejected": -3.0652499198913574, "logps/chosen": -161.23680114746094, "logps/rejected": -184.52694702148438, "loss": 0.6057, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9424988627433777, "rewards/margins": 0.26860615611076355, "rewards/rejected": -1.2111051082611084, "step": 3120 }, { "epoch": 0.54, "grad_norm": 2.984375, "learning_rate": 2.5932274908583146e-06, "logits/chosen": -3.0556998252868652, "logits/rejected": -3.0475969314575195, "logps/chosen": -160.74111938476562, "logps/rejected": -189.58358764648438, "loss": 0.6158, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.9874521493911743, "rewards/margins": 0.28721556067466736, "rewards/rejected": -1.2746676206588745, "step": 3130 }, { "epoch": 0.54, "grad_norm": 2.6875, "learning_rate": 2.578196176334995e-06, "logits/chosen": -3.0566956996917725, "logits/rejected": -3.05082106590271, "logps/chosen": -170.5960693359375, "logps/rejected": -200.77806091308594, "loss": 0.5993, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.0843502283096313, "rewards/margins": 0.3123868405818939, "rewards/rejected": -1.3967368602752686, "step": 3140 }, { "epoch": 0.54, "grad_norm": 2.671875, "learning_rate": 2.5631620316566986e-06, "logits/chosen": -3.0403594970703125, "logits/rejected": -3.0373871326446533, "logps/chosen": -172.71676635742188, "logps/rejected": -191.23036193847656, "loss": 0.6288, "rewards/accuracies": 0.65625, "rewards/chosen": -1.069809079170227, "rewards/margins": 0.2294754534959793, "rewards/rejected": -1.2992844581604004, "step": 3150 }, { "epoch": 0.54, "grad_norm": 2.171875, "learning_rate": 2.548125600954371e-06, "logits/chosen": -3.0161185264587402, "logits/rejected": -3.0118935108184814, "logps/chosen": -172.9065399169922, "logps/rejected": -192.70851135253906, "loss": 0.629, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.0712924003601074, "rewards/margins": 0.25309473276138306, "rewards/rejected": -1.3243870735168457, "step": 3160 }, { "epoch": 0.55, "grad_norm": 2.3125, "learning_rate": 2.5330874284416956e-06, "logits/chosen": -3.0632405281066895, "logits/rejected": -3.053253650665283, "logps/chosen": -173.99534606933594, "logps/rejected": -190.8984375, "loss": 0.6094, "rewards/accuracies": 0.625, "rewards/chosen": -1.0698506832122803, "rewards/margins": 0.27976253628730774, "rewards/rejected": -1.3496131896972656, "step": 3170 }, { "epoch": 0.55, "grad_norm": 2.765625, "learning_rate": 2.5180480583953974e-06, "logits/chosen": -3.075591564178467, "logits/rejected": -3.0682435035705566, "logps/chosen": -169.34739685058594, "logps/rejected": -191.22177124023438, "loss": 0.6188, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.057546615600586, "rewards/margins": 0.26352807879447937, "rewards/rejected": -1.3210748434066772, "step": 3180 }, { "epoch": 0.55, "grad_norm": 2.34375, "learning_rate": 2.5030080351355452e-06, "logits/chosen": -3.079733371734619, "logits/rejected": -3.0735690593719482, "logps/chosen": -154.4625244140625, "logps/rejected": -188.51358032226562, "loss": 0.5911, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9075784683227539, "rewards/margins": 0.32719746232032776, "rewards/rejected": -1.2347759008407593, "step": 3190 }, { "epoch": 0.55, "grad_norm": 2.171875, "learning_rate": 2.4879679030058478e-06, "logits/chosen": -3.0768027305603027, "logits/rejected": -3.0705323219299316, "logps/chosen": -158.77996826171875, "logps/rejected": -180.58169555664062, "loss": 0.6072, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9090372323989868, "rewards/margins": 0.2884618937969208, "rewards/rejected": -1.1974990367889404, "step": 3200 }, { "epoch": 0.55, "eval_logits/chosen": -3.0862679481506348, "eval_logits/rejected": -3.081866979598999, "eval_logps/chosen": -147.08103942871094, "eval_logps/rejected": -164.96694946289062, "eval_loss": 0.6499215364456177, "eval_rewards/accuracies": 0.6089683771133423, "eval_rewards/chosen": -0.7567103505134583, "eval_rewards/margins": 0.14188869297504425, "eval_rewards/rejected": -0.8985989689826965, "eval_runtime": 483.5034, "eval_samples_per_second": 8.902, "eval_steps_per_second": 1.113, "step": 3200 }, { "epoch": 0.55, "grad_norm": 2.53125, "learning_rate": 2.472928206353955e-06, "logits/chosen": -3.0501890182495117, "logits/rejected": -3.041801929473877, "logps/chosen": -160.53079223632812, "logps/rejected": -183.5560302734375, "loss": 0.6043, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.9603055715560913, "rewards/margins": 0.26849913597106934, "rewards/rejected": -1.228804588317871, "step": 3210 }, { "epoch": 0.55, "grad_norm": 2.734375, "learning_rate": 2.4578894895117554e-06, "logits/chosen": -3.0532243251800537, "logits/rejected": -3.0485551357269287, "logps/chosen": -156.84738159179688, "logps/rejected": -185.34927368164062, "loss": 0.6054, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.9624822735786438, "rewards/margins": 0.2839977741241455, "rewards/rejected": -1.2464802265167236, "step": 3220 }, { "epoch": 0.56, "grad_norm": 3.453125, "learning_rate": 2.442852296775674e-06, "logits/chosen": -3.0451407432556152, "logits/rejected": -3.040626287460327, "logps/chosen": -164.8292694091797, "logps/rejected": -191.13607788085938, "loss": 0.6239, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.0101099014282227, "rewards/margins": 0.2563210129737854, "rewards/rejected": -1.2664308547973633, "step": 3230 }, { "epoch": 0.56, "grad_norm": 2.921875, "learning_rate": 2.427817172386977e-06, "logits/chosen": -3.0670342445373535, "logits/rejected": -3.06071138381958, "logps/chosen": -174.47415161132812, "logps/rejected": -188.95457458496094, "loss": 0.6368, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.0790221691131592, "rewards/margins": 0.21176621317863464, "rewards/rejected": -1.290788173675537, "step": 3240 }, { "epoch": 0.56, "grad_norm": 2.9375, "learning_rate": 2.412784660512068e-06, "logits/chosen": -3.061464309692383, "logits/rejected": -3.0560860633850098, "logps/chosen": -163.1319580078125, "logps/rejected": -184.18960571289062, "loss": 0.6335, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.000069499015808, "rewards/margins": 0.22641122341156006, "rewards/rejected": -1.2264807224273682, "step": 3250 }, { "epoch": 0.56, "grad_norm": 2.359375, "learning_rate": 2.397755305222797e-06, "logits/chosen": -3.066709041595459, "logits/rejected": -3.0596566200256348, "logps/chosen": -156.2583770751953, "logps/rejected": -181.06417846679688, "loss": 0.6222, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9408279657363892, "rewards/margins": 0.2763696610927582, "rewards/rejected": -1.2171975374221802, "step": 3260 }, { "epoch": 0.56, "grad_norm": 1.9921875, "learning_rate": 2.3827296504767667e-06, "logits/chosen": -3.0858919620513916, "logits/rejected": -3.0763540267944336, "logps/chosen": -166.6835479736328, "logps/rejected": -190.68980407714844, "loss": 0.6257, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.0163527727127075, "rewards/margins": 0.24425125122070312, "rewards/rejected": -1.2606040239334106, "step": 3270 }, { "epoch": 0.57, "grad_norm": 3.0, "learning_rate": 2.3677082400976473e-06, "logits/chosen": -3.06581449508667, "logits/rejected": -3.061218738555908, "logps/chosen": -161.67579650878906, "logps/rejected": -186.04104614257812, "loss": 0.6247, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.017340064048767, "rewards/margins": 0.23519904911518097, "rewards/rejected": -1.2525392770767212, "step": 3280 }, { "epoch": 0.57, "grad_norm": 2.6875, "learning_rate": 2.352691617755492e-06, "logits/chosen": -3.052398204803467, "logits/rejected": -3.039611339569092, "logps/chosen": -168.47596740722656, "logps/rejected": -199.4940185546875, "loss": 0.5945, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.046691656112671, "rewards/margins": 0.320436954498291, "rewards/rejected": -1.367128610610962, "step": 3290 }, { "epoch": 0.57, "grad_norm": 3.640625, "learning_rate": 2.3376803269470604e-06, "logits/chosen": -3.0078794956207275, "logits/rejected": -2.995847225189209, "logps/chosen": -192.8760986328125, "logps/rejected": -219.29867553710938, "loss": 0.6142, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.276427149772644, "rewards/margins": 0.30553415417671204, "rewards/rejected": -1.5819613933563232, "step": 3300 }, { "epoch": 0.57, "eval_logits/chosen": -3.0087318420410156, "eval_logits/rejected": -3.002570629119873, "eval_logps/chosen": -179.26649475097656, "eval_logps/rejected": -200.59915161132812, "eval_loss": 0.6467859745025635, "eval_rewards/accuracies": 0.6175650358200073, "eval_rewards/chosen": -1.0785648822784424, "eval_rewards/margins": 0.17635630071163177, "eval_rewards/rejected": -1.2549211978912354, "eval_runtime": 484.158, "eval_samples_per_second": 8.89, "eval_steps_per_second": 1.111, "step": 3300 }, { "epoch": 0.57, "grad_norm": 2.5, "learning_rate": 2.3226749109761475e-06, "logits/chosen": -2.9723334312438965, "logits/rejected": -2.9634299278259277, "logps/chosen": -190.951904296875, "logps/rejected": -217.7271728515625, "loss": 0.6006, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.264145851135254, "rewards/margins": 0.32409703731536865, "rewards/rejected": -1.5882428884506226, "step": 3310 }, { "epoch": 0.57, "grad_norm": 2.234375, "learning_rate": 2.3076759129339222e-06, "logits/chosen": -3.0012853145599365, "logits/rejected": -2.989985704421997, "logps/chosen": -189.26229858398438, "logps/rejected": -215.1383056640625, "loss": 0.5977, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2482668161392212, "rewards/margins": 0.33457013964653015, "rewards/rejected": -1.5828371047973633, "step": 3320 }, { "epoch": 0.57, "grad_norm": 2.640625, "learning_rate": 2.2926838756792668e-06, "logits/chosen": -3.023132801055908, "logits/rejected": -3.014843463897705, "logps/chosen": -178.64822387695312, "logps/rejected": -212.94601440429688, "loss": 0.5775, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.1388877630233765, "rewards/margins": 0.37283557653427124, "rewards/rejected": -1.511723279953003, "step": 3330 }, { "epoch": 0.58, "grad_norm": 4.28125, "learning_rate": 2.2776993418191332e-06, "logits/chosen": -3.0232415199279785, "logits/rejected": -3.01299786567688, "logps/chosen": -187.7949676513672, "logps/rejected": -215.0971221923828, "loss": 0.606, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2033436298370361, "rewards/margins": 0.3320937752723694, "rewards/rejected": -1.5354373455047607, "step": 3340 }, { "epoch": 0.58, "grad_norm": 2.21875, "learning_rate": 2.262722853688902e-06, "logits/chosen": -3.015946865081787, "logits/rejected": -3.005009889602661, "logps/chosen": -181.58934020996094, "logps/rejected": -211.24002075195312, "loss": 0.5997, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.1483066082000732, "rewards/margins": 0.327314555644989, "rewards/rejected": -1.475620985031128, "step": 3350 }, { "epoch": 0.58, "grad_norm": 3.03125, "learning_rate": 2.247754953332754e-06, "logits/chosen": -3.0162365436553955, "logits/rejected": -3.0121781826019287, "logps/chosen": -179.1669158935547, "logps/rejected": -198.03746032714844, "loss": 0.6294, "rewards/accuracies": 0.65625, "rewards/chosen": -1.1394041776657104, "rewards/margins": 0.24760189652442932, "rewards/rejected": -1.3870060443878174, "step": 3360 }, { "epoch": 0.58, "grad_norm": 2.953125, "learning_rate": 2.2327961824840564e-06, "logits/chosen": -3.0204949378967285, "logits/rejected": -3.0136327743530273, "logps/chosen": -171.4439239501953, "logps/rejected": -204.19378662109375, "loss": 0.5862, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0856364965438843, "rewards/margins": 0.338853657245636, "rewards/rejected": -1.424490213394165, "step": 3370 }, { "epoch": 0.58, "grad_norm": 3.484375, "learning_rate": 2.2178470825457464e-06, "logits/chosen": -3.031080484390259, "logits/rejected": -3.026292562484741, "logps/chosen": -176.1151580810547, "logps/rejected": -195.78326416015625, "loss": 0.6171, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.095861554145813, "rewards/margins": 0.27515116333961487, "rewards/rejected": -1.3710126876831055, "step": 3380 }, { "epoch": 0.58, "grad_norm": 2.6875, "learning_rate": 2.2029081945707473e-06, "logits/chosen": -3.0416462421417236, "logits/rejected": -3.0330119132995605, "logps/chosen": -168.8120574951172, "logps/rejected": -191.52745056152344, "loss": 0.6212, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.056342363357544, "rewards/margins": 0.2708873152732849, "rewards/rejected": -1.3272297382354736, "step": 3390 }, { "epoch": 0.59, "grad_norm": 2.625, "learning_rate": 2.1879800592423758e-06, "logits/chosen": -3.0537242889404297, "logits/rejected": -3.044168472290039, "logps/chosen": -166.37783813476562, "logps/rejected": -195.04409790039062, "loss": 0.602, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.031355619430542, "rewards/margins": 0.329393595457077, "rewards/rejected": -1.3607490062713623, "step": 3400 }, { "epoch": 0.59, "eval_logits/chosen": -3.0674045085906982, "eval_logits/rejected": -3.0623619556427, "eval_logps/chosen": -150.30816650390625, "eval_logps/rejected": -168.40872192382812, "eval_loss": 0.6503540873527527, "eval_rewards/accuracies": 0.6136152148246765, "eval_rewards/chosen": -0.7889814972877502, "eval_rewards/margins": 0.14403526484966278, "eval_rewards/rejected": -0.933016836643219, "eval_runtime": 492.8396, "eval_samples_per_second": 8.733, "eval_steps_per_second": 1.092, "step": 3400 }, { "epoch": 0.59, "grad_norm": 2.828125, "learning_rate": 2.1730632168547807e-06, "logits/chosen": -3.045454263687134, "logits/rejected": -3.0381667613983154, "logps/chosen": -153.18984985351562, "logps/rejected": -168.77667236328125, "loss": 0.6296, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.931566059589386, "rewards/margins": 0.20537595450878143, "rewards/rejected": -1.1369420289993286, "step": 3410 }, { "epoch": 0.59, "grad_norm": 4.1875, "learning_rate": 2.1581582072933873e-06, "logits/chosen": -3.0582358837127686, "logits/rejected": -3.053211212158203, "logps/chosen": -157.33743286132812, "logps/rejected": -184.45919799804688, "loss": 0.5964, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.9351167678833008, "rewards/margins": 0.2938837707042694, "rewards/rejected": -1.2290005683898926, "step": 3420 }, { "epoch": 0.59, "grad_norm": 2.640625, "learning_rate": 2.1432655700153496e-06, "logits/chosen": -3.0582404136657715, "logits/rejected": -3.050635576248169, "logps/chosen": -165.08010864257812, "logps/rejected": -189.5339813232422, "loss": 0.6278, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.032097578048706, "rewards/margins": 0.23791618645191193, "rewards/rejected": -1.2700138092041016, "step": 3430 }, { "epoch": 0.59, "grad_norm": 2.8125, "learning_rate": 2.1283858440300376e-06, "logits/chosen": -3.0382144451141357, "logits/rejected": -3.025315761566162, "logps/chosen": -173.4113006591797, "logps/rejected": -204.8324432373047, "loss": 0.5897, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.112926721572876, "rewards/margins": 0.3371250629425049, "rewards/rejected": -1.4500519037246704, "step": 3440 }, { "epoch": 0.59, "grad_norm": 2.609375, "learning_rate": 2.113519567879517e-06, "logits/chosen": -3.0533881187438965, "logits/rejected": -3.050487995147705, "logps/chosen": -183.76519775390625, "logps/rejected": -200.22569274902344, "loss": 0.6285, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1319319009780884, "rewards/margins": 0.24390754103660583, "rewards/rejected": -1.375839352607727, "step": 3450 }, { "epoch": 0.6, "grad_norm": 2.671875, "learning_rate": 2.098667279619069e-06, "logits/chosen": -3.0273451805114746, "logits/rejected": -3.0172743797302246, "logps/chosen": -165.3519744873047, "logps/rejected": -193.7327117919922, "loss": 0.6054, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.0217342376708984, "rewards/margins": 0.29740262031555176, "rewards/rejected": -1.3191369771957397, "step": 3460 }, { "epoch": 0.6, "grad_norm": 2.8125, "learning_rate": 2.0838295167977066e-06, "logits/chosen": -3.0574910640716553, "logits/rejected": -3.0506603717803955, "logps/chosen": -173.7638397216797, "logps/rejected": -199.7215118408203, "loss": 0.6039, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.0668748617172241, "rewards/margins": 0.31231045722961426, "rewards/rejected": -1.379185438156128, "step": 3470 }, { "epoch": 0.6, "grad_norm": 2.96875, "learning_rate": 2.069006816438725e-06, "logits/chosen": -3.0340256690979004, "logits/rejected": -3.024568796157837, "logps/chosen": -173.4439697265625, "logps/rejected": -197.1393280029297, "loss": 0.6132, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.0758858919143677, "rewards/margins": 0.2928495705127716, "rewards/rejected": -1.368735432624817, "step": 3480 }, { "epoch": 0.6, "grad_norm": 3.40625, "learning_rate": 2.054199715020266e-06, "logits/chosen": -3.0460727214813232, "logits/rejected": -3.0427908897399902, "logps/chosen": -176.40223693847656, "logps/rejected": -196.90025329589844, "loss": 0.629, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1191596984863281, "rewards/margins": 0.22387664020061493, "rewards/rejected": -1.343036413192749, "step": 3490 }, { "epoch": 0.6, "grad_norm": 2.640625, "learning_rate": 2.039408748455894e-06, "logits/chosen": -3.0397393703460693, "logits/rejected": -3.0337138175964355, "logps/chosen": -166.6004638671875, "logps/rejected": -191.49612426757812, "loss": 0.605, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.0314760208129883, "rewards/margins": 0.26015573740005493, "rewards/rejected": -1.2916316986083984, "step": 3500 }, { "epoch": 0.6, "eval_logits/chosen": -3.058988332748413, "eval_logits/rejected": -3.0537822246551514, "eval_logps/chosen": -154.17901611328125, "eval_logps/rejected": -172.9109344482422, "eval_loss": 0.6496783494949341, "eval_rewards/accuracies": 0.6122211813926697, "eval_rewards/chosen": -0.8276901245117188, "eval_rewards/margins": 0.15034890174865723, "eval_rewards/rejected": -0.9780389070510864, "eval_runtime": 497.1419, "eval_samples_per_second": 8.657, "eval_steps_per_second": 1.082, "step": 3500 }, { "epoch": 0.6, "grad_norm": 2.25, "learning_rate": 2.024634452075209e-06, "logits/chosen": -3.0433077812194824, "logits/rejected": -3.0365288257598877, "logps/chosen": -169.48248291015625, "logps/rejected": -189.098388671875, "loss": 0.6227, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.0600249767303467, "rewards/margins": 0.2405003011226654, "rewards/rejected": -1.3005253076553345, "step": 3510 }, { "epoch": 0.61, "grad_norm": 2.671875, "learning_rate": 2.0098773606044627e-06, "logits/chosen": -3.042524814605713, "logits/rejected": -3.03288197517395, "logps/chosen": -166.3798370361328, "logps/rejected": -187.0223846435547, "loss": 0.6148, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.0103702545166016, "rewards/margins": 0.2540992498397827, "rewards/rejected": -1.2644695043563843, "step": 3520 }, { "epoch": 0.61, "grad_norm": 4.6875, "learning_rate": 1.9951380081472135e-06, "logits/chosen": -3.0522756576538086, "logits/rejected": -3.0436208248138428, "logps/chosen": -171.316650390625, "logps/rejected": -194.4859619140625, "loss": 0.6068, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.0492935180664062, "rewards/margins": 0.3023799955844879, "rewards/rejected": -1.3516733646392822, "step": 3530 }, { "epoch": 0.61, "grad_norm": 2.890625, "learning_rate": 1.9804169281649873e-06, "logits/chosen": -3.038957118988037, "logits/rejected": -3.0326712131500244, "logps/chosen": -171.6193389892578, "logps/rejected": -189.50172424316406, "loss": 0.6394, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.062083125114441, "rewards/margins": 0.2282875031232834, "rewards/rejected": -1.2903707027435303, "step": 3540 }, { "epoch": 0.61, "grad_norm": 2.75, "learning_rate": 1.965714653457979e-06, "logits/chosen": -3.0525827407836914, "logits/rejected": -3.0477380752563477, "logps/chosen": -173.34603881835938, "logps/rejected": -188.1285858154297, "loss": 0.6458, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.0868390798568726, "rewards/margins": 0.18998117744922638, "rewards/rejected": -1.276820421218872, "step": 3550 }, { "epoch": 0.61, "grad_norm": 3.484375, "learning_rate": 1.9510317161457586e-06, "logits/chosen": -3.055346965789795, "logits/rejected": -3.047273874282837, "logps/chosen": -164.53245544433594, "logps/rejected": -185.46621704101562, "loss": 0.6174, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.9918788075447083, "rewards/margins": 0.25856080651283264, "rewards/rejected": -1.2504394054412842, "step": 3560 }, { "epoch": 0.62, "grad_norm": 2.703125, "learning_rate": 1.936368647648022e-06, "logits/chosen": -3.0525732040405273, "logits/rejected": -3.0441994667053223, "logps/chosen": -179.19534301757812, "logps/rejected": -193.872314453125, "loss": 0.6613, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.1191743612289429, "rewards/margins": 0.19655336439609528, "rewards/rejected": -1.315727710723877, "step": 3570 }, { "epoch": 0.62, "grad_norm": 2.046875, "learning_rate": 1.9217259786653513e-06, "logits/chosen": -3.0564985275268555, "logits/rejected": -3.0520143508911133, "logps/chosen": -175.3017120361328, "logps/rejected": -195.2724151611328, "loss": 0.6279, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.0685102939605713, "rewards/margins": 0.2402305155992508, "rewards/rejected": -1.3087408542633057, "step": 3580 }, { "epoch": 0.62, "grad_norm": 2.796875, "learning_rate": 1.9071042391600074e-06, "logits/chosen": -3.072998046875, "logits/rejected": -3.0672171115875244, "logps/chosen": -173.44651794433594, "logps/rejected": -193.4931640625, "loss": 0.6401, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.0806306600570679, "rewards/margins": 0.24041788280010223, "rewards/rejected": -1.321048617362976, "step": 3590 }, { "epoch": 0.62, "grad_norm": 3.28125, "learning_rate": 1.8925039583367535e-06, "logits/chosen": -3.05549955368042, "logits/rejected": -3.0492753982543945, "logps/chosen": -166.55067443847656, "logps/rejected": -186.08181762695312, "loss": 0.6263, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.0093283653259277, "rewards/margins": 0.24818992614746094, "rewards/rejected": -1.2575181722640991, "step": 3600 }, { "epoch": 0.62, "eval_logits/chosen": -3.0720903873443604, "eval_logits/rejected": -3.067195177078247, "eval_logps/chosen": -149.97569274902344, "eval_logps/rejected": -168.07350158691406, "eval_loss": 0.650830090045929, "eval_rewards/accuracies": 0.6043215394020081, "eval_rewards/chosen": -0.785656750202179, "eval_rewards/margins": 0.14400769770145416, "eval_rewards/rejected": -0.9296644330024719, "eval_runtime": 497.1837, "eval_samples_per_second": 8.657, "eval_steps_per_second": 1.082, "step": 3600 }, { "epoch": 0.62, "grad_norm": 2.65625, "learning_rate": 1.8779256646236945e-06, "logits/chosen": -3.050173044204712, "logits/rejected": -3.0390946865081787, "logps/chosen": -176.19290161132812, "logps/rejected": -194.0640411376953, "loss": 0.6329, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.14492928981781, "rewards/margins": 0.2223505675792694, "rewards/rejected": -1.3672797679901123, "step": 3610 }, { "epoch": 0.62, "grad_norm": 2.609375, "learning_rate": 1.8633698856531602e-06, "logits/chosen": -3.0332961082458496, "logits/rejected": -3.0208582878112793, "logps/chosen": -166.79562377929688, "logps/rejected": -198.54063415527344, "loss": 0.5671, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.9951320886611938, "rewards/margins": 0.36578112840652466, "rewards/rejected": -1.3609130382537842, "step": 3620 }, { "epoch": 0.63, "grad_norm": 3.3125, "learning_rate": 1.8488371482425988e-06, "logits/chosen": -3.019685983657837, "logits/rejected": -3.0057766437530518, "logps/chosen": -181.3262176513672, "logps/rejected": -221.469970703125, "loss": 0.5866, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.1980793476104736, "rewards/margins": 0.3820146918296814, "rewards/rejected": -1.5800940990447998, "step": 3630 }, { "epoch": 0.63, "grad_norm": 2.6875, "learning_rate": 1.8343279783755208e-06, "logits/chosen": -2.9965124130249023, "logits/rejected": -2.990307569503784, "logps/chosen": -184.87025451660156, "logps/rejected": -211.8139190673828, "loss": 0.6045, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.1814584732055664, "rewards/margins": 0.29887276887893677, "rewards/rejected": -1.4803311824798584, "step": 3640 }, { "epoch": 0.63, "grad_norm": 3.28125, "learning_rate": 1.8198429011824515e-06, "logits/chosen": -2.991365671157837, "logits/rejected": -2.983931064605713, "logps/chosen": -182.23580932617188, "logps/rejected": -213.08407592773438, "loss": 0.6193, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2039625644683838, "rewards/margins": 0.291355162858963, "rewards/rejected": -1.4953176975250244, "step": 3650 }, { "epoch": 0.63, "grad_norm": 3.0, "learning_rate": 1.8053824409219322e-06, "logits/chosen": -2.9847915172576904, "logits/rejected": -2.9697489738464355, "logps/chosen": -196.5591583251953, "logps/rejected": -231.37966918945312, "loss": 0.5736, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.3088271617889404, "rewards/margins": 0.39994674921035767, "rewards/rejected": -1.7087738513946533, "step": 3660 }, { "epoch": 0.63, "grad_norm": 2.625, "learning_rate": 1.7909471209615447e-06, "logits/chosen": -2.9805190563201904, "logits/rejected": -2.973362445831299, "logps/chosen": -197.0777587890625, "logps/rejected": -217.090576171875, "loss": 0.6475, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.328054666519165, "rewards/margins": 0.23754820227622986, "rewards/rejected": -1.5656030178070068, "step": 3670 }, { "epoch": 0.63, "grad_norm": 2.890625, "learning_rate": 1.7765374637589632e-06, "logits/chosen": -3.008643627166748, "logits/rejected": -3.003309965133667, "logps/chosen": -195.97483825683594, "logps/rejected": -215.29763793945312, "loss": 0.6228, "rewards/accuracies": 0.625, "rewards/chosen": -1.2987301349639893, "rewards/margins": 0.2805440127849579, "rewards/rejected": -1.5792741775512695, "step": 3680 }, { "epoch": 0.64, "grad_norm": 3.078125, "learning_rate": 1.7621539908430555e-06, "logits/chosen": -3.013218879699707, "logits/rejected": -3.00187087059021, "logps/chosen": -180.8542938232422, "logps/rejected": -219.6073455810547, "loss": 0.5985, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1830642223358154, "rewards/margins": 0.3425402045249939, "rewards/rejected": -1.525604486465454, "step": 3690 }, { "epoch": 0.64, "grad_norm": 2.546875, "learning_rate": 1.7477972227949947e-06, "logits/chosen": -2.990821361541748, "logits/rejected": -2.9791314601898193, "logps/chosen": -187.114013671875, "logps/rejected": -219.70413208007812, "loss": 0.5961, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.1857783794403076, "rewards/margins": 0.34895533323287964, "rewards/rejected": -1.534733772277832, "step": 3700 }, { "epoch": 0.64, "eval_logits/chosen": -3.0151216983795166, "eval_logits/rejected": -3.0089986324310303, "eval_logps/chosen": -169.4567108154297, "eval_logps/rejected": -189.36888122558594, "eval_loss": 0.6491547226905823, "eval_rewards/accuracies": 0.6136152148246765, "eval_rewards/chosen": -0.9804668426513672, "eval_rewards/margins": 0.162151500582695, "eval_rewards/rejected": -1.1426185369491577, "eval_runtime": 497.6263, "eval_samples_per_second": 8.649, "eval_steps_per_second": 1.081, "step": 3700 }, { "epoch": 0.64, "grad_norm": 3.171875, "learning_rate": 1.7334676792294303e-06, "logits/chosen": -3.0052781105041504, "logits/rejected": -2.9978158473968506, "logps/chosen": -183.4770050048828, "logps/rejected": -205.83377075195312, "loss": 0.6272, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.192054271697998, "rewards/margins": 0.2757667601108551, "rewards/rejected": -1.4678208827972412, "step": 3710 }, { "epoch": 0.64, "grad_norm": 2.640625, "learning_rate": 1.7191658787756705e-06, "logits/chosen": -3.0063445568084717, "logits/rejected": -2.99770450592041, "logps/chosen": -177.88133239746094, "logps/rejected": -215.02456665039062, "loss": 0.5729, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.14735746383667, "rewards/margins": 0.38209640979766846, "rewards/rejected": -1.5294538736343384, "step": 3720 }, { "epoch": 0.64, "grad_norm": 2.765625, "learning_rate": 1.7048923390589211e-06, "logits/chosen": -2.99979829788208, "logits/rejected": -2.982865571975708, "logps/chosen": -190.33706665039062, "logps/rejected": -222.1634521484375, "loss": 0.5925, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.2473335266113281, "rewards/margins": 0.38343560695648193, "rewards/rejected": -1.63076913356781, "step": 3730 }, { "epoch": 0.64, "grad_norm": 2.65625, "learning_rate": 1.6906475766815455e-06, "logits/chosen": -3.015282392501831, "logits/rejected": -3.006476402282715, "logps/chosen": -177.47451782226562, "logps/rejected": -208.50765991210938, "loss": 0.6071, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1354906558990479, "rewards/margins": 0.3008057475090027, "rewards/rejected": -1.4362964630126953, "step": 3740 }, { "epoch": 0.65, "grad_norm": 2.375, "learning_rate": 1.676432107204367e-06, "logits/chosen": -3.021735191345215, "logits/rejected": -3.017714500427246, "logps/chosen": -183.16357421875, "logps/rejected": -203.0084991455078, "loss": 0.6401, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1851723194122314, "rewards/margins": 0.2241462767124176, "rewards/rejected": -1.4093185663223267, "step": 3750 }, { "epoch": 0.65, "grad_norm": 2.953125, "learning_rate": 1.6622464451280131e-06, "logits/chosen": -3.032392978668213, "logits/rejected": -3.025547504425049, "logps/chosen": -190.98033142089844, "logps/rejected": -205.5364227294922, "loss": 0.6542, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2406162023544312, "rewards/margins": 0.17366747558116913, "rewards/rejected": -1.4142837524414062, "step": 3760 }, { "epoch": 0.65, "grad_norm": 2.640625, "learning_rate": 1.6480911038742892e-06, "logits/chosen": -3.0329129695892334, "logits/rejected": -3.022871255874634, "logps/chosen": -177.671630859375, "logps/rejected": -199.72903442382812, "loss": 0.6289, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1310114860534668, "rewards/margins": 0.24106483161449432, "rewards/rejected": -1.3720762729644775, "step": 3770 }, { "epoch": 0.65, "grad_norm": 4.03125, "learning_rate": 1.6339665957676012e-06, "logits/chosen": -3.0127625465393066, "logits/rejected": -3.005437135696411, "logps/chosen": -182.5919189453125, "logps/rejected": -203.1020050048828, "loss": 0.6167, "rewards/accuracies": 0.6875, "rewards/chosen": -1.182477593421936, "rewards/margins": 0.27642253041267395, "rewards/rejected": -1.4589000940322876, "step": 3780 }, { "epoch": 0.65, "grad_norm": 3.328125, "learning_rate": 1.6198734320164084e-06, "logits/chosen": -3.0002925395965576, "logits/rejected": -2.9951531887054443, "logps/chosen": -184.12808227539062, "logps/rejected": -204.37026977539062, "loss": 0.62, "rewards/accuracies": 0.625, "rewards/chosen": -1.200259804725647, "rewards/margins": 0.2649845778942108, "rewards/rejected": -1.4652442932128906, "step": 3790 }, { "epoch": 0.65, "grad_norm": 4.8125, "learning_rate": 1.6058121226947265e-06, "logits/chosen": -2.9992926120758057, "logits/rejected": -2.9876251220703125, "logps/chosen": -187.41111755371094, "logps/rejected": -206.1591796875, "loss": 0.6273, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1964071989059448, "rewards/margins": 0.2528632581233978, "rewards/rejected": -1.449270486831665, "step": 3800 }, { "epoch": 0.65, "eval_logits/chosen": -3.011683702468872, "eval_logits/rejected": -3.005713701248169, "eval_logps/chosen": -167.98045349121094, "eval_logps/rejected": -187.6573028564453, "eval_loss": 0.6493790149688721, "eval_rewards/accuracies": 0.6140799522399902, "eval_rewards/chosen": -0.9657043814659119, "eval_rewards/margins": 0.1597982496023178, "eval_rewards/rejected": -1.1255027055740356, "eval_runtime": 496.5286, "eval_samples_per_second": 8.668, "eval_steps_per_second": 1.084, "step": 3800 }, { "epoch": 0.66, "grad_norm": 3.375, "learning_rate": 1.5917831767236597e-06, "logits/chosen": -3.01556134223938, "logits/rejected": -3.0062661170959473, "logps/chosen": -193.03201293945312, "logps/rejected": -212.33935546875, "loss": 0.6215, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2390611171722412, "rewards/margins": 0.2815569341182709, "rewards/rejected": -1.5206180810928345, "step": 3810 }, { "epoch": 0.66, "grad_norm": 2.671875, "learning_rate": 1.577787101852988e-06, "logits/chosen": -3.0076615810394287, "logits/rejected": -3.0015645027160645, "logps/chosen": -181.38314819335938, "logps/rejected": -206.2713623046875, "loss": 0.6031, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1591265201568604, "rewards/margins": 0.2929556965827942, "rewards/rejected": -1.4520821571350098, "step": 3820 }, { "epoch": 0.66, "grad_norm": 4.3125, "learning_rate": 1.5638244046427879e-06, "logits/chosen": -3.0184571743011475, "logits/rejected": -3.006317615509033, "logps/chosen": -186.52359008789062, "logps/rejected": -200.72528076171875, "loss": 0.6279, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.177769422531128, "rewards/margins": 0.23591813445091248, "rewards/rejected": -1.4136877059936523, "step": 3830 }, { "epoch": 0.66, "grad_norm": 2.09375, "learning_rate": 1.549895590445094e-06, "logits/chosen": -3.0161845684051514, "logits/rejected": -3.0071911811828613, "logps/chosen": -181.43736267089844, "logps/rejected": -220.2910614013672, "loss": 0.5827, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1619912385940552, "rewards/margins": 0.3711855411529541, "rewards/rejected": -1.5331767797470093, "step": 3840 }, { "epoch": 0.66, "grad_norm": 3.28125, "learning_rate": 1.5360011633856175e-06, "logits/chosen": -3.034837007522583, "logits/rejected": -3.0271098613739014, "logps/chosen": -180.33920288085938, "logps/rejected": -202.33938598632812, "loss": 0.6091, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.1167991161346436, "rewards/margins": 0.28126758337020874, "rewards/rejected": -1.398066759109497, "step": 3850 }, { "epoch": 0.67, "grad_norm": 2.515625, "learning_rate": 1.5221416263454914e-06, "logits/chosen": -3.0175633430480957, "logits/rejected": -3.0098071098327637, "logps/chosen": -184.2635955810547, "logps/rejected": -209.46450805664062, "loss": 0.613, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1878941059112549, "rewards/margins": 0.2898927927017212, "rewards/rejected": -1.4777867794036865, "step": 3860 }, { "epoch": 0.67, "grad_norm": 3.140625, "learning_rate": 1.5083174809430773e-06, "logits/chosen": -3.014516592025757, "logits/rejected": -3.0030410289764404, "logps/chosen": -188.04342651367188, "logps/rejected": -216.844482421875, "loss": 0.5909, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.179465889930725, "rewards/margins": 0.35519683361053467, "rewards/rejected": -1.5346627235412598, "step": 3870 }, { "epoch": 0.67, "grad_norm": 3.40625, "learning_rate": 1.4945292275158044e-06, "logits/chosen": -2.9874320030212402, "logits/rejected": -2.9876856803894043, "logps/chosen": -191.178955078125, "logps/rejected": -210.03970336914062, "loss": 0.653, "rewards/accuracies": 0.625, "rewards/chosen": -1.294626235961914, "rewards/margins": 0.18726655840873718, "rewards/rejected": -1.4818929433822632, "step": 3880 }, { "epoch": 0.67, "grad_norm": 2.90625, "learning_rate": 1.4807773651020645e-06, "logits/chosen": -3.010956048965454, "logits/rejected": -3.004424571990967, "logps/chosen": -183.68826293945312, "logps/rejected": -208.5262451171875, "loss": 0.6147, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1942976713180542, "rewards/margins": 0.2909819781780243, "rewards/rejected": -1.4852796792984009, "step": 3890 }, { "epoch": 0.67, "grad_norm": 2.609375, "learning_rate": 1.467062391423149e-06, "logits/chosen": -3.0182414054870605, "logits/rejected": -3.0148303508758545, "logps/chosen": -188.4974822998047, "logps/rejected": -207.0945281982422, "loss": 0.6183, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.2032827138900757, "rewards/margins": 0.2851700186729431, "rewards/rejected": -1.488452672958374, "step": 3900 }, { "epoch": 0.67, "eval_logits/chosen": -3.0137338638305664, "eval_logits/rejected": -3.0076637268066406, "eval_logps/chosen": -167.441650390625, "eval_logps/rejected": -187.2733917236328, "eval_loss": 0.6487549543380737, "eval_rewards/accuracies": 0.6166356801986694, "eval_rewards/chosen": -0.9603161215782166, "eval_rewards/margins": 0.16134725511074066, "eval_rewards/rejected": -1.1216634511947632, "eval_runtime": 493.6855, "eval_samples_per_second": 8.718, "eval_steps_per_second": 1.09, "step": 3900 }, { "epoch": 0.67, "grad_norm": 2.65625, "learning_rate": 1.4533848028652347e-06, "logits/chosen": -3.0164458751678467, "logits/rejected": -3.0086934566497803, "logps/chosen": -185.79513549804688, "logps/rejected": -216.59213256835938, "loss": 0.5912, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.194959282875061, "rewards/margins": 0.36031073331832886, "rewards/rejected": -1.5552700757980347, "step": 3910 }, { "epoch": 0.68, "grad_norm": 2.546875, "learning_rate": 1.4397450944614185e-06, "logits/chosen": -3.0261919498443604, "logits/rejected": -3.018491744995117, "logps/chosen": -183.15394592285156, "logps/rejected": -201.19017028808594, "loss": 0.608, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1281392574310303, "rewards/margins": 0.296304315328598, "rewards/rejected": -1.4244437217712402, "step": 3920 }, { "epoch": 0.68, "grad_norm": 2.703125, "learning_rate": 1.426143759873801e-06, "logits/chosen": -3.005645751953125, "logits/rejected": -2.9987285137176514, "logps/chosen": -181.1148223876953, "logps/rejected": -209.9329071044922, "loss": 0.5978, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.180918574333191, "rewards/margins": 0.3122202157974243, "rewards/rejected": -1.4931389093399048, "step": 3930 }, { "epoch": 0.68, "grad_norm": 3.234375, "learning_rate": 1.4125812913756174e-06, "logits/chosen": -2.989655017852783, "logits/rejected": -2.9861814975738525, "logps/chosen": -181.75909423828125, "logps/rejected": -210.435302734375, "loss": 0.5962, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.17508065700531, "rewards/margins": 0.3153669238090515, "rewards/rejected": -1.4904476404190063, "step": 3940 }, { "epoch": 0.68, "grad_norm": 4.0, "learning_rate": 1.3990581798334236e-06, "logits/chosen": -2.9854207038879395, "logits/rejected": -2.971961498260498, "logps/chosen": -188.04066467285156, "logps/rejected": -217.80105590820312, "loss": 0.5735, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2220876216888428, "rewards/margins": 0.3745902180671692, "rewards/rejected": -1.5966777801513672, "step": 3950 }, { "epoch": 0.68, "grad_norm": 3.53125, "learning_rate": 1.3855749146893285e-06, "logits/chosen": -3.0040407180786133, "logits/rejected": -2.998903512954712, "logps/chosen": -191.17031860351562, "logps/rejected": -228.12728881835938, "loss": 0.5992, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.273728370666504, "rewards/margins": 0.33710581064224243, "rewards/rejected": -1.6108341217041016, "step": 3960 }, { "epoch": 0.68, "grad_norm": 2.8125, "learning_rate": 1.3721319839432794e-06, "logits/chosen": -2.9864563941955566, "logits/rejected": -2.977466344833374, "logps/chosen": -199.20652770996094, "logps/rejected": -229.17971801757812, "loss": 0.593, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.334026575088501, "rewards/margins": 0.3332452178001404, "rewards/rejected": -1.667271614074707, "step": 3970 }, { "epoch": 0.69, "grad_norm": 2.6875, "learning_rate": 1.3587298741353999e-06, "logits/chosen": -2.967489719390869, "logits/rejected": -2.9538886547088623, "logps/chosen": -192.2974853515625, "logps/rejected": -227.893310546875, "loss": 0.5772, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2904869318008423, "rewards/margins": 0.3781260848045349, "rewards/rejected": -1.668613076210022, "step": 3980 }, { "epoch": 0.69, "grad_norm": 2.875, "learning_rate": 1.3453690703283848e-06, "logits/chosen": -2.962489604949951, "logits/rejected": -2.96486234664917, "logps/chosen": -197.5770721435547, "logps/rejected": -218.3431854248047, "loss": 0.6503, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.3310292959213257, "rewards/margins": 0.22468487918376923, "rewards/rejected": -1.5557141304016113, "step": 3990 }, { "epoch": 0.69, "grad_norm": 4.15625, "learning_rate": 1.3320500560899329e-06, "logits/chosen": -2.985219717025757, "logits/rejected": -2.9798381328582764, "logps/chosen": -200.13027954101562, "logps/rejected": -224.08981323242188, "loss": 0.6051, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.3275507688522339, "rewards/margins": 0.3009306490421295, "rewards/rejected": -1.6284815073013306, "step": 4000 }, { "epoch": 0.69, "eval_logits/chosen": -2.9974019527435303, "eval_logits/rejected": -2.9908156394958496, "eval_logps/chosen": -176.37387084960938, "eval_logps/rejected": -197.12547302246094, "eval_loss": 0.6481823325157166, "eval_rewards/accuracies": 0.6177973747253418, "eval_rewards/chosen": -1.0496385097503662, "eval_rewards/margins": 0.17054608464241028, "eval_rewards/rejected": -1.220184564590454, "eval_runtime": 494.8129, "eval_samples_per_second": 8.698, "eval_steps_per_second": 1.087, "step": 4000 }, { "epoch": 0.69, "grad_norm": 3.453125, "learning_rate": 1.3187733134752622e-06, "logits/chosen": -2.9619107246398926, "logits/rejected": -2.9526774883270264, "logps/chosen": -188.21420288085938, "logps/rejected": -226.4281463623047, "loss": 0.5841, "rewards/accuracies": 0.71875, "rewards/chosen": -1.2888858318328857, "rewards/margins": 0.3644692599773407, "rewards/rejected": -1.6533548831939697, "step": 4010 }, { "epoch": 0.69, "grad_norm": 3.109375, "learning_rate": 1.3055393230096433e-06, "logits/chosen": -2.975512981414795, "logits/rejected": -2.971684217453003, "logps/chosen": -195.65884399414062, "logps/rejected": -222.35800170898438, "loss": 0.6214, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3342936038970947, "rewards/margins": 0.2772584557533264, "rewards/rejected": -1.6115522384643555, "step": 4020 }, { "epoch": 0.69, "grad_norm": 2.5625, "learning_rate": 1.2923485636710275e-06, "logits/chosen": -2.986161947250366, "logits/rejected": -2.9778332710266113, "logps/chosen": -188.88314819335938, "logps/rejected": -211.6816864013672, "loss": 0.6201, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2178932428359985, "rewards/margins": 0.25717395544052124, "rewards/rejected": -1.475067377090454, "step": 4030 }, { "epoch": 0.7, "grad_norm": 2.9375, "learning_rate": 1.279201512872693e-06, "logits/chosen": -3.0042479038238525, "logits/rejected": -2.9908108711242676, "logps/chosen": -192.40591430664062, "logps/rejected": -224.39590454101562, "loss": 0.5853, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2539700269699097, "rewards/margins": 0.35769122838974, "rewards/rejected": -1.6116611957550049, "step": 4040 }, { "epoch": 0.7, "grad_norm": 2.71875, "learning_rate": 1.2660986464459817e-06, "logits/chosen": -2.980405807495117, "logits/rejected": -2.97395920753479, "logps/chosen": -186.85433959960938, "logps/rejected": -212.6918487548828, "loss": 0.6253, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.247534990310669, "rewards/margins": 0.2799908518791199, "rewards/rejected": -1.5275259017944336, "step": 4050 }, { "epoch": 0.7, "grad_norm": 2.546875, "learning_rate": 1.2530404386230637e-06, "logits/chosen": -2.9891881942749023, "logits/rejected": -2.9860939979553223, "logps/chosen": -202.81207275390625, "logps/rejected": -219.61532592773438, "loss": 0.6415, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.3588091135025024, "rewards/margins": 0.24134831130504608, "rewards/rejected": -1.6001571416854858, "step": 4060 }, { "epoch": 0.7, "grad_norm": 2.828125, "learning_rate": 1.2400273620197856e-06, "logits/chosen": -2.985337257385254, "logits/rejected": -2.975621223449707, "logps/chosen": -193.52359008789062, "logps/rejected": -226.47958374023438, "loss": 0.5742, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2964990139007568, "rewards/margins": 0.3701520264148712, "rewards/rejected": -1.6666511297225952, "step": 4070 }, { "epoch": 0.7, "grad_norm": 2.578125, "learning_rate": 1.2270598876185553e-06, "logits/chosen": -2.9908981323242188, "logits/rejected": -2.982999563217163, "logps/chosen": -187.68170166015625, "logps/rejected": -217.3961944580078, "loss": 0.6105, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.2406961917877197, "rewards/margins": 0.3032611310482025, "rewards/rejected": -1.5439573526382446, "step": 4080 }, { "epoch": 0.7, "grad_norm": 2.796875, "learning_rate": 1.2141384847513006e-06, "logits/chosen": -3.018590211868286, "logits/rejected": -3.004499912261963, "logps/chosen": -179.45327758789062, "logps/rejected": -208.90170288085938, "loss": 0.5894, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1329796314239502, "rewards/margins": 0.34134823083877563, "rewards/rejected": -1.474327802658081, "step": 4090 }, { "epoch": 0.71, "grad_norm": 4.0625, "learning_rate": 1.2012636210824833e-06, "logits/chosen": -2.995112180709839, "logits/rejected": -2.990323543548584, "logps/chosen": -178.38339233398438, "logps/rejected": -205.15158081054688, "loss": 0.5867, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1436684131622314, "rewards/margins": 0.3371172547340393, "rewards/rejected": -1.480785608291626, "step": 4100 }, { "epoch": 0.71, "eval_logits/chosen": -3.015127420425415, "eval_logits/rejected": -3.008767604827881, "eval_logps/chosen": -169.1083984375, "eval_logps/rejected": -189.3998260498047, "eval_loss": 0.6484485864639282, "eval_rewards/accuracies": 0.6124535202980042, "eval_rewards/chosen": -0.9769837260246277, "eval_rewards/margins": 0.1659441590309143, "eval_rewards/rejected": -1.1429280042648315, "eval_runtime": 486.6806, "eval_samples_per_second": 8.844, "eval_steps_per_second": 1.105, "step": 4100 }, { "epoch": 0.71, "grad_norm": 3.296875, "learning_rate": 1.1884357625921695e-06, "logits/chosen": -2.9936797618865967, "logits/rejected": -2.9857420921325684, "logps/chosen": -192.5975341796875, "logps/rejected": -207.32955932617188, "loss": 0.6497, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.2625000476837158, "rewards/margins": 0.2128407210111618, "rewards/rejected": -1.475340723991394, "step": 4110 }, { "epoch": 0.71, "grad_norm": 3.140625, "learning_rate": 1.175655373559168e-06, "logits/chosen": -3.006986618041992, "logits/rejected": -2.996345043182373, "logps/chosen": -184.16152954101562, "logps/rejected": -209.470947265625, "loss": 0.6272, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2307326793670654, "rewards/margins": 0.23652946949005127, "rewards/rejected": -1.4672620296478271, "step": 4120 }, { "epoch": 0.71, "grad_norm": 3.0625, "learning_rate": 1.162922916544224e-06, "logits/chosen": -3.007054090499878, "logits/rejected": -2.995990037918091, "logps/chosen": -181.14195251464844, "logps/rejected": -208.498291015625, "loss": 0.5865, "rewards/accuracies": 0.71875, "rewards/chosen": -1.1549180746078491, "rewards/margins": 0.32630079984664917, "rewards/rejected": -1.481218934059143, "step": 4130 }, { "epoch": 0.71, "grad_norm": 3.71875, "learning_rate": 1.15023885237328e-06, "logits/chosen": -2.9959869384765625, "logits/rejected": -2.9884610176086426, "logps/chosen": -195.40237426757812, "logps/rejected": -207.7620086669922, "loss": 0.6527, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.2849228382110596, "rewards/margins": 0.20186881721019745, "rewards/rejected": -1.4867916107177734, "step": 4140 }, { "epoch": 0.72, "grad_norm": 2.609375, "learning_rate": 1.1376036401207939e-06, "logits/chosen": -3.0100998878479004, "logits/rejected": -3.0050740242004395, "logps/chosen": -189.6447296142578, "logps/rejected": -204.48178100585938, "loss": 0.6342, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.214764952659607, "rewards/margins": 0.23950621485710144, "rewards/rejected": -1.4542710781097412, "step": 4150 }, { "epoch": 0.72, "grad_norm": 2.4375, "learning_rate": 1.1250177370931265e-06, "logits/chosen": -3.004896640777588, "logits/rejected": -2.9942173957824707, "logps/chosen": -178.70838928222656, "logps/rejected": -208.43161010742188, "loss": 0.5788, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1545559167861938, "rewards/margins": 0.3637978136539459, "rewards/rejected": -1.518353819847107, "step": 4160 }, { "epoch": 0.72, "grad_norm": 3.296875, "learning_rate": 1.112481598811992e-06, "logits/chosen": -3.0165929794311523, "logits/rejected": -3.010983943939209, "logps/chosen": -174.68006896972656, "logps/rejected": -200.9302215576172, "loss": 0.6214, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1269192695617676, "rewards/margins": 0.2599869966506958, "rewards/rejected": -1.3869061470031738, "step": 4170 }, { "epoch": 0.72, "grad_norm": 2.84375, "learning_rate": 1.0999956789979626e-06, "logits/chosen": -3.0153422355651855, "logits/rejected": -3.0049679279327393, "logps/chosen": -180.34591674804688, "logps/rejected": -207.4092254638672, "loss": 0.6085, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1334244012832642, "rewards/margins": 0.2850673496723175, "rewards/rejected": -1.4184919595718384, "step": 4180 }, { "epoch": 0.72, "grad_norm": 3.53125, "learning_rate": 1.0875604295540607e-06, "logits/chosen": -3.009065628051758, "logits/rejected": -3.0044589042663574, "logps/chosen": -181.29495239257812, "logps/rejected": -211.8234100341797, "loss": 0.6012, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1944143772125244, "rewards/margins": 0.3176630735397339, "rewards/rejected": -1.5120774507522583, "step": 4190 }, { "epoch": 0.72, "grad_norm": 2.578125, "learning_rate": 1.075176300549387e-06, "logits/chosen": -3.0220468044281006, "logits/rejected": -3.01804780960083, "logps/chosen": -181.4476776123047, "logps/rejected": -192.96243286132812, "loss": 0.6554, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1441035270690918, "rewards/margins": 0.17912748456001282, "rewards/rejected": -1.3232312202453613, "step": 4200 }, { "epoch": 0.72, "eval_logits/chosen": -3.0269603729248047, "eval_logits/rejected": -3.0208635330200195, "eval_logps/chosen": -164.2754669189453, "eval_logps/rejected": -184.01255798339844, "eval_loss": 0.6489173769950867, "eval_rewards/accuracies": 0.6175650358200073, "eval_rewards/chosen": -0.928654670715332, "eval_rewards/margins": 0.16040048003196716, "eval_rewards/rejected": -1.089055061340332, "eval_runtime": 484.1713, "eval_samples_per_second": 8.889, "eval_steps_per_second": 1.111, "step": 4200 }, { "epoch": 0.73, "grad_norm": 3.453125, "learning_rate": 1.0628437402028475e-06, "logits/chosen": -3.0186755657196045, "logits/rejected": -3.0084311962127686, "logps/chosen": -184.8686981201172, "logps/rejected": -200.7421112060547, "loss": 0.641, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.191839337348938, "rewards/margins": 0.21558180451393127, "rewards/rejected": -1.4074211120605469, "step": 4210 }, { "epoch": 0.73, "grad_norm": 2.390625, "learning_rate": 1.0505631948669184e-06, "logits/chosen": -3.0035483837127686, "logits/rejected": -2.9989542961120605, "logps/chosen": -178.74893188476562, "logps/rejected": -201.6924591064453, "loss": 0.6239, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1593618392944336, "rewards/margins": 0.24961857497692108, "rewards/rejected": -1.408980369567871, "step": 4220 }, { "epoch": 0.73, "grad_norm": 3.4375, "learning_rate": 1.038335109011498e-06, "logits/chosen": -3.0282905101776123, "logits/rejected": -3.024932861328125, "logps/chosen": -183.4084930419922, "logps/rejected": -201.07754516601562, "loss": 0.6335, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1595426797866821, "rewards/margins": 0.234393909573555, "rewards/rejected": -1.3939363956451416, "step": 4230 }, { "epoch": 0.73, "grad_norm": 2.390625, "learning_rate": 1.026159925207817e-06, "logits/chosen": -3.0245633125305176, "logits/rejected": -3.0180656909942627, "logps/chosen": -175.83587646484375, "logps/rejected": -205.62216186523438, "loss": 0.596, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1032450199127197, "rewards/margins": 0.32235080003738403, "rewards/rejected": -1.4255958795547485, "step": 4240 }, { "epoch": 0.73, "grad_norm": 2.890625, "learning_rate": 1.014038084112423e-06, "logits/chosen": -3.018775224685669, "logits/rejected": -3.015329360961914, "logps/chosen": -172.9668426513672, "logps/rejected": -193.0954132080078, "loss": 0.6107, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.0623455047607422, "rewards/margins": 0.2572886645793915, "rewards/rejected": -1.319634199142456, "step": 4250 }, { "epoch": 0.73, "grad_norm": 2.953125, "learning_rate": 1.001970024451229e-06, "logits/chosen": -3.0206713676452637, "logits/rejected": -3.011845588684082, "logps/chosen": -172.6252899169922, "logps/rejected": -201.23965454101562, "loss": 0.6096, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.0669132471084595, "rewards/margins": 0.3177719712257385, "rewards/rejected": -1.3846852779388428, "step": 4260 }, { "epoch": 0.74, "grad_norm": 3.296875, "learning_rate": 9.899561830036372e-07, "logits/chosen": -3.020733118057251, "logits/rejected": -3.012129306793213, "logps/chosen": -167.78994750976562, "logps/rejected": -192.33631896972656, "loss": 0.6085, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.049797773361206, "rewards/margins": 0.2823103070259094, "rewards/rejected": -1.3321080207824707, "step": 4270 }, { "epoch": 0.74, "grad_norm": 2.625, "learning_rate": 9.779969945867288e-07, "logits/chosen": -3.0113422870635986, "logits/rejected": -3.0036492347717285, "logps/chosen": -171.27249145507812, "logps/rejected": -196.91317749023438, "loss": 0.6161, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.0888071060180664, "rewards/margins": 0.2934654653072357, "rewards/rejected": -1.3822726011276245, "step": 4280 }, { "epoch": 0.74, "grad_norm": 3.265625, "learning_rate": 9.660928920395274e-07, "logits/chosen": -3.0022969245910645, "logits/rejected": -2.994992971420288, "logps/chosen": -184.11524963378906, "logps/rejected": -204.52259826660156, "loss": 0.6256, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.1857540607452393, "rewards/margins": 0.27106592059135437, "rewards/rejected": -1.4568201303482056, "step": 4290 }, { "epoch": 0.74, "grad_norm": 2.671875, "learning_rate": 9.542443062073337e-07, "logits/chosen": -3.0421223640441895, "logits/rejected": -3.0338714122772217, "logps/chosen": -171.01522827148438, "logps/rejected": -196.96121215820312, "loss": 0.6053, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.0753799676895142, "rewards/margins": 0.2902859151363373, "rewards/rejected": -1.3656659126281738, "step": 4300 }, { "epoch": 0.74, "eval_logits/chosen": -3.0361945629119873, "eval_logits/rejected": -3.0303401947021484, "eval_logps/chosen": -159.97738647460938, "eval_logps/rejected": -179.44456481933594, "eval_loss": 0.6488844752311707, "eval_rewards/accuracies": 0.6096654534339905, "eval_rewards/chosen": -0.8856736421585083, "eval_rewards/margins": 0.15770147740840912, "eval_rewards/rejected": -1.0433752536773682, "eval_runtime": 484.3352, "eval_samples_per_second": 8.886, "eval_steps_per_second": 1.111, "step": 4300 }, { "epoch": 0.74, "grad_norm": 2.40625, "learning_rate": 9.424516659261304e-07, "logits/chosen": -3.014176368713379, "logits/rejected": -3.002622127532959, "logps/chosen": -178.19253540039062, "logps/rejected": -199.57150268554688, "loss": 0.6146, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1301661729812622, "rewards/margins": 0.26318198442459106, "rewards/rejected": -1.393347978591919, "step": 4310 }, { "epoch": 0.74, "grad_norm": 2.65625, "learning_rate": 9.307153980070624e-07, "logits/chosen": -3.02925443649292, "logits/rejected": -3.017390727996826, "logps/chosen": -180.28123474121094, "logps/rejected": -212.3802947998047, "loss": 0.5645, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.1029917001724243, "rewards/margins": 0.3897990584373474, "rewards/rejected": -1.492790699005127, "step": 4320 }, { "epoch": 0.75, "grad_norm": 2.671875, "learning_rate": 9.190359272209912e-07, "logits/chosen": -3.015522003173828, "logits/rejected": -3.007598400115967, "logps/chosen": -178.08860778808594, "logps/rejected": -195.5576171875, "loss": 0.6269, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.1294702291488647, "rewards/margins": 0.24931836128234863, "rewards/rejected": -1.3787885904312134, "step": 4330 }, { "epoch": 0.75, "grad_norm": 3.046875, "learning_rate": 9.074136762831168e-07, "logits/chosen": -3.0052382946014404, "logits/rejected": -3.0017242431640625, "logps/chosen": -173.1341552734375, "logps/rejected": -204.6593780517578, "loss": 0.5954, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.1217544078826904, "rewards/margins": 0.33835938572883606, "rewards/rejected": -1.4601138830184937, "step": 4340 }, { "epoch": 0.75, "grad_norm": 3.03125, "learning_rate": 8.958490658376815e-07, "logits/chosen": -3.0085527896881104, "logits/rejected": -3.002901554107666, "logps/chosen": -170.144287109375, "logps/rejected": -197.52249145507812, "loss": 0.6131, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.076792597770691, "rewards/margins": 0.27325382828712463, "rewards/rejected": -1.350046157836914, "step": 4350 }, { "epoch": 0.75, "grad_norm": 2.6875, "learning_rate": 8.843425144427442e-07, "logits/chosen": -3.0132718086242676, "logits/rejected": -3.00311017036438, "logps/chosen": -186.8450469970703, "logps/rejected": -204.13070678710938, "loss": 0.6485, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.223015546798706, "rewards/margins": 0.22089993953704834, "rewards/rejected": -1.4439154863357544, "step": 4360 }, { "epoch": 0.75, "grad_norm": 3.453125, "learning_rate": 8.728944385550328e-07, "logits/chosen": -3.0162041187286377, "logits/rejected": -3.0061447620391846, "logps/chosen": -179.4248046875, "logps/rejected": -203.92308044433594, "loss": 0.6156, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.1377297639846802, "rewards/margins": 0.26933321356773376, "rewards/rejected": -1.4070630073547363, "step": 4370 }, { "epoch": 0.75, "grad_norm": 2.546875, "learning_rate": 8.615052525148701e-07, "logits/chosen": -3.0335440635681152, "logits/rejected": -3.0290024280548096, "logps/chosen": -178.56539916992188, "logps/rejected": -197.3820343017578, "loss": 0.6345, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1294496059417725, "rewards/margins": 0.23765477538108826, "rewards/rejected": -1.3671042919158936, "step": 4380 }, { "epoch": 0.76, "grad_norm": 3.328125, "learning_rate": 8.501753685311784e-07, "logits/chosen": -3.0319645404815674, "logits/rejected": -3.0271618366241455, "logps/chosen": -176.71009826660156, "logps/rejected": -203.8400115966797, "loss": 0.6112, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1303045749664307, "rewards/margins": 0.28727442026138306, "rewards/rejected": -1.4175790548324585, "step": 4390 }, { "epoch": 0.76, "grad_norm": 3.28125, "learning_rate": 8.389051966665596e-07, "logits/chosen": -3.027657985687256, "logits/rejected": -3.0215001106262207, "logps/chosen": -182.5607452392578, "logps/rejected": -203.29652404785156, "loss": 0.6153, "rewards/accuracies": 0.625, "rewards/chosen": -1.1382032632827759, "rewards/margins": 0.2578727602958679, "rewards/rejected": -1.396075963973999, "step": 4400 }, { "epoch": 0.76, "eval_logits/chosen": -3.0351431369781494, "eval_logits/rejected": -3.0291762351989746, "eval_logps/chosen": -160.54696655273438, "eval_logps/rejected": -180.12347412109375, "eval_loss": 0.64886873960495, "eval_rewards/accuracies": 0.6119888424873352, "eval_rewards/chosen": -0.8913692831993103, "eval_rewards/margins": 0.1587950885295868, "eval_rewards/rejected": -1.0501643419265747, "eval_runtime": 484.4593, "eval_samples_per_second": 8.884, "eval_steps_per_second": 1.111, "step": 4400 }, { "epoch": 0.76, "grad_norm": 2.734375, "learning_rate": 8.276951448224546e-07, "logits/chosen": -3.0109057426452637, "logits/rejected": -3.001814126968384, "logps/chosen": -185.36212158203125, "logps/rejected": -205.12521362304688, "loss": 0.6408, "rewards/accuracies": 0.65625, "rewards/chosen": -1.1937808990478516, "rewards/margins": 0.22498579323291779, "rewards/rejected": -1.4187666177749634, "step": 4410 }, { "epoch": 0.76, "grad_norm": 3.421875, "learning_rate": 8.165456187243797e-07, "logits/chosen": -3.02956485748291, "logits/rejected": -3.024728536605835, "logps/chosen": -181.58412170410156, "logps/rejected": -200.54795837402344, "loss": 0.6143, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1534773111343384, "rewards/margins": 0.2757090926170349, "rewards/rejected": -1.4291863441467285, "step": 4420 }, { "epoch": 0.76, "grad_norm": 3.65625, "learning_rate": 8.054570219072419e-07, "logits/chosen": -3.0097193717956543, "logits/rejected": -3.0025272369384766, "logps/chosen": -175.24148559570312, "logps/rejected": -196.47402954101562, "loss": 0.6314, "rewards/accuracies": 0.65625, "rewards/chosen": -1.1144134998321533, "rewards/margins": 0.2582167387008667, "rewards/rejected": -1.372630000114441, "step": 4430 }, { "epoch": 0.77, "grad_norm": 2.9375, "learning_rate": 7.944297557007366e-07, "logits/chosen": -3.033053398132324, "logits/rejected": -3.0255160331726074, "logps/chosen": -187.86953735351562, "logps/rejected": -211.07113647460938, "loss": 0.6031, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1804062128067017, "rewards/margins": 0.3040129244327545, "rewards/rejected": -1.4844191074371338, "step": 4440 }, { "epoch": 0.77, "grad_norm": 2.34375, "learning_rate": 7.834642192148151e-07, "logits/chosen": -3.0207505226135254, "logits/rejected": -3.013399600982666, "logps/chosen": -172.2884063720703, "logps/rejected": -196.8644256591797, "loss": 0.602, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.0546090602874756, "rewards/margins": 0.29372638463974, "rewards/rejected": -1.3483353853225708, "step": 4450 }, { "epoch": 0.77, "grad_norm": 2.5625, "learning_rate": 7.725608093252496e-07, "logits/chosen": -3.0331952571868896, "logits/rejected": -3.024142026901245, "logps/chosen": -167.3407440185547, "logps/rejected": -199.12606811523438, "loss": 0.5877, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.0478832721710205, "rewards/margins": 0.3440885841846466, "rewards/rejected": -1.3919718265533447, "step": 4460 }, { "epoch": 0.77, "grad_norm": 2.71875, "learning_rate": 7.617199206592584e-07, "logits/chosen": -3.0438404083251953, "logits/rejected": -3.0357441902160645, "logps/chosen": -175.3656768798828, "logps/rejected": -192.6163787841797, "loss": 0.6263, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.0873674154281616, "rewards/margins": 0.25625208020210266, "rewards/rejected": -1.3436195850372314, "step": 4470 }, { "epoch": 0.77, "grad_norm": 4.0625, "learning_rate": 7.509419455812336e-07, "logits/chosen": -3.050670862197876, "logits/rejected": -3.042977809906006, "logps/chosen": -173.09136962890625, "logps/rejected": -199.6421661376953, "loss": 0.615, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.115952730178833, "rewards/margins": 0.2790243625640869, "rewards/rejected": -1.3949769735336304, "step": 4480 }, { "epoch": 0.77, "grad_norm": 3.0, "learning_rate": 7.402272741785322e-07, "logits/chosen": -3.0218803882598877, "logits/rejected": -3.0118536949157715, "logps/chosen": -170.727294921875, "logps/rejected": -197.43954467773438, "loss": 0.5931, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0870743989944458, "rewards/margins": 0.30827030539512634, "rewards/rejected": -1.3953447341918945, "step": 4490 }, { "epoch": 0.78, "grad_norm": 2.65625, "learning_rate": 7.295762942473614e-07, "logits/chosen": -3.020730972290039, "logits/rejected": -3.009143352508545, "logps/chosen": -177.8622589111328, "logps/rejected": -201.65280151367188, "loss": 0.6145, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.102785587310791, "rewards/margins": 0.30618494749069214, "rewards/rejected": -1.408970594406128, "step": 4500 }, { "epoch": 0.78, "eval_logits/chosen": -3.037830114364624, "eval_logits/rejected": -3.0319135189056396, "eval_logps/chosen": -160.1719512939453, "eval_logps/rejected": -179.67282104492188, "eval_loss": 0.6489555239677429, "eval_rewards/accuracies": 0.6112918257713318, "eval_rewards/chosen": -0.8876191973686218, "eval_rewards/margins": 0.1580386459827423, "eval_rewards/rejected": -1.0456578731536865, "eval_runtime": 484.2849, "eval_samples_per_second": 8.887, "eval_steps_per_second": 1.111, "step": 4500 }, { "epoch": 0.78, "grad_norm": 2.734375, "learning_rate": 7.189893912787424e-07, "logits/chosen": -3.0372560024261475, "logits/rejected": -3.023256778717041, "logps/chosen": -174.68624877929688, "logps/rejected": -209.28921508789062, "loss": 0.5836, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0918270349502563, "rewards/margins": 0.357189804315567, "rewards/rejected": -1.449016809463501, "step": 4510 }, { "epoch": 0.78, "grad_norm": 2.671875, "learning_rate": 7.084669484445581e-07, "logits/chosen": -3.0338385105133057, "logits/rejected": -3.0261270999908447, "logps/chosen": -177.9006805419922, "logps/rejected": -195.68893432617188, "loss": 0.6342, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1469968557357788, "rewards/margins": 0.22922606766223907, "rewards/rejected": -1.3762229681015015, "step": 4520 }, { "epoch": 0.78, "grad_norm": 5.59375, "learning_rate": 6.980093465836852e-07, "logits/chosen": -3.0291950702667236, "logits/rejected": -3.0212788581848145, "logps/chosen": -176.28085327148438, "logps/rejected": -194.02845764160156, "loss": 0.6486, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.120300531387329, "rewards/margins": 0.21059612929821014, "rewards/rejected": -1.3308966159820557, "step": 4530 }, { "epoch": 0.78, "grad_norm": 4.125, "learning_rate": 6.876169641882105e-07, "logits/chosen": -3.0162062644958496, "logits/rejected": -3.005119800567627, "logps/chosen": -172.72186279296875, "logps/rejected": -192.6821746826172, "loss": 0.6345, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.0865572690963745, "rewards/margins": 0.2528809905052185, "rewards/rejected": -1.3394381999969482, "step": 4540 }, { "epoch": 0.78, "grad_norm": 2.859375, "learning_rate": 6.772901773897319e-07, "logits/chosen": -3.0391769409179688, "logits/rejected": -3.0284054279327393, "logps/chosen": -175.7732391357422, "logps/rejected": -200.0652618408203, "loss": 0.6028, "rewards/accuracies": 0.71875, "rewards/chosen": -1.075699806213379, "rewards/margins": 0.3025535047054291, "rewards/rejected": -1.37825345993042, "step": 4550 }, { "epoch": 0.79, "grad_norm": 2.5, "learning_rate": 6.670293599457459e-07, "logits/chosen": -3.0181899070739746, "logits/rejected": -3.0071330070495605, "logps/chosen": -173.14572143554688, "logps/rejected": -197.73306274414062, "loss": 0.6003, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.064906358718872, "rewards/margins": 0.30180588364601135, "rewards/rejected": -1.366712212562561, "step": 4560 }, { "epoch": 0.79, "grad_norm": 3.890625, "learning_rate": 6.568348832261174e-07, "logits/chosen": -3.027233839035034, "logits/rejected": -3.020333766937256, "logps/chosen": -179.43515014648438, "logps/rejected": -210.864013671875, "loss": 0.6053, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.1779333353042603, "rewards/margins": 0.3318426311016083, "rewards/rejected": -1.5097758769989014, "step": 4570 }, { "epoch": 0.79, "grad_norm": 2.34375, "learning_rate": 6.467071161996447e-07, "logits/chosen": -3.0118486881256104, "logits/rejected": -3.0026967525482178, "logps/chosen": -167.81613159179688, "logps/rejected": -190.34109497070312, "loss": 0.6067, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.0197126865386963, "rewards/margins": 0.27357515692710876, "rewards/rejected": -1.293287992477417, "step": 4580 }, { "epoch": 0.79, "grad_norm": 3.453125, "learning_rate": 6.366464254206966e-07, "logits/chosen": -3.031846523284912, "logits/rejected": -3.024355411529541, "logps/chosen": -179.59120178222656, "logps/rejected": -197.9866180419922, "loss": 0.6412, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1390419006347656, "rewards/margins": 0.23965242505073547, "rewards/rejected": -1.3786942958831787, "step": 4590 }, { "epoch": 0.79, "grad_norm": 3.046875, "learning_rate": 6.266531750159557e-07, "logits/chosen": -3.0279648303985596, "logits/rejected": -3.0112688541412354, "logps/chosen": -174.84622192382812, "logps/rejected": -210.6271514892578, "loss": 0.5798, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.1091092824935913, "rewards/margins": 0.3708891272544861, "rewards/rejected": -1.4799985885620117, "step": 4600 }, { "epoch": 0.79, "eval_logits/chosen": -3.0308055877685547, "eval_logits/rejected": -3.0247178077697754, "eval_logps/chosen": -162.68133544921875, "eval_logps/rejected": -182.4701385498047, "eval_loss": 0.6488083004951477, "eval_rewards/accuracies": 0.6147769689559937, "eval_rewards/chosen": -0.9127131104469299, "eval_rewards/margins": 0.16091784834861755, "eval_rewards/rejected": -1.0736308097839355, "eval_runtime": 483.7515, "eval_samples_per_second": 8.897, "eval_steps_per_second": 1.112, "step": 4600 }, { "epoch": 0.79, "grad_norm": 3.15625, "learning_rate": 6.167277266712293e-07, "logits/chosen": -3.006178379058838, "logits/rejected": -3.000431776046753, "logps/chosen": -184.13197326660156, "logps/rejected": -201.56419372558594, "loss": 0.6495, "rewards/accuracies": 0.625, "rewards/chosen": -1.224673867225647, "rewards/margins": 0.20185346901416779, "rewards/rejected": -1.4265271425247192, "step": 4610 }, { "epoch": 0.8, "grad_norm": 3.375, "learning_rate": 6.068704396183694e-07, "logits/chosen": -3.0257067680358887, "logits/rejected": -3.0185368061065674, "logps/chosen": -173.94790649414062, "logps/rejected": -199.21034240722656, "loss": 0.5994, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.074373722076416, "rewards/margins": 0.2779634892940521, "rewards/rejected": -1.3523372411727905, "step": 4620 }, { "epoch": 0.8, "grad_norm": 2.859375, "learning_rate": 5.970816706222604e-07, "logits/chosen": -3.021345853805542, "logits/rejected": -3.0148777961730957, "logps/chosen": -186.1139678955078, "logps/rejected": -213.1942138671875, "loss": 0.5979, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.1828062534332275, "rewards/margins": 0.3105699419975281, "rewards/rejected": -1.4933760166168213, "step": 4630 }, { "epoch": 0.8, "grad_norm": 2.734375, "learning_rate": 5.873617739679172e-07, "logits/chosen": -3.010087251663208, "logits/rejected": -3.0054357051849365, "logps/chosen": -188.48785400390625, "logps/rejected": -210.8941650390625, "loss": 0.6287, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2335959672927856, "rewards/margins": 0.24029798805713654, "rewards/rejected": -1.4738938808441162, "step": 4640 }, { "epoch": 0.8, "grad_norm": 2.734375, "learning_rate": 5.77711101447652e-07, "logits/chosen": -3.0194153785705566, "logits/rejected": -3.0138816833496094, "logps/chosen": -182.27523803710938, "logps/rejected": -202.95693969726562, "loss": 0.6223, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1761658191680908, "rewards/margins": 0.2618311941623688, "rewards/rejected": -1.4379971027374268, "step": 4650 }, { "epoch": 0.8, "grad_norm": 2.890625, "learning_rate": 5.681300023483521e-07, "logits/chosen": -3.02057147026062, "logits/rejected": -3.013674020767212, "logps/chosen": -180.13131713867188, "logps/rejected": -201.14138793945312, "loss": 0.6135, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1474506855010986, "rewards/margins": 0.2440837174654007, "rewards/rejected": -1.391534447669983, "step": 4660 }, { "epoch": 0.8, "grad_norm": 3.140625, "learning_rate": 5.586188234388306e-07, "logits/chosen": -3.022001028060913, "logits/rejected": -3.0123629570007324, "logps/chosen": -172.04263305664062, "logps/rejected": -200.15243530273438, "loss": 0.5797, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.0928536653518677, "rewards/margins": 0.3391774594783783, "rewards/rejected": -1.4320310354232788, "step": 4670 }, { "epoch": 0.81, "grad_norm": 2.96875, "learning_rate": 5.491779089572793e-07, "logits/chosen": -3.0283255577087402, "logits/rejected": -3.0246620178222656, "logps/chosen": -179.98556518554688, "logps/rejected": -201.14598083496094, "loss": 0.6366, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1617043018341064, "rewards/margins": 0.24819931387901306, "rewards/rejected": -1.409903645515442, "step": 4680 }, { "epoch": 0.81, "grad_norm": 2.59375, "learning_rate": 5.398076005988082e-07, "logits/chosen": -3.0354971885681152, "logits/rejected": -3.0243539810180664, "logps/chosen": -184.61776733398438, "logps/rejected": -217.5264892578125, "loss": 0.592, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1839282512664795, "rewards/margins": 0.3616673946380615, "rewards/rejected": -1.5455955266952515, "step": 4690 }, { "epoch": 0.81, "grad_norm": 3.671875, "learning_rate": 5.305082375030798e-07, "logits/chosen": -3.0186095237731934, "logits/rejected": -3.010040044784546, "logps/chosen": -179.74301147460938, "logps/rejected": -208.00949096679688, "loss": 0.6218, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.171088457107544, "rewards/margins": 0.2796909809112549, "rewards/rejected": -1.4507795572280884, "step": 4700 }, { "epoch": 0.81, "eval_logits/chosen": -3.0306708812713623, "eval_logits/rejected": -3.02455735206604, "eval_logps/chosen": -163.04933166503906, "eval_logps/rejected": -182.94818115234375, "eval_loss": 0.6485710740089417, "eval_rewards/accuracies": 0.6152416467666626, "eval_rewards/chosen": -0.916392982006073, "eval_rewards/margins": 0.1620185673236847, "eval_rewards/rejected": -1.07841157913208, "eval_runtime": 484.3564, "eval_samples_per_second": 8.886, "eval_steps_per_second": 1.111, "step": 4700 }, { "epoch": 0.81, "grad_norm": 3.1875, "learning_rate": 5.212801562420342e-07, "logits/chosen": -3.0316972732543945, "logits/rejected": -3.0217230319976807, "logps/chosen": -181.59439086914062, "logps/rejected": -206.296875, "loss": 0.5887, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.130401372909546, "rewards/margins": 0.3333713412284851, "rewards/rejected": -1.4637725353240967, "step": 4710 }, { "epoch": 0.81, "grad_norm": 2.9375, "learning_rate": 5.121236908077063e-07, "logits/chosen": -3.0099377632141113, "logits/rejected": -3.002049207687378, "logps/chosen": -181.88710021972656, "logps/rejected": -211.2221221923828, "loss": 0.5857, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1480029821395874, "rewards/margins": 0.3564620912075043, "rewards/rejected": -1.5044652223587036, "step": 4720 }, { "epoch": 0.81, "grad_norm": 3.25, "learning_rate": 5.030391726001394e-07, "logits/chosen": -3.00630521774292, "logits/rejected": -2.998715877532959, "logps/chosen": -180.12782287597656, "logps/rejected": -203.79656982421875, "loss": 0.6242, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1553746461868286, "rewards/margins": 0.2904283106327057, "rewards/rejected": -1.445802927017212, "step": 4730 }, { "epoch": 0.82, "grad_norm": 3.125, "learning_rate": 4.940269304153919e-07, "logits/chosen": -3.0037386417388916, "logits/rejected": -2.993199586868286, "logps/chosen": -173.32028198242188, "logps/rejected": -207.9495391845703, "loss": 0.5761, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.065929651260376, "rewards/margins": 0.38310036063194275, "rewards/rejected": -1.4490301609039307, "step": 4740 }, { "epoch": 0.82, "grad_norm": 2.859375, "learning_rate": 4.850872904336307e-07, "logits/chosen": -3.0079355239868164, "logits/rejected": -3.0077691078186035, "logps/chosen": -180.9278564453125, "logps/rejected": -201.39138793945312, "loss": 0.6212, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.1507413387298584, "rewards/margins": 0.23052909970283508, "rewards/rejected": -1.3812705278396606, "step": 4750 }, { "epoch": 0.82, "grad_norm": 2.890625, "learning_rate": 4.762205762073363e-07, "logits/chosen": -3.006566286087036, "logits/rejected": -2.997593402862549, "logps/chosen": -180.80429077148438, "logps/rejected": -209.4434051513672, "loss": 0.599, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1853374242782593, "rewards/margins": 0.31632545590400696, "rewards/rejected": -1.5016629695892334, "step": 4760 }, { "epoch": 0.82, "grad_norm": 2.578125, "learning_rate": 4.6742710864958103e-07, "logits/chosen": -3.026768684387207, "logits/rejected": -3.0133731365203857, "logps/chosen": -187.4033660888672, "logps/rejected": -210.99179077148438, "loss": 0.6107, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1747559309005737, "rewards/margins": 0.31086036562919617, "rewards/rejected": -1.4856163263320923, "step": 4770 }, { "epoch": 0.82, "grad_norm": 2.984375, "learning_rate": 4.5870720602242513e-07, "logits/chosen": -3.0101070404052734, "logits/rejected": -2.9985129833221436, "logps/chosen": -177.30874633789062, "logps/rejected": -208.9322509765625, "loss": 0.589, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.1518343687057495, "rewards/margins": 0.321411669254303, "rewards/rejected": -1.4732460975646973, "step": 4780 }, { "epoch": 0.83, "grad_norm": 2.6875, "learning_rate": 4.500611839253871e-07, "logits/chosen": -3.0094149112701416, "logits/rejected": -3.0022799968719482, "logps/chosen": -188.61123657226562, "logps/rejected": -201.9243621826172, "loss": 0.6452, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2073025703430176, "rewards/margins": 0.20692972838878632, "rewards/rejected": -1.4142323732376099, "step": 4790 }, { "epoch": 0.83, "grad_norm": 2.5625, "learning_rate": 4.4148935528403244e-07, "logits/chosen": -2.9978463649749756, "logits/rejected": -2.9894039630889893, "logps/chosen": -180.93643188476562, "logps/rejected": -207.8934326171875, "loss": 0.6102, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1843605041503906, "rewards/margins": 0.2998817563056946, "rewards/rejected": -1.4842422008514404, "step": 4800 }, { "epoch": 0.83, "eval_logits/chosen": -3.025871753692627, "eval_logits/rejected": -3.0196568965911865, "eval_logps/chosen": -164.8939208984375, "eval_logps/rejected": -184.97686767578125, "eval_loss": 0.6484309434890747, "eval_rewards/accuracies": 0.6150093078613281, "eval_rewards/chosen": -0.934839129447937, "eval_rewards/margins": 0.16385912895202637, "eval_rewards/rejected": -1.0986981391906738, "eval_runtime": 484.0715, "eval_samples_per_second": 8.891, "eval_steps_per_second": 1.111, "step": 4800 }, { "epoch": 0.83, "grad_norm": 3.09375, "learning_rate": 4.3299203033863643e-07, "logits/chosen": -3.003702163696289, "logits/rejected": -2.996516704559326, "logps/chosen": -183.0110321044922, "logps/rejected": -206.063232421875, "loss": 0.5994, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.151745319366455, "rewards/margins": 0.31077447533607483, "rewards/rejected": -1.462519645690918, "step": 4810 }, { "epoch": 0.83, "grad_norm": 3.671875, "learning_rate": 4.245695166329661e-07, "logits/chosen": -3.0186033248901367, "logits/rejected": -3.013110399246216, "logps/chosen": -175.73336791992188, "logps/rejected": -201.29922485351562, "loss": 0.6165, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1273748874664307, "rewards/margins": 0.2871752381324768, "rewards/rejected": -1.4145500659942627, "step": 4820 }, { "epoch": 0.83, "grad_norm": 2.859375, "learning_rate": 4.1622211900314235e-07, "logits/chosen": -3.0206007957458496, "logits/rejected": -3.0085816383361816, "logps/chosen": -178.76016235351562, "logps/rejected": -201.3628692626953, "loss": 0.6258, "rewards/accuracies": 0.65625, "rewards/chosen": -1.1388037204742432, "rewards/margins": 0.26333537697792053, "rewards/rejected": -1.4021390676498413, "step": 4830 }, { "epoch": 0.83, "grad_norm": 2.40625, "learning_rate": 4.0795013956660884e-07, "logits/chosen": -3.0116143226623535, "logits/rejected": -2.9952967166900635, "logps/chosen": -187.82388305664062, "logps/rejected": -217.6949005126953, "loss": 0.5905, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2078577280044556, "rewards/margins": 0.3612230718135834, "rewards/rejected": -1.5690808296203613, "step": 4840 }, { "epoch": 0.84, "grad_norm": 2.59375, "learning_rate": 3.9975387771119925e-07, "logits/chosen": -3.0145621299743652, "logits/rejected": -3.007615327835083, "logps/chosen": -176.22604370117188, "logps/rejected": -202.64767456054688, "loss": 0.6038, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.1093838214874268, "rewards/margins": 0.2972642481327057, "rewards/rejected": -1.4066479206085205, "step": 4850 }, { "epoch": 0.84, "grad_norm": 3.59375, "learning_rate": 3.916336300842988e-07, "logits/chosen": -3.0211424827575684, "logits/rejected": -3.013709545135498, "logps/chosen": -187.14520263671875, "logps/rejected": -200.6242218017578, "loss": 0.6361, "rewards/accuracies": 0.625, "rewards/chosen": -1.1789805889129639, "rewards/margins": 0.23755235970020294, "rewards/rejected": -1.41653311252594, "step": 4860 }, { "epoch": 0.84, "grad_norm": 3.125, "learning_rate": 3.8358969058210957e-07, "logits/chosen": -3.0137581825256348, "logits/rejected": -3.005734920501709, "logps/chosen": -189.39662170410156, "logps/rejected": -210.9857940673828, "loss": 0.61, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2036226987838745, "rewards/margins": 0.3072102665901184, "rewards/rejected": -1.5108331441879272, "step": 4870 }, { "epoch": 0.84, "grad_norm": 2.71875, "learning_rate": 3.7562235033901273e-07, "logits/chosen": -3.0142416954040527, "logits/rejected": -3.006392240524292, "logps/chosen": -176.6653289794922, "logps/rejected": -199.0810089111328, "loss": 0.6117, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1300135850906372, "rewards/margins": 0.28717657923698425, "rewards/rejected": -1.4171901941299438, "step": 4880 }, { "epoch": 0.84, "grad_norm": 3.328125, "learning_rate": 3.677318977170324e-07, "logits/chosen": -3.0293221473693848, "logits/rejected": -3.0215888023376465, "logps/chosen": -181.97988891601562, "logps/rejected": -212.58010864257812, "loss": 0.5876, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.1641104221343994, "rewards/margins": 0.34244081377983093, "rewards/rejected": -1.5065511465072632, "step": 4890 }, { "epoch": 0.84, "grad_norm": 2.765625, "learning_rate": 3.599186182953973e-07, "logits/chosen": -3.0197079181671143, "logits/rejected": -3.009781837463379, "logps/chosen": -179.6743621826172, "logps/rejected": -203.55433654785156, "loss": 0.6176, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1642951965332031, "rewards/margins": 0.29456597566604614, "rewards/rejected": -1.4588611125946045, "step": 4900 }, { "epoch": 0.84, "eval_logits/chosen": -3.0273115634918213, "eval_logits/rejected": -3.021085739135742, "eval_logps/chosen": -165.75540161132812, "eval_logps/rejected": -185.9427947998047, "eval_loss": 0.6482663154602051, "eval_rewards/accuracies": 0.6157063245773315, "eval_rewards/chosen": -0.9434537291526794, "eval_rewards/margins": 0.16490375995635986, "eval_rewards/rejected": -1.1083574295043945, "eval_runtime": 484.1991, "eval_samples_per_second": 8.889, "eval_steps_per_second": 1.111, "step": 4900 }, { "epoch": 0.85, "grad_norm": 3.1875, "learning_rate": 3.5218279486020605e-07, "logits/chosen": -3.0414328575134277, "logits/rejected": -3.034557342529297, "logps/chosen": -179.6194305419922, "logps/rejected": -204.5117950439453, "loss": 0.6038, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.1279804706573486, "rewards/margins": 0.3157906234264374, "rewards/rejected": -1.4437711238861084, "step": 4910 }, { "epoch": 0.85, "grad_norm": 2.453125, "learning_rate": 3.445247073941932e-07, "logits/chosen": -2.9959917068481445, "logits/rejected": -2.981940746307373, "logps/chosen": -184.15304565429688, "logps/rejected": -220.98568725585938, "loss": 0.5778, "rewards/accuracies": 0.65625, "rewards/chosen": -1.197116494178772, "rewards/margins": 0.39637279510498047, "rewards/rejected": -1.593489408493042, "step": 4920 }, { "epoch": 0.85, "grad_norm": 2.65625, "learning_rate": 3.369446330665918e-07, "logits/chosen": -3.0306859016418457, "logits/rejected": -3.021571397781372, "logps/chosen": -184.3625946044922, "logps/rejected": -214.49563598632812, "loss": 0.6277, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2076117992401123, "rewards/margins": 0.2767212390899658, "rewards/rejected": -1.4843331575393677, "step": 4930 }, { "epoch": 0.85, "grad_norm": 3.3125, "learning_rate": 3.2944284622310834e-07, "logits/chosen": -3.0334010124206543, "logits/rejected": -3.02193546295166, "logps/chosen": -185.15042114257812, "logps/rejected": -213.9960479736328, "loss": 0.5907, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1906287670135498, "rewards/margins": 0.35692232847213745, "rewards/rejected": -1.5475513935089111, "step": 4940 }, { "epoch": 0.85, "grad_norm": 3.078125, "learning_rate": 3.220196183759855e-07, "logits/chosen": -3.0085220336914062, "logits/rejected": -2.9998950958251953, "logps/chosen": -179.29806518554688, "logps/rejected": -208.2992706298828, "loss": 0.6052, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.1302188634872437, "rewards/margins": 0.317536324262619, "rewards/rejected": -1.447755217552185, "step": 4950 }, { "epoch": 0.85, "grad_norm": 3.125, "learning_rate": 3.146752181941834e-07, "logits/chosen": -3.0223827362060547, "logits/rejected": -3.0077877044677734, "logps/chosen": -177.91940307617188, "logps/rejected": -215.8114013671875, "loss": 0.5797, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.1578664779663086, "rewards/margins": 0.36045825481414795, "rewards/rejected": -1.518324613571167, "step": 4960 }, { "epoch": 0.86, "grad_norm": 3.25, "learning_rate": 3.074099114936491e-07, "logits/chosen": -3.0086965560913086, "logits/rejected": -2.997326374053955, "logps/chosen": -178.9662628173828, "logps/rejected": -213.8463897705078, "loss": 0.5768, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.168075680732727, "rewards/margins": 0.37543433904647827, "rewards/rejected": -1.54351007938385, "step": 4970 }, { "epoch": 0.86, "grad_norm": 2.6875, "learning_rate": 3.002239612276991e-07, "logits/chosen": -3.013916492462158, "logits/rejected": -3.0011227130889893, "logps/chosen": -181.9564208984375, "logps/rejected": -211.95346069335938, "loss": 0.5983, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.177166223526001, "rewards/margins": 0.3119356036186218, "rewards/rejected": -1.489101767539978, "step": 4980 }, { "epoch": 0.86, "grad_norm": 2.78125, "learning_rate": 2.931176274775024e-07, "logits/chosen": -3.015437126159668, "logits/rejected": -3.0049312114715576, "logps/chosen": -181.30780029296875, "logps/rejected": -219.16586303710938, "loss": 0.5788, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.192025899887085, "rewards/margins": 0.36687594652175903, "rewards/rejected": -1.5589020252227783, "step": 4990 }, { "epoch": 0.86, "grad_norm": 3.265625, "learning_rate": 2.8609116744266586e-07, "logits/chosen": -3.024904727935791, "logits/rejected": -3.011610507965088, "logps/chosen": -180.6025390625, "logps/rejected": -209.94070434570312, "loss": 0.5907, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1356312036514282, "rewards/margins": 0.35062500834465027, "rewards/rejected": -1.4862562417984009, "step": 5000 }, { "epoch": 0.86, "eval_logits/chosen": -3.0259499549865723, "eval_logits/rejected": -3.019639253616333, "eval_logps/chosen": -167.130126953125, "eval_logps/rejected": -187.4627227783203, "eval_loss": 0.6482229828834534, "eval_rewards/accuracies": 0.616403341293335, "eval_rewards/chosen": -0.9572010636329651, "eval_rewards/margins": 0.1663556545972824, "eval_rewards/rejected": -1.1235567331314087, "eval_runtime": 484.132, "eval_samples_per_second": 8.89, "eval_steps_per_second": 1.111, "step": 5000 }, { "epoch": 0.86, "grad_norm": 2.78125, "learning_rate": 2.791448354319265e-07, "logits/chosen": -3.0054192543029785, "logits/rejected": -2.994859218597412, "logps/chosen": -187.4678497314453, "logps/rejected": -218.24655151367188, "loss": 0.5867, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2418291568756104, "rewards/margins": 0.3632369637489319, "rewards/rejected": -1.6050662994384766, "step": 5010 }, { "epoch": 0.86, "grad_norm": 2.90625, "learning_rate": 2.722788828539469e-07, "logits/chosen": -2.994016170501709, "logits/rejected": -2.982360363006592, "logps/chosen": -179.8846893310547, "logps/rejected": -211.01028442382812, "loss": 0.5976, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.158063530921936, "rewards/margins": 0.34929779171943665, "rewards/rejected": -1.5073611736297607, "step": 5020 }, { "epoch": 0.87, "grad_norm": 3.96875, "learning_rate": 2.65493558208216e-07, "logits/chosen": -3.016785144805908, "logits/rejected": -3.006324052810669, "logps/chosen": -185.47933959960938, "logps/rejected": -212.0589141845703, "loss": 0.6153, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.2257567644119263, "rewards/margins": 0.2794381380081177, "rewards/rejected": -1.5051950216293335, "step": 5030 }, { "epoch": 0.87, "grad_norm": 2.875, "learning_rate": 2.5878910707605535e-07, "logits/chosen": -3.027489185333252, "logits/rejected": -3.0190629959106445, "logps/chosen": -192.54586791992188, "logps/rejected": -210.1737823486328, "loss": 0.619, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2427799701690674, "rewards/margins": 0.2671624720096588, "rewards/rejected": -1.5099425315856934, "step": 5040 }, { "epoch": 0.87, "grad_norm": 2.203125, "learning_rate": 2.5216577211173045e-07, "logits/chosen": -3.0162785053253174, "logits/rejected": -3.010364532470703, "logps/chosen": -186.3660125732422, "logps/rejected": -210.4250030517578, "loss": 0.6196, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1881041526794434, "rewards/margins": 0.2793116867542267, "rewards/rejected": -1.4674158096313477, "step": 5050 }, { "epoch": 0.87, "grad_norm": 3.171875, "learning_rate": 2.4562379303366855e-07, "logits/chosen": -2.9955532550811768, "logits/rejected": -2.9888062477111816, "logps/chosen": -182.958251953125, "logps/rejected": -207.3423614501953, "loss": 0.6383, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2245664596557617, "rewards/margins": 0.2448711395263672, "rewards/rejected": -1.469437599182129, "step": 5060 }, { "epoch": 0.87, "grad_norm": 3.296875, "learning_rate": 2.39163406615783e-07, "logits/chosen": -2.996875762939453, "logits/rejected": -2.98545241355896, "logps/chosen": -189.33526611328125, "logps/rejected": -207.96871948242188, "loss": 0.6364, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2511800527572632, "rewards/margins": 0.2708195149898529, "rewards/rejected": -1.5219995975494385, "step": 5070 }, { "epoch": 0.88, "grad_norm": 2.703125, "learning_rate": 2.327848466789029e-07, "logits/chosen": -3.036180257797241, "logits/rejected": -3.02483868598938, "logps/chosen": -182.4312744140625, "logps/rejected": -206.91598510742188, "loss": 0.5997, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1447256803512573, "rewards/margins": 0.3263944387435913, "rewards/rejected": -1.4711202383041382, "step": 5080 }, { "epoch": 0.88, "grad_norm": 2.96875, "learning_rate": 2.2648834408231012e-07, "logits/chosen": -3.031940221786499, "logits/rejected": -3.02018404006958, "logps/chosen": -178.73605346679688, "logps/rejected": -205.97500610351562, "loss": 0.5943, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1420974731445312, "rewards/margins": 0.3264358937740326, "rewards/rejected": -1.4685331583023071, "step": 5090 }, { "epoch": 0.88, "grad_norm": 4.0, "learning_rate": 2.2027412671538517e-07, "logits/chosen": -3.007051944732666, "logits/rejected": -3.0003175735473633, "logps/chosen": -186.65670776367188, "logps/rejected": -200.91976928710938, "loss": 0.6534, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2080416679382324, "rewards/margins": 0.1921565681695938, "rewards/rejected": -1.400198221206665, "step": 5100 }, { "epoch": 0.88, "eval_logits/chosen": -3.0211427211761475, "eval_logits/rejected": -3.014779806137085, "eval_logps/chosen": -167.22410583496094, "eval_logps/rejected": -187.57122802734375, "eval_loss": 0.6480957865715027, "eval_rewards/accuracies": 0.6154739856719971, "eval_rewards/chosen": -0.9581407904624939, "eval_rewards/margins": 0.1665009707212448, "eval_rewards/rejected": -1.124642014503479, "eval_runtime": 484.5249, "eval_samples_per_second": 8.883, "eval_steps_per_second": 1.11, "step": 5100 }, { "epoch": 0.88, "grad_norm": 3.046875, "learning_rate": 2.1414241948935822e-07, "logits/chosen": -3.0215511322021484, "logits/rejected": -3.0139107704162598, "logps/chosen": -198.515625, "logps/rejected": -212.1075897216797, "loss": 0.6693, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.2991710901260376, "rewards/margins": 0.2032029926776886, "rewards/rejected": -1.5023739337921143, "step": 5110 }, { "epoch": 0.88, "grad_norm": 3.5, "learning_rate": 2.0809344432916905e-07, "logits/chosen": -3.020785093307495, "logits/rejected": -3.013937473297119, "logps/chosen": -184.99778747558594, "logps/rejected": -204.39956665039062, "loss": 0.6251, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.2011207342147827, "rewards/margins": 0.25702404975891113, "rewards/rejected": -1.4581449031829834, "step": 5120 }, { "epoch": 0.88, "grad_norm": 2.640625, "learning_rate": 2.0212742016543468e-07, "logits/chosen": -3.0052871704101562, "logits/rejected": -3.000296115875244, "logps/chosen": -186.18795776367188, "logps/rejected": -210.3855743408203, "loss": 0.5988, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2200263738632202, "rewards/margins": 0.30521923303604126, "rewards/rejected": -1.5252454280853271, "step": 5130 }, { "epoch": 0.89, "grad_norm": 2.984375, "learning_rate": 1.9624456292652667e-07, "logits/chosen": -3.0031652450561523, "logits/rejected": -3.0017037391662598, "logps/chosen": -193.90243530273438, "logps/rejected": -201.4404754638672, "loss": 0.6729, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2456893920898438, "rewards/margins": 0.16400747001171112, "rewards/rejected": -1.4096968173980713, "step": 5140 }, { "epoch": 0.89, "grad_norm": 3.25, "learning_rate": 1.9044508553075436e-07, "logits/chosen": -3.016469955444336, "logits/rejected": -3.0100483894348145, "logps/chosen": -185.59703063964844, "logps/rejected": -211.82931518554688, "loss": 0.6145, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.218392252922058, "rewards/margins": 0.2722601890563965, "rewards/rejected": -1.4906524419784546, "step": 5150 }, { "epoch": 0.89, "grad_norm": 3.953125, "learning_rate": 1.8472919787865971e-07, "logits/chosen": -3.016695976257324, "logits/rejected": -3.016082286834717, "logps/chosen": -183.193359375, "logps/rejected": -199.03392028808594, "loss": 0.6535, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.2164713144302368, "rewards/margins": 0.2021493911743164, "rewards/rejected": -1.4186207056045532, "step": 5160 }, { "epoch": 0.89, "grad_norm": 2.625, "learning_rate": 1.7909710684542225e-07, "logits/chosen": -3.0045788288116455, "logits/rejected": -2.9934210777282715, "logps/chosen": -185.44113159179688, "logps/rejected": -215.13961791992188, "loss": 0.5852, "rewards/accuracies": 0.75, "rewards/chosen": -1.1757686138153076, "rewards/margins": 0.3737691342830658, "rewards/rejected": -1.5495378971099854, "step": 5170 }, { "epoch": 0.89, "grad_norm": 3.625, "learning_rate": 1.735490162733658e-07, "logits/chosen": -3.0251495838165283, "logits/rejected": -3.017005443572998, "logps/chosen": -189.4347686767578, "logps/rejected": -210.934326171875, "loss": 0.629, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.235618233680725, "rewards/margins": 0.23946666717529297, "rewards/rejected": -1.475084900856018, "step": 5180 }, { "epoch": 0.89, "grad_norm": 2.859375, "learning_rate": 1.6808512696458862e-07, "logits/chosen": -3.014350175857544, "logits/rejected": -3.0113131999969482, "logps/chosen": -184.97525024414062, "logps/rejected": -206.00265502929688, "loss": 0.6419, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.2191311120986938, "rewards/margins": 0.2178182601928711, "rewards/rejected": -1.4369492530822754, "step": 5190 }, { "epoch": 0.9, "grad_norm": 2.609375, "learning_rate": 1.6270563667368872e-07, "logits/chosen": -3.0289926528930664, "logits/rejected": -3.0216994285583496, "logps/chosen": -181.39625549316406, "logps/rejected": -210.9692840576172, "loss": 0.5973, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1923518180847168, "rewards/margins": 0.30694520473480225, "rewards/rejected": -1.4992971420288086, "step": 5200 }, { "epoch": 0.9, "eval_logits/chosen": -3.0193612575531006, "eval_logits/rejected": -3.0130417346954346, "eval_logps/chosen": -166.88232421875, "eval_logps/rejected": -187.1678924560547, "eval_loss": 0.6482971906661987, "eval_rewards/accuracies": 0.6168680191040039, "eval_rewards/chosen": -0.9547229409217834, "eval_rewards/margins": 0.16588544845581055, "eval_rewards/rejected": -1.1206083297729492, "eval_runtime": 483.9931, "eval_samples_per_second": 8.893, "eval_steps_per_second": 1.112, "step": 5200 }, { "epoch": 0.9, "grad_norm": 2.5, "learning_rate": 1.5741074010061252e-07, "logits/chosen": -3.009269952774048, "logits/rejected": -3.0040059089660645, "logps/chosen": -183.85488891601562, "logps/rejected": -201.80844116210938, "loss": 0.6429, "rewards/accuracies": 0.625, "rewards/chosen": -1.2172787189483643, "rewards/margins": 0.21514368057250977, "rewards/rejected": -1.432422399520874, "step": 5210 }, { "epoch": 0.9, "grad_norm": 2.453125, "learning_rate": 1.5220062888360172e-07, "logits/chosen": -3.026101589202881, "logits/rejected": -3.0166168212890625, "logps/chosen": -176.77809143066406, "logps/rejected": -197.70420837402344, "loss": 0.6466, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1455947160720825, "rewards/margins": 0.19701746106147766, "rewards/rejected": -1.3426120281219482, "step": 5220 }, { "epoch": 0.9, "grad_norm": 2.703125, "learning_rate": 1.4707549159226425e-07, "logits/chosen": -3.0132172107696533, "logits/rejected": -3.0083303451538086, "logps/chosen": -184.35971069335938, "logps/rejected": -216.65420532226562, "loss": 0.5877, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1685558557510376, "rewards/margins": 0.3556649088859558, "rewards/rejected": -1.5242207050323486, "step": 5230 }, { "epoch": 0.9, "grad_norm": 2.875, "learning_rate": 1.4203551372074382e-07, "logits/chosen": -2.995349884033203, "logits/rejected": -2.9813692569732666, "logps/chosen": -186.01040649414062, "logps/rejected": -228.42507934570312, "loss": 0.5638, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.2257850170135498, "rewards/margins": 0.4474519193172455, "rewards/rejected": -1.6732368469238281, "step": 5240 }, { "epoch": 0.9, "grad_norm": 3.390625, "learning_rate": 1.3708087768100897e-07, "logits/chosen": -2.9991250038146973, "logits/rejected": -2.9920661449432373, "logps/chosen": -179.68421936035156, "logps/rejected": -210.48654174804688, "loss": 0.5901, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1758867502212524, "rewards/margins": 0.33366623520851135, "rewards/rejected": -1.5095527172088623, "step": 5250 }, { "epoch": 0.91, "grad_norm": 3.4375, "learning_rate": 1.3221176279625047e-07, "logits/chosen": -3.015169143676758, "logits/rejected": -3.007452964782715, "logps/chosen": -179.83773803710938, "logps/rejected": -202.9173126220703, "loss": 0.6024, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1293582916259766, "rewards/margins": 0.3117751479148865, "rewards/rejected": -1.4411332607269287, "step": 5260 }, { "epoch": 0.91, "grad_norm": 2.640625, "learning_rate": 1.2742834529439112e-07, "logits/chosen": -3.0302436351776123, "logits/rejected": -3.025125741958618, "logps/chosen": -181.14987182617188, "logps/rejected": -208.68142700195312, "loss": 0.6103, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.1812187433242798, "rewards/margins": 0.3093191087245941, "rewards/rejected": -1.4905378818511963, "step": 5270 }, { "epoch": 0.91, "grad_norm": 3.203125, "learning_rate": 1.2273079830170787e-07, "logits/chosen": -3.0149941444396973, "logits/rejected": -3.0027318000793457, "logps/chosen": -191.20547485351562, "logps/rejected": -210.00521850585938, "loss": 0.636, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.2669647932052612, "rewards/margins": 0.2477412223815918, "rewards/rejected": -1.514706015586853, "step": 5280 }, { "epoch": 0.91, "grad_norm": 3.34375, "learning_rate": 1.181192918365645e-07, "logits/chosen": -3.0035336017608643, "logits/rejected": -2.9970505237579346, "logps/chosen": -185.7930908203125, "logps/rejected": -204.4087677001953, "loss": 0.6191, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.1858208179473877, "rewards/margins": 0.2634347379207611, "rewards/rejected": -1.4492554664611816, "step": 5290 }, { "epoch": 0.91, "grad_norm": 3.15625, "learning_rate": 1.1359399280326034e-07, "logits/chosen": -3.012399911880493, "logits/rejected": -3.002013683319092, "logps/chosen": -186.54757690429688, "logps/rejected": -213.6743927001953, "loss": 0.5975, "rewards/accuracies": 0.65625, "rewards/chosen": -1.1939321756362915, "rewards/margins": 0.3121901750564575, "rewards/rejected": -1.506122350692749, "step": 5300 }, { "epoch": 0.91, "eval_logits/chosen": -3.024763822555542, "eval_logits/rejected": -3.0184710025787354, "eval_logps/chosen": -166.61180114746094, "eval_logps/rejected": -186.8758544921875, "eval_loss": 0.6482287645339966, "eval_rewards/accuracies": 0.6161710023880005, "eval_rewards/chosen": -0.9520178437232971, "eval_rewards/margins": 0.16567029058933258, "eval_rewards/rejected": -1.1176881790161133, "eval_runtime": 484.3937, "eval_samples_per_second": 8.885, "eval_steps_per_second": 1.111, "step": 5300 }, { "epoch": 0.91, "grad_norm": 2.5625, "learning_rate": 1.0915506498598711e-07, "logits/chosen": -3.013671398162842, "logits/rejected": -3.0129668712615967, "logps/chosen": -194.44668579101562, "logps/rejected": -210.31838989257812, "loss": 0.6345, "rewards/accuracies": 0.625, "rewards/chosen": -1.260506272315979, "rewards/margins": 0.2346612960100174, "rewards/rejected": -1.4951674938201904, "step": 5310 }, { "epoch": 0.92, "grad_norm": 3.171875, "learning_rate": 1.0480266904290298e-07, "logits/chosen": -3.0138745307922363, "logits/rejected": -3.006173610687256, "logps/chosen": -189.1534423828125, "logps/rejected": -209.4146728515625, "loss": 0.6066, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2044683694839478, "rewards/margins": 0.31247133016586304, "rewards/rejected": -1.516939640045166, "step": 5320 }, { "epoch": 0.92, "grad_norm": 3.078125, "learning_rate": 1.0053696250031803e-07, "logits/chosen": -3.0049405097961426, "logits/rejected": -2.997436761856079, "logps/chosen": -179.34432983398438, "logps/rejected": -219.2215576171875, "loss": 0.5791, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1866565942764282, "rewards/margins": 0.40371227264404297, "rewards/rejected": -1.5903689861297607, "step": 5330 }, { "epoch": 0.92, "grad_norm": 3.140625, "learning_rate": 9.635809974698929e-08, "logits/chosen": -3.023958921432495, "logits/rejected": -3.018794536590576, "logps/chosen": -181.73666381835938, "logps/rejected": -202.86709594726562, "loss": 0.5928, "rewards/accuracies": 0.71875, "rewards/chosen": -1.1352908611297607, "rewards/margins": 0.2984369397163391, "rewards/rejected": -1.4337279796600342, "step": 5340 }, { "epoch": 0.92, "grad_norm": 2.234375, "learning_rate": 9.22662320285389e-08, "logits/chosen": -3.009406089782715, "logits/rejected": -3.0080466270446777, "logps/chosen": -182.8820037841797, "logps/rejected": -203.56088256835938, "loss": 0.6348, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1950762271881104, "rewards/margins": 0.252188503742218, "rewards/rejected": -1.4472649097442627, "step": 5350 }, { "epoch": 0.92, "grad_norm": 2.890625, "learning_rate": 8.826150744197403e-08, "logits/chosen": -3.0183300971984863, "logits/rejected": -3.008608341217041, "logps/chosen": -185.93148803710938, "logps/rejected": -219.19528198242188, "loss": 0.6033, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2144702672958374, "rewards/margins": 0.33122771978378296, "rewards/rejected": -1.5456980466842651, "step": 5360 }, { "epoch": 0.93, "grad_norm": 3.296875, "learning_rate": 8.434407093033225e-08, "logits/chosen": -3.0214178562164307, "logits/rejected": -3.0202929973602295, "logps/chosen": -179.71841430664062, "logps/rejected": -200.45217895507812, "loss": 0.6364, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1635373830795288, "rewards/margins": 0.2255493402481079, "rewards/rejected": -1.3890868425369263, "step": 5370 }, { "epoch": 0.93, "grad_norm": 3.234375, "learning_rate": 8.051406427743047e-08, "logits/chosen": -3.0340845584869385, "logits/rejected": -3.02712082862854, "logps/chosen": -187.09100341796875, "logps/rejected": -205.0014190673828, "loss": 0.6099, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1768646240234375, "rewards/margins": 0.2828957140445709, "rewards/rejected": -1.459760308265686, "step": 5380 }, { "epoch": 0.93, "grad_norm": 2.625, "learning_rate": 7.677162610273819e-08, "logits/chosen": -3.0002315044403076, "logits/rejected": -2.9908108711242676, "logps/chosen": -190.63095092773438, "logps/rejected": -211.54226684570312, "loss": 0.6124, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.222454309463501, "rewards/margins": 0.29625049233436584, "rewards/rejected": -1.5187046527862549, "step": 5390 }, { "epoch": 0.93, "grad_norm": 4.53125, "learning_rate": 7.311689185635573e-08, "logits/chosen": -3.0046777725219727, "logits/rejected": -2.9948372840881348, "logps/chosen": -178.56069946289062, "logps/rejected": -214.04733276367188, "loss": 0.5986, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.168642282485962, "rewards/margins": 0.35686975717544556, "rewards/rejected": -1.5255119800567627, "step": 5400 }, { "epoch": 0.93, "eval_logits/chosen": -3.024919033050537, "eval_logits/rejected": -3.0186076164245605, "eval_logps/chosen": -166.65017700195312, "eval_logps/rejected": -186.89283752441406, "eval_loss": 0.6482614278793335, "eval_rewards/accuracies": 0.6189591288566589, "eval_rewards/chosen": -0.9524016380310059, "eval_rewards/margins": 0.16545623540878296, "eval_rewards/rejected": -1.117857813835144, "eval_runtime": 484.7866, "eval_samples_per_second": 8.878, "eval_steps_per_second": 1.11, "step": 5400 }, { "epoch": 0.93, "grad_norm": 4.1875, "learning_rate": 6.954999381411642e-08, "logits/chosen": -3.0219473838806152, "logits/rejected": -3.0172617435455322, "logps/chosen": -195.20822143554688, "logps/rejected": -205.44821166992188, "loss": 0.683, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2699486017227173, "rewards/margins": 0.14408110082149506, "rewards/rejected": -1.4140297174453735, "step": 5410 }, { "epoch": 0.93, "grad_norm": 3.15625, "learning_rate": 6.607106107279604e-08, "logits/chosen": -3.018432140350342, "logits/rejected": -3.015479326248169, "logps/chosen": -189.63400268554688, "logps/rejected": -209.18264770507812, "loss": 0.6452, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2422764301300049, "rewards/margins": 0.22592100501060486, "rewards/rejected": -1.4681974649429321, "step": 5420 }, { "epoch": 0.94, "grad_norm": 2.8125, "learning_rate": 6.268021954544095e-08, "logits/chosen": -3.0200390815734863, "logits/rejected": -3.0088791847229004, "logps/chosen": -185.31239318847656, "logps/rejected": -213.2115020751953, "loss": 0.6008, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2222305536270142, "rewards/margins": 0.3227187991142273, "rewards/rejected": -1.5449492931365967, "step": 5430 }, { "epoch": 0.94, "grad_norm": 2.984375, "learning_rate": 5.9377591956812364e-08, "logits/chosen": -3.0067832469940186, "logits/rejected": -2.9997334480285645, "logps/chosen": -186.7572479248047, "logps/rejected": -213.2431182861328, "loss": 0.6082, "rewards/accuracies": 0.6875, "rewards/chosen": -1.202883005142212, "rewards/margins": 0.30217069387435913, "rewards/rejected": -1.5050535202026367, "step": 5440 }, { "epoch": 0.94, "grad_norm": 5.1875, "learning_rate": 5.6163297838942866e-08, "logits/chosen": -3.0089011192321777, "logits/rejected": -3.0013954639434814, "logps/chosen": -186.43460083007812, "logps/rejected": -208.15872192382812, "loss": 0.6253, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.1967378854751587, "rewards/margins": 0.26602086424827576, "rewards/rejected": -1.4627587795257568, "step": 5450 }, { "epoch": 0.94, "grad_norm": 2.953125, "learning_rate": 5.30374535268105e-08, "logits/chosen": -3.0090737342834473, "logits/rejected": -2.99898624420166, "logps/chosen": -185.4371337890625, "logps/rejected": -203.43051147460938, "loss": 0.6304, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.2107499837875366, "rewards/margins": 0.21922187507152557, "rewards/rejected": -1.4299719333648682, "step": 5460 }, { "epoch": 0.94, "grad_norm": 2.96875, "learning_rate": 5.0000172154129887e-08, "logits/chosen": -3.0085060596466064, "logits/rejected": -3.007678508758545, "logps/chosen": -184.86740112304688, "logps/rejected": -201.18594360351562, "loss": 0.6685, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2326360940933228, "rewards/margins": 0.18991947174072266, "rewards/rejected": -1.4225553274154663, "step": 5470 }, { "epoch": 0.94, "grad_norm": 2.625, "learning_rate": 4.705156364925467e-08, "logits/chosen": -2.9995064735412598, "logits/rejected": -2.986332654953003, "logps/chosen": -181.80081176757812, "logps/rejected": -213.63790893554688, "loss": 0.579, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.1795815229415894, "rewards/margins": 0.3651406764984131, "rewards/rejected": -1.5447221994400024, "step": 5480 }, { "epoch": 0.95, "grad_norm": 2.75, "learning_rate": 4.419173473120236e-08, "logits/chosen": -2.9968180656433105, "logits/rejected": -2.98726749420166, "logps/chosen": -181.09336853027344, "logps/rejected": -198.72418212890625, "loss": 0.6289, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.1412298679351807, "rewards/margins": 0.24370375275611877, "rewards/rejected": -1.3849337100982666, "step": 5490 }, { "epoch": 0.95, "grad_norm": 2.796875, "learning_rate": 4.142078890578827e-08, "logits/chosen": -3.0323545932769775, "logits/rejected": -3.0259850025177, "logps/chosen": -179.64572143554688, "logps/rejected": -209.1461944580078, "loss": 0.6025, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1536697149276733, "rewards/margins": 0.31138697266578674, "rewards/rejected": -1.4650566577911377, "step": 5500 }, { "epoch": 0.95, "eval_logits/chosen": -3.0252060890197754, "eval_logits/rejected": -3.0189244747161865, "eval_logps/chosen": -166.7466583251953, "eval_logps/rejected": -186.9980010986328, "eval_loss": 0.6482976078987122, "eval_rewards/accuracies": 0.6168680191040039, "eval_rewards/chosen": -0.9533662796020508, "eval_rewards/margins": 0.16554316878318787, "eval_rewards/rejected": -1.118909478187561, "eval_runtime": 485.7241, "eval_samples_per_second": 8.861, "eval_steps_per_second": 1.108, "step": 5500 }, { "epoch": 0.95, "grad_norm": 2.640625, "learning_rate": 3.873882646188265e-08, "logits/chosen": -3.009657382965088, "logits/rejected": -3.0036680698394775, "logps/chosen": -195.98202514648438, "logps/rejected": -214.7615203857422, "loss": 0.6468, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.330396294593811, "rewards/margins": 0.2086518257856369, "rewards/rejected": -1.539048194885254, "step": 5510 }, { "epoch": 0.95, "grad_norm": 2.59375, "learning_rate": 3.6145944467777525e-08, "logits/chosen": -3.008087635040283, "logits/rejected": -2.9986672401428223, "logps/chosen": -182.70603942871094, "logps/rejected": -208.48681640625, "loss": 0.5892, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.1439793109893799, "rewards/margins": 0.34625551104545593, "rewards/rejected": -1.4902350902557373, "step": 5520 }, { "epoch": 0.95, "grad_norm": 3.46875, "learning_rate": 3.364223676767725e-08, "logits/chosen": -3.005150556564331, "logits/rejected": -2.9966344833374023, "logps/chosen": -191.7229766845703, "logps/rejected": -206.38412475585938, "loss": 0.6328, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2086269855499268, "rewards/margins": 0.2549501061439514, "rewards/rejected": -1.463577151298523, "step": 5530 }, { "epoch": 0.95, "grad_norm": 2.96875, "learning_rate": 3.122779397829845e-08, "logits/chosen": -3.016235828399658, "logits/rejected": -3.0094943046569824, "logps/chosen": -179.7984619140625, "logps/rejected": -212.85830688476562, "loss": 0.6018, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.1654951572418213, "rewards/margins": 0.3237941563129425, "rewards/rejected": -1.4892891645431519, "step": 5540 }, { "epoch": 0.96, "grad_norm": 2.953125, "learning_rate": 2.8902703485593208e-08, "logits/chosen": -2.991302967071533, "logits/rejected": -2.9869298934936523, "logps/chosen": -183.2467803955078, "logps/rejected": -202.32962036132812, "loss": 0.6599, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2108595371246338, "rewards/margins": 0.19090045988559723, "rewards/rejected": -1.4017599821090698, "step": 5550 }, { "epoch": 0.96, "grad_norm": 3.671875, "learning_rate": 2.666704944158438e-08, "logits/chosen": -3.012624740600586, "logits/rejected": -3.0047385692596436, "logps/chosen": -179.16905212402344, "logps/rejected": -196.2181854248047, "loss": 0.6331, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1613173484802246, "rewards/margins": 0.2275480031967163, "rewards/rejected": -1.388865351676941, "step": 5560 }, { "epoch": 0.96, "grad_norm": 3.015625, "learning_rate": 2.4520912761320515e-08, "logits/chosen": -3.006896495819092, "logits/rejected": -3.0060629844665527, "logps/chosen": -184.01107788085938, "logps/rejected": -204.49400329589844, "loss": 0.6539, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2240345478057861, "rewards/margins": 0.17588286101818085, "rewards/rejected": -1.3999173641204834, "step": 5570 }, { "epoch": 0.96, "grad_norm": 2.765625, "learning_rate": 2.2464371119947926e-08, "logits/chosen": -3.013871908187866, "logits/rejected": -3.0044972896575928, "logps/chosen": -185.55514526367188, "logps/rejected": -216.16012573242188, "loss": 0.5986, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2228542566299438, "rewards/margins": 0.31742024421691895, "rewards/rejected": -1.5402743816375732, "step": 5580 }, { "epoch": 0.96, "grad_norm": 3.65625, "learning_rate": 2.049749894989822e-08, "logits/chosen": -3.023019313812256, "logits/rejected": -3.017407178878784, "logps/chosen": -189.84158325195312, "logps/rejected": -214.7401123046875, "loss": 0.6201, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2183094024658203, "rewards/margins": 0.2928226590156555, "rewards/rejected": -1.5111321210861206, "step": 5590 }, { "epoch": 0.96, "grad_norm": 3.015625, "learning_rate": 1.8620367438194898e-08, "logits/chosen": -3.0226356983184814, "logits/rejected": -3.015810489654541, "logps/chosen": -182.97305297851562, "logps/rejected": -214.4624786376953, "loss": 0.6149, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1971943378448486, "rewards/margins": 0.2937503159046173, "rewards/rejected": -1.490944743156433, "step": 5600 }, { "epoch": 0.96, "eval_logits/chosen": -3.0243780612945557, "eval_logits/rejected": -3.0180587768554688, "eval_logps/chosen": -166.78590393066406, "eval_logps/rejected": -187.1136932373047, "eval_loss": 0.6480231881141663, "eval_rewards/accuracies": 0.6154739856719971, "eval_rewards/chosen": -0.953758955001831, "eval_rewards/margins": 0.1663074791431427, "eval_rewards/rejected": -1.120066523551941, "eval_runtime": 484.1208, "eval_samples_per_second": 8.89, "eval_steps_per_second": 1.111, "step": 5600 }, { "epoch": 0.97, "grad_norm": 2.84375, "learning_rate": 1.683304452387763e-08, "logits/chosen": -3.020547389984131, "logits/rejected": -3.0156941413879395, "logps/chosen": -179.31251525878906, "logps/rejected": -217.8151397705078, "loss": 0.585, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.1683876514434814, "rewards/margins": 0.375874400138855, "rewards/rejected": -1.5442620515823364, "step": 5610 }, { "epoch": 0.97, "grad_norm": 4.03125, "learning_rate": 1.5135594895542005e-08, "logits/chosen": -2.9989235401153564, "logits/rejected": -2.9931704998016357, "logps/chosen": -190.47634887695312, "logps/rejected": -206.96914672851562, "loss": 0.6365, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.2506822347640991, "rewards/margins": 0.22417505085468292, "rewards/rejected": -1.474857211112976, "step": 5620 }, { "epoch": 0.97, "grad_norm": 2.875, "learning_rate": 1.352807998899891e-08, "logits/chosen": -3.015920877456665, "logits/rejected": -3.008056879043579, "logps/chosen": -187.56515502929688, "logps/rejected": -210.36080932617188, "loss": 0.6209, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2147810459136963, "rewards/margins": 0.2961927354335785, "rewards/rejected": -1.5109736919403076, "step": 5630 }, { "epoch": 0.97, "grad_norm": 2.6875, "learning_rate": 1.2010557985051297e-08, "logits/chosen": -3.017130136489868, "logits/rejected": -3.0098698139190674, "logps/chosen": -178.4241485595703, "logps/rejected": -207.69338989257812, "loss": 0.6181, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.12577223777771, "rewards/margins": 0.3019818663597107, "rewards/rejected": -1.4277540445327759, "step": 5640 }, { "epoch": 0.97, "grad_norm": 2.609375, "learning_rate": 1.0583083807387818e-08, "logits/chosen": -3.018669843673706, "logits/rejected": -3.0065765380859375, "logps/chosen": -177.80455017089844, "logps/rejected": -208.6228790283203, "loss": 0.6129, "rewards/accuracies": 0.65625, "rewards/chosen": -1.1684799194335938, "rewards/margins": 0.32036706805229187, "rewards/rejected": -1.488847017288208, "step": 5650 }, { "epoch": 0.98, "grad_norm": 3.203125, "learning_rate": 9.245709120595526e-09, "logits/chosen": -3.0136618614196777, "logits/rejected": -3.000986099243164, "logps/chosen": -181.29054260253906, "logps/rejected": -212.631591796875, "loss": 0.5992, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.1949920654296875, "rewards/margins": 0.3406812846660614, "rewards/rejected": -1.5356733798980713, "step": 5660 }, { "epoch": 0.98, "grad_norm": 2.53125, "learning_rate": 7.998482328289702e-09, "logits/chosen": -3.011711597442627, "logits/rejected": -3.0005130767822266, "logps/chosen": -176.48794555664062, "logps/rejected": -198.55197143554688, "loss": 0.6149, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1117572784423828, "rewards/margins": 0.25985077023506165, "rewards/rejected": -1.371608018875122, "step": 5670 }, { "epoch": 0.98, "grad_norm": 4.4375, "learning_rate": 6.841448571361376e-09, "logits/chosen": -3.0058178901672363, "logits/rejected": -3.003483533859253, "logps/chosen": -186.4090118408203, "logps/rejected": -207.31539916992188, "loss": 0.6124, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2142975330352783, "rewards/margins": 0.2691487967967987, "rewards/rejected": -1.4834461212158203, "step": 5680 }, { "epoch": 0.98, "grad_norm": 2.890625, "learning_rate": 5.774649726345283e-09, "logits/chosen": -3.0200724601745605, "logits/rejected": -3.005723476409912, "logps/chosen": -187.30740356445312, "logps/rejected": -212.05178833007812, "loss": 0.5797, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.1744122505187988, "rewards/margins": 0.34947630763053894, "rewards/rejected": -1.5238884687423706, "step": 5690 }, { "epoch": 0.98, "grad_norm": 5.1875, "learning_rate": 4.798124403902205e-09, "logits/chosen": -3.0057690143585205, "logits/rejected": -2.9969992637634277, "logps/chosen": -183.09750366210938, "logps/rejected": -202.24923706054688, "loss": 0.6275, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1517393589019775, "rewards/margins": 0.25165319442749023, "rewards/rejected": -1.4033925533294678, "step": 5700 }, { "epoch": 0.98, "eval_logits/chosen": -3.024533748626709, "eval_logits/rejected": -3.018249750137329, "eval_logps/chosen": -166.6790771484375, "eval_logps/rejected": -186.94839477539062, "eval_loss": 0.6482394337654114, "eval_rewards/accuracies": 0.6177973747253418, "eval_rewards/chosen": -0.9526904821395874, "eval_rewards/margins": 0.16572298109531403, "eval_rewards/rejected": -1.1184134483337402, "eval_runtime": 483.6009, "eval_samples_per_second": 8.9, "eval_steps_per_second": 1.112, "step": 5700 }, { "epoch": 0.98, "grad_norm": 2.921875, "learning_rate": 3.911907947422577e-09, "logits/chosen": -3.0172641277313232, "logits/rejected": -3.01008939743042, "logps/chosen": -184.43312072753906, "logps/rejected": -212.61965942382812, "loss": 0.5931, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1918230056762695, "rewards/margins": 0.332853227853775, "rewards/rejected": -1.5246760845184326, "step": 5710 }, { "epoch": 0.99, "grad_norm": 3.171875, "learning_rate": 3.116032431747518e-09, "logits/chosen": -3.0023066997528076, "logits/rejected": -2.9940028190612793, "logps/chosen": -186.46493530273438, "logps/rejected": -214.6597137451172, "loss": 0.6009, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2312668561935425, "rewards/margins": 0.3432316184043884, "rewards/rejected": -1.5744985342025757, "step": 5720 }, { "epoch": 0.99, "grad_norm": 3.125, "learning_rate": 2.410526662007251e-09, "logits/chosen": -3.014190912246704, "logits/rejected": -3.008674383163452, "logps/chosen": -181.78567504882812, "logps/rejected": -203.11676025390625, "loss": 0.6337, "rewards/accuracies": 0.65625, "rewards/chosen": -1.188835859298706, "rewards/margins": 0.24085529148578644, "rewards/rejected": -1.429691195487976, "step": 5730 }, { "epoch": 0.99, "grad_norm": 3.0, "learning_rate": 1.7954161725791674e-09, "logits/chosen": -3.0035383701324463, "logits/rejected": -2.988966464996338, "logps/chosen": -197.06796264648438, "logps/rejected": -226.5892791748047, "loss": 0.5916, "rewards/accuracies": 0.65625, "rewards/chosen": -1.283865213394165, "rewards/margins": 0.36810502409935, "rewards/rejected": -1.6519702672958374, "step": 5740 }, { "epoch": 0.99, "grad_norm": 4.0625, "learning_rate": 1.270723226163284e-09, "logits/chosen": -3.031846761703491, "logits/rejected": -3.027182102203369, "logps/chosen": -192.7350311279297, "logps/rejected": -200.8181610107422, "loss": 0.6526, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.245029330253601, "rewards/margins": 0.17930933833122253, "rewards/rejected": -1.424338698387146, "step": 5750 }, { "epoch": 0.99, "grad_norm": 2.640625, "learning_rate": 8.364668129762221e-10, "logits/chosen": -3.01188325881958, "logits/rejected": -3.001596212387085, "logps/chosen": -187.3595428466797, "logps/rejected": -212.9336395263672, "loss": 0.6134, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2060251235961914, "rewards/margins": 0.28464144468307495, "rewards/rejected": -1.490666389465332, "step": 5760 }, { "epoch": 0.99, "grad_norm": 3.015625, "learning_rate": 4.926626500648124e-10, "logits/chosen": -3.0015952587127686, "logits/rejected": -2.9895007610321045, "logps/chosen": -179.03176879882812, "logps/rejected": -207.6799774169922, "loss": 0.6018, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.1616852283477783, "rewards/margins": 0.2887330949306488, "rewards/rejected": -1.45041823387146, "step": 5770 }, { "epoch": 1.0, "grad_norm": 3.53125, "learning_rate": 2.393231807362728e-10, "logits/chosen": -3.0064327716827393, "logits/rejected": -2.9963736534118652, "logps/chosen": -185.0785675048828, "logps/rejected": -212.9348602294922, "loss": 0.619, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.22031569480896, "rewards/margins": 0.2880048155784607, "rewards/rejected": -1.5083205699920654, "step": 5780 }, { "epoch": 1.0, "grad_norm": 2.515625, "learning_rate": 7.645757410912336e-11, "logits/chosen": -2.999741315841675, "logits/rejected": -2.9899821281433105, "logps/chosen": -166.2742156982422, "logps/rejected": -199.00619506835938, "loss": 0.5938, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.0550991296768188, "rewards/margins": 0.31244513392448425, "rewards/rejected": -1.3675440549850464, "step": 5790 }, { "epoch": 1.0, "grad_norm": 2.765625, "learning_rate": 4.071724779286523e-12, "logits/chosen": -3.018495559692383, "logits/rejected": -3.012821674346924, "logps/chosen": -173.92466735839844, "logps/rejected": -207.4680938720703, "loss": 0.5876, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.1266343593597412, "rewards/margins": 0.35082951188087463, "rewards/rejected": -1.477463960647583, "step": 5800 }, { "epoch": 1.0, "eval_logits/chosen": -3.023923397064209, "eval_logits/rejected": -3.017603874206543, "eval_logps/chosen": -166.78807067871094, "eval_logps/rejected": -187.0471954345703, "eval_loss": 0.6481729745864868, "eval_rewards/accuracies": 0.6171003580093384, "eval_rewards/chosen": -0.9537805914878845, "eval_rewards/margins": 0.1656205952167511, "eval_rewards/rejected": -1.1194013357162476, "eval_runtime": 483.4522, "eval_samples_per_second": 8.903, "eval_steps_per_second": 1.113, "step": 5800 }, { "epoch": 1.0, "step": 5803, "total_flos": 0.0, "train_loss": 0.0003277428618961422, "train_runtime": 17.7068, "train_samples_per_second": 5244.214, "train_steps_per_second": 327.728 } ], "logging_steps": 10, "max_steps": 5803, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }