{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 50, "global_step": 436, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.022935779816513763, "grad_norm": 5.786441925419282, "learning_rate": 1.1363636363636363e-07, "logits/chosen": -2.619342088699341, "logits/rejected": -2.5526421070098877, "logps/chosen": -265.45428466796875, "logps/rejected": -236.1463165283203, "loss": 0.6931, "rewards/accuracies": 0.41874998807907104, "rewards/chosen": -8.460578828817233e-05, "rewards/margins": 7.87067401688546e-05, "rewards/rejected": -0.000163312564836815, "step": 10 }, { "epoch": 0.045871559633027525, "grad_norm": 5.491351697550114, "learning_rate": 2.2727272727272726e-07, "logits/chosen": -2.6576342582702637, "logits/rejected": -2.5759005546569824, "logps/chosen": -298.7987060546875, "logps/rejected": -274.3077392578125, "loss": 0.6923, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.00013454208965413272, "rewards/margins": 0.0018433767836540937, "rewards/rejected": -0.0019779191352427006, "step": 20 }, { "epoch": 0.06880733944954129, "grad_norm": 5.277174601748908, "learning_rate": 3.4090909090909085e-07, "logits/chosen": -2.6759681701660156, "logits/rejected": -2.602142333984375, "logps/chosen": -290.32366943359375, "logps/rejected": -234.38919067382812, "loss": 0.6891, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.010090610012412071, "rewards/margins": 0.014163595624268055, "rewards/rejected": -0.004072986543178558, "step": 30 }, { "epoch": 0.09174311926605505, "grad_norm": 5.753580388451781, "learning_rate": 4.545454545454545e-07, "logits/chosen": -2.660831928253174, "logits/rejected": -2.61124324798584, "logps/chosen": -280.9427490234375, "logps/rejected": -267.7388000488281, "loss": 0.68, "rewards/accuracies": 0.6875, "rewards/chosen": 0.042264439165592194, "rewards/margins": 0.04172234237194061, "rewards/rejected": 0.0005421031382866204, "step": 40 }, { "epoch": 0.11467889908256881, "grad_norm": 6.220985202592042, "learning_rate": 4.997110275491701e-07, "logits/chosen": -2.61970591545105, "logits/rejected": -2.6113502979278564, "logps/chosen": -294.2338562011719, "logps/rejected": -305.00823974609375, "loss": 0.6653, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.020173203200101852, "rewards/margins": 0.07841379195451736, "rewards/rejected": -0.058240581303834915, "step": 50 }, { "epoch": 0.11467889908256881, "eval_logits/chosen": -2.5731215476989746, "eval_logits/rejected": -2.4923324584960938, "eval_logps/chosen": -285.4681701660156, "eval_logps/rejected": -260.73919677734375, "eval_loss": 0.6512896418571472, "eval_rewards/accuracies": 0.693965494632721, "eval_rewards/chosen": -0.003780907019972801, "eval_rewards/margins": 0.13838669657707214, "eval_rewards/rejected": -0.1421675980091095, "eval_runtime": 91.4355, "eval_samples_per_second": 19.883, "eval_steps_per_second": 0.317, "step": 50 }, { "epoch": 0.13761467889908258, "grad_norm": 8.996643288143726, "learning_rate": 4.979475034558115e-07, "logits/chosen": -2.5592479705810547, "logits/rejected": -2.5012314319610596, "logps/chosen": -296.82501220703125, "logps/rejected": -279.3923645019531, "loss": 0.6437, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.07340322434902191, "rewards/margins": 0.14495481550693512, "rewards/rejected": -0.21835803985595703, "step": 60 }, { "epoch": 0.16055045871559634, "grad_norm": 9.499312526523301, "learning_rate": 4.945923025551788e-07, "logits/chosen": -2.4377551078796387, "logits/rejected": -2.392364740371704, "logps/chosen": -343.14935302734375, "logps/rejected": -305.904541015625, "loss": 0.6225, "rewards/accuracies": 0.71875, "rewards/chosen": -0.15578363835811615, "rewards/margins": 0.2891853451728821, "rewards/rejected": -0.4449689984321594, "step": 70 }, { "epoch": 0.1834862385321101, "grad_norm": 13.383949209241152, "learning_rate": 4.896669632591651e-07, "logits/chosen": -2.324429512023926, "logits/rejected": -2.2132680416107178, "logps/chosen": -312.23431396484375, "logps/rejected": -314.443115234375, "loss": 0.6094, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.2638443112373352, "rewards/margins": 0.35402628779411316, "rewards/rejected": -0.617870569229126, "step": 80 }, { "epoch": 0.20642201834862386, "grad_norm": 13.193135435969568, "learning_rate": 4.832031033425662e-07, "logits/chosen": -1.3427231311798096, "logits/rejected": -1.2150843143463135, "logps/chosen": -337.8877258300781, "logps/rejected": -351.61114501953125, "loss": 0.588, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5029428601264954, "rewards/margins": 0.5451563596725464, "rewards/rejected": -1.048099160194397, "step": 90 }, { "epoch": 0.22935779816513763, "grad_norm": 18.776758250311076, "learning_rate": 4.752422169756047e-07, "logits/chosen": -0.1327299326658249, "logits/rejected": 0.009161601774394512, "logps/chosen": -333.9382019042969, "logps/rejected": -386.19952392578125, "loss": 0.5699, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.6151348352432251, "rewards/margins": 0.5275757312774658, "rewards/rejected": -1.142710566520691, "step": 100 }, { "epoch": 0.22935779816513763, "eval_logits/chosen": 0.3393348455429077, "eval_logits/rejected": 0.8771037459373474, "eval_logps/chosen": -373.8545227050781, "eval_logps/rejected": -407.7341613769531, "eval_loss": 0.5577893257141113, "eval_rewards/accuracies": 0.6853448152542114, "eval_rewards/chosen": -0.8876444697380066, "eval_rewards/margins": 0.7244728207588196, "eval_rewards/rejected": -1.6121174097061157, "eval_runtime": 92.82, "eval_samples_per_second": 19.586, "eval_steps_per_second": 0.312, "step": 100 }, { "epoch": 0.25229357798165136, "grad_norm": 16.828561137860333, "learning_rate": 4.658354083558188e-07, "logits/chosen": 0.10855080932378769, "logits/rejected": 0.6684142351150513, "logps/chosen": -363.28369140625, "logps/rejected": -392.91302490234375, "loss": 0.5591, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9809554815292358, "rewards/margins": 0.6108429431915283, "rewards/rejected": -1.5917984247207642, "step": 110 }, { "epoch": 0.27522935779816515, "grad_norm": 19.631970765133257, "learning_rate": 4.550430636492389e-07, "logits/chosen": 0.46126747131347656, "logits/rejected": 0.8139599561691284, "logps/chosen": -382.4955139160156, "logps/rejected": -427.80206298828125, "loss": 0.5619, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.0667617321014404, "rewards/margins": 0.6966198682785034, "rewards/rejected": -1.7633816003799438, "step": 120 }, { "epoch": 0.2981651376146789, "grad_norm": 23.999804178074818, "learning_rate": 4.429344633468004e-07, "logits/chosen": 1.1685855388641357, "logits/rejected": 1.665592908859253, "logps/chosen": -352.095458984375, "logps/rejected": -419.9580078125, "loss": 0.5499, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9591536521911621, "rewards/margins": 0.8143006563186646, "rewards/rejected": -1.7734540700912476, "step": 130 }, { "epoch": 0.3211009174311927, "grad_norm": 18.64421738880492, "learning_rate": 4.2958733752443187e-07, "logits/chosen": 1.145119547843933, "logits/rejected": 1.878694772720337, "logps/chosen": -350.3140563964844, "logps/rejected": -365.4109802246094, "loss": 0.5568, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.7412891387939453, "rewards/margins": 0.6698554754257202, "rewards/rejected": -1.411144733428955, "step": 140 }, { "epoch": 0.3440366972477064, "grad_norm": 20.200927048362935, "learning_rate": 4.150873668617898e-07, "logits/chosen": 1.7342513799667358, "logits/rejected": 2.4978702068328857, "logps/chosen": -387.82183837890625, "logps/rejected": -444.968505859375, "loss": 0.5416, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2127825021743774, "rewards/margins": 0.9050270915031433, "rewards/rejected": -2.117809772491455, "step": 150 }, { "epoch": 0.3440366972477064, "eval_logits/chosen": 1.2442059516906738, "eval_logits/rejected": 2.1977624893188477, "eval_logps/chosen": -399.5646667480469, "eval_logps/rejected": -458.08807373046875, "eval_loss": 0.5319975018501282, "eval_rewards/accuracies": 0.7025862336158752, "eval_rewards/chosen": -1.1447453498840332, "eval_rewards/margins": 0.9709104895591736, "eval_rewards/rejected": -2.1156558990478516, "eval_runtime": 91.9947, "eval_samples_per_second": 19.762, "eval_steps_per_second": 0.315, "step": 150 }, { "epoch": 0.3669724770642202, "grad_norm": 21.063284567870923, "learning_rate": 3.9952763262280397e-07, "logits/chosen": 1.3700188398361206, "logits/rejected": 1.9712657928466797, "logps/chosen": -422.37750244140625, "logps/rejected": -497.3843688964844, "loss": 0.5292, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2582637071609497, "rewards/margins": 0.8558940887451172, "rewards/rejected": -2.1141579151153564, "step": 160 }, { "epoch": 0.38990825688073394, "grad_norm": 20.8460851273499, "learning_rate": 3.8300801912883414e-07, "logits/chosen": 1.7427918910980225, "logits/rejected": 2.409679412841797, "logps/chosen": -401.26861572265625, "logps/rejected": -495.7734375, "loss": 0.5061, "rewards/accuracies": 0.71875, "rewards/chosen": -1.2966772317886353, "rewards/margins": 0.9603677988052368, "rewards/rejected": -2.257045269012451, "step": 170 }, { "epoch": 0.41284403669724773, "grad_norm": 17.79571920632354, "learning_rate": 3.6563457256020884e-07, "logits/chosen": 1.8351173400878906, "logits/rejected": 2.8393807411193848, "logps/chosen": -433.7701110839844, "logps/rejected": -456.1665954589844, "loss": 0.5391, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.3041222095489502, "rewards/margins": 0.8725945353507996, "rewards/rejected": -2.1767165660858154, "step": 180 }, { "epoch": 0.43577981651376146, "grad_norm": 20.925215758529603, "learning_rate": 3.475188202022617e-07, "logits/chosen": 2.0596728324890137, "logits/rejected": 2.587247371673584, "logps/chosen": -375.49212646484375, "logps/rejected": -489.371826171875, "loss": 0.517, "rewards/accuracies": 0.78125, "rewards/chosen": -1.1737760305404663, "rewards/margins": 1.0764976739883423, "rewards/rejected": -2.2502734661102295, "step": 190 }, { "epoch": 0.45871559633027525, "grad_norm": 21.33737350989529, "learning_rate": 3.287770545059052e-07, "logits/chosen": 2.203148365020752, "logits/rejected": 2.9923298358917236, "logps/chosen": -424.408203125, "logps/rejected": -482.44879150390625, "loss": 0.5318, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4612659215927124, "rewards/margins": 0.9246621131896973, "rewards/rejected": -2.38592791557312, "step": 200 }, { "epoch": 0.45871559633027525, "eval_logits/chosen": 1.3448154926300049, "eval_logits/rejected": 2.4483416080474854, "eval_logps/chosen": -404.1459655761719, "eval_logps/rejected": -469.58026123046875, "eval_loss": 0.5122287273406982, "eval_rewards/accuracies": 0.7284482717514038, "eval_rewards/chosen": -1.1905584335327148, "eval_rewards/margins": 1.0400197505950928, "eval_rewards/rejected": -2.2305781841278076, "eval_runtime": 92.2657, "eval_samples_per_second": 19.704, "eval_steps_per_second": 0.314, "step": 200 }, { "epoch": 0.481651376146789, "grad_norm": 22.48419176653969, "learning_rate": 3.0952958655864954e-07, "logits/chosen": 1.9083038568496704, "logits/rejected": 2.511636734008789, "logps/chosen": -405.66839599609375, "logps/rejected": -489.11663818359375, "loss": 0.5124, "rewards/accuracies": 0.71875, "rewards/chosen": -1.3236396312713623, "rewards/margins": 1.0241048336029053, "rewards/rejected": -2.3477444648742676, "step": 210 }, { "epoch": 0.5045871559633027, "grad_norm": 24.286637463373633, "learning_rate": 2.898999737583448e-07, "logits/chosen": 1.8827180862426758, "logits/rejected": 2.9742465019226074, "logps/chosen": -462.0406799316406, "logps/rejected": -545.8687133789062, "loss": 0.503, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.3936035633087158, "rewards/margins": 1.215381383895874, "rewards/rejected": -2.6089847087860107, "step": 220 }, { "epoch": 0.5275229357798165, "grad_norm": 19.94988074398488, "learning_rate": 2.7001422664752333e-07, "logits/chosen": 1.1973925828933716, "logits/rejected": 1.7874739170074463, "logps/chosen": -377.791748046875, "logps/rejected": -472.1722717285156, "loss": 0.5198, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.0984822511672974, "rewards/margins": 0.9670408368110657, "rewards/rejected": -2.0655229091644287, "step": 230 }, { "epoch": 0.5504587155963303, "grad_norm": 24.247680553138093, "learning_rate": 2.5e-07, "logits/chosen": 1.5466783046722412, "logits/rejected": 2.6804802417755127, "logps/chosen": -449.4345703125, "logps/rejected": -522.0684814453125, "loss": 0.5279, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6190277338027954, "rewards/margins": 0.9881566762924194, "rewards/rejected": -2.6071841716766357, "step": 240 }, { "epoch": 0.573394495412844, "grad_norm": 18.449169539289652, "learning_rate": 2.2998577335247667e-07, "logits/chosen": 1.7379165887832642, "logits/rejected": 2.9272639751434326, "logps/chosen": -468.9725646972656, "logps/rejected": -532.5216674804688, "loss": 0.5178, "rewards/accuracies": 0.75, "rewards/chosen": -1.6517877578735352, "rewards/margins": 1.06899094581604, "rewards/rejected": -2.720778226852417, "step": 250 }, { "epoch": 0.573394495412844, "eval_logits/chosen": 1.3214832544326782, "eval_logits/rejected": 2.5336461067199707, "eval_logps/chosen": -429.1148681640625, "eval_logps/rejected": -502.6709289550781, "eval_loss": 0.5028622150421143, "eval_rewards/accuracies": 0.7284482717514038, "eval_rewards/chosen": -1.440247893333435, "eval_rewards/margins": 1.1212375164031982, "eval_rewards/rejected": -2.561485528945923, "eval_runtime": 92.4677, "eval_samples_per_second": 19.661, "eval_steps_per_second": 0.314, "step": 250 }, { "epoch": 0.5963302752293578, "grad_norm": 18.218334579311854, "learning_rate": 2.1010002624165524e-07, "logits/chosen": 1.2481517791748047, "logits/rejected": 2.1310501098632812, "logps/chosen": -418.47222900390625, "logps/rejected": -541.3790283203125, "loss": 0.5073, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.4775768518447876, "rewards/margins": 1.1922760009765625, "rewards/rejected": -2.6698527336120605, "step": 260 }, { "epoch": 0.6192660550458715, "grad_norm": 18.269676157352414, "learning_rate": 1.9047041344135043e-07, "logits/chosen": 1.8889141082763672, "logits/rejected": 2.4701380729675293, "logps/chosen": -426.7981872558594, "logps/rejected": -536.7744750976562, "loss": 0.504, "rewards/accuracies": 0.75, "rewards/chosen": -1.607358694076538, "rewards/margins": 1.241914987564087, "rewards/rejected": -2.849273920059204, "step": 270 }, { "epoch": 0.6422018348623854, "grad_norm": 21.937738574869034, "learning_rate": 1.7122294549409482e-07, "logits/chosen": 2.0935568809509277, "logits/rejected": 3.0940475463867188, "logps/chosen": -444.2198181152344, "logps/rejected": -559.611328125, "loss": 0.5276, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7122100591659546, "rewards/margins": 1.2459454536437988, "rewards/rejected": -2.958155393600464, "step": 280 }, { "epoch": 0.6651376146788991, "grad_norm": 23.622077882281282, "learning_rate": 1.524811797977383e-07, "logits/chosen": 2.260361433029175, "logits/rejected": 3.3843066692352295, "logps/chosen": -451.55938720703125, "logps/rejected": -546.6029052734375, "loss": 0.5071, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.6382853984832764, "rewards/margins": 1.288576364517212, "rewards/rejected": -2.9268617630004883, "step": 290 }, { "epoch": 0.6880733944954128, "grad_norm": 20.079829542188833, "learning_rate": 1.3436542743979125e-07, "logits/chosen": 1.4699538946151733, "logits/rejected": 2.1643927097320557, "logps/chosen": -453.72039794921875, "logps/rejected": -500.624267578125, "loss": 0.519, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.4777113199234009, "rewards/margins": 0.939932644367218, "rewards/rejected": -2.4176440238952637, "step": 300 }, { "epoch": 0.6880733944954128, "eval_logits/chosen": 1.3894954919815063, "eval_logits/rejected": 2.688563346862793, "eval_logps/chosen": -433.8905944824219, "eval_logps/rejected": -514.7540283203125, "eval_loss": 0.4985021948814392, "eval_rewards/accuracies": 0.7370689511299133, "eval_rewards/chosen": -1.488004446029663, "eval_rewards/margins": 1.1943109035491943, "eval_rewards/rejected": -2.6823153495788574, "eval_runtime": 91.7036, "eval_samples_per_second": 19.825, "eval_steps_per_second": 0.316, "step": 300 }, { "epoch": 0.7110091743119266, "grad_norm": 20.41756706068905, "learning_rate": 1.1699198087116588e-07, "logits/chosen": 1.4253332614898682, "logits/rejected": 2.419015407562256, "logps/chosen": -438.324462890625, "logps/rejected": -531.0131225585938, "loss": 0.5304, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.603592872619629, "rewards/margins": 1.0420410633087158, "rewards/rejected": -2.6456339359283447, "step": 310 }, { "epoch": 0.7339449541284404, "grad_norm": 23.710557812034125, "learning_rate": 1.00472367377196e-07, "logits/chosen": 1.8713343143463135, "logits/rejected": 3.118149757385254, "logps/chosen": -430.84912109375, "logps/rejected": -534.192626953125, "loss": 0.5078, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.5661717653274536, "rewards/margins": 1.4308522939682007, "rewards/rejected": -2.9970240592956543, "step": 320 }, { "epoch": 0.7568807339449541, "grad_norm": 22.190813187925453, "learning_rate": 8.49126331382102e-08, "logits/chosen": 2.05544376373291, "logits/rejected": 2.8256397247314453, "logps/chosen": -446.35931396484375, "logps/rejected": -508.9071350097656, "loss": 0.5035, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.7183713912963867, "rewards/margins": 0.8996985554695129, "rewards/rejected": -2.618070125579834, "step": 330 }, { "epoch": 0.7798165137614679, "grad_norm": 26.678335434261424, "learning_rate": 7.041266247556812e-08, "logits/chosen": 1.8211781978607178, "logits/rejected": 2.5711045265197754, "logps/chosen": -450.910400390625, "logps/rejected": -517.4847412109375, "loss": 0.501, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6291780471801758, "rewards/margins": 0.9458298683166504, "rewards/rejected": -2.575007677078247, "step": 340 }, { "epoch": 0.8027522935779816, "grad_norm": 22.960680980618804, "learning_rate": 5.706553665319955e-08, "logits/chosen": 1.8633334636688232, "logits/rejected": 3.0288431644439697, "logps/chosen": -437.11260986328125, "logps/rejected": -511.2594299316406, "loss": 0.5137, "rewards/accuracies": 0.75, "rewards/chosen": -1.6262149810791016, "rewards/margins": 1.1097644567489624, "rewards/rejected": -2.7359795570373535, "step": 350 }, { "epoch": 0.8027522935779816, "eval_logits/chosen": 1.8716328144073486, "eval_logits/rejected": 3.1580371856689453, "eval_logps/chosen": -446.3657531738281, "eval_logps/rejected": -532.5296020507812, "eval_loss": 0.49309831857681274, "eval_rewards/accuracies": 0.732758641242981, "eval_rewards/chosen": -1.612756371498108, "eval_rewards/margins": 1.247314453125, "eval_rewards/rejected": -2.8600711822509766, "eval_runtime": 91.4764, "eval_samples_per_second": 19.874, "eval_steps_per_second": 0.317, "step": 350 }, { "epoch": 0.8256880733944955, "grad_norm": 25.170197026630024, "learning_rate": 4.4956936350761005e-08, "logits/chosen": 1.9063358306884766, "logits/rejected": 2.6921870708465576, "logps/chosen": -398.64984130859375, "logps/rejected": -524.7725830078125, "loss": 0.4898, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4632974863052368, "rewards/margins": 1.1990567445755005, "rewards/rejected": -2.662353992462158, "step": 360 }, { "epoch": 0.8486238532110092, "grad_norm": 23.25403059611051, "learning_rate": 3.416459164418123e-08, "logits/chosen": 1.4586713314056396, "logits/rejected": 2.6852428913116455, "logps/chosen": -454.4266052246094, "logps/rejected": -542.4071044921875, "loss": 0.4926, "rewards/accuracies": 0.75, "rewards/chosen": -1.5484213829040527, "rewards/margins": 1.2499048709869385, "rewards/rejected": -2.798326015472412, "step": 370 }, { "epoch": 0.8715596330275229, "grad_norm": 28.54168436834153, "learning_rate": 2.475778302439524e-08, "logits/chosen": 1.7404067516326904, "logits/rejected": 3.0292108058929443, "logps/chosen": -451.39849853515625, "logps/rejected": -544.4757080078125, "loss": 0.5001, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5502045154571533, "rewards/margins": 1.3179900646209717, "rewards/rejected": -2.868194818496704, "step": 380 }, { "epoch": 0.8944954128440367, "grad_norm": 25.43427497983774, "learning_rate": 1.6796896657433805e-08, "logits/chosen": 2.222611904144287, "logits/rejected": 3.231661558151245, "logps/chosen": -413.8394470214844, "logps/rejected": -501.4020080566406, "loss": 0.515, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6188195943832397, "rewards/margins": 1.1021382808685303, "rewards/rejected": -2.7209575176239014, "step": 390 }, { "epoch": 0.9174311926605505, "grad_norm": 24.959131634596684, "learning_rate": 1.0333036740834855e-08, "logits/chosen": 2.535111427307129, "logits/rejected": 3.2772762775421143, "logps/chosen": -382.0926208496094, "logps/rejected": -499.1607971191406, "loss": 0.5033, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5753300189971924, "rewards/margins": 1.1241223812103271, "rewards/rejected": -2.6994521617889404, "step": 400 }, { "epoch": 0.9174311926605505, "eval_logits/chosen": 2.0773062705993652, "eval_logits/rejected": 3.358436346054077, "eval_logps/chosen": -449.9159851074219, "eval_logps/rejected": -541.3424072265625, "eval_loss": 0.49311476945877075, "eval_rewards/accuracies": 0.7413793206214905, "eval_rewards/chosen": -1.6482590436935425, "eval_rewards/margins": 1.299940824508667, "eval_rewards/rejected": -2.948199987411499, "eval_runtime": 92.5987, "eval_samples_per_second": 19.633, "eval_steps_per_second": 0.313, "step": 400 }, { "epoch": 0.9403669724770642, "grad_norm": 28.32448783876109, "learning_rate": 5.4076974448211685e-09, "logits/chosen": 2.5014076232910156, "logits/rejected": 3.729045867919922, "logps/chosen": -446.04974365234375, "logps/rejected": -546.2440185546875, "loss": 0.4985, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.7509788274765015, "rewards/margins": 1.3309944868087769, "rewards/rejected": -3.0819735527038574, "step": 410 }, { "epoch": 0.963302752293578, "grad_norm": 31.377213722949094, "learning_rate": 2.052496544188487e-09, "logits/chosen": 2.54590106010437, "logits/rejected": 3.5291149616241455, "logps/chosen": -431.8235778808594, "logps/rejected": -553.3231201171875, "loss": 0.4966, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.723846197128296, "rewards/margins": 1.350610613822937, "rewards/rejected": -3.0744569301605225, "step": 420 }, { "epoch": 0.9862385321100917, "grad_norm": 25.27983521226701, "learning_rate": 2.889724508297886e-10, "logits/chosen": 2.104177713394165, "logits/rejected": 3.586791515350342, "logps/chosen": -467.50860595703125, "logps/rejected": -525.298828125, "loss": 0.4944, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.64333176612854, "rewards/margins": 1.1692912578582764, "rewards/rejected": -2.8126227855682373, "step": 430 }, { "epoch": 1.0, "step": 436, "total_flos": 0.0, "train_loss": 0.5464360101507344, "train_runtime": 11333.4438, "train_samples_per_second": 4.92, "train_steps_per_second": 0.038 } ], "logging_steps": 10, "max_steps": 436, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }