{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 50, "global_step": 436, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.022935779816513763, "grad_norm": 5.353972534143438, "learning_rate": 1.1363636363636363e-07, "logits/chosen": -2.6582446098327637, "logits/rejected": -2.612395763397217, "logps/chosen": -310.3081359863281, "logps/rejected": -241.6246337890625, "loss": 0.6932, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.00043685571290552616, "rewards/margins": -0.0005496515659615397, "rewards/rejected": 0.0001127958094002679, "step": 10 }, { "epoch": 0.045871559633027525, "grad_norm": 6.431385284276218, "learning_rate": 2.2727272727272726e-07, "logits/chosen": -2.690976142883301, "logits/rejected": -2.615501880645752, "logps/chosen": -293.55859375, "logps/rejected": -265.65789794921875, "loss": 0.6924, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.001354431384243071, "rewards/margins": 0.0023786118254065514, "rewards/rejected": -0.0010241802083328366, "step": 20 }, { "epoch": 0.06880733944954129, "grad_norm": 5.140938328988767, "learning_rate": 3.4090909090909085e-07, "logits/chosen": -2.6976418495178223, "logits/rejected": -2.6304168701171875, "logps/chosen": -277.8341064453125, "logps/rejected": -297.1772155761719, "loss": 0.6892, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.004078004974871874, "rewards/margins": 0.009664928540587425, "rewards/rejected": -0.005586923565715551, "step": 30 }, { "epoch": 0.09174311926605505, "grad_norm": 5.971632655809275, "learning_rate": 4.545454545454545e-07, "logits/chosen": -2.616170883178711, "logits/rejected": -2.5451369285583496, "logps/chosen": -283.9632568359375, "logps/rejected": -259.82861328125, "loss": 0.6798, "rewards/accuracies": 0.6875, "rewards/chosen": 0.03654901683330536, "rewards/margins": 0.045721281319856644, "rewards/rejected": -0.00917226541787386, "step": 40 }, { "epoch": 0.11467889908256881, "grad_norm": 5.916656852320022, "learning_rate": 4.997110275491701e-07, "logits/chosen": -2.5970985889434814, "logits/rejected": -2.5133914947509766, "logps/chosen": -285.24835205078125, "logps/rejected": -247.303466796875, "loss": 0.6688, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.006145569030195475, "rewards/margins": 0.0578111931681633, "rewards/rejected": -0.06395676732063293, "step": 50 }, { "epoch": 0.11467889908256881, "eval_logits/chosen": -2.6066324710845947, "eval_logits/rejected": -2.506901979446411, "eval_logps/chosen": -286.6465759277344, "eval_logps/rejected": -258.62078857421875, "eval_loss": 0.6561177968978882, "eval_rewards/accuracies": 0.6767241358757019, "eval_rewards/chosen": -0.02640603668987751, "eval_rewards/margins": 0.10332722216844559, "eval_rewards/rejected": -0.12973324954509735, "eval_runtime": 91.0244, "eval_samples_per_second": 19.973, "eval_steps_per_second": 0.319, "step": 50 }, { "epoch": 0.13761467889908258, "grad_norm": 7.499634288772489, "learning_rate": 4.979475034558115e-07, "logits/chosen": -2.582371234893799, "logits/rejected": -2.5081627368927, "logps/chosen": -292.10491943359375, "logps/rejected": -282.31195068359375, "loss": 0.6423, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.11516664922237396, "rewards/margins": 0.19041100144386292, "rewards/rejected": -0.30557766556739807, "step": 60 }, { "epoch": 0.16055045871559634, "grad_norm": 17.30037068758165, "learning_rate": 4.945923025551788e-07, "logits/chosen": -2.4502875804901123, "logits/rejected": -2.3790054321289062, "logps/chosen": -298.32244873046875, "logps/rejected": -273.11859130859375, "loss": 0.6397, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.2997247576713562, "rewards/margins": 0.23786215484142303, "rewards/rejected": -0.5375869870185852, "step": 70 }, { "epoch": 0.1834862385321101, "grad_norm": 11.228813057299567, "learning_rate": 4.896669632591651e-07, "logits/chosen": -2.5100908279418945, "logits/rejected": -2.4027259349823, "logps/chosen": -306.67510986328125, "logps/rejected": -322.7925720214844, "loss": 0.6257, "rewards/accuracies": 0.6875, "rewards/chosen": -0.36647987365722656, "rewards/margins": 0.28450754284858704, "rewards/rejected": -0.6509873867034912, "step": 80 }, { "epoch": 0.20642201834862386, "grad_norm": 15.58920411413326, "learning_rate": 4.832031033425662e-07, "logits/chosen": -1.5505931377410889, "logits/rejected": -1.3694034814834595, "logps/chosen": -357.6716613769531, "logps/rejected": -372.05133056640625, "loss": 0.5967, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.5348917245864868, "rewards/margins": 0.4341323971748352, "rewards/rejected": -0.9690243005752563, "step": 90 }, { "epoch": 0.22935779816513763, "grad_norm": 13.006583087547677, "learning_rate": 4.752422169756047e-07, "logits/chosen": -0.7837198972702026, "logits/rejected": -0.35428792238235474, "logps/chosen": -326.9918518066406, "logps/rejected": -346.36737060546875, "loss": 0.5822, "rewards/accuracies": 0.65625, "rewards/chosen": -0.4772585332393646, "rewards/margins": 0.4704399108886719, "rewards/rejected": -0.9476984143257141, "step": 100 }, { "epoch": 0.22935779816513763, "eval_logits/chosen": -0.09357786923646927, "eval_logits/rejected": 0.4795497953891754, "eval_logps/chosen": -374.6986083984375, "eval_logps/rejected": -392.9424743652344, "eval_loss": 0.5813368558883667, "eval_rewards/accuracies": 0.6724137663841248, "eval_rewards/chosen": -0.9069267511367798, "eval_rewards/margins": 0.5660232305526733, "eval_rewards/rejected": -1.4729499816894531, "eval_runtime": 91.4662, "eval_samples_per_second": 19.876, "eval_steps_per_second": 0.317, "step": 100 }, { "epoch": 0.25229357798165136, "grad_norm": 14.7655268267239, "learning_rate": 4.658354083558188e-07, "logits/chosen": -0.23613190650939941, "logits/rejected": 0.2948758006095886, "logps/chosen": -371.15667724609375, "logps/rejected": -427.76885986328125, "loss": 0.5606, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7391853332519531, "rewards/margins": 0.7208150625228882, "rewards/rejected": -1.4600005149841309, "step": 110 }, { "epoch": 0.27522935779816515, "grad_norm": 28.232913631245626, "learning_rate": 4.550430636492389e-07, "logits/chosen": 0.3172193467617035, "logits/rejected": 1.228100299835205, "logps/chosen": -412.1929626464844, "logps/rejected": -428.08056640625, "loss": 0.5789, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0178120136260986, "rewards/margins": 0.6625908613204956, "rewards/rejected": -1.6804027557373047, "step": 120 }, { "epoch": 0.2981651376146789, "grad_norm": 23.106046920597972, "learning_rate": 4.429344633468004e-07, "logits/chosen": 1.296276330947876, "logits/rejected": 2.0952706336975098, "logps/chosen": -377.18572998046875, "logps/rejected": -435.5022888183594, "loss": 0.5712, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.8953048586845398, "rewards/margins": 0.8917394876480103, "rewards/rejected": -1.7870445251464844, "step": 130 }, { "epoch": 0.3211009174311927, "grad_norm": 21.3617509080007, "learning_rate": 4.2958733752443187e-07, "logits/chosen": 1.0354318618774414, "logits/rejected": 2.103768825531006, "logps/chosen": -374.42938232421875, "logps/rejected": -408.32342529296875, "loss": 0.5477, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9035611152648926, "rewards/margins": 0.7404158115386963, "rewards/rejected": -1.643977165222168, "step": 140 }, { "epoch": 0.3440366972477064, "grad_norm": 21.53708307235743, "learning_rate": 4.150873668617898e-07, "logits/chosen": 0.8976553678512573, "logits/rejected": 2.0599629878997803, "logps/chosen": -370.0615234375, "logps/rejected": -411.820068359375, "loss": 0.5512, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7934576869010925, "rewards/margins": 0.716931939125061, "rewards/rejected": -1.5103896856307983, "step": 150 }, { "epoch": 0.3440366972477064, "eval_logits/chosen": 1.9097994565963745, "eval_logits/rejected": 2.9840593338012695, "eval_logps/chosen": -382.4128112792969, "eval_logps/rejected": -426.216552734375, "eval_loss": 0.5533820390701294, "eval_rewards/accuracies": 0.7284482717514038, "eval_rewards/chosen": -0.9840683937072754, "eval_rewards/margins": 0.8216219544410706, "eval_rewards/rejected": -1.8056902885437012, "eval_runtime": 91.5586, "eval_samples_per_second": 19.856, "eval_steps_per_second": 0.317, "step": 150 }, { "epoch": 0.3669724770642202, "grad_norm": 17.634177990437703, "learning_rate": 3.9952763262280397e-07, "logits/chosen": 1.8957267999649048, "logits/rejected": 2.8357367515563965, "logps/chosen": -408.04705810546875, "logps/rejected": -450.3290100097656, "loss": 0.5609, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.1164201498031616, "rewards/margins": 0.8357731103897095, "rewards/rejected": -1.952193021774292, "step": 160 }, { "epoch": 0.38990825688073394, "grad_norm": 24.6798606158854, "learning_rate": 3.8300801912883414e-07, "logits/chosen": 1.3192155361175537, "logits/rejected": 2.3843648433685303, "logps/chosen": -356.1672058105469, "logps/rejected": -387.1358947753906, "loss": 0.5414, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.9024198651313782, "rewards/margins": 0.7546060681343079, "rewards/rejected": -1.6570260524749756, "step": 170 }, { "epoch": 0.41284403669724773, "grad_norm": 22.218831723434445, "learning_rate": 3.6563457256020884e-07, "logits/chosen": 1.3455697298049927, "logits/rejected": 2.5438590049743652, "logps/chosen": -351.62774658203125, "logps/rejected": -430.138427734375, "loss": 0.5396, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.9148648381233215, "rewards/margins": 0.9510505795478821, "rewards/rejected": -1.8659156560897827, "step": 180 }, { "epoch": 0.43577981651376146, "grad_norm": 25.297690497973328, "learning_rate": 3.475188202022617e-07, "logits/chosen": 1.7387921810150146, "logits/rejected": 2.998396396636963, "logps/chosen": -333.116455078125, "logps/rejected": -437.8639221191406, "loss": 0.5399, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.7444877624511719, "rewards/margins": 1.007294774055481, "rewards/rejected": -1.7517824172973633, "step": 190 }, { "epoch": 0.45871559633027525, "grad_norm": 25.947440207471573, "learning_rate": 3.287770545059052e-07, "logits/chosen": 1.9480648040771484, "logits/rejected": 2.9033870697021484, "logps/chosen": -380.21185302734375, "logps/rejected": -424.3128967285156, "loss": 0.5364, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1435054540634155, "rewards/margins": 0.736918032169342, "rewards/rejected": -1.8804235458374023, "step": 200 }, { "epoch": 0.45871559633027525, "eval_logits/chosen": 2.4303267002105713, "eval_logits/rejected": 3.876626491546631, "eval_logps/chosen": -425.8599853515625, "eval_logps/rejected": -481.8061828613281, "eval_loss": 0.5367683172225952, "eval_rewards/accuracies": 0.732758641242981, "eval_rewards/chosen": -1.4185398817062378, "eval_rewards/margins": 0.9430465698242188, "eval_rewards/rejected": -2.361586570739746, "eval_runtime": 91.0832, "eval_samples_per_second": 19.96, "eval_steps_per_second": 0.318, "step": 200 }, { "epoch": 0.481651376146789, "grad_norm": 19.563478098052215, "learning_rate": 3.0952958655864954e-07, "logits/chosen": 3.3911328315734863, "logits/rejected": 4.194566249847412, "logps/chosen": -451.25238037109375, "logps/rejected": -539.4793701171875, "loss": 0.5321, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.744564414024353, "rewards/margins": 0.8036998510360718, "rewards/rejected": -2.548264503479004, "step": 210 }, { "epoch": 0.5045871559633027, "grad_norm": 24.94888206530693, "learning_rate": 2.898999737583448e-07, "logits/chosen": 2.1577422618865967, "logits/rejected": 3.670943021774292, "logps/chosen": -426.3487854003906, "logps/rejected": -508.80780029296875, "loss": 0.5365, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5799643993377686, "rewards/margins": 0.9677878618240356, "rewards/rejected": -2.5477521419525146, "step": 220 }, { "epoch": 0.5275229357798165, "grad_norm": 23.826139773858404, "learning_rate": 2.7001422664752333e-07, "logits/chosen": 0.8777297735214233, "logits/rejected": 2.3443570137023926, "logps/chosen": -384.8174743652344, "logps/rejected": -467.87298583984375, "loss": 0.5416, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.1064906120300293, "rewards/margins": 1.0363706350326538, "rewards/rejected": -2.1428613662719727, "step": 230 }, { "epoch": 0.5504587155963303, "grad_norm": 18.150746967508407, "learning_rate": 2.5e-07, "logits/chosen": 2.9278922080993652, "logits/rejected": 3.511791706085205, "logps/chosen": -450.8653259277344, "logps/rejected": -547.5108642578125, "loss": 0.5638, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.810927391052246, "rewards/margins": 1.0066088438034058, "rewards/rejected": -2.8175363540649414, "step": 240 }, { "epoch": 0.573394495412844, "grad_norm": 21.73286552315769, "learning_rate": 2.2998577335247667e-07, "logits/chosen": 2.6097209453582764, "logits/rejected": 3.895547389984131, "logps/chosen": -484.319091796875, "logps/rejected": -561.3115234375, "loss": 0.5308, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.1941843032836914, "rewards/margins": 0.9737985730171204, "rewards/rejected": -3.167982578277588, "step": 250 }, { "epoch": 0.573394495412844, "eval_logits/chosen": 2.454780101776123, "eval_logits/rejected": 4.2408928871154785, "eval_logps/chosen": -519.5304565429688, "eval_logps/rejected": -602.6265869140625, "eval_loss": 0.5234553217887878, "eval_rewards/accuracies": 0.7284482717514038, "eval_rewards/chosen": -2.3552448749542236, "eval_rewards/margins": 1.2145458459854126, "eval_rewards/rejected": -3.5697906017303467, "eval_runtime": 91.7528, "eval_samples_per_second": 19.814, "eval_steps_per_second": 0.316, "step": 250 }, { "epoch": 0.5963302752293578, "grad_norm": 29.190042715796082, "learning_rate": 2.1010002624165524e-07, "logits/chosen": 2.5975215435028076, "logits/rejected": 4.360453128814697, "logps/chosen": -554.3533935546875, "logps/rejected": -655.7716064453125, "loss": 0.5307, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.5185165405273438, "rewards/margins": 1.447409987449646, "rewards/rejected": -3.9659264087677, "step": 260 }, { "epoch": 0.6192660550458715, "grad_norm": 20.081922974931803, "learning_rate": 1.9047041344135043e-07, "logits/chosen": 2.110996723175049, "logits/rejected": 3.4121272563934326, "logps/chosen": -542.6881103515625, "logps/rejected": -613.2174682617188, "loss": 0.5514, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.664341449737549, "rewards/margins": 0.9934176206588745, "rewards/rejected": -3.657759189605713, "step": 270 }, { "epoch": 0.6422018348623854, "grad_norm": 20.797156741141926, "learning_rate": 1.7122294549409482e-07, "logits/chosen": 1.95541250705719, "logits/rejected": 3.574702024459839, "logps/chosen": -541.0426025390625, "logps/rejected": -644.5146484375, "loss": 0.522, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.637174129486084, "rewards/margins": 1.117336630821228, "rewards/rejected": -3.7545104026794434, "step": 280 }, { "epoch": 0.6651376146788991, "grad_norm": 22.250484161252675, "learning_rate": 1.524811797977383e-07, "logits/chosen": 1.9210926294326782, "logits/rejected": 3.2735812664031982, "logps/chosen": -576.992431640625, "logps/rejected": -663.1363525390625, "loss": 0.5191, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.8690013885498047, "rewards/margins": 1.0208569765090942, "rewards/rejected": -3.8898582458496094, "step": 290 }, { "epoch": 0.6880733944954128, "grad_norm": 24.39105290704078, "learning_rate": 1.3436542743979125e-07, "logits/chosen": 1.646095871925354, "logits/rejected": 3.443913221359253, "logps/chosen": -558.287353515625, "logps/rejected": -657.3651123046875, "loss": 0.532, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.7306606769561768, "rewards/margins": 1.3434104919433594, "rewards/rejected": -4.074070930480957, "step": 300 }, { "epoch": 0.6880733944954128, "eval_logits/chosen": 1.1080348491668701, "eval_logits/rejected": 3.015399694442749, "eval_logps/chosen": -535.3407592773438, "eval_logps/rejected": -617.0262451171875, "eval_loss": 0.5116756558418274, "eval_rewards/accuracies": 0.7198275923728943, "eval_rewards/chosen": -2.513347864151001, "eval_rewards/margins": 1.200439691543579, "eval_rewards/rejected": -3.713787794113159, "eval_runtime": 91.1655, "eval_samples_per_second": 19.942, "eval_steps_per_second": 0.318, "step": 300 }, { "epoch": 0.7110091743119266, "grad_norm": 27.989896858900266, "learning_rate": 1.1699198087116588e-07, "logits/chosen": 2.1850762367248535, "logits/rejected": 3.5708484649658203, "logps/chosen": -533.7103271484375, "logps/rejected": -638.1661376953125, "loss": 0.54, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.7955188751220703, "rewards/margins": 1.163648247718811, "rewards/rejected": -3.959167003631592, "step": 310 }, { "epoch": 0.7339449541284404, "grad_norm": 26.423781855718417, "learning_rate": 1.00472367377196e-07, "logits/chosen": 1.8076130151748657, "logits/rejected": 4.071971893310547, "logps/chosen": -614.3458251953125, "logps/rejected": -702.5189819335938, "loss": 0.5138, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.0845131874084473, "rewards/margins": 1.4407538175582886, "rewards/rejected": -4.525267601013184, "step": 320 }, { "epoch": 0.7568807339449541, "grad_norm": 24.484829935546955, "learning_rate": 8.49126331382102e-08, "logits/chosen": 2.0375962257385254, "logits/rejected": 3.5112037658691406, "logps/chosen": -607.512939453125, "logps/rejected": -720.7943115234375, "loss": 0.5187, "rewards/accuracies": 0.6875, "rewards/chosen": -3.2878499031066895, "rewards/margins": 1.2010066509246826, "rewards/rejected": -4.488856315612793, "step": 330 }, { "epoch": 0.7798165137614679, "grad_norm": 23.41145949202815, "learning_rate": 7.041266247556812e-08, "logits/chosen": 1.657248854637146, "logits/rejected": 3.65797758102417, "logps/chosen": -553.1179809570312, "logps/rejected": -692.9093017578125, "loss": 0.5258, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.909062623977661, "rewards/margins": 1.4310705661773682, "rewards/rejected": -4.340132713317871, "step": 340 }, { "epoch": 0.8027522935779816, "grad_norm": 28.908000369301963, "learning_rate": 5.706553665319955e-08, "logits/chosen": 0.9193560481071472, "logits/rejected": 3.4261555671691895, "logps/chosen": -548.4246215820312, "logps/rejected": -669.2120361328125, "loss": 0.5064, "rewards/accuracies": 0.8125, "rewards/chosen": -2.572448968887329, "rewards/margins": 1.67291259765625, "rewards/rejected": -4.245361804962158, "step": 350 }, { "epoch": 0.8027522935779816, "eval_logits/chosen": 0.7126501202583313, "eval_logits/rejected": 2.685429096221924, "eval_logps/chosen": -547.6198120117188, "eval_logps/rejected": -636.6490478515625, "eval_loss": 0.5116574168205261, "eval_rewards/accuracies": 0.7241379022598267, "eval_rewards/chosen": -2.636138677597046, "eval_rewards/margins": 1.2738765478134155, "eval_rewards/rejected": -3.9100148677825928, "eval_runtime": 90.9536, "eval_samples_per_second": 19.988, "eval_steps_per_second": 0.319, "step": 350 }, { "epoch": 0.8256880733944955, "grad_norm": 24.395714355317615, "learning_rate": 4.4956936350761005e-08, "logits/chosen": 0.9300888180732727, "logits/rejected": 2.3748581409454346, "logps/chosen": -543.5307006835938, "logps/rejected": -659.3692016601562, "loss": 0.5084, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.6378746032714844, "rewards/margins": 1.2973625659942627, "rewards/rejected": -3.935237169265747, "step": 360 }, { "epoch": 0.8486238532110092, "grad_norm": 22.09369129566989, "learning_rate": 3.416459164418123e-08, "logits/chosen": 0.2887948155403137, "logits/rejected": 2.4889461994171143, "logps/chosen": -591.7805786132812, "logps/rejected": -667.9407348632812, "loss": 0.5109, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.5994677543640137, "rewards/margins": 1.3468105792999268, "rewards/rejected": -3.9462783336639404, "step": 370 }, { "epoch": 0.8715596330275229, "grad_norm": 26.209345843998328, "learning_rate": 2.475778302439524e-08, "logits/chosen": 0.8550162315368652, "logits/rejected": 3.1779205799102783, "logps/chosen": -580.2351684570312, "logps/rejected": -637.8566284179688, "loss": 0.5075, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.8695898056030273, "rewards/margins": 1.257644534111023, "rewards/rejected": -4.127234935760498, "step": 380 }, { "epoch": 0.8944954128440367, "grad_norm": 21.12858235158005, "learning_rate": 1.6796896657433805e-08, "logits/chosen": 0.47364893555641174, "logits/rejected": 2.78879451751709, "logps/chosen": -587.971435546875, "logps/rejected": -724.6598510742188, "loss": 0.5014, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.7999932765960693, "rewards/margins": 1.9162429571151733, "rewards/rejected": -4.716236591339111, "step": 390 }, { "epoch": 0.9174311926605505, "grad_norm": 23.688779288096637, "learning_rate": 1.0333036740834855e-08, "logits/chosen": 1.2185232639312744, "logits/rejected": 2.7246413230895996, "logps/chosen": -606.762939453125, "logps/rejected": -717.7750244140625, "loss": 0.5105, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -3.1490612030029297, "rewards/margins": 1.1964399814605713, "rewards/rejected": -4.345500946044922, "step": 400 }, { "epoch": 0.9174311926605505, "eval_logits/chosen": 0.863433837890625, "eval_logits/rejected": 2.9146454334259033, "eval_logps/chosen": -575.282958984375, "eval_logps/rejected": -674.3233032226562, "eval_loss": 0.5098804235458374, "eval_rewards/accuracies": 0.732758641242981, "eval_rewards/chosen": -2.9127700328826904, "eval_rewards/margins": 1.3739889860153198, "eval_rewards/rejected": -4.286758899688721, "eval_runtime": 91.1821, "eval_samples_per_second": 19.938, "eval_steps_per_second": 0.318, "step": 400 }, { "epoch": 0.9403669724770642, "grad_norm": 23.71509307695075, "learning_rate": 5.4076974448211685e-09, "logits/chosen": 1.313011884689331, "logits/rejected": 2.788435459136963, "logps/chosen": -609.2062377929688, "logps/rejected": -684.1138305664062, "loss": 0.5379, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.2180449962615967, "rewards/margins": 1.0637754201889038, "rewards/rejected": -4.281820297241211, "step": 410 }, { "epoch": 0.963302752293578, "grad_norm": 27.57431143958426, "learning_rate": 2.052496544188487e-09, "logits/chosen": 1.1412100791931152, "logits/rejected": 3.4668610095977783, "logps/chosen": -616.3365478515625, "logps/rejected": -685.5581665039062, "loss": 0.5129, "rewards/accuracies": 0.75, "rewards/chosen": -3.115288496017456, "rewards/margins": 1.404497504234314, "rewards/rejected": -4.5197858810424805, "step": 420 }, { "epoch": 0.9862385321100917, "grad_norm": 19.004038331375536, "learning_rate": 2.889724508297886e-10, "logits/chosen": 1.3194568157196045, "logits/rejected": 2.8764185905456543, "logps/chosen": -561.5572509765625, "logps/rejected": -680.3653564453125, "loss": 0.5165, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.9964187145233154, "rewards/margins": 1.2785086631774902, "rewards/rejected": -4.274927616119385, "step": 430 }, { "epoch": 1.0, "step": 436, "total_flos": 0.0, "train_loss": 0.5604413999330013, "train_runtime": 11415.5934, "train_samples_per_second": 4.884, "train_steps_per_second": 0.038 } ], "logging_steps": 10, "max_steps": 436, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }