{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 50, "global_step": 436, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.022935779816513763, "grad_norm": 5.356178331285126, "learning_rate": 1.1363636363636363e-07, "logits/chosen": -2.6583542823791504, "logits/rejected": -2.612396240234375, "logps/chosen": -310.2690124511719, "logps/rejected": -241.6248321533203, "loss": 0.6932, "rewards/accuracies": 0.36250001192092896, "rewards/chosen": -4.61353047285229e-05, "rewards/margins": -0.00015705036639701575, "rewards/rejected": 0.00011091506894445047, "step": 10 }, { "epoch": 0.045871559633027525, "grad_norm": 6.4233925318831595, "learning_rate": 2.2727272727272726e-07, "logits/chosen": -2.691195011138916, "logits/rejected": -2.6153342723846436, "logps/chosen": -293.5455627441406, "logps/rejected": -265.6838684082031, "loss": 0.6924, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.001484546228311956, "rewards/margins": 0.002768759150058031, "rewards/rejected": -0.0012842128053307533, "step": 20 }, { "epoch": 0.06880733944954129, "grad_norm": 5.149124678509347, "learning_rate": 3.4090909090909085e-07, "logits/chosen": -2.6977083683013916, "logits/rejected": -2.63045072555542, "logps/chosen": -277.82159423828125, "logps/rejected": -297.18646240234375, "loss": 0.6892, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.004203228745609522, "rewards/margins": 0.009881972335278988, "rewards/rejected": -0.005678744055330753, "step": 30 }, { "epoch": 0.09174311926605505, "grad_norm": 6.002207032235101, "learning_rate": 4.545454545454545e-07, "logits/chosen": -2.616579294204712, "logits/rejected": -2.5455870628356934, "logps/chosen": -283.92156982421875, "logps/rejected": -259.82562255859375, "loss": 0.6798, "rewards/accuracies": 0.6875, "rewards/chosen": 0.036965593695640564, "rewards/margins": 0.04610789567232132, "rewards/rejected": -0.009142300114035606, "step": 40 }, { "epoch": 0.11467889908256881, "grad_norm": 5.926817590245787, "learning_rate": 4.997110275491701e-07, "logits/chosen": -2.596590518951416, "logits/rejected": -2.512640953063965, "logps/chosen": -285.3323669433594, "logps/rejected": -247.4479522705078, "loss": 0.6687, "rewards/accuracies": 0.625, "rewards/chosen": -0.006985962390899658, "rewards/margins": 0.058415599167346954, "rewards/rejected": -0.06540156155824661, "step": 50 }, { "epoch": 0.11467889908256881, "eval_logits/chosen": -2.607215166091919, "eval_logits/rejected": -2.5074896812438965, "eval_logps/chosen": -286.6437683105469, "eval_logps/rejected": -258.6246032714844, "eval_loss": 0.6559526920318604, "eval_rewards/accuracies": 0.6724137663841248, "eval_rewards/chosen": -0.026378028094768524, "eval_rewards/margins": 0.10339301824569702, "eval_rewards/rejected": -0.12977103888988495, "eval_runtime": 92.1507, "eval_samples_per_second": 19.729, "eval_steps_per_second": 0.315, "step": 50 }, { "epoch": 0.13761467889908258, "grad_norm": 7.494952728753531, "learning_rate": 4.979475034558115e-07, "logits/chosen": -2.582334518432617, "logits/rejected": -2.508467197418213, "logps/chosen": -292.1842346191406, "logps/rejected": -282.423583984375, "loss": 0.6423, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.11595962941646576, "rewards/margins": 0.1907343566417694, "rewards/rejected": -0.306693971157074, "step": 60 }, { "epoch": 0.16055045871559634, "grad_norm": 18.148816686471342, "learning_rate": 4.945923025551788e-07, "logits/chosen": -2.459238052368164, "logits/rejected": -2.3897058963775635, "logps/chosen": -298.2831115722656, "logps/rejected": -273.2386474609375, "loss": 0.6393, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.29933103919029236, "rewards/margins": 0.23945657908916473, "rewards/rejected": -0.5387876629829407, "step": 70 }, { "epoch": 0.1834862385321101, "grad_norm": 12.734144337443169, "learning_rate": 4.896669632591651e-07, "logits/chosen": -2.5085086822509766, "logits/rejected": -2.3976407051086426, "logps/chosen": -305.76031494140625, "logps/rejected": -321.8554992675781, "loss": 0.6235, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3573322296142578, "rewards/margins": 0.28428393602371216, "rewards/rejected": -0.6416162252426147, "step": 80 }, { "epoch": 0.20642201834862386, "grad_norm": 14.039079346644037, "learning_rate": 4.832031033425662e-07, "logits/chosen": -1.4997788667678833, "logits/rejected": -1.313194990158081, "logps/chosen": -348.44805908203125, "logps/rejected": -361.76226806640625, "loss": 0.5956, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.44265589118003845, "rewards/margins": 0.4234777092933655, "rewards/rejected": -0.8661335706710815, "step": 90 }, { "epoch": 0.22935779816513763, "grad_norm": 13.29279140070498, "learning_rate": 4.752422169756047e-07, "logits/chosen": -0.19194559752941132, "logits/rejected": 0.2622618079185486, "logps/chosen": -339.16339111328125, "logps/rejected": -359.37176513671875, "loss": 0.581, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.598974347114563, "rewards/margins": 0.4787676930427551, "rewards/rejected": -1.0777419805526733, "step": 100 }, { "epoch": 0.22935779816513763, "eval_logits/chosen": 0.026995467022061348, "eval_logits/rejected": 0.6340460777282715, "eval_logps/chosen": -357.115966796875, "eval_logps/rejected": -377.3665771484375, "eval_loss": 0.5763944387435913, "eval_rewards/accuracies": 0.7155172228813171, "eval_rewards/chosen": -0.7311002016067505, "eval_rewards/margins": 0.5860908627510071, "eval_rewards/rejected": -1.3171910047531128, "eval_runtime": 91.0093, "eval_samples_per_second": 19.976, "eval_steps_per_second": 0.319, "step": 100 }, { "epoch": 0.25229357798165136, "grad_norm": 27.36521925016087, "learning_rate": 4.658354083558188e-07, "logits/chosen": -0.14074298739433289, "logits/rejected": 0.41164666414260864, "logps/chosen": -359.0007019042969, "logps/rejected": -422.62353515625, "loss": 0.5561, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.6176259517669678, "rewards/margins": 0.7909212708473206, "rewards/rejected": -1.4085471630096436, "step": 110 }, { "epoch": 0.27522935779816515, "grad_norm": 18.22825267425928, "learning_rate": 4.550430636492389e-07, "logits/chosen": 0.28136759996414185, "logits/rejected": 1.2520945072174072, "logps/chosen": -414.25665283203125, "logps/rejected": -428.6090393066406, "loss": 0.5788, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.0384491682052612, "rewards/margins": 0.647238552570343, "rewards/rejected": -1.6856876611709595, "step": 120 }, { "epoch": 0.2981651376146789, "grad_norm": 18.72996488177851, "learning_rate": 4.429344633468004e-07, "logits/chosen": 1.1580041646957397, "logits/rejected": 1.9673328399658203, "logps/chosen": -384.8316650390625, "logps/rejected": -440.20672607421875, "loss": 0.5744, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9717643857002258, "rewards/margins": 0.8623247146606445, "rewards/rejected": -1.8340890407562256, "step": 130 }, { "epoch": 0.3211009174311927, "grad_norm": 18.77533851044078, "learning_rate": 4.2958733752443187e-07, "logits/chosen": 0.9655276536941528, "logits/rejected": 1.986130952835083, "logps/chosen": -377.4757995605469, "logps/rejected": -408.6956481933594, "loss": 0.553, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9340255856513977, "rewards/margins": 0.7136737704277039, "rewards/rejected": -1.6476993560791016, "step": 140 }, { "epoch": 0.3440366972477064, "grad_norm": 22.441752676286086, "learning_rate": 4.150873668617898e-07, "logits/chosen": 1.651755928993225, "logits/rejected": 2.6961984634399414, "logps/chosen": -394.5315856933594, "logps/rejected": -437.6512756347656, "loss": 0.558, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.0381582975387573, "rewards/margins": 0.7305435538291931, "rewards/rejected": -1.7687019109725952, "step": 150 }, { "epoch": 0.3440366972477064, "eval_logits/chosen": 2.0827815532684326, "eval_logits/rejected": 3.0035645961761475, "eval_logps/chosen": -404.3199157714844, "eval_logps/rejected": -442.60711669921875, "eval_loss": 0.5509841442108154, "eval_rewards/accuracies": 0.7241379022598267, "eval_rewards/chosen": -1.203139305114746, "eval_rewards/margins": 0.7664569616317749, "eval_rewards/rejected": -1.9695963859558105, "eval_runtime": 90.3932, "eval_samples_per_second": 20.112, "eval_steps_per_second": 0.321, "step": 150 }, { "epoch": 0.3669724770642202, "grad_norm": 24.238500011603442, "learning_rate": 3.9952763262280397e-07, "logits/chosen": 1.6490274667739868, "logits/rejected": 2.5100581645965576, "logps/chosen": -409.46240234375, "logps/rejected": -448.33001708984375, "loss": 0.557, "rewards/accuracies": 0.75, "rewards/chosen": -1.1305733919143677, "rewards/margins": 0.8016298413276672, "rewards/rejected": -1.9322032928466797, "step": 160 }, { "epoch": 0.38990825688073394, "grad_norm": 29.076032215796957, "learning_rate": 3.8300801912883414e-07, "logits/chosen": 1.5585577487945557, "logits/rejected": 2.380032777786255, "logps/chosen": -372.0144958496094, "logps/rejected": -400.96905517578125, "loss": 0.5388, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.0608928203582764, "rewards/margins": 0.7344645261764526, "rewards/rejected": -1.795357346534729, "step": 170 }, { "epoch": 0.41284403669724773, "grad_norm": 23.777603972721764, "learning_rate": 3.6563457256020884e-07, "logits/chosen": 1.052141785621643, "logits/rejected": 1.8935604095458984, "logps/chosen": -356.8204650878906, "logps/rejected": -432.20001220703125, "loss": 0.5439, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9667918086051941, "rewards/margins": 0.9197394251823425, "rewards/rejected": -1.886531114578247, "step": 180 }, { "epoch": 0.43577981651376146, "grad_norm": 20.231853124698564, "learning_rate": 3.475188202022617e-07, "logits/chosen": 1.569053292274475, "logits/rejected": 2.5012192726135254, "logps/chosen": -349.7216491699219, "logps/rejected": -458.28955078125, "loss": 0.5442, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.9105401039123535, "rewards/margins": 1.0454990863800049, "rewards/rejected": -1.9560391902923584, "step": 190 }, { "epoch": 0.45871559633027525, "grad_norm": 20.18742592623794, "learning_rate": 3.287770545059052e-07, "logits/chosen": 2.6468214988708496, "logits/rejected": 3.313246965408325, "logps/chosen": -413.1968688964844, "logps/rejected": -454.881591796875, "loss": 0.5346, "rewards/accuracies": 0.6875, "rewards/chosen": -1.473356008529663, "rewards/margins": 0.712754487991333, "rewards/rejected": -2.186110258102417, "step": 200 }, { "epoch": 0.45871559633027525, "eval_logits/chosen": 1.7577229738235474, "eval_logits/rejected": 2.7758734226226807, "eval_logps/chosen": -400.7710876464844, "eval_logps/rejected": -449.201904296875, "eval_loss": 0.5381261706352234, "eval_rewards/accuracies": 0.7112069129943848, "eval_rewards/chosen": -1.1676514148712158, "eval_rewards/margins": 0.8678924441337585, "eval_rewards/rejected": -2.03554368019104, "eval_runtime": 90.283, "eval_samples_per_second": 20.137, "eval_steps_per_second": 0.321, "step": 200 }, { "epoch": 0.481651376146789, "grad_norm": 21.096800994630236, "learning_rate": 3.0952958655864954e-07, "logits/chosen": 2.1683189868927, "logits/rejected": 2.6720829010009766, "logps/chosen": -401.7050476074219, "logps/rejected": -487.34161376953125, "loss": 0.5345, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2490909099578857, "rewards/margins": 0.7777953743934631, "rewards/rejected": -2.026886463165283, "step": 210 }, { "epoch": 0.5045871559633027, "grad_norm": 35.955511790614246, "learning_rate": 2.898999737583448e-07, "logits/chosen": 1.9502754211425781, "logits/rejected": 2.887373447418213, "logps/chosen": -407.0714111328125, "logps/rejected": -475.75860595703125, "loss": 0.5405, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.3871901035308838, "rewards/margins": 0.8300696611404419, "rewards/rejected": -2.2172598838806152, "step": 220 }, { "epoch": 0.5275229357798165, "grad_norm": 21.81682834473053, "learning_rate": 2.7001422664752333e-07, "logits/chosen": 2.0954604148864746, "logits/rejected": 3.134028673171997, "logps/chosen": -393.80865478515625, "logps/rejected": -481.6973571777344, "loss": 0.535, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.1964021921157837, "rewards/margins": 1.084702968597412, "rewards/rejected": -2.281104803085327, "step": 230 }, { "epoch": 0.5504587155963303, "grad_norm": 20.331534801215742, "learning_rate": 2.5e-07, "logits/chosen": 2.4693617820739746, "logits/rejected": 2.7029402256011963, "logps/chosen": -397.209716796875, "logps/rejected": -480.30621337890625, "loss": 0.5634, "rewards/accuracies": 0.6875, "rewards/chosen": -1.274371862411499, "rewards/margins": 0.8711179494857788, "rewards/rejected": -2.1454896926879883, "step": 240 }, { "epoch": 0.573394495412844, "grad_norm": 21.16814139127329, "learning_rate": 2.2998577335247667e-07, "logits/chosen": 2.334216356277466, "logits/rejected": 3.1122984886169434, "logps/chosen": -399.35968017578125, "logps/rejected": -462.42877197265625, "loss": 0.5391, "rewards/accuracies": 0.6875, "rewards/chosen": -1.344590425491333, "rewards/margins": 0.8345645070075989, "rewards/rejected": -2.179154872894287, "step": 250 }, { "epoch": 0.573394495412844, "eval_logits/chosen": 1.8166545629501343, "eval_logits/rejected": 2.9561386108398438, "eval_logps/chosen": -392.5903015136719, "eval_logps/rejected": -442.3040771484375, "eval_loss": 0.5333030819892883, "eval_rewards/accuracies": 0.7198275923728943, "eval_rewards/chosen": -1.0858436822891235, "eval_rewards/margins": 0.8807222843170166, "eval_rewards/rejected": -1.9665659666061401, "eval_runtime": 91.6089, "eval_samples_per_second": 19.845, "eval_steps_per_second": 0.317, "step": 250 }, { "epoch": 0.5963302752293578, "grad_norm": 24.05630881187602, "learning_rate": 2.1010002624165524e-07, "logits/chosen": 2.180393934249878, "logits/rejected": 3.2447829246520996, "logps/chosen": -416.7367248535156, "logps/rejected": -477.38671875, "loss": 0.5431, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.1423505544662476, "rewards/margins": 1.0397279262542725, "rewards/rejected": -2.1820783615112305, "step": 260 }, { "epoch": 0.6192660550458715, "grad_norm": 16.426211814362816, "learning_rate": 1.9047041344135043e-07, "logits/chosen": 2.4754998683929443, "logits/rejected": 3.3202342987060547, "logps/chosen": -418.9905700683594, "logps/rejected": -466.9713439941406, "loss": 0.5554, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.4273664951324463, "rewards/margins": 0.7679312229156494, "rewards/rejected": -2.1952977180480957, "step": 270 }, { "epoch": 0.6422018348623854, "grad_norm": 25.36799111369545, "learning_rate": 1.7122294549409482e-07, "logits/chosen": 2.9461216926574707, "logits/rejected": 3.8612606525421143, "logps/chosen": -443.60198974609375, "logps/rejected": -535.1948852539062, "loss": 0.5313, "rewards/accuracies": 0.71875, "rewards/chosen": -1.662767767906189, "rewards/margins": 0.998543918132782, "rewards/rejected": -2.6613118648529053, "step": 280 }, { "epoch": 0.6651376146788991, "grad_norm": 15.931208067906516, "learning_rate": 1.524811797977383e-07, "logits/chosen": 2.2281603813171387, "logits/rejected": 3.0743608474731445, "logps/chosen": -415.99908447265625, "logps/rejected": -480.72003173828125, "loss": 0.5279, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2590678930282593, "rewards/margins": 0.8066269159317017, "rewards/rejected": -2.065694808959961, "step": 290 }, { "epoch": 0.6880733944954128, "grad_norm": 18.614598999130695, "learning_rate": 1.3436542743979125e-07, "logits/chosen": 2.0644378662109375, "logits/rejected": 3.2977874279022217, "logps/chosen": -393.56756591796875, "logps/rejected": -459.68646240234375, "loss": 0.5479, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0834629535675049, "rewards/margins": 1.0138219594955444, "rewards/rejected": -2.097285032272339, "step": 300 }, { "epoch": 0.6880733944954128, "eval_logits/chosen": 2.0026185512542725, "eval_logits/rejected": 3.223935604095459, "eval_logps/chosen": -388.63787841796875, "eval_logps/rejected": -442.7093200683594, "eval_loss": 0.5265418291091919, "eval_rewards/accuracies": 0.7068965435028076, "eval_rewards/chosen": -1.0463188886642456, "eval_rewards/margins": 0.9242996573448181, "eval_rewards/rejected": -1.970618486404419, "eval_runtime": 90.447, "eval_samples_per_second": 20.1, "eval_steps_per_second": 0.321, "step": 300 }, { "epoch": 0.7110091743119266, "grad_norm": 25.782071483124422, "learning_rate": 1.1699198087116588e-07, "logits/chosen": 2.8770992755889893, "logits/rejected": 3.6848435401916504, "logps/chosen": -387.76580810546875, "logps/rejected": -468.38275146484375, "loss": 0.5499, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.336073875427246, "rewards/margins": 0.9252589344978333, "rewards/rejected": -2.2613327503204346, "step": 310 }, { "epoch": 0.7339449541284404, "grad_norm": 23.531042495765035, "learning_rate": 1.00472367377196e-07, "logits/chosen": 2.587601900100708, "logits/rejected": 3.9543087482452393, "logps/chosen": -440.2958984375, "logps/rejected": -498.0613708496094, "loss": 0.5302, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.3440136909484863, "rewards/margins": 1.1366775035858154, "rewards/rejected": -2.4806911945343018, "step": 320 }, { "epoch": 0.7568807339449541, "grad_norm": 22.178841978203927, "learning_rate": 8.49126331382102e-08, "logits/chosen": 2.5279412269592285, "logits/rejected": 3.4965198040008545, "logps/chosen": -422.66168212890625, "logps/rejected": -501.438720703125, "loss": 0.5342, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4393374919891357, "rewards/margins": 0.8559640645980835, "rewards/rejected": -2.295301914215088, "step": 330 }, { "epoch": 0.7798165137614679, "grad_norm": 19.61314237963683, "learning_rate": 7.041266247556812e-08, "logits/chosen": 2.785928726196289, "logits/rejected": 3.915510892868042, "logps/chosen": -388.799072265625, "logps/rejected": -494.65606689453125, "loss": 0.5294, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.265873670578003, "rewards/margins": 1.0917268991470337, "rewards/rejected": -2.357600450515747, "step": 340 }, { "epoch": 0.8027522935779816, "grad_norm": 22.588827480706584, "learning_rate": 5.706553665319955e-08, "logits/chosen": 2.3770060539245605, "logits/rejected": 4.068874835968018, "logps/chosen": -419.5255432128906, "logps/rejected": -510.02911376953125, "loss": 0.5232, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2834579944610596, "rewards/margins": 1.3700745105743408, "rewards/rejected": -2.6535322666168213, "step": 350 }, { "epoch": 0.8027522935779816, "eval_logits/chosen": 2.348414182662964, "eval_logits/rejected": 3.6065878868103027, "eval_logps/chosen": -417.5965881347656, "eval_logps/rejected": -477.5577392578125, "eval_loss": 0.5262271761894226, "eval_rewards/accuracies": 0.7241379022598267, "eval_rewards/chosen": -1.3359062671661377, "eval_rewards/margins": 0.9831959009170532, "eval_rewards/rejected": -2.3191022872924805, "eval_runtime": 91.8801, "eval_samples_per_second": 19.787, "eval_steps_per_second": 0.316, "step": 350 }, { "epoch": 0.8256880733944955, "grad_norm": 22.898724036504742, "learning_rate": 4.4956936350761005e-08, "logits/chosen": 2.4756264686584473, "logits/rejected": 3.231902599334717, "logps/chosen": -419.9034118652344, "logps/rejected": -510.82781982421875, "loss": 0.5254, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.401601791381836, "rewards/margins": 1.0482218265533447, "rewards/rejected": -2.4498236179351807, "step": 360 }, { "epoch": 0.8486238532110092, "grad_norm": 21.290872916140614, "learning_rate": 3.416459164418123e-08, "logits/chosen": 1.8261902332305908, "logits/rejected": 3.2766151428222656, "logps/chosen": -459.34906005859375, "logps/rejected": -512.47314453125, "loss": 0.5204, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.2751537561416626, "rewards/margins": 1.1164480447769165, "rewards/rejected": -2.391602039337158, "step": 370 }, { "epoch": 0.8715596330275229, "grad_norm": 20.41896976274452, "learning_rate": 2.475778302439524e-08, "logits/chosen": 2.1876559257507324, "logits/rejected": 3.5514347553253174, "logps/chosen": -429.52801513671875, "logps/rejected": -452.6607360839844, "loss": 0.5244, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.362518548965454, "rewards/margins": 0.9127564430236816, "rewards/rejected": -2.2752749919891357, "step": 380 }, { "epoch": 0.8944954128440367, "grad_norm": 20.106111939027084, "learning_rate": 1.6796896657433805e-08, "logits/chosen": 1.5682854652404785, "logits/rejected": 3.198239803314209, "logps/chosen": -423.41143798828125, "logps/rejected": -513.44140625, "loss": 0.5138, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.1543933153152466, "rewards/margins": 1.4496588706970215, "rewards/rejected": -2.6040520668029785, "step": 390 }, { "epoch": 0.9174311926605505, "grad_norm": 22.36268387575501, "learning_rate": 1.0333036740834855e-08, "logits/chosen": 2.2944397926330566, "logits/rejected": 3.2362308502197266, "logps/chosen": -427.0224609375, "logps/rejected": -509.18438720703125, "loss": 0.5267, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.3516565561294556, "rewards/margins": 0.9079391360282898, "rewards/rejected": -2.2595956325531006, "step": 400 }, { "epoch": 0.9174311926605505, "eval_logits/chosen": 1.9855237007141113, "eval_logits/rejected": 3.3069264888763428, "eval_logps/chosen": -402.9078674316406, "eval_logps/rejected": -463.85418701171875, "eval_loss": 0.5237594246864319, "eval_rewards/accuracies": 0.7241379022598267, "eval_rewards/chosen": -1.189018964767456, "eval_rewards/margins": 0.9930478930473328, "eval_rewards/rejected": -2.1820664405822754, "eval_runtime": 90.561, "eval_samples_per_second": 20.075, "eval_steps_per_second": 0.32, "step": 400 }, { "epoch": 0.9403669724770642, "grad_norm": 20.20141424383877, "learning_rate": 5.4076974448211685e-09, "logits/chosen": 2.3932690620422363, "logits/rejected": 3.2205722332000732, "logps/chosen": -426.5123596191406, "logps/rejected": -476.37139892578125, "loss": 0.5452, "rewards/accuracies": 0.71875, "rewards/chosen": -1.391105055809021, "rewards/margins": 0.8132905960083008, "rewards/rejected": -2.2043957710266113, "step": 410 }, { "epoch": 0.963302752293578, "grad_norm": 20.629666257184397, "learning_rate": 2.052496544188487e-09, "logits/chosen": 2.141890048980713, "logits/rejected": 3.76823091506958, "logps/chosen": -436.96722412109375, "logps/rejected": -471.711181640625, "loss": 0.5323, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.3215954303741455, "rewards/margins": 1.0597209930419922, "rewards/rejected": -2.381316661834717, "step": 420 }, { "epoch": 0.9862385321100917, "grad_norm": 17.42236283649955, "learning_rate": 2.889724508297886e-10, "logits/chosen": 2.458095073699951, "logits/rejected": 3.361394166946411, "logps/chosen": -389.62994384765625, "logps/rejected": -474.5247497558594, "loss": 0.5251, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.2771459817886353, "rewards/margins": 0.9393760561943054, "rewards/rejected": -2.216521739959717, "step": 430 }, { "epoch": 1.0, "step": 436, "total_flos": 0.0, "train_loss": 0.5659637576943144, "train_runtime": 11398.0027, "train_samples_per_second": 4.892, "train_steps_per_second": 0.038 } ], "logging_steps": 10, "max_steps": 436, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }