sfulay's picture
Model save
a7ae430 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 50,
"global_step": 436,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.022935779816513763,
"grad_norm": 5.356178331285126,
"learning_rate": 1.1363636363636363e-07,
"logits/chosen": -2.6583542823791504,
"logits/rejected": -2.612396240234375,
"logps/chosen": -310.2690124511719,
"logps/rejected": -241.6248321533203,
"loss": 0.6932,
"rewards/accuracies": 0.36250001192092896,
"rewards/chosen": -4.61353047285229e-05,
"rewards/margins": -0.00015705036639701575,
"rewards/rejected": 0.00011091506894445047,
"step": 10
},
{
"epoch": 0.045871559633027525,
"grad_norm": 6.4233925318831595,
"learning_rate": 2.2727272727272726e-07,
"logits/chosen": -2.691195011138916,
"logits/rejected": -2.6153342723846436,
"logps/chosen": -293.5455627441406,
"logps/rejected": -265.6838684082031,
"loss": 0.6924,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 0.001484546228311956,
"rewards/margins": 0.002768759150058031,
"rewards/rejected": -0.0012842128053307533,
"step": 20
},
{
"epoch": 0.06880733944954129,
"grad_norm": 5.149124678509347,
"learning_rate": 3.4090909090909085e-07,
"logits/chosen": -2.6977083683013916,
"logits/rejected": -2.63045072555542,
"logps/chosen": -277.82159423828125,
"logps/rejected": -297.18646240234375,
"loss": 0.6892,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 0.004203228745609522,
"rewards/margins": 0.009881972335278988,
"rewards/rejected": -0.005678744055330753,
"step": 30
},
{
"epoch": 0.09174311926605505,
"grad_norm": 6.002207032235101,
"learning_rate": 4.545454545454545e-07,
"logits/chosen": -2.616579294204712,
"logits/rejected": -2.5455870628356934,
"logps/chosen": -283.92156982421875,
"logps/rejected": -259.82562255859375,
"loss": 0.6798,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.036965593695640564,
"rewards/margins": 0.04610789567232132,
"rewards/rejected": -0.009142300114035606,
"step": 40
},
{
"epoch": 0.11467889908256881,
"grad_norm": 5.926817590245787,
"learning_rate": 4.997110275491701e-07,
"logits/chosen": -2.596590518951416,
"logits/rejected": -2.512640953063965,
"logps/chosen": -285.3323669433594,
"logps/rejected": -247.4479522705078,
"loss": 0.6687,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.006985962390899658,
"rewards/margins": 0.058415599167346954,
"rewards/rejected": -0.06540156155824661,
"step": 50
},
{
"epoch": 0.11467889908256881,
"eval_logits/chosen": -2.607215166091919,
"eval_logits/rejected": -2.5074896812438965,
"eval_logps/chosen": -286.6437683105469,
"eval_logps/rejected": -258.6246032714844,
"eval_loss": 0.6559526920318604,
"eval_rewards/accuracies": 0.6724137663841248,
"eval_rewards/chosen": -0.026378028094768524,
"eval_rewards/margins": 0.10339301824569702,
"eval_rewards/rejected": -0.12977103888988495,
"eval_runtime": 92.1507,
"eval_samples_per_second": 19.729,
"eval_steps_per_second": 0.315,
"step": 50
},
{
"epoch": 0.13761467889908258,
"grad_norm": 7.494952728753531,
"learning_rate": 4.979475034558115e-07,
"logits/chosen": -2.582334518432617,
"logits/rejected": -2.508467197418213,
"logps/chosen": -292.1842346191406,
"logps/rejected": -282.423583984375,
"loss": 0.6423,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.11595962941646576,
"rewards/margins": 0.1907343566417694,
"rewards/rejected": -0.306693971157074,
"step": 60
},
{
"epoch": 0.16055045871559634,
"grad_norm": 18.148816686471342,
"learning_rate": 4.945923025551788e-07,
"logits/chosen": -2.459238052368164,
"logits/rejected": -2.3897058963775635,
"logps/chosen": -298.2831115722656,
"logps/rejected": -273.2386474609375,
"loss": 0.6393,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.29933103919029236,
"rewards/margins": 0.23945657908916473,
"rewards/rejected": -0.5387876629829407,
"step": 70
},
{
"epoch": 0.1834862385321101,
"grad_norm": 12.734144337443169,
"learning_rate": 4.896669632591651e-07,
"logits/chosen": -2.5085086822509766,
"logits/rejected": -2.3976407051086426,
"logps/chosen": -305.76031494140625,
"logps/rejected": -321.8554992675781,
"loss": 0.6235,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.3573322296142578,
"rewards/margins": 0.28428393602371216,
"rewards/rejected": -0.6416162252426147,
"step": 80
},
{
"epoch": 0.20642201834862386,
"grad_norm": 14.039079346644037,
"learning_rate": 4.832031033425662e-07,
"logits/chosen": -1.4997788667678833,
"logits/rejected": -1.313194990158081,
"logps/chosen": -348.44805908203125,
"logps/rejected": -361.76226806640625,
"loss": 0.5956,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.44265589118003845,
"rewards/margins": 0.4234777092933655,
"rewards/rejected": -0.8661335706710815,
"step": 90
},
{
"epoch": 0.22935779816513763,
"grad_norm": 13.29279140070498,
"learning_rate": 4.752422169756047e-07,
"logits/chosen": -0.19194559752941132,
"logits/rejected": 0.2622618079185486,
"logps/chosen": -339.16339111328125,
"logps/rejected": -359.37176513671875,
"loss": 0.581,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.598974347114563,
"rewards/margins": 0.4787676930427551,
"rewards/rejected": -1.0777419805526733,
"step": 100
},
{
"epoch": 0.22935779816513763,
"eval_logits/chosen": 0.026995467022061348,
"eval_logits/rejected": 0.6340460777282715,
"eval_logps/chosen": -357.115966796875,
"eval_logps/rejected": -377.3665771484375,
"eval_loss": 0.5763944387435913,
"eval_rewards/accuracies": 0.7155172228813171,
"eval_rewards/chosen": -0.7311002016067505,
"eval_rewards/margins": 0.5860908627510071,
"eval_rewards/rejected": -1.3171910047531128,
"eval_runtime": 91.0093,
"eval_samples_per_second": 19.976,
"eval_steps_per_second": 0.319,
"step": 100
},
{
"epoch": 0.25229357798165136,
"grad_norm": 27.36521925016087,
"learning_rate": 4.658354083558188e-07,
"logits/chosen": -0.14074298739433289,
"logits/rejected": 0.41164666414260864,
"logps/chosen": -359.0007019042969,
"logps/rejected": -422.62353515625,
"loss": 0.5561,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.6176259517669678,
"rewards/margins": 0.7909212708473206,
"rewards/rejected": -1.4085471630096436,
"step": 110
},
{
"epoch": 0.27522935779816515,
"grad_norm": 18.22825267425928,
"learning_rate": 4.550430636492389e-07,
"logits/chosen": 0.28136759996414185,
"logits/rejected": 1.2520945072174072,
"logps/chosen": -414.25665283203125,
"logps/rejected": -428.6090393066406,
"loss": 0.5788,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.0384491682052612,
"rewards/margins": 0.647238552570343,
"rewards/rejected": -1.6856876611709595,
"step": 120
},
{
"epoch": 0.2981651376146789,
"grad_norm": 18.72996488177851,
"learning_rate": 4.429344633468004e-07,
"logits/chosen": 1.1580041646957397,
"logits/rejected": 1.9673328399658203,
"logps/chosen": -384.8316650390625,
"logps/rejected": -440.20672607421875,
"loss": 0.5744,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.9717643857002258,
"rewards/margins": 0.8623247146606445,
"rewards/rejected": -1.8340890407562256,
"step": 130
},
{
"epoch": 0.3211009174311927,
"grad_norm": 18.77533851044078,
"learning_rate": 4.2958733752443187e-07,
"logits/chosen": 0.9655276536941528,
"logits/rejected": 1.986130952835083,
"logps/chosen": -377.4757995605469,
"logps/rejected": -408.6956481933594,
"loss": 0.553,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.9340255856513977,
"rewards/margins": 0.7136737704277039,
"rewards/rejected": -1.6476993560791016,
"step": 140
},
{
"epoch": 0.3440366972477064,
"grad_norm": 22.441752676286086,
"learning_rate": 4.150873668617898e-07,
"logits/chosen": 1.651755928993225,
"logits/rejected": 2.6961984634399414,
"logps/chosen": -394.5315856933594,
"logps/rejected": -437.6512756347656,
"loss": 0.558,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.0381582975387573,
"rewards/margins": 0.7305435538291931,
"rewards/rejected": -1.7687019109725952,
"step": 150
},
{
"epoch": 0.3440366972477064,
"eval_logits/chosen": 2.0827815532684326,
"eval_logits/rejected": 3.0035645961761475,
"eval_logps/chosen": -404.3199157714844,
"eval_logps/rejected": -442.60711669921875,
"eval_loss": 0.5509841442108154,
"eval_rewards/accuracies": 0.7241379022598267,
"eval_rewards/chosen": -1.203139305114746,
"eval_rewards/margins": 0.7664569616317749,
"eval_rewards/rejected": -1.9695963859558105,
"eval_runtime": 90.3932,
"eval_samples_per_second": 20.112,
"eval_steps_per_second": 0.321,
"step": 150
},
{
"epoch": 0.3669724770642202,
"grad_norm": 24.238500011603442,
"learning_rate": 3.9952763262280397e-07,
"logits/chosen": 1.6490274667739868,
"logits/rejected": 2.5100581645965576,
"logps/chosen": -409.46240234375,
"logps/rejected": -448.33001708984375,
"loss": 0.557,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.1305733919143677,
"rewards/margins": 0.8016298413276672,
"rewards/rejected": -1.9322032928466797,
"step": 160
},
{
"epoch": 0.38990825688073394,
"grad_norm": 29.076032215796957,
"learning_rate": 3.8300801912883414e-07,
"logits/chosen": 1.5585577487945557,
"logits/rejected": 2.380032777786255,
"logps/chosen": -372.0144958496094,
"logps/rejected": -400.96905517578125,
"loss": 0.5388,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.0608928203582764,
"rewards/margins": 0.7344645261764526,
"rewards/rejected": -1.795357346534729,
"step": 170
},
{
"epoch": 0.41284403669724773,
"grad_norm": 23.777603972721764,
"learning_rate": 3.6563457256020884e-07,
"logits/chosen": 1.052141785621643,
"logits/rejected": 1.8935604095458984,
"logps/chosen": -356.8204650878906,
"logps/rejected": -432.20001220703125,
"loss": 0.5439,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.9667918086051941,
"rewards/margins": 0.9197394251823425,
"rewards/rejected": -1.886531114578247,
"step": 180
},
{
"epoch": 0.43577981651376146,
"grad_norm": 20.231853124698564,
"learning_rate": 3.475188202022617e-07,
"logits/chosen": 1.569053292274475,
"logits/rejected": 2.5012192726135254,
"logps/chosen": -349.7216491699219,
"logps/rejected": -458.28955078125,
"loss": 0.5442,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.9105401039123535,
"rewards/margins": 1.0454990863800049,
"rewards/rejected": -1.9560391902923584,
"step": 190
},
{
"epoch": 0.45871559633027525,
"grad_norm": 20.18742592623794,
"learning_rate": 3.287770545059052e-07,
"logits/chosen": 2.6468214988708496,
"logits/rejected": 3.313246965408325,
"logps/chosen": -413.1968688964844,
"logps/rejected": -454.881591796875,
"loss": 0.5346,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.473356008529663,
"rewards/margins": 0.712754487991333,
"rewards/rejected": -2.186110258102417,
"step": 200
},
{
"epoch": 0.45871559633027525,
"eval_logits/chosen": 1.7577229738235474,
"eval_logits/rejected": 2.7758734226226807,
"eval_logps/chosen": -400.7710876464844,
"eval_logps/rejected": -449.201904296875,
"eval_loss": 0.5381261706352234,
"eval_rewards/accuracies": 0.7112069129943848,
"eval_rewards/chosen": -1.1676514148712158,
"eval_rewards/margins": 0.8678924441337585,
"eval_rewards/rejected": -2.03554368019104,
"eval_runtime": 90.283,
"eval_samples_per_second": 20.137,
"eval_steps_per_second": 0.321,
"step": 200
},
{
"epoch": 0.481651376146789,
"grad_norm": 21.096800994630236,
"learning_rate": 3.0952958655864954e-07,
"logits/chosen": 2.1683189868927,
"logits/rejected": 2.6720829010009766,
"logps/chosen": -401.7050476074219,
"logps/rejected": -487.34161376953125,
"loss": 0.5345,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.2490909099578857,
"rewards/margins": 0.7777953743934631,
"rewards/rejected": -2.026886463165283,
"step": 210
},
{
"epoch": 0.5045871559633027,
"grad_norm": 35.955511790614246,
"learning_rate": 2.898999737583448e-07,
"logits/chosen": 1.9502754211425781,
"logits/rejected": 2.887373447418213,
"logps/chosen": -407.0714111328125,
"logps/rejected": -475.75860595703125,
"loss": 0.5405,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.3871901035308838,
"rewards/margins": 0.8300696611404419,
"rewards/rejected": -2.2172598838806152,
"step": 220
},
{
"epoch": 0.5275229357798165,
"grad_norm": 21.81682834473053,
"learning_rate": 2.7001422664752333e-07,
"logits/chosen": 2.0954604148864746,
"logits/rejected": 3.134028673171997,
"logps/chosen": -393.80865478515625,
"logps/rejected": -481.6973571777344,
"loss": 0.535,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.1964021921157837,
"rewards/margins": 1.084702968597412,
"rewards/rejected": -2.281104803085327,
"step": 230
},
{
"epoch": 0.5504587155963303,
"grad_norm": 20.331534801215742,
"learning_rate": 2.5e-07,
"logits/chosen": 2.4693617820739746,
"logits/rejected": 2.7029402256011963,
"logps/chosen": -397.209716796875,
"logps/rejected": -480.30621337890625,
"loss": 0.5634,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.274371862411499,
"rewards/margins": 0.8711179494857788,
"rewards/rejected": -2.1454896926879883,
"step": 240
},
{
"epoch": 0.573394495412844,
"grad_norm": 21.16814139127329,
"learning_rate": 2.2998577335247667e-07,
"logits/chosen": 2.334216356277466,
"logits/rejected": 3.1122984886169434,
"logps/chosen": -399.35968017578125,
"logps/rejected": -462.42877197265625,
"loss": 0.5391,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.344590425491333,
"rewards/margins": 0.8345645070075989,
"rewards/rejected": -2.179154872894287,
"step": 250
},
{
"epoch": 0.573394495412844,
"eval_logits/chosen": 1.8166545629501343,
"eval_logits/rejected": 2.9561386108398438,
"eval_logps/chosen": -392.5903015136719,
"eval_logps/rejected": -442.3040771484375,
"eval_loss": 0.5333030819892883,
"eval_rewards/accuracies": 0.7198275923728943,
"eval_rewards/chosen": -1.0858436822891235,
"eval_rewards/margins": 0.8807222843170166,
"eval_rewards/rejected": -1.9665659666061401,
"eval_runtime": 91.6089,
"eval_samples_per_second": 19.845,
"eval_steps_per_second": 0.317,
"step": 250
},
{
"epoch": 0.5963302752293578,
"grad_norm": 24.05630881187602,
"learning_rate": 2.1010002624165524e-07,
"logits/chosen": 2.180393934249878,
"logits/rejected": 3.2447829246520996,
"logps/chosen": -416.7367248535156,
"logps/rejected": -477.38671875,
"loss": 0.5431,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -1.1423505544662476,
"rewards/margins": 1.0397279262542725,
"rewards/rejected": -2.1820783615112305,
"step": 260
},
{
"epoch": 0.6192660550458715,
"grad_norm": 16.426211814362816,
"learning_rate": 1.9047041344135043e-07,
"logits/chosen": 2.4754998683929443,
"logits/rejected": 3.3202342987060547,
"logps/chosen": -418.9905700683594,
"logps/rejected": -466.9713439941406,
"loss": 0.5554,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -1.4273664951324463,
"rewards/margins": 0.7679312229156494,
"rewards/rejected": -2.1952977180480957,
"step": 270
},
{
"epoch": 0.6422018348623854,
"grad_norm": 25.36799111369545,
"learning_rate": 1.7122294549409482e-07,
"logits/chosen": 2.9461216926574707,
"logits/rejected": 3.8612606525421143,
"logps/chosen": -443.60198974609375,
"logps/rejected": -535.1948852539062,
"loss": 0.5313,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.662767767906189,
"rewards/margins": 0.998543918132782,
"rewards/rejected": -2.6613118648529053,
"step": 280
},
{
"epoch": 0.6651376146788991,
"grad_norm": 15.931208067906516,
"learning_rate": 1.524811797977383e-07,
"logits/chosen": 2.2281603813171387,
"logits/rejected": 3.0743608474731445,
"logps/chosen": -415.99908447265625,
"logps/rejected": -480.72003173828125,
"loss": 0.5279,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.2590678930282593,
"rewards/margins": 0.8066269159317017,
"rewards/rejected": -2.065694808959961,
"step": 290
},
{
"epoch": 0.6880733944954128,
"grad_norm": 18.614598999130695,
"learning_rate": 1.3436542743979125e-07,
"logits/chosen": 2.0644378662109375,
"logits/rejected": 3.2977874279022217,
"logps/chosen": -393.56756591796875,
"logps/rejected": -459.68646240234375,
"loss": 0.5479,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -1.0834629535675049,
"rewards/margins": 1.0138219594955444,
"rewards/rejected": -2.097285032272339,
"step": 300
},
{
"epoch": 0.6880733944954128,
"eval_logits/chosen": 2.0026185512542725,
"eval_logits/rejected": 3.223935604095459,
"eval_logps/chosen": -388.63787841796875,
"eval_logps/rejected": -442.7093200683594,
"eval_loss": 0.5265418291091919,
"eval_rewards/accuracies": 0.7068965435028076,
"eval_rewards/chosen": -1.0463188886642456,
"eval_rewards/margins": 0.9242996573448181,
"eval_rewards/rejected": -1.970618486404419,
"eval_runtime": 90.447,
"eval_samples_per_second": 20.1,
"eval_steps_per_second": 0.321,
"step": 300
},
{
"epoch": 0.7110091743119266,
"grad_norm": 25.782071483124422,
"learning_rate": 1.1699198087116588e-07,
"logits/chosen": 2.8770992755889893,
"logits/rejected": 3.6848435401916504,
"logps/chosen": -387.76580810546875,
"logps/rejected": -468.38275146484375,
"loss": 0.5499,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.336073875427246,
"rewards/margins": 0.9252589344978333,
"rewards/rejected": -2.2613327503204346,
"step": 310
},
{
"epoch": 0.7339449541284404,
"grad_norm": 23.531042495765035,
"learning_rate": 1.00472367377196e-07,
"logits/chosen": 2.587601900100708,
"logits/rejected": 3.9543087482452393,
"logps/chosen": -440.2958984375,
"logps/rejected": -498.0613708496094,
"loss": 0.5302,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.3440136909484863,
"rewards/margins": 1.1366775035858154,
"rewards/rejected": -2.4806911945343018,
"step": 320
},
{
"epoch": 0.7568807339449541,
"grad_norm": 22.178841978203927,
"learning_rate": 8.49126331382102e-08,
"logits/chosen": 2.5279412269592285,
"logits/rejected": 3.4965198040008545,
"logps/chosen": -422.66168212890625,
"logps/rejected": -501.438720703125,
"loss": 0.5342,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.4393374919891357,
"rewards/margins": 0.8559640645980835,
"rewards/rejected": -2.295301914215088,
"step": 330
},
{
"epoch": 0.7798165137614679,
"grad_norm": 19.61314237963683,
"learning_rate": 7.041266247556812e-08,
"logits/chosen": 2.785928726196289,
"logits/rejected": 3.915510892868042,
"logps/chosen": -388.799072265625,
"logps/rejected": -494.65606689453125,
"loss": 0.5294,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.265873670578003,
"rewards/margins": 1.0917268991470337,
"rewards/rejected": -2.357600450515747,
"step": 340
},
{
"epoch": 0.8027522935779816,
"grad_norm": 22.588827480706584,
"learning_rate": 5.706553665319955e-08,
"logits/chosen": 2.3770060539245605,
"logits/rejected": 4.068874835968018,
"logps/chosen": -419.5255432128906,
"logps/rejected": -510.02911376953125,
"loss": 0.5232,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.2834579944610596,
"rewards/margins": 1.3700745105743408,
"rewards/rejected": -2.6535322666168213,
"step": 350
},
{
"epoch": 0.8027522935779816,
"eval_logits/chosen": 2.348414182662964,
"eval_logits/rejected": 3.6065878868103027,
"eval_logps/chosen": -417.5965881347656,
"eval_logps/rejected": -477.5577392578125,
"eval_loss": 0.5262271761894226,
"eval_rewards/accuracies": 0.7241379022598267,
"eval_rewards/chosen": -1.3359062671661377,
"eval_rewards/margins": 0.9831959009170532,
"eval_rewards/rejected": -2.3191022872924805,
"eval_runtime": 91.8801,
"eval_samples_per_second": 19.787,
"eval_steps_per_second": 0.316,
"step": 350
},
{
"epoch": 0.8256880733944955,
"grad_norm": 22.898724036504742,
"learning_rate": 4.4956936350761005e-08,
"logits/chosen": 2.4756264686584473,
"logits/rejected": 3.231902599334717,
"logps/chosen": -419.9034118652344,
"logps/rejected": -510.82781982421875,
"loss": 0.5254,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.401601791381836,
"rewards/margins": 1.0482218265533447,
"rewards/rejected": -2.4498236179351807,
"step": 360
},
{
"epoch": 0.8486238532110092,
"grad_norm": 21.290872916140614,
"learning_rate": 3.416459164418123e-08,
"logits/chosen": 1.8261902332305908,
"logits/rejected": 3.2766151428222656,
"logps/chosen": -459.34906005859375,
"logps/rejected": -512.47314453125,
"loss": 0.5204,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.2751537561416626,
"rewards/margins": 1.1164480447769165,
"rewards/rejected": -2.391602039337158,
"step": 370
},
{
"epoch": 0.8715596330275229,
"grad_norm": 20.41896976274452,
"learning_rate": 2.475778302439524e-08,
"logits/chosen": 2.1876559257507324,
"logits/rejected": 3.5514347553253174,
"logps/chosen": -429.52801513671875,
"logps/rejected": -452.6607360839844,
"loss": 0.5244,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -1.362518548965454,
"rewards/margins": 0.9127564430236816,
"rewards/rejected": -2.2752749919891357,
"step": 380
},
{
"epoch": 0.8944954128440367,
"grad_norm": 20.106111939027084,
"learning_rate": 1.6796896657433805e-08,
"logits/chosen": 1.5682854652404785,
"logits/rejected": 3.198239803314209,
"logps/chosen": -423.41143798828125,
"logps/rejected": -513.44140625,
"loss": 0.5138,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.1543933153152466,
"rewards/margins": 1.4496588706970215,
"rewards/rejected": -2.6040520668029785,
"step": 390
},
{
"epoch": 0.9174311926605505,
"grad_norm": 22.36268387575501,
"learning_rate": 1.0333036740834855e-08,
"logits/chosen": 2.2944397926330566,
"logits/rejected": 3.2362308502197266,
"logps/chosen": -427.0224609375,
"logps/rejected": -509.18438720703125,
"loss": 0.5267,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.3516565561294556,
"rewards/margins": 0.9079391360282898,
"rewards/rejected": -2.2595956325531006,
"step": 400
},
{
"epoch": 0.9174311926605505,
"eval_logits/chosen": 1.9855237007141113,
"eval_logits/rejected": 3.3069264888763428,
"eval_logps/chosen": -402.9078674316406,
"eval_logps/rejected": -463.85418701171875,
"eval_loss": 0.5237594246864319,
"eval_rewards/accuracies": 0.7241379022598267,
"eval_rewards/chosen": -1.189018964767456,
"eval_rewards/margins": 0.9930478930473328,
"eval_rewards/rejected": -2.1820664405822754,
"eval_runtime": 90.561,
"eval_samples_per_second": 20.075,
"eval_steps_per_second": 0.32,
"step": 400
},
{
"epoch": 0.9403669724770642,
"grad_norm": 20.20141424383877,
"learning_rate": 5.4076974448211685e-09,
"logits/chosen": 2.3932690620422363,
"logits/rejected": 3.2205722332000732,
"logps/chosen": -426.5123596191406,
"logps/rejected": -476.37139892578125,
"loss": 0.5452,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.391105055809021,
"rewards/margins": 0.8132905960083008,
"rewards/rejected": -2.2043957710266113,
"step": 410
},
{
"epoch": 0.963302752293578,
"grad_norm": 20.629666257184397,
"learning_rate": 2.052496544188487e-09,
"logits/chosen": 2.141890048980713,
"logits/rejected": 3.76823091506958,
"logps/chosen": -436.96722412109375,
"logps/rejected": -471.711181640625,
"loss": 0.5323,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -1.3215954303741455,
"rewards/margins": 1.0597209930419922,
"rewards/rejected": -2.381316661834717,
"step": 420
},
{
"epoch": 0.9862385321100917,
"grad_norm": 17.42236283649955,
"learning_rate": 2.889724508297886e-10,
"logits/chosen": 2.458095073699951,
"logits/rejected": 3.361394166946411,
"logps/chosen": -389.62994384765625,
"logps/rejected": -474.5247497558594,
"loss": 0.5251,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.2771459817886353,
"rewards/margins": 0.9393760561943054,
"rewards/rejected": -2.216521739959717,
"step": 430
},
{
"epoch": 1.0,
"step": 436,
"total_flos": 0.0,
"train_loss": 0.5659637576943144,
"train_runtime": 11398.0027,
"train_samples_per_second": 4.892,
"train_steps_per_second": 0.038
}
],
"logging_steps": 10,
"max_steps": 436,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}