mamba_0_75_dpo_ep1 / trainer_state.json
Junxiong Wang
add models
6b7d8bd
raw
history blame
103 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 2000,
"global_step": 1911,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0005232862375719519,
"grad_norm": 8.940283023609332,
"learning_rate": 2.6041666666666664e-09,
"logits/chosen": -3.4411821365356445,
"logits/rejected": -3.41083025932312,
"logps/chosen": -501.4610595703125,
"logps/rejected": -596.95849609375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.0052328623757195184,
"grad_norm": 8.689551063328278,
"learning_rate": 2.6041666666666667e-08,
"logits/chosen": -3.0671932697296143,
"logits/rejected": -3.0745370388031006,
"logps/chosen": -335.75750732421875,
"logps/rejected": -280.19635009765625,
"loss": 0.6931,
"rewards/accuracies": 0.4166666567325592,
"rewards/chosen": -0.000958989083301276,
"rewards/margins": -0.0004560473607853055,
"rewards/rejected": -0.0005029416061006486,
"step": 10
},
{
"epoch": 0.010465724751439037,
"grad_norm": 8.212994986770337,
"learning_rate": 5.208333333333333e-08,
"logits/chosen": -3.061262845993042,
"logits/rejected": -3.061492443084717,
"logps/chosen": -226.217529296875,
"logps/rejected": -215.25961303710938,
"loss": 0.6931,
"rewards/accuracies": 0.42500001192092896,
"rewards/chosen": -0.00020114154904149473,
"rewards/margins": -0.0007163770496845245,
"rewards/rejected": 0.0005152354133315384,
"step": 20
},
{
"epoch": 0.015698587127158554,
"grad_norm": 8.003664804799678,
"learning_rate": 7.812499999999999e-08,
"logits/chosen": -2.958186388015747,
"logits/rejected": -2.946017026901245,
"logps/chosen": -300.97979736328125,
"logps/rejected": -276.68634033203125,
"loss": 0.6932,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.0008273817365989089,
"rewards/margins": 0.001302235876210034,
"rewards/rejected": -0.00047485390678048134,
"step": 30
},
{
"epoch": 0.020931449502878074,
"grad_norm": 8.325071471418028,
"learning_rate": 1.0416666666666667e-07,
"logits/chosen": -3.1442675590515137,
"logits/rejected": -3.0619583129882812,
"logps/chosen": -316.81396484375,
"logps/rejected": -308.1053771972656,
"loss": 0.6929,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.0002774396270979196,
"rewards/margins": -0.0006835688254795969,
"rewards/rejected": 0.000406129052862525,
"step": 40
},
{
"epoch": 0.026164311878597593,
"grad_norm": 8.866152231216553,
"learning_rate": 1.3020833333333334e-07,
"logits/chosen": -3.1651201248168945,
"logits/rejected": -3.0696120262145996,
"logps/chosen": -297.9878845214844,
"logps/rejected": -271.53094482421875,
"loss": 0.6931,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.00013881332415621728,
"rewards/margins": -0.0007885316153988242,
"rewards/rejected": 0.0006497182184830308,
"step": 50
},
{
"epoch": 0.03139717425431711,
"grad_norm": 8.702138127987917,
"learning_rate": 1.5624999999999999e-07,
"logits/chosen": -3.0590338706970215,
"logits/rejected": -3.033952236175537,
"logps/chosen": -271.07489013671875,
"logps/rejected": -266.8861389160156,
"loss": 0.6927,
"rewards/accuracies": 0.4000000059604645,
"rewards/chosen": -0.0012097412254661322,
"rewards/margins": -0.0009210550342686474,
"rewards/rejected": -0.00028868610388599336,
"step": 60
},
{
"epoch": 0.03663003663003663,
"grad_norm": 9.17847945114856,
"learning_rate": 1.8229166666666666e-07,
"logits/chosen": -3.159991979598999,
"logits/rejected": -3.108083963394165,
"logps/chosen": -331.7613830566406,
"logps/rejected": -266.53192138671875,
"loss": 0.6927,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.0022141693625599146,
"rewards/margins": 0.004044364206492901,
"rewards/rejected": -0.0018301953095942736,
"step": 70
},
{
"epoch": 0.04186289900575615,
"grad_norm": 8.70085934375087,
"learning_rate": 2.0833333333333333e-07,
"logits/chosen": -3.105524778366089,
"logits/rejected": -3.0857417583465576,
"logps/chosen": -309.269287109375,
"logps/rejected": -304.593505859375,
"loss": 0.6923,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.0005593494279310107,
"rewards/margins": 0.0006081314058974385,
"rewards/rejected": -0.0011674808338284492,
"step": 80
},
{
"epoch": 0.04709576138147567,
"grad_norm": 7.70370023214065,
"learning_rate": 2.3437499999999998e-07,
"logits/chosen": -3.1123859882354736,
"logits/rejected": -3.0873000621795654,
"logps/chosen": -244.9102783203125,
"logps/rejected": -213.2863311767578,
"loss": 0.6921,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.003876964095979929,
"rewards/margins": 0.003002858255058527,
"rewards/rejected": 0.0008741060155443847,
"step": 90
},
{
"epoch": 0.052328623757195186,
"grad_norm": 7.584430634088357,
"learning_rate": 2.604166666666667e-07,
"logits/chosen": -3.119906425476074,
"logits/rejected": -3.0290744304656982,
"logps/chosen": -222.6891326904297,
"logps/rejected": -199.02757263183594,
"loss": 0.6913,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.0010819355957210064,
"rewards/margins": 0.0032298602163791656,
"rewards/rejected": -0.0021479243878275156,
"step": 100
},
{
"epoch": 0.0575614861329147,
"grad_norm": 8.205907431125933,
"learning_rate": 2.864583333333333e-07,
"logits/chosen": -3.0188896656036377,
"logits/rejected": -2.9393975734710693,
"logps/chosen": -261.2885437011719,
"logps/rejected": -197.14413452148438,
"loss": 0.69,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.00237687723711133,
"rewards/margins": 0.006252645049244165,
"rewards/rejected": -0.003875765949487686,
"step": 110
},
{
"epoch": 0.06279434850863422,
"grad_norm": 8.123874667175091,
"learning_rate": 3.1249999999999997e-07,
"logits/chosen": -3.150888681411743,
"logits/rejected": -3.160604953765869,
"logps/chosen": -369.46343994140625,
"logps/rejected": -341.2452392578125,
"loss": 0.69,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.002890329109504819,
"rewards/margins": 0.0010580271482467651,
"rewards/rejected": 0.0018323017284274101,
"step": 120
},
{
"epoch": 0.06802721088435375,
"grad_norm": 7.876000482657188,
"learning_rate": 3.3854166666666667e-07,
"logits/chosen": -3.0063118934631348,
"logits/rejected": -3.01861834526062,
"logps/chosen": -227.5029296875,
"logps/rejected": -238.10140991210938,
"loss": 0.6884,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.004924282897263765,
"rewards/margins": 0.012934369035065174,
"rewards/rejected": -0.008010086603462696,
"step": 130
},
{
"epoch": 0.07326007326007326,
"grad_norm": 8.190955614521185,
"learning_rate": 3.645833333333333e-07,
"logits/chosen": -2.970432996749878,
"logits/rejected": -2.9969277381896973,
"logps/chosen": -298.6478576660156,
"logps/rejected": -310.3978271484375,
"loss": 0.6858,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": 0.009556067176163197,
"rewards/margins": 0.02303231693804264,
"rewards/rejected": -0.013476249761879444,
"step": 140
},
{
"epoch": 0.07849293563579278,
"grad_norm": 9.052012593627612,
"learning_rate": 3.9062499999999997e-07,
"logits/chosen": -3.103742837905884,
"logits/rejected": -3.0245158672332764,
"logps/chosen": -263.4891662597656,
"logps/rejected": -248.58151245117188,
"loss": 0.6844,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.002595582278445363,
"rewards/margins": 0.0049351779744029045,
"rewards/rejected": -0.007530760020017624,
"step": 150
},
{
"epoch": 0.0837257980115123,
"grad_norm": 8.432630883076405,
"learning_rate": 4.1666666666666667e-07,
"logits/chosen": -3.114664077758789,
"logits/rejected": -3.012159824371338,
"logps/chosen": -262.4814453125,
"logps/rejected": -236.3439178466797,
"loss": 0.6837,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.009164141491055489,
"rewards/margins": 0.02413203939795494,
"rewards/rejected": -0.014967897906899452,
"step": 160
},
{
"epoch": 0.08895866038723181,
"grad_norm": 9.124712051546432,
"learning_rate": 4.427083333333333e-07,
"logits/chosen": -3.08809494972229,
"logits/rejected": -3.0950043201446533,
"logps/chosen": -232.0874481201172,
"logps/rejected": -265.0681457519531,
"loss": 0.6811,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.01028988417237997,
"rewards/margins": 0.015653502196073532,
"rewards/rejected": -0.005363619886338711,
"step": 170
},
{
"epoch": 0.09419152276295134,
"grad_norm": 7.433236173747129,
"learning_rate": 4.6874999999999996e-07,
"logits/chosen": -3.065229892730713,
"logits/rejected": -2.972902774810791,
"logps/chosen": -295.44805908203125,
"logps/rejected": -269.96026611328125,
"loss": 0.6817,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.028342243283987045,
"rewards/margins": 0.017250288277864456,
"rewards/rejected": -0.0455925352871418,
"step": 180
},
{
"epoch": 0.09942438513867086,
"grad_norm": 9.490780191701429,
"learning_rate": 4.947916666666667e-07,
"logits/chosen": -3.0910661220550537,
"logits/rejected": -3.00136137008667,
"logps/chosen": -304.7739562988281,
"logps/rejected": -260.3194274902344,
"loss": 0.6741,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.0017203543102368712,
"rewards/margins": 0.03843419998884201,
"rewards/rejected": -0.036713846027851105,
"step": 190
},
{
"epoch": 0.10465724751439037,
"grad_norm": 8.139099488973592,
"learning_rate": 4.999732803821339e-07,
"logits/chosen": -2.9885010719299316,
"logits/rejected": -2.911945104598999,
"logps/chosen": -274.64801025390625,
"logps/rejected": -299.32476806640625,
"loss": 0.6711,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.015315435826778412,
"rewards/margins": 0.0384557843208313,
"rewards/rejected": -0.05377122014760971,
"step": 200
},
{
"epoch": 0.10989010989010989,
"grad_norm": 8.597620117357772,
"learning_rate": 4.998647417232375e-07,
"logits/chosen": -3.049499750137329,
"logits/rejected": -2.9921631813049316,
"logps/chosen": -214.9530487060547,
"logps/rejected": -198.6853485107422,
"loss": 0.6828,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.011373547837138176,
"rewards/margins": 0.011143224313855171,
"rewards/rejected": -0.022516775876283646,
"step": 210
},
{
"epoch": 0.1151229722658294,
"grad_norm": 7.814974338489718,
"learning_rate": 4.996727502703357e-07,
"logits/chosen": -3.0792040824890137,
"logits/rejected": -3.0471749305725098,
"logps/chosen": -280.1625671386719,
"logps/rejected": -250.3578643798828,
"loss": 0.6749,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.013256365433335304,
"rewards/margins": 0.07305942475795746,
"rewards/rejected": -0.0598030686378479,
"step": 220
},
{
"epoch": 0.12035583464154893,
"grad_norm": 8.021422865019257,
"learning_rate": 4.993973701470142e-07,
"logits/chosen": -3.0776336193084717,
"logits/rejected": -3.065192461013794,
"logps/chosen": -246.03701782226562,
"logps/rejected": -330.24395751953125,
"loss": 0.6617,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.004228830337524414,
"rewards/margins": 0.06871353834867477,
"rewards/rejected": -0.07294236868619919,
"step": 230
},
{
"epoch": 0.12558869701726844,
"grad_norm": 8.830102325688348,
"learning_rate": 4.990386933279972e-07,
"logits/chosen": -3.057614326477051,
"logits/rejected": -2.985898971557617,
"logps/chosen": -237.7249755859375,
"logps/rejected": -227.34384155273438,
"loss": 0.6631,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.05727599188685417,
"rewards/margins": 0.04271426051855087,
"rewards/rejected": -0.09999025613069534,
"step": 240
},
{
"epoch": 0.13082155939298795,
"grad_norm": 9.25379873829916,
"learning_rate": 4.985968396084284e-07,
"logits/chosen": -2.9885993003845215,
"logits/rejected": -2.986743688583374,
"logps/chosen": -295.55853271484375,
"logps/rejected": -264.4269104003906,
"loss": 0.6579,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.047038041055202484,
"rewards/margins": 0.10233273357152939,
"rewards/rejected": -0.14937077462673187,
"step": 250
},
{
"epoch": 0.1360544217687075,
"grad_norm": 10.58421450887609,
"learning_rate": 4.98071956563861e-07,
"logits/chosen": -3.092935085296631,
"logits/rejected": -3.026994466781616,
"logps/chosen": -293.06158447265625,
"logps/rejected": -282.7720642089844,
"loss": 0.6588,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.01732378825545311,
"rewards/margins": 0.11920974403619766,
"rewards/rejected": -0.13653352856636047,
"step": 260
},
{
"epoch": 0.141287284144427,
"grad_norm": 7.811018005032728,
"learning_rate": 4.97464219500968e-07,
"logits/chosen": -3.0531859397888184,
"logits/rejected": -2.9968655109405518,
"logps/chosen": -283.08465576171875,
"logps/rejected": -289.63763427734375,
"loss": 0.6429,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.10270702838897705,
"rewards/margins": 0.05593450739979744,
"rewards/rejected": -0.1586415320634842,
"step": 270
},
{
"epoch": 0.14652014652014653,
"grad_norm": 9.605514735660224,
"learning_rate": 4.967738313989918e-07,
"logits/chosen": -3.0212960243225098,
"logits/rejected": -3.0366883277893066,
"logps/chosen": -314.5849914550781,
"logps/rejected": -308.0143127441406,
"loss": 0.6574,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.0773221030831337,
"rewards/margins": 0.13322624564170837,
"rewards/rejected": -0.21054835617542267,
"step": 280
},
{
"epoch": 0.15175300889586604,
"grad_norm": 8.389574929032044,
"learning_rate": 4.960010228419499e-07,
"logits/chosen": -3.1096034049987793,
"logits/rejected": -3.0065712928771973,
"logps/chosen": -336.0714416503906,
"logps/rejected": -258.11334228515625,
"loss": 0.6613,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.12762203812599182,
"rewards/margins": 0.07918674498796463,
"rewards/rejected": -0.20680880546569824,
"step": 290
},
{
"epoch": 0.15698587127158556,
"grad_norm": 8.7670335055176,
"learning_rate": 4.951460519416227e-07,
"logits/chosen": -3.085927963256836,
"logits/rejected": -3.0476162433624268,
"logps/chosen": -332.5134582519531,
"logps/rejected": -279.91424560546875,
"loss": 0.6507,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.18624143302440643,
"rewards/margins": 0.0757715106010437,
"rewards/rejected": -0.26201292872428894,
"step": 300
},
{
"epoch": 0.16221873364730507,
"grad_norm": 8.663994541963719,
"learning_rate": 4.942092042513458e-07,
"logits/chosen": -3.14369535446167,
"logits/rejected": -3.0388572216033936,
"logps/chosen": -326.9494934082031,
"logps/rejected": -318.8394470214844,
"loss": 0.6461,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.05960199981927872,
"rewards/margins": 0.17124859988689423,
"rewards/rejected": -0.23085062205791473,
"step": 310
},
{
"epoch": 0.1674515960230246,
"grad_norm": 10.214098303939142,
"learning_rate": 4.931907926706373e-07,
"logits/chosen": -3.1341705322265625,
"logits/rejected": -3.0006356239318848,
"logps/chosen": -352.4683837890625,
"logps/rejected": -254.84872436523438,
"loss": 0.6359,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.207993745803833,
"rewards/margins": 0.12400402128696442,
"rewards/rejected": -0.3319977819919586,
"step": 320
},
{
"epoch": 0.1726844583987441,
"grad_norm": 11.9926488132874,
"learning_rate": 4.920911573406924e-07,
"logits/chosen": -2.988920211791992,
"logits/rejected": -2.8621726036071777,
"logps/chosen": -270.1966247558594,
"logps/rejected": -231.31307983398438,
"loss": 0.6416,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.17856433987617493,
"rewards/margins": 0.1818085014820099,
"rewards/rejected": -0.3603728413581848,
"step": 330
},
{
"epoch": 0.17791732077446362,
"grad_norm": 11.181487800996836,
"learning_rate": 4.909106655307787e-07,
"logits/chosen": -3.071873188018799,
"logits/rejected": -3.073513984680176,
"logps/chosen": -300.5464172363281,
"logps/rejected": -336.34735107421875,
"loss": 0.6082,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.2116507738828659,
"rewards/margins": 0.15836475789546967,
"rewards/rejected": -0.37001553177833557,
"step": 340
},
{
"epoch": 0.18315018315018314,
"grad_norm": 11.847042665159117,
"learning_rate": 4.896497115155709e-07,
"logits/chosen": -2.988060235977173,
"logits/rejected": -3.0429558753967285,
"logps/chosen": -211.3376007080078,
"logps/rejected": -275.08087158203125,
"loss": 0.6017,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.2585861086845398,
"rewards/margins": 0.3116615116596222,
"rewards/rejected": -0.5702476501464844,
"step": 350
},
{
"epoch": 0.18838304552590268,
"grad_norm": 10.78945471485412,
"learning_rate": 4.883087164434672e-07,
"logits/chosen": -3.053856372833252,
"logits/rejected": -2.9947004318237305,
"logps/chosen": -300.0679931640625,
"logps/rejected": -320.3852233886719,
"loss": 0.6157,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.35630059242248535,
"rewards/margins": 0.1511966437101364,
"rewards/rejected": -0.5074971914291382,
"step": 360
},
{
"epoch": 0.1936159079016222,
"grad_norm": 12.222844062228763,
"learning_rate": 4.868881281959282e-07,
"logits/chosen": -3.011864423751831,
"logits/rejected": -2.965883255004883,
"logps/chosen": -337.5576171875,
"logps/rejected": -347.80572509765625,
"loss": 0.6028,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.4423811435699463,
"rewards/margins": 0.4099253714084625,
"rewards/rejected": -0.8523064851760864,
"step": 370
},
{
"epoch": 0.1988487702773417,
"grad_norm": 13.768587447328201,
"learning_rate": 4.853884212378889e-07,
"logits/chosen": -2.873631715774536,
"logits/rejected": -2.939685344696045,
"logps/chosen": -235.6674346923828,
"logps/rejected": -387.4900817871094,
"loss": 0.5963,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.33242642879486084,
"rewards/margins": 0.29951974749565125,
"rewards/rejected": -0.6319462060928345,
"step": 380
},
{
"epoch": 0.20408163265306123,
"grad_norm": 18.448504645275186,
"learning_rate": 4.838100964592904e-07,
"logits/chosen": -3.0143513679504395,
"logits/rejected": -2.915879726409912,
"logps/chosen": -423.55926513671875,
"logps/rejected": -330.427490234375,
"loss": 0.6128,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.36892321705818176,
"rewards/margins": 0.2754189372062683,
"rewards/rejected": -0.6443422436714172,
"step": 390
},
{
"epoch": 0.20931449502878074,
"grad_norm": 13.04561395514187,
"learning_rate": 4.821536810077878e-07,
"logits/chosen": -3.05737042427063,
"logits/rejected": -2.9584059715270996,
"logps/chosen": -340.3260192871094,
"logps/rejected": -336.16961669921875,
"loss": 0.5796,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.5146622061729431,
"rewards/margins": 0.4159639775753021,
"rewards/rejected": -0.9306262731552124,
"step": 400
},
{
"epoch": 0.21454735740450026,
"grad_norm": 12.961227614879007,
"learning_rate": 4.804197281126862e-07,
"logits/chosen": -2.953254222869873,
"logits/rejected": -2.9386799335479736,
"logps/chosen": -332.95013427734375,
"logps/rejected": -379.0150146484375,
"loss": 0.6141,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.5353738069534302,
"rewards/margins": 0.16674116253852844,
"rewards/rejected": -0.7021149396896362,
"step": 410
},
{
"epoch": 0.21978021978021978,
"grad_norm": 13.525221402164235,
"learning_rate": 4.786088169001671e-07,
"logits/chosen": -3.0310072898864746,
"logits/rejected": -3.0133447647094727,
"logps/chosen": -328.05743408203125,
"logps/rejected": -359.2528076171875,
"loss": 0.5931,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.5046383738517761,
"rewards/margins": 0.18837007880210876,
"rewards/rejected": -0.6930084824562073,
"step": 420
},
{
"epoch": 0.2250130821559393,
"grad_norm": 15.343151149244173,
"learning_rate": 4.767215521998648e-07,
"logits/chosen": -3.097900867462158,
"logits/rejected": -2.9799551963806152,
"logps/chosen": -344.5359802246094,
"logps/rejected": -340.30279541015625,
"loss": 0.5977,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.3433271050453186,
"rewards/margins": 0.6302416324615479,
"rewards/rejected": -0.9735687971115112,
"step": 430
},
{
"epoch": 0.2302459445316588,
"grad_norm": 15.463582386840484,
"learning_rate": 4.7475856434285853e-07,
"logits/chosen": -3.0013861656188965,
"logits/rejected": -2.9684953689575195,
"logps/chosen": -315.1118469238281,
"logps/rejected": -319.0517578125,
"loss": 0.5884,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.44304031133651733,
"rewards/margins": 0.31362444162368774,
"rewards/rejected": -0.7566647529602051,
"step": 440
},
{
"epoch": 0.23547880690737832,
"grad_norm": 18.270478359234886,
"learning_rate": 4.727205089511466e-07,
"logits/chosen": -2.8321094512939453,
"logits/rejected": -2.848545551300049,
"logps/chosen": -289.8956604003906,
"logps/rejected": -334.09063720703125,
"loss": 0.5472,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.5760111808776855,
"rewards/margins": 0.3905490040779114,
"rewards/rejected": -0.9665601849555969,
"step": 450
},
{
"epoch": 0.24071166928309787,
"grad_norm": 18.394272193166348,
"learning_rate": 4.706080667186738e-07,
"logits/chosen": -2.9533510208129883,
"logits/rejected": -2.85048246383667,
"logps/chosen": -339.3458557128906,
"logps/rejected": -362.229736328125,
"loss": 0.5852,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.527037501335144,
"rewards/margins": 0.513110876083374,
"rewards/rejected": -1.040148377418518,
"step": 460
},
{
"epoch": 0.24594453165881738,
"grad_norm": 14.635877926172551,
"learning_rate": 4.68421943183986e-07,
"logits/chosen": -2.9311420917510986,
"logits/rejected": -2.9059174060821533,
"logps/chosen": -353.56158447265625,
"logps/rejected": -397.67315673828125,
"loss": 0.5503,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.6963423490524292,
"rewards/margins": 0.4823771119117737,
"rewards/rejected": -1.178719401359558,
"step": 470
},
{
"epoch": 0.25117739403453687,
"grad_norm": 15.746195707332934,
"learning_rate": 4.661628684945851e-07,
"logits/chosen": -2.9695117473602295,
"logits/rejected": -2.9841675758361816,
"logps/chosen": -291.04376220703125,
"logps/rejected": -328.80816650390625,
"loss": 0.5728,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.5066032409667969,
"rewards/margins": 0.49630409479141235,
"rewards/rejected": -1.002907156944275,
"step": 480
},
{
"epoch": 0.2564102564102564,
"grad_norm": 13.427286500899548,
"learning_rate": 4.638315971630662e-07,
"logits/chosen": -2.9672865867614746,
"logits/rejected": -2.9439778327941895,
"logps/chosen": -344.151611328125,
"logps/rejected": -350.19305419921875,
"loss": 0.5485,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.540307343006134,
"rewards/margins": 0.437363862991333,
"rewards/rejected": -0.9776712656021118,
"step": 490
},
{
"epoch": 0.2616431187859759,
"grad_norm": 17.8293277249549,
"learning_rate": 4.6142890781511635e-07,
"logits/chosen": -2.928375720977783,
"logits/rejected": -2.9310758113861084,
"logps/chosen": -301.03839111328125,
"logps/rejected": -364.05670166015625,
"loss": 0.5414,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.43716445565223694,
"rewards/margins": 0.5820390582084656,
"rewards/rejected": -1.0192034244537354,
"step": 500
},
{
"epoch": 0.2668759811616955,
"grad_norm": 13.134882476577088,
"learning_rate": 4.5895560292945996e-07,
"logits/chosen": -3.02789306640625,
"logits/rejected": -2.938778877258301,
"logps/chosen": -346.013427734375,
"logps/rejected": -349.2925109863281,
"loss": 0.5768,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.6063824892044067,
"rewards/margins": 0.4094172418117523,
"rewards/rejected": -1.015799641609192,
"step": 510
},
{
"epoch": 0.272108843537415,
"grad_norm": 14.111790777809139,
"learning_rate": 4.5641250856983743e-07,
"logits/chosen": -2.993448495864868,
"logits/rejected": -2.9830093383789062,
"logps/chosen": -336.80517578125,
"logps/rejected": -378.89544677734375,
"loss": 0.5603,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.7271286845207214,
"rewards/margins": 0.24076862633228302,
"rewards/rejected": -0.9678972959518433,
"step": 520
},
{
"epoch": 0.2773417059131345,
"grad_norm": 23.979569462254332,
"learning_rate": 4.5380047410910655e-07,
"logits/chosen": -2.937358856201172,
"logits/rejected": -2.9407734870910645,
"logps/chosen": -383.3623962402344,
"logps/rejected": -362.71099853515625,
"loss": 0.547,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.45562949776649475,
"rewards/margins": 0.7254458665847778,
"rewards/rejected": -1.1810753345489502,
"step": 530
},
{
"epoch": 0.282574568288854,
"grad_norm": 19.701709513451974,
"learning_rate": 4.5112037194555876e-07,
"logits/chosen": -2.910482406616211,
"logits/rejected": -2.8749754428863525,
"logps/chosen": -393.69268798828125,
"logps/rejected": -415.40338134765625,
"loss": 0.5233,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.9999048113822937,
"rewards/margins": 0.4355775713920593,
"rewards/rejected": -1.435482382774353,
"step": 540
},
{
"epoch": 0.28780743066457354,
"grad_norm": 16.547979220851758,
"learning_rate": 4.4837309721154536e-07,
"logits/chosen": -3.030179977416992,
"logits/rejected": -2.925313949584961,
"logps/chosen": -398.4112243652344,
"logps/rejected": -411.71881103515625,
"loss": 0.5809,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.6699625253677368,
"rewards/margins": 0.7033188939094543,
"rewards/rejected": -1.3732813596725464,
"step": 550
},
{
"epoch": 0.29304029304029305,
"grad_norm": 17.970228607671963,
"learning_rate": 4.4555956747451065e-07,
"logits/chosen": -3.0206010341644287,
"logits/rejected": -2.9801762104034424,
"logps/chosen": -325.3530578613281,
"logps/rejected": -367.80718994140625,
"loss": 0.5519,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.3854089379310608,
"rewards/margins": 0.6328403353691101,
"rewards/rejected": -1.018249273300171,
"step": 560
},
{
"epoch": 0.29827315541601257,
"grad_norm": 14.821894805782252,
"learning_rate": 4.426807224305315e-07,
"logits/chosen": -3.058014392852783,
"logits/rejected": -2.9584782123565674,
"logps/chosen": -334.9781494140625,
"logps/rejected": -347.7608337402344,
"loss": 0.5869,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.5377572774887085,
"rewards/margins": 0.4088308811187744,
"rewards/rejected": -0.9465881586074829,
"step": 570
},
{
"epoch": 0.3035060177917321,
"grad_norm": 15.304057814913962,
"learning_rate": 4.397375235904669e-07,
"logits/chosen": -3.0159687995910645,
"logits/rejected": -2.940138339996338,
"logps/chosen": -378.4619445800781,
"logps/rejected": -325.83795166015625,
"loss": 0.5461,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.6433390378952026,
"rewards/margins": 0.4729389250278473,
"rewards/rejected": -1.1162779331207275,
"step": 580
},
{
"epoch": 0.3087388801674516,
"grad_norm": 15.41023592760314,
"learning_rate": 4.3673095395882074e-07,
"logits/chosen": -2.826242446899414,
"logits/rejected": -2.8306374549865723,
"logps/chosen": -297.02642822265625,
"logps/rejected": -344.32562255859375,
"loss": 0.5647,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.8071243166923523,
"rewards/margins": 0.44610705971717834,
"rewards/rejected": -1.2532315254211426,
"step": 590
},
{
"epoch": 0.3139717425431711,
"grad_norm": 19.465252228371234,
"learning_rate": 4.3366201770542687e-07,
"logits/chosen": -2.9092609882354736,
"logits/rejected": -2.910013198852539,
"logps/chosen": -344.72369384765625,
"logps/rejected": -383.1976623535156,
"loss": 0.5827,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.6237624883651733,
"rewards/margins": 0.513608455657959,
"rewards/rejected": -1.1373710632324219,
"step": 600
},
{
"epoch": 0.31920460491889063,
"grad_norm": 20.05320596442739,
"learning_rate": 4.3053173983006395e-07,
"logits/chosen": -2.9518351554870605,
"logits/rejected": -2.872385263442993,
"logps/chosen": -259.81561279296875,
"logps/rejected": -335.95587158203125,
"loss": 0.5515,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.5689873695373535,
"rewards/margins": 0.6198877096176147,
"rewards/rejected": -1.1888750791549683,
"step": 610
},
{
"epoch": 0.32443746729461015,
"grad_norm": 15.709917499515434,
"learning_rate": 4.2734116582011403e-07,
"logits/chosen": -2.9943740367889404,
"logits/rejected": -2.838672161102295,
"logps/chosen": -406.30401611328125,
"logps/rejected": -319.8934020996094,
"loss": 0.5606,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.4886740744113922,
"rewards/margins": 0.6085253953933716,
"rewards/rejected": -1.097199559211731,
"step": 620
},
{
"epoch": 0.32967032967032966,
"grad_norm": 12.984865579984941,
"learning_rate": 4.2409136130137845e-07,
"logits/chosen": -2.9008967876434326,
"logits/rejected": -2.91571044921875,
"logps/chosen": -289.12115478515625,
"logps/rejected": -377.608154296875,
"loss": 0.6095,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.6557156443595886,
"rewards/margins": 0.4830680787563324,
"rewards/rejected": -1.1387838125228882,
"step": 630
},
{
"epoch": 0.3349031920460492,
"grad_norm": 19.717030591939775,
"learning_rate": 4.207834116821672e-07,
"logits/chosen": -2.958247661590576,
"logits/rejected": -2.885899066925049,
"logps/chosen": -340.4076232910156,
"logps/rejected": -417.45916748046875,
"loss": 0.5439,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.5420235991477966,
"rewards/margins": 0.7290440797805786,
"rewards/rejected": -1.2710676193237305,
"step": 640
},
{
"epoch": 0.3401360544217687,
"grad_norm": 16.729020989503553,
"learning_rate": 4.174184217907818e-07,
"logits/chosen": -2.9285740852355957,
"logits/rejected": -2.892252206802368,
"logps/chosen": -332.4837951660156,
"logps/rejected": -364.90606689453125,
"loss": 0.5831,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.6785880923271179,
"rewards/margins": 0.5523657202720642,
"rewards/rejected": -1.2309538125991821,
"step": 650
},
{
"epoch": 0.3453689167974882,
"grad_norm": 20.83462510110715,
"learning_rate": 4.1399751550651084e-07,
"logits/chosen": -2.904776096343994,
"logits/rejected": -2.8706138134002686,
"logps/chosen": -321.9764099121094,
"logps/rejected": -347.6967468261719,
"loss": 0.561,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.8602074384689331,
"rewards/margins": 0.3864290714263916,
"rewards/rejected": -1.2466365098953247,
"step": 660
},
{
"epoch": 0.35060177917320773,
"grad_norm": 16.939482005435558,
"learning_rate": 4.1052183538426426e-07,
"logits/chosen": -2.8574535846710205,
"logits/rejected": -2.8395981788635254,
"logps/chosen": -335.42327880859375,
"logps/rejected": -357.9546813964844,
"loss": 0.5691,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.7099284529685974,
"rewards/margins": 0.5589883923530579,
"rewards/rejected": -1.2689168453216553,
"step": 670
},
{
"epoch": 0.35583464154892724,
"grad_norm": 14.529627338961385,
"learning_rate": 4.0699254227296884e-07,
"logits/chosen": -2.7800791263580322,
"logits/rejected": -2.7351772785186768,
"logps/chosen": -342.0890808105469,
"logps/rejected": -360.97552490234375,
"loss": 0.5595,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -1.009031057357788,
"rewards/margins": 0.451460063457489,
"rewards/rejected": -1.4604910612106323,
"step": 680
},
{
"epoch": 0.36106750392464676,
"grad_norm": 15.776547924608785,
"learning_rate": 4.034108149278543e-07,
"logits/chosen": -2.926506996154785,
"logits/rejected": -2.839150905609131,
"logps/chosen": -330.08648681640625,
"logps/rejected": -370.6238708496094,
"loss": 0.5372,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.8743458986282349,
"rewards/margins": 0.5746399164199829,
"rewards/rejected": -1.4489858150482178,
"step": 690
},
{
"epoch": 0.3663003663003663,
"grad_norm": 15.377113085837133,
"learning_rate": 3.9977784961675833e-07,
"logits/chosen": -2.8555071353912354,
"logits/rejected": -2.8349578380584717,
"logps/chosen": -342.14251708984375,
"logps/rejected": -366.9553527832031,
"loss": 0.5548,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.8782415390014648,
"rewards/margins": 0.4512537121772766,
"rewards/rejected": -1.3294951915740967,
"step": 700
},
{
"epoch": 0.3715332286760858,
"grad_norm": 14.215944756798452,
"learning_rate": 3.96094859720583e-07,
"logits/chosen": -2.897223472595215,
"logits/rejected": -2.7988810539245605,
"logps/chosen": -398.7936706542969,
"logps/rejected": -397.49578857421875,
"loss": 0.5165,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.6832660436630249,
"rewards/margins": 0.5724307298660278,
"rewards/rejected": -1.2556967735290527,
"step": 710
},
{
"epoch": 0.37676609105180536,
"grad_norm": 15.259982422665061,
"learning_rate": 3.923630753280357e-07,
"logits/chosen": -2.7229888439178467,
"logits/rejected": -2.7818284034729004,
"logps/chosen": -273.22772216796875,
"logps/rejected": -326.69171142578125,
"loss": 0.5265,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.5913980603218079,
"rewards/margins": 0.4389529228210449,
"rewards/rejected": -1.030350923538208,
"step": 720
},
{
"epoch": 0.3819989534275249,
"grad_norm": 16.540503842805457,
"learning_rate": 3.8858374282478893e-07,
"logits/chosen": -2.847386360168457,
"logits/rejected": -2.822706460952759,
"logps/chosen": -340.0333251953125,
"logps/rejected": -446.2408752441406,
"loss": 0.5689,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.7927729487419128,
"rewards/margins": 0.775786280632019,
"rewards/rejected": -1.5685592889785767,
"step": 730
},
{
"epoch": 0.3872318158032444,
"grad_norm": 14.485113206171722,
"learning_rate": 3.8475812447719823e-07,
"logits/chosen": -2.7510900497436523,
"logits/rejected": -2.770341396331787,
"logps/chosen": -311.25360107421875,
"logps/rejected": -339.53631591796875,
"loss": 0.5326,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.7380820512771606,
"rewards/margins": 0.47503456473350525,
"rewards/rejected": -1.2131164073944092,
"step": 740
},
{
"epoch": 0.3924646781789639,
"grad_norm": 20.826039047439295,
"learning_rate": 3.8088749801071496e-07,
"logits/chosen": -2.784240245819092,
"logits/rejected": -2.7459471225738525,
"logps/chosen": -353.0906677246094,
"logps/rejected": -405.60406494140625,
"loss": 0.5086,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.7327491044998169,
"rewards/margins": 0.3788543939590454,
"rewards/rejected": -1.1116034984588623,
"step": 750
},
{
"epoch": 0.3976975405546834,
"grad_norm": 17.658078906056954,
"learning_rate": 3.7697315618313644e-07,
"logits/chosen": -2.7973134517669678,
"logits/rejected": -2.7560336589813232,
"logps/chosen": -279.4862365722656,
"logps/rejected": -299.8984375,
"loss": 0.5791,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.5168331861495972,
"rewards/margins": 0.5671908855438232,
"rewards/rejected": -1.08402419090271,
"step": 760
},
{
"epoch": 0.40293040293040294,
"grad_norm": 13.415561426957483,
"learning_rate": 3.7301640635283584e-07,
"logits/chosen": -2.7720260620117188,
"logits/rejected": -2.748436450958252,
"logps/chosen": -328.19390869140625,
"logps/rejected": -405.95709228515625,
"loss": 0.5757,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.8876395225524902,
"rewards/margins": 0.42753082513809204,
"rewards/rejected": -1.315170168876648,
"step": 770
},
{
"epoch": 0.40816326530612246,
"grad_norm": 14.769300667177983,
"learning_rate": 3.6901857004211443e-07,
"logits/chosen": -2.749990463256836,
"logits/rejected": -2.7357590198516846,
"logps/chosen": -313.04180908203125,
"logps/rejected": -345.66485595703125,
"loss": 0.5333,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.7682701349258423,
"rewards/margins": 0.6885534524917603,
"rewards/rejected": -1.4568235874176025,
"step": 780
},
{
"epoch": 0.413396127681842,
"grad_norm": 14.678645322251919,
"learning_rate": 3.6498098249582444e-07,
"logits/chosen": -2.7600743770599365,
"logits/rejected": -2.767582416534424,
"logps/chosen": -304.5693664550781,
"logps/rejected": -396.30731201171875,
"loss": 0.5435,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.837949275970459,
"rewards/margins": 0.3060819208621979,
"rewards/rejected": -1.144031047821045,
"step": 790
},
{
"epoch": 0.4186289900575615,
"grad_norm": 19.043570676274012,
"learning_rate": 3.6090499223540757e-07,
"logits/chosen": -2.816871166229248,
"logits/rejected": -2.819472551345825,
"logps/chosen": -386.77410888671875,
"logps/rejected": -432.9602966308594,
"loss": 0.594,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.9629328846931458,
"rewards/margins": 0.41979989409446716,
"rewards/rejected": -1.3827327489852905,
"step": 800
},
{
"epoch": 0.423861852433281,
"grad_norm": 16.07598371652075,
"learning_rate": 3.5679196060850034e-07,
"logits/chosen": -2.774369478225708,
"logits/rejected": -2.704817771911621,
"logps/chosen": -381.45367431640625,
"logps/rejected": -394.75054931640625,
"loss": 0.5502,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.8025716543197632,
"rewards/margins": 0.5534734725952148,
"rewards/rejected": -1.356045126914978,
"step": 810
},
{
"epoch": 0.4290947148090005,
"grad_norm": 17.575598817541977,
"learning_rate": 3.5264326133425464e-07,
"logits/chosen": -2.808215856552124,
"logits/rejected": -2.7602808475494385,
"logps/chosen": -377.1565856933594,
"logps/rejected": -380.73577880859375,
"loss": 0.5746,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.9608553051948547,
"rewards/margins": 0.5722512006759644,
"rewards/rejected": -1.5331064462661743,
"step": 820
},
{
"epoch": 0.43432757718472004,
"grad_norm": 16.32115598655498,
"learning_rate": 3.4846028004452693e-07,
"logits/chosen": -2.8719420433044434,
"logits/rejected": -2.8173866271972656,
"logps/chosen": -307.88714599609375,
"logps/rejected": -346.9542236328125,
"loss": 0.6023,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.7333047389984131,
"rewards/margins": 0.4502180516719818,
"rewards/rejected": -1.1835228204727173,
"step": 830
},
{
"epoch": 0.43956043956043955,
"grad_norm": 16.91945418937996,
"learning_rate": 3.4424441382108826e-07,
"logits/chosen": -2.9018168449401855,
"logits/rejected": -2.726928472518921,
"logps/chosen": -491.023193359375,
"logps/rejected": -441.81494140625,
"loss": 0.5438,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.9286457896232605,
"rewards/margins": 0.6778661012649536,
"rewards/rejected": -1.6065118312835693,
"step": 840
},
{
"epoch": 0.44479330193615907,
"grad_norm": 21.188197365998956,
"learning_rate": 3.399970707290105e-07,
"logits/chosen": -2.784093141555786,
"logits/rejected": -2.6797289848327637,
"logps/chosen": -347.7838439941406,
"logps/rejected": -357.4574279785156,
"loss": 0.5766,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.9829579591751099,
"rewards/margins": 0.4461473822593689,
"rewards/rejected": -1.4291054010391235,
"step": 850
},
{
"epoch": 0.4500261643118786,
"grad_norm": 16.855672588469375,
"learning_rate": 3.3571966934638376e-07,
"logits/chosen": -2.818727731704712,
"logits/rejected": -2.8157877922058105,
"logps/chosen": -258.5130615234375,
"logps/rejected": -407.8525085449219,
"loss": 0.5452,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.7257243394851685,
"rewards/margins": 0.8329108953475952,
"rewards/rejected": -1.5586349964141846,
"step": 860
},
{
"epoch": 0.4552590266875981,
"grad_norm": 13.734802575828237,
"learning_rate": 3.314136382905234e-07,
"logits/chosen": -2.577549457550049,
"logits/rejected": -2.634438991546631,
"logps/chosen": -296.9052429199219,
"logps/rejected": -361.5945739746094,
"loss": 0.5641,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.766598641872406,
"rewards/margins": 0.615802526473999,
"rewards/rejected": -1.3824012279510498,
"step": 870
},
{
"epoch": 0.4604918890633176,
"grad_norm": 16.099032009321178,
"learning_rate": 3.270804157408225e-07,
"logits/chosen": -2.836336612701416,
"logits/rejected": -2.819361925125122,
"logps/chosen": -354.25347900390625,
"logps/rejected": -366.732421875,
"loss": 0.539,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.8732249140739441,
"rewards/margins": 0.35818585753440857,
"rewards/rejected": -1.2314107418060303,
"step": 880
},
{
"epoch": 0.46572475143903713,
"grad_norm": 17.106740036561465,
"learning_rate": 3.227214489584128e-07,
"logits/chosen": -2.8799033164978027,
"logits/rejected": -2.8418822288513184,
"logps/chosen": -386.0233459472656,
"logps/rejected": -369.4678039550781,
"loss": 0.5312,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.7949320077896118,
"rewards/margins": 0.5880209803581238,
"rewards/rejected": -1.3829529285430908,
"step": 890
},
{
"epoch": 0.47095761381475665,
"grad_norm": 21.890320275591346,
"learning_rate": 3.1833819380279023e-07,
"logits/chosen": -2.721087694168091,
"logits/rejected": -2.697716474533081,
"logps/chosen": -314.33087158203125,
"logps/rejected": -464.30413818359375,
"loss": 0.5303,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.7493630051612854,
"rewards/margins": 1.0829684734344482,
"rewards/rejected": -1.8323314189910889,
"step": 900
},
{
"epoch": 0.47619047619047616,
"grad_norm": 18.614163560193642,
"learning_rate": 3.139321142455703e-07,
"logits/chosen": -2.7572057247161865,
"logits/rejected": -2.706200122833252,
"logps/chosen": -257.91607666015625,
"logps/rejected": -347.5008239746094,
"loss": 0.577,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.8465842008590698,
"rewards/margins": 0.8367371559143066,
"rewards/rejected": -1.683321237564087,
"step": 910
},
{
"epoch": 0.48142333856619574,
"grad_norm": 15.269349106291124,
"learning_rate": 3.095046818815331e-07,
"logits/chosen": -2.8949315547943115,
"logits/rejected": -2.79899525642395,
"logps/chosen": -398.53765869140625,
"logps/rejected": -398.479736328125,
"loss": 0.5602,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.9853115081787109,
"rewards/margins": 0.5784602761268616,
"rewards/rejected": -1.5637718439102173,
"step": 920
},
{
"epoch": 0.48665620094191525,
"grad_norm": 15.355520412605172,
"learning_rate": 3.0505737543712275e-07,
"logits/chosen": -2.806957960128784,
"logits/rejected": -2.785641670227051,
"logps/chosen": -353.44854736328125,
"logps/rejected": -384.927490234375,
"loss": 0.5432,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.9327161908149719,
"rewards/margins": 0.3711306154727936,
"rewards/rejected": -1.303847074508667,
"step": 930
},
{
"epoch": 0.49188906331763477,
"grad_norm": 15.67157273698381,
"learning_rate": 3.0059168027656475e-07,
"logits/chosen": -2.888259172439575,
"logits/rejected": -2.8196072578430176,
"logps/chosen": -374.23443603515625,
"logps/rejected": -388.24578857421875,
"loss": 0.4706,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.8297722935676575,
"rewards/margins": 0.6955471038818359,
"rewards/rejected": -1.5253194570541382,
"step": 940
},
{
"epoch": 0.4971219256933543,
"grad_norm": 14.462678359824464,
"learning_rate": 2.9610908790576663e-07,
"logits/chosen": -2.7698135375976562,
"logits/rejected": -2.661236524581909,
"logps/chosen": -364.80810546875,
"logps/rejected": -443.79425048828125,
"loss": 0.5139,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.7954778671264648,
"rewards/margins": 0.9584125280380249,
"rewards/rejected": -1.7538903951644897,
"step": 950
},
{
"epoch": 0.5023547880690737,
"grad_norm": 13.595756965461069,
"learning_rate": 2.9161109547416667e-07,
"logits/chosen": -2.8679168224334717,
"logits/rejected": -2.795522689819336,
"logps/chosen": -391.12115478515625,
"logps/rejected": -440.32940673828125,
"loss": 0.5418,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.1392987966537476,
"rewards/margins": 0.6391605734825134,
"rewards/rejected": -1.7784591913223267,
"step": 960
},
{
"epoch": 0.5075876504447933,
"grad_norm": 17.796999860342627,
"learning_rate": 2.8709920527469834e-07,
"logits/chosen": -2.7307040691375732,
"logits/rejected": -2.691157341003418,
"logps/chosen": -373.8775939941406,
"logps/rejected": -425.29400634765625,
"loss": 0.5328,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.8613603711128235,
"rewards/margins": 0.9478839039802551,
"rewards/rejected": -1.80924391746521,
"step": 970
},
{
"epoch": 0.5128205128205128,
"grad_norm": 11.72784157089814,
"learning_rate": 2.8257492424203685e-07,
"logits/chosen": -2.8900365829467773,
"logits/rejected": -2.731393337249756,
"logps/chosen": -380.15576171875,
"logps/rejected": -384.27630615234375,
"loss": 0.53,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.7285529971122742,
"rewards/margins": 0.7510075569152832,
"rewards/rejected": -1.4795606136322021,
"step": 980
},
{
"epoch": 0.5180533751962323,
"grad_norm": 17.896633281401765,
"learning_rate": 2.780397634492949e-07,
"logits/chosen": -2.7340455055236816,
"logits/rejected": -2.663761615753174,
"logps/chosen": -312.6453552246094,
"logps/rejected": -369.53790283203125,
"loss": 0.5483,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.68805330991745,
"rewards/margins": 0.8142625093460083,
"rewards/rejected": -1.5023157596588135,
"step": 990
},
{
"epoch": 0.5232862375719518,
"grad_norm": 18.34859064941708,
"learning_rate": 2.7349523760333674e-07,
"logits/chosen": -2.7495155334472656,
"logits/rejected": -2.6967289447784424,
"logps/chosen": -327.76214599609375,
"logps/rejected": -376.7933044433594,
"loss": 0.5416,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.9751359820365906,
"rewards/margins": 0.5826241970062256,
"rewards/rejected": -1.55776047706604,
"step": 1000
},
{
"epoch": 0.5285190999476713,
"grad_norm": 17.334821505661097,
"learning_rate": 2.6894286453887827e-07,
"logits/chosen": -2.7708683013916016,
"logits/rejected": -2.763948440551758,
"logps/chosen": -315.4136047363281,
"logps/rejected": -405.6109924316406,
"loss": 0.5349,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.835088849067688,
"rewards/margins": 0.722664475440979,
"rewards/rejected": -1.5577532052993774,
"step": 1010
},
{
"epoch": 0.533751962323391,
"grad_norm": 16.461462996228413,
"learning_rate": 2.6438416471154273e-07,
"logits/chosen": -2.8052124977111816,
"logits/rejected": -2.7714879512786865,
"logps/chosen": -361.3446044921875,
"logps/rejected": -400.1153564453125,
"loss": 0.4871,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.7003945112228394,
"rewards/margins": 0.8514345288276672,
"rewards/rejected": -1.5518289804458618,
"step": 1020
},
{
"epoch": 0.5389848246991105,
"grad_norm": 16.993297156547936,
"learning_rate": 2.598206606900406e-07,
"logits/chosen": -2.812356472015381,
"logits/rejected": -2.779200553894043,
"logps/chosen": -351.882568359375,
"logps/rejected": -349.3376159667969,
"loss": 0.5472,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.9720566868782043,
"rewards/margins": 0.37396326661109924,
"rewards/rejected": -1.3460201025009155,
"step": 1030
},
{
"epoch": 0.54421768707483,
"grad_norm": 16.369305843640827,
"learning_rate": 2.552538766476443e-07,
"logits/chosen": -2.807950973510742,
"logits/rejected": -2.8600311279296875,
"logps/chosen": -336.7136535644531,
"logps/rejected": -404.81646728515625,
"loss": 0.5614,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.8792396783828735,
"rewards/margins": 0.5810363292694092,
"rewards/rejected": -1.4602760076522827,
"step": 1040
},
{
"epoch": 0.5494505494505495,
"grad_norm": 18.03446272194485,
"learning_rate": 2.5068533785312666e-07,
"logits/chosen": -2.873033046722412,
"logits/rejected": -2.799868583679199,
"logps/chosen": -392.97955322265625,
"logps/rejected": -442.9043884277344,
"loss": 0.5167,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.6563512086868286,
"rewards/margins": 0.8989084959030151,
"rewards/rejected": -1.5552598237991333,
"step": 1050
},
{
"epoch": 0.554683411826269,
"grad_norm": 20.315404295924612,
"learning_rate": 2.461165701613333e-07,
"logits/chosen": -2.759457588195801,
"logits/rejected": -2.7653160095214844,
"logps/chosen": -315.7008056640625,
"logps/rejected": -420.513671875,
"loss": 0.5364,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.5269359946250916,
"rewards/margins": 1.0427325963974,
"rewards/rejected": -1.5696685314178467,
"step": 1060
},
{
"epoch": 0.5599162742019885,
"grad_norm": 14.713699575864963,
"learning_rate": 2.415490995035596e-07,
"logits/chosen": -2.7564265727996826,
"logits/rejected": -2.7757070064544678,
"logps/chosen": -408.191162109375,
"logps/rejected": -409.313232421875,
"loss": 0.5296,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.8357181549072266,
"rewards/margins": 0.5712189674377441,
"rewards/rejected": -1.4069370031356812,
"step": 1070
},
{
"epoch": 0.565149136577708,
"grad_norm": 18.09014309365496,
"learning_rate": 2.3698445137790258e-07,
"logits/chosen": -2.8548378944396973,
"logits/rejected": -2.7975635528564453,
"logps/chosen": -307.1778869628906,
"logps/rejected": -374.8013610839844,
"loss": 0.5449,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.7622523307800293,
"rewards/margins": 0.7371785640716553,
"rewards/rejected": -1.4994310140609741,
"step": 1080
},
{
"epoch": 0.5703819989534276,
"grad_norm": 24.666086366933353,
"learning_rate": 2.3242415033975575e-07,
"logits/chosen": -2.7526750564575195,
"logits/rejected": -2.6710307598114014,
"logps/chosen": -406.1108093261719,
"logps/rejected": -334.859375,
"loss": 0.5465,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.0330530405044556,
"rewards/margins": 0.43497997522354126,
"rewards/rejected": -1.4680330753326416,
"step": 1090
},
{
"epoch": 0.5756148613291471,
"grad_norm": 19.70750428042825,
"learning_rate": 2.2786971949262134e-07,
"logits/chosen": -2.7635788917541504,
"logits/rejected": -2.740485668182373,
"logps/chosen": -350.9692077636719,
"logps/rejected": -427.8968811035156,
"loss": 0.5149,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.9213571548461914,
"rewards/margins": 0.6530935764312744,
"rewards/rejected": -1.5744506120681763,
"step": 1100
},
{
"epoch": 0.5808477237048666,
"grad_norm": 19.421047147213937,
"learning_rate": 2.2332267997940513e-07,
"logits/chosen": -2.593143939971924,
"logits/rejected": -2.5872962474823,
"logps/chosen": -264.0029296875,
"logps/rejected": -365.4161682128906,
"loss": 0.561,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.9193994402885437,
"rewards/margins": 0.7727819681167603,
"rewards/rejected": -1.6921813488006592,
"step": 1110
},
{
"epoch": 0.5860805860805861,
"grad_norm": 20.25715075172959,
"learning_rate": 2.1878455047436753e-07,
"logits/chosen": -2.7330455780029297,
"logits/rejected": -2.701078176498413,
"logps/chosen": -377.7864074707031,
"logps/rejected": -417.6946716308594,
"loss": 0.5272,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.0114750862121582,
"rewards/margins": 0.6698529124259949,
"rewards/rejected": -1.6813280582427979,
"step": 1120
},
{
"epoch": 0.5913134484563056,
"grad_norm": 18.65929450263775,
"learning_rate": 2.1425684667589852e-07,
"logits/chosen": -2.640451669692993,
"logits/rejected": -2.5956547260284424,
"logps/chosen": -344.3413391113281,
"logps/rejected": -445.24627685546875,
"loss": 0.5213,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.2718535661697388,
"rewards/margins": 0.5820540189743042,
"rewards/rejected": -1.8539073467254639,
"step": 1130
},
{
"epoch": 0.5965463108320251,
"grad_norm": 23.91921863916153,
"learning_rate": 2.0974108080028692e-07,
"logits/chosen": -2.876276969909668,
"logits/rejected": -2.8425920009613037,
"logps/chosen": -358.00347900390625,
"logps/rejected": -435.3857421875,
"loss": 0.5311,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.0057141780853271,
"rewards/margins": 0.6711603403091431,
"rewards/rejected": -1.6768745183944702,
"step": 1140
},
{
"epoch": 0.6017791732077447,
"grad_norm": 18.381841775077824,
"learning_rate": 2.0523876107665194e-07,
"logits/chosen": -2.829072952270508,
"logits/rejected": -2.6755383014678955,
"logps/chosen": -359.2571105957031,
"logps/rejected": -397.68743896484375,
"loss": 0.5262,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.9120863080024719,
"rewards/margins": 0.7489384412765503,
"rewards/rejected": -1.6610246896743774,
"step": 1150
},
{
"epoch": 0.6070120355834642,
"grad_norm": 20.516414043729537,
"learning_rate": 2.0075139124320787e-07,
"logits/chosen": -2.6632769107818604,
"logits/rejected": -2.6845157146453857,
"logps/chosen": -317.9290771484375,
"logps/rejected": -327.5867614746094,
"loss": 0.553,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.0191724300384521,
"rewards/margins": 0.4334065020084381,
"rewards/rejected": -1.452579140663147,
"step": 1160
},
{
"epoch": 0.6122448979591837,
"grad_norm": 18.076421208226538,
"learning_rate": 1.962804700450265e-07,
"logits/chosen": -2.7555832862854004,
"logits/rejected": -2.7026753425598145,
"logps/chosen": -358.18212890625,
"logps/rejected": -462.0071716308594,
"loss": 0.5437,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.876019299030304,
"rewards/margins": 1.0211801528930664,
"rewards/rejected": -1.8971996307373047,
"step": 1170
},
{
"epoch": 0.6174777603349032,
"grad_norm": 16.308484308025697,
"learning_rate": 1.9182749073346943e-07,
"logits/chosen": -2.806734085083008,
"logits/rejected": -2.7705514430999756,
"logps/chosen": -416.02520751953125,
"logps/rejected": -418.12347412109375,
"loss": 0.482,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.0437430143356323,
"rewards/margins": 0.47317224740982056,
"rewards/rejected": -1.516915202140808,
"step": 1180
},
{
"epoch": 0.6227106227106227,
"grad_norm": 19.5121441695378,
"learning_rate": 1.8739394056745372e-07,
"logits/chosen": -2.85368013381958,
"logits/rejected": -2.792527675628662,
"logps/chosen": -444.78948974609375,
"logps/rejected": -417.74078369140625,
"loss": 0.4903,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.825792670249939,
"rewards/margins": 0.635454535484314,
"rewards/rejected": -1.461247205734253,
"step": 1190
},
{
"epoch": 0.6279434850863422,
"grad_norm": 16.627989570364402,
"learning_rate": 1.8298130031671972e-07,
"logits/chosen": -2.568850517272949,
"logits/rejected": -2.4875073432922363,
"logps/chosen": -408.0639953613281,
"logps/rejected": -431.19329833984375,
"loss": 0.5083,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.0868682861328125,
"rewards/margins": 0.5747972726821899,
"rewards/rejected": -1.6616655588150024,
"step": 1200
},
{
"epoch": 0.6331763474620618,
"grad_norm": 20.140246761449887,
"learning_rate": 1.785910437672658e-07,
"logits/chosen": -2.8672873973846436,
"logits/rejected": -2.822535514831543,
"logps/chosen": -386.6443786621094,
"logps/rejected": -416.11328125,
"loss": 0.5426,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.953647792339325,
"rewards/margins": 0.6560716032981873,
"rewards/rejected": -1.6097195148468018,
"step": 1210
},
{
"epoch": 0.6384092098377813,
"grad_norm": 20.20724844042649,
"learning_rate": 1.7422463722911624e-07,
"logits/chosen": -2.8543591499328613,
"logits/rejected": -2.8314263820648193,
"logps/chosen": -403.2071228027344,
"logps/rejected": -456.78924560546875,
"loss": 0.5261,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.862319827079773,
"rewards/margins": 0.9367027282714844,
"rewards/rejected": -1.7990226745605469,
"step": 1220
},
{
"epoch": 0.6436420722135008,
"grad_norm": 17.62902811045846,
"learning_rate": 1.6988353904658492e-07,
"logits/chosen": -2.796889543533325,
"logits/rejected": -2.7177813053131104,
"logps/chosen": -430.34515380859375,
"logps/rejected": -412.7137145996094,
"loss": 0.4988,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.9104453921318054,
"rewards/margins": 0.9096380472183228,
"rewards/rejected": -1.8200836181640625,
"step": 1230
},
{
"epoch": 0.6488749345892203,
"grad_norm": 20.291430758326484,
"learning_rate": 1.6556919911120081e-07,
"logits/chosen": -2.7235121726989746,
"logits/rejected": -2.704380512237549,
"logps/chosen": -316.89495849609375,
"logps/rejected": -354.3640441894531,
"loss": 0.5081,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.8046097755432129,
"rewards/margins": 0.6576355695724487,
"rewards/rejected": -1.4622454643249512,
"step": 1240
},
{
"epoch": 0.6541077969649398,
"grad_norm": 16.634847998265094,
"learning_rate": 1.6128305837745546e-07,
"logits/chosen": -2.8713958263397217,
"logits/rejected": -2.780726909637451,
"logps/chosen": -357.17352294921875,
"logps/rejected": -450.5567321777344,
"loss": 0.5163,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.8796365857124329,
"rewards/margins": 0.8217870593070984,
"rewards/rejected": -1.7014236450195312,
"step": 1250
},
{
"epoch": 0.6593406593406593,
"grad_norm": 13.440845355698691,
"learning_rate": 1.570265483815364e-07,
"logits/chosen": -2.7988827228546143,
"logits/rejected": -2.7722063064575195,
"logps/chosen": -336.47259521484375,
"logps/rejected": -320.25933837890625,
"loss": 0.5074,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.7505493760108948,
"rewards/margins": 0.728230357170105,
"rewards/rejected": -1.4787797927856445,
"step": 1260
},
{
"epoch": 0.6645735217163788,
"grad_norm": 17.012464455361766,
"learning_rate": 1.5280109076320506e-07,
"logits/chosen": -2.7736287117004395,
"logits/rejected": -2.7175097465515137,
"logps/chosen": -305.7979736328125,
"logps/rejected": -366.69677734375,
"loss": 0.5134,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.756097137928009,
"rewards/margins": 0.8465806841850281,
"rewards/rejected": -1.6026777029037476,
"step": 1270
},
{
"epoch": 0.6698063840920984,
"grad_norm": 20.946223504777155,
"learning_rate": 1.4860809679098158e-07,
"logits/chosen": -2.7644202709198,
"logits/rejected": -2.674136161804199,
"logps/chosen": -328.1389465332031,
"logps/rejected": -375.33331298828125,
"loss": 0.5189,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.8715450167655945,
"rewards/margins": 0.7798687219619751,
"rewards/rejected": -1.6514136791229248,
"step": 1280
},
{
"epoch": 0.6750392464678179,
"grad_norm": 16.38862564822018,
"learning_rate": 1.444489668907914e-07,
"logits/chosen": -2.688934087753296,
"logits/rejected": -2.6829075813293457,
"logps/chosen": -314.0862121582031,
"logps/rejected": -436.2018127441406,
"loss": 0.5481,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.7365253567695618,
"rewards/margins": 0.8902775645256042,
"rewards/rejected": -1.6268030405044556,
"step": 1290
},
{
"epoch": 0.6802721088435374,
"grad_norm": 18.868265740047633,
"learning_rate": 1.403250901782354e-07,
"logits/chosen": -2.7281813621520996,
"logits/rejected": -2.748370409011841,
"logps/chosen": -362.42266845703125,
"logps/rejected": -434.9849548339844,
"loss": 0.5109,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.9654933214187622,
"rewards/margins": 0.6172199249267578,
"rewards/rejected": -1.58271324634552,
"step": 1300
},
{
"epoch": 0.6855049712192569,
"grad_norm": 22.766524494989497,
"learning_rate": 1.3623784399463584e-07,
"logits/chosen": -2.8341379165649414,
"logits/rejected": -2.8009238243103027,
"logps/chosen": -321.3329772949219,
"logps/rejected": -359.7023010253906,
"loss": 0.4985,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.7877645492553711,
"rewards/margins": 0.7746297717094421,
"rewards/rejected": -1.562394380569458,
"step": 1310
},
{
"epoch": 0.6907378335949764,
"grad_norm": 18.14832280420879,
"learning_rate": 1.3218859344701632e-07,
"logits/chosen": -2.7510125637054443,
"logits/rejected": -2.7323222160339355,
"logps/chosen": -392.82965087890625,
"logps/rejected": -459.291015625,
"loss": 0.4874,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.0846540927886963,
"rewards/margins": 0.6299933195114136,
"rewards/rejected": -1.7146475315093994,
"step": 1320
},
{
"epoch": 0.6959706959706959,
"grad_norm": 22.852759896856153,
"learning_rate": 1.2817869095216624e-07,
"logits/chosen": -2.7460663318634033,
"logits/rejected": -2.7363333702087402,
"logps/chosen": -344.58636474609375,
"logps/rejected": -462.2234802246094,
"loss": 0.4831,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.7783873677253723,
"rewards/margins": 0.8798893094062805,
"rewards/rejected": -1.6582765579223633,
"step": 1330
},
{
"epoch": 0.7012035583464155,
"grad_norm": 23.6570903287471,
"learning_rate": 1.2420947578494522e-07,
"logits/chosen": -2.689542293548584,
"logits/rejected": -2.6176483631134033,
"logps/chosen": -351.36468505859375,
"logps/rejected": -383.5470275878906,
"loss": 0.5132,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.9131054878234863,
"rewards/margins": 0.8674660921096802,
"rewards/rejected": -1.7805715799331665,
"step": 1340
},
{
"epoch": 0.706436420722135,
"grad_norm": 24.783115737011485,
"learning_rate": 1.202822736309758e-07,
"logits/chosen": -2.7429165840148926,
"logits/rejected": -2.718522787094116,
"logps/chosen": -374.50323486328125,
"logps/rejected": -395.9026794433594,
"loss": 0.5146,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.9068658947944641,
"rewards/margins": 0.5722583532333374,
"rewards/rejected": -1.4791243076324463,
"step": 1350
},
{
"epoch": 0.7116692830978545,
"grad_norm": 16.39264775799894,
"learning_rate": 1.1639839614387572e-07,
"logits/chosen": -2.6672797203063965,
"logits/rejected": -2.646698474884033,
"logps/chosen": -431.44305419921875,
"logps/rejected": -459.53387451171875,
"loss": 0.5512,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.0058305263519287,
"rewards/margins": 0.6214712262153625,
"rewards/rejected": -1.627301573753357,
"step": 1360
},
{
"epoch": 0.716902145473574,
"grad_norm": 21.609096898964978,
"learning_rate": 1.1255914050717552e-07,
"logits/chosen": -2.791228771209717,
"logits/rejected": -2.6643431186676025,
"logps/chosen": -397.3729553222656,
"logps/rejected": -365.84893798828125,
"loss": 0.4799,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.8865281343460083,
"rewards/margins": 0.6994263529777527,
"rewards/rejected": -1.5859544277191162,
"step": 1370
},
{
"epoch": 0.7221350078492935,
"grad_norm": 18.052304623231368,
"learning_rate": 1.0876578900107053e-07,
"logits/chosen": -2.771291494369507,
"logits/rejected": -2.7225091457366943,
"logps/chosen": -280.9075622558594,
"logps/rejected": -402.49908447265625,
"loss": 0.4933,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.6819049119949341,
"rewards/margins": 0.9486406445503235,
"rewards/rejected": -1.6305453777313232,
"step": 1380
},
{
"epoch": 0.727367870225013,
"grad_norm": 19.948472450962296,
"learning_rate": 1.050196085741491e-07,
"logits/chosen": -2.708111047744751,
"logits/rejected": -2.616854667663574,
"logps/chosen": -350.962158203125,
"logps/rejected": -393.85357666015625,
"loss": 0.5001,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.9579183459281921,
"rewards/margins": 0.8560064435005188,
"rewards/rejected": -1.81392502784729,
"step": 1390
},
{
"epoch": 0.7326007326007326,
"grad_norm": 19.160243750600223,
"learning_rate": 1.0132185042024246e-07,
"logits/chosen": -2.6855554580688477,
"logits/rejected": -2.6964261531829834,
"logps/chosen": -345.172119140625,
"logps/rejected": -429.0968322753906,
"loss": 0.5215,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.147552728652954,
"rewards/margins": 0.7010320425033569,
"rewards/rejected": -1.848584532737732,
"step": 1400
},
{
"epoch": 0.7378335949764521,
"grad_norm": 20.042088035388982,
"learning_rate": 9.767374956053584e-08,
"logits/chosen": -2.6926827430725098,
"logits/rejected": -2.65360689163208,
"logps/chosen": -353.872314453125,
"logps/rejected": -421.33447265625,
"loss": 0.5652,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.064784049987793,
"rewards/margins": 0.7993738055229187,
"rewards/rejected": -1.864158034324646,
"step": 1410
},
{
"epoch": 0.7430664573521716,
"grad_norm": 21.422151300583028,
"learning_rate": 9.407652443108192e-08,
"logits/chosen": -2.754948616027832,
"logits/rejected": -2.6873762607574463,
"logps/chosen": -410.3388671875,
"logps/rejected": -412.1189880371094,
"loss": 0.5796,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.0343502759933472,
"rewards/margins": 0.6344264149665833,
"rewards/rejected": -1.668776512145996,
"step": 1420
},
{
"epoch": 0.7482993197278912,
"grad_norm": 15.10072418425115,
"learning_rate": 9.053137647585229e-08,
"logits/chosen": -2.7673416137695312,
"logits/rejected": -2.683150053024292,
"logps/chosen": -380.3271179199219,
"logps/rejected": -419.106201171875,
"loss": 0.4956,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.8902686834335327,
"rewards/margins": 0.8359274864196777,
"rewards/rejected": -1.7261962890625,
"step": 1430
},
{
"epoch": 0.7535321821036107,
"grad_norm": 20.007719817569754,
"learning_rate": 8.70394897454659e-08,
"logits/chosen": -2.7903285026550293,
"logits/rejected": -2.726536750793457,
"logps/chosen": -412.11669921875,
"logps/rejected": -453.485107421875,
"loss": 0.5241,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.6117042899131775,
"rewards/margins": 0.9241981506347656,
"rewards/rejected": -1.5359022617340088,
"step": 1440
},
{
"epoch": 0.7587650444793302,
"grad_norm": 15.725739369442051,
"learning_rate": 8.360203050172488e-08,
"logits/chosen": -2.761046886444092,
"logits/rejected": -2.6572835445404053,
"logps/chosen": -380.8763427734375,
"logps/rejected": -428.72601318359375,
"loss": 0.5176,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.9222623705863953,
"rewards/margins": 0.817249596118927,
"rewards/rejected": -1.7395120859146118,
"step": 1450
},
{
"epoch": 0.7639979068550498,
"grad_norm": 15.913216171443693,
"learning_rate": 8.022014682809305e-08,
"logits/chosen": -2.680180072784424,
"logits/rejected": -2.6794424057006836,
"logps/chosen": -297.6555480957031,
"logps/rejected": -357.39703369140625,
"loss": 0.5246,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.8991565704345703,
"rewards/margins": 0.5045825242996216,
"rewards/rejected": -1.4037392139434814,
"step": 1460
},
{
"epoch": 0.7692307692307693,
"grad_norm": 16.251635097636957,
"learning_rate": 7.689496824624525e-08,
"logits/chosen": -2.7646737098693848,
"logits/rejected": -2.642939567565918,
"logps/chosen": -400.4357604980469,
"logps/rejected": -399.53228759765625,
"loss": 0.5194,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.7477626204490662,
"rewards/margins": 0.9188524484634399,
"rewards/rejected": -1.6666151285171509,
"step": 1470
},
{
"epoch": 0.7744636316064888,
"grad_norm": 18.848669382089422,
"learning_rate": 7.362760533881649e-08,
"logits/chosen": -2.682922124862671,
"logits/rejected": -2.678013324737549,
"logps/chosen": -340.21295166015625,
"logps/rejected": -394.0873107910156,
"loss": 0.5165,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.9958998560905457,
"rewards/margins": 0.7448866367340088,
"rewards/rejected": -1.7407863140106201,
"step": 1480
},
{
"epoch": 0.7796964939822083,
"grad_norm": 18.023386643013044,
"learning_rate": 7.041914937847584e-08,
"logits/chosen": -2.489797592163086,
"logits/rejected": -2.509646415710449,
"logps/chosen": -401.4407653808594,
"logps/rejected": -446.206298828125,
"loss": 0.5291,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.2514934539794922,
"rewards/margins": 0.5801483392715454,
"rewards/rejected": -1.8316421508789062,
"step": 1490
},
{
"epoch": 0.7849293563579278,
"grad_norm": 16.75491775995985,
"learning_rate": 6.727067196345099e-08,
"logits/chosen": -2.6368460655212402,
"logits/rejected": -2.5974481105804443,
"logps/chosen": -296.484130859375,
"logps/rejected": -450.65826416015625,
"loss": 0.4935,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.9263995289802551,
"rewards/margins": 1.0427716970443726,
"rewards/rejected": -1.9691712856292725,
"step": 1500
},
{
"epoch": 0.7901622187336473,
"grad_norm": 19.21477092868735,
"learning_rate": 6.418322465962233e-08,
"logits/chosen": -2.6289784908294678,
"logits/rejected": -2.6378707885742188,
"logps/chosen": -389.85357666015625,
"logps/rejected": -506.17926025390625,
"loss": 0.5505,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.3396786451339722,
"rewards/margins": 0.8516691327095032,
"rewards/rejected": -2.191347599029541,
"step": 1510
},
{
"epoch": 0.7953950811093669,
"grad_norm": 18.1853712960554,
"learning_rate": 6.115783864930905e-08,
"logits/chosen": -2.6589932441711426,
"logits/rejected": -2.61903715133667,
"logps/chosen": -309.13134765625,
"logps/rejected": -423.0372619628906,
"loss": 0.5275,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.9086647033691406,
"rewards/margins": 0.8935861587524414,
"rewards/rejected": -1.802250862121582,
"step": 1520
},
{
"epoch": 0.8006279434850864,
"grad_norm": 21.592413720973216,
"learning_rate": 5.8195524386862374e-08,
"logits/chosen": -2.820551872253418,
"logits/rejected": -2.7724428176879883,
"logps/chosen": -423.28363037109375,
"logps/rejected": -555.06298828125,
"loss": 0.4936,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.1178683042526245,
"rewards/margins": 0.7847362756729126,
"rewards/rejected": -1.9026046991348267,
"step": 1530
},
{
"epoch": 0.8058608058608059,
"grad_norm": 16.018713976629193,
"learning_rate": 5.529727126118228e-08,
"logits/chosen": -2.716298818588257,
"logits/rejected": -2.724658489227295,
"logps/chosen": -468.96856689453125,
"logps/rejected": -467.7854919433594,
"loss": 0.5403,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.1532551050186157,
"rewards/margins": 0.45239201188087463,
"rewards/rejected": -1.605647325515747,
"step": 1540
},
{
"epoch": 0.8110936682365254,
"grad_norm": 14.945499187412246,
"learning_rate": 5.246404726526918e-08,
"logits/chosen": -2.677903652191162,
"logits/rejected": -2.5905513763427734,
"logps/chosen": -396.8397521972656,
"logps/rejected": -377.557861328125,
"loss": 0.5017,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.7912027835845947,
"rewards/margins": 0.7971407771110535,
"rewards/rejected": -1.588343858718872,
"step": 1550
},
{
"epoch": 0.8163265306122449,
"grad_norm": 15.362047445414715,
"learning_rate": 4.969679867292276e-08,
"logits/chosen": -2.6830527782440186,
"logits/rejected": -2.6041407585144043,
"logps/chosen": -436.54473876953125,
"logps/rejected": -448.3851623535156,
"loss": 0.5104,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.9174336194992065,
"rewards/margins": 0.7332735061645508,
"rewards/rejected": -1.6507068872451782,
"step": 1560
},
{
"epoch": 0.8215593929879644,
"grad_norm": 17.576121150498565,
"learning_rate": 4.6996449722693315e-08,
"logits/chosen": -2.6931352615356445,
"logits/rejected": -2.6350607872009277,
"logps/chosen": -315.1622619628906,
"logps/rejected": -392.06292724609375,
"loss": 0.515,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.8371334075927734,
"rewards/margins": 0.6744937896728516,
"rewards/rejected": -1.5116270780563354,
"step": 1570
},
{
"epoch": 0.826792255363684,
"grad_norm": 19.810352695452664,
"learning_rate": 4.436390230919465e-08,
"logits/chosen": -2.766540050506592,
"logits/rejected": -2.647644519805908,
"logps/chosen": -376.6566467285156,
"logps/rejected": -402.571533203125,
"loss": 0.5633,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.9764121770858765,
"rewards/margins": 0.7432926297187805,
"rewards/rejected": -1.7197048664093018,
"step": 1580
},
{
"epoch": 0.8320251177394035,
"grad_norm": 23.05429981365939,
"learning_rate": 4.180003568187776e-08,
"logits/chosen": -2.5337836742401123,
"logits/rejected": -2.474341869354248,
"logps/chosen": -308.1025390625,
"logps/rejected": -385.6150817871094,
"loss": 0.5154,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.7418735027313232,
"rewards/margins": 1.1077954769134521,
"rewards/rejected": -1.849669098854065,
"step": 1590
},
{
"epoch": 0.837257980115123,
"grad_norm": 16.3861829287344,
"learning_rate": 3.930570615136919e-08,
"logits/chosen": -2.568213939666748,
"logits/rejected": -2.600006103515625,
"logps/chosen": -364.5643615722656,
"logps/rejected": -459.32794189453125,
"loss": 0.5075,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.1484493017196655,
"rewards/margins": 0.7362042665481567,
"rewards/rejected": -1.8846536874771118,
"step": 1600
},
{
"epoch": 0.8424908424908425,
"grad_norm": 19.191362805490126,
"learning_rate": 3.6881746803469756e-08,
"logits/chosen": -2.7753946781158447,
"logits/rejected": -2.6937546730041504,
"logps/chosen": -451.4925842285156,
"logps/rejected": -490.1671447753906,
"loss": 0.552,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.8986201286315918,
"rewards/margins": 0.839145839214325,
"rewards/rejected": -1.737765908241272,
"step": 1610
},
{
"epoch": 0.847723704866562,
"grad_norm": 13.924933587722114,
"learning_rate": 3.452896722091128e-08,
"logits/chosen": -2.6632466316223145,
"logits/rejected": -2.57369327545166,
"logps/chosen": -392.18756103515625,
"logps/rejected": -403.74053955078125,
"loss": 0.5298,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.1617246866226196,
"rewards/margins": 0.6037346124649048,
"rewards/rejected": -1.7654592990875244,
"step": 1620
},
{
"epoch": 0.8529565672422815,
"grad_norm": 20.923127069029967,
"learning_rate": 3.2248153212961677e-08,
"logits/chosen": -2.776475429534912,
"logits/rejected": -2.7884275913238525,
"logps/chosen": -319.9564514160156,
"logps/rejected": -403.90985107421875,
"loss": 0.5115,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.85566246509552,
"rewards/margins": 0.800037682056427,
"rewards/rejected": -1.6557003259658813,
"step": 1630
},
{
"epoch": 0.858189429618001,
"grad_norm": 17.71228621644989,
"learning_rate": 3.004006655297209e-08,
"logits/chosen": -2.704780340194702,
"logits/rejected": -2.690717935562134,
"logps/chosen": -387.93804931640625,
"logps/rejected": -438.994873046875,
"loss": 0.5445,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.8455182909965515,
"rewards/margins": 0.7660375833511353,
"rewards/rejected": -1.611555814743042,
"step": 1640
},
{
"epoch": 0.8634222919937206,
"grad_norm": 17.25547789651305,
"learning_rate": 2.7905444723949762e-08,
"logits/chosen": -2.635859251022339,
"logits/rejected": -2.592531681060791,
"logps/chosen": -379.08502197265625,
"logps/rejected": -430.8294372558594,
"loss": 0.5025,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.0563156604766846,
"rewards/margins": 0.5091310739517212,
"rewards/rejected": -1.5654467344284058,
"step": 1650
},
{
"epoch": 0.8686551543694401,
"grad_norm": 17.75386295782151,
"learning_rate": 2.5845000672245572e-08,
"logits/chosen": -2.652233839035034,
"logits/rejected": -2.6020119190216064,
"logps/chosen": -291.4010314941406,
"logps/rejected": -421.83251953125,
"loss": 0.5178,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.8415043950080872,
"rewards/margins": 1.0191437005996704,
"rewards/rejected": -1.8606479167938232,
"step": 1660
},
{
"epoch": 0.8738880167451596,
"grad_norm": 20.26581048622113,
"learning_rate": 2.385942256943499e-08,
"logits/chosen": -2.7823240756988525,
"logits/rejected": -2.7179102897644043,
"logps/chosen": -368.48687744140625,
"logps/rejected": -421.83831787109375,
"loss": 0.5155,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.1159892082214355,
"rewards/margins": 0.6273930668830872,
"rewards/rejected": -1.743382215499878,
"step": 1670
},
{
"epoch": 0.8791208791208791,
"grad_norm": 17.254193295992085,
"learning_rate": 2.194937358247506e-08,
"logits/chosen": -2.6762166023254395,
"logits/rejected": -2.680424213409424,
"logps/chosen": -341.16790771484375,
"logps/rejected": -436.9229431152344,
"loss": 0.4885,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.9166282415390015,
"rewards/margins": 0.8245986104011536,
"rewards/rejected": -1.7412269115447998,
"step": 1680
},
{
"epoch": 0.8843537414965986,
"grad_norm": 21.18803904087079,
"learning_rate": 2.011549165221127e-08,
"logits/chosen": -2.5892271995544434,
"logits/rejected": -2.5530495643615723,
"logps/chosen": -343.75274658203125,
"logps/rejected": -402.83343505859375,
"loss": 0.4921,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.9632428288459778,
"rewards/margins": 0.8480204343795776,
"rewards/rejected": -1.8112634420394897,
"step": 1690
},
{
"epoch": 0.8895866038723181,
"grad_norm": 18.038523734567626,
"learning_rate": 1.8358389280311303e-08,
"logits/chosen": -2.706275463104248,
"logits/rejected": -2.649019718170166,
"logps/chosen": -383.904296875,
"logps/rejected": -418.57098388671875,
"loss": 0.5314,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -1.1623432636260986,
"rewards/margins": 0.5158635377883911,
"rewards/rejected": -1.6782068014144897,
"step": 1700
},
{
"epoch": 0.8948194662480377,
"grad_norm": 19.626755939860992,
"learning_rate": 1.6678653324693787e-08,
"logits/chosen": -2.744741916656494,
"logits/rejected": -2.661057710647583,
"logps/chosen": -415.6158142089844,
"logps/rejected": -489.20001220703125,
"loss": 0.5109,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.9210155606269836,
"rewards/margins": 0.8741633296012878,
"rewards/rejected": -1.795178771018982,
"step": 1710
},
{
"epoch": 0.9000523286237572,
"grad_norm": 19.57980158791374,
"learning_rate": 1.507684480352292e-08,
"logits/chosen": -2.717519998550415,
"logits/rejected": -2.6023640632629395,
"logps/chosen": -398.86126708984375,
"logps/rejected": -399.9064636230469,
"loss": 0.5261,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.1560735702514648,
"rewards/margins": 0.4242979884147644,
"rewards/rejected": -1.580371618270874,
"step": 1720
},
{
"epoch": 0.9052851909994767,
"grad_norm": 16.818277672769657,
"learning_rate": 1.3553498707832761e-08,
"logits/chosen": -2.6509828567504883,
"logits/rejected": -2.614760637283325,
"logps/chosen": -306.6497802734375,
"logps/rejected": -347.5115051269531,
"loss": 0.4896,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.8303035497665405,
"rewards/margins": 0.8390544652938843,
"rewards/rejected": -1.6693580150604248,
"step": 1730
},
{
"epoch": 0.9105180533751962,
"grad_norm": 17.711495458783695,
"learning_rate": 1.2109123822844653e-08,
"logits/chosen": -2.7017340660095215,
"logits/rejected": -2.539865732192993,
"logps/chosen": -346.7171936035156,
"logps/rejected": -381.2705383300781,
"loss": 0.5339,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.024086356163025,
"rewards/margins": 0.7089843153953552,
"rewards/rejected": -1.7330706119537354,
"step": 1740
},
{
"epoch": 0.9157509157509157,
"grad_norm": 16.25718099914919,
"learning_rate": 1.0744202558037014e-08,
"logits/chosen": -2.7817070484161377,
"logits/rejected": -2.7584192752838135,
"logps/chosen": -422.7364196777344,
"logps/rejected": -449.0599060058594,
"loss": 0.5541,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.0160216093063354,
"rewards/margins": 0.6888941526412964,
"rewards/rejected": -1.7049156427383423,
"step": 1750
},
{
"epoch": 0.9209837781266352,
"grad_norm": 19.79616779504436,
"learning_rate": 9.459190786024696e-09,
"logits/chosen": -2.729013442993164,
"logits/rejected": -2.692884683609009,
"logps/chosen": -314.32562255859375,
"logps/rejected": -348.45233154296875,
"loss": 0.4649,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.9323883056640625,
"rewards/margins": 0.5154863595962524,
"rewards/rejected": -1.4478747844696045,
"step": 1760
},
{
"epoch": 0.9262166405023547,
"grad_norm": 15.466044673164172,
"learning_rate": 8.254517690300944e-09,
"logits/chosen": -2.5260634422302246,
"logits/rejected": -2.4602036476135254,
"logps/chosen": -379.27996826171875,
"logps/rejected": -432.5758361816406,
"loss": 0.4982,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.931941032409668,
"rewards/margins": 0.9537526965141296,
"rewards/rejected": -1.8856935501098633,
"step": 1770
},
{
"epoch": 0.9314495028780743,
"grad_norm": 14.105513003072497,
"learning_rate": 7.130585621893809e-09,
"logits/chosen": -2.612941026687622,
"logits/rejected": -2.601433277130127,
"logps/chosen": -337.0567932128906,
"logps/rejected": -356.3701477050781,
"loss": 0.5522,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.0178024768829346,
"rewards/margins": 0.5061073303222656,
"rewards/rejected": -1.5239098072052002,
"step": 1780
},
{
"epoch": 0.9366823652537938,
"grad_norm": 17.67412326556611,
"learning_rate": 6.0877699649840574e-09,
"logits/chosen": -2.676441192626953,
"logits/rejected": -2.6985154151916504,
"logps/chosen": -410.93975830078125,
"logps/rejected": -474.650390625,
"loss": 0.5175,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.8644577860832214,
"rewards/margins": 0.6866751313209534,
"rewards/rejected": -1.5511329174041748,
"step": 1790
},
{
"epoch": 0.9419152276295133,
"grad_norm": 21.08771075280092,
"learning_rate": 5.126419011529992e-09,
"logits/chosen": -2.514744997024536,
"logits/rejected": -2.462218999862671,
"logps/chosen": -340.0183410644531,
"logps/rejected": -472.80145263671875,
"loss": 0.5,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.1128913164138794,
"rewards/margins": 1.0394203662872314,
"rewards/rejected": -2.1523118019104004,
"step": 1800
},
{
"epoch": 0.9471480900052328,
"grad_norm": 20.973025190997458,
"learning_rate": 4.246853844940723e-09,
"logits/chosen": -2.711920738220215,
"logits/rejected": -2.6448938846588135,
"logps/chosen": -358.5855407714844,
"logps/rejected": -377.30828857421875,
"loss": 0.5545,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.0431839227676392,
"rewards/margins": 0.5513032078742981,
"rewards/rejected": -1.5944870710372925,
"step": 1810
},
{
"epoch": 0.9523809523809523,
"grad_norm": 19.679618715790966,
"learning_rate": 3.449368232836869e-09,
"logits/chosen": -2.5656230449676514,
"logits/rejected": -2.5163912773132324,
"logps/chosen": -295.70684814453125,
"logps/rejected": -333.41778564453125,
"loss": 0.5332,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.8301293253898621,
"rewards/margins": 0.7227026224136353,
"rewards/rejected": -1.5528318881988525,
"step": 1820
},
{
"epoch": 0.957613814756672,
"grad_norm": 23.243692564813756,
"learning_rate": 2.734228528934679e-09,
"logits/chosen": -2.750614881515503,
"logits/rejected": -2.7145471572875977,
"logps/chosen": -371.66656494140625,
"logps/rejected": -392.2098693847656,
"loss": 0.5439,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.9127969741821289,
"rewards/margins": 0.6625674366950989,
"rewards/rejected": -1.575364351272583,
"step": 1830
},
{
"epoch": 0.9628466771323915,
"grad_norm": 19.814979577148716,
"learning_rate": 2.1016735840859447e-09,
"logits/chosen": -2.708528995513916,
"logits/rejected": -2.5766711235046387,
"logps/chosen": -431.52508544921875,
"logps/rejected": -454.4827575683594,
"loss": 0.5108,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.0717358589172363,
"rewards/margins": 0.8303524255752563,
"rewards/rejected": -1.9020881652832031,
"step": 1840
},
{
"epoch": 0.968079539508111,
"grad_norm": 17.66585714924501,
"learning_rate": 1.551914666503812e-09,
"logits/chosen": -2.656337261199951,
"logits/rejected": -2.6318295001983643,
"logps/chosen": -453.3070373535156,
"logps/rejected": -438.5719299316406,
"loss": 0.5218,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.8247343897819519,
"rewards/margins": 0.6178687810897827,
"rewards/rejected": -1.4426031112670898,
"step": 1850
},
{
"epoch": 0.9733124018838305,
"grad_norm": 19.12046100454679,
"learning_rate": 1.0851353912008642e-09,
"logits/chosen": -2.67498779296875,
"logits/rejected": -2.5374255180358887,
"logps/chosen": -420.9612731933594,
"logps/rejected": -403.1297302246094,
"loss": 0.5131,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.0306751728057861,
"rewards/margins": 0.6978103518486023,
"rewards/rejected": -1.7284857034683228,
"step": 1860
},
{
"epoch": 0.97854526425955,
"grad_norm": 23.698665321678508,
"learning_rate": 7.014916586632336e-10,
"logits/chosen": -2.6748759746551514,
"logits/rejected": -2.589711904525757,
"logps/chosen": -330.1553649902344,
"logps/rejected": -370.8642578125,
"loss": 0.5288,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.8707270622253418,
"rewards/margins": 0.5380834341049194,
"rewards/rejected": -1.4088103771209717,
"step": 1870
},
{
"epoch": 0.9837781266352695,
"grad_norm": 14.404582481778125,
"learning_rate": 4.011116027811956e-10,
"logits/chosen": -2.7190022468566895,
"logits/rejected": -2.7610325813293457,
"logps/chosen": -337.45513916015625,
"logps/rejected": -476.40069580078125,
"loss": 0.505,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.878827691078186,
"rewards/margins": 0.7724133133888245,
"rewards/rejected": -1.6512409448623657,
"step": 1880
},
{
"epoch": 0.989010989010989,
"grad_norm": 20.80788365647563,
"learning_rate": 1.840955480532924e-10,
"logits/chosen": -2.709712505340576,
"logits/rejected": -2.666142225265503,
"logps/chosen": -482.58636474609375,
"logps/rejected": -484.43060302734375,
"loss": 0.513,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.9788215756416321,
"rewards/margins": 0.6121380925178528,
"rewards/rejected": -1.5909597873687744,
"step": 1890
},
{
"epoch": 0.9942438513867086,
"grad_norm": 19.04109274522655,
"learning_rate": 5.051597607894087e-11,
"logits/chosen": -2.6989779472351074,
"logits/rejected": -2.5977814197540283,
"logps/chosen": -321.80120849609375,
"logps/rejected": -401.3192138671875,
"loss": 0.5193,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.0180636644363403,
"rewards/margins": 0.8546449542045593,
"rewards/rejected": -1.8727085590362549,
"step": 1900
},
{
"epoch": 0.9994767137624281,
"grad_norm": 21.903615655450324,
"learning_rate": 4.1750135001961117e-13,
"logits/chosen": -2.6709775924682617,
"logits/rejected": -2.6583666801452637,
"logps/chosen": -406.5054626464844,
"logps/rejected": -506.41973876953125,
"loss": 0.5064,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.8070088624954224,
"rewards/margins": 1.0546354055404663,
"rewards/rejected": -1.8616443872451782,
"step": 1910
},
{
"epoch": 1.0,
"step": 1911,
"total_flos": 0.0,
"train_loss": 0.5618298566509827,
"train_runtime": 11326.5504,
"train_samples_per_second": 5.397,
"train_steps_per_second": 0.169
}
],
"logging_steps": 10,
"max_steps": 1911,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}