kto-llama3 / trainer_state.json
lzc0525's picture
Upload folder using huggingface_hub
455a6aa verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9959925193694897,
"eval_steps": 400,
"global_step": 233,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02137323002938819,
"grad_norm": 0.4608515202999115,
"learning_rate": 1.0416666666666667e-07,
"logits/chosen": -1.7747037410736084,
"logits/rejected": -1.6486629247665405,
"logps/chosen": -247.47836303710938,
"logps/ref_chosen": -247.4757537841797,
"logps/ref_rejected": -250.2177734375,
"logps/rejected": -250.17874145507812,
"loss": 0.5,
"rewards/accuracies": 0.29374998807907104,
"rewards/chosen": -2.605724148452282e-05,
"rewards/margins": -0.00041639525443315506,
"rewards/rejected": 0.0003903379547409713,
"step": 5
},
{
"epoch": 0.04274646005877638,
"grad_norm": 0.426495224237442,
"learning_rate": 2.0833333333333333e-07,
"logits/chosen": -1.7335236072540283,
"logits/rejected": -1.6989978551864624,
"logps/chosen": -222.6909637451172,
"logps/ref_chosen": -222.6491241455078,
"logps/ref_rejected": -223.95663452148438,
"logps/rejected": -223.9930877685547,
"loss": 0.5,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.00041838129982352257,
"rewards/margins": -5.400222653406672e-05,
"rewards/rejected": -0.00036437893868424,
"step": 10
},
{
"epoch": 0.06411969008816458,
"grad_norm": 0.4453659653663635,
"learning_rate": 3.1249999999999997e-07,
"logits/chosen": -1.9023773670196533,
"logits/rejected": -1.789849042892456,
"logps/chosen": -218.5724334716797,
"logps/ref_chosen": -218.7084503173828,
"logps/ref_rejected": -224.755615234375,
"logps/rejected": -224.6824493408203,
"loss": 0.5,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.0013600520323961973,
"rewards/margins": 0.0006284656701609492,
"rewards/rejected": 0.000731586420442909,
"step": 15
},
{
"epoch": 0.08549292011755276,
"grad_norm": 0.5101017951965332,
"learning_rate": 4.1666666666666667e-07,
"logits/chosen": -1.7127611637115479,
"logits/rejected": -1.6293315887451172,
"logps/chosen": -226.1074676513672,
"logps/ref_chosen": -226.7457275390625,
"logps/ref_rejected": -235.77908325195312,
"logps/rejected": -235.2657928466797,
"loss": 0.4999,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 0.006382433231920004,
"rewards/margins": 0.0012494683032855392,
"rewards/rejected": 0.005132964812219143,
"step": 20
},
{
"epoch": 0.10686615014694095,
"grad_norm": 0.4738335609436035,
"learning_rate": 4.999717571181741e-07,
"logits/chosen": -1.6099249124526978,
"logits/rejected": -1.5539109706878662,
"logps/chosen": -229.36843872070312,
"logps/ref_chosen": -230.34494018554688,
"logps/ref_rejected": -231.64236450195312,
"logps/rejected": -230.74813842773438,
"loss": 0.4999,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 0.009765096008777618,
"rewards/margins": 0.000822968955617398,
"rewards/rejected": 0.008942126296460629,
"step": 25
},
{
"epoch": 0.12823938017632916,
"grad_norm": 0.4367460608482361,
"learning_rate": 4.98983926127519e-07,
"logits/chosen": -1.6448577642440796,
"logits/rejected": -1.560329794883728,
"logps/chosen": -239.9384002685547,
"logps/ref_chosen": -241.2040557861328,
"logps/ref_rejected": -253.18862915039062,
"logps/rejected": -251.95547485351562,
"loss": 0.4998,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.012656150385737419,
"rewards/margins": 0.00032438611378893256,
"rewards/rejected": 0.012331764213740826,
"step": 30
},
{
"epoch": 0.14961261020571734,
"grad_norm": 0.5036317706108093,
"learning_rate": 4.965903258506806e-07,
"logits/chosen": -1.65009343624115,
"logits/rejected": -1.6685165166854858,
"logps/chosen": -240.6787109375,
"logps/ref_chosen": -242.33291625976562,
"logps/ref_rejected": -237.6911163330078,
"logps/rejected": -236.1189422607422,
"loss": 0.4997,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": 0.016541726887226105,
"rewards/margins": 0.0008201012387871742,
"rewards/rejected": 0.015721624717116356,
"step": 35
},
{
"epoch": 0.17098584023510552,
"grad_norm": 0.5212914347648621,
"learning_rate": 4.928044706128802e-07,
"logits/chosen": -1.6572792530059814,
"logits/rejected": -1.6342990398406982,
"logps/chosen": -224.078857421875,
"logps/ref_chosen": -226.43637084960938,
"logps/ref_rejected": -224.00546264648438,
"logps/rejected": -221.7003173828125,
"loss": 0.4996,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.02357516996562481,
"rewards/margins": 0.0005238516023382545,
"rewards/rejected": 0.023051317781209946,
"step": 40
},
{
"epoch": 0.19235907026449373,
"grad_norm": 0.5110143423080444,
"learning_rate": 4.876477354446189e-07,
"logits/chosen": -1.4905364513397217,
"logits/rejected": -1.3957011699676514,
"logps/chosen": -216.25308227539062,
"logps/ref_chosen": -219.16494750976562,
"logps/ref_rejected": -227.38040161132812,
"logps/rejected": -224.87564086914062,
"loss": 0.4994,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": 0.02911846712231636,
"rewards/margins": 0.004071122966706753,
"rewards/rejected": 0.025047341361641884,
"step": 45
},
{
"epoch": 0.2137323002938819,
"grad_norm": 0.48523762822151184,
"learning_rate": 4.811492353977365e-07,
"logits/chosen": -1.7010364532470703,
"logits/rejected": -1.6736198663711548,
"logps/chosen": -218.8837127685547,
"logps/ref_chosen": -221.23171997070312,
"logps/ref_rejected": -223.6177215576172,
"logps/rejected": -221.6636199951172,
"loss": 0.4993,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 0.023480093106627464,
"rewards/margins": 0.0039388458244502544,
"rewards/rejected": 0.019541248679161072,
"step": 50
},
{
"epoch": 0.2351055303232701,
"grad_norm": 0.4816797971725464,
"learning_rate": 4.7334566116112327e-07,
"logits/chosen": -1.62349534034729,
"logits/rejected": -1.5281016826629639,
"logps/chosen": -237.206787109375,
"logps/ref_chosen": -239.38412475585938,
"logps/ref_rejected": -245.71304321289062,
"logps/rejected": -244.2113800048828,
"loss": 0.4989,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.021773329004645348,
"rewards/margins": 0.006756873335689306,
"rewards/rejected": 0.015016456134617329,
"step": 55
},
{
"epoch": 0.2564787603526583,
"grad_norm": 0.5273976922035217,
"learning_rate": 4.6428107190419983e-07,
"logits/chosen": -1.6468950510025024,
"logits/rejected": -1.599461317062378,
"logps/chosen": -228.3268585205078,
"logps/ref_chosen": -231.1789093017578,
"logps/ref_rejected": -231.9095001220703,
"logps/rejected": -229.9440460205078,
"loss": 0.4988,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 0.028520625084638596,
"rewards/margins": 0.008865959011018276,
"rewards/rejected": 0.019654670730233192,
"step": 60
},
{
"epoch": 0.2778519903820465,
"grad_norm": 0.47698166966438293,
"learning_rate": 4.540066465177783e-07,
"logits/chosen": -1.7030376195907593,
"logits/rejected": -1.7270011901855469,
"logps/chosen": -218.37466430664062,
"logps/ref_chosen": -222.1732635498047,
"logps/ref_rejected": -221.90371704101562,
"logps/rejected": -219.0262451171875,
"loss": 0.4985,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.03798612207174301,
"rewards/margins": 0.009211419150233269,
"rewards/rejected": 0.028774702921509743,
"step": 65
},
{
"epoch": 0.2992252204114347,
"grad_norm": 0.4908115863800049,
"learning_rate": 4.425803946568032e-07,
"logits/chosen": -1.701042890548706,
"logits/rejected": -1.642853021621704,
"logps/chosen": -237.1160430908203,
"logps/ref_chosen": -241.13235473632812,
"logps/ref_rejected": -247.3893585205078,
"logps/rejected": -243.56692504882812,
"loss": 0.4985,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.040162790566682816,
"rewards/margins": 0.0019384495681151748,
"rewards/rejected": 0.038224343210458755,
"step": 70
},
{
"epoch": 0.32059845044082286,
"grad_norm": 0.48811107873916626,
"learning_rate": 4.300668292164329e-07,
"logits/chosen": -1.6175544261932373,
"logits/rejected": -1.6155774593353271,
"logps/chosen": -223.8777618408203,
"logps/ref_chosen": -228.91860961914062,
"logps/ref_rejected": -227.78170776367188,
"logps/rejected": -223.22732543945312,
"loss": 0.4981,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.05040856450796127,
"rewards/margins": 0.004864625167101622,
"rewards/rejected": 0.04554395005106926,
"step": 75
},
{
"epoch": 0.34197168047021104,
"grad_norm": 0.5498376488685608,
"learning_rate": 4.165366020906683e-07,
"logits/chosen": -1.721421480178833,
"logits/rejected": -1.6703542470932007,
"logps/chosen": -220.573486328125,
"logps/ref_chosen": -226.90060424804688,
"logps/ref_rejected": -232.0827178955078,
"logps/rejected": -227.0341339111328,
"loss": 0.4975,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.06327112019062042,
"rewards/margins": 0.012785114347934723,
"rewards/rejected": 0.0504860058426857,
"step": 80
},
{
"epoch": 0.36334491049959927,
"grad_norm": 0.5343174338340759,
"learning_rate": 4.0206610527004607e-07,
"logits/chosen": -1.630051612854004,
"logits/rejected": -1.571542739868164,
"logps/chosen": -231.68496704101562,
"logps/ref_chosen": -237.4697723388672,
"logps/ref_rejected": -240.751953125,
"logps/rejected": -236.31600952148438,
"loss": 0.4978,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.05784807354211807,
"rewards/margins": 0.013488592579960823,
"rewards/rejected": 0.0443594828248024,
"step": 85
},
{
"epoch": 0.38471814052898745,
"grad_norm": 0.5112692713737488,
"learning_rate": 3.867370395306068e-07,
"logits/chosen": -1.7595088481903076,
"logits/rejected": -1.7580636739730835,
"logps/chosen": -211.63906860351562,
"logps/ref_chosen": -217.63436889648438,
"logps/ref_rejected": -222.6137237548828,
"logps/rejected": -217.2650909423828,
"loss": 0.4977,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.05995314195752144,
"rewards/margins": 0.00646712351590395,
"rewards/rejected": 0.053486019372940063,
"step": 90
},
{
"epoch": 0.40609137055837563,
"grad_norm": 0.4654058516025543,
"learning_rate": 3.7063595314933156e-07,
"logits/chosen": -1.8619199991226196,
"logits/rejected": -1.786892294883728,
"logps/chosen": -208.5725555419922,
"logps/ref_chosen": -213.7164306640625,
"logps/ref_rejected": -228.556396484375,
"logps/rejected": -224.4815216064453,
"loss": 0.498,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": 0.051438819617033005,
"rewards/margins": 0.010690188966691494,
"rewards/rejected": 0.04074862599372864,
"step": 95
},
{
"epoch": 0.4274646005877638,
"grad_norm": 0.5265087485313416,
"learning_rate": 3.5385375325047163e-07,
"logits/chosen": -1.6727230548858643,
"logits/rejected": -1.677062749862671,
"logps/chosen": -239.5093994140625,
"logps/ref_chosen": -245.71194458007812,
"logps/ref_rejected": -240.1134490966797,
"logps/rejected": -235.6671142578125,
"loss": 0.4967,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 0.06202547624707222,
"rewards/margins": 0.01756184920668602,
"rewards/rejected": 0.0444636233150959,
"step": 100
},
{
"epoch": 0.448837830617152,
"grad_norm": 0.53775554895401,
"learning_rate": 3.36485192541719e-07,
"logits/chosen": -1.8463099002838135,
"logits/rejected": -1.7264705896377563,
"logps/chosen": -224.50320434570312,
"logps/ref_chosen": -232.00527954101562,
"logps/ref_rejected": -232.0154266357422,
"logps/rejected": -225.75454711914062,
"loss": 0.4968,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.0750209242105484,
"rewards/margins": 0.012411920353770256,
"rewards/rejected": 0.062609001994133,
"step": 105
},
{
"epoch": 0.4702110606465402,
"grad_norm": 0.5438077449798584,
"learning_rate": 3.186283343381213e-07,
"logits/chosen": -1.7997539043426514,
"logits/rejected": -1.7138378620147705,
"logps/chosen": -220.4825897216797,
"logps/ref_chosen": -229.9724578857422,
"logps/ref_rejected": -238.1800079345703,
"logps/rejected": -230.29736328125,
"loss": 0.4966,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.09489865601062775,
"rewards/margins": 0.016072329133749008,
"rewards/rejected": 0.07882632315158844,
"step": 110
},
{
"epoch": 0.4915842906759284,
"grad_norm": 0.5453912019729614,
"learning_rate": 3.003839988942255e-07,
"logits/chosen": -1.8438644409179688,
"logits/rejected": -1.7028881311416626,
"logps/chosen": -203.79205322265625,
"logps/ref_chosen": -214.1478729248047,
"logps/ref_rejected": -226.24618530273438,
"logps/rejected": -217.4800567626953,
"loss": 0.4968,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.1035580188035965,
"rewards/margins": 0.015896398574113846,
"rewards/rejected": 0.08766160905361176,
"step": 115
},
{
"epoch": 0.5129575207053166,
"grad_norm": 0.5030398964881897,
"learning_rate": 2.8185519417047623e-07,
"logits/chosen": -1.8514922857284546,
"logits/rejected": -1.7740070819854736,
"logps/chosen": -214.818359375,
"logps/ref_chosen": -227.9495086669922,
"logps/ref_rejected": -230.5752410888672,
"logps/rejected": -218.9449005126953,
"loss": 0.496,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 0.13131138682365417,
"rewards/margins": 0.015008069574832916,
"rewards/rejected": 0.11630330979824066,
"step": 120
},
{
"epoch": 0.5343307507347048,
"grad_norm": 0.5339066982269287,
"learning_rate": 2.631465342477719e-07,
"logits/chosen": -1.9007892608642578,
"logits/rejected": -1.8334102630615234,
"logps/chosen": -218.14743041992188,
"logps/ref_chosen": -232.6212158203125,
"logps/ref_rejected": -234.5932159423828,
"logps/rejected": -222.1468505859375,
"loss": 0.4958,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 0.1447378695011139,
"rewards/margins": 0.020274382084608078,
"rewards/rejected": 0.12446349859237671,
"step": 125
},
{
"epoch": 0.555703980764093,
"grad_norm": 0.5313855409622192,
"learning_rate": 2.44363648673827e-07,
"logits/chosen": -1.7636210918426514,
"logits/rejected": -1.7406389713287354,
"logps/chosen": -211.9698944091797,
"logps/ref_chosen": -226.790771484375,
"logps/ref_rejected": -231.8648223876953,
"logps/rejected": -219.543212890625,
"loss": 0.4945,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.1482090801000595,
"rewards/margins": 0.024992961436510086,
"rewards/rejected": 0.12321610748767853,
"step": 130
},
{
"epoch": 0.5770772107934812,
"grad_norm": 0.5537051558494568,
"learning_rate": 2.2561258607618294e-07,
"logits/chosen": -1.8008477687835693,
"logits/rejected": -1.8080832958221436,
"logps/chosen": -234.68893432617188,
"logps/ref_chosen": -247.26119995117188,
"logps/ref_rejected": -241.82345581054688,
"logps/rejected": -231.585693359375,
"loss": 0.4949,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.12572243809700012,
"rewards/margins": 0.023345012217760086,
"rewards/rejected": 0.10237739980220795,
"step": 135
},
{
"epoch": 0.5984504408228694,
"grad_norm": 0.5528976321220398,
"learning_rate": 2.069992154090854e-07,
"logits/chosen": -1.775397539138794,
"logits/rejected": -1.6931631565093994,
"logps/chosen": -219.74072265625,
"logps/ref_chosen": -230.71826171875,
"logps/ref_rejected": -227.7001953125,
"logps/rejected": -218.38241577148438,
"loss": 0.495,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": 0.10977540910243988,
"rewards/margins": 0.01659761555492878,
"rewards/rejected": 0.09317778795957565,
"step": 140
},
{
"epoch": 0.6198236708522575,
"grad_norm": 0.5473525524139404,
"learning_rate": 1.886286282148002e-07,
"logits/chosen": -1.7711913585662842,
"logits/rejected": -1.7026926279067993,
"logps/chosen": -195.3854217529297,
"logps/ref_chosen": -208.07254028320312,
"logps/ref_rejected": -210.4279022216797,
"logps/rejected": -199.79165649414062,
"loss": 0.4946,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.12687113881111145,
"rewards/margins": 0.020508771762251854,
"rewards/rejected": 0.10636236518621445,
"step": 145
},
{
"epoch": 0.6411969008816457,
"grad_norm": 0.5966719388961792,
"learning_rate": 1.7060454527421686e-07,
"logits/chosen": -1.8688771724700928,
"logits/rejected": -1.810694932937622,
"logps/chosen": -211.9062042236328,
"logps/ref_chosen": -224.8968505859375,
"logps/ref_rejected": -226.1548309326172,
"logps/rejected": -215.7084503173828,
"loss": 0.4943,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.12990659475326538,
"rewards/margins": 0.02544253133237362,
"rewards/rejected": 0.10446406900882721,
"step": 150
},
{
"epoch": 0.6625701309110339,
"grad_norm": 0.5334843993186951,
"learning_rate": 1.5302873099680374e-07,
"logits/chosen": -1.786595344543457,
"logits/rejected": -1.7971456050872803,
"logps/chosen": -225.0083465576172,
"logps/ref_chosen": -237.4626922607422,
"logps/ref_rejected": -234.39547729492188,
"logps/rejected": -223.2943572998047,
"loss": 0.4955,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": 0.12454362213611603,
"rewards/margins": 0.013532285578548908,
"rewards/rejected": 0.1110113263130188,
"step": 155
},
{
"epoch": 0.6839433609404221,
"grad_norm": 0.5639063715934753,
"learning_rate": 1.360004188562841e-07,
"logits/chosen": -2.0527145862579346,
"logits/rejected": -1.9811140298843384,
"logps/chosen": -217.0570068359375,
"logps/ref_chosen": -231.03369140625,
"logps/ref_rejected": -232.6383819580078,
"logps/rejected": -220.0625457763672,
"loss": 0.4952,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": 0.1397666186094284,
"rewards/margins": 0.014008410274982452,
"rewards/rejected": 0.12575821578502655,
"step": 160
},
{
"epoch": 0.7053165909698104,
"grad_norm": 0.5417853593826294,
"learning_rate": 1.1961575111603586e-07,
"logits/chosen": -1.8371235132217407,
"logits/rejected": -1.7954612970352173,
"logps/chosen": -220.7694854736328,
"logps/ref_chosen": -234.5041046142578,
"logps/ref_rejected": -235.61181640625,
"logps/rejected": -224.56640625,
"loss": 0.4944,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 0.1373465657234192,
"rewards/margins": 0.026892542839050293,
"rewards/rejected": 0.1104540079832077,
"step": 165
},
{
"epoch": 0.7266898209991985,
"grad_norm": 0.565830409526825,
"learning_rate": 1.0396723600754143e-07,
"logits/chosen": -1.8288425207138062,
"logits/rejected": -1.83499276638031,
"logps/chosen": -213.2861785888672,
"logps/ref_chosen": -227.1809844970703,
"logps/ref_rejected": -230.8953094482422,
"logps/rejected": -218.4414520263672,
"loss": 0.4954,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.13894793391227722,
"rewards/margins": 0.014409348368644714,
"rewards/rejected": 0.12453857809305191,
"step": 170
},
{
"epoch": 0.7480630510285867,
"grad_norm": 0.5855058431625366,
"learning_rate": 8.914322542666822e-08,
"logits/chosen": -1.8145122528076172,
"logits/rejected": -1.7646887302398682,
"logps/chosen": -212.070068359375,
"logps/ref_chosen": -224.17794799804688,
"logps/ref_rejected": -225.526123046875,
"logps/rejected": -214.7656707763672,
"loss": 0.4947,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.12107895314693451,
"rewards/margins": 0.013474419713020325,
"rewards/rejected": 0.10760452598333359,
"step": 175
},
{
"epoch": 0.7694362810579749,
"grad_norm": 0.6223751902580261,
"learning_rate": 7.522741609672193e-08,
"logits/chosen": -1.8675405979156494,
"logits/rejected": -1.8476943969726562,
"logps/chosen": -216.3776092529297,
"logps/ref_chosen": -230.77182006835938,
"logps/ref_rejected": -227.00619506835938,
"logps/rejected": -214.32931518554688,
"loss": 0.4945,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.1439422070980072,
"rewards/margins": 0.017173700034618378,
"rewards/rejected": 0.12676851451396942,
"step": 180
},
{
"epoch": 0.7908095110873631,
"grad_norm": 0.5778200030326843,
"learning_rate": 6.229837701471644e-08,
"logits/chosen": -1.9124794006347656,
"logits/rejected": -1.8135532140731812,
"logps/chosen": -216.97702026367188,
"logps/ref_chosen": -229.8362274169922,
"logps/ref_rejected": -233.65390014648438,
"logps/rejected": -222.93417358398438,
"loss": 0.4945,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.1285921037197113,
"rewards/margins": 0.021394768729805946,
"rewards/rejected": 0.10719730705022812,
"step": 185
},
{
"epoch": 0.8121827411167513,
"grad_norm": 0.5558175444602966,
"learning_rate": 5.0429105848910996e-08,
"logits/chosen": -1.9621855020523071,
"logits/rejected": -1.9175077676773071,
"logps/chosen": -215.39450073242188,
"logps/ref_chosen": -229.72836303710938,
"logps/ref_rejected": -233.65237426757812,
"logps/rejected": -222.21798706054688,
"loss": 0.4937,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 0.14333853125572205,
"rewards/margins": 0.028994807973504066,
"rewards/rejected": 0.11434372514486313,
"step": 190
},
{
"epoch": 0.8335559711461394,
"grad_norm": 0.5308636426925659,
"learning_rate": 3.968661679220467e-08,
"logits/chosen": -1.971208930015564,
"logits/rejected": -1.9112732410430908,
"logps/chosen": -210.79598999023438,
"logps/ref_chosen": -224.2023468017578,
"logps/ref_rejected": -224.3248748779297,
"logps/rejected": -212.8175811767578,
"loss": 0.4932,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.1340634524822235,
"rewards/margins": 0.018990488722920418,
"rewards/rejected": 0.11507296562194824,
"step": 195
},
{
"epoch": 0.8549292011755276,
"grad_norm": 0.615912675857544,
"learning_rate": 3.013156219837776e-08,
"logits/chosen": -1.7899879217147827,
"logits/rejected": -1.6696176528930664,
"logps/chosen": -215.92288208007812,
"logps/ref_chosen": -228.88381958007812,
"logps/ref_rejected": -231.0583953857422,
"logps/rejected": -220.5959930419922,
"loss": 0.4932,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.12960924208164215,
"rewards/margins": 0.024985069409012794,
"rewards/rejected": 0.1046241745352745,
"step": 200
},
{
"epoch": 0.8763024312049158,
"grad_norm": 0.590220034122467,
"learning_rate": 2.1817890137430932e-08,
"logits/chosen": -1.81471848487854,
"logits/rejected": -1.714023232460022,
"logps/chosen": -205.69888305664062,
"logps/ref_chosen": -221.30752563476562,
"logps/ref_rejected": -224.98486328125,
"logps/rejected": -211.78884887695312,
"loss": 0.4937,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 0.15608620643615723,
"rewards/margins": 0.024126073345541954,
"rewards/rejected": 0.13196012377738953,
"step": 205
},
{
"epoch": 0.897675661234304,
"grad_norm": 0.5369106531143188,
"learning_rate": 1.479253980347392e-08,
"logits/chosen": -1.8037662506103516,
"logits/rejected": -1.7787643671035767,
"logps/chosen": -225.9608612060547,
"logps/ref_chosen": -241.4657440185547,
"logps/ref_rejected": -241.3707733154297,
"logps/rejected": -228.4087371826172,
"loss": 0.4931,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.15504886209964752,
"rewards/margins": 0.025428583845496178,
"rewards/rejected": 0.1296202689409256,
"step": 210
},
{
"epoch": 0.9190488912636923,
"grad_norm": 0.5737273097038269,
"learning_rate": 9.095176494896661e-09,
"logits/chosen": -1.8023388385772705,
"logits/rejected": -1.7160924673080444,
"logps/chosen": -218.32034301757812,
"logps/ref_chosen": -231.6717071533203,
"logps/ref_rejected": -236.741943359375,
"logps/rejected": -225.2128448486328,
"loss": 0.4933,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.13351376354694366,
"rewards/margins": 0.018222931772470474,
"rewards/rejected": 0.11529083549976349,
"step": 215
},
{
"epoch": 0.9404221212930804,
"grad_norm": 0.6087775826454163,
"learning_rate": 4.757967663132689e-09,
"logits/chosen": -1.833620309829712,
"logits/rejected": -1.7870299816131592,
"logps/chosen": -221.86032104492188,
"logps/ref_chosen": -236.0878448486328,
"logps/ref_rejected": -230.54141235351562,
"logps/rejected": -218.8464813232422,
"loss": 0.4935,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.14227530360221863,
"rewards/margins": 0.025325754657387733,
"rewards/rejected": 0.11694953590631485,
"step": 220
},
{
"epoch": 0.9617953513224686,
"grad_norm": 0.6274195909500122,
"learning_rate": 1.8054012944479224e-09,
"logits/chosen": -1.7650978565216064,
"logits/rejected": -1.7383601665496826,
"logps/chosen": -231.64111328125,
"logps/ref_chosen": -244.44155883789062,
"logps/ref_rejected": -240.8953094482422,
"logps/rejected": -230.3839874267578,
"loss": 0.4932,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.12800416350364685,
"rewards/margins": 0.022890925407409668,
"rewards/rejected": 0.10511324554681778,
"step": 225
},
{
"epoch": 0.9831685813518568,
"grad_norm": 0.5350868105888367,
"learning_rate": 2.541476501764228e-10,
"logits/chosen": -1.8503191471099854,
"logits/rejected": -1.878313660621643,
"logps/chosen": -206.16665649414062,
"logps/ref_chosen": -219.6629638671875,
"logps/ref_rejected": -212.42172241210938,
"logps/rejected": -200.54551696777344,
"loss": 0.494,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.13496311008930206,
"rewards/margins": 0.016201000660657883,
"rewards/rejected": 0.11876209825277328,
"step": 230
},
{
"epoch": 0.9959925193694897,
"step": 233,
"total_flos": 0.0,
"train_loss": 0.49642937480124283,
"train_runtime": 16410.2083,
"train_samples_per_second": 3.649,
"train_steps_per_second": 0.014
}
],
"logging_steps": 5,
"max_steps": 233,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}