7b-kto-i0 / trainer_state.json
BraylonDash's picture
Model save
3dd3a7b verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9997382884061764,
"eval_steps": 500,
"global_step": 955,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"learning_rate": 5.208333333333333e-08,
"logits/chosen": -2.578385353088379,
"logits/rejected": -2.53226900100708,
"logps/chosen": -286.13739013671875,
"logps/rejected": -212.73016357421875,
"loss": 0.5,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.01,
"learning_rate": 5.208333333333334e-07,
"logits/chosen": -2.468435525894165,
"logits/rejected": -2.5060648918151855,
"logps/chosen": -258.7095947265625,
"logps/rejected": -233.5037384033203,
"loss": 0.5,
"rewards/accuracies": 0.3819444477558136,
"rewards/chosen": -7.664680015295744e-05,
"rewards/margins": 6.3225775193131994e-06,
"rewards/rejected": -8.296939631691203e-05,
"step": 10
},
{
"epoch": 0.02,
"learning_rate": 1.0416666666666667e-06,
"logits/chosen": -2.346015214920044,
"logits/rejected": -2.4067437648773193,
"logps/chosen": -196.97122192382812,
"logps/rejected": -193.7008056640625,
"loss": 0.5,
"rewards/accuracies": 0.39375001192092896,
"rewards/chosen": 6.571458652615547e-05,
"rewards/margins": -3.4166391742473934e-06,
"rewards/rejected": 6.913123070262372e-05,
"step": 20
},
{
"epoch": 0.03,
"learning_rate": 1.5625e-06,
"logits/chosen": -2.4858970642089844,
"logits/rejected": -2.451706886291504,
"logps/chosen": -236.32901000976562,
"logps/rejected": -208.12997436523438,
"loss": 0.5,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.0012510574888437986,
"rewards/margins": 0.00014562405704054981,
"rewards/rejected": 0.0011054335627704859,
"step": 30
},
{
"epoch": 0.04,
"learning_rate": 2.0833333333333334e-06,
"logits/chosen": -2.4558634757995605,
"logits/rejected": -2.477804183959961,
"logps/chosen": -242.181640625,
"logps/rejected": -234.55661010742188,
"loss": 0.4999,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.0018630999838933349,
"rewards/margins": 0.0003440978180151433,
"rewards/rejected": 0.001519002253189683,
"step": 40
},
{
"epoch": 0.05,
"learning_rate": 2.604166666666667e-06,
"logits/chosen": -2.489288806915283,
"logits/rejected": -2.5008156299591064,
"logps/chosen": -235.43099975585938,
"logps/rejected": -222.29641723632812,
"loss": 0.4998,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.0019815764389932156,
"rewards/margins": 0.0009142985800281167,
"rewards/rejected": 0.0010672778589650989,
"step": 50
},
{
"epoch": 0.06,
"learning_rate": 3.125e-06,
"logits/chosen": -2.450463056564331,
"logits/rejected": -2.443624258041382,
"logps/chosen": -256.23590087890625,
"logps/rejected": -230.0203094482422,
"loss": 0.4997,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.004073253367096186,
"rewards/margins": 0.001231834408827126,
"rewards/rejected": 0.002841418841853738,
"step": 60
},
{
"epoch": 0.07,
"learning_rate": 3.6458333333333333e-06,
"logits/chosen": -2.459900379180908,
"logits/rejected": -2.4804420471191406,
"logps/chosen": -251.94174194335938,
"logps/rejected": -234.4181671142578,
"loss": 0.4995,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.004830378107726574,
"rewards/margins": 0.0021367089357227087,
"rewards/rejected": 0.002693668706342578,
"step": 70
},
{
"epoch": 0.08,
"learning_rate": 4.166666666666667e-06,
"logits/chosen": -2.4172418117523193,
"logits/rejected": -2.3813605308532715,
"logps/chosen": -235.5308074951172,
"logps/rejected": -215.6710968017578,
"loss": 0.4992,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.006930059753358364,
"rewards/margins": 0.002805978525429964,
"rewards/rejected": 0.004124081693589687,
"step": 80
},
{
"epoch": 0.09,
"learning_rate": 4.6875000000000004e-06,
"logits/chosen": -2.4015233516693115,
"logits/rejected": -2.3940534591674805,
"logps/chosen": -226.13137817382812,
"logps/rejected": -213.4936981201172,
"loss": 0.4991,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 0.00892677903175354,
"rewards/margins": 0.004470665007829666,
"rewards/rejected": 0.0044561149552464485,
"step": 90
},
{
"epoch": 0.1,
"learning_rate": 4.9997324926814375e-06,
"logits/chosen": -2.382424831390381,
"logits/rejected": -2.3642446994781494,
"logps/chosen": -204.63381958007812,
"logps/rejected": -217.92910766601562,
"loss": 0.4989,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.010236050002276897,
"rewards/margins": 0.00466396939009428,
"rewards/rejected": 0.005572080612182617,
"step": 100
},
{
"epoch": 0.12,
"learning_rate": 4.996723692767927e-06,
"logits/chosen": -2.395820140838623,
"logits/rejected": -2.407099485397339,
"logps/chosen": -215.1260223388672,
"logps/rejected": -210.58309936523438,
"loss": 0.4986,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.010834300890564919,
"rewards/margins": 0.006746213883161545,
"rewards/rejected": 0.004088086076080799,
"step": 110
},
{
"epoch": 0.13,
"learning_rate": 4.9903757462135984e-06,
"logits/chosen": -2.361361503601074,
"logits/rejected": -2.4017128944396973,
"logps/chosen": -208.5069122314453,
"logps/rejected": -193.96817016601562,
"loss": 0.4984,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.009557174518704414,
"rewards/margins": 0.00605000089854002,
"rewards/rejected": 0.0035071733873337507,
"step": 120
},
{
"epoch": 0.14,
"learning_rate": 4.980697142834315e-06,
"logits/chosen": -2.3745360374450684,
"logits/rejected": -2.3689522743225098,
"logps/chosen": -226.9114990234375,
"logps/rejected": -210.3325653076172,
"loss": 0.4982,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.010273845866322517,
"rewards/margins": 0.005562370643019676,
"rewards/rejected": 0.004711476154625416,
"step": 130
},
{
"epoch": 0.15,
"learning_rate": 4.967700826904229e-06,
"logits/chosen": -2.39690899848938,
"logits/rejected": -2.4031527042388916,
"logps/chosen": -207.76968383789062,
"logps/rejected": -206.6008758544922,
"loss": 0.4978,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.011523631401360035,
"rewards/margins": 0.009559462778270245,
"rewards/rejected": 0.0019641686230897903,
"step": 140
},
{
"epoch": 0.16,
"learning_rate": 4.951404179843963e-06,
"logits/chosen": -2.2971677780151367,
"logits/rejected": -2.3256301879882812,
"logps/chosen": -223.7987823486328,
"logps/rejected": -211.0409393310547,
"loss": 0.498,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 0.007091984152793884,
"rewards/margins": 0.009901536628603935,
"rewards/rejected": -0.002809552475810051,
"step": 150
},
{
"epoch": 0.17,
"learning_rate": 4.931828996974498e-06,
"logits/chosen": -2.3667495250701904,
"logits/rejected": -2.2750391960144043,
"logps/chosen": -207.93814086914062,
"logps/rejected": -217.41806030273438,
"loss": 0.497,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.00839292537420988,
"rewards/margins": 0.015968123450875282,
"rewards/rejected": -0.0075751966796815395,
"step": 160
},
{
"epoch": 0.18,
"learning_rate": 4.909001458367867e-06,
"logits/chosen": -2.3504929542541504,
"logits/rejected": -2.328986644744873,
"logps/chosen": -262.8653259277344,
"logps/rejected": -238.43017578125,
"loss": 0.4971,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 0.005286640953272581,
"rewards/margins": 0.01585621014237404,
"rewards/rejected": -0.010569569654762745,
"step": 170
},
{
"epoch": 0.19,
"learning_rate": 4.882952093833628e-06,
"logits/chosen": -2.2022526264190674,
"logits/rejected": -2.163339138031006,
"logps/chosen": -211.5063018798828,
"logps/rejected": -248.37081909179688,
"loss": 0.4967,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.00028603168902918696,
"rewards/margins": 0.02056037448346615,
"rewards/rejected": -0.020274341106414795,
"step": 180
},
{
"epoch": 0.2,
"learning_rate": 4.853715742087947e-06,
"logits/chosen": -2.2854294776916504,
"logits/rejected": -2.230767011642456,
"logps/chosen": -295.3899230957031,
"logps/rejected": -293.7907409667969,
"loss": 0.4955,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.026039790362119675,
"rewards/margins": 0.04283389076590538,
"rewards/rejected": -0.06887368112802505,
"step": 190
},
{
"epoch": 0.21,
"learning_rate": 4.821331504159906e-06,
"logits/chosen": -2.1649932861328125,
"logits/rejected": -2.122584819793701,
"logps/chosen": -251.9384307861328,
"logps/rejected": -293.52923583984375,
"loss": 0.4952,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.030307698994874954,
"rewards/margins": 0.03713207319378853,
"rewards/rejected": -0.06743976473808289,
"step": 200
},
{
"epoch": 0.22,
"learning_rate": 4.7858426910973435e-06,
"logits/chosen": -2.0010428428649902,
"logits/rejected": -1.9664274454116821,
"logps/chosen": -352.85986328125,
"logps/rejected": -415.43768310546875,
"loss": 0.492,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.11694659292697906,
"rewards/margins": 0.0868750587105751,
"rewards/rejected": -0.20382165908813477,
"step": 210
},
{
"epoch": 0.23,
"learning_rate": 4.747296766042161e-06,
"logits/chosen": -1.5235364437103271,
"logits/rejected": -1.551948070526123,
"logps/chosen": -565.6112060546875,
"logps/rejected": -687.0833129882812,
"loss": 0.4871,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.3442026376724243,
"rewards/margins": 0.13101640343666077,
"rewards/rejected": -0.47521907091140747,
"step": 220
},
{
"epoch": 0.24,
"learning_rate": 4.705745280752586e-06,
"logits/chosen": -1.5635735988616943,
"logits/rejected": -1.5089839696884155,
"logps/chosen": -867.3118286132812,
"logps/rejected": -959.4519653320312,
"loss": 0.4819,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.6232264041900635,
"rewards/margins": 0.11986882984638214,
"rewards/rejected": -0.7430952787399292,
"step": 230
},
{
"epoch": 0.25,
"learning_rate": 4.661243806657256e-06,
"logits/chosen": -1.8420759439468384,
"logits/rejected": -1.7493212223052979,
"logps/chosen": -739.1602783203125,
"logps/rejected": -1018.7802734375,
"loss": 0.4822,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.5061613321304321,
"rewards/margins": 0.31740203499794006,
"rewards/rejected": -0.8235633969306946,
"step": 240
},
{
"epoch": 0.26,
"learning_rate": 4.613851860533367e-06,
"logits/chosen": -1.7359821796417236,
"logits/rejected": -1.664820909500122,
"logps/chosen": -867.6565551757812,
"logps/rejected": -1373.392333984375,
"loss": 0.4804,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.6370395421981812,
"rewards/margins": 0.5256737470626831,
"rewards/rejected": -1.1627132892608643,
"step": 250
},
{
"epoch": 0.27,
"learning_rate": 4.563632824908252e-06,
"logits/chosen": -1.8760831356048584,
"logits/rejected": -1.84027898311615,
"logps/chosen": -652.6282958984375,
"logps/rejected": -1129.7745361328125,
"loss": 0.4724,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.47100549936294556,
"rewards/margins": 0.4714561402797699,
"rewards/rejected": -0.9424616694450378,
"step": 260
},
{
"epoch": 0.28,
"learning_rate": 4.510653863290871e-06,
"logits/chosen": -1.8029680252075195,
"logits/rejected": -1.7324800491333008,
"logps/chosen": -1249.8055419921875,
"logps/rejected": -1807.9056396484375,
"loss": 0.4742,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -1.0133898258209229,
"rewards/margins": 0.5813573002815247,
"rewards/rejected": -1.5947470664978027,
"step": 270
},
{
"epoch": 0.29,
"learning_rate": 4.454985830346574e-06,
"logits/chosen": -1.9399795532226562,
"logits/rejected": -1.8146251440048218,
"logps/chosen": -902.8073120117188,
"logps/rejected": -1436.665283203125,
"loss": 0.469,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.6662728786468506,
"rewards/margins": 0.5495506525039673,
"rewards/rejected": -1.2158234119415283,
"step": 280
},
{
"epoch": 0.3,
"learning_rate": 4.396703177135262e-06,
"logits/chosen": -1.8842103481292725,
"logits/rejected": -1.7202155590057373,
"logps/chosen": -1762.968994140625,
"logps/rejected": -1725.986572265625,
"loss": 0.4717,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -1.510768175125122,
"rewards/margins": 0.030063262209296227,
"rewards/rejected": -1.540831446647644,
"step": 290
},
{
"epoch": 0.31,
"learning_rate": 4.335883851539693e-06,
"logits/chosen": -2.0325398445129395,
"logits/rejected": -1.8330217599868774,
"logps/chosen": -1008.1095581054688,
"logps/rejected": -1700.26171875,
"loss": 0.4671,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.7875405550003052,
"rewards/margins": 0.731080174446106,
"rewards/rejected": -1.5186206102371216,
"step": 300
},
{
"epoch": 0.32,
"learning_rate": 4.2726091940171055e-06,
"logits/chosen": -1.969412088394165,
"logits/rejected": -1.8437814712524414,
"logps/chosen": -798.6140747070312,
"logps/rejected": -1872.577392578125,
"loss": 0.4564,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.5798918604850769,
"rewards/margins": 1.0649895668029785,
"rewards/rejected": -1.6448814868927002,
"step": 310
},
{
"epoch": 0.33,
"learning_rate": 4.206963828813555e-06,
"logits/chosen": -2.004281759262085,
"logits/rejected": -1.8325812816619873,
"logps/chosen": -1020.1710205078125,
"logps/rejected": -2015.131591796875,
"loss": 0.4614,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.8064894676208496,
"rewards/margins": 0.9903135299682617,
"rewards/rejected": -1.7968031167984009,
"step": 320
},
{
"epoch": 0.35,
"learning_rate": 4.139035550786495e-06,
"logits/chosen": -2.0901331901550293,
"logits/rejected": -1.9698684215545654,
"logps/chosen": -915.0389404296875,
"logps/rejected": -1435.4764404296875,
"loss": 0.4679,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.6817190051078796,
"rewards/margins": 0.5508195161819458,
"rewards/rejected": -1.2325387001037598,
"step": 330
},
{
"epoch": 0.36,
"learning_rate": 4.068915207986931e-06,
"logits/chosen": -1.9757936000823975,
"logits/rejected": -1.897470474243164,
"logps/chosen": -1229.423828125,
"logps/rejected": -1853.947265625,
"loss": 0.4667,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -1.007495641708374,
"rewards/margins": 0.6511304974555969,
"rewards/rejected": -1.6586261987686157,
"step": 340
},
{
"epoch": 0.37,
"learning_rate": 3.996696580158211e-06,
"logits/chosen": -1.968062162399292,
"logits/rejected": -1.8332984447479248,
"logps/chosen": -1563.551025390625,
"logps/rejected": -2647.005615234375,
"loss": 0.4615,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.3222496509552002,
"rewards/margins": 1.0978131294250488,
"rewards/rejected": -2.42006254196167,
"step": 350
},
{
"epoch": 0.38,
"learning_rate": 3.922476253313921e-06,
"logits/chosen": -2.2060952186584473,
"logits/rejected": -2.1268954277038574,
"logps/chosen": -1001.3084106445312,
"logps/rejected": -1724.416015625,
"loss": 0.4584,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.7821061611175537,
"rewards/margins": 0.7285407781600952,
"rewards/rejected": -1.5106468200683594,
"step": 360
},
{
"epoch": 0.39,
"learning_rate": 3.846353490562664e-06,
"logits/chosen": -2.1300292015075684,
"logits/rejected": -2.000924587249756,
"logps/chosen": -1243.8800048828125,
"logps/rejected": -2480.47021484375,
"loss": 0.4514,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.0223863124847412,
"rewards/margins": 1.2320432662963867,
"rewards/rejected": -2.254429578781128,
"step": 370
},
{
"epoch": 0.4,
"learning_rate": 3.768430099352445e-06,
"logits/chosen": -2.221879243850708,
"logits/rejected": -2.128418207168579,
"logps/chosen": -875.8338623046875,
"logps/rejected": -1593.1365966796875,
"loss": 0.4588,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.6446818113327026,
"rewards/margins": 0.7323214411735535,
"rewards/rejected": -1.3770033121109009,
"step": 380
},
{
"epoch": 0.41,
"learning_rate": 3.6888102953122307e-06,
"logits/chosen": -2.0890746116638184,
"logits/rejected": -1.9689449071884155,
"logps/chosen": -1064.7686767578125,
"logps/rejected": -1700.5394287109375,
"loss": 0.4656,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.8423658609390259,
"rewards/margins": 0.6700539588928223,
"rewards/rejected": -1.5124199390411377,
"step": 390
},
{
"epoch": 0.42,
"learning_rate": 3.607600562872785e-06,
"logits/chosen": -2.2589426040649414,
"logits/rejected": -2.160431385040283,
"logps/chosen": -959.44140625,
"logps/rejected": -1434.484130859375,
"loss": 0.4572,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.69480961561203,
"rewards/margins": 0.5226942896842957,
"rewards/rejected": -1.2175039052963257,
"step": 400
},
{
"epoch": 0.43,
"learning_rate": 3.5249095128531863e-06,
"logits/chosen": -2.2115917205810547,
"logits/rejected": -2.127436399459839,
"logps/chosen": -1019.97509765625,
"logps/rejected": -1477.9644775390625,
"loss": 0.4597,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.7589401602745056,
"rewards/margins": 0.49645981192588806,
"rewards/rejected": -1.2553999423980713,
"step": 410
},
{
"epoch": 0.44,
"learning_rate": 3.4408477372034743e-06,
"logits/chosen": -1.9758100509643555,
"logits/rejected": -1.8129494190216064,
"logps/chosen": -1256.257568359375,
"logps/rejected": -2412.14794921875,
"loss": 0.4568,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -1.0521903038024902,
"rewards/margins": 1.142812728881836,
"rewards/rejected": -2.1950032711029053,
"step": 420
},
{
"epoch": 0.45,
"learning_rate": 3.355527661097728e-06,
"logits/chosen": -2.1546552181243896,
"logits/rejected": -2.0900943279266357,
"logps/chosen": -718.0090942382812,
"logps/rejected": -1487.015380859375,
"loss": 0.4635,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.5029612183570862,
"rewards/margins": 0.7773032188415527,
"rewards/rejected": -1.2802644968032837,
"step": 430
},
{
"epoch": 0.46,
"learning_rate": 3.269063392575352e-06,
"logits/chosen": -2.1480519771575928,
"logits/rejected": -2.071498394012451,
"logps/chosen": -1282.3341064453125,
"logps/rejected": -2054.326171875,
"loss": 0.4626,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -1.0578222274780273,
"rewards/margins": 0.7977155447006226,
"rewards/rejected": -1.855538010597229,
"step": 440
},
{
"epoch": 0.47,
"learning_rate": 3.181570569931697e-06,
"logits/chosen": -1.928739309310913,
"logits/rejected": -1.8378665447235107,
"logps/chosen": -1254.6744384765625,
"logps/rejected": -2223.1025390625,
"loss": 0.4614,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -1.060748815536499,
"rewards/margins": 0.9665641784667969,
"rewards/rejected": -2.027312755584717,
"step": 450
},
{
"epoch": 0.48,
"learning_rate": 3.09316620706208e-06,
"logits/chosen": -2.2325401306152344,
"logits/rejected": -2.123627185821533,
"logps/chosen": -882.6212768554688,
"logps/rejected": -1591.8580322265625,
"loss": 0.459,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.6440384387969971,
"rewards/margins": 0.7315788269042969,
"rewards/rejected": -1.3756173849105835,
"step": 460
},
{
"epoch": 0.49,
"learning_rate": 3.0039685369660785e-06,
"logits/chosen": -2.02402663230896,
"logits/rejected": -1.9017149209976196,
"logps/chosen": -1743.1324462890625,
"logps/rejected": -2763.71923828125,
"loss": 0.454,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -1.5182123184204102,
"rewards/margins": 1.0456020832061768,
"rewards/rejected": -2.563814401626587,
"step": 470
},
{
"epoch": 0.5,
"learning_rate": 2.91409685362137e-06,
"logits/chosen": -2.046326160430908,
"logits/rejected": -1.985815405845642,
"logps/chosen": -1439.734130859375,
"logps/rejected": -2124.520263671875,
"loss": 0.4659,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -1.2466154098510742,
"rewards/margins": 0.6888442039489746,
"rewards/rejected": -1.935459852218628,
"step": 480
},
{
"epoch": 0.51,
"learning_rate": 2.8236713524386085e-06,
"logits/chosen": -2.134103775024414,
"logits/rejected": -2.0179543495178223,
"logps/chosen": -1013.251953125,
"logps/rejected": -2033.077392578125,
"loss": 0.4508,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.815003514289856,
"rewards/margins": 1.0394479036331177,
"rewards/rejected": -1.8544514179229736,
"step": 490
},
{
"epoch": 0.52,
"learning_rate": 2.7328129695107205e-06,
"logits/chosen": -2.16344952583313,
"logits/rejected": -1.9862359762191772,
"logps/chosen": -1314.334228515625,
"logps/rejected": -2926.594970703125,
"loss": 0.458,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -1.0824334621429443,
"rewards/margins": 1.6397478580474854,
"rewards/rejected": -2.7221813201904297,
"step": 500
},
{
"epoch": 0.53,
"learning_rate": 2.641643219871597e-06,
"logits/chosen": -2.2062978744506836,
"logits/rejected": -2.057356357574463,
"logps/chosen": -1038.0162353515625,
"logps/rejected": -2059.95458984375,
"loss": 0.4491,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.8147695660591125,
"rewards/margins": 1.0522905588150024,
"rewards/rejected": -1.8670603036880493,
"step": 510
},
{
"epoch": 0.54,
"learning_rate": 2.5502840349805074e-06,
"logits/chosen": -2.2159011363983154,
"logits/rejected": -2.0826644897460938,
"logps/chosen": -936.4302978515625,
"logps/rejected": -1992.089599609375,
"loss": 0.4483,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.6953409910202026,
"rewards/margins": 1.08708918094635,
"rewards/rejected": -1.7824300527572632,
"step": 520
},
{
"epoch": 0.55,
"learning_rate": 2.4588575996495797e-06,
"logits/chosen": -2.2215633392333984,
"logits/rejected": -2.053880214691162,
"logps/chosen": -846.26611328125,
"logps/rejected": -2448.76416015625,
"loss": 0.4513,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.5850510001182556,
"rewards/margins": 1.622568130493164,
"rewards/rejected": -2.2076191902160645,
"step": 530
},
{
"epoch": 0.57,
"learning_rate": 2.367486188632446e-06,
"logits/chosen": -2.221585273742676,
"logits/rejected": -2.0297319889068604,
"logps/chosen": -1184.786865234375,
"logps/rejected": -2758.436767578125,
"loss": 0.4455,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.9226329922676086,
"rewards/margins": 1.6065568923950195,
"rewards/rejected": -2.5291898250579834,
"step": 540
},
{
"epoch": 0.58,
"learning_rate": 2.276292003092593e-06,
"logits/chosen": -2.196733236312866,
"logits/rejected": -2.057121992111206,
"logps/chosen": -1445.136962890625,
"logps/rejected": -2547.5927734375,
"loss": 0.4602,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.2020736932754517,
"rewards/margins": 1.1371889114379883,
"rewards/rejected": -2.3392627239227295,
"step": 550
},
{
"epoch": 0.59,
"learning_rate": 2.1853970071701415e-06,
"logits/chosen": -2.14131498336792,
"logits/rejected": -2.0334861278533936,
"logps/chosen": -913.9793090820312,
"logps/rejected": -1585.7960205078125,
"loss": 0.4608,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.7021154165267944,
"rewards/margins": 0.7157109975814819,
"rewards/rejected": -1.417826533317566,
"step": 560
},
{
"epoch": 0.6,
"learning_rate": 2.0949227648656194e-06,
"logits/chosen": -2.150709629058838,
"logits/rejected": -2.051652431488037,
"logps/chosen": -930.1282348632812,
"logps/rejected": -1828.182861328125,
"loss": 0.4531,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.7185366749763489,
"rewards/margins": 0.9112586975097656,
"rewards/rejected": -1.6297954320907593,
"step": 570
},
{
"epoch": 0.61,
"learning_rate": 2.00499027745888e-06,
"logits/chosen": -2.165860891342163,
"logits/rejected": -1.990290641784668,
"logps/chosen": -1457.848388671875,
"logps/rejected": -3095.86181640625,
"loss": 0.4532,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.2225459814071655,
"rewards/margins": 1.659259557723999,
"rewards/rejected": -2.881805658340454,
"step": 580
},
{
"epoch": 0.62,
"learning_rate": 1.915719821680624e-06,
"logits/chosen": -2.033405303955078,
"logits/rejected": -1.9788004159927368,
"logps/chosen": -1339.5030517578125,
"logps/rejected": -1871.1273193359375,
"loss": 0.4542,
"rewards/accuracies": 0.53125,
"rewards/chosen": -1.1169803142547607,
"rewards/margins": 0.5490958094596863,
"rewards/rejected": -1.6660760641098022,
"step": 590
},
{
"epoch": 0.63,
"learning_rate": 1.8272307888529276e-06,
"logits/chosen": -2.2021541595458984,
"logits/rejected": -2.0622100830078125,
"logps/chosen": -1060.39404296875,
"logps/rejected": -2731.3662109375,
"loss": 0.4515,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.8206619024276733,
"rewards/margins": 1.6797775030136108,
"rewards/rejected": -2.5004396438598633,
"step": 600
},
{
"epoch": 0.64,
"learning_rate": 1.739641525213929e-06,
"logits/chosen": -2.200084686279297,
"logits/rejected": -2.0629191398620605,
"logps/chosen": -1195.5421142578125,
"logps/rejected": -2445.993408203125,
"loss": 0.4489,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.9617778658866882,
"rewards/margins": 1.294762134552002,
"rewards/rejected": -2.256540298461914,
"step": 610
},
{
"epoch": 0.65,
"learning_rate": 1.6530691736402317e-06,
"logits/chosen": -2.1470463275909424,
"logits/rejected": -2.028573513031006,
"logps/chosen": -1541.4970703125,
"logps/rejected": -2454.397705078125,
"loss": 0.4469,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.3200973272323608,
"rewards/margins": 0.9341692924499512,
"rewards/rejected": -2.2542667388916016,
"step": 620
},
{
"epoch": 0.66,
"learning_rate": 1.5676295169786864e-06,
"logits/chosen": -2.1975948810577393,
"logits/rejected": -2.060920476913452,
"logps/chosen": -1427.9593505859375,
"logps/rejected": -3144.806884765625,
"loss": 0.4485,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -1.176941156387329,
"rewards/margins": 1.7532637119293213,
"rewards/rejected": -2.9302048683166504,
"step": 630
},
{
"epoch": 0.67,
"learning_rate": 1.4834368231970922e-06,
"logits/chosen": -2.156165838241577,
"logits/rejected": -2.045762777328491,
"logps/chosen": -1413.2603759765625,
"logps/rejected": -3491.219970703125,
"loss": 0.4488,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.1946003437042236,
"rewards/margins": 2.0828354358673096,
"rewards/rejected": -3.2774360179901123,
"step": 640
},
{
"epoch": 0.68,
"learning_rate": 1.4006036925609245e-06,
"logits/chosen": -2.2174124717712402,
"logits/rejected": -2.088347911834717,
"logps/chosen": -1448.646484375,
"logps/rejected": -2385.66259765625,
"loss": 0.4474,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.2005541324615479,
"rewards/margins": 0.9563320875167847,
"rewards/rejected": -2.156886339187622,
"step": 650
},
{
"epoch": 0.69,
"learning_rate": 1.3192409070404582e-06,
"logits/chosen": -2.2460074424743652,
"logits/rejected": -2.1426172256469727,
"logps/chosen": -1455.069580078125,
"logps/rejected": -2169.728515625,
"loss": 0.4531,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -1.21938955783844,
"rewards/margins": 0.7492297887802124,
"rewards/rejected": -1.9686193466186523,
"step": 660
},
{
"epoch": 0.7,
"learning_rate": 1.2394572821496953e-06,
"logits/chosen": -2.229182481765747,
"logits/rejected": -2.115177631378174,
"logps/chosen": -1379.844482421875,
"logps/rejected": -2796.632568359375,
"loss": 0.4498,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -1.1495221853256226,
"rewards/margins": 1.4514307975769043,
"rewards/rejected": -2.6009533405303955,
"step": 670
},
{
"epoch": 0.71,
"learning_rate": 1.1613595214152713e-06,
"logits/chosen": -2.2372703552246094,
"logits/rejected": -2.1571171283721924,
"logps/chosen": -1203.8697509765625,
"logps/rejected": -1857.9075927734375,
"loss": 0.4523,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.9586626291275024,
"rewards/margins": 0.6804057359695435,
"rewards/rejected": -1.6390682458877563,
"step": 680
},
{
"epoch": 0.72,
"learning_rate": 1.0850520736699362e-06,
"logits/chosen": -2.1900107860565186,
"logits/rejected": -2.080841064453125,
"logps/chosen": -1038.609619140625,
"logps/rejected": -1972.8609619140625,
"loss": 0.4589,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.8293488621711731,
"rewards/margins": 0.9573895335197449,
"rewards/rejected": -1.7867381572723389,
"step": 690
},
{
"epoch": 0.73,
"learning_rate": 1.0106369933615043e-06,
"logits/chosen": -2.208099126815796,
"logits/rejected": -2.082400321960449,
"logps/chosen": -1469.6568603515625,
"logps/rejected": -3010.545654296875,
"loss": 0.4501,
"rewards/accuracies": 0.53125,
"rewards/chosen": -1.243025302886963,
"rewards/margins": 1.5714446306228638,
"rewards/rejected": -2.814469814300537,
"step": 700
},
{
"epoch": 0.74,
"learning_rate": 9.382138040640714e-07,
"logits/chosen": -2.220716953277588,
"logits/rejected": -2.1422343254089355,
"logps/chosen": -1299.326904296875,
"logps/rejected": -2368.89111328125,
"loss": 0.4548,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -1.0415502786636353,
"rewards/margins": 1.1175090074539185,
"rewards/rejected": -2.1590590476989746,
"step": 710
},
{
"epoch": 0.75,
"learning_rate": 8.678793653740633e-07,
"logits/chosen": -2.222707509994507,
"logits/rejected": -2.1047911643981934,
"logps/chosen": -1158.9268798828125,
"logps/rejected": -2246.68212890625,
"loss": 0.4549,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.9390741586685181,
"rewards/margins": 1.1018383502960205,
"rewards/rejected": -2.040912389755249,
"step": 720
},
{
"epoch": 0.76,
"learning_rate": 7.997277433690984e-07,
"logits/chosen": -2.187948226928711,
"logits/rejected": -2.105868101119995,
"logps/chosen": -1287.493896484375,
"logps/rejected": -2305.227783203125,
"loss": 0.453,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -1.0323898792266846,
"rewards/margins": 1.072412133216858,
"rewards/rejected": -2.104801893234253,
"step": 730
},
{
"epoch": 0.77,
"learning_rate": 7.338500848029603e-07,
"logits/chosen": -2.212477445602417,
"logits/rejected": -2.127330780029297,
"logps/chosen": -960.9691162109375,
"logps/rejected": -2517.481689453125,
"loss": 0.4508,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.7053281664848328,
"rewards/margins": 1.5753790140151978,
"rewards/rejected": -2.2807071208953857,
"step": 740
},
{
"epoch": 0.79,
"learning_rate": 6.70334495204884e-07,
"logits/chosen": -2.230347156524658,
"logits/rejected": -2.119199275970459,
"logps/chosen": -1013.6222534179688,
"logps/rejected": -2389.435302734375,
"loss": 0.4505,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.7983914017677307,
"rewards/margins": 1.369321346282959,
"rewards/rejected": -2.167712688446045,
"step": 750
},
{
"epoch": 0.8,
"learning_rate": 6.092659210462232e-07,
"logits/chosen": -2.2451062202453613,
"logits/rejected": -2.2009005546569824,
"logps/chosen": -1014.0054931640625,
"logps/rejected": -2509.18310546875,
"loss": 0.4428,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.7636991143226624,
"rewards/margins": 1.5256783962249756,
"rewards/rejected": -2.2893776893615723,
"step": 760
},
{
"epoch": 0.81,
"learning_rate": 5.507260361320738e-07,
"logits/chosen": -2.2545554637908936,
"logits/rejected": -2.1696648597717285,
"logps/chosen": -1256.4468994140625,
"logps/rejected": -3355.20849609375,
"loss": 0.4417,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.9988948702812195,
"rewards/margins": 2.1163055896759033,
"rewards/rejected": -3.1152002811431885,
"step": 770
},
{
"epoch": 0.82,
"learning_rate": 4.947931323697983e-07,
"logits/chosen": -2.236833333969116,
"logits/rejected": -2.116788625717163,
"logps/chosen": -1157.8489990234375,
"logps/rejected": -2627.938720703125,
"loss": 0.4404,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.9315347671508789,
"rewards/margins": 1.4900939464569092,
"rewards/rejected": -2.421628475189209,
"step": 780
},
{
"epoch": 0.83,
"learning_rate": 4.4154201506053985e-07,
"logits/chosen": -2.2656142711639404,
"logits/rejected": -2.1838631629943848,
"logps/chosen": -1444.336181640625,
"logps/rejected": -2568.44873046875,
"loss": 0.4565,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -1.2109791040420532,
"rewards/margins": 1.1344887018203735,
"rewards/rejected": -2.345468044281006,
"step": 790
},
{
"epoch": 0.84,
"learning_rate": 3.910439028537638e-07,
"logits/chosen": -2.1305015087127686,
"logits/rejected": -2.0338492393493652,
"logps/chosen": -1384.163330078125,
"logps/rejected": -3120.740234375,
"loss": 0.44,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.1574374437332153,
"rewards/margins": 1.7521774768829346,
"rewards/rejected": -2.9096148014068604,
"step": 800
},
{
"epoch": 0.85,
"learning_rate": 3.4336633249862084e-07,
"logits/chosen": -2.1786255836486816,
"logits/rejected": -2.1090734004974365,
"logps/chosen": -1811.431640625,
"logps/rejected": -1879.762939453125,
"loss": 0.4564,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -1.5657621622085571,
"rewards/margins": 0.12173604965209961,
"rewards/rejected": -1.6874980926513672,
"step": 810
},
{
"epoch": 0.86,
"learning_rate": 2.98573068519539e-07,
"logits/chosen": -2.204667091369629,
"logits/rejected": -2.1335368156433105,
"logps/chosen": -1094.656982421875,
"logps/rejected": -2983.633056640625,
"loss": 0.4524,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.8493936657905579,
"rewards/margins": 1.935779333114624,
"rewards/rejected": -2.785172939300537,
"step": 820
},
{
"epoch": 0.87,
"learning_rate": 2.5672401793681854e-07,
"logits/chosen": -2.2261288166046143,
"logits/rejected": -2.156919002532959,
"logps/chosen": -1517.5009765625,
"logps/rejected": -2918.010986328125,
"loss": 0.4484,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -1.2897742986679077,
"rewards/margins": 1.39667546749115,
"rewards/rejected": -2.6864495277404785,
"step": 830
},
{
"epoch": 0.88,
"learning_rate": 2.178751501463036e-07,
"logits/chosen": -2.181530475616455,
"logits/rejected": -2.1404881477355957,
"logps/chosen": -1569.401611328125,
"logps/rejected": -2147.50927734375,
"loss": 0.4607,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -1.3682358264923096,
"rewards/margins": 0.5883899331092834,
"rewards/rejected": -1.9566256999969482,
"step": 840
},
{
"epoch": 0.89,
"learning_rate": 1.820784220652766e-07,
"logits/chosen": -2.213731050491333,
"logits/rejected": -2.1309895515441895,
"logps/chosen": -1591.1544189453125,
"logps/rejected": -2607.8837890625,
"loss": 0.4582,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.339202642440796,
"rewards/margins": 1.0517059564590454,
"rewards/rejected": -2.390908718109131,
"step": 850
},
{
"epoch": 0.9,
"learning_rate": 1.4938170864468636e-07,
"logits/chosen": -2.1618194580078125,
"logits/rejected": -2.054898738861084,
"logps/chosen": -1653.2855224609375,
"logps/rejected": -3330.978515625,
"loss": 0.453,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -1.4334652423858643,
"rewards/margins": 1.6885372400283813,
"rewards/rejected": -3.122002601623535,
"step": 860
},
{
"epoch": 0.91,
"learning_rate": 1.1982873884064466e-07,
"logits/chosen": -2.2253754138946533,
"logits/rejected": -2.132044792175293,
"logps/chosen": -1268.3460693359375,
"logps/rejected": -2814.267578125,
"loss": 0.4513,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -1.069705843925476,
"rewards/margins": 1.5451180934906006,
"rewards/rejected": -2.614823818206787,
"step": 870
},
{
"epoch": 0.92,
"learning_rate": 9.345903713082305e-08,
"logits/chosen": -2.2452187538146973,
"logits/rejected": -2.1340882778167725,
"logps/chosen": -1735.882568359375,
"logps/rejected": -2934.236328125,
"loss": 0.4454,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.4951032400131226,
"rewards/margins": 1.2158784866333008,
"rewards/rejected": -2.710981845855713,
"step": 880
},
{
"epoch": 0.93,
"learning_rate": 7.030787065396866e-08,
"logits/chosen": -2.2767229080200195,
"logits/rejected": -2.1917612552642822,
"logps/chosen": -1188.2750244140625,
"logps/rejected": -2986.377197265625,
"loss": 0.4469,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.9747017025947571,
"rewards/margins": 1.8013055324554443,
"rewards/rejected": -2.7760071754455566,
"step": 890
},
{
"epoch": 0.94,
"learning_rate": 5.0406202043228604e-08,
"logits/chosen": -2.2052557468414307,
"logits/rejected": -2.0551769733428955,
"logps/chosen": -1045.6455078125,
"logps/rejected": -2074.9453125,
"loss": 0.4586,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.8170153498649597,
"rewards/margins": 1.0543075799942017,
"rewards/rejected": -1.8713228702545166,
"step": 900
},
{
"epoch": 0.95,
"learning_rate": 3.378064801637687e-08,
"logits/chosen": -2.2373902797698975,
"logits/rejected": -2.1471071243286133,
"logps/chosen": -1441.9759521484375,
"logps/rejected": -3050.029296875,
"loss": 0.4416,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -1.1936613321304321,
"rewards/margins": 1.6291347742080688,
"rewards/rejected": -2.822796106338501,
"step": 910
},
{
"epoch": 0.96,
"learning_rate": 2.0453443778310766e-08,
"logits/chosen": -2.1485986709594727,
"logits/rejected": -2.0270209312438965,
"logps/chosen": -1386.749755859375,
"logps/rejected": -3182.661865234375,
"loss": 0.4479,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -1.131797432899475,
"rewards/margins": 1.841048002243042,
"rewards/rejected": -2.9728455543518066,
"step": 920
},
{
"epoch": 0.97,
"learning_rate": 1.0442413283435759e-08,
"logits/chosen": -2.273899793624878,
"logits/rejected": -2.1478586196899414,
"logps/chosen": -800.504150390625,
"logps/rejected": -2901.39501953125,
"loss": 0.4385,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.5838597416877747,
"rewards/margins": 2.1213278770446777,
"rewards/rejected": -2.7051875591278076,
"step": 930
},
{
"epoch": 0.98,
"learning_rate": 3.760945397705828e-09,
"logits/chosen": -2.2581982612609863,
"logits/rejected": -2.1440868377685547,
"logps/chosen": -1637.424072265625,
"logps/rejected": -2622.310791015625,
"loss": 0.4462,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -1.387995958328247,
"rewards/margins": 1.0160502195358276,
"rewards/rejected": -2.404046058654785,
"step": 940
},
{
"epoch": 0.99,
"learning_rate": 4.1797599220405605e-10,
"logits/chosen": -2.2077736854553223,
"logits/rejected": -2.0697312355041504,
"logps/chosen": -1709.1343994140625,
"logps/rejected": -3128.599853515625,
"loss": 0.4485,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -1.506415843963623,
"rewards/margins": 1.4321677684783936,
"rewards/rejected": -2.9385836124420166,
"step": 950
},
{
"epoch": 1.0,
"step": 955,
"total_flos": 0.0,
"train_loss": 0.46542558670043943,
"train_runtime": 22527.5186,
"train_samples_per_second": 2.714,
"train_steps_per_second": 0.042
}
],
"logging_steps": 10,
"max_steps": 955,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 20,
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}