simonycl's picture
Upload folder using huggingface_hub
602bf00 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9990762978015888,
"eval_steps": 400,
"global_step": 507,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.001970564689943962,
"grad_norm": 3.444017553990766,
"learning_rate": 9.803921568627451e-09,
"logits/chosen": -0.23276051878929138,
"logits/rejected": -0.43208426237106323,
"logps/chosen": -95.95150756835938,
"logps/rejected": -103.43749237060547,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.00985282344971981,
"grad_norm": 3.522148904992214,
"learning_rate": 4.901960784313725e-08,
"logits/chosen": -0.07696018368005753,
"logits/rejected": -0.3334544003009796,
"logps/chosen": -106.13592529296875,
"logps/rejected": -98.82791137695312,
"loss": 0.6931,
"rewards/accuracies": 0.3828125,
"rewards/chosen": -0.000650706235319376,
"rewards/margins": 0.00010106083936989307,
"rewards/rejected": -0.00075176713289693,
"step": 5
},
{
"epoch": 0.01970564689943962,
"grad_norm": 3.219795158164747,
"learning_rate": 9.80392156862745e-08,
"logits/chosen": -0.06049077585339546,
"logits/rejected": -0.260353147983551,
"logps/chosen": -98.03811645507812,
"logps/rejected": -97.61465454101562,
"loss": 0.6932,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.0005443535046651959,
"rewards/margins": -0.0005626807105727494,
"rewards/rejected": 1.8327215002500452e-05,
"step": 10
},
{
"epoch": 0.02955847034915943,
"grad_norm": 3.446656700249776,
"learning_rate": 1.4705882352941175e-07,
"logits/chosen": -0.12729588150978088,
"logits/rejected": -0.295417845249176,
"logps/chosen": -99.15818786621094,
"logps/rejected": -97.37001037597656,
"loss": 0.6931,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -7.940361683722585e-05,
"rewards/margins": -0.0002843155525624752,
"rewards/rejected": 0.00020491195027716458,
"step": 15
},
{
"epoch": 0.03941129379887924,
"grad_norm": 3.2961842224501425,
"learning_rate": 1.96078431372549e-07,
"logits/chosen": -0.11793007701635361,
"logits/rejected": -0.29867538809776306,
"logps/chosen": -103.20733642578125,
"logps/rejected": -97.68354797363281,
"loss": 0.6929,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.0002625386696308851,
"rewards/margins": 0.0006132128764875233,
"rewards/rejected": -0.0003506741486489773,
"step": 20
},
{
"epoch": 0.049264117248599054,
"grad_norm": 3.486650848831331,
"learning_rate": 2.4509803921568627e-07,
"logits/chosen": -0.17689576745033264,
"logits/rejected": -0.34940794110298157,
"logps/chosen": -106.73783874511719,
"logps/rejected": -104.61811828613281,
"loss": 0.6926,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.00013853741984348744,
"rewards/margins": 0.001048876903951168,
"rewards/rejected": -0.0009103395859710872,
"step": 25
},
{
"epoch": 0.05911694069831886,
"grad_norm": 3.326203197649224,
"learning_rate": 2.941176470588235e-07,
"logits/chosen": -0.07684741169214249,
"logits/rejected": -0.3008119761943817,
"logps/chosen": -102.79545593261719,
"logps/rejected": -101.55641174316406,
"loss": 0.692,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.0009806936141103506,
"rewards/margins": 0.003102914895862341,
"rewards/rejected": -0.0021222210489213467,
"step": 30
},
{
"epoch": 0.06896976414803867,
"grad_norm": 3.3448990755681747,
"learning_rate": 3.431372549019608e-07,
"logits/chosen": -0.05913068726658821,
"logits/rejected": -0.357162743806839,
"logps/chosen": -105.96171569824219,
"logps/rejected": -97.8105697631836,
"loss": 0.6904,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 9.926412894856185e-05,
"rewards/margins": 0.0057004219852387905,
"rewards/rejected": -0.005601157899945974,
"step": 35
},
{
"epoch": 0.07882258759775848,
"grad_norm": 3.507668256246831,
"learning_rate": 3.92156862745098e-07,
"logits/chosen": -0.08776526153087616,
"logits/rejected": -0.24237962067127228,
"logps/chosen": -90.3080825805664,
"logps/rejected": -92.34758758544922,
"loss": 0.6883,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.000393406196963042,
"rewards/margins": 0.00874106865376234,
"rewards/rejected": -0.009134475141763687,
"step": 40
},
{
"epoch": 0.0886754110474783,
"grad_norm": 3.4196928203304737,
"learning_rate": 4.4117647058823526e-07,
"logits/chosen": -0.10117539018392563,
"logits/rejected": -0.3224117159843445,
"logps/chosen": -109.0184326171875,
"logps/rejected": -108.93272399902344,
"loss": 0.6848,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.003931170795112848,
"rewards/margins": 0.021240899339318275,
"rewards/rejected": -0.025172073394060135,
"step": 45
},
{
"epoch": 0.09852823449719811,
"grad_norm": 3.5401415663183173,
"learning_rate": 4.901960784313725e-07,
"logits/chosen": -0.15071377158164978,
"logits/rejected": -0.3232310116291046,
"logps/chosen": -104.10140228271484,
"logps/rejected": -111.49967193603516,
"loss": 0.6794,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.015169525519013405,
"rewards/margins": 0.028989236801862717,
"rewards/rejected": -0.04415876418352127,
"step": 50
},
{
"epoch": 0.10838105794691791,
"grad_norm": 3.5990117325571944,
"learning_rate": 4.999050767562379e-07,
"logits/chosen": -0.08029767870903015,
"logits/rejected": -0.28052276372909546,
"logps/chosen": -99.97483825683594,
"logps/rejected": -106.4577407836914,
"loss": 0.6768,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.052169084548950195,
"rewards/margins": 0.041819095611572266,
"rewards/rejected": -0.09398818016052246,
"step": 55
},
{
"epoch": 0.11823388139663772,
"grad_norm": 3.764415157715389,
"learning_rate": 4.99519574616467e-07,
"logits/chosen": -0.13724075257778168,
"logits/rejected": -0.307682603597641,
"logps/chosen": -112.85994720458984,
"logps/rejected": -114.5948715209961,
"loss": 0.6671,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.08050791174173355,
"rewards/margins": 0.06749774515628815,
"rewards/rejected": -0.1480056643486023,
"step": 60
},
{
"epoch": 0.12808670484635754,
"grad_norm": 4.280188058589697,
"learning_rate": 4.988380179235842e-07,
"logits/chosen": -0.04185126721858978,
"logits/rejected": -0.2287481129169464,
"logps/chosen": -116.09354400634766,
"logps/rejected": -124.45157623291016,
"loss": 0.6566,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.12864899635314941,
"rewards/margins": 0.07999014109373093,
"rewards/rejected": -0.20863911509513855,
"step": 65
},
{
"epoch": 0.13793952829607734,
"grad_norm": 4.477584197323834,
"learning_rate": 4.978612153434526e-07,
"logits/chosen": -0.06294523924589157,
"logits/rejected": -0.2265748679637909,
"logps/chosen": -115.08953857421875,
"logps/rejected": -129.55136108398438,
"loss": 0.6387,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.2160721719264984,
"rewards/margins": 0.10703370720148087,
"rewards/rejected": -0.32310590147972107,
"step": 70
},
{
"epoch": 0.14779235174579716,
"grad_norm": 14.949806540573308,
"learning_rate": 4.965903258506806e-07,
"logits/chosen": -0.07772421091794968,
"logits/rejected": -0.21028895676136017,
"logps/chosen": -133.30584716796875,
"logps/rejected": -150.30581665039062,
"loss": 0.6349,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.30895400047302246,
"rewards/margins": 0.1755586862564087,
"rewards/rejected": -0.48451265692710876,
"step": 75
},
{
"epoch": 0.15764517519551696,
"grad_norm": 4.905767972397329,
"learning_rate": 4.950268573535011e-07,
"logits/chosen": -0.04325466603040695,
"logits/rejected": -0.2531152367591858,
"logps/chosen": -144.97561645507812,
"logps/rejected": -158.16122436523438,
"loss": 0.6042,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.4763612151145935,
"rewards/margins": 0.16494214534759521,
"rewards/rejected": -0.6413034200668335,
"step": 80
},
{
"epoch": 0.16749799864523676,
"grad_norm": 5.265890759040485,
"learning_rate": 4.93172664904641e-07,
"logits/chosen": -0.11157502233982086,
"logits/rejected": -0.26035866141319275,
"logps/chosen": -159.24551391601562,
"logps/rejected": -204.39495849609375,
"loss": 0.5821,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.5800860524177551,
"rewards/margins": 0.4450142979621887,
"rewards/rejected": -1.0251003503799438,
"step": 85
},
{
"epoch": 0.1773508220949566,
"grad_norm": 5.725906040480978,
"learning_rate": 4.910299485003033e-07,
"logits/chosen": -0.07629499584436417,
"logits/rejected": -0.29128947854042053,
"logps/chosen": -174.70559692382812,
"logps/rejected": -207.3603515625,
"loss": 0.5548,
"rewards/accuracies": 0.84375,
"rewards/chosen": -0.7107561230659485,
"rewards/margins": 0.3972470760345459,
"rewards/rejected": -1.10800302028656,
"step": 90
},
{
"epoch": 0.1872036455446764,
"grad_norm": 6.403573729253865,
"learning_rate": 4.886012504698769e-07,
"logits/chosen": -0.1544032096862793,
"logits/rejected": -0.37451326847076416,
"logps/chosen": -184.72915649414062,
"logps/rejected": -229.79541015625,
"loss": 0.5305,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.8728511929512024,
"rewards/margins": 0.4681348204612732,
"rewards/rejected": -1.3409860134124756,
"step": 95
},
{
"epoch": 0.19705646899439622,
"grad_norm": 7.412632948891829,
"learning_rate": 4.858894524594652e-07,
"logits/chosen": -0.15470778942108154,
"logits/rejected": -0.4357427656650543,
"logps/chosen": -234.9979705810547,
"logps/rejected": -306.1597595214844,
"loss": 0.4988,
"rewards/accuracies": 0.84375,
"rewards/chosen": -1.2750121355056763,
"rewards/margins": 0.8118740916252136,
"rewards/rejected": -2.086885929107666,
"step": 100
},
{
"epoch": 0.20690929244411602,
"grad_norm": 8.591568819157517,
"learning_rate": 4.828977720128198e-07,
"logits/chosen": -0.13351579010486603,
"logits/rejected": -0.47733497619628906,
"logps/chosen": -268.23260498046875,
"logps/rejected": -370.4355163574219,
"loss": 0.5061,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.7161176204681396,
"rewards/margins": 1.0792567729949951,
"rewards/rejected": -2.7953743934631348,
"step": 105
},
{
"epoch": 0.21676211589383582,
"grad_norm": 9.939985555056147,
"learning_rate": 4.796297587537285e-07,
"logits/chosen": -0.2192394733428955,
"logits/rejected": -0.42173558473587036,
"logps/chosen": -294.60870361328125,
"logps/rejected": -394.9258117675781,
"loss": 0.4457,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.9103400707244873,
"rewards/margins": 1.0444176197052002,
"rewards/rejected": -2.9547572135925293,
"step": 110
},
{
"epoch": 0.22661493934355564,
"grad_norm": 9.525042212102928,
"learning_rate": 4.760892901743944e-07,
"logits/chosen": -0.20480632781982422,
"logits/rejected": -0.46547192335128784,
"logps/chosen": -295.8061218261719,
"logps/rejected": -422.6036071777344,
"loss": 0.4309,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.9569038152694702,
"rewards/margins": 1.2766072750091553,
"rewards/rejected": -3.233510971069336,
"step": 115
},
{
"epoch": 0.23646776279327544,
"grad_norm": 12.501263690005059,
"learning_rate": 4.7228056703479626e-07,
"logits/chosen": -0.21756932139396667,
"logits/rejected": -0.45747238397598267,
"logps/chosen": -335.4225158691406,
"logps/rejected": -476.47637939453125,
"loss": 0.4602,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.304030179977417,
"rewards/margins": 1.4334566593170166,
"rewards/rejected": -3.7374866008758545,
"step": 120
},
{
"epoch": 0.24632058624299527,
"grad_norm": 10.295571989073236,
"learning_rate": 4.6820810837849535e-07,
"logits/chosen": -0.24834315478801727,
"logits/rejected": -0.47704511880874634,
"logps/chosen": -360.4291687011719,
"logps/rejected": -489.4334411621094,
"loss": 0.408,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.5335352420806885,
"rewards/margins": 1.3203860521316528,
"rewards/rejected": -3.8539211750030518,
"step": 125
},
{
"epoch": 0.25617340969271507,
"grad_norm": 10.797289668730418,
"learning_rate": 4.63876746170797e-07,
"logits/chosen": -0.2283620834350586,
"logits/rejected": -0.45370951294898987,
"logps/chosen": -362.5042724609375,
"logps/rejected": -505.86968994140625,
"loss": 0.4348,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.5926971435546875,
"rewards/margins": 1.4749071598052979,
"rewards/rejected": -4.067604064941406,
"step": 130
},
{
"epoch": 0.2660262331424349,
"grad_norm": 12.27688423712984,
"learning_rate": 4.592916195656321e-07,
"logits/chosen": -0.20323459804058075,
"logits/rejected": -0.4668886065483093,
"logps/chosen": -317.35345458984375,
"logps/rejected": -467.3150329589844,
"loss": 0.4105,
"rewards/accuracies": 0.8125,
"rewards/chosen": -2.133204460144043,
"rewards/margins": 1.5337626934051514,
"rewards/rejected": -3.6669669151306152,
"step": 135
},
{
"epoch": 0.27587905659215467,
"grad_norm": 15.818054338660462,
"learning_rate": 4.544581688079602e-07,
"logits/chosen": -0.18876026570796967,
"logits/rejected": -0.47572746872901917,
"logps/chosen": -343.0673828125,
"logps/rejected": -514.2898559570312,
"loss": 0.3999,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.457926034927368,
"rewards/margins": 1.7443277835845947,
"rewards/rejected": -4.2022528648376465,
"step": 140
},
{
"epoch": 0.2857318800418745,
"grad_norm": 10.354898946060759,
"learning_rate": 4.493821287789272e-07,
"logits/chosen": -0.2576290965080261,
"logits/rejected": -0.4858337938785553,
"logps/chosen": -385.8844299316406,
"logps/rejected": -581.2125244140625,
"loss": 0.3801,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -2.86683988571167,
"rewards/margins": 1.994633436203003,
"rewards/rejected": -4.861473083496094,
"step": 145
},
{
"epoch": 0.2955847034915943,
"grad_norm": 14.110695862987818,
"learning_rate": 4.4406952219143934e-07,
"logits/chosen": -0.2310657501220703,
"logits/rejected": -0.44463711977005005,
"logps/chosen": -385.70623779296875,
"logps/rejected": -559.1126708984375,
"loss": 0.3314,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -2.8108325004577637,
"rewards/margins": 1.7815577983856201,
"rewards/rejected": -4.592390060424805,
"step": 150
},
{
"epoch": 0.3054375269413141,
"grad_norm": 15.545527108248088,
"learning_rate": 4.38526652444224e-07,
"logits/chosen": -0.26962828636169434,
"logits/rejected": -0.41183385252952576,
"logps/chosen": -400.4664611816406,
"logps/rejected": -608.4261474609375,
"loss": 0.3854,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -2.9346907138824463,
"rewards/margins": 2.0727694034576416,
"rewards/rejected": -5.007460594177246,
"step": 155
},
{
"epoch": 0.3152903503910339,
"grad_norm": 14.215413027429468,
"learning_rate": 4.3276009614285824e-07,
"logits/chosen": -0.22768807411193848,
"logits/rejected": -0.4115552306175232,
"logps/chosen": -361.54876708984375,
"logps/rejected": -561.5390625,
"loss": 0.3959,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -2.6572659015655518,
"rewards/margins": 1.9841495752334595,
"rewards/rejected": -4.641415596008301,
"step": 160
},
{
"epoch": 0.32514317384075375,
"grad_norm": 15.569517834142987,
"learning_rate": 4.2677669529663686e-07,
"logits/chosen": -0.20264258980751038,
"logits/rejected": -0.40150555968284607,
"logps/chosen": -376.89971923828125,
"logps/rejected": -603.7943115234375,
"loss": 0.3664,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -2.7810330390930176,
"rewards/margins": 2.239793300628662,
"rewards/rejected": -5.02082633972168,
"step": 165
},
{
"epoch": 0.3349959972904735,
"grad_norm": 12.496148003358819,
"learning_rate": 4.2058354920054043e-07,
"logits/chosen": -0.28801003098487854,
"logits/rejected": -0.4120853543281555,
"logps/chosen": -348.6145324707031,
"logps/rejected": -565.19482421875,
"loss": 0.3559,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -2.543839454650879,
"rewards/margins": 2.167828321456909,
"rewards/rejected": -4.711668014526367,
"step": 170
},
{
"epoch": 0.34484882074019335,
"grad_norm": 14.212926345332207,
"learning_rate": 4.141880060119336e-07,
"logits/chosen": -0.2191300094127655,
"logits/rejected": -0.4566856324672699,
"logps/chosen": -396.66876220703125,
"logps/rejected": -567.9503784179688,
"loss": 0.3842,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.9396305084228516,
"rewards/margins": 1.7750612497329712,
"rewards/rejected": -4.714691162109375,
"step": 175
},
{
"epoch": 0.3547016441899132,
"grad_norm": 12.696873162753882,
"learning_rate": 4.0759765403198877e-07,
"logits/chosen": -0.28399690985679626,
"logits/rejected": -0.4555039405822754,
"logps/chosen": -343.26263427734375,
"logps/rejected": -577.4595336914062,
"loss": 0.3109,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": -2.4329833984375,
"rewards/margins": 2.351548671722412,
"rewards/rejected": -4.784532070159912,
"step": 180
},
{
"epoch": 0.364554467639633,
"grad_norm": 15.682818507341654,
"learning_rate": 4.008203127021797e-07,
"logits/chosen": -0.30414339900016785,
"logits/rejected": -0.49127548933029175,
"logps/chosen": -401.3171081542969,
"logps/rejected": -638.1029663085938,
"loss": 0.3526,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -3.013474702835083,
"rewards/margins": 2.391366720199585,
"rewards/rejected": -5.404841899871826,
"step": 185
},
{
"epoch": 0.3744072910893528,
"grad_norm": 15.137404252638685,
"learning_rate": 3.9386402332652754e-07,
"logits/chosen": -0.3183102011680603,
"logits/rejected": -0.478738397359848,
"logps/chosen": -362.82513427734375,
"logps/rejected": -609.9462890625,
"loss": 0.3586,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -2.608335018157959,
"rewards/margins": 2.479790687561035,
"rewards/rejected": -5.088125705718994,
"step": 190
},
{
"epoch": 0.3842601145390726,
"grad_norm": 14.191789849116672,
"learning_rate": 3.867370395306068e-07,
"logits/chosen": -0.2816595137119293,
"logits/rejected": -0.5143749117851257,
"logps/chosen": -354.79388427734375,
"logps/rejected": -573.3958740234375,
"loss": 0.3561,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -2.5655016899108887,
"rewards/margins": 2.193372964859009,
"rewards/rejected": -4.758874416351318,
"step": 195
},
{
"epoch": 0.39411293798879243,
"grad_norm": 16.13473953627841,
"learning_rate": 3.794478174686328e-07,
"logits/chosen": -0.30765408277511597,
"logits/rejected": -0.5184319615364075,
"logps/chosen": -380.7965087890625,
"logps/rejected": -623.9815063476562,
"loss": 0.3343,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -2.7764999866485596,
"rewards/margins": 2.448732614517212,
"rewards/rejected": -5.2252326011657715,
"step": 200
},
{
"epoch": 0.4039657614385122,
"grad_norm": 13.799406477566153,
"learning_rate": 3.720050057902495e-07,
"logits/chosen": -0.28575196862220764,
"logits/rejected": -0.5561090707778931,
"logps/chosen": -412.14154052734375,
"logps/rejected": -629.4691772460938,
"loss": 0.3917,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -3.0631420612335205,
"rewards/margins": 2.176858425140381,
"rewards/rejected": -5.2400007247924805,
"step": 205
},
{
"epoch": 0.41381858488823203,
"grad_norm": 13.376596964391368,
"learning_rate": 3.644174353789204e-07,
"logits/chosen": -0.3736252188682556,
"logits/rejected": -0.514769434928894,
"logps/chosen": -383.2430725097656,
"logps/rejected": -640.6544189453125,
"loss": 0.334,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.76784086227417,
"rewards/margins": 2.553081512451172,
"rewards/rejected": -5.320921897888184,
"step": 210
},
{
"epoch": 0.42367140833795186,
"grad_norm": 11.451472713393272,
"learning_rate": 3.566941088741009e-07,
"logits/chosen": -0.3407908082008362,
"logits/rejected": -0.6174032688140869,
"logps/chosen": -347.8931884765625,
"logps/rejected": -581.584228515625,
"loss": 0.3268,
"rewards/accuracies": 0.875,
"rewards/chosen": -2.4030208587646484,
"rewards/margins": 2.3824656009674072,
"rewards/rejected": -4.785486221313477,
"step": 215
},
{
"epoch": 0.43352423178767163,
"grad_norm": 17.46089866269522,
"learning_rate": 3.488441899896217e-07,
"logits/chosen": -0.3514137864112854,
"logits/rejected": -0.49381551146507263,
"logps/chosen": -360.62103271484375,
"logps/rejected": -615.2965087890625,
"loss": 0.3361,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -2.5678088665008545,
"rewards/margins": 2.5628271102905273,
"rewards/rejected": -5.130635738372803,
"step": 220
},
{
"epoch": 0.44337705523739146,
"grad_norm": 30.418216991487416,
"learning_rate": 3.408769926409574e-07,
"logits/chosen": -0.32667240500450134,
"logits/rejected": -0.46130138635635376,
"logps/chosen": -370.0388488769531,
"logps/rejected": -607.0093994140625,
"loss": 0.3309,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -2.7761330604553223,
"rewards/margins": 2.300311803817749,
"rewards/rejected": -5.076444149017334,
"step": 225
},
{
"epoch": 0.4532298786871113,
"grad_norm": 22.496163988303977,
"learning_rate": 3.3280196989428263e-07,
"logits/chosen": -0.23927278816699982,
"logits/rejected": -0.4683164060115814,
"logps/chosen": -415.4649963378906,
"logps/rejected": -715.1581420898438,
"loss": 0.399,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -3.170175790786743,
"rewards/margins": 3.04463529586792,
"rewards/rejected": -6.214810848236084,
"step": 230
},
{
"epoch": 0.46308270213683106,
"grad_norm": 15.358280098952724,
"learning_rate": 3.2462870275042367e-07,
"logits/chosen": -0.2206590175628662,
"logits/rejected": -0.4772756099700928,
"logps/chosen": -437.8194274902344,
"logps/rejected": -676.9967041015625,
"loss": 0.3334,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -3.432298183441162,
"rewards/margins": 2.3835928440093994,
"rewards/rejected": -5.815890789031982,
"step": 235
},
{
"epoch": 0.4729355255865509,
"grad_norm": 14.636033595582514,
"learning_rate": 3.1636688877701806e-07,
"logits/chosen": -0.379282683134079,
"logits/rejected": -0.5302340388298035,
"logps/chosen": -425.94024658203125,
"logps/rejected": -697.734130859375,
"loss": 0.2963,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -3.2339348793029785,
"rewards/margins": 2.747666835784912,
"rewards/rejected": -5.981600761413574,
"step": 240
},
{
"epoch": 0.4827883490362707,
"grad_norm": 15.394330260666242,
"learning_rate": 3.080263306023669e-07,
"logits/chosen": -0.2971007823944092,
"logits/rejected": -0.4823763370513916,
"logps/chosen": -433.9507751464844,
"logps/rejected": -664.0755615234375,
"loss": 0.2877,
"rewards/accuracies": 0.90625,
"rewards/chosen": -3.3310959339141846,
"rewards/margins": 2.319425344467163,
"rewards/rejected": -5.650521278381348,
"step": 245
},
{
"epoch": 0.49264117248599054,
"grad_norm": 14.169015562499252,
"learning_rate": 2.996169242846328e-07,
"logits/chosen": -0.3509058952331543,
"logits/rejected": -0.5016965270042419,
"logps/chosen": -401.8634948730469,
"logps/rejected": -719.4425048828125,
"loss": 0.2637,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -3.0134997367858887,
"rewards/margins": 3.148432970046997,
"rewards/rejected": -6.161932945251465,
"step": 250
},
{
"epoch": 0.5024939959357103,
"grad_norm": 15.084142231117205,
"learning_rate": 2.911486475701835e-07,
"logits/chosen": -0.2690550684928894,
"logits/rejected": -0.4977152943611145,
"logps/chosen": -400.71649169921875,
"logps/rejected": -628.1175537109375,
"loss": 0.322,
"rewards/accuracies": 0.84375,
"rewards/chosen": -3.0442709922790527,
"rewards/margins": 2.318587064743042,
"rewards/rejected": -5.362858772277832,
"step": 255
},
{
"epoch": 0.5123468193854301,
"grad_norm": 10.556248114155261,
"learning_rate": 2.826315480550129e-07,
"logits/chosen": -0.3703750967979431,
"logits/rejected": -0.4834713041782379,
"logps/chosen": -379.0237121582031,
"logps/rejected": -712.1115112304688,
"loss": 0.2737,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -2.82161283493042,
"rewards/margins": 3.283346652984619,
"rewards/rejected": -6.104959011077881,
"step": 260
},
{
"epoch": 0.52219964283515,
"grad_norm": 16.00385439318665,
"learning_rate": 2.740757312632854e-07,
"logits/chosen": -0.2965068221092224,
"logits/rejected": -0.4728400707244873,
"logps/chosen": -432.16650390625,
"logps/rejected": -744.01806640625,
"loss": 0.2898,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": -3.3073620796203613,
"rewards/margins": 3.125457286834717,
"rewards/rejected": -6.432818412780762,
"step": 265
},
{
"epoch": 0.5320524662848698,
"grad_norm": 15.37457746102107,
"learning_rate": 2.654913486571487e-07,
"logits/chosen": -0.26870328187942505,
"logits/rejected": -0.5099757313728333,
"logps/chosen": -461.7821350097656,
"logps/rejected": -804.3317260742188,
"loss": 0.281,
"rewards/accuracies": 0.90625,
"rewards/chosen": -3.477935791015625,
"rewards/margins": 3.4908695220947266,
"rewards/rejected": -6.968804836273193,
"step": 270
},
{
"epoch": 0.5419052897345896,
"grad_norm": 17.39033230160854,
"learning_rate": 2.5688858559204053e-07,
"logits/chosen": -0.2892334461212158,
"logits/rejected": -0.5267232656478882,
"logps/chosen": -488.491943359375,
"logps/rejected": -848.1380615234375,
"loss": 0.2953,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": -3.8831119537353516,
"rewards/margins": 3.5666236877441406,
"rewards/rejected": -7.44973611831665,
"step": 275
},
{
"epoch": 0.5517581131843093,
"grad_norm": 19.079707368930343,
"learning_rate": 2.4827764923178246e-07,
"logits/chosen": -0.2866331934928894,
"logits/rejected": -0.5004242062568665,
"logps/chosen": -503.42572021484375,
"logps/rejected": -888.0465698242188,
"loss": 0.2978,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -3.999812602996826,
"rewards/margins": 3.85322904586792,
"rewards/rejected": -7.853041648864746,
"step": 280
},
{
"epoch": 0.5616109366340292,
"grad_norm": 19.106895081330208,
"learning_rate": 2.3966875643779667e-07,
"logits/chosen": -0.3676909804344177,
"logits/rejected": -0.5896965861320496,
"logps/chosen": -404.20770263671875,
"logps/rejected": -752.9081420898438,
"loss": 0.288,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": -3.025970935821533,
"rewards/margins": 3.454861879348755,
"rewards/rejected": -6.480832576751709,
"step": 285
},
{
"epoch": 0.571463760083749,
"grad_norm": 12.765348172490336,
"learning_rate": 2.3107212164681774e-07,
"logits/chosen": -0.3087966740131378,
"logits/rejected": -0.5579283833503723,
"logps/chosen": -426.75372314453125,
"logps/rejected": -739.0400390625,
"loss": 0.3286,
"rewards/accuracies": 0.90625,
"rewards/chosen": -3.3023364543914795,
"rewards/margins": 3.1090188026428223,
"rewards/rejected": -6.411355495452881,
"step": 290
},
{
"epoch": 0.5813165835334688,
"grad_norm": 16.231816408507715,
"learning_rate": 2.2249794475148019e-07,
"logits/chosen": -0.4001474976539612,
"logits/rejected": -0.5297967195510864,
"logps/chosen": -417.8780822753906,
"logps/rejected": -676.3338623046875,
"loss": 0.3072,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -3.1984708309173584,
"rewards/margins": 2.528751850128174,
"rewards/rejected": -5.727222919464111,
"step": 295
},
{
"epoch": 0.5911694069831886,
"grad_norm": 17.338695196365542,
"learning_rate": 2.1395639899816332e-07,
"logits/chosen": -0.4291343688964844,
"logits/rejected": -0.556428849697113,
"logps/chosen": -390.78582763671875,
"logps/rejected": -669.5917358398438,
"loss": 0.3313,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -2.9017252922058105,
"rewards/margins": 2.7559847831726074,
"rewards/rejected": -5.657710075378418,
"step": 300
},
{
"epoch": 0.6010222304329085,
"grad_norm": 12.126664974793542,
"learning_rate": 2.0545761891645177e-07,
"logits/chosen": -0.37195321917533875,
"logits/rejected": -0.5651625394821167,
"logps/chosen": -414.18927001953125,
"logps/rejected": -704.6055297851562,
"loss": 0.2784,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -3.175144910812378,
"rewards/margins": 2.9191794395446777,
"rewards/rejected": -6.094325065612793,
"step": 305
},
{
"epoch": 0.6108750538826282,
"grad_norm": 22.304119797273902,
"learning_rate": 1.9701168829453305e-07,
"logits/chosen": -0.3536795973777771,
"logits/rejected": -0.5591118931770325,
"logps/chosen": -393.33905029296875,
"logps/rejected": -676.2900390625,
"loss": 0.303,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": -2.915839672088623,
"rewards/margins": 2.8338570594787598,
"rewards/rejected": -5.749696731567383,
"step": 310
},
{
"epoch": 0.620727877332348,
"grad_norm": 14.597142317559607,
"learning_rate": 1.886286282148002e-07,
"logits/chosen": -0.36113765835762024,
"logits/rejected": -0.5111299157142639,
"logps/chosen": -416.2958984375,
"logps/rejected": -689.5440673828125,
"loss": 0.2944,
"rewards/accuracies": 0.875,
"rewards/chosen": -3.1453917026519775,
"rewards/margins": 2.7569806575775146,
"rewards/rejected": -5.90237283706665,
"step": 315
},
{
"epoch": 0.6305807007820678,
"grad_norm": 14.66404722177273,
"learning_rate": 1.8031838516385422e-07,
"logits/chosen": -0.29976287484169006,
"logits/rejected": -0.5753315687179565,
"logps/chosen": -426.15081787109375,
"logps/rejected": -668.0872802734375,
"loss": 0.3077,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": -3.2156474590301514,
"rewards/margins": 2.5099997520446777,
"rewards/rejected": -5.725647926330566,
"step": 320
},
{
"epoch": 0.6404335242317877,
"grad_norm": 24.997450008790477,
"learning_rate": 1.7209081923101472e-07,
"logits/chosen": -0.3522949814796448,
"logits/rejected": -0.571318507194519,
"logps/chosen": -428.62615966796875,
"logps/rejected": -739.1427001953125,
"loss": 0.2803,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": -3.299776554107666,
"rewards/margins": 3.1473708152770996,
"rewards/rejected": -6.447146415710449,
"step": 325
},
{
"epoch": 0.6502863476815075,
"grad_norm": 21.077844248854888,
"learning_rate": 1.639556924093404e-07,
"logits/chosen": -0.3864585757255554,
"logits/rejected": -0.5924688577651978,
"logps/chosen": -460.22955322265625,
"logps/rejected": -780.5281372070312,
"loss": 0.29,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": -3.5728142261505127,
"rewards/margins": 3.2330241203308105,
"rewards/rejected": -6.805838584899902,
"step": 330
},
{
"epoch": 0.6601391711312273,
"grad_norm": 13.281944316685474,
"learning_rate": 1.5592265701304114e-07,
"logits/chosen": -0.31550538539886475,
"logits/rejected": -0.5574745535850525,
"logps/chosen": -432.3729553222656,
"logps/rejected": -738.777587890625,
"loss": 0.2802,
"rewards/accuracies": 0.90625,
"rewards/chosen": -3.270681858062744,
"rewards/margins": 3.0939764976501465,
"rewards/rejected": -6.364658832550049,
"step": 335
},
{
"epoch": 0.669991994580947,
"grad_norm": 13.14308400069569,
"learning_rate": 1.4800124422502334e-07,
"logits/chosen": -0.35619470477104187,
"logits/rejected": -0.5215466618537903,
"logps/chosen": -402.02227783203125,
"logps/rejected": -689.3594360351562,
"loss": 0.2762,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -3.078434944152832,
"rewards/margins": 2.8699941635131836,
"rewards/rejected": -5.948429107666016,
"step": 340
},
{
"epoch": 0.6798448180306669,
"grad_norm": 13.958916358154339,
"learning_rate": 1.4020085278815743e-07,
"logits/chosen": -0.348470538854599,
"logits/rejected": -0.554540753364563,
"logps/chosen": -393.90362548828125,
"logps/rejected": -701.527099609375,
"loss": 0.2845,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -2.926769733428955,
"rewards/margins": 3.087486743927002,
"rewards/rejected": -6.014256477355957,
"step": 345
},
{
"epoch": 0.6896976414803867,
"grad_norm": 17.268899996459307,
"learning_rate": 1.3253073785368545e-07,
"logits/chosen": -0.37151604890823364,
"logits/rejected": -0.5311695337295532,
"logps/chosen": -426.25567626953125,
"logps/rejected": -754.6126708984375,
"loss": 0.2906,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": -3.2909317016601562,
"rewards/margins": 3.2550930976867676,
"rewards/rejected": -6.546025276184082,
"step": 350
},
{
"epoch": 0.6995504649301065,
"grad_norm": 23.463103741079866,
"learning_rate": 1.2500000000000005e-07,
"logits/chosen": -0.29037579894065857,
"logits/rejected": -0.5232888460159302,
"logps/chosen": -434.14862060546875,
"logps/rejected": -760.4425048828125,
"loss": 0.3051,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": -3.306781053543091,
"rewards/margins": 3.264423370361328,
"rewards/rejected": -6.571203708648682,
"step": 355
},
{
"epoch": 0.7094032883798264,
"grad_norm": 15.524231108582358,
"learning_rate": 1.1761757443482285e-07,
"logits/chosen": -0.34106117486953735,
"logits/rejected": -0.5115201473236084,
"logps/chosen": -463.2628479003906,
"logps/rejected": -801.8025512695312,
"loss": 0.3022,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -3.6255767345428467,
"rewards/margins": 3.3763046264648438,
"rewards/rejected": -7.0018815994262695,
"step": 360
},
{
"epoch": 0.7192561118295462,
"grad_norm": 13.880093445950616,
"learning_rate": 1.1039222039359644e-07,
"logits/chosen": -0.3453301787376404,
"logits/rejected": -0.5254852175712585,
"logps/chosen": -411.9518127441406,
"logps/rejected": -712.6190795898438,
"loss": 0.2746,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": -3.0844180583953857,
"rewards/margins": 3.044656991958618,
"rewards/rejected": -6.129075527191162,
"step": 365
},
{
"epoch": 0.729108935279266,
"grad_norm": 14.175670082816483,
"learning_rate": 1.0333251074666608e-07,
"logits/chosen": -0.3751557469367981,
"logits/rejected": -0.5939264297485352,
"logps/chosen": -429.10809326171875,
"logps/rejected": -725.9818115234375,
"loss": 0.3023,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -3.204840898513794,
"rewards/margins": 3.0460567474365234,
"rewards/rejected": -6.2508978843688965,
"step": 370
},
{
"epoch": 0.7389617587289857,
"grad_norm": 16.474329546472436,
"learning_rate": 9.644682182758304e-08,
"logits/chosen": -0.35652074217796326,
"logits/rejected": -0.5406717658042908,
"logps/chosen": -442.0631408691406,
"logps/rejected": -735.47412109375,
"loss": 0.2696,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -3.3726630210876465,
"rewards/margins": 2.9257071018218994,
"rewards/rejected": -6.298369407653809,
"step": 375
},
{
"epoch": 0.7488145821787056,
"grad_norm": 18.213192833293316,
"learning_rate": 8.974332349459992e-08,
"logits/chosen": -0.3362307846546173,
"logits/rejected": -0.5135927796363831,
"logps/chosen": -418.939453125,
"logps/rejected": -711.0570068359375,
"loss": 0.2748,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": -3.2188174724578857,
"rewards/margins": 2.915469169616699,
"rewards/rejected": -6.134286403656006,
"step": 380
},
{
"epoch": 0.7586674056284254,
"grad_norm": 15.38124786698222,
"learning_rate": 8.322996943714672e-08,
"logits/chosen": -0.3672960698604584,
"logits/rejected": -0.5296192765235901,
"logps/chosen": -425.13427734375,
"logps/rejected": -741.996337890625,
"loss": 0.2826,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -3.2366461753845215,
"rewards/margins": 3.166731119155884,
"rewards/rejected": -6.403376579284668,
"step": 385
},
{
"epoch": 0.7685202290781452,
"grad_norm": 20.521680628902402,
"learning_rate": 7.691448773879256e-08,
"logits/chosen": -0.38358789682388306,
"logits/rejected": -0.5248023867607117,
"logps/chosen": -465.2935485839844,
"logps/rejected": -843.0035400390625,
"loss": 0.3038,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": -3.6291823387145996,
"rewards/margins": 3.776301145553589,
"rewards/rejected": -7.405484199523926,
"step": 390
},
{
"epoch": 0.778373052527865,
"grad_norm": 22.670752535686123,
"learning_rate": 7.080437170788722e-08,
"logits/chosen": -0.3909732699394226,
"logits/rejected": -0.46815380454063416,
"logps/chosen": -467.8148498535156,
"logps/rejected": -813.1572265625,
"loss": 0.2818,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": -3.7561402320861816,
"rewards/margins": 3.3577582836151123,
"rewards/rejected": -7.113898277282715,
"step": 395
},
{
"epoch": 0.7882258759775849,
"grad_norm": 21.305345425176412,
"learning_rate": 6.490687098676332e-08,
"logits/chosen": -0.40551847219467163,
"logits/rejected": -0.5719416737556458,
"logps/chosen": -435.22393798828125,
"logps/rejected": -831.7904052734375,
"loss": 0.2563,
"rewards/accuracies": 0.90625,
"rewards/chosen": -3.3291544914245605,
"rewards/margins": 3.9090828895568848,
"rewards/rejected": -7.2382378578186035,
"step": 400
},
{
"epoch": 0.7882258759775849,
"eval_logits/chosen": -1.0068713426589966,
"eval_logits/rejected": -0.773371696472168,
"eval_logps/chosen": -503.0605773925781,
"eval_logps/rejected": -703.8283081054688,
"eval_loss": 0.7016588449478149,
"eval_rewards/accuracies": 0.7020000219345093,
"eval_rewards/chosen": -4.100825786590576,
"eval_rewards/margins": 1.7457716464996338,
"eval_rewards/rejected": -5.846597671508789,
"eval_runtime": 224.226,
"eval_samples_per_second": 8.915,
"eval_steps_per_second": 1.115,
"step": 400
},
{
"epoch": 0.7980786994273046,
"grad_norm": 16.817512778396765,
"learning_rate": 5.9228982950048414e-08,
"logits/chosen": -0.34627681970596313,
"logits/rejected": -0.648546576499939,
"logps/chosen": -444.9021911621094,
"logps/rejected": -793.8383178710938,
"loss": 0.3108,
"rewards/accuracies": 0.90625,
"rewards/chosen": -3.4342200756073,
"rewards/margins": 3.5523598194122314,
"rewards/rejected": -6.986579895019531,
"step": 405
},
{
"epoch": 0.8079315228770244,
"grad_norm": 18.65876742845436,
"learning_rate": 5.3777444402291345e-08,
"logits/chosen": -0.3808293342590332,
"logits/rejected": -0.5627844929695129,
"logps/chosen": -447.5360412597656,
"logps/rejected": -714.4208374023438,
"loss": 0.3028,
"rewards/accuracies": 0.84375,
"rewards/chosen": -3.4805076122283936,
"rewards/margins": 2.6875407695770264,
"rewards/rejected": -6.168048858642578,
"step": 410
},
{
"epoch": 0.8177843463267442,
"grad_norm": 21.614577011578998,
"learning_rate": 4.855872358475546e-08,
"logits/chosen": -0.44765299558639526,
"logits/rejected": -0.5962406992912292,
"logps/chosen": -454.58135986328125,
"logps/rejected": -746.631103515625,
"loss": 0.3112,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -3.5401859283447266,
"rewards/margins": 2.886667251586914,
"rewards/rejected": -6.426853179931641,
"step": 415
},
{
"epoch": 0.8276371697764641,
"grad_norm": 18.508069249718975,
"learning_rate": 4.357901250086107e-08,
"logits/chosen": -0.3780784606933594,
"logits/rejected": -0.6282440423965454,
"logps/chosen": -464.607666015625,
"logps/rejected": -732.405029296875,
"loss": 0.2871,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -3.5666542053222656,
"rewards/margins": 2.74709153175354,
"rewards/rejected": -6.313745498657227,
"step": 420
},
{
"epoch": 0.8374899932261839,
"grad_norm": 21.590957086592525,
"learning_rate": 3.884421956938377e-08,
"logits/chosen": -0.38022860884666443,
"logits/rejected": -0.6045624017715454,
"logps/chosen": -429.6410217285156,
"logps/rejected": -730.5736083984375,
"loss": 0.2923,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -3.25273060798645,
"rewards/margins": 3.082298994064331,
"rewards/rejected": -6.335029125213623,
"step": 425
},
{
"epoch": 0.8473428166759037,
"grad_norm": 15.197391870273057,
"learning_rate": 3.435996261412591e-08,
"logits/chosen": -0.4843681752681732,
"logits/rejected": -0.5831128358840942,
"logps/chosen": -393.3730773925781,
"logps/rejected": -707.8683471679688,
"loss": 0.2692,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -2.8446130752563477,
"rewards/margins": 3.154205799102783,
"rewards/rejected": -5.998819351196289,
"step": 430
},
{
"epoch": 0.8571956401256235,
"grad_norm": 15.288363962234385,
"learning_rate": 3.013156219837776e-08,
"logits/chosen": -0.4089592397212982,
"logits/rejected": -0.5905001759529114,
"logps/chosen": -391.95831298828125,
"logps/rejected": -707.7792358398438,
"loss": 0.2841,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -2.9444174766540527,
"rewards/margins": 3.1579596996307373,
"rewards/rejected": -6.102376937866211,
"step": 435
},
{
"epoch": 0.8670484635753433,
"grad_norm": 17.671998119316797,
"learning_rate": 2.6164035312078447e-08,
"logits/chosen": -0.42121395468711853,
"logits/rejected": -0.5983023047447205,
"logps/chosen": -422.3511657714844,
"logps/rejected": -786.9813232421875,
"loss": 0.3052,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": -3.2139759063720703,
"rewards/margins": 3.617478847503662,
"rewards/rejected": -6.831455230712891,
"step": 440
},
{
"epoch": 0.8769012870250631,
"grad_norm": 15.463536800192488,
"learning_rate": 2.2462089419165776e-08,
"logits/chosen": -0.39272046089172363,
"logits/rejected": -0.5592643618583679,
"logps/chosen": -421.66619873046875,
"logps/rejected": -675.9927368164062,
"loss": 0.3122,
"rewards/accuracies": 0.84375,
"rewards/chosen": -3.1869752407073975,
"rewards/margins": 2.581495761871338,
"rewards/rejected": -5.7684712409973145,
"step": 445
},
{
"epoch": 0.8867541104747829,
"grad_norm": 17.36075063464836,
"learning_rate": 1.9030116872178314e-08,
"logits/chosen": -0.42874231934547424,
"logits/rejected": -0.5679312944412231,
"logps/chosen": -416.0025939941406,
"logps/rejected": -734.6575317382812,
"loss": 0.289,
"rewards/accuracies": 0.84375,
"rewards/chosen": -3.1724095344543457,
"rewards/margins": 3.161487102508545,
"rewards/rejected": -6.333896160125732,
"step": 450
},
{
"epoch": 0.8966069339245027,
"grad_norm": 15.642265040018296,
"learning_rate": 1.5872189700736337e-08,
"logits/chosen": -0.4477892518043518,
"logits/rejected": -0.579742431640625,
"logps/chosen": -446.33349609375,
"logps/rejected": -779.8458862304688,
"loss": 0.2806,
"rewards/accuracies": 0.84375,
"rewards/chosen": -3.4629955291748047,
"rewards/margins": 3.308091640472412,
"rewards/rejected": -6.771086692810059,
"step": 455
},
{
"epoch": 0.9064597573742226,
"grad_norm": 14.160026400581446,
"learning_rate": 1.2992054780085692e-08,
"logits/chosen": -0.3982602059841156,
"logits/rejected": -0.5977696776390076,
"logps/chosen": -418.07061767578125,
"logps/rejected": -713.667236328125,
"loss": 0.2735,
"rewards/accuracies": 0.84375,
"rewards/chosen": -3.1576714515686035,
"rewards/margins": 2.992490768432617,
"rewards/rejected": -6.150162696838379,
"step": 460
},
{
"epoch": 0.9163125808239424,
"grad_norm": 19.50813975803542,
"learning_rate": 1.0393129385436823e-08,
"logits/chosen": -0.364255428314209,
"logits/rejected": -0.6485485434532166,
"logps/chosen": -426.3414001464844,
"logps/rejected": -729.6509399414062,
"loss": 0.2897,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -3.199294328689575,
"rewards/margins": 3.0993099212646484,
"rewards/rejected": -6.2986040115356445,
"step": 465
},
{
"epoch": 0.9261654042736621,
"grad_norm": 19.508164166434455,
"learning_rate": 8.078497137373242e-09,
"logits/chosen": -0.46434831619262695,
"logits/rejected": -0.5418060421943665,
"logps/chosen": -456.2374572753906,
"logps/rejected": -767.5300903320312,
"loss": 0.2769,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -3.5864601135253906,
"rewards/margins": 3.058790922164917,
"rewards/rejected": -6.645251274108887,
"step": 470
},
{
"epoch": 0.936018227723382,
"grad_norm": 14.71074195682132,
"learning_rate": 6.0509043431410945e-09,
"logits/chosen": -0.36522990465164185,
"logits/rejected": -0.5609266757965088,
"logps/chosen": -441.4654235839844,
"logps/rejected": -724.3133544921875,
"loss": 0.2906,
"rewards/accuracies": 0.90625,
"rewards/chosen": -3.432781934738159,
"rewards/margins": 2.8102941513061523,
"rewards/rejected": -6.243076324462891,
"step": 475
},
{
"epoch": 0.9458710511731018,
"grad_norm": 15.31419630581113,
"learning_rate": 4.312756738160145e-09,
"logits/chosen": -0.4177488386631012,
"logits/rejected": -0.6274289488792419,
"logps/chosen": -437.7715759277344,
"logps/rejected": -771.16552734375,
"loss": 0.2431,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": -3.4045562744140625,
"rewards/margins": 3.3424010276794434,
"rewards/rejected": -6.746957302093506,
"step": 480
},
{
"epoch": 0.9557238746228216,
"grad_norm": 17.527039297327423,
"learning_rate": 2.8661166316229223e-09,
"logits/chosen": -0.4048340320587158,
"logits/rejected": -0.5873134732246399,
"logps/chosen": -422.97003173828125,
"logps/rejected": -773.1441650390625,
"loss": 0.2806,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -3.2094051837921143,
"rewards/margins": 3.512725830078125,
"rewards/rejected": -6.722131252288818,
"step": 485
},
{
"epoch": 0.9655766980725414,
"grad_norm": 14.493022786248968,
"learning_rate": 1.7127004595681727e-09,
"logits/chosen": -0.39268267154693604,
"logits/rejected": -0.5996861457824707,
"logps/chosen": -428.91986083984375,
"logps/rejected": -711.802001953125,
"loss": 0.3532,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -3.3111557960510254,
"rewards/margins": 2.841479539871216,
"rewards/rejected": -6.152635097503662,
"step": 490
},
{
"epoch": 0.9754295215222613,
"grad_norm": 18.440387807689753,
"learning_rate": 8.538767483325383e-10,
"logits/chosen": -0.4720977246761322,
"logits/rejected": -0.5540111064910889,
"logps/chosen": -421.6551208496094,
"logps/rejected": -734.9032592773438,
"loss": 0.2881,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -3.2461752891540527,
"rewards/margins": 3.103208065032959,
"rewards/rejected": -6.349383354187012,
"step": 495
},
{
"epoch": 0.9852823449719811,
"grad_norm": 24.055730221765614,
"learning_rate": 2.9066449079634404e-10,
"logits/chosen": -0.3708307147026062,
"logits/rejected": -0.6160975694656372,
"logps/chosen": -466.91717529296875,
"logps/rejected": -752.3226318359375,
"loss": 0.2672,
"rewards/accuracies": 0.90625,
"rewards/chosen": -3.598177671432495,
"rewards/margins": 2.9196102619171143,
"rewards/rejected": -6.517787933349609,
"step": 500
},
{
"epoch": 0.9951351684217008,
"grad_norm": 16.011780285680448,
"learning_rate": 2.3731937350224273e-11,
"logits/chosen": -0.3636520802974701,
"logits/rejected": -0.6337639689445496,
"logps/chosen": -420.154052734375,
"logps/rejected": -775.5059814453125,
"loss": 0.2825,
"rewards/accuracies": 0.90625,
"rewards/chosen": -3.1779532432556152,
"rewards/margins": 3.574568510055542,
"rewards/rejected": -6.752521514892578,
"step": 505
},
{
"epoch": 0.9990762978015888,
"step": 507,
"total_flos": 0.0,
"train_loss": 0.3847499547390308,
"train_runtime": 27570.709,
"train_samples_per_second": 2.356,
"train_steps_per_second": 0.018
}
],
"logging_steps": 5,
"max_steps": 507,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}