PyTorch
llama
alignment-handbook
Generated from Trainer
MambaInLlama_0_875 / trainer_state.json
Junxiong Wang
add models
d350c32
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 2000,
"global_step": 4168,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0002399232245681382,
"grad_norm": 4.854994271032306,
"learning_rate": 1.199040767386091e-09,
"logits/chosen": -0.3870464563369751,
"logits/rejected": -0.3449973464012146,
"logps/chosen": -161.37554931640625,
"logps/rejected": -150.78668212890625,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.0023992322456813818,
"grad_norm": 5.007999188048557,
"learning_rate": 1.199040767386091e-08,
"logits/chosen": -0.38755929470062256,
"logits/rejected": -0.40367352962493896,
"logps/chosen": -389.556640625,
"logps/rejected": -313.19439697265625,
"loss": 0.6932,
"rewards/accuracies": 0.5277777910232544,
"rewards/chosen": 0.000382223108317703,
"rewards/margins": 0.000817837193608284,
"rewards/rejected": -0.0004356140270829201,
"step": 10
},
{
"epoch": 0.0047984644913627635,
"grad_norm": 5.4251682517128685,
"learning_rate": 2.398081534772182e-08,
"logits/chosen": -0.40382856130599976,
"logits/rejected": -0.4116736352443695,
"logps/chosen": -253.2971649169922,
"logps/rejected": -222.39187622070312,
"loss": 0.6934,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -0.0011010636808350682,
"rewards/margins": 0.0007788551738485694,
"rewards/rejected": -0.0018799189710989594,
"step": 20
},
{
"epoch": 0.007197696737044146,
"grad_norm": 4.855409710988802,
"learning_rate": 3.597122302158273e-08,
"logits/chosen": -0.37188905477523804,
"logits/rejected": -0.41493088006973267,
"logps/chosen": -264.1092834472656,
"logps/rejected": -276.79327392578125,
"loss": 0.6932,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.0006352605996653438,
"rewards/margins": 0.0009311493486166,
"rewards/rejected": -0.001566409831866622,
"step": 30
},
{
"epoch": 0.009596928982725527,
"grad_norm": 4.773650762501925,
"learning_rate": 4.796163069544364e-08,
"logits/chosen": -0.4388062357902527,
"logits/rejected": -0.4551084041595459,
"logps/chosen": -283.5164489746094,
"logps/rejected": -264.680419921875,
"loss": 0.6931,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -6.967745866859332e-05,
"rewards/margins": 0.0002998415438923985,
"rewards/rejected": -0.0003695189952850342,
"step": 40
},
{
"epoch": 0.01199616122840691,
"grad_norm": 5.249008388011441,
"learning_rate": 5.995203836930455e-08,
"logits/chosen": -0.44028440117836,
"logits/rejected": -0.41670793294906616,
"logps/chosen": -289.8959045410156,
"logps/rejected": -249.32876586914062,
"loss": 0.693,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.00012158080789959058,
"rewards/margins": 0.0013795382110401988,
"rewards/rejected": -0.0012579575413838029,
"step": 50
},
{
"epoch": 0.014395393474088292,
"grad_norm": 5.451149941549421,
"learning_rate": 7.194244604316546e-08,
"logits/chosen": -0.41042596101760864,
"logits/rejected": -0.39021745324134827,
"logps/chosen": -293.70989990234375,
"logps/rejected": -274.8219909667969,
"loss": 0.6932,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.00042934497469104826,
"rewards/margins": 5.6701617722865194e-05,
"rewards/rejected": 0.00037264340789988637,
"step": 60
},
{
"epoch": 0.016794625719769675,
"grad_norm": 4.9176821013212955,
"learning_rate": 8.393285371702638e-08,
"logits/chosen": -0.37676185369491577,
"logits/rejected": -0.3675565719604492,
"logps/chosen": -300.6470031738281,
"logps/rejected": -285.7118225097656,
"loss": 0.6931,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -0.0005408605793491006,
"rewards/margins": -0.0005098087713122368,
"rewards/rejected": -3.105172800133005e-05,
"step": 70
},
{
"epoch": 0.019193857965451054,
"grad_norm": 5.324153930467038,
"learning_rate": 9.592326139088728e-08,
"logits/chosen": -0.3833390474319458,
"logits/rejected": -0.3485874533653259,
"logps/chosen": -202.55172729492188,
"logps/rejected": -266.27801513671875,
"loss": 0.693,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.0002449182793498039,
"rewards/margins": -0.00045424007112160325,
"rewards/rejected": 0.00020932205370627344,
"step": 80
},
{
"epoch": 0.021593090211132437,
"grad_norm": 4.902493446985089,
"learning_rate": 1.0791366906474819e-07,
"logits/chosen": -0.42354243993759155,
"logits/rejected": -0.4241662621498108,
"logps/chosen": -345.2079162597656,
"logps/rejected": -297.83392333984375,
"loss": 0.6931,
"rewards/accuracies": 0.42500001192092896,
"rewards/chosen": -0.00027865776792168617,
"rewards/margins": -7.414491847157478e-05,
"rewards/rejected": -0.00020451273303478956,
"step": 90
},
{
"epoch": 0.02399232245681382,
"grad_norm": 5.335283741962904,
"learning_rate": 1.199040767386091e-07,
"logits/chosen": -0.3970239758491516,
"logits/rejected": -0.36344924569129944,
"logps/chosen": -279.8683166503906,
"logps/rejected": -301.5334167480469,
"loss": 0.6926,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.0004452617431525141,
"rewards/margins": 0.0009495856938883662,
"rewards/rejected": -0.0005043239216320217,
"step": 100
},
{
"epoch": 0.026391554702495202,
"grad_norm": 4.56016585237767,
"learning_rate": 1.3189448441247004e-07,
"logits/chosen": -0.3945187032222748,
"logits/rejected": -0.40393415093421936,
"logps/chosen": -245.4661102294922,
"logps/rejected": -244.88955688476562,
"loss": 0.6927,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.0013859450118616223,
"rewards/margins": -9.293450420955196e-05,
"rewards/rejected": -0.0012930103112012148,
"step": 110
},
{
"epoch": 0.028790786948176585,
"grad_norm": 5.117639055978934,
"learning_rate": 1.4388489208633092e-07,
"logits/chosen": -0.3993036150932312,
"logits/rejected": -0.413900762796402,
"logps/chosen": -301.570068359375,
"logps/rejected": -287.91998291015625,
"loss": 0.6919,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.0009120000759139657,
"rewards/margins": 0.002243091817945242,
"rewards/rejected": -0.003155091777443886,
"step": 120
},
{
"epoch": 0.031190019193857964,
"grad_norm": 4.533876465192458,
"learning_rate": 1.5587529976019183e-07,
"logits/chosen": -0.40214699506759644,
"logits/rejected": -0.4113968014717102,
"logps/chosen": -219.7006072998047,
"logps/rejected": -321.1871337890625,
"loss": 0.6918,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.0008342149667441845,
"rewards/margins": 0.0029901477973908186,
"rewards/rejected": -0.0038243632297962904,
"step": 130
},
{
"epoch": 0.03358925143953935,
"grad_norm": 5.008152220168436,
"learning_rate": 1.6786570743405277e-07,
"logits/chosen": -0.33501917123794556,
"logits/rejected": -0.3522827625274658,
"logps/chosen": -314.4662780761719,
"logps/rejected": -304.34869384765625,
"loss": 0.6911,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.0010583497351035476,
"rewards/margins": 0.004176832735538483,
"rewards/rejected": -0.005235183052718639,
"step": 140
},
{
"epoch": 0.03598848368522073,
"grad_norm": 5.230326203624785,
"learning_rate": 1.7985611510791365e-07,
"logits/chosen": -0.3923170566558838,
"logits/rejected": -0.3953098952770233,
"logps/chosen": -236.24685668945312,
"logps/rejected": -234.350830078125,
"loss": 0.6911,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.0009860789868980646,
"rewards/margins": 0.004025029484182596,
"rewards/rejected": -0.0050111087039113045,
"step": 150
},
{
"epoch": 0.03838771593090211,
"grad_norm": 5.013822574107228,
"learning_rate": 1.9184652278177456e-07,
"logits/chosen": -0.31543251872062683,
"logits/rejected": -0.3109430968761444,
"logps/chosen": -316.65985107421875,
"logps/rejected": -250.0377960205078,
"loss": 0.6906,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.0033806234132498503,
"rewards/margins": 0.0072640664875507355,
"rewards/rejected": -0.010644689202308655,
"step": 160
},
{
"epoch": 0.040786948176583494,
"grad_norm": 4.673205835529891,
"learning_rate": 2.038369304556355e-07,
"logits/chosen": -0.3539220094680786,
"logits/rejected": -0.3610088527202606,
"logps/chosen": -352.55316162109375,
"logps/rejected": -340.3455810546875,
"loss": 0.689,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.0013684859732165933,
"rewards/margins": 0.009599483571946621,
"rewards/rejected": -0.01096796989440918,
"step": 170
},
{
"epoch": 0.04318618042226487,
"grad_norm": 5.280440292000614,
"learning_rate": 2.1582733812949638e-07,
"logits/chosen": -0.414604127407074,
"logits/rejected": -0.40983182191848755,
"logps/chosen": -251.31930541992188,
"logps/rejected": -246.71249389648438,
"loss": 0.6893,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.0037162289954721928,
"rewards/margins": 0.016576949506998062,
"rewards/rejected": -0.020293179899454117,
"step": 180
},
{
"epoch": 0.04558541266794626,
"grad_norm": 6.110987926729073,
"learning_rate": 2.278177458033573e-07,
"logits/chosen": -0.37201982736587524,
"logits/rejected": -0.3684994578361511,
"logps/chosen": -334.75994873046875,
"logps/rejected": -276.002197265625,
"loss": 0.6885,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.0038164921570569277,
"rewards/margins": 0.0039043619763106108,
"rewards/rejected": -0.0077208541333675385,
"step": 190
},
{
"epoch": 0.04798464491362764,
"grad_norm": 4.693128517095156,
"learning_rate": 2.398081534772182e-07,
"logits/chosen": -0.3985927700996399,
"logits/rejected": -0.3652064800262451,
"logps/chosen": -327.9397888183594,
"logps/rejected": -314.77703857421875,
"loss": 0.6871,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.0021692051086574793,
"rewards/margins": 0.02031012810766697,
"rewards/rejected": -0.022479332983493805,
"step": 200
},
{
"epoch": 0.05038387715930902,
"grad_norm": 4.82808785130016,
"learning_rate": 2.517985611510791e-07,
"logits/chosen": -0.416436105966568,
"logits/rejected": -0.42406004667282104,
"logps/chosen": -256.06634521484375,
"logps/rejected": -279.0437316894531,
"loss": 0.6868,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.006927967071533203,
"rewards/margins": 0.01621558703482151,
"rewards/rejected": -0.023143552243709564,
"step": 210
},
{
"epoch": 0.052783109404990404,
"grad_norm": 4.930218028749106,
"learning_rate": 2.637889688249401e-07,
"logits/chosen": -0.4129602015018463,
"logits/rejected": -0.42116695642471313,
"logps/chosen": -326.42987060546875,
"logps/rejected": -336.3708801269531,
"loss": 0.6874,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.011486181057989597,
"rewards/margins": 0.01089237816631794,
"rewards/rejected": -0.02237856015563011,
"step": 220
},
{
"epoch": 0.05518234165067178,
"grad_norm": 5.209995561674771,
"learning_rate": 2.7577937649880093e-07,
"logits/chosen": -0.39636367559432983,
"logits/rejected": -0.37121134996414185,
"logps/chosen": -249.68606567382812,
"logps/rejected": -287.6152038574219,
"loss": 0.6826,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.014097055420279503,
"rewards/margins": 0.01727898046374321,
"rewards/rejected": -0.031376034021377563,
"step": 230
},
{
"epoch": 0.05758157389635317,
"grad_norm": 5.637285290487847,
"learning_rate": 2.8776978417266184e-07,
"logits/chosen": -0.3950192332267761,
"logits/rejected": -0.38908690214157104,
"logps/chosen": -302.5347595214844,
"logps/rejected": -257.3473205566406,
"loss": 0.6811,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.006957252975553274,
"rewards/margins": 0.02547440305352211,
"rewards/rejected": -0.03243165463209152,
"step": 240
},
{
"epoch": 0.05998080614203455,
"grad_norm": 5.523154228755914,
"learning_rate": 2.997601918465228e-07,
"logits/chosen": -0.3644653558731079,
"logits/rejected": -0.35463160276412964,
"logps/chosen": -244.20401000976562,
"logps/rejected": -235.6724395751953,
"loss": 0.6804,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.01351804006844759,
"rewards/margins": 0.01623005047440529,
"rewards/rejected": -0.029748091474175453,
"step": 250
},
{
"epoch": 0.06238003838771593,
"grad_norm": 4.946428082161042,
"learning_rate": 3.1175059952038366e-07,
"logits/chosen": -0.44501185417175293,
"logits/rejected": -0.39841514825820923,
"logps/chosen": -285.33624267578125,
"logps/rejected": -289.7425842285156,
"loss": 0.6786,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.024796243757009506,
"rewards/margins": 0.03235085308551788,
"rewards/rejected": -0.05714709684252739,
"step": 260
},
{
"epoch": 0.0647792706333973,
"grad_norm": 5.212561603527658,
"learning_rate": 3.2374100719424457e-07,
"logits/chosen": -0.3279297947883606,
"logits/rejected": -0.38405635952949524,
"logps/chosen": -295.61566162109375,
"logps/rejected": -237.0383758544922,
"loss": 0.676,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.021820012480020523,
"rewards/margins": 0.008004983887076378,
"rewards/rejected": -0.029824992641806602,
"step": 270
},
{
"epoch": 0.0671785028790787,
"grad_norm": 5.188924605918914,
"learning_rate": 3.3573141486810554e-07,
"logits/chosen": -0.3912425935268402,
"logits/rejected": -0.38816994428634644,
"logps/chosen": -309.39617919921875,
"logps/rejected": -302.08001708984375,
"loss": 0.6706,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.01541377604007721,
"rewards/margins": 0.04586619883775711,
"rewards/rejected": -0.06127997115254402,
"step": 280
},
{
"epoch": 0.06957773512476008,
"grad_norm": 4.7601490852750326,
"learning_rate": 3.477218225419664e-07,
"logits/chosen": -0.3547287583351135,
"logits/rejected": -0.32382625341415405,
"logps/chosen": -301.15460205078125,
"logps/rejected": -276.5646667480469,
"loss": 0.6714,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.021407341584563255,
"rewards/margins": 0.03352125734090805,
"rewards/rejected": -0.05492859333753586,
"step": 290
},
{
"epoch": 0.07197696737044146,
"grad_norm": 5.589212470867438,
"learning_rate": 3.597122302158273e-07,
"logits/chosen": -0.4183027744293213,
"logits/rejected": -0.4140304923057556,
"logps/chosen": -278.68707275390625,
"logps/rejected": -302.8797912597656,
"loss": 0.6729,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.03756223991513252,
"rewards/margins": 0.041960276663303375,
"rewards/rejected": -0.0795225277543068,
"step": 300
},
{
"epoch": 0.07437619961612284,
"grad_norm": 5.107027000775508,
"learning_rate": 3.7170263788968827e-07,
"logits/chosen": -0.38045555353164673,
"logits/rejected": -0.3847430646419525,
"logps/chosen": -288.02801513671875,
"logps/rejected": -258.888671875,
"loss": 0.6756,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.043108247220516205,
"rewards/margins": 0.04828093200922012,
"rewards/rejected": -0.09138917922973633,
"step": 310
},
{
"epoch": 0.07677543186180422,
"grad_norm": 4.7093696590111955,
"learning_rate": 3.836930455635491e-07,
"logits/chosen": -0.3643363118171692,
"logits/rejected": -0.35694578289985657,
"logps/chosen": -284.8294372558594,
"logps/rejected": -257.43133544921875,
"loss": 0.6681,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.04265808314085007,
"rewards/margins": 0.04907030984759331,
"rewards/rejected": -0.09172839671373367,
"step": 320
},
{
"epoch": 0.07917466410748561,
"grad_norm": 4.910514032068699,
"learning_rate": 3.9568345323741003e-07,
"logits/chosen": -0.31883081793785095,
"logits/rejected": -0.28674525022506714,
"logps/chosen": -272.1695251464844,
"logps/rejected": -318.9271240234375,
"loss": 0.6639,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.08635957539081573,
"rewards/margins": 0.05848981812596321,
"rewards/rejected": -0.14484938979148865,
"step": 330
},
{
"epoch": 0.08157389635316699,
"grad_norm": 5.0332599907751545,
"learning_rate": 4.07673860911271e-07,
"logits/chosen": -0.3443170189857483,
"logits/rejected": -0.35223323106765747,
"logps/chosen": -254.35726928710938,
"logps/rejected": -281.94195556640625,
"loss": 0.6642,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.04620979353785515,
"rewards/margins": 0.08755537122488022,
"rewards/rejected": -0.13376514613628387,
"step": 340
},
{
"epoch": 0.08397312859884837,
"grad_norm": 5.508072548843049,
"learning_rate": 4.1966426858513185e-07,
"logits/chosen": -0.3824247717857361,
"logits/rejected": -0.3841249346733093,
"logps/chosen": -319.01995849609375,
"logps/rejected": -318.2255554199219,
"loss": 0.6667,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.08988650888204575,
"rewards/margins": 0.05337555333971977,
"rewards/rejected": -0.1432620733976364,
"step": 350
},
{
"epoch": 0.08637236084452975,
"grad_norm": 5.385600660244676,
"learning_rate": 4.3165467625899276e-07,
"logits/chosen": -0.36046355962753296,
"logits/rejected": -0.3866155743598938,
"logps/chosen": -274.48846435546875,
"logps/rejected": -234.24191284179688,
"loss": 0.6672,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.0927540734410286,
"rewards/margins": 0.045326970517635345,
"rewards/rejected": -0.13808102905750275,
"step": 360
},
{
"epoch": 0.08877159309021113,
"grad_norm": 5.5450235472250675,
"learning_rate": 4.436450839328537e-07,
"logits/chosen": -0.3677825331687927,
"logits/rejected": -0.3560819625854492,
"logps/chosen": -265.2862243652344,
"logps/rejected": -291.4950256347656,
"loss": 0.6608,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.1145174503326416,
"rewards/margins": 0.10159511864185333,
"rewards/rejected": -0.21611256897449493,
"step": 370
},
{
"epoch": 0.09117082533589252,
"grad_norm": 4.681358528597076,
"learning_rate": 4.556354916067146e-07,
"logits/chosen": -0.39217817783355713,
"logits/rejected": -0.36524298787117004,
"logps/chosen": -255.4202117919922,
"logps/rejected": -269.22540283203125,
"loss": 0.65,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.0964360311627388,
"rewards/margins": 0.10729198157787323,
"rewards/rejected": -0.20372800529003143,
"step": 380
},
{
"epoch": 0.0935700575815739,
"grad_norm": 5.3926706949666325,
"learning_rate": 4.676258992805755e-07,
"logits/chosen": -0.31998729705810547,
"logits/rejected": -0.30327945947647095,
"logps/chosen": -294.6556396484375,
"logps/rejected": -271.11346435546875,
"loss": 0.6514,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.11293824017047882,
"rewards/margins": 0.07754337787628174,
"rewards/rejected": -0.19048163294792175,
"step": 390
},
{
"epoch": 0.09596928982725528,
"grad_norm": 5.420438925013907,
"learning_rate": 4.796163069544364e-07,
"logits/chosen": -0.35674285888671875,
"logits/rejected": -0.3888497054576874,
"logps/chosen": -275.8892517089844,
"logps/rejected": -273.18157958984375,
"loss": 0.6491,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.1275942027568817,
"rewards/margins": 0.13519003987312317,
"rewards/rejected": -0.26278427243232727,
"step": 400
},
{
"epoch": 0.09836852207293666,
"grad_norm": 5.2798127437891775,
"learning_rate": 4.916067146282974e-07,
"logits/chosen": -0.362439900636673,
"logits/rejected": -0.3545471131801605,
"logps/chosen": -278.9376525878906,
"logps/rejected": -321.1399841308594,
"loss": 0.6355,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.15209965407848358,
"rewards/margins": 0.1249600201845169,
"rewards/rejected": -0.2770597040653229,
"step": 410
},
{
"epoch": 0.10076775431861804,
"grad_norm": 5.259650468657089,
"learning_rate": 4.999992108529978e-07,
"logits/chosen": -0.31682300567626953,
"logits/rejected": -0.3138789236545563,
"logps/chosen": -353.08843994140625,
"logps/rejected": -335.11993408203125,
"loss": 0.6426,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.1913050264120102,
"rewards/margins": 0.15328899025917053,
"rewards/rejected": -0.3445940315723419,
"step": 420
},
{
"epoch": 0.10316698656429943,
"grad_norm": 5.7901843585434305,
"learning_rate": 4.999851817115532e-07,
"logits/chosen": -0.4503898620605469,
"logits/rejected": -0.406587690114975,
"logps/chosen": -276.3069152832031,
"logps/rejected": -298.9613342285156,
"loss": 0.6437,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.1920853555202484,
"rewards/margins": 0.21766385436058044,
"rewards/rejected": -0.40974926948547363,
"step": 430
},
{
"epoch": 0.10556621880998081,
"grad_norm": 5.5732351937176325,
"learning_rate": 4.999536171027889e-07,
"logits/chosen": -0.3798277974128723,
"logits/rejected": -0.38922780752182007,
"logps/chosen": -321.9649658203125,
"logps/rejected": -325.6181640625,
"loss": 0.6342,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.23407897353172302,
"rewards/margins": 0.13595230877399445,
"rewards/rejected": -0.37003129720687866,
"step": 440
},
{
"epoch": 0.10796545105566219,
"grad_norm": 6.182065387597852,
"learning_rate": 4.999045192408369e-07,
"logits/chosen": -0.28310176730155945,
"logits/rejected": -0.28306809067726135,
"logps/chosen": -274.2243347167969,
"logps/rejected": -265.6375732421875,
"loss": 0.6339,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.2778427004814148,
"rewards/margins": 0.08695127815008163,
"rewards/rejected": -0.36479395627975464,
"step": 450
},
{
"epoch": 0.11036468330134357,
"grad_norm": 5.712218150903385,
"learning_rate": 4.998378915697171e-07,
"logits/chosen": -0.3742767870426178,
"logits/rejected": -0.3694307804107666,
"logps/chosen": -301.56683349609375,
"logps/rejected": -318.60382080078125,
"loss": 0.6177,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.2279098480939865,
"rewards/margins": 0.23911185562610626,
"rewards/rejected": -0.4670217037200928,
"step": 460
},
{
"epoch": 0.11276391554702495,
"grad_norm": 5.207613785560333,
"learning_rate": 4.997537387630958e-07,
"logits/chosen": -0.30902743339538574,
"logits/rejected": -0.3139379322528839,
"logps/chosen": -238.130859375,
"logps/rejected": -261.97259521484375,
"loss": 0.6099,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.21456749737262726,
"rewards/margins": 0.2124086171388626,
"rewards/rejected": -0.42697611451148987,
"step": 470
},
{
"epoch": 0.11516314779270634,
"grad_norm": 6.373579896233462,
"learning_rate": 4.996520667239582e-07,
"logits/chosen": -0.44946223497390747,
"logits/rejected": -0.4517344534397125,
"logps/chosen": -263.78277587890625,
"logps/rejected": -343.26947021484375,
"loss": 0.6111,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.2376352846622467,
"rewards/margins": 0.2664358913898468,
"rewards/rejected": -0.5040711164474487,
"step": 480
},
{
"epoch": 0.11756238003838772,
"grad_norm": 6.494273680385882,
"learning_rate": 4.995328825841939e-07,
"logits/chosen": -0.32751840353012085,
"logits/rejected": -0.33484649658203125,
"logps/chosen": -246.5781707763672,
"logps/rejected": -297.93353271484375,
"loss": 0.6083,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.27829456329345703,
"rewards/margins": 0.44562092423439026,
"rewards/rejected": -0.7239154577255249,
"step": 490
},
{
"epoch": 0.1199616122840691,
"grad_norm": 6.156723989230307,
"learning_rate": 4.993961947040967e-07,
"logits/chosen": -0.3556281626224518,
"logits/rejected": -0.37649574875831604,
"logps/chosen": -335.5538024902344,
"logps/rejected": -312.1348876953125,
"loss": 0.6281,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.4268563687801361,
"rewards/margins": 0.14374002814292908,
"rewards/rejected": -0.5705963373184204,
"step": 500
},
{
"epoch": 0.12236084452975048,
"grad_norm": 5.5439607137629,
"learning_rate": 4.992420126717784e-07,
"logits/chosen": -0.39222702383995056,
"logits/rejected": -0.3753407597541809,
"logps/chosen": -280.47393798828125,
"logps/rejected": -329.1144104003906,
"loss": 0.6064,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.24301853775978088,
"rewards/margins": 0.4458102285861969,
"rewards/rejected": -0.6888288259506226,
"step": 510
},
{
"epoch": 0.12476007677543186,
"grad_norm": 6.961165174392256,
"learning_rate": 4.990703473024958e-07,
"logits/chosen": -0.3275012969970703,
"logits/rejected": -0.35214173793792725,
"logps/chosen": -332.95355224609375,
"logps/rejected": -348.1015930175781,
"loss": 0.6268,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.42038726806640625,
"rewards/margins": 0.20834532380104065,
"rewards/rejected": -0.6287325620651245,
"step": 520
},
{
"epoch": 0.12715930902111325,
"grad_norm": 6.451198360785239,
"learning_rate": 4.98881210637893e-07,
"logits/chosen": -0.35043126344680786,
"logits/rejected": -0.3229239583015442,
"logps/chosen": -256.13128662109375,
"logps/rejected": -325.2506408691406,
"loss": 0.6186,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.35000094771385193,
"rewards/margins": 0.2998683452606201,
"rewards/rejected": -0.6498693227767944,
"step": 530
},
{
"epoch": 0.1295585412667946,
"grad_norm": 5.138402880291277,
"learning_rate": 4.986746159451553e-07,
"logits/chosen": -0.2927325963973999,
"logits/rejected": -0.2958449721336365,
"logps/chosen": -296.5050964355469,
"logps/rejected": -315.69061279296875,
"loss": 0.6091,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.2853540778160095,
"rewards/margins": 0.2909182012081146,
"rewards/rejected": -0.5762723088264465,
"step": 540
},
{
"epoch": 0.131957773512476,
"grad_norm": 5.337841343228059,
"learning_rate": 4.984505777160795e-07,
"logits/chosen": -0.2978525757789612,
"logits/rejected": -0.3084144592285156,
"logps/chosen": -360.9128112792969,
"logps/rejected": -391.237060546875,
"loss": 0.6252,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.40219253301620483,
"rewards/margins": 0.2555919587612152,
"rewards/rejected": -0.6577844619750977,
"step": 550
},
{
"epoch": 0.1343570057581574,
"grad_norm": 5.899251959062901,
"learning_rate": 4.982091116660574e-07,
"logits/chosen": -0.44975343346595764,
"logits/rejected": -0.4612964689731598,
"logps/chosen": -247.9381561279297,
"logps/rejected": -239.43057250976562,
"loss": 0.6346,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.4517020285129547,
"rewards/margins": 0.14873093366622925,
"rewards/rejected": -0.6004330515861511,
"step": 560
},
{
"epoch": 0.13675623800383876,
"grad_norm": 6.842342229687067,
"learning_rate": 4.979502347329732e-07,
"logits/chosen": -0.3259963393211365,
"logits/rejected": -0.32738104462623596,
"logps/chosen": -363.0848083496094,
"logps/rejected": -425.85650634765625,
"loss": 0.6152,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.53682541847229,
"rewards/margins": 0.35178542137145996,
"rewards/rejected": -0.88861083984375,
"step": 570
},
{
"epoch": 0.13915547024952016,
"grad_norm": 7.709446348310538,
"learning_rate": 4.976739650760151e-07,
"logits/chosen": -0.4362337589263916,
"logits/rejected": -0.4145421087741852,
"logps/chosen": -322.2565002441406,
"logps/rejected": -323.690673828125,
"loss": 0.6084,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.503047525882721,
"rewards/margins": 0.2108907401561737,
"rewards/rejected": -0.7139382362365723,
"step": 580
},
{
"epoch": 0.14155470249520152,
"grad_norm": 7.389107341643373,
"learning_rate": 4.97380322074402e-07,
"logits/chosen": -0.3247602880001068,
"logits/rejected": -0.33087000250816345,
"logps/chosen": -276.3142395019531,
"logps/rejected": -307.705078125,
"loss": 0.6167,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.4834076464176178,
"rewards/margins": 0.29755571484565735,
"rewards/rejected": -0.7809633016586304,
"step": 590
},
{
"epoch": 0.14395393474088292,
"grad_norm": 6.3757200147393585,
"learning_rate": 4.970693263260237e-07,
"logits/chosen": -0.34789201617240906,
"logits/rejected": -0.3619535267353058,
"logps/chosen": -332.65240478515625,
"logps/rejected": -348.0079650878906,
"loss": 0.6153,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.4377606511116028,
"rewards/margins": 0.43048110604286194,
"rewards/rejected": -0.8682417869567871,
"step": 600
},
{
"epoch": 0.1463531669865643,
"grad_norm": 6.992245943787356,
"learning_rate": 4.967409996459966e-07,
"logits/chosen": -0.40890535712242126,
"logits/rejected": -0.41852784156799316,
"logps/chosen": -343.47113037109375,
"logps/rejected": -352.51739501953125,
"loss": 0.6028,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.44202518463134766,
"rewards/margins": 0.35426777601242065,
"rewards/rejected": -0.7962929010391235,
"step": 610
},
{
"epoch": 0.14875239923224567,
"grad_norm": 5.871485349553309,
"learning_rate": 4.963953650651326e-07,
"logits/chosen": -0.3329642117023468,
"logits/rejected": -0.33251506090164185,
"logps/chosen": -415.4046936035156,
"logps/rejected": -351.1783447265625,
"loss": 0.5993,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.527837336063385,
"rewards/margins": 0.2701931595802307,
"rewards/rejected": -0.798030436038971,
"step": 620
},
{
"epoch": 0.15115163147792707,
"grad_norm": 6.6189704018639155,
"learning_rate": 4.960324468283248e-07,
"logits/chosen": -0.4594503343105316,
"logits/rejected": -0.4743649363517761,
"logps/chosen": -287.46697998046875,
"logps/rejected": -318.0199279785156,
"loss": 0.5864,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.5368043184280396,
"rewards/margins": 0.2990504205226898,
"rewards/rejected": -0.8358548283576965,
"step": 630
},
{
"epoch": 0.15355086372360843,
"grad_norm": 6.440240303252386,
"learning_rate": 4.956522703928451e-07,
"logits/chosen": -0.4017508625984192,
"logits/rejected": -0.3634760081768036,
"logps/chosen": -301.4740295410156,
"logps/rejected": -330.0491943359375,
"loss": 0.5777,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.5480097532272339,
"rewards/margins": 0.28381380438804626,
"rewards/rejected": -0.8318235278129578,
"step": 640
},
{
"epoch": 0.15595009596928983,
"grad_norm": 8.733595781101256,
"learning_rate": 4.952548624265606e-07,
"logits/chosen": -0.33765482902526855,
"logits/rejected": -0.3307989537715912,
"logps/chosen": -368.00457763671875,
"logps/rejected": -375.5353088378906,
"loss": 0.6137,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.6871958374977112,
"rewards/margins": 0.2888778746128082,
"rewards/rejected": -0.9760736227035522,
"step": 650
},
{
"epoch": 0.15834932821497122,
"grad_norm": 6.492977447810359,
"learning_rate": 4.948402508060607e-07,
"logits/chosen": -0.41905927658081055,
"logits/rejected": -0.4047884941101074,
"logps/chosen": -299.0521240234375,
"logps/rejected": -338.4973449707031,
"loss": 0.6203,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.5227022767066956,
"rewards/margins": 0.4564470648765564,
"rewards/rejected": -0.979149341583252,
"step": 660
},
{
"epoch": 0.16074856046065258,
"grad_norm": 7.072838685765256,
"learning_rate": 4.944084646147038e-07,
"logits/chosen": -0.3967028260231018,
"logits/rejected": -0.3915463387966156,
"logps/chosen": -393.5112609863281,
"logps/rejected": -390.46771240234375,
"loss": 0.6382,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.6279814839363098,
"rewards/margins": 0.20776453614234924,
"rewards/rejected": -0.8357461094856262,
"step": 670
},
{
"epoch": 0.16314779270633398,
"grad_norm": 7.229422542340886,
"learning_rate": 4.939595341405754e-07,
"logits/chosen": -0.4420618414878845,
"logits/rejected": -0.47058361768722534,
"logps/chosen": -320.97833251953125,
"logps/rejected": -354.74896240234375,
"loss": 0.6077,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.5401324033737183,
"rewards/margins": 0.39541110396385193,
"rewards/rejected": -0.935543417930603,
"step": 680
},
{
"epoch": 0.16554702495201534,
"grad_norm": 6.416644260001778,
"learning_rate": 4.93493490874365e-07,
"logits/chosen": -0.3352740705013275,
"logits/rejected": -0.3355199694633484,
"logps/chosen": -320.6838073730469,
"logps/rejected": -349.72637939453125,
"loss": 0.5704,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.6109887361526489,
"rewards/margins": 0.24212434887886047,
"rewards/rejected": -0.8531131744384766,
"step": 690
},
{
"epoch": 0.16794625719769674,
"grad_norm": 8.587832635723272,
"learning_rate": 4.93010367507156e-07,
"logits/chosen": -0.4223472476005554,
"logits/rejected": -0.40697455406188965,
"logps/chosen": -279.3757629394531,
"logps/rejected": -302.53887939453125,
"loss": 0.5809,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.5754938125610352,
"rewards/margins": 0.49033960700035095,
"rewards/rejected": -1.065833330154419,
"step": 700
},
{
"epoch": 0.17034548944337813,
"grad_norm": 8.280473447647031,
"learning_rate": 4.925101979281332e-07,
"logits/chosen": -0.308775395154953,
"logits/rejected": -0.3469308018684387,
"logps/chosen": -367.63006591796875,
"logps/rejected": -369.24102783203125,
"loss": 0.5982,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.54417484998703,
"rewards/margins": 0.5548971891403198,
"rewards/rejected": -1.0990720987319946,
"step": 710
},
{
"epoch": 0.1727447216890595,
"grad_norm": 7.262952451036594,
"learning_rate": 4.919930172222054e-07,
"logits/chosen": -0.40510883927345276,
"logits/rejected": -0.4301750659942627,
"logps/chosen": -333.3446350097656,
"logps/rejected": -367.443359375,
"loss": 0.562,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.6565228700637817,
"rewards/margins": 0.41670140624046326,
"rewards/rejected": -1.0732243061065674,
"step": 720
},
{
"epoch": 0.1751439539347409,
"grad_norm": 7.527915939450412,
"learning_rate": 4.914588616675445e-07,
"logits/chosen": -0.5176496505737305,
"logits/rejected": -0.5337257981300354,
"logps/chosen": -276.7431945800781,
"logps/rejected": -333.2959899902344,
"loss": 0.5984,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.5182946920394897,
"rewards/margins": 0.4113486707210541,
"rewards/rejected": -0.9296433329582214,
"step": 730
},
{
"epoch": 0.17754318618042225,
"grad_norm": 7.676539707780335,
"learning_rate": 4.909077687330404e-07,
"logits/chosen": -0.38037022948265076,
"logits/rejected": -0.3775717318058014,
"logps/chosen": -361.13360595703125,
"logps/rejected": -350.4933166503906,
"loss": 0.5737,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.6827607154846191,
"rewards/margins": 0.23238630592823029,
"rewards/rejected": -0.9151470065116882,
"step": 740
},
{
"epoch": 0.17994241842610365,
"grad_norm": 7.668861235099345,
"learning_rate": 4.903397770756729e-07,
"logits/chosen": -0.4016490876674652,
"logits/rejected": -0.41701728105545044,
"logps/chosen": -345.77093505859375,
"logps/rejected": -386.59478759765625,
"loss": 0.5849,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.6805583834648132,
"rewards/margins": 0.4540184438228607,
"rewards/rejected": -1.1345769166946411,
"step": 750
},
{
"epoch": 0.18234165067178504,
"grad_norm": 5.8948610663264756,
"learning_rate": 4.897549265378004e-07,
"logits/chosen": -0.39282411336898804,
"logits/rejected": -0.40464964509010315,
"logps/chosen": -409.1741638183594,
"logps/rejected": -438.93896484375,
"loss": 0.5763,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.6418434381484985,
"rewards/margins": 0.33645665645599365,
"rewards/rejected": -0.9782999753952026,
"step": 760
},
{
"epoch": 0.1847408829174664,
"grad_norm": 7.575295799966419,
"learning_rate": 4.891532581443643e-07,
"logits/chosen": -0.4237458109855652,
"logits/rejected": -0.43563684821128845,
"logps/chosen": -363.11199951171875,
"logps/rejected": -443.242919921875,
"loss": 0.5533,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.48444193601608276,
"rewards/margins": 0.8247998356819153,
"rewards/rejected": -1.309241771697998,
"step": 770
},
{
"epoch": 0.1871401151631478,
"grad_norm": 8.560608469390486,
"learning_rate": 4.885348141000122e-07,
"logits/chosen": -0.37164923548698425,
"logits/rejected": -0.38602423667907715,
"logps/chosen": -325.58111572265625,
"logps/rejected": -402.8592224121094,
"loss": 0.5743,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.6478859186172485,
"rewards/margins": 0.5546708106994629,
"rewards/rejected": -1.202556848526001,
"step": 780
},
{
"epoch": 0.18953934740882916,
"grad_norm": 6.7270126026080295,
"learning_rate": 4.878996377861367e-07,
"logits/chosen": -0.46751460433006287,
"logits/rejected": -0.5031236410140991,
"logps/chosen": -311.3717346191406,
"logps/rejected": -359.74786376953125,
"loss": 0.5381,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.8375557065010071,
"rewards/margins": 0.37298810482025146,
"rewards/rejected": -1.2105437517166138,
"step": 790
},
{
"epoch": 0.19193857965451055,
"grad_norm": 8.21855435965664,
"learning_rate": 4.872477737578327e-07,
"logits/chosen": -0.42351236939430237,
"logits/rejected": -0.37146827578544617,
"logps/chosen": -370.0285339355469,
"logps/rejected": -442.88092041015625,
"loss": 0.544,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.8096299171447754,
"rewards/margins": 0.7599128484725952,
"rewards/rejected": -1.569542646408081,
"step": 800
},
{
"epoch": 0.19433781190019195,
"grad_norm": 11.575192192623518,
"learning_rate": 4.865792677407718e-07,
"logits/chosen": -0.4782884120941162,
"logits/rejected": -0.48071250319480896,
"logps/chosen": -352.44830322265625,
"logps/rejected": -357.5110778808594,
"loss": 0.5849,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.840336799621582,
"rewards/margins": 0.3332519829273224,
"rewards/rejected": -1.173588752746582,
"step": 810
},
{
"epoch": 0.1967370441458733,
"grad_norm": 10.038226850588464,
"learning_rate": 4.858941666279955e-07,
"logits/chosen": -0.4984146058559418,
"logits/rejected": -0.5030771493911743,
"logps/chosen": -356.491455078125,
"logps/rejected": -367.9295959472656,
"loss": 0.5948,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.6749507784843445,
"rewards/margins": 0.2715102434158325,
"rewards/rejected": -0.946461021900177,
"step": 820
},
{
"epoch": 0.1991362763915547,
"grad_norm": 9.05117258722249,
"learning_rate": 4.851925184766247e-07,
"logits/chosen": -0.4640856683254242,
"logits/rejected": -0.4853819012641907,
"logps/chosen": -357.955078125,
"logps/rejected": -393.90478515625,
"loss": 0.5816,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.8941922187805176,
"rewards/margins": 0.591671347618103,
"rewards/rejected": -1.4858636856079102,
"step": 830
},
{
"epoch": 0.20153550863723607,
"grad_norm": 9.681421628090424,
"learning_rate": 4.844743725044897e-07,
"logits/chosen": -0.4659281373023987,
"logits/rejected": -0.5088318586349487,
"logps/chosen": -329.5724182128906,
"logps/rejected": -351.2308044433594,
"loss": 0.5644,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.7480869293212891,
"rewards/margins": 0.4603498876094818,
"rewards/rejected": -1.2084368467330933,
"step": 840
},
{
"epoch": 0.20393474088291746,
"grad_norm": 9.601153198405655,
"learning_rate": 4.837397790866774e-07,
"logits/chosen": -0.47188258171081543,
"logits/rejected": -0.4746321141719818,
"logps/chosen": -362.03021240234375,
"logps/rejected": -425.6300354003906,
"loss": 0.5614,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.596829354763031,
"rewards/margins": 0.8421809077262878,
"rewards/rejected": -1.4390103816986084,
"step": 850
},
{
"epoch": 0.20633397312859886,
"grad_norm": 8.338440429434868,
"learning_rate": 4.829887897519974e-07,
"logits/chosen": -0.4840044379234314,
"logits/rejected": -0.4713994860649109,
"logps/chosen": -310.5907897949219,
"logps/rejected": -378.32049560546875,
"loss": 0.5839,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.6725698709487915,
"rewards/margins": 0.4422430992126465,
"rewards/rejected": -1.114812970161438,
"step": 860
},
{
"epoch": 0.20873320537428022,
"grad_norm": 7.422844726445461,
"learning_rate": 4.82221457179368e-07,
"logits/chosen": -0.4798775613307953,
"logits/rejected": -0.4771656095981598,
"logps/chosen": -346.35284423828125,
"logps/rejected": -404.0704040527344,
"loss": 0.5487,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.653545081615448,
"rewards/margins": 0.7137434482574463,
"rewards/rejected": -1.36728835105896,
"step": 870
},
{
"epoch": 0.21113243761996162,
"grad_norm": 7.498203140238108,
"learning_rate": 4.814378351941206e-07,
"logits/chosen": -0.4903596341609955,
"logits/rejected": -0.4909774363040924,
"logps/chosen": -333.675048828125,
"logps/rejected": -356.37615966796875,
"loss": 0.5783,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.7229236960411072,
"rewards/margins": 0.3018752932548523,
"rewards/rejected": -1.0247989892959595,
"step": 880
},
{
"epoch": 0.21353166986564298,
"grad_norm": 8.122602026965641,
"learning_rate": 4.806379787642241e-07,
"logits/chosen": -0.46273237466812134,
"logits/rejected": -0.45851221680641174,
"logps/chosen": -316.8045959472656,
"logps/rejected": -386.3130798339844,
"loss": 0.5996,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.6047753095626831,
"rewards/margins": 0.5871869921684265,
"rewards/rejected": -1.1919623613357544,
"step": 890
},
{
"epoch": 0.21593090211132437,
"grad_norm": 7.941797087490691,
"learning_rate": 4.798219439964293e-07,
"logits/chosen": -0.5175309777259827,
"logits/rejected": -0.5462228059768677,
"logps/chosen": -322.66717529296875,
"logps/rejected": -351.2758483886719,
"loss": 0.56,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.7025772333145142,
"rewards/margins": 0.17370259761810303,
"rewards/rejected": -0.8762798309326172,
"step": 900
},
{
"epoch": 0.21833013435700577,
"grad_norm": 11.409843882847035,
"learning_rate": 4.78989788132333e-07,
"logits/chosen": -0.5460485219955444,
"logits/rejected": -0.5394560098648071,
"logps/chosen": -274.5609130859375,
"logps/rejected": -357.4576416015625,
"loss": 0.5144,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.47791242599487305,
"rewards/margins": 0.7380608320236206,
"rewards/rejected": -1.215973138809204,
"step": 910
},
{
"epoch": 0.22072936660268713,
"grad_norm": 7.234637207163524,
"learning_rate": 4.781415695443631e-07,
"logits/chosen": -0.47741183638572693,
"logits/rejected": -0.46385058760643005,
"logps/chosen": -409.548583984375,
"logps/rejected": -444.76080322265625,
"loss": 0.5728,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.9967001080513,
"rewards/margins": 0.2708914577960968,
"rewards/rejected": -1.2675915956497192,
"step": 920
},
{
"epoch": 0.22312859884836853,
"grad_norm": 7.053932572600293,
"learning_rate": 4.772773477316836e-07,
"logits/chosen": -0.4638640284538269,
"logits/rejected": -0.4659477174282074,
"logps/chosen": -399.1170654296875,
"logps/rejected": -439.843994140625,
"loss": 0.5505,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.0153203010559082,
"rewards/margins": 0.4187864661216736,
"rewards/rejected": -1.434106707572937,
"step": 930
},
{
"epoch": 0.2255278310940499,
"grad_norm": 8.914653285618083,
"learning_rate": 4.7639718331602117e-07,
"logits/chosen": -0.4590983986854553,
"logits/rejected": -0.45087337493896484,
"logps/chosen": -361.59393310546875,
"logps/rejected": -440.941650390625,
"loss": 0.5377,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.7911983728408813,
"rewards/margins": 0.8489359617233276,
"rewards/rejected": -1.6401344537734985,
"step": 940
},
{
"epoch": 0.22792706333973128,
"grad_norm": 9.061462258268621,
"learning_rate": 4.7550113803741275e-07,
"logits/chosen": -0.4630160331726074,
"logits/rejected": -0.5072802305221558,
"logps/chosen": -387.6048278808594,
"logps/rejected": -365.64532470703125,
"loss": 0.5737,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.9304320216178894,
"rewards/margins": 0.43897590041160583,
"rewards/rejected": -1.3694080114364624,
"step": 950
},
{
"epoch": 0.23032629558541268,
"grad_norm": 8.866753051371537,
"learning_rate": 4.7458927474987454e-07,
"logits/chosen": -0.4371200501918793,
"logits/rejected": -0.43364763259887695,
"logps/chosen": -409.8465881347656,
"logps/rejected": -384.90234375,
"loss": 0.5325,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.7246421575546265,
"rewards/margins": 0.3903957009315491,
"rewards/rejected": -1.1150379180908203,
"step": 960
},
{
"epoch": 0.23272552783109404,
"grad_norm": 7.704129412482537,
"learning_rate": 4.7366165741699347e-07,
"logits/chosen": -0.5143966674804688,
"logits/rejected": -0.5406373143196106,
"logps/chosen": -423.09014892578125,
"logps/rejected": -443.0372619628906,
"loss": 0.557,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.8278281092643738,
"rewards/margins": 0.45336437225341797,
"rewards/rejected": -1.2811925411224365,
"step": 970
},
{
"epoch": 0.23512476007677544,
"grad_norm": 7.64309056534733,
"learning_rate": 4.727183511074401e-07,
"logits/chosen": -0.5884715914726257,
"logits/rejected": -0.5859401822090149,
"logps/chosen": -379.3627624511719,
"logps/rejected": -392.21710205078125,
"loss": 0.5616,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.920864462852478,
"rewards/margins": 0.23082181811332703,
"rewards/rejected": -1.151686191558838,
"step": 980
},
{
"epoch": 0.2375239923224568,
"grad_norm": 8.955148603646292,
"learning_rate": 4.717594219904043e-07,
"logits/chosen": -0.5043666958808899,
"logits/rejected": -0.5165797472000122,
"logps/chosen": -397.373046875,
"logps/rejected": -394.7057189941406,
"loss": 0.5553,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.1149301528930664,
"rewards/margins": 0.4297923147678375,
"rewards/rejected": -1.544722318649292,
"step": 990
},
{
"epoch": 0.2399232245681382,
"grad_norm": 9.68741680379089,
"learning_rate": 4.7078493733095393e-07,
"logits/chosen": -0.5751169919967651,
"logits/rejected": -0.5922902822494507,
"logps/chosen": -351.0492858886719,
"logps/rejected": -421.7478942871094,
"loss": 0.5413,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.8745172619819641,
"rewards/margins": 0.5707848072052002,
"rewards/rejected": -1.445302128791809,
"step": 1000
},
{
"epoch": 0.2423224568138196,
"grad_norm": 7.552967651026979,
"learning_rate": 4.6979496548531614e-07,
"logits/chosen": -0.5039399862289429,
"logits/rejected": -0.49812453985214233,
"logps/chosen": -357.84130859375,
"logps/rejected": -476.9107360839844,
"loss": 0.5415,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.8784712553024292,
"rewards/margins": 0.6386504173278809,
"rewards/rejected": -1.5171215534210205,
"step": 1010
},
{
"epoch": 0.24472168905950095,
"grad_norm": 7.847866875926576,
"learning_rate": 4.6878957589608293e-07,
"logits/chosen": -0.5514234304428101,
"logits/rejected": -0.5487276911735535,
"logps/chosen": -358.778076171875,
"logps/rejected": -490.60150146484375,
"loss": 0.5386,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.7840293645858765,
"rewards/margins": 0.8377860188484192,
"rewards/rejected": -1.6218153238296509,
"step": 1020
},
{
"epoch": 0.24712092130518235,
"grad_norm": 7.46939724134765,
"learning_rate": 4.6776883908733956e-07,
"logits/chosen": -0.5816242098808289,
"logits/rejected": -0.5803698897361755,
"logps/chosen": -391.3583679199219,
"logps/rejected": -390.5106506347656,
"loss": 0.5283,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.8500774502754211,
"rewards/margins": 0.6333375573158264,
"rewards/rejected": -1.4834150075912476,
"step": 1030
},
{
"epoch": 0.2495201535508637,
"grad_norm": 9.499290159289549,
"learning_rate": 4.667328266597178e-07,
"logits/chosen": -0.5638601779937744,
"logits/rejected": -0.5747939348220825,
"logps/chosen": -380.7355651855469,
"logps/rejected": -426.9833984375,
"loss": 0.5105,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.9775155782699585,
"rewards/margins": 0.575149416923523,
"rewards/rejected": -1.5526649951934814,
"step": 1040
},
{
"epoch": 0.2519193857965451,
"grad_norm": 7.863264950292173,
"learning_rate": 4.6568161128537354e-07,
"logits/chosen": -0.5107079148292542,
"logits/rejected": -0.5302263498306274,
"logps/chosen": -362.51165771484375,
"logps/rejected": -357.8714294433594,
"loss": 0.5418,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.9620344042778015,
"rewards/margins": 0.46561723947525024,
"rewards/rejected": -1.4276517629623413,
"step": 1050
},
{
"epoch": 0.2543186180422265,
"grad_norm": 9.95847080901471,
"learning_rate": 4.6461526670288877e-07,
"logits/chosen": -0.5152772068977356,
"logits/rejected": -0.5086151957511902,
"logps/chosen": -378.06707763671875,
"logps/rejected": -412.7275390625,
"loss": 0.5773,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.7655351758003235,
"rewards/margins": 0.6718277335166931,
"rewards/rejected": -1.4373629093170166,
"step": 1060
},
{
"epoch": 0.2567178502879079,
"grad_norm": 6.795589468002909,
"learning_rate": 4.635338677120994e-07,
"logits/chosen": -0.6026760935783386,
"logits/rejected": -0.605171799659729,
"logps/chosen": -363.8572998046875,
"logps/rejected": -459.39764404296875,
"loss": 0.5105,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.9281944036483765,
"rewards/margins": 0.7557355761528015,
"rewards/rejected": -1.6839300394058228,
"step": 1070
},
{
"epoch": 0.2591170825335892,
"grad_norm": 8.768106225235062,
"learning_rate": 4.6243749016884835e-07,
"logits/chosen": -0.4381980001926422,
"logits/rejected": -0.4824441969394684,
"logps/chosen": -396.9549255371094,
"logps/rejected": -550.8186645507812,
"loss": 0.5355,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.1090790033340454,
"rewards/margins": 0.9858474731445312,
"rewards/rejected": -2.094926357269287,
"step": 1080
},
{
"epoch": 0.2615163147792706,
"grad_norm": 12.341914291404834,
"learning_rate": 4.613262109796645e-07,
"logits/chosen": -0.5834716558456421,
"logits/rejected": -0.5614360570907593,
"logps/chosen": -384.7580261230469,
"logps/rejected": -524.4951171875,
"loss": 0.5445,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.0333119630813599,
"rewards/margins": 0.9544838666915894,
"rewards/rejected": -1.9877955913543701,
"step": 1090
},
{
"epoch": 0.263915547024952,
"grad_norm": 7.9475026844690975,
"learning_rate": 4.602001080963678e-07,
"logits/chosen": -0.5507039427757263,
"logits/rejected": -0.5546278953552246,
"logps/chosen": -414.7740783691406,
"logps/rejected": -445.2301330566406,
"loss": 0.5363,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.1491403579711914,
"rewards/margins": 0.5966090559959412,
"rewards/rejected": -1.7457494735717773,
"step": 1100
},
{
"epoch": 0.2663147792706334,
"grad_norm": 8.769481722876568,
"learning_rate": 4.590592605106017e-07,
"logits/chosen": -0.6439992189407349,
"logits/rejected": -0.650866687297821,
"logps/chosen": -414.07147216796875,
"logps/rejected": -447.4894104003906,
"loss": 0.5726,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.8819205164909363,
"rewards/margins": 0.6305669546127319,
"rewards/rejected": -1.5124876499176025,
"step": 1110
},
{
"epoch": 0.2687140115163148,
"grad_norm": 8.351140167515311,
"learning_rate": 4.5790374824829165e-07,
"logits/chosen": -0.5511302947998047,
"logits/rejected": -0.5896275639533997,
"logps/chosen": -292.4022216796875,
"logps/rejected": -368.3529052734375,
"loss": 0.5342,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.9656060934066772,
"rewards/margins": 0.5942641496658325,
"rewards/rejected": -1.5598702430725098,
"step": 1120
},
{
"epoch": 0.27111324376199614,
"grad_norm": 8.859939366313267,
"learning_rate": 4.5673365236403216e-07,
"logits/chosen": -0.6408380270004272,
"logits/rejected": -0.6961749792098999,
"logps/chosen": -286.7148132324219,
"logps/rejected": -403.5390625,
"loss": 0.521,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.8016033172607422,
"rewards/margins": 0.8615763783454895,
"rewards/rejected": -1.6631797552108765,
"step": 1130
},
{
"epoch": 0.27351247600767753,
"grad_norm": 8.034697339520084,
"learning_rate": 4.5554905493540075e-07,
"logits/chosen": -0.6324433088302612,
"logits/rejected": -0.6281362175941467,
"logps/chosen": -320.374755859375,
"logps/rejected": -417.0606994628906,
"loss": 0.5135,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.8844473958015442,
"rewards/margins": 0.9001449346542358,
"rewards/rejected": -1.7845920324325562,
"step": 1140
},
{
"epoch": 0.2759117082533589,
"grad_norm": 8.194750667905803,
"learning_rate": 4.5435003905720074e-07,
"logits/chosen": -0.5551676750183105,
"logits/rejected": -0.5857855081558228,
"logps/chosen": -384.4462585449219,
"logps/rejected": -426.1817321777344,
"loss": 0.5131,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.8710935711860657,
"rewards/margins": 0.703058660030365,
"rewards/rejected": -1.5741522312164307,
"step": 1150
},
{
"epoch": 0.2783109404990403,
"grad_norm": 9.865565713899901,
"learning_rate": 4.531366888356324e-07,
"logits/chosen": -0.6332504153251648,
"logits/rejected": -0.6072624325752258,
"logps/chosen": -294.1646423339844,
"logps/rejected": -434.31024169921875,
"loss": 0.5061,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.0756479501724243,
"rewards/margins": 0.913143515586853,
"rewards/rejected": -1.9887917041778564,
"step": 1160
},
{
"epoch": 0.2807101727447217,
"grad_norm": 10.884390589564477,
"learning_rate": 4.519090893823931e-07,
"logits/chosen": -0.6246575117111206,
"logits/rejected": -0.6475778818130493,
"logps/chosen": -372.89215087890625,
"logps/rejected": -443.2525329589844,
"loss": 0.5261,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.2081364393234253,
"rewards/margins": 0.7674375772476196,
"rewards/rejected": -1.9755741357803345,
"step": 1170
},
{
"epoch": 0.28310940499040305,
"grad_norm": 7.778148160208306,
"learning_rate": 4.5066732680870734e-07,
"logits/chosen": -0.568785548210144,
"logits/rejected": -0.608304500579834,
"logps/chosen": -348.26702880859375,
"logps/rejected": -393.5550231933594,
"loss": 0.5014,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.8094667196273804,
"rewards/margins": 0.8615515828132629,
"rewards/rejected": -1.671018362045288,
"step": 1180
},
{
"epoch": 0.28550863723608444,
"grad_norm": 8.167033695573148,
"learning_rate": 4.494114882192862e-07,
"logits/chosen": -0.659604012966156,
"logits/rejected": -0.6383468508720398,
"logps/chosen": -355.82061767578125,
"logps/rejected": -419.57440185546875,
"loss": 0.4999,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.7848014831542969,
"rewards/margins": 0.9513505697250366,
"rewards/rejected": -1.7361520528793335,
"step": 1190
},
{
"epoch": 0.28790786948176583,
"grad_norm": 8.531778053136817,
"learning_rate": 4.4814166170621735e-07,
"logits/chosen": -0.6497922539710999,
"logits/rejected": -0.6769246459007263,
"logps/chosen": -341.2376708984375,
"logps/rejected": -416.5484924316406,
"loss": 0.5219,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.8200058937072754,
"rewards/margins": 0.9894447326660156,
"rewards/rejected": -1.8094505071640015,
"step": 1200
},
{
"epoch": 0.2903071017274472,
"grad_norm": 9.115422018620432,
"learning_rate": 4.468579363427858e-07,
"logits/chosen": -0.6323500871658325,
"logits/rejected": -0.6400243043899536,
"logps/chosen": -378.551025390625,
"logps/rejected": -431.50628662109375,
"loss": 0.5342,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.1116712093353271,
"rewards/margins": 0.7979139089584351,
"rewards/rejected": -1.9095849990844727,
"step": 1210
},
{
"epoch": 0.2927063339731286,
"grad_norm": 10.369810004009349,
"learning_rate": 4.4556040217722555e-07,
"logits/chosen": -0.7057371735572815,
"logits/rejected": -0.6883876919746399,
"logps/chosen": -335.6352233886719,
"logps/rejected": -475.59588623046875,
"loss": 0.5004,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.9118591547012329,
"rewards/margins": 0.989273190498352,
"rewards/rejected": -1.9011322259902954,
"step": 1220
},
{
"epoch": 0.29510556621880996,
"grad_norm": 8.78559151507744,
"learning_rate": 4.442491502264033e-07,
"logits/chosen": -0.6325684785842896,
"logits/rejected": -0.6488875150680542,
"logps/chosen": -321.8913879394531,
"logps/rejected": -360.14776611328125,
"loss": 0.5278,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.9378086924552917,
"rewards/margins": 0.4965507388114929,
"rewards/rejected": -1.4343595504760742,
"step": 1230
},
{
"epoch": 0.29750479846449135,
"grad_norm": 7.996922099069886,
"learning_rate": 4.429242724694338e-07,
"logits/chosen": -0.7028544545173645,
"logits/rejected": -0.6892791390419006,
"logps/chosen": -346.9255676269531,
"logps/rejected": -447.5694274902344,
"loss": 0.5193,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.8944141268730164,
"rewards/margins": 0.8071637153625488,
"rewards/rejected": -1.7015777826309204,
"step": 1240
},
{
"epoch": 0.29990403071017274,
"grad_norm": 8.834617984631441,
"learning_rate": 4.4158586184122817e-07,
"logits/chosen": -0.6356642246246338,
"logits/rejected": -0.6852391362190247,
"logps/chosen": -391.93524169921875,
"logps/rejected": -455.7276306152344,
"loss": 0.4991,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.8954633474349976,
"rewards/margins": 0.9801417589187622,
"rewards/rejected": -1.8756049871444702,
"step": 1250
},
{
"epoch": 0.30230326295585414,
"grad_norm": 10.623650311123297,
"learning_rate": 4.4023401222597443e-07,
"logits/chosen": -0.6282129287719727,
"logits/rejected": -0.7119131088256836,
"logps/chosen": -408.7945251464844,
"logps/rejected": -447.96533203125,
"loss": 0.5039,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.1023727655410767,
"rewards/margins": 0.621015191078186,
"rewards/rejected": -1.7233880758285522,
"step": 1260
},
{
"epoch": 0.30470249520153553,
"grad_norm": 9.541014472218308,
"learning_rate": 4.3886881845055235e-07,
"logits/chosen": -0.6741599440574646,
"logits/rejected": -0.7359042167663574,
"logps/chosen": -343.3130187988281,
"logps/rejected": -435.29681396484375,
"loss": 0.5013,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.8718856573104858,
"rewards/margins": 1.0113645792007446,
"rewards/rejected": -1.8832504749298096,
"step": 1270
},
{
"epoch": 0.30710172744721687,
"grad_norm": 9.400894165088678,
"learning_rate": 4.374903762778814e-07,
"logits/chosen": -0.7107304334640503,
"logits/rejected": -0.7214982509613037,
"logps/chosen": -361.5565185546875,
"logps/rejected": -422.3573303222656,
"loss": 0.4996,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.9224702715873718,
"rewards/margins": 0.8412786722183228,
"rewards/rejected": -1.7637488842010498,
"step": 1280
},
{
"epoch": 0.30950095969289826,
"grad_norm": 8.935621537238024,
"learning_rate": 4.3609878240020356e-07,
"logits/chosen": -0.6704460978507996,
"logits/rejected": -0.7258785963058472,
"logps/chosen": -425.38690185546875,
"logps/rejected": -457.9751892089844,
"loss": 0.4947,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.0847482681274414,
"rewards/margins": 0.9454424977302551,
"rewards/rejected": -2.0301907062530518,
"step": 1290
},
{
"epoch": 0.31190019193857965,
"grad_norm": 9.26956553898586,
"learning_rate": 4.346941344323005e-07,
"logits/chosen": -0.7358589768409729,
"logits/rejected": -0.8038908243179321,
"logps/chosen": -376.4468688964844,
"logps/rejected": -392.2120361328125,
"loss": 0.5534,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.231223464012146,
"rewards/margins": 0.6490092277526855,
"rewards/rejected": -1.880232572555542,
"step": 1300
},
{
"epoch": 0.31429942418426104,
"grad_norm": 8.919630982293898,
"learning_rate": 4.332765309046467e-07,
"logits/chosen": -0.6560064554214478,
"logits/rejected": -0.6517816781997681,
"logps/chosen": -403.7508239746094,
"logps/rejected": -460.4183654785156,
"loss": 0.5389,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.1548296213150024,
"rewards/margins": 0.9737479090690613,
"rewards/rejected": -2.128577470779419,
"step": 1310
},
{
"epoch": 0.31669865642994244,
"grad_norm": 10.42362251711121,
"learning_rate": 4.3184607125649754e-07,
"logits/chosen": -0.700032114982605,
"logits/rejected": -0.7237606048583984,
"logps/chosen": -376.8808288574219,
"logps/rejected": -488.0193786621094,
"loss": 0.5239,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.9241411089897156,
"rewards/margins": 0.9761545062065125,
"rewards/rejected": -1.900295615196228,
"step": 1320
},
{
"epoch": 0.3190978886756238,
"grad_norm": 8.823907511461925,
"learning_rate": 4.304028558289141e-07,
"logits/chosen": -0.7183485627174377,
"logits/rejected": -0.7400873899459839,
"logps/chosen": -375.0442810058594,
"logps/rejected": -453.552978515625,
"loss": 0.4986,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.7883042693138123,
"rewards/margins": 1.0131797790527344,
"rewards/rejected": -1.8014838695526123,
"step": 1330
},
{
"epoch": 0.32149712092130517,
"grad_norm": 9.056325230872618,
"learning_rate": 4.28946985857725e-07,
"logits/chosen": -0.7189252972602844,
"logits/rejected": -0.698843777179718,
"logps/chosen": -391.2516174316406,
"logps/rejected": -495.9942932128906,
"loss": 0.5001,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.0153273344039917,
"rewards/margins": 1.1640372276306152,
"rewards/rejected": -2.1793646812438965,
"step": 1340
},
{
"epoch": 0.32389635316698656,
"grad_norm": 9.219431888564412,
"learning_rate": 4.2747856346642445e-07,
"logits/chosen": -0.720720648765564,
"logits/rejected": -0.7227288484573364,
"logps/chosen": -323.8959045410156,
"logps/rejected": -411.42633056640625,
"loss": 0.4889,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.8822928667068481,
"rewards/margins": 0.907570481300354,
"rewards/rejected": -1.7898629903793335,
"step": 1350
},
{
"epoch": 0.32629558541266795,
"grad_norm": 9.219391981411931,
"learning_rate": 4.2599769165900933e-07,
"logits/chosen": -0.7076966166496277,
"logits/rejected": -0.7374303340911865,
"logps/chosen": -400.1958923339844,
"logps/rejected": -457.45538330078125,
"loss": 0.5265,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.270545482635498,
"rewards/margins": 0.8403311967849731,
"rewards/rejected": -2.1108765602111816,
"step": 1360
},
{
"epoch": 0.32869481765834935,
"grad_norm": 7.961461255371797,
"learning_rate": 4.245044743127535e-07,
"logits/chosen": -0.8138734698295593,
"logits/rejected": -0.8004827499389648,
"logps/chosen": -375.55255126953125,
"logps/rejected": -462.57452392578125,
"loss": 0.51,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.1249182224273682,
"rewards/margins": 0.7031105160713196,
"rewards/rejected": -1.828028678894043,
"step": 1370
},
{
"epoch": 0.3310940499040307,
"grad_norm": 17.415326900164427,
"learning_rate": 4.229990161709214e-07,
"logits/chosen": -0.7364221811294556,
"logits/rejected": -0.6838979721069336,
"logps/chosen": -354.063720703125,
"logps/rejected": -506.49468994140625,
"loss": 0.533,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.1404192447662354,
"rewards/margins": 1.2287505865097046,
"rewards/rejected": -2.3691699504852295,
"step": 1380
},
{
"epoch": 0.3334932821497121,
"grad_norm": 8.490137472524392,
"learning_rate": 4.214814228354204e-07,
"logits/chosen": -0.7031981348991394,
"logits/rejected": -0.7180779576301575,
"logps/chosen": -381.91839599609375,
"logps/rejected": -509.70391845703125,
"loss": 0.4942,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.9766290783882141,
"rewards/margins": 1.4473177194595337,
"rewards/rejected": -2.4239466190338135,
"step": 1390
},
{
"epoch": 0.33589251439539347,
"grad_norm": 9.025625908308374,
"learning_rate": 4.1995180075939375e-07,
"logits/chosen": -0.7361186742782593,
"logits/rejected": -0.7329140901565552,
"logps/chosen": -412.9568786621094,
"logps/rejected": -463.9566955566406,
"loss": 0.4945,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.0486990213394165,
"rewards/margins": 0.7866090536117554,
"rewards/rejected": -1.8353080749511719,
"step": 1400
},
{
"epoch": 0.33829174664107486,
"grad_norm": 9.968153480159424,
"learning_rate": 4.1841025723975297e-07,
"logits/chosen": -0.685051441192627,
"logits/rejected": -0.6909801959991455,
"logps/chosen": -381.12054443359375,
"logps/rejected": -472.12628173828125,
"loss": 0.4854,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.7610518932342529,
"rewards/margins": 1.0710885524749756,
"rewards/rejected": -1.832140326499939,
"step": 1410
},
{
"epoch": 0.34069097888675626,
"grad_norm": 10.86480858776822,
"learning_rate": 4.168569004096516e-07,
"logits/chosen": -0.6658666133880615,
"logits/rejected": -0.6583417654037476,
"logps/chosen": -361.16693115234375,
"logps/rejected": -498.03997802734375,
"loss": 0.4812,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.107450246810913,
"rewards/margins": 1.1987017393112183,
"rewards/rejected": -2.306152105331421,
"step": 1420
},
{
"epoch": 0.3430902111324376,
"grad_norm": 9.37088980367882,
"learning_rate": 4.152918392308997e-07,
"logits/chosen": -0.8107253313064575,
"logits/rejected": -0.792646050453186,
"logps/chosen": -417.71197509765625,
"logps/rejected": -471.83721923828125,
"loss": 0.4813,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.6705278158187866,
"rewards/margins": 0.6656574010848999,
"rewards/rejected": -2.3361852169036865,
"step": 1430
},
{
"epoch": 0.345489443378119,
"grad_norm": 13.27114348199083,
"learning_rate": 4.137151834863213e-07,
"logits/chosen": -0.7099133729934692,
"logits/rejected": -0.6920545697212219,
"logps/chosen": -407.37493896484375,
"logps/rejected": -562.7499389648438,
"loss": 0.5386,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.6011251211166382,
"rewards/margins": 1.1472370624542236,
"rewards/rejected": -2.7483620643615723,
"step": 1440
},
{
"epoch": 0.3478886756238004,
"grad_norm": 12.106940133149287,
"learning_rate": 4.121270437720526e-07,
"logits/chosen": -0.6640886068344116,
"logits/rejected": -0.6298462748527527,
"logps/chosen": -366.8916931152344,
"logps/rejected": -493.5521545410156,
"loss": 0.5162,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.4960647821426392,
"rewards/margins": 0.7681604623794556,
"rewards/rejected": -2.2642252445220947,
"step": 1450
},
{
"epoch": 0.3502879078694818,
"grad_norm": 8.795405985800107,
"learning_rate": 4.105275314897852e-07,
"logits/chosen": -0.6949892640113831,
"logits/rejected": -0.7056195139884949,
"logps/chosen": -351.6402282714844,
"logps/rejected": -530.5101318359375,
"loss": 0.5051,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.323290467262268,
"rewards/margins": 1.3428999185562134,
"rewards/rejected": -2.6661901473999023,
"step": 1460
},
{
"epoch": 0.35268714011516317,
"grad_norm": 9.62012729406584,
"learning_rate": 4.089167588389508e-07,
"logits/chosen": -0.6170503497123718,
"logits/rejected": -0.6489865183830261,
"logps/chosen": -479.70001220703125,
"logps/rejected": -550.5726318359375,
"loss": 0.4903,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.2569714784622192,
"rewards/margins": 1.206158995628357,
"rewards/rejected": -2.463130235671997,
"step": 1470
},
{
"epoch": 0.3550863723608445,
"grad_norm": 11.864875238047302,
"learning_rate": 4.072948388088515e-07,
"logits/chosen": -0.5827468037605286,
"logits/rejected": -0.58921217918396,
"logps/chosen": -419.8984375,
"logps/rejected": -524.0806274414062,
"loss": 0.5201,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.3793574571609497,
"rewards/margins": 1.0044056177139282,
"rewards/rejected": -2.383762836456299,
"step": 1480
},
{
"epoch": 0.3574856046065259,
"grad_norm": 10.119374435549082,
"learning_rate": 4.056618851707334e-07,
"logits/chosen": -0.6013773679733276,
"logits/rejected": -0.6345557570457458,
"logps/chosen": -384.5671691894531,
"logps/rejected": -508.9178771972656,
"loss": 0.4721,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.068169355392456,
"rewards/margins": 1.1706373691558838,
"rewards/rejected": -2.238806962966919,
"step": 1490
},
{
"epoch": 0.3598848368522073,
"grad_norm": 10.659792949010905,
"learning_rate": 4.0401801246980675e-07,
"logits/chosen": -0.7585668563842773,
"logits/rejected": -0.7779415249824524,
"logps/chosen": -384.1558532714844,
"logps/rejected": -441.42010498046875,
"loss": 0.5083,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.535270094871521,
"rewards/margins": 0.7871710062026978,
"rewards/rejected": -2.3224408626556396,
"step": 1500
},
{
"epoch": 0.3622840690978887,
"grad_norm": 10.123601349785917,
"learning_rate": 4.0236333601721043e-07,
"logits/chosen": -0.6317464709281921,
"logits/rejected": -0.6271511316299438,
"logps/chosen": -450.973876953125,
"logps/rejected": -524.8987426757812,
"loss": 0.5378,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.4312914609909058,
"rewards/margins": 0.6362205147743225,
"rewards/rejected": -2.067512035369873,
"step": 1510
},
{
"epoch": 0.3646833013435701,
"grad_norm": 10.190915974376356,
"learning_rate": 4.0069797188192364e-07,
"logits/chosen": -0.6999167203903198,
"logits/rejected": -0.6949875354766846,
"logps/chosen": -410.099853515625,
"logps/rejected": -510.3067321777344,
"loss": 0.4944,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.1585137844085693,
"rewards/margins": 1.2204173803329468,
"rewards/rejected": -2.3789315223693848,
"step": 1520
},
{
"epoch": 0.3670825335892514,
"grad_norm": 10.837356546075833,
"learning_rate": 3.9902203688262417e-07,
"logits/chosen": -0.6491087675094604,
"logits/rejected": -0.681550145149231,
"logps/chosen": -389.38946533203125,
"logps/rejected": -480.93701171875,
"loss": 0.4764,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.0993043184280396,
"rewards/margins": 1.0439014434814453,
"rewards/rejected": -2.1432056427001953,
"step": 1530
},
{
"epoch": 0.3694817658349328,
"grad_norm": 9.953869181979533,
"learning_rate": 3.9733564857949365e-07,
"logits/chosen": -0.637142539024353,
"logits/rejected": -0.6458380222320557,
"logps/chosen": -504.4761657714844,
"logps/rejected": -539.9736938476562,
"loss": 0.502,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.6645132303237915,
"rewards/margins": 0.8288125991821289,
"rewards/rejected": -2.493325710296631,
"step": 1540
},
{
"epoch": 0.3718809980806142,
"grad_norm": 11.690441286702658,
"learning_rate": 3.9563892526597177e-07,
"logits/chosen": -0.6920310258865356,
"logits/rejected": -0.6713690161705017,
"logps/chosen": -376.0501708984375,
"logps/rejected": -492.35382080078125,
"loss": 0.4707,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.3901115655899048,
"rewards/margins": 0.6692919731140137,
"rewards/rejected": -2.059403419494629,
"step": 1550
},
{
"epoch": 0.3742802303262956,
"grad_norm": 9.497977320733629,
"learning_rate": 3.9393198596045795e-07,
"logits/chosen": -0.7644148468971252,
"logits/rejected": -0.7483991384506226,
"logps/chosen": -376.76885986328125,
"logps/rejected": -497.29595947265625,
"loss": 0.543,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.3605177402496338,
"rewards/margins": 0.9189019203186035,
"rewards/rejected": -2.279419422149658,
"step": 1560
},
{
"epoch": 0.376679462571977,
"grad_norm": 7.866666448729003,
"learning_rate": 3.922149503979628e-07,
"logits/chosen": -0.6804630160331726,
"logits/rejected": -0.7361734509468079,
"logps/chosen": -405.5047912597656,
"logps/rejected": -583.3118286132812,
"loss": 0.4749,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.131927728652954,
"rewards/margins": 1.7377452850341797,
"rewards/rejected": -2.869673013687134,
"step": 1570
},
{
"epoch": 0.3790786948176583,
"grad_norm": 9.850531494942707,
"learning_rate": 3.904879390217095e-07,
"logits/chosen": -0.8319008946418762,
"logits/rejected": -0.8503821492195129,
"logps/chosen": -379.04827880859375,
"logps/rejected": -460.70184326171875,
"loss": 0.4687,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.1876747608184814,
"rewards/margins": 0.9654358625411987,
"rewards/rejected": -2.1531107425689697,
"step": 1580
},
{
"epoch": 0.3814779270633397,
"grad_norm": 9.904464624440974,
"learning_rate": 3.8875107297468463e-07,
"logits/chosen": -0.7686917781829834,
"logits/rejected": -0.7551219463348389,
"logps/chosen": -396.49847412109375,
"logps/rejected": -589.2566528320312,
"loss": 0.5065,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.386002540588379,
"rewards/margins": 1.4360870122909546,
"rewards/rejected": -2.822089195251465,
"step": 1590
},
{
"epoch": 0.3838771593090211,
"grad_norm": 9.880008920909678,
"learning_rate": 3.87004474091141e-07,
"logits/chosen": -0.621880829334259,
"logits/rejected": -0.6349480152130127,
"logps/chosen": -381.6803894042969,
"logps/rejected": -495.33538818359375,
"loss": 0.5038,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.3982946872711182,
"rewards/margins": 0.9522615671157837,
"rewards/rejected": -2.3505563735961914,
"step": 1600
},
{
"epoch": 0.3862763915547025,
"grad_norm": 9.893501375511553,
"learning_rate": 3.8524826488805114e-07,
"logits/chosen": -0.7854813933372498,
"logits/rejected": -0.7525703310966492,
"logps/chosen": -448.2474670410156,
"logps/rejected": -512.2816162109375,
"loss": 0.5077,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.513240098953247,
"rewards/margins": 1.0552384853363037,
"rewards/rejected": -2.5684781074523926,
"step": 1610
},
{
"epoch": 0.3886756238003839,
"grad_norm": 11.71127933022176,
"learning_rate": 3.834825685565133e-07,
"logits/chosen": -0.7181990146636963,
"logits/rejected": -0.7670043706893921,
"logps/chosen": -360.92669677734375,
"logps/rejected": -421.00372314453125,
"loss": 0.4668,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.1134910583496094,
"rewards/margins": 0.9747357368469238,
"rewards/rejected": -2.088226795196533,
"step": 1620
},
{
"epoch": 0.39107485604606523,
"grad_norm": 10.523368241496188,
"learning_rate": 3.8170750895311007e-07,
"logits/chosen": -0.7231374979019165,
"logits/rejected": -0.7205518484115601,
"logps/chosen": -411.99884033203125,
"logps/rejected": -491.25201416015625,
"loss": 0.4704,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.136505126953125,
"rewards/margins": 1.0490738153457642,
"rewards/rejected": -2.185579299926758,
"step": 1630
},
{
"epoch": 0.3934740882917466,
"grad_norm": 9.516920015983152,
"learning_rate": 3.7992321059122045e-07,
"logits/chosen": -0.6618058681488037,
"logits/rejected": -0.7061210870742798,
"logps/chosen": -389.2023010253906,
"logps/rejected": -476.97998046875,
"loss": 0.5002,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.4337702989578247,
"rewards/margins": 1.021848201751709,
"rewards/rejected": -2.455618381500244,
"step": 1640
},
{
"epoch": 0.395873320537428,
"grad_norm": 9.385395418767361,
"learning_rate": 3.7812979863228576e-07,
"logits/chosen": -0.7980898022651672,
"logits/rejected": -0.8222333788871765,
"logps/chosen": -364.509521484375,
"logps/rejected": -491.23175048828125,
"loss": 0.4547,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.4786386489868164,
"rewards/margins": 1.1454684734344482,
"rewards/rejected": -2.6241071224212646,
"step": 1650
},
{
"epoch": 0.3982725527831094,
"grad_norm": 10.324873859774062,
"learning_rate": 3.763273988770296e-07,
"logits/chosen": -0.6266960501670837,
"logits/rejected": -0.6783492565155029,
"logps/chosen": -393.72125244140625,
"logps/rejected": -528.2474365234375,
"loss": 0.4626,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.2447704076766968,
"rewards/margins": 1.3596256971359253,
"rewards/rejected": -2.604396104812622,
"step": 1660
},
{
"epoch": 0.4006717850287908,
"grad_norm": 11.992709858901152,
"learning_rate": 3.7451613775663405e-07,
"logits/chosen": -0.7533406615257263,
"logits/rejected": -0.7324401140213013,
"logps/chosen": -424.1239318847656,
"logps/rejected": -602.9987182617188,
"loss": 0.5282,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.7245795726776123,
"rewards/margins": 1.6655222177505493,
"rewards/rejected": -3.390101671218872,
"step": 1670
},
{
"epoch": 0.40307101727447214,
"grad_norm": 10.879572547394886,
"learning_rate": 3.726961423238706e-07,
"logits/chosen": -0.7729811668395996,
"logits/rejected": -0.7984837293624878,
"logps/chosen": -386.8953552246094,
"logps/rejected": -555.2294311523438,
"loss": 0.4942,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.5044174194335938,
"rewards/margins": 1.4247596263885498,
"rewards/rejected": -2.9291768074035645,
"step": 1680
},
{
"epoch": 0.40547024952015354,
"grad_norm": 11.218700739770185,
"learning_rate": 3.708675402441882e-07,
"logits/chosen": -0.6574599742889404,
"logits/rejected": -0.6961637139320374,
"logps/chosen": -434.22650146484375,
"logps/rejected": -494.91607666015625,
"loss": 0.5091,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.3373010158538818,
"rewards/margins": 0.9446905255317688,
"rewards/rejected": -2.281991481781006,
"step": 1690
},
{
"epoch": 0.40786948176583493,
"grad_norm": 8.667993544637785,
"learning_rate": 3.6903045978675775e-07,
"logits/chosen": -0.7062468528747559,
"logits/rejected": -0.7464607954025269,
"logps/chosen": -384.9060363769531,
"logps/rejected": -543.77685546875,
"loss": 0.4989,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.4272969961166382,
"rewards/margins": 1.7380192279815674,
"rewards/rejected": -3.165316343307495,
"step": 1700
},
{
"epoch": 0.4102687140115163,
"grad_norm": 8.468965922840697,
"learning_rate": 3.6718502981547474e-07,
"logits/chosen": -0.7246867418289185,
"logits/rejected": -0.7426190376281738,
"logps/chosen": -419.00439453125,
"logps/rejected": -550.689453125,
"loss": 0.5076,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.4041268825531006,
"rewards/margins": 0.8987758755683899,
"rewards/rejected": -2.3029026985168457,
"step": 1710
},
{
"epoch": 0.4126679462571977,
"grad_norm": 9.099426609662448,
"learning_rate": 3.6533137977991986e-07,
"logits/chosen": -0.7021734118461609,
"logits/rejected": -0.7220349311828613,
"logps/chosen": -433.3681640625,
"logps/rejected": -527.8214111328125,
"loss": 0.5319,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.4180127382278442,
"rewards/margins": 0.7351676225662231,
"rewards/rejected": -2.1531801223754883,
"step": 1720
},
{
"epoch": 0.41506717850287905,
"grad_norm": 8.435202779591016,
"learning_rate": 3.6346963970627865e-07,
"logits/chosen": -0.639062762260437,
"logits/rejected": -0.6104099154472351,
"logps/chosen": -357.98175048828125,
"logps/rejected": -512.7109375,
"loss": 0.4567,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.9540430307388306,
"rewards/margins": 1.3174707889556885,
"rewards/rejected": -2.2715137004852295,
"step": 1730
},
{
"epoch": 0.41746641074856045,
"grad_norm": 11.561877784247002,
"learning_rate": 3.615999401882207e-07,
"logits/chosen": -0.7886170148849487,
"logits/rejected": -0.7725807428359985,
"logps/chosen": -376.1795959472656,
"logps/rejected": -529.3326416015625,
"loss": 0.484,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.531253457069397,
"rewards/margins": 1.2733750343322754,
"rewards/rejected": -2.804628610610962,
"step": 1740
},
{
"epoch": 0.41986564299424184,
"grad_norm": 10.686288534465577,
"learning_rate": 3.597224123777389e-07,
"logits/chosen": -0.6760295629501343,
"logits/rejected": -0.6590694785118103,
"logps/chosen": -399.1770324707031,
"logps/rejected": -554.7936401367188,
"loss": 0.4812,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.3701995611190796,
"rewards/margins": 1.3539022207260132,
"rewards/rejected": -2.7241015434265137,
"step": 1750
},
{
"epoch": 0.42226487523992323,
"grad_norm": 9.333815894429002,
"learning_rate": 3.5783718797595e-07,
"logits/chosen": -0.759990394115448,
"logits/rejected": -0.777604877948761,
"logps/chosen": -457.8511657714844,
"logps/rejected": -525.8488159179688,
"loss": 0.5006,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.607553482055664,
"rewards/margins": 1.082852840423584,
"rewards/rejected": -2.690406322479248,
"step": 1760
},
{
"epoch": 0.4246641074856046,
"grad_norm": 10.014756401369624,
"learning_rate": 3.559443992238558e-07,
"logits/chosen": -0.7365792393684387,
"logits/rejected": -0.7805954217910767,
"logps/chosen": -389.59808349609375,
"logps/rejected": -584.1541137695312,
"loss": 0.5067,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.296053171157837,
"rewards/margins": 1.6537158489227295,
"rewards/rejected": -2.9497690200805664,
"step": 1770
},
{
"epoch": 0.42706333973128596,
"grad_norm": 10.070273415353826,
"learning_rate": 3.540441788930673e-07,
"logits/chosen": -0.6368024945259094,
"logits/rejected": -0.6715587377548218,
"logps/chosen": -434.8896484375,
"logps/rejected": -555.1212158203125,
"loss": 0.4727,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.2840174436569214,
"rewards/margins": 1.5494660139083862,
"rewards/rejected": -2.833483934402466,
"step": 1780
},
{
"epoch": 0.42946257197696736,
"grad_norm": 9.237418058304717,
"learning_rate": 3.5213666027649123e-07,
"logits/chosen": -0.7330187559127808,
"logits/rejected": -0.7538883686065674,
"logps/chosen": -488.0216369628906,
"logps/rejected": -523.1275024414062,
"loss": 0.4906,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.0400612354278564,
"rewards/margins": 0.7529541850090027,
"rewards/rejected": -2.793015241622925,
"step": 1790
},
{
"epoch": 0.43186180422264875,
"grad_norm": 10.85433893317022,
"learning_rate": 3.5022197717898017e-07,
"logits/chosen": -0.7300796508789062,
"logits/rejected": -0.7841044664382935,
"logps/chosen": -403.67730712890625,
"logps/rejected": -509.16558837890625,
"loss": 0.4465,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.7776187658309937,
"rewards/margins": 1.2936731576919556,
"rewards/rejected": -3.07129168510437,
"step": 1800
},
{
"epoch": 0.43426103646833014,
"grad_norm": 9.714614186627546,
"learning_rate": 3.4830026390794633e-07,
"logits/chosen": -0.7365170121192932,
"logits/rejected": -0.7708272337913513,
"logps/chosen": -494.6708984375,
"logps/rejected": -567.5706176757812,
"loss": 0.4518,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.851345419883728,
"rewards/margins": 1.2546782493591309,
"rewards/rejected": -3.1060233116149902,
"step": 1810
},
{
"epoch": 0.43666026871401153,
"grad_norm": 7.553890272409832,
"learning_rate": 3.4637165526394104e-07,
"logits/chosen": -0.7593089938163757,
"logits/rejected": -0.7586512565612793,
"logps/chosen": -378.30596923828125,
"logps/rejected": -479.9606018066406,
"loss": 0.4896,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.3854007720947266,
"rewards/margins": 0.8888666033744812,
"rewards/rejected": -2.2742676734924316,
"step": 1820
},
{
"epoch": 0.43905950095969287,
"grad_norm": 8.674418910529546,
"learning_rate": 3.4443628653119814e-07,
"logits/chosen": -0.6358439922332764,
"logits/rejected": -0.6491922736167908,
"logps/chosen": -425.07568359375,
"logps/rejected": -638.0093994140625,
"loss": 0.4962,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.3985021114349365,
"rewards/margins": 1.68317449092865,
"rewards/rejected": -3.081676959991455,
"step": 1830
},
{
"epoch": 0.44145873320537427,
"grad_norm": 9.311707071153172,
"learning_rate": 3.424942934681453e-07,
"logits/chosen": -0.7188653349876404,
"logits/rejected": -0.7695431709289551,
"logps/chosen": -372.1746520996094,
"logps/rejected": -530.4584350585938,
"loss": 0.4715,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.1345326900482178,
"rewards/margins": 1.6814968585968018,
"rewards/rejected": -2.8160295486450195,
"step": 1840
},
{
"epoch": 0.44385796545105566,
"grad_norm": 12.609351544952487,
"learning_rate": 3.405458122978804e-07,
"logits/chosen": -0.7544587850570679,
"logits/rejected": -0.7642985582351685,
"logps/chosen": -424.3369140625,
"logps/rejected": -506.177978515625,
"loss": 0.4883,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.232379674911499,
"rewards/margins": 1.0791466236114502,
"rewards/rejected": -2.3115265369415283,
"step": 1850
},
{
"epoch": 0.44625719769673705,
"grad_norm": 10.750691343448636,
"learning_rate": 3.3859097969861633e-07,
"logits/chosen": -0.6986510157585144,
"logits/rejected": -0.6798522472381592,
"logps/chosen": -440.4366149902344,
"logps/rejected": -503.4756774902344,
"loss": 0.4636,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.483278512954712,
"rewards/margins": 1.0428184270858765,
"rewards/rejected": -2.526096820831299,
"step": 1860
},
{
"epoch": 0.44865642994241844,
"grad_norm": 10.209697273000886,
"learning_rate": 3.366299327940936e-07,
"logits/chosen": -0.7111358046531677,
"logits/rejected": -0.6854827404022217,
"logps/chosen": -484.577392578125,
"logps/rejected": -609.7952880859375,
"loss": 0.4982,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.7832101583480835,
"rewards/margins": 1.0032484531402588,
"rewards/rejected": -2.7864584922790527,
"step": 1870
},
{
"epoch": 0.4510556621880998,
"grad_norm": 10.739925690898465,
"learning_rate": 3.3466280914396117e-07,
"logits/chosen": -0.6824935078620911,
"logits/rejected": -0.7022455334663391,
"logps/chosen": -411.59375,
"logps/rejected": -576.3400268554688,
"loss": 0.4636,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.5962189435958862,
"rewards/margins": 1.4433557987213135,
"rewards/rejected": -3.0395750999450684,
"step": 1880
},
{
"epoch": 0.4534548944337812,
"grad_norm": 15.164146543864618,
"learning_rate": 3.326897467341281e-07,
"logits/chosen": -0.7167527079582214,
"logits/rejected": -0.74461829662323,
"logps/chosen": -349.6092529296875,
"logps/rejected": -494.6625061035156,
"loss": 0.4884,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.3855880498886108,
"rewards/margins": 1.2441637516021729,
"rewards/rejected": -2.629751682281494,
"step": 1890
},
{
"epoch": 0.45585412667946257,
"grad_norm": 12.377059840959534,
"learning_rate": 3.3071088396708335e-07,
"logits/chosen": -0.7990108132362366,
"logits/rejected": -0.7646141052246094,
"logps/chosen": -343.55450439453125,
"logps/rejected": -517.2105712890625,
"loss": 0.489,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.3536632061004639,
"rewards/margins": 1.533830165863037,
"rewards/rejected": -2.88749361038208,
"step": 1900
},
{
"epoch": 0.45825335892514396,
"grad_norm": 11.613569882909587,
"learning_rate": 3.2872635965218824e-07,
"logits/chosen": -0.5556444525718689,
"logits/rejected": -0.5901921391487122,
"logps/chosen": -421.22119140625,
"logps/rejected": -584.3763427734375,
"loss": 0.5208,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.6016429662704468,
"rewards/margins": 1.2570686340332031,
"rewards/rejected": -2.8587117195129395,
"step": 1910
},
{
"epoch": 0.46065259117082535,
"grad_norm": 9.955391170965928,
"learning_rate": 3.2673631299593905e-07,
"logits/chosen": -0.658098578453064,
"logits/rejected": -0.7359055876731873,
"logps/chosen": -450.887451171875,
"logps/rejected": -559.0867919921875,
"loss": 0.4893,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.6648181676864624,
"rewards/margins": 1.1666862964630127,
"rewards/rejected": -2.8315043449401855,
"step": 1920
},
{
"epoch": 0.4630518234165067,
"grad_norm": 11.201011724113032,
"learning_rate": 3.247408835922024e-07,
"logits/chosen": -0.6952091455459595,
"logits/rejected": -0.6913400292396545,
"logps/chosen": -496.1107482910156,
"logps/rejected": -632.0260009765625,
"loss": 0.4992,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.7909389734268188,
"rewards/margins": 1.2750978469848633,
"rewards/rejected": -3.0660367012023926,
"step": 1930
},
{
"epoch": 0.4654510556621881,
"grad_norm": 9.304186497298465,
"learning_rate": 3.2274021141242306e-07,
"logits/chosen": -0.6521833539009094,
"logits/rejected": -0.6770762205123901,
"logps/chosen": -436.94500732421875,
"logps/rejected": -563.1138916015625,
"loss": 0.452,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -1.661968469619751,
"rewards/margins": 1.2037100791931152,
"rewards/rejected": -2.865678548812866,
"step": 1940
},
{
"epoch": 0.4678502879078695,
"grad_norm": 14.57039869779609,
"learning_rate": 3.2073443679580613e-07,
"logits/chosen": -0.710097074508667,
"logits/rejected": -0.7277542352676392,
"logps/chosen": -424.69091796875,
"logps/rejected": -525.2821044921875,
"loss": 0.4701,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.3532383441925049,
"rewards/margins": 0.9686284065246582,
"rewards/rejected": -2.321866750717163,
"step": 1950
},
{
"epoch": 0.47024952015355087,
"grad_norm": 8.6232802821884,
"learning_rate": 3.1872370043947194e-07,
"logits/chosen": -0.7797672152519226,
"logits/rejected": -0.8235223889350891,
"logps/chosen": -389.4978332519531,
"logps/rejected": -578.5491333007812,
"loss": 0.4749,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.0760996341705322,
"rewards/margins": 1.9451076984405518,
"rewards/rejected": -3.021207809448242,
"step": 1960
},
{
"epoch": 0.47264875239923226,
"grad_norm": 10.70725522123182,
"learning_rate": 3.167081433885874e-07,
"logits/chosen": -0.5467191338539124,
"logits/rejected": -0.573945164680481,
"logps/chosen": -495.3836975097656,
"logps/rejected": -635.158203125,
"loss": 0.44,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.5290435552597046,
"rewards/margins": 0.9863445162773132,
"rewards/rejected": -2.515388011932373,
"step": 1970
},
{
"epoch": 0.4750479846449136,
"grad_norm": 12.578146181704888,
"learning_rate": 3.14687907026472e-07,
"logits/chosen": -0.6268805265426636,
"logits/rejected": -0.6679359674453735,
"logps/chosen": -384.5243835449219,
"logps/rejected": -525.4344482421875,
"loss": 0.4628,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -1.4070085287094116,
"rewards/margins": 1.292812705039978,
"rewards/rejected": -2.6998214721679688,
"step": 1980
},
{
"epoch": 0.477447216890595,
"grad_norm": 9.948295850627304,
"learning_rate": 3.126631330646801e-07,
"logits/chosen": -0.635405421257019,
"logits/rejected": -0.6675763726234436,
"logps/chosen": -497.21466064453125,
"logps/rejected": -574.4089965820312,
"loss": 0.4961,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.751558542251587,
"rewards/margins": 0.7816027402877808,
"rewards/rejected": -2.533161163330078,
"step": 1990
},
{
"epoch": 0.4798464491362764,
"grad_norm": 10.378623964749199,
"learning_rate": 3.1063396353306097e-07,
"logits/chosen": -0.698126494884491,
"logits/rejected": -0.7467767000198364,
"logps/chosen": -417.2310485839844,
"logps/rejected": -496.03173828125,
"loss": 0.4627,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.2678186893463135,
"rewards/margins": 1.3022050857543945,
"rewards/rejected": -2.570024013519287,
"step": 2000
},
{
"epoch": 0.4798464491362764,
"eval_logits/chosen": -0.704944372177124,
"eval_logits/rejected": -0.7196417450904846,
"eval_logps/chosen": -413.8262023925781,
"eval_logps/rejected": -571.4524536132812,
"eval_loss": 0.46414685249328613,
"eval_rewards/accuracies": 0.8035714030265808,
"eval_rewards/chosen": -1.4788715839385986,
"eval_rewards/margins": 1.4836254119873047,
"eval_rewards/rejected": -2.9624969959259033,
"eval_runtime": 234.411,
"eval_samples_per_second": 19.031,
"eval_steps_per_second": 0.299,
"step": 2000
},
{
"epoch": 0.4822456813819578,
"grad_norm": 10.645209258851928,
"learning_rate": 3.0860054076979535e-07,
"logits/chosen": -0.6905248761177063,
"logits/rejected": -0.681174635887146,
"logps/chosen": -447.33978271484375,
"logps/rejected": -541.9505004882812,
"loss": 0.4867,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.6291488409042358,
"rewards/margins": 1.1908628940582275,
"rewards/rejected": -2.820011615753174,
"step": 2010
},
{
"epoch": 0.4846449136276392,
"grad_norm": 12.91171406230696,
"learning_rate": 3.065630074114115e-07,
"logits/chosen": -0.7098181843757629,
"logits/rejected": -0.7330686450004578,
"logps/chosen": -465.2738342285156,
"logps/rejected": -591.0724487304688,
"loss": 0.4784,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.6661344766616821,
"rewards/margins": 1.6645488739013672,
"rewards/rejected": -3.330683469772339,
"step": 2020
},
{
"epoch": 0.4870441458733205,
"grad_norm": 11.6716276344079,
"learning_rate": 3.0452150638277947e-07,
"logits/chosen": -0.6548662185668945,
"logits/rejected": -0.6267608404159546,
"logps/chosen": -399.82501220703125,
"logps/rejected": -510.04425048828125,
"loss": 0.5167,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.6882944107055664,
"rewards/margins": 0.975196361541748,
"rewards/rejected": -2.6634907722473145,
"step": 2030
},
{
"epoch": 0.4894433781190019,
"grad_norm": 8.381022963011278,
"learning_rate": 3.024761808870856e-07,
"logits/chosen": -0.7615236043930054,
"logits/rejected": -0.7732762098312378,
"logps/chosen": -385.1295471191406,
"logps/rejected": -582.0843505859375,
"loss": 0.462,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.4341049194335938,
"rewards/margins": 1.9455454349517822,
"rewards/rejected": -3.379650592803955,
"step": 2040
},
{
"epoch": 0.4918426103646833,
"grad_norm": 14.700023421409464,
"learning_rate": 3.004271743957875e-07,
"logits/chosen": -0.6434902548789978,
"logits/rejected": -0.6391478180885315,
"logps/chosen": -473.94256591796875,
"logps/rejected": -567.1292114257812,
"loss": 0.5136,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.904388666152954,
"rewards/margins": 0.7521687746047974,
"rewards/rejected": -2.656557559967041,
"step": 2050
},
{
"epoch": 0.4942418426103647,
"grad_norm": 9.948082556212318,
"learning_rate": 2.983746306385499e-07,
"logits/chosen": -0.8040687441825867,
"logits/rejected": -0.750956118106842,
"logps/chosen": -404.974365234375,
"logps/rejected": -571.2884521484375,
"loss": 0.4606,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.5249295234680176,
"rewards/margins": 1.4364125728607178,
"rewards/rejected": -2.9613418579101562,
"step": 2060
},
{
"epoch": 0.4966410748560461,
"grad_norm": 10.22438537397434,
"learning_rate": 2.963186935931628e-07,
"logits/chosen": -0.7241095304489136,
"logits/rejected": -0.6997084021568298,
"logps/chosen": -448.89300537109375,
"logps/rejected": -555.0241088867188,
"loss": 0.4714,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.5664496421813965,
"rewards/margins": 1.0874204635620117,
"rewards/rejected": -2.653870105743408,
"step": 2070
},
{
"epoch": 0.4990403071017274,
"grad_norm": 8.109581248601415,
"learning_rate": 2.9425950747544176e-07,
"logits/chosen": -0.641141414642334,
"logits/rejected": -0.7013910412788391,
"logps/chosen": -510.6063537597656,
"logps/rejected": -640.6654052734375,
"loss": 0.4353,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.8512099981307983,
"rewards/margins": 1.588141679763794,
"rewards/rejected": -3.4393515586853027,
"step": 2080
},
{
"epoch": 0.5014395393474088,
"grad_norm": 12.084926653784903,
"learning_rate": 2.921972167291119e-07,
"logits/chosen": -0.7068333625793457,
"logits/rejected": -0.7458164691925049,
"logps/chosen": -449.1051330566406,
"logps/rejected": -605.0886840820312,
"loss": 0.4457,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.4715017080307007,
"rewards/margins": 1.2597967386245728,
"rewards/rejected": -2.7312982082366943,
"step": 2090
},
{
"epoch": 0.5038387715930902,
"grad_norm": 10.473433486601849,
"learning_rate": 2.9013196601567567e-07,
"logits/chosen": -0.672719419002533,
"logits/rejected": -0.6805760264396667,
"logps/chosen": -399.2666931152344,
"logps/rejected": -524.7140502929688,
"loss": 0.5356,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.396222472190857,
"rewards/margins": 1.1300289630889893,
"rewards/rejected": -2.5262515544891357,
"step": 2100
},
{
"epoch": 0.5062380038387716,
"grad_norm": 8.259116612360256,
"learning_rate": 2.8806390020426555e-07,
"logits/chosen": -0.7717374563217163,
"logits/rejected": -0.7531148195266724,
"logps/chosen": -406.16351318359375,
"logps/rejected": -557.4243774414062,
"loss": 0.4464,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.2984120845794678,
"rewards/margins": 1.5021198987960815,
"rewards/rejected": -2.800532102584839,
"step": 2110
},
{
"epoch": 0.508637236084453,
"grad_norm": 12.480170050902744,
"learning_rate": 2.8599316436148187e-07,
"logits/chosen": -0.6736984252929688,
"logits/rejected": -0.6641879081726074,
"logps/chosen": -438.26092529296875,
"logps/rejected": -534.37890625,
"loss": 0.4671,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.752355933189392,
"rewards/margins": 0.9608189463615417,
"rewards/rejected": -2.713174819946289,
"step": 2120
},
{
"epoch": 0.5110364683301344,
"grad_norm": 13.113194375488565,
"learning_rate": 2.8391990374121723e-07,
"logits/chosen": -0.7215433120727539,
"logits/rejected": -0.7145394086837769,
"logps/chosen": -429.87689208984375,
"logps/rejected": -588.1317138671875,
"loss": 0.5056,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.8144502639770508,
"rewards/margins": 1.295245885848999,
"rewards/rejected": -3.10969614982605,
"step": 2130
},
{
"epoch": 0.5134357005758158,
"grad_norm": 10.389514519832314,
"learning_rate": 2.818442637744669e-07,
"logits/chosen": -0.7280897498130798,
"logits/rejected": -0.7563216686248779,
"logps/chosen": -451.57037353515625,
"logps/rejected": -583.136962890625,
"loss": 0.4995,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.9117393493652344,
"rewards/margins": 1.2607026100158691,
"rewards/rejected": -3.1724419593811035,
"step": 2140
},
{
"epoch": 0.5158349328214972,
"grad_norm": 9.893178585305161,
"learning_rate": 2.797663900591284e-07,
"logits/chosen": -0.7491916418075562,
"logits/rejected": -0.7631763815879822,
"logps/chosen": -454.49542236328125,
"logps/rejected": -534.6981201171875,
"loss": 0.4507,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.7530666589736938,
"rewards/margins": 1.1452219486236572,
"rewards/rejected": -2.8982887268066406,
"step": 2150
},
{
"epoch": 0.5182341650671785,
"grad_norm": 10.78966013478198,
"learning_rate": 2.776864283497874e-07,
"logits/chosen": -0.7122198343276978,
"logits/rejected": -0.7713319063186646,
"logps/chosen": -410.6304626464844,
"logps/rejected": -599.1976928710938,
"loss": 0.479,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.6474449634552002,
"rewards/margins": 1.9179481267929077,
"rewards/rejected": -3.5653927326202393,
"step": 2160
},
{
"epoch": 0.5206333973128598,
"grad_norm": 7.331401403742752,
"learning_rate": 2.756045245474943e-07,
"logits/chosen": -0.672527551651001,
"logits/rejected": -0.6674192547798157,
"logps/chosen": -429.77374267578125,
"logps/rejected": -540.6340942382812,
"loss": 0.4662,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.4047420024871826,
"rewards/margins": 0.8396526575088501,
"rewards/rejected": -2.2443947792053223,
"step": 2170
},
{
"epoch": 0.5230326295585412,
"grad_norm": 11.47735860978489,
"learning_rate": 2.7352082468952977e-07,
"logits/chosen": -0.7144309282302856,
"logits/rejected": -0.7627060413360596,
"logps/chosen": -419.7461853027344,
"logps/rejected": -631.6406860351562,
"loss": 0.5053,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.7272449731826782,
"rewards/margins": 1.802207589149475,
"rewards/rejected": -3.529452085494995,
"step": 2180
},
{
"epoch": 0.5254318618042226,
"grad_norm": 12.66627984144212,
"learning_rate": 2.7143547493916e-07,
"logits/chosen": -0.7830231785774231,
"logits/rejected": -0.7730289697647095,
"logps/chosen": -392.6502380371094,
"logps/rejected": -612.9030151367188,
"loss": 0.4365,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.296020746231079,
"rewards/margins": 2.1511874198913574,
"rewards/rejected": -3.4472084045410156,
"step": 2190
},
{
"epoch": 0.527831094049904,
"grad_norm": 10.665323165924152,
"learning_rate": 2.693486215753853e-07,
"logits/chosen": -0.7580839395523071,
"logits/rejected": -0.778628945350647,
"logps/chosen": -419.266845703125,
"logps/rejected": -601.0218505859375,
"loss": 0.4777,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.676492691040039,
"rewards/margins": 1.9639472961425781,
"rewards/rejected": -3.640439987182617,
"step": 2200
},
{
"epoch": 0.5302303262955854,
"grad_norm": 15.168171864262504,
"learning_rate": 2.6726041098267805e-07,
"logits/chosen": -0.8083688616752625,
"logits/rejected": -0.835811972618103,
"logps/chosen": -480.6859436035156,
"logps/rejected": -533.2108764648438,
"loss": 0.4966,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.826184868812561,
"rewards/margins": 0.8893225789070129,
"rewards/rejected": -2.7155075073242188,
"step": 2210
},
{
"epoch": 0.5326295585412668,
"grad_norm": 13.565039831991228,
"learning_rate": 2.6517098964071507e-07,
"logits/chosen": -0.6329632997512817,
"logits/rejected": -0.658043384552002,
"logps/chosen": -456.0284118652344,
"logps/rejected": -526.2221069335938,
"loss": 0.5196,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.818780541419983,
"rewards/margins": 0.5884894132614136,
"rewards/rejected": -2.4072699546813965,
"step": 2220
},
{
"epoch": 0.5350287907869482,
"grad_norm": 11.227729437674412,
"learning_rate": 2.630805041141023e-07,
"logits/chosen": -0.7310200333595276,
"logits/rejected": -0.7426483035087585,
"logps/chosen": -385.03326416015625,
"logps/rejected": -597.54541015625,
"loss": 0.4777,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.4363114833831787,
"rewards/margins": 1.9664011001586914,
"rewards/rejected": -3.402712345123291,
"step": 2230
},
{
"epoch": 0.5374280230326296,
"grad_norm": 12.301180793094087,
"learning_rate": 2.609891010420941e-07,
"logits/chosen": -0.756328284740448,
"logits/rejected": -0.731390118598938,
"logps/chosen": -422.31524658203125,
"logps/rejected": -570.0447998046875,
"loss": 0.4578,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.5025454759597778,
"rewards/margins": 1.4451904296875,
"rewards/rejected": -2.9477362632751465,
"step": 2240
},
{
"epoch": 0.539827255278311,
"grad_norm": 11.203287249603568,
"learning_rate": 2.5889692712830674e-07,
"logits/chosen": -0.7012640237808228,
"logits/rejected": -0.734104335308075,
"logps/chosen": -366.77569580078125,
"logps/rejected": -487.832763671875,
"loss": 0.452,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.3068386316299438,
"rewards/margins": 1.2463617324829102,
"rewards/rejected": -2.5532002449035645,
"step": 2250
},
{
"epoch": 0.5422264875239923,
"grad_norm": 11.401920927705493,
"learning_rate": 2.5680412913042843e-07,
"logits/chosen": -0.7200027704238892,
"logits/rejected": -0.7047854661941528,
"logps/chosen": -408.1886291503906,
"logps/rejected": -583.43896484375,
"loss": 0.4436,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.5970829725265503,
"rewards/margins": 1.6908365488052368,
"rewards/rejected": -3.2879199981689453,
"step": 2260
},
{
"epoch": 0.5446257197696737,
"grad_norm": 13.14986787185228,
"learning_rate": 2.5471085384992404e-07,
"logits/chosen": -0.7282342910766602,
"logits/rejected": -0.7267628312110901,
"logps/chosen": -395.9429016113281,
"logps/rejected": -668.2164916992188,
"loss": 0.4305,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.5156352519989014,
"rewards/margins": 2.5282320976257324,
"rewards/rejected": -4.043867588043213,
"step": 2270
},
{
"epoch": 0.5470249520153551,
"grad_norm": 9.395110944586639,
"learning_rate": 2.526172481217381e-07,
"logits/chosen": -0.6741994619369507,
"logits/rejected": -0.6505922675132751,
"logps/chosen": -426.7835388183594,
"logps/rejected": -573.0272827148438,
"loss": 0.4759,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -2.0611157417297363,
"rewards/margins": 1.2577565908432007,
"rewards/rejected": -3.3188719749450684,
"step": 2280
},
{
"epoch": 0.5494241842610365,
"grad_norm": 13.157814593299042,
"learning_rate": 2.5052345880399456e-07,
"logits/chosen": -0.727673351764679,
"logits/rejected": -0.7585957050323486,
"logps/chosen": -428.2842712402344,
"logps/rejected": -550.08642578125,
"loss": 0.4445,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.9122216701507568,
"rewards/margins": 1.1558340787887573,
"rewards/rejected": -3.0680556297302246,
"step": 2290
},
{
"epoch": 0.5518234165067178,
"grad_norm": 11.84384992024014,
"learning_rate": 2.4842963276769555e-07,
"logits/chosen": -0.6177406311035156,
"logits/rejected": -0.5921697616577148,
"logps/chosen": -425.8565368652344,
"logps/rejected": -591.1513061523438,
"loss": 0.4711,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.9242675304412842,
"rewards/margins": 1.2209120988845825,
"rewards/rejected": -3.1451797485351562,
"step": 2300
},
{
"epoch": 0.5542226487523992,
"grad_norm": 10.558621692008028,
"learning_rate": 2.463359168864189e-07,
"logits/chosen": -0.6363598108291626,
"logits/rejected": -0.7210627794265747,
"logps/chosen": -480.21820068359375,
"logps/rejected": -575.8844604492188,
"loss": 0.4867,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.584695816040039,
"rewards/margins": 1.3920116424560547,
"rewards/rejected": -2.9767074584960938,
"step": 2310
},
{
"epoch": 0.5566218809980806,
"grad_norm": 13.809223972324538,
"learning_rate": 2.4424245802601555e-07,
"logits/chosen": -0.7176483869552612,
"logits/rejected": -0.7233623266220093,
"logps/chosen": -392.2666931152344,
"logps/rejected": -544.047607421875,
"loss": 0.4604,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.4483287334442139,
"rewards/margins": 0.9412357211112976,
"rewards/rejected": -2.3895645141601562,
"step": 2320
},
{
"epoch": 0.559021113243762,
"grad_norm": 10.320795589655923,
"learning_rate": 2.421494030343072e-07,
"logits/chosen": -0.5995772480964661,
"logits/rejected": -0.665002703666687,
"logps/chosen": -429.8282165527344,
"logps/rejected": -476.87384033203125,
"loss": 0.5063,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.5427360534667969,
"rewards/margins": 1.0240482091903687,
"rewards/rejected": -2.566784143447876,
"step": 2330
},
{
"epoch": 0.5614203454894434,
"grad_norm": 11.374067564936935,
"learning_rate": 2.400568987307861e-07,
"logits/chosen": -0.6323488354682922,
"logits/rejected": -0.6464725732803345,
"logps/chosen": -405.65814208984375,
"logps/rejected": -461.2422790527344,
"loss": 0.4381,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.6847827434539795,
"rewards/margins": 0.6106036305427551,
"rewards/rejected": -2.29538631439209,
"step": 2340
},
{
"epoch": 0.5638195777351248,
"grad_norm": 11.798719511944995,
"learning_rate": 2.379650918963156e-07,
"logits/chosen": -0.7201340198516846,
"logits/rejected": -0.7137752771377563,
"logps/chosen": -407.8214416503906,
"logps/rejected": -557.0501708984375,
"loss": 0.4396,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.9475882053375244,
"rewards/margins": 1.3403925895690918,
"rewards/rejected": -3.287980556488037,
"step": 2350
},
{
"epoch": 0.5662188099808061,
"grad_norm": 18.10704498970427,
"learning_rate": 2.3587412926283438e-07,
"logits/chosen": -0.7477551698684692,
"logits/rejected": -0.7495108842849731,
"logps/chosen": -487.01788330078125,
"logps/rejected": -621.1641235351562,
"loss": 0.4846,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.7702643871307373,
"rewards/margins": 1.6865708827972412,
"rewards/rejected": -3.4568352699279785,
"step": 2360
},
{
"epoch": 0.5686180422264875,
"grad_norm": 9.023431566315685,
"learning_rate": 2.337841575030642e-07,
"logits/chosen": -0.6413623690605164,
"logits/rejected": -0.6585931777954102,
"logps/chosen": -468.298095703125,
"logps/rejected": -593.5641479492188,
"loss": 0.4868,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.6977428197860718,
"rewards/margins": 1.104875087738037,
"rewards/rejected": -2.8026180267333984,
"step": 2370
},
{
"epoch": 0.5710172744721689,
"grad_norm": 9.106685136975523,
"learning_rate": 2.316953232202206e-07,
"logits/chosen": -0.6131690740585327,
"logits/rejected": -0.705342710018158,
"logps/chosen": -403.64483642578125,
"logps/rejected": -455.7774353027344,
"loss": 0.4345,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.58620285987854,
"rewards/margins": 1.1645066738128662,
"rewards/rejected": -2.7507095336914062,
"step": 2380
},
{
"epoch": 0.5734165067178503,
"grad_norm": 12.485032451040595,
"learning_rate": 2.2960777293772958e-07,
"logits/chosen": -0.5965815186500549,
"logits/rejected": -0.6691153049468994,
"logps/chosen": -375.786376953125,
"logps/rejected": -546.0467529296875,
"loss": 0.4677,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.4897288084030151,
"rewards/margins": 1.8273935317993164,
"rewards/rejected": -3.3171226978302,
"step": 2390
},
{
"epoch": 0.5758157389635317,
"grad_norm": 9.104242742336105,
"learning_rate": 2.2752165308894974e-07,
"logits/chosen": -0.6820736527442932,
"logits/rejected": -0.6869875192642212,
"logps/chosen": -366.31060791015625,
"logps/rejected": -517.474365234375,
"loss": 0.4591,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.6863653659820557,
"rewards/margins": 1.5893397331237793,
"rewards/rejected": -3.275705337524414,
"step": 2400
},
{
"epoch": 0.5782149712092131,
"grad_norm": 15.565842828873247,
"learning_rate": 2.254371100069005e-07,
"logits/chosen": -0.6215115189552307,
"logits/rejected": -0.5873704552650452,
"logps/chosen": -400.32440185546875,
"logps/rejected": -548.8357543945312,
"loss": 0.4672,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -1.3634350299835205,
"rewards/margins": 1.2007476091384888,
"rewards/rejected": -2.564182758331299,
"step": 2410
},
{
"epoch": 0.5806142034548945,
"grad_norm": 10.548601465166293,
"learning_rate": 2.2335428991399725e-07,
"logits/chosen": -0.6729727387428284,
"logits/rejected": -0.6920270919799805,
"logps/chosen": -388.9903564453125,
"logps/rejected": -711.9734497070312,
"loss": 0.4679,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.8660037517547607,
"rewards/margins": 3.018165111541748,
"rewards/rejected": -4.884169101715088,
"step": 2420
},
{
"epoch": 0.5830134357005758,
"grad_norm": 9.156657974638184,
"learning_rate": 2.2127333891179458e-07,
"logits/chosen": -0.7091829180717468,
"logits/rejected": -0.7354472875595093,
"logps/chosen": -383.76605224609375,
"logps/rejected": -601.3203735351562,
"loss": 0.48,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.6603578329086304,
"rewards/margins": 1.862908959388733,
"rewards/rejected": -3.523266553878784,
"step": 2430
},
{
"epoch": 0.5854126679462572,
"grad_norm": 13.232575343054773,
"learning_rate": 2.1919440297073782e-07,
"logits/chosen": -0.7067408561706543,
"logits/rejected": -0.7342425584793091,
"logps/chosen": -383.7411804199219,
"logps/rejected": -589.1730346679688,
"loss": 0.4993,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.7060611248016357,
"rewards/margins": 1.9039154052734375,
"rewards/rejected": -3.609976291656494,
"step": 2440
},
{
"epoch": 0.5878119001919386,
"grad_norm": 9.071189608456551,
"learning_rate": 2.1711762791992368e-07,
"logits/chosen": -0.6443785429000854,
"logits/rejected": -0.6465337872505188,
"logps/chosen": -449.90753173828125,
"logps/rejected": -567.1222534179688,
"loss": 0.4979,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.4745603799819946,
"rewards/margins": 1.4671189785003662,
"rewards/rejected": -2.941678762435913,
"step": 2450
},
{
"epoch": 0.5902111324376199,
"grad_norm": 10.300854259670189,
"learning_rate": 2.1504315943687114e-07,
"logits/chosen": -0.7359960675239563,
"logits/rejected": -0.72270667552948,
"logps/chosen": -403.669189453125,
"logps/rejected": -606.3935546875,
"loss": 0.4464,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -1.6255872249603271,
"rewards/margins": 1.5683765411376953,
"rewards/rejected": -3.1939637660980225,
"step": 2460
},
{
"epoch": 0.5926103646833013,
"grad_norm": 12.922478808451812,
"learning_rate": 2.1297114303730248e-07,
"logits/chosen": -0.6276537775993347,
"logits/rejected": -0.5880897045135498,
"logps/chosen": -394.9385986328125,
"logps/rejected": -579.1409912109375,
"loss": 0.5033,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.5723788738250732,
"rewards/margins": 1.3019646406173706,
"rewards/rejected": -2.8743433952331543,
"step": 2470
},
{
"epoch": 0.5950095969289827,
"grad_norm": 11.565229978778513,
"learning_rate": 2.1090172406493616e-07,
"logits/chosen": -0.6361690163612366,
"logits/rejected": -0.6227170825004578,
"logps/chosen": -399.3015441894531,
"logps/rejected": -556.216064453125,
"loss": 0.4182,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.5578765869140625,
"rewards/margins": 1.3979572057724,
"rewards/rejected": -2.955833911895752,
"step": 2480
},
{
"epoch": 0.5974088291746641,
"grad_norm": 13.512372450905238,
"learning_rate": 2.0883504768129146e-07,
"logits/chosen": -0.7200502157211304,
"logits/rejected": -0.7266454696655273,
"logps/chosen": -463.3838806152344,
"logps/rejected": -626.7127075195312,
"loss": 0.4704,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.8044846057891846,
"rewards/margins": 1.658591628074646,
"rewards/rejected": -3.46307635307312,
"step": 2490
},
{
"epoch": 0.5998080614203455,
"grad_norm": 10.955905269308658,
"learning_rate": 2.0677125885550571e-07,
"logits/chosen": -0.560949444770813,
"logits/rejected": -0.6333897113800049,
"logps/chosen": -404.170654296875,
"logps/rejected": -486.53631591796875,
"loss": 0.4561,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -1.536853551864624,
"rewards/margins": 1.2563847303390503,
"rewards/rejected": -2.7932381629943848,
"step": 2500
},
{
"epoch": 0.6022072936660269,
"grad_norm": 11.614988822427518,
"learning_rate": 2.0471050235416587e-07,
"logits/chosen": -0.6411922574043274,
"logits/rejected": -0.7291480898857117,
"logps/chosen": -442.89910888671875,
"logps/rejected": -545.2685546875,
"loss": 0.438,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.7440217733383179,
"rewards/margins": 1.5008093118667603,
"rewards/rejected": -3.24483060836792,
"step": 2510
},
{
"epoch": 0.6046065259117083,
"grad_norm": 12.607898741358827,
"learning_rate": 2.026529227311532e-07,
"logits/chosen": -0.7110682725906372,
"logits/rejected": -0.7056074142456055,
"logps/chosen": -416.0528259277344,
"logps/rejected": -573.2369384765625,
"loss": 0.4998,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.9202907085418701,
"rewards/margins": 1.4374290704727173,
"rewards/rejected": -3.357719898223877,
"step": 2520
},
{
"epoch": 0.6070057581573897,
"grad_norm": 10.178927094903639,
"learning_rate": 2.005986643175036e-07,
"logits/chosen": -0.6290922164916992,
"logits/rejected": -0.5829756259918213,
"logps/chosen": -434.39288330078125,
"logps/rejected": -613.2723999023438,
"loss": 0.4025,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.4553261995315552,
"rewards/margins": 1.7923997640609741,
"rewards/rejected": -3.2477259635925293,
"step": 2530
},
{
"epoch": 0.6094049904030711,
"grad_norm": 13.408248580673138,
"learning_rate": 1.9854787121128328e-07,
"logits/chosen": -0.6658229231834412,
"logits/rejected": -0.7100438475608826,
"logps/chosen": -389.3979797363281,
"logps/rejected": -438.6206970214844,
"loss": 0.4888,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.6204715967178345,
"rewards/margins": 0.9446170926094055,
"rewards/rejected": -2.5650887489318848,
"step": 2540
},
{
"epoch": 0.6118042226487524,
"grad_norm": 12.880050575259938,
"learning_rate": 1.9650068726748106e-07,
"logits/chosen": -0.6123485565185547,
"logits/rejected": -0.6827987432479858,
"logps/chosen": -440.87896728515625,
"logps/rejected": -584.0040283203125,
"loss": 0.4767,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.731406569480896,
"rewards/margins": 1.396422028541565,
"rewards/rejected": -3.127828359603882,
"step": 2550
},
{
"epoch": 0.6142034548944337,
"grad_norm": 10.788734052701793,
"learning_rate": 1.9445725608791718e-07,
"logits/chosen": -0.6031758785247803,
"logits/rejected": -0.648442268371582,
"logps/chosen": -400.7000732421875,
"logps/rejected": -660.7904052734375,
"loss": 0.4752,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.2926169633865356,
"rewards/margins": 2.534430980682373,
"rewards/rejected": -3.827047824859619,
"step": 2560
},
{
"epoch": 0.6166026871401151,
"grad_norm": 10.567224853092739,
"learning_rate": 1.924177210111705e-07,
"logits/chosen": -0.7051092386245728,
"logits/rejected": -0.7292466759681702,
"logps/chosen": -377.7469787597656,
"logps/rejected": -552.3789672851562,
"loss": 0.4712,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.5156134366989136,
"rewards/margins": 1.6158390045166016,
"rewards/rejected": -3.1314525604248047,
"step": 2570
},
{
"epoch": 0.6190019193857965,
"grad_norm": 9.777520636965628,
"learning_rate": 1.9038222510252364e-07,
"logits/chosen": -0.6945359110832214,
"logits/rejected": -0.6680124998092651,
"logps/chosen": -410.309326171875,
"logps/rejected": -502.10760498046875,
"loss": 0.4815,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.4556920528411865,
"rewards/margins": 1.0640310049057007,
"rewards/rejected": -2.5197231769561768,
"step": 2580
},
{
"epoch": 0.6214011516314779,
"grad_norm": 11.680219063781859,
"learning_rate": 1.883509111439277e-07,
"logits/chosen": -0.6178931593894958,
"logits/rejected": -0.6295452117919922,
"logps/chosen": -406.61749267578125,
"logps/rejected": -649.4032592773438,
"loss": 0.4905,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.6173099279403687,
"rewards/margins": 1.7425906658172607,
"rewards/rejected": -3.359900712966919,
"step": 2590
},
{
"epoch": 0.6238003838771593,
"grad_norm": 8.702965805638529,
"learning_rate": 1.8632392162398665e-07,
"logits/chosen": -0.7019624710083008,
"logits/rejected": -0.6865247488021851,
"logps/chosen": -422.4203186035156,
"logps/rejected": -627.749755859375,
"loss": 0.446,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -1.275315761566162,
"rewards/margins": 2.0367074012756348,
"rewards/rejected": -3.312023639678955,
"step": 2600
},
{
"epoch": 0.6261996161228407,
"grad_norm": 10.5146873413561,
"learning_rate": 1.84301398727962e-07,
"logits/chosen": -0.6342155933380127,
"logits/rejected": -0.5797609686851501,
"logps/chosen": -340.15179443359375,
"logps/rejected": -614.032470703125,
"loss": 0.449,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.4568018913269043,
"rewards/margins": 2.3389487266540527,
"rewards/rejected": -3.795750379562378,
"step": 2610
},
{
"epoch": 0.6285988483685221,
"grad_norm": 10.737478950061426,
"learning_rate": 1.8228348432779966e-07,
"logits/chosen": -0.7070366740226746,
"logits/rejected": -0.717880129814148,
"logps/chosen": -418.17706298828125,
"logps/rejected": -548.4085083007812,
"loss": 0.501,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.8227876424789429,
"rewards/margins": 1.3973190784454346,
"rewards/rejected": -3.220106840133667,
"step": 2620
},
{
"epoch": 0.6309980806142035,
"grad_norm": 9.684713099622218,
"learning_rate": 1.8027031997217773e-07,
"logits/chosen": -0.7213168740272522,
"logits/rejected": -0.7527577877044678,
"logps/chosen": -404.2679748535156,
"logps/rejected": -679.9412841796875,
"loss": 0.4013,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.9124399423599243,
"rewards/margins": 2.531831741333008,
"rewards/rejected": -4.444272041320801,
"step": 2630
},
{
"epoch": 0.6333973128598849,
"grad_norm": 11.280247842445823,
"learning_rate": 1.7826204687657758e-07,
"logits/chosen": -0.6257885098457336,
"logits/rejected": -0.5951186418533325,
"logps/chosen": -457.1598205566406,
"logps/rejected": -517.6695556640625,
"loss": 0.4193,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.6708452701568604,
"rewards/margins": 1.033866286277771,
"rewards/rejected": -2.704711437225342,
"step": 2640
},
{
"epoch": 0.6357965451055663,
"grad_norm": 14.057616949353399,
"learning_rate": 1.762588059133781e-07,
"logits/chosen": -0.6142539381980896,
"logits/rejected": -0.6413928270339966,
"logps/chosen": -473.72979736328125,
"logps/rejected": -600.7677001953125,
"loss": 0.4437,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.7104568481445312,
"rewards/margins": 1.6506239175796509,
"rewards/rejected": -3.3610808849334717,
"step": 2650
},
{
"epoch": 0.6381957773512476,
"grad_norm": 10.007284775964992,
"learning_rate": 1.7426073760197406e-07,
"logits/chosen": -0.768576979637146,
"logits/rejected": -0.7556449174880981,
"logps/chosen": -412.39984130859375,
"logps/rejected": -669.0284423828125,
"loss": 0.4858,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.6826118230819702,
"rewards/margins": 2.168964147567749,
"rewards/rejected": -3.8515758514404297,
"step": 2660
},
{
"epoch": 0.6405950095969289,
"grad_norm": 8.96835406227991,
"learning_rate": 1.7226798209891935e-07,
"logits/chosen": -0.583341658115387,
"logits/rejected": -0.6630910038948059,
"logps/chosen": -434.9918518066406,
"logps/rejected": -545.5853271484375,
"loss": 0.4433,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.7140676975250244,
"rewards/margins": 1.6664565801620483,
"rewards/rejected": -3.380524158477783,
"step": 2670
},
{
"epoch": 0.6429942418426103,
"grad_norm": 10.326630103018084,
"learning_rate": 1.7028067918809535e-07,
"logits/chosen": -0.6508952379226685,
"logits/rejected": -0.6744917631149292,
"logps/chosen": -381.95367431640625,
"logps/rejected": -674.4815673828125,
"loss": 0.4495,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -1.5953381061553955,
"rewards/margins": 2.39109468460083,
"rewards/rejected": -3.9864323139190674,
"step": 2680
},
{
"epoch": 0.6453934740882917,
"grad_norm": 12.773425150047508,
"learning_rate": 1.6829896827090584e-07,
"logits/chosen": -0.7672047019004822,
"logits/rejected": -0.7807837724685669,
"logps/chosen": -420.0484313964844,
"logps/rejected": -511.435302734375,
"loss": 0.4804,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.6788091659545898,
"rewards/margins": 1.1816326379776,
"rewards/rejected": -2.8604416847229004,
"step": 2690
},
{
"epoch": 0.6477927063339731,
"grad_norm": 8.097344363797557,
"learning_rate": 1.6632298835649844e-07,
"logits/chosen": -0.6450155973434448,
"logits/rejected": -0.6305941343307495,
"logps/chosen": -443.16607666015625,
"logps/rejected": -679.9429321289062,
"loss": 0.4316,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.5886855125427246,
"rewards/margins": 1.9584792852401733,
"rewards/rejected": -3.5471644401550293,
"step": 2700
},
{
"epoch": 0.6501919385796545,
"grad_norm": 17.48613369684693,
"learning_rate": 1.6435287805201364e-07,
"logits/chosen": -0.6135013103485107,
"logits/rejected": -0.6035945415496826,
"logps/chosen": -451.90380859375,
"logps/rejected": -557.508544921875,
"loss": 0.4885,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.8703639507293701,
"rewards/margins": 1.1114423274993896,
"rewards/rejected": -2.9818062782287598,
"step": 2710
},
{
"epoch": 0.6525911708253359,
"grad_norm": 10.490682463258194,
"learning_rate": 1.6238877555286207e-07,
"logits/chosen": -0.6777503490447998,
"logits/rejected": -0.6797904968261719,
"logps/chosen": -436.258056640625,
"logps/rejected": -608.3687744140625,
"loss": 0.4457,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.422296166419983,
"rewards/margins": 1.5763972997665405,
"rewards/rejected": -2.9986937046051025,
"step": 2720
},
{
"epoch": 0.6549904030710173,
"grad_norm": 12.224882835993649,
"learning_rate": 1.60430818633031e-07,
"logits/chosen": -0.6939619779586792,
"logits/rejected": -0.6975654363632202,
"logps/chosen": -427.3936462402344,
"logps/rejected": -593.472412109375,
"loss": 0.4393,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.530230164527893,
"rewards/margins": 1.6595938205718994,
"rewards/rejected": -3.189823865890503,
"step": 2730
},
{
"epoch": 0.6573896353166987,
"grad_norm": 10.153894922340301,
"learning_rate": 1.5847914463541939e-07,
"logits/chosen": -0.6700283288955688,
"logits/rejected": -0.6896187663078308,
"logps/chosen": -357.0517578125,
"logps/rejected": -522.7760620117188,
"loss": 0.4347,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.513653039932251,
"rewards/margins": 1.3927968740463257,
"rewards/rejected": -2.906449794769287,
"step": 2740
},
{
"epoch": 0.6597888675623801,
"grad_norm": 8.373116906245645,
"learning_rate": 1.5653389046220427e-07,
"logits/chosen": -0.60322105884552,
"logits/rejected": -0.6336368322372437,
"logps/chosen": -365.42059326171875,
"logps/rejected": -530.8365478515625,
"loss": 0.4393,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.241115927696228,
"rewards/margins": 1.4525740146636963,
"rewards/rejected": -2.693690299987793,
"step": 2750
},
{
"epoch": 0.6621880998080614,
"grad_norm": 13.542426256092975,
"learning_rate": 1.545951925652375e-07,
"logits/chosen": -0.6148853302001953,
"logits/rejected": -0.6405919194221497,
"logps/chosen": -476.22589111328125,
"logps/rejected": -573.9639282226562,
"loss": 0.4321,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.4998128414154053,
"rewards/margins": 1.6009342670440674,
"rewards/rejected": -3.1007466316223145,
"step": 2760
},
{
"epoch": 0.6645873320537428,
"grad_norm": 11.419777174024256,
"learning_rate": 1.5266318693647423e-07,
"logits/chosen": -0.6193658113479614,
"logits/rejected": -0.6027348637580872,
"logps/chosen": -428.71490478515625,
"logps/rejected": -535.6744995117188,
"loss": 0.4509,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.558387041091919,
"rewards/margins": 1.1832726001739502,
"rewards/rejected": -2.741659641265869,
"step": 2770
},
{
"epoch": 0.6669865642994242,
"grad_norm": 12.984704637723206,
"learning_rate": 1.5073800909843353e-07,
"logits/chosen": -0.6714409589767456,
"logits/rejected": -0.7126461267471313,
"logps/chosen": -428.8514099121094,
"logps/rejected": -529.1953735351562,
"loss": 0.4526,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.5292034149169922,
"rewards/margins": 1.5168203115463257,
"rewards/rejected": -3.0460238456726074,
"step": 2780
},
{
"epoch": 0.6693857965451055,
"grad_norm": 14.138835125115458,
"learning_rate": 1.488197940946922e-07,
"logits/chosen": -0.6455475687980652,
"logits/rejected": -0.642737090587616,
"logps/chosen": -417.8585510253906,
"logps/rejected": -533.5113525390625,
"loss": 0.4155,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.2823712825775146,
"rewards/margins": 1.770923376083374,
"rewards/rejected": -3.0532946586608887,
"step": 2790
},
{
"epoch": 0.6717850287907869,
"grad_norm": 15.610740173237787,
"learning_rate": 1.4690867648041167e-07,
"logits/chosen": -0.6172278523445129,
"logits/rejected": -0.6776692271232605,
"logps/chosen": -418.36578369140625,
"logps/rejected": -581.3909912109375,
"loss": 0.4866,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.4995958805084229,
"rewards/margins": 1.887927770614624,
"rewards/rejected": -3.387523651123047,
"step": 2800
},
{
"epoch": 0.6741842610364683,
"grad_norm": 10.722965313997076,
"learning_rate": 1.4500479031289987e-07,
"logits/chosen": -0.6301898956298828,
"logits/rejected": -0.6822000741958618,
"logps/chosen": -449.79840087890625,
"logps/rejected": -575.036865234375,
"loss": 0.4839,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -1.575657606124878,
"rewards/margins": 1.3444297313690186,
"rewards/rejected": -2.9200873374938965,
"step": 2810
},
{
"epoch": 0.6765834932821497,
"grad_norm": 10.427262900663766,
"learning_rate": 1.4310826914220747e-07,
"logits/chosen": -0.6417717933654785,
"logits/rejected": -0.6694071888923645,
"logps/chosen": -496.41064453125,
"logps/rejected": -595.4664306640625,
"loss": 0.468,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.6847069263458252,
"rewards/margins": 1.2323038578033447,
"rewards/rejected": -2.91701078414917,
"step": 2820
},
{
"epoch": 0.6789827255278311,
"grad_norm": 11.587445881578779,
"learning_rate": 1.412192460017597e-07,
"logits/chosen": -0.6943923234939575,
"logits/rejected": -0.6795603036880493,
"logps/chosen": -427.15155029296875,
"logps/rejected": -583.9935913085938,
"loss": 0.4638,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.664332628250122,
"rewards/margins": 1.5579102039337158,
"rewards/rejected": -3.222242832183838,
"step": 2830
},
{
"epoch": 0.6813819577735125,
"grad_norm": 8.40191068876138,
"learning_rate": 1.3933785339902504e-07,
"logits/chosen": -0.6684115529060364,
"logits/rejected": -0.6200501918792725,
"logps/chosen": -355.61358642578125,
"logps/rejected": -538.16650390625,
"loss": 0.4765,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.4830284118652344,
"rewards/margins": 1.320604681968689,
"rewards/rejected": -2.803633213043213,
"step": 2840
},
{
"epoch": 0.6837811900191939,
"grad_norm": 9.705292210107526,
"learning_rate": 1.374642233062197e-07,
"logits/chosen": -0.6299320459365845,
"logits/rejected": -0.6841608285903931,
"logps/chosen": -473.47222900390625,
"logps/rejected": -577.1047973632812,
"loss": 0.4608,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.683821439743042,
"rewards/margins": 1.3992817401885986,
"rewards/rejected": -3.0831027030944824,
"step": 2850
},
{
"epoch": 0.6861804222648752,
"grad_norm": 10.377222974537387,
"learning_rate": 1.355984871510511e-07,
"logits/chosen": -0.6160681247711182,
"logits/rejected": -0.5785273313522339,
"logps/chosen": -480.8841857910156,
"logps/rejected": -627.6201171875,
"loss": 0.4366,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.7052888870239258,
"rewards/margins": 1.3676353693008423,
"rewards/rejected": -3.0729241371154785,
"step": 2860
},
{
"epoch": 0.6885796545105566,
"grad_norm": 9.428651099395465,
"learning_rate": 1.3374077580749783e-07,
"logits/chosen": -0.6848248243331909,
"logits/rejected": -0.6872170567512512,
"logps/chosen": -386.19873046875,
"logps/rejected": -546.9776000976562,
"loss": 0.4409,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.8132606744766235,
"rewards/margins": 1.4182673692703247,
"rewards/rejected": -3.2315280437469482,
"step": 2870
},
{
"epoch": 0.690978886756238,
"grad_norm": 13.945186305417156,
"learning_rate": 1.3189121958663024e-07,
"logits/chosen": -0.6140165328979492,
"logits/rejected": -0.6878429651260376,
"logps/chosen": -531.7335205078125,
"logps/rejected": -573.6845092773438,
"loss": 0.4699,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.3029494285583496,
"rewards/margins": 0.8266263008117676,
"rewards/rejected": -3.129575490951538,
"step": 2880
},
{
"epoch": 0.6933781190019194,
"grad_norm": 12.851850092870997,
"learning_rate": 1.3004994822746895e-07,
"logits/chosen": -0.7798065543174744,
"logits/rejected": -0.7741595506668091,
"logps/chosen": -430.0048828125,
"logps/rejected": -566.271240234375,
"loss": 0.4687,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.7590996026992798,
"rewards/margins": 1.2983884811401367,
"rewards/rejected": -3.057487964630127,
"step": 2890
},
{
"epoch": 0.6957773512476008,
"grad_norm": 12.345371556886562,
"learning_rate": 1.2821709088788434e-07,
"logits/chosen": -0.5772908329963684,
"logits/rejected": -0.6081336140632629,
"logps/chosen": -378.8913879394531,
"logps/rejected": -535.4404907226562,
"loss": 0.4542,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.5490992069244385,
"rewards/margins": 1.543939232826233,
"rewards/rejected": -3.093038558959961,
"step": 2900
},
{
"epoch": 0.6981765834932822,
"grad_norm": 14.289160494822747,
"learning_rate": 1.2639277613553736e-07,
"logits/chosen": -0.6734031438827515,
"logits/rejected": -0.6486900448799133,
"logps/chosen": -372.4572448730469,
"logps/rejected": -487.9190979003906,
"loss": 0.461,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.6701858043670654,
"rewards/margins": 1.1198476552963257,
"rewards/rejected": -2.7900338172912598,
"step": 2910
},
{
"epoch": 0.7005758157389635,
"grad_norm": 12.042269042020521,
"learning_rate": 1.2457713193885975e-07,
"logits/chosen": -0.6462276577949524,
"logits/rejected": -0.660896897315979,
"logps/chosen": -339.7335510253906,
"logps/rejected": -547.5769653320312,
"loss": 0.4327,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.608481764793396,
"rewards/margins": 1.6944421529769897,
"rewards/rejected": -3.3029239177703857,
"step": 2920
},
{
"epoch": 0.7029750479846449,
"grad_norm": 15.19569101654986,
"learning_rate": 1.2277028565807838e-07,
"logits/chosen": -0.6471028923988342,
"logits/rejected": -0.6764336824417114,
"logps/chosen": -425.2708435058594,
"logps/rejected": -565.1405639648438,
"loss": 0.4666,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.5919914245605469,
"rewards/margins": 1.4726965427398682,
"rewards/rejected": -3.064688205718994,
"step": 2930
},
{
"epoch": 0.7053742802303263,
"grad_norm": 13.971571066599896,
"learning_rate": 1.209723640362815e-07,
"logits/chosen": -0.6739888191223145,
"logits/rejected": -0.6833500266075134,
"logps/chosen": -457.6881408691406,
"logps/rejected": -645.4636840820312,
"loss": 0.5118,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.7702935934066772,
"rewards/margins": 1.9555785655975342,
"rewards/rejected": -3.725872039794922,
"step": 2940
},
{
"epoch": 0.7077735124760077,
"grad_norm": 11.688443402207835,
"learning_rate": 1.191834931905277e-07,
"logits/chosen": -0.6156803369522095,
"logits/rejected": -0.6335949897766113,
"logps/chosen": -491.3165588378906,
"logps/rejected": -634.9505004882812,
"loss": 0.4246,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.7908356189727783,
"rewards/margins": 1.4069457054138184,
"rewards/rejected": -3.1977813243865967,
"step": 2950
},
{
"epoch": 0.710172744721689,
"grad_norm": 10.926862919127126,
"learning_rate": 1.1740379860299988e-07,
"logits/chosen": -0.6044400334358215,
"logits/rejected": -0.6133986711502075,
"logps/chosen": -445.9336853027344,
"logps/rejected": -599.2249755859375,
"loss": 0.4724,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.5493725538253784,
"rewards/margins": 1.346935749053955,
"rewards/rejected": -2.896308183670044,
"step": 2960
},
{
"epoch": 0.7125719769673704,
"grad_norm": 10.901886694528228,
"learning_rate": 1.1563340511220254e-07,
"logits/chosen": -0.6457855105400085,
"logits/rejected": -0.6682008504867554,
"logps/chosen": -491.98797607421875,
"logps/rejected": -631.3724975585938,
"loss": 0.4918,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.8513435125350952,
"rewards/margins": 1.5765281915664673,
"rewards/rejected": -3.4278717041015625,
"step": 2970
},
{
"epoch": 0.7149712092130518,
"grad_norm": 10.052694308856596,
"learning_rate": 1.1387243690420556e-07,
"logits/chosen": -0.6152561902999878,
"logits/rejected": -0.6195570230484009,
"logps/chosen": -467.3623962402344,
"logps/rejected": -649.93994140625,
"loss": 0.4614,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.5257104635238647,
"rewards/margins": 1.9131603240966797,
"rewards/rejected": -3.438870906829834,
"step": 2980
},
{
"epoch": 0.7173704414587332,
"grad_norm": 13.617310196785702,
"learning_rate": 1.1212101750393235e-07,
"logits/chosen": -0.652159571647644,
"logits/rejected": -0.669161856174469,
"logps/chosen": -421.92620849609375,
"logps/rejected": -561.7144775390625,
"loss": 0.4315,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.6268739700317383,
"rewards/margins": 1.6520655155181885,
"rewards/rejected": -3.278939723968506,
"step": 2990
},
{
"epoch": 0.7197696737044146,
"grad_norm": 9.598022401827114,
"learning_rate": 1.1037926976649562e-07,
"logits/chosen": -0.6599806547164917,
"logits/rejected": -0.6797146797180176,
"logps/chosen": -446.956787109375,
"logps/rejected": -640.7252807617188,
"loss": 0.5011,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.7980464696884155,
"rewards/margins": 1.6290537118911743,
"rewards/rejected": -3.427100419998169,
"step": 3000
},
{
"epoch": 0.722168905950096,
"grad_norm": 10.802854623400075,
"learning_rate": 1.0864731586857936e-07,
"logits/chosen": -0.5600841045379639,
"logits/rejected": -0.5877747535705566,
"logps/chosen": -460.4256286621094,
"logps/rejected": -603.6989135742188,
"loss": 0.4476,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.7123295068740845,
"rewards/margins": 1.7583147287368774,
"rewards/rejected": -3.470644474029541,
"step": 3010
},
{
"epoch": 0.7245681381957774,
"grad_norm": 10.40502367000872,
"learning_rate": 1.0692527729986839e-07,
"logits/chosen": -0.6589699387550354,
"logits/rejected": -0.6785061955451965,
"logps/chosen": -431.74407958984375,
"logps/rejected": -569.8807373046875,
"loss": 0.4092,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.640982985496521,
"rewards/margins": 1.5366647243499756,
"rewards/rejected": -3.177647829055786,
"step": 3020
},
{
"epoch": 0.7269673704414588,
"grad_norm": 13.119977378434658,
"learning_rate": 1.0521327485452692e-07,
"logits/chosen": -0.5950068831443787,
"logits/rejected": -0.6270568370819092,
"logps/chosen": -422.1985778808594,
"logps/rejected": -560.9178466796875,
"loss": 0.4589,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.6238855123519897,
"rewards/margins": 1.5490210056304932,
"rewards/rejected": -3.1729063987731934,
"step": 3030
},
{
"epoch": 0.7293666026871402,
"grad_norm": 12.65683765122561,
"learning_rate": 1.0351142862272468e-07,
"logits/chosen": -0.6144478917121887,
"logits/rejected": -0.673326849937439,
"logps/chosen": -397.64007568359375,
"logps/rejected": -620.5786743164062,
"loss": 0.4609,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.721166968345642,
"rewards/margins": 2.2845423221588135,
"rewards/rejected": -4.005709648132324,
"step": 3040
},
{
"epoch": 0.7317658349328215,
"grad_norm": 12.041290560966866,
"learning_rate": 1.0181985798221343e-07,
"logits/chosen": -0.550287663936615,
"logits/rejected": -0.5756568908691406,
"logps/chosen": -453.2012634277344,
"logps/rejected": -650.4305419921875,
"loss": 0.5115,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.7032783031463623,
"rewards/margins": 1.8229873180389404,
"rewards/rejected": -3.5262656211853027,
"step": 3050
},
{
"epoch": 0.7341650671785028,
"grad_norm": 13.307257346339915,
"learning_rate": 1.0013868158995329e-07,
"logits/chosen": -0.5099418759346008,
"logits/rejected": -0.5463215112686157,
"logps/chosen": -423.4984436035156,
"logps/rejected": -534.3104248046875,
"loss": 0.488,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.6515445709228516,
"rewards/margins": 1.3121881484985352,
"rewards/rejected": -2.9637324810028076,
"step": 3060
},
{
"epoch": 0.7365642994241842,
"grad_norm": 10.351075268333476,
"learning_rate": 9.84680173737887e-08,
"logits/chosen": -0.6528457403182983,
"logits/rejected": -0.6615931987762451,
"logps/chosen": -444.66717529296875,
"logps/rejected": -533.7962646484375,
"loss": 0.4646,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.675614595413208,
"rewards/margins": 1.2561010122299194,
"rewards/rejected": -2.931715488433838,
"step": 3070
},
{
"epoch": 0.7389635316698656,
"grad_norm": 9.443696454315829,
"learning_rate": 9.680798252417713e-08,
"logits/chosen": -0.6959069967269897,
"logits/rejected": -0.7350667715072632,
"logps/chosen": -372.649658203125,
"logps/rejected": -555.9948120117188,
"loss": 0.4445,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.5621325969696045,
"rewards/margins": 1.445049524307251,
"rewards/rejected": -3.0071818828582764,
"step": 3080
},
{
"epoch": 0.741362763915547,
"grad_norm": 11.47296651628307,
"learning_rate": 9.515869348596808e-08,
"logits/chosen": -0.6271109580993652,
"logits/rejected": -0.6984132528305054,
"logps/chosen": -474.30230712890625,
"logps/rejected": -584.3538818359375,
"loss": 0.4685,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.6461492776870728,
"rewards/margins": 1.4845248460769653,
"rewards/rejected": -3.130673885345459,
"step": 3090
},
{
"epoch": 0.7437619961612284,
"grad_norm": 10.021909817688707,
"learning_rate": 9.352026595023493e-08,
"logits/chosen": -0.6822315454483032,
"logits/rejected": -0.6859509944915771,
"logps/chosen": -464.59588623046875,
"logps/rejected": -531.1356201171875,
"loss": 0.4735,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -1.5760090351104736,
"rewards/margins": 1.0710208415985107,
"rewards/rejected": -2.6470298767089844,
"step": 3100
},
{
"epoch": 0.7461612284069098,
"grad_norm": 12.68094636270811,
"learning_rate": 9.189281484616004e-08,
"logits/chosen": -0.6438357830047607,
"logits/rejected": -0.6551543474197388,
"logps/chosen": -383.98248291015625,
"logps/rejected": -559.4468383789062,
"loss": 0.4749,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.792620301246643,
"rewards/margins": 1.217846155166626,
"rewards/rejected": -3.0104660987854004,
"step": 3110
},
{
"epoch": 0.7485604606525912,
"grad_norm": 11.028927932236359,
"learning_rate": 9.027645433297249e-08,
"logits/chosen": -0.5930813550949097,
"logits/rejected": -0.5801911950111389,
"logps/chosen": -542.3621826171875,
"logps/rejected": -639.3001708984375,
"loss": 0.4961,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -2.2047979831695557,
"rewards/margins": 1.3231174945831299,
"rewards/rejected": -3.5279152393341064,
"step": 3120
},
{
"epoch": 0.7509596928982726,
"grad_norm": 13.142799913341205,
"learning_rate": 8.867129779194066e-08,
"logits/chosen": -0.6943696141242981,
"logits/rejected": -0.7373479604721069,
"logps/chosen": -370.00933837890625,
"logps/rejected": -546.3580322265625,
"loss": 0.4559,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.5144741535186768,
"rewards/margins": 1.7807083129882812,
"rewards/rejected": -3.295182466506958,
"step": 3130
},
{
"epoch": 0.753358925143954,
"grad_norm": 11.188015298374586,
"learning_rate": 8.707745781841866e-08,
"logits/chosen": -0.6203271150588989,
"logits/rejected": -0.6621488332748413,
"logps/chosen": -396.5414123535156,
"logps/rejected": -583.9200439453125,
"loss": 0.4851,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.708814263343811,
"rewards/margins": 1.800474762916565,
"rewards/rejected": -3.509288787841797,
"step": 3140
},
{
"epoch": 0.7557581573896354,
"grad_norm": 6.520933471158698,
"learning_rate": 8.549504621394831e-08,
"logits/chosen": -0.7140206694602966,
"logits/rejected": -0.7158041000366211,
"logps/chosen": -390.3932189941406,
"logps/rejected": -608.0358276367188,
"loss": 0.3909,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.4705148935317993,
"rewards/margins": 2.1252219676971436,
"rewards/rejected": -3.5957369804382324,
"step": 3150
},
{
"epoch": 0.7581573896353166,
"grad_norm": 13.927678669681274,
"learning_rate": 8.392417397841703e-08,
"logits/chosen": -0.6197787523269653,
"logits/rejected": -0.6519285440444946,
"logps/chosen": -416.1162109375,
"logps/rejected": -562.5856323242188,
"loss": 0.4768,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.56044602394104,
"rewards/margins": 1.3187508583068848,
"rewards/rejected": -2.879196882247925,
"step": 3160
},
{
"epoch": 0.760556621880998,
"grad_norm": 10.074795778090818,
"learning_rate": 8.236495130227083e-08,
"logits/chosen": -0.5864537358283997,
"logits/rejected": -0.6371886730194092,
"logps/chosen": -442.40313720703125,
"logps/rejected": -615.7220458984375,
"loss": 0.4786,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -1.4601584672927856,
"rewards/margins": 2.044390916824341,
"rewards/rejected": -3.504549503326416,
"step": 3170
},
{
"epoch": 0.7629558541266794,
"grad_norm": 13.663011005109329,
"learning_rate": 8.081748755878612e-08,
"logits/chosen": -0.6179635524749756,
"logits/rejected": -0.6545384526252747,
"logps/chosen": -452.73382568359375,
"logps/rejected": -525.165283203125,
"loss": 0.4475,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.757580041885376,
"rewards/margins": 1.2399779558181763,
"rewards/rejected": -2.9975578784942627,
"step": 3180
},
{
"epoch": 0.7653550863723608,
"grad_norm": 11.361503632955099,
"learning_rate": 7.928189129639632e-08,
"logits/chosen": -0.5514404773712158,
"logits/rejected": -0.5337514281272888,
"logps/chosen": -404.6654052734375,
"logps/rejected": -557.6652221679688,
"loss": 0.4311,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.5441641807556152,
"rewards/margins": 1.4246327877044678,
"rewards/rejected": -2.968797206878662,
"step": 3190
},
{
"epoch": 0.7677543186180422,
"grad_norm": 14.862474133401097,
"learning_rate": 7.775827023107834e-08,
"logits/chosen": -0.6156660318374634,
"logits/rejected": -0.643264889717102,
"logps/chosen": -428.32989501953125,
"logps/rejected": -573.34130859375,
"loss": 0.4982,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.7375695705413818,
"rewards/margins": 1.2411071062088013,
"rewards/rejected": -2.9786763191223145,
"step": 3200
},
{
"epoch": 0.7701535508637236,
"grad_norm": 11.951225803984062,
"learning_rate": 7.624673123879682e-08,
"logits/chosen": -0.6403513550758362,
"logits/rejected": -0.6913474798202515,
"logps/chosen": -411.776611328125,
"logps/rejected": -524.7989501953125,
"loss": 0.4563,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.625312089920044,
"rewards/margins": 1.2939417362213135,
"rewards/rejected": -2.9192535877227783,
"step": 3210
},
{
"epoch": 0.772552783109405,
"grad_norm": 9.593855988626308,
"learning_rate": 7.474738034800663e-08,
"logits/chosen": -0.7377493381500244,
"logits/rejected": -0.728441596031189,
"logps/chosen": -364.731201171875,
"logps/rejected": -579.6549682617188,
"loss": 0.4688,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.6445392370224,
"rewards/margins": 2.2165136337280273,
"rewards/rejected": -3.861053466796875,
"step": 3220
},
{
"epoch": 0.7749520153550864,
"grad_norm": 12.640503406225628,
"learning_rate": 7.326032273221606e-08,
"logits/chosen": -0.6615322828292847,
"logits/rejected": -0.6496458053588867,
"logps/chosen": -469.51007080078125,
"logps/rejected": -604.8178100585938,
"loss": 0.4379,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.678299903869629,
"rewards/margins": 1.5208964347839355,
"rewards/rejected": -3.1991963386535645,
"step": 3230
},
{
"epoch": 0.7773512476007678,
"grad_norm": 13.937512065135813,
"learning_rate": 7.178566270260872e-08,
"logits/chosen": -0.6518770456314087,
"logits/rejected": -0.6966899633407593,
"logps/chosen": -453.48797607421875,
"logps/rejected": -625.8817138671875,
"loss": 0.4913,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.9149278402328491,
"rewards/margins": 1.3741127252578735,
"rewards/rejected": -3.2890403270721436,
"step": 3240
},
{
"epoch": 0.7797504798464492,
"grad_norm": 11.229901391937847,
"learning_rate": 7.032350370072709e-08,
"logits/chosen": -0.5851765871047974,
"logits/rejected": -0.6159471273422241,
"logps/chosen": -437.6891174316406,
"logps/rejected": -598.682861328125,
"loss": 0.4339,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.56214439868927,
"rewards/margins": 1.7052253484725952,
"rewards/rejected": -3.2673697471618652,
"step": 3250
},
{
"epoch": 0.7821497120921305,
"grad_norm": 11.146263352042403,
"learning_rate": 6.887394829121596e-08,
"logits/chosen": -0.6397983431816101,
"logits/rejected": -0.7075640559196472,
"logps/chosen": -459.11553955078125,
"logps/rejected": -686.8187255859375,
"loss": 0.4385,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.8990970849990845,
"rewards/margins": 2.3527369499206543,
"rewards/rejected": -4.251833915710449,
"step": 3260
},
{
"epoch": 0.7845489443378119,
"grad_norm": 11.067281012027754,
"learning_rate": 6.743709815462833e-08,
"logits/chosen": -0.702612042427063,
"logits/rejected": -0.7422297596931458,
"logps/chosen": -440.0489196777344,
"logps/rejected": -574.8292846679688,
"loss": 0.4316,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.7014598846435547,
"rewards/margins": 1.6431381702423096,
"rewards/rejected": -3.3445980548858643,
"step": 3270
},
{
"epoch": 0.7869481765834933,
"grad_norm": 9.830989747261166,
"learning_rate": 6.601305408029287e-08,
"logits/chosen": -0.5348180532455444,
"logits/rejected": -0.5696184039115906,
"logps/chosen": -442.927490234375,
"logps/rejected": -583.0240478515625,
"loss": 0.4608,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.906508445739746,
"rewards/margins": 1.4029037952423096,
"rewards/rejected": -3.3094124794006348,
"step": 3280
},
{
"epoch": 0.7893474088291746,
"grad_norm": 12.175693604464533,
"learning_rate": 6.460191595924366e-08,
"logits/chosen": -0.5926901698112488,
"logits/rejected": -0.6048527956008911,
"logps/chosen": -458.3211975097656,
"logps/rejected": -586.0518798828125,
"loss": 0.4435,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.8188337087631226,
"rewards/margins": 1.2453868389129639,
"rewards/rejected": -3.064220905303955,
"step": 3290
},
{
"epoch": 0.791746641074856,
"grad_norm": 11.750711543669468,
"learning_rate": 6.320378277721342e-08,
"logits/chosen": -0.6148731112480164,
"logits/rejected": -0.6050759553909302,
"logps/chosen": -457.68878173828125,
"logps/rejected": -555.8333740234375,
"loss": 0.4667,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.960931420326233,
"rewards/margins": 1.0489509105682373,
"rewards/rejected": -3.0098819732666016,
"step": 3300
},
{
"epoch": 0.7941458733205374,
"grad_norm": 13.919290209857113,
"learning_rate": 6.181875260769032e-08,
"logits/chosen": -0.6457343101501465,
"logits/rejected": -0.6977934241294861,
"logps/chosen": -435.4483947753906,
"logps/rejected": -547.3876953125,
"loss": 0.4771,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.3289425373077393,
"rewards/margins": 1.8005937337875366,
"rewards/rejected": -3.1295368671417236,
"step": 3310
},
{
"epoch": 0.7965451055662188,
"grad_norm": 12.774155044726525,
"learning_rate": 6.044692260503797e-08,
"logits/chosen": -0.5455335378646851,
"logits/rejected": -0.569166362285614,
"logps/chosen": -488.89923095703125,
"logps/rejected": -631.3294677734375,
"loss": 0.4144,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.8063256740570068,
"rewards/margins": 1.6628071069717407,
"rewards/rejected": -3.469132661819458,
"step": 3320
},
{
"epoch": 0.7989443378119002,
"grad_norm": 11.394655126013888,
"learning_rate": 5.9088388997680984e-08,
"logits/chosen": -0.5993139147758484,
"logits/rejected": -0.6322329044342041,
"logps/chosen": -519.2103271484375,
"logps/rejected": -596.4190063476562,
"loss": 0.4286,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.7323102951049805,
"rewards/margins": 1.5095983743667603,
"rewards/rejected": -3.241908550262451,
"step": 3330
},
{
"epoch": 0.8013435700575816,
"grad_norm": 12.200878623728958,
"learning_rate": 5.774324708135439e-08,
"logits/chosen": -0.6741082668304443,
"logits/rejected": -0.7074322700500488,
"logps/chosen": -373.7408447265625,
"logps/rejected": -486.2967224121094,
"loss": 0.4564,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.4916250705718994,
"rewards/margins": 1.3450305461883545,
"rewards/rejected": -2.836656093597412,
"step": 3340
},
{
"epoch": 0.803742802303263,
"grad_norm": 10.453220701732908,
"learning_rate": 5.641159121241953e-08,
"logits/chosen": -0.651732325553894,
"logits/rejected": -0.6395163536071777,
"logps/chosen": -382.8707580566406,
"logps/rejected": -592.2684936523438,
"loss": 0.4593,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.6059911251068115,
"rewards/margins": 1.7453718185424805,
"rewards/rejected": -3.351362943649292,
"step": 3350
},
{
"epoch": 0.8061420345489443,
"grad_norm": 11.15644450768947,
"learning_rate": 5.5093514801245106e-08,
"logits/chosen": -0.58311527967453,
"logits/rejected": -0.6077650189399719,
"logps/chosen": -422.38287353515625,
"logps/rejected": -605.2293701171875,
"loss": 0.4403,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.6799404621124268,
"rewards/margins": 1.5105533599853516,
"rewards/rejected": -3.1904940605163574,
"step": 3360
},
{
"epoch": 0.8085412667946257,
"grad_norm": 13.646112951195525,
"learning_rate": 5.378911030565453e-08,
"logits/chosen": -0.520195722579956,
"logits/rejected": -0.5245386362075806,
"logps/chosen": -505.70819091796875,
"logps/rejected": -675.5585327148438,
"loss": 0.4727,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.0382139682769775,
"rewards/margins": 1.3585295677185059,
"rewards/rejected": -3.3967432975769043,
"step": 3370
},
{
"epoch": 0.8109404990403071,
"grad_norm": 10.108220217158234,
"learning_rate": 5.249846922444101e-08,
"logits/chosen": -0.6458074450492859,
"logits/rejected": -0.7096244096755981,
"logps/chosen": -390.3080139160156,
"logps/rejected": -658.1427001953125,
"loss": 0.4316,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -1.7075704336166382,
"rewards/margins": 2.7826123237609863,
"rewards/rejected": -4.490181922912598,
"step": 3380
},
{
"epoch": 0.8133397312859885,
"grad_norm": 12.66986825653512,
"learning_rate": 5.122168209094865e-08,
"logits/chosen": -0.5679661631584167,
"logits/rejected": -0.5969215631484985,
"logps/chosen": -402.6626892089844,
"logps/rejected": -498.47479248046875,
"loss": 0.4469,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.8141701221466064,
"rewards/margins": 0.9520853757858276,
"rewards/rejected": -2.7662553787231445,
"step": 3390
},
{
"epoch": 0.8157389635316699,
"grad_norm": 9.834215615688265,
"learning_rate": 4.995883846672222e-08,
"logits/chosen": -0.5988723039627075,
"logits/rejected": -0.6316601037979126,
"logps/chosen": -566.8046875,
"logps/rejected": -627.3985595703125,
"loss": 0.4445,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.795799970626831,
"rewards/margins": 1.456854224205017,
"rewards/rejected": -3.2526543140411377,
"step": 3400
},
{
"epoch": 0.8181381957773513,
"grad_norm": 11.502139473474164,
"learning_rate": 4.871002693522486e-08,
"logits/chosen": -0.5939972996711731,
"logits/rejected": -0.5957666635513306,
"logps/chosen": -431.6761169433594,
"logps/rejected": -533.3076171875,
"loss": 0.471,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.6297683715820312,
"rewards/margins": 1.2114887237548828,
"rewards/rejected": -2.841256856918335,
"step": 3410
},
{
"epoch": 0.8205374280230326,
"grad_norm": 9.038305230217524,
"learning_rate": 4.7475335095623956e-08,
"logits/chosen": -0.598876416683197,
"logits/rejected": -0.6024787425994873,
"logps/chosen": -451.1143493652344,
"logps/rejected": -610.4466552734375,
"loss": 0.4555,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.8450918197631836,
"rewards/margins": 1.7127292156219482,
"rewards/rejected": -3.5578207969665527,
"step": 3420
},
{
"epoch": 0.822936660268714,
"grad_norm": 17.498105828738847,
"learning_rate": 4.6254849556646714e-08,
"logits/chosen": -0.5433209538459778,
"logits/rejected": -0.5503520965576172,
"logps/chosen": -476.6543884277344,
"logps/rejected": -635.9801635742188,
"loss": 0.4553,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.6800941228866577,
"rewards/margins": 1.8871549367904663,
"rewards/rejected": -3.567249298095703,
"step": 3430
},
{
"epoch": 0.8253358925143954,
"grad_norm": 12.97369382211523,
"learning_rate": 4.504865593050483e-08,
"logits/chosen": -0.5857795476913452,
"logits/rejected": -0.6014319658279419,
"logps/chosen": -460.88348388671875,
"logps/rejected": -596.8003540039062,
"loss": 0.4711,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.8348299264907837,
"rewards/margins": 1.2898591756820679,
"rewards/rejected": -3.1246893405914307,
"step": 3440
},
{
"epoch": 0.8277351247600768,
"grad_norm": 12.980103635170739,
"learning_rate": 4.385683882688895e-08,
"logits/chosen": -0.5943895578384399,
"logits/rejected": -0.6202970743179321,
"logps/chosen": -484.2207946777344,
"logps/rejected": -530.2681884765625,
"loss": 0.5219,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.8965202569961548,
"rewards/margins": 1.0006572008132935,
"rewards/rejected": -2.897177219390869,
"step": 3450
},
{
"epoch": 0.8301343570057581,
"grad_norm": 13.02526735064428,
"learning_rate": 4.2679481847033985e-08,
"logits/chosen": -0.6043378114700317,
"logits/rejected": -0.6147600412368774,
"logps/chosen": -457.24072265625,
"logps/rejected": -620.4854125976562,
"loss": 0.4657,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.8802976608276367,
"rewards/margins": 1.4927122592926025,
"rewards/rejected": -3.3730101585388184,
"step": 3460
},
{
"epoch": 0.8325335892514395,
"grad_norm": 10.240226182122116,
"learning_rate": 4.151666757785435e-08,
"logits/chosen": -0.6241481304168701,
"logits/rejected": -0.6314017176628113,
"logps/chosen": -401.0560302734375,
"logps/rejected": -634.5782470703125,
"loss": 0.4354,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.427109956741333,
"rewards/margins": 2.2766079902648926,
"rewards/rejected": -3.7037181854248047,
"step": 3470
},
{
"epoch": 0.8349328214971209,
"grad_norm": 12.450431309081564,
"learning_rate": 4.036847758615136e-08,
"logits/chosen": -0.5069397687911987,
"logits/rejected": -0.5848828554153442,
"logps/chosen": -474.01043701171875,
"logps/rejected": -627.2423706054688,
"loss": 0.4704,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.224709987640381,
"rewards/margins": 1.437635898590088,
"rewards/rejected": -3.6623454093933105,
"step": 3480
},
{
"epoch": 0.8373320537428023,
"grad_norm": 10.138959991207273,
"learning_rate": 3.923499241289113e-08,
"logits/chosen": -0.6470298171043396,
"logits/rejected": -0.6826261878013611,
"logps/chosen": -513.265869140625,
"logps/rejected": -608.2982177734375,
"loss": 0.4976,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.9213802814483643,
"rewards/margins": 1.5853230953216553,
"rewards/rejected": -3.5067031383514404,
"step": 3490
},
{
"epoch": 0.8397312859884837,
"grad_norm": 7.737508793759033,
"learning_rate": 3.811629156755541e-08,
"logits/chosen": -0.5882548093795776,
"logits/rejected": -0.6023901104927063,
"logps/chosen": -484.13787841796875,
"logps/rejected": -623.9624633789062,
"loss": 0.4558,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.8149983882904053,
"rewards/margins": 1.4490883350372314,
"rewards/rejected": -3.2640864849090576,
"step": 3500
},
{
"epoch": 0.8421305182341651,
"grad_norm": 9.568696750818255,
"learning_rate": 3.701245352256391e-08,
"logits/chosen": -0.5902693867683411,
"logits/rejected": -0.6269119381904602,
"logps/chosen": -477.4012756347656,
"logps/rejected": -554.7974853515625,
"loss": 0.4444,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.74009108543396,
"rewards/margins": 1.0817813873291016,
"rewards/rejected": -2.8218724727630615,
"step": 3510
},
{
"epoch": 0.8445297504798465,
"grad_norm": 12.00695207332559,
"learning_rate": 3.592355570776984e-08,
"logits/chosen": -0.6704959273338318,
"logits/rejected": -0.6980074644088745,
"logps/chosen": -379.5652770996094,
"logps/rejected": -551.5906372070312,
"loss": 0.4273,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.503880500793457,
"rewards/margins": 1.5545501708984375,
"rewards/rejected": -3.0584301948547363,
"step": 3520
},
{
"epoch": 0.8469289827255279,
"grad_norm": 9.257056695952906,
"learning_rate": 3.484967450502904e-08,
"logits/chosen": -0.583393394947052,
"logits/rejected": -0.626370370388031,
"logps/chosen": -371.80194091796875,
"logps/rejected": -579.4976806640625,
"loss": 0.4622,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.546886920928955,
"rewards/margins": 1.578477382659912,
"rewards/rejected": -3.1253647804260254,
"step": 3530
},
{
"epoch": 0.8493282149712092,
"grad_norm": 14.494964173795129,
"learning_rate": 3.3790885242841296e-08,
"logits/chosen": -0.610164999961853,
"logits/rejected": -0.6447314023971558,
"logps/chosen": -432.84210205078125,
"logps/rejected": -672.5138549804688,
"loss": 0.419,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.7256730794906616,
"rewards/margins": 2.3827433586120605,
"rewards/rejected": -4.1084160804748535,
"step": 3540
},
{
"epoch": 0.8517274472168906,
"grad_norm": 10.946709475218693,
"learning_rate": 3.274726219106677e-08,
"logits/chosen": -0.6309023499488831,
"logits/rejected": -0.6657734513282776,
"logps/chosen": -481.0071716308594,
"logps/rejected": -643.405029296875,
"loss": 0.4691,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.7343776226043701,
"rewards/margins": 1.648224115371704,
"rewards/rejected": -3.3826019763946533,
"step": 3550
},
{
"epoch": 0.8541266794625719,
"grad_norm": 10.689900945283165,
"learning_rate": 3.171887855571642e-08,
"logits/chosen": -0.618683934211731,
"logits/rejected": -0.5845375657081604,
"logps/chosen": -405.7684631347656,
"logps/rejected": -498.9336853027344,
"loss": 0.4636,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.7307859659194946,
"rewards/margins": 0.9967991709709167,
"rewards/rejected": -2.7275853157043457,
"step": 3560
},
{
"epoch": 0.8565259117082533,
"grad_norm": 12.01909548635221,
"learning_rate": 3.070580647381643e-08,
"logits/chosen": -0.5815375447273254,
"logits/rejected": -0.62933748960495,
"logps/chosen": -413.2583923339844,
"logps/rejected": -569.9872436523438,
"loss": 0.4664,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.719911813735962,
"rewards/margins": 1.5588918924331665,
"rewards/rejected": -3.278803586959839,
"step": 3570
},
{
"epoch": 0.8589251439539347,
"grad_norm": 11.866805408924924,
"learning_rate": 2.9708117008348576e-08,
"logits/chosen": -0.57561856508255,
"logits/rejected": -0.6152477264404297,
"logps/chosen": -487.6710510253906,
"logps/rejected": -560.8670043945312,
"loss": 0.4308,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.7181422710418701,
"rewards/margins": 1.274820327758789,
"rewards/rejected": -2.992962598800659,
"step": 3580
},
{
"epoch": 0.8613243761996161,
"grad_norm": 11.368661908364057,
"learning_rate": 2.8725880143264992e-08,
"logits/chosen": -0.6161478757858276,
"logits/rejected": -0.621782660484314,
"logps/chosen": -455.6564025878906,
"logps/rejected": -615.894775390625,
"loss": 0.5076,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.038883686065674,
"rewards/margins": 1.1252130270004272,
"rewards/rejected": -3.1640963554382324,
"step": 3590
},
{
"epoch": 0.8637236084452975,
"grad_norm": 16.14783859072051,
"learning_rate": 2.775916477857948e-08,
"logits/chosen": -0.6006742715835571,
"logits/rejected": -0.6039419770240784,
"logps/chosen": -416.3548278808594,
"logps/rejected": -545.7139892578125,
"loss": 0.4477,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.8900909423828125,
"rewards/margins": 1.278205394744873,
"rewards/rejected": -3.1682963371276855,
"step": 3600
},
{
"epoch": 0.8661228406909789,
"grad_norm": 11.990331851376549,
"learning_rate": 2.680803872553408e-08,
"logits/chosen": -0.624252200126648,
"logits/rejected": -0.7020074725151062,
"logps/chosen": -399.41583251953125,
"logps/rejected": -661.79638671875,
"loss": 0.4534,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.495900273323059,
"rewards/margins": 2.714350461959839,
"rewards/rejected": -4.210250377655029,
"step": 3610
},
{
"epoch": 0.8685220729366603,
"grad_norm": 14.100883523011712,
"learning_rate": 2.5872568701842706e-08,
"logits/chosen": -0.58869868516922,
"logits/rejected": -0.6457717418670654,
"logps/chosen": -384.4469299316406,
"logps/rejected": -560.6753540039062,
"loss": 0.5292,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.7529369592666626,
"rewards/margins": 1.4963237047195435,
"rewards/rejected": -3.249260425567627,
"step": 3620
},
{
"epoch": 0.8709213051823417,
"grad_norm": 14.528120128162028,
"learning_rate": 2.495282032701096e-08,
"logits/chosen": -0.6547173857688904,
"logits/rejected": -0.7049331068992615,
"logps/chosen": -343.5353698730469,
"logps/rejected": -484.39617919921875,
"loss": 0.4662,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.712456464767456,
"rewards/margins": 1.597670316696167,
"rewards/rejected": -3.310126781463623,
"step": 3630
},
{
"epoch": 0.8733205374280231,
"grad_norm": 14.220779667409527,
"learning_rate": 2.4048858117733133e-08,
"logits/chosen": -0.6767258644104004,
"logits/rejected": -0.7082260847091675,
"logps/chosen": -444.30328369140625,
"logps/rejected": -614.4713745117188,
"loss": 0.4496,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.7594902515411377,
"rewards/margins": 2.157029628753662,
"rewards/rejected": -3.9165198802948,
"step": 3640
},
{
"epoch": 0.8757197696737045,
"grad_norm": 11.643290676232011,
"learning_rate": 2.3160745483366938e-08,
"logits/chosen": -0.6050413846969604,
"logits/rejected": -0.6035085916519165,
"logps/chosen": -438.84942626953125,
"logps/rejected": -599.5989990234375,
"loss": 0.4611,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.013329029083252,
"rewards/margins": 1.2097079753875732,
"rewards/rejected": -3.223036527633667,
"step": 3650
},
{
"epoch": 0.8781190019193857,
"grad_norm": 12.836965077883955,
"learning_rate": 2.2288544721485197e-08,
"logits/chosen": -0.7066579461097717,
"logits/rejected": -0.7247270941734314,
"logps/chosen": -371.9989929199219,
"logps/rejected": -582.3410034179688,
"loss": 0.4348,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.5276857614517212,
"rewards/margins": 1.909120798110962,
"rewards/rejected": -3.4368062019348145,
"step": 3660
},
{
"epoch": 0.8805182341650671,
"grad_norm": 11.52899292103731,
"learning_rate": 2.1432317013506117e-08,
"logits/chosen": -0.7156012654304504,
"logits/rejected": -0.7498332262039185,
"logps/chosen": -447.65216064453125,
"logps/rejected": -560.2888793945312,
"loss": 0.5002,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.9097896814346313,
"rewards/margins": 1.5398523807525635,
"rewards/rejected": -3.449641704559326,
"step": 3670
},
{
"epoch": 0.8829174664107485,
"grad_norm": 13.634086015337521,
"learning_rate": 2.0592122420401704e-08,
"logits/chosen": -0.5250085592269897,
"logits/rejected": -0.57183438539505,
"logps/chosen": -402.1532287597656,
"logps/rejected": -514.146484375,
"loss": 0.4715,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.7628848552703857,
"rewards/margins": 0.9792253375053406,
"rewards/rejected": -2.7421107292175293,
"step": 3680
},
{
"epoch": 0.8853166986564299,
"grad_norm": 10.931227876480555,
"learning_rate": 1.976801987848459e-08,
"logits/chosen": -0.6632574796676636,
"logits/rejected": -0.6789246201515198,
"logps/chosen": -438.82598876953125,
"logps/rejected": -618.0784912109375,
"loss": 0.4579,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.6889044046401978,
"rewards/margins": 1.663873314857483,
"rewards/rejected": -3.3527779579162598,
"step": 3690
},
{
"epoch": 0.8877159309021113,
"grad_norm": 12.88089932313707,
"learning_rate": 1.8960067195273987e-08,
"logits/chosen": -0.6466517448425293,
"logits/rejected": -0.6885952949523926,
"logps/chosen": -386.842529296875,
"logps/rejected": -563.5064697265625,
"loss": 0.4368,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.6945703029632568,
"rewards/margins": 1.814061164855957,
"rewards/rejected": -3.508631467819214,
"step": 3700
},
{
"epoch": 0.8901151631477927,
"grad_norm": 11.33604603257016,
"learning_rate": 1.816832104544072e-08,
"logits/chosen": -0.5263174772262573,
"logits/rejected": -0.5546278953552246,
"logps/chosen": -467.58062744140625,
"logps/rejected": -575.5345458984375,
"loss": 0.4632,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.8927171230316162,
"rewards/margins": 1.3500896692276,
"rewards/rejected": -3.2428061962127686,
"step": 3710
},
{
"epoch": 0.8925143953934741,
"grad_norm": 10.620251504859734,
"learning_rate": 1.7392836966831553e-08,
"logits/chosen": -0.5253115892410278,
"logits/rejected": -0.5563468933105469,
"logps/chosen": -434.1669921875,
"logps/rejected": -624.7203369140625,
"loss": 0.4168,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -1.62359619140625,
"rewards/margins": 2.1972010135650635,
"rewards/rejected": -3.8207976818084717,
"step": 3720
},
{
"epoch": 0.8949136276391555,
"grad_norm": 13.275739947749992,
"learning_rate": 1.663366935657373e-08,
"logits/chosen": -0.6284958124160767,
"logits/rejected": -0.6665322780609131,
"logps/chosen": -392.99346923828125,
"logps/rejected": -562.0194091796875,
"loss": 0.4876,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.7423441410064697,
"rewards/margins": 1.6086766719818115,
"rewards/rejected": -3.3510212898254395,
"step": 3730
},
{
"epoch": 0.8973128598848369,
"grad_norm": 15.530425079669428,
"learning_rate": 1.5890871467258898e-08,
"logits/chosen": -0.5325186252593994,
"logits/rejected": -0.5298448204994202,
"logps/chosen": -506.15118408203125,
"logps/rejected": -606.4880981445312,
"loss": 0.4461,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.7241334915161133,
"rewards/margins": 1.3581666946411133,
"rewards/rejected": -3.0822999477386475,
"step": 3740
},
{
"epoch": 0.8997120921305183,
"grad_norm": 10.19750465326913,
"learning_rate": 1.5164495403207967e-08,
"logits/chosen": -0.6508103609085083,
"logits/rejected": -0.6848149299621582,
"logps/chosen": -467.98046875,
"logps/rejected": -680.7239379882812,
"loss": 0.4473,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.8734798431396484,
"rewards/margins": 1.8285648822784424,
"rewards/rejected": -3.702044725418091,
"step": 3750
},
{
"epoch": 0.9021113243761996,
"grad_norm": 12.346658548809334,
"learning_rate": 1.4454592116815962e-08,
"logits/chosen": -0.5491658449172974,
"logits/rejected": -0.5658199787139893,
"logps/chosen": -436.49505615234375,
"logps/rejected": -605.0525512695312,
"loss": 0.4476,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.5681920051574707,
"rewards/margins": 1.4636965990066528,
"rewards/rejected": -3.031888484954834,
"step": 3760
},
{
"epoch": 0.904510556621881,
"grad_norm": 8.029053228209763,
"learning_rate": 1.3761211404977934e-08,
"logits/chosen": -0.6819595098495483,
"logits/rejected": -0.6787452101707458,
"logps/chosen": -416.7828063964844,
"logps/rejected": -647.9034423828125,
"loss": 0.4124,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.8057578802108765,
"rewards/margins": 2.299898862838745,
"rewards/rejected": -4.105656623840332,
"step": 3770
},
{
"epoch": 0.9069097888675623,
"grad_norm": 12.064654099310772,
"learning_rate": 1.3084401905596177e-08,
"logits/chosen": -0.6439425349235535,
"logits/rejected": -0.7118849158287048,
"logps/chosen": -462.96624755859375,
"logps/rejected": -571.2410278320312,
"loss": 0.4681,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.6763248443603516,
"rewards/margins": 1.4996883869171143,
"rewards/rejected": -3.176013469696045,
"step": 3780
},
{
"epoch": 0.9093090211132437,
"grad_norm": 11.710340217031446,
"learning_rate": 1.2424211094168053e-08,
"logits/chosen": -0.4598866403102875,
"logits/rejected": -0.5065708756446838,
"logps/chosen": -502.1546936035156,
"logps/rejected": -631.9319458007812,
"loss": 0.4355,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.6645265817642212,
"rewards/margins": 1.4170030355453491,
"rewards/rejected": -3.0815296173095703,
"step": 3790
},
{
"epoch": 0.9117082533589251,
"grad_norm": 10.967428634066259,
"learning_rate": 1.1780685280456143e-08,
"logits/chosen": -0.5946656465530396,
"logits/rejected": -0.6331689953804016,
"logps/chosen": -513.2894287109375,
"logps/rejected": -712.6598510742188,
"loss": 0.5091,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.061929225921631,
"rewards/margins": 1.8402255773544312,
"rewards/rejected": -3.9021544456481934,
"step": 3800
},
{
"epoch": 0.9141074856046065,
"grad_norm": 12.665118166759022,
"learning_rate": 1.1153869605239564e-08,
"logits/chosen": -0.5937948226928711,
"logits/rejected": -0.6147378087043762,
"logps/chosen": -453.622314453125,
"logps/rejected": -509.7294006347656,
"loss": 0.4617,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.6426572799682617,
"rewards/margins": 1.0369822978973389,
"rewards/rejected": -2.6796395778656006,
"step": 3810
},
{
"epoch": 0.9165067178502879,
"grad_norm": 13.103880470124455,
"learning_rate": 1.0543808037147606e-08,
"logits/chosen": -0.6876846551895142,
"logits/rejected": -0.7068900465965271,
"logps/chosen": -422.0281677246094,
"logps/rejected": -667.2548828125,
"loss": 0.448,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.5988212823867798,
"rewards/margins": 2.381234645843506,
"rewards/rejected": -3.980056047439575,
"step": 3820
},
{
"epoch": 0.9189059500959693,
"grad_norm": 8.997982934710759,
"learning_rate": 9.95054336957557e-09,
"logits/chosen": -0.6352418661117554,
"logits/rejected": -0.6384015083312988,
"logps/chosen": -425.876220703125,
"logps/rejected": -567.1570434570312,
"loss": 0.4062,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.594925045967102,
"rewards/margins": 1.3477894067764282,
"rewards/rejected": -2.9427144527435303,
"step": 3830
},
{
"epoch": 0.9213051823416507,
"grad_norm": 11.532295976661894,
"learning_rate": 9.37411721768286e-09,
"logits/chosen": -0.5964576601982117,
"logits/rejected": -0.6410446763038635,
"logps/chosen": -464.87823486328125,
"logps/rejected": -694.9117431640625,
"loss": 0.4116,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.9001286029815674,
"rewards/margins": 1.8672752380371094,
"rewards/rejected": -3.767404079437256,
"step": 3840
},
{
"epoch": 0.9237044145873321,
"grad_norm": 11.154793279181051,
"learning_rate": 8.81457001547392e-09,
"logits/chosen": -0.5532232522964478,
"logits/rejected": -0.537521481513977,
"logps/chosen": -463.60540771484375,
"logps/rejected": -587.6224365234375,
"loss": 0.4493,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.8449652194976807,
"rewards/margins": 1.1032222509384155,
"rewards/rejected": -2.9481875896453857,
"step": 3850
},
{
"epoch": 0.9261036468330134,
"grad_norm": 11.05082468706879,
"learning_rate": 8.271941012961942e-09,
"logits/chosen": -0.546120822429657,
"logits/rejected": -0.5496604442596436,
"logps/chosen": -394.8270568847656,
"logps/rejected": -661.518310546875,
"loss": 0.452,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.7090051174163818,
"rewards/margins": 2.0601634979248047,
"rewards/rejected": -3.7691688537597656,
"step": 3860
},
{
"epoch": 0.9285028790786948,
"grad_norm": 12.11334503986506,
"learning_rate": 7.746268273415568e-09,
"logits/chosen": -0.6160927414894104,
"logits/rejected": -0.5806897878646851,
"logps/chosen": -448.4591369628906,
"logps/rejected": -560.7288818359375,
"loss": 0.4697,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.6977341175079346,
"rewards/margins": 0.6812986135482788,
"rewards/rejected": -2.379032611846924,
"step": 3870
},
{
"epoch": 0.9309021113243762,
"grad_norm": 11.544284346542328,
"learning_rate": 7.237588670689076e-09,
"logits/chosen": -0.6722389459609985,
"logits/rejected": -0.7193390727043152,
"logps/chosen": -424.52545166015625,
"logps/rejected": -611.8565063476562,
"loss": 0.4297,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.7004144191741943,
"rewards/margins": 2.1898179054260254,
"rewards/rejected": -3.890232801437378,
"step": 3880
},
{
"epoch": 0.9333013435700576,
"grad_norm": 11.715147679206341,
"learning_rate": 6.745937886635606e-09,
"logits/chosen": -0.5881049633026123,
"logits/rejected": -0.6139761805534363,
"logps/chosen": -464.87908935546875,
"logps/rejected": -680.6104125976562,
"loss": 0.4339,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.7303098440170288,
"rewards/margins": 2.0931639671325684,
"rewards/rejected": -3.8234734535217285,
"step": 3890
},
{
"epoch": 0.935700575815739,
"grad_norm": 10.400118656620084,
"learning_rate": 6.271350408604409e-09,
"logits/chosen": -0.6110928058624268,
"logits/rejected": -0.6207016706466675,
"logps/chosen": -360.6642761230469,
"logps/rejected": -565.9451904296875,
"loss": 0.427,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -1.2418937683105469,
"rewards/margins": 1.8387107849121094,
"rewards/rejected": -3.0806050300598145,
"step": 3900
},
{
"epoch": 0.9380998080614203,
"grad_norm": 10.41650137772363,
"learning_rate": 5.813859527021487e-09,
"logits/chosen": -0.5888563394546509,
"logits/rejected": -0.6184204816818237,
"logps/chosen": -425.99176025390625,
"logps/rejected": -601.2418823242188,
"loss": 0.4341,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.6858470439910889,
"rewards/margins": 2.0192112922668457,
"rewards/rejected": -3.7050583362579346,
"step": 3910
},
{
"epoch": 0.9404990403071017,
"grad_norm": 11.8325653977223,
"learning_rate": 5.373497333054616e-09,
"logits/chosen": -0.627325713634491,
"logits/rejected": -0.6293385028839111,
"logps/chosen": -477.244140625,
"logps/rejected": -571.5916748046875,
"loss": 0.485,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.9243043661117554,
"rewards/margins": 1.038652777671814,
"rewards/rejected": -2.9629569053649902,
"step": 3920
},
{
"epoch": 0.9428982725527831,
"grad_norm": 12.78783329314139,
"learning_rate": 4.950294716362213e-09,
"logits/chosen": -0.5876864194869995,
"logits/rejected": -0.6184590458869934,
"logps/chosen": -502.6764221191406,
"logps/rejected": -607.631103515625,
"loss": 0.4598,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.936902403831482,
"rewards/margins": 1.1025243997573853,
"rewards/rejected": -3.0394270420074463,
"step": 3930
},
{
"epoch": 0.9452975047984645,
"grad_norm": 9.281745185619053,
"learning_rate": 4.544281362926422e-09,
"logits/chosen": -0.6346616148948669,
"logits/rejected": -0.6384531855583191,
"logps/chosen": -471.59912109375,
"logps/rejected": -625.6812133789062,
"loss": 0.4382,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.560572862625122,
"rewards/margins": 1.6168543100357056,
"rewards/rejected": -3.177427291870117,
"step": 3940
},
{
"epoch": 0.9476967370441459,
"grad_norm": 11.774371944259814,
"learning_rate": 4.15548575297095e-09,
"logits/chosen": -0.6360484957695007,
"logits/rejected": -0.6688522696495056,
"logps/chosen": -415.83685302734375,
"logps/rejected": -604.4151611328125,
"loss": 0.4287,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.7383111715316772,
"rewards/margins": 1.9439513683319092,
"rewards/rejected": -3.682262420654297,
"step": 3950
},
{
"epoch": 0.9500959692898272,
"grad_norm": 9.746861342706476,
"learning_rate": 3.7839351589631366e-09,
"logits/chosen": -0.6292358040809631,
"logits/rejected": -0.5761995911598206,
"logps/chosen": -410.7013244628906,
"logps/rejected": -591.5662841796875,
"loss": 0.4475,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.7394983768463135,
"rewards/margins": 1.2296544313430786,
"rewards/rejected": -2.9691526889801025,
"step": 3960
},
{
"epoch": 0.9524952015355086,
"grad_norm": 11.778423651753915,
"learning_rate": 3.4296556437010405e-09,
"logits/chosen": -0.6809018850326538,
"logits/rejected": -0.695138692855835,
"logps/chosen": -382.27337646484375,
"logps/rejected": -545.0702514648438,
"loss": 0.4623,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.746303915977478,
"rewards/margins": 1.5834695100784302,
"rewards/rejected": -3.32977294921875,
"step": 3970
},
{
"epoch": 0.95489443378119,
"grad_norm": 12.78112886016458,
"learning_rate": 3.092672058485124e-09,
"logits/chosen": -0.6508705615997314,
"logits/rejected": -0.6388789415359497,
"logps/chosen": -406.3719482421875,
"logps/rejected": -647.2185668945312,
"loss": 0.4965,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.6541764736175537,
"rewards/margins": 2.194026231765747,
"rewards/rejected": -3.8482024669647217,
"step": 3980
},
{
"epoch": 0.9572936660268714,
"grad_norm": 11.56058546078559,
"learning_rate": 2.7730080413750356e-09,
"logits/chosen": -0.5227060914039612,
"logits/rejected": -0.5568557977676392,
"logps/chosen": -435.97833251953125,
"logps/rejected": -572.4579467773438,
"loss": 0.4687,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.5814177989959717,
"rewards/margins": 1.316449522972107,
"rewards/rejected": -2.897867441177368,
"step": 3990
},
{
"epoch": 0.9596928982725528,
"grad_norm": 10.601860302681265,
"learning_rate": 2.4706860155316033e-09,
"logits/chosen": -0.6122329235076904,
"logits/rejected": -0.6286668181419373,
"logps/chosen": -524.0120849609375,
"logps/rejected": -665.0354614257812,
"loss": 0.4644,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.8280586004257202,
"rewards/margins": 1.3872017860412598,
"rewards/rejected": -3.2152607440948486,
"step": 4000
},
{
"epoch": 0.9596928982725528,
"eval_logits/chosen": -0.6084198951721191,
"eval_logits/rejected": -0.626106858253479,
"eval_logps/chosen": -439.457275390625,
"eval_logps/rejected": -623.4827270507812,
"eval_loss": 0.43927037715911865,
"eval_rewards/accuracies": 0.8285714387893677,
"eval_rewards/chosen": -1.7351824045181274,
"eval_rewards/margins": 1.7476173639297485,
"eval_rewards/rejected": -3.482799530029297,
"eval_runtime": 205.925,
"eval_samples_per_second": 21.663,
"eval_steps_per_second": 0.34,
"step": 4000
},
{
"epoch": 0.9620921305182342,
"grad_norm": 12.00737344425498,
"learning_rate": 2.185727187643843e-09,
"logits/chosen": -0.6551751494407654,
"logits/rejected": -0.6754254102706909,
"logps/chosen": -384.97015380859375,
"logps/rejected": -610.7628784179688,
"loss": 0.4753,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.6510511636734009,
"rewards/margins": 2.127711296081543,
"rewards/rejected": -3.7787623405456543,
"step": 4010
},
{
"epoch": 0.9644913627639156,
"grad_norm": 13.508620118248077,
"learning_rate": 1.9181515464413434e-09,
"logits/chosen": -0.5892629623413086,
"logits/rejected": -0.6146517395973206,
"logps/chosen": -518.9400024414062,
"logps/rejected": -719.6813354492188,
"loss": 0.4167,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.4437657594680786,
"rewards/margins": 1.971710205078125,
"rewards/rejected": -3.4154763221740723,
"step": 4020
},
{
"epoch": 0.966890595009597,
"grad_norm": 10.39563083030194,
"learning_rate": 1.6679778612923302e-09,
"logits/chosen": -0.5608581304550171,
"logits/rejected": -0.6161444187164307,
"logps/chosen": -488.674560546875,
"logps/rejected": -594.23681640625,
"loss": 0.4207,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.8151352405548096,
"rewards/margins": 1.053483247756958,
"rewards/rejected": -2.8686180114746094,
"step": 4030
},
{
"epoch": 0.9692898272552783,
"grad_norm": 11.778106767916414,
"learning_rate": 1.43522368088686e-09,
"logits/chosen": -0.5647310018539429,
"logits/rejected": -0.6243175864219666,
"logps/chosen": -465.7294006347656,
"logps/rejected": -713.4627685546875,
"loss": 0.4842,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.9448398351669312,
"rewards/margins": 2.4202308654785156,
"rewards/rejected": -4.365070819854736,
"step": 4040
},
{
"epoch": 0.9716890595009597,
"grad_norm": 13.924155842753079,
"learning_rate": 1.2199053320059993e-09,
"logits/chosen": -0.5596794486045837,
"logits/rejected": -0.5762395858764648,
"logps/chosen": -459.1459045410156,
"logps/rejected": -605.2967529296875,
"loss": 0.4569,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.7754650115966797,
"rewards/margins": 1.3398131132125854,
"rewards/rejected": -3.1152782440185547,
"step": 4050
},
{
"epoch": 0.974088291746641,
"grad_norm": 10.101674481691141,
"learning_rate": 1.0220379183764338e-09,
"logits/chosen": -0.6857717633247375,
"logits/rejected": -0.67542564868927,
"logps/chosen": -366.9244384765625,
"logps/rejected": -573.9992065429688,
"loss": 0.4449,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.5600941181182861,
"rewards/margins": 1.933546781539917,
"rewards/rejected": -3.493640899658203,
"step": 4060
},
{
"epoch": 0.9764875239923224,
"grad_norm": 10.9782818923637,
"learning_rate": 8.416353196111503e-10,
"logits/chosen": -0.5480167269706726,
"logits/rejected": -0.551671028137207,
"logps/chosen": -435.30120849609375,
"logps/rejected": -583.2462158203125,
"loss": 0.4928,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.900551438331604,
"rewards/margins": 1.5537331104278564,
"rewards/rejected": -3.45428466796875,
"step": 4070
},
{
"epoch": 0.9788867562380038,
"grad_norm": 13.709217546306082,
"learning_rate": 6.787101902356873e-10,
"logits/chosen": -0.5955997705459595,
"logits/rejected": -0.5746399760246277,
"logps/chosen": -457.44512939453125,
"logps/rejected": -655.3438110351562,
"loss": 0.4125,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.843754529953003,
"rewards/margins": 1.7090778350830078,
"rewards/rejected": -3.5528323650360107,
"step": 4080
},
{
"epoch": 0.9812859884836852,
"grad_norm": 15.002773863211088,
"learning_rate": 5.332739588005953e-10,
"logits/chosen": -0.7055156826972961,
"logits/rejected": -0.7228876352310181,
"logps/chosen": -376.8843994140625,
"logps/rejected": -592.8224487304688,
"loss": 0.4463,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.6472818851470947,
"rewards/margins": 1.8167731761932373,
"rewards/rejected": -3.464055299758911,
"step": 4090
},
{
"epoch": 0.9836852207293666,
"grad_norm": 13.389436536453555,
"learning_rate": 4.053368270797164e-10,
"logits/chosen": -0.5252457857131958,
"logits/rejected": -0.5572882890701294,
"logps/chosen": -437.04620361328125,
"logps/rejected": -581.270751953125,
"loss": 0.4427,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.8863855600357056,
"rewards/margins": 1.4563047885894775,
"rewards/rejected": -3.3426902294158936,
"step": 4100
},
{
"epoch": 0.986084452975048,
"grad_norm": 9.112429191727882,
"learning_rate": 2.949077693545354e-10,
"logits/chosen": -0.49874311685562134,
"logits/rejected": -0.5566374063491821,
"logps/chosen": -467.4441833496094,
"logps/rejected": -625.98681640625,
"loss": 0.4872,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.7639166116714478,
"rewards/margins": 1.2108697891235352,
"rewards/rejected": -2.9747862815856934,
"step": 4110
},
{
"epoch": 0.9884836852207294,
"grad_norm": 10.18504682387471,
"learning_rate": 2.0199453178471047e-10,
"logits/chosen": -0.5316934585571289,
"logits/rejected": -0.5874772071838379,
"logps/chosen": -509.3759765625,
"logps/rejected": -588.4547119140625,
"loss": 0.4279,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.8416917324066162,
"rewards/margins": 1.1702340841293335,
"rewards/rejected": -3.0119261741638184,
"step": 4120
},
{
"epoch": 0.9908829174664108,
"grad_norm": 9.481256396419454,
"learning_rate": 1.266036318647301e-10,
"logits/chosen": -0.593826413154602,
"logits/rejected": -0.6198239326477051,
"logps/chosen": -489.977294921875,
"logps/rejected": -679.3530883789062,
"loss": 0.42,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.5848186016082764,
"rewards/margins": 2.1017608642578125,
"rewards/rejected": -3.686579465866089,
"step": 4130
},
{
"epoch": 0.9932821497120922,
"grad_norm": 14.774926148490673,
"learning_rate": 6.874035796672339e-11,
"logits/chosen": -0.6392898559570312,
"logits/rejected": -0.65348219871521,
"logps/chosen": -449.4820251464844,
"logps/rejected": -634.1976318359375,
"loss": 0.4374,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -1.4988961219787598,
"rewards/margins": 2.359536647796631,
"rewards/rejected": -3.858433246612549,
"step": 4140
},
{
"epoch": 0.9956813819577736,
"grad_norm": 13.569592726410221,
"learning_rate": 2.8408768969423458e-11,
"logits/chosen": -0.6521973609924316,
"logits/rejected": -0.6605287194252014,
"logps/chosen": -451.91217041015625,
"logps/rejected": -617.4226684570312,
"loss": 0.446,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.6109716892242432,
"rewards/margins": 1.5008330345153809,
"rewards/rejected": -3.111804485321045,
"step": 4150
},
{
"epoch": 0.9980806142034548,
"grad_norm": 14.763770162561924,
"learning_rate": 5.611693973617271e-12,
"logits/chosen": -0.5573083162307739,
"logits/rejected": -0.5667176246643066,
"logps/chosen": -397.83111572265625,
"logps/rejected": -570.945068359375,
"loss": 0.465,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.6926639080047607,
"rewards/margins": 1.5600066184997559,
"rewards/rejected": -3.2526707649230957,
"step": 4160
},
{
"epoch": 1.0,
"step": 4168,
"total_flos": 0.0,
"train_loss": 0.5098277106738136,
"train_runtime": 16630.655,
"train_samples_per_second": 8.019,
"train_steps_per_second": 0.251
}
],
"logging_steps": 10,
"max_steps": 4168,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}