qwen2.5-7B-instruct-kto / trainer_state.json
ptrdvn's picture
Upload folder using huggingface_hub
7a3ab06 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9974025974025974,
"eval_steps": 36,
"global_step": 360,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0027705627705627706,
"grad_norm": 14.627778578717658,
"kl": 0.0,
"learning_rate": 1.25e-06,
"logits/chosen": -36907408.0,
"logits/rejected": -36792634.666666664,
"logps/chosen": -312.351806640625,
"logps/rejected": -283.6378580729167,
"loss": 0.5,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.005541125541125541,
"grad_norm": 13.780174124436396,
"kl": 0.0,
"learning_rate": 2.5e-06,
"logits/chosen": -34840466.666666664,
"logits/rejected": -43695128.0,
"logps/chosen": -182.85721842447916,
"logps/rejected": -222.22654724121094,
"loss": 0.5,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 2
},
{
"epoch": 0.008311688311688312,
"grad_norm": 12.294259063686624,
"kl": 4.698612689971924,
"learning_rate": 3.7500000000000005e-06,
"logits/chosen": -19804460.8,
"logits/rejected": -16712610.909090908,
"logps/chosen": -222.3800048828125,
"logps/rejected": -122.40013538707386,
"loss": 0.4986,
"rewards/chosen": 0.04441284537315369,
"rewards/margins": 0.043410241061990915,
"rewards/rejected": 0.0010026043111627753,
"step": 3
},
{
"epoch": 0.011082251082251082,
"grad_norm": 17.203944077808572,
"kl": 8.57018756866455,
"learning_rate": 5e-06,
"logits/chosen": -42728661.333333336,
"logits/rejected": -37368886.85714286,
"logps/chosen": -206.19590928819446,
"logps/rejected": -164.41081891741072,
"loss": 0.4789,
"rewards/chosen": -0.018145985073513456,
"rewards/margins": 0.18317703027573842,
"rewards/rejected": -0.20132301534925187,
"step": 4
},
{
"epoch": 0.013852813852813853,
"grad_norm": 10.50938907109465,
"kl": 34.45105743408203,
"learning_rate": 4.999902656502973e-06,
"logits/chosen": -18186068.0,
"logits/rejected": -24469592.0,
"logps/chosen": -188.94903564453125,
"logps/rejected": -102.71192932128906,
"loss": 0.4732,
"rewards/chosen": 0.12755107879638672,
"rewards/margins": 0.456174373626709,
"rewards/rejected": -0.32862329483032227,
"step": 5
},
{
"epoch": 0.016623376623376623,
"grad_norm": 9.308233094927798,
"kl": 64.32754516601562,
"learning_rate": 4.9996106335924965e-06,
"logits/chosen": -58021513.14285714,
"logits/rejected": -46324174.222222224,
"logps/chosen": -431.99476841517856,
"logps/rejected": -169.90737575954861,
"loss": 0.3975,
"rewards/chosen": 0.04927550894873483,
"rewards/margins": 1.1982065883893815,
"rewards/rejected": -1.1489310794406467,
"step": 6
},
{
"epoch": 0.019393939393939394,
"grad_norm": 8.26019431779866,
"kl": 64.02778625488281,
"learning_rate": 4.999123954009797e-06,
"logits/chosen": -39251945.14285714,
"logits/rejected": -45070531.55555555,
"logps/chosen": -216.63478306361608,
"logps/rejected": -215.42609320746527,
"loss": 0.352,
"rewards/chosen": -1.9811251504080636,
"rewards/margins": 2.4016610705663286,
"rewards/rejected": -4.382786220974392,
"step": 7
},
{
"epoch": 0.022164502164502164,
"grad_norm": 8.217719696009635,
"kl": 48.159217834472656,
"learning_rate": 4.998442655654946e-06,
"logits/chosen": -34504220.8,
"logits/rejected": -36456395.63636363,
"logps/chosen": -324.70927734375,
"logps/rejected": -180.1541082208807,
"loss": 0.3637,
"rewards/chosen": -1.7615163803100586,
"rewards/margins": 3.1409979906949133,
"rewards/rejected": -4.902514371004972,
"step": 8
},
{
"epoch": 0.024935064935064935,
"grad_norm": 10.654775495500138,
"kl": 23.938976287841797,
"learning_rate": 4.997566791583916e-06,
"logits/chosen": -67123216.0,
"logits/rejected": -84117072.0,
"logps/chosen": -435.34124755859375,
"logps/rejected": -230.71749877929688,
"loss": 0.4793,
"rewards/chosen": -2.759248733520508,
"rewards/margins": 2.205596923828125,
"rewards/rejected": -4.964845657348633,
"step": 9
},
{
"epoch": 0.027705627705627706,
"grad_norm": 8.39236794673388,
"kl": 63.119232177734375,
"learning_rate": 4.996496430004446e-06,
"logits/chosen": -67407008.0,
"logits/rejected": -40225140.0,
"logps/chosen": -231.02957153320312,
"logps/rejected": -157.6824951171875,
"loss": 0.3755,
"rewards/chosen": -1.482393503189087,
"rewards/margins": 2.959088087081909,
"rewards/rejected": -4.441481590270996,
"step": 10
},
{
"epoch": 0.030476190476190476,
"grad_norm": 9.000961005160892,
"kl": 80.52793884277344,
"learning_rate": 4.995231654270726e-06,
"logits/chosen": -35507206.4,
"logits/rejected": -27073269.333333332,
"logps/chosen": -129.73853759765626,
"logps/rejected": -198.3338419596354,
"loss": 0.4853,
"rewards/chosen": -3.5513961791992186,
"rewards/margins": -2.7626913865407308,
"rewards/rejected": -0.7887047926584879,
"step": 11
},
{
"epoch": 0.03324675324675325,
"grad_norm": 6.379191198402483,
"kl": 106.77307891845703,
"learning_rate": 4.993772562876909e-06,
"logits/chosen": -41411072.0,
"logits/rejected": -30578227.2,
"logps/chosen": -189.07457386363637,
"logps/rejected": -137.128857421875,
"loss": 0.3923,
"rewards/chosen": 1.0351500077681108,
"rewards/margins": 1.3740568681196734,
"rewards/rejected": -0.3389068603515625,
"step": 12
},
{
"epoch": 0.03601731601731602,
"grad_norm": 6.452742017638695,
"kl": 122.21878814697266,
"learning_rate": 4.992119269449445e-06,
"logits/chosen": -43168708.571428575,
"logits/rejected": -131686030.22222222,
"logps/chosen": -225.06903948102678,
"logps/rejected": -224.36984592013889,
"loss": 0.3514,
"rewards/chosen": -0.09181928634643555,
"rewards/margins": 4.457288159264459,
"rewards/rejected": -4.549107445610894,
"step": 13
},
{
"epoch": 0.03878787878787879,
"grad_norm": 5.69756506879914,
"kl": 121.53662872314453,
"learning_rate": 4.990271902738223e-06,
"logits/chosen": -59036272.0,
"logits/rejected": -60129404.0,
"logps/chosen": -261.13958740234375,
"logps/rejected": -139.1809539794922,
"loss": 0.4055,
"rewards/chosen": -0.7429616451263428,
"rewards/margins": 2.4130918979644775,
"rewards/rejected": -3.1560535430908203,
"step": 14
},
{
"epoch": 0.04155844155844156,
"grad_norm": 6.128286045148822,
"kl": 135.92550659179688,
"learning_rate": 4.988230606606552e-06,
"logits/chosen": -68020558.22222222,
"logits/rejected": -159836653.7142857,
"logps/chosen": -279.03700086805554,
"logps/rejected": -178.57425362723214,
"loss": 0.3175,
"rewards/chosen": 1.3599602381388347,
"rewards/margins": 3.7635091599963957,
"rewards/rejected": -2.403548921857561,
"step": 15
},
{
"epoch": 0.04432900432900433,
"grad_norm": 6.677440912213255,
"kl": 200.06797790527344,
"learning_rate": 4.985995540019956e-06,
"logits/chosen": -64806084.0,
"logits/rejected": -43605824.0,
"logps/chosen": -188.4673309326172,
"logps/rejected": -151.33450317382812,
"loss": 0.3861,
"rewards/chosen": -2.2935891151428223,
"rewards/margins": 1.6028807163238525,
"rewards/rejected": -3.896469831466675,
"step": 16
},
{
"epoch": 0.0470995670995671,
"grad_norm": 5.53499338288777,
"kl": 132.95904541015625,
"learning_rate": 4.983566877033791e-06,
"logits/chosen": -58334624.0,
"logits/rejected": -47487511.27272727,
"logps/chosen": -269.74716796875,
"logps/rejected": -178.86970658735797,
"loss": 0.3605,
"rewards/chosen": -0.3949946403503418,
"rewards/margins": 4.419949887015602,
"rewards/rejected": -4.814944527365944,
"step": 17
},
{
"epoch": 0.04987012987012987,
"grad_norm": 5.996796684805764,
"kl": 208.39804077148438,
"learning_rate": 4.980944806779698e-06,
"logits/chosen": -80701216.0,
"logits/rejected": -29817596.8,
"logps/chosen": -366.2938639322917,
"logps/rejected": -123.7053955078125,
"loss": 0.388,
"rewards/chosen": 0.7954994837443033,
"rewards/margins": 2.894377581278483,
"rewards/rejected": -2.09887809753418,
"step": 18
},
{
"epoch": 0.05264069264069264,
"grad_norm": 5.298093207942763,
"kl": 171.8631591796875,
"learning_rate": 4.9781295334508664e-06,
"logits/chosen": -27342618.666666668,
"logits/rejected": -175808640.0,
"logps/chosen": -178.55368381076389,
"logps/rejected": -251.65311104910714,
"loss": 0.3068,
"rewards/chosen": -1.4788221783108182,
"rewards/margins": 5.431802734496101,
"rewards/rejected": -6.910624912806919,
"step": 19
},
{
"epoch": 0.05541125541125541,
"grad_norm": 5.5366402223371605,
"kl": 168.5543212890625,
"learning_rate": 4.975121276286136e-06,
"logits/chosen": -39478311.11111111,
"logits/rejected": -44926532.571428575,
"logps/chosen": -229.44447157118054,
"logps/rejected": -176.92333984375,
"loss": 0.3188,
"rewards/chosen": 0.9360362158881294,
"rewards/margins": 3.546793498690166,
"rewards/rejected": -2.610757282802037,
"step": 20
},
{
"epoch": 0.05818181818181818,
"grad_norm": 4.114450168170683,
"kl": 213.9116973876953,
"learning_rate": 4.9719202695529265e-06,
"logits/chosen": -30691328.0,
"logits/rejected": -33394152.727272727,
"logps/chosen": -149.3848876953125,
"logps/rejected": -156.1878329190341,
"loss": 0.3428,
"rewards/chosen": -1.1895624160766602,
"rewards/margins": 1.3192689548839223,
"rewards/rejected": -2.5088313709605825,
"step": 21
},
{
"epoch": 0.06095238095238095,
"grad_norm": 4.009166305344757,
"kl": 183.32760620117188,
"learning_rate": 4.968526762528988e-06,
"logits/chosen": -35098371.2,
"logits/rejected": 25460706.90909091,
"logps/chosen": -145.5945556640625,
"logps/rejected": -120.78593306107955,
"loss": 0.3241,
"rewards/chosen": -3.308795166015625,
"rewards/margins": 0.9302642822265623,
"rewards/rejected": -4.2390594482421875,
"step": 22
},
{
"epoch": 0.06372294372294372,
"grad_norm": 4.640987152057261,
"kl": 153.11923217773438,
"learning_rate": 4.964941019482995e-06,
"logits/chosen": -33867536.0,
"logits/rejected": -55536612.0,
"logps/chosen": -305.594970703125,
"logps/rejected": -207.89471435546875,
"loss": 0.3545,
"rewards/chosen": 0.747844934463501,
"rewards/margins": 4.4949328899383545,
"rewards/rejected": -3.7470879554748535,
"step": 23
},
{
"epoch": 0.0664935064935065,
"grad_norm": 4.069238916717589,
"kl": 125.38595581054688,
"learning_rate": 4.961163319653959e-06,
"logits/chosen": -47397428.0,
"logits/rejected": -82234298.66666667,
"logps/chosen": -178.1082763671875,
"logps/rejected": -169.91890462239584,
"loss": 0.3224,
"rewards/chosen": 0.6711676716804504,
"rewards/margins": 4.262869497140249,
"rewards/rejected": -3.5917018254597983,
"step": 24
},
{
"epoch": 0.06926406926406926,
"grad_norm": 4.859678347936459,
"kl": 130.6267852783203,
"learning_rate": 4.9571939572294914e-06,
"logits/chosen": -38077904.0,
"logits/rejected": -46673220.0,
"logps/chosen": -246.6427001953125,
"logps/rejected": -112.71556091308594,
"loss": 0.318,
"rewards/chosen": 0.7895539999008179,
"rewards/margins": 3.2466784715652466,
"rewards/rejected": -2.4571244716644287,
"step": 25
},
{
"epoch": 0.07203463203463203,
"grad_norm": 4.215905495865878,
"kl": 85.88780212402344,
"learning_rate": 4.953033241322887e-06,
"logits/chosen": -36872256.0,
"logits/rejected": -27889625.6,
"logps/chosen": -206.98484108664772,
"logps/rejected": -222.779833984375,
"loss": 0.3116,
"rewards/chosen": -0.3384280638261275,
"rewards/margins": 2.963332037492232,
"rewards/rejected": -3.3017601013183593,
"step": 26
},
{
"epoch": 0.0748051948051948,
"grad_norm": 5.17124531247104,
"kl": 132.2495880126953,
"learning_rate": 4.948681495949055e-06,
"logits/chosen": -50783381.333333336,
"logits/rejected": -15797090.0,
"logps/chosen": -282.1715087890625,
"logps/rejected": -155.00286865234375,
"loss": 0.3684,
"rewards/chosen": -0.16415607929229736,
"rewards/margins": 3.6816858053207397,
"rewards/rejected": -3.845841884613037,
"step": 27
},
{
"epoch": 0.07757575757575758,
"grad_norm": 5.45010267483556,
"kl": 99.80706787109375,
"learning_rate": 4.944139059999286e-06,
"logits/chosen": -48624998.4,
"logits/rejected": -14226288.0,
"logps/chosen": -418.8494140625,
"logps/rejected": -180.1219278971354,
"loss": 0.2844,
"rewards/chosen": 1.4669153213500976,
"rewards/margins": 8.812172126770019,
"rewards/rejected": -7.345256805419922,
"step": 28
},
{
"epoch": 0.08034632034632035,
"grad_norm": 4.572726104274604,
"kl": 121.2557601928711,
"learning_rate": 4.939406287214861e-06,
"logits/chosen": -75202816.0,
"logits/rejected": -52673325.71428572,
"logps/chosen": -256.0649685329861,
"logps/rejected": -171.17110770089286,
"loss": 0.3235,
"rewards/chosen": -0.08450836605495876,
"rewards/margins": 2.685517977154444,
"rewards/rejected": -2.7700263432094028,
"step": 29
},
{
"epoch": 0.08311688311688312,
"grad_norm": 4.6083614006549585,
"kl": 86.99708557128906,
"learning_rate": 4.9344835461595016e-06,
"logits/chosen": -36531722.666666664,
"logits/rejected": -29011876.57142857,
"logps/chosen": -172.84847005208334,
"logps/rejected": -144.96559361049108,
"loss": 0.3997,
"rewards/chosen": -0.7234950595431857,
"rewards/margins": 2.354658823164683,
"rewards/rejected": -3.0781538827078685,
"step": 30
},
{
"epoch": 0.08588744588744589,
"grad_norm": 4.872957501355809,
"kl": 108.07892608642578,
"learning_rate": 4.929371220190671e-06,
"logits/chosen": -23903181.714285713,
"logits/rejected": -30589084.444444444,
"logps/chosen": -177.80123465401786,
"logps/rejected": -171.34022352430554,
"loss": 0.3246,
"rewards/chosen": 0.4418588365827288,
"rewards/margins": 2.642532257806687,
"rewards/rejected": -2.2006734212239585,
"step": 31
},
{
"epoch": 0.08865800865800866,
"grad_norm": 4.353723125264067,
"kl": 189.1405029296875,
"learning_rate": 4.9240697074297205e-06,
"logits/chosen": -61281422.222222224,
"logits/rejected": -26948859.42857143,
"logps/chosen": -315.59071180555554,
"logps/rejected": -145.61966378348214,
"loss": 0.2904,
"rewards/chosen": 1.7190555996365018,
"rewards/margins": 6.484896402510385,
"rewards/rejected": -4.765840802873884,
"step": 32
},
{
"epoch": 0.09142857142857143,
"grad_norm": 3.9621961428795482,
"kl": 129.8131103515625,
"learning_rate": 4.918579420730884e-06,
"logits/chosen": -59562949.333333336,
"logits/rejected": -38947785.6,
"logps/chosen": -232.64375813802084,
"logps/rejected": -109.01717529296874,
"loss": 0.3206,
"rewards/chosen": 0.07605441411336263,
"rewards/margins": 2.483719857533773,
"rewards/rejected": -2.4076654434204103,
"step": 33
},
{
"epoch": 0.0941991341991342,
"grad_norm": 4.640088655848224,
"kl": 153.42471313476562,
"learning_rate": 4.912900787649124e-06,
"logits/chosen": -79684328.72727273,
"logits/rejected": -61416396.8,
"logps/chosen": -557.2211026278409,
"logps/rejected": -227.8122314453125,
"loss": 0.3035,
"rewards/chosen": 1.5112367109818892,
"rewards/margins": 5.1786668257279835,
"rewards/rejected": -3.667430114746094,
"step": 34
},
{
"epoch": 0.09696969696969697,
"grad_norm": 3.400632617608017,
"kl": 223.9918212890625,
"learning_rate": 4.907034250406846e-06,
"logits/chosen": -63817045.333333336,
"logits/rejected": -36983830.85714286,
"logps/chosen": -226.66205512152777,
"logps/rejected": -213.019775390625,
"loss": 0.3779,
"rewards/chosen": -0.03687206904093424,
"rewards/margins": 5.821811380840483,
"rewards/rejected": -5.8586834498814175,
"step": 35
},
{
"epoch": 0.09974025974025974,
"grad_norm": 4.880517140185325,
"kl": 106.79385375976562,
"learning_rate": 4.900980265859449e-06,
"logits/chosen": -36042773.333333336,
"logits/rejected": -33042774.85714286,
"logps/chosen": -207.92580837673611,
"logps/rejected": -164.6212158203125,
"loss": 0.2623,
"rewards/chosen": -0.6909873220655653,
"rewards/margins": 1.9688087115212094,
"rewards/rejected": -2.6597960335867747,
"step": 36
},
{
"epoch": 0.09974025974025974,
"eval_logits/chosen": -55713169.06666667,
"eval_logits/rejected": -40055004.68965517,
"eval_logps/chosen": -345.47955729166665,
"eval_logps/rejected": -197.1070177801724,
"eval_loss": 0.33400189876556396,
"eval_rewards/chosen": 1.3846696217854817,
"eval_rewards/margins": 5.02308683285768,
"eval_rewards/rejected": -3.6384172110721984,
"eval_runtime": 376.2441,
"eval_samples_per_second": 1.241,
"eval_steps_per_second": 0.157,
"kl": 890.2158813476562,
"step": 36
},
{
"epoch": 0.10251082251082251,
"grad_norm": 4.888307645224207,
"kl": 218.9296112060547,
"learning_rate": 4.894739305459754e-06,
"logits/chosen": -43587472.0,
"logits/rejected": -41952672.0,
"logps/chosen": -277.0217692057292,
"logps/rejected": -136.7241455078125,
"loss": 0.2681,
"rewards/chosen": -0.6008732318878174,
"rewards/margins": 2.065483808517456,
"rewards/rejected": -2.6663570404052734,
"step": 37
},
{
"epoch": 0.10528138528138528,
"grad_norm": 3.9930692633221185,
"kl": 173.95948791503906,
"learning_rate": 4.88831185522129e-06,
"logits/chosen": -42094129.777777776,
"logits/rejected": -102693074.28571428,
"logps/chosen": -383.2141384548611,
"logps/rejected": -281.09629603794644,
"loss": 0.3007,
"rewards/chosen": 1.5146233240763347,
"rewards/margins": 7.224462191263835,
"rewards/rejected": -5.7098388671875,
"step": 38
},
{
"epoch": 0.10805194805194805,
"grad_norm": 4.7795619675028815,
"kl": 210.2698211669922,
"learning_rate": 4.881698415680442e-06,
"logits/chosen": -55168146.28571428,
"logits/rejected": -61552739.55555555,
"logps/chosen": -279.0960693359375,
"logps/rejected": -206.74319118923611,
"loss": 0.3303,
"rewards/chosen": 0.6114097322736468,
"rewards/margins": 5.236137193346781,
"rewards/rejected": -4.6247274610731335,
"step": 39
},
{
"epoch": 0.11082251082251082,
"grad_norm": 4.316697115631673,
"kl": 160.36947631835938,
"learning_rate": 4.874899501857477e-06,
"logits/chosen": -48044475.428571425,
"logits/rejected": -110801429.33333333,
"logps/chosen": -183.30032784598214,
"logps/rejected": -277.29033745659723,
"loss": 0.2789,
"rewards/chosen": -1.4944199153355189,
"rewards/margins": 6.188857638646685,
"rewards/rejected": -7.6832775539822045,
"step": 40
},
{
"epoch": 0.11359307359307359,
"grad_norm": 3.752176207163753,
"kl": 255.1221466064453,
"learning_rate": 4.867915643216434e-06,
"logits/chosen": -47792854.85714286,
"logits/rejected": -61886236.44444445,
"logps/chosen": -237.83729771205358,
"logps/rejected": -266.69927300347223,
"loss": 0.3505,
"rewards/chosen": 0.7381483486720494,
"rewards/margins": 6.307764030638195,
"rewards/rejected": -5.5696156819661455,
"step": 41
},
{
"epoch": 0.11636363636363636,
"grad_norm": 4.00527705976467,
"kl": 229.1636199951172,
"learning_rate": 4.860747383623889e-06,
"logits/chosen": -67951502.22222222,
"logits/rejected": -32232422.85714286,
"logps/chosen": -215.768310546875,
"logps/rejected": -87.32527378627232,
"loss": 0.3736,
"rewards/chosen": 1.4013728035820856,
"rewards/margins": 1.9508999377962144,
"rewards/rejected": -0.5495271342141288,
"step": 42
},
{
"epoch": 0.11913419913419913,
"grad_norm": 4.135955907049656,
"kl": 246.27914428710938,
"learning_rate": 4.85339528130661e-06,
"logits/chosen": -68992011.63636364,
"logits/rejected": -29306707.2,
"logps/chosen": -250.64783824573863,
"logps/rejected": -138.613916015625,
"loss": 0.3449,
"rewards/chosen": 0.9984440370039507,
"rewards/margins": 6.130092672868209,
"rewards/rejected": -5.1316486358642575,
"step": 43
},
{
"epoch": 0.1219047619047619,
"grad_norm": 4.923417830289614,
"kl": 211.35293579101562,
"learning_rate": 4.845859908808074e-06,
"logits/chosen": -46779109.333333336,
"logits/rejected": -48145318.4,
"logps/chosen": -305.1178792317708,
"logps/rejected": -204.313037109375,
"loss": 0.3315,
"rewards/chosen": 1.7318267822265625,
"rewards/margins": 3.607950782775879,
"rewards/rejected": -1.8761240005493165,
"step": 44
},
{
"epoch": 0.12467532467532468,
"grad_norm": 4.2715927235424385,
"kl": 186.33251953125,
"learning_rate": 4.838141852943891e-06,
"logits/chosen": -38434194.28571428,
"logits/rejected": -33670951.11111111,
"logps/chosen": -271.14303152901783,
"logps/rejected": -149.29356553819446,
"loss": 0.2698,
"rewards/chosen": 1.1793903623308455,
"rewards/margins": 2.703519382174053,
"rewards/rejected": -1.5241290198432074,
"step": 45
},
{
"epoch": 0.12744588744588745,
"grad_norm": 4.6712246149702334,
"kl": 379.38775634765625,
"learning_rate": 4.830241714756099e-06,
"logits/chosen": -47082810.666666664,
"logits/rejected": -53669280.0,
"logps/chosen": -209.59720865885416,
"logps/rejected": -228.5864715576172,
"loss": 0.3081,
"rewards/chosen": -1.286256472269694,
"rewards/margins": 4.078855832417806,
"rewards/rejected": -5.3651123046875,
"step": 46
},
{
"epoch": 0.13021645021645023,
"grad_norm": 4.157335918962299,
"kl": 252.03436279296875,
"learning_rate": 4.822160109466361e-06,
"logits/chosen": -71515571.2,
"logits/rejected": -22340152.0,
"logps/chosen": -444.233447265625,
"logps/rejected": -128.06829833984375,
"loss": 0.2922,
"rewards/chosen": 3.7814491271972654,
"rewards/margins": 7.945715713500976,
"rewards/rejected": -4.164266586303711,
"step": 47
},
{
"epoch": 0.132987012987013,
"grad_norm": 4.614266751571361,
"kl": 209.5349578857422,
"learning_rate": 4.813897666428054e-06,
"logits/chosen": -44444301.71428572,
"logits/rejected": -104307093.33333333,
"logps/chosen": -344.1742466517857,
"logps/rejected": -157.80377875434027,
"loss": 0.3107,
"rewards/chosen": 4.24214472089495,
"rewards/margins": 10.676348398602197,
"rewards/rejected": -6.434203677707249,
"step": 48
},
{
"epoch": 0.13575757575757577,
"grad_norm": 3.963234963343962,
"kl": 283.381103515625,
"learning_rate": 4.805455029077255e-06,
"logits/chosen": -34125922.28571428,
"logits/rejected": -58880739.55555555,
"logps/chosen": -242.03897530691964,
"logps/rejected": -168.01375325520834,
"loss": 0.3119,
"rewards/chosen": 0.16667870112827846,
"rewards/margins": 5.258827799842471,
"rewards/rejected": -5.092149098714192,
"step": 49
},
{
"epoch": 0.13852813852813853,
"grad_norm": 4.655086695934908,
"kl": 234.0029754638672,
"learning_rate": 4.79683285488264e-06,
"logits/chosen": -60300384.0,
"logits/rejected": -33782124.8,
"logps/chosen": -330.35732014973956,
"logps/rejected": -118.8407958984375,
"loss": 0.3508,
"rewards/chosen": -4.03786055246989,
"rewards/margins": -0.5354275385538738,
"rewards/rejected": -3.502433013916016,
"step": 50
},
{
"epoch": 0.1412987012987013,
"grad_norm": 4.404029490858146,
"kl": 258.7068176269531,
"learning_rate": 4.788031815294282e-06,
"logits/chosen": -41171053.333333336,
"logits/rejected": -47690988.8,
"logps/chosen": -225.68290201822916,
"logps/rejected": -165.079248046875,
"loss": 0.3853,
"rewards/chosen": -3.4279282887776694,
"rewards/margins": -0.5250184377034506,
"rewards/rejected": -2.902909851074219,
"step": 51
},
{
"epoch": 0.14406926406926407,
"grad_norm": 4.502899514521724,
"kl": 234.19357299804688,
"learning_rate": 4.779052595691355e-06,
"logits/chosen": -48258922.666666664,
"logits/rejected": -41363040.0,
"logps/chosen": -226.1843465169271,
"logps/rejected": -213.36155700683594,
"loss": 0.3074,
"rewards/chosen": -0.007684846719106038,
"rewards/margins": 4.0709970990816755,
"rewards/rejected": -4.078681945800781,
"step": 52
},
{
"epoch": 0.14683982683982685,
"grad_norm": 4.564834903412173,
"kl": 273.16302490234375,
"learning_rate": 4.76989589532877e-06,
"logits/chosen": -58062768.0,
"logits/rejected": -28522993.230769232,
"logps/chosen": -288.6212158203125,
"logps/rejected": -137.31136380709134,
"loss": 0.3462,
"rewards/chosen": 2.7320359547932944,
"rewards/margins": 6.071770252325596,
"rewards/rejected": -3.339734297532302,
"step": 53
},
{
"epoch": 0.1496103896103896,
"grad_norm": 4.295499026061421,
"kl": 138.43191528320312,
"learning_rate": 4.7605624272827125e-06,
"logits/chosen": -77513848.0,
"logits/rejected": -133321173.33333333,
"logps/chosen": -210.3496856689453,
"logps/rejected": -126.23902384440105,
"loss": 0.3016,
"rewards/chosen": -3.524599075317383,
"rewards/margins": 2.062013626098633,
"rewards/rejected": -5.586612701416016,
"step": 54
},
{
"epoch": 0.1523809523809524,
"grad_norm": 3.7911522230413235,
"kl": 138.6194305419922,
"learning_rate": 4.75105291839512e-06,
"logits/chosen": -52138740.0,
"logits/rejected": -23599914.0,
"logps/chosen": -258.3819274902344,
"logps/rejected": -172.046630859375,
"loss": 0.335,
"rewards/chosen": 0.0711725652217865,
"rewards/margins": 5.18670067191124,
"rewards/rejected": -5.115528106689453,
"step": 55
},
{
"epoch": 0.15515151515151515,
"grad_norm": 3.473713440188282,
"kl": 212.0938720703125,
"learning_rate": 4.741368109217072e-06,
"logits/chosen": -55373659.428571425,
"logits/rejected": -26278688.0,
"logps/chosen": -308.01792689732144,
"logps/rejected": -207.99937608506946,
"loss": 0.2906,
"rewards/chosen": 4.009753363473075,
"rewards/margins": 8.175141682700506,
"rewards/rejected": -4.16538831922743,
"step": 56
},
{
"epoch": 0.15792207792207794,
"grad_norm": 4.254963226590674,
"kl": 171.899658203125,
"learning_rate": 4.7315087539511225e-06,
"logits/chosen": -47232265.14285714,
"logits/rejected": -35438577.777777776,
"logps/chosen": -169.33925083705358,
"logps/rejected": -180.327880859375,
"loss": 0.3628,
"rewards/chosen": -2.5366314479282925,
"rewards/margins": 1.3937638903421066,
"rewards/rejected": -3.930395338270399,
"step": 57
},
{
"epoch": 0.1606926406926407,
"grad_norm": 3.9659807050855505,
"kl": 254.7626953125,
"learning_rate": 4.721475620392567e-06,
"logits/chosen": -50043021.71428572,
"logits/rejected": -62206862.222222224,
"logps/chosen": -242.54567173549108,
"logps/rejected": -245.458984375,
"loss": 0.2694,
"rewards/chosen": 1.714186668395996,
"rewards/margins": 8.648983319600422,
"rewards/rejected": -6.934796651204427,
"step": 58
},
{
"epoch": 0.16346320346320348,
"grad_norm": 4.532754524040317,
"kl": 254.67286682128906,
"learning_rate": 4.711269489869654e-06,
"logits/chosen": -23121174.0,
"logits/rejected": -25245008.0,
"logps/chosen": -231.70849609375,
"logps/rejected": -182.4578857421875,
"loss": 0.3414,
"rewards/chosen": -0.8669279217720032,
"rewards/margins": 3.1747695803642273,
"rewards/rejected": -4.0416975021362305,
"step": 59
},
{
"epoch": 0.16623376623376623,
"grad_norm": 4.231371855524758,
"kl": 281.6844177246094,
"learning_rate": 4.700891157182729e-06,
"logits/chosen": -42404366.222222224,
"logits/rejected": -37054004.571428575,
"logps/chosen": -141.22482638888889,
"logps/rejected": -169.881591796875,
"loss": 0.4238,
"rewards/chosen": -2.689548068576389,
"rewards/margins": 2.9127321394663013,
"rewards/rejected": -5.60228020804269,
"step": 60
},
{
"epoch": 0.16900432900432902,
"grad_norm": 3.979048810041181,
"kl": 392.581298828125,
"learning_rate": 4.690341430542351e-06,
"logits/chosen": -82912292.57142857,
"logits/rejected": -52836330.666666664,
"logps/chosen": -509.1022251674107,
"logps/rejected": -204.47900390625,
"loss": 0.3324,
"rewards/chosen": 2.8874827793666293,
"rewards/margins": 9.94957036820669,
"rewards/rejected": -7.062087588840061,
"step": 61
},
{
"epoch": 0.17177489177489177,
"grad_norm": 4.714544408668155,
"kl": 215.25192260742188,
"learning_rate": 4.679621131506347e-06,
"logits/chosen": -38413403.428571425,
"logits/rejected": -51266595.55555555,
"logps/chosen": -234.61213030133928,
"logps/rejected": -182.70484754774304,
"loss": 0.3589,
"rewards/chosen": 0.2332275935581752,
"rewards/margins": 3.614927685449994,
"rewards/rejected": -3.3817000918918185,
"step": 62
},
{
"epoch": 0.17454545454545456,
"grad_norm": 4.623389807959627,
"kl": 335.26934814453125,
"learning_rate": 4.668731094915835e-06,
"logits/chosen": -35904576.0,
"logits/rejected": -65528533.333333336,
"logps/chosen": -289.337890625,
"logps/rejected": -275.28566487630206,
"loss": 0.3341,
"rewards/chosen": 1.6814540863037108,
"rewards/margins": 10.296795272827149,
"rewards/rejected": -8.615341186523438,
"step": 63
},
{
"epoch": 0.17731601731601732,
"grad_norm": 4.704432412085312,
"kl": 219.504150390625,
"learning_rate": 4.657672168830211e-06,
"logits/chosen": -73965184.0,
"logits/rejected": -135261056.0,
"logps/chosen": -360.76513671875,
"logps/rejected": -249.28709411621094,
"loss": 0.3758,
"rewards/chosen": 1.1860873699188232,
"rewards/margins": 7.718278646469116,
"rewards/rejected": -6.532191276550293,
"step": 64
},
{
"epoch": 0.1800865800865801,
"grad_norm": 3.4016250966473707,
"kl": 293.42034912109375,
"learning_rate": 4.646445214461105e-06,
"logits/chosen": -68569877.33333333,
"logits/rejected": -70185024.0,
"logps/chosen": -208.67692057291666,
"logps/rejected": -220.83232421875,
"loss": 0.2864,
"rewards/chosen": 1.106031099955241,
"rewards/margins": 8.20595251719157,
"rewards/rejected": -7.099921417236328,
"step": 65
},
{
"epoch": 0.18285714285714286,
"grad_norm": 4.424106473489455,
"kl": 359.9471435546875,
"learning_rate": 4.635051106105316e-06,
"logits/chosen": -53769763.55555555,
"logits/rejected": -45391094.85714286,
"logps/chosen": -188.59453667534723,
"logps/rejected": -173.94886997767858,
"loss": 0.3533,
"rewards/chosen": -2.5977306365966797,
"rewards/margins": 1.459943226405552,
"rewards/rejected": -4.057673863002232,
"step": 66
},
{
"epoch": 0.18562770562770564,
"grad_norm": 3.7471465185818924,
"kl": 283.2303466796875,
"learning_rate": 4.623490731076728e-06,
"logits/chosen": -44121778.28571428,
"logits/rejected": -35914748.44444445,
"logps/chosen": -223.49496023995536,
"logps/rejected": -194.11257595486111,
"loss": 0.2895,
"rewards/chosen": -1.5352796827043806,
"rewards/margins": 1.1657637641543435,
"rewards/rejected": -2.701043446858724,
"step": 67
},
{
"epoch": 0.1883982683982684,
"grad_norm": 4.286110449007166,
"kl": 249.0130615234375,
"learning_rate": 4.6117649896372055e-06,
"logits/chosen": -66104793.6,
"logits/rejected": -78641966.54545455,
"logps/chosen": -486.58681640625,
"logps/rejected": -216.8487215909091,
"loss": 0.3098,
"rewards/chosen": 2.7544036865234376,
"rewards/margins": 8.66450722434304,
"rewards/rejected": -5.9101035378196025,
"step": 68
},
{
"epoch": 0.19116883116883118,
"grad_norm": 4.19634882053574,
"kl": 295.2521667480469,
"learning_rate": 4.59987479492649e-06,
"logits/chosen": -49621673.6,
"logits/rejected": -103387168.0,
"logps/chosen": -292.345458984375,
"logps/rejected": -152.9824015299479,
"loss": 0.3605,
"rewards/chosen": 1.9187255859375,
"rewards/margins": 6.33223876953125,
"rewards/rejected": -4.41351318359375,
"step": 69
},
{
"epoch": 0.19393939393939394,
"grad_norm": 3.4689063655732606,
"kl": 295.6634216308594,
"learning_rate": 4.587821072891089e-06,
"logits/chosen": -85551232.0,
"logits/rejected": -44202093.333333336,
"logps/chosen": -415.968359375,
"logps/rejected": -180.08984375,
"loss": 0.2793,
"rewards/chosen": 3.5712188720703124,
"rewards/margins": 6.219875717163086,
"rewards/rejected": -2.6486568450927734,
"step": 70
},
{
"epoch": 0.19670995670995672,
"grad_norm": 4.285024920845038,
"kl": 258.05487060546875,
"learning_rate": 4.5756047622121665e-06,
"logits/chosen": -50422928.0,
"logits/rejected": -55744752.0,
"logps/chosen": -167.7302703857422,
"logps/rejected": -208.2799072265625,
"loss": 0.3164,
"rewards/chosen": -2.9284355640411377,
"rewards/margins": -0.29325294494628906,
"rewards/rejected": -2.6351826190948486,
"step": 71
},
{
"epoch": 0.19948051948051948,
"grad_norm": 3.6178248205419554,
"kl": 321.04461669921875,
"learning_rate": 4.563226814232444e-06,
"logits/chosen": -57370528.0,
"logits/rejected": -49400672.0,
"logps/chosen": -243.88953993055554,
"logps/rejected": -175.27256556919642,
"loss": 0.3222,
"rewards/chosen": 1.1517394383748372,
"rewards/margins": 4.414692787897019,
"rewards/rejected": -3.262953349522182,
"step": 72
},
{
"epoch": 0.19948051948051948,
"eval_logits/chosen": -61469499.733333334,
"eval_logits/rejected": -32503238.620689657,
"eval_logps/chosen": -344.1068359375,
"eval_logps/rejected": -209.9999158135776,
"eval_loss": 0.32733702659606934,
"eval_rewards/chosen": 1.5219429016113282,
"eval_rewards/margins": 6.449649389858903,
"eval_rewards/rejected": -4.927706488247575,
"eval_runtime": 376.0789,
"eval_samples_per_second": 1.242,
"eval_steps_per_second": 0.157,
"kl": 1189.544677734375,
"step": 72
},
{
"epoch": 0.20225108225108226,
"grad_norm": 4.29914119533165,
"kl": 282.59783935546875,
"learning_rate": 4.550688192882115e-06,
"logits/chosen": -45450176.0,
"logits/rejected": -62472713.14285714,
"logps/chosen": -270.63197157118054,
"logps/rejected": -223.25875418526786,
"loss": 0.308,
"rewards/chosen": 0.047760725021362305,
"rewards/margins": 5.480312517711094,
"rewards/rejected": -5.432551792689732,
"step": 73
},
{
"epoch": 0.20502164502164502,
"grad_norm": 4.0469021106657355,
"kl": 301.9462890625,
"learning_rate": 4.53798987460378e-06,
"logits/chosen": -61264512.0,
"logits/rejected": -70442304.0,
"logps/chosen": -208.86031087239584,
"logps/rejected": -239.4268798828125,
"loss": 0.3314,
"rewards/chosen": -0.09858560562133789,
"rewards/margins": 4.180919170379639,
"rewards/rejected": -4.279504776000977,
"step": 74
},
{
"epoch": 0.2077922077922078,
"grad_norm": 3.9451262684847106,
"kl": 291.61004638671875,
"learning_rate": 4.525132848276405e-06,
"logits/chosen": -51769896.0,
"logits/rejected": -33958032.0,
"logps/chosen": -296.2464599609375,
"logps/rejected": -154.05738830566406,
"loss": 0.2617,
"rewards/chosen": 0.972441554069519,
"rewards/margins": 5.835387110710144,
"rewards/rejected": -4.862945556640625,
"step": 75
},
{
"epoch": 0.21056277056277056,
"grad_norm": 3.6314514523940984,
"kl": 232.65203857421875,
"learning_rate": 4.512118115138315e-06,
"logits/chosen": -67941207.27272727,
"logits/rejected": -48613404.8,
"logps/chosen": -314.12986061789775,
"logps/rejected": -230.21904296875,
"loss": 0.3111,
"rewards/chosen": 3.0080684315074575,
"rewards/margins": 6.707305110584606,
"rewards/rejected": -3.6992366790771483,
"step": 76
},
{
"epoch": 0.21333333333333335,
"grad_norm": 4.06139882760236,
"kl": 281.50775146484375,
"learning_rate": 4.498946688709216e-06,
"logits/chosen": -53087360.0,
"logits/rejected": -53375168.0,
"logps/chosen": -192.54742431640625,
"logps/rejected": -188.837060546875,
"loss": 0.312,
"rewards/chosen": -2.0402072270711265,
"rewards/margins": 1.6422310511271156,
"rewards/rejected": -3.682438278198242,
"step": 77
},
{
"epoch": 0.2161038961038961,
"grad_norm": 3.8100857886583226,
"kl": 203.6710662841797,
"learning_rate": 4.485619594711278e-06,
"logits/chosen": -98739016.0,
"logits/rejected": -57291765.333333336,
"logps/chosen": -229.67535400390625,
"logps/rejected": -275.8813069661458,
"loss": 0.3199,
"rewards/chosen": 2.281548261642456,
"rewards/margins": 7.678904930750529,
"rewards/rejected": -5.397356669108073,
"step": 78
},
{
"epoch": 0.2188744588744589,
"grad_norm": 4.0430201951719384,
"kl": 237.18043518066406,
"learning_rate": 4.4721378709892475e-06,
"logits/chosen": -67218921.14285715,
"logits/rejected": -48233592.88888889,
"logps/chosen": -234.91176060267858,
"logps/rejected": -176.94509548611111,
"loss": 0.3595,
"rewards/chosen": 0.625354358128139,
"rewards/margins": 5.350802118816073,
"rewards/rejected": -4.725447760687934,
"step": 79
},
{
"epoch": 0.22164502164502164,
"grad_norm": 3.6673216672550777,
"kl": 213.51605224609375,
"learning_rate": 4.4585025674296315e-06,
"logits/chosen": -84983160.0,
"logits/rejected": -81099253.33333333,
"logps/chosen": -495.0953063964844,
"logps/rejected": -206.34517415364584,
"loss": 0.2647,
"rewards/chosen": 2.066316604614258,
"rewards/margins": 6.826260566711426,
"rewards/rejected": -4.759943962097168,
"step": 80
},
{
"epoch": 0.22441558441558443,
"grad_norm": 3.6668548319794882,
"kl": 149.62611389160156,
"learning_rate": 4.444714745878936e-06,
"logits/chosen": -53186656.0,
"logits/rejected": -110574705.77777778,
"logps/chosen": -203.04007393973214,
"logps/rejected": -216.20652940538196,
"loss": 0.2927,
"rewards/chosen": -0.9502920423235212,
"rewards/margins": 4.565778853401306,
"rewards/rejected": -5.516070895724827,
"step": 81
},
{
"epoch": 0.22718614718614719,
"grad_norm": 4.419648414760186,
"kl": 177.2197265625,
"learning_rate": 4.430775480060973e-06,
"logits/chosen": -67670118.4,
"logits/rejected": -32177362.666666668,
"logps/chosen": -336.2021728515625,
"logps/rejected": -181.74137369791666,
"loss": 0.3077,
"rewards/chosen": 2.1282627105712892,
"rewards/margins": 5.788774998982747,
"rewards/rejected": -3.6605122884114585,
"step": 82
},
{
"epoch": 0.22995670995670997,
"grad_norm": 3.649359422872617,
"kl": 165.82017517089844,
"learning_rate": 4.416685855493246e-06,
"logits/chosen": -34292243.2,
"logits/rejected": -70526666.66666667,
"logps/chosen": -187.4371826171875,
"logps/rejected": -280.88523356119794,
"loss": 0.3408,
"rewards/chosen": 0.32179970741271974,
"rewards/margins": 7.029301436742147,
"rewards/rejected": -6.707501729329427,
"step": 83
},
{
"epoch": 0.23272727272727273,
"grad_norm": 4.6924596982127635,
"kl": 287.5317687988281,
"learning_rate": 4.4024469694024194e-06,
"logits/chosen": -44100153.6,
"logits/rejected": -76765579.63636364,
"logps/chosen": -231.4888427734375,
"logps/rejected": -254.74940074573863,
"loss": 0.2934,
"rewards/chosen": -4.008915710449219,
"rewards/margins": 1.9597850452769885,
"rewards/rejected": -5.968700755726207,
"step": 84
},
{
"epoch": 0.2354978354978355,
"grad_norm": 3.103062765938464,
"kl": 158.9697723388672,
"learning_rate": 4.388059930638865e-06,
"logits/chosen": -41523650.666666664,
"logits/rejected": -91830340.92307693,
"logps/chosen": -135.86697387695312,
"logps/rejected": -196.91357421875,
"loss": 0.3093,
"rewards/chosen": -4.111440022786458,
"rewards/margins": -0.3413952558468547,
"rewards/rejected": -3.7700447669396033,
"step": 85
},
{
"epoch": 0.23826839826839827,
"grad_norm": 3.792643877482403,
"kl": 121.17691802978516,
"learning_rate": 4.373525859590313e-06,
"logits/chosen": -62676300.8,
"logits/rejected": -68556032.0,
"logps/chosen": -301.734521484375,
"logps/rejected": -153.32867431640625,
"loss": 0.2263,
"rewards/chosen": 1.7410552978515625,
"rewards/margins": 6.963522084554036,
"rewards/rejected": -5.222466786702474,
"step": 86
},
{
"epoch": 0.24103896103896105,
"grad_norm": 4.249459927433365,
"kl": 259.1443176269531,
"learning_rate": 4.358845888094607e-06,
"logits/chosen": -41253852.8,
"logits/rejected": -31193773.333333332,
"logps/chosen": -269.636181640625,
"logps/rejected": -107.13633219401042,
"loss": 0.2666,
"rewards/chosen": 0.29608802795410155,
"rewards/margins": 3.969274012247721,
"rewards/rejected": -3.6731859842936196,
"step": 87
},
{
"epoch": 0.2438095238095238,
"grad_norm": 4.075168086387706,
"kl": 334.02734375,
"learning_rate": 4.3440211593515556e-06,
"logits/chosen": -50688288.0,
"logits/rejected": -126751398.4,
"logps/chosen": -228.28106689453125,
"logps/rejected": -291.38564453125,
"loss": 0.3756,
"rewards/chosen": -0.46695244312286377,
"rewards/margins": 7.10467689037323,
"rewards/rejected": -7.571629333496094,
"step": 88
},
{
"epoch": 0.2465800865800866,
"grad_norm": 4.06016767816469,
"kl": 237.378662109375,
"learning_rate": 4.32905282783391e-06,
"logits/chosen": -64327445.333333336,
"logits/rejected": 108107328.0,
"logps/chosen": -239.98824055989584,
"logps/rejected": -162.29722900390624,
"loss": 0.3375,
"rewards/chosen": 0.16190481185913086,
"rewards/margins": 6.292160320281982,
"rewards/rejected": -6.130255508422851,
"step": 89
},
{
"epoch": 0.24935064935064935,
"grad_norm": 5.14470621992649,
"kl": 298.228515625,
"learning_rate": 4.313942059197457e-06,
"logits/chosen": -20689902.666666668,
"logits/rejected": -37410044.8,
"logps/chosen": -165.82305908203125,
"logps/rejected": -172.006689453125,
"loss": 0.3189,
"rewards/chosen": -1.7497021357218425,
"rewards/margins": 1.8894381841023764,
"rewards/rejected": -3.639140319824219,
"step": 90
},
{
"epoch": 0.25212121212121213,
"grad_norm": 4.584573892248193,
"kl": 270.75927734375,
"learning_rate": 4.298690030190247e-06,
"logits/chosen": -59297584.0,
"logits/rejected": -47739494.4,
"logps/chosen": -452.8969319661458,
"logps/rejected": -150.6827392578125,
"loss": 0.2776,
"rewards/chosen": 2.8319133122762046,
"rewards/margins": 7.6222434361775715,
"rewards/rejected": -4.790330123901367,
"step": 91
},
{
"epoch": 0.2548917748917749,
"grad_norm": 4.287301568077891,
"kl": 227.37271118164062,
"learning_rate": 4.283297928560951e-06,
"logits/chosen": -53382579.2,
"logits/rejected": -54202240.0,
"logps/chosen": -175.3722412109375,
"logps/rejected": -285.7312825520833,
"loss": 0.3621,
"rewards/chosen": -0.47195091247558596,
"rewards/margins": 4.055190658569336,
"rewards/rejected": -4.527141571044922,
"step": 92
},
{
"epoch": 0.25766233766233765,
"grad_norm": 4.220829228867934,
"kl": 398.3648681640625,
"learning_rate": 4.267766952966369e-06,
"logits/chosen": -63718946.90909091,
"logits/rejected": -42242217.6,
"logps/chosen": -335.96311257102275,
"logps/rejected": -149.02587890625,
"loss": 0.3372,
"rewards/chosen": 0.7265178507024591,
"rewards/margins": 3.0149881189519707,
"rewards/rejected": -2.2884702682495117,
"step": 93
},
{
"epoch": 0.26043290043290046,
"grad_norm": 4.358702143381746,
"kl": 264.9898681640625,
"learning_rate": 4.252098312878083e-06,
"logits/chosen": -79301941.33333333,
"logits/rejected": -43912708.0,
"logps/chosen": -302.7877604166667,
"logps/rejected": -173.28231811523438,
"loss": 0.3621,
"rewards/chosen": 0.2944912115732829,
"rewards/margins": 3.817267100016276,
"rewards/rejected": -3.522775888442993,
"step": 94
},
{
"epoch": 0.2632034632034632,
"grad_norm": 4.254924656611504,
"kl": 330.5307922363281,
"learning_rate": 4.236293228488267e-06,
"logits/chosen": -67833432.0,
"logits/rejected": -78841848.0,
"logps/chosen": -237.7548065185547,
"logps/rejected": -250.9872589111328,
"loss": 0.3272,
"rewards/chosen": 0.7279956340789795,
"rewards/margins": 5.013963460922241,
"rewards/rejected": -4.285967826843262,
"step": 95
},
{
"epoch": 0.265974025974026,
"grad_norm": 4.712066217533106,
"kl": 237.3966522216797,
"learning_rate": 4.220352930614672e-06,
"logits/chosen": -83831668.36363636,
"logits/rejected": -39652540.8,
"logps/chosen": -279.5663396661932,
"logps/rejected": -141.55482177734376,
"loss": 0.337,
"rewards/chosen": 1.9667802290482954,
"rewards/margins": 5.22833303971724,
"rewards/rejected": -3.2615528106689453,
"step": 96
},
{
"epoch": 0.26874458874458873,
"grad_norm": 3.756148033952096,
"kl": 363.8216247558594,
"learning_rate": 4.204278660604767e-06,
"logits/chosen": -42908544.0,
"logits/rejected": -70756288.0,
"logps/chosen": -437.5492350260417,
"logps/rejected": -170.53684645432693,
"loss": 0.3529,
"rewards/chosen": 3.5399929682413735,
"rewards/margins": 9.0660737600082,
"rewards/rejected": -5.526080791766827,
"step": 97
},
{
"epoch": 0.27151515151515154,
"grad_norm": 4.179831214257625,
"kl": 269.4178466796875,
"learning_rate": 4.1880716702390764e-06,
"logits/chosen": -70139897.6,
"logits/rejected": -164937994.66666666,
"logps/chosen": -314.740185546875,
"logps/rejected": -187.75248209635416,
"loss": 0.2662,
"rewards/chosen": 1.2780502319335938,
"rewards/margins": 2.9210824648539226,
"rewards/rejected": -1.6430322329203289,
"step": 98
},
{
"epoch": 0.2742857142857143,
"grad_norm": 4.691897896464608,
"kl": 446.2198486328125,
"learning_rate": 4.171733221633695e-06,
"logits/chosen": -42673925.333333336,
"logits/rejected": -22087344.0,
"logps/chosen": -203.5780029296875,
"logps/rejected": -194.28848266601562,
"loss": 0.3054,
"rewards/chosen": -0.28914421796798706,
"rewards/margins": 3.8305423855781555,
"rewards/rejected": -4.119686603546143,
"step": 99
},
{
"epoch": 0.27705627705627706,
"grad_norm": 4.964944191060874,
"kl": 455.47906494140625,
"learning_rate": 4.155264587142002e-06,
"logits/chosen": -65384675.55555555,
"logits/rejected": -41647456.0,
"logps/chosen": -266.32972547743054,
"logps/rejected": -211.07502092633928,
"loss": 0.3833,
"rewards/chosen": 0.7655517260233561,
"rewards/margins": 3.0700247855413525,
"rewards/rejected": -2.3044730595179965,
"step": 100
},
{
"epoch": 0.2798268398268398,
"grad_norm": 3.8792329085023534,
"kl": 308.05084228515625,
"learning_rate": 4.138667049255574e-06,
"logits/chosen": -21252804.8,
"logits/rejected": -42198394.18181818,
"logps/chosen": -239.7299072265625,
"logps/rejected": -214.3200017755682,
"loss": 0.3334,
"rewards/chosen": 2.0222194671630858,
"rewards/margins": 7.540434993397105,
"rewards/rejected": -5.51821552623402,
"step": 101
},
{
"epoch": 0.2825974025974026,
"grad_norm": 3.9057066201741293,
"kl": 234.1949462890625,
"learning_rate": 4.121941900504316e-06,
"logits/chosen": -54799564.0,
"logits/rejected": -59120932.0,
"logps/chosen": -195.52490234375,
"logps/rejected": -255.46969604492188,
"loss": 0.323,
"rewards/chosen": -2.307999610900879,
"rewards/margins": 5.062156677246094,
"rewards/rejected": -7.370156288146973,
"step": 102
},
{
"epoch": 0.2853679653679654,
"grad_norm": 3.4089811236807392,
"kl": 329.79876708984375,
"learning_rate": 4.105090443355801e-06,
"logits/chosen": -71939761.77777778,
"logits/rejected": -86891346.28571428,
"logps/chosen": -248.69297960069446,
"logps/rejected": -295.9298618861607,
"loss": 0.3506,
"rewards/chosen": -2.3155670166015625,
"rewards/margins": 4.571990966796875,
"rewards/rejected": -6.8875579833984375,
"step": 103
},
{
"epoch": 0.28813852813852814,
"grad_norm": 3.896313964551072,
"kl": 173.1775665283203,
"learning_rate": 4.088113990113846e-06,
"logits/chosen": -39651421.333333336,
"logits/rejected": -83678412.8,
"logps/chosen": -153.03709920247397,
"logps/rejected": -227.3532470703125,
"loss": 0.3179,
"rewards/chosen": 0.045263449350992836,
"rewards/margins": 5.96420914332072,
"rewards/rejected": -5.918945693969727,
"step": 104
},
{
"epoch": 0.2909090909090909,
"grad_norm": 4.60023712492313,
"kl": 248.22030639648438,
"learning_rate": 4.071013862816311e-06,
"logits/chosen": -33611790.222222224,
"logits/rejected": -75680219.42857143,
"logps/chosen": -215.91140407986111,
"logps/rejected": -202.12782505580358,
"loss": 0.2801,
"rewards/chosen": -3.120152791341146,
"rewards/margins": 4.2319252377464665,
"rewards/rejected": -7.352078029087612,
"step": 105
},
{
"epoch": 0.2936796536796537,
"grad_norm": 3.278250351800608,
"kl": 256.5851745605469,
"learning_rate": 4.0537913931321495e-06,
"logits/chosen": -61538093.71428572,
"logits/rejected": -32800387.555555556,
"logps/chosen": -244.89592633928572,
"logps/rejected": -191.21028645833334,
"loss": 0.3477,
"rewards/chosen": 0.4793204239436558,
"rewards/margins": 2.215007361911592,
"rewards/rejected": -1.7356869379679363,
"step": 106
},
{
"epoch": 0.29645021645021646,
"grad_norm": 3.346665787200658,
"kl": 281.5144348144531,
"learning_rate": 4.036447922257699e-06,
"logits/chosen": -63766696.0,
"logits/rejected": -166489008.0,
"logps/chosen": -335.6734924316406,
"logps/rejected": -220.5155792236328,
"loss": 0.291,
"rewards/chosen": 3.1485161781311035,
"rewards/margins": 8.604130268096924,
"rewards/rejected": -5.45561408996582,
"step": 107
},
{
"epoch": 0.2992207792207792,
"grad_norm": 3.9845639945750104,
"kl": 216.4145050048828,
"learning_rate": 4.018984800812248e-06,
"logits/chosen": -61535150.54545455,
"logits/rejected": -26694153.6,
"logps/chosen": -263.3902476917614,
"logps/rejected": -140.8936767578125,
"loss": 0.3798,
"rewards/chosen": -2.7628853537819604,
"rewards/margins": -0.11502139351584706,
"rewards/rejected": -2.6478639602661134,
"step": 108
},
{
"epoch": 0.2992207792207792,
"eval_logits/chosen": -63003302.4,
"eval_logits/rejected": -31597484.137931034,
"eval_logps/chosen": -343.75305989583336,
"eval_logps/rejected": -217.80376144935346,
"eval_loss": 0.3185268044471741,
"eval_rewards/chosen": 1.5573221842447917,
"eval_rewards/margins": 7.265413130836926,
"eval_rewards/rejected": -5.708090946592134,
"eval_runtime": 375.7756,
"eval_samples_per_second": 1.243,
"eval_steps_per_second": 0.157,
"kl": 955.4994506835938,
"step": 108
},
{
"epoch": 0.301991341991342,
"grad_norm": 2.6213775523227443,
"kl": 135.88255310058594,
"learning_rate": 4.001403388732842e-06,
"logits/chosen": -44565760.0,
"logits/rejected": -44044765.09090909,
"logps/chosen": -208.237841796875,
"logps/rejected": -185.643310546875,
"loss": 0.2488,
"rewards/chosen": -2.5330881118774413,
"rewards/margins": 3.4011062795465645,
"rewards/rejected": -5.934194391424006,
"step": 109
},
{
"epoch": 0.3047619047619048,
"grad_norm": 3.884074940842948,
"kl": 172.53640747070312,
"learning_rate": 3.983705055168391e-06,
"logits/chosen": -60829255.11111111,
"logits/rejected": -39536137.14285714,
"logps/chosen": -338.5453287760417,
"logps/rejected": -126.38860212053571,
"loss": 0.329,
"rewards/chosen": 3.6447732713487415,
"rewards/margins": 5.23764221251957,
"rewards/rejected": -1.5928689411708288,
"step": 110
},
{
"epoch": 0.30753246753246755,
"grad_norm": 3.1610714574942866,
"kl": 194.14366149902344,
"learning_rate": 3.965891178373038e-06,
"logits/chosen": -60445340.44444445,
"logits/rejected": -173474523.42857143,
"logps/chosen": -347.3582356770833,
"logps/rejected": -189.652587890625,
"loss": 0.3257,
"rewards/chosen": 2.0447090996636286,
"rewards/margins": 3.7555268454173256,
"rewards/rejected": -1.710817745753697,
"step": 111
},
{
"epoch": 0.3103030303030303,
"grad_norm": 4.309649858922247,
"kl": 251.73069763183594,
"learning_rate": 3.947963145598833e-06,
"logits/chosen": -51157600.0,
"logits/rejected": -30458390.85714286,
"logps/chosen": -367.60199652777777,
"logps/rejected": -172.21843610491072,
"loss": 0.3487,
"rewards/chosen": -1.202609380086263,
"rewards/margins": 2.800561723254976,
"rewards/rejected": -4.003171103341239,
"step": 112
},
{
"epoch": 0.31307359307359306,
"grad_norm": 3.682730545477106,
"kl": 298.24200439453125,
"learning_rate": 3.929922352987702e-06,
"logits/chosen": -75188368.0,
"logits/rejected": -38451584.0,
"logps/chosen": -226.4161580403646,
"logps/rejected": -158.42171630859374,
"loss": 0.3574,
"rewards/chosen": 3.0447769165039062,
"rewards/margins": 5.327561569213867,
"rewards/rejected": -2.282784652709961,
"step": 113
},
{
"epoch": 0.31584415584415587,
"grad_norm": 3.8491174271088955,
"kl": 233.29315185546875,
"learning_rate": 3.911770205462717e-06,
"logits/chosen": -36310968.88888889,
"logits/rejected": -142471716.57142857,
"logps/chosen": -199.12166341145834,
"logps/rejected": -214.38889857700892,
"loss": 0.3359,
"rewards/chosen": -4.300041198730469,
"rewards/margins": 0.7065179007393976,
"rewards/rejected": -5.006559099469866,
"step": 114
},
{
"epoch": 0.31861471861471863,
"grad_norm": 3.301544661387389,
"kl": 165.22003173828125,
"learning_rate": 3.8935081166186935e-06,
"logits/chosen": -64191436.8,
"logits/rejected": -44381066.666666664,
"logps/chosen": -166.80452880859374,
"logps/rejected": -209.62703450520834,
"loss": 0.3044,
"rewards/chosen": -3.046977233886719,
"rewards/margins": 1.0728163401285808,
"rewards/rejected": -4.1197935740153,
"step": 115
},
{
"epoch": 0.3213852813852814,
"grad_norm": 3.8281271473973297,
"kl": 377.83355712890625,
"learning_rate": 3.875137508612104e-06,
"logits/chosen": -35862681.6,
"logits/rejected": -123204629.33333333,
"logps/chosen": -174.58330078125,
"logps/rejected": -278.10194905598956,
"loss": 0.3275,
"rewards/chosen": 0.8950153350830078,
"rewards/margins": 5.497071202596028,
"rewards/rejected": -4.6020558675130205,
"step": 116
},
{
"epoch": 0.32415584415584414,
"grad_norm": 4.141911435738275,
"kl": 301.08831787109375,
"learning_rate": 3.856659812050328e-06,
"logits/chosen": -68912918.85714285,
"logits/rejected": -45419395.55555555,
"logps/chosen": -278.12374441964283,
"logps/rejected": -215.42805989583334,
"loss": 0.3665,
"rewards/chosen": -2.4060380118233815,
"rewards/margins": 1.0650618416922435,
"rewards/rejected": -3.471099853515625,
"step": 117
},
{
"epoch": 0.32692640692640695,
"grad_norm": 3.9655628188506165,
"kl": 279.1497802734375,
"learning_rate": 3.838076465880248e-06,
"logits/chosen": -91121265.77777778,
"logits/rejected": -51099254.85714286,
"logps/chosen": -402.9201388888889,
"logps/rejected": -193.41775948660714,
"loss": 0.3269,
"rewards/chosen": 3.7855517069498696,
"rewards/margins": 7.851486296880813,
"rewards/rejected": -4.065934589930943,
"step": 118
},
{
"epoch": 0.3296969696969697,
"grad_norm": 5.104200711024875,
"kl": 317.8056640625,
"learning_rate": 3.819388917276186e-06,
"logits/chosen": -57592173.71428572,
"logits/rejected": -96817816.0,
"logps/chosen": -305.020751953125,
"logps/rejected": -300.74017333984375,
"loss": 0.3944,
"rewards/chosen": 0.34333719526018414,
"rewards/margins": 7.173882620675223,
"rewards/rejected": -6.830545425415039,
"step": 119
},
{
"epoch": 0.33246753246753247,
"grad_norm": 4.431842355084121,
"kl": 320.4718017578125,
"learning_rate": 3.8005986215272056e-06,
"logits/chosen": -69471952.0,
"logits/rejected": -88022208.0,
"logps/chosen": -294.1892395019531,
"logps/rejected": -163.56695556640625,
"loss": 0.3068,
"rewards/chosen": -1.5611099004745483,
"rewards/margins": 1.6267324686050415,
"rewards/rejected": -3.18784236907959,
"step": 120
},
{
"epoch": 0.3352380952380952,
"grad_norm": 3.938866395899551,
"kl": 348.1075439453125,
"learning_rate": 3.7817070419237866e-06,
"logits/chosen": -45101536.0,
"logits/rejected": -64612682.666666664,
"logps/chosen": -181.74100341796876,
"logps/rejected": -277.8824055989583,
"loss": 0.3559,
"rewards/chosen": -3.265227508544922,
"rewards/margins": 2.3906384785970047,
"rewards/rejected": -5.655865987141927,
"step": 121
},
{
"epoch": 0.33800865800865804,
"grad_norm": 4.385482507170866,
"kl": 215.3175506591797,
"learning_rate": 3.7627156496438686e-06,
"logits/chosen": -59783303.11111111,
"logits/rejected": -68396731.42857143,
"logps/chosen": -289.76850043402777,
"logps/rejected": -293.2662876674107,
"loss": 0.2996,
"rewards/chosen": 0.33743249045477974,
"rewards/margins": 7.059532884567503,
"rewards/rejected": -6.722100394112723,
"step": 122
},
{
"epoch": 0.3407792207792208,
"grad_norm": 3.0031329772352704,
"kl": 279.9674987792969,
"learning_rate": 3.7436259236382797e-06,
"logits/chosen": -52341881.6,
"logits/rejected": -52982293.333333336,
"logps/chosen": -345.33056640625,
"logps/rejected": -214.87158203125,
"loss": 0.3117,
"rewards/chosen": 2.501374435424805,
"rewards/margins": 6.737306785583496,
"rewards/rejected": -4.235932350158691,
"step": 123
},
{
"epoch": 0.34354978354978355,
"grad_norm": 3.828989254174337,
"kl": 322.56781005859375,
"learning_rate": 3.7244393505155713e-06,
"logits/chosen": -55003168.0,
"logits/rejected": -38429700.571428575,
"logps/chosen": -319.0847439236111,
"logps/rejected": -220.10914829799108,
"loss": 0.3261,
"rewards/chosen": -4.397332509358724,
"rewards/margins": 3.908439454578219,
"rewards/rejected": -8.305771963936943,
"step": 124
},
{
"epoch": 0.3463203463203463,
"grad_norm": 3.3716001758196614,
"kl": 175.434326171875,
"learning_rate": 3.7051574244262412e-06,
"logits/chosen": -63195132.0,
"logits/rejected": -43530908.0,
"logps/chosen": -322.2129821777344,
"logps/rejected": -238.111572265625,
"loss": 0.2878,
"rewards/chosen": 3.3598997592926025,
"rewards/margins": 11.616735696792603,
"rewards/rejected": -8.2568359375,
"step": 125
},
{
"epoch": 0.3490909090909091,
"grad_norm": 3.520834555125108,
"kl": 336.7948303222656,
"learning_rate": 3.6857816469463806e-06,
"logits/chosen": -45862705.777777776,
"logits/rejected": -61132077.71428572,
"logps/chosen": -182.07706705729166,
"logps/rejected": -235.33091517857142,
"loss": 0.3024,
"rewards/chosen": -0.025241321987575956,
"rewards/margins": 6.8694152680654375,
"rewards/rejected": -6.894656590053013,
"step": 126
},
{
"epoch": 0.3518614718614719,
"grad_norm": 3.6860559156292867,
"kl": 245.18157958984375,
"learning_rate": 3.6663135269607413e-06,
"logits/chosen": -54326741.333333336,
"logits/rejected": -32279739.42857143,
"logps/chosen": -279.1733669704861,
"logps/rejected": -167.6710205078125,
"loss": 0.2751,
"rewards/chosen": -0.2571737501356337,
"rewards/margins": 3.168460906498016,
"rewards/rejected": -3.4256346566336497,
"step": 127
},
{
"epoch": 0.35463203463203463,
"grad_norm": 3.557862256362831,
"kl": 276.58953857421875,
"learning_rate": 3.6467545805452266e-06,
"logits/chosen": -58534702.54545455,
"logits/rejected": -44616534.4,
"logps/chosen": -288.982666015625,
"logps/rejected": -202.448095703125,
"loss": 0.3473,
"rewards/chosen": 1.2414755387739702,
"rewards/margins": 3.335013250871138,
"rewards/rejected": -2.093537712097168,
"step": 128
},
{
"epoch": 0.3574025974025974,
"grad_norm": 3.2589691814781454,
"kl": 309.430419921875,
"learning_rate": 3.6271063308488298e-06,
"logits/chosen": -50912728.0,
"logits/rejected": -39484352.0,
"logps/chosen": -172.81301879882812,
"logps/rejected": -150.67156982421875,
"loss": 0.2654,
"rewards/chosen": -0.07838886976242065,
"rewards/margins": 5.625042259693146,
"rewards/rejected": -5.703431129455566,
"step": 129
},
{
"epoch": 0.3601731601731602,
"grad_norm": 4.37989836230213,
"kl": 288.1458740234375,
"learning_rate": 3.6073703079750204e-06,
"logits/chosen": -86152466.28571428,
"logits/rejected": -94912405.33333333,
"logps/chosen": -428.32386997767856,
"logps/rejected": -225.74245876736111,
"loss": 0.2825,
"rewards/chosen": 6.9850311279296875,
"rewards/margins": 13.19218275282118,
"rewards/rejected": -6.207151624891493,
"step": 130
},
{
"epoch": 0.36294372294372296,
"grad_norm": 2.943643589164811,
"kl": 219.82888793945312,
"learning_rate": 3.5875480488625847e-06,
"logits/chosen": -61464201.14285714,
"logits/rejected": -75437624.8888889,
"logps/chosen": -230.37840053013392,
"logps/rejected": -146.04918077256946,
"loss": 0.3034,
"rewards/chosen": -2.431492396763393,
"rewards/margins": 2.6535274566165987,
"rewards/rejected": -5.0850198533799915,
"step": 131
},
{
"epoch": 0.3657142857142857,
"grad_norm": 2.5894854276111055,
"kl": 137.6978759765625,
"learning_rate": 3.5676410971659404e-06,
"logits/chosen": -48515029.333333336,
"logits/rejected": -53117555.2,
"logps/chosen": -259.1162923177083,
"logps/rejected": -156.87340087890624,
"loss": 0.3241,
"rewards/chosen": -2.674201329549154,
"rewards/margins": 3.1144594828287757,
"rewards/rejected": -5.7886608123779295,
"step": 132
},
{
"epoch": 0.36848484848484847,
"grad_norm": 3.4611805226691934,
"kl": 284.0462646484375,
"learning_rate": 3.547651003134921e-06,
"logits/chosen": -50902294.4,
"logits/rejected": -29886968.0,
"logps/chosen": -316.385986328125,
"logps/rejected": -143.97941080729166,
"loss": 0.2921,
"rewards/chosen": 1.024208927154541,
"rewards/margins": 4.237102095286051,
"rewards/rejected": -3.2128931681315103,
"step": 133
},
{
"epoch": 0.3712554112554113,
"grad_norm": 3.4173731166240695,
"kl": 223.8057098388672,
"learning_rate": 3.527579323494055e-06,
"logits/chosen": -55733544.0,
"logits/rejected": -58133440.0,
"logps/chosen": -282.2908935546875,
"logps/rejected": -204.7510223388672,
"loss": 0.2934,
"rewards/chosen": 2.8473093509674072,
"rewards/margins": 9.089063882827759,
"rewards/rejected": -6.241754531860352,
"step": 134
},
{
"epoch": 0.37402597402597404,
"grad_norm": 4.5075933419621395,
"kl": 226.37472534179688,
"learning_rate": 3.507427621321331e-06,
"logits/chosen": -44690326.85714286,
"logits/rejected": -40422179.55555555,
"logps/chosen": -197.55255998883928,
"logps/rejected": -173.03732638888889,
"loss": 0.2774,
"rewards/chosen": 0.318291323525565,
"rewards/margins": 3.7029942179483077,
"rewards/rejected": -3.384702894422743,
"step": 135
},
{
"epoch": 0.3767965367965368,
"grad_norm": 2.9825634868618396,
"kl": 225.9692840576172,
"learning_rate": 3.4871974659264786e-06,
"logits/chosen": -103750326.85714285,
"logits/rejected": -36922702.222222224,
"logps/chosen": -330.4085170200893,
"logps/rejected": -195.47115071614584,
"loss": 0.3326,
"rewards/chosen": 0.9742650985717773,
"rewards/margins": 7.700783517625597,
"rewards/rejected": -6.72651841905382,
"step": 136
},
{
"epoch": 0.37956709956709955,
"grad_norm": 3.4440138918363106,
"kl": 208.99081420898438,
"learning_rate": 3.466890432728754e-06,
"logits/chosen": -44583936.0,
"logits/rejected": -61040928.0,
"logps/chosen": -263.41937255859375,
"logps/rejected": -275.160888671875,
"loss": 0.3081,
"rewards/chosen": 0.680980920791626,
"rewards/margins": 7.2455079555511475,
"rewards/rejected": -6.5645270347595215,
"step": 137
},
{
"epoch": 0.38233766233766237,
"grad_norm": 4.447615914301488,
"kl": 231.37353515625,
"learning_rate": 3.446508103134259e-06,
"logits/chosen": -56082509.71428572,
"logits/rejected": -36391459.55555555,
"logps/chosen": -301.94363839285717,
"logps/rejected": -130.56571451822916,
"loss": 0.3302,
"rewards/chosen": 0.23018741607666016,
"rewards/margins": 5.488740815056695,
"rewards/rejected": -5.258553398980035,
"step": 138
},
{
"epoch": 0.3851082251082251,
"grad_norm": 3.227144245083403,
"kl": 110.66608428955078,
"learning_rate": 3.426052064412785e-06,
"logits/chosen": -61858517.333333336,
"logits/rejected": -84151936.0,
"logps/chosen": -326.2563883463542,
"logps/rejected": -106.5555419921875,
"loss": 0.349,
"rewards/chosen": -1.6193631490071614,
"rewards/margins": 1.0518550078074138,
"rewards/rejected": -2.671218156814575,
"step": 139
},
{
"epoch": 0.3878787878787879,
"grad_norm": 2.962046268391638,
"kl": 226.14247131347656,
"learning_rate": 3.4055239095742067e-06,
"logits/chosen": -31455502.222222224,
"logits/rejected": -69333769.14285715,
"logps/chosen": -165.690185546875,
"logps/rejected": -210.72841099330358,
"loss": 0.2861,
"rewards/chosen": -4.098465389675564,
"rewards/margins": 1.3641238742404518,
"rewards/rejected": -5.462589263916016,
"step": 140
},
{
"epoch": 0.39064935064935064,
"grad_norm": 3.437549771241675,
"kl": 123.592529296875,
"learning_rate": 3.3849252372444295e-06,
"logits/chosen": -76042393.6,
"logits/rejected": -40568626.666666664,
"logps/chosen": -320.3619140625,
"logps/rejected": -211.34452311197916,
"loss": 0.3302,
"rewards/chosen": 0.19550623893737792,
"rewards/margins": 6.736018323898316,
"rewards/rejected": -6.5405120849609375,
"step": 141
},
{
"epoch": 0.39341991341991345,
"grad_norm": 3.8510252801665685,
"kl": 131.01315307617188,
"learning_rate": 3.364257651540891e-06,
"logits/chosen": -62763580.0,
"logits/rejected": -93990400.0,
"logps/chosen": -323.82330322265625,
"logps/rejected": -222.23504638671875,
"loss": 0.3731,
"rewards/chosen": -1.7098395824432373,
"rewards/margins": 6.021216630935669,
"rewards/rejected": -7.731056213378906,
"step": 142
},
{
"epoch": 0.3961904761904762,
"grad_norm": 3.127952716350436,
"kl": 188.2672119140625,
"learning_rate": 3.343522761947646e-06,
"logits/chosen": -48637932.8,
"logits/rejected": -127404981.33333333,
"logps/chosen": -202.50628662109375,
"logps/rejected": -183.98612467447916,
"loss": 0.2592,
"rewards/chosen": -1.46456880569458,
"rewards/margins": 6.765842723846435,
"rewards/rejected": -8.230411529541016,
"step": 143
},
{
"epoch": 0.39896103896103896,
"grad_norm": 3.686618621599935,
"kl": 201.80601501464844,
"learning_rate": 3.322722183190025e-06,
"logits/chosen": -69373568.0,
"logits/rejected": -74651648.0,
"logps/chosen": -295.803564453125,
"logps/rejected": -173.57267252604166,
"loss": 0.3755,
"rewards/chosen": -1.9531005859375,
"rewards/margins": 4.711961364746093,
"rewards/rejected": -6.665061950683594,
"step": 144
},
{
"epoch": 0.39896103896103896,
"eval_logits/chosen": -63924428.8,
"eval_logits/rejected": -27711788.137931034,
"eval_logps/chosen": -350.41809895833336,
"eval_logps/rejected": -229.70920325969828,
"eval_loss": 0.30155444145202637,
"eval_rewards/chosen": 0.8908199310302735,
"eval_rewards/margins": 7.7894547429578065,
"eval_rewards/rejected": -6.898634811927533,
"eval_runtime": 375.8146,
"eval_samples_per_second": 1.243,
"eval_steps_per_second": 0.157,
"kl": 705.8950805664062,
"step": 144
},
{
"epoch": 0.4017316017316017,
"grad_norm": 3.4461177641035965,
"kl": 185.58932495117188,
"learning_rate": 3.3018575351088894e-06,
"logits/chosen": -48496684.8,
"logits/rejected": -53352704.0,
"logps/chosen": -207.49033203125,
"logps/rejected": -178.48878062855113,
"loss": 0.3813,
"rewards/chosen": -6.067684555053711,
"rewards/margins": -0.9541293404319076,
"rewards/rejected": -5.113555214621804,
"step": 145
},
{
"epoch": 0.40450216450216453,
"grad_norm": 2.9521823101965525,
"kl": 143.91859436035156,
"learning_rate": 3.280930442534486e-06,
"logits/chosen": -52825682.28571428,
"logits/rejected": -66124664.88888889,
"logps/chosen": -161.79621233258928,
"logps/rejected": -206.44997829861111,
"loss": 0.3756,
"rewards/chosen": -3.0689896174839566,
"rewards/margins": -1.2850553421747117,
"rewards/rejected": -1.7839342753092449,
"step": 146
},
{
"epoch": 0.4072727272727273,
"grad_norm": 4.456496036133048,
"kl": 124.10264587402344,
"learning_rate": 3.2599425351599136e-06,
"logits/chosen": -57379889.777777776,
"logits/rejected": -68506422.85714285,
"logps/chosen": -192.18386501736111,
"logps/rejected": -231.92295619419642,
"loss": 0.3121,
"rewards/chosen": -1.923858854505751,
"rewards/margins": 4.669862807743133,
"rewards/rejected": -6.593721662248884,
"step": 147
},
{
"epoch": 0.41004329004329004,
"grad_norm": 3.6744852920431432,
"kl": 282.42156982421875,
"learning_rate": 3.238895447414211e-06,
"logits/chosen": -50177578.666666664,
"logits/rejected": -54207213.71428572,
"logps/chosen": -243.22791883680554,
"logps/rejected": -223.31515066964286,
"loss": 0.3778,
"rewards/chosen": 3.0299839443630643,
"rewards/margins": 5.608355809771826,
"rewards/rejected": -2.5783718654087613,
"step": 148
},
{
"epoch": 0.4128138528138528,
"grad_norm": 3.6093960850994122,
"kl": 193.4736328125,
"learning_rate": 3.217790818335077e-06,
"logits/chosen": -41057161.6,
"logits/rejected": -60332170.666666664,
"logps/chosen": -195.6895263671875,
"logps/rejected": -261.66713460286456,
"loss": 0.3093,
"rewards/chosen": -3.0542125701904297,
"rewards/margins": 4.613312403361003,
"rewards/rejected": -7.667524973551433,
"step": 149
},
{
"epoch": 0.4155844155844156,
"grad_norm": 2.9227143339919324,
"kl": 328.6368103027344,
"learning_rate": 3.196630291441231e-06,
"logits/chosen": -46167304.0,
"logits/rejected": -39402808.0,
"logps/chosen": -200.86553955078125,
"logps/rejected": -197.71485900878906,
"loss": 0.3118,
"rewards/chosen": 0.24297916889190674,
"rewards/margins": 4.419718384742737,
"rewards/rejected": -4.17673921585083,
"step": 150
},
{
"epoch": 0.41835497835497837,
"grad_norm": 3.5809278836703617,
"kl": 287.126708984375,
"learning_rate": 3.175415514604422e-06,
"logits/chosen": -82066352.0,
"logits/rejected": -65108904.0,
"logps/chosen": -405.883056640625,
"logps/rejected": -179.24244689941406,
"loss": 0.3376,
"rewards/chosen": 3.3306524753570557,
"rewards/margins": 8.936949968338013,
"rewards/rejected": -5.606297492980957,
"step": 151
},
{
"epoch": 0.4211255411255411,
"grad_norm": 4.070395809564026,
"kl": 196.88804626464844,
"learning_rate": 3.154148139921102e-06,
"logits/chosen": -44551419.428571425,
"logits/rejected": -40765962.666666664,
"logps/chosen": -169.73423549107142,
"logps/rejected": -266.4632161458333,
"loss": 0.259,
"rewards/chosen": -1.6945086887904577,
"rewards/margins": 6.6088563828241265,
"rewards/rejected": -8.303365071614584,
"step": 152
},
{
"epoch": 0.4238961038961039,
"grad_norm": 4.302061820298429,
"kl": 134.61264038085938,
"learning_rate": 3.132829823583771e-06,
"logits/chosen": -67628433.45454545,
"logits/rejected": -65108883.2,
"logps/chosen": -268.54106001420456,
"logps/rejected": -281.208203125,
"loss": 0.3274,
"rewards/chosen": -1.7147686698219993,
"rewards/margins": 4.980678107521751,
"rewards/rejected": -6.69544677734375,
"step": 153
},
{
"epoch": 0.4266666666666667,
"grad_norm": 3.639837464165828,
"kl": 190.67416381835938,
"learning_rate": 3.1114622257520004e-06,
"logits/chosen": -50159235.2,
"logits/rejected": -213066624.0,
"logps/chosen": -301.3509521484375,
"logps/rejected": -239.82194010416666,
"loss": 0.3034,
"rewards/chosen": 0.7694726943969726,
"rewards/margins": 7.645719464619954,
"rewards/rejected": -6.8762467702229815,
"step": 154
},
{
"epoch": 0.42943722943722945,
"grad_norm": 3.7172883552273768,
"kl": 285.25189208984375,
"learning_rate": 3.0900470104231456e-06,
"logits/chosen": -46441233.777777776,
"logits/rejected": -26665755.42857143,
"logps/chosen": -325.4695638020833,
"logps/rejected": -101.84296526227679,
"loss": 0.3404,
"rewards/chosen": 0.1546611785888672,
"rewards/margins": 4.439751216343471,
"rewards/rejected": -4.285090037754604,
"step": 155
},
{
"epoch": 0.4322077922077922,
"grad_norm": 3.312339719329392,
"kl": 285.44769287109375,
"learning_rate": 3.0685858453027668e-06,
"logits/chosen": -49904411.428571425,
"logits/rejected": -44058030.222222224,
"logps/chosen": -140.95223563058036,
"logps/rejected": -162.76173231336804,
"loss": 0.2813,
"rewards/chosen": -1.3947293417794364,
"rewards/margins": 4.468493416195824,
"rewards/rejected": -5.863222757975261,
"step": 156
},
{
"epoch": 0.43497835497835496,
"grad_norm": 3.479887405954686,
"kl": 270.1009521484375,
"learning_rate": 3.047080401674754e-06,
"logits/chosen": -51652922.18181818,
"logits/rejected": -61749913.6,
"logps/chosen": -276.02401455965907,
"logps/rejected": -134.616015625,
"loss": 0.3498,
"rewards/chosen": 1.614875620061701,
"rewards/margins": 8.703042429143732,
"rewards/rejected": -7.088166809082031,
"step": 157
},
{
"epoch": 0.4377489177489178,
"grad_norm": 4.077398185150515,
"kl": 330.6680908203125,
"learning_rate": 3.0255323542711784e-06,
"logits/chosen": -60507984.0,
"logits/rejected": -11630900.0,
"logps/chosen": -268.957275390625,
"logps/rejected": -127.5375747680664,
"loss": 0.2849,
"rewards/chosen": 2.3366661071777344,
"rewards/margins": 7.807009220123291,
"rewards/rejected": -5.470343112945557,
"step": 158
},
{
"epoch": 0.44051948051948053,
"grad_norm": 3.2322970154513215,
"kl": 261.8209228515625,
"learning_rate": 3.00394338114187e-06,
"logits/chosen": -40481952.0,
"logits/rejected": -142613808.0,
"logps/chosen": -157.77676391601562,
"logps/rejected": -166.2552032470703,
"loss": 0.333,
"rewards/chosen": -0.08739203214645386,
"rewards/margins": 6.05664998292923,
"rewards/rejected": -6.144042015075684,
"step": 159
},
{
"epoch": 0.4432900432900433,
"grad_norm": 3.1517174776004455,
"kl": 177.34640502929688,
"learning_rate": 2.9823151635237424e-06,
"logits/chosen": -57420648.0,
"logits/rejected": -60590308.0,
"logps/chosen": -281.8450622558594,
"logps/rejected": -194.87185668945312,
"loss": 0.3901,
"rewards/chosen": 2.5421125888824463,
"rewards/margins": 5.507090330123901,
"rewards/rejected": -2.964977741241455,
"step": 160
},
{
"epoch": 0.44606060606060605,
"grad_norm": 3.7184656464192902,
"kl": 154.72317504882812,
"learning_rate": 2.9606493857098657e-06,
"logits/chosen": -40019485.333333336,
"logits/rejected": 303537024.0,
"logps/chosen": -218.24466959635416,
"logps/rejected": -148.39111328125,
"loss": 0.3214,
"rewards/chosen": -1.7011640866597493,
"rewards/margins": 5.837723890940349,
"rewards/rejected": -7.538887977600098,
"step": 161
},
{
"epoch": 0.44883116883116886,
"grad_norm": 3.857199942049942,
"kl": 182.81161499023438,
"learning_rate": 2.938947734918302e-06,
"logits/chosen": -47456576.0,
"logits/rejected": -56964508.44444445,
"logps/chosen": -213.33220563616072,
"logps/rejected": -128.02083333333334,
"loss": 0.2709,
"rewards/chosen": -3.7205652509416853,
"rewards/margins": -0.0823261321537081,
"rewards/rejected": -3.6382391187879772,
"step": 162
},
{
"epoch": 0.4516017316017316,
"grad_norm": 3.6554439186941736,
"kl": 197.32955932617188,
"learning_rate": 2.9172119011607153e-06,
"logits/chosen": -62697898.666666664,
"logits/rejected": -30507272.0,
"logps/chosen": -226.22090657552084,
"logps/rejected": -94.68756866455078,
"loss": 0.2987,
"rewards/chosen": -2.0916875203450522,
"rewards/margins": 2.4424230257670083,
"rewards/rejected": -4.5341105461120605,
"step": 163
},
{
"epoch": 0.45437229437229437,
"grad_norm": 4.608790075602045,
"kl": 213.34255981445312,
"learning_rate": 2.8954435771107604e-06,
"logits/chosen": -79102016.0,
"logits/rejected": -62801610.666666664,
"logps/chosen": -278.2896423339844,
"logps/rejected": -202.40938313802084,
"loss": 0.2914,
"rewards/chosen": -6.47105598449707,
"rewards/margins": -0.2049547831217451,
"rewards/rejected": -6.266101201375325,
"step": 164
},
{
"epoch": 0.45714285714285713,
"grad_norm": 3.3882159493387616,
"kl": 198.41207885742188,
"learning_rate": 2.8736444579722665e-06,
"logits/chosen": -78504037.33333333,
"logits/rejected": -73070310.4,
"logps/chosen": -327.8160400390625,
"logps/rejected": -222.9837890625,
"loss": 0.2741,
"rewards/chosen": 1.1677392323811848,
"rewards/margins": 8.064625676472982,
"rewards/rejected": -6.8968864440917965,
"step": 165
},
{
"epoch": 0.45991341991341994,
"grad_norm": 3.279972807646011,
"kl": 142.17318725585938,
"learning_rate": 2.8518162413472266e-06,
"logits/chosen": -85365942.85714285,
"logits/rejected": -70893845.33333333,
"logps/chosen": -279.30538504464283,
"logps/rejected": -182.84242078993054,
"loss": 0.3081,
"rewards/chosen": -2.3745449611118863,
"rewards/margins": 1.4273942614358566,
"rewards/rejected": -3.801939222547743,
"step": 166
},
{
"epoch": 0.4626839826839827,
"grad_norm": 3.6661575094256427,
"kl": 89.53360748291016,
"learning_rate": 2.8299606271035913e-06,
"logits/chosen": -52625970.28571428,
"logits/rejected": -72505472.0,
"logps/chosen": -301.35030691964283,
"logps/rejected": -205.73879665798611,
"loss": 0.3063,
"rewards/chosen": 1.8995137895856584,
"rewards/margins": 8.419966500902932,
"rewards/rejected": -6.520452711317274,
"step": 167
},
{
"epoch": 0.46545454545454545,
"grad_norm": 4.152823525128342,
"kl": 183.32223510742188,
"learning_rate": 2.8080793172428965e-06,
"logits/chosen": -73402512.0,
"logits/rejected": -142298960.0,
"logps/chosen": -396.2037353515625,
"logps/rejected": -394.7413330078125,
"loss": 0.3263,
"rewards/chosen": 1.2748676141103108,
"rewards/margins": 13.17820700009664,
"rewards/rejected": -11.903339385986328,
"step": 168
},
{
"epoch": 0.4682251082251082,
"grad_norm": 4.915619443451149,
"kl": 157.55738830566406,
"learning_rate": 2.786174015767721e-06,
"logits/chosen": -56088037.333333336,
"logits/rejected": -43772393.6,
"logps/chosen": -226.74296061197916,
"logps/rejected": -226.644873046875,
"loss": 0.255,
"rewards/chosen": -4.559232076009114,
"rewards/margins": 4.601092020670573,
"rewards/rejected": -9.160324096679688,
"step": 169
},
{
"epoch": 0.470995670995671,
"grad_norm": 5.3345479859735345,
"kl": 184.7256317138672,
"learning_rate": 2.764246428548983e-06,
"logits/chosen": -51022968.0,
"logits/rejected": -58412576.0,
"logps/chosen": -207.92623901367188,
"logps/rejected": -239.64891052246094,
"loss": 0.2828,
"rewards/chosen": -1.1218031644821167,
"rewards/margins": 8.50666630268097,
"rewards/rejected": -9.628469467163086,
"step": 170
},
{
"epoch": 0.4737662337662338,
"grad_norm": 4.13624571497418,
"kl": 177.11712646484375,
"learning_rate": 2.742298263193099e-06,
"logits/chosen": -99049797.81818181,
"logits/rejected": -67656134.4,
"logps/chosen": -280.09694602272725,
"logps/rejected": -271.40283203125,
"loss": 0.3186,
"rewards/chosen": -1.507522236217152,
"rewards/margins": 6.865571941028941,
"rewards/rejected": -8.373094177246093,
"step": 171
},
{
"epoch": 0.47653679653679654,
"grad_norm": 3.0904137276117103,
"kl": 200.4022216796875,
"learning_rate": 2.720331228909005e-06,
"logits/chosen": -42735584.0,
"logits/rejected": -45149984.0,
"logps/chosen": -211.05224609375,
"logps/rejected": -235.17949567522322,
"loss": 0.3305,
"rewards/chosen": -1.8013004726833768,
"rewards/margins": 1.051658206515842,
"rewards/rejected": -2.8529586791992188,
"step": 172
},
{
"epoch": 0.4793073593073593,
"grad_norm": 4.247838462733702,
"kl": 208.93966674804688,
"learning_rate": 2.6983470363750497e-06,
"logits/chosen": -52870764.8,
"logits/rejected": -93502741.33333333,
"logps/chosen": -247.8509521484375,
"logps/rejected": -176.8176472981771,
"loss": 0.2907,
"rewards/chosen": 0.4748857498168945,
"rewards/margins": 6.685143597920735,
"rewards/rejected": -6.210257848103841,
"step": 173
},
{
"epoch": 0.4820779220779221,
"grad_norm": 3.6922139188578385,
"kl": 199.78564453125,
"learning_rate": 2.6763473976057776e-06,
"logits/chosen": -78229719.27272727,
"logits/rejected": -64247590.4,
"logps/chosen": -343.76633522727275,
"logps/rejected": -289.5193359375,
"loss": 0.3302,
"rewards/chosen": 0.27789141915061255,
"rewards/margins": 6.072880051352761,
"rewards/rejected": -5.794988632202148,
"step": 174
},
{
"epoch": 0.48484848484848486,
"grad_norm": 2.1830098135367253,
"kl": 291.7579345703125,
"learning_rate": 2.6543340258186063e-06,
"logits/chosen": -61894516.0,
"logits/rejected": -88448416.0,
"logps/chosen": -175.04672241210938,
"logps/rejected": -153.3810577392578,
"loss": 0.2448,
"rewards/chosen": -1.9807024002075195,
"rewards/margins": 2.309774398803711,
"rewards/rejected": -4.2904767990112305,
"step": 175
},
{
"epoch": 0.4876190476190476,
"grad_norm": 4.218222845890689,
"kl": 119.53733825683594,
"learning_rate": 2.6323086353004077e-06,
"logits/chosen": -85618194.28571428,
"logits/rejected": -93672120.8888889,
"logps/chosen": -396.98486328125,
"logps/rejected": -197.43092176649304,
"loss": 0.2549,
"rewards/chosen": 0.570185661315918,
"rewards/margins": 5.8234213723076715,
"rewards/rejected": -5.253235710991754,
"step": 176
},
{
"epoch": 0.4903896103896104,
"grad_norm": 2.596784708198279,
"kl": 273.5486145019531,
"learning_rate": 2.610272941274012e-06,
"logits/chosen": -33087848.0,
"logits/rejected": -25035680.0,
"logps/chosen": -114.39351654052734,
"logps/rejected": -187.162353515625,
"loss": 0.2989,
"rewards/chosen": -1.1123628616333008,
"rewards/margins": 6.887551307678223,
"rewards/rejected": -7.999914169311523,
"step": 177
},
{
"epoch": 0.4931601731601732,
"grad_norm": 3.339051362841475,
"kl": 120.95160675048828,
"learning_rate": 2.588228659764632e-06,
"logits/chosen": -50194481.777777776,
"logits/rejected": -46198921.14285714,
"logps/chosen": -192.59000651041666,
"logps/rejected": -185.52493722098214,
"loss": 0.3189,
"rewards/chosen": -6.144053565131293,
"rewards/margins": -1.0134704832046753,
"rewards/rejected": -5.130583081926618,
"step": 178
},
{
"epoch": 0.49593073593073594,
"grad_norm": 3.4153821985604975,
"kl": 170.84121704101562,
"learning_rate": 2.5661775074662276e-06,
"logits/chosen": -130978457.6,
"logits/rejected": -44669677.333333336,
"logps/chosen": -237.725830078125,
"logps/rejected": -108.64566040039062,
"loss": 0.3417,
"rewards/chosen": -0.7145191192626953,
"rewards/margins": 3.991784159342448,
"rewards/rejected": -4.7063032786051435,
"step": 179
},
{
"epoch": 0.4987012987012987,
"grad_norm": 3.013896524570175,
"kl": 204.37509155273438,
"learning_rate": 2.544121201607822e-06,
"logits/chosen": -84088128.0,
"logits/rejected": -29717827.2,
"logps/chosen": -383.296875,
"logps/rejected": -157.595068359375,
"loss": 0.3454,
"rewards/chosen": 1.9199379285176594,
"rewards/margins": 7.402603562672932,
"rewards/rejected": -5.482665634155273,
"step": 180
},
{
"epoch": 0.4987012987012987,
"eval_logits/chosen": -67193476.26666667,
"eval_logits/rejected": -37107747.31034483,
"eval_logps/chosen": -344.8448893229167,
"eval_logps/rejected": -226.03357354525863,
"eval_loss": 0.30528655648231506,
"eval_rewards/chosen": 1.4481372833251953,
"eval_rewards/margins": 7.979209965672986,
"eval_rewards/rejected": -6.531072682347791,
"eval_runtime": 375.7871,
"eval_samples_per_second": 1.243,
"eval_steps_per_second": 0.157,
"kl": 836.632568359375,
"step": 180
},
{
"epoch": 0.5014718614718615,
"grad_norm": 4.356060168329763,
"kl": 191.42889404296875,
"learning_rate": 2.5220614598197708e-06,
"logits/chosen": -107889254.4,
"logits/rejected": -52258368.0,
"logps/chosen": -304.1372314453125,
"logps/rejected": -282.9250081380208,
"loss": 0.3333,
"rewards/chosen": -2.5827245712280273,
"rewards/margins": 6.893130302429199,
"rewards/rejected": -9.475854873657227,
"step": 181
},
{
"epoch": 0.5042424242424243,
"grad_norm": 4.2430092265377715,
"kl": 188.14871215820312,
"learning_rate": 2.5e-06,
"logits/chosen": -124263899.42857143,
"logits/rejected": -47703192.88888889,
"logps/chosen": -204.89937918526786,
"logps/rejected": -259.95594618055554,
"loss": 0.3226,
"rewards/chosen": -2.21267454964774,
"rewards/margins": 1.8116386353023466,
"rewards/rejected": -4.024313184950087,
"step": 182
},
{
"epoch": 0.507012987012987,
"grad_norm": 2.722752429679691,
"kl": 169.01434326171875,
"learning_rate": 2.477938540180231e-06,
"logits/chosen": -49152826.666666664,
"logits/rejected": -71773224.0,
"logps/chosen": -209.00362141927084,
"logps/rejected": -120.16709899902344,
"loss": 0.3903,
"rewards/chosen": -1.652681032816569,
"rewards/margins": 2.2883280118306475,
"rewards/rejected": -3.941009044647217,
"step": 183
},
{
"epoch": 0.5097835497835498,
"grad_norm": 3.4971096121656378,
"kl": 147.58242797851562,
"learning_rate": 2.455878798392179e-06,
"logits/chosen": -64680269.71428572,
"logits/rejected": -103894449.77777778,
"logps/chosen": -346.93739536830356,
"logps/rejected": -235.97618272569446,
"loss": 0.3303,
"rewards/chosen": 2.4380479540143694,
"rewards/margins": 12.841405565776522,
"rewards/rejected": -10.403357611762154,
"step": 184
},
{
"epoch": 0.5125541125541125,
"grad_norm": 3.538113732240373,
"kl": 341.8497314453125,
"learning_rate": 2.433822492533774e-06,
"logits/chosen": -39145449.14285714,
"logits/rejected": -84920298.66666667,
"logps/chosen": -229.3765869140625,
"logps/rejected": -230.70206705729166,
"loss": 0.3537,
"rewards/chosen": -5.895386832101004,
"rewards/margins": 1.4380838303338912,
"rewards/rejected": -7.3334706624348955,
"step": 185
},
{
"epoch": 0.5153246753246753,
"grad_norm": 2.9086188103636714,
"kl": 114.81712341308594,
"learning_rate": 2.411771340235369e-06,
"logits/chosen": -49792800.0,
"logits/rejected": -105182803.2,
"logps/chosen": -250.2506306966146,
"logps/rejected": -235.0851806640625,
"loss": 0.3312,
"rewards/chosen": -4.144415219624837,
"rewards/margins": 1.8368858973185223,
"rewards/rejected": -5.981301116943359,
"step": 186
},
{
"epoch": 0.518095238095238,
"grad_norm": 6.561660944265687,
"kl": 104.41949462890625,
"learning_rate": 2.389727058725989e-06,
"logits/chosen": -158027867.42857143,
"logits/rejected": -63834716.44444445,
"logps/chosen": -347.48318917410717,
"logps/rejected": -279.6531032986111,
"loss": 0.342,
"rewards/chosen": 2.9799742017473494,
"rewards/margins": 11.362875711350215,
"rewards/rejected": -8.382901509602865,
"step": 187
},
{
"epoch": 0.5208658008658009,
"grad_norm": 3.079713812899905,
"kl": 112.68702697753906,
"learning_rate": 2.3676913646995923e-06,
"logits/chosen": -87418666.66666667,
"logits/rejected": -104699712.0,
"logps/chosen": -482.70909288194446,
"logps/rejected": -167.60402134486608,
"loss": 0.2445,
"rewards/chosen": 4.543322245279948,
"rewards/margins": 11.19434828985305,
"rewards/rejected": -6.651026044573102,
"step": 188
},
{
"epoch": 0.5236363636363637,
"grad_norm": 3.384610966553019,
"kl": 159.68157958984375,
"learning_rate": 2.3456659741813945e-06,
"logits/chosen": -69022734.22222222,
"logits/rejected": -40000859.428571425,
"logps/chosen": -371.64873589409723,
"logps/rejected": -211.55555943080358,
"loss": 0.3456,
"rewards/chosen": 0.04310830434163412,
"rewards/margins": 7.334953353518531,
"rewards/rejected": -7.291845049176898,
"step": 189
},
{
"epoch": 0.5264069264069264,
"grad_norm": 3.8221129014555633,
"kl": 226.33648681640625,
"learning_rate": 2.3236526023942224e-06,
"logits/chosen": -64279296.0,
"logits/rejected": -94565772.8,
"logps/chosen": -300.8207341974432,
"logps/rejected": -186.9443603515625,
"loss": 0.3614,
"rewards/chosen": -3.407633001154119,
"rewards/margins": 2.492927759343928,
"rewards/rejected": -5.900560760498047,
"step": 190
},
{
"epoch": 0.5291774891774892,
"grad_norm": 3.7771126966424564,
"kl": 137.0946044921875,
"learning_rate": 2.301652963624951e-06,
"logits/chosen": -76113541.81818181,
"logits/rejected": -51425171.2,
"logps/chosen": -346.21928267045456,
"logps/rejected": -229.1968994140625,
"loss": 0.3241,
"rewards/chosen": -2.7908023487437856,
"rewards/margins": 1.7894470561634406,
"rewards/rejected": -4.580249404907226,
"step": 191
},
{
"epoch": 0.531948051948052,
"grad_norm": 1.8598401806559461,
"kl": 197.385986328125,
"learning_rate": 2.2796687710909966e-06,
"logits/chosen": -55358822.4,
"logits/rejected": -102614474.66666667,
"logps/chosen": -203.16923828125,
"logps/rejected": -193.5582275390625,
"loss": 0.3448,
"rewards/chosen": -4.700377655029297,
"rewards/margins": 0.5808376948038738,
"rewards/rejected": -5.281215349833171,
"step": 192
},
{
"epoch": 0.5347186147186147,
"grad_norm": 3.834495630642973,
"kl": 187.49928283691406,
"learning_rate": 2.2577017368069017e-06,
"logits/chosen": -108176749.71428572,
"logits/rejected": -47750257.777777776,
"logps/chosen": -469.99124581473217,
"logps/rejected": -177.22574869791666,
"loss": 0.2616,
"rewards/chosen": 1.035844189780099,
"rewards/margins": 8.450432270292252,
"rewards/rejected": -7.414588080512153,
"step": 193
},
{
"epoch": 0.5374891774891775,
"grad_norm": 3.186977461146233,
"kl": 90.20316314697266,
"learning_rate": 2.235753571451018e-06,
"logits/chosen": -72693686.85714285,
"logits/rejected": -53111786.666666664,
"logps/chosen": -186.34223284040178,
"logps/rejected": -159.53831651475696,
"loss": 0.3198,
"rewards/chosen": -0.7966529982430595,
"rewards/margins": 2.663518807244679,
"rewards/rejected": -3.4601718054877386,
"step": 194
},
{
"epoch": 0.5402597402597402,
"grad_norm": 4.255977101246861,
"kl": 178.103759765625,
"learning_rate": 2.2138259842322794e-06,
"logits/chosen": -94066453.33333333,
"logits/rejected": -80498636.8,
"logps/chosen": -396.0868733723958,
"logps/rejected": -196.08753662109376,
"loss": 0.2509,
"rewards/chosen": -0.41888968149820965,
"rewards/margins": 3.4590808232625325,
"rewards/rejected": -3.877970504760742,
"step": 195
},
{
"epoch": 0.5430303030303031,
"grad_norm": 3.6907369184366896,
"kl": 121.10801696777344,
"learning_rate": 2.191920682757104e-06,
"logits/chosen": -55915576.0,
"logits/rejected": -137895184.0,
"logps/chosen": -360.009765625,
"logps/rejected": -233.00111389160156,
"loss": 0.3048,
"rewards/chosen": -0.7659169435501099,
"rewards/margins": 6.7380160093307495,
"rewards/rejected": -7.503932952880859,
"step": 196
},
{
"epoch": 0.5458008658008658,
"grad_norm": 2.2704628703365444,
"kl": 160.8948974609375,
"learning_rate": 2.170039372896409e-06,
"logits/chosen": -96806220.8,
"logits/rejected": -139429973.33333334,
"logps/chosen": -165.101806640625,
"logps/rejected": -218.95597330729166,
"loss": 0.2765,
"rewards/chosen": -0.4397267818450928,
"rewards/margins": 5.83861387570699,
"rewards/rejected": -6.278340657552083,
"step": 197
},
{
"epoch": 0.5485714285714286,
"grad_norm": 3.3593101598446586,
"kl": 155.88333129882812,
"learning_rate": 2.148183758652774e-06,
"logits/chosen": -59763976.0,
"logits/rejected": -74410616.0,
"logps/chosen": -267.308837890625,
"logps/rejected": -214.98797607421875,
"loss": 0.2769,
"rewards/chosen": -1.0068299770355225,
"rewards/margins": 4.386720895767212,
"rewards/rejected": -5.393550872802734,
"step": 198
},
{
"epoch": 0.5513419913419914,
"grad_norm": 3.348634978846618,
"kl": 211.95455932617188,
"learning_rate": 2.126355542027734e-06,
"logits/chosen": -93054104.0,
"logits/rejected": -36135216.0,
"logps/chosen": -209.05706787109375,
"logps/rejected": -142.19302368164062,
"loss": 0.3478,
"rewards/chosen": -2.386659622192383,
"rewards/margins": 0.8848772048950195,
"rewards/rejected": -3.2715368270874023,
"step": 199
},
{
"epoch": 0.5541125541125541,
"grad_norm": 2.6185958603566855,
"kl": 133.95506286621094,
"learning_rate": 2.1045564228892404e-06,
"logits/chosen": -59569035.63636363,
"logits/rejected": -37804576.0,
"logps/chosen": -369.35862038352275,
"logps/rejected": -124.1267578125,
"loss": 0.2268,
"rewards/chosen": 1.7300832921808416,
"rewards/margins": 7.441509073430842,
"rewards/rejected": -5.71142578125,
"step": 200
},
{
"epoch": 0.5568831168831169,
"grad_norm": 3.1853022448974375,
"kl": 239.44927978515625,
"learning_rate": 2.0827880988392856e-06,
"logits/chosen": -84826208.0,
"logits/rejected": -94426064.0,
"logps/chosen": -268.79168701171875,
"logps/rejected": -142.93902587890625,
"loss": 0.274,
"rewards/chosen": 0.07495087385177612,
"rewards/margins": 3.469887435436249,
"rewards/rejected": -3.3949365615844727,
"step": 201
},
{
"epoch": 0.5596536796536796,
"grad_norm": 2.8423454716873917,
"kl": 168.17323303222656,
"learning_rate": 2.0610522650816985e-06,
"logits/chosen": -68017907.2,
"logits/rejected": -222357141.33333334,
"logps/chosen": -231.947509765625,
"logps/rejected": -189.9059855143229,
"loss": 0.3558,
"rewards/chosen": -0.16442975997924805,
"rewards/margins": 6.447909895579021,
"rewards/rejected": -6.6123396555582685,
"step": 202
},
{
"epoch": 0.5624242424242424,
"grad_norm": 3.7346854730125254,
"kl": 104.60269927978516,
"learning_rate": 2.0393506142901347e-06,
"logits/chosen": -60994228.36363637,
"logits/rejected": -107862118.4,
"logps/chosen": -260.94948508522725,
"logps/rejected": -245.2341552734375,
"loss": 0.274,
"rewards/chosen": 1.0195409601384944,
"rewards/margins": 9.791877538507634,
"rewards/rejected": -8.77233657836914,
"step": 203
},
{
"epoch": 0.5651948051948052,
"grad_norm": 2.8605120324430215,
"kl": 205.68209838867188,
"learning_rate": 2.017684836476258e-06,
"logits/chosen": -49999822.222222224,
"logits/rejected": -84995885.71428572,
"logps/chosen": -266.4399685329861,
"logps/rejected": -143.72217668805803,
"loss": 0.3008,
"rewards/chosen": 1.8411036597357855,
"rewards/margins": 8.940965894668821,
"rewards/rejected": -7.099862234933036,
"step": 204
},
{
"epoch": 0.567965367965368,
"grad_norm": 3.280948123262779,
"kl": 216.97950744628906,
"learning_rate": 1.9960566188581306e-06,
"logits/chosen": -46048905.14285714,
"logits/rejected": -57397656.88888889,
"logps/chosen": -184.09369768415178,
"logps/rejected": -143.51478407118054,
"loss": 0.343,
"rewards/chosen": -7.50201416015625,
"rewards/margins": -3.1174642774793835,
"rewards/rejected": -4.3845498826768665,
"step": 205
},
{
"epoch": 0.5707359307359308,
"grad_norm": 2.4558271657409336,
"kl": 286.4020080566406,
"learning_rate": 1.9744676457288225e-06,
"logits/chosen": -77790198.85714285,
"logits/rejected": -47510997.333333336,
"logps/chosen": -345.818603515625,
"logps/rejected": -197.7877197265625,
"loss": 0.3601,
"rewards/chosen": -1.7951624734061105,
"rewards/margins": 2.253208311777266,
"rewards/rejected": -4.048370785183376,
"step": 206
},
{
"epoch": 0.5735064935064935,
"grad_norm": 3.942002526387752,
"kl": 206.5150146484375,
"learning_rate": 1.952919598325247e-06,
"logits/chosen": -55335221.333333336,
"logits/rejected": -34377880.0,
"logps/chosen": -207.23506673177084,
"logps/rejected": -113.67945861816406,
"loss": 0.3738,
"rewards/chosen": -2.086402416229248,
"rewards/margins": 5.298608779907227,
"rewards/rejected": -7.385011196136475,
"step": 207
},
{
"epoch": 0.5762770562770563,
"grad_norm": 2.5572946277340742,
"kl": 279.0215148925781,
"learning_rate": 1.9314141546972345e-06,
"logits/chosen": -59309312.0,
"logits/rejected": -43267481.6,
"logps/chosen": -198.94559733072916,
"logps/rejected": -191.52454833984376,
"loss": 0.2513,
"rewards/chosen": -4.163699150085449,
"rewards/margins": 1.1932497024536133,
"rewards/rejected": -5.3569488525390625,
"step": 208
},
{
"epoch": 0.579047619047619,
"grad_norm": 3.792975813303454,
"kl": 169.12713623046875,
"learning_rate": 1.9099529895768552e-06,
"logits/chosen": -41098888.0,
"logits/rejected": -99659952.0,
"logps/chosen": -248.0914306640625,
"logps/rejected": -194.57171630859375,
"loss": 0.2727,
"rewards/chosen": 4.070443630218506,
"rewards/margins": 10.523594379425049,
"rewards/rejected": -6.453150749206543,
"step": 209
},
{
"epoch": 0.5818181818181818,
"grad_norm": 3.1456231702627186,
"kl": 189.19842529296875,
"learning_rate": 1.8885377742480005e-06,
"logits/chosen": -86137941.33333333,
"logits/rejected": -69529678.76923077,
"logps/chosen": -357.3465169270833,
"logps/rejected": -243.63138521634616,
"loss": 0.3118,
"rewards/chosen": 3.267169952392578,
"rewards/margins": 9.234573657696064,
"rewards/rejected": -5.967403705303486,
"step": 210
},
{
"epoch": 0.5845887445887445,
"grad_norm": 3.255048348416602,
"kl": 218.10623168945312,
"learning_rate": 1.8671701764162287e-06,
"logits/chosen": -59820352.0,
"logits/rejected": -43202088.0,
"logps/chosen": -235.07044982910156,
"logps/rejected": -230.73135375976562,
"loss": 0.2568,
"rewards/chosen": -0.05381596088409424,
"rewards/margins": 9.376083254814148,
"rewards/rejected": -9.429899215698242,
"step": 211
},
{
"epoch": 0.5873593073593074,
"grad_norm": 2.54679273461964,
"kl": 152.00927734375,
"learning_rate": 1.8458518600788988e-06,
"logits/chosen": -77701637.81818181,
"logits/rejected": -50945308.8,
"logps/chosen": -347.40553977272725,
"logps/rejected": -185.97353515625,
"loss": 0.277,
"rewards/chosen": 4.390115217729048,
"rewards/margins": 9.58318353132768,
"rewards/rejected": -5.193068313598633,
"step": 212
},
{
"epoch": 0.5901298701298702,
"grad_norm": 2.766794898118744,
"kl": 188.09686279296875,
"learning_rate": 1.8245844853955786e-06,
"logits/chosen": -61706139.428571425,
"logits/rejected": -98877696.0,
"logps/chosen": -208.24166434151786,
"logps/rejected": -238.23269314236111,
"loss": 0.2229,
"rewards/chosen": -0.4899448667253767,
"rewards/margins": 5.7206187929425925,
"rewards/rejected": -6.210563659667969,
"step": 213
},
{
"epoch": 0.5929004329004329,
"grad_norm": 3.077063238549823,
"kl": 192.8994140625,
"learning_rate": 1.8033697085587698e-06,
"logits/chosen": -44352933.333333336,
"logits/rejected": -61902777.6,
"logps/chosen": -228.66778564453125,
"logps/rejected": -204.53642578125,
"loss": 0.3359,
"rewards/chosen": -4.820536295572917,
"rewards/margins": 1.5907872517903643,
"rewards/rejected": -6.411323547363281,
"step": 214
},
{
"epoch": 0.5956709956709957,
"grad_norm": 3.355279316331082,
"kl": 348.7542724609375,
"learning_rate": 1.782209181664924e-06,
"logits/chosen": -79986400.0,
"logits/rejected": -69224832.0,
"logps/chosen": -353.9718017578125,
"logps/rejected": -196.55494689941406,
"loss": 0.2641,
"rewards/chosen": 0.0889444351196289,
"rewards/margins": 6.460943698883057,
"rewards/rejected": -6.371999263763428,
"step": 215
},
{
"epoch": 0.5984415584415584,
"grad_norm": 3.652340653227592,
"kl": 177.54664611816406,
"learning_rate": 1.7611045525857902e-06,
"logits/chosen": -77715251.2,
"logits/rejected": -51552048.0,
"logps/chosen": -340.14111328125,
"logps/rejected": -178.1352335611979,
"loss": 0.2633,
"rewards/chosen": 0.6218101501464843,
"rewards/margins": 7.701845041910808,
"rewards/rejected": -7.080034891764323,
"step": 216
},
{
"epoch": 0.5984415584415584,
"eval_logits/chosen": -68801646.93333334,
"eval_logits/rejected": -37986458.48275862,
"eval_logps/chosen": -343.46266276041666,
"eval_logps/rejected": -225.37659954202587,
"eval_loss": 0.30848661065101624,
"eval_rewards/chosen": 1.586359405517578,
"eval_rewards/margins": 8.051734582309065,
"eval_rewards/rejected": -6.465375176791487,
"eval_runtime": 375.7911,
"eval_samples_per_second": 1.243,
"eval_steps_per_second": 0.157,
"kl": 974.3778076171875,
"step": 216
},
{
"epoch": 0.6012121212121212,
"grad_norm": 2.6940300170171376,
"kl": 296.03472900390625,
"learning_rate": 1.740057464840088e-06,
"logits/chosen": -72538086.4,
"logits/rejected": -77256587.63636364,
"logps/chosen": -338.1424560546875,
"logps/rejected": -178.35451438210228,
"loss": 0.2693,
"rewards/chosen": 4.033015441894531,
"rewards/margins": 9.458120796897195,
"rewards/rejected": -5.425105355002663,
"step": 217
},
{
"epoch": 0.603982683982684,
"grad_norm": 5.880026752195334,
"kl": 285.2386169433594,
"learning_rate": 1.7190695574655147e-06,
"logits/chosen": -67694520.8888889,
"logits/rejected": -37378576.0,
"logps/chosen": -280.1799045138889,
"logps/rejected": -164.02565220424108,
"loss": 0.3916,
"rewards/chosen": -1.6728751924302843,
"rewards/margins": 3.780970240396166,
"rewards/rejected": -5.4538454328264505,
"step": 218
},
{
"epoch": 0.6067532467532467,
"grad_norm": 2.610552070109955,
"kl": 240.72793579101562,
"learning_rate": 1.6981424648911112e-06,
"logits/chosen": -41839498.666666664,
"logits/rejected": -50530960.0,
"logps/chosen": -212.16925048828125,
"logps/rejected": -200.35166015625,
"loss": 0.308,
"rewards/chosen": -0.23567096392313638,
"rewards/margins": 5.863442881902059,
"rewards/rejected": -6.099113845825196,
"step": 219
},
{
"epoch": 0.6095238095238096,
"grad_norm": 3.401876003409717,
"kl": 323.20635986328125,
"learning_rate": 1.677277816809975e-06,
"logits/chosen": -67701248.0,
"logits/rejected": -70669077.33333333,
"logps/chosen": -345.4431396484375,
"logps/rejected": -145.46737670898438,
"loss": 0.336,
"rewards/chosen": -4.474223327636719,
"rewards/margins": -2.6156206448872883,
"rewards/rejected": -1.8586026827494304,
"step": 220
},
{
"epoch": 0.6122943722943723,
"grad_norm": 3.915703434239461,
"kl": 216.21176147460938,
"learning_rate": 1.6564772380523546e-06,
"logits/chosen": -58600466.28571428,
"logits/rejected": -74421880.8888889,
"logps/chosen": -241.07268415178572,
"logps/rejected": -211.32619900173611,
"loss": 0.368,
"rewards/chosen": 1.955857821873256,
"rewards/margins": 9.50594394926041,
"rewards/rejected": -7.550086127387153,
"step": 221
},
{
"epoch": 0.6150649350649351,
"grad_norm": 3.6577888277996102,
"kl": 207.39048767089844,
"learning_rate": 1.635742348459109e-06,
"logits/chosen": -60414848.0,
"logits/rejected": -129543833.6,
"logps/chosen": -279.7829996744792,
"logps/rejected": -161.88243408203124,
"loss": 0.2651,
"rewards/chosen": 1.968168576558431,
"rewards/margins": 7.690291913350423,
"rewards/rejected": -5.722123336791992,
"step": 222
},
{
"epoch": 0.6178354978354978,
"grad_norm": 4.7884656461897,
"kl": 286.3252868652344,
"learning_rate": 1.6150747627555713e-06,
"logits/chosen": -74578944.0,
"logits/rejected": -49674577.777777776,
"logps/chosen": -284.42117745535717,
"logps/rejected": -222.95830620659723,
"loss": 0.3203,
"rewards/chosen": -3.2675579616001675,
"rewards/margins": 4.710056547134641,
"rewards/rejected": -7.977614508734809,
"step": 223
},
{
"epoch": 0.6206060606060606,
"grad_norm": 3.378925338056134,
"kl": 275.15087890625,
"learning_rate": 1.5944760904257944e-06,
"logits/chosen": -46858032.0,
"logits/rejected": -68605488.0,
"logps/chosen": -253.7530314127604,
"logps/rejected": -271.87481689453125,
"loss": 0.3245,
"rewards/chosen": -1.6568760871887207,
"rewards/margins": 7.310287952423096,
"rewards/rejected": -8.967164039611816,
"step": 224
},
{
"epoch": 0.6233766233766234,
"grad_norm": 5.2707958999962266,
"kl": 302.9314880371094,
"learning_rate": 1.5739479355872162e-06,
"logits/chosen": -47392557.71428572,
"logits/rejected": -34227840.0,
"logps/chosen": -175.62360491071428,
"logps/rejected": -144.90973578559027,
"loss": 0.3358,
"rewards/chosen": -2.6706837245396207,
"rewards/margins": 0.10068981231205054,
"rewards/rejected": -2.771373536851671,
"step": 225
},
{
"epoch": 0.6261471861471861,
"grad_norm": 3.9348746172309466,
"kl": 300.81634521484375,
"learning_rate": 1.5534918968657423e-06,
"logits/chosen": -88354311.1111111,
"logits/rejected": -88495917.71428572,
"logps/chosen": -483.0252278645833,
"logps/rejected": -221.13459123883928,
"loss": 0.2938,
"rewards/chosen": 1.3839975992838542,
"rewards/margins": 7.2059002830868675,
"rewards/rejected": -5.821902683803013,
"step": 226
},
{
"epoch": 0.6289177489177489,
"grad_norm": 3.5239597024615374,
"kl": 304.20819091796875,
"learning_rate": 1.5331095672712463e-06,
"logits/chosen": -59224921.6,
"logits/rejected": -51953280.0,
"logps/chosen": -307.050732421875,
"logps/rejected": -200.07796223958334,
"loss": 0.4037,
"rewards/chosen": -1.031832218170166,
"rewards/margins": 5.969323635101318,
"rewards/rejected": -7.001155853271484,
"step": 227
},
{
"epoch": 0.6316883116883117,
"grad_norm": 3.3758964781856307,
"kl": 177.49435424804688,
"learning_rate": 1.5128025340735223e-06,
"logits/chosen": -72111429.81818181,
"logits/rejected": -45238502.4,
"logps/chosen": -310.7448064630682,
"logps/rejected": -189.208349609375,
"loss": 0.3153,
"rewards/chosen": -3.234787334095348,
"rewards/margins": 1.2791733134876595,
"rewards/rejected": -4.513960647583008,
"step": 228
},
{
"epoch": 0.6344588744588745,
"grad_norm": 3.4930191805210673,
"kl": 288.73455810546875,
"learning_rate": 1.4925723786786691e-06,
"logits/chosen": -57290352.0,
"logits/rejected": -64807464.0,
"logps/chosen": -269.4695739746094,
"logps/rejected": -178.97718811035156,
"loss": 0.3023,
"rewards/chosen": -3.9210712909698486,
"rewards/margins": 1.226576566696167,
"rewards/rejected": -5.147647857666016,
"step": 229
},
{
"epoch": 0.6372294372294373,
"grad_norm": 3.1177672935847545,
"kl": 135.36849975585938,
"learning_rate": 1.4724206765059456e-06,
"logits/chosen": -54134994.28571428,
"logits/rejected": -111949852.44444445,
"logps/chosen": -204.20844377790178,
"logps/rejected": -233.01820203993054,
"loss": 0.2649,
"rewards/chosen": 0.520991427557809,
"rewards/margins": 6.7558784447019065,
"rewards/rejected": -6.234887017144097,
"step": 230
},
{
"epoch": 0.64,
"grad_norm": 2.7643709324145864,
"kl": 178.657958984375,
"learning_rate": 1.4523489968650795e-06,
"logits/chosen": -64459858.28571428,
"logits/rejected": -85256739.55555555,
"logps/chosen": -291.76614815848217,
"logps/rejected": -213.23220486111111,
"loss": 0.3014,
"rewards/chosen": 2.469794682094029,
"rewards/margins": 8.441367558070592,
"rewards/rejected": -5.9715728759765625,
"step": 231
},
{
"epoch": 0.6427705627705628,
"grad_norm": 3.712587969836102,
"kl": 143.9523162841797,
"learning_rate": 1.4323589028340598e-06,
"logits/chosen": -79929376.0,
"logits/rejected": -155408866.46153846,
"logps/chosen": -377.5170084635417,
"logps/rejected": -220.03947566105768,
"loss": 0.2185,
"rewards/chosen": -0.14797210693359375,
"rewards/margins": 6.270457341120793,
"rewards/rejected": -6.418429448054387,
"step": 232
},
{
"epoch": 0.6455411255411255,
"grad_norm": 2.6928811967907924,
"kl": 237.62255859375,
"learning_rate": 1.4124519511374158e-06,
"logits/chosen": -68120402.28571428,
"logits/rejected": -84057173.33333333,
"logps/chosen": -176.9976806640625,
"logps/rejected": -196.41248914930554,
"loss": 0.3564,
"rewards/chosen": 2.186638968331473e-05,
"rewards/margins": 8.546026267702619,
"rewards/rejected": -8.546004401312935,
"step": 233
},
{
"epoch": 0.6483116883116883,
"grad_norm": 3.3451720915869685,
"kl": 248.16729736328125,
"learning_rate": 1.3926296920249796e-06,
"logits/chosen": -69849329.77777778,
"logits/rejected": -48679707.428571425,
"logps/chosen": -240.88053385416666,
"logps/rejected": -153.94259207589286,
"loss": 0.2441,
"rewards/chosen": 2.785261789957682,
"rewards/margins": 9.85936500912621,
"rewards/rejected": -7.074103219168527,
"step": 234
},
{
"epoch": 0.651082251082251,
"grad_norm": 4.331228652019405,
"kl": 217.10455322265625,
"learning_rate": 1.3728936691511704e-06,
"logits/chosen": -53082216.72727273,
"logits/rejected": -73758624.0,
"logps/chosen": -257.35977450284093,
"logps/rejected": -336.2916015625,
"loss": 0.343,
"rewards/chosen": 0.690484263680198,
"rewards/margins": 6.507778193733909,
"rewards/rejected": -5.817293930053711,
"step": 235
},
{
"epoch": 0.6538528138528139,
"grad_norm": 3.7377097810384403,
"kl": 176.5330810546875,
"learning_rate": 1.3532454194547734e-06,
"logits/chosen": -114124352.0,
"logits/rejected": -52099942.4,
"logps/chosen": -380.898193359375,
"logps/rejected": -236.4220703125,
"loss": 0.2775,
"rewards/chosen": -0.5208327770233154,
"rewards/margins": 5.431049394607544,
"rewards/rejected": -5.951882171630859,
"step": 236
},
{
"epoch": 0.6566233766233767,
"grad_norm": 4.191306253129776,
"kl": 201.5948486328125,
"learning_rate": 1.3336864730392587e-06,
"logits/chosen": -98785644.8,
"logits/rejected": -63730141.09090909,
"logps/chosen": -268.1023193359375,
"logps/rejected": -166.88303444602272,
"loss": 0.3229,
"rewards/chosen": -0.36386423110961913,
"rewards/margins": 4.326531470905651,
"rewards/rejected": -4.69039570201527,
"step": 237
},
{
"epoch": 0.6593939393939394,
"grad_norm": 4.9884577158326415,
"kl": 275.1553955078125,
"learning_rate": 1.314218353053619e-06,
"logits/chosen": -50905499.428571425,
"logits/rejected": -108700174.22222222,
"logps/chosen": -168.27641950334822,
"logps/rejected": -248.65044487847223,
"loss": 0.374,
"rewards/chosen": -5.029791695731027,
"rewards/margins": -0.4220421806214345,
"rewards/rejected": -4.607749515109592,
"step": 238
},
{
"epoch": 0.6621645021645022,
"grad_norm": 2.908368881048658,
"kl": 235.3330078125,
"learning_rate": 1.2948425755737592e-06,
"logits/chosen": -48739173.333333336,
"logits/rejected": -37064664.0,
"logps/chosen": -217.1978759765625,
"logps/rejected": -134.1108856201172,
"loss": 0.3171,
"rewards/chosen": -0.37347551186879474,
"rewards/margins": 6.792223970095317,
"rewards/rejected": -7.165699481964111,
"step": 239
},
{
"epoch": 0.6649350649350649,
"grad_norm": 4.311065760657517,
"kl": 178.09713745117188,
"learning_rate": 1.2755606494844294e-06,
"logits/chosen": -59626680.0,
"logits/rejected": -64623596.0,
"logps/chosen": -316.20013427734375,
"logps/rejected": -231.9442901611328,
"loss": 0.2457,
"rewards/chosen": 1.917290449142456,
"rewards/margins": 9.681453466415405,
"rewards/rejected": -7.764163017272949,
"step": 240
},
{
"epoch": 0.6677056277056277,
"grad_norm": 3.0064005045422832,
"kl": 202.15634155273438,
"learning_rate": 1.2563740763617198e-06,
"logits/chosen": -62175616.0,
"logits/rejected": -42690928.0,
"logps/chosen": -308.9885919744318,
"logps/rejected": -176.0036865234375,
"loss": 0.2843,
"rewards/chosen": 1.3145668723366477,
"rewards/margins": 2.3579565741799096,
"rewards/rejected": -1.0433897018432616,
"step": 241
},
{
"epoch": 0.6704761904761904,
"grad_norm": 2.9232235110862885,
"kl": 170.14356994628906,
"learning_rate": 1.2372843503561318e-06,
"logits/chosen": -41083064.88888889,
"logits/rejected": -52004361.14285714,
"logps/chosen": -193.79656304253473,
"logps/rejected": -240.13106863839286,
"loss": 0.3154,
"rewards/chosen": 0.9401733610365126,
"rewards/margins": 7.140057503230988,
"rewards/rejected": -6.199884142194476,
"step": 242
},
{
"epoch": 0.6732467532467532,
"grad_norm": 3.2345169148154636,
"kl": 306.8934020996094,
"learning_rate": 1.218292958076213e-06,
"logits/chosen": -43112040.0,
"logits/rejected": -49439756.0,
"logps/chosen": -206.4788055419922,
"logps/rejected": -203.757080078125,
"loss": 0.342,
"rewards/chosen": 0.21420925855636597,
"rewards/margins": 6.295262038707733,
"rewards/rejected": -6.081052780151367,
"step": 243
},
{
"epoch": 0.6760173160173161,
"grad_norm": 3.5936976457100824,
"kl": 290.11920166015625,
"learning_rate": 1.1994013784727948e-06,
"logits/chosen": -61215940.0,
"logits/rejected": -35973080.0,
"logps/chosen": -332.1205139160156,
"logps/rejected": -155.35565185546875,
"loss": 0.352,
"rewards/chosen": 1.5890569686889648,
"rewards/margins": 5.8782877922058105,
"rewards/rejected": -4.289230823516846,
"step": 244
},
{
"epoch": 0.6787878787878788,
"grad_norm": 3.2106021999017913,
"kl": 253.48971557617188,
"learning_rate": 1.180611082723814e-06,
"logits/chosen": -71740049.45454545,
"logits/rejected": -95363673.6,
"logps/chosen": -374.18319424715907,
"logps/rejected": -203.96192626953126,
"loss": 0.3024,
"rewards/chosen": 0.7615150104869496,
"rewards/margins": 9.2805946003307,
"rewards/rejected": -8.51907958984375,
"step": 245
},
{
"epoch": 0.6815584415584416,
"grad_norm": 3.7801991502937464,
"kl": 188.4849853515625,
"learning_rate": 1.161923534119752e-06,
"logits/chosen": -50279622.4,
"logits/rejected": -42633162.666666664,
"logps/chosen": -254.7062744140625,
"logps/rejected": -164.59708658854166,
"loss": 0.3034,
"rewards/chosen": -1.0920855522155761,
"rewards/margins": 4.658198006947835,
"rewards/rejected": -5.750283559163411,
"step": 246
},
{
"epoch": 0.6843290043290043,
"grad_norm": 2.700192871685373,
"kl": 223.2760772705078,
"learning_rate": 1.1433401879496723e-06,
"logits/chosen": -72866929.77777778,
"logits/rejected": -46679538.28571428,
"logps/chosen": -280.45448133680554,
"logps/rejected": -190.09620884486608,
"loss": 0.2797,
"rewards/chosen": -1.5863361358642578,
"rewards/margins": 4.47297477722168,
"rewards/rejected": -6.0593109130859375,
"step": 247
},
{
"epoch": 0.6870995670995671,
"grad_norm": 3.7313916035101675,
"kl": 229.77716064453125,
"learning_rate": 1.1248624913878966e-06,
"logits/chosen": -90124640.0,
"logits/rejected": -49573136.0,
"logps/chosen": -392.1523132324219,
"logps/rejected": -185.72872924804688,
"loss": 0.3145,
"rewards/chosen": 2.3596444129943848,
"rewards/margins": 9.85338306427002,
"rewards/rejected": -7.493738651275635,
"step": 248
},
{
"epoch": 0.6898701298701299,
"grad_norm": 2.8892760595582017,
"kl": 330.8048095703125,
"learning_rate": 1.1064918833813073e-06,
"logits/chosen": -51090944.0,
"logits/rejected": -71529491.2,
"logps/chosen": -175.30206298828125,
"logps/rejected": -274.307275390625,
"loss": 0.3581,
"rewards/chosen": 0.6159710884094238,
"rewards/margins": 6.964090061187744,
"rewards/rejected": -6.3481189727783205,
"step": 249
},
{
"epoch": 0.6926406926406926,
"grad_norm": 3.5154507753168063,
"kl": 262.57794189453125,
"learning_rate": 1.088229794537283e-06,
"logits/chosen": -64860172.8,
"logits/rejected": -149981525.33333334,
"logps/chosen": -288.94384765625,
"logps/rejected": -265.95355224609375,
"loss": 0.316,
"rewards/chosen": 1.1107938766479493,
"rewards/margins": 10.878010876973471,
"rewards/rejected": -9.767217000325521,
"step": 250
},
{
"epoch": 0.6954112554112554,
"grad_norm": 2.9905382664133535,
"kl": 159.176513671875,
"learning_rate": 1.0700776470122981e-06,
"logits/chosen": -45575300.0,
"logits/rejected": -83707800.0,
"logps/chosen": -157.0391845703125,
"logps/rejected": -256.8617858886719,
"loss": 0.2896,
"rewards/chosen": -0.197882741689682,
"rewards/margins": 8.787455469369888,
"rewards/rejected": -8.98533821105957,
"step": 251
},
{
"epoch": 0.6981818181818182,
"grad_norm": 3.2115303535516118,
"kl": 212.965576171875,
"learning_rate": 1.0520368544011661e-06,
"logits/chosen": -56122581.333333336,
"logits/rejected": -132167224.0,
"logps/chosen": -262.070068359375,
"logps/rejected": -305.2275390625,
"loss": 0.2519,
"rewards/chosen": -0.16033987204233804,
"rewards/margins": 9.907416780789694,
"rewards/rejected": -10.067756652832031,
"step": 252
},
{
"epoch": 0.6981818181818182,
"eval_logits/chosen": -69407142.4,
"eval_logits/rejected": -34758311.72413793,
"eval_logps/chosen": -343.69078776041664,
"eval_logps/rejected": -225.02624932650863,
"eval_loss": 0.3109191656112671,
"eval_rewards/chosen": 1.5635496775309246,
"eval_rewards/margins": 7.993889622304632,
"eval_rewards/rejected": -6.430339944773707,
"eval_runtime": 375.3266,
"eval_samples_per_second": 1.244,
"eval_steps_per_second": 0.157,
"kl": 1106.7635498046875,
"step": 252
},
{
"epoch": 0.700952380952381,
"grad_norm": 3.408372034521783,
"kl": 155.97579956054688,
"learning_rate": 1.0341088216269625e-06,
"logits/chosen": -65252832.0,
"logits/rejected": -51106684.0,
"logps/chosen": -388.51385498046875,
"logps/rejected": -146.90829467773438,
"loss": 0.2696,
"rewards/chosen": 0.8967366218566895,
"rewards/margins": 6.216433048248291,
"rewards/rejected": -5.319696426391602,
"step": 253
},
{
"epoch": 0.7037229437229438,
"grad_norm": 4.383580312032361,
"kl": 341.5067443847656,
"learning_rate": 1.0162949448316089e-06,
"logits/chosen": -86413928.72727273,
"logits/rejected": -114341286.4,
"logps/chosen": -367.32177734375,
"logps/rejected": -190.59051513671875,
"loss": 0.3508,
"rewards/chosen": 1.0880462473089045,
"rewards/margins": 3.699661619013006,
"rewards/rejected": -2.6116153717041017,
"step": 254
},
{
"epoch": 0.7064935064935065,
"grad_norm": 3.055607123710733,
"kl": 195.12664794921875,
"learning_rate": 9.98596611267158e-07,
"logits/chosen": -76725646.22222222,
"logits/rejected": -64971469.71428572,
"logps/chosen": -337.21875,
"logps/rejected": -205.77462332589286,
"loss": 0.3384,
"rewards/chosen": -1.6779973771837022,
"rewards/margins": 5.715687010023329,
"rewards/rejected": -7.393684387207031,
"step": 255
},
{
"epoch": 0.7092640692640693,
"grad_norm": 2.3759956264666604,
"kl": 273.7037353515625,
"learning_rate": 9.81015199187753e-07,
"logits/chosen": -72682016.0,
"logits/rejected": -81808928.0,
"logps/chosen": -385.7765197753906,
"logps/rejected": -200.73654174804688,
"loss": 0.2374,
"rewards/chosen": 1.9999887943267822,
"rewards/margins": 8.917597532272339,
"rewards/rejected": -6.917608737945557,
"step": 256
},
{
"epoch": 0.712034632034632,
"grad_norm": 3.8057096837262425,
"kl": 188.0250244140625,
"learning_rate": 9.63552077742301e-07,
"logits/chosen": -67763000.0,
"logits/rejected": -90358536.0,
"logps/chosen": -238.7766876220703,
"logps/rejected": -257.0978698730469,
"loss": 0.3161,
"rewards/chosen": 1.9526047706604004,
"rewards/margins": 8.665544509887695,
"rewards/rejected": -6.712939739227295,
"step": 257
},
{
"epoch": 0.7148051948051948,
"grad_norm": 2.5951420032808077,
"kl": 182.75180053710938,
"learning_rate": 9.462086068678519e-07,
"logits/chosen": -82555717.33333333,
"logits/rejected": -83283424.0,
"logps/chosen": -274.6826171875,
"logps/rejected": -231.703662109375,
"loss": 0.2752,
"rewards/chosen": -4.324587504069011,
"rewards/margins": 2.9592534383138016,
"rewards/rejected": -7.283840942382812,
"step": 258
},
{
"epoch": 0.7175757575757575,
"grad_norm": 3.592536819677904,
"kl": 372.02880859375,
"learning_rate": 9.289861371836886e-07,
"logits/chosen": -33494551.272727273,
"logits/rejected": -43864403.2,
"logps/chosen": -210.74216530539772,
"logps/rejected": -146.0051513671875,
"loss": 0.3954,
"rewards/chosen": -2.606386184692383,
"rewards/margins": -0.9926284790039062,
"rewards/rejected": -1.6137577056884767,
"step": 259
},
{
"epoch": 0.7203463203463204,
"grad_norm": 3.611795674877251,
"kl": 144.7232208251953,
"learning_rate": 9.118860098861538e-07,
"logits/chosen": -27257533.714285713,
"logits/rejected": -92377756.44444445,
"logps/chosen": -106.76185825892857,
"logps/rejected": -172.93641493055554,
"loss": 0.2925,
"rewards/chosen": -4.36328969682966,
"rewards/margins": 1.0060645512172153,
"rewards/rejected": -5.369354248046875,
"step": 260
},
{
"epoch": 0.7231168831168832,
"grad_norm": 3.426912310848221,
"kl": 230.80148315429688,
"learning_rate": 8.949095566441985e-07,
"logits/chosen": -70854326.85714285,
"logits/rejected": -106736128.0,
"logps/chosen": -355.3104771205357,
"logps/rejected": -143.09120008680554,
"loss": 0.2628,
"rewards/chosen": 4.951349530901227,
"rewards/margins": 11.514222523522754,
"rewards/rejected": -6.562872992621528,
"step": 261
},
{
"epoch": 0.7258874458874459,
"grad_norm": 3.9222498849988217,
"kl": 275.5905456542969,
"learning_rate": 8.78058099495685e-07,
"logits/chosen": -63815808.0,
"logits/rejected": -43907032.0,
"logps/chosen": -221.944921875,
"logps/rejected": -152.33535766601562,
"loss": 0.3599,
"rewards/chosen": 1.3001568794250489,
"rewards/margins": 6.684520689646403,
"rewards/rejected": -5.3843638102213545,
"step": 262
},
{
"epoch": 0.7286580086580087,
"grad_norm": 4.081637856436443,
"kl": 205.53375244140625,
"learning_rate": 8.613329507444274e-07,
"logits/chosen": -70823014.4,
"logits/rejected": 51152917.333333336,
"logps/chosen": -416.84267578125,
"logps/rejected": -218.20318603515625,
"loss": 0.3067,
"rewards/chosen": 2.8227375030517576,
"rewards/margins": 11.075025685628255,
"rewards/rejected": -8.252288182576498,
"step": 263
},
{
"epoch": 0.7314285714285714,
"grad_norm": 3.0956087519734283,
"kl": 347.00433349609375,
"learning_rate": 8.44735412857999e-07,
"logits/chosen": -48914204.44444445,
"logits/rejected": -54545723.428571425,
"logps/chosen": -236.80449761284723,
"logps/rejected": -212.97638811383928,
"loss": 0.3157,
"rewards/chosen": -2.7551773918999567,
"rewards/margins": 2.7477452111622647,
"rewards/rejected": -5.502922603062221,
"step": 264
},
{
"epoch": 0.7341991341991342,
"grad_norm": 3.6798026423098853,
"kl": 378.4184265136719,
"learning_rate": 8.282667783663056e-07,
"logits/chosen": -47141145.6,
"logits/rejected": -69143936.0,
"logps/chosen": -230.9576171875,
"logps/rejected": -178.14546342329547,
"loss": 0.2581,
"rewards/chosen": 1.3340899467468261,
"rewards/margins": 5.900271476398815,
"rewards/rejected": -4.566181529651988,
"step": 265
},
{
"epoch": 0.7369696969696969,
"grad_norm": 2.818485572402302,
"kl": 211.65887451171875,
"learning_rate": 8.119283297609238e-07,
"logits/chosen": -63136617.14285714,
"logits/rejected": -55634499.55555555,
"logps/chosen": -260.20821707589283,
"logps/rejected": -183.45157877604166,
"loss": 0.2609,
"rewards/chosen": 2.2578698566981723,
"rewards/margins": 5.820503794957721,
"rewards/rejected": -3.5626339382595487,
"step": 266
},
{
"epoch": 0.7397402597402597,
"grad_norm": 3.1897322815284683,
"kl": 142.84185791015625,
"learning_rate": 7.957213393952335e-07,
"logits/chosen": -67597770.66666667,
"logits/rejected": -44832944.0,
"logps/chosen": -222.02327473958334,
"logps/rejected": -255.47900390625,
"loss": 0.2854,
"rewards/chosen": -4.233989079793294,
"rewards/margins": 4.390859285990397,
"rewards/rejected": -8.624848365783691,
"step": 267
},
{
"epoch": 0.7425108225108226,
"grad_norm": 3.3891834518638553,
"kl": 134.27809143066406,
"learning_rate": 7.796470693853281e-07,
"logits/chosen": -43992632.0,
"logits/rejected": -94291280.0,
"logps/chosen": -248.00100708007812,
"logps/rejected": -233.19149780273438,
"loss": 0.2848,
"rewards/chosen": 0.43076658248901367,
"rewards/margins": 6.797356128692627,
"rewards/rejected": -6.366589546203613,
"step": 268
},
{
"epoch": 0.7452813852813853,
"grad_norm": 3.3252557869988606,
"kl": 180.33831787109375,
"learning_rate": 7.637067715117327e-07,
"logits/chosen": -62336172.8,
"logits/rejected": -46789130.666666664,
"logps/chosen": -399.1670654296875,
"logps/rejected": -212.2073771158854,
"loss": 0.291,
"rewards/chosen": 1.414572048187256,
"rewards/margins": 7.036088593800862,
"rewards/rejected": -5.6215165456136065,
"step": 269
},
{
"epoch": 0.7480519480519481,
"grad_norm": 3.566170694431363,
"kl": 199.22201538085938,
"learning_rate": 7.479016871219174e-07,
"logits/chosen": -71036677.33333333,
"logits/rejected": -59511052.8,
"logps/chosen": -341.9608154296875,
"logps/rejected": -148.16932373046876,
"loss": 0.3713,
"rewards/chosen": -0.5459572474161783,
"rewards/margins": 2.417703183492025,
"rewards/rejected": -2.9636604309082033,
"step": 270
},
{
"epoch": 0.7508225108225108,
"grad_norm": 2.802819654299222,
"kl": 356.5174560546875,
"learning_rate": 7.322330470336314e-07,
"logits/chosen": -53457590.85714286,
"logits/rejected": -110052842.66666667,
"logps/chosen": -213.4609375,
"logps/rejected": -178.4493408203125,
"loss": 0.3042,
"rewards/chosen": 0.768862111227853,
"rewards/margins": 6.479752139439658,
"rewards/rejected": -5.710890028211805,
"step": 271
},
{
"epoch": 0.7535930735930736,
"grad_norm": 3.27403131274412,
"kl": 300.19671630859375,
"learning_rate": 7.167020714390502e-07,
"logits/chosen": -79985589.33333333,
"logits/rejected": -98586624.0,
"logps/chosen": -271.8570963541667,
"logps/rejected": -254.91591796875,
"loss": 0.2824,
"rewards/chosen": -0.687197208404541,
"rewards/margins": 5.884105587005616,
"rewards/rejected": -6.571302795410157,
"step": 272
},
{
"epoch": 0.7563636363636363,
"grad_norm": 4.796024627038505,
"kl": 172.873291015625,
"learning_rate": 7.013099698097539e-07,
"logits/chosen": -58375436.8,
"logits/rejected": -113968372.36363636,
"logps/chosen": -200.2075927734375,
"logps/rejected": -224.1153231534091,
"loss": 0.2744,
"rewards/chosen": 2.289317321777344,
"rewards/margins": 9.3487773548473,
"rewards/rejected": -7.059460033069957,
"step": 273
},
{
"epoch": 0.7591341991341991,
"grad_norm": 1.898707621563178,
"kl": 275.5303955078125,
"learning_rate": 6.860579408025436e-07,
"logits/chosen": -66903751.11111111,
"logits/rejected": -52988854.85714286,
"logps/chosen": -210.03946940104166,
"logps/rejected": -199.36795479910714,
"loss": 0.351,
"rewards/chosen": -4.433419969346788,
"rewards/margins": 2.752294752332899,
"rewards/rejected": -7.1857147216796875,
"step": 274
},
{
"epoch": 0.7619047619047619,
"grad_norm": 3.166222661314354,
"kl": 249.98538208007812,
"learning_rate": 6.709471721660904e-07,
"logits/chosen": -46733385.14285714,
"logits/rejected": -126729671.1111111,
"logps/chosen": -265.08956473214283,
"logps/rejected": -174.86515299479166,
"loss": 0.3114,
"rewards/chosen": -0.1910043443952288,
"rewards/margins": 4.7248930249895364,
"rewards/rejected": -4.915897369384766,
"step": 275
},
{
"epoch": 0.7646753246753247,
"grad_norm": 2.690930213427526,
"kl": 212.82666015625,
"learning_rate": 6.559788406484446e-07,
"logits/chosen": -48945336.0,
"logits/rejected": -61969256.0,
"logps/chosen": -328.7871398925781,
"logps/rejected": -133.97158813476562,
"loss": 0.2748,
"rewards/chosen": 3.2727508544921875,
"rewards/margins": 6.7245166301727295,
"rewards/rejected": -3.451765775680542,
"step": 276
},
{
"epoch": 0.7674458874458875,
"grad_norm": 2.7007228681678948,
"kl": 120.11971282958984,
"learning_rate": 6.41154111905393e-07,
"logits/chosen": -91537863.1111111,
"logits/rejected": -50415232.0,
"logps/chosen": -287.26361762152777,
"logps/rejected": -191.4361572265625,
"loss": 0.2997,
"rewards/chosen": 1.7716151343451605,
"rewards/margins": 9.348053493197002,
"rewards/rejected": -7.576438358851841,
"step": 277
},
{
"epoch": 0.7702164502164502,
"grad_norm": 2.072018372743311,
"kl": 345.71630859375,
"learning_rate": 6.264741404096875e-07,
"logits/chosen": -50276706.90909091,
"logits/rejected": -44993907.2,
"logps/chosen": -155.4117986505682,
"logps/rejected": -171.1499755859375,
"loss": 0.3447,
"rewards/chosen": -1.067693363536488,
"rewards/margins": 4.660452998768199,
"rewards/rejected": -5.728146362304687,
"step": 278
},
{
"epoch": 0.772987012987013,
"grad_norm": 2.9339434752000773,
"kl": 322.894287109375,
"learning_rate": 6.119400693611358e-07,
"logits/chosen": -70990665.14285715,
"logits/rejected": -43696771.55555555,
"logps/chosen": -285.5392368861607,
"logps/rejected": -176.08287217881946,
"loss": 0.2985,
"rewards/chosen": 2.104515620640346,
"rewards/margins": 7.515042320130362,
"rewards/rejected": -5.410526699490017,
"step": 279
},
{
"epoch": 0.7757575757575758,
"grad_norm": 3.562536584571455,
"kl": 177.15957641601562,
"learning_rate": 5.975530305975808e-07,
"logits/chosen": -103062884.57142857,
"logits/rejected": -50913994.666666664,
"logps/chosen": -339.17027064732144,
"logps/rejected": -184.08607313368054,
"loss": 0.282,
"rewards/chosen": 3.5254838126046315,
"rewards/margins": 7.8241237458728605,
"rewards/rejected": -4.2986399332682295,
"step": 280
},
{
"epoch": 0.7785281385281385,
"grad_norm": 4.0306130164614,
"kl": 167.3966064453125,
"learning_rate": 5.833141445067541e-07,
"logits/chosen": -100358954.66666667,
"logits/rejected": -57243862.85714286,
"logps/chosen": -231.44365776909723,
"logps/rejected": -176.75859723772322,
"loss": 0.2869,
"rewards/chosen": -1.6125593185424805,
"rewards/margins": 2.7696215765816827,
"rewards/rejected": -4.382180895124163,
"step": 281
},
{
"epoch": 0.7812987012987013,
"grad_norm": 3.44873803212302,
"kl": 348.943359375,
"learning_rate": 5.692245199390281e-07,
"logits/chosen": -68807224.0,
"logits/rejected": -69575264.0,
"logps/chosen": -225.96981811523438,
"logps/rejected": -246.62388610839844,
"loss": 0.3249,
"rewards/chosen": -0.17798352241516113,
"rewards/margins": 5.94242262840271,
"rewards/rejected": -6.120406150817871,
"step": 282
},
{
"epoch": 0.784069264069264,
"grad_norm": 2.894002118538045,
"kl": 241.6986083984375,
"learning_rate": 5.552852541210651e-07,
"logits/chosen": -47480483.55555555,
"logits/rejected": -41506720.0,
"logps/chosen": -136.6748046875,
"logps/rejected": -164.92328752790178,
"loss": 0.3215,
"rewards/chosen": -4.108927408854167,
"rewards/margins": 2.43545168922061,
"rewards/rejected": -6.544379098074777,
"step": 283
},
{
"epoch": 0.7868398268398269,
"grad_norm": 3.2824108049566894,
"kl": 170.58547973632812,
"learning_rate": 5.414974325703687e-07,
"logits/chosen": -58694203.428571425,
"logits/rejected": -85210695.1111111,
"logps/chosen": -287.21871512276783,
"logps/rejected": -216.19121636284723,
"loss": 0.2668,
"rewards/chosen": -1.4370292936052596,
"rewards/margins": 3.6460005593678306,
"rewards/rejected": -5.08302985297309,
"step": 284
},
{
"epoch": 0.7896103896103897,
"grad_norm": 3.738104349195781,
"kl": 140.03277587890625,
"learning_rate": 5.278621290107533e-07,
"logits/chosen": -48089952.0,
"logits/rejected": -68618922.66666667,
"logps/chosen": -210.25188337053572,
"logps/rejected": -225.17115614149304,
"loss": 0.3009,
"rewards/chosen": -2.186547006879534,
"rewards/margins": -1.190037031022329,
"rewards/rejected": -0.9965099758572049,
"step": 285
},
{
"epoch": 0.7923809523809524,
"grad_norm": 3.4587778119224875,
"kl": 338.2921142578125,
"learning_rate": 5.143804052887228e-07,
"logits/chosen": -87703588.57142857,
"logits/rejected": -32304408.888888888,
"logps/chosen": -481.19339425223217,
"logps/rejected": -205.55504014756946,
"loss": 0.3118,
"rewards/chosen": 2.911403111049107,
"rewards/margins": 8.781512245299325,
"rewards/rejected": -5.870109134250217,
"step": 286
},
{
"epoch": 0.7951515151515152,
"grad_norm": 4.206081190690971,
"kl": 144.6542510986328,
"learning_rate": 5.010533112907845e-07,
"logits/chosen": -35128680.0,
"logits/rejected": -64883232.0,
"logps/chosen": -138.93106079101562,
"logps/rejected": -158.321044921875,
"loss": 0.2868,
"rewards/chosen": 2.1601052284240723,
"rewards/margins": 8.834944248199463,
"rewards/rejected": -6.674839019775391,
"step": 287
},
{
"epoch": 0.7979220779220779,
"grad_norm": 3.386798752207688,
"kl": 239.18814086914062,
"learning_rate": 4.878818848616861e-07,
"logits/chosen": -80401216.0,
"logits/rejected": -90698474.66666667,
"logps/chosen": -238.15419224330358,
"logps/rejected": -262.73231336805554,
"loss": 0.2959,
"rewards/chosen": 1.4890550885881697,
"rewards/margins": 10.170775398375497,
"rewards/rejected": -8.681720309787327,
"step": 288
},
{
"epoch": 0.7979220779220779,
"eval_logits/chosen": -69444923.73333333,
"eval_logits/rejected": -36029797.51724138,
"eval_logps/chosen": -342.69557291666666,
"eval_logps/rejected": -230.78365773168105,
"eval_loss": 0.3033340275287628,
"eval_rewards/chosen": 1.6630688985188802,
"eval_rewards/margins": 8.669149262877717,
"eval_rewards/rejected": -7.006080364358836,
"eval_runtime": 375.9399,
"eval_samples_per_second": 1.242,
"eval_steps_per_second": 0.157,
"kl": 1082.5067138671875,
"step": 288
},
{
"epoch": 0.8006926406926407,
"grad_norm": 2.808240147886354,
"kl": 238.10203552246094,
"learning_rate": 4.748671517235948e-07,
"logits/chosen": -62291260.0,
"logits/rejected": -120199832.0,
"logps/chosen": -252.74928283691406,
"logps/rejected": -304.03900146484375,
"loss": 0.3472,
"rewards/chosen": -0.03838551044464111,
"rewards/margins": 8.531028628349304,
"rewards/rejected": -8.569414138793945,
"step": 289
},
{
"epoch": 0.8034632034632034,
"grad_norm": 4.382126448345103,
"kl": 272.16949462890625,
"learning_rate": 4.620101253962206e-07,
"logits/chosen": -41600490.666666664,
"logits/rejected": -59269593.6,
"logps/chosen": -191.1187744140625,
"logps/rejected": -142.35963134765626,
"loss": 0.2236,
"rewards/chosen": 0.45951588948567706,
"rewards/margins": 5.3680478413899735,
"rewards/rejected": -4.908531951904297,
"step": 290
},
{
"epoch": 0.8062337662337662,
"grad_norm": 3.351554288936717,
"kl": 270.4270935058594,
"learning_rate": 4.4931180711788537e-07,
"logits/chosen": -163745781.33333334,
"logits/rejected": -138789612.30769232,
"logps/chosen": -429.6186930338542,
"logps/rejected": -226.00240384615384,
"loss": 0.2312,
"rewards/chosen": 6.541826883951823,
"rewards/margins": 12.985393426357172,
"rewards/rejected": -6.443566542405349,
"step": 291
},
{
"epoch": 0.8090043290043291,
"grad_norm": 2.404015170456249,
"kl": 400.7691650390625,
"learning_rate": 4.3677318576755693e-07,
"logits/chosen": -60730896.0,
"logits/rejected": -195664464.0,
"logps/chosen": -279.2489929199219,
"logps/rejected": -246.13470458984375,
"loss": 0.3219,
"rewards/chosen": -3.9280588626861572,
"rewards/margins": 2.784353017807007,
"rewards/rejected": -6.712411880493164,
"step": 292
},
{
"epoch": 0.8117748917748918,
"grad_norm": 3.105770373370238,
"kl": 166.30032348632812,
"learning_rate": 4.243952377878338e-07,
"logits/chosen": -77011364.57142857,
"logits/rejected": -49117194.666666664,
"logps/chosen": -285.13291713169644,
"logps/rejected": -136.7578125,
"loss": 0.2972,
"rewards/chosen": -6.261838095528739,
"rewards/margins": 0.7438669961596291,
"rewards/rejected": -7.005705091688368,
"step": 293
},
{
"epoch": 0.8145454545454546,
"grad_norm": 4.000426081788337,
"kl": 267.60205078125,
"learning_rate": 4.1217892710891134e-07,
"logits/chosen": -113192118.85714285,
"logits/rejected": -43238922.666666664,
"logps/chosen": -452.5059291294643,
"logps/rejected": -208.38062879774304,
"loss": 0.2971,
"rewards/chosen": 0.20489086423601424,
"rewards/margins": 7.392902033669608,
"rewards/rejected": -7.188011169433594,
"step": 294
},
{
"epoch": 0.8173160173160173,
"grad_norm": 3.123408229018022,
"kl": 218.33502197265625,
"learning_rate": 4.001252050735102e-07,
"logits/chosen": -38402567.11111111,
"logits/rejected": -56519460.571428575,
"logps/chosen": -252.67822265625,
"logps/rejected": -268.72366768973217,
"loss": 0.3443,
"rewards/chosen": 0.5289336310492622,
"rewards/margins": 9.533072032625714,
"rewards/rejected": -9.004138401576451,
"step": 295
},
{
"epoch": 0.8200865800865801,
"grad_norm": 5.000927062881679,
"kl": 338.04443359375,
"learning_rate": 3.882350103627952e-07,
"logits/chosen": -58505769.14285714,
"logits/rejected": -57647619.55555555,
"logps/chosen": -247.9539794921875,
"logps/rejected": -206.06060112847223,
"loss": 0.3395,
"rewards/chosen": 3.2841903141566684,
"rewards/margins": 11.585580220298162,
"rewards/rejected": -8.301389906141493,
"step": 296
},
{
"epoch": 0.8228571428571428,
"grad_norm": 3.0362677210091022,
"kl": 173.23562622070312,
"learning_rate": 3.7650926892327297e-07,
"logits/chosen": -44870958.222222224,
"logits/rejected": -66984045.71428572,
"logps/chosen": -256.7326388888889,
"logps/rejected": -109.4141845703125,
"loss": 0.2963,
"rewards/chosen": -2.3694470723470054,
"rewards/margins": 3.4614811851864764,
"rewards/rejected": -5.830928257533482,
"step": 297
},
{
"epoch": 0.8256277056277056,
"grad_norm": 2.855189599099019,
"kl": 210.4012451171875,
"learning_rate": 3.649488938946844e-07,
"logits/chosen": -45783491.55555555,
"logits/rejected": -51585654.85714286,
"logps/chosen": -244.33824327256946,
"logps/rejected": -176.662841796875,
"loss": 0.2647,
"rewards/chosen": -2.904595692952474,
"rewards/margins": 3.5319571722121466,
"rewards/rejected": -6.436552865164621,
"step": 298
},
{
"epoch": 0.8283982683982684,
"grad_norm": 3.805562216682305,
"kl": 276.049072265625,
"learning_rate": 3.5355478553889626e-07,
"logits/chosen": -79455768.0,
"logits/rejected": 15600413.333333334,
"logps/chosen": -300.433349609375,
"logps/rejected": -213.483154296875,
"loss": 0.2934,
"rewards/chosen": 2.3578178882598877,
"rewards/margins": 7.926704963048299,
"rewards/rejected": -5.568887074788411,
"step": 299
},
{
"epoch": 0.8311688311688312,
"grad_norm": 2.8140310846836147,
"kl": 196.21664428710938,
"learning_rate": 3.4232783116978976e-07,
"logits/chosen": -72521104.0,
"logits/rejected": -60625048.0,
"logps/chosen": -243.73028564453125,
"logps/rejected": -262.26806640625,
"loss": 0.3485,
"rewards/chosen": -1.804718017578125,
"rewards/margins": 1.0916314125061035,
"rewards/rejected": -2.8963494300842285,
"step": 300
},
{
"epoch": 0.833939393939394,
"grad_norm": 3.9832279655691645,
"kl": 228.42916870117188,
"learning_rate": 3.312689050841658e-07,
"logits/chosen": -93826016.0,
"logits/rejected": -53009270.4,
"logps/chosen": -299.4957682291667,
"logps/rejected": -197.59713134765624,
"loss": 0.2369,
"rewards/chosen": -2.22921085357666,
"rewards/margins": 4.334420585632325,
"rewards/rejected": -6.563631439208985,
"step": 301
},
{
"epoch": 0.8367099567099567,
"grad_norm": 2.8538026273017025,
"kl": 175.7204132080078,
"learning_rate": 3.203788684936535e-07,
"logits/chosen": -73006393.6,
"logits/rejected": -61207109.81818182,
"logps/chosen": -242.75419921875,
"logps/rejected": -142.1380948153409,
"loss": 0.3186,
"rewards/chosen": 0.9609004974365234,
"rewards/margins": 4.359611615267667,
"rewards/rejected": -3.3987111178311435,
"step": 302
},
{
"epoch": 0.8394805194805195,
"grad_norm": 3.8683368268702805,
"kl": 301.593505859375,
"learning_rate": 3.096585694576498e-07,
"logits/chosen": -70421624.8888889,
"logits/rejected": -174282185.14285713,
"logps/chosen": -437.4618869357639,
"logps/rejected": -260.1992710658482,
"loss": 0.3247,
"rewards/chosen": 1.732922871907552,
"rewards/margins": 9.504617600213914,
"rewards/rejected": -7.771694728306362,
"step": 303
},
{
"epoch": 0.8422510822510823,
"grad_norm": 3.9680420344462726,
"kl": 251.5784912109375,
"learning_rate": 2.9910884281727225e-07,
"logits/chosen": -71067492.57142857,
"logits/rejected": -66364231.11111111,
"logps/chosen": -239.69499860491072,
"logps/rejected": -192.93386501736111,
"loss": 0.3226,
"rewards/chosen": 0.19936534336635045,
"rewards/margins": 6.042430393279545,
"rewards/rejected": -5.843065049913195,
"step": 304
},
{
"epoch": 0.845021645021645,
"grad_norm": 3.7746476943141687,
"kl": 325.1376953125,
"learning_rate": 2.8873051013034695e-07,
"logits/chosen": -42786944.0,
"logits/rejected": -62841930.666666664,
"logps/chosen": -141.4681884765625,
"logps/rejected": -199.77783203125,
"loss": 0.2805,
"rewards/chosen": -4.344178771972656,
"rewards/margins": 2.449849192301433,
"rewards/rejected": -6.794027964274089,
"step": 305
},
{
"epoch": 0.8477922077922078,
"grad_norm": 4.307089034749869,
"kl": 240.35658264160156,
"learning_rate": 2.785243796074333e-07,
"logits/chosen": -57018339.55555555,
"logits/rejected": -130973366.85714285,
"logps/chosen": -227.49403211805554,
"logps/rejected": -192.2506103515625,
"loss": 0.2939,
"rewards/chosen": -1.010747167799208,
"rewards/margins": 2.9281449242243687,
"rewards/rejected": -3.938892092023577,
"step": 306
},
{
"epoch": 0.8505627705627705,
"grad_norm": 2.1582717524376656,
"kl": 225.83633422851562,
"learning_rate": 2.6849124604887836e-07,
"logits/chosen": -45088565.333333336,
"logits/rejected": -65247955.2,
"logps/chosen": -155.26885986328125,
"logps/rejected": -231.8005859375,
"loss": 0.2921,
"rewards/chosen": -2.116344610850016,
"rewards/margins": 5.6510290463765465,
"rewards/rejected": -7.767373657226562,
"step": 307
},
{
"epoch": 0.8533333333333334,
"grad_norm": 2.996078110648648,
"kl": 217.82655334472656,
"learning_rate": 2.5863189078292913e-07,
"logits/chosen": -63356553.84615385,
"logits/rejected": -39525973.333333336,
"logps/chosen": -406.2390324519231,
"logps/rejected": -163.10069783528647,
"loss": 0.2882,
"rewards/chosen": 1.0046427800105169,
"rewards/margins": 3.491529758159931,
"rewards/rejected": -2.486886978149414,
"step": 308
},
{
"epoch": 0.8561038961038961,
"grad_norm": 2.9594486176139214,
"kl": 283.44134521484375,
"learning_rate": 2.489470816048806e-07,
"logits/chosen": -43881161.14285714,
"logits/rejected": -29412579.555555556,
"logps/chosen": -207.05925641741072,
"logps/rejected": -212.45526801215277,
"loss": 0.2987,
"rewards/chosen": -1.6313300813947404,
"rewards/margins": 5.493112125093974,
"rewards/rejected": -7.124442206488715,
"step": 309
},
{
"epoch": 0.8588744588744589,
"grad_norm": 3.3978016460694382,
"kl": 230.8690185546875,
"learning_rate": 2.3943757271728816e-07,
"logits/chosen": -66721024.0,
"logits/rejected": -92142896.0,
"logps/chosen": -311.16510009765625,
"logps/rejected": -228.64767456054688,
"loss": 0.3377,
"rewards/chosen": -0.45139995217323303,
"rewards/margins": 6.1784490048885345,
"rewards/rejected": -6.629848957061768,
"step": 310
},
{
"epoch": 0.8616450216450217,
"grad_norm": 2.8586263345070497,
"kl": 321.99847412109375,
"learning_rate": 2.30104104671231e-07,
"logits/chosen": -90843675.42857143,
"logits/rejected": -73359914.66666667,
"logps/chosen": -273.88846261160717,
"logps/rejected": -211.38840060763889,
"loss": 0.3031,
"rewards/chosen": 1.2099121638706751,
"rewards/margins": 7.697730987791031,
"rewards/rejected": -6.487818823920356,
"step": 311
},
{
"epoch": 0.8644155844155844,
"grad_norm": 3.629578412957439,
"kl": 352.22943115234375,
"learning_rate": 2.2094740430864569e-07,
"logits/chosen": -93567232.0,
"logits/rejected": -73416960.0,
"logps/chosen": -474.8711344401042,
"logps/rejected": -223.6056396484375,
"loss": 0.3594,
"rewards/chosen": 0.6456926663716634,
"rewards/margins": 5.024157174428304,
"rewards/rejected": -4.3784645080566404,
"step": 312
},
{
"epoch": 0.8671861471861472,
"grad_norm": 3.2593722533835514,
"kl": 101.73670959472656,
"learning_rate": 2.119681847057184e-07,
"logits/chosen": -49168621.71428572,
"logits/rejected": -64435868.44444445,
"logps/chosen": -223.25570242745536,
"logps/rejected": -224.44247775607639,
"loss": 0.2976,
"rewards/chosen": -1.141160488128662,
"rewards/margins": 3.5539769596523705,
"rewards/rejected": -4.695137447781033,
"step": 313
},
{
"epoch": 0.8699567099567099,
"grad_norm": 3.0561471018551187,
"kl": 238.22964477539062,
"learning_rate": 2.0316714511736002e-07,
"logits/chosen": -73805504.0,
"logits/rejected": -120474040.8888889,
"logps/chosen": -281.7974330357143,
"logps/rejected": -220.72686089409723,
"loss": 0.2737,
"rewards/chosen": 4.077202388218471,
"rewards/margins": 5.297303744724819,
"rewards/rejected": -1.2201013565063477,
"step": 314
},
{
"epoch": 0.8727272727272727,
"grad_norm": 4.05194504150993,
"kl": 356.24725341796875,
"learning_rate": 1.9454497092274565e-07,
"logits/chosen": -104081649.77777778,
"logits/rejected": -34870395.428571425,
"logps/chosen": -365.76576063368054,
"logps/rejected": -160.77207728794642,
"loss": 0.3133,
"rewards/chosen": 3.3444756401909723,
"rewards/margins": 9.537121121845548,
"rewards/rejected": -6.1926454816545755,
"step": 315
},
{
"epoch": 0.8754978354978356,
"grad_norm": 3.0299855668646036,
"kl": 274.89227294921875,
"learning_rate": 1.861023335719475e-07,
"logits/chosen": -54783534.54545455,
"logits/rejected": -23337324.8,
"logps/chosen": -257.8592640269886,
"logps/rejected": -76.99609375,
"loss": 0.3657,
"rewards/chosen": -1.5488752885298296,
"rewards/margins": 3.5257962660356,
"rewards/rejected": -5.0746715545654295,
"step": 316
},
{
"epoch": 0.8782683982683983,
"grad_norm": 3.4960756957856054,
"kl": 216.21316528320312,
"learning_rate": 1.7783989053363926e-07,
"logits/chosen": -102276416.0,
"logits/rejected": -56183384.0,
"logps/chosen": -436.3009033203125,
"logps/rejected": -253.29261779785156,
"loss": 0.256,
"rewards/chosen": -2.8814847469329834,
"rewards/margins": 3.035329580307007,
"rewards/rejected": -5.91681432723999,
"step": 317
},
{
"epoch": 0.8810389610389611,
"grad_norm": 3.255643273544522,
"kl": 170.55194091796875,
"learning_rate": 1.6975828524390116e-07,
"logits/chosen": -25597888.0,
"logits/rejected": -74746484.36363636,
"logps/chosen": -149.9252685546875,
"logps/rejected": -170.6590909090909,
"loss": 0.3,
"rewards/chosen": -3.4752635955810547,
"rewards/margins": 3.1817431016401807,
"rewards/rejected": -6.657006697221235,
"step": 318
},
{
"epoch": 0.8838095238095238,
"grad_norm": 3.211590528118705,
"kl": 225.81451416015625,
"learning_rate": 1.6185814705610926e-07,
"logits/chosen": -80230812.44444445,
"logits/rejected": -95820672.0,
"logps/chosen": -365.86924913194446,
"logps/rejected": -267.72134835379467,
"loss": 0.3225,
"rewards/chosen": -0.6288706461588541,
"rewards/margins": 9.722852071126303,
"rewards/rejected": -10.351722717285156,
"step": 319
},
{
"epoch": 0.8865800865800866,
"grad_norm": 4.968087897180119,
"kl": 318.9226379394531,
"learning_rate": 1.5414009119192635e-07,
"logits/chosen": -59935155.2,
"logits/rejected": -99951413.33333333,
"logps/chosen": -189.9381591796875,
"logps/rejected": -181.38252766927084,
"loss": 0.2767,
"rewards/chosen": -3.4159313201904298,
"rewards/margins": 4.187530899047852,
"rewards/rejected": -7.603462219238281,
"step": 320
},
{
"epoch": 0.8893506493506493,
"grad_norm": 2.710501799989454,
"kl": 182.2108154296875,
"learning_rate": 1.4660471869339056e-07,
"logits/chosen": -60232084.0,
"logits/rejected": -43643592.0,
"logps/chosen": -188.66204833984375,
"logps/rejected": -244.72280883789062,
"loss": 0.3119,
"rewards/chosen": -0.325054407119751,
"rewards/margins": 7.316741704940796,
"rewards/rejected": -7.641796112060547,
"step": 321
},
{
"epoch": 0.8921212121212121,
"grad_norm": 3.1125121761971637,
"kl": 231.7778778076172,
"learning_rate": 1.392526163761107e-07,
"logits/chosen": -58159717.333333336,
"logits/rejected": -44477772.8,
"logps/chosen": -325.116943359375,
"logps/rejected": -250.502734375,
"loss": 0.3591,
"rewards/chosen": -0.008535941441853842,
"rewards/margins": 5.7585439840952555,
"rewards/rejected": -5.767079925537109,
"step": 322
},
{
"epoch": 0.8948917748917748,
"grad_norm": 2.9951995159051656,
"kl": 321.8599853515625,
"learning_rate": 1.3208435678356612e-07,
"logits/chosen": -84027865.6,
"logits/rejected": -111909888.0,
"logps/chosen": -349.3338134765625,
"logps/rejected": -172.89811197916666,
"loss": 0.3352,
"rewards/chosen": -2.2276878356933594,
"rewards/margins": 3.500701268513997,
"rewards/rejected": -5.7283891042073565,
"step": 323
},
{
"epoch": 0.8976623376623377,
"grad_norm": 3.773179708058833,
"kl": 320.73931884765625,
"learning_rate": 1.2510049814252302e-07,
"logits/chosen": -52532563.2,
"logits/rejected": -46823627.63636363,
"logps/chosen": -191.76851806640624,
"logps/rejected": -189.61625532670453,
"loss": 0.2921,
"rewards/chosen": 0.10258646011352539,
"rewards/margins": 5.660441546006636,
"rewards/rejected": -5.55785508589311,
"step": 324
},
{
"epoch": 0.8976623376623377,
"eval_logits/chosen": -69711099.73333333,
"eval_logits/rejected": -35742644.96551724,
"eval_logps/chosen": -345.00416666666666,
"eval_logps/rejected": -236.56349339978448,
"eval_loss": 0.30222654342651367,
"eval_rewards/chosen": 1.4322120666503906,
"eval_rewards/margins": 9.016275892586544,
"eval_rewards/rejected": -7.584063825936153,
"eval_runtime": 376.6798,
"eval_samples_per_second": 1.24,
"eval_steps_per_second": 0.157,
"kl": 1047.622314453125,
"step": 324
},
{
"epoch": 0.9004329004329005,
"grad_norm": 3.225048381245606,
"kl": 162.58395385742188,
"learning_rate": 1.1830158431955841e-07,
"logits/chosen": -70002758.4,
"logits/rejected": -68099543.27272727,
"logps/chosen": -316.6424560546875,
"logps/rejected": -205.49928977272728,
"loss": 0.2455,
"rewards/chosen": 1.4882720947265624,
"rewards/margins": 9.835954700816762,
"rewards/rejected": -8.3476826060902,
"step": 325
},
{
"epoch": 0.9032034632034632,
"grad_norm": 7.020489831387403,
"kl": 154.04054260253906,
"learning_rate": 1.1168814477871132e-07,
"logits/chosen": -27315149.333333332,
"logits/rejected": -40221536.0,
"logps/chosen": -163.5007527669271,
"logps/rejected": -205.08662109375,
"loss": 0.3071,
"rewards/chosen": 1.7751677831013997,
"rewards/margins": 6.471413358052572,
"rewards/rejected": -4.696245574951172,
"step": 326
},
{
"epoch": 0.905974025974026,
"grad_norm": 3.9910790804312772,
"kl": 343.1492004394531,
"learning_rate": 1.0526069454024651e-07,
"logits/chosen": -69089873.45454545,
"logits/rejected": -25067128.0,
"logps/chosen": -184.99495072798297,
"logps/rejected": -145.78553466796876,
"loss": 0.4173,
"rewards/chosen": -0.5141198418357156,
"rewards/margins": 4.104613373496315,
"rewards/rejected": -4.618733215332031,
"step": 327
},
{
"epoch": 0.9087445887445887,
"grad_norm": 2.3379723695453842,
"kl": 200.3038787841797,
"learning_rate": 9.901973414055188e-08,
"logits/chosen": -43525616.0,
"logits/rejected": -88932544.0,
"logps/chosen": -189.66627502441406,
"logps/rejected": -226.9715118408203,
"loss": 0.2767,
"rewards/chosen": -0.33219343423843384,
"rewards/margins": 5.645094335079193,
"rewards/rejected": -5.977287769317627,
"step": 328
},
{
"epoch": 0.9115151515151515,
"grad_norm": 3.368306425906985,
"kl": 226.6565704345703,
"learning_rate": 9.296574959315464e-08,
"logits/chosen": -55839961.6,
"logits/rejected": -1148263.2727272727,
"logps/chosen": -202.15638427734376,
"logps/rejected": -215.63878284801137,
"loss": 0.2375,
"rewards/chosen": 2.4019466400146485,
"rewards/margins": 9.938241750543767,
"rewards/rejected": -7.536295110529119,
"step": 329
},
{
"epoch": 0.9142857142857143,
"grad_norm": 3.5086087623333375,
"kl": 212.34596252441406,
"learning_rate": 8.709921235087598e-08,
"logits/chosen": -115221869.71428572,
"logits/rejected": -43847939.55555555,
"logps/chosen": -409.99365234375,
"logps/rejected": -195.90090603298611,
"loss": 0.2772,
"rewards/chosen": 1.8871876852852958,
"rewards/margins": 8.114954781910729,
"rewards/rejected": -6.227767096625434,
"step": 330
},
{
"epoch": 0.917056277056277,
"grad_norm": 3.1912878671336515,
"kl": 276.59588623046875,
"learning_rate": 8.142057926911722e-08,
"logits/chosen": -54401212.44444445,
"logits/rejected": -86659062.85714285,
"logps/chosen": -228.07329644097223,
"logps/rejected": -178.87850516183036,
"loss": 0.247,
"rewards/chosen": -1.1525320476955838,
"rewards/margins": 4.903115666101849,
"rewards/rejected": -6.055647713797433,
"step": 331
},
{
"epoch": 0.9198268398268399,
"grad_norm": 3.4518885090842457,
"kl": 179.40066528320312,
"learning_rate": 7.593029257027956e-08,
"logits/chosen": -81397880.0,
"logits/rejected": -65295944.0,
"logps/chosen": -387.6104736328125,
"logps/rejected": -264.7943420410156,
"loss": 0.2784,
"rewards/chosen": 0.17824554443359375,
"rewards/margins": 7.819864273071289,
"rewards/rejected": -7.641618728637695,
"step": 332
},
{
"epoch": 0.9225974025974026,
"grad_norm": 3.240667722331457,
"kl": 273.7913818359375,
"learning_rate": 7.062877980932914e-08,
"logits/chosen": -69872288.0,
"logits/rejected": -134056736.0,
"logps/chosen": -258.5987548828125,
"logps/rejected": -230.22084045410156,
"loss": 0.2929,
"rewards/chosen": 0.8421229124069214,
"rewards/margins": 10.753296732902527,
"rewards/rejected": -9.911173820495605,
"step": 333
},
{
"epoch": 0.9253679653679654,
"grad_norm": 2.999389757174662,
"kl": 398.09234619140625,
"learning_rate": 6.551645384049898e-08,
"logits/chosen": -50461835.63636363,
"logits/rejected": -68490873.6,
"logps/chosen": -184.9519708806818,
"logps/rejected": -173.1832763671875,
"loss": 0.3399,
"rewards/chosen": 0.13307727466930042,
"rewards/margins": 7.3943916667591445,
"rewards/rejected": -7.261314392089844,
"step": 334
},
{
"epoch": 0.9281385281385282,
"grad_norm": 4.041013131462937,
"kl": 291.8245849609375,
"learning_rate": 6.059371278513942e-08,
"logits/chosen": -80652544.0,
"logits/rejected": -60337960.0,
"logps/chosen": -281.4577941894531,
"logps/rejected": -140.62286376953125,
"loss": 0.2597,
"rewards/chosen": 2.074185848236084,
"rewards/margins": 5.002105712890625,
"rewards/rejected": -2.927919864654541,
"step": 335
},
{
"epoch": 0.9309090909090909,
"grad_norm": 3.6881342907625374,
"kl": 281.2275085449219,
"learning_rate": 5.5860940000714016e-08,
"logits/chosen": -92601429.33333333,
"logits/rejected": -64497465.6,
"logps/chosen": -416.5006510416667,
"logps/rejected": -220.225244140625,
"loss": 0.334,
"rewards/chosen": 5.220906575520833,
"rewards/margins": 9.354856236775717,
"rewards/rejected": -4.133949661254883,
"step": 336
},
{
"epoch": 0.9336796536796537,
"grad_norm": 3.2732744144842303,
"kl": 135.52395629882812,
"learning_rate": 5.131850405094535e-08,
"logits/chosen": -66077589.333333336,
"logits/rejected": -41077076.0,
"logps/chosen": -265.8728434244792,
"logps/rejected": -235.4630889892578,
"loss": 0.2889,
"rewards/chosen": -3.7092259724934897,
"rewards/margins": 9.39453379313151,
"rewards/rejected": -13.103759765625,
"step": 337
},
{
"epoch": 0.9364502164502164,
"grad_norm": 4.262613045998365,
"kl": 127.82938385009766,
"learning_rate": 4.6966758677113865e-08,
"logits/chosen": -81822281.14285715,
"logits/rejected": -62049500.44444445,
"logps/chosen": -375.00167410714283,
"logps/rejected": -196.10477701822916,
"loss": 0.3371,
"rewards/chosen": -0.4809649331229074,
"rewards/margins": 6.648403319101485,
"rewards/rejected": -7.129368252224392,
"step": 338
},
{
"epoch": 0.9392207792207792,
"grad_norm": 2.7567661957637566,
"kl": 131.0496826171875,
"learning_rate": 4.280604277050932e-08,
"logits/chosen": -65746481.777777776,
"logits/rejected": -71039881.14285715,
"logps/chosen": -240.61138237847223,
"logps/rejected": -201.98568289620536,
"loss": 0.3823,
"rewards/chosen": -0.5426181687249078,
"rewards/margins": 1.088538616422623,
"rewards/rejected": -1.6311567851475306,
"step": 339
},
{
"epoch": 0.941991341991342,
"grad_norm": 3.810451018052984,
"kl": 188.156005859375,
"learning_rate": 3.88366803460416e-08,
"logits/chosen": -50966796.8,
"logits/rejected": -45116309.333333336,
"logps/chosen": -225.039013671875,
"logps/rejected": -156.35626220703125,
"loss": 0.3137,
"rewards/chosen": 2.0323509216308593,
"rewards/margins": 5.4068340301513675,
"rewards/rejected": -3.374483108520508,
"step": 340
},
{
"epoch": 0.9447619047619048,
"grad_norm": 4.023653516232581,
"kl": 261.86407470703125,
"learning_rate": 3.505898051700596e-08,
"logits/chosen": -60868138.666666664,
"logits/rejected": -103185574.4,
"logps/chosen": -187.67252604166666,
"logps/rejected": -203.42421875,
"loss": 0.2495,
"rewards/chosen": 0.37401819229125977,
"rewards/margins": 7.960676670074463,
"rewards/rejected": -7.586658477783203,
"step": 341
},
{
"epoch": 0.9475324675324676,
"grad_norm": 2.848154809605621,
"kl": 333.7825927734375,
"learning_rate": 3.147323747101222e-08,
"logits/chosen": -42793696.0,
"logits/rejected": -84218375.1111111,
"logps/chosen": -240.01213727678572,
"logps/rejected": -197.28348795572916,
"loss": 0.318,
"rewards/chosen": 3.4886975969587053,
"rewards/margins": 10.118460882277716,
"rewards/rejected": -6.629763285319011,
"step": 342
},
{
"epoch": 0.9503030303030303,
"grad_norm": 4.052057580524384,
"kl": 295.6441650390625,
"learning_rate": 2.8079730447073685e-08,
"logits/chosen": -74955200.0,
"logits/rejected": -61429477.333333336,
"logps/chosen": -338.2581787109375,
"logps/rejected": -154.8097941080729,
"loss": 0.27,
"rewards/chosen": -0.7783769607543946,
"rewards/margins": 2.1431963602701822,
"rewards/rejected": -2.9215733210245767,
"step": 343
},
{
"epoch": 0.9530735930735931,
"grad_norm": 2.588708817226156,
"kl": 252.98692321777344,
"learning_rate": 2.487872371386424e-08,
"logits/chosen": -71219725.71428572,
"logits/rejected": -135706026.66666666,
"logps/chosen": -215.78688267299108,
"logps/rejected": -177.41084798177084,
"loss": 0.2689,
"rewards/chosen": 0.5888588087899345,
"rewards/margins": 6.814543928418841,
"rewards/rejected": -6.225685119628906,
"step": 344
},
{
"epoch": 0.9558441558441558,
"grad_norm": 2.475512688880312,
"kl": 287.192626953125,
"learning_rate": 2.187046654913455e-08,
"logits/chosen": -48774666.666666664,
"logits/rejected": -141680844.8,
"logps/chosen": -129.57332356770834,
"logps/rejected": -232.22451171875,
"loss": 0.3501,
"rewards/chosen": -0.7337352434794108,
"rewards/margins": 7.57529567082723,
"rewards/rejected": -8.309030914306641,
"step": 345
},
{
"epoch": 0.9586147186147186,
"grad_norm": 2.345963196880215,
"kl": 178.36175537109375,
"learning_rate": 1.9055193220302582e-08,
"logits/chosen": -74306917.33333333,
"logits/rejected": -118739724.8,
"logps/chosen": -419.848876953125,
"logps/rejected": -203.6920166015625,
"loss": 0.2536,
"rewards/chosen": 4.156125386555989,
"rewards/margins": 12.88521931966146,
"rewards/rejected": -8.729093933105469,
"step": 346
},
{
"epoch": 0.9613852813852813,
"grad_norm": 3.535440511298158,
"kl": 232.6870574951172,
"learning_rate": 1.6433122966209303e-08,
"logits/chosen": -52279899.428571425,
"logits/rejected": -56559985.777777776,
"logps/chosen": -284.62510463169644,
"logps/rejected": -222.73961046006946,
"loss": 0.3007,
"rewards/chosen": -0.15411996841430664,
"rewards/margins": 8.343976921505398,
"rewards/rejected": -8.498096889919704,
"step": 347
},
{
"epoch": 0.9641558441558442,
"grad_norm": 2.7174709299005877,
"kl": 179.14004516601562,
"learning_rate": 1.4004459980045127e-08,
"logits/chosen": -51113212.44444445,
"logits/rejected": -70323492.57142857,
"logps/chosen": -293.8728841145833,
"logps/rejected": -163.85121372767858,
"loss": 0.2698,
"rewards/chosen": 0.6734237670898438,
"rewards/margins": 6.350269862583706,
"rewards/rejected": -5.676846095493862,
"step": 348
},
{
"epoch": 0.966926406926407,
"grad_norm": 2.7297787722044076,
"kl": 299.56707763671875,
"learning_rate": 1.1769393393448459e-08,
"logits/chosen": -51520972.0,
"logits/rejected": -111233944.0,
"logps/chosen": -255.2079620361328,
"logps/rejected": -269.553955078125,
"loss": 0.2803,
"rewards/chosen": -3.926908254623413,
"rewards/margins": 3.280689001083374,
"rewards/rejected": -7.207597255706787,
"step": 349
},
{
"epoch": 0.9696969696969697,
"grad_norm": 3.458726157837369,
"kl": 194.5485076904297,
"learning_rate": 9.728097261777202e-09,
"logits/chosen": -69388939.63636364,
"logits/rejected": -59349760.0,
"logps/chosen": -200.84548117897728,
"logps/rejected": -153.4105712890625,
"loss": 0.3073,
"rewards/chosen": -0.6452965302900835,
"rewards/margins": 6.0389731840653855,
"rewards/rejected": -6.684269714355469,
"step": 350
},
{
"epoch": 0.9724675324675325,
"grad_norm": 3.043597739972342,
"kl": 288.1514587402344,
"learning_rate": 7.88073055055516e-09,
"logits/chosen": -43270261.333333336,
"logits/rejected": -62705388.0,
"logps/chosen": -233.39241536458334,
"logps/rejected": -178.01548767089844,
"loss": 0.3206,
"rewards/chosen": -0.9711050192515055,
"rewards/margins": 4.855683883031209,
"rewards/rejected": -5.826788902282715,
"step": 351
},
{
"epoch": 0.9752380952380952,
"grad_norm": 3.3453162638571334,
"kl": 273.7534484863281,
"learning_rate": 6.2274371230905405e-09,
"logits/chosen": -51228424.0,
"logits/rejected": -43057528.0,
"logps/chosen": -257.15313720703125,
"logps/rejected": -185.47024536132812,
"loss": 0.2676,
"rewards/chosen": 1.785893440246582,
"rewards/margins": 11.457650184631348,
"rewards/rejected": -9.671756744384766,
"step": 352
},
{
"epoch": 0.978008658008658,
"grad_norm": 2.82927755492849,
"kl": 410.8206481933594,
"learning_rate": 4.7683457292743705e-09,
"logits/chosen": -95908460.8,
"logits/rejected": -64794293.333333336,
"logps/chosen": -242.651708984375,
"logps/rejected": -233.58734130859375,
"loss": 0.2688,
"rewards/chosen": 4.084451293945312,
"rewards/margins": 13.245009994506836,
"rewards/rejected": -9.160558700561523,
"step": 353
},
{
"epoch": 0.9807792207792208,
"grad_norm": 3.9253391250529686,
"kl": 298.74639892578125,
"learning_rate": 3.503569995554068e-09,
"logits/chosen": -42239523.2,
"logits/rejected": -21779062.666666668,
"logps/chosen": -217.557080078125,
"logps/rejected": -178.1170654296875,
"loss": 0.3393,
"rewards/chosen": 1.7639793395996093,
"rewards/margins": 7.103317642211914,
"rewards/rejected": -5.339338302612305,
"step": 354
},
{
"epoch": 0.9835497835497835,
"grad_norm": 2.4850995996925946,
"kl": 195.93548583984375,
"learning_rate": 2.4332084160835766e-09,
"logits/chosen": -47525056.0,
"logits/rejected": -50351388.0,
"logps/chosen": -262.4061279296875,
"logps/rejected": -249.47586059570312,
"loss": 0.3643,
"rewards/chosen": -1.0697911977767944,
"rewards/margins": 4.326021790504456,
"rewards/rejected": -5.39581298828125,
"step": 355
},
{
"epoch": 0.9863203463203464,
"grad_norm": 3.454830902434755,
"kl": 169.8162841796875,
"learning_rate": 1.5573443450545012e-09,
"logits/chosen": -66981845.333333336,
"logits/rejected": -78846729.14285715,
"logps/chosen": -282.89678276909723,
"logps/rejected": -214.99260602678572,
"loss": 0.3501,
"rewards/chosen": 0.8235403696695963,
"rewards/margins": 7.977521169753302,
"rewards/rejected": -7.153980800083706,
"step": 356
},
{
"epoch": 0.9890909090909091,
"grad_norm": 3.5602617502447202,
"kl": 369.330078125,
"learning_rate": 8.760459902037998e-10,
"logits/chosen": -86534136.8888889,
"logits/rejected": -49817129.14285714,
"logps/chosen": -335.5671115451389,
"logps/rejected": -184.69977678571428,
"loss": 0.2598,
"rewards/chosen": 2.4023797776963978,
"rewards/margins": 9.863903651161799,
"rewards/rejected": -7.461523873465402,
"step": 357
},
{
"epoch": 0.9918614718614719,
"grad_norm": 3.652592934041048,
"kl": 289.38031005859375,
"learning_rate": 3.8936640750358856e-10,
"logits/chosen": -53824308.36363637,
"logits/rejected": -23989609.6,
"logps/chosen": -296.4788707386364,
"logps/rejected": -154.59755859375,
"loss": 0.2426,
"rewards/chosen": -2.885924599387429,
"rewards/margins": 2.8186373970725316,
"rewards/rejected": -5.704561996459961,
"step": 358
},
{
"epoch": 0.9946320346320346,
"grad_norm": 4.316881838802012,
"kl": 284.5806884765625,
"learning_rate": 9.734349702722468e-11,
"logits/chosen": -87349336.0,
"logits/rejected": -81604216.0,
"logps/chosen": -296.7097473144531,
"logps/rejected": -270.17901611328125,
"loss": 0.3314,
"rewards/chosen": -4.627019882202148,
"rewards/margins": 2.630732536315918,
"rewards/rejected": -7.257752418518066,
"step": 359
},
{
"epoch": 0.9974025974025974,
"grad_norm": 2.621055829790202,
"kl": 262.32708740234375,
"learning_rate": 0.0,
"logits/chosen": -90589222.4,
"logits/rejected": -62416917.333333336,
"logps/chosen": -352.624951171875,
"logps/rejected": -277.27325439453125,
"loss": 0.3122,
"rewards/chosen": 0.6123371124267578,
"rewards/margins": 8.145875930786133,
"rewards/rejected": -7.533538818359375,
"step": 360
},
{
"epoch": 0.9974025974025974,
"eval_logits/chosen": -69679219.2,
"eval_logits/rejected": -34451756.137931034,
"eval_logps/chosen": -343.90514322916664,
"eval_logps/rejected": -233.7683694773707,
"eval_loss": 0.30314239859580994,
"eval_rewards/chosen": 1.5421129862467449,
"eval_rewards/margins": 8.846664735640603,
"eval_rewards/rejected": -7.304551749393858,
"eval_runtime": 375.3852,
"eval_samples_per_second": 1.244,
"eval_steps_per_second": 0.157,
"kl": 1080.3172607421875,
"step": 360
},
{
"epoch": 0.9974025974025974,
"step": 360,
"total_flos": 8.196772297546138e+16,
"train_loss": 0.31702631492581634,
"train_runtime": 54644.2785,
"train_samples_per_second": 0.845,
"train_steps_per_second": 0.007
}
],
"logging_steps": 1,
"max_steps": 360,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 360,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 8.196772297546138e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}