FactAlign-gemma-2b-sft / trainer_state.json
chaoweihuang's picture
Upload folder using huggingface_hub
5a9b970 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9998706171561651,
"eval_steps": 200,
"global_step": 966,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.010350627506792599,
"grad_norm": 36.896687952176364,
"kl": 0.006904316134750843,
"learning_rate": 2.126812117966759e-07,
"logps/chosen": -417.2011954066265,
"logps/rejected": -375.1744622564935,
"loss": 0.4997,
"rewards/chosen": -0.003417713455407016,
"rewards/margins": 0.002690252778750596,
"rewards/rejected": -0.006107966234157612,
"step": 10
},
{
"epoch": 0.020701255013585197,
"grad_norm": 36.344980205107255,
"kl": 0.004612588789314032,
"learning_rate": 3.096603651432316e-07,
"logps/chosen": -277.7969021267361,
"logps/rejected": -407.0654296875,
"loss": 0.4892,
"rewards/chosen": -0.04762052165137397,
"rewards/margins": 0.07216862355819856,
"rewards/rejected": -0.11978914520957253,
"step": 20
},
{
"epoch": 0.0310518825203778,
"grad_norm": 36.71259965236908,
"kl": 0.0,
"learning_rate": 3.602235071779947e-07,
"logps/chosen": -355.01975574712645,
"logps/rejected": -431.1890785530822,
"loss": 0.4788,
"rewards/chosen": -0.2371558485359981,
"rewards/margins": 0.25850485494016306,
"rewards/rejected": -0.49566070347616115,
"step": 30
},
{
"epoch": 0.041402510027170394,
"grad_norm": 38.99807359143627,
"kl": 0.016344498842954636,
"learning_rate": 3.9466076978545386e-07,
"logps/chosen": -347.33896998355266,
"logps/rejected": -404.498046875,
"loss": 0.4398,
"rewards/chosen": -0.5145087995027241,
"rewards/margins": 0.5702773885320601,
"rewards/rejected": -1.0847861880347842,
"step": 40
},
{
"epoch": 0.051753137533962996,
"grad_norm": 34.4366373643818,
"kl": 0.0,
"learning_rate": 4.208077428062608e-07,
"logps/chosen": -401.31200610632186,
"logps/rejected": -408.78579837328766,
"loss": 0.4906,
"rewards/chosen": -1.0610687431247754,
"rewards/margins": 0.6205490982038848,
"rewards/rejected": -1.6816178413286602,
"step": 50
},
{
"epoch": 0.0621037650407556,
"grad_norm": 40.6191164803455,
"kl": 0.0,
"learning_rate": 4.4189144263242994e-07,
"logps/chosen": -292.2215844131098,
"logps/rejected": -431.56860977564105,
"loss": 0.4649,
"rewards/chosen": -0.6987755007860137,
"rewards/margins": 0.5288097293321754,
"rewards/rejected": -1.227585230118189,
"step": 60
},
{
"epoch": 0.0724543925475482,
"grad_norm": 30.15586760876392,
"kl": 0.0,
"learning_rate": 4.5955828020052655e-07,
"logps/chosen": -355.0156035370879,
"logps/rejected": -401.8425045289855,
"loss": 0.4658,
"rewards/chosen": -0.5742165701729911,
"rewards/margins": 0.8813798008004576,
"rewards/rejected": -1.4555963709734487,
"step": 70
},
{
"epoch": 0.08280502005434079,
"grad_norm": 28.330817825249255,
"kl": 0.0,
"learning_rate": 4.7476282570257156e-07,
"logps/chosen": -394.8970209478022,
"logps/rejected": -387.24026268115944,
"loss": 0.4731,
"rewards/chosen": -0.7301217383080787,
"rewards/margins": 0.8021065933268852,
"rewards/rejected": -1.5322283316349639,
"step": 80
},
{
"epoch": 0.0931556475611334,
"grad_norm": 30.67181137678842,
"kl": 0.0,
"learning_rate": 4.881082258136016e-07,
"logps/chosen": -294.80318509615387,
"logps/rejected": -385.4679163490854,
"loss": 0.4613,
"rewards/chosen": -0.6603363232734876,
"rewards/margins": 0.5270779856001309,
"rewards/rejected": -1.1874143088736184,
"step": 90
},
{
"epoch": 0.10350627506792599,
"grad_norm": 31.541642718713373,
"kl": 0.0,
"learning_rate": 5e-07,
"logps/chosen": -356.5553466796875,
"logps/rejected": -427.236279296875,
"loss": 0.4466,
"rewards/chosen": -0.5922697067260743,
"rewards/margins": 0.8280625343322754,
"rewards/rejected": -1.4203322410583497,
"step": 100
},
{
"epoch": 0.11385690257471859,
"grad_norm": 28.188607938438196,
"kl": 0.0,
"learning_rate": 5e-07,
"logps/chosen": -349.71470997431504,
"logps/rejected": -431.29777298850576,
"loss": 0.445,
"rewards/chosen": -0.7172038457165025,
"rewards/margins": 0.5990428885518614,
"rewards/rejected": -1.316246734268364,
"step": 110
},
{
"epoch": 0.1242075300815112,
"grad_norm": 28.777259577988843,
"kl": 0.0,
"learning_rate": 5e-07,
"logps/chosen": -317.6811767578125,
"logps/rejected": -400.8,
"loss": 0.4577,
"rewards/chosen": -0.748396921157837,
"rewards/margins": 0.6573972225189209,
"rewards/rejected": -1.4057941436767578,
"step": 120
},
{
"epoch": 0.13455815758830378,
"grad_norm": 27.073111094683828,
"kl": 0.0,
"learning_rate": 5e-07,
"logps/chosen": -334.3945529513889,
"logps/rejected": -424.9839564732143,
"loss": 0.4654,
"rewards/chosen": -0.797715589735243,
"rewards/margins": 0.9449826437329489,
"rewards/rejected": -1.742698233468192,
"step": 130
},
{
"epoch": 0.1449087850950964,
"grad_norm": 29.19719970356803,
"kl": 0.0,
"learning_rate": 5e-07,
"logps/chosen": -368.06354166666665,
"logps/rejected": -338.69952566964287,
"loss": 0.487,
"rewards/chosen": -0.6288536071777344,
"rewards/margins": 0.4991338457380021,
"rewards/rejected": -1.1279874529157365,
"step": 140
},
{
"epoch": 0.155259412601889,
"grad_norm": 31.37561442050933,
"kl": 0.0,
"learning_rate": 5e-07,
"logps/chosen": -308.6735341061828,
"logps/rejected": -390.50953241604475,
"loss": 0.4775,
"rewards/chosen": -0.5661141180223034,
"rewards/margins": 0.7382167362512507,
"rewards/rejected": -1.304330854273554,
"step": 150
},
{
"epoch": 0.16561004010868158,
"grad_norm": 36.002916810630985,
"kl": 0.07424011081457138,
"learning_rate": 5e-07,
"logps/chosen": -395.24665850903614,
"logps/rejected": -402.71707589285717,
"loss": 0.4599,
"rewards/chosen": -0.5169859277196678,
"rewards/margins": 0.6465992892484227,
"rewards/rejected": -1.1635852169680905,
"step": 160
},
{
"epoch": 0.1759606676154742,
"grad_norm": 27.151915007789793,
"kl": 0.0,
"learning_rate": 5e-07,
"logps/chosen": -380.0856370192308,
"logps/rejected": -437.3331269054878,
"loss": 0.434,
"rewards/chosen": -0.5822516710330279,
"rewards/margins": 0.9586351846739081,
"rewards/rejected": -1.540886855706936,
"step": 170
},
{
"epoch": 0.1863112951222668,
"grad_norm": 22.70085892654007,
"kl": 0.0,
"learning_rate": 5e-07,
"logps/chosen": -352.72755281690144,
"logps/rejected": -438.0743504213483,
"loss": 0.4081,
"rewards/chosen": -0.6838695364938655,
"rewards/margins": 0.9942577951862609,
"rewards/rejected": -1.6781273316801264,
"step": 180
},
{
"epoch": 0.19666192262905938,
"grad_norm": 27.14667046044915,
"kl": 0.0,
"learning_rate": 5e-07,
"logps/chosen": -320.2009880514706,
"logps/rejected": -407.538046875,
"loss": 0.449,
"rewards/chosen": -1.0842503267176011,
"rewards/margins": 1.4791047758214615,
"rewards/rejected": -2.5633551025390626,
"step": 190
},
{
"epoch": 0.20701255013585199,
"grad_norm": 27.694967881656005,
"kl": 0.005686330609023571,
"learning_rate": 5e-07,
"logps/chosen": -337.79836856617646,
"logps/rejected": -427.5978645833333,
"loss": 0.4522,
"rewards/chosen": -0.9811132094439339,
"rewards/margins": 1.1809233302696076,
"rewards/rejected": -2.1620365397135415,
"step": 200
},
{
"epoch": 0.20701255013585199,
"eval_kl": 0.0010393437696620822,
"eval_logps/chosen": -345.2487181263858,
"eval_logps/rejected": -393.2139168432203,
"eval_loss": 0.44461360573768616,
"eval_rewards/chosen": -1.1207509516612388,
"eval_rewards/margins": 0.9445489068199584,
"eval_rewards/rejected": -2.0652998584811972,
"eval_runtime": 261.4133,
"eval_samples_per_second": 7.062,
"eval_steps_per_second": 3.531,
"step": 200
},
{
"epoch": 0.2173631776426446,
"grad_norm": 30.464898770807604,
"kl": 0.0,
"learning_rate": 5e-07,
"logps/chosen": -365.0289713541667,
"logps/rejected": -427.5989879261364,
"loss": 0.4016,
"rewards/chosen": -0.8537895944383409,
"rewards/margins": 1.3811903818689213,
"rewards/rejected": -2.234979976307262,
"step": 210
},
{
"epoch": 0.22771380514943718,
"grad_norm": 23.41466055625897,
"kl": 0.0,
"learning_rate": 5e-07,
"logps/chosen": -345.475933908046,
"logps/rejected": -393.412189640411,
"loss": 0.4443,
"rewards/chosen": -0.9509018645889458,
"rewards/margins": 1.2836219608605406,
"rewards/rejected": -2.2345238254494864,
"step": 220
},
{
"epoch": 0.23806443265622979,
"grad_norm": 26.98695760593119,
"kl": 0.0,
"learning_rate": 5e-07,
"logps/chosen": -323.52855282738096,
"logps/rejected": -349.73843544407896,
"loss": 0.4764,
"rewards/chosen": -0.9678686232793898,
"rewards/margins": 0.599584660733254,
"rewards/rejected": -1.5674532840126438,
"step": 230
},
{
"epoch": 0.2484150601630224,
"grad_norm": 26.51688363505412,
"kl": 0.0,
"learning_rate": 5e-07,
"logps/chosen": -339.72511772260276,
"logps/rejected": -392.6373024425287,
"loss": 0.4254,
"rewards/chosen": -0.7914297548058915,
"rewards/margins": 0.7201490352668456,
"rewards/rejected": -1.511578790072737,
"step": 240
},
{
"epoch": 0.258765687669815,
"grad_norm": 38.3631109147077,
"kl": 0.0,
"learning_rate": 5e-07,
"logps/chosen": -362.05623478084414,
"logps/rejected": -361.7480233433735,
"loss": 0.4123,
"rewards/chosen": -0.5259268129026735,
"rewards/margins": 1.3802443193803648,
"rewards/rejected": -1.9061711322830384,
"step": 250
},
{
"epoch": 0.26911631517660756,
"grad_norm": 28.783330131851603,
"kl": 0.0,
"learning_rate": 5e-07,
"logps/chosen": -318.40223672945206,
"logps/rejected": -472.43588362068965,
"loss": 0.4135,
"rewards/chosen": -0.8065869579576466,
"rewards/margins": 1.0660637146918686,
"rewards/rejected": -1.8726506726495151,
"step": 260
},
{
"epoch": 0.27946694268340017,
"grad_norm": 25.596794311830312,
"kl": 0.0,
"learning_rate": 5e-07,
"logps/chosen": -367.7001139322917,
"logps/rejected": -436.5064808238636,
"loss": 0.4093,
"rewards/chosen": -0.8103501001993815,
"rewards/margins": 1.0771059267448657,
"rewards/rejected": -1.887456026944247,
"step": 270
},
{
"epoch": 0.2898175701901928,
"grad_norm": 31.49395928287787,
"kl": 0.0206025131046772,
"learning_rate": 5e-07,
"logps/chosen": -456.68581081081084,
"logps/rejected": -399.91547056686045,
"loss": 0.4345,
"rewards/chosen": -1.0405741511164486,
"rewards/margins": 0.9193381711718871,
"rewards/rejected": -1.9599123222883357,
"step": 280
},
{
"epoch": 0.3001681976969854,
"grad_norm": 27.507156588516853,
"kl": 0.0,
"learning_rate": 5e-07,
"logps/chosen": -354.3474633487654,
"logps/rejected": -430.79647943037975,
"loss": 0.4246,
"rewards/chosen": -0.8167637954523534,
"rewards/margins": 1.6583199540531843,
"rewards/rejected": -2.4750837495055378,
"step": 290
},
{
"epoch": 0.310518825203778,
"grad_norm": 27.243120877089865,
"kl": 0.0,
"learning_rate": 5e-07,
"logps/chosen": -369.0854611280488,
"logps/rejected": -411.7598407451923,
"loss": 0.4515,
"rewards/chosen": -0.9885020372344226,
"rewards/margins": 1.07946980364849,
"rewards/rejected": -2.0679718408829126,
"step": 300
},
{
"epoch": 0.3208694527105706,
"grad_norm": 26.41738454716243,
"kl": 0.0,
"learning_rate": 5e-07,
"logps/chosen": -328.5754642210145,
"logps/rejected": -473.32679429945057,
"loss": 0.4,
"rewards/chosen": -0.8000211853911912,
"rewards/margins": 1.4340147676694202,
"rewards/rejected": -2.2340359530606113,
"step": 310
},
{
"epoch": 0.33122008021736316,
"grad_norm": 27.90947815196134,
"kl": 0.0,
"learning_rate": 5e-07,
"logps/chosen": -379.9532833614865,
"logps/rejected": -414.0909792877907,
"loss": 0.4206,
"rewards/chosen": -0.707832078675966,
"rewards/margins": 1.2218697929741975,
"rewards/rejected": -1.9297018716501635,
"step": 320
},
{
"epoch": 0.34157070772415576,
"grad_norm": 34.323093394556274,
"kl": 0.0,
"learning_rate": 5e-07,
"logps/chosen": -302.920654296875,
"logps/rejected": -365.2463107638889,
"loss": 0.4599,
"rewards/chosen": -0.8200391422618519,
"rewards/margins": 1.4710271334407303,
"rewards/rejected": -2.2910662757025824,
"step": 330
},
{
"epoch": 0.3519213352309484,
"grad_norm": 35.00194682599148,
"kl": 0.020750045776367188,
"learning_rate": 5e-07,
"logps/chosen": -370.7525414156627,
"logps/rejected": -407.6445819805195,
"loss": 0.4401,
"rewards/chosen": -0.4423764699912933,
"rewards/margins": 0.795768243571511,
"rewards/rejected": -1.2381447135628043,
"step": 340
},
{
"epoch": 0.362271962737741,
"grad_norm": 28.74449923281838,
"kl": 0.0,
"learning_rate": 5e-07,
"logps/chosen": -369.2329220655488,
"logps/rejected": -412.28390424679486,
"loss": 0.443,
"rewards/chosen": -0.5598751161156631,
"rewards/margins": 0.7612275152820732,
"rewards/rejected": -1.3211026313977363,
"step": 350
},
{
"epoch": 0.3726225902445336,
"grad_norm": 33.80893504974849,
"kl": 0.0,
"learning_rate": 5e-07,
"logps/chosen": -333.06757269965277,
"logps/rejected": -476.0582386363636,
"loss": 0.3942,
"rewards/chosen": -0.8373040093315972,
"rewards/margins": 1.5102612081200184,
"rewards/rejected": -2.3475652174516157,
"step": 360
},
{
"epoch": 0.3829732177513262,
"grad_norm": 23.093501234844034,
"kl": 0.0,
"learning_rate": 5e-07,
"logps/chosen": -344.985234375,
"logps/rejected": -478.50422794117645,
"loss": 0.3956,
"rewards/chosen": -1.264248046875,
"rewards/margins": 1.5773571059283087,
"rewards/rejected": -2.8416051528033086,
"step": 370
},
{
"epoch": 0.39332384525811875,
"grad_norm": 23.6165146626171,
"kl": 0.0,
"learning_rate": 5e-07,
"logps/chosen": -331.19694890202703,
"logps/rejected": -458.86123728197674,
"loss": 0.4153,
"rewards/chosen": -1.1533899049501162,
"rewards/margins": 1.4770645798563884,
"rewards/rejected": -2.6304544848065046,
"step": 380
},
{
"epoch": 0.40367447276491136,
"grad_norm": 24.677426766885045,
"kl": 0.045375823974609375,
"learning_rate": 5e-07,
"logps/chosen": -335.65542204483694,
"logps/rejected": -439.2108800551471,
"loss": 0.4565,
"rewards/chosen": -0.7980768784232761,
"rewards/margins": 1.7186397981765629,
"rewards/rejected": -2.516716676599839,
"step": 390
},
{
"epoch": 0.41402510027170397,
"grad_norm": 32.96461257238746,
"kl": 0.0,
"learning_rate": 5e-07,
"logps/chosen": -313.4358512581169,
"logps/rejected": -439.3407379518072,
"loss": 0.4056,
"rewards/chosen": -0.6841482187246347,
"rewards/margins": 1.2257567208579143,
"rewards/rejected": -1.909904939582549,
"step": 400
},
{
"epoch": 0.41402510027170397,
"eval_kl": 0.02226920612156391,
"eval_logps/chosen": -340.0967987804878,
"eval_logps/rejected": -387.47169623940675,
"eval_loss": 0.44011881947517395,
"eval_rewards/chosen": -0.6055575284090909,
"eval_rewards/margins": 0.885514011000999,
"eval_rewards/rejected": -1.49107153941009,
"eval_runtime": 260.8826,
"eval_samples_per_second": 7.076,
"eval_steps_per_second": 3.538,
"step": 400
},
{
"epoch": 0.4243757277784966,
"grad_norm": 26.751958968613145,
"kl": 0.0,
"learning_rate": 5e-07,
"logps/chosen": -372.82060185185185,
"logps/rejected": -399.93740110759495,
"loss": 0.4265,
"rewards/chosen": -0.5553302058467159,
"rewards/margins": 1.1236735458839013,
"rewards/rejected": -1.6790037517306171,
"step": 410
},
{
"epoch": 0.4347263552852892,
"grad_norm": 32.246235152731096,
"kl": 0.0,
"learning_rate": 5e-07,
"logps/chosen": -348.120418595679,
"logps/rejected": -402.0041287579114,
"loss": 0.4239,
"rewards/chosen": -0.7425044495382427,
"rewards/margins": 1.1616015940238618,
"rewards/rejected": -1.9041060435621044,
"step": 420
},
{
"epoch": 0.44507698279208174,
"grad_norm": 27.212254824473547,
"kl": 0.04713239520788193,
"learning_rate": 5e-07,
"logps/chosen": -330.6474880642361,
"logps/rejected": -389.7398792613636,
"loss": 0.3978,
"rewards/chosen": -0.9934198591444228,
"rewards/margins": 1.0819970525876441,
"rewards/rejected": -2.075416911732067,
"step": 430
},
{
"epoch": 0.45542761029887435,
"grad_norm": 24.894169784907362,
"kl": 0.0,
"learning_rate": 5e-07,
"logps/chosen": -376.64564344618054,
"logps/rejected": -515.4582297585227,
"loss": 0.3809,
"rewards/chosen": -0.8414801491631402,
"rewards/margins": 1.9854850094727796,
"rewards/rejected": -2.82696515863592,
"step": 440
},
{
"epoch": 0.46577823780566696,
"grad_norm": 33.54283688924568,
"kl": 0.0,
"learning_rate": 5e-07,
"logps/chosen": -373.34893120659723,
"logps/rejected": -440.4869495738636,
"loss": 0.4141,
"rewards/chosen": -1.090722295973036,
"rewards/margins": 1.4239928987291124,
"rewards/rejected": -2.5147151947021484,
"step": 450
},
{
"epoch": 0.47612886531245957,
"grad_norm": 25.873975620632326,
"kl": 0.0,
"learning_rate": 5e-07,
"logps/chosen": -297.38337725903614,
"logps/rejected": -368.12974330357144,
"loss": 0.4304,
"rewards/chosen": -0.642763620399567,
"rewards/margins": 1.4404461233478962,
"rewards/rejected": -2.0832097437474633,
"step": 460
},
{
"epoch": 0.4864794928192522,
"grad_norm": 31.22528359201901,
"kl": 0.0,
"learning_rate": 5e-07,
"logps/chosen": -364.6297576121795,
"logps/rejected": -414.5650247713415,
"loss": 0.397,
"rewards/chosen": -0.4758866138947316,
"rewards/margins": 1.4139596296147006,
"rewards/rejected": -1.889846243509432,
"step": 470
},
{
"epoch": 0.4968301203260448,
"grad_norm": 26.938362242757048,
"kl": 0.0,
"learning_rate": 5e-07,
"logps/chosen": -398.0412109375,
"logps/rejected": -487.195654296875,
"loss": 0.4055,
"rewards/chosen": -0.5108624458312988,
"rewards/margins": 1.3080674171447755,
"rewards/rejected": -1.8189298629760742,
"step": 480
},
{
"epoch": 0.5071807478328374,
"grad_norm": 29.10971517563742,
"kl": 0.0,
"learning_rate": 5e-07,
"logps/chosen": -390.874140625,
"logps/rejected": -379.295703125,
"loss": 0.4363,
"rewards/chosen": -0.9983409627278645,
"rewards/margins": 0.9311472754384957,
"rewards/rejected": -1.9294882381663603,
"step": 490
},
{
"epoch": 0.51753137533963,
"grad_norm": 27.404424128068055,
"kl": 0.0,
"learning_rate": 5e-07,
"logps/chosen": -330.4240828804348,
"logps/rejected": -394.64285714285717,
"loss": 0.3714,
"rewards/chosen": -0.724442468173262,
"rewards/margins": 1.7530021910493512,
"rewards/rejected": -2.477444659222613,
"step": 500
},
{
"epoch": 0.5278820028464226,
"grad_norm": 30.205347992720988,
"kl": 0.010777664370834827,
"learning_rate": 5e-07,
"logps/chosen": -371.4545238597973,
"logps/rejected": -425.5056776889535,
"loss": 0.4051,
"rewards/chosen": -0.7893987088590055,
"rewards/margins": 1.3341055749573099,
"rewards/rejected": -2.1235042838163154,
"step": 510
},
{
"epoch": 0.5382326303532151,
"grad_norm": 27.47044972378467,
"kl": 0.0,
"learning_rate": 5e-07,
"logps/chosen": -317.4935569324713,
"logps/rejected": -394.0045751284247,
"loss": 0.433,
"rewards/chosen": -0.9540053619735542,
"rewards/margins": 1.3108872161514458,
"rewards/rejected": -2.264892578125,
"step": 520
},
{
"epoch": 0.5485832578600077,
"grad_norm": 32.30343597091197,
"kl": 0.06133537366986275,
"learning_rate": 5e-07,
"logps/chosen": -377.94091796875,
"logps/rejected": -412.06171875,
"loss": 0.415,
"rewards/chosen": -0.42492337226867677,
"rewards/margins": 1.4800034999847413,
"rewards/rejected": -1.904926872253418,
"step": 530
},
{
"epoch": 0.5589338853668003,
"grad_norm": 34.615081184959564,
"kl": 0.0,
"learning_rate": 5e-07,
"logps/chosen": -326.7767721036585,
"logps/rejected": -424.0320012019231,
"loss": 0.4396,
"rewards/chosen": -0.4712153178889577,
"rewards/margins": 0.8699719880505454,
"rewards/rejected": -1.3411873059395032,
"step": 540
},
{
"epoch": 0.5692845128735929,
"grad_norm": 34.56564210195164,
"kl": 0.06511452049016953,
"learning_rate": 5e-07,
"logps/chosen": -407.66327617694805,
"logps/rejected": -371.1233998493976,
"loss": 0.375,
"rewards/chosen": -0.19861872784503096,
"rewards/margins": 1.8991040725361816,
"rewards/rejected": -2.0977228003812125,
"step": 550
},
{
"epoch": 0.5796351403803855,
"grad_norm": 26.904457572554023,
"kl": 0.14332695305347443,
"learning_rate": 5e-07,
"logps/chosen": -336.1937744140625,
"logps/rejected": -404.095556640625,
"loss": 0.4105,
"rewards/chosen": -0.39203429222106934,
"rewards/margins": 1.2247087955474854,
"rewards/rejected": -1.6167430877685547,
"step": 560
},
{
"epoch": 0.5899857678871782,
"grad_norm": 29.784323096457744,
"kl": 0.005803870968520641,
"learning_rate": 5e-07,
"logps/chosen": -300.69694346005156,
"logps/rejected": -407.36216517857144,
"loss": 0.4499,
"rewards/chosen": -0.4008376917888209,
"rewards/margins": 1.1861952923846664,
"rewards/rejected": -1.5870329841734871,
"step": 570
},
{
"epoch": 0.6003363953939708,
"grad_norm": 24.46799204128634,
"kl": 0.22002115845680237,
"learning_rate": 5e-07,
"logps/chosen": -347.5768229166667,
"logps/rejected": -443.0283717105263,
"loss": 0.4189,
"rewards/chosen": -0.3804002716427758,
"rewards/margins": 1.428496646403071,
"rewards/rejected": -1.808896918045847,
"step": 580
},
{
"epoch": 0.6106870229007634,
"grad_norm": 30.275312642751995,
"kl": 0.13701924681663513,
"learning_rate": 5e-07,
"logps/chosen": -355.59707919034093,
"logps/rejected": -430.7814670138889,
"loss": 0.4381,
"rewards/chosen": -0.5888070193204012,
"rewards/margins": 1.315426489319464,
"rewards/rejected": -1.9042335086398654,
"step": 590
},
{
"epoch": 0.621037650407556,
"grad_norm": 32.60832471668693,
"kl": 0.02227201499044895,
"learning_rate": 5e-07,
"logps/chosen": -350.7683919270833,
"logps/rejected": -444.4885896381579,
"loss": 0.4163,
"rewards/chosen": -0.5356872195289248,
"rewards/margins": 1.2915597977793605,
"rewards/rejected": -1.8272470173082853,
"step": 600
},
{
"epoch": 0.621037650407556,
"eval_kl": 0.009310548193752766,
"eval_logps/chosen": -339.2911238913525,
"eval_logps/rejected": -389.9666313559322,
"eval_loss": 0.418056845664978,
"eval_rewards/chosen": -0.5249900056623302,
"eval_rewards/margins": 1.2155782523322407,
"eval_rewards/rejected": -1.740568257994571,
"eval_runtime": 261.1582,
"eval_samples_per_second": 7.069,
"eval_steps_per_second": 3.534,
"step": 600
},
{
"epoch": 0.6313882779143486,
"grad_norm": 22.18017793377208,
"kl": 0.039247892796993256,
"learning_rate": 5e-07,
"logps/chosen": -383.04136439732144,
"logps/rejected": -429.54263466282896,
"loss": 0.4132,
"rewards/chosen": -0.4417642865862165,
"rewards/margins": 1.5167117298097539,
"rewards/rejected": -1.9584760163959705,
"step": 610
},
{
"epoch": 0.6417389054211412,
"grad_norm": 29.924696721027633,
"kl": 0.03644561767578125,
"learning_rate": 5e-07,
"logps/chosen": -382.4176720727848,
"logps/rejected": -498.68258101851853,
"loss": 0.3878,
"rewards/chosen": -0.47994140431850774,
"rewards/margins": 1.639255923095169,
"rewards/rejected": -2.119197327413677,
"step": 620
},
{
"epoch": 0.6520895329279337,
"grad_norm": 28.116353403382174,
"kl": 0.0513916015625,
"learning_rate": 5e-07,
"logps/chosen": -311.66650390625,
"logps/rejected": -419.412939453125,
"loss": 0.3978,
"rewards/chosen": -0.5402119159698486,
"rewards/margins": 1.4218003749847412,
"rewards/rejected": -1.9620122909545898,
"step": 630
},
{
"epoch": 0.6624401604347263,
"grad_norm": 30.136959971403833,
"kl": 0.0,
"learning_rate": 5e-07,
"logps/chosen": -367.5465806934931,
"logps/rejected": -451.5183638649425,
"loss": 0.4204,
"rewards/chosen": -1.145416991351402,
"rewards/margins": 0.9065032307885044,
"rewards/rejected": -2.0519202221399064,
"step": 640
},
{
"epoch": 0.6727907879415189,
"grad_norm": 25.55983506887128,
"kl": 0.0,
"learning_rate": 5e-07,
"logps/chosen": -331.976943597561,
"logps/rejected": -408.9071514423077,
"loss": 0.4119,
"rewards/chosen": -0.9971130185010957,
"rewards/margins": 1.7702796535241447,
"rewards/rejected": -2.7673926720252404,
"step": 650
},
{
"epoch": 0.6831414154483115,
"grad_norm": 25.82328491415139,
"kl": 0.008263682946562767,
"learning_rate": 5e-07,
"logps/chosen": -361.71830610795456,
"logps/rejected": -486.77197265625,
"loss": 0.4233,
"rewards/chosen": -0.7347448522394354,
"rewards/margins": 2.0849816678750392,
"rewards/rejected": -2.8197265201144748,
"step": 660
},
{
"epoch": 0.6934920429551041,
"grad_norm": 26.68136550686645,
"kl": 0.09514617919921875,
"learning_rate": 5e-07,
"logps/chosen": -355.8723958333333,
"logps/rejected": -486.3736672794118,
"loss": 0.3773,
"rewards/chosen": -0.6042455546061198,
"rewards/margins": 2.0892714347091377,
"rewards/rejected": -2.6935169893152575,
"step": 670
},
{
"epoch": 0.7038426704618967,
"grad_norm": 21.168784955584055,
"kl": 0.0,
"learning_rate": 5e-07,
"logps/chosen": -330.87958757267444,
"logps/rejected": -369.6824588260135,
"loss": 0.4207,
"rewards/chosen": -0.6469083830367687,
"rewards/margins": 1.6826272301521037,
"rewards/rejected": -2.3295356131888725,
"step": 680
},
{
"epoch": 0.7141932979686894,
"grad_norm": 31.900093457461022,
"kl": 0.08187294006347656,
"learning_rate": 5e-07,
"logps/chosen": -417.3225528492647,
"logps/rejected": -440.6748471467391,
"loss": 0.4112,
"rewards/chosen": -0.7309647728415096,
"rewards/margins": 1.2705956041965338,
"rewards/rejected": -2.0015603770380435,
"step": 690
},
{
"epoch": 0.724543925475482,
"grad_norm": 27.570529431002825,
"kl": 0.0,
"learning_rate": 5e-07,
"logps/chosen": -337.6109751506024,
"logps/rejected": -445.5989752435065,
"loss": 0.4132,
"rewards/chosen": -0.41143114021025506,
"rewards/margins": 1.2929300198698992,
"rewards/rejected": -1.7043611600801543,
"step": 700
},
{
"epoch": 0.7348945529822746,
"grad_norm": 25.994342612564424,
"kl": 0.004410457797348499,
"learning_rate": 5e-07,
"logps/chosen": -273.53585737179486,
"logps/rejected": -435.5107660060976,
"loss": 0.392,
"rewards/chosen": -0.5441466111403245,
"rewards/margins": 1.5854568910867144,
"rewards/rejected": -2.129603502227039,
"step": 710
},
{
"epoch": 0.7452451804890672,
"grad_norm": 28.362840310964046,
"kl": 0.0,
"learning_rate": 5e-07,
"logps/chosen": -332.836171875,
"logps/rejected": -474.0301470588235,
"loss": 0.3782,
"rewards/chosen": -0.7581790669759114,
"rewards/margins": 1.799709726969401,
"rewards/rejected": -2.5578887939453123,
"step": 720
},
{
"epoch": 0.7555958079958598,
"grad_norm": 21.561327202318306,
"kl": 0.0,
"learning_rate": 5e-07,
"logps/chosen": -275.46072571536143,
"logps/rejected": -412.22519277597405,
"loss": 0.3875,
"rewards/chosen": -0.5184578493417028,
"rewards/margins": 1.9156346847111583,
"rewards/rejected": -2.434092534052861,
"step": 730
},
{
"epoch": 0.7659464355026524,
"grad_norm": 18.54675354753111,
"kl": 0.0,
"learning_rate": 5e-07,
"logps/chosen": -368.63963607594934,
"logps/rejected": -425.9934895833333,
"loss": 0.4036,
"rewards/chosen": -0.8795772504202927,
"rewards/margins": 1.8751527858089703,
"rewards/rejected": -2.754730036229263,
"step": 740
},
{
"epoch": 0.7762970630094449,
"grad_norm": 27.548342849043514,
"kl": 0.0,
"learning_rate": 5e-07,
"logps/chosen": -395.18419471153845,
"logps/rejected": -404.6112804878049,
"loss": 0.4149,
"rewards/chosen": -0.8401767046023638,
"rewards/margins": 1.619423790526733,
"rewards/rejected": -2.459600495129097,
"step": 750
},
{
"epoch": 0.7866476905162375,
"grad_norm": 26.78674064602181,
"kl": 0.0,
"learning_rate": 5e-07,
"logps/chosen": -361.57657251602564,
"logps/rejected": -418.945693597561,
"loss": 0.4087,
"rewards/chosen": -0.6369634775015024,
"rewards/margins": 1.3416846384474304,
"rewards/rejected": -1.9786481159489329,
"step": 760
},
{
"epoch": 0.7969983180230301,
"grad_norm": 25.603994333749306,
"kl": 0.02446603775024414,
"learning_rate": 5e-07,
"logps/chosen": -319.98974609375,
"logps/rejected": -407.2808314732143,
"loss": 0.39,
"rewards/chosen": -0.4739310615941098,
"rewards/margins": 1.6105410496991381,
"rewards/rejected": -2.084472111293248,
"step": 770
},
{
"epoch": 0.8073489455298227,
"grad_norm": 26.587028848139315,
"kl": 0.04417114332318306,
"learning_rate": 5e-07,
"logps/chosen": -318.715,
"logps/rejected": -429.6086856617647,
"loss": 0.3786,
"rewards/chosen": -0.553302001953125,
"rewards/margins": 1.9189411836511947,
"rewards/rejected": -2.4722431856043197,
"step": 780
},
{
"epoch": 0.8176995730366153,
"grad_norm": 23.329347974769387,
"kl": 0.10457019507884979,
"learning_rate": 5e-07,
"logps/chosen": -382.36054180194805,
"logps/rejected": -415.20811370481925,
"loss": 0.3949,
"rewards/chosen": -0.9674345734831574,
"rewards/margins": 2.05568157274237,
"rewards/rejected": -3.023116146225527,
"step": 790
},
{
"epoch": 0.8280502005434079,
"grad_norm": 25.709767109519216,
"kl": 0.0,
"learning_rate": 5e-07,
"logps/chosen": -314.2330375339674,
"logps/rejected": -512.2797564338235,
"loss": 0.4158,
"rewards/chosen": -0.7708018759022588,
"rewards/margins": 2.3479348399754985,
"rewards/rejected": -3.1187367158777572,
"step": 800
},
{
"epoch": 0.8280502005434079,
"eval_kl": 0.006037264596670866,
"eval_logps/chosen": -341.35116407982264,
"eval_logps/rejected": -395.00337658898303,
"eval_loss": 0.41274696588516235,
"eval_rewards/chosen": -0.7309938418098669,
"eval_rewards/margins": 1.5132525602193967,
"eval_rewards/rejected": -2.2442464020292636,
"eval_runtime": 260.9097,
"eval_samples_per_second": 7.075,
"eval_steps_per_second": 3.538,
"step": 800
},
{
"epoch": 0.8384008280502006,
"grad_norm": 35.370868792942815,
"kl": 0.038701437413692474,
"learning_rate": 5e-07,
"logps/chosen": -300.92038143382354,
"logps/rejected": -437.9970833333333,
"loss": 0.4117,
"rewards/chosen": -0.6198445039636948,
"rewards/margins": 1.6185169055415134,
"rewards/rejected": -2.2383614095052082,
"step": 810
},
{
"epoch": 0.8487514555569932,
"grad_norm": 27.200562796310017,
"kl": 0.03499946743249893,
"learning_rate": 5e-07,
"logps/chosen": -420.0014134457237,
"logps/rejected": -474.64820498511904,
"loss": 0.3712,
"rewards/chosen": -0.21315298582378187,
"rewards/margins": 2.169883309749135,
"rewards/rejected": -2.3830362955729165,
"step": 820
},
{
"epoch": 0.8591020830637858,
"grad_norm": 22.838766986028332,
"kl": 0.08836288750171661,
"learning_rate": 5e-07,
"logps/chosen": -410.17025862068965,
"logps/rejected": -479.935466609589,
"loss": 0.4239,
"rewards/chosen": -0.5182619642937321,
"rewards/margins": 1.6473036309411782,
"rewards/rejected": -2.16556559523491,
"step": 830
},
{
"epoch": 0.8694527105705784,
"grad_norm": 31.190543721407206,
"kl": 0.0,
"learning_rate": 5e-07,
"logps/chosen": -392.1194540895062,
"logps/rejected": -380.65261570411394,
"loss": 0.4195,
"rewards/chosen": -0.5796352904519917,
"rewards/margins": 1.1813462089422924,
"rewards/rejected": -1.760981499394284,
"step": 840
},
{
"epoch": 0.879803338077371,
"grad_norm": 27.922649096371728,
"kl": 0.05376587063074112,
"learning_rate": 5e-07,
"logps/chosen": -342.7175263554217,
"logps/rejected": -405.21989143668833,
"loss": 0.3893,
"rewards/chosen": -0.3039788625326501,
"rewards/margins": 1.6599957309890625,
"rewards/rejected": -1.9639745935217126,
"step": 850
},
{
"epoch": 0.8901539655841635,
"grad_norm": 27.062117676313864,
"kl": 0.029529189690947533,
"learning_rate": 5e-07,
"logps/chosen": -339.9334415584416,
"logps/rejected": -389.23075112951807,
"loss": 0.4056,
"rewards/chosen": -0.7721986646776076,
"rewards/margins": 1.4165069634991618,
"rewards/rejected": -2.1887056281767694,
"step": 860
},
{
"epoch": 0.9005045930909561,
"grad_norm": 26.777659950643177,
"kl": 0.027013396844267845,
"learning_rate": 5e-07,
"logps/chosen": -359.30659239969134,
"logps/rejected": -432.05760482594934,
"loss": 0.4124,
"rewards/chosen": -1.0398042466905382,
"rewards/margins": 1.4762831525628244,
"rewards/rejected": -2.5160873992533626,
"step": 870
},
{
"epoch": 0.9108552205977487,
"grad_norm": 26.582657305921924,
"kl": 0.0,
"learning_rate": 5e-07,
"logps/chosen": -347.1331422483766,
"logps/rejected": -441.9758565512048,
"loss": 0.3819,
"rewards/chosen": -0.478736332484654,
"rewards/margins": 2.0877041119101123,
"rewards/rejected": -2.5664404443947664,
"step": 880
},
{
"epoch": 0.9212058481045413,
"grad_norm": 26.122755291889042,
"kl": 0.0009471893426962197,
"learning_rate": 5e-07,
"logps/chosen": -334.71205003955697,
"logps/rejected": -477.93663194444446,
"loss": 0.3907,
"rewards/chosen": -0.6639707058290892,
"rewards/margins": 2.118597389813754,
"rewards/rejected": -2.7825680956428434,
"step": 890
},
{
"epoch": 0.9315564756113339,
"grad_norm": 27.631179779669328,
"kl": 0.03726501390337944,
"learning_rate": 5e-07,
"logps/chosen": -354.04930971746575,
"logps/rejected": -398.39897629310343,
"loss": 0.3698,
"rewards/chosen": -0.6091255292500535,
"rewards/margins": 1.8384917494519042,
"rewards/rejected": -2.4476172787019577,
"step": 900
},
{
"epoch": 0.9419071031181265,
"grad_norm": 31.5668708111869,
"kl": 0.0027565001510083675,
"learning_rate": 5e-07,
"logps/chosen": -339.08727254746833,
"logps/rejected": -447.73466435185185,
"loss": 0.3717,
"rewards/chosen": -0.4057273864746094,
"rewards/margins": 2.1597686108247736,
"rewards/rejected": -2.565495997299383,
"step": 910
},
{
"epoch": 0.9522577306249191,
"grad_norm": 23.52230221185674,
"kl": 0.0,
"learning_rate": 5e-07,
"logps/chosen": -331.8610026041667,
"logps/rejected": -437.75386186079544,
"loss": 0.3801,
"rewards/chosen": -0.9904574288262261,
"rewards/margins": 1.9297606053978504,
"rewards/rejected": -2.9202180342240767,
"step": 920
},
{
"epoch": 0.9626083581317117,
"grad_norm": 21.084240235211357,
"kl": 0.005317878909409046,
"learning_rate": 5e-07,
"logps/chosen": -443.4196810787671,
"logps/rejected": -434.53286637931035,
"loss": 0.371,
"rewards/chosen": -0.6944470340258455,
"rewards/margins": 2.5923810180125884,
"rewards/rejected": -3.286828052038434,
"step": 930
},
{
"epoch": 0.9729589856385044,
"grad_norm": 20.680591232042584,
"kl": 0.0,
"learning_rate": 5e-07,
"logps/chosen": -323.59056991185895,
"logps/rejected": -446.6455792682927,
"loss": 0.3827,
"rewards/chosen": -0.837760729667468,
"rewards/margins": 2.667289185181046,
"rewards/rejected": -3.505049914848514,
"step": 940
},
{
"epoch": 0.983309613145297,
"grad_norm": 27.520713861963205,
"kl": 0.0,
"learning_rate": 5e-07,
"logps/chosen": -321.9452868009868,
"logps/rejected": -363.61830357142856,
"loss": 0.3648,
"rewards/chosen": -0.34084164468865646,
"rewards/margins": 1.999740703362869,
"rewards/rejected": -2.3405823480515253,
"step": 950
},
{
"epoch": 0.9936602406520896,
"grad_norm": 27.18091953431505,
"kl": 0.0,
"learning_rate": 5e-07,
"logps/chosen": -348.9962173655063,
"logps/rejected": -325.42737268518516,
"loss": 0.3878,
"rewards/chosen": -0.4670451200461086,
"rewards/margins": 1.8737544706415845,
"rewards/rejected": -2.340799590687693,
"step": 960
},
{
"epoch": 0.9998706171561651,
"step": 966,
"total_flos": 0.0,
"train_loss": 0.420091498218955,
"train_runtime": 6442.4359,
"train_samples_per_second": 2.399,
"train_steps_per_second": 0.15
}
],
"logging_steps": 10,
"max_steps": 966,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}