llama-3-8b-instruct-armorm-judge / trainer_state.json
simonycl's picture
Upload folder using huggingface_hub
f6eb862 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9979342973279136,
"eval_steps": 400,
"global_step": 468,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002132338242153662,
"grad_norm": 4.209297607323075,
"learning_rate": 1.0638297872340425e-08,
"logits/chosen": -0.5133028626441956,
"logits/rejected": -0.44742655754089355,
"logps/chosen": -137.54428100585938,
"logps/rejected": -153.15798950195312,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.010661691210768308,
"grad_norm": 3.882884920695959,
"learning_rate": 5.3191489361702123e-08,
"logits/chosen": -0.5493287444114685,
"logits/rejected": -0.5353066325187683,
"logps/chosen": -149.18487548828125,
"logps/rejected": -152.49844360351562,
"loss": 0.6933,
"rewards/accuracies": 0.3984375,
"rewards/chosen": 5.994962702970952e-05,
"rewards/margins": 0.00012374535435810685,
"rewards/rejected": -6.379572732839733e-05,
"step": 5
},
{
"epoch": 0.021323382421536616,
"grad_norm": 4.187160307696768,
"learning_rate": 1.0638297872340425e-07,
"logits/chosen": -0.5537915229797363,
"logits/rejected": -0.49435940384864807,
"logps/chosen": -148.3112030029297,
"logps/rejected": -148.7774658203125,
"loss": 0.6931,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": 0.00019158302166033536,
"rewards/margins": -4.862697096541524e-05,
"rewards/rejected": 0.0002402100944891572,
"step": 10
},
{
"epoch": 0.031985073632304926,
"grad_norm": 4.989515444992066,
"learning_rate": 1.5957446808510638e-07,
"logits/chosen": -0.641636312007904,
"logits/rejected": -0.5020254850387573,
"logps/chosen": -145.27212524414062,
"logps/rejected": -152.85948181152344,
"loss": 0.693,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -8.565824100514874e-05,
"rewards/margins": 2.2089341655373573e-05,
"rewards/rejected": -0.00010774763359222561,
"step": 15
},
{
"epoch": 0.04264676484307323,
"grad_norm": 4.2620983128908385,
"learning_rate": 2.127659574468085e-07,
"logits/chosen": -0.5836353898048401,
"logits/rejected": -0.513951301574707,
"logps/chosen": -136.21817016601562,
"logps/rejected": -140.47598266601562,
"loss": 0.693,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.00023734460410196334,
"rewards/margins": 0.0010727389017120004,
"rewards/rejected": -0.0008353941957466304,
"step": 20
},
{
"epoch": 0.05330845605384154,
"grad_norm": 4.037816980879404,
"learning_rate": 2.659574468085106e-07,
"logits/chosen": -0.5967869758605957,
"logits/rejected": -0.552001953125,
"logps/chosen": -164.9960174560547,
"logps/rejected": -159.42225646972656,
"loss": 0.693,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": 0.0004635151126421988,
"rewards/margins": 0.00024214605218730867,
"rewards/rejected": 0.0002213690459029749,
"step": 25
},
{
"epoch": 0.06397014726460985,
"grad_norm": 3.926673567554115,
"learning_rate": 3.1914893617021275e-07,
"logits/chosen": -0.6094973683357239,
"logits/rejected": -0.5662705898284912,
"logps/chosen": -152.1143798828125,
"logps/rejected": -156.60104370117188,
"loss": 0.6927,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.00032081748940981925,
"rewards/margins": 0.001090517034754157,
"rewards/rejected": -0.0014113344950601459,
"step": 30
},
{
"epoch": 0.07463183847537816,
"grad_norm": 3.9457303398915133,
"learning_rate": 3.7234042553191484e-07,
"logits/chosen": -0.6680720448493958,
"logits/rejected": -0.5494471788406372,
"logps/chosen": -142.37496948242188,
"logps/rejected": -141.9342498779297,
"loss": 0.6925,
"rewards/accuracies": 0.5625,
"rewards/chosen": 7.808269583620131e-05,
"rewards/margins": 0.0020259125158190727,
"rewards/rejected": -0.0019478298490867019,
"step": 35
},
{
"epoch": 0.08529352968614647,
"grad_norm": 3.955885053818346,
"learning_rate": 4.25531914893617e-07,
"logits/chosen": -0.6476858854293823,
"logits/rejected": -0.5677641034126282,
"logps/chosen": -144.7392578125,
"logps/rejected": -150.64984130859375,
"loss": 0.692,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.0007050691056065261,
"rewards/margins": 0.0024024732410907745,
"rewards/rejected": -0.003107542172074318,
"step": 40
},
{
"epoch": 0.09595522089691477,
"grad_norm": 4.193445815112738,
"learning_rate": 4.787234042553192e-07,
"logits/chosen": -0.5770394206047058,
"logits/rejected": -0.496532142162323,
"logps/chosen": -150.3260498046875,
"logps/rejected": -146.87594604492188,
"loss": 0.6912,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.0027928086929023266,
"rewards/margins": 0.004344488959759474,
"rewards/rejected": -0.0071372976526618,
"step": 45
},
{
"epoch": 0.10661691210768308,
"grad_norm": 4.127676285889427,
"learning_rate": 4.999373573764186e-07,
"logits/chosen": -0.6215689182281494,
"logits/rejected": -0.509456992149353,
"logps/chosen": -156.27560424804688,
"logps/rejected": -164.31430053710938,
"loss": 0.6899,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.004007105715572834,
"rewards/margins": 0.008976086974143982,
"rewards/rejected": -0.012983192689716816,
"step": 50
},
{
"epoch": 0.11727860331845139,
"grad_norm": 3.9160373040316245,
"learning_rate": 4.995546550233241e-07,
"logits/chosen": -0.6048154234886169,
"logits/rejected": -0.5707536935806274,
"logps/chosen": -150.3190460205078,
"logps/rejected": -151.73095703125,
"loss": 0.6894,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.008075769990682602,
"rewards/margins": 0.006209281273186207,
"rewards/rejected": -0.014285050332546234,
"step": 55
},
{
"epoch": 0.1279402945292197,
"grad_norm": 4.077147191981429,
"learning_rate": 4.988245838331339e-07,
"logits/chosen": -0.5681597590446472,
"logits/rejected": -0.49796080589294434,
"logps/chosen": -156.58375549316406,
"logps/rejected": -162.067138671875,
"loss": 0.6874,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.015376018360257149,
"rewards/margins": 0.01199757494032383,
"rewards/rejected": -0.02737359330058098,
"step": 60
},
{
"epoch": 0.138601985739988,
"grad_norm": 4.03433737830531,
"learning_rate": 4.977481600320545e-07,
"logits/chosen": -0.6257273554801941,
"logits/rejected": -0.5969006419181824,
"logps/chosen": -153.68531799316406,
"logps/rejected": -153.58663940429688,
"loss": 0.6851,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.021818330511450768,
"rewards/margins": 0.014782066456973553,
"rewards/rejected": -0.036600399762392044,
"step": 65
},
{
"epoch": 0.14926367695075632,
"grad_norm": 4.603352859826981,
"learning_rate": 4.963268819535228e-07,
"logits/chosen": -0.6294292211532593,
"logits/rejected": -0.4907303750514984,
"logps/chosen": -154.8817901611328,
"logps/rejected": -154.6444549560547,
"loss": 0.6825,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.03498289734125137,
"rewards/margins": 0.0258068535476923,
"rewards/rejected": -0.06078975275158882,
"step": 70
},
{
"epoch": 0.1599253681615246,
"grad_norm": 4.546908129493846,
"learning_rate": 4.945627279525942e-07,
"logits/chosen": -0.5467456579208374,
"logits/rejected": -0.4726549983024597,
"logps/chosen": -142.3949432373047,
"logps/rejected": -150.24319458007812,
"loss": 0.6789,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.04987934231758118,
"rewards/margins": 0.025978704914450645,
"rewards/rejected": -0.07585804164409637,
"step": 75
},
{
"epoch": 0.17058705937229293,
"grad_norm": 4.609197926565239,
"learning_rate": 4.924581536521611e-07,
"logits/chosen": -0.6952506303787231,
"logits/rejected": -0.5784528851509094,
"logps/chosen": -157.6682891845703,
"logps/rejected": -166.3611297607422,
"loss": 0.6744,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.07680492848157883,
"rewards/margins": 0.0350065752863884,
"rewards/rejected": -0.11181151866912842,
"step": 80
},
{
"epoch": 0.18124875058306122,
"grad_norm": 5.2243481961036915,
"learning_rate": 4.900160885248362e-07,
"logits/chosen": -0.6026470065116882,
"logits/rejected": -0.5226645469665527,
"logps/chosen": -157.77618408203125,
"logps/rejected": -168.16580200195312,
"loss": 0.672,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.10381762683391571,
"rewards/margins": 0.04510679468512535,
"rewards/rejected": -0.14892444014549255,
"step": 85
},
{
"epoch": 0.19191044179382954,
"grad_norm": 5.426637382074236,
"learning_rate": 4.872399318152593e-07,
"logits/chosen": -0.5520131587982178,
"logits/rejected": -0.5407645106315613,
"logps/chosen": -169.11874389648438,
"logps/rejected": -171.12612915039062,
"loss": 0.6609,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.15968191623687744,
"rewards/margins": 0.05249009653925896,
"rewards/rejected": -0.2121720016002655,
"step": 90
},
{
"epoch": 0.20257213300459787,
"grad_norm": 5.259574320459566,
"learning_rate": 4.841335478085015e-07,
"logits/chosen": -0.602705180644989,
"logits/rejected": -0.5497740507125854,
"logps/chosen": -160.9965057373047,
"logps/rejected": -174.87588500976562,
"loss": 0.6548,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.1887643039226532,
"rewards/margins": 0.09653543680906296,
"rewards/rejected": -0.28529977798461914,
"step": 95
},
{
"epoch": 0.21323382421536616,
"grad_norm": 5.216332894677162,
"learning_rate": 4.807012604511541e-07,
"logits/chosen": -0.5497472882270813,
"logits/rejected": -0.4645184576511383,
"logps/chosen": -184.96392822265625,
"logps/rejected": -191.80685424804688,
"loss": 0.6466,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.2781105637550354,
"rewards/margins": 0.07820834219455719,
"rewards/rejected": -0.3563188910484314,
"step": 100
},
{
"epoch": 0.22389551542613448,
"grad_norm": 5.528705114015297,
"learning_rate": 4.769478473325907e-07,
"logits/chosen": -0.5338613986968994,
"logits/rejected": -0.51846843957901,
"logps/chosen": -192.13998413085938,
"logps/rejected": -201.9773406982422,
"loss": 0.6366,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.3397431969642639,
"rewards/margins": 0.11168007552623749,
"rewards/rejected": -0.4514232575893402,
"step": 105
},
{
"epoch": 0.23455720663690277,
"grad_norm": 6.011226113877762,
"learning_rate": 4.7287853303477696e-07,
"logits/chosen": -0.5935484170913696,
"logits/rejected": -0.5163929462432861,
"logps/chosen": -191.00587463378906,
"logps/rejected": -213.3096466064453,
"loss": 0.6332,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.40704232454299927,
"rewards/margins": 0.1552998423576355,
"rewards/rejected": -0.5623422265052795,
"step": 110
},
{
"epoch": 0.2452188978476711,
"grad_norm": 6.207154465186514,
"learning_rate": 4.684989818598887e-07,
"logits/chosen": -0.5952309370040894,
"logits/rejected": -0.5275391340255737,
"logps/chosen": -197.7395782470703,
"logps/rejected": -218.7361602783203,
"loss": 0.6101,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.46388697624206543,
"rewards/margins": 0.22530913352966309,
"rewards/rejected": -0.6891961097717285,
"step": 115
},
{
"epoch": 0.2558805890584394,
"grad_norm": 6.116244737357764,
"learning_rate": 4.638152899458579e-07,
"logits/chosen": -0.5706356763839722,
"logits/rejected": -0.5397927165031433,
"logps/chosen": -216.757080078125,
"logps/rejected": -236.5631866455078,
"loss": 0.605,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.6336530447006226,
"rewards/margins": 0.19573207199573517,
"rewards/rejected": -0.8293851613998413,
"step": 120
},
{
"epoch": 0.2665422802692077,
"grad_norm": 6.020581764532851,
"learning_rate": 4.588339767808238e-07,
"logits/chosen": -0.6469541788101196,
"logits/rejected": -0.5639356970787048,
"logps/chosen": -242.92431640625,
"logps/rejected": -269.52703857421875,
"loss": 0.5959,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.7765325307846069,
"rewards/margins": 0.2900935709476471,
"rewards/rejected": -1.0666261911392212,
"step": 125
},
{
"epoch": 0.277203971479976,
"grad_norm": 6.365407871842816,
"learning_rate": 4.535619761282988e-07,
"logits/chosen": -0.5723987817764282,
"logits/rejected": -0.4914250373840332,
"logps/chosen": -244.1280517578125,
"logps/rejected": -277.622802734375,
"loss": 0.5884,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.8900648355484009,
"rewards/margins": 0.3473728597164154,
"rewards/rejected": -1.2374377250671387,
"step": 130
},
{
"epoch": 0.2878656626907443,
"grad_norm": 6.423134008068168,
"learning_rate": 4.480066263756821e-07,
"logits/chosen": -0.619472086429596,
"logits/rejected": -0.5424299836158752,
"logps/chosen": -261.1055908203125,
"logps/rejected": -299.7402648925781,
"loss": 0.5941,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -1.0646685361862183,
"rewards/margins": 0.34845179319381714,
"rewards/rejected": -1.4131202697753906,
"step": 135
},
{
"epoch": 0.29852735390151264,
"grad_norm": 7.786460978742034,
"learning_rate": 4.42175660319555e-07,
"logits/chosen": -0.5871402621269226,
"logits/rejected": -0.5535299181938171,
"logps/chosen": -250.9578094482422,
"logps/rejected": -283.0528869628906,
"loss": 0.5764,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.072014570236206,
"rewards/margins": 0.33194950222969055,
"rewards/rejected": -1.4039641618728638,
"step": 140
},
{
"epoch": 0.30918904511228096,
"grad_norm": 7.605992199638712,
"learning_rate": 4.360771944019766e-07,
"logits/chosen": -0.6613143682479858,
"logits/rejected": -0.5987201929092407,
"logps/chosen": -281.7815246582031,
"logps/rejected": -325.2270812988281,
"loss": 0.5605,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.3026009798049927,
"rewards/margins": 0.40984100103378296,
"rewards/rejected": -1.7124418020248413,
"step": 145
},
{
"epoch": 0.3198507363230492,
"grad_norm": 8.64455245115027,
"learning_rate": 4.2971971741276185e-07,
"logits/chosen": -0.5969311594963074,
"logits/rejected": -0.547439694404602,
"logps/chosen": -311.7250061035156,
"logps/rejected": -360.2786560058594,
"loss": 0.572,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -1.515192985534668,
"rewards/margins": 0.5103713870048523,
"rewards/rejected": -2.025564432144165,
"step": 150
},
{
"epoch": 0.33051242753381754,
"grad_norm": 9.301843749121433,
"learning_rate": 4.2311207867346886e-07,
"logits/chosen": -0.5962031483650208,
"logits/rejected": -0.5288577675819397,
"logps/chosen": -320.295654296875,
"logps/rejected": -374.32415771484375,
"loss": 0.5477,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.6180155277252197,
"rewards/margins": 0.5525304079055786,
"rewards/rejected": -2.170545816421509,
"step": 155
},
{
"epoch": 0.34117411874458586,
"grad_norm": 10.172478276638168,
"learning_rate": 4.162634757195417e-07,
"logits/chosen": -0.6355341076850891,
"logits/rejected": -0.5586315393447876,
"logps/chosen": -334.57391357421875,
"logps/rejected": -407.7463073730469,
"loss": 0.54,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.7949445247650146,
"rewards/margins": 0.6658836007118225,
"rewards/rejected": -2.4608283042907715,
"step": 160
},
{
"epoch": 0.3518358099553542,
"grad_norm": 9.778904083871394,
"learning_rate": 4.0918344149775553e-07,
"logits/chosen": -0.630448579788208,
"logits/rejected": -0.5656327605247498,
"logps/chosen": -332.3238220214844,
"logps/rejected": -397.57818603515625,
"loss": 0.5483,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -1.7869739532470703,
"rewards/margins": 0.6199783086776733,
"rewards/rejected": -2.406952381134033,
"step": 165
},
{
"epoch": 0.36249750116612245,
"grad_norm": 8.962534096060436,
"learning_rate": 4.018818310967842e-07,
"logits/chosen": -0.6361697912216187,
"logits/rejected": -0.6125262379646301,
"logps/chosen": -340.560791015625,
"logps/rejected": -393.91943359375,
"loss": 0.5355,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.8884109258651733,
"rewards/margins": 0.5314295887947083,
"rewards/rejected": -2.4198403358459473,
"step": 170
},
{
"epoch": 0.37315919237689077,
"grad_norm": 9.199829374087676,
"learning_rate": 3.9436880802936067e-07,
"logits/chosen": -0.5958508253097534,
"logits/rejected": -0.5747475624084473,
"logps/chosen": -368.4230651855469,
"logps/rejected": -433.1883850097656,
"loss": 0.5325,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.0544705390930176,
"rewards/margins": 0.663877010345459,
"rewards/rejected": -2.7183475494384766,
"step": 175
},
{
"epoch": 0.3838208835876591,
"grad_norm": 10.70397755088912,
"learning_rate": 3.8665483008512536e-07,
"logits/chosen": -0.6373119950294495,
"logits/rejected": -0.5987192392349243,
"logps/chosen": -372.47662353515625,
"logps/rejected": -454.82769775390625,
"loss": 0.5195,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.168134927749634,
"rewards/margins": 0.7901454567909241,
"rewards/rejected": -2.958280324935913,
"step": 180
},
{
"epoch": 0.3944825747984274,
"grad_norm": 10.679642796728752,
"learning_rate": 3.787506347738538e-07,
"logits/chosen": -0.6430412530899048,
"logits/rejected": -0.6131948232650757,
"logps/chosen": -371.15625,
"logps/rejected": -464.52056884765625,
"loss": 0.5244,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.154017925262451,
"rewards/margins": 0.9343518018722534,
"rewards/rejected": -3.088369607925415,
"step": 185
},
{
"epoch": 0.40514426600919573,
"grad_norm": 10.016875889342252,
"learning_rate": 3.706672243793271e-07,
"logits/chosen": -0.7310691475868225,
"logits/rejected": -0.6220592856407166,
"logps/chosen": -401.1075439453125,
"logps/rejected": -499.98480224609375,
"loss": 0.4781,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -2.3723862171173096,
"rewards/margins": 0.9661838412284851,
"rewards/rejected": -3.3385701179504395,
"step": 190
},
{
"epoch": 0.415805957219964,
"grad_norm": 9.705697392058743,
"learning_rate": 3.624158506446484e-07,
"logits/chosen": -0.6986671686172485,
"logits/rejected": -0.6336062550544739,
"logps/chosen": -401.28143310546875,
"logps/rejected": -496.3516540527344,
"loss": 0.4983,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.4336769580841064,
"rewards/margins": 0.9512998461723328,
"rewards/rejected": -3.384976863861084,
"step": 195
},
{
"epoch": 0.4264676484307323,
"grad_norm": 11.1924041807584,
"learning_rate": 3.540079991103235e-07,
"logits/chosen": -0.6155360341072083,
"logits/rejected": -0.5795532464981079,
"logps/chosen": -420.99505615234375,
"logps/rejected": -510.29949951171875,
"loss": 0.4957,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.6494202613830566,
"rewards/margins": 0.9207509756088257,
"rewards/rejected": -3.570171356201172,
"step": 200
},
{
"epoch": 0.43712933964150064,
"grad_norm": 11.402392729818219,
"learning_rate": 3.4545537312690557e-07,
"logits/chosen": -0.6502883434295654,
"logits/rejected": -0.5817223787307739,
"logps/chosen": -393.88580322265625,
"logps/rejected": -484.9453125,
"loss": 0.5215,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.4964230060577393,
"rewards/margins": 0.8120796084403992,
"rewards/rejected": -3.308502197265625,
"step": 205
},
{
"epoch": 0.44779103085226896,
"grad_norm": 13.457423856796606,
"learning_rate": 3.367698775644589e-07,
"logits/chosen": -0.6476996541023254,
"logits/rejected": -0.5849811434745789,
"logps/chosen": -425.48779296875,
"logps/rejected": -523.0567626953125,
"loss": 0.5106,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.741436004638672,
"rewards/margins": 0.932793915271759,
"rewards/rejected": -3.674229860305786,
"step": 210
},
{
"epoch": 0.4584527220630373,
"grad_norm": 11.205830566067114,
"learning_rate": 3.279636022415158e-07,
"logits/chosen": -0.6362490653991699,
"logits/rejected": -0.5585761070251465,
"logps/chosen": -406.4499816894531,
"logps/rejected": -513.5891723632812,
"loss": 0.5183,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.5447230339050293,
"rewards/margins": 1.0839515924453735,
"rewards/rejected": -3.628674268722534,
"step": 215
},
{
"epoch": 0.46911441327380554,
"grad_norm": 11.16271599609029,
"learning_rate": 3.1904880509659394e-07,
"logits/chosen": -0.7270904183387756,
"logits/rejected": -0.6296547651290894,
"logps/chosen": -425.26409912109375,
"logps/rejected": -543.89013671875,
"loss": 0.4671,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.7184805870056152,
"rewards/margins": 1.107802152633667,
"rewards/rejected": -3.8262829780578613,
"step": 220
},
{
"epoch": 0.47977610448457386,
"grad_norm": 12.78514973294055,
"learning_rate": 3.100378951256981e-07,
"logits/chosen": -0.6514928340911865,
"logits/rejected": -0.617154061794281,
"logps/chosen": -455.8995056152344,
"logps/rejected": -549.4182739257812,
"loss": 0.5,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.9396634101867676,
"rewards/margins": 0.8971932530403137,
"rewards/rejected": -3.8368568420410156,
"step": 225
},
{
"epoch": 0.4904377956953422,
"grad_norm": 12.228823390494735,
"learning_rate": 3.0094341510955693e-07,
"logits/chosen": -0.6390506029129028,
"logits/rejected": -0.5878512263298035,
"logps/chosen": -386.823486328125,
"logps/rejected": -473.30084228515625,
"loss": 0.4944,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.460756778717041,
"rewards/margins": 0.869968593120575,
"rewards/rejected": -3.3307254314422607,
"step": 230
},
{
"epoch": 0.5010994869061105,
"grad_norm": 14.395964703992592,
"learning_rate": 2.917780241546371e-07,
"logits/chosen": -0.724800705909729,
"logits/rejected": -0.6312491297721863,
"logps/chosen": -440.82135009765625,
"logps/rejected": -563.3785400390625,
"loss": 0.5034,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.806361675262451,
"rewards/margins": 1.23042893409729,
"rewards/rejected": -4.036790370941162,
"step": 235
},
{
"epoch": 0.5117611781168788,
"grad_norm": 11.701162608624527,
"learning_rate": 2.825544800722376e-07,
"logits/chosen": -0.678560197353363,
"logits/rejected": -0.5992690324783325,
"logps/chosen": -410.61309814453125,
"logps/rejected": -517.0965576171875,
"loss": 0.5189,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.588188886642456,
"rewards/margins": 1.034708857536316,
"rewards/rejected": -3.6228981018066406,
"step": 240
},
{
"epoch": 0.5224228693276471,
"grad_norm": 10.666345842168104,
"learning_rate": 2.7328562162019057e-07,
"logits/chosen": -0.6738190650939941,
"logits/rejected": -0.6309827566146851,
"logps/chosen": -389.16461181640625,
"logps/rejected": -480.9994201660156,
"loss": 0.4932,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.460685968399048,
"rewards/margins": 0.908993124961853,
"rewards/rejected": -3.369678497314453,
"step": 245
},
{
"epoch": 0.5330845605384154,
"grad_norm": 12.712988877236741,
"learning_rate": 2.639843506318899e-07,
"logits/chosen": -0.6564850807189941,
"logits/rejected": -0.6236242055892944,
"logps/chosen": -404.89874267578125,
"logps/rejected": -493.0071716308594,
"loss": 0.5048,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.589948892593384,
"rewards/margins": 0.8368066549301147,
"rewards/rejected": -3.426755428314209,
"step": 250
},
{
"epoch": 0.5437462517491837,
"grad_norm": 10.503825338255721,
"learning_rate": 2.546636140575191e-07,
"logits/chosen": -0.7264063358306885,
"logits/rejected": -0.6483162641525269,
"logps/chosen": -374.58258056640625,
"logps/rejected": -508.2911071777344,
"loss": 0.5061,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -2.2691266536712646,
"rewards/margins": 1.2855161428451538,
"rewards/rejected": -3.554642915725708,
"step": 255
},
{
"epoch": 0.554407942959952,
"grad_norm": 11.560210733838508,
"learning_rate": 2.453363859424809e-07,
"logits/chosen": -0.6495022773742676,
"logits/rejected": -0.6398875713348389,
"logps/chosen": -407.90216064453125,
"logps/rejected": -494.3495178222656,
"loss": 0.4841,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.5238044261932373,
"rewards/margins": 0.9107331037521362,
"rewards/rejected": -3.434537410736084,
"step": 260
},
{
"epoch": 0.5650696341707203,
"grad_norm": 11.927908855427777,
"learning_rate": 2.3601564936811018e-07,
"logits/chosen": -0.7030381560325623,
"logits/rejected": -0.6639617085456848,
"logps/chosen": -383.1998291015625,
"logps/rejected": -495.85015869140625,
"loss": 0.4981,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.35768985748291,
"rewards/margins": 1.0254899263381958,
"rewards/rejected": -3.3831799030303955,
"step": 265
},
{
"epoch": 0.5757313253814886,
"grad_norm": 11.968314972913152,
"learning_rate": 2.267143783798094e-07,
"logits/chosen": -0.6847670674324036,
"logits/rejected": -0.6712856292724609,
"logps/chosen": -397.50421142578125,
"logps/rejected": -478.1363220214844,
"loss": 0.5166,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.4988386631011963,
"rewards/margins": 0.8168338537216187,
"rewards/rejected": -3.3156723976135254,
"step": 270
},
{
"epoch": 0.586393016592257,
"grad_norm": 14.417251821790718,
"learning_rate": 2.1744551992776244e-07,
"logits/chosen": -0.6841756701469421,
"logits/rejected": -0.6490763425827026,
"logps/chosen": -423.22052001953125,
"logps/rejected": -527.6184692382812,
"loss": 0.5079,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.6250014305114746,
"rewards/margins": 0.9655100107192993,
"rewards/rejected": -3.5905113220214844,
"step": 275
},
{
"epoch": 0.5970547078030253,
"grad_norm": 11.936781827926813,
"learning_rate": 2.0822197584536287e-07,
"logits/chosen": -0.6951079368591309,
"logits/rejected": -0.6226946711540222,
"logps/chosen": -384.56121826171875,
"logps/rejected": -499.79095458984375,
"loss": 0.4781,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.4033970832824707,
"rewards/margins": 1.1390013694763184,
"rewards/rejected": -3.542397975921631,
"step": 280
},
{
"epoch": 0.6077163990137936,
"grad_norm": 16.620259019998883,
"learning_rate": 1.9905658489044307e-07,
"logits/chosen": -0.6553946137428284,
"logits/rejected": -0.6309363842010498,
"logps/chosen": -409.9910583496094,
"logps/rejected": -513.8627319335938,
"loss": 0.5193,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.5966861248016357,
"rewards/margins": 0.9828068614006042,
"rewards/rejected": -3.5794930458068848,
"step": 285
},
{
"epoch": 0.6183780902245619,
"grad_norm": 12.0304164250374,
"learning_rate": 1.899621048743019e-07,
"logits/chosen": -0.6362113952636719,
"logits/rejected": -0.6540666222572327,
"logps/chosen": -382.2437438964844,
"logps/rejected": -475.27130126953125,
"loss": 0.5006,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.3609213829040527,
"rewards/margins": 0.9484884142875671,
"rewards/rejected": -3.3094096183776855,
"step": 290
},
{
"epoch": 0.6290397814353301,
"grad_norm": 11.846669016651198,
"learning_rate": 1.8095119490340615e-07,
"logits/chosen": -0.6740937232971191,
"logits/rejected": -0.647221565246582,
"logps/chosen": -396.89208984375,
"logps/rejected": -510.3208923339844,
"loss": 0.4535,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.4332923889160156,
"rewards/margins": 1.1633473634719849,
"rewards/rejected": -3.596639633178711,
"step": 295
},
{
"epoch": 0.6397014726460984,
"grad_norm": 10.787540635532423,
"learning_rate": 1.7203639775848423e-07,
"logits/chosen": -0.6580207347869873,
"logits/rejected": -0.6252058744430542,
"logps/chosen": -402.5466003417969,
"logps/rejected": -513.3468017578125,
"loss": 0.4695,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -2.515665292739868,
"rewards/margins": 1.1106935739517212,
"rewards/rejected": -3.6263587474823,
"step": 300
},
{
"epoch": 0.6503631638568668,
"grad_norm": 13.5895433661477,
"learning_rate": 1.6323012243554106e-07,
"logits/chosen": -0.7108097672462463,
"logits/rejected": -0.6324597597122192,
"logps/chosen": -427.470458984375,
"logps/rejected": -526.4566650390625,
"loss": 0.4642,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.6723906993865967,
"rewards/margins": 0.9718725085258484,
"rewards/rejected": -3.6442630290985107,
"step": 305
},
{
"epoch": 0.6610248550676351,
"grad_norm": 10.69149258619884,
"learning_rate": 1.5454462687309444e-07,
"logits/chosen": -0.7408244013786316,
"logits/rejected": -0.6759353876113892,
"logps/chosen": -410.1529235839844,
"logps/rejected": -576.83984375,
"loss": 0.4667,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.559839963912964,
"rewards/margins": 1.6155951023101807,
"rewards/rejected": -4.1754350662231445,
"step": 310
},
{
"epoch": 0.6716865462784034,
"grad_norm": 13.414294575112912,
"learning_rate": 1.459920008896765e-07,
"logits/chosen": -0.713616669178009,
"logits/rejected": -0.7023594379425049,
"logps/chosen": -430.0381774902344,
"logps/rejected": -536.2015380859375,
"loss": 0.4657,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.690309762954712,
"rewards/margins": 1.0717895030975342,
"rewards/rejected": -3.762099027633667,
"step": 315
},
{
"epoch": 0.6823482374891717,
"grad_norm": 13.49512905939788,
"learning_rate": 1.3758414935535145e-07,
"logits/chosen": -0.6761552691459656,
"logits/rejected": -0.6508086919784546,
"logps/chosen": -397.4078674316406,
"logps/rejected": -503.7195739746094,
"loss": 0.4694,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.4770684242248535,
"rewards/margins": 1.089545488357544,
"rewards/rejected": -3.5666146278381348,
"step": 320
},
{
"epoch": 0.69300992869994,
"grad_norm": 11.06599328024594,
"learning_rate": 1.2933277562067288e-07,
"logits/chosen": -0.7036711573600769,
"logits/rejected": -0.6898130178451538,
"logps/chosen": -428.58563232421875,
"logps/rejected": -540.5661010742188,
"loss": 0.4988,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.6999242305755615,
"rewards/margins": 1.1091340780258179,
"rewards/rejected": -3.8090579509735107,
"step": 325
},
{
"epoch": 0.7036716199107084,
"grad_norm": 11.287294541718003,
"learning_rate": 1.212493652261462e-07,
"logits/chosen": -0.6557536721229553,
"logits/rejected": -0.598514974117279,
"logps/chosen": -411.374755859375,
"logps/rejected": -522.1329956054688,
"loss": 0.4572,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.67822003364563,
"rewards/margins": 1.0374858379364014,
"rewards/rejected": -3.715705394744873,
"step": 330
},
{
"epoch": 0.7143333111214767,
"grad_norm": 14.174123718877084,
"learning_rate": 1.1334516991487472e-07,
"logits/chosen": -0.6946598887443542,
"logits/rejected": -0.6765455603599548,
"logps/chosen": -420.7255859375,
"logps/rejected": -525.3436889648438,
"loss": 0.4903,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.735704183578491,
"rewards/margins": 1.0136221647262573,
"rewards/rejected": -3.749326229095459,
"step": 335
},
{
"epoch": 0.7249950023322449,
"grad_norm": 11.875650925559528,
"learning_rate": 1.0563119197063933e-07,
"logits/chosen": -0.6363321542739868,
"logits/rejected": -0.610489010810852,
"logps/chosen": -430.07135009765625,
"logps/rejected": -546.0662231445312,
"loss": 0.4911,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.6708502769470215,
"rewards/margins": 1.1354032754898071,
"rewards/rejected": -3.806253433227539,
"step": 340
},
{
"epoch": 0.7356566935430132,
"grad_norm": 12.158844690788914,
"learning_rate": 9.811816890321578e-08,
"logits/chosen": -0.6831678152084351,
"logits/rejected": -0.6622704863548279,
"logps/chosen": -421.2864685058594,
"logps/rejected": -587.9827880859375,
"loss": 0.4211,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -2.720494031906128,
"rewards/margins": 1.634281873703003,
"rewards/rejected": -4.354775905609131,
"step": 345
},
{
"epoch": 0.7463183847537815,
"grad_norm": 20.30568413490793,
"learning_rate": 9.081655850224449e-08,
"logits/chosen": -0.669623851776123,
"logits/rejected": -0.5724581480026245,
"logps/chosen": -424.1244201660156,
"logps/rejected": -568.143798828125,
"loss": 0.4225,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.7207565307617188,
"rewards/margins": 1.3297325372695923,
"rewards/rejected": -4.0504889488220215,
"step": 350
},
{
"epoch": 0.7569800759645499,
"grad_norm": 17.640008025150962,
"learning_rate": 8.37365242804583e-08,
"logits/chosen": -0.6978561878204346,
"logits/rejected": -0.6499579548835754,
"logps/chosen": -466.77337646484375,
"logps/rejected": -663.1705932617188,
"loss": 0.4564,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -3.1434326171875,
"rewards/margins": 1.8815462589263916,
"rewards/rejected": -5.024979114532471,
"step": 355
},
{
"epoch": 0.7676417671753182,
"grad_norm": 51.21482859691763,
"learning_rate": 7.68879213265311e-08,
"logits/chosen": -0.6487331986427307,
"logits/rejected": -0.5610198378562927,
"logps/chosen": -426.02532958984375,
"logps/rejected": -567.581787109375,
"loss": 0.4593,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.7650089263916016,
"rewards/margins": 1.389169454574585,
"rewards/rejected": -4.154178619384766,
"step": 360
},
{
"epoch": 0.7783034583860865,
"grad_norm": 12.312317231610301,
"learning_rate": 7.028028258723817e-08,
"logits/chosen": -0.6627609729766846,
"logits/rejected": -0.6275384426116943,
"logps/chosen": -420.10638427734375,
"logps/rejected": -541.6509399414062,
"loss": 0.4362,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -2.699690103530884,
"rewards/margins": 1.2001326084136963,
"rewards/rejected": -3.89982271194458,
"step": 365
},
{
"epoch": 0.7889651495968548,
"grad_norm": 14.824950870173511,
"learning_rate": 6.392280559802341e-08,
"logits/chosen": -0.6717527508735657,
"logits/rejected": -0.6606348752975464,
"logps/chosen": -445.54412841796875,
"logps/rejected": -588.8523559570312,
"loss": 0.461,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.92059588432312,
"rewards/margins": 1.460580825805664,
"rewards/rejected": -4.381176948547363,
"step": 370
},
{
"epoch": 0.7996268408076231,
"grad_norm": 13.6644249310921,
"learning_rate": 5.782433968044495e-08,
"logits/chosen": -0.6819238662719727,
"logits/rejected": -0.638149082660675,
"logps/chosen": -421.51031494140625,
"logps/rejected": -669.2894287109375,
"loss": 0.4489,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -2.7319085597991943,
"rewards/margins": 2.4759204387664795,
"rewards/rejected": -5.207829475402832,
"step": 375
},
{
"epoch": 0.8102885320183915,
"grad_norm": 13.811424824515617,
"learning_rate": 5.199337362431791e-08,
"logits/chosen": -0.6884719133377075,
"logits/rejected": -0.6233955025672913,
"logps/chosen": -438.57293701171875,
"logps/rejected": -597.1315307617188,
"loss": 0.4523,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -2.8267288208007812,
"rewards/margins": 1.5069990158081055,
"rewards/rejected": -4.333728313446045,
"step": 380
},
{
"epoch": 0.8209502232291597,
"grad_norm": 12.122140633447449,
"learning_rate": 4.643802387170117e-08,
"logits/chosen": -0.6704726815223694,
"logits/rejected": -0.6620519161224365,
"logps/chosen": -462.6454162597656,
"logps/rejected": -569.0838623046875,
"loss": 0.4772,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -3.02994966506958,
"rewards/margins": 1.0704432725906372,
"rewards/rejected": -4.100392818450928,
"step": 385
},
{
"epoch": 0.831611914439928,
"grad_norm": 13.510721862336407,
"learning_rate": 4.116602321917617e-08,
"logits/chosen": -0.6129786968231201,
"logits/rejected": -0.5863287448883057,
"logps/chosen": -459.5635681152344,
"logps/rejected": -604.2559814453125,
"loss": 0.4647,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -3.085102081298828,
"rewards/margins": 1.438297986984253,
"rewards/rejected": -4.52340030670166,
"step": 390
},
{
"epoch": 0.8422736056506963,
"grad_norm": 17.09993809361333,
"learning_rate": 3.6184710054142144e-08,
"logits/chosen": -0.69035804271698,
"logits/rejected": -0.6435971260070801,
"logps/chosen": -479.14935302734375,
"logps/rejected": -643.2648315429688,
"loss": 0.4602,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -3.1109519004821777,
"rewards/margins": 1.6468206644058228,
"rewards/rejected": -4.757771968841553,
"step": 395
},
{
"epoch": 0.8529352968614646,
"grad_norm": 14.20201406251709,
"learning_rate": 3.150101814011136e-08,
"logits/chosen": -0.6618126630783081,
"logits/rejected": -0.6529041528701782,
"logps/chosen": -463.82415771484375,
"logps/rejected": -575.7443237304688,
"loss": 0.5062,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -3.062734603881836,
"rewards/margins": 1.059270977973938,
"rewards/rejected": -4.122005462646484,
"step": 400
},
{
"epoch": 0.8529352968614646,
"eval_logits/chosen": -0.6989333629608154,
"eval_logits/rejected": -0.6484398245811462,
"eval_logps/chosen": -448.32574462890625,
"eval_logps/rejected": -597.7947998046875,
"eval_loss": 0.4833716154098511,
"eval_rewards/accuracies": 0.7759674191474915,
"eval_rewards/chosen": -2.957282066345215,
"eval_rewards/margins": 1.4963340759277344,
"eval_rewards/rejected": -4.453616142272949,
"eval_runtime": 356.4535,
"eval_samples_per_second": 5.504,
"eval_steps_per_second": 1.377,
"step": 400
},
{
"epoch": 0.863596988072233,
"grad_norm": 15.05986989475561,
"learning_rate": 2.712146696522305e-08,
"logits/chosen": -0.6149640083312988,
"logits/rejected": -0.5986669659614563,
"logps/chosen": -456.62615966796875,
"logps/rejected": -587.21630859375,
"loss": 0.4581,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -3.0605359077453613,
"rewards/margins": 1.240896463394165,
"rewards/rejected": -4.3014326095581055,
"step": 405
},
{
"epoch": 0.8742586792830013,
"grad_norm": 17.350625249654133,
"learning_rate": 2.3052152667409287e-08,
"logits/chosen": -0.6955739259719849,
"logits/rejected": -0.6243568658828735,
"logps/chosen": -449.71319580078125,
"logps/rejected": -593.1027221679688,
"loss": 0.4803,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -3.006499767303467,
"rewards/margins": 1.3734157085418701,
"rewards/rejected": -4.379915714263916,
"step": 410
},
{
"epoch": 0.8849203704937696,
"grad_norm": 14.462761406291644,
"learning_rate": 1.929873954884581e-08,
"logits/chosen": -0.6829768419265747,
"logits/rejected": -0.6062845587730408,
"logps/chosen": -418.482177734375,
"logps/rejected": -541.1483154296875,
"loss": 0.446,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.7707464694976807,
"rewards/margins": 1.1584304571151733,
"rewards/rejected": -3.9291768074035645,
"step": 415
},
{
"epoch": 0.8955820617045379,
"grad_norm": 14.240832321861784,
"learning_rate": 1.5866452191498486e-08,
"logits/chosen": -0.6378864049911499,
"logits/rejected": -0.5631311535835266,
"logps/chosen": -445.82489013671875,
"logps/rejected": -588.2786865234375,
"loss": 0.4542,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.9222068786621094,
"rewards/margins": 1.373407006263733,
"rewards/rejected": -4.295614242553711,
"step": 420
},
{
"epoch": 0.9062437529153062,
"grad_norm": 14.300875070246876,
"learning_rate": 1.2760068184740597e-08,
"logits/chosen": -0.6629018783569336,
"logits/rejected": -0.6413969993591309,
"logps/chosen": -447.8999938964844,
"logps/rejected": -604.1832275390625,
"loss": 0.4772,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -2.8662009239196777,
"rewards/margins": 1.5696779489517212,
"rewards/rejected": -4.435878753662109,
"step": 425
},
{
"epoch": 0.9169054441260746,
"grad_norm": 15.167998348057905,
"learning_rate": 9.983911475163725e-09,
"logits/chosen": -0.6861739754676819,
"logits/rejected": -0.6672254800796509,
"logps/chosen": -467.89996337890625,
"logps/rejected": -618.0469360351562,
"loss": 0.462,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -3.1228601932525635,
"rewards/margins": 1.485724687576294,
"rewards/rejected": -4.608585357666016,
"step": 430
},
{
"epoch": 0.9275671353368428,
"grad_norm": 14.071697017283158,
"learning_rate": 7.541846347838915e-09,
"logits/chosen": -0.6308220624923706,
"logits/rejected": -0.6001772880554199,
"logps/chosen": -477.44329833984375,
"logps/rejected": -596.0589599609375,
"loss": 0.4843,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -3.173348903656006,
"rewards/margins": 1.121080756187439,
"rewards/rejected": -4.294429779052734,
"step": 435
},
{
"epoch": 0.9382288265476111,
"grad_norm": 16.08993041357463,
"learning_rate": 5.437272047405711e-09,
"logits/chosen": -0.6392040252685547,
"logits/rejected": -0.6253064870834351,
"logps/chosen": -415.3251953125,
"logps/rejected": -527.3311767578125,
"loss": 0.4948,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.714038848876953,
"rewards/margins": 1.125810146331787,
"rewards/rejected": -3.8398489952087402,
"step": 440
},
{
"epoch": 0.9488905177583794,
"grad_norm": 13.509397089938123,
"learning_rate": 3.673118046477158e-09,
"logits/chosen": -0.6543330550193787,
"logits/rejected": -0.6509405374526978,
"logps/chosen": -456.007080078125,
"logps/rejected": -619.2344360351562,
"loss": 0.4768,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -2.979337453842163,
"rewards/margins": 1.5961627960205078,
"rewards/rejected": -4.575499534606934,
"step": 445
},
{
"epoch": 0.9595522089691477,
"grad_norm": 14.016464196588691,
"learning_rate": 2.251839967945535e-09,
"logits/chosen": -0.6727645397186279,
"logits/rejected": -0.5936774611473083,
"logps/chosen": -428.5733337402344,
"logps/rejected": -629.4482421875,
"loss": 0.4454,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.7794129848480225,
"rewards/margins": 2.001149892807007,
"rewards/rejected": -4.7805633544921875,
"step": 450
},
{
"epoch": 0.970213900179916,
"grad_norm": 14.602385442499152,
"learning_rate": 1.1754161668660612e-09,
"logits/chosen": -0.5893079042434692,
"logits/rejected": -0.5978569984436035,
"logps/chosen": -428.67218017578125,
"logps/rejected": -585.7625732421875,
"loss": 0.5192,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.863682985305786,
"rewards/margins": 1.5374996662139893,
"rewards/rejected": -4.401182174682617,
"step": 455
},
{
"epoch": 0.9808755913906844,
"grad_norm": 12.799904592616386,
"learning_rate": 4.453449766758932e-10,
"logits/chosen": -0.7284419536590576,
"logits/rejected": -0.6639989614486694,
"logps/chosen": -453.0536193847656,
"logps/rejected": -603.8924560546875,
"loss": 0.469,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.9768033027648926,
"rewards/margins": 1.4411671161651611,
"rewards/rejected": -4.417970657348633,
"step": 460
},
{
"epoch": 0.9915372826014527,
"grad_norm": 13.505994490232299,
"learning_rate": 6.264262358129935e-11,
"logits/chosen": -0.7227040529251099,
"logits/rejected": -0.6694614887237549,
"logps/chosen": -442.71307373046875,
"logps/rejected": -571.0550537109375,
"loss": 0.474,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.9378533363342285,
"rewards/margins": 1.2195371389389038,
"rewards/rejected": -4.15739107131958,
"step": 465
},
{
"epoch": 0.9979342973279136,
"step": 468,
"total_flos": 0.0,
"train_loss": 0.5391119274064007,
"train_runtime": 25974.5898,
"train_samples_per_second": 2.311,
"train_steps_per_second": 0.018
}
],
"logging_steps": 5,
"max_steps": 468,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}