llama-3-8b-instruct-agg-judge / trainer_state.json
simonycl's picture
Upload folder using huggingface_hub
7853866 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 400,
"global_step": 938,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0010660980810234541,
"grad_norm": 5.5463102558146335,
"learning_rate": 5.3191489361702125e-09,
"logits/chosen": -0.48140522837638855,
"logits/rejected": -0.7986129522323608,
"logps/chosen": -160.70640563964844,
"logps/rejected": -136.7216033935547,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.005330490405117271,
"grad_norm": 5.944179098618821,
"learning_rate": 2.6595744680851062e-08,
"logits/chosen": -0.5258230566978455,
"logits/rejected": -0.640978991985321,
"logps/chosen": -143.9716033935547,
"logps/rejected": -130.26953125,
"loss": 0.6932,
"rewards/accuracies": 0.28125,
"rewards/chosen": -0.0004923552623949945,
"rewards/margins": -0.0009530532988719642,
"rewards/rejected": 0.00046069800737313926,
"step": 5
},
{
"epoch": 0.010660980810234541,
"grad_norm": 4.877499599443773,
"learning_rate": 5.3191489361702123e-08,
"logits/chosen": -0.45687875151634216,
"logits/rejected": -0.633367121219635,
"logps/chosen": -171.5751190185547,
"logps/rejected": -156.70230102539062,
"loss": 0.6934,
"rewards/accuracies": 0.38749998807907104,
"rewards/chosen": -0.0014754905132576823,
"rewards/margins": -0.0020646383054554462,
"rewards/rejected": 0.0005891475593671203,
"step": 10
},
{
"epoch": 0.015991471215351813,
"grad_norm": 4.928018921113954,
"learning_rate": 7.978723404255319e-08,
"logits/chosen": -0.5080267190933228,
"logits/rejected": -0.6690904498100281,
"logps/chosen": -168.29055786132812,
"logps/rejected": -155.68568420410156,
"loss": 0.6929,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": 0.00017321776249445975,
"rewards/margins": 0.0004894703743048012,
"rewards/rejected": -0.00031625264091417193,
"step": 15
},
{
"epoch": 0.021321961620469083,
"grad_norm": 5.502864121859809,
"learning_rate": 1.0638297872340425e-07,
"logits/chosen": -0.49741801619529724,
"logits/rejected": -0.6529160141944885,
"logps/chosen": -158.0856475830078,
"logps/rejected": -141.2250518798828,
"loss": 0.6934,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": 0.0007460988126695156,
"rewards/margins": 0.0009390910854563117,
"rewards/rejected": -0.00019299241830594838,
"step": 20
},
{
"epoch": 0.026652452025586353,
"grad_norm": 5.3161185644529905,
"learning_rate": 1.329787234042553e-07,
"logits/chosen": -0.46866098046302795,
"logits/rejected": -0.5745824575424194,
"logps/chosen": -156.7174835205078,
"logps/rejected": -144.2376251220703,
"loss": 0.6933,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.0008935723453760147,
"rewards/margins": -0.0009486509370617568,
"rewards/rejected": 5.507881360244937e-05,
"step": 25
},
{
"epoch": 0.031982942430703626,
"grad_norm": 5.272912501828491,
"learning_rate": 1.5957446808510638e-07,
"logits/chosen": -0.49024850130081177,
"logits/rejected": -0.6033456921577454,
"logps/chosen": -157.64395141601562,
"logps/rejected": -150.4394073486328,
"loss": 0.6928,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.0018176069715991616,
"rewards/margins": 0.0021634683944284916,
"rewards/rejected": -0.0003458613937254995,
"step": 30
},
{
"epoch": 0.03731343283582089,
"grad_norm": 5.55755087436118,
"learning_rate": 1.8617021276595742e-07,
"logits/chosen": -0.5024099349975586,
"logits/rejected": -0.5742695927619934,
"logps/chosen": -162.9497528076172,
"logps/rejected": -156.5416259765625,
"loss": 0.6932,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": 0.0004840154724661261,
"rewards/margins": 4.346743298810907e-05,
"rewards/rejected": 0.00044054799946025014,
"step": 35
},
{
"epoch": 0.042643923240938165,
"grad_norm": 5.050573104575282,
"learning_rate": 2.127659574468085e-07,
"logits/chosen": -0.4654630124568939,
"logits/rejected": -0.5773854851722717,
"logps/chosen": -153.1385955810547,
"logps/rejected": -147.40850830078125,
"loss": 0.6927,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.00016260957636404783,
"rewards/margins": 0.0010524257086217403,
"rewards/rejected": -0.0008898162050172687,
"step": 40
},
{
"epoch": 0.04797441364605544,
"grad_norm": 5.699569850833138,
"learning_rate": 2.393617021276596e-07,
"logits/chosen": -0.4928715229034424,
"logits/rejected": -0.6359135508537292,
"logps/chosen": -155.63232421875,
"logps/rejected": -143.79296875,
"loss": 0.6929,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.0014595793327316642,
"rewards/margins": 0.001696806401014328,
"rewards/rejected": -0.00023722714104223996,
"step": 45
},
{
"epoch": 0.053304904051172705,
"grad_norm": 5.124728866824331,
"learning_rate": 2.659574468085106e-07,
"logits/chosen": -0.43387550115585327,
"logits/rejected": -0.5658468008041382,
"logps/chosen": -175.50062561035156,
"logps/rejected": -154.56787109375,
"loss": 0.6922,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.0016532255103811622,
"rewards/margins": 0.0013198386877775192,
"rewards/rejected": 0.0003333869099151343,
"step": 50
},
{
"epoch": 0.05863539445628998,
"grad_norm": 5.56223084484684,
"learning_rate": 2.925531914893617e-07,
"logits/chosen": -0.5003554224967957,
"logits/rejected": -0.6052166223526001,
"logps/chosen": -151.86026000976562,
"logps/rejected": -144.47586059570312,
"loss": 0.6923,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.0034173422027379274,
"rewards/margins": 0.0018555650021880865,
"rewards/rejected": 0.0015617769677191973,
"step": 55
},
{
"epoch": 0.06396588486140725,
"grad_norm": 5.412628405701925,
"learning_rate": 3.1914893617021275e-07,
"logits/chosen": -0.42743635177612305,
"logits/rejected": -0.5739923715591431,
"logps/chosen": -147.76356506347656,
"logps/rejected": -135.29502868652344,
"loss": 0.6923,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.004091166891157627,
"rewards/margins": 0.0018434191588312387,
"rewards/rejected": 0.0022477474994957447,
"step": 60
},
{
"epoch": 0.06929637526652452,
"grad_norm": 5.442775471516293,
"learning_rate": 3.457446808510638e-07,
"logits/chosen": -0.522619366645813,
"logits/rejected": -0.6582551002502441,
"logps/chosen": -162.0552520751953,
"logps/rejected": -147.86856079101562,
"loss": 0.6914,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.005479807965457439,
"rewards/margins": 0.004093030467629433,
"rewards/rejected": 0.001386777381412685,
"step": 65
},
{
"epoch": 0.07462686567164178,
"grad_norm": 5.634942637913951,
"learning_rate": 3.7234042553191484e-07,
"logits/chosen": -0.5439807772636414,
"logits/rejected": -0.6844218969345093,
"logps/chosen": -156.98483276367188,
"logps/rejected": -140.76651000976562,
"loss": 0.6911,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.0070900036953389645,
"rewards/margins": 0.00586737459525466,
"rewards/rejected": 0.0012226292164996266,
"step": 70
},
{
"epoch": 0.07995735607675906,
"grad_norm": 4.9084292553173166,
"learning_rate": 3.989361702127659e-07,
"logits/chosen": -0.4480295181274414,
"logits/rejected": -0.5396173596382141,
"logps/chosen": -149.27737426757812,
"logps/rejected": -138.79080200195312,
"loss": 0.6901,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.0054114703088998795,
"rewards/margins": 0.004020148888230324,
"rewards/rejected": 0.0013913216535001993,
"step": 75
},
{
"epoch": 0.08528784648187633,
"grad_norm": 5.243856701834545,
"learning_rate": 4.25531914893617e-07,
"logits/chosen": -0.5261912941932678,
"logits/rejected": -0.6559049487113953,
"logps/chosen": -155.7753143310547,
"logps/rejected": -146.03860473632812,
"loss": 0.6908,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.007854573428630829,
"rewards/margins": 0.004827320575714111,
"rewards/rejected": 0.0030272528529167175,
"step": 80
},
{
"epoch": 0.0906183368869936,
"grad_norm": 5.506781468585061,
"learning_rate": 4.5212765957446806e-07,
"logits/chosen": -0.5435600876808167,
"logits/rejected": -0.6427361369132996,
"logps/chosen": -136.4017791748047,
"logps/rejected": -130.49105834960938,
"loss": 0.6897,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": 0.00461820513010025,
"rewards/margins": 0.004016582854092121,
"rewards/rejected": 0.0006016212282702327,
"step": 85
},
{
"epoch": 0.09594882729211088,
"grad_norm": 5.811357371598748,
"learning_rate": 4.787234042553192e-07,
"logits/chosen": -0.5588937401771545,
"logits/rejected": -0.6763302087783813,
"logps/chosen": -157.134521484375,
"logps/rejected": -146.5879364013672,
"loss": 0.6895,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.008327952586114407,
"rewards/margins": 0.008614275604486465,
"rewards/rejected": -0.0002863233967218548,
"step": 90
},
{
"epoch": 0.10127931769722814,
"grad_norm": 5.698422927582689,
"learning_rate": 4.999982680938129e-07,
"logits/chosen": -0.5830127596855164,
"logits/rejected": -0.7297841310501099,
"logps/chosen": -165.22900390625,
"logps/rejected": -152.13014221191406,
"loss": 0.6876,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.01156298816204071,
"rewards/margins": 0.011255884543061256,
"rewards/rejected": 0.00030710286227986217,
"step": 95
},
{
"epoch": 0.10660980810234541,
"grad_norm": 5.540011917380718,
"learning_rate": 4.999376538968061e-07,
"logits/chosen": -0.5830188393592834,
"logits/rejected": -0.6362646222114563,
"logps/chosen": -160.86549377441406,
"logps/rejected": -151.5850372314453,
"loss": 0.6883,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.007453514728695154,
"rewards/margins": 0.0087841572239995,
"rewards/rejected": -0.0013306414475664496,
"step": 100
},
{
"epoch": 0.11194029850746269,
"grad_norm": 5.554459962070174,
"learning_rate": 4.997904683849418e-07,
"logits/chosen": -0.6047431826591492,
"logits/rejected": -0.7156568765640259,
"logps/chosen": -145.95703125,
"logps/rejected": -137.16812133789062,
"loss": 0.6872,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.003936653956770897,
"rewards/margins": 0.009741699323058128,
"rewards/rejected": -0.013678351417183876,
"step": 105
},
{
"epoch": 0.11727078891257996,
"grad_norm": 6.6635480922191395,
"learning_rate": 4.99556762539107e-07,
"logits/chosen": -0.5515817403793335,
"logits/rejected": -0.7226412296295166,
"logps/chosen": -163.9252166748047,
"logps/rejected": -151.83767700195312,
"loss": 0.6857,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.0011928931344300508,
"rewards/margins": 0.012570838443934917,
"rewards/rejected": -0.013763731345534325,
"step": 110
},
{
"epoch": 0.12260127931769722,
"grad_norm": 5.957095370833089,
"learning_rate": 4.992366173083787e-07,
"logits/chosen": -0.586641788482666,
"logits/rejected": -0.7417147159576416,
"logps/chosen": -161.9275360107422,
"logps/rejected": -145.07772827148438,
"loss": 0.6843,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.003873241599649191,
"rewards/margins": 0.01244218461215496,
"rewards/rejected": -0.01631542667746544,
"step": 115
},
{
"epoch": 0.1279317697228145,
"grad_norm": 5.703633835475685,
"learning_rate": 4.988301435819852e-07,
"logits/chosen": -0.5778621435165405,
"logits/rejected": -0.6562256217002869,
"logps/chosen": -164.537353515625,
"logps/rejected": -152.1725616455078,
"loss": 0.6845,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.007811696734279394,
"rewards/margins": 0.015073996968567371,
"rewards/rejected": -0.022885693237185478,
"step": 120
},
{
"epoch": 0.13326226012793177,
"grad_norm": 7.3435341527621585,
"learning_rate": 4.983374821508973e-07,
"logits/chosen": -0.6186214685440063,
"logits/rejected": -0.7367585301399231,
"logps/chosen": -190.20452880859375,
"logps/rejected": -183.6043243408203,
"loss": 0.6813,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.008534837514162064,
"rewards/margins": 0.027956834062933922,
"rewards/rejected": -0.019421998411417007,
"step": 125
},
{
"epoch": 0.13859275053304904,
"grad_norm": 6.188992862485417,
"learning_rate": 4.977588036590624e-07,
"logits/chosen": -0.6698447465896606,
"logits/rejected": -0.7765822410583496,
"logps/chosen": -157.9294891357422,
"logps/rejected": -146.48617553710938,
"loss": 0.6833,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.013769884593784809,
"rewards/margins": 0.021815448999404907,
"rewards/rejected": -0.03558532893657684,
"step": 130
},
{
"epoch": 0.1439232409381663,
"grad_norm": 5.78355447268637,
"learning_rate": 4.970943085442984e-07,
"logits/chosen": -0.6052809953689575,
"logits/rejected": -0.768462061882019,
"logps/chosen": -156.30868530273438,
"logps/rejected": -149.22007751464844,
"loss": 0.6805,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.02559695765376091,
"rewards/margins": 0.03413590043783188,
"rewards/rejected": -0.05973286181688309,
"step": 135
},
{
"epoch": 0.14925373134328357,
"grad_norm": 5.813443617152644,
"learning_rate": 4.96344226968867e-07,
"logits/chosen": -0.6367892026901245,
"logits/rejected": -0.7320101857185364,
"logps/chosen": -162.81149291992188,
"logps/rejected": -153.95095825195312,
"loss": 0.6829,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.03205486014485359,
"rewards/margins": 0.017323989421129227,
"rewards/rejected": -0.04937884956598282,
"step": 140
},
{
"epoch": 0.15458422174840086,
"grad_norm": 6.2002717305065,
"learning_rate": 4.955088187397534e-07,
"logits/chosen": -0.7039578557014465,
"logits/rejected": -0.8707769513130188,
"logps/chosen": -176.5757598876953,
"logps/rejected": -168.57083129882812,
"loss": 0.6787,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.04023490846157074,
"rewards/margins": 0.030298087745904922,
"rewards/rejected": -0.07053300738334656,
"step": 145
},
{
"epoch": 0.15991471215351813,
"grad_norm": 5.905902869206233,
"learning_rate": 4.945883732186751e-07,
"logits/chosen": -0.6456910371780396,
"logits/rejected": -0.8251630067825317,
"logps/chosen": -141.7700653076172,
"logps/rejected": -129.72817993164062,
"loss": 0.6746,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.0674080178141594,
"rewards/margins": 0.03635396808385849,
"rewards/rejected": -0.10376199334859848,
"step": 150
},
{
"epoch": 0.1652452025586354,
"grad_norm": 6.258969756632891,
"learning_rate": 4.935832092218558e-07,
"logits/chosen": -0.724746823310852,
"logits/rejected": -0.842291533946991,
"logps/chosen": -159.5133514404297,
"logps/rejected": -152.9370574951172,
"loss": 0.6748,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.04968777671456337,
"rewards/margins": 0.04536419361829758,
"rewards/rejected": -0.09505197405815125,
"step": 155
},
{
"epoch": 0.17057569296375266,
"grad_norm": 6.094426950794661,
"learning_rate": 4.924936749095969e-07,
"logits/chosen": -0.6918126344680786,
"logits/rejected": -0.7708092331886292,
"logps/chosen": -167.59994506835938,
"logps/rejected": -160.00057983398438,
"loss": 0.6703,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.04972660169005394,
"rewards/margins": 0.05021023750305176,
"rewards/rejected": -0.0999368354678154,
"step": 160
},
{
"epoch": 0.17590618336886993,
"grad_norm": 6.3293548148521905,
"learning_rate": 4.913201476656838e-07,
"logits/chosen": -0.7461433410644531,
"logits/rejected": -0.8420252799987793,
"logps/chosen": -161.7245330810547,
"logps/rejected": -157.357177734375,
"loss": 0.6684,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.05842015892267227,
"rewards/margins": 0.06748794764280319,
"rewards/rejected": -0.12590810656547546,
"step": 165
},
{
"epoch": 0.1812366737739872,
"grad_norm": 6.5840976852855,
"learning_rate": 4.900630339666717e-07,
"logits/chosen": -0.7366148829460144,
"logits/rejected": -0.8815475702285767,
"logps/chosen": -183.88925170898438,
"logps/rejected": -176.22451782226562,
"loss": 0.6703,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.08390282094478607,
"rewards/margins": 0.05011892318725586,
"rewards/rejected": -0.13402177393436432,
"step": 170
},
{
"epoch": 0.1865671641791045,
"grad_norm": 6.234539356652731,
"learning_rate": 4.88722769241093e-07,
"logits/chosen": -0.6534587144851685,
"logits/rejected": -0.7359489798545837,
"logps/chosen": -156.6703338623047,
"logps/rejected": -150.93853759765625,
"loss": 0.6747,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.07284261286258698,
"rewards/margins": 0.05436049774289131,
"rewards/rejected": -0.127203106880188,
"step": 175
},
{
"epoch": 0.19189765458422176,
"grad_norm": 6.610956936672055,
"learning_rate": 4.872998177186375e-07,
"logits/chosen": -0.666496992111206,
"logits/rejected": -0.7403326034545898,
"logps/chosen": -160.12982177734375,
"logps/rejected": -157.0307159423828,
"loss": 0.6654,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.11026358604431152,
"rewards/margins": 0.055197346955537796,
"rewards/rejected": -0.16546092927455902,
"step": 180
},
{
"epoch": 0.19722814498933902,
"grad_norm": 7.023716688535026,
"learning_rate": 4.857946722693566e-07,
"logits/chosen": -0.7221956849098206,
"logits/rejected": -0.9377690553665161,
"logps/chosen": -169.1468048095703,
"logps/rejected": -154.34695434570312,
"loss": 0.6676,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.09703753888607025,
"rewards/margins": 0.06947065889835358,
"rewards/rejected": -0.16650819778442383,
"step": 185
},
{
"epoch": 0.2025586353944563,
"grad_norm": 7.14685316792679,
"learning_rate": 4.842078542329463e-07,
"logits/chosen": -0.8102830648422241,
"logits/rejected": -0.892846941947937,
"logps/chosen": -161.26129150390625,
"logps/rejected": -156.14747619628906,
"loss": 0.6671,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.08616851270198822,
"rewards/margins": 0.06732877343893051,
"rewards/rejected": -0.15349729359149933,
"step": 190
},
{
"epoch": 0.20788912579957355,
"grad_norm": 6.752366375502548,
"learning_rate": 4.825399132381714e-07,
"logits/chosen": -0.6987568736076355,
"logits/rejected": -0.8175935745239258,
"logps/chosen": -171.1442108154297,
"logps/rejected": -166.8987579345703,
"loss": 0.6635,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.10287340730428696,
"rewards/margins": 0.05335085466504097,
"rewards/rejected": -0.15622428059577942,
"step": 195
},
{
"epoch": 0.21321961620469082,
"grad_norm": 7.347783152614852,
"learning_rate": 4.807914270124876e-07,
"logits/chosen": -0.7212746143341064,
"logits/rejected": -0.8661853671073914,
"logps/chosen": -154.50978088378906,
"logps/rejected": -151.14486694335938,
"loss": 0.6607,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.13088169693946838,
"rewards/margins": 0.07638157904148102,
"rewards/rejected": -0.2072632759809494,
"step": 200
},
{
"epoch": 0.21855010660980811,
"grad_norm": 7.628582115766163,
"learning_rate": 4.789630011819354e-07,
"logits/chosen": -0.8047178387641907,
"logits/rejected": -0.9235810041427612,
"logps/chosen": -172.83328247070312,
"logps/rejected": -167.29969787597656,
"loss": 0.6594,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.13691547513008118,
"rewards/margins": 0.08130868524312973,
"rewards/rejected": -0.21822413802146912,
"step": 205
},
{
"epoch": 0.22388059701492538,
"grad_norm": 8.158484143715567,
"learning_rate": 4.770552690613665e-07,
"logits/chosen": -0.6994116902351379,
"logits/rejected": -0.8224090337753296,
"logps/chosen": -165.53271484375,
"logps/rejected": -160.39566040039062,
"loss": 0.6632,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.15383288264274597,
"rewards/margins": 0.08487533777952194,
"rewards/rejected": -0.23870821297168732,
"step": 210
},
{
"epoch": 0.22921108742004265,
"grad_norm": 6.818497799203832,
"learning_rate": 4.750688914350824e-07,
"logits/chosen": -0.7993873357772827,
"logits/rejected": -0.8913162350654602,
"logps/chosen": -168.80471801757812,
"logps/rejected": -161.65472412109375,
"loss": 0.6635,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.175734743475914,
"rewards/margins": 0.0465971902012825,
"rewards/rejected": -0.2223319262266159,
"step": 215
},
{
"epoch": 0.2345415778251599,
"grad_norm": 6.951350944403186,
"learning_rate": 4.730045563279577e-07,
"logits/chosen": -0.8045557141304016,
"logits/rejected": -0.9918710589408875,
"logps/chosen": -166.38870239257812,
"logps/rejected": -160.87380981445312,
"loss": 0.6548,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.1676856279373169,
"rewards/margins": 0.08295993506908417,
"rewards/rejected": -0.25064557790756226,
"step": 220
},
{
"epoch": 0.23987206823027718,
"grad_norm": 8.045186231414235,
"learning_rate": 4.708629787671268e-07,
"logits/chosen": -0.7760337591171265,
"logits/rejected": -0.9154524803161621,
"logps/chosen": -176.33999633789062,
"logps/rejected": -174.62783813476562,
"loss": 0.6562,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.17208227515220642,
"rewards/margins": 0.07876059412956238,
"rewards/rejected": -0.2508428692817688,
"step": 225
},
{
"epoch": 0.24520255863539445,
"grad_norm": 8.063146353723733,
"learning_rate": 4.6864490053432e-07,
"logits/chosen": -0.8260966539382935,
"logits/rejected": -0.9618522524833679,
"logps/chosen": -181.78347778320312,
"logps/rejected": -169.95745849609375,
"loss": 0.6563,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.19365951418876648,
"rewards/margins": 0.0855235755443573,
"rewards/rejected": -0.2791830897331238,
"step": 230
},
{
"epoch": 0.2505330490405117,
"grad_norm": 8.049754139869536,
"learning_rate": 4.6635108990893033e-07,
"logits/chosen": -0.7784782648086548,
"logits/rejected": -0.9096555709838867,
"logps/chosen": -183.8284454345703,
"logps/rejected": -179.8243408203125,
"loss": 0.6471,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.22320905327796936,
"rewards/margins": 0.07317076623439789,
"rewards/rejected": -0.29637983441352844,
"step": 235
},
{
"epoch": 0.255863539445629,
"grad_norm": 7.983403287573849,
"learning_rate": 4.6398234140190413e-07,
"logits/chosen": -0.7042727470397949,
"logits/rejected": -0.8362523317337036,
"logps/chosen": -177.31149291992188,
"logps/rejected": -171.16531372070312,
"loss": 0.6495,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.2963384687900543,
"rewards/margins": 0.054160721600055695,
"rewards/rejected": -0.3504992127418518,
"step": 240
},
{
"epoch": 0.26119402985074625,
"grad_norm": 8.675362509945584,
"learning_rate": 4.615394754805443e-07,
"logits/chosen": -0.802803635597229,
"logits/rejected": -0.8539141416549683,
"logps/chosen": -185.90289306640625,
"logps/rejected": -192.5127716064453,
"loss": 0.641,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.28103405237197876,
"rewards/margins": 0.13215723633766174,
"rewards/rejected": -0.4131912589073181,
"step": 245
},
{
"epoch": 0.26652452025586354,
"grad_norm": 8.803685601945949,
"learning_rate": 4.5902333828432416e-07,
"logits/chosen": -0.7753912210464478,
"logits/rejected": -0.9024080038070679,
"logps/chosen": -181.60507202148438,
"logps/rejected": -184.08474731445312,
"loss": 0.651,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.34952813386917114,
"rewards/margins": 0.10412784665822983,
"rewards/rejected": -0.4536559581756592,
"step": 250
},
{
"epoch": 0.27185501066098083,
"grad_norm": 8.599862072625585,
"learning_rate": 4.5643480133180855e-07,
"logits/chosen": -0.7215537428855896,
"logits/rejected": -0.8429878354072571,
"logps/chosen": -193.21871948242188,
"logps/rejected": -192.76535034179688,
"loss": 0.652,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.43663716316223145,
"rewards/margins": 0.09049404412508011,
"rewards/rejected": -0.527131199836731,
"step": 255
},
{
"epoch": 0.2771855010660981,
"grad_norm": 10.219200493862845,
"learning_rate": 4.537747612187848e-07,
"logits/chosen": -0.83184415102005,
"logits/rejected": -1.0026618242263794,
"logps/chosen": -216.73391723632812,
"logps/rejected": -207.87527465820312,
"loss": 0.6408,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.4637986123561859,
"rewards/margins": 0.050548046827316284,
"rewards/rejected": -0.5143465995788574,
"step": 260
},
{
"epoch": 0.28251599147121537,
"grad_norm": 9.931121908912841,
"learning_rate": 4.510441393077069e-07,
"logits/chosen": -0.8512382507324219,
"logits/rejected": -1.0560386180877686,
"logps/chosen": -200.3084259033203,
"logps/rejected": -195.31561279296875,
"loss": 0.6472,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.4075559675693512,
"rewards/margins": 0.09888825565576553,
"rewards/rejected": -0.5064442157745361,
"step": 265
},
{
"epoch": 0.2878464818763326,
"grad_norm": 8.859652819595963,
"learning_rate": 4.4824388140856194e-07,
"logits/chosen": -0.8754502534866333,
"logits/rejected": -1.0318089723587036,
"logps/chosen": -198.41824340820312,
"logps/rejected": -189.89077758789062,
"loss": 0.6502,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.3337915539741516,
"rewards/margins": 0.13341596722602844,
"rewards/rejected": -0.46720752120018005,
"step": 270
},
{
"epoch": 0.2931769722814499,
"grad_norm": 9.06466635441445,
"learning_rate": 4.453749574512685e-07,
"logits/chosen": -0.9197045564651489,
"logits/rejected": -1.0155677795410156,
"logps/chosen": -185.05616760253906,
"logps/rejected": -186.17381286621094,
"loss": 0.6518,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.36515626311302185,
"rewards/margins": 0.09796006977558136,
"rewards/rejected": -0.4631163477897644,
"step": 275
},
{
"epoch": 0.29850746268656714,
"grad_norm": 8.989140378395783,
"learning_rate": 4.4243836114972003e-07,
"logits/chosen": -0.8735504150390625,
"logits/rejected": -1.004237413406372,
"logps/chosen": -186.00570678710938,
"logps/rejected": -192.78012084960938,
"loss": 0.6409,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.3774046003818512,
"rewards/margins": 0.15985320508480072,
"rewards/rejected": -0.5372577905654907,
"step": 280
},
{
"epoch": 0.30383795309168443,
"grad_norm": 8.593968658418248,
"learning_rate": 4.3943510965759113e-07,
"logits/chosen": -0.9259954690933228,
"logits/rejected": -1.008984088897705,
"logps/chosen": -196.23764038085938,
"logps/rejected": -200.01434326171875,
"loss": 0.6385,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.3551446795463562,
"rewards/margins": 0.15665681660175323,
"rewards/rejected": -0.5118014812469482,
"step": 285
},
{
"epoch": 0.3091684434968017,
"grad_norm": 8.920829639132457,
"learning_rate": 4.3636624321602354e-07,
"logits/chosen": -0.9114233255386353,
"logits/rejected": -1.0022578239440918,
"logps/chosen": -199.855712890625,
"logps/rejected": -202.50045776367188,
"loss": 0.6372,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.3673010468482971,
"rewards/margins": 0.11292729526758194,
"rewards/rejected": -0.48022833466529846,
"step": 290
},
{
"epoch": 0.31449893390191896,
"grad_norm": 9.061291327996578,
"learning_rate": 4.3323282479331713e-07,
"logits/chosen": -0.8595677614212036,
"logits/rejected": -0.9653046727180481,
"logps/chosen": -215.55239868164062,
"logps/rejected": -208.42080688476562,
"loss": 0.6506,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.4611906409263611,
"rewards/margins": 0.046931833028793335,
"rewards/rejected": -0.508122444152832,
"step": 295
},
{
"epoch": 0.31982942430703626,
"grad_norm": 8.932259281056975,
"learning_rate": 4.300359397167469e-07,
"logits/chosen": -0.899543285369873,
"logits/rejected": -1.058935523033142,
"logps/chosen": -221.4969024658203,
"logps/rejected": -217.5653076171875,
"loss": 0.6415,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.3796769082546234,
"rewards/margins": 0.15340924263000488,
"rewards/rejected": -0.5330861806869507,
"step": 300
},
{
"epoch": 0.3251599147121535,
"grad_norm": 10.69745472768232,
"learning_rate": 4.2677669529663686e-07,
"logits/chosen": -0.7874996066093445,
"logits/rejected": -0.9376864433288574,
"logps/chosen": -175.2811279296875,
"logps/rejected": -174.26589965820312,
"loss": 0.6345,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.3251371383666992,
"rewards/margins": 0.14439386129379272,
"rewards/rejected": -0.46953099966049194,
"step": 305
},
{
"epoch": 0.3304904051172708,
"grad_norm": 9.502534309876182,
"learning_rate": 4.2345622044281914e-07,
"logits/chosen": -0.8365820646286011,
"logits/rejected": -0.9602219462394714,
"logps/chosen": -198.43325805664062,
"logps/rejected": -202.5500946044922,
"loss": 0.6393,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.44264811277389526,
"rewards/margins": 0.15877890586853027,
"rewards/rejected": -0.6014270186424255,
"step": 310
},
{
"epoch": 0.3358208955223881,
"grad_norm": 10.39334291807327,
"learning_rate": 4.200756652736115e-07,
"logits/chosen": -0.8717101812362671,
"logits/rejected": -0.9584082365036011,
"logps/chosen": -212.91275024414062,
"logps/rejected": -228.49215698242188,
"loss": 0.6423,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.48386502265930176,
"rewards/margins": 0.22382013499736786,
"rewards/rejected": -0.7076851725578308,
"step": 315
},
{
"epoch": 0.3411513859275053,
"grad_norm": 9.34889686698433,
"learning_rate": 4.1663620071744896e-07,
"logits/chosen": -0.8714283108711243,
"logits/rejected": -0.9042676091194153,
"logps/chosen": -184.66299438476562,
"logps/rejected": -192.072021484375,
"loss": 0.6433,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.47013959288597107,
"rewards/margins": 0.11798025667667389,
"rewards/rejected": -0.5881198644638062,
"step": 320
},
{
"epoch": 0.3464818763326226,
"grad_norm": 8.959638666781437,
"learning_rate": 4.131390181073076e-07,
"logits/chosen": -0.8877362012863159,
"logits/rejected": -1.0048226118087769,
"logps/chosen": -208.8312530517578,
"logps/rejected": -213.86831665039062,
"loss": 0.6312,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.397031307220459,
"rewards/margins": 0.15148170292377472,
"rewards/rejected": -0.5485130548477173,
"step": 325
},
{
"epoch": 0.35181236673773986,
"grad_norm": 9.592550539528885,
"learning_rate": 4.0958532876806036e-07,
"logits/chosen": -0.8785327076911926,
"logits/rejected": -0.9449760317802429,
"logps/chosen": -222.5094757080078,
"logps/rejected": -226.78274536132812,
"loss": 0.6337,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.5226814150810242,
"rewards/margins": 0.08113773167133331,
"rewards/rejected": -0.6038191914558411,
"step": 330
},
{
"epoch": 0.35714285714285715,
"grad_norm": 10.432646240734098,
"learning_rate": 4.0597636359690854e-07,
"logits/chosen": -0.927719235420227,
"logits/rejected": -1.0275365114212036,
"logps/chosen": -223.2044219970703,
"logps/rejected": -225.4947967529297,
"loss": 0.6146,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.5208097696304321,
"rewards/margins": 0.20672473311424255,
"rewards/rejected": -0.7275345325469971,
"step": 335
},
{
"epoch": 0.3624733475479744,
"grad_norm": 11.479351717352401,
"learning_rate": 4.023133726370341e-07,
"logits/chosen": -0.9192001223564148,
"logits/rejected": -1.064570426940918,
"logps/chosen": -218.15878295898438,
"logps/rejected": -227.5740966796875,
"loss": 0.6338,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.5721922516822815,
"rewards/margins": 0.18816125392913818,
"rewards/rejected": -0.7603535056114197,
"step": 340
},
{
"epoch": 0.3678038379530917,
"grad_norm": 9.752374588486973,
"learning_rate": 3.9859762464461986e-07,
"logits/chosen": -0.9149691462516785,
"logits/rejected": -0.9972041845321655,
"logps/chosen": -226.1667938232422,
"logps/rejected": -231.6415557861328,
"loss": 0.6171,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.5891625285148621,
"rewards/margins": 0.15944533050060272,
"rewards/rejected": -0.7486079931259155,
"step": 345
},
{
"epoch": 0.373134328358209,
"grad_norm": 10.686589782660716,
"learning_rate": 3.9483040664938844e-07,
"logits/chosen": -0.9657170176506042,
"logits/rejected": -1.0521764755249023,
"logps/chosen": -222.5596160888672,
"logps/rejected": -228.71206665039062,
"loss": 0.6368,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.5828268527984619,
"rewards/margins": 0.16033609211444855,
"rewards/rejected": -0.7431629300117493,
"step": 350
},
{
"epoch": 0.3784648187633262,
"grad_norm": 11.539842635377966,
"learning_rate": 3.910130235088118e-07,
"logits/chosen": -0.9302359819412231,
"logits/rejected": -0.9725440740585327,
"logps/chosen": -218.52835083007812,
"logps/rejected": -236.67294311523438,
"loss": 0.6251,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.6089349389076233,
"rewards/margins": 0.1914242058992386,
"rewards/rejected": -0.8003591299057007,
"step": 355
},
{
"epoch": 0.3837953091684435,
"grad_norm": 11.169667795718752,
"learning_rate": 3.8714679745614556e-07,
"logits/chosen": -0.9223200082778931,
"logits/rejected": -1.0504696369171143,
"logps/chosen": -218.5863037109375,
"logps/rejected": -221.20297241210938,
"loss": 0.6276,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.6130382418632507,
"rewards/margins": 0.17048679292201996,
"rewards/rejected": -0.7835251092910767,
"step": 360
},
{
"epoch": 0.38912579957356075,
"grad_norm": 11.400952450487356,
"learning_rate": 3.8323306764244445e-07,
"logits/chosen": -0.8188157081604004,
"logits/rejected": -0.9803248643875122,
"logps/chosen": -235.3826904296875,
"logps/rejected": -230.9711456298828,
"loss": 0.6338,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.6885900497436523,
"rewards/margins": 0.15824738144874573,
"rewards/rejected": -0.8468375205993652,
"step": 365
},
{
"epoch": 0.39445628997867804,
"grad_norm": 12.170280653967604,
"learning_rate": 3.792731896727196e-07,
"logits/chosen": -0.9494584798812866,
"logits/rejected": -0.9925413131713867,
"logps/chosen": -215.47842407226562,
"logps/rejected": -234.8311767578125,
"loss": 0.6269,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.6851028203964233,
"rewards/margins": 0.2173648625612259,
"rewards/rejected": -0.9024677276611328,
"step": 370
},
{
"epoch": 0.3997867803837953,
"grad_norm": 10.692294507184064,
"learning_rate": 3.752685351363937e-07,
"logits/chosen": -0.9608640670776367,
"logits/rejected": -1.0993045568466187,
"logps/chosen": -240.19479370117188,
"logps/rejected": -244.61849975585938,
"loss": 0.61,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.772241473197937,
"rewards/margins": 0.1992679387331009,
"rewards/rejected": -0.9715094566345215,
"step": 375
},
{
"epoch": 0.4051172707889126,
"grad_norm": 10.604201029450365,
"learning_rate": 3.712204911322228e-07,
"logits/chosen": -0.8940795660018921,
"logits/rejected": -1.0112766027450562,
"logps/chosen": -226.2286376953125,
"logps/rejected": -236.16824340820312,
"loss": 0.6243,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.7305911779403687,
"rewards/margins": 0.17177362740039825,
"rewards/rejected": -0.9023649096488953,
"step": 380
},
{
"epoch": 0.41044776119402987,
"grad_norm": 11.725383803893557,
"learning_rate": 3.671304597878437e-07,
"logits/chosen": -0.8430676460266113,
"logits/rejected": -0.9990310668945312,
"logps/chosen": -226.9669189453125,
"logps/rejected": -233.7083740234375,
"loss": 0.6201,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.7214430570602417,
"rewards/margins": 0.18212191760540009,
"rewards/rejected": -0.9035650491714478,
"step": 385
},
{
"epoch": 0.4157782515991471,
"grad_norm": 11.91435588205294,
"learning_rate": 3.629998577741174e-07,
"logits/chosen": -0.9615923762321472,
"logits/rejected": -1.166372537612915,
"logps/chosen": -227.1701202392578,
"logps/rejected": -231.6777801513672,
"loss": 0.6179,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.6583755016326904,
"rewards/margins": 0.21044659614562988,
"rewards/rejected": -0.8688220977783203,
"step": 390
},
{
"epoch": 0.4211087420042644,
"grad_norm": 11.584436951407874,
"learning_rate": 3.588301158144338e-07,
"logits/chosen": -0.9084697961807251,
"logits/rejected": -0.9693692922592163,
"logps/chosen": -245.2818145751953,
"logps/rejected": -247.6303253173828,
"loss": 0.6356,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.822106659412384,
"rewards/margins": 0.17483489215373993,
"rewards/rejected": -0.9969415664672852,
"step": 395
},
{
"epoch": 0.42643923240938164,
"grad_norm": 12.023549452448254,
"learning_rate": 3.546226781891501e-07,
"logits/chosen": -0.8738770484924316,
"logits/rejected": -1.0339401960372925,
"logps/chosen": -238.5256805419922,
"logps/rejected": -247.3977813720703,
"loss": 0.6265,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.713554322719574,
"rewards/margins": 0.22547940909862518,
"rewards/rejected": -0.9390336871147156,
"step": 400
},
{
"epoch": 0.42643923240938164,
"eval_logits/chosen": -1.4091081619262695,
"eval_logits/rejected": -1.3678923845291138,
"eval_logps/chosen": -229.8961181640625,
"eval_logps/rejected": -245.27667236328125,
"eval_loss": 0.6455010771751404,
"eval_rewards/accuracies": 0.6504064798355103,
"eval_rewards/chosen": -0.7831487059593201,
"eval_rewards/margins": 0.165547713637352,
"eval_rewards/rejected": -0.9486963748931885,
"eval_runtime": 167.4485,
"eval_samples_per_second": 11.717,
"eval_steps_per_second": 1.469,
"step": 400
},
{
"epoch": 0.43176972281449894,
"grad_norm": 11.60850533610325,
"learning_rate": 3.5037900223533325e-07,
"logits/chosen": -0.9261396527290344,
"logits/rejected": -1.0748217105865479,
"logps/chosen": -221.2123565673828,
"logps/rejected": -228.6689910888672,
"loss": 0.5935,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.6648604273796082,
"rewards/margins": 0.26066452264785767,
"rewards/rejected": -0.925524890422821,
"step": 405
},
{
"epoch": 0.43710021321961623,
"grad_norm": 11.18634804714827,
"learning_rate": 3.461005578419791e-07,
"logits/chosen": -0.8335205316543579,
"logits/rejected": -0.9212998151779175,
"logps/chosen": -244.4559783935547,
"logps/rejected": -249.6932830810547,
"loss": 0.6405,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.8230158090591431,
"rewards/margins": 0.19074369966983795,
"rewards/rejected": -1.0137594938278198,
"step": 410
},
{
"epoch": 0.44243070362473347,
"grad_norm": 12.840173089928633,
"learning_rate": 3.4178882694088507e-07,
"logits/chosen": -0.9584044218063354,
"logits/rejected": -1.1188139915466309,
"logps/chosen": -230.2344970703125,
"logps/rejected": -227.662841796875,
"loss": 0.6385,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.7208009958267212,
"rewards/margins": 0.14610765874385834,
"rewards/rejected": -0.8669085502624512,
"step": 415
},
{
"epoch": 0.44776119402985076,
"grad_norm": 12.162616519674994,
"learning_rate": 3.374453029933509e-07,
"logits/chosen": -0.972398579120636,
"logits/rejected": -1.1250841617584229,
"logps/chosen": -232.1986541748047,
"logps/rejected": -250.6172332763672,
"loss": 0.6009,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.5975956320762634,
"rewards/margins": 0.4015510678291321,
"rewards/rejected": -0.9991466403007507,
"step": 420
},
{
"epoch": 0.453091684434968,
"grad_norm": 11.14473654749687,
"learning_rate": 3.3307149047288575e-07,
"logits/chosen": -0.9900253415107727,
"logits/rejected": -1.084149718284607,
"logps/chosen": -239.5063018798828,
"logps/rejected": -251.7197723388672,
"loss": 0.6014,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.7624539732933044,
"rewards/margins": 0.22666020691394806,
"rewards/rejected": -0.9891141653060913,
"step": 425
},
{
"epoch": 0.4584221748400853,
"grad_norm": 12.058706301647955,
"learning_rate": 3.286689043441015e-07,
"logits/chosen": -0.9329894185066223,
"logits/rejected": -1.0619364976882935,
"logps/chosen": -246.6885223388672,
"logps/rejected": -256.63360595703125,
"loss": 0.6082,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.8027140498161316,
"rewards/margins": 0.26076698303222656,
"rewards/rejected": -1.063481092453003,
"step": 430
},
{
"epoch": 0.46375266524520253,
"grad_norm": 13.080197907672602,
"learning_rate": 3.2423906953797207e-07,
"logits/chosen": -0.8946924209594727,
"logits/rejected": -0.9567023515701294,
"logps/chosen": -211.9040069580078,
"logps/rejected": -230.66299438476562,
"loss": 0.6036,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.698184609413147,
"rewards/margins": 0.28591758012771606,
"rewards/rejected": -0.9841020703315735,
"step": 435
},
{
"epoch": 0.4690831556503198,
"grad_norm": 10.950896035507196,
"learning_rate": 3.197835204236402e-07,
"logits/chosen": -1.0192838907241821,
"logits/rejected": -1.1130427122116089,
"logps/chosen": -241.8474578857422,
"logps/rejected": -262.25335693359375,
"loss": 0.6106,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.860467791557312,
"rewards/margins": 0.19941401481628418,
"rewards/rejected": -1.0598818063735962,
"step": 440
},
{
"epoch": 0.4744136460554371,
"grad_norm": 12.737513715694588,
"learning_rate": 3.153038002769558e-07,
"logits/chosen": -0.9327136874198914,
"logits/rejected": -1.030767560005188,
"logps/chosen": -248.5272979736328,
"logps/rejected": -258.75872802734375,
"loss": 0.6136,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.8692724108695984,
"rewards/margins": 0.2143835574388504,
"rewards/rejected": -1.0836559534072876,
"step": 445
},
{
"epoch": 0.47974413646055436,
"grad_norm": 15.296734055580355,
"learning_rate": 3.1080146074592877e-07,
"logits/chosen": -0.9727839231491089,
"logits/rejected": -1.0751718282699585,
"logps/chosen": -241.76846313476562,
"logps/rejected": -251.7017364501953,
"loss": 0.6371,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.8884710073471069,
"rewards/margins": 0.22023312747478485,
"rewards/rejected": -1.1087043285369873,
"step": 450
},
{
"epoch": 0.48507462686567165,
"grad_norm": 11.780615719126844,
"learning_rate": 3.0627806131328246e-07,
"logits/chosen": -0.9416291117668152,
"logits/rejected": -1.0612024068832397,
"logps/chosen": -235.45767211914062,
"logps/rejected": -247.3424530029297,
"loss": 0.6173,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.8884525299072266,
"rewards/margins": 0.24100270867347717,
"rewards/rejected": -1.1294552087783813,
"step": 455
},
{
"epoch": 0.4904051172707889,
"grad_norm": 13.447809886194765,
"learning_rate": 3.017351687562928e-07,
"logits/chosen": -1.0132644176483154,
"logits/rejected": -1.1040401458740234,
"logps/chosen": -247.7209014892578,
"logps/rejected": -248.7446746826172,
"loss": 0.6152,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.9005931615829468,
"rewards/margins": 0.11596985161304474,
"rewards/rejected": -1.016563057899475,
"step": 460
},
{
"epoch": 0.4957356076759062,
"grad_norm": 13.620020353000207,
"learning_rate": 2.971743566041009e-07,
"logits/chosen": -1.0589954853057861,
"logits/rejected": -1.066146731376648,
"logps/chosen": -247.865478515625,
"logps/rejected": -260.4234313964844,
"loss": 0.607,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.8577459454536438,
"rewards/margins": 0.17027950286865234,
"rewards/rejected": -1.0280256271362305,
"step": 465
},
{
"epoch": 0.5010660980810234,
"grad_norm": 12.958251619951241,
"learning_rate": 2.925972045926878e-07,
"logits/chosen": -0.9736588597297668,
"logits/rejected": -1.0960971117019653,
"logps/chosen": -220.01126098632812,
"logps/rejected": -241.4611358642578,
"loss": 0.6215,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.7264107465744019,
"rewards/margins": 0.30126145482063293,
"rewards/rejected": -1.0276721715927124,
"step": 470
},
{
"epoch": 0.5063965884861408,
"grad_norm": 13.3140639310756,
"learning_rate": 2.880052981176979e-07,
"logits/chosen": -0.9312192797660828,
"logits/rejected": -1.0160053968429565,
"logps/chosen": -222.43057250976562,
"logps/rejected": -225.4202423095703,
"loss": 0.63,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.8433539271354675,
"rewards/margins": 0.1893020123243332,
"rewards/rejected": -1.032655954360962,
"step": 475
},
{
"epoch": 0.511727078891258,
"grad_norm": 10.478988610995284,
"learning_rate": 2.83400227685304e-07,
"logits/chosen": -0.9926323890686035,
"logits/rejected": -1.1106306314468384,
"logps/chosen": -249.5770263671875,
"logps/rejected": -254.35794067382812,
"loss": 0.612,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.795128345489502,
"rewards/margins": 0.20585620403289795,
"rewards/rejected": -1.0009845495224,
"step": 480
},
{
"epoch": 0.5170575692963753,
"grad_norm": 13.398290532585486,
"learning_rate": 2.7878358836129984e-07,
"logits/chosen": -1.035072922706604,
"logits/rejected": -1.1353219747543335,
"logps/chosen": -231.4602508544922,
"logps/rejected": -246.7826690673828,
"loss": 0.6198,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.7928118705749512,
"rewards/margins": 0.24832260608673096,
"rewards/rejected": -1.0411344766616821,
"step": 485
},
{
"epoch": 0.5223880597014925,
"grad_norm": 17.87280922700601,
"learning_rate": 2.7415697921861525e-07,
"logits/chosen": -0.9991563558578491,
"logits/rejected": -1.1898800134658813,
"logps/chosen": -280.6573181152344,
"logps/rejected": -277.5446472167969,
"loss": 0.6233,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -1.0797468423843384,
"rewards/margins": 0.14736375212669373,
"rewards/rejected": -1.2271106243133545,
"step": 490
},
{
"epoch": 0.5277185501066098,
"grad_norm": 14.03763690100759,
"learning_rate": 2.6952200278344253e-07,
"logits/chosen": -0.8905277252197266,
"logits/rejected": -1.083092451095581,
"logps/chosen": -234.2960662841797,
"logps/rejected": -247.935791015625,
"loss": 0.5984,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.8255411982536316,
"rewards/margins": 0.22554175555706024,
"rewards/rejected": -1.051082968711853,
"step": 495
},
{
"epoch": 0.5330490405117271,
"grad_norm": 13.488186802243888,
"learning_rate": 2.6488026448016686e-07,
"logits/chosen": -1.000211238861084,
"logits/rejected": -1.137927770614624,
"logps/chosen": -260.8866882324219,
"logps/rejected": -279.22772216796875,
"loss": 0.6074,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.9299663305282593,
"rewards/margins": 0.32078418135643005,
"rewards/rejected": -1.2507504224777222,
"step": 500
},
{
"epoch": 0.5383795309168443,
"grad_norm": 13.891894437147359,
"learning_rate": 2.602333720752927e-07,
"logits/chosen": -1.0730583667755127,
"logits/rejected": -1.1127971410751343,
"logps/chosen": -249.81387329101562,
"logps/rejected": -281.8894958496094,
"loss": 0.6043,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.9583339691162109,
"rewards/margins": 0.47916507720947266,
"rewards/rejected": -1.4374990463256836,
"step": 505
},
{
"epoch": 0.5437100213219617,
"grad_norm": 13.78432594287455,
"learning_rate": 2.5558293512055923e-07,
"logits/chosen": -0.9855419397354126,
"logits/rejected": -1.0839966535568237,
"logps/chosen": -259.2463684082031,
"logps/rejected": -281.6179504394531,
"loss": 0.575,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.9760101437568665,
"rewards/margins": 0.3508725166320801,
"rewards/rejected": -1.3268824815750122,
"step": 510
},
{
"epoch": 0.5490405117270789,
"grad_norm": 13.647127313830234,
"learning_rate": 2.509305643954369e-07,
"logits/chosen": -1.0535143613815308,
"logits/rejected": -1.2141129970550537,
"logps/chosen": -235.3438720703125,
"logps/rejected": -240.82583618164062,
"loss": 0.6014,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.8737846612930298,
"rewards/margins": 0.20960083603858948,
"rewards/rejected": -1.0833853483200073,
"step": 515
},
{
"epoch": 0.5543710021321961,
"grad_norm": 12.039038135090694,
"learning_rate": 2.4627787134919946e-07,
"logits/chosen": -0.9818887710571289,
"logits/rejected": -1.1572598218917847,
"logps/chosen": -270.9326171875,
"logps/rejected": -290.0334167480469,
"loss": 0.5699,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.9136640429496765,
"rewards/margins": 0.3413304388523102,
"rewards/rejected": -1.254994511604309,
"step": 520
},
{
"epoch": 0.5597014925373134,
"grad_norm": 12.903483427378369,
"learning_rate": 2.41626467542764e-07,
"logits/chosen": -0.9903634190559387,
"logits/rejected": -1.0747687816619873,
"logps/chosen": -246.91549682617188,
"logps/rejected": -270.34954833984375,
"loss": 0.5932,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.9555915594100952,
"rewards/margins": 0.39527803659439087,
"rewards/rejected": -1.3508695363998413,
"step": 525
},
{
"epoch": 0.5650319829424307,
"grad_norm": 13.018513420276996,
"learning_rate": 2.369779640904909e-07,
"logits/chosen": -1.0167109966278076,
"logits/rejected": -1.1171668767929077,
"logps/chosen": -260.72833251953125,
"logps/rejected": -275.2940673828125,
"loss": 0.5986,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.9983251690864563,
"rewards/margins": 0.2603410482406616,
"rewards/rejected": -1.2586661577224731,
"step": 530
},
{
"epoch": 0.570362473347548,
"grad_norm": 14.17080589691093,
"learning_rate": 2.3233397110214044e-07,
"logits/chosen": -1.114485740661621,
"logits/rejected": -1.2273226976394653,
"logps/chosen": -267.9627380371094,
"logps/rejected": -286.55926513671875,
"loss": 0.6196,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.9473945498466492,
"rewards/margins": 0.29965347051620483,
"rewards/rejected": -1.247048020362854,
"step": 535
},
{
"epoch": 0.5756929637526652,
"grad_norm": 14.907759901462226,
"learning_rate": 2.2769609712517602e-07,
"logits/chosen": -1.051343560218811,
"logits/rejected": -1.112343668937683,
"logps/chosen": -282.5835266113281,
"logps/rejected": -286.35833740234375,
"loss": 0.6281,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -1.1061525344848633,
"rewards/margins": 0.1019170731306076,
"rewards/rejected": -1.2080695629119873,
"step": 540
},
{
"epoch": 0.5810234541577826,
"grad_norm": 14.332154011748669,
"learning_rate": 2.2306594858760898e-07,
"logits/chosen": -0.9886674880981445,
"logits/rejected": -1.1096317768096924,
"logps/chosen": -264.4873046875,
"logps/rejected": -288.09381103515625,
"loss": 0.6197,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.9765904545783997,
"rewards/margins": 0.43369174003601074,
"rewards/rejected": -1.4102822542190552,
"step": 545
},
{
"epoch": 0.5863539445628998,
"grad_norm": 14.364439575082633,
"learning_rate": 2.184451292415778e-07,
"logits/chosen": -1.0294235944747925,
"logits/rejected": -1.0585139989852905,
"logps/chosen": -229.28250122070312,
"logps/rejected": -251.99560546875,
"loss": 0.6132,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.8699715733528137,
"rewards/margins": 0.3365539610385895,
"rewards/rejected": -1.2065255641937256,
"step": 550
},
{
"epoch": 0.591684434968017,
"grad_norm": 14.37697025208913,
"learning_rate": 2.1383523960785342e-07,
"logits/chosen": -1.0905894041061401,
"logits/rejected": -1.2189487218856812,
"logps/chosen": -245.4988250732422,
"logps/rejected": -253.37771606445312,
"loss": 0.611,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.8295791745185852,
"rewards/margins": 0.25238728523254395,
"rewards/rejected": -1.0819664001464844,
"step": 555
},
{
"epoch": 0.5970149253731343,
"grad_norm": 13.273086730559621,
"learning_rate": 2.0923787642146434e-07,
"logits/chosen": -0.9458072781562805,
"logits/rejected": -1.0978261232376099,
"logps/chosen": -216.93496704101562,
"logps/rejected": -238.4340362548828,
"loss": 0.5878,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.8222309947013855,
"rewards/margins": 0.3146277964115143,
"rewards/rejected": -1.1368588209152222,
"step": 560
},
{
"epoch": 0.6023454157782516,
"grad_norm": 12.275413565116182,
"learning_rate": 2.046546320786331e-07,
"logits/chosen": -1.0852059125900269,
"logits/rejected": -1.217184066772461,
"logps/chosen": -243.6894989013672,
"logps/rejected": -254.8353271484375,
"loss": 0.6099,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.8728886842727661,
"rewards/margins": 0.2236359417438507,
"rewards/rejected": -1.0965244770050049,
"step": 565
},
{
"epoch": 0.6076759061833689,
"grad_norm": 13.385563460718537,
"learning_rate": 2.0008709408521507e-07,
"logits/chosen": -1.075157880783081,
"logits/rejected": -1.1313838958740234,
"logps/chosen": -230.10983276367188,
"logps/rejected": -256.946044921875,
"loss": 0.5973,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.7323789000511169,
"rewards/margins": 0.291358083486557,
"rewards/rejected": -1.0237371921539307,
"step": 570
},
{
"epoch": 0.6130063965884861,
"grad_norm": 12.904686522939175,
"learning_rate": 1.9553684450683193e-07,
"logits/chosen": -1.092653512954712,
"logits/rejected": -1.1976040601730347,
"logps/chosen": -237.1493682861328,
"logps/rejected": -256.56890869140625,
"loss": 0.6163,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.8681387901306152,
"rewards/margins": 0.29039478302001953,
"rewards/rejected": -1.1585334539413452,
"step": 575
},
{
"epoch": 0.6183368869936035,
"grad_norm": 18.21477968462917,
"learning_rate": 1.9100545942088848e-07,
"logits/chosen": -1.0292062759399414,
"logits/rejected": -1.1282401084899902,
"logps/chosen": -222.6328582763672,
"logps/rejected": -251.3138885498047,
"loss": 0.6092,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.8912476301193237,
"rewards/margins": 0.3392987847328186,
"rewards/rejected": -1.2305463552474976,
"step": 580
},
{
"epoch": 0.6236673773987207,
"grad_norm": 13.078178032971321,
"learning_rate": 1.8649450837066444e-07,
"logits/chosen": -1.1086572408676147,
"logits/rejected": -1.2702162265777588,
"logps/chosen": -237.74673461914062,
"logps/rejected": -256.3179626464844,
"loss": 0.5969,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.7724729776382446,
"rewards/margins": 0.34164559841156006,
"rewards/rejected": -1.1141188144683838,
"step": 585
},
{
"epoch": 0.6289978678038379,
"grad_norm": 14.54157536622139,
"learning_rate": 1.8200555382166898e-07,
"logits/chosen": -1.036029577255249,
"logits/rejected": -1.1076552867889404,
"logps/chosen": -257.90423583984375,
"logps/rejected": -276.80859375,
"loss": 0.5922,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.9389132261276245,
"rewards/margins": 0.3674803078174591,
"rewards/rejected": -1.3063933849334717,
"step": 590
},
{
"epoch": 0.6343283582089553,
"grad_norm": 13.963862396350292,
"learning_rate": 1.775401506204472e-07,
"logits/chosen": -1.0365560054779053,
"logits/rejected": -1.111011266708374,
"logps/chosen": -249.289794921875,
"logps/rejected": -261.4861755371094,
"loss": 0.5923,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.9952207803726196,
"rewards/margins": 0.24101737141609192,
"rewards/rejected": -1.2362381219863892,
"step": 595
},
{
"epoch": 0.6396588486140725,
"grad_norm": 15.013492823138591,
"learning_rate": 1.7309984545602528e-07,
"logits/chosen": -1.160706877708435,
"logits/rejected": -1.2153781652450562,
"logps/chosen": -283.8118896484375,
"logps/rejected": -309.1605529785156,
"loss": 0.6042,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.2339740991592407,
"rewards/margins": 0.33963102102279663,
"rewards/rejected": -1.5736052989959717,
"step": 600
},
{
"epoch": 0.6449893390191898,
"grad_norm": 12.416383191561394,
"learning_rate": 1.6868617632418114e-07,
"logits/chosen": -1.1419028043746948,
"logits/rejected": -1.2748745679855347,
"logps/chosen": -278.46051025390625,
"logps/rejected": -305.10687255859375,
"loss": 0.5995,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.106767177581787,
"rewards/margins": 0.3402232229709625,
"rewards/rejected": -1.4469903707504272,
"step": 605
},
{
"epoch": 0.650319829424307,
"grad_norm": 13.316722698785181,
"learning_rate": 1.6430067199472657e-07,
"logits/chosen": -1.0973302125930786,
"logits/rejected": -1.1705577373504639,
"logps/chosen": -238.37857055664062,
"logps/rejected": -260.87811279296875,
"loss": 0.593,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.9219633936882019,
"rewards/margins": 0.3023082911968231,
"rewards/rejected": -1.224271535873413,
"step": 610
},
{
"epoch": 0.6556503198294243,
"grad_norm": 14.96464401167362,
"learning_rate": 1.599448514819844e-07,
"logits/chosen": -1.1169979572296143,
"logits/rejected": -1.2400496006011963,
"logps/chosen": -251.90786743164062,
"logps/rejected": -275.16632080078125,
"loss": 0.6088,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.9491860270500183,
"rewards/margins": 0.33866086602211,
"rewards/rejected": -1.2878468036651611,
"step": 615
},
{
"epoch": 0.6609808102345416,
"grad_norm": 14.725406377953089,
"learning_rate": 1.5562022351864534e-07,
"logits/chosen": -1.0977303981781006,
"logits/rejected": -1.129206657409668,
"logps/chosen": -240.2178955078125,
"logps/rejected": -280.0063171386719,
"loss": 0.5894,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.890995979309082,
"rewards/margins": 0.4513840079307556,
"rewards/rejected": -1.3423799276351929,
"step": 620
},
{
"epoch": 0.6663113006396588,
"grad_norm": 12.844371576008434,
"learning_rate": 1.5132828603318577e-07,
"logits/chosen": -1.0323293209075928,
"logits/rejected": -1.139953374862671,
"logps/chosen": -255.4688720703125,
"logps/rejected": -270.874755859375,
"loss": 0.5895,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -1.0522164106369019,
"rewards/margins": 0.21118326485157013,
"rewards/rejected": -1.2633997201919556,
"step": 625
},
{
"epoch": 0.6716417910447762,
"grad_norm": 17.421764111517,
"learning_rate": 1.4707052563102748e-07,
"logits/chosen": -1.0614488124847412,
"logits/rejected": -1.1523784399032593,
"logps/chosen": -246.06497192382812,
"logps/rejected": -260.67041015625,
"loss": 0.5892,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.0199403762817383,
"rewards/margins": 0.25652509927749634,
"rewards/rejected": -1.2764654159545898,
"step": 630
},
{
"epoch": 0.6769722814498934,
"grad_norm": 11.820948981122326,
"learning_rate": 1.4284841707961987e-07,
"logits/chosen": -1.1177728176116943,
"logits/rejected": -1.2426093816757202,
"logps/chosen": -238.572509765625,
"logps/rejected": -274.74896240234375,
"loss": 0.5731,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.9242167472839355,
"rewards/margins": 0.4471352994441986,
"rewards/rejected": -1.371351957321167,
"step": 635
},
{
"epoch": 0.6823027718550106,
"grad_norm": 16.810425880201876,
"learning_rate": 1.386634227976224e-07,
"logits/chosen": -1.0978498458862305,
"logits/rejected": -1.1455624103546143,
"logps/chosen": -252.0463409423828,
"logps/rejected": -267.16900634765625,
"loss": 0.6045,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -1.01809561252594,
"rewards/margins": 0.2102915495634079,
"rewards/rejected": -1.2283871173858643,
"step": 640
},
{
"epoch": 0.6876332622601279,
"grad_norm": 16.16875613964214,
"learning_rate": 1.345169923483642e-07,
"logits/chosen": -1.074209451675415,
"logits/rejected": -1.0901873111724854,
"logps/chosen": -242.7150421142578,
"logps/rejected": -255.5235595703125,
"loss": 0.6142,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.9898387789726257,
"rewards/margins": 0.1817895770072937,
"rewards/rejected": -1.1716282367706299,
"step": 645
},
{
"epoch": 0.6929637526652452,
"grad_norm": 14.889847230913357,
"learning_rate": 1.3041056193775665e-07,
"logits/chosen": -1.1271008253097534,
"logits/rejected": -1.2666515111923218,
"logps/chosen": -270.30523681640625,
"logps/rejected": -305.54791259765625,
"loss": 0.5835,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.0262703895568848,
"rewards/margins": 0.49466007947921753,
"rewards/rejected": -1.520930528640747,
"step": 650
},
{
"epoch": 0.6982942430703625,
"grad_norm": 17.172517125459027,
"learning_rate": 1.2634555391683188e-07,
"logits/chosen": -1.1146763563156128,
"logits/rejected": -1.1514074802398682,
"logps/chosen": -282.617431640625,
"logps/rejected": -305.576416015625,
"loss": 0.6102,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.0780295133590698,
"rewards/margins": 0.341577410697937,
"rewards/rejected": -1.4196069240570068,
"step": 655
},
{
"epoch": 0.7036247334754797,
"grad_norm": 12.769747207532605,
"learning_rate": 1.2232337628908103e-07,
"logits/chosen": -1.005274772644043,
"logits/rejected": -1.120625376701355,
"logps/chosen": -270.090087890625,
"logps/rejected": -300.65325927734375,
"loss": 0.5804,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.084705114364624,
"rewards/margins": 0.36065369844436646,
"rewards/rejected": -1.4453589916229248,
"step": 660
},
{
"epoch": 0.7089552238805971,
"grad_norm": 13.848401050251594,
"learning_rate": 1.1834542222276206e-07,
"logits/chosen": -1.1191794872283936,
"logits/rejected": -1.2426977157592773,
"logps/chosen": -271.416015625,
"logps/rejected": -293.0040588378906,
"loss": 0.6129,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.0729753971099854,
"rewards/margins": 0.35853153467178345,
"rewards/rejected": -1.4315071105957031,
"step": 665
},
{
"epoch": 0.7142857142857143,
"grad_norm": 15.508341962305703,
"learning_rate": 1.1441306956834504e-07,
"logits/chosen": -1.1978566646575928,
"logits/rejected": -1.2869031429290771,
"logps/chosen": -252.2648162841797,
"logps/rejected": -265.5303649902344,
"loss": 0.6299,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.888351559638977,
"rewards/margins": 0.23599569499492645,
"rewards/rejected": -1.1243473291397095,
"step": 670
},
{
"epoch": 0.7196162046908315,
"grad_norm": 12.326647139033,
"learning_rate": 1.1052768038126464e-07,
"logits/chosen": -1.0239012241363525,
"logits/rejected": -1.1444613933563232,
"logps/chosen": -273.5757751464844,
"logps/rejected": -296.2203369140625,
"loss": 0.594,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.0173006057739258,
"rewards/margins": 0.30182453989982605,
"rewards/rejected": -1.3191251754760742,
"step": 675
},
{
"epoch": 0.7249466950959488,
"grad_norm": 18.71316397695756,
"learning_rate": 1.0669060045014214e-07,
"logits/chosen": -1.1475574970245361,
"logits/rejected": -1.2453533411026,
"logps/chosen": -267.3650207519531,
"logps/rejected": -284.7320861816406,
"loss": 0.6173,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.9687989354133606,
"rewards/margins": 0.274304062128067,
"rewards/rejected": -1.2431029081344604,
"step": 680
},
{
"epoch": 0.7302771855010661,
"grad_norm": 20.228539694090426,
"learning_rate": 1.0290315883064258e-07,
"logits/chosen": -1.0727207660675049,
"logits/rejected": -1.169166088104248,
"logps/chosen": -234.42538452148438,
"logps/rejected": -259.74664306640625,
"loss": 0.5982,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.879969596862793,
"rewards/margins": 0.3725913166999817,
"rewards/rejected": -1.2525609731674194,
"step": 685
},
{
"epoch": 0.7356076759061834,
"grad_norm": 13.518054635862288,
"learning_rate": 9.9166667385128e-08,
"logits/chosen": -1.0578858852386475,
"logits/rejected": -1.159432053565979,
"logps/chosen": -266.2313232421875,
"logps/rejected": -277.4366149902344,
"loss": 0.5997,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.0338330268859863,
"rewards/margins": 0.32066407799720764,
"rewards/rejected": -1.3544971942901611,
"step": 690
},
{
"epoch": 0.7409381663113006,
"grad_norm": 13.588646234249182,
"learning_rate": 9.54824203282647e-08,
"logits/chosen": -1.1122428178787231,
"logits/rejected": -1.1691913604736328,
"logps/chosen": -292.1220703125,
"logps/rejected": -314.33013916015625,
"loss": 0.6016,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.232649564743042,
"rewards/margins": 0.3411175310611725,
"rewards/rejected": -1.573767066001892,
"step": 695
},
{
"epoch": 0.746268656716418,
"grad_norm": 13.39599747528393,
"learning_rate": 9.185169377874488e-08,
"logits/chosen": -1.0565317869186401,
"logits/rejected": -1.104970932006836,
"logps/chosen": -232.4084014892578,
"logps/rejected": -271.18511962890625,
"loss": 0.6054,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.9469151496887207,
"rewards/margins": 0.4982013702392578,
"rewards/rejected": -1.445116639137268,
"step": 700
},
{
"epoch": 0.7515991471215352,
"grad_norm": 12.55265938358736,
"learning_rate": 8.827574531727452e-08,
"logits/chosen": -1.085356593132019,
"logits/rejected": -1.254529595375061,
"logps/chosen": -234.07498168945312,
"logps/rejected": -255.6736297607422,
"loss": 0.5812,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.9192997217178345,
"rewards/margins": 0.39327552914619446,
"rewards/rejected": -1.3125752210617065,
"step": 705
},
{
"epoch": 0.7569296375266524,
"grad_norm": 16.169497185007014,
"learning_rate": 8.475581355098379e-08,
"logits/chosen": -1.1205322742462158,
"logits/rejected": -1.227104902267456,
"logps/chosen": -254.82998657226562,
"logps/rejected": -265.16961669921875,
"loss": 0.5946,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.9393894076347351,
"rewards/margins": 0.3107382357120514,
"rewards/rejected": -1.2501277923583984,
"step": 710
},
{
"epoch": 0.7622601279317697,
"grad_norm": 12.716385801386352,
"learning_rate": 8.129311768440807e-08,
"logits/chosen": -1.0053701400756836,
"logits/rejected": -1.096592903137207,
"logps/chosen": -277.47540283203125,
"logps/rejected": -304.0431823730469,
"loss": 0.5821,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.1990084648132324,
"rewards/margins": 0.3271089792251587,
"rewards/rejected": -1.5261173248291016,
"step": 715
},
{
"epoch": 0.767590618336887,
"grad_norm": 18.552637237088017,
"learning_rate": 7.788885709719033e-08,
"logits/chosen": -1.0947494506835938,
"logits/rejected": -1.1496310234069824,
"logps/chosen": -254.96630859375,
"logps/rejected": -285.97308349609375,
"loss": 0.6177,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.0156748294830322,
"rewards/margins": 0.3178596496582031,
"rewards/rejected": -1.333534598350525,
"step": 720
},
{
"epoch": 0.7729211087420043,
"grad_norm": 12.746321182622895,
"learning_rate": 7.454421092865037e-08,
"logits/chosen": -1.0280872583389282,
"logits/rejected": -1.1245920658111572,
"logps/chosen": -273.39501953125,
"logps/rejected": -283.32110595703125,
"loss": 0.5903,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.220465898513794,
"rewards/margins": 0.208203986287117,
"rewards/rejected": -1.4286696910858154,
"step": 725
},
{
"epoch": 0.7782515991471215,
"grad_norm": 11.876891342705504,
"learning_rate": 7.126033766936365e-08,
"logits/chosen": -1.1646153926849365,
"logits/rejected": -1.2614471912384033,
"logps/chosen": -267.27734375,
"logps/rejected": -288.18731689453125,
"loss": 0.6068,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.021458387374878,
"rewards/margins": 0.3050800859928131,
"rewards/rejected": -1.3265384435653687,
"step": 730
},
{
"epoch": 0.7835820895522388,
"grad_norm": 13.39270512390593,
"learning_rate": 6.80383747598938e-08,
"logits/chosen": -1.1257355213165283,
"logits/rejected": -1.1825156211853027,
"logps/chosen": -269.59747314453125,
"logps/rejected": -300.54266357421875,
"loss": 0.5952,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.0236284732818604,
"rewards/margins": 0.42881250381469727,
"rewards/rejected": -1.4524409770965576,
"step": 735
},
{
"epoch": 0.7889125799573561,
"grad_norm": 14.236670336488492,
"learning_rate": 6.487943819681488e-08,
"logits/chosen": -1.0966401100158691,
"logits/rejected": -1.1666018962860107,
"logps/chosen": -249.0513153076172,
"logps/rejected": -277.46051025390625,
"loss": 0.5756,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.015439748764038,
"rewards/margins": 0.33569568395614624,
"rewards/rejected": -1.3511353731155396,
"step": 740
},
{
"epoch": 0.7942430703624733,
"grad_norm": 13.439712780756642,
"learning_rate": 6.178462214616203e-08,
"logits/chosen": -1.0600165128707886,
"logits/rejected": -1.1609599590301514,
"logps/chosen": -253.85018920898438,
"logps/rejected": -286.5955810546875,
"loss": 0.5808,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.0086755752563477,
"rewards/margins": 0.4383172392845154,
"rewards/rejected": -1.4469928741455078,
"step": 745
},
{
"epoch": 0.7995735607675906,
"grad_norm": 13.727935713255976,
"learning_rate": 5.875499856444358e-08,
"logits/chosen": -1.052286148071289,
"logits/rejected": -1.177433967590332,
"logps/chosen": -263.67333984375,
"logps/rejected": -291.416015625,
"loss": 0.5914,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.058933138847351,
"rewards/margins": 0.31677955389022827,
"rewards/rejected": -1.3757128715515137,
"step": 750
},
{
"epoch": 0.8049040511727079,
"grad_norm": 13.5774716097425,
"learning_rate": 5.5791616827345484e-08,
"logits/chosen": -1.1035162210464478,
"logits/rejected": -1.2241528034210205,
"logps/chosen": -258.1141357421875,
"logps/rejected": -289.6146545410156,
"loss": 0.5851,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.0831810235977173,
"rewards/margins": 0.37278053164482117,
"rewards/rejected": -1.4559617042541504,
"step": 755
},
{
"epoch": 0.8102345415778252,
"grad_norm": 13.872602372334944,
"learning_rate": 5.289550336625731e-08,
"logits/chosen": -0.967927098274231,
"logits/rejected": -1.1444356441497803,
"logps/chosen": -245.47329711914062,
"logps/rejected": -270.84033203125,
"loss": 0.5823,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.0041565895080566,
"rewards/margins": 0.34393635392189026,
"rewards/rejected": -1.348093032836914,
"step": 760
},
{
"epoch": 0.8155650319829424,
"grad_norm": 14.466168736500185,
"learning_rate": 5.006766131274559e-08,
"logits/chosen": -1.1071698665618896,
"logits/rejected": -1.1825703382492065,
"logps/chosen": -275.54388427734375,
"logps/rejected": -296.87689208984375,
"loss": 0.604,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.1156272888183594,
"rewards/margins": 0.3059665858745575,
"rewards/rejected": -1.4215937852859497,
"step": 765
},
{
"epoch": 0.8208955223880597,
"grad_norm": 15.26501051880337,
"learning_rate": 4.730907015109759e-08,
"logits/chosen": -1.037107229232788,
"logits/rejected": -1.1411950588226318,
"logps/chosen": -266.26123046875,
"logps/rejected": -296.9294128417969,
"loss": 0.5551,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.0510127544403076,
"rewards/margins": 0.46340426802635193,
"rewards/rejected": -1.5144169330596924,
"step": 770
},
{
"epoch": 0.826226012793177,
"grad_norm": 10.911420515343652,
"learning_rate": 4.4620685379055584e-08,
"logits/chosen": -1.0774571895599365,
"logits/rejected": -1.2212311029434204,
"logps/chosen": -263.77813720703125,
"logps/rejected": -281.2510070800781,
"loss": 0.6032,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.093552589416504,
"rewards/margins": 0.22536174952983856,
"rewards/rejected": -1.318914532661438,
"step": 775
},
{
"epoch": 0.8315565031982942,
"grad_norm": 13.252940181725066,
"learning_rate": 4.200343817685981e-08,
"logits/chosen": -1.134172797203064,
"logits/rejected": -1.157869577407837,
"logps/chosen": -237.58920288085938,
"logps/rejected": -262.5208435058594,
"loss": 0.59,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.9115175008773804,
"rewards/margins": 0.29485780000686646,
"rewards/rejected": -1.2063753604888916,
"step": 780
},
{
"epoch": 0.8368869936034116,
"grad_norm": 18.75722287778124,
"learning_rate": 3.945823508471352e-08,
"logits/chosen": -1.1293060779571533,
"logits/rejected": -1.2253621816635132,
"logps/chosen": -274.54595947265625,
"logps/rejected": -298.9117126464844,
"loss": 0.6294,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.0663083791732788,
"rewards/margins": 0.33463555574417114,
"rewards/rejected": -1.4009437561035156,
"step": 785
},
{
"epoch": 0.8422174840085288,
"grad_norm": 12.55898038094129,
"learning_rate": 3.698595768878363e-08,
"logits/chosen": -1.0901148319244385,
"logits/rejected": -1.2076427936553955,
"logps/chosen": -242.2190704345703,
"logps/rejected": -263.9744567871094,
"loss": 0.5918,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.9093745946884155,
"rewards/margins": 0.4039868712425232,
"rewards/rejected": -1.313361406326294,
"step": 790
},
{
"epoch": 0.847547974413646,
"grad_norm": 12.79776093588139,
"learning_rate": 3.458746231584414e-08,
"logits/chosen": -1.1291230916976929,
"logits/rejected": -1.2340444326400757,
"logps/chosen": -260.56195068359375,
"logps/rejected": -304.3238525390625,
"loss": 0.5823,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.992597222328186,
"rewards/margins": 0.5544020533561707,
"rewards/rejected": -1.5469990968704224,
"step": 795
},
{
"epoch": 0.8528784648187633,
"grad_norm": 13.33658395118552,
"learning_rate": 3.226357973666888e-08,
"logits/chosen": -1.10861074924469,
"logits/rejected": -1.2813326120376587,
"logps/chosen": -228.31155395507812,
"logps/rejected": -253.84207153320312,
"loss": 0.6053,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.8997095227241516,
"rewards/margins": 0.36860379576683044,
"rewards/rejected": -1.2683132886886597,
"step": 800
},
{
"epoch": 0.8528784648187633,
"eval_logits/chosen": -1.526044487953186,
"eval_logits/rejected": -1.4904903173446655,
"eval_logps/chosen": -256.8968811035156,
"eval_logps/rejected": -280.7786560058594,
"eval_loss": 0.6389869451522827,
"eval_rewards/accuracies": 0.6056910753250122,
"eval_rewards/chosen": -1.0531564950942993,
"eval_rewards/margins": 0.25055956840515137,
"eval_rewards/rejected": -1.3037161827087402,
"eval_runtime": 165.6574,
"eval_samples_per_second": 11.844,
"eval_steps_per_second": 1.485,
"step": 800
},
{
"epoch": 0.8582089552238806,
"grad_norm": 13.707316978636483,
"learning_rate": 3.001511487827582e-08,
"logits/chosen": -1.117619514465332,
"logits/rejected": -1.1415525674819946,
"logps/chosen": -267.25225830078125,
"logps/rejected": -302.49298095703125,
"loss": 0.5991,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.0340323448181152,
"rewards/margins": 0.3930579125881195,
"rewards/rejected": -1.4270904064178467,
"step": 805
},
{
"epoch": 0.8635394456289979,
"grad_norm": 16.362755169612413,
"learning_rate": 2.7842846545123505e-08,
"logits/chosen": -1.0309226512908936,
"logits/rejected": -1.1349594593048096,
"logps/chosen": -256.92706298828125,
"logps/rejected": -267.79986572265625,
"loss": 0.6232,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.9786995649337769,
"rewards/margins": 0.1820269525051117,
"rewards/rejected": -1.160726547241211,
"step": 810
},
{
"epoch": 0.8688699360341151,
"grad_norm": 13.309838462940968,
"learning_rate": 2.5747527149355018e-08,
"logits/chosen": -1.1667518615722656,
"logits/rejected": -1.2266581058502197,
"logps/chosen": -267.74517822265625,
"logps/rejected": -306.21563720703125,
"loss": 0.5618,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.044034719467163,
"rewards/margins": 0.49896711111068726,
"rewards/rejected": -1.5430018901824951,
"step": 815
},
{
"epoch": 0.8742004264392325,
"grad_norm": 12.846376086365586,
"learning_rate": 2.372988245018401e-08,
"logits/chosen": -1.05556321144104,
"logits/rejected": -1.1795189380645752,
"logps/chosen": -259.63458251953125,
"logps/rejected": -298.86761474609375,
"loss": 0.5695,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.0314178466796875,
"rewards/margins": 0.522697925567627,
"rewards/rejected": -1.5541157722473145,
"step": 820
},
{
"epoch": 0.8795309168443497,
"grad_norm": 13.824707878509736,
"learning_rate": 2.1790611302512114e-08,
"logits/chosen": -1.1069999933242798,
"logits/rejected": -1.1457974910736084,
"logps/chosen": -283.31170654296875,
"logps/rejected": -302.56658935546875,
"loss": 0.5923,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -1.16603684425354,
"rewards/margins": 0.2864134609699249,
"rewards/rejected": -1.4524505138397217,
"step": 825
},
{
"epoch": 0.8848614072494669,
"grad_norm": 15.28607581049521,
"learning_rate": 1.9930385414865386e-08,
"logits/chosen": -1.0714858770370483,
"logits/rejected": -1.1165021657943726,
"logps/chosen": -269.0600280761719,
"logps/rejected": -298.10711669921875,
"loss": 0.5989,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.1144940853118896,
"rewards/margins": 0.3277047276496887,
"rewards/rejected": -1.4421989917755127,
"step": 830
},
{
"epoch": 0.8901918976545842,
"grad_norm": 16.521551762750804,
"learning_rate": 1.8149849116733672e-08,
"logits/chosen": -1.0863420963287354,
"logits/rejected": -1.1994072198867798,
"logps/chosen": -260.0115661621094,
"logps/rejected": -284.62774658203125,
"loss": 0.597,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.9503141641616821,
"rewards/margins": 0.32754647731781006,
"rewards/rejected": -1.2778605222702026,
"step": 835
},
{
"epoch": 0.8955223880597015,
"grad_norm": 18.673626539883045,
"learning_rate": 1.6449619135393084e-08,
"logits/chosen": -1.0423157215118408,
"logits/rejected": -1.1916964054107666,
"logps/chosen": -263.9278564453125,
"logps/rejected": -286.19940185546875,
"loss": 0.5925,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.0104882717132568,
"rewards/margins": 0.30736953020095825,
"rewards/rejected": -1.3178579807281494,
"step": 840
},
{
"epoch": 0.9008528784648188,
"grad_norm": 11.823082792807151,
"learning_rate": 1.4830284382289144e-08,
"logits/chosen": -1.114751935005188,
"logits/rejected": -1.1412584781646729,
"logps/chosen": -269.94757080078125,
"logps/rejected": -289.11083984375,
"loss": 0.5786,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.1332142353057861,
"rewards/margins": 0.24711325764656067,
"rewards/rejected": -1.3803274631500244,
"step": 845
},
{
"epoch": 0.906183368869936,
"grad_norm": 14.344497159976747,
"learning_rate": 1.329240574905452e-08,
"logits/chosen": -1.1762893199920654,
"logits/rejected": -1.265937328338623,
"logps/chosen": -289.95892333984375,
"logps/rejected": -309.7876281738281,
"loss": 0.6024,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.0822083950042725,
"rewards/margins": 0.27944907546043396,
"rewards/rejected": -1.3616573810577393,
"step": 850
},
{
"epoch": 0.9115138592750534,
"grad_norm": 18.008488344780044,
"learning_rate": 1.1836515913232175e-08,
"logits/chosen": -1.1288697719573975,
"logits/rejected": -1.3069543838500977,
"logps/chosen": -264.71087646484375,
"logps/rejected": -277.7538757324219,
"loss": 0.5839,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.0512837171554565,
"rewards/margins": 0.2792138457298279,
"rewards/rejected": -1.3304975032806396,
"step": 855
},
{
"epoch": 0.9168443496801706,
"grad_norm": 14.381654064223152,
"learning_rate": 1.0463119153770989e-08,
"logits/chosen": -1.143795132637024,
"logits/rejected": -1.2443573474884033,
"logps/chosen": -259.2528076171875,
"logps/rejected": -280.7781066894531,
"loss": 0.6172,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.0562851428985596,
"rewards/margins": 0.3439714312553406,
"rewards/rejected": -1.4002567529678345,
"step": 860
},
{
"epoch": 0.9221748400852878,
"grad_norm": 18.214838134208975,
"learning_rate": 9.172691176357633e-09,
"logits/chosen": -1.1904518604278564,
"logits/rejected": -1.3172063827514648,
"logps/chosen": -243.25830078125,
"logps/rejected": -253.4103240966797,
"loss": 0.5922,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.9157142639160156,
"rewards/margins": 0.22436395287513733,
"rewards/rejected": -1.1400783061981201,
"step": 865
},
{
"epoch": 0.9275053304904051,
"grad_norm": 13.634453891392663,
"learning_rate": 7.965678948645832e-09,
"logits/chosen": -1.1409590244293213,
"logits/rejected": -1.193704605102539,
"logps/chosen": -290.54986572265625,
"logps/rejected": -311.5206604003906,
"loss": 0.5871,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.1318142414093018,
"rewards/margins": 0.27807727456092834,
"rewards/rejected": -1.4098914861679077,
"step": 870
},
{
"epoch": 0.9328358208955224,
"grad_norm": 16.785891539902277,
"learning_rate": 6.842500545439278e-09,
"logits/chosen": -1.189774751663208,
"logits/rejected": -1.1961729526519775,
"logps/chosen": -282.62677001953125,
"logps/rejected": -313.8055419921875,
"loss": 0.5976,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.145236611366272,
"rewards/margins": 0.3282146751880646,
"rewards/rejected": -1.4734513759613037,
"step": 875
},
{
"epoch": 0.9381663113006397,
"grad_norm": 12.489851575915692,
"learning_rate": 5.803545003882554e-09,
"logits/chosen": -1.0940172672271729,
"logits/rejected": -1.2261667251586914,
"logps/chosen": -262.1616516113281,
"logps/rejected": -284.6984558105469,
"loss": 0.5974,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.0160671472549438,
"rewards/margins": 0.3600946366786957,
"rewards/rejected": -1.376161813735962,
"step": 880
},
{
"epoch": 0.9434968017057569,
"grad_norm": 13.817135678609631,
"learning_rate": 4.849172188709588e-09,
"logits/chosen": -1.124348521232605,
"logits/rejected": -1.223716139793396,
"logps/chosen": -275.583251953125,
"logps/rejected": -284.2818298339844,
"loss": 0.5971,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -1.0598669052124023,
"rewards/margins": 0.21211442351341248,
"rewards/rejected": -1.2719814777374268,
"step": 885
},
{
"epoch": 0.9488272921108742,
"grad_norm": 17.35628545613914,
"learning_rate": 3.979712667596669e-09,
"logits/chosen": -1.0675632953643799,
"logits/rejected": -1.1845059394836426,
"logps/chosen": -253.7633819580078,
"logps/rejected": -275.1048278808594,
"loss": 0.5955,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.99676513671875,
"rewards/margins": 0.29226452112197876,
"rewards/rejected": -1.2890297174453735,
"step": 890
},
{
"epoch": 0.9541577825159915,
"grad_norm": 16.142121068942174,
"learning_rate": 3.195467596663254e-09,
"logits/chosen": -1.131365180015564,
"logits/rejected": -1.242356538772583,
"logps/chosen": -240.4732208251953,
"logps/rejected": -275.12884521484375,
"loss": 0.5831,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.9128969311714172,
"rewards/margins": 0.4684675335884094,
"rewards/rejected": -1.3813644647598267,
"step": 895
},
{
"epoch": 0.9594882729211087,
"grad_norm": 15.867103975451991,
"learning_rate": 2.4967086161600814e-09,
"logits/chosen": -1.082676649093628,
"logits/rejected": -1.2301782369613647,
"logps/chosen": -251.3439483642578,
"logps/rejected": -259.95037841796875,
"loss": 0.6056,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.056687355041504,
"rewards/margins": 0.2243305891752243,
"rewards/rejected": -1.2810180187225342,
"step": 900
},
{
"epoch": 0.964818763326226,
"grad_norm": 14.494532565146372,
"learning_rate": 1.8836777563805416e-09,
"logits/chosen": -1.1500489711761475,
"logits/rejected": -1.267773151397705,
"logps/chosen": -262.1219177246094,
"logps/rejected": -284.3946228027344,
"loss": 0.584,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.9880873560905457,
"rewards/margins": 0.3360677659511566,
"rewards/rejected": -1.3241552114486694,
"step": 905
},
{
"epoch": 0.9701492537313433,
"grad_norm": 13.7532663379059,
"learning_rate": 1.3565873538283757e-09,
"logits/chosen": -1.0888932943344116,
"logits/rejected": -1.2862221002578735,
"logps/chosen": -281.6429443359375,
"logps/rejected": -287.7558898925781,
"loss": 0.5837,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.1525847911834717,
"rewards/margins": 0.23554334044456482,
"rewards/rejected": -1.3881282806396484,
"step": 910
},
{
"epoch": 0.9754797441364605,
"grad_norm": 18.11310668134442,
"learning_rate": 9.156199776702567e-10,
"logits/chosen": -1.2114653587341309,
"logits/rejected": -1.2886050939559937,
"logps/chosen": -277.15283203125,
"logps/rejected": -297.81988525390625,
"loss": 0.6029,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.1020017862319946,
"rewards/margins": 0.3052862286567688,
"rewards/rejected": -1.4072880744934082,
"step": 915
},
{
"epoch": 0.9808102345415778,
"grad_norm": 16.38268266725161,
"learning_rate": 5.609283664990693e-10,
"logits/chosen": -1.1473147869110107,
"logits/rejected": -1.2061867713928223,
"logps/chosen": -269.15692138671875,
"logps/rejected": -297.3847351074219,
"loss": 0.6185,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.0159175395965576,
"rewards/margins": 0.310077965259552,
"rewards/rejected": -1.3259953260421753,
"step": 920
},
{
"epoch": 0.9861407249466951,
"grad_norm": 15.464029236617364,
"learning_rate": 2.926353754295896e-10,
"logits/chosen": -1.2007001638412476,
"logits/rejected": -1.333519458770752,
"logps/chosen": -266.4494934082031,
"logps/rejected": -296.6923522949219,
"loss": 0.5848,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.0838963985443115,
"rewards/margins": 0.3357781171798706,
"rewards/rejected": -1.4196745157241821,
"step": 925
},
{
"epoch": 0.9914712153518124,
"grad_norm": 11.785459063580232,
"learning_rate": 1.1083393354488491e-10,
"logits/chosen": -1.0910792350769043,
"logits/rejected": -1.14475417137146,
"logps/chosen": -275.3228454589844,
"logps/rejected": -302.8422546386719,
"loss": 0.5652,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.1003937721252441,
"rewards/margins": 0.42316898703575134,
"rewards/rejected": -1.5235626697540283,
"step": 930
},
{
"epoch": 0.9968017057569296,
"grad_norm": 12.18026896660238,
"learning_rate": 1.5587011708340092e-11,
"logits/chosen": -1.0970802307128906,
"logits/rejected": -1.167004108428955,
"logps/chosen": -295.5314636230469,
"logps/rejected": -330.5072937011719,
"loss": 0.5541,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.1293576955795288,
"rewards/margins": 0.5504059791564941,
"rewards/rejected": -1.6797635555267334,
"step": 935
},
{
"epoch": 1.0,
"step": 938,
"total_flos": 0.0,
"train_loss": 0.6256769998495513,
"train_runtime": 22377.6313,
"train_samples_per_second": 2.683,
"train_steps_per_second": 0.042
}
],
"logging_steps": 5,
"max_steps": 938,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}