llama-3-8b-instruct-agg-judge / trainer_state.json
simonycl's picture
Upload folder using huggingface_hub
8c0375c verified
raw
history blame
51.6 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9999333733093477,
"eval_steps": 400,
"global_step": 469,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0021320541008728097,
"grad_norm": 4.17070478980581,
"learning_rate": 1.0638297872340425e-08,
"logits/chosen": -0.4388880133628845,
"logits/rejected": -0.6813962459564209,
"logps/chosen": -137.1171112060547,
"logps/rejected": -114.13969421386719,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.010660270504364048,
"grad_norm": 3.7299717491618436,
"learning_rate": 5.3191489361702123e-08,
"logits/chosen": -0.4889238774776459,
"logits/rejected": -0.6665000319480896,
"logps/chosen": -169.8695068359375,
"logps/rejected": -153.95947265625,
"loss": 0.6932,
"rewards/accuracies": 0.3671875,
"rewards/chosen": 0.00029664667090401053,
"rewards/margins": -0.00023018479987513274,
"rewards/rejected": 0.0005268314271233976,
"step": 5
},
{
"epoch": 0.021320541008728097,
"grad_norm": 3.95978205732512,
"learning_rate": 1.0638297872340425e-07,
"logits/chosen": -0.46806925535202026,
"logits/rejected": -0.6404483318328857,
"logps/chosen": -160.8107147216797,
"logps/rejected": -149.25921630859375,
"loss": 0.6928,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.0006372839561663568,
"rewards/margins": 0.0015358469681814313,
"rewards/rejected": -0.0008985629538074136,
"step": 10
},
{
"epoch": 0.03198081151309214,
"grad_norm": 4.070738919050114,
"learning_rate": 1.5957446808510638e-07,
"logits/chosen": -0.5198644399642944,
"logits/rejected": -0.7026724219322205,
"logps/chosen": -148.3934783935547,
"logps/rejected": -137.8568878173828,
"loss": 0.6932,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": 0.00037692085606977344,
"rewards/margins": 9.87994353636168e-05,
"rewards/rejected": 0.00027812132611870766,
"step": 15
},
{
"epoch": 0.04264108201745619,
"grad_norm": 4.076698141198564,
"learning_rate": 2.127659574468085e-07,
"logits/chosen": -0.5080031156539917,
"logits/rejected": -0.6844709515571594,
"logps/chosen": -163.26565551757812,
"logps/rejected": -144.93130493164062,
"loss": 0.6929,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 0.0008511164924129844,
"rewards/margins": 0.0010705896420404315,
"rewards/rejected": -0.00021947314962744713,
"step": 20
},
{
"epoch": 0.05330135252182024,
"grad_norm": 4.091883356232605,
"learning_rate": 2.659574468085106e-07,
"logits/chosen": -0.45363473892211914,
"logits/rejected": -0.6415150761604309,
"logps/chosen": -160.65203857421875,
"logps/rejected": -139.57582092285156,
"loss": 0.6925,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.0009880407014861703,
"rewards/margins": 0.0012083369074389338,
"rewards/rejected": -0.00022029613319318742,
"step": 25
},
{
"epoch": 0.06396162302618429,
"grad_norm": 4.4267622202574675,
"learning_rate": 3.1914893617021275e-07,
"logits/chosen": -0.5177901983261108,
"logits/rejected": -0.6321993470191956,
"logps/chosen": -165.01699829101562,
"logps/rejected": -151.71261596679688,
"loss": 0.6921,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.0023814309388399124,
"rewards/margins": 0.002116392133757472,
"rewards/rejected": 0.0002650389797054231,
"step": 30
},
{
"epoch": 0.07462189353054834,
"grad_norm": 4.269424985466007,
"learning_rate": 3.7234042553191484e-07,
"logits/chosen": -0.4782675802707672,
"logits/rejected": -0.7104529738426208,
"logps/chosen": -163.6421356201172,
"logps/rejected": -143.2295379638672,
"loss": 0.6913,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.004739758092910051,
"rewards/margins": 0.0038230004720389843,
"rewards/rejected": 0.000916757620871067,
"step": 35
},
{
"epoch": 0.08528216403491239,
"grad_norm": 4.2880363073067365,
"learning_rate": 4.25531914893617e-07,
"logits/chosen": -0.5303796529769897,
"logits/rejected": -0.7106837630271912,
"logps/chosen": -174.71463012695312,
"logps/rejected": -153.29507446289062,
"loss": 0.6903,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.008925501257181168,
"rewards/margins": 0.006593695841729641,
"rewards/rejected": 0.0023318054154515266,
"step": 40
},
{
"epoch": 0.09594243453927644,
"grad_norm": 4.016438849908063,
"learning_rate": 4.787234042553192e-07,
"logits/chosen": -0.522494375705719,
"logits/rejected": -0.7226734757423401,
"logps/chosen": -165.866455078125,
"logps/rejected": -144.34194946289062,
"loss": 0.6886,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 0.010274471715092659,
"rewards/margins": 0.011223495937883854,
"rewards/rejected": -0.0009490237571299076,
"step": 45
},
{
"epoch": 0.10660270504364049,
"grad_norm": 4.3216596095930235,
"learning_rate": 4.999376538968061e-07,
"logits/chosen": -0.5761003494262695,
"logits/rejected": -0.7390087842941284,
"logps/chosen": -161.60655212402344,
"logps/rejected": -144.6966552734375,
"loss": 0.6868,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.009824760258197784,
"rewards/margins": 0.014007952995598316,
"rewards/rejected": -0.004183194134384394,
"step": 50
},
{
"epoch": 0.11726297554800454,
"grad_norm": 4.305829979355763,
"learning_rate": 4.99556762539107e-07,
"logits/chosen": -0.5275800824165344,
"logits/rejected": -0.7155976891517639,
"logps/chosen": -172.5618133544922,
"logps/rejected": -159.7906494140625,
"loss": 0.6842,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.007245404180139303,
"rewards/margins": 0.016996894031763077,
"rewards/rejected": -0.009751489385962486,
"step": 55
},
{
"epoch": 0.12792324605236857,
"grad_norm": 3.919812332975093,
"learning_rate": 4.988301435819852e-07,
"logits/chosen": -0.528161883354187,
"logits/rejected": -0.7242938280105591,
"logps/chosen": -163.2517547607422,
"logps/rejected": -152.65904235839844,
"loss": 0.6833,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -4.5745400711894035e-05,
"rewards/margins": 0.017660435289144516,
"rewards/rejected": -0.01770617999136448,
"step": 60
},
{
"epoch": 0.13858351655673262,
"grad_norm": 4.26787115297138,
"learning_rate": 4.977588036590624e-07,
"logits/chosen": -0.6125078797340393,
"logits/rejected": -0.7909122109413147,
"logps/chosen": -157.07858276367188,
"logps/rejected": -142.1239776611328,
"loss": 0.6787,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.011157763190567493,
"rewards/margins": 0.029583096504211426,
"rewards/rejected": -0.04074086248874664,
"step": 65
},
{
"epoch": 0.14924378706109667,
"grad_norm": 4.32141025222622,
"learning_rate": 4.96344226968867e-07,
"logits/chosen": -0.6417307257652283,
"logits/rejected": -0.8415061235427856,
"logps/chosen": -177.39974975585938,
"logps/rejected": -156.98171997070312,
"loss": 0.6761,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.018069323152303696,
"rewards/margins": 0.04366481304168701,
"rewards/rejected": -0.061734139919281006,
"step": 70
},
{
"epoch": 0.15990405756546072,
"grad_norm": 4.745633736375277,
"learning_rate": 4.945883732186751e-07,
"logits/chosen": -0.6420779824256897,
"logits/rejected": -0.8456922769546509,
"logps/chosen": -175.96359252929688,
"logps/rejected": -160.39553833007812,
"loss": 0.6753,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.049303699284791946,
"rewards/margins": 0.04190283641219139,
"rewards/rejected": -0.09120653569698334,
"step": 75
},
{
"epoch": 0.17056432806982477,
"grad_norm": 4.4046157142215705,
"learning_rate": 4.924936749095969e-07,
"logits/chosen": -0.6506496071815491,
"logits/rejected": -0.8331305384635925,
"logps/chosen": -170.9277801513672,
"logps/rejected": -157.8987579345703,
"loss": 0.6764,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.07082077115774155,
"rewards/margins": 0.044193871319293976,
"rewards/rejected": -0.11501463502645493,
"step": 80
},
{
"epoch": 0.18122459857418882,
"grad_norm": 5.024858873122934,
"learning_rate": 4.900630339666717e-07,
"logits/chosen": -0.6046501994132996,
"logits/rejected": -0.879498302936554,
"logps/chosen": -172.4420928955078,
"logps/rejected": -155.1177215576172,
"loss": 0.6708,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.08710388094186783,
"rewards/margins": 0.05091012641787529,
"rewards/rejected": -0.13801398873329163,
"step": 85
},
{
"epoch": 0.19188486907855287,
"grad_norm": 4.906760943250142,
"learning_rate": 4.872998177186375e-07,
"logits/chosen": -0.6804112195968628,
"logits/rejected": -0.9185736775398254,
"logps/chosen": -173.2130126953125,
"logps/rejected": -157.01849365234375,
"loss": 0.6656,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.09927495568990707,
"rewards/margins": 0.056527040898799896,
"rewards/rejected": -0.15580201148986816,
"step": 90
},
{
"epoch": 0.20254513958291692,
"grad_norm": 4.854322224106784,
"learning_rate": 4.842078542329463e-07,
"logits/chosen": -0.6420129537582397,
"logits/rejected": -0.8440741300582886,
"logps/chosen": -172.54263305664062,
"logps/rejected": -160.012939453125,
"loss": 0.6636,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.11956344544887543,
"rewards/margins": 0.0651877298951149,
"rewards/rejected": -0.18475116789340973,
"step": 95
},
{
"epoch": 0.21320541008728097,
"grad_norm": 5.020847639274401,
"learning_rate": 4.807914270124876e-07,
"logits/chosen": -0.6584053635597229,
"logits/rejected": -0.8369486927986145,
"logps/chosen": -158.8271484375,
"logps/rejected": -151.04791259765625,
"loss": 0.6622,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.13495273888111115,
"rewards/margins": 0.06916390359401703,
"rewards/rejected": -0.20411665737628937,
"step": 100
},
{
"epoch": 0.22386568059164502,
"grad_norm": 5.1518931973507875,
"learning_rate": 4.770552690613665e-07,
"logits/chosen": -0.7008846998214722,
"logits/rejected": -0.9158443212509155,
"logps/chosen": -181.6995391845703,
"logps/rejected": -168.43638610839844,
"loss": 0.6531,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.14559721946716309,
"rewards/margins": 0.08520212024450302,
"rewards/rejected": -0.2307993471622467,
"step": 105
},
{
"epoch": 0.23452595109600907,
"grad_norm": 4.93222468686984,
"learning_rate": 4.730045563279577e-07,
"logits/chosen": -0.7327751517295837,
"logits/rejected": -0.9426084756851196,
"logps/chosen": -184.8527069091797,
"logps/rejected": -169.2633056640625,
"loss": 0.6536,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.18423308432102203,
"rewards/margins": 0.08043086528778076,
"rewards/rejected": -0.2646639347076416,
"step": 110
},
{
"epoch": 0.24518622160037312,
"grad_norm": 5.321285521863998,
"learning_rate": 4.6864490053432e-07,
"logits/chosen": -0.7645201683044434,
"logits/rejected": -0.9136350750923157,
"logps/chosen": -184.50399780273438,
"logps/rejected": -182.33792114257812,
"loss": 0.6467,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.1797805279493332,
"rewards/margins": 0.10915856063365936,
"rewards/rejected": -0.28893908858299255,
"step": 115
},
{
"epoch": 0.25584649210473714,
"grad_norm": 5.62424898876036,
"learning_rate": 4.6398234140190413e-07,
"logits/chosen": -0.7312062978744507,
"logits/rejected": -0.9342387318611145,
"logps/chosen": -189.24227905273438,
"logps/rejected": -181.2150115966797,
"loss": 0.6404,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.22928175330162048,
"rewards/margins": 0.1005432978272438,
"rewards/rejected": -0.3298250436782837,
"step": 120
},
{
"epoch": 0.2665067626091012,
"grad_norm": 5.848008736661893,
"learning_rate": 4.5902333828432416e-07,
"logits/chosen": -0.7402585744857788,
"logits/rejected": -0.9469724893569946,
"logps/chosen": -188.2518768310547,
"logps/rejected": -183.68360900878906,
"loss": 0.6314,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.2475469410419464,
"rewards/margins": 0.15488557517528534,
"rewards/rejected": -0.40243250131607056,
"step": 125
},
{
"epoch": 0.27716703311346524,
"grad_norm": 5.62435510068984,
"learning_rate": 4.537747612187848e-07,
"logits/chosen": -0.6827915906906128,
"logits/rejected": -0.9053131341934204,
"logps/chosen": -176.27835083007812,
"logps/rejected": -177.09768676757812,
"loss": 0.6331,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.2656404376029968,
"rewards/margins": 0.14400802552700043,
"rewards/rejected": -0.40964850783348083,
"step": 130
},
{
"epoch": 0.2878273036178293,
"grad_norm": 5.883733263408107,
"learning_rate": 4.4824388140856194e-07,
"logits/chosen": -0.813726544380188,
"logits/rejected": -0.9863494634628296,
"logps/chosen": -193.75765991210938,
"logps/rejected": -192.6829833984375,
"loss": 0.6258,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -0.32872524857521057,
"rewards/margins": 0.16848836839199066,
"rewards/rejected": -0.49721360206604004,
"step": 135
},
{
"epoch": 0.29848757412219334,
"grad_norm": 6.222829798884928,
"learning_rate": 4.4243836114972003e-07,
"logits/chosen": -0.7957421541213989,
"logits/rejected": -0.9675641059875488,
"logps/chosen": -185.958251953125,
"logps/rejected": -190.2810516357422,
"loss": 0.6259,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.36352983117103577,
"rewards/margins": 0.1679573506116867,
"rewards/rejected": -0.5314871072769165,
"step": 140
},
{
"epoch": 0.3091478446265574,
"grad_norm": 6.026406045285321,
"learning_rate": 4.3636624321602354e-07,
"logits/chosen": -0.7669280171394348,
"logits/rejected": -1.0013420581817627,
"logps/chosen": -199.62496948242188,
"logps/rejected": -198.5312957763672,
"loss": 0.6139,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.41982731223106384,
"rewards/margins": 0.1919022500514984,
"rewards/rejected": -0.611729621887207,
"step": 145
},
{
"epoch": 0.31980811513092144,
"grad_norm": 6.938366915650047,
"learning_rate": 4.300359397167469e-07,
"logits/chosen": -0.78579181432724,
"logits/rejected": -1.0266155004501343,
"logps/chosen": -190.5222625732422,
"logps/rejected": -191.94302368164062,
"loss": 0.6191,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.4288663864135742,
"rewards/margins": 0.1750030219554901,
"rewards/rejected": -0.6038694381713867,
"step": 150
},
{
"epoch": 0.3304683856352855,
"grad_norm": 6.503433628260907,
"learning_rate": 4.2345622044281914e-07,
"logits/chosen": -0.7738896608352661,
"logits/rejected": -0.9923878908157349,
"logps/chosen": -201.4437255859375,
"logps/rejected": -201.36099243164062,
"loss": 0.6073,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.46533137559890747,
"rewards/margins": 0.18831129372119904,
"rewards/rejected": -0.6536425948143005,
"step": 155
},
{
"epoch": 0.34112865613964954,
"grad_norm": 6.951278659773283,
"learning_rate": 4.1663620071744896e-07,
"logits/chosen": -0.8082219958305359,
"logits/rejected": -1.0701286792755127,
"logps/chosen": -221.80789184570312,
"logps/rejected": -220.5237274169922,
"loss": 0.6108,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.5697073340415955,
"rewards/margins": 0.196958988904953,
"rewards/rejected": -0.7666663527488708,
"step": 160
},
{
"epoch": 0.35178892664401357,
"grad_norm": 7.107245594085975,
"learning_rate": 4.0958532876806036e-07,
"logits/chosen": -0.9068414568901062,
"logits/rejected": -1.0665959119796753,
"logps/chosen": -223.1608428955078,
"logps/rejected": -228.6382598876953,
"loss": 0.6007,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -0.6051439046859741,
"rewards/margins": 0.22736486792564392,
"rewards/rejected": -0.8325088620185852,
"step": 165
},
{
"epoch": 0.36244919714837764,
"grad_norm": 7.5558158008023355,
"learning_rate": 4.023133726370341e-07,
"logits/chosen": -0.7768110036849976,
"logits/rejected": -1.023694634437561,
"logps/chosen": -230.20028686523438,
"logps/rejected": -237.296630859375,
"loss": 0.6005,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.6818786859512329,
"rewards/margins": 0.2647910714149475,
"rewards/rejected": -0.9466696977615356,
"step": 170
},
{
"epoch": 0.37310946765274167,
"grad_norm": 7.748401207711855,
"learning_rate": 3.9483040664938844e-07,
"logits/chosen": -0.8651229739189148,
"logits/rejected": -1.1080349683761597,
"logps/chosen": -239.4313201904297,
"logps/rejected": -245.35641479492188,
"loss": 0.5827,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.7178173065185547,
"rewards/margins": 0.29743796586990356,
"rewards/rejected": -1.015255331993103,
"step": 175
},
{
"epoch": 0.38376973815710574,
"grad_norm": 7.833168702083219,
"learning_rate": 3.8714679745614556e-07,
"logits/chosen": -0.9112879633903503,
"logits/rejected": -1.1001932621002197,
"logps/chosen": -251.1482391357422,
"logps/rejected": -257.7167053222656,
"loss": 0.5869,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.8083968162536621,
"rewards/margins": 0.26524096727371216,
"rewards/rejected": -1.073637843132019,
"step": 180
},
{
"epoch": 0.39443000866146977,
"grad_norm": 7.402036456357543,
"learning_rate": 3.792731896727196e-07,
"logits/chosen": -0.8897370100021362,
"logits/rejected": -1.091963768005371,
"logps/chosen": -246.6190948486328,
"logps/rejected": -268.6842041015625,
"loss": 0.5851,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.8738805651664734,
"rewards/margins": 0.3643074929714203,
"rewards/rejected": -1.2381881475448608,
"step": 185
},
{
"epoch": 0.40509027916583384,
"grad_norm": 7.32634230041485,
"learning_rate": 3.712204911322228e-07,
"logits/chosen": -0.8557780981063843,
"logits/rejected": -1.057023286819458,
"logps/chosen": -217.1138916015625,
"logps/rejected": -232.2842254638672,
"loss": 0.5838,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.7771707773208618,
"rewards/margins": 0.2797245681285858,
"rewards/rejected": -1.05689537525177,
"step": 190
},
{
"epoch": 0.41575054967019787,
"grad_norm": 9.45088347010784,
"learning_rate": 3.629998577741174e-07,
"logits/chosen": -0.8742257952690125,
"logits/rejected": -1.0490225553512573,
"logps/chosen": -240.11489868164062,
"logps/rejected": -265.6509094238281,
"loss": 0.5864,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.8606696128845215,
"rewards/margins": 0.3593491315841675,
"rewards/rejected": -1.2200186252593994,
"step": 195
},
{
"epoch": 0.42641082017456194,
"grad_norm": 8.652861206718594,
"learning_rate": 3.546226781891501e-07,
"logits/chosen": -0.8858518600463867,
"logits/rejected": -1.0868691205978394,
"logps/chosen": -266.2615051269531,
"logps/rejected": -285.27703857421875,
"loss": 0.5821,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.975814938545227,
"rewards/margins": 0.4038930833339691,
"rewards/rejected": -1.3797080516815186,
"step": 200
},
{
"epoch": 0.43707109067892597,
"grad_norm": 9.648919264403354,
"learning_rate": 3.461005578419791e-07,
"logits/chosen": -0.8321302533149719,
"logits/rejected": -1.0552650690078735,
"logps/chosen": -253.7904815673828,
"logps/rejected": -272.8400573730469,
"loss": 0.588,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.9785162210464478,
"rewards/margins": 0.3188565969467163,
"rewards/rejected": -1.297372817993164,
"step": 205
},
{
"epoch": 0.44773136118329004,
"grad_norm": 8.305774901520081,
"learning_rate": 3.374453029933509e-07,
"logits/chosen": -0.9058141708374023,
"logits/rejected": -1.0458682775497437,
"logps/chosen": -258.77069091796875,
"logps/rejected": -279.82977294921875,
"loss": 0.5823,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.9745637774467468,
"rewards/margins": 0.3414529263973236,
"rewards/rejected": -1.3160169124603271,
"step": 210
},
{
"epoch": 0.45839163168765407,
"grad_norm": 8.730250055075079,
"learning_rate": 3.286689043441015e-07,
"logits/chosen": -0.8889232873916626,
"logits/rejected": -1.12659752368927,
"logps/chosen": -264.6424255371094,
"logps/rejected": -273.76092529296875,
"loss": 0.5905,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.9881819486618042,
"rewards/margins": 0.31245288252830505,
"rewards/rejected": -1.3006350994110107,
"step": 215
},
{
"epoch": 0.46905190219201814,
"grad_norm": 9.464259902697126,
"learning_rate": 3.197835204236402e-07,
"logits/chosen": -0.9472643136978149,
"logits/rejected": -1.142138123512268,
"logps/chosen": -279.47662353515625,
"logps/rejected": -311.5118103027344,
"loss": 0.5629,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -1.133866548538208,
"rewards/margins": 0.4763459265232086,
"rewards/rejected": -1.6102125644683838,
"step": 220
},
{
"epoch": 0.47971217269638217,
"grad_norm": 9.53110205637003,
"learning_rate": 3.1080146074592877e-07,
"logits/chosen": -0.8609586954116821,
"logits/rejected": -1.1460800170898438,
"logps/chosen": -280.66595458984375,
"logps/rejected": -307.8553771972656,
"loss": 0.5514,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.1233617067337036,
"rewards/margins": 0.49458152055740356,
"rewards/rejected": -1.6179431676864624,
"step": 225
},
{
"epoch": 0.49037244320074624,
"grad_norm": 10.766670968073823,
"learning_rate": 3.017351687562928e-07,
"logits/chosen": -0.869361400604248,
"logits/rejected": -1.071195125579834,
"logps/chosen": -287.5640869140625,
"logps/rejected": -315.25347900390625,
"loss": 0.5665,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -1.2507811784744263,
"rewards/margins": 0.4507381319999695,
"rewards/rejected": -1.7015190124511719,
"step": 230
},
{
"epoch": 0.5010327137051103,
"grad_norm": 8.57346401837084,
"learning_rate": 2.925972045926878e-07,
"logits/chosen": -0.9069381952285767,
"logits/rejected": -1.0885123014450073,
"logps/chosen": -276.06878662109375,
"logps/rejected": -302.81072998046875,
"loss": 0.5677,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.1936795711517334,
"rewards/margins": 0.44402870535850525,
"rewards/rejected": -1.6377084255218506,
"step": 235
},
{
"epoch": 0.5116929842094743,
"grad_norm": 8.335769499664682,
"learning_rate": 2.83400227685304e-07,
"logits/chosen": -0.926740288734436,
"logits/rejected": -1.188207983970642,
"logps/chosen": -272.0440979003906,
"logps/rejected": -291.0050964355469,
"loss": 0.5609,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.1271604299545288,
"rewards/margins": 0.37117230892181396,
"rewards/rejected": -1.4983327388763428,
"step": 240
},
{
"epoch": 0.5223532547138383,
"grad_norm": 8.95305553011223,
"learning_rate": 2.7415697921861525e-07,
"logits/chosen": -0.8435291051864624,
"logits/rejected": -1.072458028793335,
"logps/chosen": -263.8363952636719,
"logps/rejected": -289.58270263671875,
"loss": 0.552,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.0684736967086792,
"rewards/margins": 0.43612685799598694,
"rewards/rejected": -1.5046006441116333,
"step": 245
},
{
"epoch": 0.5330135252182024,
"grad_norm": 10.305199478555215,
"learning_rate": 2.6488026448016686e-07,
"logits/chosen": -0.9254539608955383,
"logits/rejected": -1.1660327911376953,
"logps/chosen": -287.7872009277344,
"logps/rejected": -306.3985290527344,
"loss": 0.5594,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.1574687957763672,
"rewards/margins": 0.37755414843559265,
"rewards/rejected": -1.5350229740142822,
"step": 250
},
{
"epoch": 0.5436737957225665,
"grad_norm": 9.11035884736237,
"learning_rate": 2.5558293512055923e-07,
"logits/chosen": -0.8859409093856812,
"logits/rejected": -1.1229826211929321,
"logps/chosen": -278.84051513671875,
"logps/rejected": -311.79669189453125,
"loss": 0.5571,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.2464487552642822,
"rewards/margins": 0.48425453901290894,
"rewards/rejected": -1.730703353881836,
"step": 255
},
{
"epoch": 0.5543340662269305,
"grad_norm": 9.443455019352353,
"learning_rate": 2.4627787134919946e-07,
"logits/chosen": -0.8607537150382996,
"logits/rejected": -1.067083716392517,
"logps/chosen": -306.5609130859375,
"logps/rejected": -340.9252014160156,
"loss": 0.559,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.4955613613128662,
"rewards/margins": 0.5148967504501343,
"rewards/rejected": -2.01045823097229,
"step": 260
},
{
"epoch": 0.5649943367312945,
"grad_norm": 10.020105882711649,
"learning_rate": 2.369779640904909e-07,
"logits/chosen": -0.9872435331344604,
"logits/rejected": -1.1790921688079834,
"logps/chosen": -301.1463928222656,
"logps/rejected": -326.53509521484375,
"loss": 0.5522,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.46715247631073,
"rewards/margins": 0.45322275161743164,
"rewards/rejected": -1.9203754663467407,
"step": 265
},
{
"epoch": 0.5756546072356586,
"grad_norm": 9.230369920285517,
"learning_rate": 2.2769609712517602e-07,
"logits/chosen": -0.9972273707389832,
"logits/rejected": -1.139904499053955,
"logps/chosen": -310.1788635253906,
"logps/rejected": -328.85455322265625,
"loss": 0.5693,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.3879780769348145,
"rewards/margins": 0.4023415446281433,
"rewards/rejected": -1.7903196811676025,
"step": 270
},
{
"epoch": 0.5863148777400227,
"grad_norm": 9.773551123939216,
"learning_rate": 2.184451292415778e-07,
"logits/chosen": -0.9245126843452454,
"logits/rejected": -1.0917091369628906,
"logps/chosen": -265.5910949707031,
"logps/rejected": -292.25726318359375,
"loss": 0.5625,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.123450517654419,
"rewards/margins": 0.4249204099178314,
"rewards/rejected": -1.5483709573745728,
"step": 275
},
{
"epoch": 0.5969751482443867,
"grad_norm": 9.944866138311095,
"learning_rate": 2.0923787642146434e-07,
"logits/chosen": -0.8810575604438782,
"logits/rejected": -1.0941672325134277,
"logps/chosen": -280.61279296875,
"logps/rejected": -312.9557800292969,
"loss": 0.552,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.2670402526855469,
"rewards/margins": 0.519837498664856,
"rewards/rejected": -1.7868778705596924,
"step": 280
},
{
"epoch": 0.6076354187487507,
"grad_norm": 9.880910925618455,
"learning_rate": 2.0008709408521507e-07,
"logits/chosen": -0.9383381009101868,
"logits/rejected": -1.1827994585037231,
"logps/chosen": -295.6000671386719,
"logps/rejected": -324.3331604003906,
"loss": 0.5407,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.2447686195373535,
"rewards/margins": 0.5489395260810852,
"rewards/rejected": -1.793708086013794,
"step": 285
},
{
"epoch": 0.6182956892531148,
"grad_norm": 10.071491320024812,
"learning_rate": 1.9100545942088848e-07,
"logits/chosen": -0.9224274754524231,
"logits/rejected": -1.1538960933685303,
"logps/chosen": -289.017578125,
"logps/rejected": -325.94952392578125,
"loss": 0.5457,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.2537972927093506,
"rewards/margins": 0.5672923922538757,
"rewards/rejected": -1.821089744567871,
"step": 290
},
{
"epoch": 0.6289559597574789,
"grad_norm": 11.845857689113707,
"learning_rate": 1.8200555382166898e-07,
"logits/chosen": -0.9387105107307434,
"logits/rejected": -1.1250282526016235,
"logps/chosen": -318.4964294433594,
"logps/rejected": -338.69696044921875,
"loss": 0.5696,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.5140787363052368,
"rewards/margins": 0.4427851140499115,
"rewards/rejected": -1.9568637609481812,
"step": 295
},
{
"epoch": 0.6396162302618429,
"grad_norm": 10.971903527074975,
"learning_rate": 1.7309984545602528e-07,
"logits/chosen": -0.9286500215530396,
"logits/rejected": -1.1137937307357788,
"logps/chosen": -279.747802734375,
"logps/rejected": -307.8285217285156,
"loss": 0.5376,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.323687195777893,
"rewards/margins": 0.48056259751319885,
"rewards/rejected": -1.8042497634887695,
"step": 300
},
{
"epoch": 0.6502765007662069,
"grad_norm": 10.964118734413244,
"learning_rate": 1.6430067199472657e-07,
"logits/chosen": -0.9661188125610352,
"logits/rejected": -1.1719661951065063,
"logps/chosen": -294.7871398925781,
"logps/rejected": -329.8990783691406,
"loss": 0.5342,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.3090574741363525,
"rewards/margins": 0.5292733907699585,
"rewards/rejected": -1.838330864906311,
"step": 305
},
{
"epoch": 0.660936771270571,
"grad_norm": 11.086382549521785,
"learning_rate": 1.5562022351864534e-07,
"logits/chosen": -0.9217275381088257,
"logits/rejected": -1.1163594722747803,
"logps/chosen": -266.56402587890625,
"logps/rejected": -306.4192810058594,
"loss": 0.5437,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.1430429220199585,
"rewards/margins": 0.5940698981285095,
"rewards/rejected": -1.7371127605438232,
"step": 310
},
{
"epoch": 0.6715970417749351,
"grad_norm": 10.957109584007643,
"learning_rate": 1.4707052563102748e-07,
"logits/chosen": -0.8743804097175598,
"logits/rejected": -1.0983814001083374,
"logps/chosen": -285.22607421875,
"logps/rejected": -317.2628173828125,
"loss": 0.5298,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.3059532642364502,
"rewards/margins": 0.5242554545402527,
"rewards/rejected": -1.8302087783813477,
"step": 315
},
{
"epoch": 0.6822573122792991,
"grad_norm": 10.507330109558843,
"learning_rate": 1.386634227976224e-07,
"logits/chosen": -0.9597967863082886,
"logits/rejected": -1.124963402748108,
"logps/chosen": -286.6432189941406,
"logps/rejected": -315.79937744140625,
"loss": 0.5378,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.3352241516113281,
"rewards/margins": 0.4382667541503906,
"rewards/rejected": -1.7734909057617188,
"step": 320
},
{
"epoch": 0.6929175827836631,
"grad_norm": 9.804790546339078,
"learning_rate": 1.3041056193775665e-07,
"logits/chosen": -0.888710618019104,
"logits/rejected": -1.0851693153381348,
"logps/chosen": -311.01544189453125,
"logps/rejected": -332.7283020019531,
"loss": 0.5475,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.5570933818817139,
"rewards/margins": 0.4053064286708832,
"rewards/rejected": -1.9623997211456299,
"step": 325
},
{
"epoch": 0.7035778532880271,
"grad_norm": 9.630550808372668,
"learning_rate": 1.2232337628908103e-07,
"logits/chosen": -0.9582077264785767,
"logits/rejected": -1.1537044048309326,
"logps/chosen": -326.71221923828125,
"logps/rejected": -377.6993713378906,
"loss": 0.5435,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.4935967922210693,
"rewards/margins": 0.7231054902076721,
"rewards/rejected": -2.2167022228240967,
"step": 330
},
{
"epoch": 0.7142381237923913,
"grad_norm": 9.172032682717258,
"learning_rate": 1.1441306956834504e-07,
"logits/chosen": -0.9413734674453735,
"logits/rejected": -1.1069329977035522,
"logps/chosen": -306.80218505859375,
"logps/rejected": -357.0929870605469,
"loss": 0.5238,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.4035927057266235,
"rewards/margins": 0.6626663208007812,
"rewards/rejected": -2.0662589073181152,
"step": 335
},
{
"epoch": 0.7248983942967553,
"grad_norm": 10.907598822157487,
"learning_rate": 1.0669060045014214e-07,
"logits/chosen": -1.0222991704940796,
"logits/rejected": -1.228389024734497,
"logps/chosen": -316.627197265625,
"logps/rejected": -357.66229248046875,
"loss": 0.5388,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -1.4493268728256226,
"rewards/margins": 0.5827343463897705,
"rewards/rejected": -2.0320611000061035,
"step": 340
},
{
"epoch": 0.7355586648011193,
"grad_norm": 10.97300975462713,
"learning_rate": 9.9166667385128e-08,
"logits/chosen": -0.963638186454773,
"logits/rejected": -1.1757190227508545,
"logps/chosen": -304.3102722167969,
"logps/rejected": -354.2998962402344,
"loss": 0.5432,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.4618219137191772,
"rewards/margins": 0.7080960273742676,
"rewards/rejected": -2.1699178218841553,
"step": 345
},
{
"epoch": 0.7462189353054833,
"grad_norm": 9.89897013382996,
"learning_rate": 9.185169377874488e-08,
"logits/chosen": -0.9903243780136108,
"logits/rejected": -1.1469306945800781,
"logps/chosen": -312.1212158203125,
"logps/rejected": -346.9307861328125,
"loss": 0.5252,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.5106861591339111,
"rewards/margins": 0.49892768263816833,
"rewards/rejected": -2.0096137523651123,
"step": 350
},
{
"epoch": 0.7568792058098475,
"grad_norm": 10.018680833325265,
"learning_rate": 8.475581355098379e-08,
"logits/chosen": -0.9698395729064941,
"logits/rejected": -1.1572554111480713,
"logps/chosen": -304.4853820800781,
"logps/rejected": -342.16827392578125,
"loss": 0.5462,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -1.4320096969604492,
"rewards/margins": 0.5366055965423584,
"rewards/rejected": -1.968615174293518,
"step": 355
},
{
"epoch": 0.7675394763142115,
"grad_norm": 11.03385142626086,
"learning_rate": 7.788885709719033e-08,
"logits/chosen": -0.9215399622917175,
"logits/rejected": -1.1144723892211914,
"logps/chosen": -316.9365234375,
"logps/rejected": -359.6341857910156,
"loss": 0.5392,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.580185890197754,
"rewards/margins": 0.564557671546936,
"rewards/rejected": -2.1447434425354004,
"step": 360
},
{
"epoch": 0.7781997468185755,
"grad_norm": 9.523737016870674,
"learning_rate": 7.126033766936365e-08,
"logits/chosen": -0.9409270286560059,
"logits/rejected": -1.124208688735962,
"logps/chosen": -311.7746276855469,
"logps/rejected": -355.46343994140625,
"loss": 0.536,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.5002214908599854,
"rewards/margins": 0.5499864816665649,
"rewards/rejected": -2.05020809173584,
"step": 365
},
{
"epoch": 0.7888600173229395,
"grad_norm": 11.210638577879926,
"learning_rate": 6.487943819681488e-08,
"logits/chosen": -0.9616110920906067,
"logits/rejected": -1.0974061489105225,
"logps/chosen": -315.260009765625,
"logps/rejected": -357.67059326171875,
"loss": 0.5533,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -1.509570837020874,
"rewards/margins": 0.537238597869873,
"rewards/rejected": -2.046809434890747,
"step": 370
},
{
"epoch": 0.7995202878273037,
"grad_norm": 9.781063018210089,
"learning_rate": 5.875499856444358e-08,
"logits/chosen": -0.9564340710639954,
"logits/rejected": -1.1133265495300293,
"logps/chosen": -314.17535400390625,
"logps/rejected": -351.45001220703125,
"loss": 0.5458,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.493622064590454,
"rewards/margins": 0.5427702069282532,
"rewards/rejected": -2.0363922119140625,
"step": 375
},
{
"epoch": 0.8101805583316677,
"grad_norm": 11.983119955061767,
"learning_rate": 5.289550336625731e-08,
"logits/chosen": -1.0206782817840576,
"logits/rejected": -1.2104320526123047,
"logps/chosen": -327.4963684082031,
"logps/rejected": -353.74603271484375,
"loss": 0.5474,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -1.506259560585022,
"rewards/margins": 0.49152374267578125,
"rewards/rejected": -1.9977830648422241,
"step": 380
},
{
"epoch": 0.8208408288360317,
"grad_norm": 10.83148544527409,
"learning_rate": 4.730907015109759e-08,
"logits/chosen": -0.9245961308479309,
"logits/rejected": -1.1795787811279297,
"logps/chosen": -309.1303405761719,
"logps/rejected": -346.46051025390625,
"loss": 0.5403,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.5297610759735107,
"rewards/margins": 0.5533354878425598,
"rewards/rejected": -2.083096742630005,
"step": 385
},
{
"epoch": 0.8315010993403957,
"grad_norm": 9.500539654945461,
"learning_rate": 4.200343817685981e-08,
"logits/chosen": -0.9566155672073364,
"logits/rejected": -1.0963544845581055,
"logps/chosen": -313.0601501464844,
"logps/rejected": -343.36773681640625,
"loss": 0.547,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.5300524234771729,
"rewards/margins": 0.4933779835700989,
"rewards/rejected": -2.023430347442627,
"step": 390
},
{
"epoch": 0.8421613698447599,
"grad_norm": 9.955855605589283,
"learning_rate": 3.698595768878363e-08,
"logits/chosen": -0.9913743734359741,
"logits/rejected": -1.180884599685669,
"logps/chosen": -311.83636474609375,
"logps/rejected": -356.932373046875,
"loss": 0.5178,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.429694414138794,
"rewards/margins": 0.6187530755996704,
"rewards/rejected": -2.048447370529175,
"step": 395
},
{
"epoch": 0.8528216403491239,
"grad_norm": 11.149747005186983,
"learning_rate": 3.226357973666888e-08,
"logits/chosen": -1.0238213539123535,
"logits/rejected": -1.1811949014663696,
"logps/chosen": -332.1514587402344,
"logps/rejected": -359.03167724609375,
"loss": 0.5505,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.6280012130737305,
"rewards/margins": 0.43937546014785767,
"rewards/rejected": -2.0673766136169434,
"step": 400
},
{
"epoch": 0.8528216403491239,
"eval_logits/chosen": -0.9705477356910706,
"eval_logits/rejected": -1.165926456451416,
"eval_logps/chosen": -307.21051025390625,
"eval_logps/rejected": -356.52508544921875,
"eval_loss": 0.5049245953559875,
"eval_rewards/accuracies": 0.7932573556900024,
"eval_rewards/chosen": -1.4455755949020386,
"eval_rewards/margins": 0.6763937473297119,
"eval_rewards/rejected": -2.12196946144104,
"eval_runtime": 11441.6179,
"eval_samples_per_second": 5.247,
"eval_steps_per_second": 1.312,
"step": 400
},
{
"epoch": 0.8634819108534879,
"grad_norm": 9.468787134199466,
"learning_rate": 2.7842846545123505e-08,
"logits/chosen": -0.9555789232254028,
"logits/rejected": -1.1705703735351562,
"logps/chosen": -289.6531677246094,
"logps/rejected": -345.7925720214844,
"loss": 0.5233,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -1.3922350406646729,
"rewards/margins": 0.6980171203613281,
"rewards/rejected": -2.090252161026001,
"step": 405
},
{
"epoch": 0.8741421813578519,
"grad_norm": 10.178761020491258,
"learning_rate": 2.372988245018401e-08,
"logits/chosen": -0.9851318597793579,
"logits/rejected": -1.1668522357940674,
"logps/chosen": -316.6786193847656,
"logps/rejected": -362.8905944824219,
"loss": 0.5423,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.598661184310913,
"rewards/margins": 0.608306884765625,
"rewards/rejected": -2.206967830657959,
"step": 410
},
{
"epoch": 0.884802451862216,
"grad_norm": 9.329485481095736,
"learning_rate": 1.9930385414865386e-08,
"logits/chosen": -1.0145405530929565,
"logits/rejected": -1.2289698123931885,
"logps/chosen": -336.15087890625,
"logps/rejected": -373.11309814453125,
"loss": 0.5293,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.558721899986267,
"rewards/margins": 0.6198412775993347,
"rewards/rejected": -2.178563356399536,
"step": 415
},
{
"epoch": 0.8954627223665801,
"grad_norm": 9.690686562397088,
"learning_rate": 1.6449619135393084e-08,
"logits/chosen": -0.9239746928215027,
"logits/rejected": -1.1881077289581299,
"logps/chosen": -296.87200927734375,
"logps/rejected": -329.9718017578125,
"loss": 0.5513,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.472847580909729,
"rewards/margins": 0.5113754868507385,
"rewards/rejected": -1.9842230081558228,
"step": 420
},
{
"epoch": 0.9061229928709441,
"grad_norm": 10.862769817255897,
"learning_rate": 1.329240574905452e-08,
"logits/chosen": -0.9023639559745789,
"logits/rejected": -1.0890004634857178,
"logps/chosen": -324.7179260253906,
"logps/rejected": -374.7180480957031,
"loss": 0.5149,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.5423232316970825,
"rewards/margins": 0.6671528816223145,
"rewards/rejected": -2.2094759941101074,
"step": 425
},
{
"epoch": 0.9167832633753081,
"grad_norm": 11.35977235393007,
"learning_rate": 1.0463119153770989e-08,
"logits/chosen": -0.9444347620010376,
"logits/rejected": -1.1702197790145874,
"logps/chosen": -298.4215393066406,
"logps/rejected": -328.64215087890625,
"loss": 0.5404,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.4311974048614502,
"rewards/margins": 0.5026859045028687,
"rewards/rejected": -1.9338833093643188,
"step": 430
},
{
"epoch": 0.9274435338796722,
"grad_norm": 10.068213055827782,
"learning_rate": 7.965678948645832e-09,
"logits/chosen": -0.9912747144699097,
"logits/rejected": -1.2084077596664429,
"logps/chosen": -336.46929931640625,
"logps/rejected": -379.56640625,
"loss": 0.538,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.6182082891464233,
"rewards/margins": 0.6836891174316406,
"rewards/rejected": -2.3018975257873535,
"step": 435
},
{
"epoch": 0.9381038043840363,
"grad_norm": 12.790282190393167,
"learning_rate": 5.803545003882554e-09,
"logits/chosen": -0.9938758015632629,
"logits/rejected": -1.17817223072052,
"logps/chosen": -326.2915954589844,
"logps/rejected": -371.28631591796875,
"loss": 0.5377,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.5600776672363281,
"rewards/margins": 0.5917671918869019,
"rewards/rejected": -2.1518447399139404,
"step": 440
},
{
"epoch": 0.9487640748884003,
"grad_norm": 9.050016131957404,
"learning_rate": 3.979712667596669e-09,
"logits/chosen": -0.9720270037651062,
"logits/rejected": -1.1488044261932373,
"logps/chosen": -304.312255859375,
"logps/rejected": -351.5962219238281,
"loss": 0.5199,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -1.4655094146728516,
"rewards/margins": 0.6790416240692139,
"rewards/rejected": -2.1445512771606445,
"step": 445
},
{
"epoch": 0.9594243453927643,
"grad_norm": 13.159010993827899,
"learning_rate": 2.4967086161600814e-09,
"logits/chosen": -0.994873046875,
"logits/rejected": -1.1672512292861938,
"logps/chosen": -314.894287109375,
"logps/rejected": -354.23223876953125,
"loss": 0.5276,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.5018284320831299,
"rewards/margins": 0.5567340850830078,
"rewards/rejected": -2.0585622787475586,
"step": 450
},
{
"epoch": 0.9700846158971284,
"grad_norm": 9.906738715572994,
"learning_rate": 1.3565873538283757e-09,
"logits/chosen": -0.9630732536315918,
"logits/rejected": -1.1276707649230957,
"logps/chosen": -306.04345703125,
"logps/rejected": -351.21099853515625,
"loss": 0.5208,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.395446538925171,
"rewards/margins": 0.6138492822647095,
"rewards/rejected": -2.009295701980591,
"step": 455
},
{
"epoch": 0.9807448864014925,
"grad_norm": 10.687835024200046,
"learning_rate": 5.609283664990693e-10,
"logits/chosen": -0.9506285786628723,
"logits/rejected": -1.20163094997406,
"logps/chosen": -323.80657958984375,
"logps/rejected": -370.2672424316406,
"loss": 0.5199,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -1.5296146869659424,
"rewards/margins": 0.6610507369041443,
"rewards/rejected": -2.1906654834747314,
"step": 460
},
{
"epoch": 0.9914051569058565,
"grad_norm": 11.797447945184583,
"learning_rate": 1.1083393354488491e-10,
"logits/chosen": -0.9356955289840698,
"logits/rejected": -1.1217402219772339,
"logps/chosen": -326.0872497558594,
"logps/rejected": -382.658203125,
"loss": 0.5263,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.588428020477295,
"rewards/margins": 0.7401828169822693,
"rewards/rejected": -2.328610897064209,
"step": 465
},
{
"epoch": 0.9999333733093477,
"step": 469,
"total_flos": 0.0,
"train_loss": 0.5891387982409138,
"train_runtime": 37343.5856,
"train_samples_per_second": 1.608,
"train_steps_per_second": 0.013
}
],
"logging_steps": 5,
"max_steps": 469,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}