Meta-Llama-3-8B-Base-MI-2e-5 / trainer_state.json
tengxiao1
TX
b9462d9
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.998691442030882,
"eval_steps": 500,
"global_step": 477,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.010468463752944255,
"grad_norm": 33.21111681571131,
"learning_rate": 2.0833333333333334e-06,
"logits/chosen": -0.4980102479457855,
"logits/rejected": -0.5135027170181274,
"logps/chosen": -1.1746745109558105,
"logps/rejected": -1.3606590032577515,
"loss": 2.1734,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.1746745109558105,
"rewards/margins": 0.1859845519065857,
"rewards/rejected": -1.3606590032577515,
"step": 5
},
{
"epoch": 0.02093692750588851,
"grad_norm": 21.16742924169967,
"learning_rate": 4.166666666666667e-06,
"logits/chosen": -0.5296765565872192,
"logits/rejected": -0.5027884244918823,
"logps/chosen": -1.1314122676849365,
"logps/rejected": -1.2633330821990967,
"loss": 2.1306,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -1.1314122676849365,
"rewards/margins": 0.13192060589790344,
"rewards/rejected": -1.2633330821990967,
"step": 10
},
{
"epoch": 0.031405391258832765,
"grad_norm": 18.622273155389507,
"learning_rate": 6.25e-06,
"logits/chosen": -0.45581430196762085,
"logits/rejected": -0.42932063341140747,
"logps/chosen": -1.1560032367706299,
"logps/rejected": -1.4923290014266968,
"loss": 2.0523,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.1560032367706299,
"rewards/margins": 0.3363257944583893,
"rewards/rejected": -1.4923290014266968,
"step": 15
},
{
"epoch": 0.04187385501177702,
"grad_norm": 25.84825355543498,
"learning_rate": 8.333333333333334e-06,
"logits/chosen": -0.6032270789146423,
"logits/rejected": -0.5604568719863892,
"logps/chosen": -1.2145692110061646,
"logps/rejected": -1.5209157466888428,
"loss": 2.101,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -1.2145692110061646,
"rewards/margins": 0.30634641647338867,
"rewards/rejected": -1.5209157466888428,
"step": 20
},
{
"epoch": 0.05234231876472128,
"grad_norm": 10.051572353875851,
"learning_rate": 1.0416666666666668e-05,
"logits/chosen": -0.7330023646354675,
"logits/rejected": -0.6652411222457886,
"logps/chosen": -1.3188468217849731,
"logps/rejected": -1.643450140953064,
"loss": 2.0473,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.3188468217849731,
"rewards/margins": 0.32460346817970276,
"rewards/rejected": -1.643450140953064,
"step": 25
},
{
"epoch": 0.06281078251766553,
"grad_norm": 12.836119680084701,
"learning_rate": 1.25e-05,
"logits/chosen": -0.7389785051345825,
"logits/rejected": -0.7157658338546753,
"logps/chosen": -1.2610353231430054,
"logps/rejected": -1.5368638038635254,
"loss": 2.1476,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -1.2610353231430054,
"rewards/margins": 0.27582842111587524,
"rewards/rejected": -1.5368638038635254,
"step": 30
},
{
"epoch": 0.07327924627060979,
"grad_norm": 10.916144118237353,
"learning_rate": 1.4583333333333333e-05,
"logits/chosen": -0.6624680757522583,
"logits/rejected": -0.5841827392578125,
"logps/chosen": -1.3438886404037476,
"logps/rejected": -1.5585218667984009,
"loss": 2.1252,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -1.3438886404037476,
"rewards/margins": 0.2146332710981369,
"rewards/rejected": -1.5585218667984009,
"step": 35
},
{
"epoch": 0.08374771002355404,
"grad_norm": 7.904056592473059,
"learning_rate": 1.6666666666666667e-05,
"logits/chosen": -0.8896552920341492,
"logits/rejected": -0.7669180631637573,
"logps/chosen": -1.3083586692810059,
"logps/rejected": -1.7862266302108765,
"loss": 2.0664,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.3083586692810059,
"rewards/margins": 0.47786790132522583,
"rewards/rejected": -1.7862266302108765,
"step": 40
},
{
"epoch": 0.0942161737764983,
"grad_norm": 13.316362762434997,
"learning_rate": 1.8750000000000002e-05,
"logits/chosen": -0.7929601669311523,
"logits/rejected": -0.752467930316925,
"logps/chosen": -1.2723389863967896,
"logps/rejected": -1.6567331552505493,
"loss": 2.0997,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -1.2723389863967896,
"rewards/margins": 0.38439422845840454,
"rewards/rejected": -1.6567331552505493,
"step": 45
},
{
"epoch": 0.10468463752944256,
"grad_norm": 16.98219621825263,
"learning_rate": 1.9998927475076107e-05,
"logits/chosen": -0.3519185483455658,
"logits/rejected": -0.30840247869491577,
"logps/chosen": -1.275742769241333,
"logps/rejected": -1.7419742345809937,
"loss": 2.1089,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -1.275742769241333,
"rewards/margins": 0.4662315845489502,
"rewards/rejected": -1.7419742345809937,
"step": 50
},
{
"epoch": 0.11515310128238682,
"grad_norm": 7.79923910311647,
"learning_rate": 1.998686421164407e-05,
"logits/chosen": -0.13412383198738098,
"logits/rejected": -0.06430118530988693,
"logps/chosen": -1.3077303171157837,
"logps/rejected": -1.7474453449249268,
"loss": 2.0751,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -1.3077303171157837,
"rewards/margins": 0.43971508741378784,
"rewards/rejected": -1.7474453449249268,
"step": 55
},
{
"epoch": 0.12562156503533106,
"grad_norm": 7.005978339387122,
"learning_rate": 1.9961413253717214e-05,
"logits/chosen": -0.4779301583766937,
"logits/rejected": -0.4137405455112457,
"logps/chosen": -1.3986326456069946,
"logps/rejected": -1.6036043167114258,
"loss": 2.1009,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -1.3986326456069946,
"rewards/margins": 0.20497193932533264,
"rewards/rejected": -1.6036043167114258,
"step": 60
},
{
"epoch": 0.1360900287882753,
"grad_norm": 6.982350763810334,
"learning_rate": 1.9922608719076874e-05,
"logits/chosen": -0.267805278301239,
"logits/rejected": -0.1766107976436615,
"logps/chosen": -1.2244327068328857,
"logps/rejected": -2.0722804069519043,
"loss": 2.0512,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.2244327068328857,
"rewards/margins": 0.8478477597236633,
"rewards/rejected": -2.0722804069519043,
"step": 65
},
{
"epoch": 0.14655849254121958,
"grad_norm": 10.132344121190984,
"learning_rate": 1.9870502626379127e-05,
"logits/chosen": -0.35906368494033813,
"logits/rejected": -0.33368802070617676,
"logps/chosen": -1.450307846069336,
"logps/rejected": -1.7906442880630493,
"loss": 2.1396,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -1.450307846069336,
"rewards/margins": 0.34033653140068054,
"rewards/rejected": -1.7906442880630493,
"step": 70
},
{
"epoch": 0.15702695629416383,
"grad_norm": 13.919238096447465,
"learning_rate": 1.980516482542224e-05,
"logits/chosen": -0.6731249094009399,
"logits/rejected": -0.6837888956069946,
"logps/chosen": -1.2502187490463257,
"logps/rejected": -1.7363303899765015,
"loss": 2.078,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -1.2502187490463257,
"rewards/margins": 0.4861116409301758,
"rewards/rejected": -1.7363303899765015,
"step": 75
},
{
"epoch": 0.16749542004710807,
"grad_norm": 8.714846434947708,
"learning_rate": 1.972668290351084e-05,
"logits/chosen": -0.8093172311782837,
"logits/rejected": -0.8910678029060364,
"logps/chosen": -1.3465213775634766,
"logps/rejected": -1.8265445232391357,
"loss": 2.1277,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.3465213775634766,
"rewards/margins": 0.480023056268692,
"rewards/rejected": -1.8265445232391357,
"step": 80
},
{
"epoch": 0.17796388380005235,
"grad_norm": 34.11378786664511,
"learning_rate": 1.9635162068042547e-05,
"logits/chosen": -0.6499379873275757,
"logits/rejected": -0.6738103628158569,
"logps/chosen": -1.2838003635406494,
"logps/rejected": -1.6559721231460571,
"loss": 2.1205,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -1.2838003635406494,
"rewards/margins": 0.37217170000076294,
"rewards/rejected": -1.6559721231460571,
"step": 85
},
{
"epoch": 0.1884323475529966,
"grad_norm": 6.6135900227176,
"learning_rate": 1.9530725005474195e-05,
"logits/chosen": -0.1341579109430313,
"logits/rejected": -0.1497870236635208,
"logps/chosen": -1.3539865016937256,
"logps/rejected": -1.7489871978759766,
"loss": 2.0639,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.3539865016937256,
"rewards/margins": 0.39500072598457336,
"rewards/rejected": -1.7489871978759766,
"step": 90
},
{
"epoch": 0.19890081130594087,
"grad_norm": 7.360183689725487,
"learning_rate": 1.9413511716856973e-05,
"logits/chosen": -0.12092798948287964,
"logits/rejected": -0.07471726834774017,
"logps/chosen": -1.3030513525009155,
"logps/rejected": -1.8159534931182861,
"loss": 2.0725,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.3030513525009155,
"rewards/margins": 0.512902021408081,
"rewards/rejected": -1.8159534931182861,
"step": 95
},
{
"epoch": 0.2093692750588851,
"grad_norm": 7.942085572142913,
"learning_rate": 1.9283679330160726e-05,
"logits/chosen": 0.026644444093108177,
"logits/rejected": 0.05488858371973038,
"logps/chosen": -1.3510897159576416,
"logps/rejected": -1.8336998224258423,
"loss": 2.0911,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -1.3510897159576416,
"rewards/margins": 0.482610285282135,
"rewards/rejected": -1.8336998224258423,
"step": 100
},
{
"epoch": 0.21983773881182936,
"grad_norm": 10.38696834889441,
"learning_rate": 1.9141401889639167e-05,
"logits/chosen": 0.12454743683338165,
"logits/rejected": 0.1521395593881607,
"logps/chosen": -1.308062195777893,
"logps/rejected": -1.873884916305542,
"loss": 2.029,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -1.308062195777893,
"rewards/margins": 0.5658227205276489,
"rewards/rejected": -1.873884916305542,
"step": 105
},
{
"epoch": 0.23030620256477363,
"grad_norm": 14.10578954487523,
"learning_rate": 1.898687012251826e-05,
"logits/chosen": -0.14296935498714447,
"logits/rejected": -0.08335347473621368,
"logps/chosen": -1.3113409280776978,
"logps/rejected": -1.7755804061889648,
"loss": 2.0509,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.3113409280776978,
"rewards/margins": 0.4642394483089447,
"rewards/rejected": -1.7755804061889648,
"step": 110
},
{
"epoch": 0.24077466631771788,
"grad_norm": 7.096253964138347,
"learning_rate": 1.8820291183320602e-05,
"logits/chosen": -0.20576635003089905,
"logits/rejected": -0.1285274177789688,
"logps/chosen": -1.2730509042739868,
"logps/rejected": -1.8806079626083374,
"loss": 2.0506,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.2730509042739868,
"rewards/margins": 0.6075571179389954,
"rewards/rejected": -1.8806079626083374,
"step": 115
},
{
"epoch": 0.2512431300706621,
"grad_norm": 8.976667698418499,
"learning_rate": 1.8641888376168483e-05,
"logits/chosen": -0.10974551737308502,
"logits/rejected": -0.07689039409160614,
"logps/chosen": -1.442338466644287,
"logps/rejected": -1.90249764919281,
"loss": 2.1387,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.442338466644287,
"rewards/margins": 0.46015921235084534,
"rewards/rejected": -1.90249764919281,
"step": 120
},
{
"epoch": 0.26171159382360637,
"grad_norm": 11.84902364416852,
"learning_rate": 1.845190085543795e-05,
"logits/chosen": 0.1279464215040207,
"logits/rejected": 0.1599569022655487,
"logps/chosen": -1.29521906375885,
"logps/rejected": -1.5512622594833374,
"loss": 2.0874,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.29521906375885,
"rewards/margins": 0.2560431957244873,
"rewards/rejected": -1.5512622594833374,
"step": 125
},
{
"epoch": 0.2721800575765506,
"grad_norm": 12.665702263586864,
"learning_rate": 1.8250583305165098e-05,
"logits/chosen": 0.10071973502635956,
"logits/rejected": 0.114678755402565,
"logps/chosen": -1.3293492794036865,
"logps/rejected": -1.6235164403915405,
"loss": 2.105,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -1.3293492794036865,
"rewards/margins": 0.29416733980178833,
"rewards/rejected": -1.6235164403915405,
"step": 130
},
{
"epoch": 0.2826485213294949,
"grad_norm": 11.168921888078422,
"learning_rate": 1.8038205597634392e-05,
"logits/chosen": -0.2312246859073639,
"logits/rejected": -0.13947580754756927,
"logps/chosen": -1.3103423118591309,
"logps/rejected": -1.973184585571289,
"loss": 2.0983,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.3103423118591309,
"rewards/margins": 0.662842333316803,
"rewards/rejected": -1.973184585571289,
"step": 135
},
{
"epoch": 0.29311698508243916,
"grad_norm": 15.647952191048011,
"learning_rate": 1.7815052431606702e-05,
"logits/chosen": -0.27144142985343933,
"logits/rejected": -0.2118106335401535,
"logps/chosen": -1.3751564025878906,
"logps/rejected": -2.03005051612854,
"loss": 2.0429,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.3751564025878906,
"rewards/margins": 0.6548939943313599,
"rewards/rejected": -2.03005051612854,
"step": 140
},
{
"epoch": 0.3035854488353834,
"grad_norm": 7.242840211884775,
"learning_rate": 1.7581422950671942e-05,
"logits/chosen": -0.19757069647312164,
"logits/rejected": -0.1668623834848404,
"logps/chosen": -1.3345425128936768,
"logps/rejected": -1.8127644062042236,
"loss": 2.0876,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.3345425128936768,
"rewards/margins": 0.4782216548919678,
"rewards/rejected": -1.8127644062042236,
"step": 145
},
{
"epoch": 0.31405391258832765,
"grad_norm": 8.615419856166682,
"learning_rate": 1.733763034223804e-05,
"logits/chosen": -0.21767687797546387,
"logits/rejected": -0.21838533878326416,
"logps/chosen": -1.2229845523834229,
"logps/rejected": -1.660559058189392,
"loss": 2.0294,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -1.2229845523834229,
"rewards/margins": 0.43757471442222595,
"rewards/rejected": -1.660559058189392,
"step": 150
},
{
"epoch": 0.3245223763412719,
"grad_norm": 11.4467193943517,
"learning_rate": 1.7084001417693702e-05,
"logits/chosen": -0.17819705605506897,
"logits/rejected": -0.1267833411693573,
"logps/chosen": -1.389460563659668,
"logps/rejected": -1.8192943334579468,
"loss": 2.084,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.389460563659668,
"rewards/margins": 0.4298337399959564,
"rewards/rejected": -1.8192943334579468,
"step": 155
},
{
"epoch": 0.33499084009421615,
"grad_norm": 8.12988482306829,
"learning_rate": 1.682087617430782e-05,
"logits/chosen": -0.12651406228542328,
"logits/rejected": -0.04694231227040291,
"logps/chosen": -1.318313479423523,
"logps/rejected": -1.8056846857070923,
"loss": 2.0818,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -1.318313479423523,
"rewards/margins": 0.4873710572719574,
"rewards/rejected": -1.8056846857070923,
"step": 160
},
{
"epoch": 0.34545930384716045,
"grad_norm": 6.762373951814964,
"learning_rate": 1.6548607339452853e-05,
"logits/chosen": -0.10803677886724472,
"logits/rejected": -0.048906028270721436,
"logps/chosen": -1.25996994972229,
"logps/rejected": -1.8231735229492188,
"loss": 2.0354,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.25996994972229,
"rewards/margins": 0.5632035732269287,
"rewards/rejected": -1.8231735229492188,
"step": 165
},
{
"epoch": 0.3559277676001047,
"grad_norm": 11.050433248891123,
"learning_rate": 1.626755989776303e-05,
"logits/chosen": -0.1651381254196167,
"logits/rejected": -0.04633602499961853,
"logps/chosen": -1.4237867593765259,
"logps/rejected": -2.101548671722412,
"loss": 2.0616,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.4237867593765259,
"rewards/margins": 0.6777619123458862,
"rewards/rejected": -2.101548671722412,
"step": 170
},
{
"epoch": 0.36639623135304894,
"grad_norm": 7.0268134591588245,
"learning_rate": 1.5978110601861408e-05,
"logits/chosen": -0.12373347580432892,
"logits/rejected": -0.0877654105424881,
"logps/chosen": -1.3757156133651733,
"logps/rejected": -1.7569023370742798,
"loss": 2.072,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -1.3757156133651733,
"rewards/margins": 0.3811867833137512,
"rewards/rejected": -1.7569023370742798,
"step": 175
},
{
"epoch": 0.3768646951059932,
"grad_norm": 12.761833101713918,
"learning_rate": 1.568064746731156e-05,
"logits/chosen": -0.11597935855388641,
"logits/rejected": -0.1455441117286682,
"logps/chosen": -1.374710202217102,
"logps/rejected": -1.7882392406463623,
"loss": 2.0783,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -1.374710202217102,
"rewards/margins": 0.4135288596153259,
"rewards/rejected": -1.7882392406463623,
"step": 180
},
{
"epoch": 0.38733315885893743,
"grad_norm": 7.06376095255915,
"learning_rate": 1.5375569252470897e-05,
"logits/chosen": -0.16596433520317078,
"logits/rejected": -0.026778871193528175,
"logps/chosen": -1.3514513969421387,
"logps/rejected": -2.0585455894470215,
"loss": 2.0175,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.3514513969421387,
"rewards/margins": 0.7070940732955933,
"rewards/rejected": -2.0585455894470215,
"step": 185
},
{
"epoch": 0.39780162261188173,
"grad_norm": 8.565174444883986,
"learning_rate": 1.506328492394303e-05,
"logits/chosen": -0.1680208444595337,
"logits/rejected": -0.10585353523492813,
"logps/chosen": -1.3384554386138916,
"logps/rejected": -1.7696669101715088,
"loss": 2.1269,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -1.3384554386138916,
"rewards/margins": 0.43121138215065,
"rewards/rejected": -1.7696669101715088,
"step": 190
},
{
"epoch": 0.408270086364826,
"grad_norm": 8.66730844602624,
"learning_rate": 1.4744213108345605e-05,
"logits/chosen": -0.18466773629188538,
"logits/rejected": -0.03730706498026848,
"logps/chosen": -1.3560011386871338,
"logps/rejected": -1.740012764930725,
"loss": 2.0877,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.3560011386871338,
"rewards/margins": 0.3840116560459137,
"rewards/rejected": -1.740012764930725,
"step": 195
},
{
"epoch": 0.4187385501177702,
"grad_norm": 7.228970778278195,
"learning_rate": 1.4418781531128636e-05,
"logits/chosen": -0.0062202452681958675,
"logits/rejected": 0.13902577757835388,
"logps/chosen": -1.3838578462600708,
"logps/rejected": -1.9292205572128296,
"loss": 2.0565,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.3838578462600708,
"rewards/margins": 0.5453627705574036,
"rewards/rejected": -1.9292205572128296,
"step": 200
},
{
"epoch": 0.42920701387071447,
"grad_norm": 9.000136464867888,
"learning_rate": 1.4087426443195549e-05,
"logits/chosen": 0.13322147727012634,
"logits/rejected": 0.31764692068099976,
"logps/chosen": -1.2240257263183594,
"logps/rejected": -1.729107141494751,
"loss": 2.0412,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -1.2240257263183594,
"rewards/margins": 0.5050811171531677,
"rewards/rejected": -1.729107141494751,
"step": 205
},
{
"epoch": 0.4396754776236587,
"grad_norm": 9.154719134241198,
"learning_rate": 1.375059203609562e-05,
"logits/chosen": 0.16319788992404938,
"logits/rejected": 0.3346864581108093,
"logps/chosen": -1.4042994976043701,
"logps/rejected": -1.8507611751556396,
"loss": 2.1446,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -1.4042994976043701,
"rewards/margins": 0.44646158814430237,
"rewards/rejected": -1.8507611751556396,
"step": 210
},
{
"epoch": 0.45014394137660296,
"grad_norm": 6.0840861971289115,
"learning_rate": 1.3408729846571716e-05,
"logits/chosen": 0.09617350250482559,
"logits/rejected": 0.3308163285255432,
"logps/chosen": -1.284582495689392,
"logps/rejected": -1.9165366888046265,
"loss": 2.0609,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.284582495689392,
"rewards/margins": 0.6319543123245239,
"rewards/rejected": -1.9165366888046265,
"step": 215
},
{
"epoch": 0.46061240512954726,
"grad_norm": 8.00813078143392,
"learning_rate": 1.3062298151261592e-05,
"logits/chosen": 0.044529713690280914,
"logits/rejected": 0.3042605519294739,
"logps/chosen": -1.358139157295227,
"logps/rejected": -1.929091215133667,
"loss": 2.0609,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.358139157295227,
"rewards/margins": 0.5709521770477295,
"rewards/rejected": -1.929091215133667,
"step": 220
},
{
"epoch": 0.4710808688824915,
"grad_norm": 7.246210126735172,
"learning_rate": 1.2711761352364172e-05,
"logits/chosen": 0.03733745217323303,
"logits/rejected": 0.2569599449634552,
"logps/chosen": -1.2875401973724365,
"logps/rejected": -2.0033631324768066,
"loss": 1.9734,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.2875401973724365,
"rewards/margins": 0.7158228754997253,
"rewards/rejected": -2.0033631324768066,
"step": 225
},
{
"epoch": 0.48154933263543576,
"grad_norm": 9.113546871891296,
"learning_rate": 1.2357589355094275e-05,
"logits/chosen": 0.014212149195373058,
"logits/rejected": 0.32842034101486206,
"logps/chosen": -1.3232357501983643,
"logps/rejected": -2.133357524871826,
"loss": 2.0056,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.3232357501983643,
"rewards/margins": 0.8101218342781067,
"rewards/rejected": -2.133357524871826,
"step": 230
},
{
"epoch": 0.49201779638838,
"grad_norm": 8.416270817600122,
"learning_rate": 1.2000256937760446e-05,
"logits/chosen": 0.1557755172252655,
"logits/rejected": 0.4363393187522888,
"logps/chosen": -1.2627068758010864,
"logps/rejected": -1.9027000665664673,
"loss": 2.0341,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.2627068758010864,
"rewards/margins": 0.6399933099746704,
"rewards/rejected": -1.9027000665664673,
"step": 235
},
{
"epoch": 0.5024862601413242,
"grad_norm": 8.509607776673544,
"learning_rate": 1.1640243115310219e-05,
"logits/chosen": 0.1399160474538803,
"logits/rejected": 0.41544660925865173,
"logps/chosen": -1.2238709926605225,
"logps/rejected": -1.8552753925323486,
"loss": 2.043,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.2238709926605225,
"rewards/margins": 0.6314042210578918,
"rewards/rejected": -1.8552753925323486,
"step": 240
},
{
"epoch": 0.5129547238942685,
"grad_norm": 7.675820182938368,
"learning_rate": 1.127803049719605e-05,
"logits/chosen": 0.1161346435546875,
"logits/rejected": 0.3014758825302124,
"logps/chosen": -1.3739269971847534,
"logps/rejected": -1.896535873413086,
"loss": 2.0516,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.3739269971847534,
"rewards/margins": 0.5226086378097534,
"rewards/rejected": -1.896535873413086,
"step": 245
},
{
"epoch": 0.5234231876472127,
"grad_norm": 8.105295156815169,
"learning_rate": 1.091410464042268e-05,
"logits/chosen": 0.13467064499855042,
"logits/rejected": 0.20409516990184784,
"logps/chosen": -1.283080816268921,
"logps/rejected": -1.9848954677581787,
"loss": 2.0297,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.283080816268921,
"rewards/margins": 0.701814591884613,
"rewards/rejected": -1.9848954677581787,
"step": 250
},
{
"epoch": 0.533891651400157,
"grad_norm": 8.059542189088054,
"learning_rate": 1.0548953398643276e-05,
"logits/chosen": 0.16154329478740692,
"logits/rejected": 0.32910025119781494,
"logps/chosen": -1.3794437646865845,
"logps/rejected": -2.083132743835449,
"loss": 2.0164,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.3794437646865845,
"rewards/margins": 0.70368891954422,
"rewards/rejected": -2.083132743835449,
"step": 255
},
{
"epoch": 0.5443601151531012,
"grad_norm": 6.4999384094249955,
"learning_rate": 1.0183066268176775e-05,
"logits/chosen": 0.6510103940963745,
"logits/rejected": 1.0612311363220215,
"logps/chosen": -1.3178019523620605,
"logps/rejected": -2.1374964714050293,
"loss": 2.0503,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -1.3178019523620605,
"rewards/margins": 0.8196946978569031,
"rewards/rejected": -2.1374964714050293,
"step": 260
},
{
"epoch": 0.5548285789060455,
"grad_norm": 7.313206322781419,
"learning_rate": 9.81693373182323e-06,
"logits/chosen": 0.484092652797699,
"logits/rejected": 0.6402750015258789,
"logps/chosen": -1.3769603967666626,
"logps/rejected": -1.7423893213272095,
"loss": 2.0132,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -1.3769603967666626,
"rewards/margins": 0.365428626537323,
"rewards/rejected": -1.7423893213272095,
"step": 265
},
{
"epoch": 0.5652970426589898,
"grad_norm": 8.756069730601565,
"learning_rate": 9.451046601356725e-06,
"logits/chosen": 0.13520203530788422,
"logits/rejected": 0.3411861062049866,
"logps/chosen": -1.3921834230422974,
"logps/rejected": -1.8440046310424805,
"loss": 2.0618,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.3921834230422974,
"rewards/margins": 0.4518211781978607,
"rewards/rejected": -1.8440046310424805,
"step": 270
},
{
"epoch": 0.575765506411934,
"grad_norm": 6.597441645495227,
"learning_rate": 9.085895359577324e-06,
"logits/chosen": -0.17431692779064178,
"logits/rejected": 0.03041163645684719,
"logps/chosen": -1.317625641822815,
"logps/rejected": -2.10213041305542,
"loss": 2.1302,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.317625641822815,
"rewards/margins": 0.7845045328140259,
"rewards/rejected": -2.10213041305542,
"step": 275
},
{
"epoch": 0.5862339701648783,
"grad_norm": 7.014450023153696,
"learning_rate": 8.721969502803954e-06,
"logits/chosen": -0.2185564786195755,
"logits/rejected": -0.061094462871551514,
"logps/chosen": -1.2835057973861694,
"logps/rejected": -1.7011182308197021,
"loss": 2.0526,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.2835057973861694,
"rewards/margins": 0.41761231422424316,
"rewards/rejected": -1.7011182308197021,
"step": 280
},
{
"epoch": 0.5967024339178225,
"grad_norm": 8.176491560294703,
"learning_rate": 8.359756884689785e-06,
"logits/chosen": -0.24773511290550232,
"logits/rejected": -0.19381779432296753,
"logps/chosen": -1.4067161083221436,
"logps/rejected": -2.0575575828552246,
"loss": 2.0134,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.4067161083221436,
"rewards/margins": 0.6508415341377258,
"rewards/rejected": -2.0575575828552246,
"step": 285
},
{
"epoch": 0.6071708976707668,
"grad_norm": 7.876329412436082,
"learning_rate": 7.999743062239557e-06,
"logits/chosen": -0.4009264409542084,
"logits/rejected": -0.22962765395641327,
"logps/chosen": -1.3704339265823364,
"logps/rejected": -2.099834680557251,
"loss": 2.0007,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.3704339265823364,
"rewards/margins": 0.7294005155563354,
"rewards/rejected": -2.099834680557251,
"step": 290
},
{
"epoch": 0.6176393614237111,
"grad_norm": 7.662219802711994,
"learning_rate": 7.642410644905726e-06,
"logits/chosen": -0.4107929766178131,
"logits/rejected": -0.22412636876106262,
"logps/chosen": -1.3308082818984985,
"logps/rejected": -2.054591655731201,
"loss": 2.0538,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.3308082818984985,
"rewards/margins": 0.7237831354141235,
"rewards/rejected": -2.054591655731201,
"step": 295
},
{
"epoch": 0.6281078251766553,
"grad_norm": 8.408469077357054,
"learning_rate": 7.2882386476358304e-06,
"logits/chosen": -0.4748775064945221,
"logits/rejected": -0.3870747983455658,
"logps/chosen": -1.3256374597549438,
"logps/rejected": -1.8613688945770264,
"loss": 2.0223,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.3256374597549438,
"rewards/margins": 0.5357314348220825,
"rewards/rejected": -1.8613688945770264,
"step": 300
},
{
"epoch": 0.6385762889295996,
"grad_norm": 9.20767722080512,
"learning_rate": 6.937701848738407e-06,
"logits/chosen": -0.5532702803611755,
"logits/rejected": -0.517833411693573,
"logps/chosen": -1.3089849948883057,
"logps/rejected": -1.9914848804473877,
"loss": 2.0097,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.3089849948883057,
"rewards/margins": 0.6825000047683716,
"rewards/rejected": -1.9914848804473877,
"step": 305
},
{
"epoch": 0.6490447526825438,
"grad_norm": 9.168807609346384,
"learning_rate": 6.591270153428288e-06,
"logits/chosen": -0.5921626687049866,
"logits/rejected": -0.5593982934951782,
"logps/chosen": -1.233320951461792,
"logps/rejected": -1.824541449546814,
"loss": 2.0114,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.233320951461792,
"rewards/margins": 0.5912207365036011,
"rewards/rejected": -1.824541449546814,
"step": 310
},
{
"epoch": 0.6595132164354881,
"grad_norm": 11.74928248778955,
"learning_rate": 6.249407963904381e-06,
"logits/chosen": -0.5820972323417664,
"logits/rejected": -0.37617072463035583,
"logps/chosen": -1.308586835861206,
"logps/rejected": -2.1290369033813477,
"loss": 2.0377,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.308586835861206,
"rewards/margins": 0.8204501271247864,
"rewards/rejected": -2.1290369033813477,
"step": 315
},
{
"epoch": 0.6699816801884323,
"grad_norm": 7.446497900716036,
"learning_rate": 5.912573556804453e-06,
"logits/chosen": -0.4366278648376465,
"logits/rejected": -0.34356969594955444,
"logps/chosen": -1.3265436887741089,
"logps/rejected": -1.9333693981170654,
"loss": 1.9563,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.3265436887741089,
"rewards/margins": 0.6068258285522461,
"rewards/rejected": -1.9333693981170654,
"step": 320
},
{
"epoch": 0.6804501439413766,
"grad_norm": 7.834867022615392,
"learning_rate": 5.581218468871365e-06,
"logits/chosen": -0.42373937368392944,
"logits/rejected": -0.1534721851348877,
"logps/chosen": -1.178399682044983,
"logps/rejected": -1.9840328693389893,
"loss": 1.9479,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.178399682044983,
"rewards/margins": 0.8056330680847168,
"rewards/rejected": -1.9840328693389893,
"step": 325
},
{
"epoch": 0.6909186076943209,
"grad_norm": 7.323036569812089,
"learning_rate": 5.2557868916543996e-06,
"logits/chosen": -0.24924680590629578,
"logits/rejected": 0.03496779128909111,
"logps/chosen": -1.226240873336792,
"logps/rejected": -1.8618109226226807,
"loss": 1.9899,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.226240873336792,
"rewards/margins": 0.6355697512626648,
"rewards/rejected": -1.8618109226226807,
"step": 330
},
{
"epoch": 0.7013870714472651,
"grad_norm": 7.704436038290053,
"learning_rate": 4.9367150760569746e-06,
"logits/chosen": -0.289539635181427,
"logits/rejected": 0.08722052723169327,
"logps/chosen": -1.2469284534454346,
"logps/rejected": -2.0381367206573486,
"loss": 1.9837,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.2469284534454346,
"rewards/margins": 0.7912081480026245,
"rewards/rejected": -2.0381367206573486,
"step": 335
},
{
"epoch": 0.7118555352002094,
"grad_norm": 8.947529662445875,
"learning_rate": 4.6244307475291025e-06,
"logits/chosen": -0.18107546865940094,
"logits/rejected": 0.22904996573925018,
"logps/chosen": -1.446345329284668,
"logps/rejected": -2.1773736476898193,
"loss": 2.0338,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -1.446345329284668,
"rewards/margins": 0.7310282588005066,
"rewards/rejected": -2.1773736476898193,
"step": 340
},
{
"epoch": 0.7223239989531536,
"grad_norm": 9.583549007245818,
"learning_rate": 4.319352532688444e-06,
"logits/chosen": -0.29274436831474304,
"logits/rejected": 0.00033287107362411916,
"logps/chosen": -1.2861610651016235,
"logps/rejected": -1.9985120296478271,
"loss": 2.0307,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -1.2861610651016235,
"rewards/margins": 0.7123511433601379,
"rewards/rejected": -1.9985120296478271,
"step": 345
},
{
"epoch": 0.7327924627060979,
"grad_norm": 6.266721042983619,
"learning_rate": 4.0218893981385935e-06,
"logits/chosen": -0.28006237745285034,
"logits/rejected": -0.14952346682548523,
"logps/chosen": -1.2466567754745483,
"logps/rejected": -1.7666349411010742,
"loss": 2.0448,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -1.2466567754745483,
"rewards/margins": 0.5199781656265259,
"rewards/rejected": -1.7666349411010742,
"step": 350
},
{
"epoch": 0.7432609264590422,
"grad_norm": 8.529189010191509,
"learning_rate": 3.732440102236975e-06,
"logits/chosen": -0.38612329959869385,
"logits/rejected": -0.1318252980709076,
"logps/chosen": -1.1369296312332153,
"logps/rejected": -1.9028323888778687,
"loss": 1.9434,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.1369296312332153,
"rewards/margins": 0.7659028172492981,
"rewards/rejected": -1.9028323888778687,
"step": 355
},
{
"epoch": 0.7537293902119864,
"grad_norm": 8.98691100719935,
"learning_rate": 3.4513926605471504e-06,
"logits/chosen": -0.2708672881126404,
"logits/rejected": -0.002604148583486676,
"logps/chosen": -1.2128788232803345,
"logps/rejected": -1.881731629371643,
"loss": 1.9188,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.2128788232803345,
"rewards/margins": 0.6688528060913086,
"rewards/rejected": -1.881731629371643,
"step": 360
},
{
"epoch": 0.7641978539649307,
"grad_norm": 8.427363605011601,
"learning_rate": 3.1791238256921785e-06,
"logits/chosen": -0.2245834320783615,
"logits/rejected": 0.03710466995835304,
"logps/chosen": -1.3998098373413086,
"logps/rejected": -2.1002440452575684,
"loss": 2.0448,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.3998098373413086,
"rewards/margins": 0.7004340887069702,
"rewards/rejected": -2.1002440452575684,
"step": 365
},
{
"epoch": 0.7746663177178749,
"grad_norm": 8.84465057372313,
"learning_rate": 2.9159985823062997e-06,
"logits/chosen": -0.35639292001724243,
"logits/rejected": -0.17423222959041595,
"logps/chosen": -1.3097021579742432,
"logps/rejected": -2.0959980487823486,
"loss": 1.9799,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.3097021579742432,
"rewards/margins": 0.7862957715988159,
"rewards/rejected": -2.0959980487823486,
"step": 370
},
{
"epoch": 0.7851347814708192,
"grad_norm": 7.000990509721392,
"learning_rate": 2.662369657761963e-06,
"logits/chosen": -0.32897457480430603,
"logits/rejected": -0.3297235369682312,
"logps/chosen": -1.2792903184890747,
"logps/rejected": -1.8416297435760498,
"loss": 2.0065,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -1.2792903184890747,
"rewards/margins": 0.5623396635055542,
"rewards/rejected": -1.8416297435760498,
"step": 375
},
{
"epoch": 0.7956032452237635,
"grad_norm": 8.146523706243913,
"learning_rate": 2.418577049328058e-06,
"logits/chosen": -0.3611428439617157,
"logits/rejected": -0.2454133927822113,
"logps/chosen": -1.266564130783081,
"logps/rejected": -1.7843055725097656,
"loss": 1.9381,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -1.266564130783081,
"rewards/margins": 0.517741322517395,
"rewards/rejected": -1.7843055725097656,
"step": 380
},
{
"epoch": 0.8060717089767077,
"grad_norm": 8.167524524758202,
"learning_rate": 2.1849475683932996e-06,
"logits/chosen": -0.3831802010536194,
"logits/rejected": -0.2638740539550781,
"logps/chosen": -1.2620943784713745,
"logps/rejected": -1.9156465530395508,
"loss": 1.9611,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -1.2620943784713745,
"rewards/margins": 0.6535523533821106,
"rewards/rejected": -1.9156465530395508,
"step": 385
},
{
"epoch": 0.816540172729652,
"grad_norm": 8.30770659075244,
"learning_rate": 1.961794402365611e-06,
"logits/chosen": -0.3085532486438751,
"logits/rejected": -0.14951160550117493,
"logps/chosen": -1.2462884187698364,
"logps/rejected": -2.1334927082061768,
"loss": 1.9555,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -1.2462884187698364,
"rewards/margins": 0.8872040510177612,
"rewards/rejected": -2.1334927082061768,
"step": 390
},
{
"epoch": 0.8270086364825961,
"grad_norm": 7.7825890292689905,
"learning_rate": 1.7494166948349057e-06,
"logits/chosen": -0.267643541097641,
"logits/rejected": 0.02370324358344078,
"logps/chosen": -1.3041934967041016,
"logps/rejected": -2.0585289001464844,
"loss": 1.9722,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.3041934967041016,
"rewards/margins": 0.7543356418609619,
"rewards/rejected": -2.0585289001464844,
"step": 395
},
{
"epoch": 0.8374771002355405,
"grad_norm": 11.47946040263076,
"learning_rate": 1.5480991445620541e-06,
"logits/chosen": -0.2226092368364334,
"logits/rejected": 0.03486952185630798,
"logps/chosen": -1.237866759300232,
"logps/rejected": -1.9466907978057861,
"loss": 1.9589,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.237866759300232,
"rewards/margins": 0.7088239192962646,
"rewards/rejected": -1.9466907978057861,
"step": 400
},
{
"epoch": 0.8479455639884846,
"grad_norm": 8.663615848996892,
"learning_rate": 1.3581116238315194e-06,
"logits/chosen": -0.2045535147190094,
"logits/rejected": 0.006712320260703564,
"logps/chosen": -1.3987605571746826,
"logps/rejected": -2.1295289993286133,
"loss": 2.0021,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -1.3987605571746826,
"rewards/margins": 0.7307685017585754,
"rewards/rejected": -2.1295289993286133,
"step": 405
},
{
"epoch": 0.8584140277414289,
"grad_norm": 8.186995321334237,
"learning_rate": 1.1797088166794002e-06,
"logits/chosen": -0.20332176983356476,
"logits/rejected": 0.029474016278982162,
"logps/chosen": -1.172918677330017,
"logps/rejected": -1.7614654302597046,
"loss": 1.9558,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.172918677330017,
"rewards/margins": 0.5885466933250427,
"rewards/rejected": -1.7614654302597046,
"step": 410
},
{
"epoch": 0.8688824914943732,
"grad_norm": 6.525083767762586,
"learning_rate": 1.013129877481741e-06,
"logits/chosen": -0.2240767925977707,
"logits/rejected": 0.07820748537778854,
"logps/chosen": -1.20893132686615,
"logps/rejected": -1.91985285282135,
"loss": 1.9579,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.20893132686615,
"rewards/margins": 0.7109212875366211,
"rewards/rejected": -1.91985285282135,
"step": 415
},
{
"epoch": 0.8793509552473174,
"grad_norm": 9.083137976581597,
"learning_rate": 8.585981103608343e-07,
"logits/chosen": -0.11082730442285538,
"logits/rejected": 0.09507735818624496,
"logps/chosen": -1.1996517181396484,
"logps/rejected": -1.8896563053131104,
"loss": 2.0042,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.1996517181396484,
"rewards/margins": 0.6900044679641724,
"rewards/rejected": -1.8896563053131104,
"step": 420
},
{
"epoch": 0.8898194190002617,
"grad_norm": 6.873945114472838,
"learning_rate": 7.163206698392744e-07,
"logits/chosen": -0.10862954705953598,
"logits/rejected": 0.1946602761745453,
"logps/chosen": -1.3608647584915161,
"logps/rejected": -1.970487356185913,
"loss": 1.9911,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -1.3608647584915161,
"rewards/margins": 0.6096227169036865,
"rewards/rejected": -1.970487356185913,
"step": 425
},
{
"epoch": 0.9002878827532059,
"grad_norm": 7.542928376234922,
"learning_rate": 5.864882831430274e-07,
"logits/chosen": -0.16213567554950714,
"logits/rejected": 0.21448484063148499,
"logps/chosen": -1.3169506788253784,
"logps/rejected": -2.048978328704834,
"loss": 1.956,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.3169506788253784,
"rewards/margins": 0.7320275902748108,
"rewards/rejected": -2.048978328704834,
"step": 430
},
{
"epoch": 0.9107563465061502,
"grad_norm": 10.100813855786358,
"learning_rate": 4.6927499452580574e-07,
"logits/chosen": -0.12442419677972794,
"logits/rejected": 0.06503897905349731,
"logps/chosen": -1.288496732711792,
"logps/rejected": -2.066117286682129,
"loss": 1.9804,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.288496732711792,
"rewards/margins": 0.777620792388916,
"rewards/rejected": -2.066117286682129,
"step": 435
},
{
"epoch": 0.9212248102590945,
"grad_norm": 10.273579675957365,
"learning_rate": 3.6483793195745686e-07,
"logits/chosen": -0.04678087681531906,
"logits/rejected": 0.3355256915092468,
"logps/chosen": -1.2764372825622559,
"logps/rejected": -2.03460955619812,
"loss": 1.9929,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -1.2764372825622559,
"rewards/margins": 0.7581723928451538,
"rewards/rejected": -2.03460955619812,
"step": 440
},
{
"epoch": 0.9316932740120387,
"grad_norm": 8.126486085847516,
"learning_rate": 2.733170964891607e-07,
"logits/chosen": -0.17456679046154022,
"logits/rejected": 0.09745622426271439,
"logps/chosen": -1.2472385168075562,
"logps/rejected": -1.892249345779419,
"loss": 1.9962,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -1.2472385168075562,
"rewards/margins": 0.6450108289718628,
"rewards/rejected": -1.892249345779419,
"step": 445
},
{
"epoch": 0.942161737764983,
"grad_norm": 8.866488191256463,
"learning_rate": 1.9483517457776436e-07,
"logits/chosen": -0.05351231247186661,
"logits/rejected": 0.1783636510372162,
"logps/chosen": -1.2550338506698608,
"logps/rejected": -1.8777573108673096,
"loss": 1.9585,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.2550338506698608,
"rewards/margins": 0.6227231621742249,
"rewards/rejected": -1.8777573108673096,
"step": 450
},
{
"epoch": 0.9526302015179272,
"grad_norm": 8.43900399314022,
"learning_rate": 1.2949737362087156e-07,
"logits/chosen": -0.09251005947589874,
"logits/rejected": 0.2792736291885376,
"logps/chosen": -1.381317377090454,
"logps/rejected": -1.868950605392456,
"loss": 1.9791,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -1.381317377090454,
"rewards/margins": 0.48763322830200195,
"rewards/rejected": -1.868950605392456,
"step": 455
},
{
"epoch": 0.9630986652708715,
"grad_norm": 7.3135362806868365,
"learning_rate": 7.73912809231292e-08,
"logits/chosen": -0.15108491480350494,
"logits/rejected": 0.16171926259994507,
"logps/chosen": -1.2007992267608643,
"logps/rejected": -2.045020341873169,
"loss": 1.9082,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.2007992267608643,
"rewards/margins": 0.8442209362983704,
"rewards/rejected": -2.045020341873169,
"step": 460
},
{
"epoch": 0.9735671290238157,
"grad_norm": 8.496120629902867,
"learning_rate": 3.858674628278825e-08,
"logits/chosen": -0.16963747143745422,
"logits/rejected": 0.33284881711006165,
"logps/chosen": -1.3178845643997192,
"logps/rejected": -2.077279806137085,
"loss": 1.9421,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.3178845643997192,
"rewards/margins": 0.7593953013420105,
"rewards/rejected": -2.077279806137085,
"step": 465
},
{
"epoch": 0.98403559277676,
"grad_norm": 8.29474797261466,
"learning_rate": 1.3135788355934652e-08,
"logits/chosen": -0.18520286679267883,
"logits/rejected": 0.14646300673484802,
"logps/chosen": -1.2585632801055908,
"logps/rejected": -1.9155442714691162,
"loss": 1.9995,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.2585632801055908,
"rewards/margins": 0.6569809317588806,
"rewards/rejected": -1.9155442714691162,
"step": 470
},
{
"epoch": 0.9945040565297043,
"grad_norm": 8.421060788269113,
"learning_rate": 1.0725249238940916e-09,
"logits/chosen": -0.2087993174791336,
"logits/rejected": 0.3205938935279846,
"logps/chosen": -1.2470612525939941,
"logps/rejected": -1.9542922973632812,
"loss": 1.985,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -1.2470612525939941,
"rewards/margins": 0.7072311639785767,
"rewards/rejected": -1.9542922973632812,
"step": 475
},
{
"epoch": 0.998691442030882,
"step": 477,
"total_flos": 0.0,
"train_loss": 0.0,
"train_runtime": 4.0963,
"train_samples_per_second": 14924.323,
"train_steps_per_second": 116.446
}
],
"logging_steps": 5,
"max_steps": 477,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}