Safetensors
llama
Llama3-SimPO / trainer_state.json
sabersaleh's picture
Upload folder using huggingface_hub
26cfb89 verified
raw
history blame
51.8 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.998691442030882,
"eval_steps": 400,
"global_step": 477,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.010468463752944255,
"grad_norm": 89.9968305873071,
"learning_rate": 6.25e-08,
"logits/chosen": -0.7388366460800171,
"logits/rejected": -0.7827404141426086,
"logps/chosen": -1.15103280544281,
"logps/rejected": -1.2909390926361084,
"loss": 1.2935,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.30206561088562,
"rewards/margins": 0.2798125147819519,
"rewards/rejected": -2.581878185272217,
"step": 5
},
{
"epoch": 0.02093692750588851,
"grad_norm": 24.705919418070632,
"learning_rate": 1.25e-07,
"logits/chosen": -0.7937806844711304,
"logits/rejected": -0.8651958703994751,
"logps/chosen": -1.1529361009597778,
"logps/rejected": -1.3611778020858765,
"loss": 1.314,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -2.3058722019195557,
"rewards/margins": 0.41648340225219727,
"rewards/rejected": -2.722355604171753,
"step": 10
},
{
"epoch": 0.031405391258832765,
"grad_norm": 27.735520006717728,
"learning_rate": 1.875e-07,
"logits/chosen": -0.7491501569747925,
"logits/rejected": -0.8338179588317871,
"logps/chosen": -1.1712462902069092,
"logps/rejected": -1.270825743675232,
"loss": 1.2667,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.3424925804138184,
"rewards/margins": 0.19915875792503357,
"rewards/rejected": -2.541651487350464,
"step": 15
},
{
"epoch": 0.04187385501177702,
"grad_norm": 22.322171681204715,
"learning_rate": 2.5e-07,
"logits/chosen": -0.7619983553886414,
"logits/rejected": -0.9046538472175598,
"logps/chosen": -1.1294901371002197,
"logps/rejected": -1.2941240072250366,
"loss": 1.2696,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -2.2589802742004395,
"rewards/margins": 0.32926779985427856,
"rewards/rejected": -2.5882480144500732,
"step": 20
},
{
"epoch": 0.05234231876472128,
"grad_norm": 12.849323230827375,
"learning_rate": 3.125e-07,
"logits/chosen": -0.772399365901947,
"logits/rejected": -0.8519186973571777,
"logps/chosen": -1.077214002609253,
"logps/rejected": -1.2762653827667236,
"loss": 1.2362,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.154428005218506,
"rewards/margins": 0.39810293912887573,
"rewards/rejected": -2.5525307655334473,
"step": 25
},
{
"epoch": 0.06281078251766553,
"grad_norm": 84.84769866542291,
"learning_rate": 3.75e-07,
"logits/chosen": -0.7909184694290161,
"logits/rejected": -0.8215691447257996,
"logps/chosen": -1.059594988822937,
"logps/rejected": -1.0990025997161865,
"loss": 1.2897,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -2.119189977645874,
"rewards/margins": 0.0788152664899826,
"rewards/rejected": -2.198005199432373,
"step": 30
},
{
"epoch": 0.07327924627060979,
"grad_norm": 12.477109087394112,
"learning_rate": 4.3749999999999994e-07,
"logits/chosen": -0.7678741216659546,
"logits/rejected": -0.8405346870422363,
"logps/chosen": -0.9820269346237183,
"logps/rejected": -1.2532163858413696,
"loss": 1.2497,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -1.9640538692474365,
"rewards/margins": 0.5423787236213684,
"rewards/rejected": -2.5064327716827393,
"step": 35
},
{
"epoch": 0.08374771002355404,
"grad_norm": 10.85962784004132,
"learning_rate": 5e-07,
"logits/chosen": -0.7665027379989624,
"logits/rejected": -0.8336607217788696,
"logps/chosen": -0.9715523719787598,
"logps/rejected": -1.1505324840545654,
"loss": 1.2359,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -1.9431047439575195,
"rewards/margins": 0.35795995593070984,
"rewards/rejected": -2.301064968109131,
"step": 40
},
{
"epoch": 0.0942161737764983,
"grad_norm": 10.414385637292323,
"learning_rate": 5.625e-07,
"logits/chosen": -0.7420114874839783,
"logits/rejected": -0.8339902758598328,
"logps/chosen": -0.9872716665267944,
"logps/rejected": -1.1155823469161987,
"loss": 1.2267,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -1.9745433330535889,
"rewards/margins": 0.2566211223602295,
"rewards/rejected": -2.2311646938323975,
"step": 45
},
{
"epoch": 0.10468463752944256,
"grad_norm": 9.651448839940226,
"learning_rate": 5.999678242522831e-07,
"logits/chosen": -0.7927948832511902,
"logits/rejected": -0.8290635943412781,
"logps/chosen": -0.9459100961685181,
"logps/rejected": -1.2578647136688232,
"loss": 1.2207,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.8918201923370361,
"rewards/margins": 0.6239093542098999,
"rewards/rejected": -2.5157294273376465,
"step": 50
},
{
"epoch": 0.11515310128238682,
"grad_norm": 43.90472722310407,
"learning_rate": 5.996059263493219e-07,
"logits/chosen": -0.7944079637527466,
"logits/rejected": -0.9001775979995728,
"logps/chosen": -1.072819471359253,
"logps/rejected": -1.181773066520691,
"loss": 1.2551,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -2.145638942718506,
"rewards/margins": 0.21790704131126404,
"rewards/rejected": -2.363546133041382,
"step": 55
},
{
"epoch": 0.12562156503533106,
"grad_norm": 13.393066662370963,
"learning_rate": 5.988423976115163e-07,
"logits/chosen": -0.7826106548309326,
"logits/rejected": -0.8369284868240356,
"logps/chosen": -1.0628390312194824,
"logps/rejected": -1.2253072261810303,
"loss": 1.2246,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -2.125678062438965,
"rewards/margins": 0.32493603229522705,
"rewards/rejected": -2.4506144523620605,
"step": 60
},
{
"epoch": 0.1360900287882753,
"grad_norm": 26.206483702491475,
"learning_rate": 5.976782615723061e-07,
"logits/chosen": -0.7975456714630127,
"logits/rejected": -0.8562803268432617,
"logps/chosen": -1.0680768489837646,
"logps/rejected": -1.2204017639160156,
"loss": 1.2268,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -2.1361536979675293,
"rewards/margins": 0.3046496510505676,
"rewards/rejected": -2.4408035278320312,
"step": 65
},
{
"epoch": 0.14655849254121958,
"grad_norm": 13.41584537004533,
"learning_rate": 5.961150787913738e-07,
"logits/chosen": -0.8376196622848511,
"logits/rejected": -0.9019572138786316,
"logps/chosen": -1.0893644094467163,
"logps/rejected": -1.2784545421600342,
"loss": 1.1754,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.1787288188934326,
"rewards/margins": 0.37818047404289246,
"rewards/rejected": -2.5569090843200684,
"step": 70
},
{
"epoch": 0.15702695629416383,
"grad_norm": 32.22425187362688,
"learning_rate": 5.941549447626671e-07,
"logits/chosen": -0.804112434387207,
"logits/rejected": -0.845563530921936,
"logps/chosen": -1.0805425643920898,
"logps/rejected": -1.3212538957595825,
"loss": 1.209,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.1610851287841797,
"rewards/margins": 0.4814226031303406,
"rewards/rejected": -2.642507791519165,
"step": 75
},
{
"epoch": 0.16749542004710807,
"grad_norm": 8.981853488976475,
"learning_rate": 5.918004871053251e-07,
"logits/chosen": -0.7968226671218872,
"logits/rejected": -0.8211067318916321,
"logps/chosen": -1.026604413986206,
"logps/rejected": -1.3631267547607422,
"loss": 1.1624,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.053208827972412,
"rewards/margins": 0.673044741153717,
"rewards/rejected": -2.7262535095214844,
"step": 80
},
{
"epoch": 0.17796388380005235,
"grad_norm": 17.367470137588203,
"learning_rate": 5.890548620412763e-07,
"logits/chosen": -0.8126602172851562,
"logits/rejected": -0.8794834017753601,
"logps/chosen": -1.0674957036972046,
"logps/rejected": -1.3523355722427368,
"loss": 1.1625,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -2.134991407394409,
"rewards/margins": 0.5696790814399719,
"rewards/rejected": -2.7046711444854736,
"step": 85
},
{
"epoch": 0.1884323475529966,
"grad_norm": 17.833322868673477,
"learning_rate": 5.859217501642258e-07,
"logits/chosen": -0.840762734413147,
"logits/rejected": -0.9274584054946899,
"logps/chosen": -1.1602346897125244,
"logps/rejected": -1.5290915966033936,
"loss": 1.1734,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -2.320469379425049,
"rewards/margins": 0.7377143502235413,
"rewards/rejected": -3.058183193206787,
"step": 90
},
{
"epoch": 0.19890081130594087,
"grad_norm": 22.98307788140464,
"learning_rate": 5.824053515057091e-07,
"logits/chosen": -0.8092079162597656,
"logits/rejected": -0.8328098058700562,
"logps/chosen": -1.133385419845581,
"logps/rejected": -1.4298288822174072,
"loss": 1.1919,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -2.266770839691162,
"rewards/margins": 0.5928869247436523,
"rewards/rejected": -2.8596577644348145,
"step": 95
},
{
"epoch": 0.2093692750588851,
"grad_norm": 11.026437481785171,
"learning_rate": 5.785103799048218e-07,
"logits/chosen": -0.8240598440170288,
"logits/rejected": -0.8689464330673218,
"logps/chosen": -1.147385835647583,
"logps/rejected": -1.3535184860229492,
"loss": 1.2131,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -2.294771671295166,
"rewards/margins": 0.41226544976234436,
"rewards/rejected": -2.7070369720458984,
"step": 100
},
{
"epoch": 0.21983773881182936,
"grad_norm": 9.837343506686455,
"learning_rate": 5.742420566891749e-07,
"logits/chosen": -0.7966706156730652,
"logits/rejected": -0.878908634185791,
"logps/chosen": -1.1871858835220337,
"logps/rejected": -1.4869831800460815,
"loss": 1.1062,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -2.3743717670440674,
"rewards/margins": 0.5995948314666748,
"rewards/rejected": -2.973966360092163,
"step": 105
},
{
"epoch": 0.23030620256477363,
"grad_norm": 19.01097451640794,
"learning_rate": 5.696061036755478e-07,
"logits/chosen": -0.7402995228767395,
"logits/rejected": -0.8451690673828125,
"logps/chosen": -1.0870535373687744,
"logps/rejected": -1.3536127805709839,
"loss": 1.1368,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.174107074737549,
"rewards/margins": 0.5331184267997742,
"rewards/rejected": -2.7072255611419678,
"step": 110
},
{
"epoch": 0.24077466631771788,
"grad_norm": 89.427421788791,
"learning_rate": 5.64608735499618e-07,
"logits/chosen": -0.833459734916687,
"logits/rejected": -0.829018235206604,
"logps/chosen": -1.150940179824829,
"logps/rejected": -1.287229061126709,
"loss": 1.1596,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.301880359649658,
"rewards/margins": 0.2725774943828583,
"rewards/rejected": -2.574458122253418,
"step": 115
},
{
"epoch": 0.2512431300706621,
"grad_norm": 31.745365051153907,
"learning_rate": 5.592566512850545e-07,
"logits/chosen": -0.79100501537323,
"logits/rejected": -0.8663417100906372,
"logps/chosen": -1.0571635961532593,
"logps/rejected": -1.4087059497833252,
"loss": 1.1752,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -2.1143271923065186,
"rewards/margins": 0.703084409236908,
"rewards/rejected": -2.8174118995666504,
"step": 120
},
{
"epoch": 0.26171159382360637,
"grad_norm": 14.496796822119729,
"learning_rate": 5.535570256631384e-07,
"logits/chosen": -0.798068642616272,
"logits/rejected": -0.7694944143295288,
"logps/chosen": -1.171478271484375,
"logps/rejected": -1.5117442607879639,
"loss": 1.1603,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -2.34295654296875,
"rewards/margins": 0.6805320978164673,
"rewards/rejected": -3.0234885215759277,
"step": 125
},
{
"epoch": 0.2721800575765506,
"grad_norm": 11.15517991690276,
"learning_rate": 5.475174991549528e-07,
"logits/chosen": -0.7599740624427795,
"logits/rejected": -0.8051120638847351,
"logps/chosen": -1.1963175535202026,
"logps/rejected": -1.5290193557739258,
"loss": 1.1204,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -2.3926351070404053,
"rewards/margins": 0.6654035449028015,
"rewards/rejected": -3.0580387115478516,
"step": 130
},
{
"epoch": 0.2826485213294949,
"grad_norm": 13.030746243741968,
"learning_rate": 5.411461679290317e-07,
"logits/chosen": -0.7586075663566589,
"logits/rejected": -0.7899220585823059,
"logps/chosen": -1.0880517959594727,
"logps/rejected": -1.4661823511123657,
"loss": 1.1668,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -2.1761035919189453,
"rewards/margins": 0.7562611103057861,
"rewards/rejected": -2.9323647022247314,
"step": 135
},
{
"epoch": 0.29311698508243916,
"grad_norm": 12.738817253337984,
"learning_rate": 5.34451572948201e-07,
"logits/chosen": -0.8128818273544312,
"logits/rejected": -0.842110812664032,
"logps/chosen": -1.2075114250183105,
"logps/rejected": -1.4238183498382568,
"loss": 1.2141,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.415022850036621,
"rewards/margins": 0.4326140284538269,
"rewards/rejected": -2.8476366996765137,
"step": 140
},
{
"epoch": 0.3035854488353834,
"grad_norm": 24.983190739092922,
"learning_rate": 5.274426885201582e-07,
"logits/chosen": -0.7843077778816223,
"logits/rejected": -0.8767129182815552,
"logps/chosen": -1.1461377143859863,
"logps/rejected": -1.5009636878967285,
"loss": 1.1207,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.2922754287719727,
"rewards/margins": 0.7096518278121948,
"rewards/rejected": -3.001927375793457,
"step": 145
},
{
"epoch": 0.31405391258832765,
"grad_norm": 23.74860585722539,
"learning_rate": 5.201289102671411e-07,
"logits/chosen": -0.8561376333236694,
"logits/rejected": -0.8589056134223938,
"logps/chosen": -1.1982135772705078,
"logps/rejected": -1.5201013088226318,
"loss": 1.1476,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.3964271545410156,
"rewards/margins": 0.6437759399414062,
"rewards/rejected": -3.0402026176452637,
"step": 150
},
{
"epoch": 0.3245223763412719,
"grad_norm": 15.03909875634319,
"learning_rate": 5.12520042530811e-07,
"logits/chosen": -0.7681445479393005,
"logits/rejected": -0.8174452781677246,
"logps/chosen": -1.2068870067596436,
"logps/rejected": -1.6613304615020752,
"loss": 1.1179,
"rewards/accuracies": 0.65625,
"rewards/chosen": -2.413774013519287,
"rewards/margins": 0.908886730670929,
"rewards/rejected": -3.3226609230041504,
"step": 155
},
{
"epoch": 0.33499084009421615,
"grad_norm": 14.191169695059497,
"learning_rate": 5.046262852292346e-07,
"logits/chosen": -0.8029179573059082,
"logits/rejected": -0.8746109008789062,
"logps/chosen": -1.1898596286773682,
"logps/rejected": -1.6815717220306396,
"loss": 1.1138,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -2.3797192573547363,
"rewards/margins": 0.9834240674972534,
"rewards/rejected": -3.3631434440612793,
"step": 160
},
{
"epoch": 0.34545930384716045,
"grad_norm": 35.93680907186828,
"learning_rate": 4.964582201835856e-07,
"logits/chosen": -0.7598133087158203,
"logits/rejected": -0.7828689813613892,
"logps/chosen": -1.1410859823226929,
"logps/rejected": -1.5104478597640991,
"loss": 1.1132,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -2.2821719646453857,
"rewards/margins": 0.7387233972549438,
"rewards/rejected": -3.0208957195281982,
"step": 165
},
{
"epoch": 0.3559277676001047,
"grad_norm": 33.280459458949075,
"learning_rate": 4.880267969328908e-07,
"logits/chosen": -0.7489741444587708,
"logits/rejected": -0.8511075973510742,
"logps/chosen": -1.2344070672988892,
"logps/rejected": -1.6722608804702759,
"loss": 1.1051,
"rewards/accuracies": 0.65625,
"rewards/chosen": -2.4688141345977783,
"rewards/margins": 0.8757076263427734,
"rewards/rejected": -3.3445217609405518,
"step": 170
},
{
"epoch": 0.36639623135304894,
"grad_norm": 13.559524548726696,
"learning_rate": 4.793433180558423e-07,
"logits/chosen": -0.7471566796302795,
"logits/rejected": -0.8381919860839844,
"logps/chosen": -1.1587435007095337,
"logps/rejected": -1.5522888898849487,
"loss": 1.133,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.3174870014190674,
"rewards/margins": 0.7870910167694092,
"rewards/rejected": -3.1045777797698975,
"step": 175
},
{
"epoch": 0.3768646951059932,
"grad_norm": 19.60609504538111,
"learning_rate": 4.704194240193467e-07,
"logits/chosen": -0.7779995203018188,
"logits/rejected": -0.8208974599838257,
"logps/chosen": -1.1914243698120117,
"logps/rejected": -1.6478986740112305,
"loss": 1.0991,
"rewards/accuracies": 0.65625,
"rewards/chosen": -2.3828487396240234,
"rewards/margins": 0.9129486083984375,
"rewards/rejected": -3.295797348022461,
"step": 180
},
{
"epoch": 0.38733315885893743,
"grad_norm": 15.986798312827595,
"learning_rate": 4.6126707757412686e-07,
"logits/chosen": -0.7536464333534241,
"logits/rejected": -0.836445152759552,
"logps/chosen": -1.18105149269104,
"logps/rejected": -1.5753639936447144,
"loss": 1.0801,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -2.36210298538208,
"rewards/margins": 0.788625180721283,
"rewards/rejected": -3.1507279872894287,
"step": 185
},
{
"epoch": 0.39780162261188173,
"grad_norm": 11.085659412542848,
"learning_rate": 4.5189854771829086e-07,
"logits/chosen": -0.7779768705368042,
"logits/rejected": -0.860378623008728,
"logps/chosen": -1.174264907836914,
"logps/rejected": -1.5782097578048706,
"loss": 1.0897,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -2.348529815673828,
"rewards/margins": 0.8078898191452026,
"rewards/rejected": -3.156419515609741,
"step": 190
},
{
"epoch": 0.408270086364826,
"grad_norm": 19.478521042945726,
"learning_rate": 4.4232639325036807e-07,
"logits/chosen": -0.8138440251350403,
"logits/rejected": -0.888975977897644,
"logps/chosen": -1.1923892498016357,
"logps/rejected": -1.6592342853546143,
"loss": 1.1171,
"rewards/accuracies": 0.6875,
"rewards/chosen": -2.3847784996032715,
"rewards/margins": 0.933690071105957,
"rewards/rejected": -3.3184685707092285,
"step": 195
},
{
"epoch": 0.4187385501177702,
"grad_norm": 12.673420292445082,
"learning_rate": 4.32563445933859e-07,
"logits/chosen": -0.7443628311157227,
"logits/rejected": -0.7802754044532776,
"logps/chosen": -1.211715579032898,
"logps/rejected": -1.5577033758163452,
"loss": 1.0631,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.423431158065796,
"rewards/margins": 0.6919752955436707,
"rewards/rejected": -3.1154067516326904,
"step": 200
},
{
"epoch": 0.42920701387071447,
"grad_norm": 18.156036717162227,
"learning_rate": 4.226227932958664e-07,
"logits/chosen": -0.8596774935722351,
"logits/rejected": -0.8864806294441223,
"logps/chosen": -1.2197387218475342,
"logps/rejected": -1.706209421157837,
"loss": 1.0695,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.4394774436950684,
"rewards/margins": 0.9729412794113159,
"rewards/rejected": -3.412418842315674,
"step": 205
},
{
"epoch": 0.4396754776236587,
"grad_norm": 18.614311057711063,
"learning_rate": 4.1251776108286854e-07,
"logits/chosen": -0.7632856965065002,
"logits/rejected": -0.7707933187484741,
"logps/chosen": -1.2796884775161743,
"logps/rejected": -1.6428205966949463,
"loss": 1.1264,
"rewards/accuracies": 0.65625,
"rewards/chosen": -2.5593769550323486,
"rewards/margins": 0.7262641191482544,
"rewards/rejected": -3.2856411933898926,
"step": 210
},
{
"epoch": 0.45014394137660296,
"grad_norm": 19.070261616595026,
"learning_rate": 4.022618953971514e-07,
"logits/chosen": -0.7568240761756897,
"logits/rejected": -0.8358641862869263,
"logps/chosen": -1.308774709701538,
"logps/rejected": -1.6738483905792236,
"loss": 1.1102,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -2.617549419403076,
"rewards/margins": 0.7301470041275024,
"rewards/rejected": -3.3476967811584473,
"step": 215
},
{
"epoch": 0.46061240512954726,
"grad_norm": 13.160800920164423,
"learning_rate": 3.918689445378477e-07,
"logits/chosen": -0.7660185098648071,
"logits/rejected": -0.8393454551696777,
"logps/chosen": -1.2900028228759766,
"logps/rejected": -1.7106046676635742,
"loss": 1.0429,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -2.580005645751953,
"rewards/margins": 0.8412036895751953,
"rewards/rejected": -3.4212093353271484,
"step": 220
},
{
"epoch": 0.4710808688824915,
"grad_norm": 15.467772988868518,
"learning_rate": 3.813528405709251e-07,
"logits/chosen": -0.7320618629455566,
"logits/rejected": -0.7756307125091553,
"logps/chosen": -1.3943421840667725,
"logps/rejected": -1.8419634103775024,
"loss": 1.084,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -2.788684368133545,
"rewards/margins": 0.8952423334121704,
"rewards/rejected": -3.683926820755005,
"step": 225
},
{
"epoch": 0.48154933263543576,
"grad_norm": 23.599162652169078,
"learning_rate": 3.707276806528282e-07,
"logits/chosen": -0.7983018159866333,
"logits/rejected": -0.8536737561225891,
"logps/chosen": -1.3397753238677979,
"logps/rejected": -1.8982980251312256,
"loss": 1.0107,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -2.6795506477355957,
"rewards/margins": 1.1170451641082764,
"rewards/rejected": -3.796596050262451,
"step": 230
},
{
"epoch": 0.49201779638838,
"grad_norm": 22.745006961113983,
"learning_rate": 3.6000770813281334e-07,
"logits/chosen": -0.7526620626449585,
"logits/rejected": -0.7841376066207886,
"logps/chosen": -1.3173251152038574,
"logps/rejected": -1.6973741054534912,
"loss": 1.096,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -2.634650230407715,
"rewards/margins": 0.7600980401039124,
"rewards/rejected": -3.3947482109069824,
"step": 235
},
{
"epoch": 0.5024862601413242,
"grad_norm": 17.29631229132808,
"learning_rate": 3.4920729345930654e-07,
"logits/chosen": -0.8024924993515015,
"logits/rejected": -0.8705514669418335,
"logps/chosen": -1.3106586933135986,
"logps/rejected": -1.8416321277618408,
"loss": 1.0622,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.6213173866271973,
"rewards/margins": 1.0619468688964844,
"rewards/rejected": -3.6832642555236816,
"step": 240
},
{
"epoch": 0.5129547238942685,
"grad_norm": 15.697390709369445,
"learning_rate": 3.383409149158814e-07,
"logits/chosen": -0.8013178110122681,
"logits/rejected": -0.8261008262634277,
"logps/chosen": -1.2374125719070435,
"logps/rejected": -1.8463026285171509,
"loss": 1.0412,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.474825143814087,
"rewards/margins": 1.2177798748016357,
"rewards/rejected": -3.6926052570343018,
"step": 245
},
{
"epoch": 0.5234231876472127,
"grad_norm": 38.568029652024805,
"learning_rate": 3.2742313921268035e-07,
"logits/chosen": -0.7440148591995239,
"logits/rejected": -0.8371674418449402,
"logps/chosen": -1.3792295455932617,
"logps/rejected": -1.996372938156128,
"loss": 1.0533,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -2.7584590911865234,
"rewards/margins": 1.234286904335022,
"rewards/rejected": -3.992745876312256,
"step": 250
},
{
"epoch": 0.533891651400157,
"grad_norm": 31.29600689027817,
"learning_rate": 3.1646860195929825e-07,
"logits/chosen": -0.798254132270813,
"logits/rejected": -0.819698691368103,
"logps/chosen": -1.4148808717727661,
"logps/rejected": -1.9883480072021484,
"loss": 1.1126,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -2.8297617435455322,
"rewards/margins": 1.1469345092773438,
"rewards/rejected": -3.976696014404297,
"step": 255
},
{
"epoch": 0.5443601151531012,
"grad_norm": 21.255043892106038,
"learning_rate": 3.054919880453032e-07,
"logits/chosen": -0.8065778017044067,
"logits/rejected": -0.8200203776359558,
"logps/chosen": -1.3674335479736328,
"logps/rejected": -1.8728046417236328,
"loss": 1.0948,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -2.7348670959472656,
"rewards/margins": 1.0107421875,
"rewards/rejected": -3.7456092834472656,
"step": 260
},
{
"epoch": 0.5548285789060455,
"grad_norm": 15.283609874940026,
"learning_rate": 2.9450801195469686e-07,
"logits/chosen": -0.7686730027198792,
"logits/rejected": -0.7811926603317261,
"logps/chosen": -1.3809654712677002,
"logps/rejected": -1.8307151794433594,
"loss": 1.0502,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -2.7619309425354004,
"rewards/margins": 0.8994992971420288,
"rewards/rejected": -3.6614303588867188,
"step": 265
},
{
"epoch": 0.5652970426589898,
"grad_norm": 34.69673151716839,
"learning_rate": 2.835313980407017e-07,
"logits/chosen": -0.8522397875785828,
"logits/rejected": -0.8554953336715698,
"logps/chosen": -1.4796664714813232,
"logps/rejected": -1.868570327758789,
"loss": 1.11,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -2.9593329429626465,
"rewards/margins": 0.7778076529502869,
"rewards/rejected": -3.737140655517578,
"step": 270
},
{
"epoch": 0.575765506411934,
"grad_norm": 12.490257980809535,
"learning_rate": 2.7257686078731973e-07,
"logits/chosen": -0.8593546748161316,
"logits/rejected": -0.8926668167114258,
"logps/chosen": -1.2937114238739014,
"logps/rejected": -2.0442328453063965,
"loss": 0.9612,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.5874228477478027,
"rewards/margins": 1.5010432004928589,
"rewards/rejected": -4.088465690612793,
"step": 275
},
{
"epoch": 0.5862339701648783,
"grad_norm": 17.66798289482467,
"learning_rate": 2.6165908508411857e-07,
"logits/chosen": -0.7889951467514038,
"logits/rejected": -0.8469230532646179,
"logps/chosen": -1.3164467811584473,
"logps/rejected": -1.873552680015564,
"loss": 1.0829,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -2.6328935623168945,
"rewards/margins": 1.114211916923523,
"rewards/rejected": -3.747105360031128,
"step": 280
},
{
"epoch": 0.5967024339178225,
"grad_norm": 25.798144103608532,
"learning_rate": 2.5079270654069354e-07,
"logits/chosen": -0.7999380230903625,
"logits/rejected": -0.8465052843093872,
"logps/chosen": -1.4005292654037476,
"logps/rejected": -1.9563087224960327,
"loss": 1.0559,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.801058530807495,
"rewards/margins": 1.1115590333938599,
"rewards/rejected": -3.9126174449920654,
"step": 285
},
{
"epoch": 0.6071708976707668,
"grad_norm": 26.70646393830588,
"learning_rate": 2.399922918671867e-07,
"logits/chosen": -0.8188889622688293,
"logits/rejected": -0.8326479196548462,
"logps/chosen": -1.4042682647705078,
"logps/rejected": -1.8107773065567017,
"loss": 1.0877,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -2.8085365295410156,
"rewards/margins": 0.8130179643630981,
"rewards/rejected": -3.6215546131134033,
"step": 290
},
{
"epoch": 0.6176393614237111,
"grad_norm": 23.426122701316096,
"learning_rate": 2.2927231934717176e-07,
"logits/chosen": -0.8667086362838745,
"logits/rejected": -0.87919682264328,
"logps/chosen": -1.4516851902008057,
"logps/rejected": -1.7210047245025635,
"loss": 1.0425,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.9033703804016113,
"rewards/margins": 0.5386390686035156,
"rewards/rejected": -3.442009449005127,
"step": 295
},
{
"epoch": 0.6281078251766553,
"grad_norm": 26.456279591360094,
"learning_rate": 2.1864715942907487e-07,
"logits/chosen": -0.8121633529663086,
"logits/rejected": -0.8183205723762512,
"logps/chosen": -1.4428894519805908,
"logps/rejected": -1.9755233526229858,
"loss": 1.0841,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -2.8857789039611816,
"rewards/margins": 1.0652679204940796,
"rewards/rejected": -3.9510467052459717,
"step": 300
},
{
"epoch": 0.6385762889295996,
"grad_norm": 31.787422608248555,
"learning_rate": 2.081310554621522e-07,
"logits/chosen": -0.812918484210968,
"logits/rejected": -0.848720371723175,
"logps/chosen": -1.3704057931900024,
"logps/rejected": -1.7566410303115845,
"loss": 1.0211,
"rewards/accuracies": 0.6875,
"rewards/chosen": -2.740811586380005,
"rewards/margins": 0.7724703550338745,
"rewards/rejected": -3.513282060623169,
"step": 305
},
{
"epoch": 0.6490447526825438,
"grad_norm": 20.39803180345373,
"learning_rate": 1.9773810460284862e-07,
"logits/chosen": -0.7991079092025757,
"logits/rejected": -0.8711285591125488,
"logps/chosen": -1.4278900623321533,
"logps/rejected": -2.05625581741333,
"loss": 0.9925,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.8557801246643066,
"rewards/margins": 1.256731629371643,
"rewards/rejected": -4.11251163482666,
"step": 310
},
{
"epoch": 0.6595132164354881,
"grad_norm": 22.590110789535018,
"learning_rate": 1.874822389171314e-07,
"logits/chosen": -0.8574708700180054,
"logits/rejected": -0.9009912610054016,
"logps/chosen": -1.545143723487854,
"logps/rejected": -2.0895230770111084,
"loss": 1.0237,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -3.090287446975708,
"rewards/margins": 1.088758945465088,
"rewards/rejected": -4.179046154022217,
"step": 315
},
{
"epoch": 0.6699816801884323,
"grad_norm": 19.0520960322845,
"learning_rate": 1.7737720670413356e-07,
"logits/chosen": -0.8097273111343384,
"logits/rejected": -0.8335424661636353,
"logps/chosen": -1.5219576358795166,
"logps/rejected": -2.0950403213500977,
"loss": 1.0412,
"rewards/accuracies": 0.6875,
"rewards/chosen": -3.043915271759033,
"rewards/margins": 1.1461658477783203,
"rewards/rejected": -4.190080642700195,
"step": 320
},
{
"epoch": 0.6804501439413766,
"grad_norm": 28.978881064657845,
"learning_rate": 1.6743655406614095e-07,
"logits/chosen": -0.8851544260978699,
"logits/rejected": -0.8812357187271118,
"logps/chosen": -1.505824089050293,
"logps/rejected": -2.034778118133545,
"loss": 1.0881,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -3.011648178100586,
"rewards/margins": 1.0579078197479248,
"rewards/rejected": -4.06955623626709,
"step": 325
},
{
"epoch": 0.6909186076943209,
"grad_norm": 23.915843277630973,
"learning_rate": 1.5767360674963198e-07,
"logits/chosen": -0.870714008808136,
"logits/rejected": -0.8971943855285645,
"logps/chosen": -1.3601343631744385,
"logps/rejected": -2.0130364894866943,
"loss": 1.0087,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.720268726348877,
"rewards/margins": 1.3058046102523804,
"rewards/rejected": -4.026072978973389,
"step": 330
},
{
"epoch": 0.7013870714472651,
"grad_norm": 26.514246744997322,
"learning_rate": 1.4810145228170922e-07,
"logits/chosen": -0.8225549459457397,
"logits/rejected": -0.8689346313476562,
"logps/chosen": -1.4374722242355347,
"logps/rejected": -1.9102426767349243,
"loss": 1.052,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -2.8749444484710693,
"rewards/margins": 0.9455404281616211,
"rewards/rejected": -3.8204853534698486,
"step": 335
},
{
"epoch": 0.7118555352002094,
"grad_norm": 27.7004551617753,
"learning_rate": 1.3873292242587306e-07,
"logits/chosen": -0.8165398836135864,
"logits/rejected": -0.9100580215454102,
"logps/chosen": -1.461507797241211,
"logps/rejected": -2.0511550903320312,
"loss": 1.0709,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -2.923015594482422,
"rewards/margins": 1.1792947053909302,
"rewards/rejected": -4.1023101806640625,
"step": 340
},
{
"epoch": 0.7223239989531536,
"grad_norm": 24.617595975995133,
"learning_rate": 1.295805759806533e-07,
"logits/chosen": -0.8566834330558777,
"logits/rejected": -0.8978926539421082,
"logps/chosen": -1.5079203844070435,
"logps/rejected": -2.044774293899536,
"loss": 1.0388,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -3.015840768814087,
"rewards/margins": 1.073707938194275,
"rewards/rejected": -4.089548587799072,
"step": 345
},
{
"epoch": 0.7327924627060979,
"grad_norm": 22.995198881906134,
"learning_rate": 1.2065668194415777e-07,
"logits/chosen": -0.8893098831176758,
"logits/rejected": -0.9465163946151733,
"logps/chosen": -1.5923842191696167,
"logps/rejected": -2.066089153289795,
"loss": 0.9896,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -3.1847684383392334,
"rewards/margins": 0.947409987449646,
"rewards/rejected": -4.13217830657959,
"step": 350
},
{
"epoch": 0.7432609264590422,
"grad_norm": 22.355221430364576,
"learning_rate": 1.1197320306710923e-07,
"logits/chosen": -0.8776585459709167,
"logits/rejected": -0.9053448438644409,
"logps/chosen": -1.5153396129608154,
"logps/rejected": -2.0724828243255615,
"loss": 1.0507,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -3.030679225921631,
"rewards/margins": 1.1142865419387817,
"rewards/rejected": -4.144965648651123,
"step": 355
},
{
"epoch": 0.7537293902119864,
"grad_norm": 23.090030368869293,
"learning_rate": 1.035417798164145e-07,
"logits/chosen": -0.8465662002563477,
"logits/rejected": -0.9114416837692261,
"logps/chosen": -1.5818672180175781,
"logps/rejected": -2.124342441558838,
"loss": 1.0082,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -3.1637344360351562,
"rewards/margins": 1.0849504470825195,
"rewards/rejected": -4.248684883117676,
"step": 360
},
{
"epoch": 0.7641978539649307,
"grad_norm": 23.156782358223225,
"learning_rate": 9.537371477076535e-08,
"logits/chosen": -0.8677560687065125,
"logits/rejected": -0.9061796069145203,
"logps/chosen": -1.5915837287902832,
"logps/rejected": -2.287815570831299,
"loss": 0.9867,
"rewards/accuracies": 0.71875,
"rewards/chosen": -3.1831674575805664,
"rewards/margins": 1.3924639225006104,
"rewards/rejected": -4.575631141662598,
"step": 365
},
{
"epoch": 0.7746663177178749,
"grad_norm": 20.328637763728924,
"learning_rate": 8.747995746918898e-08,
"logits/chosen": -0.8234347105026245,
"logits/rejected": -0.8825669288635254,
"logps/chosen": -1.5265567302703857,
"logps/rejected": -2.1997315883636475,
"loss": 0.9162,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -3.0531134605407715,
"rewards/margins": 1.3463497161865234,
"rewards/rejected": -4.399463176727295,
"step": 370
},
{
"epoch": 0.7851347814708192,
"grad_norm": 38.958615879066635,
"learning_rate": 7.987108973285888e-08,
"logits/chosen": -0.8697785139083862,
"logits/rejected": -0.8908045887947083,
"logps/chosen": -1.508302927017212,
"logps/rejected": -2.1442337036132812,
"loss": 1.0045,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -3.016605854034424,
"rewards/margins": 1.2718614339828491,
"rewards/rejected": -4.2884674072265625,
"step": 375
},
{
"epoch": 0.7956032452237635,
"grad_norm": 40.64376807024019,
"learning_rate": 7.255731147984174e-08,
"logits/chosen": -0.8699348568916321,
"logits/rejected": -0.9192712903022766,
"logps/chosen": -1.5248959064483643,
"logps/rejected": -2.057331085205078,
"loss": 1.0402,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -3.0497918128967285,
"rewards/margins": 1.064869999885559,
"rewards/rejected": -4.114662170410156,
"step": 380
},
{
"epoch": 0.8060717089767077,
"grad_norm": 31.79789174489367,
"learning_rate": 6.554842705179898e-08,
"logits/chosen": -0.8611375093460083,
"logits/rejected": -0.8788291215896606,
"logps/chosen": -1.4700887203216553,
"logps/rejected": -2.0618722438812256,
"loss": 1.0386,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.9401774406433105,
"rewards/margins": 1.183566927909851,
"rewards/rejected": -4.123744487762451,
"step": 385
},
{
"epoch": 0.816540172729652,
"grad_norm": 27.699401276090363,
"learning_rate": 5.885383207096832e-08,
"logits/chosen": -0.8817920684814453,
"logits/rejected": -0.9167042970657349,
"logps/chosen": -1.5808578729629517,
"logps/rejected": -2.0726354122161865,
"loss": 1.0164,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -3.1617157459259033,
"rewards/margins": 0.9835556149482727,
"rewards/rejected": -4.145270824432373,
"step": 390
},
{
"epoch": 0.8270086364825961,
"grad_norm": 22.291806094067294,
"learning_rate": 5.2482500845047165e-08,
"logits/chosen": -0.8046171069145203,
"logits/rejected": -0.8632856607437134,
"logps/chosen": -1.474746823310852,
"logps/rejected": -2.074794292449951,
"loss": 1.0014,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -2.949493646621704,
"rewards/margins": 1.2000950574874878,
"rewards/rejected": -4.149588584899902,
"step": 395
},
{
"epoch": 0.8374771002355405,
"grad_norm": 32.14293789219742,
"learning_rate": 4.644297433686162e-08,
"logits/chosen": -0.8459577560424805,
"logits/rejected": -0.8775212168693542,
"logps/chosen": -1.5837218761444092,
"logps/rejected": -2.0384469032287598,
"loss": 1.0682,
"rewards/accuracies": 0.65625,
"rewards/chosen": -3.1674437522888184,
"rewards/margins": 0.9094497561454773,
"rewards/rejected": -4.0768938064575195,
"step": 400
},
{
"epoch": 0.8374771002355405,
"eval_logits/chosen": -0.9671933650970459,
"eval_logits/rejected": -0.9966414570808411,
"eval_logps/chosen": -1.536142349243164,
"eval_logps/rejected": -2.0912911891937256,
"eval_loss": 0.9954066276550293,
"eval_rewards/accuracies": 0.7279999852180481,
"eval_rewards/chosen": -3.072284698486328,
"eval_rewards/margins": 1.1102983951568604,
"eval_rewards/rejected": -4.182582378387451,
"eval_runtime": 45.9263,
"eval_samples_per_second": 43.548,
"eval_steps_per_second": 2.722,
"step": 400
},
{
"epoch": 0.8479455639884846,
"grad_norm": 23.581025931041157,
"learning_rate": 4.074334871494558e-08,
"logits/chosen": -0.8318978548049927,
"logits/rejected": -0.9007453918457031,
"logps/chosen": -1.597597360610962,
"logps/rejected": -2.2467799186706543,
"loss": 0.9898,
"rewards/accuracies": 0.71875,
"rewards/chosen": -3.195194721221924,
"rewards/margins": 1.2983646392822266,
"rewards/rejected": -4.493559837341309,
"step": 405
},
{
"epoch": 0.8584140277414289,
"grad_norm": 37.34203846776795,
"learning_rate": 3.5391264500382e-08,
"logits/chosen": -0.8569322824478149,
"logits/rejected": -0.8944110870361328,
"logps/chosen": -1.6689296960830688,
"logps/rejected": -2.2536518573760986,
"loss": 0.9821,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -3.3378593921661377,
"rewards/margins": 1.16944420337677,
"rewards/rejected": -4.507303714752197,
"step": 410
},
{
"epoch": 0.8688824914943732,
"grad_norm": 25.448649440851888,
"learning_rate": 3.0393896324452226e-08,
"logits/chosen": -0.8548834919929504,
"logits/rejected": -0.8898690938949585,
"logps/chosen": -1.6892175674438477,
"logps/rejected": -2.1383655071258545,
"loss": 1.0282,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -3.3784351348876953,
"rewards/margins": 0.8982963562011719,
"rewards/rejected": -4.276731014251709,
"step": 415
},
{
"epoch": 0.8793509552473174,
"grad_norm": 22.81456603203954,
"learning_rate": 2.5757943310825026e-08,
"logits/chosen": -0.8120086789131165,
"logits/rejected": -0.8377026319503784,
"logps/chosen": -1.5306228399276733,
"logps/rejected": -2.244910478591919,
"loss": 0.9802,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -3.0612456798553467,
"rewards/margins": 1.428574800491333,
"rewards/rejected": -4.489820957183838,
"step": 420
},
{
"epoch": 0.8898194190002617,
"grad_norm": 37.81119467654555,
"learning_rate": 2.148962009517823e-08,
"logits/chosen": -0.8621734380722046,
"logits/rejected": -0.9295539855957031,
"logps/chosen": -1.594923973083496,
"logps/rejected": -2.202113389968872,
"loss": 0.9772,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -3.189847946166992,
"rewards/margins": 1.2143787145614624,
"rewards/rejected": -4.404226779937744,
"step": 425
},
{
"epoch": 0.9002878827532059,
"grad_norm": 23.35609170503276,
"learning_rate": 1.759464849429082e-08,
"logits/chosen": -0.8409427404403687,
"logits/rejected": -0.8790140151977539,
"logps/chosen": -1.6252171993255615,
"logps/rejected": -2.1690993309020996,
"loss": 0.9766,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -3.250434398651123,
"rewards/margins": 1.087764024734497,
"rewards/rejected": -4.338198661804199,
"step": 430
},
{
"epoch": 0.9107563465061502,
"grad_norm": 42.92976213914578,
"learning_rate": 1.4078249835774169e-08,
"logits/chosen": -0.8287452459335327,
"logits/rejected": -0.8296720385551453,
"logps/chosen": -1.493123173713684,
"logps/rejected": -2.055771827697754,
"loss": 1.0029,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.986246347427368,
"rewards/margins": 1.1252974271774292,
"rewards/rejected": -4.111543655395508,
"step": 435
},
{
"epoch": 0.9212248102590945,
"grad_norm": 32.7360124305529,
"learning_rate": 1.0945137958723705e-08,
"logits/chosen": -0.8666203618049622,
"logits/rejected": -0.9023343920707703,
"logps/chosen": -1.6795040369033813,
"logps/rejected": -2.055238962173462,
"loss": 1.0619,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -3.3590080738067627,
"rewards/margins": 0.7514694929122925,
"rewards/rejected": -4.110477924346924,
"step": 440
},
{
"epoch": 0.9316932740120387,
"grad_norm": 25.809975837885126,
"learning_rate": 8.19951289467482e-09,
"logits/chosen": -0.8226273655891418,
"logits/rejected": -0.8915680646896362,
"logps/chosen": -1.6063209772109985,
"logps/rejected": -2.2188549041748047,
"loss": 1.0036,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -3.212641954421997,
"rewards/margins": 1.2250680923461914,
"rewards/rejected": -4.437709808349609,
"step": 445
},
{
"epoch": 0.942161737764983,
"grad_norm": 27.907394126837357,
"learning_rate": 5.84505523733293e-09,
"logits/chosen": -0.8590003848075867,
"logits/rejected": -0.9254142642021179,
"logps/chosen": -1.5489723682403564,
"logps/rejected": -2.138707160949707,
"loss": 1.0026,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -3.097944736480713,
"rewards/margins": 1.1794699430465698,
"rewards/rejected": -4.277414321899414,
"step": 450
},
{
"epoch": 0.9526302015179272,
"grad_norm": 26.194546776590737,
"learning_rate": 3.8849212086261466e-09,
"logits/chosen": -0.8426074981689453,
"logits/rejected": -0.8449162244796753,
"logps/chosen": -1.5749680995941162,
"logps/rejected": -2.065624475479126,
"loss": 1.0628,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -3.1499361991882324,
"rewards/margins": 0.9813130497932434,
"rewards/rejected": -4.131248950958252,
"step": 455
},
{
"epoch": 0.9630986652708715,
"grad_norm": 28.647656191366522,
"learning_rate": 2.3217384276938756e-09,
"logits/chosen": -0.7687999606132507,
"logits/rejected": -0.8947674036026001,
"logps/chosen": -1.4748101234436035,
"logps/rejected": -2.2467246055603027,
"loss": 1.0081,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.949620246887207,
"rewards/margins": 1.5438289642333984,
"rewards/rejected": -4.4934492111206055,
"step": 460
},
{
"epoch": 0.9735671290238157,
"grad_norm": 25.297804062883948,
"learning_rate": 1.1576023884836472e-09,
"logits/chosen": -0.8351796269416809,
"logits/rejected": -0.8887630701065063,
"logps/chosen": -1.5146936178207397,
"logps/rejected": -2.2188751697540283,
"loss": 0.9987,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -3.0293872356414795,
"rewards/margins": 1.4083633422851562,
"rewards/rejected": -4.437750339508057,
"step": 465
},
{
"epoch": 0.98403559277676,
"grad_norm": 19.986270660762962,
"learning_rate": 3.940736506780395e-10,
"logits/chosen": -0.7743644118309021,
"logits/rejected": -0.788620114326477,
"logps/chosen": -1.4425890445709229,
"logps/rejected": -2.27103853225708,
"loss": 1.0166,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.8851780891418457,
"rewards/margins": 1.6568992137908936,
"rewards/rejected": -4.54207706451416,
"step": 470
},
{
"epoch": 0.9945040565297043,
"grad_norm": 26.273630707088135,
"learning_rate": 3.2175747716822744e-11,
"logits/chosen": -0.8468500971794128,
"logits/rejected": -0.9172460436820984,
"logps/chosen": -1.5344510078430176,
"logps/rejected": -2.111969470977783,
"loss": 0.9858,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -3.068902015686035,
"rewards/margins": 1.1550369262695312,
"rewards/rejected": -4.223938941955566,
"step": 475
}
],
"logging_steps": 5,
"max_steps": 477,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 225,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}