{ "best_metric": null, "best_model_checkpoint": null, "epoch": 7.272, "eval_steps": 100, "global_step": 56, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.6153846153846154, "grad_norm": 127.33006286621094, "learning_rate": 5e-07, "logits/chosen": 0.19238418340682983, "logits/rejected": 0.21956193447113037, "logps/chosen": -58.537498474121094, "logps/rejected": -66.73164367675781, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 1.2307692307692308, "grad_norm": 136.6772918701172, "learning_rate": 4.752422169756047e-07, "logits/chosen": 0.22914010286331177, "logits/rejected": 0.28378042578697205, "logps/chosen": -76.12860870361328, "logps/rejected": -70.64468383789062, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 1.8461538461538463, "grad_norm": 143.6078338623047, "learning_rate": 4.058724504646834e-07, "logits/chosen": 0.24467766284942627, "logits/rejected": 0.21774037182331085, "logps/chosen": -55.543739318847656, "logps/rejected": -77.82548522949219, "loss": 0.7227, "rewards/accuracies": 0.5625, "rewards/chosen": 0.029082417488098145, "rewards/margins": -0.0368955135345459, "rewards/rejected": 0.06597793102264404, "step": 3 }, { "epoch": 2.4615384615384617, "grad_norm": 131.76895141601562, "learning_rate": 3.056302334890786e-07, "logits/chosen": 0.261148065328598, "logits/rejected": 0.2774331569671631, "logps/chosen": -66.54498291015625, "logps/rejected": -66.6764907836914, "loss": 0.6577, "rewards/accuracies": 0.5, "rewards/chosen": 0.008320659399032593, "rewards/margins": 0.04801854491233826, "rewards/rejected": -0.039697885513305664, "step": 4 }, { "epoch": 3.076923076923077, "grad_norm": 126.15006256103516, "learning_rate": 1.9436976651092142e-07, "logits/chosen": 0.36693644523620605, "logits/rejected": 0.3742007613182068, "logps/chosen": -95.5995864868164, "logps/rejected": -113.8344955444336, "loss": 0.597, "rewards/accuracies": 0.8125, "rewards/chosen": 0.10893827676773071, "rewards/margins": 0.25116902589797974, "rewards/rejected": -0.14223074913024902, "step": 5 }, { "epoch": 3.6923076923076925, "grad_norm": 100.18375396728516, "learning_rate": 9.412754953531663e-08, "logits/chosen": 0.24576663970947266, "logits/rejected": 0.25556063652038574, "logps/chosen": -80.51493072509766, "logps/rejected": -84.49187469482422, "loss": 0.5787, "rewards/accuracies": 0.9375, "rewards/chosen": 0.17812098562717438, "rewards/margins": 0.3286281228065491, "rewards/rejected": -0.1505071520805359, "step": 6 }, { "epoch": 4.3076923076923075, "grad_norm": 119.61666870117188, "learning_rate": 2.475778302439524e-08, "logits/chosen": 0.21196235716342926, "logits/rejected": 0.2644736170768738, "logps/chosen": -69.17505645751953, "logps/rejected": -66.96580505371094, "loss": 0.563, "rewards/accuracies": 0.9375, "rewards/chosen": 0.16142475605010986, "rewards/margins": 0.3076605200767517, "rewards/rejected": -0.14623576402664185, "step": 7 }, { "epoch": 4.923076923076923, "grad_norm": 96.98345184326172, "learning_rate": 0.0, "logits/chosen": 0.318248987197876, "logits/rejected": 0.2900750935077667, "logps/chosen": -55.142822265625, "logps/rejected": -67.91665649414062, "loss": 0.5369, "rewards/accuracies": 0.75, "rewards/chosen": 0.07582578808069229, "rewards/margins": 0.24877893924713135, "rewards/rejected": -0.17295315861701965, "step": 8 }, { "epoch": 1.256, "grad_norm": 84.79383087158203, "learning_rate": 4.955718126821722e-07, "logits/chosen": 0.29836222529411316, "logits/rejected": 0.32582682371139526, "logps/chosen": -83.86153411865234, "logps/rejected": -77.13251495361328, "loss": 0.6714, "rewards/accuracies": 0.625, "rewards/chosen": 0.023729726672172546, "rewards/margins": 0.05816943943500519, "rewards/rejected": -0.03443971276283264, "step": 9 }, { "epoch": 1.384, "grad_norm": 88.73702239990234, "learning_rate": 4.921457902821578e-07, "logits/chosen": 0.21312400698661804, "logits/rejected": 0.23579223453998566, "logps/chosen": -68.02887725830078, "logps/rejected": -76.00659942626953, "loss": 0.6785, "rewards/accuracies": 0.5625, "rewards/chosen": 0.005893569439649582, "rewards/margins": 0.031381912529468536, "rewards/rejected": -0.025488346815109253, "step": 10 }, { "epoch": 1.512, "grad_norm": 96.98503875732422, "learning_rate": 4.877641290737883e-07, "logits/chosen": 0.2715354561805725, "logits/rejected": 0.27474918961524963, "logps/chosen": -66.76045227050781, "logps/rejected": -78.57473754882812, "loss": 0.6883, "rewards/accuracies": 0.4375, "rewards/chosen": -0.02488572895526886, "rewards/margins": 0.004496380686759949, "rewards/rejected": -0.02938210964202881, "step": 11 }, { "epoch": 1.6400000000000001, "grad_norm": 82.92364501953125, "learning_rate": 4.824441214720628e-07, "logits/chosen": 0.2345450520515442, "logits/rejected": 0.2685388922691345, "logps/chosen": -71.50077056884766, "logps/rejected": -66.5575942993164, "loss": 0.6724, "rewards/accuracies": 0.625, "rewards/chosen": 0.06529319286346436, "rewards/margins": 0.09461906552314758, "rewards/rejected": -0.029325872659683228, "step": 12 }, { "epoch": 1.768, "grad_norm": 90.68313598632812, "learning_rate": 4.762067631165049e-07, "logits/chosen": 0.3173472583293915, "logits/rejected": 0.31548872590065, "logps/chosen": -62.33905792236328, "logps/rejected": -69.90167236328125, "loss": 0.6621, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0690992921590805, "rewards/margins": 0.08333452045917511, "rewards/rejected": -0.014235228300094604, "step": 13 }, { "epoch": 1.896, "grad_norm": 91.8805923461914, "learning_rate": 4.6907667001096585e-07, "logits/chosen": 0.24450257420539856, "logits/rejected": 0.27835142612457275, "logps/chosen": -75.39544677734375, "logps/rejected": -92.54512786865234, "loss": 0.6806, "rewards/accuracies": 0.6875, "rewards/chosen": 0.11982224881649017, "rewards/margins": 0.14259418845176697, "rewards/rejected": -0.022771939635276794, "step": 14 }, { "epoch": 2.024, "grad_norm": 93.57877349853516, "learning_rate": 4.6108198137550377e-07, "logits/chosen": 0.2691981792449951, "logits/rejected": 0.29418689012527466, "logps/chosen": -65.36813354492188, "logps/rejected": -86.02149963378906, "loss": 0.6694, "rewards/accuracies": 0.65625, "rewards/chosen": -0.0014134570956230164, "rewards/margins": 0.08445831388235092, "rewards/rejected": -0.08587177097797394, "step": 15 }, { "epoch": 2.152, "grad_norm": 90.0985336303711, "learning_rate": 4.5225424859373684e-07, "logits/chosen": 0.29637423157691956, "logits/rejected": 0.3497394323348999, "logps/chosen": -78.20895385742188, "logps/rejected": -65.3874282836914, "loss": 0.6953, "rewards/accuracies": 0.5625, "rewards/chosen": 0.05982813239097595, "rewards/margins": 0.08022981882095337, "rewards/rejected": -0.020401686429977417, "step": 16 }, { "epoch": 2.2800000000000002, "grad_norm": 91.4386978149414, "learning_rate": 4.426283106939473e-07, "logits/chosen": 0.3221435546875, "logits/rejected": 0.34331709146499634, "logps/chosen": -73.48678588867188, "logps/rejected": -81.2340087890625, "loss": 0.6876, "rewards/accuracies": 0.59375, "rewards/chosen": 0.06735238432884216, "rewards/margins": 0.05368679761886597, "rewards/rejected": 0.013665586709976196, "step": 17 }, { "epoch": 2.408, "grad_norm": 98.76863098144531, "learning_rate": 4.3224215685535287e-07, "logits/chosen": 0.2613396942615509, "logits/rejected": 0.2849021553993225, "logps/chosen": -89.08374786376953, "logps/rejected": -69.78533172607422, "loss": 0.6688, "rewards/accuracies": 0.40625, "rewards/chosen": -0.02113175392150879, "rewards/margins": 0.001418381929397583, "rewards/rejected": -0.022550135850906372, "step": 18 }, { "epoch": 2.536, "grad_norm": 81.9192123413086, "learning_rate": 4.2113677648217216e-07, "logits/chosen": 0.22890335321426392, "logits/rejected": 0.23874913156032562, "logps/chosen": -68.76072692871094, "logps/rejected": -66.74049377441406, "loss": 0.6763, "rewards/accuracies": 0.59375, "rewards/chosen": 0.09503498673439026, "rewards/margins": 0.086346834897995, "rewards/rejected": 0.008688151836395264, "step": 19 }, { "epoch": 2.664, "grad_norm": 95.22876739501953, "learning_rate": 4.0935599743717244e-07, "logits/chosen": 0.32450735569000244, "logits/rejected": 0.33199459314346313, "logps/chosen": -91.43396759033203, "logps/rejected": -83.63076782226562, "loss": 0.6491, "rewards/accuracies": 0.75, "rewards/chosen": 0.05055028200149536, "rewards/margins": 0.13369867205619812, "rewards/rejected": -0.08314839005470276, "step": 20 }, { "epoch": 2.792, "grad_norm": 92.8243408203125, "learning_rate": 3.9694631307311825e-07, "logits/chosen": 0.3116016983985901, "logits/rejected": 0.3494156002998352, "logps/chosen": -67.47573852539062, "logps/rejected": -71.52774047851562, "loss": 0.6736, "rewards/accuracies": 0.5, "rewards/chosen": 0.03751923143863678, "rewards/margins": 0.048606112599372864, "rewards/rejected": -0.011086881160736084, "step": 21 }, { "epoch": 2.92, "grad_norm": 89.23714447021484, "learning_rate": 3.839566987447491e-07, "logits/chosen": 0.22846412658691406, "logits/rejected": 0.21796303987503052, "logps/chosen": -65.1306381225586, "logps/rejected": -71.10429382324219, "loss": 0.6992, "rewards/accuracies": 0.53125, "rewards/chosen": -0.009614139795303345, "rewards/margins": -0.01433388888835907, "rewards/rejected": 0.004719749093055725, "step": 22 }, { "epoch": 3.048, "grad_norm": 87.72737884521484, "learning_rate": 3.704384185254288e-07, "logits/chosen": 0.2647473216056824, "logits/rejected": 0.2934381663799286, "logps/chosen": -63.67654037475586, "logps/rejected": -62.632781982421875, "loss": 0.6676, "rewards/accuracies": 0.6875, "rewards/chosen": 0.013809099793434143, "rewards/margins": 0.06607498228549957, "rewards/rejected": -0.05226588249206543, "step": 23 }, { "epoch": 3.176, "grad_norm": 90.60627746582031, "learning_rate": 3.5644482289126813e-07, "logits/chosen": 0.32062453031539917, "logits/rejected": 0.2993485927581787, "logps/chosen": -68.11253356933594, "logps/rejected": -99.29121398925781, "loss": 0.6716, "rewards/accuracies": 0.625, "rewards/chosen": 0.008185192942619324, "rewards/margins": 0.056953445076942444, "rewards/rejected": -0.04876825213432312, "step": 24 }, { "epoch": 3.304, "grad_norm": 94.25776672363281, "learning_rate": 3.4203113817116953e-07, "logits/chosen": 0.2894556522369385, "logits/rejected": 0.29452645778656006, "logps/chosen": -64.85166931152344, "logps/rejected": -63.267059326171875, "loss": 0.6803, "rewards/accuracies": 0.5625, "rewards/chosen": 0.01005951315164566, "rewards/margins": 0.05620530992746353, "rewards/rejected": -0.04614579677581787, "step": 25 }, { "epoch": 3.432, "grad_norm": 95.0955810546875, "learning_rate": 3.272542485937368e-07, "logits/chosen": 0.2319055050611496, "logits/rejected": 0.287945032119751, "logps/chosen": -72.57554626464844, "logps/rejected": -66.64920806884766, "loss": 0.6887, "rewards/accuracies": 0.53125, "rewards/chosen": 0.027968034148216248, "rewards/margins": 0.0742889791727066, "rewards/rejected": -0.046320945024490356, "step": 26 }, { "epoch": 3.56, "grad_norm": 88.33543395996094, "learning_rate": 3.121724717912138e-07, "logits/chosen": 0.309038907289505, "logits/rejected": 0.321429967880249, "logps/chosen": -97.70095825195312, "logps/rejected": -82.71894073486328, "loss": 0.659, "rewards/accuracies": 0.53125, "rewards/chosen": 0.08497677743434906, "rewards/margins": 0.1083778589963913, "rewards/rejected": -0.023401081562042236, "step": 27 }, { "epoch": 3.6879999999999997, "grad_norm": 93.63185119628906, "learning_rate": 2.968453286464312e-07, "logits/chosen": 0.2761862277984619, "logits/rejected": 0.27546417713165283, "logps/chosen": -75.79278564453125, "logps/rejected": -75.79965209960938, "loss": 0.6746, "rewards/accuracies": 0.59375, "rewards/chosen": 0.01948818564414978, "rewards/margins": 0.04068872332572937, "rewards/rejected": -0.02120053768157959, "step": 28 }, { "epoch": 3.816, "grad_norm": 90.8388671875, "learning_rate": 2.8133330839107604e-07, "logits/chosen": 0.280830055475235, "logits/rejected": 0.2866876423358917, "logps/chosen": -66.83413696289062, "logps/rejected": -67.01375579833984, "loss": 0.6725, "rewards/accuracies": 0.46875, "rewards/chosen": 0.013940572738647461, "rewards/margins": 0.014010876417160034, "rewards/rejected": -7.030367851257324e-05, "step": 29 }, { "epoch": 3.944, "grad_norm": 92.31433868408203, "learning_rate": 2.6569762988232837e-07, "logits/chosen": 0.2993810474872589, "logits/rejected": 0.29364442825317383, "logps/chosen": -63.468109130859375, "logps/rejected": -77.49847412109375, "loss": 0.6981, "rewards/accuracies": 0.5, "rewards/chosen": 0.010666653513908386, "rewards/margins": 0.03114195168018341, "rewards/rejected": -0.020475298166275024, "step": 30 }, { "epoch": 4.072, "grad_norm": 85.3602294921875, "learning_rate": 2.5e-07, "logits/chosen": 0.26220929622650146, "logits/rejected": 0.23664042353630066, "logps/chosen": -69.07573699951172, "logps/rejected": -72.7073974609375, "loss": 0.6799, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0597347617149353, "rewards/margins": 0.12319907546043396, "rewards/rejected": -0.06346431374549866, "step": 31 }, { "epoch": 4.2, "grad_norm": 89.2237548828125, "learning_rate": 2.3430237011767164e-07, "logits/chosen": 0.2281663417816162, "logits/rejected": 0.24119029939174652, "logps/chosen": -75.16613006591797, "logps/rejected": -64.49757385253906, "loss": 0.6633, "rewards/accuracies": 0.5625, "rewards/chosen": 0.040758922696113586, "rewards/margins": 0.05312100052833557, "rewards/rejected": -0.012362077832221985, "step": 32 }, { "epoch": 4.328, "grad_norm": 95.59449768066406, "learning_rate": 2.1866669160892389e-07, "logits/chosen": 0.3119271993637085, "logits/rejected": 0.30429312586784363, "logps/chosen": -67.04680633544922, "logps/rejected": -76.78421020507812, "loss": 0.674, "rewards/accuracies": 0.65625, "rewards/chosen": 0.04746510088443756, "rewards/margins": 0.08129900693893433, "rewards/rejected": -0.033833906054496765, "step": 33 }, { "epoch": 4.456, "grad_norm": 103.24285888671875, "learning_rate": 2.0315467135356878e-07, "logits/chosen": 0.28600916266441345, "logits/rejected": 0.30370771884918213, "logps/chosen": -92.84146118164062, "logps/rejected": -109.2697982788086, "loss": 0.694, "rewards/accuracies": 0.46875, "rewards/chosen": -0.03957655280828476, "rewards/margins": -0.023300133645534515, "rewards/rejected": -0.016276419162750244, "step": 34 }, { "epoch": 4.584, "grad_norm": 92.40351104736328, "learning_rate": 1.8782752820878633e-07, "logits/chosen": 0.25603896379470825, "logits/rejected": 0.2662765681743622, "logps/chosen": -72.62451171875, "logps/rejected": -60.2940559387207, "loss": 0.6891, "rewards/accuracies": 0.53125, "rewards/chosen": 0.02680887281894684, "rewards/margins": 0.04961217939853668, "rewards/rejected": -0.022803306579589844, "step": 35 }, { "epoch": 4.712, "grad_norm": 84.56185913085938, "learning_rate": 1.7274575140626315e-07, "logits/chosen": 0.3107318878173828, "logits/rejected": 0.33106040954589844, "logps/chosen": -86.22938537597656, "logps/rejected": -76.20439910888672, "loss": 0.6649, "rewards/accuracies": 0.46875, "rewards/chosen": 0.027996808290481567, "rewards/margins": 0.06501305848360062, "rewards/rejected": -0.03701625019311905, "step": 36 }, { "epoch": 4.84, "grad_norm": 87.72008514404297, "learning_rate": 1.579688618288305e-07, "logits/chosen": 0.3074452877044678, "logits/rejected": 0.31057560443878174, "logps/chosen": -77.69036865234375, "logps/rejected": -68.51107025146484, "loss": 0.6563, "rewards/accuracies": 0.78125, "rewards/chosen": 0.0846804529428482, "rewards/margins": 0.1669072061777115, "rewards/rejected": -0.08222675323486328, "step": 37 }, { "epoch": 4.968, "grad_norm": 78.48970031738281, "learning_rate": 1.4355517710873182e-07, "logits/chosen": 0.27899622917175293, "logits/rejected": 0.2892475724220276, "logps/chosen": -68.76217651367188, "logps/rejected": -72.50349426269531, "loss": 0.6612, "rewards/accuracies": 0.5625, "rewards/chosen": 0.03485181927680969, "rewards/margins": 0.05903954803943634, "rewards/rejected": -0.024187728762626648, "step": 38 }, { "epoch": 5.096, "grad_norm": 102.67794036865234, "learning_rate": 1.2956158147457114e-07, "logits/chosen": 0.3237246870994568, "logits/rejected": 0.34282439947128296, "logps/chosen": -80.13423156738281, "logps/rejected": -74.20858764648438, "loss": 0.6689, "rewards/accuracies": 0.625, "rewards/chosen": 0.04920327663421631, "rewards/margins": 0.1548747569322586, "rewards/rejected": -0.1056714802980423, "step": 39 }, { "epoch": 5.224, "grad_norm": 90.3107681274414, "learning_rate": 1.1604330125525078e-07, "logits/chosen": 0.29913192987442017, "logits/rejected": 0.2973610460758209, "logps/chosen": -81.41338348388672, "logps/rejected": -78.10675048828125, "loss": 0.6817, "rewards/accuracies": 0.5625, "rewards/chosen": 0.08691957592964172, "rewards/margins": 0.09112322330474854, "rewards/rejected": -0.0042036473751068115, "step": 40 }, { "epoch": 5.352, "grad_norm": 86.05548858642578, "learning_rate": 1.0305368692688174e-07, "logits/chosen": 0.26672640442848206, "logits/rejected": 0.2698957026004791, "logps/chosen": -82.20582580566406, "logps/rejected": -72.3929443359375, "loss": 0.6793, "rewards/accuracies": 0.59375, "rewards/chosen": -0.004676908254623413, "rewards/margins": 0.028314650058746338, "rewards/rejected": -0.03299155831336975, "step": 41 }, { "epoch": 5.48, "grad_norm": 86.79624938964844, "learning_rate": 9.064400256282755e-08, "logits/chosen": 0.3021017014980316, "logits/rejected": 0.29037410020828247, "logps/chosen": -60.563438415527344, "logps/rejected": -72.60798645019531, "loss": 0.6688, "rewards/accuracies": 0.625, "rewards/chosen": 0.015075430274009705, "rewards/margins": 0.07287518680095673, "rewards/rejected": -0.05779975652694702, "step": 42 }, { "epoch": 5.608, "grad_norm": 92.15886688232422, "learning_rate": 7.886322351782782e-08, "logits/chosen": 0.26732951402664185, "logits/rejected": 0.30227866768836975, "logps/chosen": -73.15177917480469, "logps/rejected": -78.50798797607422, "loss": 0.6841, "rewards/accuracies": 0.625, "rewards/chosen": 0.02876923978328705, "rewards/margins": 0.06183256208896637, "rewards/rejected": -0.03306332230567932, "step": 43 }, { "epoch": 5.736, "grad_norm": 89.39374542236328, "learning_rate": 6.775784314464716e-08, "logits/chosen": 0.25305798649787903, "logits/rejected": 0.2594181001186371, "logps/chosen": -86.95756530761719, "logps/rejected": -77.09736633300781, "loss": 0.6691, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0014192461967468262, "rewards/margins": 0.11444368958473206, "rewards/rejected": -0.11302444338798523, "step": 44 }, { "epoch": 5.864, "grad_norm": 87.49380493164062, "learning_rate": 5.737168930605271e-08, "logits/chosen": 0.3325170874595642, "logits/rejected": 0.32772064208984375, "logps/chosen": -72.07937622070312, "logps/rejected": -83.23653411865234, "loss": 0.6907, "rewards/accuracies": 0.71875, "rewards/chosen": 0.033388733863830566, "rewards/margins": 0.23375508189201355, "rewards/rejected": -0.20036634802818298, "step": 45 }, { "epoch": 5.992, "grad_norm": 94.16134643554688, "learning_rate": 4.774575140626316e-08, "logits/chosen": 0.28719452023506165, "logits/rejected": 0.31415650248527527, "logps/chosen": -80.78883361816406, "logps/rejected": -83.40714263916016, "loss": 0.6694, "rewards/accuracies": 0.4375, "rewards/chosen": 0.0021561384201049805, "rewards/margins": 0.0118083655834198, "rewards/rejected": -0.00965222716331482, "step": 46 }, { "epoch": 6.12, "grad_norm": 89.89035034179688, "learning_rate": 3.8918018624496286e-08, "logits/chosen": 0.2381378412246704, "logits/rejected": 0.24997369945049286, "logps/chosen": -66.21188354492188, "logps/rejected": -67.53558349609375, "loss": 0.6703, "rewards/accuracies": 0.625, "rewards/chosen": 0.003077469766139984, "rewards/margins": 0.0005584284663200378, "rewards/rejected": 0.0025190412998199463, "step": 47 }, { "epoch": 6.248, "grad_norm": 84.92794036865234, "learning_rate": 3.092332998903416e-08, "logits/chosen": 0.2564837634563446, "logits/rejected": 0.28156182169914246, "logps/chosen": -72.13143157958984, "logps/rejected": -85.50643157958984, "loss": 0.6771, "rewards/accuracies": 0.5625, "rewards/chosen": -0.03621651977300644, "rewards/margins": 0.013661496341228485, "rewards/rejected": -0.049878016114234924, "step": 48 }, { "epoch": 6.376, "grad_norm": 92.3100357055664, "learning_rate": 2.379323688349516e-08, "logits/chosen": 0.2702118158340454, "logits/rejected": 0.2811765968799591, "logps/chosen": -80.61731719970703, "logps/rejected": -95.37781524658203, "loss": 0.6677, "rewards/accuracies": 0.625, "rewards/chosen": 0.03306543827056885, "rewards/margins": 0.09397777915000916, "rewards/rejected": -0.06091234087944031, "step": 49 }, { "epoch": 6.504, "grad_norm": 83.64833068847656, "learning_rate": 1.7555878527937163e-08, "logits/chosen": 0.26621848344802856, "logits/rejected": 0.2580479383468628, "logps/chosen": -61.17379379272461, "logps/rejected": -70.72584533691406, "loss": 0.6766, "rewards/accuracies": 0.46875, "rewards/chosen": 0.011742278933525085, "rewards/margins": 0.03917151689529419, "rewards/rejected": -0.027429237961769104, "step": 50 }, { "epoch": 6.632, "grad_norm": 94.73871612548828, "learning_rate": 1.2235870926211616e-08, "logits/chosen": 0.21123512089252472, "logits/rejected": 0.21980169415473938, "logps/chosen": -55.80116271972656, "logps/rejected": -61.21021270751953, "loss": 0.6897, "rewards/accuracies": 0.5625, "rewards/chosen": -0.03383632004261017, "rewards/margins": 0.027702882885932922, "rewards/rejected": -0.06153920292854309, "step": 51 }, { "epoch": 6.76, "grad_norm": 90.2896499633789, "learning_rate": 7.85420971784223e-09, "logits/chosen": 0.28194403648376465, "logits/rejected": 0.314169704914093, "logps/chosen": -83.96858978271484, "logps/rejected": -84.47467803955078, "loss": 0.687, "rewards/accuracies": 0.5625, "rewards/chosen": 0.06711554527282715, "rewards/margins": 0.007981911301612854, "rewards/rejected": 0.059133633971214294, "step": 52 }, { "epoch": 6.888, "grad_norm": 86.9649887084961, "learning_rate": 4.4281873178278475e-09, "logits/chosen": 0.2565808892250061, "logits/rejected": 0.26864296197891235, "logps/chosen": -60.76387405395508, "logps/rejected": -57.71691131591797, "loss": 0.6633, "rewards/accuracies": 0.5625, "rewards/chosen": 0.027324259281158447, "rewards/margins": 0.03995504975318909, "rewards/rejected": -0.01263079047203064, "step": 53 }, { "epoch": 7.016, "grad_norm": 91.96407318115234, "learning_rate": 1.9713246713805587e-09, "logits/chosen": 0.2563447952270508, "logits/rejected": 0.23585036396980286, "logps/chosen": -64.38143157958984, "logps/rejected": -73.07710266113281, "loss": 0.701, "rewards/accuracies": 0.5, "rewards/chosen": 0.0030507892370224, "rewards/margins": 0.019449278712272644, "rewards/rejected": -0.016398489475250244, "step": 54 }, { "epoch": 7.144, "grad_norm": 92.31555938720703, "learning_rate": 4.933178929321102e-10, "logits/chosen": 0.27761310338974, "logits/rejected": 0.28139054775238037, "logps/chosen": -86.69955444335938, "logps/rejected": -84.17654418945312, "loss": 0.6593, "rewards/accuracies": 0.59375, "rewards/chosen": 0.09960392117500305, "rewards/margins": 0.13018175959587097, "rewards/rejected": -0.03057783842086792, "step": 55 }, { "epoch": 7.272, "grad_norm": 88.12805938720703, "learning_rate": 0.0, "logits/chosen": 0.28197741508483887, "logits/rejected": 0.3240779936313629, "logps/chosen": -61.40129852294922, "logps/rejected": -71.11226654052734, "loss": 0.6692, "rewards/accuracies": 0.5, "rewards/chosen": -0.014597773551940918, "rewards/margins": 0.04788690805435181, "rewards/rejected": -0.062484681606292725, "step": 56 }, { "epoch": 7.272, "step": 56, "total_flos": 0.0, "train_loss": 0.5793062054685184, "train_runtime": 6908.77, "train_samples_per_second": 1.158, "train_steps_per_second": 0.008 } ], "logging_steps": 1, "max_steps": 56, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 400, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }