{ "best_metric": null, "best_model_checkpoint": null, "epoch": 7.964444444444444, "eval_steps": 100, "global_step": 1344, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005925925925925926, "grad_norm": 54.73865774156835, "learning_rate": 3.7037037037037036e-09, "logits/chosen": -1.6551780700683594, "logits/rejected": -1.6470587253570557, "logps/chosen": -42.52139663696289, "logps/rejected": -48.890506744384766, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.011851851851851851, "grad_norm": 50.35685724767536, "learning_rate": 7.407407407407407e-09, "logits/chosen": -1.1584198474884033, "logits/rejected": -1.2945518493652344, "logps/chosen": -42.262428283691406, "logps/rejected": -47.62751007080078, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.017777777777777778, "grad_norm": 54.59371328811371, "learning_rate": 1.111111111111111e-08, "logits/chosen": -1.4449834823608398, "logits/rejected": -1.347031831741333, "logps/chosen": -38.551475524902344, "logps/rejected": -56.001258850097656, "loss": 0.7056, "rewards/accuracies": 0.5, "rewards/chosen": -0.04485452175140381, "rewards/margins": -0.044418931007385254, "rewards/rejected": -0.0004355907440185547, "step": 3 }, { "epoch": 0.023703703703703703, "grad_norm": 51.501597470917936, "learning_rate": 1.4814814814814814e-08, "logits/chosen": -1.69174063205719, "logits/rejected": -1.577598214149475, "logps/chosen": -30.320907592773438, "logps/rejected": -46.535797119140625, "loss": 0.7038, "rewards/accuracies": 0.8125, "rewards/chosen": 0.0048062801361083984, "rewards/margins": 0.05189502239227295, "rewards/rejected": -0.04708874225616455, "step": 4 }, { "epoch": 0.02962962962962963, "grad_norm": 51.34455089675247, "learning_rate": 1.8518518518518518e-08, "logits/chosen": -1.6288294792175293, "logits/rejected": -1.5976742506027222, "logps/chosen": -33.35110855102539, "logps/rejected": -48.16716766357422, "loss": 0.7014, "rewards/accuracies": 0.5, "rewards/chosen": -0.00041840970516204834, "rewards/margins": -0.032079800963401794, "rewards/rejected": 0.031661391258239746, "step": 5 }, { "epoch": 0.035555555555555556, "grad_norm": 47.841287211454, "learning_rate": 2.222222222222222e-08, "logits/chosen": -1.4972864389419556, "logits/rejected": -1.5760784149169922, "logps/chosen": -35.15846252441406, "logps/rejected": -35.15956497192383, "loss": 0.696, "rewards/accuracies": 0.5, "rewards/chosen": 0.0043003857135772705, "rewards/margins": -0.031109660863876343, "rewards/rejected": 0.03541004657745361, "step": 6 }, { "epoch": 0.04148148148148148, "grad_norm": 48.17008791092014, "learning_rate": 2.5925925925925923e-08, "logits/chosen": -1.2874022722244263, "logits/rejected": -1.339775562286377, "logps/chosen": -38.65494155883789, "logps/rejected": -44.154571533203125, "loss": 0.6844, "rewards/accuracies": 0.6875, "rewards/chosen": 0.02324095368385315, "rewards/margins": 0.07515916228294373, "rewards/rejected": -0.051918208599090576, "step": 7 }, { "epoch": 0.047407407407407405, "grad_norm": 53.97607510739969, "learning_rate": 2.962962962962963e-08, "logits/chosen": -1.6390717029571533, "logits/rejected": -1.5105955600738525, "logps/chosen": -34.070068359375, "logps/rejected": -48.21985626220703, "loss": 0.7044, "rewards/accuracies": 0.4375, "rewards/chosen": 0.015645623207092285, "rewards/margins": 0.03352612257003784, "rewards/rejected": -0.017880499362945557, "step": 8 }, { "epoch": 0.05333333333333334, "grad_norm": 50.44575738510241, "learning_rate": 3.3333333333333334e-08, "logits/chosen": -1.9325859546661377, "logits/rejected": -1.77297043800354, "logps/chosen": -54.47149658203125, "logps/rejected": -57.60348892211914, "loss": 0.6878, "rewards/accuracies": 0.5, "rewards/chosen": -0.00028821825981140137, "rewards/margins": 0.010739117860794067, "rewards/rejected": -0.011027336120605469, "step": 9 }, { "epoch": 0.05925925925925926, "grad_norm": 51.58385642006147, "learning_rate": 3.7037037037037036e-08, "logits/chosen": -1.6119937896728516, "logits/rejected": -1.7191580533981323, "logps/chosen": -41.67599105834961, "logps/rejected": -42.40376663208008, "loss": 0.7032, "rewards/accuracies": 0.5625, "rewards/chosen": 0.013362348079681396, "rewards/margins": -0.01079791784286499, "rewards/rejected": 0.024160265922546387, "step": 10 }, { "epoch": 0.06518518518518518, "grad_norm": 54.37783035721775, "learning_rate": 4.0740740740740745e-08, "logits/chosen": -1.2692227363586426, "logits/rejected": -1.355187177658081, "logps/chosen": -43.08770751953125, "logps/rejected": -46.87504577636719, "loss": 0.6969, "rewards/accuracies": 0.375, "rewards/chosen": -0.028767406940460205, "rewards/margins": -0.06381511688232422, "rewards/rejected": 0.035047709941864014, "step": 11 }, { "epoch": 0.07111111111111111, "grad_norm": 56.74473973643056, "learning_rate": 4.444444444444444e-08, "logits/chosen": -1.8102182149887085, "logits/rejected": -1.938857078552246, "logps/chosen": -43.0614013671875, "logps/rejected": -44.25722122192383, "loss": 0.7345, "rewards/accuracies": 0.1875, "rewards/chosen": -0.0229855477809906, "rewards/margins": -0.07008519768714905, "rewards/rejected": 0.04709964990615845, "step": 12 }, { "epoch": 0.07703703703703704, "grad_norm": 46.45521014700028, "learning_rate": 4.814814814814814e-08, "logits/chosen": -1.4651801586151123, "logits/rejected": -1.4712345600128174, "logps/chosen": -32.00542449951172, "logps/rejected": -37.7706413269043, "loss": 0.6979, "rewards/accuracies": 0.5625, "rewards/chosen": -0.019961148500442505, "rewards/margins": 0.01969766616821289, "rewards/rejected": -0.039658814668655396, "step": 13 }, { "epoch": 0.08296296296296296, "grad_norm": 53.72462131673607, "learning_rate": 5.1851851851851846e-08, "logits/chosen": -1.7651007175445557, "logits/rejected": -1.5879353284835815, "logps/chosen": -37.73487091064453, "logps/rejected": -59.109642028808594, "loss": 0.7008, "rewards/accuracies": 0.5625, "rewards/chosen": -0.019394323229789734, "rewards/margins": -0.01382051408290863, "rewards/rejected": -0.0055738091468811035, "step": 14 }, { "epoch": 0.08888888888888889, "grad_norm": 57.44517565738861, "learning_rate": 5.555555555555555e-08, "logits/chosen": -1.8541333675384521, "logits/rejected": -1.7781238555908203, "logps/chosen": -33.13742446899414, "logps/rejected": -45.20591354370117, "loss": 0.6984, "rewards/accuracies": 0.3125, "rewards/chosen": -0.016418397426605225, "rewards/margins": -0.0519789457321167, "rewards/rejected": 0.035560548305511475, "step": 15 }, { "epoch": 0.09481481481481481, "grad_norm": 51.361031686541665, "learning_rate": 5.925925925925926e-08, "logits/chosen": -1.5105748176574707, "logits/rejected": -1.3466538190841675, "logps/chosen": -38.21760177612305, "logps/rejected": -50.58500289916992, "loss": 0.7051, "rewards/accuracies": 0.6875, "rewards/chosen": -0.03620937466621399, "rewards/margins": -0.012676209211349487, "rewards/rejected": -0.023533165454864502, "step": 16 }, { "epoch": 0.10074074074074074, "grad_norm": 54.99448299996414, "learning_rate": 6.296296296296296e-08, "logits/chosen": -1.2743877172470093, "logits/rejected": -1.5392966270446777, "logps/chosen": -45.76844787597656, "logps/rejected": -44.51301574707031, "loss": 0.7044, "rewards/accuracies": 0.625, "rewards/chosen": 0.02413499355316162, "rewards/margins": 0.03405413031578064, "rewards/rejected": -0.009919136762619019, "step": 17 }, { "epoch": 0.10666666666666667, "grad_norm": 52.90769744924133, "learning_rate": 6.666666666666667e-08, "logits/chosen": -1.394054651260376, "logits/rejected": -1.660745620727539, "logps/chosen": -51.191001892089844, "logps/rejected": -39.66780090332031, "loss": 0.688, "rewards/accuracies": 0.5625, "rewards/chosen": 0.04243624210357666, "rewards/margins": -0.002468883991241455, "rewards/rejected": 0.044905126094818115, "step": 18 }, { "epoch": 0.11259259259259259, "grad_norm": 52.73976531396199, "learning_rate": 7.037037037037038e-08, "logits/chosen": -1.643120527267456, "logits/rejected": -1.6542155742645264, "logps/chosen": -42.54930114746094, "logps/rejected": -46.6912956237793, "loss": 0.6981, "rewards/accuracies": 0.5625, "rewards/chosen": -0.037486732006073, "rewards/margins": 0.02916562557220459, "rewards/rejected": -0.06665235757827759, "step": 19 }, { "epoch": 0.11851851851851852, "grad_norm": 51.77338754692824, "learning_rate": 7.407407407407407e-08, "logits/chosen": -1.9419344663619995, "logits/rejected": -1.7904196977615356, "logps/chosen": -33.6644287109375, "logps/rejected": -50.46963882446289, "loss": 0.6912, "rewards/accuracies": 0.3125, "rewards/chosen": 0.003351449966430664, "rewards/margins": -0.023046374320983887, "rewards/rejected": 0.02639782428741455, "step": 20 }, { "epoch": 0.12444444444444444, "grad_norm": 55.682792202937584, "learning_rate": 7.777777777777778e-08, "logits/chosen": -1.6269031763076782, "logits/rejected": -1.5598448514938354, "logps/chosen": -44.19285202026367, "logps/rejected": -54.80866241455078, "loss": 0.6673, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0722590982913971, "rewards/margins": 0.0795658528804779, "rewards/rejected": -0.0073067545890808105, "step": 21 }, { "epoch": 0.13037037037037036, "grad_norm": 54.26723284603444, "learning_rate": 8.148148148148149e-08, "logits/chosen": -1.5746511220932007, "logits/rejected": -1.4856189489364624, "logps/chosen": -31.291728973388672, "logps/rejected": -37.666141510009766, "loss": 0.7061, "rewards/accuracies": 0.375, "rewards/chosen": -0.011329293251037598, "rewards/margins": -0.0033955276012420654, "rewards/rejected": -0.007933765649795532, "step": 22 }, { "epoch": 0.1362962962962963, "grad_norm": 49.26655807512015, "learning_rate": 8.518518518518517e-08, "logits/chosen": -1.3974454402923584, "logits/rejected": -1.2355865240097046, "logps/chosen": -33.44588851928711, "logps/rejected": -44.378822326660156, "loss": 0.7064, "rewards/accuracies": 0.75, "rewards/chosen": 0.016000211238861084, "rewards/margins": 0.05508500337600708, "rewards/rejected": -0.039084792137145996, "step": 23 }, { "epoch": 0.14222222222222222, "grad_norm": 53.748442869248805, "learning_rate": 8.888888888888888e-08, "logits/chosen": -2.1273233890533447, "logits/rejected": -1.962372899055481, "logps/chosen": -43.61072540283203, "logps/rejected": -52.419864654541016, "loss": 0.7231, "rewards/accuracies": 0.5, "rewards/chosen": 0.054617494344711304, "rewards/margins": -0.019975215196609497, "rewards/rejected": 0.0745927095413208, "step": 24 }, { "epoch": 0.14814814814814814, "grad_norm": 48.38445431031237, "learning_rate": 9.259259259259258e-08, "logits/chosen": -1.7700583934783936, "logits/rejected": -1.822185754776001, "logps/chosen": -38.442745208740234, "logps/rejected": -49.97007751464844, "loss": 0.6872, "rewards/accuracies": 0.625, "rewards/chosen": 0.014503806829452515, "rewards/margins": 0.08013251423835754, "rewards/rejected": -0.06562870740890503, "step": 25 }, { "epoch": 0.15407407407407409, "grad_norm": 53.61488969779955, "learning_rate": 9.629629629629629e-08, "logits/chosen": -1.6693568229675293, "logits/rejected": -1.666528582572937, "logps/chosen": -43.25126266479492, "logps/rejected": -53.571266174316406, "loss": 0.7053, "rewards/accuracies": 0.5625, "rewards/chosen": 0.022118449211120605, "rewards/margins": 0.0469089150428772, "rewards/rejected": -0.024790465831756592, "step": 26 }, { "epoch": 0.16, "grad_norm": 59.47709615901772, "learning_rate": 1e-07, "logits/chosen": -1.3913928270339966, "logits/rejected": -1.6422309875488281, "logps/chosen": -48.43421936035156, "logps/rejected": -44.25364685058594, "loss": 0.6948, "rewards/accuracies": 0.5625, "rewards/chosen": 0.02788090705871582, "rewards/margins": 0.027790188789367676, "rewards/rejected": 9.071826934814453e-05, "step": 27 }, { "epoch": 0.16592592592592592, "grad_norm": 51.38601767812816, "learning_rate": 1.0370370370370369e-07, "logits/chosen": -1.5396907329559326, "logits/rejected": -1.4416035413742065, "logps/chosen": -34.06419372558594, "logps/rejected": -43.13043975830078, "loss": 0.6867, "rewards/accuracies": 0.5, "rewards/chosen": -0.027874916791915894, "rewards/margins": -0.0011770427227020264, "rewards/rejected": -0.026697874069213867, "step": 28 }, { "epoch": 0.17185185185185184, "grad_norm": 49.75849658704746, "learning_rate": 1.074074074074074e-07, "logits/chosen": -1.2683113813400269, "logits/rejected": -0.9843475222587585, "logps/chosen": -25.81869888305664, "logps/rejected": -43.70813751220703, "loss": 0.7049, "rewards/accuracies": 0.5625, "rewards/chosen": -0.03348975256085396, "rewards/margins": -0.026481706649065018, "rewards/rejected": -0.00700804591178894, "step": 29 }, { "epoch": 0.17777777777777778, "grad_norm": 52.49502678815258, "learning_rate": 1.111111111111111e-07, "logits/chosen": -1.413278341293335, "logits/rejected": -1.4029860496520996, "logps/chosen": -37.33210754394531, "logps/rejected": -52.769691467285156, "loss": 0.6962, "rewards/accuracies": 0.5625, "rewards/chosen": -0.013176381587982178, "rewards/margins": -0.05075275897979736, "rewards/rejected": 0.037576377391815186, "step": 30 }, { "epoch": 0.1837037037037037, "grad_norm": 55.82883153656641, "learning_rate": 1.148148148148148e-07, "logits/chosen": -1.6182873249053955, "logits/rejected": -1.5029940605163574, "logps/chosen": -38.19645309448242, "logps/rejected": -54.64601135253906, "loss": 0.6943, "rewards/accuracies": 0.625, "rewards/chosen": 0.04735572636127472, "rewards/margins": 0.0014861971139907837, "rewards/rejected": 0.045869529247283936, "step": 31 }, { "epoch": 0.18962962962962962, "grad_norm": 51.73567756412023, "learning_rate": 1.1851851851851851e-07, "logits/chosen": -1.3902158737182617, "logits/rejected": -1.4902468919754028, "logps/chosen": -44.617671966552734, "logps/rejected": -50.17817687988281, "loss": 0.7021, "rewards/accuracies": 0.4375, "rewards/chosen": 0.0382617712020874, "rewards/margins": 0.020143568515777588, "rewards/rejected": 0.018118202686309814, "step": 32 }, { "epoch": 0.19555555555555557, "grad_norm": 51.46866040359476, "learning_rate": 1.2222222222222222e-07, "logits/chosen": -1.5765653848648071, "logits/rejected": -1.7073959112167358, "logps/chosen": -43.03544998168945, "logps/rejected": -42.10237121582031, "loss": 0.6699, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0025068819522857666, "rewards/margins": 0.0735115110874176, "rewards/rejected": -0.07100462913513184, "step": 33 }, { "epoch": 0.20148148148148148, "grad_norm": 52.04875175615269, "learning_rate": 1.2592592592592592e-07, "logits/chosen": -0.9567450284957886, "logits/rejected": -0.9443216919898987, "logps/chosen": -38.146095275878906, "logps/rejected": -36.97870635986328, "loss": 0.686, "rewards/accuracies": 0.5, "rewards/chosen": 0.03759458661079407, "rewards/margins": -0.012229889631271362, "rewards/rejected": 0.04982447624206543, "step": 34 }, { "epoch": 0.2074074074074074, "grad_norm": 52.37961759952867, "learning_rate": 1.2962962962962961e-07, "logits/chosen": -1.5861988067626953, "logits/rejected": -1.3526108264923096, "logps/chosen": -40.725303649902344, "logps/rejected": -53.843467712402344, "loss": 0.6964, "rewards/accuracies": 0.5, "rewards/chosen": -0.04395633935928345, "rewards/margins": -0.035573214292526245, "rewards/rejected": -0.008383125066757202, "step": 35 }, { "epoch": 0.21333333333333335, "grad_norm": 52.20405363764235, "learning_rate": 1.3333333333333334e-07, "logits/chosen": -2.0409719944000244, "logits/rejected": -1.8356863260269165, "logps/chosen": -36.4237174987793, "logps/rejected": -54.58771514892578, "loss": 0.693, "rewards/accuracies": 0.5625, "rewards/chosen": 0.01013416051864624, "rewards/margins": 0.023032546043395996, "rewards/rejected": -0.012898385524749756, "step": 36 }, { "epoch": 0.21925925925925926, "grad_norm": 49.83353701641285, "learning_rate": 1.3703703703703703e-07, "logits/chosen": -1.9610185623168945, "logits/rejected": -1.8862241506576538, "logps/chosen": -32.727813720703125, "logps/rejected": -46.04048156738281, "loss": 0.68, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0113593190908432, "rewards/margins": 0.03532211482524872, "rewards/rejected": -0.023962795734405518, "step": 37 }, { "epoch": 0.22518518518518518, "grad_norm": 91.63451568348084, "learning_rate": 1.4074074074074075e-07, "logits/chosen": -2.103301525115967, "logits/rejected": -2.1240031719207764, "logps/chosen": -43.680843353271484, "logps/rejected": -43.4760627746582, "loss": 0.6831, "rewards/accuracies": 0.6875, "rewards/chosen": 0.02546370029449463, "rewards/margins": 0.09499198198318481, "rewards/rejected": -0.06952828168869019, "step": 38 }, { "epoch": 0.2311111111111111, "grad_norm": 50.029380495348924, "learning_rate": 1.4444444444444442e-07, "logits/chosen": -1.7226446866989136, "logits/rejected": -1.6855897903442383, "logps/chosen": -37.30487060546875, "logps/rejected": -39.31515121459961, "loss": 0.6942, "rewards/accuracies": 0.5625, "rewards/chosen": -0.011799216270446777, "rewards/margins": 0.03693026304244995, "rewards/rejected": -0.04872947931289673, "step": 39 }, { "epoch": 0.23703703703703705, "grad_norm": 51.79236934572246, "learning_rate": 1.4814814814814815e-07, "logits/chosen": -1.6022825241088867, "logits/rejected": -1.5007712841033936, "logps/chosen": -43.124755859375, "logps/rejected": -60.236854553222656, "loss": 0.7037, "rewards/accuracies": 0.4375, "rewards/chosen": -0.03802824020385742, "rewards/margins": -0.013065189123153687, "rewards/rejected": -0.024963051080703735, "step": 40 }, { "epoch": 0.24296296296296296, "grad_norm": 51.34662690579377, "learning_rate": 1.5185185185185184e-07, "logits/chosen": -1.7133663892745972, "logits/rejected": -1.7276082038879395, "logps/chosen": -44.17156219482422, "logps/rejected": -56.69810485839844, "loss": 0.6964, "rewards/accuracies": 0.625, "rewards/chosen": -0.0171157568693161, "rewards/margins": 0.04664464294910431, "rewards/rejected": -0.06376039981842041, "step": 41 }, { "epoch": 0.24888888888888888, "grad_norm": 52.11523781421188, "learning_rate": 1.5555555555555556e-07, "logits/chosen": -1.4053561687469482, "logits/rejected": -1.3741445541381836, "logps/chosen": -44.58378601074219, "logps/rejected": -50.028602600097656, "loss": 0.679, "rewards/accuracies": 0.5625, "rewards/chosen": 0.002070978283882141, "rewards/margins": 0.0352911502122879, "rewards/rejected": -0.03322017192840576, "step": 42 }, { "epoch": 0.2548148148148148, "grad_norm": 49.32915916600631, "learning_rate": 1.5925925925925926e-07, "logits/chosen": -1.3316234350204468, "logits/rejected": -1.2330265045166016, "logps/chosen": -27.901004791259766, "logps/rejected": -31.064517974853516, "loss": 0.7082, "rewards/accuracies": 0.625, "rewards/chosen": 0.009749919176101685, "rewards/margins": -0.010031551122665405, "rewards/rejected": 0.01978147029876709, "step": 43 }, { "epoch": 0.2607407407407407, "grad_norm": 56.48104490330988, "learning_rate": 1.6296296296296298e-07, "logits/chosen": -1.796046495437622, "logits/rejected": -1.5567198991775513, "logps/chosen": -32.16002655029297, "logps/rejected": -47.925880432128906, "loss": 0.6863, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0326349139213562, "rewards/margins": 0.0439448356628418, "rewards/rejected": -0.011309921741485596, "step": 44 }, { "epoch": 0.26666666666666666, "grad_norm": 52.28508627480772, "learning_rate": 1.6666666666666665e-07, "logits/chosen": -1.5154943466186523, "logits/rejected": -1.4815261363983154, "logps/chosen": -37.8524055480957, "logps/rejected": -54.14366149902344, "loss": 0.6693, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0024260282516479492, "rewards/margins": 0.07158929109573364, "rewards/rejected": -0.07401531934738159, "step": 45 }, { "epoch": 0.2725925925925926, "grad_norm": 49.33789452340116, "learning_rate": 1.7037037037037035e-07, "logits/chosen": -1.1807670593261719, "logits/rejected": -1.257714033126831, "logps/chosen": -47.12394714355469, "logps/rejected": -49.35055160522461, "loss": 0.6868, "rewards/accuracies": 0.625, "rewards/chosen": 0.018887341022491455, "rewards/margins": 0.025706887245178223, "rewards/rejected": -0.006819546222686768, "step": 46 }, { "epoch": 0.2785185185185185, "grad_norm": 49.24012566537143, "learning_rate": 1.7407407407407407e-07, "logits/chosen": -1.6047768592834473, "logits/rejected": -1.6896839141845703, "logps/chosen": -55.357322692871094, "logps/rejected": -58.79747009277344, "loss": 0.6856, "rewards/accuracies": 0.75, "rewards/chosen": 0.016460120677947998, "rewards/margins": 0.10610747337341309, "rewards/rejected": -0.08964735269546509, "step": 47 }, { "epoch": 0.28444444444444444, "grad_norm": 55.92737374580854, "learning_rate": 1.7777777777777776e-07, "logits/chosen": -1.314180612564087, "logits/rejected": -1.2767497301101685, "logps/chosen": -40.34666442871094, "logps/rejected": -48.74937438964844, "loss": 0.6834, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0430338978767395, "rewards/margins": -0.010425090789794922, "rewards/rejected": -0.03260880708694458, "step": 48 }, { "epoch": 0.2903703703703704, "grad_norm": 49.01432924086448, "learning_rate": 1.8148148148148149e-07, "logits/chosen": -1.3731322288513184, "logits/rejected": -1.4195034503936768, "logps/chosen": -34.62800598144531, "logps/rejected": -43.32075500488281, "loss": 0.6605, "rewards/accuracies": 0.625, "rewards/chosen": -0.00801500678062439, "rewards/margins": 0.04836231470108032, "rewards/rejected": -0.05637732148170471, "step": 49 }, { "epoch": 0.2962962962962963, "grad_norm": 48.181313971977815, "learning_rate": 1.8518518518518516e-07, "logits/chosen": -1.6057178974151611, "logits/rejected": -1.5365872383117676, "logps/chosen": -25.938697814941406, "logps/rejected": -43.2042350769043, "loss": 0.6415, "rewards/accuracies": 0.6875, "rewards/chosen": 0.03604808449745178, "rewards/margins": 0.07843932509422302, "rewards/rejected": -0.04239124059677124, "step": 50 }, { "epoch": 0.3022222222222222, "grad_norm": 49.376325941861374, "learning_rate": 1.8888888888888888e-07, "logits/chosen": -1.6562511920928955, "logits/rejected": -1.6392525434494019, "logps/chosen": -47.340354919433594, "logps/rejected": -57.06756591796875, "loss": 0.6565, "rewards/accuracies": 0.625, "rewards/chosen": -0.007433861494064331, "rewards/margins": 0.07025310397148132, "rewards/rejected": -0.07768696546554565, "step": 51 }, { "epoch": 0.30814814814814817, "grad_norm": 47.893215154872685, "learning_rate": 1.9259259259259257e-07, "logits/chosen": -1.4644725322723389, "logits/rejected": -1.3114018440246582, "logps/chosen": -30.613296508789062, "logps/rejected": -42.53533172607422, "loss": 0.6799, "rewards/accuracies": 0.5625, "rewards/chosen": -0.04141402244567871, "rewards/margins": 0.0248582661151886, "rewards/rejected": -0.06627228856086731, "step": 52 }, { "epoch": 0.31407407407407406, "grad_norm": 48.83713343476518, "learning_rate": 1.962962962962963e-07, "logits/chosen": -1.1819308996200562, "logits/rejected": -1.2800672054290771, "logps/chosen": -32.65019989013672, "logps/rejected": -32.47792053222656, "loss": 0.6795, "rewards/accuracies": 0.4375, "rewards/chosen": -0.05936972796916962, "rewards/margins": 0.005213424563407898, "rewards/rejected": -0.06458315253257751, "step": 53 }, { "epoch": 0.32, "grad_norm": 49.333664814053236, "learning_rate": 2e-07, "logits/chosen": -1.4133021831512451, "logits/rejected": -1.3248672485351562, "logps/chosen": -39.12198257446289, "logps/rejected": -60.79273223876953, "loss": 0.6495, "rewards/accuracies": 0.625, "rewards/chosen": -0.0566544234752655, "rewards/margins": 0.06260296702384949, "rewards/rejected": -0.11925739049911499, "step": 54 }, { "epoch": 0.32592592592592595, "grad_norm": 52.153743932483735, "learning_rate": 2.0370370370370369e-07, "logits/chosen": -1.989043951034546, "logits/rejected": -1.7848541736602783, "logps/chosen": -35.482582092285156, "logps/rejected": -52.01215362548828, "loss": 0.6858, "rewards/accuracies": 0.625, "rewards/chosen": -0.03922635316848755, "rewards/margins": 0.014412403106689453, "rewards/rejected": -0.053638756275177, "step": 55 }, { "epoch": 0.33185185185185184, "grad_norm": 51.7443158402984, "learning_rate": 2.0740740740740738e-07, "logits/chosen": -1.6254875659942627, "logits/rejected": -1.689445972442627, "logps/chosen": -54.14313507080078, "logps/rejected": -59.49906921386719, "loss": 0.682, "rewards/accuracies": 0.5, "rewards/chosen": -0.04046362638473511, "rewards/margins": 0.05603635311126709, "rewards/rejected": -0.0964999794960022, "step": 56 }, { "epoch": 0.3377777777777778, "grad_norm": 50.651540848490434, "learning_rate": 2.111111111111111e-07, "logits/chosen": -1.3982229232788086, "logits/rejected": -1.4037715196609497, "logps/chosen": -43.407264709472656, "logps/rejected": -44.814266204833984, "loss": 0.6784, "rewards/accuracies": 0.6875, "rewards/chosen": 0.06774739176034927, "rewards/margins": 0.04890603572130203, "rewards/rejected": 0.01884135603904724, "step": 57 }, { "epoch": 0.3437037037037037, "grad_norm": 48.03187108978971, "learning_rate": 2.148148148148148e-07, "logits/chosen": -1.647029161453247, "logits/rejected": -1.6315536499023438, "logps/chosen": -39.03837585449219, "logps/rejected": -47.67170333862305, "loss": 0.6386, "rewards/accuracies": 0.8125, "rewards/chosen": -0.009533137083053589, "rewards/margins": 0.11656391620635986, "rewards/rejected": -0.12609705328941345, "step": 58 }, { "epoch": 0.3496296296296296, "grad_norm": 47.820881352326055, "learning_rate": 2.1851851851851852e-07, "logits/chosen": -0.7058523893356323, "logits/rejected": -0.7873870730400085, "logps/chosen": -40.064552307128906, "logps/rejected": -42.425132751464844, "loss": 0.6754, "rewards/accuracies": 0.625, "rewards/chosen": 0.0005445778369903564, "rewards/margins": 0.09737929701805115, "rewards/rejected": -0.09683471918106079, "step": 59 }, { "epoch": 0.35555555555555557, "grad_norm": 52.192096809034254, "learning_rate": 2.222222222222222e-07, "logits/chosen": -1.3768231868743896, "logits/rejected": -1.2123544216156006, "logps/chosen": -35.475467681884766, "logps/rejected": -55.790775299072266, "loss": 0.6618, "rewards/accuracies": 0.75, "rewards/chosen": -0.012328773736953735, "rewards/margins": 0.1337939202785492, "rewards/rejected": -0.14612269401550293, "step": 60 }, { "epoch": 0.36148148148148146, "grad_norm": 47.41837940906366, "learning_rate": 2.2592592592592591e-07, "logits/chosen": -1.6038029193878174, "logits/rejected": -1.5517463684082031, "logps/chosen": -31.51889419555664, "logps/rejected": -42.58946990966797, "loss": 0.6697, "rewards/accuracies": 0.4375, "rewards/chosen": -0.08359864354133606, "rewards/margins": 0.0298750102519989, "rewards/rejected": -0.11347365379333496, "step": 61 }, { "epoch": 0.3674074074074074, "grad_norm": 48.790002508106205, "learning_rate": 2.296296296296296e-07, "logits/chosen": -1.6775469779968262, "logits/rejected": -1.626863956451416, "logps/chosen": -35.4007568359375, "logps/rejected": -45.62629699707031, "loss": 0.6594, "rewards/accuracies": 0.6875, "rewards/chosen": -0.033218562602996826, "rewards/margins": 0.03804764151573181, "rewards/rejected": -0.07126620411872864, "step": 62 }, { "epoch": 0.37333333333333335, "grad_norm": 48.66526101519147, "learning_rate": 2.3333333333333333e-07, "logits/chosen": -1.521484136581421, "logits/rejected": -1.5792737007141113, "logps/chosen": -39.2108039855957, "logps/rejected": -46.792152404785156, "loss": 0.6432, "rewards/accuracies": 0.875, "rewards/chosen": 0.034676894545555115, "rewards/margins": 0.20874853432178497, "rewards/rejected": -0.17407163977622986, "step": 63 }, { "epoch": 0.37925925925925924, "grad_norm": 50.80353184657783, "learning_rate": 2.3703703703703703e-07, "logits/chosen": -1.6936514377593994, "logits/rejected": -1.5413093566894531, "logps/chosen": -34.11452865600586, "logps/rejected": -52.953857421875, "loss": 0.6335, "rewards/accuracies": 0.6875, "rewards/chosen": -0.016986578702926636, "rewards/margins": 0.07408609986305237, "rewards/rejected": -0.091072678565979, "step": 64 }, { "epoch": 0.3851851851851852, "grad_norm": 48.49871450799722, "learning_rate": 2.407407407407407e-07, "logits/chosen": -1.500447392463684, "logits/rejected": -1.4313578605651855, "logps/chosen": -38.88074493408203, "logps/rejected": -50.81879425048828, "loss": 0.6497, "rewards/accuracies": 0.75, "rewards/chosen": -0.05385851860046387, "rewards/margins": 0.1640552282333374, "rewards/rejected": -0.21791374683380127, "step": 65 }, { "epoch": 0.39111111111111113, "grad_norm": 47.16678228397842, "learning_rate": 2.4444444444444445e-07, "logits/chosen": -1.55772066116333, "logits/rejected": -1.4572783708572388, "logps/chosen": -39.041255950927734, "logps/rejected": -43.814151763916016, "loss": 0.631, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08624696731567383, "rewards/margins": 0.068337082862854, "rewards/rejected": -0.15458405017852783, "step": 66 }, { "epoch": 0.397037037037037, "grad_norm": 52.21526548282161, "learning_rate": 2.4814814814814814e-07, "logits/chosen": -1.925466775894165, "logits/rejected": -1.8819047212600708, "logps/chosen": -34.829891204833984, "logps/rejected": -41.185691833496094, "loss": 0.6401, "rewards/accuracies": 0.625, "rewards/chosen": -0.0699830949306488, "rewards/margins": 0.05667153000831604, "rewards/rejected": -0.12665462493896484, "step": 67 }, { "epoch": 0.40296296296296297, "grad_norm": 46.527865628613704, "learning_rate": 2.5185185185185184e-07, "logits/chosen": -1.6865217685699463, "logits/rejected": -1.4329313039779663, "logps/chosen": -28.436674118041992, "logps/rejected": -50.398094177246094, "loss": 0.6594, "rewards/accuracies": 0.6875, "rewards/chosen": -0.03219631314277649, "rewards/margins": 0.21057480573654175, "rewards/rejected": -0.24277111887931824, "step": 68 }, { "epoch": 0.4088888888888889, "grad_norm": 52.777421185335456, "learning_rate": 2.5555555555555553e-07, "logits/chosen": -1.4382909536361694, "logits/rejected": -1.316765308380127, "logps/chosen": -33.29138946533203, "logps/rejected": -40.392578125, "loss": 0.6415, "rewards/accuracies": 0.75, "rewards/chosen": -0.00170879065990448, "rewards/margins": 0.1503458172082901, "rewards/rejected": -0.15205460786819458, "step": 69 }, { "epoch": 0.4148148148148148, "grad_norm": 48.73012974262115, "learning_rate": 2.5925925925925923e-07, "logits/chosen": -1.265784740447998, "logits/rejected": -1.2845590114593506, "logps/chosen": -41.45735168457031, "logps/rejected": -47.93992614746094, "loss": 0.6418, "rewards/accuracies": 0.625, "rewards/chosen": 0.022552520036697388, "rewards/margins": 0.18044358491897583, "rewards/rejected": -0.15789106488227844, "step": 70 }, { "epoch": 0.42074074074074075, "grad_norm": 47.468267237095105, "learning_rate": 2.629629629629629e-07, "logits/chosen": -2.099151134490967, "logits/rejected": -1.9303994178771973, "logps/chosen": -35.98270034790039, "logps/rejected": -59.536529541015625, "loss": 0.6412, "rewards/accuracies": 0.6875, "rewards/chosen": -0.019120603799819946, "rewards/margins": 0.0822630226612091, "rewards/rejected": -0.10138362646102905, "step": 71 }, { "epoch": 0.4266666666666667, "grad_norm": 44.72180715869609, "learning_rate": 2.6666666666666667e-07, "logits/chosen": -1.867506980895996, "logits/rejected": -1.9733811616897583, "logps/chosen": -45.759056091308594, "logps/rejected": -49.942962646484375, "loss": 0.61, "rewards/accuracies": 0.75, "rewards/chosen": -0.044030994176864624, "rewards/margins": 0.20099851489067078, "rewards/rejected": -0.2450295090675354, "step": 72 }, { "epoch": 0.4325925925925926, "grad_norm": 45.529088721861704, "learning_rate": 2.7037037037037037e-07, "logits/chosen": -1.7743897438049316, "logits/rejected": -1.6937413215637207, "logps/chosen": -31.361610412597656, "logps/rejected": -46.95793914794922, "loss": 0.6292, "rewards/accuracies": 0.625, "rewards/chosen": -0.07771135866641998, "rewards/margins": 0.14026887714862823, "rewards/rejected": -0.21798023581504822, "step": 73 }, { "epoch": 0.43851851851851853, "grad_norm": 45.588644884343736, "learning_rate": 2.7407407407407406e-07, "logits/chosen": -1.773651361465454, "logits/rejected": -1.704150915145874, "logps/chosen": -35.21525955200195, "logps/rejected": -47.804935455322266, "loss": 0.6131, "rewards/accuracies": 0.6875, "rewards/chosen": -0.09824433922767639, "rewards/margins": 0.1910991370677948, "rewards/rejected": -0.2893434762954712, "step": 74 }, { "epoch": 0.4444444444444444, "grad_norm": 45.93590071651665, "learning_rate": 2.7777777777777776e-07, "logits/chosen": -1.6096124649047852, "logits/rejected": -1.6175916194915771, "logps/chosen": -39.06000900268555, "logps/rejected": -54.238365173339844, "loss": 0.6101, "rewards/accuracies": 0.625, "rewards/chosen": -0.115651935338974, "rewards/margins": 0.15163597464561462, "rewards/rejected": -0.2672879099845886, "step": 75 }, { "epoch": 0.45037037037037037, "grad_norm": 47.95259419432817, "learning_rate": 2.814814814814815e-07, "logits/chosen": -1.5724971294403076, "logits/rejected": -1.5149195194244385, "logps/chosen": -37.35981750488281, "logps/rejected": -48.432159423828125, "loss": 0.6448, "rewards/accuracies": 0.625, "rewards/chosen": -0.11759337782859802, "rewards/margins": 0.18581095337867737, "rewards/rejected": -0.3034043312072754, "step": 76 }, { "epoch": 0.4562962962962963, "grad_norm": 47.09881943830815, "learning_rate": 2.851851851851852e-07, "logits/chosen": -1.4347920417785645, "logits/rejected": -1.369720458984375, "logps/chosen": -42.203067779541016, "logps/rejected": -49.453102111816406, "loss": 0.5943, "rewards/accuracies": 0.9375, "rewards/chosen": -0.07791465520858765, "rewards/margins": 0.3344114422798157, "rewards/rejected": -0.4123260974884033, "step": 77 }, { "epoch": 0.4622222222222222, "grad_norm": 44.2801384081521, "learning_rate": 2.8888888888888885e-07, "logits/chosen": -1.570386290550232, "logits/rejected": -1.4483294486999512, "logps/chosen": -35.98171615600586, "logps/rejected": -49.303775787353516, "loss": 0.5816, "rewards/accuracies": 0.875, "rewards/chosen": -0.062459707260131836, "rewards/margins": 0.3032795786857605, "rewards/rejected": -0.36573928594589233, "step": 78 }, { "epoch": 0.46814814814814815, "grad_norm": 43.35068917072876, "learning_rate": 2.9259259259259254e-07, "logits/chosen": -1.4911473989486694, "logits/rejected": -1.4859997034072876, "logps/chosen": -43.731712341308594, "logps/rejected": -51.12042236328125, "loss": 0.6045, "rewards/accuracies": 0.6875, "rewards/chosen": -0.20656388998031616, "rewards/margins": 0.26457029581069946, "rewards/rejected": -0.4711341857910156, "step": 79 }, { "epoch": 0.4740740740740741, "grad_norm": 47.72342589223719, "learning_rate": 2.962962962962963e-07, "logits/chosen": -1.602846384048462, "logits/rejected": -1.3540472984313965, "logps/chosen": -40.95463943481445, "logps/rejected": -56.518924713134766, "loss": 0.6044, "rewards/accuracies": 0.75, "rewards/chosen": -0.07146379351615906, "rewards/margins": 0.26500311493873596, "rewards/rejected": -0.336466908454895, "step": 80 }, { "epoch": 0.48, "grad_norm": 44.13792564031615, "learning_rate": 3e-07, "logits/chosen": -1.9351346492767334, "logits/rejected": -1.9528157711029053, "logps/chosen": -45.20112609863281, "logps/rejected": -47.75102233886719, "loss": 0.5828, "rewards/accuracies": 0.5625, "rewards/chosen": -0.1623789370059967, "rewards/margins": 0.123433917760849, "rewards/rejected": -0.2858128547668457, "step": 81 }, { "epoch": 0.48592592592592593, "grad_norm": 45.1081090387189, "learning_rate": 3.037037037037037e-07, "logits/chosen": -1.35515296459198, "logits/rejected": -1.2480462789535522, "logps/chosen": -41.036842346191406, "logps/rejected": -52.20813751220703, "loss": 0.5904, "rewards/accuracies": 0.6875, "rewards/chosen": -0.16542187333106995, "rewards/margins": 0.3270244598388672, "rewards/rejected": -0.49244633316993713, "step": 82 }, { "epoch": 0.4918518518518519, "grad_norm": 43.25852049728235, "learning_rate": 3.074074074074074e-07, "logits/chosen": -2.026505708694458, "logits/rejected": -1.778357744216919, "logps/chosen": -40.66118621826172, "logps/rejected": -53.48126983642578, "loss": 0.5743, "rewards/accuracies": 0.5625, "rewards/chosen": -0.12560813128948212, "rewards/margins": 0.23484648764133453, "rewards/rejected": -0.36045461893081665, "step": 83 }, { "epoch": 0.49777777777777776, "grad_norm": 46.12076558091472, "learning_rate": 3.111111111111111e-07, "logits/chosen": -1.7639347314834595, "logits/rejected": -1.836035966873169, "logps/chosen": -39.40026092529297, "logps/rejected": -47.0621223449707, "loss": 0.5967, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1895102560520172, "rewards/margins": 0.31577515602111816, "rewards/rejected": -0.505285382270813, "step": 84 }, { "epoch": 0.5037037037037037, "grad_norm": 40.673959765790045, "learning_rate": 3.148148148148148e-07, "logits/chosen": -1.6891648769378662, "logits/rejected": -1.4950172901153564, "logps/chosen": -39.709754943847656, "logps/rejected": -51.628135681152344, "loss": 0.5301, "rewards/accuracies": 0.875, "rewards/chosen": -0.19943200051784515, "rewards/margins": 0.4749172031879425, "rewards/rejected": -0.6743491888046265, "step": 85 }, { "epoch": 0.5096296296296297, "grad_norm": 41.51680039866243, "learning_rate": 3.185185185185185e-07, "logits/chosen": -1.5121097564697266, "logits/rejected": -1.367327094078064, "logps/chosen": -42.37049102783203, "logps/rejected": -57.95114517211914, "loss": 0.5417, "rewards/accuracies": 0.875, "rewards/chosen": -0.09324803948402405, "rewards/margins": 0.4947849214076996, "rewards/rejected": -0.5880329608917236, "step": 86 }, { "epoch": 0.5155555555555555, "grad_norm": 43.23322172437616, "learning_rate": 3.222222222222222e-07, "logits/chosen": -1.559330701828003, "logits/rejected": -1.2308735847473145, "logps/chosen": -39.38884735107422, "logps/rejected": -53.98082733154297, "loss": 0.5726, "rewards/accuracies": 0.8125, "rewards/chosen": -0.17123621702194214, "rewards/margins": 0.3211599588394165, "rewards/rejected": -0.49239617586135864, "step": 87 }, { "epoch": 0.5214814814814814, "grad_norm": 44.62237450482671, "learning_rate": 3.2592592592592596e-07, "logits/chosen": -1.4969669580459595, "logits/rejected": -1.1791250705718994, "logps/chosen": -30.64980125427246, "logps/rejected": -47.99778747558594, "loss": 0.586, "rewards/accuracies": 0.6875, "rewards/chosen": -0.13224159181118011, "rewards/margins": 0.3085125684738159, "rewards/rejected": -0.4407541751861572, "step": 88 }, { "epoch": 0.5274074074074074, "grad_norm": 45.82415006543612, "learning_rate": 3.296296296296296e-07, "logits/chosen": -1.7968785762786865, "logits/rejected": -1.8232977390289307, "logps/chosen": -45.06230926513672, "logps/rejected": -49.47294235229492, "loss": 0.5492, "rewards/accuracies": 0.8125, "rewards/chosen": -0.27738267183303833, "rewards/margins": 0.3488852381706238, "rewards/rejected": -0.6262679100036621, "step": 89 }, { "epoch": 0.5333333333333333, "grad_norm": 45.46414191234741, "learning_rate": 3.333333333333333e-07, "logits/chosen": -1.6212067604064941, "logits/rejected": -1.634657859802246, "logps/chosen": -43.29548645019531, "logps/rejected": -46.447914123535156, "loss": 0.5794, "rewards/accuracies": 0.6875, "rewards/chosen": -0.20608562231063843, "rewards/margins": 0.4118134379386902, "rewards/rejected": -0.6178990602493286, "step": 90 }, { "epoch": 0.5392592592592592, "grad_norm": 43.39793711865867, "learning_rate": 3.37037037037037e-07, "logits/chosen": -1.5805771350860596, "logits/rejected": -1.5447726249694824, "logps/chosen": -39.61695861816406, "logps/rejected": -47.621971130371094, "loss": 0.5694, "rewards/accuracies": 0.9375, "rewards/chosen": -0.19198113679885864, "rewards/margins": 0.45119503140449524, "rewards/rejected": -0.6431761980056763, "step": 91 }, { "epoch": 0.5451851851851852, "grad_norm": 41.97640066399846, "learning_rate": 3.407407407407407e-07, "logits/chosen": -1.6754682064056396, "logits/rejected": -1.4996973276138306, "logps/chosen": -35.48243713378906, "logps/rejected": -46.71401596069336, "loss": 0.5619, "rewards/accuracies": 0.6875, "rewards/chosen": -0.10602587461471558, "rewards/margins": 0.4457920789718628, "rewards/rejected": -0.5518179535865784, "step": 92 }, { "epoch": 0.5511111111111111, "grad_norm": 44.228366108054274, "learning_rate": 3.4444444444444444e-07, "logits/chosen": -1.4117846488952637, "logits/rejected": -1.4865397214889526, "logps/chosen": -42.30179214477539, "logps/rejected": -43.16421890258789, "loss": 0.6, "rewards/accuracies": 0.5625, "rewards/chosen": -0.27544260025024414, "rewards/margins": 0.013427436351776123, "rewards/rejected": -0.28887003660202026, "step": 93 }, { "epoch": 0.557037037037037, "grad_norm": 40.85074060393982, "learning_rate": 3.4814814814814814e-07, "logits/chosen": -1.7078298330307007, "logits/rejected": -1.8396275043487549, "logps/chosen": -43.45661163330078, "logps/rejected": -36.45011520385742, "loss": 0.524, "rewards/accuracies": 0.75, "rewards/chosen": -0.18786099553108215, "rewards/margins": 0.4306405782699585, "rewards/rejected": -0.6185015439987183, "step": 94 }, { "epoch": 0.562962962962963, "grad_norm": 42.13291872541957, "learning_rate": 3.5185185185185183e-07, "logits/chosen": -1.4014463424682617, "logits/rejected": -1.314795970916748, "logps/chosen": -35.93153381347656, "logps/rejected": -47.03495788574219, "loss": 0.5286, "rewards/accuracies": 0.75, "rewards/chosen": -0.1643732786178589, "rewards/margins": 0.5755838751792908, "rewards/rejected": -0.7399571537971497, "step": 95 }, { "epoch": 0.5688888888888889, "grad_norm": 42.917662210162185, "learning_rate": 3.5555555555555553e-07, "logits/chosen": -1.393978476524353, "logits/rejected": -1.306774377822876, "logps/chosen": -47.09367370605469, "logps/rejected": -61.688514709472656, "loss": 0.5234, "rewards/accuracies": 0.8125, "rewards/chosen": -0.31930363178253174, "rewards/margins": 0.44419366121292114, "rewards/rejected": -0.7634972929954529, "step": 96 }, { "epoch": 0.5748148148148148, "grad_norm": 41.93635144096363, "learning_rate": 3.592592592592593e-07, "logits/chosen": -1.4364768266677856, "logits/rejected": -1.138240098953247, "logps/chosen": -43.779056549072266, "logps/rejected": -50.88738250732422, "loss": 0.5004, "rewards/accuracies": 0.8125, "rewards/chosen": -0.25555330514907837, "rewards/margins": 0.5455037355422974, "rewards/rejected": -0.8010570406913757, "step": 97 }, { "epoch": 0.5807407407407408, "grad_norm": 48.44364004513561, "learning_rate": 3.6296296296296297e-07, "logits/chosen": -1.3962883949279785, "logits/rejected": -1.345455527305603, "logps/chosen": -41.076805114746094, "logps/rejected": -50.62786865234375, "loss": 0.567, "rewards/accuracies": 0.8125, "rewards/chosen": -0.18818557262420654, "rewards/margins": 0.32878220081329346, "rewards/rejected": -0.5169677734375, "step": 98 }, { "epoch": 0.5866666666666667, "grad_norm": 40.709285626891656, "learning_rate": 3.666666666666666e-07, "logits/chosen": -1.6217023134231567, "logits/rejected": -1.455048680305481, "logps/chosen": -27.7904052734375, "logps/rejected": -50.464454650878906, "loss": 0.5517, "rewards/accuracies": 0.75, "rewards/chosen": -0.12327791750431061, "rewards/margins": 0.31492501497268677, "rewards/rejected": -0.4382029175758362, "step": 99 }, { "epoch": 0.5925925925925926, "grad_norm": 44.49791847791134, "learning_rate": 3.703703703703703e-07, "logits/chosen": -1.3246082067489624, "logits/rejected": -1.341948390007019, "logps/chosen": -45.43359375, "logps/rejected": -64.56365203857422, "loss": 0.5368, "rewards/accuracies": 0.875, "rewards/chosen": -0.2600909173488617, "rewards/margins": 0.6141197681427002, "rewards/rejected": -0.8742106556892395, "step": 100 }, { "epoch": 0.5985185185185186, "grad_norm": 42.97756774543605, "learning_rate": 3.7407407407407406e-07, "logits/chosen": -1.2443385124206543, "logits/rejected": -1.0570563077926636, "logps/chosen": -36.11662673950195, "logps/rejected": -52.12215805053711, "loss": 0.5246, "rewards/accuracies": 0.8125, "rewards/chosen": -0.32490265369415283, "rewards/margins": 0.6514842510223389, "rewards/rejected": -0.9763869047164917, "step": 101 }, { "epoch": 0.6044444444444445, "grad_norm": 42.33008530948347, "learning_rate": 3.7777777777777775e-07, "logits/chosen": -1.480468988418579, "logits/rejected": -1.4824107885360718, "logps/chosen": -32.58100891113281, "logps/rejected": -38.337615966796875, "loss": 0.5293, "rewards/accuracies": 0.6875, "rewards/chosen": -0.23031967878341675, "rewards/margins": 0.4455646872520447, "rewards/rejected": -0.6758843660354614, "step": 102 }, { "epoch": 0.6103703703703703, "grad_norm": 37.653216263070966, "learning_rate": 3.8148148148148145e-07, "logits/chosen": -1.830120325088501, "logits/rejected": -1.9020617008209229, "logps/chosen": -35.33047866821289, "logps/rejected": -38.83496856689453, "loss": 0.5366, "rewards/accuracies": 0.875, "rewards/chosen": -0.2223891317844391, "rewards/margins": 0.4304327070713043, "rewards/rejected": -0.6528218388557434, "step": 103 }, { "epoch": 0.6162962962962963, "grad_norm": 37.93219958276944, "learning_rate": 3.8518518518518515e-07, "logits/chosen": -1.184201717376709, "logits/rejected": -1.1568915843963623, "logps/chosen": -41.511451721191406, "logps/rejected": -51.12090301513672, "loss": 0.4447, "rewards/accuracies": 0.875, "rewards/chosen": -0.16868919134140015, "rewards/margins": 1.0149188041687012, "rewards/rejected": -1.1836079359054565, "step": 104 }, { "epoch": 0.6222222222222222, "grad_norm": 44.46824215729115, "learning_rate": 3.888888888888889e-07, "logits/chosen": -1.642662525177002, "logits/rejected": -1.373224139213562, "logps/chosen": -37.704383850097656, "logps/rejected": -57.575584411621094, "loss": 0.559, "rewards/accuracies": 0.625, "rewards/chosen": -0.39903172850608826, "rewards/margins": 0.5329186916351318, "rewards/rejected": -0.9319504499435425, "step": 105 }, { "epoch": 0.6281481481481481, "grad_norm": 40.24407413315748, "learning_rate": 3.925925925925926e-07, "logits/chosen": -1.6562988758087158, "logits/rejected": -1.6213304996490479, "logps/chosen": -42.33479309082031, "logps/rejected": -46.03807830810547, "loss": 0.5085, "rewards/accuracies": 0.75, "rewards/chosen": -0.3010629415512085, "rewards/margins": 0.7453230023384094, "rewards/rejected": -1.0463860034942627, "step": 106 }, { "epoch": 0.6340740740740741, "grad_norm": 38.972579462755846, "learning_rate": 3.962962962962963e-07, "logits/chosen": -1.3949552774429321, "logits/rejected": -1.2594126462936401, "logps/chosen": -40.719146728515625, "logps/rejected": -60.80188751220703, "loss": 0.441, "rewards/accuracies": 0.6875, "rewards/chosen": -0.32348203659057617, "rewards/margins": 0.8667979836463928, "rewards/rejected": -1.1902799606323242, "step": 107 }, { "epoch": 0.64, "grad_norm": 36.39536195986827, "learning_rate": 4e-07, "logits/chosen": -2.0106091499328613, "logits/rejected": -1.8416554927825928, "logps/chosen": -39.52052307128906, "logps/rejected": -52.07545471191406, "loss": 0.4314, "rewards/accuracies": 0.875, "rewards/chosen": -0.18241432309150696, "rewards/margins": 0.9339324235916138, "rewards/rejected": -1.116346836090088, "step": 108 }, { "epoch": 0.6459259259259259, "grad_norm": 44.98515264402161, "learning_rate": 4.0370370370370373e-07, "logits/chosen": -1.7714054584503174, "logits/rejected": -1.7249916791915894, "logps/chosen": -39.17259216308594, "logps/rejected": -56.0714111328125, "loss": 0.524, "rewards/accuracies": 0.875, "rewards/chosen": -0.29726728796958923, "rewards/margins": 1.1248726844787598, "rewards/rejected": -1.4221400022506714, "step": 109 }, { "epoch": 0.6518518518518519, "grad_norm": 44.824508106883286, "learning_rate": 4.0740740740740737e-07, "logits/chosen": -1.0125787258148193, "logits/rejected": -0.9560598134994507, "logps/chosen": -42.5981330871582, "logps/rejected": -50.66206359863281, "loss": 0.5808, "rewards/accuracies": 0.625, "rewards/chosen": -0.5798179507255554, "rewards/margins": 0.5451233386993408, "rewards/rejected": -1.124941349029541, "step": 110 }, { "epoch": 0.6577777777777778, "grad_norm": 38.90156797225312, "learning_rate": 4.1111111111111107e-07, "logits/chosen": -1.1894886493682861, "logits/rejected": -1.1232939958572388, "logps/chosen": -37.06401062011719, "logps/rejected": -47.91145324707031, "loss": 0.4815, "rewards/accuracies": 0.75, "rewards/chosen": -0.3007606565952301, "rewards/margins": 0.7585300207138062, "rewards/rejected": -1.0592906475067139, "step": 111 }, { "epoch": 0.6637037037037037, "grad_norm": 49.6237498585263, "learning_rate": 4.1481481481481476e-07, "logits/chosen": -1.2994275093078613, "logits/rejected": -1.5018212795257568, "logps/chosen": -55.228172302246094, "logps/rejected": -54.287559509277344, "loss": 0.598, "rewards/accuracies": 0.8125, "rewards/chosen": -0.679509162902832, "rewards/margins": 0.5942882299423218, "rewards/rejected": -1.2737973928451538, "step": 112 }, { "epoch": 0.6696296296296296, "grad_norm": 43.2646193758696, "learning_rate": 4.185185185185185e-07, "logits/chosen": -1.492136836051941, "logits/rejected": -1.4424383640289307, "logps/chosen": -35.74744415283203, "logps/rejected": -43.59991455078125, "loss": 0.555, "rewards/accuracies": 0.75, "rewards/chosen": -0.33821651339530945, "rewards/margins": 0.5227227210998535, "rewards/rejected": -0.8609392046928406, "step": 113 }, { "epoch": 0.6755555555555556, "grad_norm": 41.14792424525068, "learning_rate": 4.222222222222222e-07, "logits/chosen": -1.5096982717514038, "logits/rejected": -1.4714512825012207, "logps/chosen": -38.27312469482422, "logps/rejected": -44.96509552001953, "loss": 0.4681, "rewards/accuracies": 0.75, "rewards/chosen": -0.4304978847503662, "rewards/margins": 0.5347405076026917, "rewards/rejected": -0.9652383923530579, "step": 114 }, { "epoch": 0.6814814814814815, "grad_norm": 40.18814061966553, "learning_rate": 4.259259259259259e-07, "logits/chosen": -1.5355191230773926, "logits/rejected": -1.524069905281067, "logps/chosen": -43.97163391113281, "logps/rejected": -55.98404312133789, "loss": 0.4432, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2595410943031311, "rewards/margins": 1.4614770412445068, "rewards/rejected": -1.7210183143615723, "step": 115 }, { "epoch": 0.6874074074074074, "grad_norm": 34.642341791275506, "learning_rate": 4.296296296296296e-07, "logits/chosen": -1.4036482572555542, "logits/rejected": -1.3584758043289185, "logps/chosen": -39.086036682128906, "logps/rejected": -52.415191650390625, "loss": 0.4359, "rewards/accuracies": 0.8125, "rewards/chosen": -0.47538018226623535, "rewards/margins": 0.8739761710166931, "rewards/rejected": -1.3493562936782837, "step": 116 }, { "epoch": 0.6933333333333334, "grad_norm": 35.64480707741049, "learning_rate": 4.3333333333333335e-07, "logits/chosen": -1.396484613418579, "logits/rejected": -1.3210082054138184, "logps/chosen": -44.036865234375, "logps/rejected": -54.3887939453125, "loss": 0.4008, "rewards/accuracies": 0.875, "rewards/chosen": -0.5203036069869995, "rewards/margins": 1.411538004875183, "rewards/rejected": -1.9318416118621826, "step": 117 }, { "epoch": 0.6992592592592592, "grad_norm": 38.24792362559405, "learning_rate": 4.3703703703703704e-07, "logits/chosen": -1.3976020812988281, "logits/rejected": -1.3027665615081787, "logps/chosen": -34.96048355102539, "logps/rejected": -47.282859802246094, "loss": 0.4459, "rewards/accuracies": 0.875, "rewards/chosen": -0.35238882899284363, "rewards/margins": 1.0079915523529053, "rewards/rejected": -1.3603804111480713, "step": 118 }, { "epoch": 0.7051851851851851, "grad_norm": 37.84290162399573, "learning_rate": 4.4074074074074074e-07, "logits/chosen": -1.3033220767974854, "logits/rejected": -1.2730566263198853, "logps/chosen": -37.476531982421875, "logps/rejected": -48.643516540527344, "loss": 0.4149, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3667222857475281, "rewards/margins": 1.1727958917617798, "rewards/rejected": -1.5395182371139526, "step": 119 }, { "epoch": 0.7111111111111111, "grad_norm": 37.878664188795796, "learning_rate": 4.444444444444444e-07, "logits/chosen": -0.9400476217269897, "logits/rejected": -0.7184149622917175, "logps/chosen": -31.205936431884766, "logps/rejected": -47.984012603759766, "loss": 0.4405, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4675554037094116, "rewards/margins": 0.7965797781944275, "rewards/rejected": -1.2641352415084839, "step": 120 }, { "epoch": 0.717037037037037, "grad_norm": 40.88887819751045, "learning_rate": 4.4814814814814813e-07, "logits/chosen": -1.7406163215637207, "logits/rejected": -1.7047609090805054, "logps/chosen": -42.533084869384766, "logps/rejected": -57.01613998413086, "loss": 0.4841, "rewards/accuracies": 0.75, "rewards/chosen": -0.7570239305496216, "rewards/margins": 0.736762523651123, "rewards/rejected": -1.493786334991455, "step": 121 }, { "epoch": 0.7229629629629629, "grad_norm": 35.35115029467827, "learning_rate": 4.5185185185185183e-07, "logits/chosen": -1.2737648487091064, "logits/rejected": -1.1840261220932007, "logps/chosen": -37.37749099731445, "logps/rejected": -52.0399055480957, "loss": 0.4191, "rewards/accuracies": 0.875, "rewards/chosen": -0.6177048683166504, "rewards/margins": 1.4750514030456543, "rewards/rejected": -2.0927562713623047, "step": 122 }, { "epoch": 0.7288888888888889, "grad_norm": 38.5026450550663, "learning_rate": 4.555555555555555e-07, "logits/chosen": -1.5902702808380127, "logits/rejected": -1.5567035675048828, "logps/chosen": -44.0450553894043, "logps/rejected": -52.94071960449219, "loss": 0.4411, "rewards/accuracies": 0.75, "rewards/chosen": -0.4093371033668518, "rewards/margins": 1.0345042943954468, "rewards/rejected": -1.4438413381576538, "step": 123 }, { "epoch": 0.7348148148148148, "grad_norm": 30.293538787855763, "learning_rate": 4.592592592592592e-07, "logits/chosen": -1.4934203624725342, "logits/rejected": -1.3282454013824463, "logps/chosen": -39.16450500488281, "logps/rejected": -56.53828048706055, "loss": 0.3579, "rewards/accuracies": 0.875, "rewards/chosen": -0.35691025853157043, "rewards/margins": 1.0708496570587158, "rewards/rejected": -1.4277598857879639, "step": 124 }, { "epoch": 0.7407407407407407, "grad_norm": 47.64124797635846, "learning_rate": 4.6296296296296297e-07, "logits/chosen": -1.4489716291427612, "logits/rejected": -1.516164779663086, "logps/chosen": -49.20244598388672, "logps/rejected": -52.79627990722656, "loss": 0.5055, "rewards/accuracies": 0.75, "rewards/chosen": -0.8552089929580688, "rewards/margins": 0.5824382305145264, "rewards/rejected": -1.4376472234725952, "step": 125 }, { "epoch": 0.7466666666666667, "grad_norm": 40.037576352632705, "learning_rate": 4.6666666666666666e-07, "logits/chosen": -1.3147563934326172, "logits/rejected": -1.3571640253067017, "logps/chosen": -44.24326705932617, "logps/rejected": -52.24501037597656, "loss": 0.4446, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7837151885032654, "rewards/margins": 1.0028468370437622, "rewards/rejected": -1.7865620851516724, "step": 126 }, { "epoch": 0.7525925925925926, "grad_norm": 33.59225363588131, "learning_rate": 4.7037037037037036e-07, "logits/chosen": -1.8176665306091309, "logits/rejected": -1.6222167015075684, "logps/chosen": -28.441362380981445, "logps/rejected": -46.46739959716797, "loss": 0.3895, "rewards/accuracies": 0.875, "rewards/chosen": -0.18460707366466522, "rewards/margins": 1.4571757316589355, "rewards/rejected": -1.6417827606201172, "step": 127 }, { "epoch": 0.7585185185185185, "grad_norm": 39.646954284486725, "learning_rate": 4.7407407407407405e-07, "logits/chosen": -1.2093782424926758, "logits/rejected": -1.2084648609161377, "logps/chosen": -38.774654388427734, "logps/rejected": -52.121620178222656, "loss": 0.4106, "rewards/accuracies": 0.875, "rewards/chosen": -0.5091949701309204, "rewards/margins": 1.171380877494812, "rewards/rejected": -1.6805758476257324, "step": 128 }, { "epoch": 0.7644444444444445, "grad_norm": 38.70377586651715, "learning_rate": 4.777777777777778e-07, "logits/chosen": -1.777665376663208, "logits/rejected": -1.5932849645614624, "logps/chosen": -43.61308288574219, "logps/rejected": -53.84217834472656, "loss": 0.4068, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3957401514053345, "rewards/margins": 1.026474118232727, "rewards/rejected": -1.4222142696380615, "step": 129 }, { "epoch": 0.7703703703703704, "grad_norm": 41.30978823426115, "learning_rate": 4.814814814814814e-07, "logits/chosen": -1.340857982635498, "logits/rejected": -1.3845839500427246, "logps/chosen": -41.08652114868164, "logps/rejected": -50.04093551635742, "loss": 0.4386, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6982749700546265, "rewards/margins": 0.5795612931251526, "rewards/rejected": -1.2778363227844238, "step": 130 }, { "epoch": 0.7762962962962963, "grad_norm": 46.879219229092406, "learning_rate": 4.851851851851852e-07, "logits/chosen": -1.7075390815734863, "logits/rejected": -1.839177131652832, "logps/chosen": -52.101966857910156, "logps/rejected": -49.733314514160156, "loss": 0.4896, "rewards/accuracies": 0.6875, "rewards/chosen": -0.981723427772522, "rewards/margins": 0.5867584943771362, "rewards/rejected": -1.5684819221496582, "step": 131 }, { "epoch": 0.7822222222222223, "grad_norm": 35.95083543821181, "learning_rate": 4.888888888888889e-07, "logits/chosen": -2.1524219512939453, "logits/rejected": -2.247002601623535, "logps/chosen": -46.40180206298828, "logps/rejected": -46.5478515625, "loss": 0.4148, "rewards/accuracies": 0.75, "rewards/chosen": -0.5037150979042053, "rewards/margins": 1.0674867630004883, "rewards/rejected": -1.5712018013000488, "step": 132 }, { "epoch": 0.7881481481481482, "grad_norm": 32.260601736979446, "learning_rate": 4.925925925925926e-07, "logits/chosen": -1.435699701309204, "logits/rejected": -1.4596015214920044, "logps/chosen": -41.34964370727539, "logps/rejected": -56.690032958984375, "loss": 0.3289, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5615026950836182, "rewards/margins": 1.8352725505828857, "rewards/rejected": -2.396775245666504, "step": 133 }, { "epoch": 0.794074074074074, "grad_norm": 41.73394354567568, "learning_rate": 4.962962962962963e-07, "logits/chosen": -0.9744957089424133, "logits/rejected": -0.8672415018081665, "logps/chosen": -33.1612663269043, "logps/rejected": -42.50749206542969, "loss": 0.4965, "rewards/accuracies": 1.0, "rewards/chosen": -0.4336245059967041, "rewards/margins": 1.558875560760498, "rewards/rejected": -1.9925000667572021, "step": 134 }, { "epoch": 0.8, "grad_norm": 38.10147271008432, "learning_rate": 5e-07, "logits/chosen": -1.3705247640609741, "logits/rejected": -1.5914721488952637, "logps/chosen": -47.50928497314453, "logps/rejected": -50.321815490722656, "loss": 0.4227, "rewards/accuracies": 0.875, "rewards/chosen": -0.7019506096839905, "rewards/margins": 1.1958553791046143, "rewards/rejected": -1.89780592918396, "step": 135 }, { "epoch": 0.8059259259259259, "grad_norm": 50.33738386164938, "learning_rate": 4.999991559718872e-07, "logits/chosen": -1.0675755739212036, "logits/rejected": -0.9794799089431763, "logps/chosen": -47.514888763427734, "logps/rejected": -66.84017944335938, "loss": 0.499, "rewards/accuracies": 0.875, "rewards/chosen": -0.6851686239242554, "rewards/margins": 1.2223974466323853, "rewards/rejected": -1.9075660705566406, "step": 136 }, { "epoch": 0.8118518518518518, "grad_norm": 45.70334700227829, "learning_rate": 4.999966238932478e-07, "logits/chosen": -1.2879647016525269, "logits/rejected": -1.3816440105438232, "logps/chosen": -47.61471176147461, "logps/rejected": -46.95405197143555, "loss": 0.4767, "rewards/accuracies": 0.625, "rewards/chosen": -0.6745498180389404, "rewards/margins": 0.8887066841125488, "rewards/rejected": -1.5632563829421997, "step": 137 }, { "epoch": 0.8177777777777778, "grad_norm": 39.634286154920076, "learning_rate": 4.999924037811792e-07, "logits/chosen": -1.9803012609481812, "logits/rejected": -1.9392635822296143, "logps/chosen": -45.58927917480469, "logps/rejected": -66.4295883178711, "loss": 0.4753, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5923961400985718, "rewards/margins": 1.6612188816070557, "rewards/rejected": -2.253614902496338, "step": 138 }, { "epoch": 0.8237037037037037, "grad_norm": 37.260534789973065, "learning_rate": 4.999864956641761e-07, "logits/chosen": -1.2599807977676392, "logits/rejected": -1.1629146337509155, "logps/chosen": -32.816200256347656, "logps/rejected": -40.76869583129883, "loss": 0.4061, "rewards/accuracies": 0.9375, "rewards/chosen": -0.4357667863368988, "rewards/margins": 1.3744035959243774, "rewards/rejected": -1.810170292854309, "step": 139 }, { "epoch": 0.8296296296296296, "grad_norm": 40.00541881208066, "learning_rate": 4.99978899582132e-07, "logits/chosen": -1.776047945022583, "logits/rejected": -1.7421143054962158, "logps/chosen": -41.147705078125, "logps/rejected": -52.4171142578125, "loss": 0.4588, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5953139066696167, "rewards/margins": 1.7757620811462402, "rewards/rejected": -2.3710761070251465, "step": 140 }, { "epoch": 0.8355555555555556, "grad_norm": 42.7543024702637, "learning_rate": 4.999696155863368e-07, "logits/chosen": -1.578580379486084, "logits/rejected": -1.4482641220092773, "logps/chosen": -31.513076782226562, "logps/rejected": -43.173744201660156, "loss": 0.5108, "rewards/accuracies": 0.6875, "rewards/chosen": -0.33485445380210876, "rewards/margins": 1.1695399284362793, "rewards/rejected": -1.5043944120407104, "step": 141 }, { "epoch": 0.8414814814814815, "grad_norm": 38.96998757756861, "learning_rate": 4.999586437394786e-07, "logits/chosen": -1.481893539428711, "logits/rejected": -1.5493358373641968, "logps/chosen": -40.78139877319336, "logps/rejected": -48.02816390991211, "loss": 0.4458, "rewards/accuracies": 0.8125, "rewards/chosen": -0.35072338581085205, "rewards/margins": 1.8705483675003052, "rewards/rejected": -2.2212717533111572, "step": 142 }, { "epoch": 0.8474074074074074, "grad_norm": 37.872628732664225, "learning_rate": 4.999459841156414e-07, "logits/chosen": -1.2435798645019531, "logits/rejected": -1.1472171545028687, "logps/chosen": -31.227603912353516, "logps/rejected": -40.37028503417969, "loss": 0.4717, "rewards/accuracies": 0.875, "rewards/chosen": -0.36621564626693726, "rewards/margins": 1.2962467670440674, "rewards/rejected": -1.6624623537063599, "step": 143 }, { "epoch": 0.8533333333333334, "grad_norm": 38.01721990825646, "learning_rate": 4.999316368003061e-07, "logits/chosen": -1.5544506311416626, "logits/rejected": -1.362791657447815, "logps/chosen": -51.45790100097656, "logps/rejected": -60.38456726074219, "loss": 0.4668, "rewards/accuracies": 0.75, "rewards/chosen": -0.9039173126220703, "rewards/margins": 1.306924819946289, "rewards/rejected": -2.2108421325683594, "step": 144 }, { "epoch": 0.8592592592592593, "grad_norm": 47.57106677206415, "learning_rate": 4.999156018903489e-07, "logits/chosen": -1.6596348285675049, "logits/rejected": -1.6843326091766357, "logps/chosen": -47.816925048828125, "logps/rejected": -50.309932708740234, "loss": 0.5428, "rewards/accuracies": 0.5, "rewards/chosen": -0.5581059455871582, "rewards/margins": 0.2512357831001282, "rewards/rejected": -0.8093417286872864, "step": 145 }, { "epoch": 0.8651851851851852, "grad_norm": 35.787352860041, "learning_rate": 4.998978794940411e-07, "logits/chosen": -1.1359593868255615, "logits/rejected": -1.2077702283859253, "logps/chosen": -47.62373733520508, "logps/rejected": -45.07956314086914, "loss": 0.394, "rewards/accuracies": 0.875, "rewards/chosen": -0.30160394310951233, "rewards/margins": 1.6038706302642822, "rewards/rejected": -1.9054745435714722, "step": 146 }, { "epoch": 0.8711111111111111, "grad_norm": 42.18929063216021, "learning_rate": 4.998784697310482e-07, "logits/chosen": -1.1696794033050537, "logits/rejected": -1.4276272058486938, "logps/chosen": -49.076904296875, "logps/rejected": -47.925071716308594, "loss": 0.4435, "rewards/accuracies": 0.75, "rewards/chosen": -0.44560256600379944, "rewards/margins": 1.376593828201294, "rewards/rejected": -1.822196364402771, "step": 147 }, { "epoch": 0.8770370370370371, "grad_norm": 36.04716697331485, "learning_rate": 4.998573727324294e-07, "logits/chosen": -1.4678583145141602, "logits/rejected": -1.1020907163619995, "logps/chosen": -39.27702331542969, "logps/rejected": -70.841552734375, "loss": 0.4088, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4344024658203125, "rewards/margins": 2.001018524169922, "rewards/rejected": -2.4354209899902344, "step": 148 }, { "epoch": 0.882962962962963, "grad_norm": 34.61517859729992, "learning_rate": 4.998345886406365e-07, "logits/chosen": -1.5456968545913696, "logits/rejected": -1.5202821493148804, "logps/chosen": -36.613433837890625, "logps/rejected": -42.71227264404297, "loss": 0.4214, "rewards/accuracies": 0.6875, "rewards/chosen": -0.25451546907424927, "rewards/margins": 0.7433053255081177, "rewards/rejected": -0.9978208541870117, "step": 149 }, { "epoch": 0.8888888888888888, "grad_norm": 37.1698250651998, "learning_rate": 4.998101176095128e-07, "logits/chosen": -1.3398644924163818, "logits/rejected": -1.3869026899337769, "logps/chosen": -40.0852165222168, "logps/rejected": -55.97785186767578, "loss": 0.4379, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5003520846366882, "rewards/margins": 1.8009438514709473, "rewards/rejected": -2.3012959957122803, "step": 150 }, { "epoch": 0.8948148148148148, "grad_norm": 36.86123772415135, "learning_rate": 4.997839598042919e-07, "logits/chosen": -2.002145290374756, "logits/rejected": -1.9517096281051636, "logps/chosen": -41.4112663269043, "logps/rejected": -51.83033752441406, "loss": 0.3932, "rewards/accuracies": 0.875, "rewards/chosen": -0.4211091101169586, "rewards/margins": 1.41811203956604, "rewards/rejected": -1.8392211198806763, "step": 151 }, { "epoch": 0.9007407407407407, "grad_norm": 44.396493964573885, "learning_rate": 4.997561154015975e-07, "logits/chosen": -1.787272334098816, "logits/rejected": -1.758475422859192, "logps/chosen": -36.319149017333984, "logps/rejected": -44.939476013183594, "loss": 0.5036, "rewards/accuracies": 0.6875, "rewards/chosen": -0.45524048805236816, "rewards/margins": 0.7919211387634277, "rewards/rejected": -1.247161626815796, "step": 152 }, { "epoch": 0.9066666666666666, "grad_norm": 36.380787063292495, "learning_rate": 4.997265845894411e-07, "logits/chosen": -1.5731865167617798, "logits/rejected": -1.6401747465133667, "logps/chosen": -48.35523986816406, "logps/rejected": -41.2987174987793, "loss": 0.3815, "rewards/accuracies": 0.8125, "rewards/chosen": -0.23987555503845215, "rewards/margins": 1.1017296314239502, "rewards/rejected": -1.3416051864624023, "step": 153 }, { "epoch": 0.9125925925925926, "grad_norm": 39.813495349904564, "learning_rate": 4.996953675672213e-07, "logits/chosen": -1.2417954206466675, "logits/rejected": -1.1722739934921265, "logps/chosen": -39.34113693237305, "logps/rejected": -47.16643524169922, "loss": 0.4529, "rewards/accuracies": 0.6875, "rewards/chosen": -0.45460134744644165, "rewards/margins": 0.8995037078857422, "rewards/rejected": -1.354104995727539, "step": 154 }, { "epoch": 0.9185185185185185, "grad_norm": 33.59176857625763, "learning_rate": 4.996624645457227e-07, "logits/chosen": -1.565160870552063, "logits/rejected": -1.8368042707443237, "logps/chosen": -48.77574157714844, "logps/rejected": -49.07792663574219, "loss": 0.3968, "rewards/accuracies": 0.875, "rewards/chosen": -0.27005040645599365, "rewards/margins": 1.148226261138916, "rewards/rejected": -1.4182766675949097, "step": 155 }, { "epoch": 0.9244444444444444, "grad_norm": 32.73969782640533, "learning_rate": 4.996278757471138e-07, "logits/chosen": -1.9420874118804932, "logits/rejected": -1.9092347621917725, "logps/chosen": -36.45257568359375, "logps/rejected": -46.10245895385742, "loss": 0.3288, "rewards/accuracies": 1.0, "rewards/chosen": -0.2543891966342926, "rewards/margins": 2.042929172515869, "rewards/rejected": -2.297318458557129, "step": 156 }, { "epoch": 0.9303703703703704, "grad_norm": 37.40978956784082, "learning_rate": 4.995916014049461e-07, "logits/chosen": -1.2158647775650024, "logits/rejected": -1.3647968769073486, "logps/chosen": -55.8748779296875, "logps/rejected": -57.6270751953125, "loss": 0.4138, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7079750895500183, "rewards/margins": 0.7428067922592163, "rewards/rejected": -1.4507818222045898, "step": 157 }, { "epoch": 0.9362962962962963, "grad_norm": 42.46766006074291, "learning_rate": 4.995536417641517e-07, "logits/chosen": -1.6255074739456177, "logits/rejected": -1.534214973449707, "logps/chosen": -37.175628662109375, "logps/rejected": -48.801025390625, "loss": 0.4694, "rewards/accuracies": 0.75, "rewards/chosen": -0.2790091633796692, "rewards/margins": 1.2557103633880615, "rewards/rejected": -1.534719467163086, "step": 158 }, { "epoch": 0.9422222222222222, "grad_norm": 35.427791373582686, "learning_rate": 4.99513997081043e-07, "logits/chosen": -1.4098460674285889, "logits/rejected": -1.2685893774032593, "logps/chosen": -38.03111267089844, "logps/rejected": -54.027000427246094, "loss": 0.3986, "rewards/accuracies": 0.875, "rewards/chosen": -0.5155774354934692, "rewards/margins": 1.0906703472137451, "rewards/rejected": -1.606247901916504, "step": 159 }, { "epoch": 0.9481481481481482, "grad_norm": 39.60980028499697, "learning_rate": 4.994726676233097e-07, "logits/chosen": -0.8466954231262207, "logits/rejected": -0.7026901245117188, "logps/chosen": -49.040225982666016, "logps/rejected": -59.15562438964844, "loss": 0.3939, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6523393392562866, "rewards/margins": 0.9394184350967407, "rewards/rejected": -1.5917577743530273, "step": 160 }, { "epoch": 0.9540740740740741, "grad_norm": 42.71341893793309, "learning_rate": 4.994296536700177e-07, "logits/chosen": -1.6049985885620117, "logits/rejected": -1.6918436288833618, "logps/chosen": -47.31741714477539, "logps/rejected": -60.05731964111328, "loss": 0.4019, "rewards/accuracies": 0.875, "rewards/chosen": -0.4310283064842224, "rewards/margins": 1.8012323379516602, "rewards/rejected": -2.2322607040405273, "step": 161 }, { "epoch": 0.96, "grad_norm": 35.9928869034378, "learning_rate": 4.993849555116066e-07, "logits/chosen": -1.5866928100585938, "logits/rejected": -1.4496181011199951, "logps/chosen": -29.258974075317383, "logps/rejected": -39.82886505126953, "loss": 0.3986, "rewards/accuracies": 0.75, "rewards/chosen": -0.02396818995475769, "rewards/margins": 0.8717716932296753, "rewards/rejected": -0.8957399129867554, "step": 162 }, { "epoch": 0.965925925925926, "grad_norm": 25.847460352007364, "learning_rate": 4.993385734498887e-07, "logits/chosen": -1.755631446838379, "logits/rejected": -1.581370234489441, "logps/chosen": -35.742820739746094, "logps/rejected": -57.367156982421875, "loss": 0.2876, "rewards/accuracies": 1.0, "rewards/chosen": -0.23074620962142944, "rewards/margins": 2.2658233642578125, "rewards/rejected": -2.4965696334838867, "step": 163 }, { "epoch": 0.9718518518518519, "grad_norm": 42.879579981836365, "learning_rate": 4.992905077980461e-07, "logits/chosen": -1.7979357242584229, "logits/rejected": -1.8016386032104492, "logps/chosen": -48.4314079284668, "logps/rejected": -55.05060577392578, "loss": 0.4501, "rewards/accuracies": 0.75, "rewards/chosen": -0.49184557795524597, "rewards/margins": 1.798371434211731, "rewards/rejected": -2.2902169227600098, "step": 164 }, { "epoch": 0.9777777777777777, "grad_norm": 40.09049915958416, "learning_rate": 4.992407588806287e-07, "logits/chosen": -1.562727451324463, "logits/rejected": -1.552638292312622, "logps/chosen": -36.466468811035156, "logps/rejected": -49.57290267944336, "loss": 0.4016, "rewards/accuracies": 0.875, "rewards/chosen": -0.43339627981185913, "rewards/margins": 1.6709353923797607, "rewards/rejected": -2.1043317317962646, "step": 165 }, { "epoch": 0.9837037037037037, "grad_norm": 33.4817708291202, "learning_rate": 4.991893270335525e-07, "logits/chosen": -1.164905309677124, "logits/rejected": -1.1139239072799683, "logps/chosen": -31.748462677001953, "logps/rejected": -54.49275588989258, "loss": 0.3344, "rewards/accuracies": 0.8125, "rewards/chosen": -0.49971821904182434, "rewards/margins": 1.8249340057373047, "rewards/rejected": -2.3246521949768066, "step": 166 }, { "epoch": 0.9896296296296296, "grad_norm": 36.48838262716427, "learning_rate": 4.991362126040969e-07, "logits/chosen": -1.425585150718689, "logits/rejected": -1.3379764556884766, "logps/chosen": -32.85935974121094, "logps/rejected": -50.20701217651367, "loss": 0.4296, "rewards/accuracies": 0.6875, "rewards/chosen": 0.024861067533493042, "rewards/margins": 0.9732530117034912, "rewards/rejected": -0.9483919143676758, "step": 167 }, { "epoch": 0.9955555555555555, "grad_norm": 34.12565710662784, "learning_rate": 4.990814159509024e-07, "logits/chosen": -1.4324623346328735, "logits/rejected": -1.4954330921173096, "logps/chosen": -42.89631652832031, "logps/rejected": -38.344970703125, "loss": 0.3861, "rewards/accuracies": 0.875, "rewards/chosen": -0.4475001394748688, "rewards/margins": 1.3485389947891235, "rewards/rejected": -1.7960389852523804, "step": 168 }, { "epoch": 1.0014814814814814, "grad_norm": 38.32760108321501, "learning_rate": 4.990249374439684e-07, "logits/chosen": -1.5158495903015137, "logits/rejected": -1.534022331237793, "logps/chosen": -31.372514724731445, "logps/rejected": -46.01219177246094, "loss": 0.3269, "rewards/accuracies": 1.0, "rewards/chosen": -0.012081414461135864, "rewards/margins": 1.969221830368042, "rewards/rejected": -1.9813032150268555, "step": 169 }, { "epoch": 1.0074074074074073, "grad_norm": 34.96121150467421, "learning_rate": 4.989667774646505e-07, "logits/chosen": -1.1385910511016846, "logits/rejected": -1.3404728174209595, "logps/chosen": -46.99889373779297, "logps/rejected": -44.828121185302734, "loss": 0.381, "rewards/accuracies": 0.75, "rewards/chosen": -0.45912593603134155, "rewards/margins": 1.4480329751968384, "rewards/rejected": -1.9071589708328247, "step": 170 }, { "epoch": 1.0133333333333334, "grad_norm": 38.09115794530134, "learning_rate": 4.989069364056579e-07, "logits/chosen": -1.1056678295135498, "logits/rejected": -1.3960037231445312, "logps/chosen": -42.210548400878906, "logps/rejected": -36.17823791503906, "loss": 0.3765, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5556241273880005, "rewards/margins": 1.0674997568130493, "rewards/rejected": -1.6231238842010498, "step": 171 }, { "epoch": 1.0192592592592593, "grad_norm": 25.250747618417908, "learning_rate": 4.98845414671051e-07, "logits/chosen": -1.8973287343978882, "logits/rejected": -1.8729844093322754, "logps/chosen": -38.14885711669922, "logps/rejected": -49.52397155761719, "loss": 0.2812, "rewards/accuracies": 0.8125, "rewards/chosen": -0.33738794922828674, "rewards/margins": 1.8423973321914673, "rewards/rejected": -2.1797852516174316, "step": 172 }, { "epoch": 1.0251851851851852, "grad_norm": 24.64598403479892, "learning_rate": 4.987822126762382e-07, "logits/chosen": -1.416272521018982, "logits/rejected": -1.329006552696228, "logps/chosen": -43.28013610839844, "logps/rejected": -55.639705657958984, "loss": 0.2011, "rewards/accuracies": 1.0, "rewards/chosen": -0.20702561736106873, "rewards/margins": 2.411742687225342, "rewards/rejected": -2.6187682151794434, "step": 173 }, { "epoch": 1.031111111111111, "grad_norm": 28.853682463289353, "learning_rate": 4.987173308479737e-07, "logits/chosen": -1.2722222805023193, "logits/rejected": -1.28517484664917, "logps/chosen": -39.79129409790039, "logps/rejected": -56.42878723144531, "loss": 0.2956, "rewards/accuracies": 0.9375, "rewards/chosen": -0.14098718762397766, "rewards/margins": 2.1605212688446045, "rewards/rejected": -2.3015084266662598, "step": 174 }, { "epoch": 1.037037037037037, "grad_norm": 31.102293680811048, "learning_rate": 4.986507696243543e-07, "logits/chosen": -1.4579617977142334, "logits/rejected": -1.309941291809082, "logps/chosen": -37.566322326660156, "logps/rejected": -54.842105865478516, "loss": 0.3111, "rewards/accuracies": 0.875, "rewards/chosen": -0.3267005681991577, "rewards/margins": 2.0936899185180664, "rewards/rejected": -2.4203903675079346, "step": 175 }, { "epoch": 1.0429629629629629, "grad_norm": 33.13459931935171, "learning_rate": 4.985825294548162e-07, "logits/chosen": -1.4088029861450195, "logits/rejected": -1.4467169046401978, "logps/chosen": -47.83537292480469, "logps/rejected": -52.668922424316406, "loss": 0.3269, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2606503367424011, "rewards/margins": 2.440474033355713, "rewards/rejected": -2.7011241912841797, "step": 176 }, { "epoch": 1.048888888888889, "grad_norm": 35.121879505889765, "learning_rate": 4.985126108001323e-07, "logits/chosen": -1.266465425491333, "logits/rejected": -1.1826586723327637, "logps/chosen": -39.725948333740234, "logps/rejected": -57.177398681640625, "loss": 0.3317, "rewards/accuracies": 1.0, "rewards/chosen": -0.37341824173927307, "rewards/margins": 2.5008201599121094, "rewards/rejected": -2.8742384910583496, "step": 177 }, { "epoch": 1.0548148148148149, "grad_norm": 37.244016043977574, "learning_rate": 4.984410141324092e-07, "logits/chosen": -2.1158571243286133, "logits/rejected": -1.9198758602142334, "logps/chosen": -43.933040618896484, "logps/rejected": -52.21183395385742, "loss": 0.3642, "rewards/accuracies": 0.875, "rewards/chosen": -0.28503745794296265, "rewards/margins": 1.6764382123947144, "rewards/rejected": -1.9614756107330322, "step": 178 }, { "epoch": 1.0607407407407408, "grad_norm": 27.827943454765062, "learning_rate": 4.983677399350838e-07, "logits/chosen": -1.3472518920898438, "logits/rejected": -1.069368600845337, "logps/chosen": -37.077884674072266, "logps/rejected": -61.32925033569336, "loss": 0.2883, "rewards/accuracies": 1.0, "rewards/chosen": -0.5079047679901123, "rewards/margins": 2.298504590988159, "rewards/rejected": -2.8064093589782715, "step": 179 }, { "epoch": 1.0666666666666667, "grad_norm": 35.482747269625385, "learning_rate": 4.982927887029197e-07, "logits/chosen": -1.5738056898117065, "logits/rejected": -1.5168110132217407, "logps/chosen": -40.75530242919922, "logps/rejected": -53.785465240478516, "loss": 0.3638, "rewards/accuracies": 0.8125, "rewards/chosen": -0.196524977684021, "rewards/margins": 1.446781039237976, "rewards/rejected": -1.643306016921997, "step": 180 }, { "epoch": 1.0725925925925925, "grad_norm": 33.393710024973245, "learning_rate": 4.982161609420047e-07, "logits/chosen": -1.8916293382644653, "logits/rejected": -1.541512131690979, "logps/chosen": -39.467071533203125, "logps/rejected": -70.18623352050781, "loss": 0.28, "rewards/accuracies": 0.875, "rewards/chosen": -0.2709190845489502, "rewards/margins": 2.419748306274414, "rewards/rejected": -2.6906673908233643, "step": 181 }, { "epoch": 1.0785185185185184, "grad_norm": 27.059206764761488, "learning_rate": 4.981378571697466e-07, "logits/chosen": -1.5622146129608154, "logits/rejected": -1.5354870557785034, "logps/chosen": -38.11195755004883, "logps/rejected": -46.43415451049805, "loss": 0.3158, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3464301526546478, "rewards/margins": 1.3460371494293213, "rewards/rejected": -1.692467212677002, "step": 182 }, { "epoch": 1.0844444444444445, "grad_norm": 37.99289989125244, "learning_rate": 4.980578779148702e-07, "logits/chosen": -1.675439476966858, "logits/rejected": -1.599168062210083, "logps/chosen": -35.517608642578125, "logps/rejected": -49.9504508972168, "loss": 0.3525, "rewards/accuracies": 1.0, "rewards/chosen": 0.1147196888923645, "rewards/margins": 2.2086353302001953, "rewards/rejected": -2.0939157009124756, "step": 183 }, { "epoch": 1.0903703703703704, "grad_norm": 29.913772956316752, "learning_rate": 4.979762237174131e-07, "logits/chosen": -2.1450929641723633, "logits/rejected": -2.026754856109619, "logps/chosen": -36.079044342041016, "logps/rejected": -56.236167907714844, "loss": 0.2973, "rewards/accuracies": 0.9375, "rewards/chosen": -0.17820218205451965, "rewards/margins": 1.8444724082946777, "rewards/rejected": -2.022674560546875, "step": 184 }, { "epoch": 1.0962962962962963, "grad_norm": 30.99605878821647, "learning_rate": 4.978928951287232e-07, "logits/chosen": -1.5684118270874023, "logits/rejected": -1.4883699417114258, "logps/chosen": -51.95133972167969, "logps/rejected": -68.82379150390625, "loss": 0.275, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3810669183731079, "rewards/margins": 2.429225444793701, "rewards/rejected": -2.8102922439575195, "step": 185 }, { "epoch": 1.1022222222222222, "grad_norm": 27.655362320953017, "learning_rate": 4.978078927114535e-07, "logits/chosen": -1.1995656490325928, "logits/rejected": -1.1525938510894775, "logps/chosen": -31.533584594726562, "logps/rejected": -43.609130859375, "loss": 0.2864, "rewards/accuracies": 0.9375, "rewards/chosen": -0.19067591428756714, "rewards/margins": 1.482248306274414, "rewards/rejected": -1.672924280166626, "step": 186 }, { "epoch": 1.108148148148148, "grad_norm": 34.748921344170064, "learning_rate": 4.977212170395597e-07, "logits/chosen": -1.595931887626648, "logits/rejected": -1.460189938545227, "logps/chosen": -41.28904724121094, "logps/rejected": -53.500274658203125, "loss": 0.3198, "rewards/accuracies": 0.75, "rewards/chosen": -0.2814917266368866, "rewards/margins": 1.909599781036377, "rewards/rejected": -2.191091537475586, "step": 187 }, { "epoch": 1.114074074074074, "grad_norm": 33.03290755090235, "learning_rate": 4.976328686982954e-07, "logits/chosen": -1.4642574787139893, "logits/rejected": -1.4906011819839478, "logps/chosen": -36.64849090576172, "logps/rejected": -45.48463439941406, "loss": 0.3067, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2857409417629242, "rewards/margins": 1.9576942920684814, "rewards/rejected": -2.2434353828430176, "step": 188 }, { "epoch": 1.12, "grad_norm": 29.67228463711039, "learning_rate": 4.975428482842082e-07, "logits/chosen": -2.017104148864746, "logits/rejected": -1.6186161041259766, "logps/chosen": -37.26373291015625, "logps/rejected": -61.593345642089844, "loss": 0.3102, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1307264268398285, "rewards/margins": 1.523173213005066, "rewards/rejected": -1.6538997888565063, "step": 189 }, { "epoch": 1.125925925925926, "grad_norm": 26.614271321646097, "learning_rate": 4.974511564051367e-07, "logits/chosen": -1.2072570323944092, "logits/rejected": -1.2738533020019531, "logps/chosen": -35.82682800292969, "logps/rejected": -47.37385940551758, "loss": 0.2503, "rewards/accuracies": 0.9375, "rewards/chosen": -0.11302122473716736, "rewards/margins": 1.6395896673202515, "rewards/rejected": -1.7526109218597412, "step": 190 }, { "epoch": 1.1318518518518519, "grad_norm": 35.09137385840197, "learning_rate": 4.973577936802046e-07, "logits/chosen": -1.2088086605072021, "logits/rejected": -1.0792959928512573, "logps/chosen": -40.28418731689453, "logps/rejected": -49.54509735107422, "loss": 0.3661, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4909687936306, "rewards/margins": 1.4177656173706055, "rewards/rejected": -1.9087340831756592, "step": 191 }, { "epoch": 1.1377777777777778, "grad_norm": 30.378844905181477, "learning_rate": 4.972627607398182e-07, "logits/chosen": -1.9133532047271729, "logits/rejected": -1.7833731174468994, "logps/chosen": -48.8740348815918, "logps/rejected": -58.93412780761719, "loss": 0.3391, "rewards/accuracies": 0.875, "rewards/chosen": -0.2783634662628174, "rewards/margins": 1.4361963272094727, "rewards/rejected": -1.71455979347229, "step": 192 }, { "epoch": 1.1437037037037037, "grad_norm": 32.45747667996871, "learning_rate": 4.971660582256614e-07, "logits/chosen": -1.4115142822265625, "logits/rejected": -1.4700024127960205, "logps/chosen": -36.24808120727539, "logps/rejected": -39.952327728271484, "loss": 0.2917, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4845938980579376, "rewards/margins": 1.4569411277770996, "rewards/rejected": -1.9415351152420044, "step": 193 }, { "epoch": 1.1496296296296296, "grad_norm": 33.14067947652267, "learning_rate": 4.970676867906911e-07, "logits/chosen": -1.3633407354354858, "logits/rejected": -1.257892370223999, "logps/chosen": -42.489227294921875, "logps/rejected": -60.33306121826172, "loss": 0.2577, "rewards/accuracies": 1.0, "rewards/chosen": -0.31740912795066833, "rewards/margins": 2.823535680770874, "rewards/rejected": -3.140944719314575, "step": 194 }, { "epoch": 1.1555555555555554, "grad_norm": 22.44088258779626, "learning_rate": 4.969676470991335e-07, "logits/chosen": -1.6855206489562988, "logits/rejected": -1.5853474140167236, "logps/chosen": -40.9166259765625, "logps/rejected": -58.58363342285156, "loss": 0.2222, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3480229079723358, "rewards/margins": 2.3396825790405273, "rewards/rejected": -2.6877055168151855, "step": 195 }, { "epoch": 1.1614814814814816, "grad_norm": 30.19027524330909, "learning_rate": 4.96865939826479e-07, "logits/chosen": -1.4181514978408813, "logits/rejected": -1.352044701576233, "logps/chosen": -47.38768005371094, "logps/rejected": -50.862403869628906, "loss": 0.2625, "rewards/accuracies": 0.9375, "rewards/chosen": -0.32805755734443665, "rewards/margins": 2.48689341545105, "rewards/rejected": -2.814950942993164, "step": 196 }, { "epoch": 1.1674074074074074, "grad_norm": 33.63843769566052, "learning_rate": 4.967625656594781e-07, "logits/chosen": -1.059614896774292, "logits/rejected": -1.1472609043121338, "logps/chosen": -46.473411560058594, "logps/rejected": -47.90194320678711, "loss": 0.3335, "rewards/accuracies": 0.875, "rewards/chosen": -0.664130449295044, "rewards/margins": 1.3695776462554932, "rewards/rejected": -2.033708095550537, "step": 197 }, { "epoch": 1.1733333333333333, "grad_norm": 25.93318598058472, "learning_rate": 4.966575252961365e-07, "logits/chosen": -0.9099617004394531, "logits/rejected": -1.086199402809143, "logps/chosen": -41.337745666503906, "logps/rejected": -47.416107177734375, "loss": 0.2587, "rewards/accuracies": 0.875, "rewards/chosen": -0.344178169965744, "rewards/margins": 2.067307472229004, "rewards/rejected": -2.4114856719970703, "step": 198 }, { "epoch": 1.1792592592592592, "grad_norm": 26.412082223409303, "learning_rate": 4.9655081944571e-07, "logits/chosen": -1.1102447509765625, "logits/rejected": -1.0874698162078857, "logps/chosen": -37.87491989135742, "logps/rejected": -44.73563003540039, "loss": 0.259, "rewards/accuracies": 0.9375, "rewards/chosen": -0.09453117847442627, "rewards/margins": 1.5228742361068726, "rewards/rejected": -1.6174055337905884, "step": 199 }, { "epoch": 1.1851851851851851, "grad_norm": 30.713958497949275, "learning_rate": 4.964424488287009e-07, "logits/chosen": -1.3750660419464111, "logits/rejected": -1.5156819820404053, "logps/chosen": -41.877891540527344, "logps/rejected": -48.9142951965332, "loss": 0.3177, "rewards/accuracies": 0.875, "rewards/chosen": -0.21863913536071777, "rewards/margins": 1.9450185298919678, "rewards/rejected": -2.1636576652526855, "step": 200 }, { "epoch": 1.1911111111111112, "grad_norm": 31.035229497524096, "learning_rate": 4.963324141768518e-07, "logits/chosen": -1.5514556169509888, "logits/rejected": -1.433958649635315, "logps/chosen": -45.017242431640625, "logps/rejected": -59.65779113769531, "loss": 0.2587, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7274824976921082, "rewards/margins": 1.8945391178131104, "rewards/rejected": -2.6220216751098633, "step": 201 }, { "epoch": 1.1970370370370371, "grad_norm": 25.89090435294919, "learning_rate": 4.962207162331414e-07, "logits/chosen": -1.4618628025054932, "logits/rejected": -1.3739463090896606, "logps/chosen": -34.74220275878906, "logps/rejected": -49.502899169921875, "loss": 0.245, "rewards/accuracies": 0.8125, "rewards/chosen": -0.1882825791835785, "rewards/margins": 2.2607545852661133, "rewards/rejected": -2.4490370750427246, "step": 202 }, { "epoch": 1.202962962962963, "grad_norm": 32.42192756007002, "learning_rate": 4.961073557517792e-07, "logits/chosen": -1.6199530363082886, "logits/rejected": -1.6937488317489624, "logps/chosen": -33.535919189453125, "logps/rejected": -38.97394561767578, "loss": 0.3269, "rewards/accuracies": 0.75, "rewards/chosen": -0.28588587045669556, "rewards/margins": 1.5164083242416382, "rewards/rejected": -1.8022942543029785, "step": 203 }, { "epoch": 1.208888888888889, "grad_norm": 40.287715082939165, "learning_rate": 4.95992333498201e-07, "logits/chosen": -1.2144708633422852, "logits/rejected": -1.1612203121185303, "logps/chosen": -41.554542541503906, "logps/rejected": -54.438270568847656, "loss": 0.4155, "rewards/accuracies": 0.625, "rewards/chosen": -0.6244699954986572, "rewards/margins": 1.4721695184707642, "rewards/rejected": -2.096639633178711, "step": 204 }, { "epoch": 1.2148148148148148, "grad_norm": 31.12088690243942, "learning_rate": 4.958756502490626e-07, "logits/chosen": -1.4652817249298096, "logits/rejected": -1.3764071464538574, "logps/chosen": -36.79831314086914, "logps/rejected": -59.886383056640625, "loss": 0.2911, "rewards/accuracies": 0.9375, "rewards/chosen": -0.30859121680259705, "rewards/margins": 2.757361888885498, "rewards/rejected": -3.065952777862549, "step": 205 }, { "epoch": 1.2207407407407407, "grad_norm": 30.460892608308736, "learning_rate": 4.957573067922359e-07, "logits/chosen": -1.7502586841583252, "logits/rejected": -1.4709583520889282, "logps/chosen": -36.0119514465332, "logps/rejected": -51.99536895751953, "loss": 0.3284, "rewards/accuracies": 0.875, "rewards/chosen": -0.36890193819999695, "rewards/margins": 1.593143343925476, "rewards/rejected": -1.9620450735092163, "step": 206 }, { "epoch": 1.2266666666666666, "grad_norm": 31.368572219016052, "learning_rate": 4.956373039268021e-07, "logits/chosen": -0.9687676429748535, "logits/rejected": -1.1372796297073364, "logps/chosen": -40.27385711669922, "logps/rejected": -53.913265228271484, "loss": 0.3021, "rewards/accuracies": 0.875, "rewards/chosen": -0.39367496967315674, "rewards/margins": 3.2817864418029785, "rewards/rejected": -3.6754612922668457, "step": 207 }, { "epoch": 1.2325925925925927, "grad_norm": 28.33249165288491, "learning_rate": 4.955156424630479e-07, "logits/chosen": -1.431084394454956, "logits/rejected": -1.2873094081878662, "logps/chosen": -38.09033203125, "logps/rejected": -54.86888122558594, "loss": 0.268, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4929051399230957, "rewards/margins": 2.4378890991210938, "rewards/rejected": -2.9307942390441895, "step": 208 }, { "epoch": 1.2385185185185186, "grad_norm": 32.08230351680351, "learning_rate": 4.953923232224586e-07, "logits/chosen": -1.2549279928207397, "logits/rejected": -1.2246358394622803, "logps/chosen": -37.1239128112793, "logps/rejected": -43.20243453979492, "loss": 0.3344, "rewards/accuracies": 0.8125, "rewards/chosen": -0.21321409940719604, "rewards/margins": 1.9452736377716064, "rewards/rejected": -2.1584877967834473, "step": 209 }, { "epoch": 1.2444444444444445, "grad_norm": 22.865961224797655, "learning_rate": 4.952673470377137e-07, "logits/chosen": -1.3631465435028076, "logits/rejected": -1.389848232269287, "logps/chosen": -34.06418991088867, "logps/rejected": -60.43558120727539, "loss": 0.208, "rewards/accuracies": 0.875, "rewards/chosen": 0.04666715860366821, "rewards/margins": 2.7804617881774902, "rewards/rejected": -2.733794689178467, "step": 210 }, { "epoch": 1.2503703703703704, "grad_norm": 33.18444614021853, "learning_rate": 4.951407147526803e-07, "logits/chosen": -1.26889169216156, "logits/rejected": -1.262638807296753, "logps/chosen": -44.48564910888672, "logps/rejected": -50.11359405517578, "loss": 0.322, "rewards/accuracies": 0.75, "rewards/chosen": -0.3982480466365814, "rewards/margins": 2.0993640422821045, "rewards/rejected": -2.4976119995117188, "step": 211 }, { "epoch": 1.2562962962962962, "grad_norm": 32.15101957433149, "learning_rate": 4.950124272224082e-07, "logits/chosen": -1.0472859144210815, "logits/rejected": -0.9838506579399109, "logps/chosen": -44.68108367919922, "logps/rejected": -52.78573226928711, "loss": 0.3157, "rewards/accuracies": 0.8125, "rewards/chosen": -0.46195846796035767, "rewards/margins": 1.9016549587249756, "rewards/rejected": -2.3636133670806885, "step": 212 }, { "epoch": 1.2622222222222224, "grad_norm": 27.576004007465258, "learning_rate": 4.948824853131236e-07, "logits/chosen": -0.8029367923736572, "logits/rejected": -0.7601336240768433, "logps/chosen": -37.77496337890625, "logps/rejected": -43.76077651977539, "loss": 0.2837, "rewards/accuracies": 1.0, "rewards/chosen": -0.45464155077934265, "rewards/margins": 1.4948298931121826, "rewards/rejected": -1.9494714736938477, "step": 213 }, { "epoch": 1.268148148148148, "grad_norm": 31.02135986952652, "learning_rate": 4.947508899022234e-07, "logits/chosen": -1.3477468490600586, "logits/rejected": -1.4308537244796753, "logps/chosen": -32.83580017089844, "logps/rejected": -36.56342315673828, "loss": 0.2527, "rewards/accuracies": 0.9375, "rewards/chosen": 0.05743288993835449, "rewards/margins": 1.7676852941513062, "rewards/rejected": -1.7102524042129517, "step": 214 }, { "epoch": 1.2740740740740741, "grad_norm": 34.88556914925436, "learning_rate": 4.946176418782698e-07, "logits/chosen": -2.1566824913024902, "logits/rejected": -2.090806484222412, "logps/chosen": -46.413124084472656, "logps/rejected": -66.00543975830078, "loss": 0.2969, "rewards/accuracies": 0.75, "rewards/chosen": -0.8992799520492554, "rewards/margins": 2.2176101207733154, "rewards/rejected": -3.1168899536132812, "step": 215 }, { "epoch": 1.28, "grad_norm": 35.18732477543249, "learning_rate": 4.944827421409829e-07, "logits/chosen": -1.181545376777649, "logits/rejected": -1.3015544414520264, "logps/chosen": -49.637779235839844, "logps/rejected": -57.909244537353516, "loss": 0.326, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8887189030647278, "rewards/margins": 1.9327178001403809, "rewards/rejected": -2.821436643600464, "step": 216 }, { "epoch": 1.285925925925926, "grad_norm": 24.591401843178655, "learning_rate": 4.943461916012363e-07, "logits/chosen": -1.474562168121338, "logits/rejected": -1.4195574522018433, "logps/chosen": -43.157752990722656, "logps/rejected": -63.577606201171875, "loss": 0.1874, "rewards/accuracies": 1.0, "rewards/chosen": -0.028524503111839294, "rewards/margins": 3.577744483947754, "rewards/rejected": -3.606269359588623, "step": 217 }, { "epoch": 1.2918518518518518, "grad_norm": 33.2216535916157, "learning_rate": 4.9420799118105e-07, "logits/chosen": -1.2040375471115112, "logits/rejected": -1.3402186632156372, "logps/chosen": -40.10552215576172, "logps/rejected": -47.71405792236328, "loss": 0.3211, "rewards/accuracies": 0.875, "rewards/chosen": -0.4178282916545868, "rewards/margins": 2.3589730262756348, "rewards/rejected": -2.776801347732544, "step": 218 }, { "epoch": 1.2977777777777777, "grad_norm": 24.75619541002914, "learning_rate": 4.940681418135843e-07, "logits/chosen": -1.5321645736694336, "logits/rejected": -1.4269739389419556, "logps/chosen": -30.008689880371094, "logps/rejected": -57.94854736328125, "loss": 0.2077, "rewards/accuracies": 0.875, "rewards/chosen": -0.2954857349395752, "rewards/margins": 3.1439101696014404, "rewards/rejected": -3.4393959045410156, "step": 219 }, { "epoch": 1.3037037037037038, "grad_norm": 32.2244205728105, "learning_rate": 4.939266444431335e-07, "logits/chosen": -1.1552932262420654, "logits/rejected": -0.8443745374679565, "logps/chosen": -37.77070617675781, "logps/rejected": -65.06275177001953, "loss": 0.3124, "rewards/accuracies": 0.9375, "rewards/chosen": -0.43534815311431885, "rewards/margins": 2.6500864028930664, "rewards/rejected": -3.0854344367980957, "step": 220 }, { "epoch": 1.3096296296296297, "grad_norm": 34.31075867463383, "learning_rate": 4.937835000251197e-07, "logits/chosen": -1.6441656351089478, "logits/rejected": -1.4223228693008423, "logps/chosen": -39.33687210083008, "logps/rejected": -60.138710021972656, "loss": 0.3128, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6635284423828125, "rewards/margins": 2.1143033504486084, "rewards/rejected": -2.777831792831421, "step": 221 }, { "epoch": 1.3155555555555556, "grad_norm": 24.691079671097242, "learning_rate": 4.936387095260863e-07, "logits/chosen": -1.527470350265503, "logits/rejected": -1.349807620048523, "logps/chosen": -31.314828872680664, "logps/rejected": -60.759063720703125, "loss": 0.2295, "rewards/accuracies": 1.0, "rewards/chosen": -0.4836583435535431, "rewards/margins": 3.3463735580444336, "rewards/rejected": -3.8300321102142334, "step": 222 }, { "epoch": 1.3214814814814815, "grad_norm": 29.27341357309857, "learning_rate": 4.934922739236912e-07, "logits/chosen": -1.265206217765808, "logits/rejected": -1.2071069478988647, "logps/chosen": -33.34834289550781, "logps/rejected": -54.05493927001953, "loss": 0.2658, "rewards/accuracies": 0.875, "rewards/chosen": -0.175077885389328, "rewards/margins": 2.510103940963745, "rewards/rejected": -2.6851820945739746, "step": 223 }, { "epoch": 1.3274074074074074, "grad_norm": 33.68278541187267, "learning_rate": 4.933441942067006e-07, "logits/chosen": -1.1925909519195557, "logits/rejected": -0.9682923555374146, "logps/chosen": -51.606163024902344, "logps/rejected": -62.01679611206055, "loss": 0.2812, "rewards/accuracies": 0.9375, "rewards/chosen": -0.20239922404289246, "rewards/margins": 2.0104899406433105, "rewards/rejected": -2.2128894329071045, "step": 224 }, { "epoch": 1.3333333333333333, "grad_norm": 25.970662302058052, "learning_rate": 4.93194471374982e-07, "logits/chosen": -1.2032102346420288, "logits/rejected": -1.178951621055603, "logps/chosen": -39.75410461425781, "logps/rejected": -51.909507751464844, "loss": 0.2617, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3341366946697235, "rewards/margins": 1.715659260749817, "rewards/rejected": -2.0497961044311523, "step": 225 }, { "epoch": 1.3392592592592591, "grad_norm": 28.630664569506944, "learning_rate": 4.930431064394976e-07, "logits/chosen": -1.2886648178100586, "logits/rejected": -1.1532554626464844, "logps/chosen": -46.30451202392578, "logps/rejected": -48.568546295166016, "loss": 0.2757, "rewards/accuracies": 0.75, "rewards/chosen": -0.7058642506599426, "rewards/margins": 1.6998100280761719, "rewards/rejected": -2.405674457550049, "step": 226 }, { "epoch": 1.3451851851851853, "grad_norm": 29.29936360176106, "learning_rate": 4.928901004222977e-07, "logits/chosen": -1.8237268924713135, "logits/rejected": -1.9156745672225952, "logps/chosen": -39.91328430175781, "logps/rejected": -49.987911224365234, "loss": 0.251, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7427995204925537, "rewards/margins": 2.478236198425293, "rewards/rejected": -3.221035957336426, "step": 227 }, { "epoch": 1.3511111111111112, "grad_norm": 25.1014676322794, "learning_rate": 4.92735454356513e-07, "logits/chosen": -1.5877453088760376, "logits/rejected": -1.4968990087509155, "logps/chosen": -45.305973052978516, "logps/rejected": -64.76545715332031, "loss": 0.2078, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6092345118522644, "rewards/margins": 2.52042293548584, "rewards/rejected": -3.129657506942749, "step": 228 }, { "epoch": 1.357037037037037, "grad_norm": 30.078462760535523, "learning_rate": 4.925791692863488e-07, "logits/chosen": -1.226888656616211, "logits/rejected": -1.1548047065734863, "logps/chosen": -32.65156173706055, "logps/rejected": -45.28959655761719, "loss": 0.2925, "rewards/accuracies": 0.9375, "rewards/chosen": -0.459149032831192, "rewards/margins": 2.1847825050354004, "rewards/rejected": -2.6439313888549805, "step": 229 }, { "epoch": 1.362962962962963, "grad_norm": 39.86584108808501, "learning_rate": 4.924212462670768e-07, "logits/chosen": -1.27255380153656, "logits/rejected": -1.3597698211669922, "logps/chosen": -45.05531311035156, "logps/rejected": -54.719696044921875, "loss": 0.3644, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3324463665485382, "rewards/margins": 2.581584930419922, "rewards/rejected": -2.914031505584717, "step": 230 }, { "epoch": 1.3688888888888888, "grad_norm": 22.696681021450235, "learning_rate": 4.922616863650289e-07, "logits/chosen": -1.656007170677185, "logits/rejected": -1.5683889389038086, "logps/chosen": -41.425514221191406, "logps/rejected": -66.38223266601562, "loss": 0.2261, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3960660398006439, "rewards/margins": 3.071995735168457, "rewards/rejected": -3.468061685562134, "step": 231 }, { "epoch": 1.374814814814815, "grad_norm": 29.486281634858543, "learning_rate": 4.921004906575896e-07, "logits/chosen": -1.4675967693328857, "logits/rejected": -1.393191933631897, "logps/chosen": -43.084381103515625, "logps/rejected": -52.339630126953125, "loss": 0.2716, "rewards/accuracies": 0.9375, "rewards/chosen": -0.27437475323677063, "rewards/margins": 2.7827978134155273, "rewards/rejected": -3.0571727752685547, "step": 232 }, { "epoch": 1.3807407407407408, "grad_norm": 31.144179117519332, "learning_rate": 4.919376602331883e-07, "logits/chosen": -1.2110486030578613, "logits/rejected": -1.4307780265808105, "logps/chosen": -47.4427604675293, "logps/rejected": -56.62528991699219, "loss": 0.2436, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2791571617126465, "rewards/margins": 4.268290996551514, "rewards/rejected": -4.54744815826416, "step": 233 }, { "epoch": 1.3866666666666667, "grad_norm": 29.17828921084767, "learning_rate": 4.917731961912926e-07, "logits/chosen": -0.8598974347114563, "logits/rejected": -0.7685071229934692, "logps/chosen": -38.05088806152344, "logps/rejected": -50.185630798339844, "loss": 0.2472, "rewards/accuracies": 0.875, "rewards/chosen": -0.3008832037448883, "rewards/margins": 2.4341988563537598, "rewards/rejected": -2.7350821495056152, "step": 234 }, { "epoch": 1.3925925925925926, "grad_norm": 29.70725398562355, "learning_rate": 4.91607099642401e-07, "logits/chosen": -1.569394826889038, "logits/rejected": -1.5405265092849731, "logps/chosen": -40.34491729736328, "logps/rejected": -49.547454833984375, "loss": 0.302, "rewards/accuracies": 0.875, "rewards/chosen": -0.32715070247650146, "rewards/margins": 2.5119659900665283, "rewards/rejected": -2.8391168117523193, "step": 235 }, { "epoch": 1.3985185185185185, "grad_norm": 25.496602616107623, "learning_rate": 4.914393717080346e-07, "logits/chosen": -1.1362640857696533, "logits/rejected": -1.0989569425582886, "logps/chosen": -31.34090232849121, "logps/rejected": -44.264793395996094, "loss": 0.2358, "rewards/accuracies": 0.875, "rewards/chosen": 0.08524125814437866, "rewards/margins": 2.406437873840332, "rewards/rejected": -2.3211965560913086, "step": 236 }, { "epoch": 1.4044444444444444, "grad_norm": 28.575948207640465, "learning_rate": 4.9127001352073e-07, "logits/chosen": -1.339735746383667, "logits/rejected": -1.2221708297729492, "logps/chosen": -39.16889190673828, "logps/rejected": -60.40363311767578, "loss": 0.21, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5930743217468262, "rewards/margins": 2.652825355529785, "rewards/rejected": -3.2458996772766113, "step": 237 }, { "epoch": 1.4103703703703703, "grad_norm": 24.369471042771703, "learning_rate": 4.910990262240321e-07, "logits/chosen": -1.8492693901062012, "logits/rejected": -1.8319913148880005, "logps/chosen": -36.13838195800781, "logps/rejected": -47.36172103881836, "loss": 0.1971, "rewards/accuracies": 0.9375, "rewards/chosen": -0.33304405212402344, "rewards/margins": 2.4353842735290527, "rewards/rejected": -2.768428325653076, "step": 238 }, { "epoch": 1.4162962962962964, "grad_norm": 25.91992533520253, "learning_rate": 4.909264109724852e-07, "logits/chosen": -1.5908434391021729, "logits/rejected": -1.4340860843658447, "logps/chosen": -30.060483932495117, "logps/rejected": -50.15409469604492, "loss": 0.2141, "rewards/accuracies": 1.0, "rewards/chosen": 0.035857439041137695, "rewards/margins": 2.761453628540039, "rewards/rejected": -2.7255964279174805, "step": 239 }, { "epoch": 1.4222222222222223, "grad_norm": 28.00265276561218, "learning_rate": 4.907521689316265e-07, "logits/chosen": -1.2963168621063232, "logits/rejected": -0.9083112478256226, "logps/chosen": -31.37090301513672, "logps/rejected": -67.65018463134766, "loss": 0.2784, "rewards/accuracies": 0.875, "rewards/chosen": -0.04646921157836914, "rewards/margins": 2.4311680793762207, "rewards/rejected": -2.477637529373169, "step": 240 }, { "epoch": 1.4281481481481482, "grad_norm": 22.845123828242635, "learning_rate": 4.905763012779775e-07, "logits/chosen": -2.0449070930480957, "logits/rejected": -1.8327888250350952, "logps/chosen": -47.79774475097656, "logps/rejected": -66.09587097167969, "loss": 0.2008, "rewards/accuracies": 1.0, "rewards/chosen": -0.43650582432746887, "rewards/margins": 4.076416492462158, "rewards/rejected": -4.512922286987305, "step": 241 }, { "epoch": 1.434074074074074, "grad_norm": 20.347183374436156, "learning_rate": 4.90398809199036e-07, "logits/chosen": -1.5851411819458008, "logits/rejected": -1.7745072841644287, "logps/chosen": -46.445831298828125, "logps/rejected": -54.314292907714844, "loss": 0.1838, "rewards/accuracies": 1.0, "rewards/chosen": -0.21882963180541992, "rewards/margins": 3.393106698989868, "rewards/rejected": -3.611936569213867, "step": 242 }, { "epoch": 1.44, "grad_norm": 32.344226506103965, "learning_rate": 4.902196938932685e-07, "logits/chosen": -1.8756588697433472, "logits/rejected": -1.8360813856124878, "logps/chosen": -36.15525436401367, "logps/rejected": -44.08958053588867, "loss": 0.2719, "rewards/accuracies": 0.9375, "rewards/chosen": -0.1032625138759613, "rewards/margins": 2.351353645324707, "rewards/rejected": -2.454616069793701, "step": 243 }, { "epoch": 1.445925925925926, "grad_norm": 28.242539134271052, "learning_rate": 4.90038956570102e-07, "logits/chosen": -1.1855772733688354, "logits/rejected": -1.3019590377807617, "logps/chosen": -54.570953369140625, "logps/rejected": -50.146095275878906, "loss": 0.2221, "rewards/accuracies": 0.875, "rewards/chosen": -0.9502605199813843, "rewards/margins": 1.6242974996566772, "rewards/rejected": -2.5745580196380615, "step": 244 }, { "epoch": 1.4518518518518517, "grad_norm": 24.65166740001487, "learning_rate": 4.898565984499153e-07, "logits/chosen": -1.5563501119613647, "logits/rejected": -1.2845768928527832, "logps/chosen": -32.572696685791016, "logps/rejected": -62.32068634033203, "loss": 0.2219, "rewards/accuracies": 0.9375, "rewards/chosen": -0.4941897392272949, "rewards/margins": 3.1560323238372803, "rewards/rejected": -3.650221824645996, "step": 245 }, { "epoch": 1.4577777777777778, "grad_norm": 26.176561390267167, "learning_rate": 4.896726207640314e-07, "logits/chosen": -1.284088134765625, "logits/rejected": -1.530550241470337, "logps/chosen": -49.36870574951172, "logps/rejected": -45.64020538330078, "loss": 0.2343, "rewards/accuracies": 0.9375, "rewards/chosen": -0.45776939392089844, "rewards/margins": 2.273721694946289, "rewards/rejected": -2.7314910888671875, "step": 246 }, { "epoch": 1.4637037037037037, "grad_norm": 32.31060258996922, "learning_rate": 4.894870247547093e-07, "logits/chosen": -1.6879163980484009, "logits/rejected": -1.5221493244171143, "logps/chosen": -30.046092987060547, "logps/rejected": -55.25277328491211, "loss": 0.3011, "rewards/accuracies": 0.9375, "rewards/chosen": -0.42848262190818787, "rewards/margins": 2.895785093307495, "rewards/rejected": -3.324267864227295, "step": 247 }, { "epoch": 1.4696296296296296, "grad_norm": 24.438337032328114, "learning_rate": 4.892998116751348e-07, "logits/chosen": -1.3243341445922852, "logits/rejected": -1.3782624006271362, "logps/chosen": -32.396087646484375, "logps/rejected": -41.859649658203125, "loss": 0.2315, "rewards/accuracies": 0.8125, "rewards/chosen": 0.05424916744232178, "rewards/margins": 1.7480883598327637, "rewards/rejected": -1.693839192390442, "step": 248 }, { "epoch": 1.4755555555555555, "grad_norm": 25.80505112713608, "learning_rate": 4.891109827894127e-07, "logits/chosen": -1.5409799814224243, "logits/rejected": -1.6533753871917725, "logps/chosen": -60.424072265625, "logps/rejected": -66.90180206298828, "loss": 0.2631, "rewards/accuracies": 1.0, "rewards/chosen": -0.5530689358711243, "rewards/margins": 3.336688280105591, "rewards/rejected": -3.8897571563720703, "step": 249 }, { "epoch": 1.4814814814814814, "grad_norm": 26.021296278280122, "learning_rate": 4.889205393725583e-07, "logits/chosen": -1.2817881107330322, "logits/rejected": -1.060404658317566, "logps/chosen": -32.70816421508789, "logps/rejected": -52.11760711669922, "loss": 0.2183, "rewards/accuracies": 1.0, "rewards/chosen": 0.21378082036972046, "rewards/margins": 3.265263080596924, "rewards/rejected": -3.0514822006225586, "step": 250 }, { "epoch": 1.4874074074074075, "grad_norm": 26.001434100122033, "learning_rate": 4.887284827104881e-07, "logits/chosen": -2.0695719718933105, "logits/rejected": -1.9940532445907593, "logps/chosen": -28.80967140197754, "logps/rejected": -78.68373107910156, "loss": 0.1901, "rewards/accuracies": 1.0, "rewards/chosen": 0.015644848346710205, "rewards/margins": 4.127260684967041, "rewards/rejected": -4.111616134643555, "step": 251 }, { "epoch": 1.4933333333333334, "grad_norm": 31.73693517564944, "learning_rate": 4.885348141000122e-07, "logits/chosen": -1.0371400117874146, "logits/rejected": -1.0298850536346436, "logps/chosen": -34.962913513183594, "logps/rejected": -53.1561279296875, "loss": 0.2894, "rewards/accuracies": 0.875, "rewards/chosen": -0.4044491946697235, "rewards/margins": 3.485771417617798, "rewards/rejected": -3.8902206420898438, "step": 252 }, { "epoch": 1.4992592592592593, "grad_norm": 27.137864591631498, "learning_rate": 4.883395348488243e-07, "logits/chosen": -1.3027697801589966, "logits/rejected": -1.3290810585021973, "logps/chosen": -47.35820007324219, "logps/rejected": -53.30695343017578, "loss": 0.2341, "rewards/accuracies": 0.9375, "rewards/chosen": -0.4950598478317261, "rewards/margins": 3.308154344558716, "rewards/rejected": -3.8032140731811523, "step": 253 }, { "epoch": 1.5051851851851852, "grad_norm": 27.529883497460066, "learning_rate": 4.88142646275494e-07, "logits/chosen": -1.4786378145217896, "logits/rejected": -1.4781079292297363, "logps/chosen": -35.87552261352539, "logps/rejected": -50.91257095336914, "loss": 0.2294, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06936764717102051, "rewards/margins": 2.9450628757476807, "rewards/rejected": -3.014430522918701, "step": 254 }, { "epoch": 1.511111111111111, "grad_norm": 27.659117607442656, "learning_rate": 4.879441497094572e-07, "logits/chosen": -1.3110840320587158, "logits/rejected": -1.2133020162582397, "logps/chosen": -35.17852783203125, "logps/rejected": -45.25739288330078, "loss": 0.242, "rewards/accuracies": 0.9375, "rewards/chosen": -0.07239916920661926, "rewards/margins": 2.228761672973633, "rewards/rejected": -2.3011608123779297, "step": 255 }, { "epoch": 1.5170370370370372, "grad_norm": 31.289246246516008, "learning_rate": 4.877440464910073e-07, "logits/chosen": -1.564308762550354, "logits/rejected": -1.2593793869018555, "logps/chosen": -36.79103469848633, "logps/rejected": -52.30302047729492, "loss": 0.2747, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7653313279151917, "rewards/margins": 2.011465311050415, "rewards/rejected": -2.776796340942383, "step": 256 }, { "epoch": 1.5229629629629629, "grad_norm": 29.211097710250886, "learning_rate": 4.875423379712864e-07, "logits/chosen": -1.4866360425949097, "logits/rejected": -1.2372266054153442, "logps/chosen": -37.493309020996094, "logps/rejected": -67.01297760009766, "loss": 0.2303, "rewards/accuracies": 1.0, "rewards/chosen": -0.5150955319404602, "rewards/margins": 3.707171678543091, "rewards/rejected": -4.222267150878906, "step": 257 }, { "epoch": 1.528888888888889, "grad_norm": 26.21514812235197, "learning_rate": 4.873390255122756e-07, "logits/chosen": -1.276237964630127, "logits/rejected": -1.4946424961090088, "logps/chosen": -38.75402069091797, "logps/rejected": -52.27590560913086, "loss": 0.1998, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3489813804626465, "rewards/margins": 3.6085846424102783, "rewards/rejected": -3.957566022872925, "step": 258 }, { "epoch": 1.5348148148148149, "grad_norm": 29.630098087292826, "learning_rate": 4.871341104867864e-07, "logits/chosen": -1.614687442779541, "logits/rejected": -1.4839171171188354, "logps/chosen": -43.47966003417969, "logps/rejected": -53.94084167480469, "loss": 0.2513, "rewards/accuracies": 0.875, "rewards/chosen": -0.4233679175376892, "rewards/margins": 2.405550241470337, "rewards/rejected": -2.828917980194092, "step": 259 }, { "epoch": 1.5407407407407407, "grad_norm": 28.18645966963501, "learning_rate": 4.869275942784511e-07, "logits/chosen": -1.4025224447250366, "logits/rejected": -1.199758768081665, "logps/chosen": -30.40345001220703, "logps/rejected": -47.12421417236328, "loss": 0.2201, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3111732304096222, "rewards/margins": 2.2125093936920166, "rewards/rejected": -2.5236825942993164, "step": 260 }, { "epoch": 1.5466666666666666, "grad_norm": 33.643647259370184, "learning_rate": 4.867194782817137e-07, "logits/chosen": -1.0420281887054443, "logits/rejected": -1.054417610168457, "logps/chosen": -40.79283142089844, "logps/rejected": -43.291046142578125, "loss": 0.3138, "rewards/accuracies": 0.75, "rewards/chosen": -0.007962435483932495, "rewards/margins": 2.214482307434082, "rewards/rejected": -2.222445011138916, "step": 261 }, { "epoch": 1.5525925925925925, "grad_norm": 29.30111226931869, "learning_rate": 4.865097639018202e-07, "logits/chosen": -1.2452741861343384, "logits/rejected": -1.1993070840835571, "logps/chosen": -42.200904846191406, "logps/rejected": -56.703468322753906, "loss": 0.2458, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7176186442375183, "rewards/margins": 2.2573556900024414, "rewards/rejected": -2.9749743938446045, "step": 262 }, { "epoch": 1.5585185185185186, "grad_norm": 25.912554895976097, "learning_rate": 4.862984525548091e-07, "logits/chosen": -1.1260875463485718, "logits/rejected": -1.1260864734649658, "logps/chosen": -33.189659118652344, "logps/rejected": -43.486534118652344, "loss": 0.2525, "rewards/accuracies": 0.875, "rewards/chosen": -0.013395726680755615, "rewards/margins": 2.880253791809082, "rewards/rejected": -2.8936493396759033, "step": 263 }, { "epoch": 1.5644444444444443, "grad_norm": 25.587208623838446, "learning_rate": 4.860855456675024e-07, "logits/chosen": -1.5094867944717407, "logits/rejected": -1.4383167028427124, "logps/chosen": -34.93250274658203, "logps/rejected": -62.74409484863281, "loss": 0.1984, "rewards/accuracies": 1.0, "rewards/chosen": -0.5001842379570007, "rewards/margins": 3.726599931716919, "rewards/rejected": -4.226784706115723, "step": 264 }, { "epoch": 1.5703703703703704, "grad_norm": 27.07883301946322, "learning_rate": 4.85871044677495e-07, "logits/chosen": -1.2766215801239014, "logits/rejected": -1.2287697792053223, "logps/chosen": -43.3912239074707, "logps/rejected": -58.06782150268555, "loss": 0.2165, "rewards/accuracies": 1.0, "rewards/chosen": -0.05057030916213989, "rewards/margins": 3.6173009872436523, "rewards/rejected": -3.6678709983825684, "step": 265 }, { "epoch": 1.5762962962962963, "grad_norm": 28.18716416057205, "learning_rate": 4.856549510331461e-07, "logits/chosen": -1.9891289472579956, "logits/rejected": -1.8930578231811523, "logps/chosen": -39.6988639831543, "logps/rejected": -50.377994537353516, "loss": 0.247, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3518238663673401, "rewards/margins": 2.627807140350342, "rewards/rejected": -2.979630947113037, "step": 266 }, { "epoch": 1.5822222222222222, "grad_norm": 25.21634175596078, "learning_rate": 4.854372661935684e-07, "logits/chosen": -1.3575857877731323, "logits/rejected": -1.2936060428619385, "logps/chosen": -39.070228576660156, "logps/rejected": -49.42529296875, "loss": 0.253, "rewards/accuracies": 0.875, "rewards/chosen": -0.28435996174812317, "rewards/margins": 2.0522220134735107, "rewards/rejected": -2.3365821838378906, "step": 267 }, { "epoch": 1.5881481481481483, "grad_norm": 25.067139870537627, "learning_rate": 4.852179916286189e-07, "logits/chosen": -1.6648176908493042, "logits/rejected": -1.3902709484100342, "logps/chosen": -37.70637893676758, "logps/rejected": -55.506195068359375, "loss": 0.236, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2536647319793701, "rewards/margins": 2.993668556213379, "rewards/rejected": -3.247333288192749, "step": 268 }, { "epoch": 1.594074074074074, "grad_norm": 27.770666704811592, "learning_rate": 4.849971288188889e-07, "logits/chosen": -1.361600637435913, "logits/rejected": -1.1852885484695435, "logps/chosen": -33.70117950439453, "logps/rejected": -57.27318572998047, "loss": 0.2575, "rewards/accuracies": 0.9375, "rewards/chosen": 0.07503369450569153, "rewards/margins": 2.6005759239196777, "rewards/rejected": -2.5255422592163086, "step": 269 }, { "epoch": 1.6, "grad_norm": 34.25142945752442, "learning_rate": 4.847746792556936e-07, "logits/chosen": -1.7486881017684937, "logits/rejected": -1.783259630203247, "logps/chosen": -45.490604400634766, "logps/rejected": -50.73610305786133, "loss": 0.3175, "rewards/accuracies": 0.875, "rewards/chosen": -0.502980649471283, "rewards/margins": 3.7213191986083984, "rewards/rejected": -4.224299907684326, "step": 270 }, { "epoch": 1.605925925925926, "grad_norm": 20.275807912086314, "learning_rate": 4.845506444410626e-07, "logits/chosen": -1.2842497825622559, "logits/rejected": -1.4023914337158203, "logps/chosen": -36.809024810791016, "logps/rejected": -41.20471954345703, "loss": 0.2032, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5635099411010742, "rewards/margins": 2.691178560256958, "rewards/rejected": -3.254688262939453, "step": 271 }, { "epoch": 1.6118518518518519, "grad_norm": 24.468859454275407, "learning_rate": 4.843250258877294e-07, "logits/chosen": -1.632100224494934, "logits/rejected": -1.7789125442504883, "logps/chosen": -40.27824401855469, "logps/rejected": -42.3206672668457, "loss": 0.2082, "rewards/accuracies": 0.9375, "rewards/chosen": -0.21913456916809082, "rewards/margins": 2.017493486404419, "rewards/rejected": -2.2366278171539307, "step": 272 }, { "epoch": 1.6177777777777778, "grad_norm": 35.329644939940636, "learning_rate": 4.840978251191211e-07, "logits/chosen": -1.4649416208267212, "logits/rejected": -1.5622587203979492, "logps/chosen": -44.524383544921875, "logps/rejected": -47.37981414794922, "loss": 0.3464, "rewards/accuracies": 0.6875, "rewards/chosen": -0.726753294467926, "rewards/margins": 2.2740654945373535, "rewards/rejected": -3.0008187294006348, "step": 273 }, { "epoch": 1.6237037037037036, "grad_norm": 41.38773432746113, "learning_rate": 4.838690436693483e-07, "logits/chosen": -1.4717864990234375, "logits/rejected": -1.5824425220489502, "logps/chosen": -62.04584884643555, "logps/rejected": -64.39595031738281, "loss": 0.3574, "rewards/accuracies": 0.9375, "rewards/chosen": -0.29022544622421265, "rewards/margins": 3.477273941040039, "rewards/rejected": -3.7674996852874756, "step": 274 }, { "epoch": 1.6296296296296298, "grad_norm": 28.53036915990997, "learning_rate": 4.836386830831951e-07, "logits/chosen": -1.4235268831253052, "logits/rejected": -1.3757199048995972, "logps/chosen": -33.49037170410156, "logps/rejected": -52.49199676513672, "loss": 0.2304, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2958151698112488, "rewards/margins": 2.777085304260254, "rewards/rejected": -3.0729002952575684, "step": 275 }, { "epoch": 1.6355555555555554, "grad_norm": 18.64876805274949, "learning_rate": 4.834067449161077e-07, "logits/chosen": -1.503778100013733, "logits/rejected": -1.242067575454712, "logps/chosen": -41.87370300292969, "logps/rejected": -67.22463989257812, "loss": 0.1729, "rewards/accuracies": 0.875, "rewards/chosen": -0.38197654485702515, "rewards/margins": 3.5509443283081055, "rewards/rejected": -3.9329206943511963, "step": 276 }, { "epoch": 1.6414814814814815, "grad_norm": 18.696944468106953, "learning_rate": 4.83173230734185e-07, "logits/chosen": -1.8584768772125244, "logits/rejected": -1.5482829809188843, "logps/chosen": -37.62712097167969, "logps/rejected": -68.63243103027344, "loss": 0.1572, "rewards/accuracies": 0.9375, "rewards/chosen": -0.19617757201194763, "rewards/margins": 2.8529129028320312, "rewards/rejected": -3.0490903854370117, "step": 277 }, { "epoch": 1.6474074074074074, "grad_norm": 26.003574415992446, "learning_rate": 4.829381421141671e-07, "logits/chosen": -1.407435417175293, "logits/rejected": -1.323309302330017, "logps/chosen": -35.42864227294922, "logps/rejected": -50.165164947509766, "loss": 0.2221, "rewards/accuracies": 0.875, "rewards/chosen": -0.4789162874221802, "rewards/margins": 2.547053813934326, "rewards/rejected": -3.025969982147217, "step": 278 }, { "epoch": 1.6533333333333333, "grad_norm": 26.39467431573978, "learning_rate": 4.827014806434253e-07, "logits/chosen": -1.535237431526184, "logits/rejected": -1.4902827739715576, "logps/chosen": -57.465370178222656, "logps/rejected": -78.98246765136719, "loss": 0.1981, "rewards/accuracies": 0.8125, "rewards/chosen": -0.563177227973938, "rewards/margins": 3.7371535301208496, "rewards/rejected": -4.300330638885498, "step": 279 }, { "epoch": 1.6592592592592592, "grad_norm": 32.49048736438936, "learning_rate": 4.824632479199511e-07, "logits/chosen": -1.4649295806884766, "logits/rejected": -1.4040541648864746, "logps/chosen": -44.324127197265625, "logps/rejected": -55.108619689941406, "loss": 0.2692, "rewards/accuracies": 1.0, "rewards/chosen": -0.22248147428035736, "rewards/margins": 2.6846671104431152, "rewards/rejected": -2.907148838043213, "step": 280 }, { "epoch": 1.665185185185185, "grad_norm": 29.020989526396118, "learning_rate": 4.822234455523453e-07, "logits/chosen": -1.7030525207519531, "logits/rejected": -1.6422662734985352, "logps/chosen": -34.391170501708984, "logps/rejected": -58.28746795654297, "loss": 0.2213, "rewards/accuracies": 0.9375, "rewards/chosen": -1.007947564125061, "rewards/margins": 2.7358577251434326, "rewards/rejected": -3.743805170059204, "step": 281 }, { "epoch": 1.6711111111111112, "grad_norm": 20.797914036041025, "learning_rate": 4.819820751598076e-07, "logits/chosen": -1.2752454280853271, "logits/rejected": -1.3115514516830444, "logps/chosen": -34.01559829711914, "logps/rejected": -46.77418899536133, "loss": 0.1823, "rewards/accuracies": 1.0, "rewards/chosen": -0.3145977854728699, "rewards/margins": 2.5336179733276367, "rewards/rejected": -2.8482160568237305, "step": 282 }, { "epoch": 1.6770370370370369, "grad_norm": 26.774632112271266, "learning_rate": 4.817391383721249e-07, "logits/chosen": -1.5755228996276855, "logits/rejected": -1.2880446910858154, "logps/chosen": -40.601654052734375, "logps/rejected": -66.74862670898438, "loss": 0.1948, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3186114430427551, "rewards/margins": 3.0807223320007324, "rewards/rejected": -3.399333953857422, "step": 283 }, { "epoch": 1.682962962962963, "grad_norm": 31.762544002402134, "learning_rate": 4.814946368296616e-07, "logits/chosen": -1.195683240890503, "logits/rejected": -1.124356746673584, "logps/chosen": -34.632972717285156, "logps/rejected": -49.069664001464844, "loss": 0.2904, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5742506384849548, "rewards/margins": 2.327057361602783, "rewards/rejected": -2.901308059692383, "step": 284 }, { "epoch": 1.6888888888888889, "grad_norm": 33.560122086582936, "learning_rate": 4.812485721833464e-07, "logits/chosen": -1.8307454586029053, "logits/rejected": -1.5758129358291626, "logps/chosen": -48.31635665893555, "logps/rejected": -76.99293518066406, "loss": 0.242, "rewards/accuracies": 1.0, "rewards/chosen": -0.42657211422920227, "rewards/margins": 3.085447072982788, "rewards/rejected": -3.512019157409668, "step": 285 }, { "epoch": 1.6948148148148148, "grad_norm": 31.833892095022023, "learning_rate": 4.810009460946635e-07, "logits/chosen": -1.3257019519805908, "logits/rejected": -1.190606713294983, "logps/chosen": -38.6641845703125, "logps/rejected": -52.16994094848633, "loss": 0.2515, "rewards/accuracies": 0.875, "rewards/chosen": -0.23393523693084717, "rewards/margins": 2.993025541305542, "rewards/rejected": -3.226961135864258, "step": 286 }, { "epoch": 1.7007407407407409, "grad_norm": 24.602658233645577, "learning_rate": 4.8075176023564e-07, "logits/chosen": -1.226719617843628, "logits/rejected": -1.0526630878448486, "logps/chosen": -37.76171112060547, "logps/rejected": -55.442710876464844, "loss": 0.2353, "rewards/accuracies": 0.875, "rewards/chosen": 0.017082542181015015, "rewards/margins": 2.848991632461548, "rewards/rejected": -2.8319091796875, "step": 287 }, { "epoch": 1.7066666666666666, "grad_norm": 27.23670345528548, "learning_rate": 4.805010162888346e-07, "logits/chosen": -1.474751353263855, "logits/rejected": -1.519294023513794, "logps/chosen": -44.07060623168945, "logps/rejected": -49.262245178222656, "loss": 0.2091, "rewards/accuracies": 1.0, "rewards/chosen": -0.1540902554988861, "rewards/margins": 2.2316653728485107, "rewards/rejected": -2.3857555389404297, "step": 288 }, { "epoch": 1.7125925925925927, "grad_norm": 30.319908739216153, "learning_rate": 4.802487159473271e-07, "logits/chosen": -1.283259630203247, "logits/rejected": -1.2373545169830322, "logps/chosen": -41.3947868347168, "logps/rejected": -57.63949203491211, "loss": 0.2539, "rewards/accuracies": 0.9375, "rewards/chosen": -0.4889850616455078, "rewards/margins": 3.9944164752960205, "rewards/rejected": -4.483401298522949, "step": 289 }, { "epoch": 1.7185185185185186, "grad_norm": 23.991939953835697, "learning_rate": 4.799948609147061e-07, "logits/chosen": -1.344802737236023, "logits/rejected": -1.218867301940918, "logps/chosen": -39.203948974609375, "logps/rejected": -55.3688850402832, "loss": 0.2034, "rewards/accuracies": 0.875, "rewards/chosen": -0.4728401303291321, "rewards/margins": 2.995742082595825, "rewards/rejected": -3.4685819149017334, "step": 290 }, { "epoch": 1.7244444444444444, "grad_norm": 18.676355499326434, "learning_rate": 4.797394529050577e-07, "logits/chosen": -1.393309235572815, "logits/rejected": -1.3101699352264404, "logps/chosen": -46.854278564453125, "logps/rejected": -56.5775146484375, "loss": 0.1496, "rewards/accuracies": 0.875, "rewards/chosen": -0.3527151048183441, "rewards/margins": 3.0831854343414307, "rewards/rejected": -3.4359004497528076, "step": 291 }, { "epoch": 1.7303703703703703, "grad_norm": 24.923664319626276, "learning_rate": 4.794824936429543e-07, "logits/chosen": -1.4183282852172852, "logits/rejected": -1.319956660270691, "logps/chosen": -32.210166931152344, "logps/rejected": -40.73335647583008, "loss": 0.192, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2939870357513428, "rewards/margins": 3.1537084579467773, "rewards/rejected": -2.8597211837768555, "step": 292 }, { "epoch": 1.7362962962962962, "grad_norm": 22.354248418795603, "learning_rate": 4.792239848634426e-07, "logits/chosen": -1.1553823947906494, "logits/rejected": -1.1823606491088867, "logps/chosen": -44.60945129394531, "logps/rejected": -52.90985870361328, "loss": 0.2075, "rewards/accuracies": 0.9375, "rewards/chosen": -0.29689791798591614, "rewards/margins": 3.1230454444885254, "rewards/rejected": -3.4199435710906982, "step": 293 }, { "epoch": 1.7422222222222223, "grad_norm": 20.407211978062318, "learning_rate": 4.789639283120322e-07, "logits/chosen": -1.777173638343811, "logits/rejected": -1.6720659732818604, "logps/chosen": -30.502058029174805, "logps/rejected": -55.912635803222656, "loss": 0.194, "rewards/accuracies": 1.0, "rewards/chosen": 0.023857399821281433, "rewards/margins": 4.332409858703613, "rewards/rejected": -4.3085527420043945, "step": 294 }, { "epoch": 1.748148148148148, "grad_norm": 22.108322034964687, "learning_rate": 4.787023257446832e-07, "logits/chosen": -1.170305609703064, "logits/rejected": -1.1386812925338745, "logps/chosen": -46.53575134277344, "logps/rejected": -56.994422912597656, "loss": 0.1593, "rewards/accuracies": 1.0, "rewards/chosen": 0.07963424921035767, "rewards/margins": 3.4485249519348145, "rewards/rejected": -3.3688905239105225, "step": 295 }, { "epoch": 1.7540740740740741, "grad_norm": 25.145388894157914, "learning_rate": 4.784391789277952e-07, "logits/chosen": -1.4293248653411865, "logits/rejected": -1.3704808950424194, "logps/chosen": -33.885250091552734, "logps/rejected": -45.85121154785156, "loss": 0.2342, "rewards/accuracies": 0.875, "rewards/chosen": -0.2255295366048813, "rewards/margins": 2.5380513668060303, "rewards/rejected": -2.763580799102783, "step": 296 }, { "epoch": 1.76, "grad_norm": 25.641234909948803, "learning_rate": 4.781744896381944e-07, "logits/chosen": -1.3408482074737549, "logits/rejected": -1.2661770582199097, "logps/chosen": -50.95426940917969, "logps/rejected": -62.9360466003418, "loss": 0.2135, "rewards/accuracies": 0.875, "rewards/chosen": -0.606664776802063, "rewards/margins": 2.8116254806518555, "rewards/rejected": -3.418290376663208, "step": 297 }, { "epoch": 1.765925925925926, "grad_norm": 17.698843100494624, "learning_rate": 4.779082596631226e-07, "logits/chosen": -1.2342387437820435, "logits/rejected": -0.7951022386550903, "logps/chosen": -36.144840240478516, "logps/rejected": -62.39626693725586, "loss": 0.1044, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04224780201911926, "rewards/margins": 4.0136003494262695, "rewards/rejected": -4.055848121643066, "step": 298 }, { "epoch": 1.771851851851852, "grad_norm": 24.352998739418076, "learning_rate": 4.776404908002245e-07, "logits/chosen": -1.4755496978759766, "logits/rejected": -1.259360909461975, "logps/chosen": -35.81382751464844, "logps/rejected": -53.95246124267578, "loss": 0.2259, "rewards/accuracies": 0.8125, "rewards/chosen": -0.28839975595474243, "rewards/margins": 2.211383819580078, "rewards/rejected": -2.499783515930176, "step": 299 }, { "epoch": 1.7777777777777777, "grad_norm": 33.803527524668894, "learning_rate": 4.773711848575356e-07, "logits/chosen": -1.5609314441680908, "logits/rejected": -1.7283426523208618, "logps/chosen": -49.594703674316406, "logps/rejected": -48.518550872802734, "loss": 0.257, "rewards/accuracies": 0.875, "rewards/chosen": -0.7066521644592285, "rewards/margins": 2.6102638244628906, "rewards/rejected": -3.316915988922119, "step": 300 }, { "epoch": 1.7837037037037038, "grad_norm": 22.088311553708106, "learning_rate": 4.771003436534702e-07, "logits/chosen": -1.8934781551361084, "logits/rejected": -1.519129753112793, "logps/chosen": -35.62154006958008, "logps/rejected": -61.455875396728516, "loss": 0.1709, "rewards/accuracies": 1.0, "rewards/chosen": 0.06477588415145874, "rewards/margins": 3.86716628074646, "rewards/rejected": -3.8023905754089355, "step": 301 }, { "epoch": 1.7896296296296297, "grad_norm": 31.474291026302936, "learning_rate": 4.7682796901680906e-07, "logits/chosen": -0.7919188141822815, "logits/rejected": -0.667506754398346, "logps/chosen": -42.68754196166992, "logps/rejected": -61.21702194213867, "loss": 0.263, "rewards/accuracies": 0.875, "rewards/chosen": -0.5648635625839233, "rewards/margins": 3.7736103534698486, "rewards/rejected": -4.338474273681641, "step": 302 }, { "epoch": 1.7955555555555556, "grad_norm": 25.861532042981946, "learning_rate": 4.765540627866869e-07, "logits/chosen": -1.824802041053772, "logits/rejected": -1.9499473571777344, "logps/chosen": -51.77365493774414, "logps/rejected": -50.19049072265625, "loss": 0.2147, "rewards/accuracies": 0.875, "rewards/chosen": -0.6279614567756653, "rewards/margins": 2.7504382133483887, "rewards/rejected": -3.37839937210083, "step": 303 }, { "epoch": 1.8014814814814815, "grad_norm": 27.50126926128136, "learning_rate": 4.7627862681258027e-07, "logits/chosen": -1.2844971418380737, "logits/rejected": -1.365068793296814, "logps/chosen": -37.15514373779297, "logps/rejected": -42.242286682128906, "loss": 0.251, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05502760410308838, "rewards/margins": 2.6144795417785645, "rewards/rejected": -2.6695070266723633, "step": 304 }, { "epoch": 1.8074074074074074, "grad_norm": 29.03711415892969, "learning_rate": 4.7600166295429476e-07, "logits/chosen": -0.9044826030731201, "logits/rejected": -0.8295137882232666, "logps/chosen": -34.650047302246094, "logps/rejected": -51.922096252441406, "loss": 0.2514, "rewards/accuracies": 0.875, "rewards/chosen": -0.9077940583229065, "rewards/margins": 2.484612464904785, "rewards/rejected": -3.3924062252044678, "step": 305 }, { "epoch": 1.8133333333333335, "grad_norm": 24.14727003294785, "learning_rate": 4.7572317308195276e-07, "logits/chosen": -1.8832111358642578, "logits/rejected": -1.620338797569275, "logps/chosen": -38.12425231933594, "logps/rejected": -56.99446105957031, "loss": 0.1977, "rewards/accuracies": 0.875, "rewards/chosen": -0.22710034251213074, "rewards/margins": 2.965156078338623, "rewards/rejected": -3.192256450653076, "step": 306 }, { "epoch": 1.8192592592592591, "grad_norm": 28.260474836345267, "learning_rate": 4.7544315907598034e-07, "logits/chosen": -1.2670873403549194, "logits/rejected": -1.1744232177734375, "logps/chosen": -36.84714126586914, "logps/rejected": -44.011810302734375, "loss": 0.2284, "rewards/accuracies": 0.8125, "rewards/chosen": -0.37556207180023193, "rewards/margins": 2.0585269927978516, "rewards/rejected": -2.434089183807373, "step": 307 }, { "epoch": 1.8251851851851852, "grad_norm": 30.677307239625325, "learning_rate": 4.7516162282709515e-07, "logits/chosen": -2.0330681800842285, "logits/rejected": -1.9144965410232544, "logps/chosen": -38.90164566040039, "logps/rejected": -51.84284210205078, "loss": 0.2654, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3513489365577698, "rewards/margins": 2.5029258728027344, "rewards/rejected": -2.8542749881744385, "step": 308 }, { "epoch": 1.8311111111111111, "grad_norm": 25.485483260448667, "learning_rate": 4.748785662362932e-07, "logits/chosen": -1.3988568782806396, "logits/rejected": -1.1845300197601318, "logps/chosen": -47.51979064941406, "logps/rejected": -69.07670593261719, "loss": 0.2036, "rewards/accuracies": 0.875, "rewards/chosen": 0.1303098201751709, "rewards/margins": 3.500807285308838, "rewards/rejected": -3.370497226715088, "step": 309 }, { "epoch": 1.837037037037037, "grad_norm": 25.256216032781886, "learning_rate": 4.7459399121483634e-07, "logits/chosen": -1.3516114950180054, "logits/rejected": -1.4268112182617188, "logps/chosen": -48.036685943603516, "logps/rejected": -57.18891906738281, "loss": 0.2178, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3574332594871521, "rewards/margins": 3.844609260559082, "rewards/rejected": -4.202042579650879, "step": 310 }, { "epoch": 1.842962962962963, "grad_norm": 24.369536690397222, "learning_rate": 4.74307899684239e-07, "logits/chosen": -1.069244623184204, "logits/rejected": -1.054039478302002, "logps/chosen": -45.125831604003906, "logps/rejected": -57.4384651184082, "loss": 0.2188, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5561109185218811, "rewards/margins": 3.588425636291504, "rewards/rejected": -4.144536972045898, "step": 311 }, { "epoch": 1.8488888888888888, "grad_norm": 31.092403944743825, "learning_rate": 4.7402029357625563e-07, "logits/chosen": -1.3228671550750732, "logits/rejected": -1.3702974319458008, "logps/chosen": -45.36236572265625, "logps/rejected": -53.258853912353516, "loss": 0.2624, "rewards/accuracies": 1.0, "rewards/chosen": 0.0017190277576446533, "rewards/margins": 3.654135227203369, "rewards/rejected": -3.652416229248047, "step": 312 }, { "epoch": 1.854814814814815, "grad_norm": 36.08547561166675, "learning_rate": 4.737311748328673e-07, "logits/chosen": -1.4084701538085938, "logits/rejected": -1.2140371799468994, "logps/chosen": -40.802616119384766, "logps/rejected": -60.195823669433594, "loss": 0.2686, "rewards/accuracies": 0.75, "rewards/chosen": -0.7584589719772339, "rewards/margins": 3.5438449382781982, "rewards/rejected": -4.302303791046143, "step": 313 }, { "epoch": 1.8607407407407406, "grad_norm": 23.310186468345425, "learning_rate": 4.7344054540626887e-07, "logits/chosen": -1.0628677606582642, "logits/rejected": -1.2128937244415283, "logps/chosen": -29.777320861816406, "logps/rejected": -42.853614807128906, "loss": 0.193, "rewards/accuracies": 0.9375, "rewards/chosen": 0.09455633163452148, "rewards/margins": 3.388399124145508, "rewards/rejected": -3.2938427925109863, "step": 314 }, { "epoch": 1.8666666666666667, "grad_norm": 23.41430379058325, "learning_rate": 4.731484072588555e-07, "logits/chosen": -1.1477389335632324, "logits/rejected": -1.1705048084259033, "logps/chosen": -39.31995391845703, "logps/rejected": -56.09278869628906, "loss": 0.1586, "rewards/accuracies": 0.875, "rewards/chosen": -0.46085304021835327, "rewards/margins": 3.9288570880889893, "rewards/rejected": -4.389710426330566, "step": 315 }, { "epoch": 1.8725925925925926, "grad_norm": 32.46581149348618, "learning_rate": 4.7285476236320976e-07, "logits/chosen": -1.323777198791504, "logits/rejected": -1.2052544355392456, "logps/chosen": -36.882781982421875, "logps/rejected": -58.216888427734375, "loss": 0.2763, "rewards/accuracies": 0.9375, "rewards/chosen": -0.299294114112854, "rewards/margins": 2.8449900150299072, "rewards/rejected": -3.144284248352051, "step": 316 }, { "epoch": 1.8785185185185185, "grad_norm": 26.73276893926149, "learning_rate": 4.725596127020879e-07, "logits/chosen": -1.9721529483795166, "logits/rejected": -1.777452826499939, "logps/chosen": -44.33803176879883, "logps/rejected": -61.64508056640625, "loss": 0.2086, "rewards/accuracies": 1.0, "rewards/chosen": 0.06346356868743896, "rewards/margins": 2.4611425399780273, "rewards/rejected": -2.397678852081299, "step": 317 }, { "epoch": 1.8844444444444446, "grad_norm": 32.452202726241005, "learning_rate": 4.7226296026840686e-07, "logits/chosen": -1.0877735614776611, "logits/rejected": -1.0793946981430054, "logps/chosen": -39.05781936645508, "logps/rejected": -49.42589569091797, "loss": 0.2663, "rewards/accuracies": 0.875, "rewards/chosen": -0.8096023797988892, "rewards/margins": 1.485464096069336, "rewards/rejected": -2.2950665950775146, "step": 318 }, { "epoch": 1.8903703703703703, "grad_norm": 34.12614874092215, "learning_rate": 4.7196480706523066e-07, "logits/chosen": -1.5061382055282593, "logits/rejected": -1.2875308990478516, "logps/chosen": -39.065887451171875, "logps/rejected": -52.43286895751953, "loss": 0.2586, "rewards/accuracies": 0.9375, "rewards/chosen": -0.11397549510002136, "rewards/margins": 2.9681153297424316, "rewards/rejected": -3.0820908546447754, "step": 319 }, { "epoch": 1.8962962962962964, "grad_norm": 21.572252361390387, "learning_rate": 4.716651551057567e-07, "logits/chosen": -1.606571912765503, "logits/rejected": -1.3566153049468994, "logps/chosen": -42.1258659362793, "logps/rejected": -60.3515510559082, "loss": 0.1804, "rewards/accuracies": 1.0, "rewards/chosen": -0.23937255144119263, "rewards/margins": 3.642030715942383, "rewards/rejected": -3.8814032077789307, "step": 320 }, { "epoch": 1.9022222222222223, "grad_norm": 23.09776497168057, "learning_rate": 4.7136400641330245e-07, "logits/chosen": -1.7957206964492798, "logits/rejected": -1.8467280864715576, "logps/chosen": -34.64049530029297, "logps/rejected": -55.07294464111328, "loss": 0.2012, "rewards/accuracies": 1.0, "rewards/chosen": -0.3333526849746704, "rewards/margins": 3.44081711769104, "rewards/rejected": -3.774169921875, "step": 321 }, { "epoch": 1.9081481481481481, "grad_norm": 22.774321177776685, "learning_rate": 4.710613630212916e-07, "logits/chosen": -1.482938289642334, "logits/rejected": -1.5811318159103394, "logps/chosen": -45.22616195678711, "logps/rejected": -60.64653015136719, "loss": 0.1673, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5251885652542114, "rewards/margins": 4.717442512512207, "rewards/rejected": -5.242630958557129, "step": 322 }, { "epoch": 1.914074074074074, "grad_norm": 27.731235068985775, "learning_rate": 4.707572269732404e-07, "logits/chosen": -1.5442708730697632, "logits/rejected": -1.4787030220031738, "logps/chosen": -37.137718200683594, "logps/rejected": -54.83098602294922, "loss": 0.2146, "rewards/accuracies": 0.875, "rewards/chosen": 0.15929245948791504, "rewards/margins": 3.0805439949035645, "rewards/rejected": -2.9212515354156494, "step": 323 }, { "epoch": 1.92, "grad_norm": 24.64035908249009, "learning_rate": 4.7045160032274376e-07, "logits/chosen": -1.3611880540847778, "logits/rejected": -1.240279197692871, "logps/chosen": -46.88545608520508, "logps/rejected": -69.9762191772461, "loss": 0.1878, "rewards/accuracies": 1.0, "rewards/chosen": -0.9051852226257324, "rewards/margins": 3.8201780319213867, "rewards/rejected": -4.725363254547119, "step": 324 }, { "epoch": 1.925925925925926, "grad_norm": 25.51618342591243, "learning_rate": 4.701444851334617e-07, "logits/chosen": -1.103247880935669, "logits/rejected": -1.1200182437896729, "logps/chosen": -38.013099670410156, "logps/rejected": -40.656707763671875, "loss": 0.1805, "rewards/accuracies": 0.9375, "rewards/chosen": -0.09585386514663696, "rewards/margins": 2.9236769676208496, "rewards/rejected": -3.019530773162842, "step": 325 }, { "epoch": 1.9318518518518517, "grad_norm": 23.951817662174715, "learning_rate": 4.698358834791051e-07, "logits/chosen": -1.2133781909942627, "logits/rejected": -1.1840780973434448, "logps/chosen": -37.65591812133789, "logps/rejected": -55.01872253417969, "loss": 0.1766, "rewards/accuracies": 1.0, "rewards/chosen": -0.5142883062362671, "rewards/margins": 3.075425386428833, "rewards/rejected": -3.5897135734558105, "step": 326 }, { "epoch": 1.9377777777777778, "grad_norm": 25.13824963787546, "learning_rate": 4.695257974434215e-07, "logits/chosen": -1.3680187463760376, "logits/rejected": -1.4396944046020508, "logps/chosen": -46.585350036621094, "logps/rejected": -49.38848114013672, "loss": 0.2593, "rewards/accuracies": 0.9375, "rewards/chosen": -0.672561526298523, "rewards/margins": 2.856922149658203, "rewards/rejected": -3.5294833183288574, "step": 327 }, { "epoch": 1.9437037037037037, "grad_norm": 20.347354135425682, "learning_rate": 4.6921422912018174e-07, "logits/chosen": -1.5289157629013062, "logits/rejected": -1.3484441041946411, "logps/chosen": -33.157310485839844, "logps/rejected": -60.96978759765625, "loss": 0.1445, "rewards/accuracies": 1.0, "rewards/chosen": -0.4831308424472809, "rewards/margins": 3.873037099838257, "rewards/rejected": -4.356168270111084, "step": 328 }, { "epoch": 1.9496296296296296, "grad_norm": 23.106778896782263, "learning_rate": 4.689011806131651e-07, "logits/chosen": -1.5771872997283936, "logits/rejected": -1.5938518047332764, "logps/chosen": -44.27512741088867, "logps/rejected": -46.58651351928711, "loss": 0.1959, "rewards/accuracies": 0.875, "rewards/chosen": -0.38986605405807495, "rewards/margins": 2.192875623703003, "rewards/rejected": -2.5827417373657227, "step": 329 }, { "epoch": 1.9555555555555557, "grad_norm": 30.558215857635854, "learning_rate": 4.685866540361455e-07, "logits/chosen": -1.098953127861023, "logits/rejected": -0.836959958076477, "logps/chosen": -35.18102264404297, "logps/rejected": -60.627655029296875, "loss": 0.2649, "rewards/accuracies": 1.0, "rewards/chosen": 0.03618580102920532, "rewards/margins": 4.475597381591797, "rewards/rejected": -4.439411640167236, "step": 330 }, { "epoch": 1.9614814814814814, "grad_norm": 26.406377528441052, "learning_rate": 4.6827065151287726e-07, "logits/chosen": -1.6159999370574951, "logits/rejected": -1.5996906757354736, "logps/chosen": -42.592185974121094, "logps/rejected": -61.75004196166992, "loss": 0.2316, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3190871477127075, "rewards/margins": 2.9828944206237793, "rewards/rejected": -3.3019816875457764, "step": 331 }, { "epoch": 1.9674074074074075, "grad_norm": 29.63360426993791, "learning_rate": 4.6795317517708037e-07, "logits/chosen": -1.3993651866912842, "logits/rejected": -1.431746244430542, "logps/chosen": -40.97235107421875, "logps/rejected": -51.440406799316406, "loss": 0.2577, "rewards/accuracies": 1.0, "rewards/chosen": 0.15230146050453186, "rewards/margins": 3.651387929916382, "rewards/rejected": -3.499086380004883, "step": 332 }, { "epoch": 1.9733333333333334, "grad_norm": 23.887681678446096, "learning_rate": 4.676342271724265e-07, "logits/chosen": -2.0206501483917236, "logits/rejected": -1.9321112632751465, "logps/chosen": -33.036163330078125, "logps/rejected": -52.85546112060547, "loss": 0.2217, "rewards/accuracies": 1.0, "rewards/chosen": -0.3907713294029236, "rewards/margins": 3.7629597187042236, "rewards/rejected": -4.153730869293213, "step": 333 }, { "epoch": 1.9792592592592593, "grad_norm": 31.05342628187582, "learning_rate": 4.673138096525243e-07, "logits/chosen": -0.9585044384002686, "logits/rejected": -0.7881700992584229, "logps/chosen": -41.761714935302734, "logps/rejected": -57.95026779174805, "loss": 0.2294, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3382699191570282, "rewards/margins": 2.7629165649414062, "rewards/rejected": -3.1011862754821777, "step": 334 }, { "epoch": 1.9851851851851852, "grad_norm": 23.832470475974606, "learning_rate": 4.6699192478090495e-07, "logits/chosen": -1.7739732265472412, "logits/rejected": -1.647121787071228, "logps/chosen": -36.00649642944336, "logps/rejected": -56.30244445800781, "loss": 0.1638, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5453657507896423, "rewards/margins": 3.5081372261047363, "rewards/rejected": -4.053503036499023, "step": 335 }, { "epoch": 1.991111111111111, "grad_norm": 23.724299564441143, "learning_rate": 4.666685747310074e-07, "logits/chosen": -1.0021092891693115, "logits/rejected": -0.9383841753005981, "logps/chosen": -48.95524597167969, "logps/rejected": -65.27790832519531, "loss": 0.1646, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2525467276573181, "rewards/margins": 4.3653106689453125, "rewards/rejected": -4.617857456207275, "step": 336 }, { "epoch": 1.9970370370370372, "grad_norm": 26.482303767274114, "learning_rate": 4.663437616861641e-07, "logits/chosen": -1.426429271697998, "logits/rejected": -1.4977319240570068, "logps/chosen": -38.05354309082031, "logps/rejected": -45.32928466796875, "loss": 0.1916, "rewards/accuracies": 0.875, "rewards/chosen": -0.18445011973381042, "rewards/margins": 3.021121025085449, "rewards/rejected": -3.205570697784424, "step": 337 }, { "epoch": 2.002962962962963, "grad_norm": 16.43992788407079, "learning_rate": 4.660174878395855e-07, "logits/chosen": -1.5523961782455444, "logits/rejected": -1.3887999057769775, "logps/chosen": -42.240272521972656, "logps/rejected": -56.8974723815918, "loss": 0.1446, "rewards/accuracies": 0.9375, "rewards/chosen": -0.46149742603302, "rewards/margins": 4.195261001586914, "rewards/rejected": -4.656758785247803, "step": 338 }, { "epoch": 2.008888888888889, "grad_norm": 11.567648071422163, "learning_rate": 4.6568975539434624e-07, "logits/chosen": -1.6070256233215332, "logits/rejected": -1.5450011491775513, "logps/chosen": -30.87481689453125, "logps/rejected": -46.38202667236328, "loss": 0.1257, "rewards/accuracies": 1.0, "rewards/chosen": 0.001979619264602661, "rewards/margins": 2.454421281814575, "rewards/rejected": -2.452441692352295, "step": 339 }, { "epoch": 2.0148148148148146, "grad_norm": 10.578338909286192, "learning_rate": 4.653605665633694e-07, "logits/chosen": -1.3266496658325195, "logits/rejected": -1.1827898025512695, "logps/chosen": -46.79537582397461, "logps/rejected": -70.38484191894531, "loss": 0.1019, "rewards/accuracies": 1.0, "rewards/chosen": -0.8711462020874023, "rewards/margins": 3.143320083618164, "rewards/rejected": -4.014466285705566, "step": 340 }, { "epoch": 2.0207407407407407, "grad_norm": 12.217181224347149, "learning_rate": 4.6502992356941193e-07, "logits/chosen": -1.832109808921814, "logits/rejected": -1.8823586702346802, "logps/chosen": -41.478126525878906, "logps/rejected": -60.84067916870117, "loss": 0.0908, "rewards/accuracies": 1.0, "rewards/chosen": -0.2918078899383545, "rewards/margins": 3.8748106956481934, "rewards/rejected": -4.166618824005127, "step": 341 }, { "epoch": 2.026666666666667, "grad_norm": 12.075752437338522, "learning_rate": 4.6469782864504993e-07, "logits/chosen": -1.3877170085906982, "logits/rejected": -1.3782953023910522, "logps/chosen": -44.92713165283203, "logps/rejected": -60.12208557128906, "loss": 0.0928, "rewards/accuracies": 1.0, "rewards/chosen": -0.3753489553928375, "rewards/margins": 3.521347999572754, "rewards/rejected": -3.8966970443725586, "step": 342 }, { "epoch": 2.0325925925925925, "grad_norm": 10.663089210825552, "learning_rate": 4.643642840326627e-07, "logits/chosen": -1.2021465301513672, "logits/rejected": -0.9962760210037231, "logps/chosen": -27.163143157958984, "logps/rejected": -56.33610534667969, "loss": 0.087, "rewards/accuracies": 1.0, "rewards/chosen": -0.251925528049469, "rewards/margins": 3.977407455444336, "rewards/rejected": -4.22933292388916, "step": 343 }, { "epoch": 2.0385185185185186, "grad_norm": 13.245834881134732, "learning_rate": 4.6402929198441876e-07, "logits/chosen": -1.2417113780975342, "logits/rejected": -1.3178651332855225, "logps/chosen": -40.25837326049805, "logps/rejected": -54.30005645751953, "loss": 0.1133, "rewards/accuracies": 1.0, "rewards/chosen": 0.05435517430305481, "rewards/margins": 4.211629867553711, "rewards/rejected": -4.1572747230529785, "step": 344 }, { "epoch": 2.0444444444444443, "grad_norm": 10.571718543433656, "learning_rate": 4.6369285476225953e-07, "logits/chosen": -1.4555777311325073, "logits/rejected": -1.3790762424468994, "logps/chosen": -28.030969619750977, "logps/rejected": -49.054386138916016, "loss": 0.0966, "rewards/accuracies": 1.0, "rewards/chosen": -0.06195509433746338, "rewards/margins": 3.6887340545654297, "rewards/rejected": -3.7506890296936035, "step": 345 }, { "epoch": 2.0503703703703704, "grad_norm": 12.624900961599034, "learning_rate": 4.6335497463788497e-07, "logits/chosen": -1.6352788209915161, "logits/rejected": -1.643273115158081, "logps/chosen": -49.78592300415039, "logps/rejected": -65.03792572021484, "loss": 0.0986, "rewards/accuracies": 1.0, "rewards/chosen": -0.6293268799781799, "rewards/margins": 4.604469299316406, "rewards/rejected": -5.2337965965271, "step": 346 }, { "epoch": 2.0562962962962965, "grad_norm": 8.46422535980816, "learning_rate": 4.6301565389273755e-07, "logits/chosen": -1.5873762369155884, "logits/rejected": -1.4620087146759033, "logps/chosen": -37.836265563964844, "logps/rejected": -47.346160888671875, "loss": 0.0689, "rewards/accuracies": 1.0, "rewards/chosen": 0.030138731002807617, "rewards/margins": 3.4804584980010986, "rewards/rejected": -3.450319766998291, "step": 347 }, { "epoch": 2.062222222222222, "grad_norm": 12.580939295603663, "learning_rate": 4.6267489481798736e-07, "logits/chosen": -1.265205979347229, "logits/rejected": -1.074588656425476, "logps/chosen": -42.781517028808594, "logps/rejected": -64.60198974609375, "loss": 0.103, "rewards/accuracies": 1.0, "rewards/chosen": -0.2199668288230896, "rewards/margins": 5.077633380889893, "rewards/rejected": -5.297599792480469, "step": 348 }, { "epoch": 2.0681481481481483, "grad_norm": 17.749133036624073, "learning_rate": 4.6233269971451627e-07, "logits/chosen": -1.1558200120925903, "logits/rejected": -1.1175287961959839, "logps/chosen": -47.773460388183594, "logps/rejected": -61.29571533203125, "loss": 0.1078, "rewards/accuracies": 1.0, "rewards/chosen": -0.3092385530471802, "rewards/margins": 3.6212716102600098, "rewards/rejected": -3.9305102825164795, "step": 349 }, { "epoch": 2.074074074074074, "grad_norm": 14.385543975965255, "learning_rate": 4.619890708929025e-07, "logits/chosen": -1.5604002475738525, "logits/rejected": -1.399170160293579, "logps/chosen": -40.24705505371094, "logps/rejected": -54.07878875732422, "loss": 0.1248, "rewards/accuracies": 1.0, "rewards/chosen": -0.07989335060119629, "rewards/margins": 2.8104934692382812, "rewards/rejected": -2.8903868198394775, "step": 350 }, { "epoch": 2.08, "grad_norm": 8.82921299232262, "learning_rate": 4.6164401067340526e-07, "logits/chosen": -1.6723850965499878, "logits/rejected": -1.534740686416626, "logps/chosen": -40.84113311767578, "logps/rejected": -51.969539642333984, "loss": 0.0727, "rewards/accuracies": 1.0, "rewards/chosen": -0.15863922238349915, "rewards/margins": 3.5295114517211914, "rewards/rejected": -3.688150405883789, "step": 351 }, { "epoch": 2.0859259259259257, "grad_norm": 11.642298858178583, "learning_rate": 4.612975213859487e-07, "logits/chosen": -1.572222113609314, "logits/rejected": -1.4436020851135254, "logps/chosen": -43.44025802612305, "logps/rejected": -70.96936798095703, "loss": 0.0749, "rewards/accuracies": 1.0, "rewards/chosen": -0.4972277283668518, "rewards/margins": 4.664485931396484, "rewards/rejected": -5.161713600158691, "step": 352 }, { "epoch": 2.091851851851852, "grad_norm": 9.739329509065628, "learning_rate": 4.609496053701064e-07, "logits/chosen": -1.2559152841567993, "logits/rejected": -1.027353048324585, "logps/chosen": -38.480262756347656, "logps/rejected": -63.803558349609375, "loss": 0.0687, "rewards/accuracies": 1.0, "rewards/chosen": -0.8633000254631042, "rewards/margins": 5.508317470550537, "rewards/rejected": -6.371617317199707, "step": 353 }, { "epoch": 2.097777777777778, "grad_norm": 11.068841998119069, "learning_rate": 4.606002649750855e-07, "logits/chosen": -1.7230266332626343, "logits/rejected": -1.7567236423492432, "logps/chosen": -41.075870513916016, "logps/rejected": -57.1865234375, "loss": 0.0899, "rewards/accuracies": 1.0, "rewards/chosen": -0.2410399317741394, "rewards/margins": 4.128354549407959, "rewards/rejected": -4.369394302368164, "step": 354 }, { "epoch": 2.1037037037037036, "grad_norm": 10.99841662995637, "learning_rate": 4.6024950255971106e-07, "logits/chosen": -1.9418741464614868, "logits/rejected": -1.7188175916671753, "logps/chosen": -41.123714447021484, "logps/rejected": -60.509952545166016, "loss": 0.0736, "rewards/accuracies": 0.9375, "rewards/chosen": -0.25673192739486694, "rewards/margins": 3.7385413646698, "rewards/rejected": -3.9952731132507324, "step": 355 }, { "epoch": 2.1096296296296297, "grad_norm": 12.220073717808429, "learning_rate": 4.598973204924097e-07, "logits/chosen": -1.5152227878570557, "logits/rejected": -1.2851159572601318, "logps/chosen": -34.16219711303711, "logps/rejected": -58.27785873413086, "loss": 0.0936, "rewards/accuracies": 1.0, "rewards/chosen": 0.058980196714401245, "rewards/margins": 4.199907302856445, "rewards/rejected": -4.140926837921143, "step": 356 }, { "epoch": 2.1155555555555554, "grad_norm": 11.959716813475435, "learning_rate": 4.5954372115119395e-07, "logits/chosen": -1.0060768127441406, "logits/rejected": -0.8774399161338806, "logps/chosen": -38.00749206542969, "logps/rejected": -55.84514617919922, "loss": 0.0816, "rewards/accuracies": 1.0, "rewards/chosen": 0.07694879174232483, "rewards/margins": 4.28627347946167, "rewards/rejected": -4.209324836730957, "step": 357 }, { "epoch": 2.1214814814814815, "grad_norm": 13.030300625065772, "learning_rate": 4.5918870692364606e-07, "logits/chosen": -1.5811065435409546, "logits/rejected": -1.4525682926177979, "logps/chosen": -42.92179489135742, "logps/rejected": -65.98393249511719, "loss": 0.1022, "rewards/accuracies": 0.9375, "rewards/chosen": -0.11530932784080505, "rewards/margins": 4.738978385925293, "rewards/rejected": -4.854287624359131, "step": 358 }, { "epoch": 2.127407407407407, "grad_norm": 11.182842417441053, "learning_rate": 4.5883228020690204e-07, "logits/chosen": -1.7302151918411255, "logits/rejected": -1.6962320804595947, "logps/chosen": -46.37171173095703, "logps/rejected": -77.80236053466797, "loss": 0.0852, "rewards/accuracies": 1.0, "rewards/chosen": -0.49540960788726807, "rewards/margins": 5.423673152923584, "rewards/rejected": -5.9190826416015625, "step": 359 }, { "epoch": 2.1333333333333333, "grad_norm": 12.012153556810627, "learning_rate": 4.5847444340763516e-07, "logits/chosen": -2.043246269226074, "logits/rejected": -1.7466188669204712, "logps/chosen": -36.21939468383789, "logps/rejected": -70.36598205566406, "loss": 0.1002, "rewards/accuracies": 1.0, "rewards/chosen": -0.5813055038452148, "rewards/margins": 4.620772361755371, "rewards/rejected": -5.202077865600586, "step": 360 }, { "epoch": 2.1392592592592594, "grad_norm": 10.866062973581458, "learning_rate": 4.5811519894204e-07, "logits/chosen": -1.3014001846313477, "logits/rejected": -1.369365930557251, "logps/chosen": -37.454872131347656, "logps/rejected": -48.353370666503906, "loss": 0.0904, "rewards/accuracies": 1.0, "rewards/chosen": -0.7284749746322632, "rewards/margins": 3.3361270427703857, "rewards/rejected": -4.064601898193359, "step": 361 }, { "epoch": 2.145185185185185, "grad_norm": 10.984013926118568, "learning_rate": 4.577545492358159e-07, "logits/chosen": -1.3029181957244873, "logits/rejected": -1.2786986827850342, "logps/chosen": -34.83509826660156, "logps/rejected": -43.100284576416016, "loss": 0.0911, "rewards/accuracies": 1.0, "rewards/chosen": -0.24407219886779785, "rewards/margins": 3.1360340118408203, "rewards/rejected": -3.3801064491271973, "step": 362 }, { "epoch": 2.151111111111111, "grad_norm": 15.321427963115688, "learning_rate": 4.573924967241509e-07, "logits/chosen": -1.6620972156524658, "logits/rejected": -1.6337506771087646, "logps/chosen": -41.56214904785156, "logps/rejected": -55.2154655456543, "loss": 0.0901, "rewards/accuracies": 1.0, "rewards/chosen": 0.0873856246471405, "rewards/margins": 3.668139696121216, "rewards/rejected": -3.580754041671753, "step": 363 }, { "epoch": 2.157037037037037, "grad_norm": 13.335824438416493, "learning_rate": 4.5702904385170495e-07, "logits/chosen": -1.4828283786773682, "logits/rejected": -1.3530676364898682, "logps/chosen": -37.79193878173828, "logps/rejected": -55.125858306884766, "loss": 0.0921, "rewards/accuracies": 1.0, "rewards/chosen": -0.4327038824558258, "rewards/margins": 3.8411686420440674, "rewards/rejected": -4.273872375488281, "step": 364 }, { "epoch": 2.162962962962963, "grad_norm": 10.513080158547776, "learning_rate": 4.566641930725935e-07, "logits/chosen": -0.9346736669540405, "logits/rejected": -0.8324167728424072, "logps/chosen": -38.22724533081055, "logps/rejected": -60.57867431640625, "loss": 0.0794, "rewards/accuracies": 1.0, "rewards/chosen": -0.40388479828834534, "rewards/margins": 3.9846858978271484, "rewards/rejected": -4.388570785522461, "step": 365 }, { "epoch": 2.168888888888889, "grad_norm": 17.76026800342366, "learning_rate": 4.5629794685037125e-07, "logits/chosen": -1.5361857414245605, "logits/rejected": -1.4024276733398438, "logps/chosen": -41.96015548706055, "logps/rejected": -64.81401824951172, "loss": 0.109, "rewards/accuracies": 1.0, "rewards/chosen": -0.6312123537063599, "rewards/margins": 4.2823896408081055, "rewards/rejected": -4.913601875305176, "step": 366 }, { "epoch": 2.1748148148148148, "grad_norm": 14.5609677400579, "learning_rate": 4.5593030765801493e-07, "logits/chosen": -1.497239589691162, "logits/rejected": -1.434057354927063, "logps/chosen": -37.64501953125, "logps/rejected": -54.797393798828125, "loss": 0.1081, "rewards/accuracies": 1.0, "rewards/chosen": -0.6892575025558472, "rewards/margins": 4.99005651473999, "rewards/rejected": -5.679313659667969, "step": 367 }, { "epoch": 2.180740740740741, "grad_norm": 8.065156807391954, "learning_rate": 4.555612779779071e-07, "logits/chosen": -1.3586572408676147, "logits/rejected": -1.0452971458435059, "logps/chosen": -43.58069610595703, "logps/rejected": -63.918155670166016, "loss": 0.0665, "rewards/accuracies": 1.0, "rewards/chosen": -0.6702809929847717, "rewards/margins": 4.295268535614014, "rewards/rejected": -4.965549468994141, "step": 368 }, { "epoch": 2.1866666666666665, "grad_norm": 9.937243299148612, "learning_rate": 4.551908603018191e-07, "logits/chosen": -1.537664771080017, "logits/rejected": -1.4797343015670776, "logps/chosen": -40.95267868041992, "logps/rejected": -59.62392807006836, "loss": 0.0734, "rewards/accuracies": 1.0, "rewards/chosen": -0.8830336928367615, "rewards/margins": 4.300657272338867, "rewards/rejected": -5.183691024780273, "step": 369 }, { "epoch": 2.1925925925925926, "grad_norm": 12.149045017374526, "learning_rate": 4.548190571308944e-07, "logits/chosen": -1.8814219236373901, "logits/rejected": -1.5982637405395508, "logps/chosen": -40.04777145385742, "logps/rejected": -68.85643768310547, "loss": 0.0906, "rewards/accuracies": 1.0, "rewards/chosen": -0.5396410226821899, "rewards/margins": 4.6829514503479, "rewards/rejected": -5.222592353820801, "step": 370 }, { "epoch": 2.1985185185185183, "grad_norm": 16.77656741404385, "learning_rate": 4.5444587097563166e-07, "logits/chosen": -1.4021185636520386, "logits/rejected": -1.355046033859253, "logps/chosen": -41.689239501953125, "logps/rejected": -54.89845657348633, "loss": 0.1307, "rewards/accuracies": 1.0, "rewards/chosen": -0.5340372323989868, "rewards/margins": 3.982806444168091, "rewards/rejected": -4.516843795776367, "step": 371 }, { "epoch": 2.2044444444444444, "grad_norm": 11.063737609204612, "learning_rate": 4.540713043558677e-07, "logits/chosen": -1.2235300540924072, "logits/rejected": -1.3220264911651611, "logps/chosen": -47.1357421875, "logps/rejected": -69.21366882324219, "loss": 0.0872, "rewards/accuracies": 0.9375, "rewards/chosen": 0.04345780611038208, "rewards/margins": 5.431947708129883, "rewards/rejected": -5.388490676879883, "step": 372 }, { "epoch": 2.2103703703703705, "grad_norm": 11.275739717839482, "learning_rate": 4.536953598007607e-07, "logits/chosen": -1.437028169631958, "logits/rejected": -1.5387985706329346, "logps/chosen": -53.654788970947266, "logps/rejected": -53.855712890625, "loss": 0.0746, "rewards/accuracies": 0.9375, "rewards/chosen": -0.4302152693271637, "rewards/margins": 4.380431175231934, "rewards/rejected": -4.8106465339660645, "step": 373 }, { "epoch": 2.216296296296296, "grad_norm": 10.838669499719408, "learning_rate": 4.533180398487726e-07, "logits/chosen": -1.4005894660949707, "logits/rejected": -1.527348279953003, "logps/chosen": -53.5523681640625, "logps/rejected": -60.16189956665039, "loss": 0.1007, "rewards/accuracies": 1.0, "rewards/chosen": -0.6702936887741089, "rewards/margins": 5.403100967407227, "rewards/rejected": -6.073394775390625, "step": 374 }, { "epoch": 2.2222222222222223, "grad_norm": 11.602629913479493, "learning_rate": 4.529393470476528e-07, "logits/chosen": -1.31447172164917, "logits/rejected": -1.3682016134262085, "logps/chosen": -38.680519104003906, "logps/rejected": -41.53880310058594, "loss": 0.0792, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2111378014087677, "rewards/margins": 3.4360599517822266, "rewards/rejected": -3.6471974849700928, "step": 375 }, { "epoch": 2.228148148148148, "grad_norm": 13.311358584472384, "learning_rate": 4.525592839544202e-07, "logits/chosen": -1.437548041343689, "logits/rejected": -1.2081105709075928, "logps/chosen": -34.22154235839844, "logps/rejected": -64.00574493408203, "loss": 0.0848, "rewards/accuracies": 1.0, "rewards/chosen": -1.184605598449707, "rewards/margins": 4.785362720489502, "rewards/rejected": -5.969968795776367, "step": 376 }, { "epoch": 2.234074074074074, "grad_norm": 10.754664912428062, "learning_rate": 4.521778531353462e-07, "logits/chosen": -1.645902156829834, "logits/rejected": -1.533874750137329, "logps/chosen": -41.23374938964844, "logps/rejected": -60.57324981689453, "loss": 0.073, "rewards/accuracies": 1.0, "rewards/chosen": -0.9864581227302551, "rewards/margins": 4.8595967292785645, "rewards/rejected": -5.846055030822754, "step": 377 }, { "epoch": 2.24, "grad_norm": 10.277846832273466, "learning_rate": 4.517950571659376e-07, "logits/chosen": -1.3001539707183838, "logits/rejected": -1.2223761081695557, "logps/chosen": -31.6280574798584, "logps/rejected": -57.09972381591797, "loss": 0.0746, "rewards/accuracies": 1.0, "rewards/chosen": -0.4209780693054199, "rewards/margins": 4.4373779296875, "rewards/rejected": -4.858355522155762, "step": 378 }, { "epoch": 2.245925925925926, "grad_norm": 11.884820901078847, "learning_rate": 4.5141089863091876e-07, "logits/chosen": -1.5289418697357178, "logits/rejected": -1.4325823783874512, "logps/chosen": -38.5244140625, "logps/rejected": -60.34238052368164, "loss": 0.0692, "rewards/accuracies": 1.0, "rewards/chosen": -0.4513339400291443, "rewards/margins": 4.745321273803711, "rewards/rejected": -5.1966552734375, "step": 379 }, { "epoch": 2.251851851851852, "grad_norm": 11.294048882773792, "learning_rate": 4.5102538012421463e-07, "logits/chosen": -1.715264081954956, "logits/rejected": -1.6054749488830566, "logps/chosen": -31.455442428588867, "logps/rejected": -51.70906066894531, "loss": 0.0782, "rewards/accuracies": 1.0, "rewards/chosen": 0.01587429642677307, "rewards/margins": 4.770445823669434, "rewards/rejected": -4.754571914672852, "step": 380 }, { "epoch": 2.2577777777777777, "grad_norm": 11.72778067934763, "learning_rate": 4.506385042489328e-07, "logits/chosen": -1.2105664014816284, "logits/rejected": -1.2829630374908447, "logps/chosen": -41.66542434692383, "logps/rejected": -58.2675895690918, "loss": 0.094, "rewards/accuracies": 0.9375, "rewards/chosen": -0.11428321897983551, "rewards/margins": 4.219393730163574, "rewards/rejected": -4.333676815032959, "step": 381 }, { "epoch": 2.2637037037037038, "grad_norm": 11.88382896794105, "learning_rate": 4.5025027361734613e-07, "logits/chosen": -1.5602684020996094, "logits/rejected": -1.506664514541626, "logps/chosen": -32.05914306640625, "logps/rejected": -60.42582702636719, "loss": 0.0806, "rewards/accuracies": 1.0, "rewards/chosen": -0.41543838381767273, "rewards/margins": 5.331958293914795, "rewards/rejected": -5.747396469116211, "step": 382 }, { "epoch": 2.2696296296296294, "grad_norm": 12.79526415048381, "learning_rate": 4.498606908508753e-07, "logits/chosen": -1.3425393104553223, "logits/rejected": -1.3030786514282227, "logps/chosen": -34.17608642578125, "logps/rejected": -55.656944274902344, "loss": 0.0939, "rewards/accuracies": 1.0, "rewards/chosen": -0.24564027786254883, "rewards/margins": 5.182816505432129, "rewards/rejected": -5.428457260131836, "step": 383 }, { "epoch": 2.2755555555555556, "grad_norm": 8.607081975884558, "learning_rate": 4.4946975858007064e-07, "logits/chosen": -1.390102505683899, "logits/rejected": -1.2515252828598022, "logps/chosen": -30.730792999267578, "logps/rejected": -53.722434997558594, "loss": 0.0455, "rewards/accuracies": 1.0, "rewards/chosen": -0.39248955249786377, "rewards/margins": 5.531357765197754, "rewards/rejected": -5.923847675323486, "step": 384 }, { "epoch": 2.2814814814814817, "grad_norm": 16.021871038921365, "learning_rate": 4.4907747944459484e-07, "logits/chosen": -1.4583913087844849, "logits/rejected": -1.394193172454834, "logps/chosen": -48.586524963378906, "logps/rejected": -57.825904846191406, "loss": 0.0927, "rewards/accuracies": 1.0, "rewards/chosen": -0.9691762924194336, "rewards/margins": 3.857024669647217, "rewards/rejected": -4.82620096206665, "step": 385 }, { "epoch": 2.2874074074074073, "grad_norm": 12.513747923528404, "learning_rate": 4.486838560932048e-07, "logits/chosen": -1.4857195615768433, "logits/rejected": -1.4608405828475952, "logps/chosen": -39.87062072753906, "logps/rejected": -54.67584228515625, "loss": 0.1012, "rewards/accuracies": 1.0, "rewards/chosen": -0.5097848176956177, "rewards/margins": 3.999152183532715, "rewards/rejected": -4.508936882019043, "step": 386 }, { "epoch": 2.2933333333333334, "grad_norm": 9.060948338933432, "learning_rate": 4.4828889118373394e-07, "logits/chosen": -1.5972692966461182, "logits/rejected": -1.5197217464447021, "logps/chosen": -46.44121551513672, "logps/rejected": -63.70100402832031, "loss": 0.0532, "rewards/accuracies": 1.0, "rewards/chosen": -0.3537145256996155, "rewards/margins": 5.512208938598633, "rewards/rejected": -5.865923881530762, "step": 387 }, { "epoch": 2.299259259259259, "grad_norm": 12.21618451799351, "learning_rate": 4.4789258738307413e-07, "logits/chosen": -1.712825059890747, "logits/rejected": -1.6520627737045288, "logps/chosen": -37.02870178222656, "logps/rejected": -57.479644775390625, "loss": 0.1091, "rewards/accuracies": 1.0, "rewards/chosen": -0.015223681926727295, "rewards/margins": 4.26918888092041, "rewards/rejected": -4.284412860870361, "step": 388 }, { "epoch": 2.3051851851851852, "grad_norm": 11.212588488728503, "learning_rate": 4.474949473671578e-07, "logits/chosen": -1.6660000085830688, "logits/rejected": -1.5088659524917603, "logps/chosen": -34.143104553222656, "logps/rejected": -53.65770721435547, "loss": 0.0697, "rewards/accuracies": 1.0, "rewards/chosen": 0.29158514738082886, "rewards/margins": 5.010606288909912, "rewards/rejected": -4.719020843505859, "step": 389 }, { "epoch": 2.311111111111111, "grad_norm": 8.774018598386073, "learning_rate": 4.4709597382093976e-07, "logits/chosen": -1.3193117380142212, "logits/rejected": -1.144613265991211, "logps/chosen": -36.79685592651367, "logps/rejected": -60.21685028076172, "loss": 0.0582, "rewards/accuracies": 1.0, "rewards/chosen": -0.01098334789276123, "rewards/margins": 4.089751720428467, "rewards/rejected": -4.100735187530518, "step": 390 }, { "epoch": 2.317037037037037, "grad_norm": 12.868024033007977, "learning_rate": 4.4669566943837916e-07, "logits/chosen": -1.4795200824737549, "logits/rejected": -1.2550170421600342, "logps/chosen": -38.5240478515625, "logps/rejected": -57.04312515258789, "loss": 0.0741, "rewards/accuracies": 1.0, "rewards/chosen": -0.6837342381477356, "rewards/margins": 4.087418079376221, "rewards/rejected": -4.771152496337891, "step": 391 }, { "epoch": 2.322962962962963, "grad_norm": 9.209291618824878, "learning_rate": 4.462940369224212e-07, "logits/chosen": -1.875885009765625, "logits/rejected": -1.897498607635498, "logps/chosen": -39.49676513671875, "logps/rejected": -59.790672302246094, "loss": 0.0511, "rewards/accuracies": 1.0, "rewards/chosen": -0.7308411598205566, "rewards/margins": 5.174574375152588, "rewards/rejected": -5.9054155349731445, "step": 392 }, { "epoch": 2.328888888888889, "grad_norm": 9.72538897552064, "learning_rate": 4.4589107898497885e-07, "logits/chosen": -1.1584303379058838, "logits/rejected": -1.0515251159667969, "logps/chosen": -41.24639129638672, "logps/rejected": -58.28783416748047, "loss": 0.0606, "rewards/accuracies": 1.0, "rewards/chosen": -0.7245338559150696, "rewards/margins": 5.01975154876709, "rewards/rejected": -5.744285583496094, "step": 393 }, { "epoch": 2.334814814814815, "grad_norm": 10.129377936987572, "learning_rate": 4.454867983469148e-07, "logits/chosen": -1.7846736907958984, "logits/rejected": -1.6749004125595093, "logps/chosen": -38.921051025390625, "logps/rejected": -52.41502380371094, "loss": 0.0695, "rewards/accuracies": 1.0, "rewards/chosen": -0.06997081637382507, "rewards/margins": 3.646679639816284, "rewards/rejected": -3.7166504859924316, "step": 394 }, { "epoch": 2.3407407407407406, "grad_norm": 10.256827451169878, "learning_rate": 4.4508119773802294e-07, "logits/chosen": -1.655611515045166, "logits/rejected": -1.486838936805725, "logps/chosen": -31.4853515625, "logps/rejected": -52.45492172241211, "loss": 0.0781, "rewards/accuracies": 1.0, "rewards/chosen": -0.4950629770755768, "rewards/margins": 4.72844123840332, "rewards/rejected": -5.223504066467285, "step": 395 }, { "epoch": 2.3466666666666667, "grad_norm": 14.641010278912901, "learning_rate": 4.4467427989700967e-07, "logits/chosen": -1.8286457061767578, "logits/rejected": -1.8515902757644653, "logps/chosen": -52.1026496887207, "logps/rejected": -67.81857299804688, "loss": 0.104, "rewards/accuracies": 0.9375, "rewards/chosen": -0.699891209602356, "rewards/margins": 5.736815452575684, "rewards/rejected": -6.43670654296875, "step": 396 }, { "epoch": 2.3525925925925923, "grad_norm": 13.932979253909581, "learning_rate": 4.442660475714758e-07, "logits/chosen": -1.407354474067688, "logits/rejected": -1.4280579090118408, "logps/chosen": -48.2438850402832, "logps/rejected": -62.31871032714844, "loss": 0.0757, "rewards/accuracies": 1.0, "rewards/chosen": -0.9111906290054321, "rewards/margins": 4.790341377258301, "rewards/rejected": -5.701531887054443, "step": 397 }, { "epoch": 2.3585185185185185, "grad_norm": 6.215058288543995, "learning_rate": 4.438565035178979e-07, "logits/chosen": -1.617882490158081, "logits/rejected": -1.507187008857727, "logps/chosen": -39.37698745727539, "logps/rejected": -54.16233825683594, "loss": 0.0466, "rewards/accuracies": 1.0, "rewards/chosen": -0.2572103440761566, "rewards/margins": 4.302726745605469, "rewards/rejected": -4.559937000274658, "step": 398 }, { "epoch": 2.3644444444444446, "grad_norm": 9.899082388032094, "learning_rate": 4.434456505016094e-07, "logits/chosen": -1.5732052326202393, "logits/rejected": -1.5536649227142334, "logps/chosen": -32.2484130859375, "logps/rejected": -49.28650665283203, "loss": 0.0701, "rewards/accuracies": 1.0, "rewards/chosen": -0.1696593165397644, "rewards/margins": 3.8268446922302246, "rewards/rejected": -3.996504306793213, "step": 399 }, { "epoch": 2.3703703703703702, "grad_norm": 10.019608057983106, "learning_rate": 4.430334912967823e-07, "logits/chosen": -1.7139465808868408, "logits/rejected": -1.4648141860961914, "logps/chosen": -38.92392349243164, "logps/rejected": -59.63561248779297, "loss": 0.0541, "rewards/accuracies": 1.0, "rewards/chosen": -0.45068129897117615, "rewards/margins": 4.821527004241943, "rewards/rejected": -5.2722086906433105, "step": 400 }, { "epoch": 2.3762962962962964, "grad_norm": 6.146789861935184, "learning_rate": 4.4262002868640826e-07, "logits/chosen": -1.6599886417388916, "logits/rejected": -1.843379259109497, "logps/chosen": -53.537452697753906, "logps/rejected": -63.20209503173828, "loss": 0.0358, "rewards/accuracies": 1.0, "rewards/chosen": -0.8360182046890259, "rewards/margins": 4.479837894439697, "rewards/rejected": -5.315855979919434, "step": 401 }, { "epoch": 2.3822222222222225, "grad_norm": 8.120322997880242, "learning_rate": 4.422052654622799e-07, "logits/chosen": -1.338700532913208, "logits/rejected": -1.5274639129638672, "logps/chosen": -46.13654327392578, "logps/rejected": -60.73787307739258, "loss": 0.0421, "rewards/accuracies": 1.0, "rewards/chosen": -1.356442928314209, "rewards/margins": 5.699449062347412, "rewards/rejected": -7.055891990661621, "step": 402 }, { "epoch": 2.388148148148148, "grad_norm": 17.358175977582217, "learning_rate": 4.417892044249716e-07, "logits/chosen": -1.484145998954773, "logits/rejected": -1.342088222503662, "logps/chosen": -39.93052673339844, "logps/rejected": -59.5439453125, "loss": 0.116, "rewards/accuracies": 1.0, "rewards/chosen": -0.8260771036148071, "rewards/margins": 4.9727325439453125, "rewards/rejected": -5.79880952835083, "step": 403 }, { "epoch": 2.3940740740740742, "grad_norm": 8.567071041959487, "learning_rate": 4.4137184838382125e-07, "logits/chosen": -1.9984219074249268, "logits/rejected": -1.9365501403808594, "logps/chosen": -46.297367095947266, "logps/rejected": -62.80537414550781, "loss": 0.0529, "rewards/accuracies": 1.0, "rewards/chosen": -0.19484835863113403, "rewards/margins": 5.696597099304199, "rewards/rejected": -5.891445636749268, "step": 404 }, { "epoch": 2.4, "grad_norm": 9.265199389793064, "learning_rate": 4.409532001569105e-07, "logits/chosen": -1.2973332405090332, "logits/rejected": -1.4657697677612305, "logps/chosen": -37.72175598144531, "logps/rejected": -57.419185638427734, "loss": 0.0678, "rewards/accuracies": 1.0, "rewards/chosen": -1.0432603359222412, "rewards/margins": 5.144648551940918, "rewards/rejected": -6.187908172607422, "step": 405 }, { "epoch": 2.405925925925926, "grad_norm": 17.31160313347824, "learning_rate": 4.405332625710465e-07, "logits/chosen": -1.5048401355743408, "logits/rejected": -1.5686776638031006, "logps/chosen": -41.25992202758789, "logps/rejected": -51.92019271850586, "loss": 0.0932, "rewards/accuracies": 1.0, "rewards/chosen": -0.7663825750350952, "rewards/margins": 3.883291006088257, "rewards/rejected": -4.6496734619140625, "step": 406 }, { "epoch": 2.4118518518518517, "grad_norm": 13.886785554067659, "learning_rate": 4.401120384617423e-07, "logits/chosen": -1.3854293823242188, "logits/rejected": -1.417116641998291, "logps/chosen": -49.49497604370117, "logps/rejected": -62.962364196777344, "loss": 0.0848, "rewards/accuracies": 1.0, "rewards/chosen": -1.0801547765731812, "rewards/margins": 5.861600875854492, "rewards/rejected": -6.941755294799805, "step": 407 }, { "epoch": 2.417777777777778, "grad_norm": 13.366585938571683, "learning_rate": 4.396895306731977e-07, "logits/chosen": -1.0745258331298828, "logits/rejected": -1.209951400756836, "logps/chosen": -42.42633819580078, "logps/rejected": -50.970062255859375, "loss": 0.0778, "rewards/accuracies": 0.9375, "rewards/chosen": -0.09873640537261963, "rewards/margins": 4.979592800140381, "rewards/rejected": -5.078329086303711, "step": 408 }, { "epoch": 2.423703703703704, "grad_norm": 10.608096407712143, "learning_rate": 4.3926574205828037e-07, "logits/chosen": -1.454784631729126, "logits/rejected": -1.3318755626678467, "logps/chosen": -28.561241149902344, "logps/rejected": -51.40495300292969, "loss": 0.0738, "rewards/accuracies": 1.0, "rewards/chosen": -0.3801489770412445, "rewards/margins": 4.299707412719727, "rewards/rejected": -4.679856300354004, "step": 409 }, { "epoch": 2.4296296296296296, "grad_norm": 10.042327426623421, "learning_rate": 4.388406754785063e-07, "logits/chosen": -1.4973795413970947, "logits/rejected": -1.3662190437316895, "logps/chosen": -34.50919723510742, "logps/rejected": -56.294063568115234, "loss": 0.0671, "rewards/accuracies": 1.0, "rewards/chosen": -0.7284948825836182, "rewards/margins": 4.622704029083252, "rewards/rejected": -5.351198673248291, "step": 410 }, { "epoch": 2.4355555555555557, "grad_norm": 12.535369572053135, "learning_rate": 4.3841433380402073e-07, "logits/chosen": -1.4382065534591675, "logits/rejected": -1.2739676237106323, "logps/chosen": -40.13460922241211, "logps/rejected": -68.07673645019531, "loss": 0.0749, "rewards/accuracies": 1.0, "rewards/chosen": -0.0914524495601654, "rewards/margins": 5.701619625091553, "rewards/rejected": -5.79307222366333, "step": 411 }, { "epoch": 2.4414814814814814, "grad_norm": 9.584609693394986, "learning_rate": 4.379867199135785e-07, "logits/chosen": -1.039376974105835, "logits/rejected": -0.6864200234413147, "logps/chosen": -36.3159294128418, "logps/rejected": -68.66885375976562, "loss": 0.0641, "rewards/accuracies": 1.0, "rewards/chosen": -0.8507220149040222, "rewards/margins": 6.0718865394592285, "rewards/rejected": -6.922608375549316, "step": 412 }, { "epoch": 2.4474074074074075, "grad_norm": 10.158194545720395, "learning_rate": 4.375578366945246e-07, "logits/chosen": -1.2954965829849243, "logits/rejected": -1.2651395797729492, "logps/chosen": -41.173309326171875, "logps/rejected": -54.60121154785156, "loss": 0.0594, "rewards/accuracies": 1.0, "rewards/chosen": -1.059409737586975, "rewards/margins": 4.310122966766357, "rewards/rejected": -5.369532585144043, "step": 413 }, { "epoch": 2.453333333333333, "grad_norm": 15.06897747431, "learning_rate": 4.3712768704277524e-07, "logits/chosen": -1.5561192035675049, "logits/rejected": -1.5117237567901611, "logps/chosen": -35.58319091796875, "logps/rejected": -50.53232192993164, "loss": 0.0933, "rewards/accuracies": 1.0, "rewards/chosen": -0.16081687808036804, "rewards/margins": 4.436723709106445, "rewards/rejected": -4.597539901733398, "step": 414 }, { "epoch": 2.4592592592592593, "grad_norm": 16.494448887111883, "learning_rate": 4.366962738627975e-07, "logits/chosen": -1.8110413551330566, "logits/rejected": -1.7763352394104004, "logps/chosen": -27.89055633544922, "logps/rejected": -52.024383544921875, "loss": 0.098, "rewards/accuracies": 1.0, "rewards/chosen": -0.5048554539680481, "rewards/margins": 4.789825439453125, "rewards/rejected": -5.294680595397949, "step": 415 }, { "epoch": 2.4651851851851854, "grad_norm": 10.39957564348003, "learning_rate": 4.3626360006759016e-07, "logits/chosen": -1.3889925479888916, "logits/rejected": -1.4343739748001099, "logps/chosen": -44.50629806518555, "logps/rejected": -59.46120071411133, "loss": 0.0586, "rewards/accuracies": 1.0, "rewards/chosen": 0.5878319144248962, "rewards/margins": 5.08259391784668, "rewards/rejected": -4.494762420654297, "step": 416 }, { "epoch": 2.471111111111111, "grad_norm": 7.962756765530989, "learning_rate": 4.3582966857866397e-07, "logits/chosen": -1.805254340171814, "logits/rejected": -1.5298397541046143, "logps/chosen": -35.40469741821289, "logps/rejected": -59.09782409667969, "loss": 0.0498, "rewards/accuracies": 1.0, "rewards/chosen": 0.034172892570495605, "rewards/margins": 5.298932075500488, "rewards/rejected": -5.264759063720703, "step": 417 }, { "epoch": 2.477037037037037, "grad_norm": 9.729500240098128, "learning_rate": 4.353944823260221e-07, "logits/chosen": -1.2123661041259766, "logits/rejected": -1.0601003170013428, "logps/chosen": -35.60749053955078, "logps/rejected": -62.845333099365234, "loss": 0.0706, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5825104713439941, "rewards/margins": 4.653536796569824, "rewards/rejected": -5.23604679107666, "step": 418 }, { "epoch": 2.482962962962963, "grad_norm": 13.078064168370302, "learning_rate": 4.3495804424813986e-07, "logits/chosen": -1.7815532684326172, "logits/rejected": -1.5699641704559326, "logps/chosen": -36.4069938659668, "logps/rejected": -56.897377014160156, "loss": 0.0903, "rewards/accuracies": 1.0, "rewards/chosen": 0.07872982323169708, "rewards/margins": 4.7484540939331055, "rewards/rejected": -4.669724464416504, "step": 419 }, { "epoch": 2.488888888888889, "grad_norm": 13.298324800536891, "learning_rate": 4.3452035729194534e-07, "logits/chosen": -1.675660490989685, "logits/rejected": -1.5707687139511108, "logps/chosen": -36.95539093017578, "logps/rejected": -63.85231018066406, "loss": 0.0895, "rewards/accuracies": 1.0, "rewards/chosen": 0.03855517506599426, "rewards/margins": 6.0861358642578125, "rewards/rejected": -6.047580718994141, "step": 420 }, { "epoch": 2.4948148148148146, "grad_norm": 10.091782491568125, "learning_rate": 4.340814244127993e-07, "logits/chosen": -1.6767897605895996, "logits/rejected": -1.5757229328155518, "logps/chosen": -40.57682800292969, "logps/rejected": -56.93830871582031, "loss": 0.0705, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6066794395446777, "rewards/margins": 3.8079683780670166, "rewards/rejected": -4.414648056030273, "step": 421 }, { "epoch": 2.5007407407407407, "grad_norm": 7.613026313203136, "learning_rate": 4.3364124857447525e-07, "logits/chosen": -1.4799020290374756, "logits/rejected": -1.1713473796844482, "logps/chosen": -44.87807083129883, "logps/rejected": -65.74991607666016, "loss": 0.0494, "rewards/accuracies": 1.0, "rewards/chosen": -0.8942358493804932, "rewards/margins": 5.720493316650391, "rewards/rejected": -6.614729404449463, "step": 422 }, { "epoch": 2.506666666666667, "grad_norm": 7.9345951349843435, "learning_rate": 4.331998327491395e-07, "logits/chosen": -1.092266321182251, "logits/rejected": -1.1247848272323608, "logps/chosen": -42.70750045776367, "logps/rejected": -59.23059844970703, "loss": 0.0463, "rewards/accuracies": 1.0, "rewards/chosen": -0.8865154385566711, "rewards/margins": 5.208459854125977, "rewards/rejected": -6.094975471496582, "step": 423 }, { "epoch": 2.5125925925925925, "grad_norm": 13.270734370115317, "learning_rate": 4.3275717991733097e-07, "logits/chosen": -2.118969440460205, "logits/rejected": -1.9935420751571655, "logps/chosen": -35.17277908325195, "logps/rejected": -52.8382568359375, "loss": 0.0795, "rewards/accuracies": 1.0, "rewards/chosen": -0.34134382009506226, "rewards/margins": 3.8893234729766846, "rewards/rejected": -4.2306671142578125, "step": 424 }, { "epoch": 2.5185185185185186, "grad_norm": 10.153066540239442, "learning_rate": 4.3231329306794106e-07, "logits/chosen": -1.8174488544464111, "logits/rejected": -1.8481026887893677, "logps/chosen": -40.095062255859375, "logps/rejected": -60.66204071044922, "loss": 0.0722, "rewards/accuracies": 1.0, "rewards/chosen": -0.58811354637146, "rewards/margins": 5.898916244506836, "rewards/rejected": -6.487029552459717, "step": 425 }, { "epoch": 2.5244444444444447, "grad_norm": 12.327594132627251, "learning_rate": 4.3186817519819365e-07, "logits/chosen": -1.3380463123321533, "logits/rejected": -1.233199119567871, "logps/chosen": -43.25425720214844, "logps/rejected": -63.275611877441406, "loss": 0.0759, "rewards/accuracies": 1.0, "rewards/chosen": -0.8061537742614746, "rewards/margins": 5.740635871887207, "rewards/rejected": -6.54679012298584, "step": 426 }, { "epoch": 2.5303703703703704, "grad_norm": 9.443105985819049, "learning_rate": 4.314218293136247e-07, "logits/chosen": -1.3254871368408203, "logits/rejected": -1.3607250452041626, "logps/chosen": -30.99878692626953, "logps/rejected": -45.84544372558594, "loss": 0.0519, "rewards/accuracies": 1.0, "rewards/chosen": -0.09832948446273804, "rewards/margins": 4.39568567276001, "rewards/rejected": -4.494015216827393, "step": 427 }, { "epoch": 2.536296296296296, "grad_norm": 15.015344786901764, "learning_rate": 4.30974258428062e-07, "logits/chosen": -1.9797377586364746, "logits/rejected": -2.0408339500427246, "logps/chosen": -47.02696990966797, "logps/rejected": -50.319114685058594, "loss": 0.0804, "rewards/accuracies": 1.0, "rewards/chosen": -1.1645985841751099, "rewards/margins": 3.7737340927124023, "rewards/rejected": -4.938333034515381, "step": 428 }, { "epoch": 2.542222222222222, "grad_norm": 9.349974145456377, "learning_rate": 4.3052546556360486e-07, "logits/chosen": -1.687772512435913, "logits/rejected": -1.6783831119537354, "logps/chosen": -32.642303466796875, "logps/rejected": -47.195770263671875, "loss": 0.0678, "rewards/accuracies": 1.0, "rewards/chosen": -0.3790569603443146, "rewards/margins": 3.810107469558716, "rewards/rejected": -4.189164638519287, "step": 429 }, { "epoch": 2.5481481481481483, "grad_norm": 7.215406846905479, "learning_rate": 4.300754537506036e-07, "logits/chosen": -1.4609354734420776, "logits/rejected": -1.5548583269119263, "logps/chosen": -41.346309661865234, "logps/rejected": -52.50380325317383, "loss": 0.0455, "rewards/accuracies": 1.0, "rewards/chosen": -0.40993720293045044, "rewards/margins": 5.032467842102051, "rewards/rejected": -5.442404747009277, "step": 430 }, { "epoch": 2.554074074074074, "grad_norm": 10.222814355312524, "learning_rate": 4.2962422602763925e-07, "logits/chosen": -1.1078249216079712, "logits/rejected": -0.9215734004974365, "logps/chosen": -33.152610778808594, "logps/rejected": -57.324745178222656, "loss": 0.0858, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7816075086593628, "rewards/margins": 4.221255302429199, "rewards/rejected": -5.002862453460693, "step": 431 }, { "epoch": 2.56, "grad_norm": 18.265248426638163, "learning_rate": 4.2917178544150284e-07, "logits/chosen": -1.9573190212249756, "logits/rejected": -1.8439496755599976, "logps/chosen": -36.08943176269531, "logps/rejected": -57.182159423828125, "loss": 0.0873, "rewards/accuracies": 0.875, "rewards/chosen": -1.192732572555542, "rewards/margins": 4.874904632568359, "rewards/rejected": -6.0676374435424805, "step": 432 }, { "epoch": 2.565925925925926, "grad_norm": 10.235583078032512, "learning_rate": 4.2871813504717497e-07, "logits/chosen": -1.5011374950408936, "logits/rejected": -1.2769317626953125, "logps/chosen": -39.9639892578125, "logps/rejected": -61.80725860595703, "loss": 0.0543, "rewards/accuracies": 1.0, "rewards/chosen": -0.6877691745758057, "rewards/margins": 5.699342727661133, "rewards/rejected": -6.387112140655518, "step": 433 }, { "epoch": 2.571851851851852, "grad_norm": 9.62877392648817, "learning_rate": 4.2826327790780505e-07, "logits/chosen": -1.1425065994262695, "logits/rejected": -1.1360677480697632, "logps/chosen": -43.025978088378906, "logps/rejected": -61.36090087890625, "loss": 0.0649, "rewards/accuracies": 1.0, "rewards/chosen": -0.4637770354747772, "rewards/margins": 5.920599460601807, "rewards/rejected": -6.384376525878906, "step": 434 }, { "epoch": 2.5777777777777775, "grad_norm": 9.890311360978767, "learning_rate": 4.278072170946909e-07, "logits/chosen": -1.4360768795013428, "logits/rejected": -1.4959397315979004, "logps/chosen": -46.09064483642578, "logps/rejected": -56.41389465332031, "loss": 0.0593, "rewards/accuracies": 1.0, "rewards/chosen": -0.707612931728363, "rewards/margins": 5.068720817565918, "rewards/rejected": -5.776333332061768, "step": 435 }, { "epoch": 2.5837037037037036, "grad_norm": 10.392170104491372, "learning_rate": 4.273499556872576e-07, "logits/chosen": -2.2950620651245117, "logits/rejected": -2.092170238494873, "logps/chosen": -36.24480438232422, "logps/rejected": -64.73004150390625, "loss": 0.0619, "rewards/accuracies": 1.0, "rewards/chosen": -0.4665120542049408, "rewards/margins": 5.585385322570801, "rewards/rejected": -6.051897048950195, "step": 436 }, { "epoch": 2.5896296296296297, "grad_norm": 10.12765358633514, "learning_rate": 4.2689149677303716e-07, "logits/chosen": -1.0707876682281494, "logits/rejected": -1.1080553531646729, "logps/chosen": -43.415000915527344, "logps/rejected": -51.25993347167969, "loss": 0.0572, "rewards/accuracies": 1.0, "rewards/chosen": -0.4414452314376831, "rewards/margins": 4.333120346069336, "rewards/rejected": -4.774565696716309, "step": 437 }, { "epoch": 2.5955555555555554, "grad_norm": 11.403233126394678, "learning_rate": 4.264318434476472e-07, "logits/chosen": -0.9671777486801147, "logits/rejected": -1.0228941440582275, "logps/chosen": -43.18418884277344, "logps/rejected": -61.22423553466797, "loss": 0.0733, "rewards/accuracies": 1.0, "rewards/chosen": -0.750145435333252, "rewards/margins": 5.196172714233398, "rewards/rejected": -5.94631814956665, "step": 438 }, { "epoch": 2.6014814814814815, "grad_norm": 23.740794064578566, "learning_rate": 4.2597099881477017e-07, "logits/chosen": -1.4594378471374512, "logits/rejected": -1.2035342454910278, "logps/chosen": -35.04090118408203, "logps/rejected": -48.212501525878906, "loss": 0.1412, "rewards/accuracies": 0.875, "rewards/chosen": -0.5435713529586792, "rewards/margins": 2.9230170249938965, "rewards/rejected": -3.4665884971618652, "step": 439 }, { "epoch": 2.6074074074074076, "grad_norm": 7.22571057177582, "learning_rate": 4.2550896598613297e-07, "logits/chosen": -1.6149311065673828, "logits/rejected": -1.366624355316162, "logps/chosen": -26.589153289794922, "logps/rejected": -60.616371154785156, "loss": 0.0451, "rewards/accuracies": 1.0, "rewards/chosen": 0.10370810329914093, "rewards/margins": 5.520005226135254, "rewards/rejected": -5.41629695892334, "step": 440 }, { "epoch": 2.6133333333333333, "grad_norm": 8.984715683706641, "learning_rate": 4.25045748081485e-07, "logits/chosen": -1.4854509830474854, "logits/rejected": -1.4200009107589722, "logps/chosen": -34.8680305480957, "logps/rejected": -62.0745735168457, "loss": 0.0597, "rewards/accuracies": 1.0, "rewards/chosen": -0.705161452293396, "rewards/margins": 5.211274147033691, "rewards/rejected": -5.916436195373535, "step": 441 }, { "epoch": 2.6192592592592594, "grad_norm": 9.126681254113208, "learning_rate": 4.2458134822857774e-07, "logits/chosen": -1.644370198249817, "logits/rejected": -1.582041621208191, "logps/chosen": -36.514915466308594, "logps/rejected": -67.11909484863281, "loss": 0.0519, "rewards/accuracies": 1.0, "rewards/chosen": -0.7975553870201111, "rewards/margins": 6.119021892547607, "rewards/rejected": -6.916577339172363, "step": 442 }, { "epoch": 2.625185185185185, "grad_norm": 13.885847569564467, "learning_rate": 4.241157695631435e-07, "logits/chosen": -1.5854181051254272, "logits/rejected": -1.6152215003967285, "logps/chosen": -38.029273986816406, "logps/rejected": -62.7104377746582, "loss": 0.0787, "rewards/accuracies": 1.0, "rewards/chosen": -0.16970673203468323, "rewards/margins": 6.10245943069458, "rewards/rejected": -6.2721662521362305, "step": 443 }, { "epoch": 2.631111111111111, "grad_norm": 10.006195702980698, "learning_rate": 4.2364901522887416e-07, "logits/chosen": -1.1892244815826416, "logits/rejected": -1.2867357730865479, "logps/chosen": -36.74848937988281, "logps/rejected": -57.911075592041016, "loss": 0.0579, "rewards/accuracies": 1.0, "rewards/chosen": -0.27545565366744995, "rewards/margins": 5.799798488616943, "rewards/rejected": -6.075253963470459, "step": 444 }, { "epoch": 2.637037037037037, "grad_norm": 17.203886073678117, "learning_rate": 4.2318108837739986e-07, "logits/chosen": -1.2790842056274414, "logits/rejected": -1.226332664489746, "logps/chosen": -36.43606948852539, "logps/rejected": -49.779937744140625, "loss": 0.0892, "rewards/accuracies": 1.0, "rewards/chosen": -0.7447434663772583, "rewards/margins": 4.716348648071289, "rewards/rejected": -5.461092948913574, "step": 445 }, { "epoch": 2.642962962962963, "grad_norm": 10.069009996513447, "learning_rate": 4.22711992168268e-07, "logits/chosen": -1.6787967681884766, "logits/rejected": -1.6690788269042969, "logps/chosen": -44.772674560546875, "logps/rejected": -56.2176399230957, "loss": 0.0708, "rewards/accuracies": 1.0, "rewards/chosen": -0.09006957709789276, "rewards/margins": 4.575125694274902, "rewards/rejected": -4.665195465087891, "step": 446 }, { "epoch": 2.648888888888889, "grad_norm": 11.658795202407136, "learning_rate": 4.2224172976892166e-07, "logits/chosen": -1.3170833587646484, "logits/rejected": -1.157299518585205, "logps/chosen": -47.767242431640625, "logps/rejected": -72.54170227050781, "loss": 0.061, "rewards/accuracies": 1.0, "rewards/chosen": -0.17752492427825928, "rewards/margins": 6.256413459777832, "rewards/rejected": -6.433938980102539, "step": 447 }, { "epoch": 2.6548148148148147, "grad_norm": 5.940086524497116, "learning_rate": 4.217703043546783e-07, "logits/chosen": -1.2264058589935303, "logits/rejected": -1.195594310760498, "logps/chosen": -43.642913818359375, "logps/rejected": -60.73713684082031, "loss": 0.0274, "rewards/accuracies": 1.0, "rewards/chosen": -1.053247094154358, "rewards/margins": 6.064220428466797, "rewards/rejected": -7.117467880249023, "step": 448 }, { "epoch": 2.660740740740741, "grad_norm": 7.247831097507863, "learning_rate": 4.2129771910870845e-07, "logits/chosen": -1.4685695171356201, "logits/rejected": -1.2932255268096924, "logps/chosen": -39.4880256652832, "logps/rejected": -72.38164520263672, "loss": 0.0446, "rewards/accuracies": 1.0, "rewards/chosen": -0.781300961971283, "rewards/margins": 6.027366638183594, "rewards/rejected": -6.80866813659668, "step": 449 }, { "epoch": 2.6666666666666665, "grad_norm": 8.142661646420805, "learning_rate": 4.2082397722201385e-07, "logits/chosen": -1.6290993690490723, "logits/rejected": -1.3875850439071655, "logps/chosen": -27.86062240600586, "logps/rejected": -60.022274017333984, "loss": 0.0604, "rewards/accuracies": 1.0, "rewards/chosen": -0.38442179560661316, "rewards/margins": 5.5896830558776855, "rewards/rejected": -5.974104881286621, "step": 450 }, { "epoch": 2.6725925925925926, "grad_norm": 12.094207410074398, "learning_rate": 4.2034908189340634e-07, "logits/chosen": -1.3270450830459595, "logits/rejected": -1.184122920036316, "logps/chosen": -36.17488479614258, "logps/rejected": -62.60739517211914, "loss": 0.0712, "rewards/accuracies": 1.0, "rewards/chosen": -0.7593768239021301, "rewards/margins": 5.755254745483398, "rewards/rejected": -6.514631748199463, "step": 451 }, { "epoch": 2.6785185185185183, "grad_norm": 10.502777697396722, "learning_rate": 4.19873036329486e-07, "logits/chosen": -1.6183356046676636, "logits/rejected": -1.8080652952194214, "logps/chosen": -39.19137954711914, "logps/rejected": -61.41072082519531, "loss": 0.0541, "rewards/accuracies": 1.0, "rewards/chosen": -0.3307141363620758, "rewards/margins": 5.200984954833984, "rewards/rejected": -5.531699180603027, "step": 452 }, { "epoch": 2.6844444444444444, "grad_norm": 5.2366059114266035, "learning_rate": 4.1939584374461943e-07, "logits/chosen": -1.7235251665115356, "logits/rejected": -1.756578803062439, "logps/chosen": -33.519996643066406, "logps/rejected": -48.20545959472656, "loss": 0.0323, "rewards/accuracies": 1.0, "rewards/chosen": 0.2523934543132782, "rewards/margins": 4.641357421875, "rewards/rejected": -4.3889641761779785, "step": 453 }, { "epoch": 2.6903703703703705, "grad_norm": 6.793456123885119, "learning_rate": 4.189175073609184e-07, "logits/chosen": -1.768607258796692, "logits/rejected": -1.6957839727401733, "logps/chosen": -42.76543426513672, "logps/rejected": -55.93104934692383, "loss": 0.0418, "rewards/accuracies": 1.0, "rewards/chosen": 0.22528111934661865, "rewards/margins": 4.591314792633057, "rewards/rejected": -4.366034030914307, "step": 454 }, { "epoch": 2.696296296296296, "grad_norm": 10.416935814267305, "learning_rate": 4.184380304082177e-07, "logits/chosen": -1.3785486221313477, "logits/rejected": -1.3025366067886353, "logps/chosen": -37.827213287353516, "logps/rejected": -52.29517364501953, "loss": 0.0686, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8132541179656982, "rewards/margins": 4.752249717712402, "rewards/rejected": -5.5655035972595215, "step": 455 }, { "epoch": 2.7022222222222223, "grad_norm": 13.940426481316408, "learning_rate": 4.179574161240536e-07, "logits/chosen": -2.09922456741333, "logits/rejected": -1.9302072525024414, "logps/chosen": -30.62826156616211, "logps/rejected": -48.425559997558594, "loss": 0.0924, "rewards/accuracies": 1.0, "rewards/chosen": -0.37107735872268677, "rewards/margins": 3.9561352729797363, "rewards/rejected": -4.327212333679199, "step": 456 }, { "epoch": 2.7081481481481484, "grad_norm": 14.616095915339471, "learning_rate": 4.1747566775364175e-07, "logits/chosen": -1.7652040719985962, "logits/rejected": -1.5786316394805908, "logps/chosen": -28.41473388671875, "logps/rejected": -60.02101135253906, "loss": 0.0874, "rewards/accuracies": 1.0, "rewards/chosen": -0.0990707278251648, "rewards/margins": 5.42191219329834, "rewards/rejected": -5.5209832191467285, "step": 457 }, { "epoch": 2.714074074074074, "grad_norm": 11.634526933904187, "learning_rate": 4.169927885498556e-07, "logits/chosen": -1.509355068206787, "logits/rejected": -1.5713496208190918, "logps/chosen": -44.71356964111328, "logps/rejected": -59.13399124145508, "loss": 0.0474, "rewards/accuracies": 1.0, "rewards/chosen": -0.6387603878974915, "rewards/margins": 6.12332820892334, "rewards/rejected": -6.762088775634766, "step": 458 }, { "epoch": 2.7199999999999998, "grad_norm": 12.6955399694262, "learning_rate": 4.16508781773204e-07, "logits/chosen": -1.034393548965454, "logits/rejected": -1.0755641460418701, "logps/chosen": -50.58789825439453, "logps/rejected": -65.17190551757812, "loss": 0.0744, "rewards/accuracies": 1.0, "rewards/chosen": -0.7241175174713135, "rewards/margins": 6.459036350250244, "rewards/rejected": -7.183154106140137, "step": 459 }, { "epoch": 2.725925925925926, "grad_norm": 10.566702546539224, "learning_rate": 4.1602365069180976e-07, "logits/chosen": -1.2953882217407227, "logits/rejected": -1.4143340587615967, "logps/chosen": -48.79972839355469, "logps/rejected": -62.423553466796875, "loss": 0.067, "rewards/accuracies": 1.0, "rewards/chosen": -1.5391722917556763, "rewards/margins": 5.307343482971191, "rewards/rejected": -6.846515655517578, "step": 460 }, { "epoch": 2.731851851851852, "grad_norm": 7.496698557060088, "learning_rate": 4.155373985813868e-07, "logits/chosen": -1.5388610363006592, "logits/rejected": -1.4691227674484253, "logps/chosen": -34.53229522705078, "logps/rejected": -45.744293212890625, "loss": 0.0546, "rewards/accuracies": 1.0, "rewards/chosen": -0.47361868619918823, "rewards/margins": 4.789101600646973, "rewards/rejected": -5.262720584869385, "step": 461 }, { "epoch": 2.7377777777777776, "grad_norm": 11.142058376362055, "learning_rate": 4.150500287252189e-07, "logits/chosen": -1.1989514827728271, "logits/rejected": -1.2030577659606934, "logps/chosen": -42.25239944458008, "logps/rejected": -57.260189056396484, "loss": 0.0583, "rewards/accuracies": 1.0, "rewards/chosen": -0.7515776753425598, "rewards/margins": 5.3145246505737305, "rewards/rejected": -6.066102981567383, "step": 462 }, { "epoch": 2.7437037037037038, "grad_norm": 11.051939576284227, "learning_rate": 4.145615444141369e-07, "logits/chosen": -1.109269618988037, "logits/rejected": -1.1979848146438599, "logps/chosen": -46.74151611328125, "logps/rejected": -53.64583206176758, "loss": 0.0708, "rewards/accuracies": 1.0, "rewards/chosen": -0.73493492603302, "rewards/margins": 5.678466320037842, "rewards/rejected": -6.4134016036987305, "step": 463 }, { "epoch": 2.74962962962963, "grad_norm": 7.400402446401093, "learning_rate": 4.1407194894649677e-07, "logits/chosen": -2.147449493408203, "logits/rejected": -2.0689783096313477, "logps/chosen": -37.51029968261719, "logps/rejected": -69.01571655273438, "loss": 0.043, "rewards/accuracies": 1.0, "rewards/chosen": -0.8444314002990723, "rewards/margins": 6.505341529846191, "rewards/rejected": -7.349773406982422, "step": 464 }, { "epoch": 2.7555555555555555, "grad_norm": 17.234630076573417, "learning_rate": 4.135812456281571e-07, "logits/chosen": -1.9983713626861572, "logits/rejected": -1.5781863927841187, "logps/chosen": -47.452430725097656, "logps/rejected": -84.41545104980469, "loss": 0.0975, "rewards/accuracies": 0.875, "rewards/chosen": -1.7086008787155151, "rewards/margins": 5.433979034423828, "rewards/rejected": -7.142579555511475, "step": 465 }, { "epoch": 2.7614814814814816, "grad_norm": 11.722950079081206, "learning_rate": 4.1308943777245717e-07, "logits/chosen": -1.4446229934692383, "logits/rejected": -1.5490970611572266, "logps/chosen": -33.173927307128906, "logps/rejected": -53.479957580566406, "loss": 0.075, "rewards/accuracies": 1.0, "rewards/chosen": -0.5102207660675049, "rewards/margins": 6.010803699493408, "rewards/rejected": -6.521024703979492, "step": 466 }, { "epoch": 2.7674074074074073, "grad_norm": 13.810856571727623, "learning_rate": 4.1259652870019426e-07, "logits/chosen": -1.1215626001358032, "logits/rejected": -1.2170990705490112, "logps/chosen": -44.314571380615234, "logps/rejected": -55.65526580810547, "loss": 0.0819, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8959226012229919, "rewards/margins": 5.7853522300720215, "rewards/rejected": -6.681274890899658, "step": 467 }, { "epoch": 2.7733333333333334, "grad_norm": 7.703802667245408, "learning_rate": 4.121025217396011e-07, "logits/chosen": -0.8404613733291626, "logits/rejected": -0.8951901197433472, "logps/chosen": -39.737998962402344, "logps/rejected": -52.829063415527344, "loss": 0.0428, "rewards/accuracies": 1.0, "rewards/chosen": -1.0370070934295654, "rewards/margins": 5.2985029220581055, "rewards/rejected": -6.33551025390625, "step": 468 }, { "epoch": 2.779259259259259, "grad_norm": 11.663820751466071, "learning_rate": 4.1160742022632395e-07, "logits/chosen": -1.3555892705917358, "logits/rejected": -1.2612375020980835, "logps/chosen": -36.32597732543945, "logps/rejected": -56.69281005859375, "loss": 0.0644, "rewards/accuracies": 1.0, "rewards/chosen": -0.7630610466003418, "rewards/margins": 4.628640651702881, "rewards/rejected": -5.391702175140381, "step": 469 }, { "epoch": 2.785185185185185, "grad_norm": 5.80144294575508, "learning_rate": 4.1111122750339945e-07, "logits/chosen": -1.330127239227295, "logits/rejected": -1.1901962757110596, "logps/chosen": -41.21522903442383, "logps/rejected": -63.96897888183594, "loss": 0.0286, "rewards/accuracies": 1.0, "rewards/chosen": -0.2805338501930237, "rewards/margins": 7.6646647453308105, "rewards/rejected": -7.945198059082031, "step": 470 }, { "epoch": 2.7911111111111113, "grad_norm": 9.363333254370783, "learning_rate": 4.106139469212326e-07, "logits/chosen": -1.337806224822998, "logits/rejected": -1.2989773750305176, "logps/chosen": -46.482181549072266, "logps/rejected": -63.64010238647461, "loss": 0.0568, "rewards/accuracies": 1.0, "rewards/chosen": -0.9635290503501892, "rewards/margins": 5.768977165222168, "rewards/rejected": -6.73250675201416, "step": 471 }, { "epoch": 2.797037037037037, "grad_norm": 14.879444811599774, "learning_rate": 4.1011558183757374e-07, "logits/chosen": -1.7458575963974, "logits/rejected": -1.6218409538269043, "logps/chosen": -30.842025756835938, "logps/rejected": -56.45557403564453, "loss": 0.0776, "rewards/accuracies": 1.0, "rewards/chosen": -0.7571108341217041, "rewards/margins": 5.208766937255859, "rewards/rejected": -5.965878486633301, "step": 472 }, { "epoch": 2.802962962962963, "grad_norm": 12.457885607362718, "learning_rate": 4.0961613561749585e-07, "logits/chosen": -2.1387579441070557, "logits/rejected": -1.9743508100509644, "logps/chosen": -45.234222412109375, "logps/rejected": -69.26448059082031, "loss": 0.049, "rewards/accuracies": 1.0, "rewards/chosen": -1.0624377727508545, "rewards/margins": 5.491796016693115, "rewards/rejected": -6.554234027862549, "step": 473 }, { "epoch": 2.8088888888888888, "grad_norm": 11.785711167501036, "learning_rate": 4.091156116333723e-07, "logits/chosen": -1.527976632118225, "logits/rejected": -1.4141886234283447, "logps/chosen": -45.122650146484375, "logps/rejected": -62.66756820678711, "loss": 0.0607, "rewards/accuracies": 1.0, "rewards/chosen": -1.8391146659851074, "rewards/margins": 4.945567607879639, "rewards/rejected": -6.784682273864746, "step": 474 }, { "epoch": 2.814814814814815, "grad_norm": 6.438450018764331, "learning_rate": 4.086140132648534e-07, "logits/chosen": -1.6443370580673218, "logits/rejected": -1.5842254161834717, "logps/chosen": -40.13809585571289, "logps/rejected": -73.64363098144531, "loss": 0.0382, "rewards/accuracies": 1.0, "rewards/chosen": -0.7049580812454224, "rewards/margins": 6.288333415985107, "rewards/rejected": -6.99329137802124, "step": 475 }, { "epoch": 2.8207407407407405, "grad_norm": 10.33277822610733, "learning_rate": 4.081113438988443e-07, "logits/chosen": -1.9966257810592651, "logits/rejected": -1.940199851989746, "logps/chosen": -40.505191802978516, "logps/rejected": -59.13007736206055, "loss": 0.0487, "rewards/accuracies": 1.0, "rewards/chosen": -0.38005155324935913, "rewards/margins": 5.394841194152832, "rewards/rejected": -5.774892807006836, "step": 476 }, { "epoch": 2.8266666666666667, "grad_norm": 9.375606930837032, "learning_rate": 4.076076069294816e-07, "logits/chosen": -1.5683081150054932, "logits/rejected": -1.3896409273147583, "logps/chosen": -41.28034973144531, "logps/rejected": -68.79712677001953, "loss": 0.0545, "rewards/accuracies": 1.0, "rewards/chosen": -0.7804399728775024, "rewards/margins": 4.933218955993652, "rewards/rejected": -5.713658809661865, "step": 477 }, { "epoch": 2.8325925925925928, "grad_norm": 9.043547249851873, "learning_rate": 4.071028057581105e-07, "logits/chosen": -1.6767199039459229, "logits/rejected": -1.9358806610107422, "logps/chosen": -66.44622039794922, "logps/rejected": -67.9832763671875, "loss": 0.0478, "rewards/accuracies": 1.0, "rewards/chosen": -2.0683860778808594, "rewards/margins": 5.570001602172852, "rewards/rejected": -7.638387680053711, "step": 478 }, { "epoch": 2.8385185185185184, "grad_norm": 5.908586113770876, "learning_rate": 4.065969437932622e-07, "logits/chosen": -1.6558722257614136, "logits/rejected": -1.7621960639953613, "logps/chosen": -52.626182556152344, "logps/rejected": -61.46979522705078, "loss": 0.0469, "rewards/accuracies": 1.0, "rewards/chosen": -1.2955846786499023, "rewards/margins": 4.890224933624268, "rewards/rejected": -6.185809135437012, "step": 479 }, { "epoch": 2.8444444444444446, "grad_norm": 12.445167440486212, "learning_rate": 4.0609002445063036e-07, "logits/chosen": -1.5896965265274048, "logits/rejected": -1.4875233173370361, "logps/chosen": -40.567039489746094, "logps/rejected": -54.923274993896484, "loss": 0.0647, "rewards/accuracies": 1.0, "rewards/chosen": -0.5689337849617004, "rewards/margins": 5.234133720397949, "rewards/rejected": -5.803068161010742, "step": 480 }, { "epoch": 2.85037037037037, "grad_norm": 12.247510070974629, "learning_rate": 4.0558205115304846e-07, "logits/chosen": -2.029125690460205, "logits/rejected": -1.977358102798462, "logps/chosen": -46.63209533691406, "logps/rejected": -69.29682922363281, "loss": 0.0648, "rewards/accuracies": 1.0, "rewards/chosen": -0.12471213936805725, "rewards/margins": 5.987627983093262, "rewards/rejected": -6.112339973449707, "step": 481 }, { "epoch": 2.8562962962962963, "grad_norm": 10.08015012343844, "learning_rate": 4.050730273304663e-07, "logits/chosen": -1.6675320863723755, "logits/rejected": -1.3868141174316406, "logps/chosen": -39.18746566772461, "logps/rejected": -65.97308349609375, "loss": 0.072, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7474696636199951, "rewards/margins": 6.0732421875, "rewards/rejected": -6.820712089538574, "step": 482 }, { "epoch": 2.862222222222222, "grad_norm": 6.181099423214191, "learning_rate": 4.045629564199273e-07, "logits/chosen": -2.1505188941955566, "logits/rejected": -2.0988426208496094, "logps/chosen": -49.55622863769531, "logps/rejected": -69.73926544189453, "loss": 0.0325, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3313867747783661, "rewards/margins": 6.1561055183410645, "rewards/rejected": -6.487491607666016, "step": 483 }, { "epoch": 2.868148148148148, "grad_norm": 13.024076592466916, "learning_rate": 4.04051841865545e-07, "logits/chosen": -1.7880933284759521, "logits/rejected": -2.0355589389801025, "logps/chosen": -45.80509948730469, "logps/rejected": -41.591346740722656, "loss": 0.0756, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5180366039276123, "rewards/margins": 3.310197114944458, "rewards/rejected": -3.8282337188720703, "step": 484 }, { "epoch": 2.8740740740740742, "grad_norm": 5.834739744406833, "learning_rate": 4.0353968711847974e-07, "logits/chosen": -1.5249016284942627, "logits/rejected": -1.4942580461502075, "logps/chosen": -47.616737365722656, "logps/rejected": -65.72685241699219, "loss": 0.0395, "rewards/accuracies": 1.0, "rewards/chosen": -0.47097060084342957, "rewards/margins": 5.679997444152832, "rewards/rejected": -6.150968074798584, "step": 485 }, { "epoch": 2.88, "grad_norm": 13.420831403558674, "learning_rate": 4.030264956369157e-07, "logits/chosen": -1.1555256843566895, "logits/rejected": -1.1296708583831787, "logps/chosen": -50.24407958984375, "logps/rejected": -61.94266891479492, "loss": 0.0591, "rewards/accuracies": 1.0, "rewards/chosen": -0.4866521954536438, "rewards/margins": 5.132786750793457, "rewards/rejected": -5.619439125061035, "step": 486 }, { "epoch": 2.885925925925926, "grad_norm": 10.70741309743604, "learning_rate": 4.02512270886037e-07, "logits/chosen": -1.7210756540298462, "logits/rejected": -1.7596194744110107, "logps/chosen": -52.329383850097656, "logps/rejected": -49.94835662841797, "loss": 0.061, "rewards/accuracies": 1.0, "rewards/chosen": -0.6579025983810425, "rewards/margins": 4.657275199890137, "rewards/rejected": -5.315177917480469, "step": 487 }, { "epoch": 2.891851851851852, "grad_norm": 11.033154443356473, "learning_rate": 4.01997016338005e-07, "logits/chosen": -1.3694206476211548, "logits/rejected": -1.2944746017456055, "logps/chosen": -42.40999221801758, "logps/rejected": -62.894813537597656, "loss": 0.0676, "rewards/accuracies": 1.0, "rewards/chosen": -1.0166794061660767, "rewards/margins": 6.320663928985596, "rewards/rejected": -7.337343215942383, "step": 488 }, { "epoch": 2.897777777777778, "grad_norm": 13.768219520367577, "learning_rate": 4.014807354719342e-07, "logits/chosen": -1.243717908859253, "logits/rejected": -1.4224070310592651, "logps/chosen": -41.15166091918945, "logps/rejected": -46.17465591430664, "loss": 0.07, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2711668610572815, "rewards/margins": 5.416727066040039, "rewards/rejected": -5.687893867492676, "step": 489 }, { "epoch": 2.9037037037037035, "grad_norm": 12.055706759326277, "learning_rate": 4.00963431773869e-07, "logits/chosen": -1.600014567375183, "logits/rejected": -1.4893999099731445, "logps/chosen": -37.52910232543945, "logps/rejected": -56.09258270263672, "loss": 0.0817, "rewards/accuracies": 1.0, "rewards/chosen": -0.5928289890289307, "rewards/margins": 4.855260848999023, "rewards/rejected": -5.448089599609375, "step": 490 }, { "epoch": 2.9096296296296296, "grad_norm": 6.6663303072591695, "learning_rate": 4.0044510873676043e-07, "logits/chosen": -1.4263795614242554, "logits/rejected": -1.3732199668884277, "logps/chosen": -47.48921203613281, "logps/rejected": -64.12741088867188, "loss": 0.036, "rewards/accuracies": 1.0, "rewards/chosen": -0.6781390905380249, "rewards/margins": 5.753025054931641, "rewards/rejected": -6.431163787841797, "step": 491 }, { "epoch": 2.9155555555555557, "grad_norm": 9.203786828995497, "learning_rate": 3.9992576986044223e-07, "logits/chosen": -1.6113818883895874, "logits/rejected": -1.4018325805664062, "logps/chosen": -42.585716247558594, "logps/rejected": -72.791259765625, "loss": 0.0424, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0629472732543945, "rewards/margins": 6.943647384643555, "rewards/rejected": -8.00659465789795, "step": 492 }, { "epoch": 2.9214814814814813, "grad_norm": 13.268694442559015, "learning_rate": 3.9940541865160726e-07, "logits/chosen": -1.9327130317687988, "logits/rejected": -1.9189391136169434, "logps/chosen": -44.672420501708984, "logps/rejected": -55.77782440185547, "loss": 0.0619, "rewards/accuracies": 1.0, "rewards/chosen": -0.5316133499145508, "rewards/margins": 4.759648323059082, "rewards/rejected": -5.291261672973633, "step": 493 }, { "epoch": 2.9274074074074075, "grad_norm": 8.631688599659915, "learning_rate": 3.9888405862378395e-07, "logits/chosen": -1.689211368560791, "logits/rejected": -1.6311895847320557, "logps/chosen": -50.45976257324219, "logps/rejected": -59.6866569519043, "loss": 0.0428, "rewards/accuracies": 1.0, "rewards/chosen": -0.5894455909729004, "rewards/margins": 5.608089923858643, "rewards/rejected": -6.197535514831543, "step": 494 }, { "epoch": 2.9333333333333336, "grad_norm": 14.665098491526825, "learning_rate": 3.983616932973124e-07, "logits/chosen": -1.5946894884109497, "logits/rejected": -1.6317996978759766, "logps/chosen": -38.38591766357422, "logps/rejected": -51.410438537597656, "loss": 0.0811, "rewards/accuracies": 0.875, "rewards/chosen": -0.6509155035018921, "rewards/margins": 4.341098785400391, "rewards/rejected": -4.992014408111572, "step": 495 }, { "epoch": 2.9392592592592592, "grad_norm": 9.56403557410635, "learning_rate": 3.9783832619932076e-07, "logits/chosen": -1.5779447555541992, "logits/rejected": -1.5724250078201294, "logps/chosen": -38.29379653930664, "logps/rejected": -56.06721496582031, "loss": 0.0543, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6153882741928101, "rewards/margins": 5.206683158874512, "rewards/rejected": -5.822071552276611, "step": 496 }, { "epoch": 2.9451851851851854, "grad_norm": 8.708843532666638, "learning_rate": 3.973139608637015e-07, "logits/chosen": -1.666991114616394, "logits/rejected": -1.704276204109192, "logps/chosen": -44.54426574707031, "logps/rejected": -64.68223571777344, "loss": 0.0433, "rewards/accuracies": 1.0, "rewards/chosen": -0.8914271593093872, "rewards/margins": 5.089811325073242, "rewards/rejected": -5.98123836517334, "step": 497 }, { "epoch": 2.951111111111111, "grad_norm": 9.740568197362656, "learning_rate": 3.9678860083108713e-07, "logits/chosen": -1.3851922750473022, "logits/rejected": -1.2133703231811523, "logps/chosen": -35.292724609375, "logps/rejected": -56.05168151855469, "loss": 0.0676, "rewards/accuracies": 1.0, "rewards/chosen": -0.12854641675949097, "rewards/margins": 4.927215576171875, "rewards/rejected": -5.05576229095459, "step": 498 }, { "epoch": 2.957037037037037, "grad_norm": 6.6252979700182335, "learning_rate": 3.9626224964882685e-07, "logits/chosen": -1.1749128103256226, "logits/rejected": -1.0022330284118652, "logps/chosen": -37.999114990234375, "logps/rejected": -53.09870910644531, "loss": 0.0368, "rewards/accuracies": 1.0, "rewards/chosen": 0.06034252047538757, "rewards/margins": 5.425458908081055, "rewards/rejected": -5.365116596221924, "step": 499 }, { "epoch": 2.962962962962963, "grad_norm": 9.100591870121587, "learning_rate": 3.957349108709623e-07, "logits/chosen": -1.2385791540145874, "logits/rejected": -1.1235514879226685, "logps/chosen": -41.54240417480469, "logps/rejected": -57.841407775878906, "loss": 0.0561, "rewards/accuracies": 1.0, "rewards/chosen": -0.4359692931175232, "rewards/margins": 4.188076019287109, "rewards/rejected": -4.624044895172119, "step": 500 }, { "epoch": 2.968888888888889, "grad_norm": 8.00234767245306, "learning_rate": 3.9520658805820335e-07, "logits/chosen": -1.8589203357696533, "logits/rejected": -1.9100127220153809, "logps/chosen": -47.3678092956543, "logps/rejected": -68.48341369628906, "loss": 0.0394, "rewards/accuracies": 1.0, "rewards/chosen": -0.45701107382774353, "rewards/margins": 7.444866180419922, "rewards/rejected": -7.901876449584961, "step": 501 }, { "epoch": 2.974814814814815, "grad_norm": 10.099174335269522, "learning_rate": 3.946772847779045e-07, "logits/chosen": -1.275445580482483, "logits/rejected": -1.436569333076477, "logps/chosen": -39.363243103027344, "logps/rejected": -48.795955657958984, "loss": 0.0658, "rewards/accuracies": 1.0, "rewards/chosen": -0.5841267704963684, "rewards/margins": 5.275768280029297, "rewards/rejected": -5.859894752502441, "step": 502 }, { "epoch": 2.9807407407407407, "grad_norm": 8.064642083628808, "learning_rate": 3.941470046040406e-07, "logits/chosen": -1.7252798080444336, "logits/rejected": -1.6667922735214233, "logps/chosen": -45.00448226928711, "logps/rejected": -53.28717803955078, "loss": 0.0483, "rewards/accuracies": 1.0, "rewards/chosen": -0.4511069059371948, "rewards/margins": 5.3886237144470215, "rewards/rejected": -5.839731216430664, "step": 503 }, { "epoch": 2.986666666666667, "grad_norm": 9.662113041163051, "learning_rate": 3.936157511171826e-07, "logits/chosen": -1.906599521636963, "logits/rejected": -1.7287859916687012, "logps/chosen": -34.12775421142578, "logps/rejected": -59.56098556518555, "loss": 0.0531, "rewards/accuracies": 1.0, "rewards/chosen": -0.7677432894706726, "rewards/margins": 5.709640979766846, "rewards/rejected": -6.477384567260742, "step": 504 }, { "epoch": 2.9925925925925925, "grad_norm": 11.592574124839727, "learning_rate": 3.9308352790447354e-07, "logits/chosen": -1.1439859867095947, "logits/rejected": -1.0030274391174316, "logps/chosen": -41.51778030395508, "logps/rejected": -55.89512252807617, "loss": 0.054, "rewards/accuracies": 1.0, "rewards/chosen": -0.8742501139640808, "rewards/margins": 5.307973861694336, "rewards/rejected": -6.182224273681641, "step": 505 }, { "epoch": 2.9985185185185186, "grad_norm": 8.429500412566364, "learning_rate": 3.9255033855960414e-07, "logits/chosen": -1.666643500328064, "logits/rejected": -1.315439224243164, "logps/chosen": -35.58273696899414, "logps/rejected": -75.78899383544922, "loss": 0.0499, "rewards/accuracies": 1.0, "rewards/chosen": -1.2345635890960693, "rewards/margins": 5.818065643310547, "rewards/rejected": -7.052628517150879, "step": 506 }, { "epoch": 3.0044444444444443, "grad_norm": 7.9465699446302835, "learning_rate": 3.920161866827889e-07, "logits/chosen": -1.6582579612731934, "logits/rejected": -1.4823225736618042, "logps/chosen": -37.8082389831543, "logps/rejected": -64.94297790527344, "loss": 0.0274, "rewards/accuracies": 1.0, "rewards/chosen": -0.8594337105751038, "rewards/margins": 5.952652931213379, "rewards/rejected": -6.812087059020996, "step": 507 }, { "epoch": 3.0103703703703704, "grad_norm": 2.6939179253705063, "learning_rate": 3.914810758807414e-07, "logits/chosen": -1.1479514837265015, "logits/rejected": -0.9574017524719238, "logps/chosen": -34.60985565185547, "logps/rejected": -61.0933952331543, "loss": 0.0161, "rewards/accuracies": 1.0, "rewards/chosen": -0.8293716907501221, "rewards/margins": 5.337098121643066, "rewards/rejected": -6.166469573974609, "step": 508 }, { "epoch": 3.0162962962962965, "grad_norm": 3.2817462401951696, "learning_rate": 3.9094500976665025e-07, "logits/chosen": -1.9915450811386108, "logits/rejected": -1.973534107208252, "logps/chosen": -40.481937408447266, "logps/rejected": -61.4949951171875, "loss": 0.0242, "rewards/accuracies": 1.0, "rewards/chosen": -0.6554736495018005, "rewards/margins": 5.948192596435547, "rewards/rejected": -6.603665351867676, "step": 509 }, { "epoch": 3.022222222222222, "grad_norm": 3.489297028949427, "learning_rate": 3.904079919601542e-07, "logits/chosen": -1.9627009630203247, "logits/rejected": -1.7277880907058716, "logps/chosen": -40.081199645996094, "logps/rejected": -68.22596740722656, "loss": 0.0181, "rewards/accuracies": 1.0, "rewards/chosen": -0.9112627506256104, "rewards/margins": 8.041915893554688, "rewards/rejected": -8.953178405761719, "step": 510 }, { "epoch": 3.0281481481481483, "grad_norm": 3.2704990299983976, "learning_rate": 3.898700260873182e-07, "logits/chosen": -2.022371292114258, "logits/rejected": -1.998417854309082, "logps/chosen": -35.742862701416016, "logps/rejected": -48.1036376953125, "loss": 0.0237, "rewards/accuracies": 1.0, "rewards/chosen": 0.3673231899738312, "rewards/margins": 4.890713691711426, "rewards/rejected": -4.523390769958496, "step": 511 }, { "epoch": 3.034074074074074, "grad_norm": 4.390628410073519, "learning_rate": 3.893311157806091e-07, "logits/chosen": -1.4293193817138672, "logits/rejected": -1.4505712985992432, "logps/chosen": -48.03293991088867, "logps/rejected": -60.35232162475586, "loss": 0.0224, "rewards/accuracies": 1.0, "rewards/chosen": -1.2736024856567383, "rewards/margins": 5.292821884155273, "rewards/rejected": -6.56642484664917, "step": 512 }, { "epoch": 3.04, "grad_norm": 2.5706286347658893, "learning_rate": 3.887912646788703e-07, "logits/chosen": -1.5962660312652588, "logits/rejected": -1.4947378635406494, "logps/chosen": -37.679420471191406, "logps/rejected": -67.93458557128906, "loss": 0.0161, "rewards/accuracies": 1.0, "rewards/chosen": -0.7557663917541504, "rewards/margins": 6.053064823150635, "rewards/rejected": -6.808831691741943, "step": 513 }, { "epoch": 3.0459259259259257, "grad_norm": 2.6847904429490503, "learning_rate": 3.882504764272979e-07, "logits/chosen": -1.7403340339660645, "logits/rejected": -1.5254353284835815, "logps/chosen": -39.930538177490234, "logps/rejected": -72.91048431396484, "loss": 0.0191, "rewards/accuracies": 1.0, "rewards/chosen": -0.5932849645614624, "rewards/margins": 5.84938907623291, "rewards/rejected": -6.44267463684082, "step": 514 }, { "epoch": 3.051851851851852, "grad_norm": 2.305116393194017, "learning_rate": 3.8770875467741577e-07, "logits/chosen": -1.658508062362671, "logits/rejected": -1.5594358444213867, "logps/chosen": -41.339759826660156, "logps/rejected": -74.74822998046875, "loss": 0.0136, "rewards/accuracies": 1.0, "rewards/chosen": -0.5647907257080078, "rewards/margins": 6.762681007385254, "rewards/rejected": -7.327471733093262, "step": 515 }, { "epoch": 3.057777777777778, "grad_norm": 2.445276515915435, "learning_rate": 3.871661030870511e-07, "logits/chosen": -1.4913989305496216, "logits/rejected": -1.3115919828414917, "logps/chosen": -48.97815704345703, "logps/rejected": -78.59329986572266, "loss": 0.0138, "rewards/accuracies": 1.0, "rewards/chosen": -1.327409267425537, "rewards/margins": 8.095717430114746, "rewards/rejected": -9.423127174377441, "step": 516 }, { "epoch": 3.0637037037037036, "grad_norm": 3.8785229521156293, "learning_rate": 3.866225253203093e-07, "logits/chosen": -1.665648102760315, "logits/rejected": -1.6093999147415161, "logps/chosen": -44.661468505859375, "logps/rejected": -66.47560119628906, "loss": 0.0232, "rewards/accuracies": 1.0, "rewards/chosen": -0.9593856334686279, "rewards/margins": 7.025257110595703, "rewards/rejected": -7.984643936157227, "step": 517 }, { "epoch": 3.0696296296296297, "grad_norm": 4.002989049596063, "learning_rate": 3.8607802504754984e-07, "logits/chosen": -1.787219524383545, "logits/rejected": -1.6592282056808472, "logps/chosen": -39.96990966796875, "logps/rejected": -63.611854553222656, "loss": 0.022, "rewards/accuracies": 1.0, "rewards/chosen": -0.06485319137573242, "rewards/margins": 6.901584625244141, "rewards/rejected": -6.966438293457031, "step": 518 }, { "epoch": 3.0755555555555554, "grad_norm": 2.9675791319527955, "learning_rate": 3.85532605945361e-07, "logits/chosen": -1.6534134149551392, "logits/rejected": -1.783852458000183, "logps/chosen": -50.092247009277344, "logps/rejected": -62.32521438598633, "loss": 0.0162, "rewards/accuracies": 1.0, "rewards/chosen": -0.8201130628585815, "rewards/margins": 6.162082672119141, "rewards/rejected": -6.9821953773498535, "step": 519 }, { "epoch": 3.0814814814814815, "grad_norm": 1.7844151140655473, "learning_rate": 3.849862716965352e-07, "logits/chosen": -1.269382357597351, "logits/rejected": -1.2153847217559814, "logps/chosen": -45.336570739746094, "logps/rejected": -78.79979705810547, "loss": 0.0081, "rewards/accuracies": 1.0, "rewards/chosen": -1.9207139015197754, "rewards/margins": 8.384785652160645, "rewards/rejected": -10.305500030517578, "step": 520 }, { "epoch": 3.0874074074074076, "grad_norm": 2.1034172372012776, "learning_rate": 3.8443902599004406e-07, "logits/chosen": -1.8327618837356567, "logits/rejected": -1.6914877891540527, "logps/chosen": -34.10930633544922, "logps/rejected": -58.51155471801758, "loss": 0.0115, "rewards/accuracies": 1.0, "rewards/chosen": -0.6798341274261475, "rewards/margins": 6.230722427368164, "rewards/rejected": -6.910556316375732, "step": 521 }, { "epoch": 3.0933333333333333, "grad_norm": 3.2968154046385667, "learning_rate": 3.8389087252101395e-07, "logits/chosen": -1.264432430267334, "logits/rejected": -1.2380956411361694, "logps/chosen": -41.07152557373047, "logps/rejected": -62.34565353393555, "loss": 0.0193, "rewards/accuracies": 1.0, "rewards/chosen": -2.5138745307922363, "rewards/margins": 6.157859802246094, "rewards/rejected": -8.671734809875488, "step": 522 }, { "epoch": 3.0992592592592594, "grad_norm": 2.2221442473212, "learning_rate": 3.833418149907001e-07, "logits/chosen": -1.3116705417633057, "logits/rejected": -1.46858811378479, "logps/chosen": -56.232017517089844, "logps/rejected": -68.88365173339844, "loss": 0.0118, "rewards/accuracies": 1.0, "rewards/chosen": -1.6804299354553223, "rewards/margins": 8.152290344238281, "rewards/rejected": -9.832719802856445, "step": 523 }, { "epoch": 3.105185185185185, "grad_norm": 3.7570593059357664, "learning_rate": 3.827918571064626e-07, "logits/chosen": -0.9017548561096191, "logits/rejected": -0.8780984878540039, "logps/chosen": -41.97267150878906, "logps/rejected": -57.89922332763672, "loss": 0.0267, "rewards/accuracies": 1.0, "rewards/chosen": -0.7553677558898926, "rewards/margins": 5.701758861541748, "rewards/rejected": -6.457126617431641, "step": 524 }, { "epoch": 3.111111111111111, "grad_norm": 3.3754159307101665, "learning_rate": 3.822410025817406e-07, "logits/chosen": -1.785221815109253, "logits/rejected": -1.7697774171829224, "logps/chosen": -40.88723373413086, "logps/rejected": -57.72883605957031, "loss": 0.0147, "rewards/accuracies": 1.0, "rewards/chosen": -1.4020781517028809, "rewards/margins": 5.520038604736328, "rewards/rejected": -6.922116756439209, "step": 525 }, { "epoch": 3.117037037037037, "grad_norm": 5.658194893720724, "learning_rate": 3.816892551360279e-07, "logits/chosen": -2.2358968257904053, "logits/rejected": -2.079282522201538, "logps/chosen": -50.923828125, "logps/rejected": -102.37371826171875, "loss": 0.0207, "rewards/accuracies": 1.0, "rewards/chosen": -1.3834398984909058, "rewards/margins": 10.244373321533203, "rewards/rejected": -11.627814292907715, "step": 526 }, { "epoch": 3.122962962962963, "grad_norm": 1.6516866779926083, "learning_rate": 3.8113661849484723e-07, "logits/chosen": -1.627011775970459, "logits/rejected": -1.5414108037948608, "logps/chosen": -43.25103759765625, "logps/rejected": -64.36841583251953, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": -0.902656078338623, "rewards/margins": 5.974774360656738, "rewards/rejected": -6.8774309158325195, "step": 527 }, { "epoch": 3.128888888888889, "grad_norm": 2.946008061171455, "learning_rate": 3.805830963897256e-07, "logits/chosen": -1.742790937423706, "logits/rejected": -1.3985412120819092, "logps/chosen": -42.32114791870117, "logps/rejected": -102.3118896484375, "loss": 0.0156, "rewards/accuracies": 1.0, "rewards/chosen": -1.4717878103256226, "rewards/margins": 10.213966369628906, "rewards/rejected": -11.685752868652344, "step": 528 }, { "epoch": 3.1348148148148147, "grad_norm": 2.4632882994990672, "learning_rate": 3.8002869255816873e-07, "logits/chosen": -1.5484261512756348, "logits/rejected": -1.6879433393478394, "logps/chosen": -56.2159309387207, "logps/rejected": -70.41197967529297, "loss": 0.012, "rewards/accuracies": 1.0, "rewards/chosen": -1.9030942916870117, "rewards/margins": 7.6704864501953125, "rewards/rejected": -9.573579788208008, "step": 529 }, { "epoch": 3.140740740740741, "grad_norm": 3.1788918790550964, "learning_rate": 3.7947341074363593e-07, "logits/chosen": -1.720942497253418, "logits/rejected": -1.575656771659851, "logps/chosen": -45.5122184753418, "logps/rejected": -68.554931640625, "loss": 0.0144, "rewards/accuracies": 1.0, "rewards/chosen": -1.585146427154541, "rewards/margins": 7.759179592132568, "rewards/rejected": -9.34432601928711, "step": 530 }, { "epoch": 3.1466666666666665, "grad_norm": 6.875646057259051, "learning_rate": 3.7891725469551485e-07, "logits/chosen": -1.036989450454712, "logits/rejected": -0.9969202280044556, "logps/chosen": -31.839096069335938, "logps/rejected": -55.27592086791992, "loss": 0.035, "rewards/accuracies": 1.0, "rewards/chosen": -0.72756028175354, "rewards/margins": 7.08872127532959, "rewards/rejected": -7.816281795501709, "step": 531 }, { "epoch": 3.1525925925925926, "grad_norm": 2.3835006340562814, "learning_rate": 3.783602281690963e-07, "logits/chosen": -1.673425316810608, "logits/rejected": -1.5485360622406006, "logps/chosen": -35.52006912231445, "logps/rejected": -68.11849975585938, "loss": 0.0136, "rewards/accuracies": 1.0, "rewards/chosen": -0.3884926438331604, "rewards/margins": 6.947432994842529, "rewards/rejected": -7.335925102233887, "step": 532 }, { "epoch": 3.1585185185185187, "grad_norm": 3.989692171252318, "learning_rate": 3.7780233492554856e-07, "logits/chosen": -1.7301304340362549, "logits/rejected": -1.6668508052825928, "logps/chosen": -33.42741394042969, "logps/rejected": -58.77648162841797, "loss": 0.0191, "rewards/accuracies": 1.0, "rewards/chosen": -0.36675456166267395, "rewards/margins": 7.16362190246582, "rewards/rejected": -7.530376434326172, "step": 533 }, { "epoch": 3.1644444444444444, "grad_norm": 2.1314519227467335, "learning_rate": 3.7724357873189244e-07, "logits/chosen": -1.3821113109588623, "logits/rejected": -1.401893973350525, "logps/chosen": -41.42967987060547, "logps/rejected": -58.070919036865234, "loss": 0.0107, "rewards/accuracies": 1.0, "rewards/chosen": -1.3712562322616577, "rewards/margins": 6.296813011169434, "rewards/rejected": -7.668068885803223, "step": 534 }, { "epoch": 3.1703703703703705, "grad_norm": 1.9872916286881108, "learning_rate": 3.766839633609753e-07, "logits/chosen": -1.4474170207977295, "logits/rejected": -1.5491206645965576, "logps/chosen": -40.968833923339844, "logps/rejected": -56.61018753051758, "loss": 0.0123, "rewards/accuracies": 1.0, "rewards/chosen": -0.8881934285163879, "rewards/margins": 6.965273857116699, "rewards/rejected": -7.8534674644470215, "step": 535 }, { "epoch": 3.176296296296296, "grad_norm": 2.2679168350553076, "learning_rate": 3.761234925914459e-07, "logits/chosen": -1.2111544609069824, "logits/rejected": -1.0226666927337646, "logps/chosen": -44.167083740234375, "logps/rejected": -65.37567138671875, "loss": 0.011, "rewards/accuracies": 1.0, "rewards/chosen": -0.7891241312026978, "rewards/margins": 7.499638557434082, "rewards/rejected": -8.288763046264648, "step": 536 }, { "epoch": 3.1822222222222223, "grad_norm": 2.9464081811606833, "learning_rate": 3.755621702077292e-07, "logits/chosen": -1.2695435285568237, "logits/rejected": -1.2304050922393799, "logps/chosen": -43.53666305541992, "logps/rejected": -68.55162048339844, "loss": 0.0135, "rewards/accuracies": 1.0, "rewards/chosen": -1.62175452709198, "rewards/margins": 7.167821407318115, "rewards/rejected": -8.789575576782227, "step": 537 }, { "epoch": 3.188148148148148, "grad_norm": 2.577399784863567, "learning_rate": 3.75e-07, "logits/chosen": -1.3915406465530396, "logits/rejected": -1.3516476154327393, "logps/chosen": -36.945255279541016, "logps/rejected": -62.853023529052734, "loss": 0.0135, "rewards/accuracies": 1.0, "rewards/chosen": -1.697420597076416, "rewards/margins": 8.28250503540039, "rewards/rejected": -9.979926109313965, "step": 538 }, { "epoch": 3.194074074074074, "grad_norm": 2.4302332490814877, "learning_rate": 3.7443698576415795e-07, "logits/chosen": -1.3171603679656982, "logits/rejected": -1.2523666620254517, "logps/chosen": -57.70205307006836, "logps/rejected": -60.83525085449219, "loss": 0.0118, "rewards/accuracies": 1.0, "rewards/chosen": -1.0728650093078613, "rewards/margins": 6.117823600769043, "rewards/rejected": -7.190688610076904, "step": 539 }, { "epoch": 3.2, "grad_norm": 1.7579619325838043, "learning_rate": 3.738731313018019e-07, "logits/chosen": -1.5011435747146606, "logits/rejected": -1.496269702911377, "logps/chosen": -42.36829376220703, "logps/rejected": -60.10231018066406, "loss": 0.0092, "rewards/accuracies": 1.0, "rewards/chosen": -0.6206650733947754, "rewards/margins": 6.9321393966674805, "rewards/rejected": -7.552803993225098, "step": 540 }, { "epoch": 3.205925925925926, "grad_norm": 2.539448968573661, "learning_rate": 3.7330844042020384e-07, "logits/chosen": -1.5449732542037964, "logits/rejected": -1.5869890451431274, "logps/chosen": -42.37911605834961, "logps/rejected": -61.17539978027344, "loss": 0.0154, "rewards/accuracies": 1.0, "rewards/chosen": -0.2428092062473297, "rewards/margins": 6.85809850692749, "rewards/rejected": -7.100907325744629, "step": 541 }, { "epoch": 3.211851851851852, "grad_norm": 1.6571223721636208, "learning_rate": 3.727429169322837e-07, "logits/chosen": -1.9273267984390259, "logits/rejected": -1.9184643030166626, "logps/chosen": -34.810516357421875, "logps/rejected": -58.148075103759766, "loss": 0.0101, "rewards/accuracies": 1.0, "rewards/chosen": -1.5371243953704834, "rewards/margins": 6.692122459411621, "rewards/rejected": -8.229247093200684, "step": 542 }, { "epoch": 3.2177777777777776, "grad_norm": 2.3470121776227626, "learning_rate": 3.721765646565833e-07, "logits/chosen": -1.3661789894104004, "logits/rejected": -1.1353942155838013, "logps/chosen": -44.39642333984375, "logps/rejected": -76.97927856445312, "loss": 0.0108, "rewards/accuracies": 1.0, "rewards/chosen": -1.3597421646118164, "rewards/margins": 8.367069244384766, "rewards/rejected": -9.726812362670898, "step": 543 }, { "epoch": 3.2237037037037037, "grad_norm": 2.1307972412032377, "learning_rate": 3.7160938741724057e-07, "logits/chosen": -1.4313926696777344, "logits/rejected": -1.4233020544052124, "logps/chosen": -42.53575134277344, "logps/rejected": -58.79235076904297, "loss": 0.008, "rewards/accuracies": 1.0, "rewards/chosen": -1.8175709247589111, "rewards/margins": 7.115467071533203, "rewards/rejected": -8.933037757873535, "step": 544 }, { "epoch": 3.2296296296296294, "grad_norm": 2.3490068166225244, "learning_rate": 3.7104138904396374e-07, "logits/chosen": -2.0127451419830322, "logits/rejected": -2.0704212188720703, "logps/chosen": -52.262203216552734, "logps/rejected": -67.50675964355469, "loss": 0.0103, "rewards/accuracies": 1.0, "rewards/chosen": -1.5701168775558472, "rewards/margins": 7.017406940460205, "rewards/rejected": -8.587523460388184, "step": 545 }, { "epoch": 3.2355555555555555, "grad_norm": 4.119175048227472, "learning_rate": 3.704725733720055e-07, "logits/chosen": -1.7419947385787964, "logits/rejected": -1.4981815814971924, "logps/chosen": -46.47538757324219, "logps/rejected": -87.71017456054688, "loss": 0.0282, "rewards/accuracies": 1.0, "rewards/chosen": -1.745780110359192, "rewards/margins": 8.732891082763672, "rewards/rejected": -10.478671073913574, "step": 546 }, { "epoch": 3.2414814814814816, "grad_norm": 2.3392522055872553, "learning_rate": 3.699029442421374e-07, "logits/chosen": -1.5528309345245361, "logits/rejected": -1.6342136859893799, "logps/chosen": -44.845664978027344, "logps/rejected": -68.08673095703125, "loss": 0.012, "rewards/accuracies": 1.0, "rewards/chosen": -1.0458078384399414, "rewards/margins": 7.162102222442627, "rewards/rejected": -8.20790958404541, "step": 547 }, { "epoch": 3.2474074074074073, "grad_norm": 2.0232840255336706, "learning_rate": 3.693325055006232e-07, "logits/chosen": -2.193101167678833, "logits/rejected": -2.054701805114746, "logps/chosen": -34.14925003051758, "logps/rejected": -62.127071380615234, "loss": 0.009, "rewards/accuracies": 1.0, "rewards/chosen": -0.9216932654380798, "rewards/margins": 6.902035236358643, "rewards/rejected": -7.823729038238525, "step": 548 }, { "epoch": 3.2533333333333334, "grad_norm": 2.022216893496469, "learning_rate": 3.6876126099919373e-07, "logits/chosen": -1.370928168296814, "logits/rejected": -1.2814081907272339, "logps/chosen": -32.92713165283203, "logps/rejected": -61.8518180847168, "loss": 0.0099, "rewards/accuracies": 1.0, "rewards/chosen": -1.1010417938232422, "rewards/margins": 7.316898822784424, "rewards/rejected": -8.417941093444824, "step": 549 }, { "epoch": 3.259259259259259, "grad_norm": 2.899770523123044, "learning_rate": 3.681892145950203e-07, "logits/chosen": -1.481421709060669, "logits/rejected": -1.3554790019989014, "logps/chosen": -39.57270812988281, "logps/rejected": -65.8409194946289, "loss": 0.0151, "rewards/accuracies": 1.0, "rewards/chosen": -1.1680896282196045, "rewards/margins": 6.664047718048096, "rewards/rejected": -7.832137584686279, "step": 550 }, { "epoch": 3.265185185185185, "grad_norm": 2.315306748673688, "learning_rate": 3.6761637015068893e-07, "logits/chosen": -1.265090823173523, "logits/rejected": -1.2061020135879517, "logps/chosen": -49.136653900146484, "logps/rejected": -82.91927337646484, "loss": 0.0102, "rewards/accuracies": 1.0, "rewards/chosen": -2.704127073287964, "rewards/margins": 10.218045234680176, "rewards/rejected": -12.922172546386719, "step": 551 }, { "epoch": 3.2711111111111113, "grad_norm": 3.497832278225285, "learning_rate": 3.67042731534174e-07, "logits/chosen": -1.5351362228393555, "logits/rejected": -1.425960898399353, "logps/chosen": -45.82518768310547, "logps/rejected": -72.26417541503906, "loss": 0.0231, "rewards/accuracies": 1.0, "rewards/chosen": -2.3638558387756348, "rewards/margins": 8.301955223083496, "rewards/rejected": -10.665811538696289, "step": 552 }, { "epoch": 3.277037037037037, "grad_norm": 2.4080638985142384, "learning_rate": 3.6646830261881263e-07, "logits/chosen": -1.958190679550171, "logits/rejected": -1.7889118194580078, "logps/chosen": -53.166595458984375, "logps/rejected": -86.16584777832031, "loss": 0.012, "rewards/accuracies": 1.0, "rewards/chosen": -1.144944429397583, "rewards/margins": 8.708757400512695, "rewards/rejected": -9.853702545166016, "step": 553 }, { "epoch": 3.282962962962963, "grad_norm": 1.773594008636619, "learning_rate": 3.6589308728327797e-07, "logits/chosen": -1.5912288427352905, "logits/rejected": -1.6438047885894775, "logps/chosen": -51.71526336669922, "logps/rejected": -74.97248840332031, "loss": 0.009, "rewards/accuracies": 1.0, "rewards/chosen": -1.5826064348220825, "rewards/margins": 8.281431198120117, "rewards/rejected": -9.86403751373291, "step": 554 }, { "epoch": 3.2888888888888888, "grad_norm": 2.9348832139784267, "learning_rate": 3.653170894115533e-07, "logits/chosen": -2.006295680999756, "logits/rejected": -1.864182472229004, "logps/chosen": -41.15538024902344, "logps/rejected": -59.866580963134766, "loss": 0.0172, "rewards/accuracies": 1.0, "rewards/chosen": -0.9665942192077637, "rewards/margins": 6.412576198577881, "rewards/rejected": -7.3791704177856445, "step": 555 }, { "epoch": 3.294814814814815, "grad_norm": 2.726239917687131, "learning_rate": 3.6474031289290586e-07, "logits/chosen": -1.9200947284698486, "logits/rejected": -1.7760218381881714, "logps/chosen": -39.4498176574707, "logps/rejected": -66.42085266113281, "loss": 0.0135, "rewards/accuracies": 1.0, "rewards/chosen": -1.2401189804077148, "rewards/margins": 6.8181610107421875, "rewards/rejected": -8.058279991149902, "step": 556 }, { "epoch": 3.300740740740741, "grad_norm": 3.9297578507818494, "learning_rate": 3.641627616218603e-07, "logits/chosen": -1.6229515075683594, "logits/rejected": -1.7213314771652222, "logps/chosen": -43.064247131347656, "logps/rejected": -52.334712982177734, "loss": 0.0196, "rewards/accuracies": 1.0, "rewards/chosen": -0.9696689248085022, "rewards/margins": 5.829996585845947, "rewards/rejected": -6.799665927886963, "step": 557 }, { "epoch": 3.3066666666666666, "grad_norm": 2.5631539966075465, "learning_rate": 3.6358443949817283e-07, "logits/chosen": -1.0429606437683105, "logits/rejected": -1.2104207277297974, "logps/chosen": -62.048316955566406, "logps/rejected": -65.05219268798828, "loss": 0.0128, "rewards/accuracies": 1.0, "rewards/chosen": -2.2899022102355957, "rewards/margins": 6.154999256134033, "rewards/rejected": -8.444901466369629, "step": 558 }, { "epoch": 3.3125925925925928, "grad_norm": 1.3823868358140514, "learning_rate": 3.630053504268046e-07, "logits/chosen": -1.3712897300720215, "logits/rejected": -1.4772628545761108, "logps/chosen": -51.50799560546875, "logps/rejected": -56.672271728515625, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": -1.256461262702942, "rewards/margins": 6.195413112640381, "rewards/rejected": -7.451874256134033, "step": 559 }, { "epoch": 3.3185185185185184, "grad_norm": 3.197742548961152, "learning_rate": 3.62425498317895e-07, "logits/chosen": -1.6012769937515259, "logits/rejected": -1.5433298349380493, "logps/chosen": -43.341773986816406, "logps/rejected": -67.37931823730469, "loss": 0.015, "rewards/accuracies": 1.0, "rewards/chosen": -1.6422319412231445, "rewards/margins": 7.589832782745361, "rewards/rejected": -9.232064247131348, "step": 560 }, { "epoch": 3.3244444444444445, "grad_norm": 1.2358853384087398, "learning_rate": 3.6184488708673597e-07, "logits/chosen": -1.2647123336791992, "logits/rejected": -1.1407551765441895, "logps/chosen": -44.367061614990234, "logps/rejected": -72.44467163085938, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": -2.255622625350952, "rewards/margins": 7.809691905975342, "rewards/rejected": -10.065313339233398, "step": 561 }, { "epoch": 3.33037037037037, "grad_norm": 1.8475498854849877, "learning_rate": 3.6126352065374517e-07, "logits/chosen": -1.470711350440979, "logits/rejected": -1.288089632987976, "logps/chosen": -47.34082794189453, "logps/rejected": -72.96466827392578, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": -1.1720082759857178, "rewards/margins": 7.697314262390137, "rewards/rejected": -8.869322776794434, "step": 562 }, { "epoch": 3.3362962962962963, "grad_norm": 1.9803556762007266, "learning_rate": 3.6068140294443943e-07, "logits/chosen": -1.3812202215194702, "logits/rejected": -1.2750273942947388, "logps/chosen": -43.31711959838867, "logps/rejected": -63.418060302734375, "loss": 0.0086, "rewards/accuracies": 1.0, "rewards/chosen": -1.6750633716583252, "rewards/margins": 6.8236236572265625, "rewards/rejected": -8.498686790466309, "step": 563 }, { "epoch": 3.3422222222222224, "grad_norm": 2.8447867177205497, "learning_rate": 3.6009853788940856e-07, "logits/chosen": -1.3081015348434448, "logits/rejected": -1.373008131980896, "logps/chosen": -41.99599838256836, "logps/rejected": -54.60331726074219, "loss": 0.0132, "rewards/accuracies": 1.0, "rewards/chosen": -1.2790483236312866, "rewards/margins": 6.154098033905029, "rewards/rejected": -7.433146953582764, "step": 564 }, { "epoch": 3.348148148148148, "grad_norm": 2.611575108505304, "learning_rate": 3.595149294242884e-07, "logits/chosen": -1.4728150367736816, "logits/rejected": -1.578336238861084, "logps/chosen": -39.75627899169922, "logps/rejected": -61.03871154785156, "loss": 0.015, "rewards/accuracies": 1.0, "rewards/chosen": -0.8853597640991211, "rewards/margins": 6.816481590270996, "rewards/rejected": -7.701841354370117, "step": 565 }, { "epoch": 3.354074074074074, "grad_norm": 1.4157122583137127, "learning_rate": 3.589305814897346e-07, "logits/chosen": -1.7357616424560547, "logits/rejected": -1.8931334018707275, "logps/chosen": -43.53925323486328, "logps/rejected": -71.56130981445312, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": -1.1073122024536133, "rewards/margins": 8.838522911071777, "rewards/rejected": -9.94583511352539, "step": 566 }, { "epoch": 3.36, "grad_norm": 2.635191240095001, "learning_rate": 3.5834549803139586e-07, "logits/chosen": -1.142876386642456, "logits/rejected": -1.163784384727478, "logps/chosen": -39.930816650390625, "logps/rejected": -53.69756317138672, "loss": 0.0126, "rewards/accuracies": 1.0, "rewards/chosen": -1.7858086824417114, "rewards/margins": 6.6770148277282715, "rewards/rejected": -8.462823867797852, "step": 567 }, { "epoch": 3.365925925925926, "grad_norm": 1.9424680925260813, "learning_rate": 3.5775968299988725e-07, "logits/chosen": -2.3536765575408936, "logits/rejected": -2.014406681060791, "logps/chosen": -38.420127868652344, "logps/rejected": -81.54600524902344, "loss": 0.0083, "rewards/accuracies": 1.0, "rewards/chosen": -0.818911612033844, "rewards/margins": 9.348888397216797, "rewards/rejected": -10.167799949645996, "step": 568 }, { "epoch": 3.3718518518518517, "grad_norm": 2.2845030472585948, "learning_rate": 3.571731403507635e-07, "logits/chosen": -1.2182214260101318, "logits/rejected": -1.2486618757247925, "logps/chosen": -37.355804443359375, "logps/rejected": -62.415714263916016, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": -0.916069746017456, "rewards/margins": 7.980484485626221, "rewards/rejected": -8.896553993225098, "step": 569 }, { "epoch": 3.3777777777777778, "grad_norm": 7.950874380857367, "learning_rate": 3.565858740444927e-07, "logits/chosen": -1.2446577548980713, "logits/rejected": -1.2009145021438599, "logps/chosen": -34.67760467529297, "logps/rejected": -50.92575454711914, "loss": 0.0231, "rewards/accuracies": 1.0, "rewards/chosen": -1.5964220762252808, "rewards/margins": 5.911198616027832, "rewards/rejected": -7.507620811462402, "step": 570 }, { "epoch": 3.383703703703704, "grad_norm": 1.9717151883082555, "learning_rate": 3.559978880464289e-07, "logits/chosen": -1.235534429550171, "logits/rejected": -1.3239576816558838, "logps/chosen": -39.85081481933594, "logps/rejected": -57.22991180419922, "loss": 0.0093, "rewards/accuracies": 1.0, "rewards/chosen": -0.38666263222694397, "rewards/margins": 7.964129447937012, "rewards/rejected": -8.350791931152344, "step": 571 }, { "epoch": 3.3896296296296295, "grad_norm": 2.2224780083074216, "learning_rate": 3.5540918632678583e-07, "logits/chosen": -1.7430646419525146, "logits/rejected": -1.7376810312271118, "logps/chosen": -47.81946563720703, "logps/rejected": -70.83491516113281, "loss": 0.0119, "rewards/accuracies": 1.0, "rewards/chosen": -1.5437856912612915, "rewards/margins": 8.201151847839355, "rewards/rejected": -9.744937896728516, "step": 572 }, { "epoch": 3.3955555555555557, "grad_norm": 3.4318865366613247, "learning_rate": 3.5481977286060995e-07, "logits/chosen": -1.4170291423797607, "logits/rejected": -1.5433483123779297, "logps/chosen": -47.669212341308594, "logps/rejected": -77.21953582763672, "loss": 0.0109, "rewards/accuracies": 1.0, "rewards/chosen": -2.699216604232788, "rewards/margins": 8.496070861816406, "rewards/rejected": -11.195287704467773, "step": 573 }, { "epoch": 3.4014814814814813, "grad_norm": 1.9972289407606707, "learning_rate": 3.542296516277535e-07, "logits/chosen": -1.040672779083252, "logits/rejected": -1.0020769834518433, "logps/chosen": -47.71010971069336, "logps/rejected": -65.93901824951172, "loss": 0.0093, "rewards/accuracies": 1.0, "rewards/chosen": -1.5380032062530518, "rewards/margins": 7.058391094207764, "rewards/rejected": -8.596394538879395, "step": 574 }, { "epoch": 3.4074074074074074, "grad_norm": 3.0477572554006946, "learning_rate": 3.5363882661284767e-07, "logits/chosen": -1.711016058921814, "logits/rejected": -1.6152186393737793, "logps/chosen": -37.883445739746094, "logps/rejected": -53.224891662597656, "loss": 0.0119, "rewards/accuracies": 1.0, "rewards/chosen": -1.2196171283721924, "rewards/margins": 6.921237945556641, "rewards/rejected": -8.140854835510254, "step": 575 }, { "epoch": 3.413333333333333, "grad_norm": 2.1287963617619745, "learning_rate": 3.53047301805276e-07, "logits/chosen": -1.5620815753936768, "logits/rejected": -1.5769602060317993, "logps/chosen": -53.81451416015625, "logps/rejected": -64.60205841064453, "loss": 0.0077, "rewards/accuracies": 1.0, "rewards/chosen": -0.9155237674713135, "rewards/margins": 7.080358028411865, "rewards/rejected": -7.995882034301758, "step": 576 }, { "epoch": 3.419259259259259, "grad_norm": 3.119553571905832, "learning_rate": 3.5245508119914683e-07, "logits/chosen": -1.545169472694397, "logits/rejected": -1.5148056745529175, "logps/chosen": -44.71506881713867, "logps/rejected": -68.02455139160156, "loss": 0.0119, "rewards/accuracies": 1.0, "rewards/chosen": -1.8168060779571533, "rewards/margins": 8.291409492492676, "rewards/rejected": -10.10821533203125, "step": 577 }, { "epoch": 3.4251851851851853, "grad_norm": 2.1741179080277244, "learning_rate": 3.518621687932671e-07, "logits/chosen": -1.4934191703796387, "logits/rejected": -1.4217729568481445, "logps/chosen": -44.57159423828125, "logps/rejected": -68.36115264892578, "loss": 0.0114, "rewards/accuracies": 1.0, "rewards/chosen": -1.414076328277588, "rewards/margins": 8.635876655578613, "rewards/rejected": -10.049952507019043, "step": 578 }, { "epoch": 3.431111111111111, "grad_norm": 3.5979935916648906, "learning_rate": 3.5126856859111464e-07, "logits/chosen": -1.3749088048934937, "logits/rejected": -1.2176498174667358, "logps/chosen": -43.616153717041016, "logps/rejected": -71.89463806152344, "loss": 0.0109, "rewards/accuracies": 1.0, "rewards/chosen": -0.36016517877578735, "rewards/margins": 9.268863677978516, "rewards/rejected": -9.6290283203125, "step": 579 }, { "epoch": 3.437037037037037, "grad_norm": 4.438241469697319, "learning_rate": 3.5067428460081157e-07, "logits/chosen": -1.0625383853912354, "logits/rejected": -1.0654343366622925, "logps/chosen": -34.844093322753906, "logps/rejected": -53.295997619628906, "loss": 0.0198, "rewards/accuracies": 1.0, "rewards/chosen": -0.308157354593277, "rewards/margins": 7.436243057250977, "rewards/rejected": -7.744400501251221, "step": 580 }, { "epoch": 3.442962962962963, "grad_norm": 1.639309679332784, "learning_rate": 3.5007932083509687e-07, "logits/chosen": -1.6686345338821411, "logits/rejected": -1.4766108989715576, "logps/chosen": -44.36106872558594, "logps/rejected": -79.30110168457031, "loss": 0.007, "rewards/accuracies": 1.0, "rewards/chosen": -0.979895830154419, "rewards/margins": 9.241266250610352, "rewards/rejected": -10.221162796020508, "step": 581 }, { "epoch": 3.448888888888889, "grad_norm": 1.9840382447613365, "learning_rate": 3.494836813112998e-07, "logits/chosen": -1.3479125499725342, "logits/rejected": -1.352927565574646, "logps/chosen": -46.45930862426758, "logps/rejected": -60.15100860595703, "loss": 0.0111, "rewards/accuracies": 1.0, "rewards/chosen": -1.6770470142364502, "rewards/margins": 8.113531112670898, "rewards/rejected": -9.790578842163086, "step": 582 }, { "epoch": 3.454814814814815, "grad_norm": 2.367955115764623, "learning_rate": 3.488873700513124e-07, "logits/chosen": -1.7887319326400757, "logits/rejected": -1.5854450464248657, "logps/chosen": -43.808101654052734, "logps/rejected": -76.43734741210938, "loss": 0.0117, "rewards/accuracies": 1.0, "rewards/chosen": -1.833268165588379, "rewards/margins": 9.030920028686523, "rewards/rejected": -10.864188194274902, "step": 583 }, { "epoch": 3.4607407407407407, "grad_norm": 1.0365638614829655, "learning_rate": 3.482903910815625e-07, "logits/chosen": -1.5932282209396362, "logits/rejected": -1.48412024974823, "logps/chosen": -37.209903717041016, "logps/rejected": -77.74060821533203, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -1.3066656589508057, "rewards/margins": 8.648603439331055, "rewards/rejected": -9.955268859863281, "step": 584 }, { "epoch": 3.466666666666667, "grad_norm": 2.3646988217750335, "learning_rate": 3.476927484329862e-07, "logits/chosen": -1.5596637725830078, "logits/rejected": -1.6348111629486084, "logps/chosen": -42.485225677490234, "logps/rejected": -53.519187927246094, "loss": 0.0114, "rewards/accuracies": 1.0, "rewards/chosen": -1.168082356452942, "rewards/margins": 6.6856207847595215, "rewards/rejected": -7.853703022003174, "step": 585 }, { "epoch": 3.4725925925925925, "grad_norm": 3.430393631502938, "learning_rate": 3.4709444614100113e-07, "logits/chosen": -1.573261022567749, "logits/rejected": -1.584892749786377, "logps/chosen": -40.92796325683594, "logps/rejected": -58.3778076171875, "loss": 0.0103, "rewards/accuracies": 1.0, "rewards/chosen": -0.7870615124702454, "rewards/margins": 7.023894309997559, "rewards/rejected": -7.810956001281738, "step": 586 }, { "epoch": 3.4785185185185186, "grad_norm": 3.106079011723257, "learning_rate": 3.46495488245479e-07, "logits/chosen": -1.4963181018829346, "logits/rejected": -1.3813966512680054, "logps/chosen": -32.22932434082031, "logps/rejected": -63.854183197021484, "loss": 0.0118, "rewards/accuracies": 1.0, "rewards/chosen": -0.8944528102874756, "rewards/margins": 9.122952461242676, "rewards/rejected": -10.017404556274414, "step": 587 }, { "epoch": 3.4844444444444447, "grad_norm": 0.9107948624763849, "learning_rate": 3.4589587879071814e-07, "logits/chosen": -1.522757887840271, "logits/rejected": -1.4110288619995117, "logps/chosen": -31.915634155273438, "logps/rejected": -72.63192749023438, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": -1.4233702421188354, "rewards/margins": 10.482945442199707, "rewards/rejected": -11.906315803527832, "step": 588 }, { "epoch": 3.4903703703703703, "grad_norm": 0.7654845847244033, "learning_rate": 3.452956218254165e-07, "logits/chosen": -0.7181179523468018, "logits/rejected": -0.9414831399917603, "logps/chosen": -60.01945114135742, "logps/rejected": -79.92008209228516, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -2.5449278354644775, "rewards/margins": 8.45158863067627, "rewards/rejected": -10.996516227722168, "step": 589 }, { "epoch": 3.4962962962962965, "grad_norm": 2.7343597937414907, "learning_rate": 3.44694721402644e-07, "logits/chosen": -1.83599853515625, "logits/rejected": -1.9086142778396606, "logps/chosen": -42.17871856689453, "logps/rejected": -66.1895751953125, "loss": 0.0142, "rewards/accuracies": 1.0, "rewards/chosen": -1.6002362966537476, "rewards/margins": 9.458833694458008, "rewards/rejected": -11.059069633483887, "step": 590 }, { "epoch": 3.502222222222222, "grad_norm": 1.115662564068377, "learning_rate": 3.440931815798156e-07, "logits/chosen": -1.4597891569137573, "logits/rejected": -1.5365428924560547, "logps/chosen": -38.84928512573242, "logps/rejected": -55.871421813964844, "loss": 0.0052, "rewards/accuracies": 1.0, "rewards/chosen": -1.3517990112304688, "rewards/margins": 7.835026741027832, "rewards/rejected": -9.186826705932617, "step": 591 }, { "epoch": 3.5081481481481482, "grad_norm": 1.631199912161018, "learning_rate": 3.434910064186633e-07, "logits/chosen": -1.519524097442627, "logits/rejected": -1.2287867069244385, "logps/chosen": -54.680198669433594, "logps/rejected": -81.27238464355469, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": -1.0984071493148804, "rewards/margins": 9.239324569702148, "rewards/rejected": -10.337732315063477, "step": 592 }, { "epoch": 3.514074074074074, "grad_norm": 2.2319395068778145, "learning_rate": 3.428881999852093e-07, "logits/chosen": -1.8822609186172485, "logits/rejected": -1.9900339841842651, "logps/chosen": -60.23460388183594, "logps/rejected": -66.42163848876953, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": -3.2803685665130615, "rewards/margins": 7.103879928588867, "rewards/rejected": -10.384248733520508, "step": 593 }, { "epoch": 3.52, "grad_norm": 1.4189293626243713, "learning_rate": 3.4228476634973836e-07, "logits/chosen": -1.3194860219955444, "logits/rejected": -1.3229904174804688, "logps/chosen": -35.27375793457031, "logps/rejected": -47.984230041503906, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": -1.1222331523895264, "rewards/margins": 6.366780757904053, "rewards/rejected": -7.489014148712158, "step": 594 }, { "epoch": 3.525925925925926, "grad_norm": 1.4373583401932304, "learning_rate": 3.4168070958676985e-07, "logits/chosen": -1.0970607995986938, "logits/rejected": -0.8358435034751892, "logps/chosen": -32.3553352355957, "logps/rejected": -66.28813934326172, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": -2.2889058589935303, "rewards/margins": 7.470686435699463, "rewards/rejected": -9.759592056274414, "step": 595 }, { "epoch": 3.531851851851852, "grad_norm": 1.6877804430201413, "learning_rate": 3.41076033775031e-07, "logits/chosen": -1.2642066478729248, "logits/rejected": -1.1834553480148315, "logps/chosen": -45.60649108886719, "logps/rejected": -72.86331176757812, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": -1.6096386909484863, "rewards/margins": 8.637361526489258, "rewards/rejected": -10.246999740600586, "step": 596 }, { "epoch": 3.537777777777778, "grad_norm": 1.8759383611475728, "learning_rate": 3.404707429974289e-07, "logits/chosen": -1.5445129871368408, "logits/rejected": -1.601426362991333, "logps/chosen": -44.18944549560547, "logps/rejected": -64.6243667602539, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": -2.5349621772766113, "rewards/margins": 7.849261283874512, "rewards/rejected": -10.384223937988281, "step": 597 }, { "epoch": 3.5437037037037036, "grad_norm": 2.6280084496105967, "learning_rate": 3.3986484134102294e-07, "logits/chosen": -1.6996586322784424, "logits/rejected": -1.57733154296875, "logps/chosen": -32.68741226196289, "logps/rejected": -51.48966979980469, "loss": 0.0123, "rewards/accuracies": 1.0, "rewards/chosen": -1.0923914909362793, "rewards/margins": 7.8780293464660645, "rewards/rejected": -8.970420837402344, "step": 598 }, { "epoch": 3.5496296296296297, "grad_norm": 2.0022463955320684, "learning_rate": 3.392583328969975e-07, "logits/chosen": -1.815554141998291, "logits/rejected": -1.819061517715454, "logps/chosen": -43.40631866455078, "logps/rejected": -59.122802734375, "loss": 0.0081, "rewards/accuracies": 1.0, "rewards/chosen": -1.6866397857666016, "rewards/margins": 7.3134284019470215, "rewards/rejected": -9.000067710876465, "step": 599 }, { "epoch": 3.5555555555555554, "grad_norm": 1.6284522933445447, "learning_rate": 3.3865122176063385e-07, "logits/chosen": -1.4640413522720337, "logits/rejected": -1.4328960180282593, "logps/chosen": -64.9375, "logps/rejected": -82.05753326416016, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": -3.3941292762756348, "rewards/margins": 8.559239387512207, "rewards/rejected": -11.953369140625, "step": 600 }, { "epoch": 3.5614814814814815, "grad_norm": 2.5895176046146102, "learning_rate": 3.380435120312831e-07, "logits/chosen": -2.206264019012451, "logits/rejected": -1.858229160308838, "logps/chosen": -32.72801208496094, "logps/rejected": -79.77110290527344, "loss": 0.011, "rewards/accuracies": 1.0, "rewards/chosen": -0.8773959875106812, "rewards/margins": 9.627907752990723, "rewards/rejected": -10.505304336547852, "step": 601 }, { "epoch": 3.5674074074074076, "grad_norm": 1.864701579440055, "learning_rate": 3.374352078123379e-07, "logits/chosen": -1.914277195930481, "logits/rejected": -1.7512773275375366, "logps/chosen": -47.07988739013672, "logps/rejected": -86.24842834472656, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": -2.883514165878296, "rewards/margins": 9.177932739257812, "rewards/rejected": -12.061447143554688, "step": 602 }, { "epoch": 3.5733333333333333, "grad_norm": 4.481967030457234, "learning_rate": 3.36826313211205e-07, "logits/chosen": -1.933127999305725, "logits/rejected": -1.8415874242782593, "logps/chosen": -40.879615783691406, "logps/rejected": -72.97755432128906, "loss": 0.016, "rewards/accuracies": 1.0, "rewards/chosen": -1.9781012535095215, "rewards/margins": 8.083052635192871, "rewards/rejected": -10.061153411865234, "step": 603 }, { "epoch": 3.5792592592592594, "grad_norm": 3.978013520120644, "learning_rate": 3.36216832339278e-07, "logits/chosen": -1.7329645156860352, "logits/rejected": -1.6681479215621948, "logps/chosen": -56.697105407714844, "logps/rejected": -81.9181900024414, "loss": 0.0118, "rewards/accuracies": 1.0, "rewards/chosen": -2.683328151702881, "rewards/margins": 10.081719398498535, "rewards/rejected": -12.765048027038574, "step": 604 }, { "epoch": 3.585185185185185, "grad_norm": 1.4428693502250116, "learning_rate": 3.3560676931190866e-07, "logits/chosen": -1.5751123428344727, "logits/rejected": -1.5669260025024414, "logps/chosen": -60.73744201660156, "logps/rejected": -90.38732147216797, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": -1.3062713146209717, "rewards/margins": 10.367473602294922, "rewards/rejected": -11.673744201660156, "step": 605 }, { "epoch": 3.591111111111111, "grad_norm": 2.0391085171572185, "learning_rate": 3.3499612824837976e-07, "logits/chosen": -1.114881157875061, "logits/rejected": -0.9926152229309082, "logps/chosen": -44.00819396972656, "logps/rejected": -70.94087219238281, "loss": 0.0102, "rewards/accuracies": 1.0, "rewards/chosen": -1.272582769393921, "rewards/margins": 9.082612037658691, "rewards/rejected": -10.355194091796875, "step": 606 }, { "epoch": 3.597037037037037, "grad_norm": 1.057804941519028, "learning_rate": 3.343849132718771e-07, "logits/chosen": -1.5145657062530518, "logits/rejected": -1.4814441204071045, "logps/chosen": -41.45111083984375, "logps/rejected": -62.060455322265625, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -1.5824816226959229, "rewards/margins": 6.7437310218811035, "rewards/rejected": -8.326212882995605, "step": 607 }, { "epoch": 3.602962962962963, "grad_norm": 2.626600924225498, "learning_rate": 3.337731285094616e-07, "logits/chosen": -1.8049207925796509, "logits/rejected": -1.74497652053833, "logps/chosen": -40.572940826416016, "logps/rejected": -63.675750732421875, "loss": 0.0123, "rewards/accuracies": 1.0, "rewards/chosen": -2.2193665504455566, "rewards/margins": 6.881660461425781, "rewards/rejected": -9.10102653503418, "step": 608 }, { "epoch": 3.608888888888889, "grad_norm": 1.2907642289410493, "learning_rate": 3.3316077809204163e-07, "logits/chosen": -1.744649052619934, "logits/rejected": -1.6019152402877808, "logps/chosen": -52.210235595703125, "logps/rejected": -74.92243194580078, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": -1.9966752529144287, "rewards/margins": 9.754047393798828, "rewards/rejected": -11.750722885131836, "step": 609 }, { "epoch": 3.6148148148148147, "grad_norm": 2.2152999887221654, "learning_rate": 3.3254786615434495e-07, "logits/chosen": -1.7057602405548096, "logits/rejected": -1.7353460788726807, "logps/chosen": -33.89598083496094, "logps/rejected": -52.0216064453125, "loss": 0.0112, "rewards/accuracies": 1.0, "rewards/chosen": -0.5950571298599243, "rewards/margins": 6.827630996704102, "rewards/rejected": -7.422687530517578, "step": 610 }, { "epoch": 3.620740740740741, "grad_norm": 2.6009366158084894, "learning_rate": 3.319343968348908e-07, "logits/chosen": -1.6717418432235718, "logits/rejected": -1.541961431503296, "logps/chosen": -45.757110595703125, "logps/rejected": -77.61996459960938, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": -2.8870725631713867, "rewards/margins": 9.506970405578613, "rewards/rejected": -12.394044876098633, "step": 611 }, { "epoch": 3.626666666666667, "grad_norm": 1.2297638615351605, "learning_rate": 3.3132037427596186e-07, "logits/chosen": -1.7202712297439575, "logits/rejected": -1.6844654083251953, "logps/chosen": -31.21420669555664, "logps/rejected": -62.227325439453125, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": -1.343228816986084, "rewards/margins": 9.035632133483887, "rewards/rejected": -10.378860473632812, "step": 612 }, { "epoch": 3.6325925925925926, "grad_norm": 2.3969901798812137, "learning_rate": 3.3070580262357676e-07, "logits/chosen": -0.8491813540458679, "logits/rejected": -0.882433295249939, "logps/chosen": -47.235923767089844, "logps/rejected": -62.53350830078125, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": -0.8392652273178101, "rewards/margins": 7.459800720214844, "rewards/rejected": -8.299065589904785, "step": 613 }, { "epoch": 3.6385185185185183, "grad_norm": 1.5139951967012626, "learning_rate": 3.3009068602746135e-07, "logits/chosen": -1.6768100261688232, "logits/rejected": -1.4328532218933105, "logps/chosen": -50.028465270996094, "logps/rejected": -88.61032104492188, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": -2.902674436569214, "rewards/margins": 10.083459854125977, "rewards/rejected": -12.986133575439453, "step": 614 }, { "epoch": 3.6444444444444444, "grad_norm": 3.876731182363171, "learning_rate": 3.294750286410213e-07, "logits/chosen": -1.7702586650848389, "logits/rejected": -1.7367758750915527, "logps/chosen": -41.51939392089844, "logps/rejected": -65.0497817993164, "loss": 0.0159, "rewards/accuracies": 1.0, "rewards/chosen": -1.2305309772491455, "rewards/margins": 8.569968223571777, "rewards/rejected": -9.800498962402344, "step": 615 }, { "epoch": 3.6503703703703705, "grad_norm": 5.985446773820635, "learning_rate": 3.288588346213139e-07, "logits/chosen": -1.6211884021759033, "logits/rejected": -1.6923515796661377, "logps/chosen": -47.695152282714844, "logps/rejected": -63.015865325927734, "loss": 0.0107, "rewards/accuracies": 1.0, "rewards/chosen": -1.490757942199707, "rewards/margins": 8.131473541259766, "rewards/rejected": -9.622231483459473, "step": 616 }, { "epoch": 3.656296296296296, "grad_norm": 2.232883236877547, "learning_rate": 3.282421081290195e-07, "logits/chosen": -1.707023024559021, "logits/rejected": -1.5826444625854492, "logps/chosen": -47.43241882324219, "logps/rejected": -76.33192443847656, "loss": 0.0116, "rewards/accuracies": 1.0, "rewards/chosen": -1.0400581359863281, "rewards/margins": 7.394491672515869, "rewards/rejected": -8.434549331665039, "step": 617 }, { "epoch": 3.6622222222222223, "grad_norm": 2.0024865182396723, "learning_rate": 3.2762485332841404e-07, "logits/chosen": -1.7938523292541504, "logits/rejected": -1.6399613618850708, "logps/chosen": -33.98335266113281, "logps/rejected": -55.67890167236328, "loss": 0.01, "rewards/accuracies": 1.0, "rewards/chosen": -0.4796392321586609, "rewards/margins": 6.540268421173096, "rewards/rejected": -7.019907474517822, "step": 618 }, { "epoch": 3.6681481481481484, "grad_norm": 2.1922455464812303, "learning_rate": 3.27007074387341e-07, "logits/chosen": -1.6379787921905518, "logits/rejected": -1.5925250053405762, "logps/chosen": -47.66102600097656, "logps/rejected": -62.999855041503906, "loss": 0.01, "rewards/accuracies": 1.0, "rewards/chosen": -1.8236360549926758, "rewards/margins": 7.007270812988281, "rewards/rejected": -8.830906867980957, "step": 619 }, { "epoch": 3.674074074074074, "grad_norm": 3.044208794209425, "learning_rate": 3.2638877547718263e-07, "logits/chosen": -1.7888829708099365, "logits/rejected": -1.5564265251159668, "logps/chosen": -41.539024353027344, "logps/rejected": -66.69727325439453, "loss": 0.0134, "rewards/accuracies": 1.0, "rewards/chosen": -2.7005274295806885, "rewards/margins": 8.028372764587402, "rewards/rejected": -10.728899955749512, "step": 620 }, { "epoch": 3.68, "grad_norm": 2.5233284063470625, "learning_rate": 3.2576996077283217e-07, "logits/chosen": -0.9492640495300293, "logits/rejected": -0.8763157725334167, "logps/chosen": -43.827125549316406, "logps/rejected": -68.0890884399414, "loss": 0.0118, "rewards/accuracies": 1.0, "rewards/chosen": -2.6933960914611816, "rewards/margins": 9.143272399902344, "rewards/rejected": -11.836669921875, "step": 621 }, { "epoch": 3.685925925925926, "grad_norm": 1.3687699276618075, "learning_rate": 3.251506344526658e-07, "logits/chosen": -1.7169603109359741, "logits/rejected": -1.5893797874450684, "logps/chosen": -40.559478759765625, "logps/rejected": -72.53877258300781, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": -1.8258345127105713, "rewards/margins": 8.606887817382812, "rewards/rejected": -10.432722091674805, "step": 622 }, { "epoch": 3.691851851851852, "grad_norm": 1.811447056426145, "learning_rate": 3.2453080069851403e-07, "logits/chosen": -1.3575303554534912, "logits/rejected": -1.4146182537078857, "logps/chosen": -50.56128692626953, "logps/rejected": -69.76527404785156, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": -2.2796287536621094, "rewards/margins": 8.239351272583008, "rewards/rejected": -10.518980026245117, "step": 623 }, { "epoch": 3.6977777777777776, "grad_norm": 3.3947370038677263, "learning_rate": 3.239104636956337e-07, "logits/chosen": -1.722593069076538, "logits/rejected": -1.583823323249817, "logps/chosen": -51.618125915527344, "logps/rejected": -80.45478820800781, "loss": 0.012, "rewards/accuracies": 1.0, "rewards/chosen": -1.1515426635742188, "rewards/margins": 9.869569778442383, "rewards/rejected": -11.021112442016602, "step": 624 }, { "epoch": 3.7037037037037037, "grad_norm": 0.7776787350074231, "learning_rate": 3.2328962763267993e-07, "logits/chosen": -1.3223106861114502, "logits/rejected": -1.3556057214736938, "logps/chosen": -45.37212371826172, "logps/rejected": -70.70166015625, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -1.9760671854019165, "rewards/margins": 8.622611999511719, "rewards/rejected": -10.598679542541504, "step": 625 }, { "epoch": 3.70962962962963, "grad_norm": 2.5724293932549016, "learning_rate": 3.2266829670167736e-07, "logits/chosen": -1.8816306591033936, "logits/rejected": -1.727679967880249, "logps/chosen": -43.26377868652344, "logps/rejected": -87.9742660522461, "loss": 0.0094, "rewards/accuracies": 1.0, "rewards/chosen": -1.6621125936508179, "rewards/margins": 10.484230041503906, "rewards/rejected": -12.146342277526855, "step": 626 }, { "epoch": 3.7155555555555555, "grad_norm": 2.743832235404827, "learning_rate": 3.2204647509799216e-07, "logits/chosen": -1.9375836849212646, "logits/rejected": -1.9025050401687622, "logps/chosen": -62.01106643676758, "logps/rejected": -70.475830078125, "loss": 0.0109, "rewards/accuracies": 1.0, "rewards/chosen": -2.905947208404541, "rewards/margins": 7.062991142272949, "rewards/rejected": -9.968938827514648, "step": 627 }, { "epoch": 3.7214814814814816, "grad_norm": 1.7788380759300608, "learning_rate": 3.2142416702030365e-07, "logits/chosen": -1.6858139038085938, "logits/rejected": -1.46574068069458, "logps/chosen": -34.817596435546875, "logps/rejected": -68.91419982910156, "loss": 0.0128, "rewards/accuracies": 1.0, "rewards/chosen": -1.0730873346328735, "rewards/margins": 8.742530822753906, "rewards/rejected": -9.815618515014648, "step": 628 }, { "epoch": 3.7274074074074073, "grad_norm": 2.0130515415798644, "learning_rate": 3.2080137667057595e-07, "logits/chosen": -1.6360899209976196, "logits/rejected": -1.587815523147583, "logps/chosen": -36.161739349365234, "logps/rejected": -51.88941192626953, "loss": 0.0092, "rewards/accuracies": 1.0, "rewards/chosen": -0.8041471242904663, "rewards/margins": 7.299750804901123, "rewards/rejected": -8.103898048400879, "step": 629 }, { "epoch": 3.7333333333333334, "grad_norm": 3.822746078893138, "learning_rate": 3.201781082540297e-07, "logits/chosen": -1.9967753887176514, "logits/rejected": -1.8232698440551758, "logps/chosen": -31.268970489501953, "logps/rejected": -60.96702194213867, "loss": 0.0138, "rewards/accuracies": 1.0, "rewards/chosen": -0.2652736306190491, "rewards/margins": 7.545168876647949, "rewards/rejected": -7.810442924499512, "step": 630 }, { "epoch": 3.739259259259259, "grad_norm": 3.4510801637868753, "learning_rate": 3.1955436597911315e-07, "logits/chosen": -1.2564302682876587, "logits/rejected": -1.2304657697677612, "logps/chosen": -44.99286651611328, "logps/rejected": -61.647193908691406, "loss": 0.0105, "rewards/accuracies": 1.0, "rewards/chosen": -1.33876633644104, "rewards/margins": 8.576675415039062, "rewards/rejected": -9.915441513061523, "step": 631 }, { "epoch": 3.745185185185185, "grad_norm": 2.702360678099268, "learning_rate": 3.1893015405747467e-07, "logits/chosen": -1.3866653442382812, "logits/rejected": -1.3158271312713623, "logps/chosen": -37.21662521362305, "logps/rejected": -61.110347747802734, "loss": 0.01, "rewards/accuracies": 1.0, "rewards/chosen": -1.9909217357635498, "rewards/margins": 10.208771705627441, "rewards/rejected": -12.19969367980957, "step": 632 }, { "epoch": 3.7511111111111113, "grad_norm": 1.307284115251361, "learning_rate": 3.183054767039333e-07, "logits/chosen": -1.290824294090271, "logits/rejected": -1.5284253358840942, "logps/chosen": -60.69805145263672, "logps/rejected": -65.60289001464844, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -2.2406201362609863, "rewards/margins": 7.582151412963867, "rewards/rejected": -9.822771072387695, "step": 633 }, { "epoch": 3.757037037037037, "grad_norm": 1.5871167186944344, "learning_rate": 3.176803381364512e-07, "logits/chosen": -2.3787388801574707, "logits/rejected": -2.057800769805908, "logps/chosen": -46.9011344909668, "logps/rejected": -81.9096908569336, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": -2.9250102043151855, "rewards/margins": 8.01507568359375, "rewards/rejected": -10.940086364746094, "step": 634 }, { "epoch": 3.762962962962963, "grad_norm": 2.839912087777198, "learning_rate": 3.170547425761046e-07, "logits/chosen": -1.5725014209747314, "logits/rejected": -1.5229789018630981, "logps/chosen": -37.74675750732422, "logps/rejected": -68.0882568359375, "loss": 0.0117, "rewards/accuracies": 1.0, "rewards/chosen": -1.480069637298584, "rewards/margins": 9.368812561035156, "rewards/rejected": -10.848881721496582, "step": 635 }, { "epoch": 3.7688888888888887, "grad_norm": 3.5480712966399017, "learning_rate": 3.164286942470553e-07, "logits/chosen": -1.657641887664795, "logits/rejected": -1.3071664571762085, "logps/chosen": -42.06265640258789, "logps/rejected": -89.001220703125, "loss": 0.0153, "rewards/accuracies": 1.0, "rewards/chosen": -1.8841521739959717, "rewards/margins": 11.081164360046387, "rewards/rejected": -12.965315818786621, "step": 636 }, { "epoch": 3.774814814814815, "grad_norm": 1.7011165480825463, "learning_rate": 3.1580219737652254e-07, "logits/chosen": -1.8198199272155762, "logits/rejected": -1.7223021984100342, "logps/chosen": -40.75661087036133, "logps/rejected": -70.20028686523438, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": -2.252716541290283, "rewards/margins": 9.318144798278809, "rewards/rejected": -11.57086181640625, "step": 637 }, { "epoch": 3.7807407407407405, "grad_norm": 1.9852461114559077, "learning_rate": 3.1517525619475394e-07, "logits/chosen": -1.4178622961044312, "logits/rejected": -1.3874255418777466, "logps/chosen": -35.899024963378906, "logps/rejected": -53.1900634765625, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": -1.4967761039733887, "rewards/margins": 7.301504135131836, "rewards/rejected": -8.798280715942383, "step": 638 }, { "epoch": 3.7866666666666666, "grad_norm": 0.9802194626023532, "learning_rate": 3.145478749349974e-07, "logits/chosen": -1.647719144821167, "logits/rejected": -1.615530252456665, "logps/chosen": -52.62092208862305, "logps/rejected": -74.0809326171875, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -2.5072264671325684, "rewards/margins": 8.355988502502441, "rewards/rejected": -10.863214492797852, "step": 639 }, { "epoch": 3.7925925925925927, "grad_norm": 2.410614126181565, "learning_rate": 3.139200578334724e-07, "logits/chosen": -1.3317790031433105, "logits/rejected": -1.2641746997833252, "logps/chosen": -47.751319885253906, "logps/rejected": -70.82566833496094, "loss": 0.0104, "rewards/accuracies": 1.0, "rewards/chosen": -2.740259885787964, "rewards/margins": 6.87734317779541, "rewards/rejected": -9.617603302001953, "step": 640 }, { "epoch": 3.7985185185185184, "grad_norm": 3.0815761741282435, "learning_rate": 3.132918091293411e-07, "logits/chosen": -1.2445253133773804, "logits/rejected": -1.2281543016433716, "logps/chosen": -44.184261322021484, "logps/rejected": -67.85798645019531, "loss": 0.0091, "rewards/accuracies": 1.0, "rewards/chosen": -2.710573196411133, "rewards/margins": 7.016053676605225, "rewards/rejected": -9.726627349853516, "step": 641 }, { "epoch": 3.8044444444444445, "grad_norm": 2.3331148256321246, "learning_rate": 3.126631330646801e-07, "logits/chosen": -1.5085358619689941, "logits/rejected": -1.5857086181640625, "logps/chosen": -37.31532669067383, "logps/rejected": -60.988746643066406, "loss": 0.0101, "rewards/accuracies": 1.0, "rewards/chosen": -0.8923465013504028, "rewards/margins": 8.49822998046875, "rewards/rejected": -9.390576362609863, "step": 642 }, { "epoch": 3.8103703703703706, "grad_norm": 2.3789903560495462, "learning_rate": 3.120340338844516e-07, "logits/chosen": -2.025918960571289, "logits/rejected": -2.0642755031585693, "logps/chosen": -41.12156677246094, "logps/rejected": -59.014190673828125, "loss": 0.0089, "rewards/accuracies": 1.0, "rewards/chosen": -2.0245141983032227, "rewards/margins": 8.251988410949707, "rewards/rejected": -10.27650260925293, "step": 643 }, { "epoch": 3.8162962962962963, "grad_norm": 2.6624718851606644, "learning_rate": 3.1140451583647464e-07, "logits/chosen": -1.790588617324829, "logits/rejected": -1.8795886039733887, "logps/chosen": -40.51091766357422, "logps/rejected": -71.70164489746094, "loss": 0.01, "rewards/accuracies": 1.0, "rewards/chosen": -2.3066911697387695, "rewards/margins": 10.29784870147705, "rewards/rejected": -12.60453987121582, "step": 644 }, { "epoch": 3.822222222222222, "grad_norm": 2.1636336873715973, "learning_rate": 3.1077458317139677e-07, "logits/chosen": -1.3567291498184204, "logits/rejected": -1.442582368850708, "logps/chosen": -37.33628845214844, "logps/rejected": -50.815921783447266, "loss": 0.0125, "rewards/accuracies": 1.0, "rewards/chosen": -1.4248002767562866, "rewards/margins": 6.379508018493652, "rewards/rejected": -7.80430793762207, "step": 645 }, { "epoch": 3.828148148148148, "grad_norm": 3.142146106508061, "learning_rate": 3.1014424014266494e-07, "logits/chosen": -1.6922850608825684, "logits/rejected": -1.6728633642196655, "logps/chosen": -32.17321014404297, "logps/rejected": -57.95257568359375, "loss": 0.0182, "rewards/accuracies": 1.0, "rewards/chosen": -1.2450605630874634, "rewards/margins": 9.735218048095703, "rewards/rejected": -10.980278015136719, "step": 646 }, { "epoch": 3.834074074074074, "grad_norm": 1.0327148739028738, "learning_rate": 3.095134910064971e-07, "logits/chosen": -1.3771042823791504, "logits/rejected": -1.573880672454834, "logps/chosen": -50.5789680480957, "logps/rejected": -55.373573303222656, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -1.5233265161514282, "rewards/margins": 7.985459327697754, "rewards/rejected": -9.508785247802734, "step": 647 }, { "epoch": 3.84, "grad_norm": 1.7197133806532463, "learning_rate": 3.0888234002185325e-07, "logits/chosen": -1.8431189060211182, "logits/rejected": -1.7777403593063354, "logps/chosen": -34.486351013183594, "logps/rejected": -60.21614074707031, "loss": 0.0079, "rewards/accuracies": 1.0, "rewards/chosen": -1.4829984903335571, "rewards/margins": 8.156380653381348, "rewards/rejected": -9.639379501342773, "step": 648 }, { "epoch": 3.845925925925926, "grad_norm": 1.7262987298671724, "learning_rate": 3.082507914504068e-07, "logits/chosen": -1.2412363290786743, "logits/rejected": -1.2558114528656006, "logps/chosen": -46.88982009887695, "logps/rejected": -74.43185424804688, "loss": 0.007, "rewards/accuracies": 1.0, "rewards/chosen": -2.0637423992156982, "rewards/margins": 9.63477897644043, "rewards/rejected": -11.69852066040039, "step": 649 }, { "epoch": 3.851851851851852, "grad_norm": 3.642086296060839, "learning_rate": 3.0761884955651563e-07, "logits/chosen": -1.5971522331237793, "logits/rejected": -1.6424740552902222, "logps/chosen": -48.26805114746094, "logps/rejected": -54.9079704284668, "loss": 0.0131, "rewards/accuracies": 1.0, "rewards/chosen": -0.8122729659080505, "rewards/margins": 7.315271377563477, "rewards/rejected": -8.127544403076172, "step": 650 }, { "epoch": 3.8577777777777778, "grad_norm": 2.5304696991003826, "learning_rate": 3.069865186071938e-07, "logits/chosen": -1.5003931522369385, "logits/rejected": -1.357499599456787, "logps/chosen": -38.52720642089844, "logps/rejected": -66.80923461914062, "loss": 0.0112, "rewards/accuracies": 1.0, "rewards/chosen": -0.9663707613945007, "rewards/margins": 9.103131294250488, "rewards/rejected": -10.069501876831055, "step": 651 }, { "epoch": 3.863703703703704, "grad_norm": 1.275499734191683, "learning_rate": 3.0635380287208184e-07, "logits/chosen": -2.0289735794067383, "logits/rejected": -2.004368543624878, "logps/chosen": -41.99580383300781, "logps/rejected": -66.64373779296875, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": -2.089265823364258, "rewards/margins": 9.074212074279785, "rewards/rejected": -11.163477897644043, "step": 652 }, { "epoch": 3.8696296296296295, "grad_norm": 3.416542035435825, "learning_rate": 3.057207066234188e-07, "logits/chosen": -1.1870657205581665, "logits/rejected": -1.2482651472091675, "logps/chosen": -42.56634521484375, "logps/rejected": -58.963680267333984, "loss": 0.0102, "rewards/accuracies": 1.0, "rewards/chosen": -0.7613425850868225, "rewards/margins": 7.487583637237549, "rewards/rejected": -8.248926162719727, "step": 653 }, { "epoch": 3.8755555555555556, "grad_norm": 2.8088220635856005, "learning_rate": 3.0508723413601296e-07, "logits/chosen": -0.9739608764648438, "logits/rejected": -0.9807726144790649, "logps/chosen": -47.68108367919922, "logps/rejected": -66.4764175415039, "loss": 0.0077, "rewards/accuracies": 1.0, "rewards/chosen": -2.5027942657470703, "rewards/margins": 7.769590377807617, "rewards/rejected": -10.272384643554688, "step": 654 }, { "epoch": 3.8814814814814813, "grad_norm": 2.674068519159376, "learning_rate": 3.0445338968721283e-07, "logits/chosen": -1.8572252988815308, "logits/rejected": -1.7199347019195557, "logps/chosen": -51.708492279052734, "logps/rejected": -80.59733581542969, "loss": 0.0093, "rewards/accuracies": 1.0, "rewards/chosen": -1.1489932537078857, "rewards/margins": 10.27888298034668, "rewards/rejected": -11.427876472473145, "step": 655 }, { "epoch": 3.8874074074074074, "grad_norm": 2.9778285430298514, "learning_rate": 3.0381917755687896e-07, "logits/chosen": -1.8231042623519897, "logits/rejected": -1.619692325592041, "logps/chosen": -44.044952392578125, "logps/rejected": -72.98948669433594, "loss": 0.0132, "rewards/accuracies": 1.0, "rewards/chosen": -2.5052106380462646, "rewards/margins": 8.7330904006958, "rewards/rejected": -11.238300323486328, "step": 656 }, { "epoch": 3.8933333333333335, "grad_norm": 5.21161187358889, "learning_rate": 3.0318460202735415e-07, "logits/chosen": -1.2046602964401245, "logits/rejected": -1.2679613828659058, "logps/chosen": -36.49602508544922, "logps/rejected": -59.53178024291992, "loss": 0.0255, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9609317779541016, "rewards/margins": 7.4171247482299805, "rewards/rejected": -9.378056526184082, "step": 657 }, { "epoch": 3.899259259259259, "grad_norm": 0.901184093067749, "learning_rate": 3.025496673834351e-07, "logits/chosen": -1.6594105958938599, "logits/rejected": -1.6217676401138306, "logps/chosen": -46.861385345458984, "logps/rejected": -63.68336486816406, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -1.7903817892074585, "rewards/margins": 8.82080364227295, "rewards/rejected": -10.611185073852539, "step": 658 }, { "epoch": 3.9051851851851853, "grad_norm": 4.509371487983056, "learning_rate": 3.0191437791234335e-07, "logits/chosen": -1.4345513582229614, "logits/rejected": -1.4621853828430176, "logps/chosen": -40.582069396972656, "logps/rejected": -67.71416473388672, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": -1.575059175491333, "rewards/margins": 8.932149887084961, "rewards/rejected": -10.507207870483398, "step": 659 }, { "epoch": 3.911111111111111, "grad_norm": 1.974158077967259, "learning_rate": 3.0127873790369625e-07, "logits/chosen": -1.837843656539917, "logits/rejected": -1.8465306758880615, "logps/chosen": -31.680309295654297, "logps/rejected": -47.00579833984375, "loss": 0.0087, "rewards/accuracies": 1.0, "rewards/chosen": -0.8345265984535217, "rewards/margins": 6.112514019012451, "rewards/rejected": -6.94704008102417, "step": 660 }, { "epoch": 3.917037037037037, "grad_norm": 3.9937805277174365, "learning_rate": 3.006427516494781e-07, "logits/chosen": -1.159081220626831, "logits/rejected": -0.9563398957252502, "logps/chosen": -33.920318603515625, "logps/rejected": -63.24695587158203, "loss": 0.013, "rewards/accuracies": 1.0, "rewards/chosen": -0.36665505170822144, "rewards/margins": 8.001738548278809, "rewards/rejected": -8.368393898010254, "step": 661 }, { "epoch": 3.9229629629629628, "grad_norm": 3.017214815756595, "learning_rate": 3.000064234440111e-07, "logits/chosen": -1.8783760070800781, "logits/rejected": -1.8840227127075195, "logps/chosen": -46.433624267578125, "logps/rejected": -66.72588348388672, "loss": 0.0114, "rewards/accuracies": 1.0, "rewards/chosen": -0.8653470873832703, "rewards/margins": 7.810515403747559, "rewards/rejected": -8.675862312316895, "step": 662 }, { "epoch": 3.928888888888889, "grad_norm": 1.1211584130537804, "learning_rate": 2.9936975758392644e-07, "logits/chosen": -1.8524677753448486, "logits/rejected": -1.9079548120498657, "logps/chosen": -57.159034729003906, "logps/rejected": -69.21312713623047, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -2.6859822273254395, "rewards/margins": 7.816958904266357, "rewards/rejected": -10.502941131591797, "step": 663 }, { "epoch": 3.934814814814815, "grad_norm": 2.5458746214899377, "learning_rate": 2.9873275836813526e-07, "logits/chosen": -1.7048072814941406, "logits/rejected": -1.8171309232711792, "logps/chosen": -48.66388702392578, "logps/rejected": -63.1533203125, "loss": 0.0083, "rewards/accuracies": 1.0, "rewards/chosen": -1.7343381643295288, "rewards/margins": 8.266016006469727, "rewards/rejected": -10.000354766845703, "step": 664 }, { "epoch": 3.9407407407407407, "grad_norm": 1.9736274445994668, "learning_rate": 2.980954300977995e-07, "logits/chosen": -1.619166374206543, "logits/rejected": -1.5777629613876343, "logps/chosen": -49.931640625, "logps/rejected": -78.51671600341797, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": -3.6705405712127686, "rewards/margins": 9.18619155883789, "rewards/rejected": -12.856733322143555, "step": 665 }, { "epoch": 3.9466666666666668, "grad_norm": 2.552365730581539, "learning_rate": 2.974577770763028e-07, "logits/chosen": -1.7241450548171997, "logits/rejected": -1.6734381914138794, "logps/chosen": -46.34965515136719, "logps/rejected": -90.48927307128906, "loss": 0.0094, "rewards/accuracies": 1.0, "rewards/chosen": -1.3870890140533447, "rewards/margins": 11.964576721191406, "rewards/rejected": -13.351667404174805, "step": 666 }, { "epoch": 3.9525925925925924, "grad_norm": 2.3305835074048704, "learning_rate": 2.96819803609222e-07, "logits/chosen": -1.9842724800109863, "logits/rejected": -2.00046443939209, "logps/chosen": -35.96994400024414, "logps/rejected": -57.09709548950195, "loss": 0.0085, "rewards/accuracies": 1.0, "rewards/chosen": -1.034604787826538, "rewards/margins": 7.337852478027344, "rewards/rejected": -8.372458457946777, "step": 667 }, { "epoch": 3.9585185185185185, "grad_norm": 1.5958168958315395, "learning_rate": 2.9618151400429735e-07, "logits/chosen": -1.620787262916565, "logits/rejected": -1.6079652309417725, "logps/chosen": -42.559288024902344, "logps/rejected": -65.05506134033203, "loss": 0.0079, "rewards/accuracies": 1.0, "rewards/chosen": -1.2509933710098267, "rewards/margins": 8.53365421295166, "rewards/rejected": -9.784646987915039, "step": 668 }, { "epoch": 3.964444444444444, "grad_norm": 1.3718053352193202, "learning_rate": 2.955429125714038e-07, "logits/chosen": -1.5058695077896118, "logits/rejected": -1.4394159317016602, "logps/chosen": -37.802268981933594, "logps/rejected": -67.28629302978516, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -0.9949045777320862, "rewards/margins": 8.948270797729492, "rewards/rejected": -9.943174362182617, "step": 669 }, { "epoch": 3.9703703703703703, "grad_norm": 1.3132523241213343, "learning_rate": 2.949040036225218e-07, "logits/chosen": -1.5042078495025635, "logits/rejected": -1.624751091003418, "logps/chosen": -54.53010940551758, "logps/rejected": -74.37867736816406, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": -3.157304286956787, "rewards/margins": 9.978010177612305, "rewards/rejected": -13.13531494140625, "step": 670 }, { "epoch": 3.9762962962962964, "grad_norm": 1.3508609108880327, "learning_rate": 2.9426479147170836e-07, "logits/chosen": -1.6658778190612793, "logits/rejected": -1.5220093727111816, "logps/chosen": -35.21366500854492, "logps/rejected": -65.32410430908203, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": -1.7612733840942383, "rewards/margins": 9.228185653686523, "rewards/rejected": -10.989458084106445, "step": 671 }, { "epoch": 3.982222222222222, "grad_norm": 1.9276538943992374, "learning_rate": 2.9362528043506767e-07, "logits/chosen": -1.1979081630706787, "logits/rejected": -1.2840179204940796, "logps/chosen": -56.21266555786133, "logps/rejected": -78.62033081054688, "loss": 0.0072, "rewards/accuracies": 1.0, "rewards/chosen": -2.804077386856079, "rewards/margins": 9.969850540161133, "rewards/rejected": -12.773927688598633, "step": 672 }, { "epoch": 3.988148148148148, "grad_norm": 2.7896360676711693, "learning_rate": 2.929854748307221e-07, "logits/chosen": -1.3991694450378418, "logits/rejected": -1.437917709350586, "logps/chosen": -44.147377014160156, "logps/rejected": -65.40982818603516, "loss": 0.012, "rewards/accuracies": 1.0, "rewards/chosen": -1.3406189680099487, "rewards/margins": 9.758732795715332, "rewards/rejected": -11.099350929260254, "step": 673 }, { "epoch": 3.9940740740740743, "grad_norm": 2.80652860847146, "learning_rate": 2.923453789787828e-07, "logits/chosen": -1.7140244245529175, "logits/rejected": -1.575933575630188, "logps/chosen": -46.123146057128906, "logps/rejected": -67.82181549072266, "loss": 0.0116, "rewards/accuracies": 1.0, "rewards/chosen": -1.8217127323150635, "rewards/margins": 8.149543762207031, "rewards/rejected": -9.971257209777832, "step": 674 }, { "epoch": 4.0, "grad_norm": 3.2592488566875524, "learning_rate": 2.9170499720132106e-07, "logits/chosen": -1.7309939861297607, "logits/rejected": -1.6547414064407349, "logps/chosen": -50.30008316040039, "logps/rejected": -86.93077850341797, "loss": 0.009, "rewards/accuracies": 1.0, "rewards/chosen": -2.0141303539276123, "rewards/margins": 9.713563919067383, "rewards/rejected": -11.727693557739258, "step": 675 }, { "epoch": 4.005925925925926, "grad_norm": 0.44616270427309485, "learning_rate": 2.9106433382233877e-07, "logits/chosen": -1.6794055700302124, "logits/rejected": -1.6650766134262085, "logps/chosen": -34.78199768066406, "logps/rejected": -61.020729064941406, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -1.43512761592865, "rewards/margins": 8.089323043823242, "rewards/rejected": -9.524450302124023, "step": 676 }, { "epoch": 4.011851851851852, "grad_norm": 0.341375032203737, "learning_rate": 2.90423393167739e-07, "logits/chosen": -1.6758373975753784, "logits/rejected": -1.7946535348892212, "logps/chosen": -53.309226989746094, "logps/rejected": -90.131591796875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -1.5563483238220215, "rewards/margins": 12.359565734863281, "rewards/rejected": -13.915914535522461, "step": 677 }, { "epoch": 4.017777777777778, "grad_norm": 0.34700385447182475, "learning_rate": 2.897821795652972e-07, "logits/chosen": -1.9766027927398682, "logits/rejected": -1.864401936531067, "logps/chosen": -38.792755126953125, "logps/rejected": -81.77716064453125, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -0.4709629714488983, "rewards/margins": 11.713024139404297, "rewards/rejected": -12.183988571166992, "step": 678 }, { "epoch": 4.023703703703704, "grad_norm": 0.4037522038903148, "learning_rate": 2.891406973446319e-07, "logits/chosen": -1.5716941356658936, "logits/rejected": -1.5570297241210938, "logps/chosen": -60.495906829833984, "logps/rejected": -77.89375305175781, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -2.105973958969116, "rewards/margins": 8.402738571166992, "rewards/rejected": -10.508712768554688, "step": 679 }, { "epoch": 4.029629629629629, "grad_norm": 0.30335133752738797, "learning_rate": 2.8849895083717536e-07, "logits/chosen": -1.4498519897460938, "logits/rejected": -1.3987611532211304, "logps/chosen": -45.42414474487305, "logps/rejected": -68.12642669677734, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -2.8546907901763916, "rewards/margins": 9.858153343200684, "rewards/rejected": -12.712844848632812, "step": 680 }, { "epoch": 4.035555555555556, "grad_norm": 0.5402501262528404, "learning_rate": 2.8785694437614416e-07, "logits/chosen": -1.4688129425048828, "logits/rejected": -1.3592897653579712, "logps/chosen": -41.556304931640625, "logps/rejected": -67.09361267089844, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -2.2745485305786133, "rewards/margins": 8.337224006652832, "rewards/rejected": -10.611772537231445, "step": 681 }, { "epoch": 4.0414814814814815, "grad_norm": 0.557643458455523, "learning_rate": 2.872146822965105e-07, "logits/chosen": -1.367553472518921, "logits/rejected": -1.0743577480316162, "logps/chosen": -34.53441619873047, "logps/rejected": -67.5462646484375, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -1.9292609691619873, "rewards/margins": 9.131160736083984, "rewards/rejected": -11.06042194366455, "step": 682 }, { "epoch": 4.047407407407407, "grad_norm": 0.22131488540300684, "learning_rate": 2.865721689349722e-07, "logits/chosen": -1.3739020824432373, "logits/rejected": -1.0636659860610962, "logps/chosen": -43.13876724243164, "logps/rejected": -79.29910278320312, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -1.9893310070037842, "rewards/margins": 10.191972732543945, "rewards/rejected": -12.181304931640625, "step": 683 }, { "epoch": 4.053333333333334, "grad_norm": 0.2563358598810667, "learning_rate": 2.8592940862992415e-07, "logits/chosen": -1.4154634475708008, "logits/rejected": -1.2823362350463867, "logps/chosen": -40.450199127197266, "logps/rejected": -69.16603088378906, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.343075752258301, "rewards/margins": 9.379046440124512, "rewards/rejected": -11.722122192382812, "step": 684 }, { "epoch": 4.059259259259259, "grad_norm": 0.4045380885099888, "learning_rate": 2.8528640572142835e-07, "logits/chosen": -1.6875383853912354, "logits/rejected": -1.620922565460205, "logps/chosen": -32.78309631347656, "logps/rejected": -56.436370849609375, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -1.2603614330291748, "rewards/margins": 9.30524730682373, "rewards/rejected": -10.565608978271484, "step": 685 }, { "epoch": 4.065185185185185, "grad_norm": 0.38515890207527487, "learning_rate": 2.846431645511851e-07, "logits/chosen": -1.4085663557052612, "logits/rejected": -1.379919409751892, "logps/chosen": -36.84951400756836, "logps/rejected": -67.17756652832031, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -1.5410351753234863, "rewards/margins": 10.908300399780273, "rewards/rejected": -12.449335098266602, "step": 686 }, { "epoch": 4.071111111111111, "grad_norm": 1.1313449143022392, "learning_rate": 2.839996894625037e-07, "logits/chosen": -1.6871917247772217, "logits/rejected": -1.337934136390686, "logps/chosen": -42.18588638305664, "logps/rejected": -82.45413970947266, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -2.5758676528930664, "rewards/margins": 11.01657485961914, "rewards/rejected": -13.592442512512207, "step": 687 }, { "epoch": 4.077037037037037, "grad_norm": 0.7964723232122505, "learning_rate": 2.8335598480027224e-07, "logits/chosen": -1.4245645999908447, "logits/rejected": -1.473569393157959, "logps/chosen": -54.070159912109375, "logps/rejected": -68.75679016113281, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -2.470885753631592, "rewards/margins": 7.473635673522949, "rewards/rejected": -9.944520950317383, "step": 688 }, { "epoch": 4.082962962962963, "grad_norm": 0.3792283961142719, "learning_rate": 2.8271205491092963e-07, "logits/chosen": -1.634963870048523, "logits/rejected": -1.5209393501281738, "logps/chosen": -38.38444519042969, "logps/rejected": -76.35704803466797, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -1.5493276119232178, "rewards/margins": 12.205493927001953, "rewards/rejected": -13.75482177734375, "step": 689 }, { "epoch": 4.088888888888889, "grad_norm": 0.3997631224622663, "learning_rate": 2.820679041424352e-07, "logits/chosen": -1.3238720893859863, "logits/rejected": -1.2506842613220215, "logps/chosen": -30.587196350097656, "logps/rejected": -53.626861572265625, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -1.9008562564849854, "rewards/margins": 7.825616836547852, "rewards/rejected": -9.726472854614258, "step": 690 }, { "epoch": 4.094814814814815, "grad_norm": 0.4811894590769858, "learning_rate": 2.814235368442398e-07, "logits/chosen": -1.9758150577545166, "logits/rejected": -1.9664149284362793, "logps/chosen": -52.41566467285156, "logps/rejected": -81.12825775146484, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -2.485563278198242, "rewards/margins": 9.805296897888184, "rewards/rejected": -12.290860176086426, "step": 691 }, { "epoch": 4.100740740740741, "grad_norm": 0.4049087658472698, "learning_rate": 2.8077895736725647e-07, "logits/chosen": -1.5128250122070312, "logits/rejected": -1.4495248794555664, "logps/chosen": -47.274620056152344, "logps/rejected": -81.28178405761719, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -2.5623488426208496, "rewards/margins": 10.90963077545166, "rewards/rejected": -13.471979141235352, "step": 692 }, { "epoch": 4.1066666666666665, "grad_norm": 0.7133811245985797, "learning_rate": 2.801341700638307e-07, "logits/chosen": -1.385081171989441, "logits/rejected": -1.4335522651672363, "logps/chosen": -53.6270751953125, "logps/rejected": -71.13097381591797, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -3.143235683441162, "rewards/margins": 9.599444389343262, "rewards/rejected": -12.742679595947266, "step": 693 }, { "epoch": 4.112592592592593, "grad_norm": 0.4479230736646601, "learning_rate": 2.7948917928771153e-07, "logits/chosen": -1.4062227010726929, "logits/rejected": -1.4977256059646606, "logps/chosen": -42.570579528808594, "logps/rejected": -71.04094696044922, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -1.6300326585769653, "rewards/margins": 10.64380931854248, "rewards/rejected": -12.273841857910156, "step": 694 }, { "epoch": 4.118518518518519, "grad_norm": 0.2811534329638401, "learning_rate": 2.7884398939402156e-07, "logits/chosen": -1.6848721504211426, "logits/rejected": -1.6964337825775146, "logps/chosen": -38.059608459472656, "logps/rejected": -56.316627502441406, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -1.5647661685943604, "rewards/margins": 9.532676696777344, "rewards/rejected": -11.097443580627441, "step": 695 }, { "epoch": 4.124444444444444, "grad_norm": 0.5476766976057272, "learning_rate": 2.78198604739228e-07, "logits/chosen": -1.293939471244812, "logits/rejected": -1.3563249111175537, "logps/chosen": -46.362632751464844, "logps/rejected": -52.52191162109375, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -1.0411702394485474, "rewards/margins": 8.045003890991211, "rewards/rejected": -9.086174011230469, "step": 696 }, { "epoch": 4.13037037037037, "grad_norm": 0.28241806387271107, "learning_rate": 2.7755302968111346e-07, "logits/chosen": -2.09261155128479, "logits/rejected": -2.1400394439697266, "logps/chosen": -49.53968811035156, "logps/rejected": -97.72592163085938, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -1.2412805557250977, "rewards/margins": 13.639833450317383, "rewards/rejected": -14.88111400604248, "step": 697 }, { "epoch": 4.136296296296297, "grad_norm": 0.3964227629392675, "learning_rate": 2.7690726857874564e-07, "logits/chosen": -1.924755573272705, "logits/rejected": -1.8481194972991943, "logps/chosen": -40.84855270385742, "logps/rejected": -65.64152526855469, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -1.4446959495544434, "rewards/margins": 10.051863670349121, "rewards/rejected": -11.496559143066406, "step": 698 }, { "epoch": 4.142222222222222, "grad_norm": 0.18798709520475315, "learning_rate": 2.7626132579244893e-07, "logits/chosen": -1.5864109992980957, "logits/rejected": -1.5083001852035522, "logps/chosen": -42.136207580566406, "logps/rejected": -73.30389404296875, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -2.3849995136260986, "rewards/margins": 9.932413101196289, "rewards/rejected": -12.317412376403809, "step": 699 }, { "epoch": 4.148148148148148, "grad_norm": 0.31778139923542953, "learning_rate": 2.756152056837743e-07, "logits/chosen": -1.566931128501892, "logits/rejected": -1.6499443054199219, "logps/chosen": -47.97265625, "logps/rejected": -62.361839294433594, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -2.526698589324951, "rewards/margins": 9.130287170410156, "rewards/rejected": -11.656986236572266, "step": 700 }, { "epoch": 4.1540740740740745, "grad_norm": 0.27186160437951046, "learning_rate": 2.749689126154698e-07, "logits/chosen": -1.5107808113098145, "logits/rejected": -1.403849482536316, "logps/chosen": -34.08753967285156, "logps/rejected": -59.909873962402344, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -1.3250665664672852, "rewards/margins": 8.911955833435059, "rewards/rejected": -10.237022399902344, "step": 701 }, { "epoch": 4.16, "grad_norm": 0.5485812218119803, "learning_rate": 2.743224509514519e-07, "logits/chosen": -1.731938362121582, "logits/rejected": -1.6260651350021362, "logps/chosen": -43.674652099609375, "logps/rejected": -76.96273803710938, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -3.195899724960327, "rewards/margins": 11.439129829406738, "rewards/rejected": -14.635029792785645, "step": 702 }, { "epoch": 4.165925925925926, "grad_norm": 0.4031706245145299, "learning_rate": 2.73675825056775e-07, "logits/chosen": -1.7939112186431885, "logits/rejected": -1.9431332349777222, "logps/chosen": -45.181190490722656, "logps/rejected": -61.67738342285156, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -0.5738084316253662, "rewards/margins": 9.546890258789062, "rewards/rejected": -10.120698928833008, "step": 703 }, { "epoch": 4.1718518518518515, "grad_norm": 0.3689385733663178, "learning_rate": 2.730290392976025e-07, "logits/chosen": -1.2359377145767212, "logits/rejected": -1.576162338256836, "logps/chosen": -52.07234191894531, "logps/rejected": -66.13168334960938, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -2.4633660316467285, "rewards/margins": 9.161190032958984, "rewards/rejected": -11.624555587768555, "step": 704 }, { "epoch": 4.177777777777778, "grad_norm": 0.2884304692703736, "learning_rate": 2.723820980411774e-07, "logits/chosen": -1.401477575302124, "logits/rejected": -1.3764700889587402, "logps/chosen": -37.40475082397461, "logps/rejected": -61.718605041503906, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -1.7576137781143188, "rewards/margins": 8.915960311889648, "rewards/rejected": -10.67357349395752, "step": 705 }, { "epoch": 4.183703703703704, "grad_norm": 0.40859441414556874, "learning_rate": 2.7173500565579256e-07, "logits/chosen": -1.9278970956802368, "logits/rejected": -1.8463729619979858, "logps/chosen": -56.57386779785156, "logps/rejected": -95.51248168945312, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -3.846968173980713, "rewards/margins": 13.251940727233887, "rewards/rejected": -17.098907470703125, "step": 706 }, { "epoch": 4.189629629629629, "grad_norm": 0.1915169354428314, "learning_rate": 2.7108776651076116e-07, "logits/chosen": -1.8610478639602661, "logits/rejected": -1.8643684387207031, "logps/chosen": -34.90126419067383, "logps/rejected": -67.45941162109375, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -1.8161227703094482, "rewards/margins": 10.358243942260742, "rewards/rejected": -12.174367904663086, "step": 707 }, { "epoch": 4.195555555555556, "grad_norm": 0.6811289236407966, "learning_rate": 2.704403849763878e-07, "logits/chosen": -1.4927406311035156, "logits/rejected": -1.4667131900787354, "logps/chosen": -46.89321517944336, "logps/rejected": -73.92708587646484, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -2.232743740081787, "rewards/margins": 9.66046142578125, "rewards/rejected": -11.893203735351562, "step": 708 }, { "epoch": 4.201481481481482, "grad_norm": 0.7640135263515507, "learning_rate": 2.697928654239378e-07, "logits/chosen": -1.563415288925171, "logits/rejected": -1.6332391500473022, "logps/chosen": -40.70332717895508, "logps/rejected": -60.507545471191406, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -1.8509180545806885, "rewards/margins": 8.442529678344727, "rewards/rejected": -10.293449401855469, "step": 709 }, { "epoch": 4.207407407407407, "grad_norm": 0.6091245161088805, "learning_rate": 2.6914521222560907e-07, "logits/chosen": -1.6223444938659668, "logits/rejected": -1.419754981994629, "logps/chosen": -50.94943618774414, "logps/rejected": -81.05281829833984, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -2.3365554809570312, "rewards/margins": 10.70995044708252, "rewards/rejected": -13.046506881713867, "step": 710 }, { "epoch": 4.213333333333333, "grad_norm": 0.49108019369046707, "learning_rate": 2.6849742975450163e-07, "logits/chosen": -1.5237915515899658, "logits/rejected": -1.3569711446762085, "logps/chosen": -46.981380462646484, "logps/rejected": -74.44270324707031, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -1.2535314559936523, "rewards/margins": 9.97585391998291, "rewards/rejected": -11.229385375976562, "step": 711 }, { "epoch": 4.2192592592592595, "grad_norm": 0.5511110163173969, "learning_rate": 2.6784952238458824e-07, "logits/chosen": -1.4845302104949951, "logits/rejected": -1.4548472166061401, "logps/chosen": -42.92816162109375, "logps/rejected": -69.95036315917969, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -2.3284811973571777, "rewards/margins": 10.633824348449707, "rewards/rejected": -12.962305068969727, "step": 712 }, { "epoch": 4.225185185185185, "grad_norm": 0.6631086415739061, "learning_rate": 2.672014944906854e-07, "logits/chosen": -1.6981308460235596, "logits/rejected": -1.5001087188720703, "logps/chosen": -46.60968017578125, "logps/rejected": -88.11974334716797, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -1.792693853378296, "rewards/margins": 11.876981735229492, "rewards/rejected": -13.669676780700684, "step": 713 }, { "epoch": 4.231111111111111, "grad_norm": 0.23031970682456634, "learning_rate": 2.665533504484231e-07, "logits/chosen": -1.549435019493103, "logits/rejected": -1.5263104438781738, "logps/chosen": -45.03104782104492, "logps/rejected": -68.78125762939453, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.494513511657715, "rewards/margins": 10.230485916137695, "rewards/rejected": -13.72499942779541, "step": 714 }, { "epoch": 4.237037037037037, "grad_norm": 0.4011643778274213, "learning_rate": 2.6590509463421573e-07, "logits/chosen": -2.088960647583008, "logits/rejected": -1.963022232055664, "logps/chosen": -37.30885314941406, "logps/rejected": -71.3285903930664, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -2.178884506225586, "rewards/margins": 10.42874526977539, "rewards/rejected": -12.607630729675293, "step": 715 }, { "epoch": 4.242962962962963, "grad_norm": 0.4448671943062079, "learning_rate": 2.6525673142523217e-07, "logits/chosen": -1.758203148841858, "logits/rejected": -1.5110293626785278, "logps/chosen": -55.106895446777344, "logps/rejected": -96.56599426269531, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -3.7676544189453125, "rewards/margins": 13.347644805908203, "rewards/rejected": -17.115299224853516, "step": 716 }, { "epoch": 4.248888888888889, "grad_norm": 0.4172731802131882, "learning_rate": 2.646082651993668e-07, "logits/chosen": -2.040677070617676, "logits/rejected": -2.044556140899658, "logps/chosen": -45.693824768066406, "logps/rejected": -64.56766510009766, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -1.5376042127609253, "rewards/margins": 9.785834312438965, "rewards/rejected": -11.32343864440918, "step": 717 }, { "epoch": 4.254814814814814, "grad_norm": 0.4123890195750414, "learning_rate": 2.6395970033520944e-07, "logits/chosen": -1.487902283668518, "logits/rejected": -1.4854258298873901, "logps/chosen": -50.966705322265625, "logps/rejected": -65.53778076171875, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -1.6284089088439941, "rewards/margins": 8.387168884277344, "rewards/rejected": -10.01557731628418, "step": 718 }, { "epoch": 4.260740740740741, "grad_norm": 1.6566065496557807, "learning_rate": 2.6331104121201575e-07, "logits/chosen": -1.9715131521224976, "logits/rejected": -1.799248456954956, "logps/chosen": -50.722068786621094, "logps/rejected": -91.66192626953125, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -3.7970070838928223, "rewards/margins": 9.966672897338867, "rewards/rejected": -13.763680458068848, "step": 719 }, { "epoch": 4.266666666666667, "grad_norm": 0.39476587023081217, "learning_rate": 2.626622922096782e-07, "logits/chosen": -1.5378649234771729, "logits/rejected": -1.4630424976348877, "logps/chosen": -46.547203063964844, "logps/rejected": -80.9116439819336, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -2.3771755695343018, "rewards/margins": 11.267572402954102, "rewards/rejected": -13.644747734069824, "step": 720 }, { "epoch": 4.272592592592592, "grad_norm": 0.5669746583387688, "learning_rate": 2.6201345770869584e-07, "logits/chosen": -1.5094960927963257, "logits/rejected": -1.3966280221939087, "logps/chosen": -40.22157287597656, "logps/rejected": -72.3768539428711, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -1.9138059616088867, "rewards/margins": 11.016143798828125, "rewards/rejected": -12.929948806762695, "step": 721 }, { "epoch": 4.278518518518519, "grad_norm": 0.6436582390962847, "learning_rate": 2.6136454209014513e-07, "logits/chosen": -1.6182851791381836, "logits/rejected": -1.5757420063018799, "logps/chosen": -47.499351501464844, "logps/rejected": -72.69096374511719, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -2.6912693977355957, "rewards/margins": 9.962089538574219, "rewards/rejected": -12.653359413146973, "step": 722 }, { "epoch": 4.2844444444444445, "grad_norm": 0.3075696401766946, "learning_rate": 2.6071554973565036e-07, "logits/chosen": -1.0639721155166626, "logits/rejected": -1.0852502584457397, "logps/chosen": -40.48524475097656, "logps/rejected": -65.24382019042969, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -2.40928316116333, "rewards/margins": 10.195439338684082, "rewards/rejected": -12.604722023010254, "step": 723 }, { "epoch": 4.29037037037037, "grad_norm": 0.339129929102594, "learning_rate": 2.600664850273538e-07, "logits/chosen": -1.5270686149597168, "logits/rejected": -1.4607093334197998, "logps/chosen": -52.461341857910156, "logps/rejected": -70.19440460205078, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -2.4749996662139893, "rewards/margins": 9.578967094421387, "rewards/rejected": -12.053966522216797, "step": 724 }, { "epoch": 4.296296296296296, "grad_norm": 0.4827892185298997, "learning_rate": 2.594173523478864e-07, "logits/chosen": -1.8119120597839355, "logits/rejected": -1.6746340990066528, "logps/chosen": -37.49448013305664, "logps/rejected": -69.68302917480469, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -3.0536510944366455, "rewards/margins": 10.237602233886719, "rewards/rejected": -13.291254043579102, "step": 725 }, { "epoch": 4.302222222222222, "grad_norm": 0.4377884100324364, "learning_rate": 2.587681560803379e-07, "logits/chosen": -1.5027875900268555, "logits/rejected": -1.6284228563308716, "logps/chosen": -45.38482666015625, "logps/rejected": -71.05689239501953, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -3.0713844299316406, "rewards/margins": 10.261495590209961, "rewards/rejected": -13.332880020141602, "step": 726 }, { "epoch": 4.308148148148148, "grad_norm": 0.288174657644559, "learning_rate": 2.5811890060822754e-07, "logits/chosen": -1.5329185724258423, "logits/rejected": -1.6217219829559326, "logps/chosen": -54.94902038574219, "logps/rejected": -75.75360870361328, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -1.8438515663146973, "rewards/margins": 11.377456665039062, "rewards/rejected": -13.221307754516602, "step": 727 }, { "epoch": 4.314074074074074, "grad_norm": 0.3568800731107457, "learning_rate": 2.574695903154744e-07, "logits/chosen": -1.2863966226577759, "logits/rejected": -1.3943865299224854, "logps/chosen": -52.2459716796875, "logps/rejected": -68.17107391357422, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -2.828482151031494, "rewards/margins": 9.69110107421875, "rewards/rejected": -12.519582748413086, "step": 728 }, { "epoch": 4.32, "grad_norm": 0.49248567873103494, "learning_rate": 2.5682022958636753e-07, "logits/chosen": -1.5085232257843018, "logits/rejected": -1.2503995895385742, "logps/chosen": -38.498687744140625, "logps/rejected": -76.63363647460938, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -2.512385368347168, "rewards/margins": 11.470130920410156, "rewards/rejected": -13.98251724243164, "step": 729 }, { "epoch": 4.325925925925926, "grad_norm": 0.3106187542124455, "learning_rate": 2.5617082280553655e-07, "logits/chosen": -1.7640349864959717, "logits/rejected": -1.7682902812957764, "logps/chosen": -41.87824249267578, "logps/rejected": -69.6838150024414, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -2.774226427078247, "rewards/margins": 8.697710990905762, "rewards/rejected": -11.47193717956543, "step": 730 }, { "epoch": 4.331851851851852, "grad_norm": 0.2636700118454411, "learning_rate": 2.5552137435792215e-07, "logits/chosen": -1.506960391998291, "logits/rejected": -1.710749626159668, "logps/chosen": -52.51408004760742, "logps/rejected": -69.61265563964844, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -1.9004751443862915, "rewards/margins": 10.869614601135254, "rewards/rejected": -12.770090103149414, "step": 731 }, { "epoch": 4.337777777777778, "grad_norm": 0.3296961452649938, "learning_rate": 2.5487188862874633e-07, "logits/chosen": -1.400599718093872, "logits/rejected": -1.525638461112976, "logps/chosen": -39.74789810180664, "logps/rejected": -74.80589294433594, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -2.5468039512634277, "rewards/margins": 10.360738754272461, "rewards/rejected": -12.907543182373047, "step": 732 }, { "epoch": 4.343703703703704, "grad_norm": 0.2820919892356091, "learning_rate": 2.542223700034827e-07, "logits/chosen": -1.4439010620117188, "logits/rejected": -1.4324991703033447, "logps/chosen": -35.00661087036133, "logps/rejected": -68.59481811523438, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -1.4738209247589111, "rewards/margins": 10.527488708496094, "rewards/rejected": -12.001309394836426, "step": 733 }, { "epoch": 4.3496296296296295, "grad_norm": 0.36821571256857155, "learning_rate": 2.535728228678273e-07, "logits/chosen": -1.9044547080993652, "logits/rejected": -1.7454307079315186, "logps/chosen": -41.30238723754883, "logps/rejected": -70.7161865234375, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -2.1526682376861572, "rewards/margins": 9.870889663696289, "rewards/rejected": -12.023557662963867, "step": 734 }, { "epoch": 4.355555555555555, "grad_norm": 0.5309724031670829, "learning_rate": 2.529232516076684e-07, "logits/chosen": -1.8040151596069336, "logits/rejected": -1.8421554565429688, "logps/chosen": -34.45152282714844, "logps/rejected": -64.11228942871094, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -1.7052326202392578, "rewards/margins": 9.953506469726562, "rewards/rejected": -11.65873908996582, "step": 735 }, { "epoch": 4.361481481481482, "grad_norm": 0.5620423781992994, "learning_rate": 2.522736606090572e-07, "logits/chosen": -1.9946095943450928, "logits/rejected": -1.9713982343673706, "logps/chosen": -47.492515563964844, "logps/rejected": -74.77565002441406, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -2.4397852420806885, "rewards/margins": 10.341421127319336, "rewards/rejected": -12.781206130981445, "step": 736 }, { "epoch": 4.367407407407407, "grad_norm": 0.5850268258670716, "learning_rate": 2.5162405425817804e-07, "logits/chosen": -1.7595919370651245, "logits/rejected": -1.523133397102356, "logps/chosen": -38.44977569580078, "logps/rejected": -75.89402770996094, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -2.9519128799438477, "rewards/margins": 10.567989349365234, "rewards/rejected": -13.519901275634766, "step": 737 }, { "epoch": 4.373333333333333, "grad_norm": 0.36409290406422345, "learning_rate": 2.5097443694131944e-07, "logits/chosen": -1.910279393196106, "logits/rejected": -1.6399755477905273, "logps/chosen": -49.62364196777344, "logps/rejected": -94.20441436767578, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -2.5087497234344482, "rewards/margins": 13.17677116394043, "rewards/rejected": -15.68552017211914, "step": 738 }, { "epoch": 4.37925925925926, "grad_norm": 0.27280875284841666, "learning_rate": 2.503248130448434e-07, "logits/chosen": -1.222158432006836, "logits/rejected": -1.1858694553375244, "logps/chosen": -41.11689758300781, "logps/rejected": -69.91155242919922, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -3.33315372467041, "rewards/margins": 10.658595085144043, "rewards/rejected": -13.991748809814453, "step": 739 }, { "epoch": 4.385185185185185, "grad_norm": 0.48153543008194893, "learning_rate": 2.496751869551567e-07, "logits/chosen": -1.213226079940796, "logits/rejected": -1.2117483615875244, "logps/chosen": -53.64634704589844, "logps/rejected": -79.62733459472656, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -2.945598840713501, "rewards/margins": 9.169174194335938, "rewards/rejected": -12.11477279663086, "step": 740 }, { "epoch": 4.391111111111111, "grad_norm": 0.3260449818828276, "learning_rate": 2.4902556305868064e-07, "logits/chosen": -1.6482226848602295, "logits/rejected": -1.369916558265686, "logps/chosen": -49.26082992553711, "logps/rejected": -80.69975280761719, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -4.141822338104248, "rewards/margins": 10.516618728637695, "rewards/rejected": -14.658441543579102, "step": 741 }, { "epoch": 4.397037037037037, "grad_norm": 0.5688273128014634, "learning_rate": 2.4837594574182194e-07, "logits/chosen": -1.4720345735549927, "logits/rejected": -1.4317636489868164, "logps/chosen": -47.89663314819336, "logps/rejected": -64.76475524902344, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -3.222043037414551, "rewards/margins": 7.936434745788574, "rewards/rejected": -11.158477783203125, "step": 742 }, { "epoch": 4.402962962962963, "grad_norm": 0.7988128886724679, "learning_rate": 2.477263393909429e-07, "logits/chosen": -1.9475209712982178, "logits/rejected": -1.9408725500106812, "logps/chosen": -41.1652717590332, "logps/rejected": -73.0821533203125, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -2.581312656402588, "rewards/margins": 10.089584350585938, "rewards/rejected": -12.670897483825684, "step": 743 }, { "epoch": 4.408888888888889, "grad_norm": 0.6956004735540405, "learning_rate": 2.4707674839233165e-07, "logits/chosen": -1.6389610767364502, "logits/rejected": -1.5735392570495605, "logps/chosen": -39.5618782043457, "logps/rejected": -75.9864501953125, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -3.017786979675293, "rewards/margins": 12.282722473144531, "rewards/rejected": -15.30051040649414, "step": 744 }, { "epoch": 4.4148148148148145, "grad_norm": 0.2740055720964037, "learning_rate": 2.4642717713217266e-07, "logits/chosen": -1.5863927602767944, "logits/rejected": -1.5586270093917847, "logps/chosen": -51.11642837524414, "logps/rejected": -78.3765869140625, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -4.301530361175537, "rewards/margins": 11.2866792678833, "rewards/rejected": -15.588210105895996, "step": 745 }, { "epoch": 4.420740740740741, "grad_norm": 0.5552007613739203, "learning_rate": 2.4577762999651727e-07, "logits/chosen": -1.665607213973999, "logits/rejected": -1.6078526973724365, "logps/chosen": -40.21504211425781, "logps/rejected": -80.14183807373047, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -2.7667739391326904, "rewards/margins": 10.827664375305176, "rewards/rejected": -13.594437599182129, "step": 746 }, { "epoch": 4.426666666666667, "grad_norm": 0.32790522815064826, "learning_rate": 2.451281113712537e-07, "logits/chosen": -1.6151583194732666, "logits/rejected": -1.5450541973114014, "logps/chosen": -37.39631271362305, "logps/rejected": -68.37910461425781, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -2.6576449871063232, "rewards/margins": 9.498502731323242, "rewards/rejected": -12.156147003173828, "step": 747 }, { "epoch": 4.432592592592592, "grad_norm": 0.3278713822461682, "learning_rate": 2.4447862564207783e-07, "logits/chosen": -1.6984974145889282, "logits/rejected": -1.4929834604263306, "logps/chosen": -41.737998962402344, "logps/rejected": -82.5109634399414, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -2.1655399799346924, "rewards/margins": 11.867754936218262, "rewards/rejected": -14.033292770385742, "step": 748 }, { "epoch": 4.438518518518519, "grad_norm": 0.2728427147646305, "learning_rate": 2.438291771944635e-07, "logits/chosen": -1.3253810405731201, "logits/rejected": -1.304832935333252, "logps/chosen": -32.45439529418945, "logps/rejected": -64.35618591308594, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -1.709897518157959, "rewards/margins": 9.905057907104492, "rewards/rejected": -11.61495590209961, "step": 749 }, { "epoch": 4.444444444444445, "grad_norm": 0.2992721613416659, "learning_rate": 2.431797704136325e-07, "logits/chosen": -1.3873847723007202, "logits/rejected": -1.1295723915100098, "logps/chosen": -37.542747497558594, "logps/rejected": -80.18496704101562, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.6178274154663086, "rewards/margins": 10.947952270507812, "rewards/rejected": -13.565778732299805, "step": 750 }, { "epoch": 4.45037037037037, "grad_norm": 0.23217219343574805, "learning_rate": 2.425304096845256e-07, "logits/chosen": -2.1051247119903564, "logits/rejected": -2.2890005111694336, "logps/chosen": -71.4691162109375, "logps/rejected": -85.68158721923828, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -4.611448287963867, "rewards/margins": 10.625432968139648, "rewards/rejected": -15.236881256103516, "step": 751 }, { "epoch": 4.456296296296296, "grad_norm": 0.31223869571923357, "learning_rate": 2.4188109939177244e-07, "logits/chosen": -1.3819341659545898, "logits/rejected": -1.1680090427398682, "logps/chosen": -46.19057846069336, "logps/rejected": -78.53694152832031, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -2.2963953018188477, "rewards/margins": 10.76588249206543, "rewards/rejected": -13.062276840209961, "step": 752 }, { "epoch": 4.4622222222222225, "grad_norm": 0.5598617582374887, "learning_rate": 2.412318439196621e-07, "logits/chosen": -1.2198549509048462, "logits/rejected": -1.4387001991271973, "logps/chosen": -49.030216217041016, "logps/rejected": -57.43315124511719, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -2.343151092529297, "rewards/margins": 8.80035400390625, "rewards/rejected": -11.143505096435547, "step": 753 }, { "epoch": 4.468148148148148, "grad_norm": 0.388327433841169, "learning_rate": 2.405826476521137e-07, "logits/chosen": -1.7963589429855347, "logits/rejected": -1.6334545612335205, "logps/chosen": -42.468994140625, "logps/rejected": -81.61261749267578, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -3.233997106552124, "rewards/margins": 11.50871467590332, "rewards/rejected": -14.742712020874023, "step": 754 }, { "epoch": 4.474074074074074, "grad_norm": 0.36873722546852566, "learning_rate": 2.399335149726463e-07, "logits/chosen": -1.864285945892334, "logits/rejected": -1.8117425441741943, "logps/chosen": -43.57659149169922, "logps/rejected": -61.182159423828125, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -2.8054585456848145, "rewards/margins": 8.73664665222168, "rewards/rejected": -11.542104721069336, "step": 755 }, { "epoch": 4.48, "grad_norm": 0.9656825907418681, "learning_rate": 2.392844502643497e-07, "logits/chosen": -1.5215977430343628, "logits/rejected": -1.389098882675171, "logps/chosen": -42.34136199951172, "logps/rejected": -69.96714782714844, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -1.9610204696655273, "rewards/margins": 10.299530982971191, "rewards/rejected": -12.260551452636719, "step": 756 }, { "epoch": 4.485925925925926, "grad_norm": 0.25342334021643204, "learning_rate": 2.3863545790985485e-07, "logits/chosen": -1.7879868745803833, "logits/rejected": -1.884661078453064, "logps/chosen": -49.11554718017578, "logps/rejected": -73.60369110107422, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -2.804800271987915, "rewards/margins": 10.777206420898438, "rewards/rejected": -13.582006454467773, "step": 757 }, { "epoch": 4.491851851851852, "grad_norm": 0.4088130548067966, "learning_rate": 2.379865422913042e-07, "logits/chosen": -1.6179802417755127, "logits/rejected": -1.5631730556488037, "logps/chosen": -38.03044509887695, "logps/rejected": -74.8770980834961, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -3.0774755477905273, "rewards/margins": 10.472331047058105, "rewards/rejected": -13.549808502197266, "step": 758 }, { "epoch": 4.497777777777777, "grad_norm": 0.24621046378728606, "learning_rate": 2.3733770779032184e-07, "logits/chosen": -1.0918471813201904, "logits/rejected": -1.4053854942321777, "logps/chosen": -44.596763610839844, "logps/rejected": -68.87012481689453, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -1.8497509956359863, "rewards/margins": 11.430082321166992, "rewards/rejected": -13.27983283996582, "step": 759 }, { "epoch": 4.503703703703704, "grad_norm": 0.4039328343347822, "learning_rate": 2.3668895878798423e-07, "logits/chosen": -1.666556477546692, "logits/rejected": -1.495963454246521, "logps/chosen": -35.45246505737305, "logps/rejected": -56.131202697753906, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -0.8134236335754395, "rewards/margins": 7.402918338775635, "rewards/rejected": -8.216341972351074, "step": 760 }, { "epoch": 4.50962962962963, "grad_norm": 0.6762734783611634, "learning_rate": 2.360402996647906e-07, "logits/chosen": -1.7931262254714966, "logits/rejected": -1.6433568000793457, "logps/chosen": -49.27035903930664, "logps/rejected": -93.5889892578125, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -3.803044080734253, "rewards/margins": 13.115455627441406, "rewards/rejected": -16.918498992919922, "step": 761 }, { "epoch": 4.515555555555555, "grad_norm": 0.6202135342855698, "learning_rate": 2.3539173480063318e-07, "logits/chosen": -1.5122251510620117, "logits/rejected": -1.5641227960586548, "logps/chosen": -51.571006774902344, "logps/rejected": -71.65675354003906, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -4.491517066955566, "rewards/margins": 8.394001007080078, "rewards/rejected": -12.885517120361328, "step": 762 }, { "epoch": 4.521481481481482, "grad_norm": 0.5983718330378733, "learning_rate": 2.3474326857476783e-07, "logits/chosen": -1.9533593654632568, "logits/rejected": -1.8577436208724976, "logps/chosen": -40.418888092041016, "logps/rejected": -72.35916137695312, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -2.6917667388916016, "rewards/margins": 11.463409423828125, "rewards/rejected": -14.155176162719727, "step": 763 }, { "epoch": 4.5274074074074075, "grad_norm": 0.5523399287602111, "learning_rate": 2.340949053657843e-07, "logits/chosen": -1.3917421102523804, "logits/rejected": -1.4328992366790771, "logps/chosen": -53.25602722167969, "logps/rejected": -73.91915893554688, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -2.373779296875, "rewards/margins": 10.894166946411133, "rewards/rejected": -13.267946243286133, "step": 764 }, { "epoch": 4.533333333333333, "grad_norm": 0.2775376512794254, "learning_rate": 2.3344664955157685e-07, "logits/chosen": -1.2376761436462402, "logits/rejected": -1.156088948249817, "logps/chosen": -31.447860717773438, "logps/rejected": -66.50593566894531, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -2.0162906646728516, "rewards/margins": 10.467144012451172, "rewards/rejected": -12.483434677124023, "step": 765 }, { "epoch": 4.539259259259259, "grad_norm": 0.37613779762460614, "learning_rate": 2.3279850550931458e-07, "logits/chosen": -1.581992745399475, "logits/rejected": -1.3465206623077393, "logps/chosen": -45.77264404296875, "logps/rejected": -78.6106185913086, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -2.8553242683410645, "rewards/margins": 10.771078109741211, "rewards/rejected": -13.626401901245117, "step": 766 }, { "epoch": 4.545185185185185, "grad_norm": 0.26554629446762146, "learning_rate": 2.3215047761541172e-07, "logits/chosen": -1.3734705448150635, "logits/rejected": -1.1478571891784668, "logps/chosen": -31.20534896850586, "logps/rejected": -67.67927551269531, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.9986950159072876, "rewards/margins": 10.351818084716797, "rewards/rejected": -11.350513458251953, "step": 767 }, { "epoch": 4.551111111111111, "grad_norm": 0.4328268508688234, "learning_rate": 2.3150257024549845e-07, "logits/chosen": -1.4306426048278809, "logits/rejected": -1.2422418594360352, "logps/chosen": -35.660118103027344, "logps/rejected": -66.81575775146484, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -2.6081128120422363, "rewards/margins": 11.39274787902832, "rewards/rejected": -14.000861167907715, "step": 768 }, { "epoch": 4.557037037037037, "grad_norm": 0.5336695898129881, "learning_rate": 2.3085478777439096e-07, "logits/chosen": -1.548780083656311, "logits/rejected": -1.5772260427474976, "logps/chosen": -42.280174255371094, "logps/rejected": -63.68194580078125, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -2.8020665645599365, "rewards/margins": 9.052971839904785, "rewards/rejected": -11.855037689208984, "step": 769 }, { "epoch": 4.562962962962963, "grad_norm": 0.22474609924604425, "learning_rate": 2.302071345760622e-07, "logits/chosen": -1.84547758102417, "logits/rejected": -1.9938280582427979, "logps/chosen": -60.60712432861328, "logps/rejected": -72.54964447021484, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -3.1693592071533203, "rewards/margins": 10.86662483215332, "rewards/rejected": -14.03598403930664, "step": 770 }, { "epoch": 4.568888888888889, "grad_norm": 1.0805753910268383, "learning_rate": 2.2955961502361232e-07, "logits/chosen": -1.837569236755371, "logits/rejected": -1.8610866069793701, "logps/chosen": -45.81024932861328, "logps/rejected": -63.923728942871094, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": -1.9928364753723145, "rewards/margins": 9.243518829345703, "rewards/rejected": -11.23635482788086, "step": 771 }, { "epoch": 4.574814814814815, "grad_norm": 0.2815304479426961, "learning_rate": 2.2891223348923882e-07, "logits/chosen": -1.499778389930725, "logits/rejected": -1.4236795902252197, "logps/chosen": -54.17256164550781, "logps/rejected": -88.95265197753906, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -3.209611654281616, "rewards/margins": 12.507816314697266, "rewards/rejected": -15.717428207397461, "step": 772 }, { "epoch": 4.58074074074074, "grad_norm": 0.28313198718639915, "learning_rate": 2.2826499434420745e-07, "logits/chosen": -1.4156975746154785, "logits/rejected": -1.4123296737670898, "logps/chosen": -41.201629638671875, "logps/rejected": -69.11726379394531, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -1.8570693731307983, "rewards/margins": 10.749539375305176, "rewards/rejected": -12.606608390808105, "step": 773 }, { "epoch": 4.586666666666667, "grad_norm": 0.3125143240803341, "learning_rate": 2.2761790195882261e-07, "logits/chosen": -1.5952832698822021, "logits/rejected": -1.4846335649490356, "logps/chosen": -37.14253234863281, "logps/rejected": -74.46310424804688, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -1.426466703414917, "rewards/margins": 11.401270866394043, "rewards/rejected": -12.827737808227539, "step": 774 }, { "epoch": 4.592592592592593, "grad_norm": 0.6926539611379354, "learning_rate": 2.2697096070239748e-07, "logits/chosen": -1.636855125427246, "logits/rejected": -1.7660517692565918, "logps/chosen": -64.5381851196289, "logps/rejected": -70.02143859863281, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -3.0808470249176025, "rewards/margins": 8.71413516998291, "rewards/rejected": -11.794981956481934, "step": 775 }, { "epoch": 4.598518518518518, "grad_norm": 0.41730602281644963, "learning_rate": 2.2632417494322503e-07, "logits/chosen": -1.5239399671554565, "logits/rejected": -1.5000847578048706, "logps/chosen": -50.480892181396484, "logps/rejected": -75.76677703857422, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -2.8016414642333984, "rewards/margins": 11.542520523071289, "rewards/rejected": -14.344161987304688, "step": 776 }, { "epoch": 4.604444444444445, "grad_norm": 0.6117954122512027, "learning_rate": 2.2567754904854809e-07, "logits/chosen": -1.8561725616455078, "logits/rejected": -1.8074907064437866, "logps/chosen": -47.85967254638672, "logps/rejected": -78.99420928955078, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -2.509251356124878, "rewards/margins": 11.366935729980469, "rewards/rejected": -13.876188278198242, "step": 777 }, { "epoch": 4.6103703703703705, "grad_norm": 0.1755774879417684, "learning_rate": 2.2503108738453014e-07, "logits/chosen": -1.6200981140136719, "logits/rejected": -1.6129133701324463, "logps/chosen": -41.06743240356445, "logps/rejected": -74.25436401367188, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -2.5940749645233154, "rewards/margins": 10.168746948242188, "rewards/rejected": -12.762822151184082, "step": 778 }, { "epoch": 4.616296296296296, "grad_norm": 0.349414448813446, "learning_rate": 2.243847943162257e-07, "logits/chosen": -1.9405025243759155, "logits/rejected": -1.9417346715927124, "logps/chosen": -52.690181732177734, "logps/rejected": -65.54427337646484, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -2.6885979175567627, "rewards/margins": 9.365793228149414, "rewards/rejected": -12.054390907287598, "step": 779 }, { "epoch": 4.622222222222222, "grad_norm": 0.20242192540609477, "learning_rate": 2.23738674207551e-07, "logits/chosen": -1.613708734512329, "logits/rejected": -1.436056137084961, "logps/chosen": -35.43689727783203, "logps/rejected": -83.06486511230469, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -0.6374375820159912, "rewards/margins": 13.647027015686035, "rewards/rejected": -14.284463882446289, "step": 780 }, { "epoch": 4.628148148148148, "grad_norm": 0.5765752927321536, "learning_rate": 2.230927314212543e-07, "logits/chosen": -1.494924783706665, "logits/rejected": -1.4253530502319336, "logps/chosen": -45.896095275878906, "logps/rejected": -68.88455200195312, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -3.2405786514282227, "rewards/margins": 9.971403121948242, "rewards/rejected": -13.211981773376465, "step": 781 }, { "epoch": 4.634074074074074, "grad_norm": 0.281505079478091, "learning_rate": 2.2244697031888655e-07, "logits/chosen": -1.8622199296951294, "logits/rejected": -1.7759878635406494, "logps/chosen": -47.928443908691406, "logps/rejected": -77.3371353149414, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -3.063960075378418, "rewards/margins": 9.88760757446289, "rewards/rejected": -12.951568603515625, "step": 782 }, { "epoch": 4.64, "grad_norm": 0.8954387493569548, "learning_rate": 2.21801395260772e-07, "logits/chosen": -1.552648901939392, "logits/rejected": -1.3441781997680664, "logps/chosen": -41.77521514892578, "logps/rejected": -80.31526184082031, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -3.018218517303467, "rewards/margins": 13.303080558776855, "rewards/rejected": -16.321298599243164, "step": 783 }, { "epoch": 4.645925925925926, "grad_norm": 0.3599276511446899, "learning_rate": 2.2115601060597852e-07, "logits/chosen": -2.0601389408111572, "logits/rejected": -2.110264301300049, "logps/chosen": -49.506591796875, "logps/rejected": -65.16485595703125, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -2.511070489883423, "rewards/margins": 9.494369506835938, "rewards/rejected": -12.005439758300781, "step": 784 }, { "epoch": 4.651851851851852, "grad_norm": 0.2771432486865571, "learning_rate": 2.2051082071228852e-07, "logits/chosen": -2.0726568698883057, "logits/rejected": -1.9759494066238403, "logps/chosen": -42.74811553955078, "logps/rejected": -66.74032592773438, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -2.455653190612793, "rewards/margins": 9.912121772766113, "rewards/rejected": -12.367774963378906, "step": 785 }, { "epoch": 4.657777777777778, "grad_norm": 0.33053314179972837, "learning_rate": 2.1986582993616925e-07, "logits/chosen": -1.732776165008545, "logits/rejected": -1.5467349290847778, "logps/chosen": -39.45323181152344, "logps/rejected": -81.57275390625, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -1.0747121572494507, "rewards/margins": 13.912720680236816, "rewards/rejected": -14.987432479858398, "step": 786 }, { "epoch": 4.663703703703703, "grad_norm": 0.6345719417992604, "learning_rate": 2.192210426327435e-07, "logits/chosen": -1.784833312034607, "logits/rejected": -1.6461296081542969, "logps/chosen": -43.437374114990234, "logps/rejected": -69.29059600830078, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -1.3814399242401123, "rewards/margins": 10.256202697753906, "rewards/rejected": -11.637643814086914, "step": 787 }, { "epoch": 4.66962962962963, "grad_norm": 0.4536643965652212, "learning_rate": 2.185764631557602e-07, "logits/chosen": -1.6088899374008179, "logits/rejected": -1.374220371246338, "logps/chosen": -42.108734130859375, "logps/rejected": -78.39088439941406, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -2.488034725189209, "rewards/margins": 9.032186508178711, "rewards/rejected": -11.520221710205078, "step": 788 }, { "epoch": 4.6755555555555555, "grad_norm": 0.24775139302990495, "learning_rate": 2.1793209585756482e-07, "logits/chosen": -1.583560585975647, "logits/rejected": -1.6787372827529907, "logps/chosen": -75.60416412353516, "logps/rejected": -101.32034301757812, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -4.8459086418151855, "rewards/margins": 14.32331371307373, "rewards/rejected": -19.16922378540039, "step": 789 }, { "epoch": 4.681481481481481, "grad_norm": 0.352054728762041, "learning_rate": 2.1728794508907038e-07, "logits/chosen": -2.015982151031494, "logits/rejected": -1.5659945011138916, "logps/chosen": -44.24329376220703, "logps/rejected": -109.96054077148438, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -2.7237515449523926, "rewards/margins": 13.699109077453613, "rewards/rejected": -16.42285919189453, "step": 790 }, { "epoch": 4.687407407407408, "grad_norm": 0.29454959273525827, "learning_rate": 2.1664401519972774e-07, "logits/chosen": -1.7378650903701782, "logits/rejected": -1.8261386156082153, "logps/chosen": -55.897666931152344, "logps/rejected": -81.62759399414062, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.7774252891540527, "rewards/margins": 12.035506248474121, "rewards/rejected": -15.812932014465332, "step": 791 }, { "epoch": 4.693333333333333, "grad_norm": 0.2714000933964117, "learning_rate": 2.1600031053749634e-07, "logits/chosen": -1.463356614112854, "logits/rejected": -1.3946796655654907, "logps/chosen": -50.632022857666016, "logps/rejected": -71.64886474609375, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -2.7824299335479736, "rewards/margins": 8.319438934326172, "rewards/rejected": -11.101869583129883, "step": 792 }, { "epoch": 4.699259259259259, "grad_norm": 0.3956245737008001, "learning_rate": 2.1535683544881478e-07, "logits/chosen": -1.6728391647338867, "logits/rejected": -1.6265099048614502, "logps/chosen": -38.276920318603516, "logps/rejected": -65.04293823242188, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -2.1510419845581055, "rewards/margins": 10.412332534790039, "rewards/rejected": -12.563374519348145, "step": 793 }, { "epoch": 4.705185185185185, "grad_norm": 0.34770270483367516, "learning_rate": 2.147135942785716e-07, "logits/chosen": -1.6922452449798584, "logits/rejected": -1.6048784255981445, "logps/chosen": -38.79063415527344, "logps/rejected": -68.69209289550781, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -1.9936587810516357, "rewards/margins": 9.991509437561035, "rewards/rejected": -11.98516845703125, "step": 794 }, { "epoch": 4.711111111111111, "grad_norm": 0.6662575958266204, "learning_rate": 2.1407059137007583e-07, "logits/chosen": -1.9861844778060913, "logits/rejected": -2.161007881164551, "logps/chosen": -49.49615478515625, "logps/rejected": -65.79344177246094, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -1.8835437297821045, "rewards/margins": 10.038654327392578, "rewards/rejected": -11.922198295593262, "step": 795 }, { "epoch": 4.717037037037037, "grad_norm": 0.34706493537306654, "learning_rate": 2.1342783106502777e-07, "logits/chosen": -1.9054166078567505, "logits/rejected": -1.8093022108078003, "logps/chosen": -42.534156799316406, "logps/rejected": -81.73444366455078, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -1.304574966430664, "rewards/margins": 11.91904354095459, "rewards/rejected": -13.22361946105957, "step": 796 }, { "epoch": 4.722962962962963, "grad_norm": 0.28784218622198576, "learning_rate": 2.1278531770348963e-07, "logits/chosen": -1.6888959407806396, "logits/rejected": -1.77483069896698, "logps/chosen": -51.807090759277344, "logps/rejected": -78.14078521728516, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.369813919067383, "rewards/margins": 11.10791301727295, "rewards/rejected": -13.477725982666016, "step": 797 }, { "epoch": 4.728888888888889, "grad_norm": 0.38336438976817927, "learning_rate": 2.121430556238559e-07, "logits/chosen": -1.9089480638504028, "logits/rejected": -1.8946913480758667, "logps/chosen": -38.47759246826172, "logps/rejected": -68.7850570678711, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -2.1309518814086914, "rewards/margins": 12.784750938415527, "rewards/rejected": -14.915702819824219, "step": 798 }, { "epoch": 4.734814814814815, "grad_norm": 0.2868183618195756, "learning_rate": 2.115010491628247e-07, "logits/chosen": -1.696946382522583, "logits/rejected": -1.6048494577407837, "logps/chosen": -34.717411041259766, "logps/rejected": -65.3773422241211, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -1.898589849472046, "rewards/margins": 9.88906478881836, "rewards/rejected": -11.787654876708984, "step": 799 }, { "epoch": 4.7407407407407405, "grad_norm": 0.22472378862968692, "learning_rate": 2.1085930265536808e-07, "logits/chosen": -2.0204107761383057, "logits/rejected": -1.864630937576294, "logps/chosen": -34.87665939331055, "logps/rejected": -66.435791015625, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -2.800518274307251, "rewards/margins": 9.670722961425781, "rewards/rejected": -12.471240043640137, "step": 800 }, { "epoch": 4.746666666666667, "grad_norm": 0.6742747101060639, "learning_rate": 2.1021782043470278e-07, "logits/chosen": -1.3186049461364746, "logits/rejected": -1.3445327281951904, "logps/chosen": -54.38391876220703, "logps/rejected": -77.40103149414062, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -3.349400281906128, "rewards/margins": 11.073860168457031, "rewards/rejected": -14.423260688781738, "step": 801 }, { "epoch": 4.752592592592593, "grad_norm": 0.3771344334707704, "learning_rate": 2.0957660683226103e-07, "logits/chosen": -1.71254301071167, "logits/rejected": -1.5470032691955566, "logps/chosen": -37.44327163696289, "logps/rejected": -69.07460021972656, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -1.4861843585968018, "rewards/margins": 11.189270973205566, "rewards/rejected": -12.675455093383789, "step": 802 }, { "epoch": 4.758518518518518, "grad_norm": 0.6275388078504144, "learning_rate": 2.0893566617766126e-07, "logits/chosen": -1.789711833000183, "logits/rejected": -1.892749309539795, "logps/chosen": -49.782630920410156, "logps/rejected": -66.36517333984375, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -1.7289974689483643, "rewards/margins": 9.127378463745117, "rewards/rejected": -10.856375694274902, "step": 803 }, { "epoch": 4.764444444444445, "grad_norm": 0.47449108482322694, "learning_rate": 2.0829500279867891e-07, "logits/chosen": -1.871337652206421, "logits/rejected": -1.7207472324371338, "logps/chosen": -28.964725494384766, "logps/rejected": -69.4187240600586, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -0.8920726776123047, "rewards/margins": 11.862289428710938, "rewards/rejected": -12.754362106323242, "step": 804 }, { "epoch": 4.770370370370371, "grad_norm": 0.3037246861891076, "learning_rate": 2.0765462102121719e-07, "logits/chosen": -1.5920605659484863, "logits/rejected": -1.5429118871688843, "logps/chosen": -36.87894058227539, "logps/rejected": -65.16922760009766, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -2.5912890434265137, "rewards/margins": 10.682775497436523, "rewards/rejected": -13.274065017700195, "step": 805 }, { "epoch": 4.776296296296296, "grad_norm": 0.3152244031437843, "learning_rate": 2.0701452516927797e-07, "logits/chosen": -1.61420738697052, "logits/rejected": -1.4144175052642822, "logps/chosen": -48.10711669921875, "logps/rejected": -81.6900634765625, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -3.1305041313171387, "rewards/margins": 11.635595321655273, "rewards/rejected": -14.766098022460938, "step": 806 }, { "epoch": 4.782222222222222, "grad_norm": 0.6160323095063934, "learning_rate": 2.0637471956493234e-07, "logits/chosen": -1.8907852172851562, "logits/rejected": -1.6985530853271484, "logps/chosen": -38.62782287597656, "logps/rejected": -78.48397064208984, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -3.042975425720215, "rewards/margins": 11.772692680358887, "rewards/rejected": -14.815667152404785, "step": 807 }, { "epoch": 4.7881481481481485, "grad_norm": 0.477923686133842, "learning_rate": 2.0573520852829164e-07, "logits/chosen": -2.0220890045166016, "logits/rejected": -2.0097062587738037, "logps/chosen": -33.547210693359375, "logps/rejected": -61.80625915527344, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -1.2768610715866089, "rewards/margins": 11.213603973388672, "rewards/rejected": -12.49046516418457, "step": 808 }, { "epoch": 4.794074074074074, "grad_norm": 0.27178947050649066, "learning_rate": 2.0509599637747818e-07, "logits/chosen": -1.4557392597198486, "logits/rejected": -1.5568785667419434, "logps/chosen": -43.41080856323242, "logps/rejected": -74.368408203125, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -4.034477710723877, "rewards/margins": 12.510175704956055, "rewards/rejected": -16.544654846191406, "step": 809 }, { "epoch": 4.8, "grad_norm": 0.3937548138391649, "learning_rate": 2.0445708742859625e-07, "logits/chosen": -1.3229466676712036, "logits/rejected": -1.1964821815490723, "logps/chosen": -51.84530258178711, "logps/rejected": -77.97311401367188, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -3.1834235191345215, "rewards/margins": 10.095224380493164, "rewards/rejected": -13.278647422790527, "step": 810 }, { "epoch": 4.805925925925926, "grad_norm": 0.3519512729455183, "learning_rate": 2.0381848599570273e-07, "logits/chosen": -1.6972732543945312, "logits/rejected": -1.6273045539855957, "logps/chosen": -32.62199020385742, "logps/rejected": -56.509918212890625, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -0.6276986002922058, "rewards/margins": 9.845847129821777, "rewards/rejected": -10.473546028137207, "step": 811 }, { "epoch": 4.811851851851852, "grad_norm": 0.21318796805172743, "learning_rate": 2.0318019639077803e-07, "logits/chosen": -1.5077314376831055, "logits/rejected": -1.302263855934143, "logps/chosen": -40.9090461730957, "logps/rejected": -80.91336059570312, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -2.8164610862731934, "rewards/margins": 11.253703117370605, "rewards/rejected": -14.07016372680664, "step": 812 }, { "epoch": 4.817777777777778, "grad_norm": 0.3855192253106906, "learning_rate": 2.0254222292369724e-07, "logits/chosen": -1.6157485246658325, "logits/rejected": -1.407730221748352, "logps/chosen": -38.98833465576172, "logps/rejected": -75.60125732421875, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -3.4220004081726074, "rewards/margins": 9.778350830078125, "rewards/rejected": -13.20035171508789, "step": 813 }, { "epoch": 4.823703703703703, "grad_norm": 0.7157114320343476, "learning_rate": 2.0190456990220055e-07, "logits/chosen": -1.4571491479873657, "logits/rejected": -1.4702848196029663, "logps/chosen": -42.851844787597656, "logps/rejected": -77.12294006347656, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -2.4410855770111084, "rewards/margins": 10.907438278198242, "rewards/rejected": -13.34852409362793, "step": 814 }, { "epoch": 4.82962962962963, "grad_norm": 0.5514766898574819, "learning_rate": 2.0126724163186474e-07, "logits/chosen": -1.4762797355651855, "logits/rejected": -1.388708233833313, "logps/chosen": -47.19092559814453, "logps/rejected": -65.01657104492188, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -3.214632034301758, "rewards/margins": 9.465707778930664, "rewards/rejected": -12.680340766906738, "step": 815 }, { "epoch": 4.835555555555556, "grad_norm": 0.3002635941047584, "learning_rate": 2.006302424160735e-07, "logits/chosen": -1.601129174232483, "logits/rejected": -1.5539610385894775, "logps/chosen": -38.64893341064453, "logps/rejected": -69.77659606933594, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -1.940110445022583, "rewards/margins": 12.507593154907227, "rewards/rejected": -14.44770336151123, "step": 816 }, { "epoch": 4.841481481481481, "grad_norm": 0.4205605025312493, "learning_rate": 1.9999357655598891e-07, "logits/chosen": -1.8134194612503052, "logits/rejected": -1.7884035110473633, "logps/chosen": -53.481422424316406, "logps/rejected": -75.1780776977539, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -3.23048996925354, "rewards/margins": 9.090320587158203, "rewards/rejected": -12.320810317993164, "step": 817 }, { "epoch": 4.847407407407408, "grad_norm": 0.2761329051467174, "learning_rate": 1.9935724835052196e-07, "logits/chosen": -1.1999740600585938, "logits/rejected": -1.2345824241638184, "logps/chosen": -57.651649475097656, "logps/rejected": -86.1004867553711, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -4.300785541534424, "rewards/margins": 11.698287010192871, "rewards/rejected": -15.99907112121582, "step": 818 }, { "epoch": 4.8533333333333335, "grad_norm": 0.6343615960273807, "learning_rate": 1.987212620963038e-07, "logits/chosen": -1.923182725906372, "logits/rejected": -1.9407228231430054, "logps/chosen": -51.311744689941406, "logps/rejected": -82.07662200927734, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -2.584122657775879, "rewards/margins": 10.869361877441406, "rewards/rejected": -13.453484535217285, "step": 819 }, { "epoch": 4.859259259259259, "grad_norm": 0.2782877911411568, "learning_rate": 1.9808562208765663e-07, "logits/chosen": -1.5519965887069702, "logits/rejected": -1.4769684076309204, "logps/chosen": -36.23551559448242, "logps/rejected": -74.62303924560547, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -1.2322258949279785, "rewards/margins": 11.663935661315918, "rewards/rejected": -12.896162033081055, "step": 820 }, { "epoch": 4.865185185185185, "grad_norm": 0.21964861465933386, "learning_rate": 1.9745033261656486e-07, "logits/chosen": -1.6528503894805908, "logits/rejected": -1.5674102306365967, "logps/chosen": -47.28667068481445, "logps/rejected": -75.80752563476562, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.3995161056518555, "rewards/margins": 9.84759521484375, "rewards/rejected": -12.247111320495605, "step": 821 }, { "epoch": 4.871111111111111, "grad_norm": 0.15734208549409717, "learning_rate": 1.9681539797264578e-07, "logits/chosen": -1.5805985927581787, "logits/rejected": -1.45188307762146, "logps/chosen": -57.12106704711914, "logps/rejected": -85.31029510498047, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -4.592021942138672, "rewards/margins": 10.575047492980957, "rewards/rejected": -15.167069435119629, "step": 822 }, { "epoch": 4.877037037037037, "grad_norm": 0.6675396598727341, "learning_rate": 1.96180822443121e-07, "logits/chosen": -1.1367294788360596, "logits/rejected": -1.2640800476074219, "logps/chosen": -44.80545425415039, "logps/rejected": -70.67766571044922, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -1.2347452640533447, "rewards/margins": 11.651213645935059, "rewards/rejected": -12.885958671569824, "step": 823 }, { "epoch": 4.882962962962963, "grad_norm": 0.27873470720660054, "learning_rate": 1.955466103127871e-07, "logits/chosen": -1.770205020904541, "logits/rejected": -1.7015005350112915, "logps/chosen": -34.791748046875, "logps/rejected": -72.49945068359375, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -1.5266883373260498, "rewards/margins": 12.88920783996582, "rewards/rejected": -14.415897369384766, "step": 824 }, { "epoch": 4.888888888888889, "grad_norm": 0.3097771173621495, "learning_rate": 1.9491276586398715e-07, "logits/chosen": -1.861426591873169, "logits/rejected": -1.5631628036499023, "logps/chosen": -33.725547790527344, "logps/rejected": -79.86085510253906, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -2.760953903198242, "rewards/margins": 14.066446304321289, "rewards/rejected": -16.82740020751953, "step": 825 }, { "epoch": 4.894814814814815, "grad_norm": 1.2522381877844386, "learning_rate": 1.9427929337658126e-07, "logits/chosen": -1.6294392347335815, "logits/rejected": -1.5337265729904175, "logps/chosen": -37.48204040527344, "logps/rejected": -69.2812271118164, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -2.072634220123291, "rewards/margins": 12.420723915100098, "rewards/rejected": -14.493358612060547, "step": 826 }, { "epoch": 4.900740740740741, "grad_norm": 0.20546866017138665, "learning_rate": 1.9364619712791819e-07, "logits/chosen": -1.5217711925506592, "logits/rejected": -1.5121992826461792, "logps/chosen": -44.170570373535156, "logps/rejected": -73.65629577636719, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -2.5526375770568848, "rewards/margins": 11.5411376953125, "rewards/rejected": -14.093774795532227, "step": 827 }, { "epoch": 4.906666666666666, "grad_norm": 0.23822531337396938, "learning_rate": 1.9301348139280627e-07, "logits/chosen": -1.5563844442367554, "logits/rejected": -1.7817423343658447, "logps/chosen": -56.81804656982422, "logps/rejected": -68.70649719238281, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -3.0236644744873047, "rewards/margins": 8.670280456542969, "rewards/rejected": -11.69394588470459, "step": 828 }, { "epoch": 4.912592592592593, "grad_norm": 0.21821591547354985, "learning_rate": 1.9238115044348434e-07, "logits/chosen": -1.8185888528823853, "logits/rejected": -1.876836895942688, "logps/chosen": -58.44865417480469, "logps/rejected": -81.23777770996094, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -2.8119595050811768, "rewards/margins": 10.934724807739258, "rewards/rejected": -13.746683120727539, "step": 829 }, { "epoch": 4.9185185185185185, "grad_norm": 0.2266270583534369, "learning_rate": 1.9174920854959322e-07, "logits/chosen": -1.6655666828155518, "logits/rejected": -1.4781509637832642, "logps/chosen": -26.67877197265625, "logps/rejected": -62.411434173583984, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -1.696338176727295, "rewards/margins": 12.304670333862305, "rewards/rejected": -14.001008033752441, "step": 830 }, { "epoch": 4.924444444444444, "grad_norm": 0.2918176447105253, "learning_rate": 1.9111765997814678e-07, "logits/chosen": -1.8377158641815186, "logits/rejected": -1.787956953048706, "logps/chosen": -38.94948196411133, "logps/rejected": -67.58212280273438, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -1.692730188369751, "rewards/margins": 10.632195472717285, "rewards/rejected": -12.324925422668457, "step": 831 }, { "epoch": 4.930370370370371, "grad_norm": 0.26842566124743916, "learning_rate": 1.904865089935029e-07, "logits/chosen": -1.4635124206542969, "logits/rejected": -1.3722989559173584, "logps/chosen": -38.668697357177734, "logps/rejected": -71.26251983642578, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -2.14703106880188, "rewards/margins": 10.690751075744629, "rewards/rejected": -12.837782859802246, "step": 832 }, { "epoch": 4.936296296296296, "grad_norm": 0.36609333996244137, "learning_rate": 1.8985575985733507e-07, "logits/chosen": -1.3907794952392578, "logits/rejected": -1.4159953594207764, "logps/chosen": -46.419254302978516, "logps/rejected": -76.58683776855469, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -3.326676607131958, "rewards/margins": 9.860599517822266, "rewards/rejected": -13.187276840209961, "step": 833 }, { "epoch": 4.942222222222222, "grad_norm": 0.4311886487916821, "learning_rate": 1.8922541682860326e-07, "logits/chosen": -1.259995460510254, "logits/rejected": -1.2965853214263916, "logps/chosen": -30.577587127685547, "logps/rejected": -63.5474739074707, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -0.6297186613082886, "rewards/margins": 12.237893104553223, "rewards/rejected": -12.867610931396484, "step": 834 }, { "epoch": 4.948148148148148, "grad_norm": 0.7173217469645337, "learning_rate": 1.8859548416352536e-07, "logits/chosen": -1.2326655387878418, "logits/rejected": -1.1718957424163818, "logps/chosen": -38.014225006103516, "logps/rejected": -71.79017639160156, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -1.3967584371566772, "rewards/margins": 11.66148567199707, "rewards/rejected": -13.058243751525879, "step": 835 }, { "epoch": 4.954074074074074, "grad_norm": 0.25516273124859323, "learning_rate": 1.8796596611554838e-07, "logits/chosen": -1.6921401023864746, "logits/rejected": -1.852120041847229, "logps/chosen": -43.73085021972656, "logps/rejected": -64.00594329833984, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -1.6823835372924805, "rewards/margins": 9.839491844177246, "rewards/rejected": -11.521875381469727, "step": 836 }, { "epoch": 4.96, "grad_norm": 0.538226909564372, "learning_rate": 1.8733686693531982e-07, "logits/chosen": -1.8975204229354858, "logits/rejected": -1.7692875862121582, "logps/chosen": -40.96481704711914, "logps/rejected": -88.20527648925781, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -2.7765021324157715, "rewards/margins": 12.758014678955078, "rewards/rejected": -15.534517288208008, "step": 837 }, { "epoch": 4.965925925925926, "grad_norm": 0.43354396745461216, "learning_rate": 1.8670819087065882e-07, "logits/chosen": -1.4917025566101074, "logits/rejected": -1.4172505140304565, "logps/chosen": -45.58994674682617, "logps/rejected": -68.04974365234375, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -2.9073712825775146, "rewards/margins": 9.858301162719727, "rewards/rejected": -12.765671730041504, "step": 838 }, { "epoch": 4.971851851851852, "grad_norm": 0.3086137098014021, "learning_rate": 1.8607994216652756e-07, "logits/chosen": -1.8002091646194458, "logits/rejected": -1.7989627122879028, "logps/chosen": -37.95539855957031, "logps/rejected": -91.1846694946289, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -3.227663516998291, "rewards/margins": 15.667410850524902, "rewards/rejected": -18.89507484436035, "step": 839 }, { "epoch": 4.977777777777778, "grad_norm": 0.32236357616937117, "learning_rate": 1.8545212506500257e-07, "logits/chosen": -1.8015594482421875, "logits/rejected": -1.7732892036437988, "logps/chosen": -45.942161560058594, "logps/rejected": -69.69486999511719, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -2.7260780334472656, "rewards/margins": 11.671018600463867, "rewards/rejected": -14.39709758758545, "step": 840 }, { "epoch": 4.9837037037037035, "grad_norm": 2.1936326995084725, "learning_rate": 1.848247438052461e-07, "logits/chosen": -2.0741078853607178, "logits/rejected": -2.024766683578491, "logps/chosen": -61.82673263549805, "logps/rejected": -90.516845703125, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -5.203417778015137, "rewards/margins": 11.07644271850586, "rewards/rejected": -16.279861450195312, "step": 841 }, { "epoch": 4.989629629629629, "grad_norm": 0.5696743631038291, "learning_rate": 1.8419780262347754e-07, "logits/chosen": -1.3205444812774658, "logits/rejected": -1.1502920389175415, "logps/chosen": -46.6002082824707, "logps/rejected": -77.9881362915039, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -3.359750509262085, "rewards/margins": 12.488546371459961, "rewards/rejected": -15.848296165466309, "step": 842 }, { "epoch": 4.995555555555556, "grad_norm": 0.1911816493782355, "learning_rate": 1.835713057529447e-07, "logits/chosen": -2.0486555099487305, "logits/rejected": -1.8096239566802979, "logps/chosen": -41.883853912353516, "logps/rejected": -102.98055267333984, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -2.3345673084259033, "rewards/margins": 14.44814682006836, "rewards/rejected": -16.78271484375, "step": 843 }, { "epoch": 5.001481481481481, "grad_norm": 0.3585733698368132, "learning_rate": 1.8294525742389545e-07, "logits/chosen": -1.695112705230713, "logits/rejected": -1.7007943391799927, "logps/chosen": -47.733699798583984, "logps/rejected": -62.256568908691406, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -2.1449780464172363, "rewards/margins": 8.733757019042969, "rewards/rejected": -10.878735542297363, "step": 844 }, { "epoch": 5.007407407407407, "grad_norm": 0.148574929218555, "learning_rate": 1.8231966186354881e-07, "logits/chosen": -1.6611672639846802, "logits/rejected": -1.492279052734375, "logps/chosen": -44.596466064453125, "logps/rejected": -72.3811264038086, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.640878915786743, "rewards/margins": 10.067465782165527, "rewards/rejected": -12.708344459533691, "step": 845 }, { "epoch": 5.013333333333334, "grad_norm": 0.24940558308584396, "learning_rate": 1.8169452329606666e-07, "logits/chosen": -1.5457825660705566, "logits/rejected": -1.3591402769088745, "logps/chosen": -44.07053756713867, "logps/rejected": -83.65834045410156, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -3.497549057006836, "rewards/margins": 11.002856254577637, "rewards/rejected": -14.500405311584473, "step": 846 }, { "epoch": 5.019259259259259, "grad_norm": 0.13228197472605105, "learning_rate": 1.810698459425254e-07, "logits/chosen": -1.4919474124908447, "logits/rejected": -1.4146510362625122, "logps/chosen": -43.999305725097656, "logps/rejected": -67.03208923339844, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.773082971572876, "rewards/margins": 10.872330665588379, "rewards/rejected": -13.645413398742676, "step": 847 }, { "epoch": 5.025185185185185, "grad_norm": 0.17189656018878605, "learning_rate": 1.8044563402088682e-07, "logits/chosen": -1.6326994895935059, "logits/rejected": -1.5319743156433105, "logps/chosen": -40.37211227416992, "logps/rejected": -79.12166595458984, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -2.4987974166870117, "rewards/margins": 12.085766792297363, "rewards/rejected": -14.584563255310059, "step": 848 }, { "epoch": 5.0311111111111115, "grad_norm": 0.16328383350438383, "learning_rate": 1.7982189174597033e-07, "logits/chosen": -1.8950517177581787, "logits/rejected": -1.8567767143249512, "logps/chosen": -56.13173294067383, "logps/rejected": -81.51428985595703, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.818589687347412, "rewards/margins": 13.249429702758789, "rewards/rejected": -17.06801986694336, "step": 849 }, { "epoch": 5.037037037037037, "grad_norm": 0.20542554218578457, "learning_rate": 1.7919862332942398e-07, "logits/chosen": -1.784167766571045, "logits/rejected": -1.7963954210281372, "logps/chosen": -44.83396530151367, "logps/rejected": -67.42826843261719, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -1.5964694023132324, "rewards/margins": 11.381461143493652, "rewards/rejected": -12.977930068969727, "step": 850 }, { "epoch": 5.042962962962963, "grad_norm": 0.28807920712060275, "learning_rate": 1.785758329796963e-07, "logits/chosen": -1.703472375869751, "logits/rejected": -1.6009771823883057, "logps/chosen": -35.299949645996094, "logps/rejected": -73.72513580322266, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -2.533238172531128, "rewards/margins": 11.330462455749512, "rewards/rejected": -13.863700866699219, "step": 851 }, { "epoch": 5.0488888888888885, "grad_norm": 0.2189616708519809, "learning_rate": 1.779535249020078e-07, "logits/chosen": -1.5616728067398071, "logits/rejected": -1.5174376964569092, "logps/chosen": -38.645652770996094, "logps/rejected": -67.2764663696289, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -1.4764845371246338, "rewards/margins": 11.848831176757812, "rewards/rejected": -13.325315475463867, "step": 852 }, { "epoch": 5.054814814814815, "grad_norm": 0.20874214506595973, "learning_rate": 1.7733170329832262e-07, "logits/chosen": -1.6223198175430298, "logits/rejected": -1.7681981325149536, "logps/chosen": -32.13164520263672, "logps/rejected": -61.60198974609375, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -1.745962381362915, "rewards/margins": 10.576457977294922, "rewards/rejected": -12.322421073913574, "step": 853 }, { "epoch": 5.060740740740741, "grad_norm": 0.18766328646780403, "learning_rate": 1.7671037236732012e-07, "logits/chosen": -1.8586299419403076, "logits/rejected": -1.9080753326416016, "logps/chosen": -51.48103713989258, "logps/rejected": -86.90821075439453, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -3.9493086338043213, "rewards/margins": 13.482855796813965, "rewards/rejected": -17.432165145874023, "step": 854 }, { "epoch": 5.066666666666666, "grad_norm": 0.18720620307407093, "learning_rate": 1.760895363043663e-07, "logits/chosen": -2.031804084777832, "logits/rejected": -1.9341238737106323, "logps/chosen": -49.05330276489258, "logps/rejected": -82.30054473876953, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -4.23289680480957, "rewards/margins": 11.15815544128418, "rewards/rejected": -15.39105224609375, "step": 855 }, { "epoch": 5.072592592592593, "grad_norm": 0.15775224502125676, "learning_rate": 1.7546919930148603e-07, "logits/chosen": -1.9293596744537354, "logits/rejected": -2.0265965461730957, "logps/chosen": -69.87615966796875, "logps/rejected": -80.0749740600586, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.907338857650757, "rewards/margins": 10.003570556640625, "rewards/rejected": -13.910908699035645, "step": 856 }, { "epoch": 5.078518518518519, "grad_norm": 0.19674405918676902, "learning_rate": 1.748493655473342e-07, "logits/chosen": -1.8837556838989258, "logits/rejected": -2.0229501724243164, "logps/chosen": -39.472660064697266, "logps/rejected": -61.644203186035156, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -1.8183362483978271, "rewards/margins": 9.928937911987305, "rewards/rejected": -11.747274398803711, "step": 857 }, { "epoch": 5.084444444444444, "grad_norm": 0.1675085843386872, "learning_rate": 1.742300392271678e-07, "logits/chosen": -1.8151192665100098, "logits/rejected": -1.591550588607788, "logps/chosen": -37.466732025146484, "logps/rejected": -74.43209838867188, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -2.0111122131347656, "rewards/margins": 10.86896800994873, "rewards/rejected": -12.880081176757812, "step": 858 }, { "epoch": 5.09037037037037, "grad_norm": 0.28954876828630643, "learning_rate": 1.7361122452281737e-07, "logits/chosen": -1.2304236888885498, "logits/rejected": -1.1013797521591187, "logps/chosen": -41.00086212158203, "logps/rejected": -64.92744445800781, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -2.6675610542297363, "rewards/margins": 9.110584259033203, "rewards/rejected": -11.778144836425781, "step": 859 }, { "epoch": 5.0962962962962965, "grad_norm": 0.12147094194677399, "learning_rate": 1.72992925612659e-07, "logits/chosen": -1.4669568538665771, "logits/rejected": -1.2288365364074707, "logps/chosen": -42.41246032714844, "logps/rejected": -76.5176010131836, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.155447006225586, "rewards/margins": 11.347990989685059, "rewards/rejected": -13.503437995910645, "step": 860 }, { "epoch": 5.102222222222222, "grad_norm": 0.15467732711878926, "learning_rate": 1.7237514667158596e-07, "logits/chosen": -1.5924859046936035, "logits/rejected": -1.5197021961212158, "logps/chosen": -45.47554397583008, "logps/rejected": -75.77124786376953, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -2.6827468872070312, "rewards/margins": 12.081143379211426, "rewards/rejected": -14.763890266418457, "step": 861 }, { "epoch": 5.108148148148148, "grad_norm": 0.24260116050907574, "learning_rate": 1.7175789187098055e-07, "logits/chosen": -1.570918083190918, "logits/rejected": -1.6058459281921387, "logps/chosen": -33.30390167236328, "logps/rejected": -70.89424133300781, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -1.9541648626327515, "rewards/margins": 12.669885635375977, "rewards/rejected": -14.624051094055176, "step": 862 }, { "epoch": 5.114074074074074, "grad_norm": 0.13271833595661878, "learning_rate": 1.7114116537868612e-07, "logits/chosen": -1.4068008661270142, "logits/rejected": -1.4065369367599487, "logps/chosen": -46.0545654296875, "logps/rejected": -69.67312622070312, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.1072192192077637, "rewards/margins": 10.404853820800781, "rewards/rejected": -13.512073516845703, "step": 863 }, { "epoch": 5.12, "grad_norm": 0.13102584512640375, "learning_rate": 1.705249713589786e-07, "logits/chosen": -1.739338755607605, "logits/rejected": -1.6088874340057373, "logps/chosen": -52.92829132080078, "logps/rejected": -97.14276123046875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -4.2332658767700195, "rewards/margins": 12.030122756958008, "rewards/rejected": -16.263389587402344, "step": 864 }, { "epoch": 5.125925925925926, "grad_norm": 0.22324793093063372, "learning_rate": 1.699093139725386e-07, "logits/chosen": -1.5772820711135864, "logits/rejected": -1.6475763320922852, "logps/chosen": -52.72998809814453, "logps/rejected": -72.01866149902344, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -2.5473053455352783, "rewards/margins": 11.291367530822754, "rewards/rejected": -13.838672637939453, "step": 865 }, { "epoch": 5.131851851851851, "grad_norm": 0.1551815052938634, "learning_rate": 1.6929419737642322e-07, "logits/chosen": -2.116971492767334, "logits/rejected": -2.0724849700927734, "logps/chosen": -45.568336486816406, "logps/rejected": -79.17977905273438, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.3480639457702637, "rewards/margins": 11.395853042602539, "rewards/rejected": -14.743916511535645, "step": 866 }, { "epoch": 5.137777777777778, "grad_norm": 0.18261474128420957, "learning_rate": 1.686796257240381e-07, "logits/chosen": -1.4726767539978027, "logits/rejected": -1.2904458045959473, "logps/chosen": -41.11164093017578, "logps/rejected": -75.76146697998047, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -2.5571563243865967, "rewards/margins": 11.647672653198242, "rewards/rejected": -14.204829216003418, "step": 867 }, { "epoch": 5.143703703703704, "grad_norm": 0.16977709275009925, "learning_rate": 1.680656031651093e-07, "logits/chosen": -1.4244813919067383, "logits/rejected": -1.2991671562194824, "logps/chosen": -48.0638427734375, "logps/rejected": -82.90692138671875, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.5564422607421875, "rewards/margins": 11.91999626159668, "rewards/rejected": -15.476438522338867, "step": 868 }, { "epoch": 5.149629629629629, "grad_norm": 0.09172224696780171, "learning_rate": 1.6745213384565516e-07, "logits/chosen": -2.0797746181488037, "logits/rejected": -2.0070877075195312, "logps/chosen": -53.26316833496094, "logps/rejected": -89.38504791259766, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -4.355812072753906, "rewards/margins": 11.446057319641113, "rewards/rejected": -15.80186939239502, "step": 869 }, { "epoch": 5.155555555555556, "grad_norm": 0.17909443988936508, "learning_rate": 1.6683922190795845e-07, "logits/chosen": -2.0503368377685547, "logits/rejected": -1.7288861274719238, "logps/chosen": -40.119171142578125, "logps/rejected": -82.29203796386719, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -2.1652438640594482, "rewards/margins": 14.119874000549316, "rewards/rejected": -16.285120010375977, "step": 870 }, { "epoch": 5.161481481481482, "grad_norm": 0.18436994864792003, "learning_rate": 1.6622687149053844e-07, "logits/chosen": -2.2823660373687744, "logits/rejected": -2.0534257888793945, "logps/chosen": -44.072784423828125, "logps/rejected": -83.30123901367188, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -3.622286319732666, "rewards/margins": 11.508301734924316, "rewards/rejected": -15.130587577819824, "step": 871 }, { "epoch": 5.167407407407407, "grad_norm": 0.1822933236225203, "learning_rate": 1.6561508672812295e-07, "logits/chosen": -1.9030749797821045, "logits/rejected": -1.8649938106536865, "logps/chosen": -50.30503845214844, "logps/rejected": -77.90472412109375, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -2.4370203018188477, "rewards/margins": 10.136503219604492, "rewards/rejected": -12.57352352142334, "step": 872 }, { "epoch": 5.173333333333334, "grad_norm": 0.11100254764300277, "learning_rate": 1.650038717516203e-07, "logits/chosen": -1.6390033960342407, "logits/rejected": -1.7791786193847656, "logps/chosen": -50.51274108886719, "logps/rejected": -55.75922393798828, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.171482801437378, "rewards/margins": 8.132706642150879, "rewards/rejected": -10.304189682006836, "step": 873 }, { "epoch": 5.1792592592592595, "grad_norm": 0.1709052348065624, "learning_rate": 1.6439323068809137e-07, "logits/chosen": -1.5960361957550049, "logits/rejected": -1.5929274559020996, "logps/chosen": -44.663734436035156, "logps/rejected": -77.63716125488281, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -1.5435336828231812, "rewards/margins": 11.703277587890625, "rewards/rejected": -13.246810913085938, "step": 874 }, { "epoch": 5.185185185185185, "grad_norm": 0.18561882864161833, "learning_rate": 1.6378316766072196e-07, "logits/chosen": -1.4937806129455566, "logits/rejected": -1.445252776145935, "logps/chosen": -50.529808044433594, "logps/rejected": -78.3757553100586, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -3.8628158569335938, "rewards/margins": 11.858318328857422, "rewards/rejected": -15.721134185791016, "step": 875 }, { "epoch": 5.191111111111111, "grad_norm": 0.26835891565560865, "learning_rate": 1.6317368678879496e-07, "logits/chosen": -1.8304246664047241, "logits/rejected": -1.7924251556396484, "logps/chosen": -41.48052978515625, "logps/rejected": -60.102622985839844, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -2.755875587463379, "rewards/margins": 8.978135108947754, "rewards/rejected": -11.734010696411133, "step": 876 }, { "epoch": 5.197037037037037, "grad_norm": 0.15704330760066193, "learning_rate": 1.6256479218766212e-07, "logits/chosen": -2.1413064002990723, "logits/rejected": -1.9285932779312134, "logps/chosen": -51.57318115234375, "logps/rejected": -98.74991607666016, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.0889475345611572, "rewards/margins": 13.694742202758789, "rewards/rejected": -16.783689498901367, "step": 877 }, { "epoch": 5.202962962962963, "grad_norm": 0.11321601848002678, "learning_rate": 1.6195648796871687e-07, "logits/chosen": -1.5476150512695312, "logits/rejected": -1.3866195678710938, "logps/chosen": -36.852561950683594, "logps/rejected": -72.84454345703125, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.758419990539551, "rewards/margins": 12.511301040649414, "rewards/rejected": -15.269720077514648, "step": 878 }, { "epoch": 5.208888888888889, "grad_norm": 0.10351774951876742, "learning_rate": 1.6134877823936607e-07, "logits/chosen": -2.093661308288574, "logits/rejected": -2.1594038009643555, "logps/chosen": -51.953643798828125, "logps/rejected": -86.07762145996094, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.956282138824463, "rewards/margins": 11.716033935546875, "rewards/rejected": -14.672317504882812, "step": 879 }, { "epoch": 5.214814814814815, "grad_norm": 0.1434146009933508, "learning_rate": 1.6074166710300247e-07, "logits/chosen": -1.702378749847412, "logits/rejected": -1.7260849475860596, "logps/chosen": -51.182071685791016, "logps/rejected": -73.42428588867188, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.504643440246582, "rewards/margins": 12.243188858032227, "rewards/rejected": -14.747832298278809, "step": 880 }, { "epoch": 5.220740740740741, "grad_norm": 0.20945502929349574, "learning_rate": 1.60135158658977e-07, "logits/chosen": -2.109502077102661, "logits/rejected": -1.807983160018921, "logps/chosen": -58.1779670715332, "logps/rejected": -100.36227416992188, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -4.7934346199035645, "rewards/margins": 12.290433883666992, "rewards/rejected": -17.08386993408203, "step": 881 }, { "epoch": 5.226666666666667, "grad_norm": 0.20448547318787463, "learning_rate": 1.5952925700257115e-07, "logits/chosen": -1.705009937286377, "logits/rejected": -1.6618585586547852, "logps/chosen": -45.62565612792969, "logps/rejected": -76.2286148071289, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -3.5053670406341553, "rewards/margins": 12.319999694824219, "rewards/rejected": -15.825366020202637, "step": 882 }, { "epoch": 5.232592592592592, "grad_norm": 0.18171957750482662, "learning_rate": 1.5892396622496905e-07, "logits/chosen": -1.3300485610961914, "logits/rejected": -1.0937800407409668, "logps/chosen": -59.10209655761719, "logps/rejected": -103.3541030883789, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -4.020575523376465, "rewards/margins": 15.489795684814453, "rewards/rejected": -19.5103702545166, "step": 883 }, { "epoch": 5.238518518518519, "grad_norm": 0.10331028555572916, "learning_rate": 1.5831929041323023e-07, "logits/chosen": -1.8355892896652222, "logits/rejected": -1.873203992843628, "logps/chosen": -57.147979736328125, "logps/rejected": -82.013916015625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.6592514514923096, "rewards/margins": 11.781926155090332, "rewards/rejected": -15.441177368164062, "step": 884 }, { "epoch": 5.2444444444444445, "grad_norm": 0.15900220224599235, "learning_rate": 1.5771523365026175e-07, "logits/chosen": -1.9305607080459595, "logits/rejected": -1.7835602760314941, "logps/chosen": -37.924530029296875, "logps/rejected": -73.7749252319336, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -2.2176930904388428, "rewards/margins": 10.380413055419922, "rewards/rejected": -12.598106384277344, "step": 885 }, { "epoch": 5.25037037037037, "grad_norm": 0.1496584668105621, "learning_rate": 1.5711180001479068e-07, "logits/chosen": -1.603898525238037, "logits/rejected": -1.5381674766540527, "logps/chosen": -35.780635833740234, "logps/rejected": -65.23573303222656, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -2.7302260398864746, "rewards/margins": 9.829032897949219, "rewards/rejected": -12.559258460998535, "step": 886 }, { "epoch": 5.256296296296297, "grad_norm": 0.091370988937047, "learning_rate": 1.5650899358133667e-07, "logits/chosen": -1.788956642150879, "logits/rejected": -1.748964548110962, "logps/chosen": -53.1939582824707, "logps/rejected": -82.92172241210938, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.7135212421417236, "rewards/margins": 11.983846664428711, "rewards/rejected": -15.697367668151855, "step": 887 }, { "epoch": 5.262222222222222, "grad_norm": 0.14689421171326186, "learning_rate": 1.5590681842018443e-07, "logits/chosen": -1.099969744682312, "logits/rejected": -1.2583948373794556, "logps/chosen": -59.88705062866211, "logps/rejected": -84.44697570800781, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -4.428144454956055, "rewards/margins": 11.820623397827148, "rewards/rejected": -16.248767852783203, "step": 888 }, { "epoch": 5.268148148148148, "grad_norm": 0.10821955488468121, "learning_rate": 1.5530527859735599e-07, "logits/chosen": -2.088761806488037, "logits/rejected": -1.9281114339828491, "logps/chosen": -46.75210189819336, "logps/rejected": -88.88577270507812, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.3439178466796875, "rewards/margins": 14.868821144104004, "rewards/rejected": -18.212738037109375, "step": 889 }, { "epoch": 5.274074074074074, "grad_norm": 0.15466292314549412, "learning_rate": 1.5470437817458355e-07, "logits/chosen": -2.0101559162139893, "logits/rejected": -1.789503574371338, "logps/chosen": -49.836647033691406, "logps/rejected": -89.97150421142578, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.7755634784698486, "rewards/margins": 11.662132263183594, "rewards/rejected": -15.43769645690918, "step": 890 }, { "epoch": 5.28, "grad_norm": 0.20928188129386854, "learning_rate": 1.5410412120928186e-07, "logits/chosen": -1.8343095779418945, "logits/rejected": -1.8289053440093994, "logps/chosen": -59.51985168457031, "logps/rejected": -88.18516540527344, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -4.776739120483398, "rewards/margins": 12.286046028137207, "rewards/rejected": -17.062786102294922, "step": 891 }, { "epoch": 5.285925925925926, "grad_norm": 0.09170725540184624, "learning_rate": 1.53504511754521e-07, "logits/chosen": -1.5599915981292725, "logits/rejected": -1.3725429773330688, "logps/chosen": -49.52803421020508, "logps/rejected": -90.8835220336914, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -4.027318954467773, "rewards/margins": 12.441356658935547, "rewards/rejected": -16.46867561340332, "step": 892 }, { "epoch": 5.291851851851852, "grad_norm": 0.18130879391822655, "learning_rate": 1.5290555385899877e-07, "logits/chosen": -1.7436559200286865, "logits/rejected": -1.6874361038208008, "logps/chosen": -47.117225646972656, "logps/rejected": -79.31859588623047, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -3.854762077331543, "rewards/margins": 11.091411590576172, "rewards/rejected": -14.946174621582031, "step": 893 }, { "epoch": 5.297777777777778, "grad_norm": 0.11334482840840347, "learning_rate": 1.5230725156701373e-07, "logits/chosen": -1.4855456352233887, "logits/rejected": -1.4895267486572266, "logps/chosen": -48.04728317260742, "logps/rejected": -90.0076904296875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.4714088439941406, "rewards/margins": 14.348779678344727, "rewards/rejected": -17.820188522338867, "step": 894 }, { "epoch": 5.303703703703704, "grad_norm": 0.09948661594392598, "learning_rate": 1.517096089184375e-07, "logits/chosen": -1.6243767738342285, "logits/rejected": -1.8554785251617432, "logps/chosen": -61.03385925292969, "logps/rejected": -71.03145599365234, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.3068552017211914, "rewards/margins": 9.547409057617188, "rewards/rejected": -12.854263305664062, "step": 895 }, { "epoch": 5.3096296296296295, "grad_norm": 0.19297746323084378, "learning_rate": 1.5111262994868756e-07, "logits/chosen": -1.6317886114120483, "logits/rejected": -1.5646547079086304, "logps/chosen": -43.04448699951172, "logps/rejected": -70.97728729248047, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -2.640531539916992, "rewards/margins": 9.738162994384766, "rewards/rejected": -12.37869644165039, "step": 896 }, { "epoch": 5.315555555555555, "grad_norm": 0.14642107606223295, "learning_rate": 1.5051631868870019e-07, "logits/chosen": -1.7854773998260498, "logits/rejected": -1.6659780740737915, "logps/chosen": -43.154239654541016, "logps/rejected": -80.3381118774414, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.406045913696289, "rewards/margins": 12.070757865905762, "rewards/rejected": -15.476802825927734, "step": 897 }, { "epoch": 5.321481481481482, "grad_norm": 0.15732900222598203, "learning_rate": 1.499206791649032e-07, "logits/chosen": -1.9318976402282715, "logits/rejected": -1.8870396614074707, "logps/chosen": -49.68609619140625, "logps/rejected": -80.17252349853516, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.818880558013916, "rewards/margins": 12.839373588562012, "rewards/rejected": -16.658254623413086, "step": 898 }, { "epoch": 5.327407407407407, "grad_norm": 0.16901663436453515, "learning_rate": 1.4932571539918854e-07, "logits/chosen": -1.6694287061691284, "logits/rejected": -1.674289345741272, "logps/chosen": -58.69844055175781, "logps/rejected": -86.66792297363281, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -4.718257904052734, "rewards/margins": 11.202542304992676, "rewards/rejected": -15.920801162719727, "step": 899 }, { "epoch": 5.333333333333333, "grad_norm": 0.0866407359161085, "learning_rate": 1.4873143140888537e-07, "logits/chosen": -1.7056925296783447, "logits/rejected": -1.4066742658615112, "logps/chosen": -53.29381561279297, "logps/rejected": -96.85028076171875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -5.774338722229004, "rewards/margins": 13.016897201538086, "rewards/rejected": -18.791236877441406, "step": 900 }, { "epoch": 5.33925925925926, "grad_norm": 0.1164374083908993, "learning_rate": 1.481378312067329e-07, "logits/chosen": -1.958636999130249, "logits/rejected": -1.8225376605987549, "logps/chosen": -49.395442962646484, "logps/rejected": -72.3202896118164, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.258683443069458, "rewards/margins": 12.200054168701172, "rewards/rejected": -14.458738327026367, "step": 901 }, { "epoch": 5.345185185185185, "grad_norm": 0.18450207404639837, "learning_rate": 1.4754491880085317e-07, "logits/chosen": -1.6482468843460083, "logits/rejected": -1.6017494201660156, "logps/chosen": -46.62718200683594, "logps/rejected": -94.2940673828125, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.9291470050811768, "rewards/margins": 12.749122619628906, "rewards/rejected": -16.678268432617188, "step": 902 }, { "epoch": 5.351111111111111, "grad_norm": 0.11574118575383069, "learning_rate": 1.4695269819472403e-07, "logits/chosen": -1.1760472059249878, "logits/rejected": -1.199681282043457, "logps/chosen": -64.5206069946289, "logps/rejected": -74.74624633789062, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -4.804712772369385, "rewards/margins": 9.701336860656738, "rewards/rejected": -14.506050109863281, "step": 903 }, { "epoch": 5.357037037037037, "grad_norm": 0.14066639890033453, "learning_rate": 1.463611733871523e-07, "logits/chosen": -1.8428750038146973, "logits/rejected": -1.6979756355285645, "logps/chosen": -45.86759567260742, "logps/rejected": -88.10768127441406, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.3660061359405518, "rewards/margins": 11.283397674560547, "rewards/rejected": -13.649404525756836, "step": 904 }, { "epoch": 5.362962962962963, "grad_norm": 0.2518767989627491, "learning_rate": 1.457703483722466e-07, "logits/chosen": -1.6185064315795898, "logits/rejected": -1.5029176473617554, "logps/chosen": -41.58538818359375, "logps/rejected": -73.95729064941406, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -2.240269422531128, "rewards/margins": 11.645746231079102, "rewards/rejected": -13.886014938354492, "step": 905 }, { "epoch": 5.368888888888889, "grad_norm": 0.12174019876649274, "learning_rate": 1.4518022713938998e-07, "logits/chosen": -2.0943048000335693, "logits/rejected": -2.0399320125579834, "logps/chosen": -45.09468078613281, "logps/rejected": -80.62596130371094, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.9401047229766846, "rewards/margins": 11.51397705078125, "rewards/rejected": -14.454082489013672, "step": 906 }, { "epoch": 5.3748148148148145, "grad_norm": 0.20129319769151877, "learning_rate": 1.4459081367321407e-07, "logits/chosen": -1.5126574039459229, "logits/rejected": -1.61480712890625, "logps/chosen": -43.759063720703125, "logps/rejected": -66.43977355957031, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -4.753220558166504, "rewards/margins": 9.984600067138672, "rewards/rejected": -14.73781967163086, "step": 907 }, { "epoch": 5.380740740740741, "grad_norm": 0.1415414744220965, "learning_rate": 1.4400211195357103e-07, "logits/chosen": -1.6789093017578125, "logits/rejected": -1.8044849634170532, "logps/chosen": -56.00989532470703, "logps/rejected": -77.61322784423828, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.8247482776641846, "rewards/margins": 12.38948917388916, "rewards/rejected": -16.2142391204834, "step": 908 }, { "epoch": 5.386666666666667, "grad_norm": 0.11781108239895481, "learning_rate": 1.4341412595550724e-07, "logits/chosen": -1.45395028591156, "logits/rejected": -1.4600470066070557, "logps/chosen": -38.969261169433594, "logps/rejected": -77.78085327148438, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.7798218727111816, "rewards/margins": 11.665205001831055, "rewards/rejected": -14.445026397705078, "step": 909 }, { "epoch": 5.392592592592592, "grad_norm": 0.16066753379617685, "learning_rate": 1.428268596492364e-07, "logits/chosen": -1.5352404117584229, "logits/rejected": -1.5462236404418945, "logps/chosen": -37.77696228027344, "logps/rejected": -72.236328125, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -1.9697984457015991, "rewards/margins": 12.46438980102539, "rewards/rejected": -14.434188842773438, "step": 910 }, { "epoch": 5.398518518518518, "grad_norm": 0.1829332241319529, "learning_rate": 1.4224031700011286e-07, "logits/chosen": -1.6134045124053955, "logits/rejected": -1.6110115051269531, "logps/chosen": -42.56840133666992, "logps/rejected": -79.94451904296875, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.0541820526123047, "rewards/margins": 12.98902702331543, "rewards/rejected": -16.043209075927734, "step": 911 }, { "epoch": 5.404444444444445, "grad_norm": 0.12167838129787681, "learning_rate": 1.416545019686042e-07, "logits/chosen": -2.096259355545044, "logits/rejected": -2.0034162998199463, "logps/chosen": -46.483367919921875, "logps/rejected": -88.5770034790039, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.900122880935669, "rewards/margins": 14.230045318603516, "rewards/rejected": -18.130168914794922, "step": 912 }, { "epoch": 5.41037037037037, "grad_norm": 0.15784798830656716, "learning_rate": 1.4106941851026544e-07, "logits/chosen": -1.9469846487045288, "logits/rejected": -1.8676340579986572, "logps/chosen": -49.274776458740234, "logps/rejected": -83.13460540771484, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.6215996742248535, "rewards/margins": 12.980171203613281, "rewards/rejected": -16.601770401000977, "step": 913 }, { "epoch": 5.416296296296296, "grad_norm": 0.1413161041155283, "learning_rate": 1.4048507057571164e-07, "logits/chosen": -1.4655190706253052, "logits/rejected": -1.5878344774246216, "logps/chosen": -49.412841796875, "logps/rejected": -74.7137680053711, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -4.394691467285156, "rewards/margins": 10.30486011505127, "rewards/rejected": -14.699551582336426, "step": 914 }, { "epoch": 5.4222222222222225, "grad_norm": 0.1583348852334666, "learning_rate": 1.3990146211059139e-07, "logits/chosen": -1.909976840019226, "logits/rejected": -1.8862512111663818, "logps/chosen": -45.07548522949219, "logps/rejected": -66.51271057128906, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -2.0548486709594727, "rewards/margins": 10.233278274536133, "rewards/rejected": -12.288125991821289, "step": 915 }, { "epoch": 5.428148148148148, "grad_norm": 0.3217765341519717, "learning_rate": 1.3931859705556052e-07, "logits/chosen": -1.720643162727356, "logits/rejected": -1.6782926321029663, "logps/chosen": -41.83833312988281, "logps/rejected": -73.55179595947266, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -2.188084840774536, "rewards/margins": 11.182394981384277, "rewards/rejected": -13.370479583740234, "step": 916 }, { "epoch": 5.434074074074074, "grad_norm": 0.11661976888036214, "learning_rate": 1.387364793462548e-07, "logits/chosen": -1.5590168237686157, "logits/rejected": -1.3452208042144775, "logps/chosen": -47.607704162597656, "logps/rejected": -85.09840393066406, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.510599374771118, "rewards/margins": 10.469550132751465, "rewards/rejected": -12.98015022277832, "step": 917 }, { "epoch": 5.44, "grad_norm": 0.1397991323159031, "learning_rate": 1.38155112913264e-07, "logits/chosen": -1.1578762531280518, "logits/rejected": -1.1332764625549316, "logps/chosen": -43.81012725830078, "logps/rejected": -73.04928588867188, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.3922626972198486, "rewards/margins": 11.932851791381836, "rewards/rejected": -14.325115203857422, "step": 918 }, { "epoch": 5.445925925925926, "grad_norm": 0.17044941604549793, "learning_rate": 1.37574501682105e-07, "logits/chosen": -1.7931560277938843, "logits/rejected": -1.6720855236053467, "logps/chosen": -47.04766845703125, "logps/rejected": -90.76607513427734, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.885108232498169, "rewards/margins": 14.144776344299316, "rewards/rejected": -18.029884338378906, "step": 919 }, { "epoch": 5.451851851851852, "grad_norm": 0.17803670632400478, "learning_rate": 1.369946495731954e-07, "logits/chosen": -1.4030263423919678, "logits/rejected": -1.4680297374725342, "logps/chosen": -58.498477935791016, "logps/rejected": -81.02774047851562, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -3.9646613597869873, "rewards/margins": 11.905306816101074, "rewards/rejected": -15.86996841430664, "step": 920 }, { "epoch": 5.457777777777777, "grad_norm": 0.1274841560573801, "learning_rate": 1.3641556050182707e-07, "logits/chosen": -1.7464091777801514, "logits/rejected": -1.6411049365997314, "logps/chosen": -38.34832000732422, "logps/rejected": -70.27515411376953, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.4438483715057373, "rewards/margins": 10.543989181518555, "rewards/rejected": -12.987837791442871, "step": 921 }, { "epoch": 5.463703703703704, "grad_norm": 0.12059465093251162, "learning_rate": 1.3583723837813964e-07, "logits/chosen": -1.6953315734863281, "logits/rejected": -1.5744541883468628, "logps/chosen": -50.8625373840332, "logps/rejected": -85.90507507324219, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.873828887939453, "rewards/margins": 13.21931266784668, "rewards/rejected": -17.093143463134766, "step": 922 }, { "epoch": 5.46962962962963, "grad_norm": 0.1785860228395741, "learning_rate": 1.3525968710709415e-07, "logits/chosen": -2.02950382232666, "logits/rejected": -1.9261343479156494, "logps/chosen": -38.81111526489258, "logps/rejected": -77.9141845703125, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -1.2182492017745972, "rewards/margins": 14.254678726196289, "rewards/rejected": -15.472929000854492, "step": 923 }, { "epoch": 5.475555555555555, "grad_norm": 0.11719304740531708, "learning_rate": 1.346829105884467e-07, "logits/chosen": -1.734592318534851, "logits/rejected": -1.4210288524627686, "logps/chosen": -37.435325622558594, "logps/rejected": -80.92240905761719, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.19169282913208, "rewards/margins": 13.293989181518555, "rewards/rejected": -15.485682487487793, "step": 924 }, { "epoch": 5.481481481481482, "grad_norm": 0.17100357178124784, "learning_rate": 1.3410691271672206e-07, "logits/chosen": -1.3211579322814941, "logits/rejected": -1.3886321783065796, "logps/chosen": -43.99436950683594, "logps/rejected": -67.71986389160156, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -3.496647596359253, "rewards/margins": 9.868618965148926, "rewards/rejected": -13.365266799926758, "step": 925 }, { "epoch": 5.4874074074074075, "grad_norm": 0.13742557760347932, "learning_rate": 1.335316973811874e-07, "logits/chosen": -1.7404249906539917, "logits/rejected": -1.7618457078933716, "logps/chosen": -42.16370391845703, "logps/rejected": -68.19352722167969, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.6484298706054688, "rewards/margins": 10.290946960449219, "rewards/rejected": -12.939376831054688, "step": 926 }, { "epoch": 5.493333333333333, "grad_norm": 0.12442842572287284, "learning_rate": 1.32957268465826e-07, "logits/chosen": -1.5197845697402954, "logits/rejected": -1.4984042644500732, "logps/chosen": -47.004085540771484, "logps/rejected": -76.21795654296875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -4.030613899230957, "rewards/margins": 11.936113357543945, "rewards/rejected": -15.966728210449219, "step": 927 }, { "epoch": 5.499259259259259, "grad_norm": 0.15299953515000728, "learning_rate": 1.3238362984931113e-07, "logits/chosen": -1.8709659576416016, "logits/rejected": -1.7127676010131836, "logps/chosen": -41.45587921142578, "logps/rejected": -80.52196502685547, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.514099597930908, "rewards/margins": 12.647805213928223, "rewards/rejected": -16.161903381347656, "step": 928 }, { "epoch": 5.505185185185185, "grad_norm": 0.09435887433938309, "learning_rate": 1.318107854049797e-07, "logits/chosen": -2.078130006790161, "logits/rejected": -2.1746225357055664, "logps/chosen": -44.262779235839844, "logps/rejected": -68.68415832519531, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.1330409049987793, "rewards/margins": 11.835768699645996, "rewards/rejected": -14.968809127807617, "step": 929 }, { "epoch": 5.511111111111111, "grad_norm": 0.09000734814570381, "learning_rate": 1.3123873900080628e-07, "logits/chosen": -1.8118696212768555, "logits/rejected": -1.8935637474060059, "logps/chosen": -45.19469451904297, "logps/rejected": -65.81236267089844, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -4.033607006072998, "rewards/margins": 10.029970169067383, "rewards/rejected": -14.063577651977539, "step": 930 }, { "epoch": 5.517037037037037, "grad_norm": 0.1047861210360843, "learning_rate": 1.306674944993768e-07, "logits/chosen": -1.4601209163665771, "logits/rejected": -1.4323076009750366, "logps/chosen": -45.547691345214844, "logps/rejected": -81.56329345703125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.131673812866211, "rewards/margins": 13.62175178527832, "rewards/rejected": -16.75342559814453, "step": 931 }, { "epoch": 5.522962962962963, "grad_norm": 0.11003995547439586, "learning_rate": 1.3009705575786268e-07, "logits/chosen": -1.8155300617218018, "logits/rejected": -1.9376707077026367, "logps/chosen": -46.22417449951172, "logps/rejected": -73.66895294189453, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.8170642852783203, "rewards/margins": 10.779097557067871, "rewards/rejected": -14.596162796020508, "step": 932 }, { "epoch": 5.528888888888889, "grad_norm": 0.09141229143535569, "learning_rate": 1.295274266279945e-07, "logits/chosen": -1.5078794956207275, "logits/rejected": -1.5662742853164673, "logps/chosen": -50.77909851074219, "logps/rejected": -76.8865966796875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -1.6053050756454468, "rewards/margins": 12.83316421508789, "rewards/rejected": -14.438468933105469, "step": 933 }, { "epoch": 5.534814814814815, "grad_norm": 0.21281575708183442, "learning_rate": 1.2895861095603632e-07, "logits/chosen": -1.824397087097168, "logits/rejected": -1.7250351905822754, "logps/chosen": -34.78361511230469, "logps/rejected": -60.876487731933594, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -1.5677258968353271, "rewards/margins": 9.703927993774414, "rewards/rejected": -11.271653175354004, "step": 934 }, { "epoch": 5.540740740740741, "grad_norm": 0.16706462984138418, "learning_rate": 1.2839061258275946e-07, "logits/chosen": -1.4025630950927734, "logits/rejected": -1.3066201210021973, "logps/chosen": -44.123268127441406, "logps/rejected": -76.74554443359375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.301388740539551, "rewards/margins": 12.366952896118164, "rewards/rejected": -15.668341636657715, "step": 935 }, { "epoch": 5.546666666666667, "grad_norm": 0.1170849878787737, "learning_rate": 1.2782343534341665e-07, "logits/chosen": -1.8335165977478027, "logits/rejected": -1.8167027235031128, "logps/chosen": -46.001007080078125, "logps/rejected": -69.01423645019531, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.7690696716308594, "rewards/margins": 10.074502944946289, "rewards/rejected": -13.843572616577148, "step": 936 }, { "epoch": 5.5525925925925925, "grad_norm": 0.1706348543998457, "learning_rate": 1.2725708306771618e-07, "logits/chosen": -1.396052360534668, "logits/rejected": -1.3702348470687866, "logps/chosen": -46.654876708984375, "logps/rejected": -72.2405014038086, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.5897727012634277, "rewards/margins": 11.45168399810791, "rewards/rejected": -15.041457176208496, "step": 937 }, { "epoch": 5.558518518518518, "grad_norm": 0.12103662627467811, "learning_rate": 1.266915595797961e-07, "logits/chosen": -1.9351294040679932, "logits/rejected": -1.8262648582458496, "logps/chosen": -41.44812774658203, "logps/rejected": -79.66790008544922, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.912602424621582, "rewards/margins": 12.376932144165039, "rewards/rejected": -16.289533615112305, "step": 938 }, { "epoch": 5.564444444444445, "grad_norm": 0.11976343348272687, "learning_rate": 1.2612686869819817e-07, "logits/chosen": -1.5141785144805908, "logits/rejected": -1.415678858757019, "logps/chosen": -36.035640716552734, "logps/rejected": -74.10150146484375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.3020496368408203, "rewards/margins": 12.624713897705078, "rewards/rejected": -14.926763534545898, "step": 939 }, { "epoch": 5.57037037037037, "grad_norm": 0.11787248363508873, "learning_rate": 1.2556301423584208e-07, "logits/chosen": -1.495579481124878, "logits/rejected": -1.2809770107269287, "logps/chosen": -56.97976303100586, "logps/rejected": -86.33170318603516, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -4.188412666320801, "rewards/margins": 10.436057090759277, "rewards/rejected": -14.624469757080078, "step": 940 }, { "epoch": 5.576296296296296, "grad_norm": 0.0907026532078704, "learning_rate": 1.2500000000000005e-07, "logits/chosen": -1.603977084159851, "logits/rejected": -1.4511686563491821, "logps/chosen": -45.15902328491211, "logps/rejected": -80.01466369628906, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.9305174350738525, "rewards/margins": 10.783876419067383, "rewards/rejected": -13.714394569396973, "step": 941 }, { "epoch": 5.582222222222223, "grad_norm": 0.1487765153354055, "learning_rate": 1.2443782979227082e-07, "logits/chosen": -1.8944647312164307, "logits/rejected": -1.905172348022461, "logps/chosen": -35.225990295410156, "logps/rejected": -67.60298156738281, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -2.6557745933532715, "rewards/margins": 10.987051010131836, "rewards/rejected": -13.64282512664795, "step": 942 }, { "epoch": 5.588148148148148, "grad_norm": 0.17628256299454922, "learning_rate": 1.2387650740855406e-07, "logits/chosen": -1.564589023590088, "logits/rejected": -1.7260979413986206, "logps/chosen": -46.70269775390625, "logps/rejected": -62.99655532836914, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -2.87866473197937, "rewards/margins": 10.071403503417969, "rewards/rejected": -12.950067520141602, "step": 943 }, { "epoch": 5.594074074074074, "grad_norm": 0.12012338864560455, "learning_rate": 1.2331603663902475e-07, "logits/chosen": -1.7884104251861572, "logits/rejected": -1.7316097021102905, "logps/chosen": -49.966835021972656, "logps/rejected": -80.75395202636719, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.379611015319824, "rewards/margins": 14.045755386352539, "rewards/rejected": -17.425365447998047, "step": 944 }, { "epoch": 5.6, "grad_norm": 0.14786919502739027, "learning_rate": 1.2275642126810762e-07, "logits/chosen": -1.4788850545883179, "logits/rejected": -1.4324711561203003, "logps/chosen": -40.342864990234375, "logps/rejected": -71.33074188232422, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.499598264694214, "rewards/margins": 11.36124324798584, "rewards/rejected": -14.860841751098633, "step": 945 }, { "epoch": 5.605925925925926, "grad_norm": 0.18333393490548783, "learning_rate": 1.2219766507445144e-07, "logits/chosen": -1.8778958320617676, "logits/rejected": -1.5382031202316284, "logps/chosen": -44.424560546875, "logps/rejected": -91.03005981445312, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.4865312576293945, "rewards/margins": 14.280929565429688, "rewards/rejected": -17.7674617767334, "step": 946 }, { "epoch": 5.611851851851852, "grad_norm": 0.12945141594778764, "learning_rate": 1.2163977183090368e-07, "logits/chosen": -1.5234986543655396, "logits/rejected": -1.3622126579284668, "logps/chosen": -42.00763702392578, "logps/rejected": -87.643798828125, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.1348018646240234, "rewards/margins": 13.84468936920166, "rewards/rejected": -16.9794921875, "step": 947 }, { "epoch": 5.6177777777777775, "grad_norm": 0.21178020754931173, "learning_rate": 1.210827453044851e-07, "logits/chosen": -2.04561448097229, "logits/rejected": -1.9825788736343384, "logps/chosen": -45.74441909790039, "logps/rejected": -82.15046691894531, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.860677719116211, "rewards/margins": 10.71257495880127, "rewards/rejected": -13.57325267791748, "step": 948 }, { "epoch": 5.623703703703704, "grad_norm": 0.1455054608296158, "learning_rate": 1.2052658925636405e-07, "logits/chosen": -1.5713088512420654, "logits/rejected": -1.5190423727035522, "logps/chosen": -36.49808883666992, "logps/rejected": -64.18106079101562, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.315652847290039, "rewards/margins": 10.477497100830078, "rewards/rejected": -12.7931489944458, "step": 949 }, { "epoch": 5.62962962962963, "grad_norm": 0.2980500440824699, "learning_rate": 1.1997130744183124e-07, "logits/chosen": -1.6031593084335327, "logits/rejected": -1.5546696186065674, "logps/chosen": -63.49602508544922, "logps/rejected": -96.17118835449219, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -3.9462623596191406, "rewards/margins": 13.48281192779541, "rewards/rejected": -17.429073333740234, "step": 950 }, { "epoch": 5.635555555555555, "grad_norm": 0.1307241810970709, "learning_rate": 1.194169036102743e-07, "logits/chosen": -1.7493221759796143, "logits/rejected": -1.605499267578125, "logps/chosen": -46.23032760620117, "logps/rejected": -85.92477416992188, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -4.037178993225098, "rewards/margins": 13.696718215942383, "rewards/rejected": -17.733896255493164, "step": 951 }, { "epoch": 5.641481481481481, "grad_norm": 0.14975321955144041, "learning_rate": 1.1886338150515268e-07, "logits/chosen": -1.851197361946106, "logits/rejected": -1.5876126289367676, "logps/chosen": -55.22184753417969, "logps/rejected": -99.03150177001953, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -5.070620536804199, "rewards/margins": 13.874107360839844, "rewards/rejected": -18.944726943969727, "step": 952 }, { "epoch": 5.647407407407408, "grad_norm": 0.13874169383297966, "learning_rate": 1.1831074486397217e-07, "logits/chosen": -1.6220589876174927, "logits/rejected": -1.5277056694030762, "logps/chosen": -45.646156311035156, "logps/rejected": -82.58694458007812, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -2.766599655151367, "rewards/margins": 13.377978324890137, "rewards/rejected": -16.14457893371582, "step": 953 }, { "epoch": 5.653333333333333, "grad_norm": 0.13578476971641049, "learning_rate": 1.1775899741825945e-07, "logits/chosen": -1.6966445446014404, "logits/rejected": -1.4161500930786133, "logps/chosen": -51.15569305419922, "logps/rejected": -96.25025939941406, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -4.151477813720703, "rewards/margins": 13.05290412902832, "rewards/rejected": -17.204381942749023, "step": 954 }, { "epoch": 5.659259259259259, "grad_norm": 0.12481862557973492, "learning_rate": 1.172081428935375e-07, "logits/chosen": -2.3386316299438477, "logits/rejected": -2.2986228466033936, "logps/chosen": -47.3796501159668, "logps/rejected": -74.82670593261719, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.846057415008545, "rewards/margins": 12.71998405456543, "rewards/rejected": -15.5660400390625, "step": 955 }, { "epoch": 5.6651851851851855, "grad_norm": 0.1821793357669405, "learning_rate": 1.1665818500929986e-07, "logits/chosen": -1.7978324890136719, "logits/rejected": -1.7424900531768799, "logps/chosen": -57.95037841796875, "logps/rejected": -88.53657531738281, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.9283673763275146, "rewards/margins": 12.124265670776367, "rewards/rejected": -15.052633285522461, "step": 956 }, { "epoch": 5.671111111111111, "grad_norm": 0.14658210734864752, "learning_rate": 1.1610912747898605e-07, "logits/chosen": -2.2586140632629395, "logits/rejected": -2.166628122329712, "logps/chosen": -44.212364196777344, "logps/rejected": -71.91984558105469, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.876743793487549, "rewards/margins": 10.460264205932617, "rewards/rejected": -13.337007522583008, "step": 957 }, { "epoch": 5.677037037037037, "grad_norm": 0.21052053428277495, "learning_rate": 1.1556097400995585e-07, "logits/chosen": -1.2835049629211426, "logits/rejected": -1.3611830472946167, "logps/chosen": -66.21566009521484, "logps/rejected": -90.28056335449219, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -5.6806111335754395, "rewards/margins": 12.183880805969238, "rewards/rejected": -17.864492416381836, "step": 958 }, { "epoch": 5.6829629629629625, "grad_norm": 0.12157817956464971, "learning_rate": 1.1501372830346482e-07, "logits/chosen": -1.7824749946594238, "logits/rejected": -1.8065423965454102, "logps/chosen": -35.580074310302734, "logps/rejected": -61.021427154541016, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -0.39305993914604187, "rewards/margins": 9.523391723632812, "rewards/rejected": -9.916450500488281, "step": 959 }, { "epoch": 5.688888888888889, "grad_norm": 0.17255666494982388, "learning_rate": 1.1446739405463899e-07, "logits/chosen": -1.3453398942947388, "logits/rejected": -1.2265667915344238, "logps/chosen": -31.389596939086914, "logps/rejected": -63.46915817260742, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -2.0556116104125977, "rewards/margins": 11.092000961303711, "rewards/rejected": -13.147612571716309, "step": 960 }, { "epoch": 5.694814814814815, "grad_norm": 0.11405289684306127, "learning_rate": 1.1392197495245015e-07, "logits/chosen": -1.5662834644317627, "logits/rejected": -1.4976131916046143, "logps/chosen": -39.92075729370117, "logps/rejected": -71.36894226074219, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.032357692718506, "rewards/margins": 11.438801765441895, "rewards/rejected": -13.471158981323242, "step": 961 }, { "epoch": 5.70074074074074, "grad_norm": 0.14616719178130974, "learning_rate": 1.1337747467969069e-07, "logits/chosen": -1.4800591468811035, "logits/rejected": -1.4703744649887085, "logps/chosen": -49.12568664550781, "logps/rejected": -86.93719482421875, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -4.348463535308838, "rewards/margins": 12.572733879089355, "rewards/rejected": -16.92119789123535, "step": 962 }, { "epoch": 5.706666666666667, "grad_norm": 0.23009834739386595, "learning_rate": 1.1283389691294893e-07, "logits/chosen": -2.024674415588379, "logits/rejected": -2.035721778869629, "logps/chosen": -53.45521545410156, "logps/rejected": -89.40463256835938, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.8616273403167725, "rewards/margins": 12.572017669677734, "rewards/rejected": -15.433645248413086, "step": 963 }, { "epoch": 5.712592592592593, "grad_norm": 0.10650246877796829, "learning_rate": 1.1229124532258421e-07, "logits/chosen": -1.6839030981063843, "logits/rejected": -1.6192753314971924, "logps/chosen": -54.92471694946289, "logps/rejected": -82.94303894042969, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -4.707098960876465, "rewards/margins": 11.277144432067871, "rewards/rejected": -15.984243392944336, "step": 964 }, { "epoch": 5.718518518518518, "grad_norm": 0.1015957402950518, "learning_rate": 1.1174952357270212e-07, "logits/chosen": -1.8444169759750366, "logits/rejected": -1.8085654973983765, "logps/chosen": -46.73295593261719, "logps/rejected": -75.91047668457031, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.431227207183838, "rewards/margins": 11.139023780822754, "rewards/rejected": -14.57025146484375, "step": 965 }, { "epoch": 5.724444444444444, "grad_norm": 0.12271793759807745, "learning_rate": 1.112087353211297e-07, "logits/chosen": -2.244523286819458, "logits/rejected": -2.1519553661346436, "logps/chosen": -39.844268798828125, "logps/rejected": -74.11243438720703, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.3675413131713867, "rewards/margins": 12.443099021911621, "rewards/rejected": -14.810640335083008, "step": 966 }, { "epoch": 5.730370370370371, "grad_norm": 0.13586249354982624, "learning_rate": 1.1066888421939092e-07, "logits/chosen": -1.9669585227966309, "logits/rejected": -1.8986517190933228, "logps/chosen": -56.03706359863281, "logps/rejected": -96.92056274414062, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -2.9295215606689453, "rewards/margins": 12.924680709838867, "rewards/rejected": -15.854202270507812, "step": 967 }, { "epoch": 5.736296296296296, "grad_norm": 0.12060708329390671, "learning_rate": 1.1012997391268177e-07, "logits/chosen": -1.5202209949493408, "logits/rejected": -1.5482919216156006, "logps/chosen": -43.142616271972656, "logps/rejected": -70.00810241699219, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.034687042236328, "rewards/margins": 11.336004257202148, "rewards/rejected": -14.370691299438477, "step": 968 }, { "epoch": 5.742222222222222, "grad_norm": 0.25218587071541326, "learning_rate": 1.095920080398459e-07, "logits/chosen": -1.751842975616455, "logits/rejected": -1.7139880657196045, "logps/chosen": -38.157073974609375, "logps/rejected": -70.97021484375, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -1.3091049194335938, "rewards/margins": 11.698443412780762, "rewards/rejected": -13.007549285888672, "step": 969 }, { "epoch": 5.7481481481481485, "grad_norm": 0.14363264584749322, "learning_rate": 1.0905499023334979e-07, "logits/chosen": -1.8324824571609497, "logits/rejected": -1.9127658605575562, "logps/chosen": -51.50800704956055, "logps/rejected": -76.4848403930664, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.799527168273926, "rewards/margins": 11.289259910583496, "rewards/rejected": -15.088786125183105, "step": 970 }, { "epoch": 5.754074074074074, "grad_norm": 0.07308955653243333, "learning_rate": 1.0851892411925856e-07, "logits/chosen": -1.664994239807129, "logits/rejected": -1.5534952878952026, "logps/chosen": -45.636905670166016, "logps/rejected": -76.47474670410156, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -1.9818623065948486, "rewards/margins": 10.973186492919922, "rewards/rejected": -12.955049514770508, "step": 971 }, { "epoch": 5.76, "grad_norm": 0.09610117014811514, "learning_rate": 1.0798381331721107e-07, "logits/chosen": -1.9020330905914307, "logits/rejected": -1.8605923652648926, "logps/chosen": -41.162384033203125, "logps/rejected": -84.16923522949219, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.8716068267822266, "rewards/margins": 14.272635459899902, "rewards/rejected": -17.144243240356445, "step": 972 }, { "epoch": 5.7659259259259255, "grad_norm": 0.21835290195862744, "learning_rate": 1.0744966144039588e-07, "logits/chosen": -1.87041437625885, "logits/rejected": -1.7399128675460815, "logps/chosen": -47.70142364501953, "logps/rejected": -84.19062805175781, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -3.7133491039276123, "rewards/margins": 12.153305053710938, "rewards/rejected": -15.866655349731445, "step": 973 }, { "epoch": 5.771851851851852, "grad_norm": 0.18795920746502257, "learning_rate": 1.0691647209552654e-07, "logits/chosen": -1.8339283466339111, "logits/rejected": -1.7644309997558594, "logps/chosen": -38.69776916503906, "logps/rejected": -65.06787109375, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.1597390174865723, "rewards/margins": 9.444230079650879, "rewards/rejected": -11.60396957397461, "step": 974 }, { "epoch": 5.777777777777778, "grad_norm": 0.17740674289925806, "learning_rate": 1.0638424888281744e-07, "logits/chosen": -1.702333927154541, "logits/rejected": -1.5982693433761597, "logps/chosen": -49.459510803222656, "logps/rejected": -93.81587219238281, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -4.351852893829346, "rewards/margins": 13.706995964050293, "rewards/rejected": -18.058849334716797, "step": 975 }, { "epoch": 5.783703703703703, "grad_norm": 0.1424353181362849, "learning_rate": 1.0585299539595943e-07, "logits/chosen": -1.8163714408874512, "logits/rejected": -1.9260118007659912, "logps/chosen": -61.91559600830078, "logps/rejected": -82.7620849609375, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -4.653960704803467, "rewards/margins": 10.99573040008545, "rewards/rejected": -15.649691581726074, "step": 976 }, { "epoch": 5.78962962962963, "grad_norm": 0.07940058375309675, "learning_rate": 1.0532271522209551e-07, "logits/chosen": -1.5750871896743774, "logits/rejected": -1.374831199645996, "logps/chosen": -44.500892639160156, "logps/rejected": -88.57928466796875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -4.226994514465332, "rewards/margins": 14.695962905883789, "rewards/rejected": -18.922958374023438, "step": 977 }, { "epoch": 5.795555555555556, "grad_norm": 0.1764725717312812, "learning_rate": 1.0479341194179658e-07, "logits/chosen": -1.331404209136963, "logits/rejected": -1.1948274374008179, "logps/chosen": -38.005226135253906, "logps/rejected": -88.23011016845703, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -1.5174663066864014, "rewards/margins": 14.888310432434082, "rewards/rejected": -16.405776977539062, "step": 978 }, { "epoch": 5.801481481481481, "grad_norm": 0.12298277994646507, "learning_rate": 1.0426508912903764e-07, "logits/chosen": -1.1262080669403076, "logits/rejected": -1.158602237701416, "logps/chosen": -50.767574310302734, "logps/rejected": -76.22100830078125, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -5.1396355628967285, "rewards/margins": 11.057031631469727, "rewards/rejected": -16.196666717529297, "step": 979 }, { "epoch": 5.807407407407408, "grad_norm": 0.15841005567455482, "learning_rate": 1.0373775035117305e-07, "logits/chosen": -2.0430521965026855, "logits/rejected": -1.843505620956421, "logps/chosen": -34.70548629760742, "logps/rejected": -72.10562133789062, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -1.6738539934158325, "rewards/margins": 12.656837463378906, "rewards/rejected": -14.33069133758545, "step": 980 }, { "epoch": 5.8133333333333335, "grad_norm": 0.09126868696163637, "learning_rate": 1.0321139916891281e-07, "logits/chosen": -1.9275261163711548, "logits/rejected": -1.5792516469955444, "logps/chosen": -51.00112533569336, "logps/rejected": -107.09458923339844, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.374115228652954, "rewards/margins": 14.925018310546875, "rewards/rejected": -18.29913330078125, "step": 981 }, { "epoch": 5.819259259259259, "grad_norm": 0.11668536193593289, "learning_rate": 1.0268603913629858e-07, "logits/chosen": -1.510830044746399, "logits/rejected": -1.533048391342163, "logps/chosen": -37.096092224121094, "logps/rejected": -62.85737991333008, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -1.515608549118042, "rewards/margins": 11.164163589477539, "rewards/rejected": -12.67977237701416, "step": 982 }, { "epoch": 5.825185185185185, "grad_norm": 0.23361770696046172, "learning_rate": 1.0216167380067927e-07, "logits/chosen": -1.7323989868164062, "logits/rejected": -1.747807502746582, "logps/chosen": -34.62275695800781, "logps/rejected": -72.12071228027344, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -1.8253674507141113, "rewards/margins": 13.908540725708008, "rewards/rejected": -15.733907699584961, "step": 983 }, { "epoch": 5.831111111111111, "grad_norm": 0.277833354779492, "learning_rate": 1.0163830670268767e-07, "logits/chosen": -2.256314754486084, "logits/rejected": -2.0917136669158936, "logps/chosen": -52.86377716064453, "logps/rejected": -85.62020874023438, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -4.4890899658203125, "rewards/margins": 11.622297286987305, "rewards/rejected": -16.111387252807617, "step": 984 }, { "epoch": 5.837037037037037, "grad_norm": 0.10893117436507617, "learning_rate": 1.0111594137621613e-07, "logits/chosen": -1.61838698387146, "logits/rejected": -1.6808353662490845, "logps/chosen": -61.41318130493164, "logps/rejected": -90.2042465209961, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -4.989392280578613, "rewards/margins": 11.840309143066406, "rewards/rejected": -16.829700469970703, "step": 985 }, { "epoch": 5.842962962962963, "grad_norm": 0.14083883793143315, "learning_rate": 1.0059458134839277e-07, "logits/chosen": -1.809838056564331, "logits/rejected": -1.7997772693634033, "logps/chosen": -38.19802474975586, "logps/rejected": -81.2254638671875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -1.70159912109375, "rewards/margins": 13.643616676330566, "rewards/rejected": -15.345215797424316, "step": 986 }, { "epoch": 5.848888888888889, "grad_norm": 0.2029646340596989, "learning_rate": 1.0007423013955782e-07, "logits/chosen": -1.7847646474838257, "logits/rejected": -1.6058578491210938, "logps/chosen": -45.05035400390625, "logps/rejected": -73.54215240478516, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.438556671142578, "rewards/margins": 9.645893096923828, "rewards/rejected": -13.084449768066406, "step": 987 }, { "epoch": 5.854814814814815, "grad_norm": 0.1166149245738814, "learning_rate": 9.955489126323954e-08, "logits/chosen": -1.264709234237671, "logits/rejected": -1.1719449758529663, "logps/chosen": -39.547542572021484, "logps/rejected": -72.57160949707031, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.485950469970703, "rewards/margins": 12.804004669189453, "rewards/rejected": -16.289955139160156, "step": 988 }, { "epoch": 5.860740740740741, "grad_norm": 0.189855556650095, "learning_rate": 9.903656822613099e-08, "logits/chosen": -2.148167133331299, "logits/rejected": -2.095820426940918, "logps/chosen": -42.154842376708984, "logps/rejected": -80.5337142944336, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -2.442440986633301, "rewards/margins": 13.176408767700195, "rewards/rejected": -15.618850708007812, "step": 989 }, { "epoch": 5.866666666666667, "grad_norm": 0.1277843814429211, "learning_rate": 9.851926452806583e-08, "logits/chosen": -2.021697759628296, "logits/rejected": -1.9521626234054565, "logps/chosen": -53.14356994628906, "logps/rejected": -80.20213317871094, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.5833356380462646, "rewards/margins": 11.104471206665039, "rewards/rejected": -14.687807083129883, "step": 990 }, { "epoch": 5.872592592592593, "grad_norm": 0.10600699030812348, "learning_rate": 9.800298366199497e-08, "logits/chosen": -1.6231588125228882, "logits/rejected": -1.6354756355285645, "logps/chosen": -51.428890228271484, "logps/rejected": -94.6219711303711, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -4.853515148162842, "rewards/margins": 14.988431930541992, "rewards/rejected": -19.841947555541992, "step": 991 }, { "epoch": 5.8785185185185185, "grad_norm": 0.11596245744295286, "learning_rate": 9.748772911396291e-08, "logits/chosen": -1.3082904815673828, "logits/rejected": -1.233626365661621, "logps/chosen": -40.92805099487305, "logps/rejected": -65.74427032470703, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.9067976474761963, "rewards/margins": 10.380163192749023, "rewards/rejected": -14.28696060180664, "step": 992 }, { "epoch": 5.884444444444444, "grad_norm": 0.10483820314945619, "learning_rate": 9.697350436308427e-08, "logits/chosen": -1.7032501697540283, "logits/rejected": -1.6479954719543457, "logps/chosen": -39.51605987548828, "logps/rejected": -76.45269775390625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.6380763053894043, "rewards/margins": 11.980184555053711, "rewards/rejected": -14.618260383605957, "step": 993 }, { "epoch": 5.890370370370371, "grad_norm": 0.11282242003298107, "learning_rate": 9.646031288152021e-08, "logits/chosen": -2.1721444129943848, "logits/rejected": -2.001124382019043, "logps/chosen": -42.80406188964844, "logps/rejected": -88.69567108154297, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.4269580841064453, "rewards/margins": 12.612770080566406, "rewards/rejected": -16.03972816467285, "step": 994 }, { "epoch": 5.896296296296296, "grad_norm": 0.17922823799988089, "learning_rate": 9.5948158134455e-08, "logits/chosen": -1.8580384254455566, "logits/rejected": -1.887601375579834, "logps/chosen": -57.25939178466797, "logps/rejected": -74.04570770263672, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.449359655380249, "rewards/margins": 12.284698486328125, "rewards/rejected": -15.73405647277832, "step": 995 }, { "epoch": 5.902222222222222, "grad_norm": 0.14419271465393926, "learning_rate": 9.543704358007279e-08, "logits/chosen": -1.651800274848938, "logits/rejected": -1.4914857149124146, "logps/chosen": -36.71223449707031, "logps/rejected": -63.70142364501953, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -1.186145305633545, "rewards/margins": 10.299652099609375, "rewards/rejected": -11.485797882080078, "step": 996 }, { "epoch": 5.908148148148149, "grad_norm": 0.16193844665413645, "learning_rate": 9.492697266953373e-08, "logits/chosen": -1.8199375867843628, "logits/rejected": -1.6269965171813965, "logps/chosen": -47.34810256958008, "logps/rejected": -75.77220153808594, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -4.279019355773926, "rewards/margins": 10.331778526306152, "rewards/rejected": -14.610797882080078, "step": 997 }, { "epoch": 5.914074074074074, "grad_norm": 0.11881887411981226, "learning_rate": 9.44179488469516e-08, "logits/chosen": -1.6865270137786865, "logits/rejected": -1.2623865604400635, "logps/chosen": -44.04072952270508, "logps/rejected": -99.73695373535156, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -4.052587032318115, "rewards/margins": 15.76125431060791, "rewards/rejected": -19.8138427734375, "step": 998 }, { "epoch": 5.92, "grad_norm": 0.10314018796390148, "learning_rate": 9.390997554936964e-08, "logits/chosen": -1.4690736532211304, "logits/rejected": -1.3281701803207397, "logps/chosen": -53.75056457519531, "logps/rejected": -96.96499633789062, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -5.602512836456299, "rewards/margins": 14.359382629394531, "rewards/rejected": -19.961896896362305, "step": 999 }, { "epoch": 5.925925925925926, "grad_norm": 0.1513517755495629, "learning_rate": 9.340305620673778e-08, "logits/chosen": -1.65963876247406, "logits/rejected": -1.7874866724014282, "logps/chosen": -54.26740264892578, "logps/rejected": -76.15953063964844, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.9122443199157715, "rewards/margins": 11.574843406677246, "rewards/rejected": -15.487088203430176, "step": 1000 }, { "epoch": 5.931851851851852, "grad_norm": 0.14425098132355896, "learning_rate": 9.289719424188947e-08, "logits/chosen": -2.188652276992798, "logits/rejected": -2.323061943054199, "logps/chosen": -58.438350677490234, "logps/rejected": -85.29507446289062, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -5.2114763259887695, "rewards/margins": 11.773921966552734, "rewards/rejected": -16.985397338867188, "step": 1001 }, { "epoch": 5.937777777777778, "grad_norm": 0.177912208496194, "learning_rate": 9.239239307051841e-08, "logits/chosen": -2.1309244632720947, "logits/rejected": -1.9711205959320068, "logps/chosen": -40.25239562988281, "logps/rejected": -60.29902267456055, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -1.9188445806503296, "rewards/margins": 9.246747970581055, "rewards/rejected": -11.165592193603516, "step": 1002 }, { "epoch": 5.9437037037037035, "grad_norm": 0.1561294024334232, "learning_rate": 9.18886561011557e-08, "logits/chosen": -1.3059730529785156, "logits/rejected": -1.2804628610610962, "logps/chosen": -46.57466125488281, "logps/rejected": -81.99845123291016, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -4.611976146697998, "rewards/margins": 11.705613136291504, "rewards/rejected": -16.317588806152344, "step": 1003 }, { "epoch": 5.94962962962963, "grad_norm": 0.2993503902397938, "learning_rate": 9.13859867351466e-08, "logits/chosen": -1.8890063762664795, "logits/rejected": -2.08756685256958, "logps/chosen": -59.0401496887207, "logps/rejected": -76.09051513671875, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -4.641585826873779, "rewards/margins": 11.28979778289795, "rewards/rejected": -15.93138313293457, "step": 1004 }, { "epoch": 5.955555555555556, "grad_norm": 0.10981688272526485, "learning_rate": 9.088438836662777e-08, "logits/chosen": -1.4090893268585205, "logits/rejected": -1.3857533931732178, "logps/chosen": -54.12238311767578, "logps/rejected": -91.73143005371094, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -5.333917140960693, "rewards/margins": 12.519246101379395, "rewards/rejected": -17.853164672851562, "step": 1005 }, { "epoch": 5.961481481481481, "grad_norm": 0.13439049466641606, "learning_rate": 9.038386438250414e-08, "logits/chosen": -1.2438244819641113, "logits/rejected": -1.1741102933883667, "logps/chosen": -39.77931213378906, "logps/rejected": -71.62962341308594, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.493358612060547, "rewards/margins": 12.272336959838867, "rewards/rejected": -14.765695571899414, "step": 1006 }, { "epoch": 5.967407407407407, "grad_norm": 0.3528541714993207, "learning_rate": 8.988441816242629e-08, "logits/chosen": -1.874366044998169, "logits/rejected": -1.843646764755249, "logps/chosen": -46.88833999633789, "logps/rejected": -72.9251708984375, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -3.3884449005126953, "rewards/margins": 10.11141586303711, "rewards/rejected": -13.499860763549805, "step": 1007 }, { "epoch": 5.973333333333334, "grad_norm": 0.11757249904059189, "learning_rate": 8.938605307876736e-08, "logits/chosen": -1.7174410820007324, "logits/rejected": -1.5187230110168457, "logps/chosen": -38.19911575317383, "logps/rejected": -68.39817810058594, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.4435136318206787, "rewards/margins": 10.123954772949219, "rewards/rejected": -13.56746768951416, "step": 1008 }, { "epoch": 5.979259259259259, "grad_norm": 0.1236027401279087, "learning_rate": 8.888877249660052e-08, "logits/chosen": -1.4056321382522583, "logits/rejected": -1.3889738321304321, "logps/chosen": -46.628700256347656, "logps/rejected": -76.94029235839844, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.220608711242676, "rewards/margins": 10.921125411987305, "rewards/rejected": -14.141735076904297, "step": 1009 }, { "epoch": 5.985185185185185, "grad_norm": 0.14761964298924235, "learning_rate": 8.839257977367609e-08, "logits/chosen": -1.4390403032302856, "logits/rejected": -1.1668685674667358, "logps/chosen": -38.51154708862305, "logps/rejected": -84.68218994140625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.108654260635376, "rewards/margins": 14.42491340637207, "rewards/rejected": -16.5335693359375, "step": 1010 }, { "epoch": 5.9911111111111115, "grad_norm": 0.16578379927474013, "learning_rate": 8.789747826039893e-08, "logits/chosen": -1.4390288591384888, "logits/rejected": -1.4251270294189453, "logps/chosen": -45.86073684692383, "logps/rejected": -74.61750793457031, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -4.397583961486816, "rewards/margins": 10.961638450622559, "rewards/rejected": -15.359222412109375, "step": 1011 }, { "epoch": 5.997037037037037, "grad_norm": 0.23256467980724813, "learning_rate": 8.74034712998058e-08, "logits/chosen": -1.9970195293426514, "logits/rejected": -1.8830376863479614, "logps/chosen": -50.25756072998047, "logps/rejected": -91.27782440185547, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -4.501843452453613, "rewards/margins": 13.392400741577148, "rewards/rejected": -17.894245147705078, "step": 1012 }, { "epoch": 6.002962962962963, "grad_norm": 0.14358647909840425, "learning_rate": 8.69105622275428e-08, "logits/chosen": -1.5321400165557861, "logits/rejected": -1.4904074668884277, "logps/chosen": -34.789794921875, "logps/rejected": -68.898193359375, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -1.4096457958221436, "rewards/margins": 13.383646965026855, "rewards/rejected": -14.793292999267578, "step": 1013 }, { "epoch": 6.0088888888888885, "grad_norm": 0.12844382297364182, "learning_rate": 8.641875437184287e-08, "logits/chosen": -1.7536540031433105, "logits/rejected": -1.660245656967163, "logps/chosen": -35.77042770385742, "logps/rejected": -82.79251098632812, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.1235854625701904, "rewards/margins": 14.947321891784668, "rewards/rejected": -18.070907592773438, "step": 1014 }, { "epoch": 6.014814814814815, "grad_norm": 0.05247327611483253, "learning_rate": 8.592805105350326e-08, "logits/chosen": -2.009796619415283, "logits/rejected": -1.900392770767212, "logps/chosen": -41.918434143066406, "logps/rejected": -77.7911376953125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.0733580589294434, "rewards/margins": 13.40345573425293, "rewards/rejected": -15.476814270019531, "step": 1015 }, { "epoch": 6.020740740740741, "grad_norm": 0.10007031711708843, "learning_rate": 8.543845558586307e-08, "logits/chosen": -1.3085033893585205, "logits/rejected": -1.413244366645813, "logps/chosen": -42.41469955444336, "logps/rejected": -72.19366455078125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.757518768310547, "rewards/margins": 10.87429428100586, "rewards/rejected": -13.631814956665039, "step": 1016 }, { "epoch": 6.026666666666666, "grad_norm": 0.08269761942657197, "learning_rate": 8.494997127478109e-08, "logits/chosen": -1.6426849365234375, "logits/rejected": -1.4693927764892578, "logps/chosen": -46.25225830078125, "logps/rejected": -79.94685363769531, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.9081649780273438, "rewards/margins": 12.332348823547363, "rewards/rejected": -16.24051284790039, "step": 1017 }, { "epoch": 6.032592592592593, "grad_norm": 0.0965133655534609, "learning_rate": 8.44626014186132e-08, "logits/chosen": -1.5757637023925781, "logits/rejected": -1.5381988286972046, "logps/chosen": -42.592041015625, "logps/rejected": -73.64163970947266, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.198291540145874, "rewards/margins": 12.328102111816406, "rewards/rejected": -15.526392936706543, "step": 1018 }, { "epoch": 6.038518518518519, "grad_norm": 0.09484154834530545, "learning_rate": 8.39763493081902e-08, "logits/chosen": -1.8162059783935547, "logits/rejected": -1.6350376605987549, "logps/chosen": -42.69355773925781, "logps/rejected": -72.34944915771484, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.1462531089782715, "rewards/margins": 11.239422798156738, "rewards/rejected": -14.385677337646484, "step": 1019 }, { "epoch": 6.044444444444444, "grad_norm": 0.10897687127100829, "learning_rate": 8.349121822679589e-08, "logits/chosen": -1.4719338417053223, "logits/rejected": -1.5809326171875, "logps/chosen": -40.893314361572266, "logps/rejected": -71.21835327148438, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -1.4168126583099365, "rewards/margins": 11.799530029296875, "rewards/rejected": -13.216341972351074, "step": 1020 }, { "epoch": 6.05037037037037, "grad_norm": 0.1015909348981711, "learning_rate": 8.300721145014434e-08, "logits/chosen": -1.1651694774627686, "logits/rejected": -1.0104308128356934, "logps/chosen": -44.911109924316406, "logps/rejected": -69.9439926147461, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -4.591246128082275, "rewards/margins": 12.101785659790039, "rewards/rejected": -16.693031311035156, "step": 1021 }, { "epoch": 6.0562962962962965, "grad_norm": 0.12301399347924664, "learning_rate": 8.252433224635816e-08, "logits/chosen": -1.5464973449707031, "logits/rejected": -1.5874733924865723, "logps/chosen": -44.601627349853516, "logps/rejected": -84.60299682617188, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.1868271827697754, "rewards/margins": 12.496557235717773, "rewards/rejected": -15.68338394165039, "step": 1022 }, { "epoch": 6.062222222222222, "grad_norm": 0.14071709766452006, "learning_rate": 8.204258387594634e-08, "logits/chosen": -1.8018912076950073, "logits/rejected": -2.0076708793640137, "logps/chosen": -55.555274963378906, "logps/rejected": -76.36517333984375, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -5.126642227172852, "rewards/margins": 12.043182373046875, "rewards/rejected": -17.169824600219727, "step": 1023 }, { "epoch": 6.068148148148148, "grad_norm": 0.09827836009768653, "learning_rate": 8.15619695917823e-08, "logits/chosen": -1.4957119226455688, "logits/rejected": -1.577928066253662, "logps/chosen": -59.20314025878906, "logps/rejected": -71.88645935058594, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -4.856942653656006, "rewards/margins": 11.350852966308594, "rewards/rejected": -16.207796096801758, "step": 1024 }, { "epoch": 6.074074074074074, "grad_norm": 0.11371787712657735, "learning_rate": 8.108249263908163e-08, "logits/chosen": -1.4241392612457275, "logits/rejected": -1.4908593893051147, "logps/chosen": -54.07698059082031, "logps/rejected": -98.04219055175781, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -4.959539890289307, "rewards/margins": 13.624971389770508, "rewards/rejected": -18.584510803222656, "step": 1025 }, { "epoch": 6.08, "grad_norm": 0.13302313121136608, "learning_rate": 8.060415625538059e-08, "logits/chosen": -1.4159773588180542, "logits/rejected": -1.369776725769043, "logps/chosen": -40.464969635009766, "logps/rejected": -70.99534606933594, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -2.73652982711792, "rewards/margins": 10.462564468383789, "rewards/rejected": -13.199094772338867, "step": 1026 }, { "epoch": 6.085925925925926, "grad_norm": 0.11523165898367618, "learning_rate": 8.012696367051409e-08, "logits/chosen": -1.6489384174346924, "logits/rejected": -1.7299697399139404, "logps/chosen": -45.44413375854492, "logps/rejected": -78.03041076660156, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.226395606994629, "rewards/margins": 12.231207847595215, "rewards/rejected": -14.45760440826416, "step": 1027 }, { "epoch": 6.091851851851851, "grad_norm": 0.1678387929900549, "learning_rate": 7.965091810659369e-08, "logits/chosen": -1.6275016069412231, "logits/rejected": -1.4610252380371094, "logps/chosen": -38.6914176940918, "logps/rejected": -71.39549255371094, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -2.7591590881347656, "rewards/margins": 10.687475204467773, "rewards/rejected": -13.446634292602539, "step": 1028 }, { "epoch": 6.097777777777778, "grad_norm": 0.11180465890014177, "learning_rate": 7.917602277798612e-08, "logits/chosen": -1.6225529909133911, "logits/rejected": -1.6014906167984009, "logps/chosen": -46.327056884765625, "logps/rejected": -84.01023864746094, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -4.628958225250244, "rewards/margins": 13.545883178710938, "rewards/rejected": -18.174840927124023, "step": 1029 }, { "epoch": 6.103703703703704, "grad_norm": 0.137214967452007, "learning_rate": 7.870228089129155e-08, "logits/chosen": -1.7564111948013306, "logits/rejected": -1.6935635805130005, "logps/chosen": -38.401329040527344, "logps/rejected": -61.10845184326172, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -1.4874920845031738, "rewards/margins": 11.215259552001953, "rewards/rejected": -12.702751159667969, "step": 1030 }, { "epoch": 6.109629629629629, "grad_norm": 0.1671060221459912, "learning_rate": 7.822969564532167e-08, "logits/chosen": -1.7741791009902954, "logits/rejected": -1.744217872619629, "logps/chosen": -38.02019500732422, "logps/rejected": -76.86927032470703, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -2.3427977561950684, "rewards/margins": 13.43283462524414, "rewards/rejected": -15.77563190460205, "step": 1031 }, { "epoch": 6.115555555555556, "grad_norm": 0.1536051341773224, "learning_rate": 7.775827023107834e-08, "logits/chosen": -1.8468081951141357, "logits/rejected": -1.5555062294006348, "logps/chosen": -42.24263000488281, "logps/rejected": -78.72967529296875, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -2.6049342155456543, "rewards/margins": 12.029871940612793, "rewards/rejected": -14.634806632995605, "step": 1032 }, { "epoch": 6.1214814814814815, "grad_norm": 0.10753768585043083, "learning_rate": 7.728800783173201e-08, "logits/chosen": -2.1814775466918945, "logits/rejected": -2.0472769737243652, "logps/chosen": -40.55844497680664, "logps/rejected": -85.31634521484375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.7944538593292236, "rewards/margins": 13.156540870666504, "rewards/rejected": -15.950995445251465, "step": 1033 }, { "epoch": 6.127407407407407, "grad_norm": 0.10291282391822812, "learning_rate": 7.681891162260015e-08, "logits/chosen": -1.929794430732727, "logits/rejected": -1.872850775718689, "logps/chosen": -39.1888427734375, "logps/rejected": -68.86357879638672, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.3173604011535645, "rewards/margins": 10.948533058166504, "rewards/rejected": -13.265893936157227, "step": 1034 }, { "epoch": 6.133333333333334, "grad_norm": 0.09519974691947208, "learning_rate": 7.635098477112587e-08, "logits/chosen": -1.5099825859069824, "logits/rejected": -1.426225185394287, "logps/chosen": -42.998443603515625, "logps/rejected": -72.8267593383789, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.638385534286499, "rewards/margins": 10.71774673461914, "rewards/rejected": -13.356132507324219, "step": 1035 }, { "epoch": 6.139259259259259, "grad_norm": 0.07922820550503175, "learning_rate": 7.588423043685646e-08, "logits/chosen": -1.6852991580963135, "logits/rejected": -1.421472430229187, "logps/chosen": -49.89509201049805, "logps/rejected": -78.23391723632812, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.7343664169311523, "rewards/margins": 12.89884090423584, "rewards/rejected": -15.633207321166992, "step": 1036 }, { "epoch": 6.145185185185185, "grad_norm": 0.14171862753933648, "learning_rate": 7.541865177142223e-08, "logits/chosen": -1.5457457304000854, "logits/rejected": -1.4155274629592896, "logps/chosen": -42.30841827392578, "logps/rejected": -88.99565887451172, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.4843955039978027, "rewards/margins": 15.793426513671875, "rewards/rejected": -19.277822494506836, "step": 1037 }, { "epoch": 6.151111111111111, "grad_norm": 0.14581198869002696, "learning_rate": 7.4954251918515e-08, "logits/chosen": -1.3942396640777588, "logits/rejected": -1.353884220123291, "logps/chosen": -44.77840042114258, "logps/rejected": -69.60354614257812, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.442391872406006, "rewards/margins": 10.158035278320312, "rewards/rejected": -12.600425720214844, "step": 1038 }, { "epoch": 6.157037037037037, "grad_norm": 0.10854236092797581, "learning_rate": 7.449103401386702e-08, "logits/chosen": -2.0815091133117676, "logits/rejected": -1.9354617595672607, "logps/chosen": -37.36747360229492, "logps/rejected": -79.1553726196289, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.691523551940918, "rewards/margins": 12.638469696044922, "rewards/rejected": -15.329992294311523, "step": 1039 }, { "epoch": 6.162962962962963, "grad_norm": 0.12985455462798487, "learning_rate": 7.402900118522978e-08, "logits/chosen": -1.8217581510543823, "logits/rejected": -1.6328381299972534, "logps/chosen": -43.85091781616211, "logps/rejected": -96.46054077148438, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -4.261449337005615, "rewards/margins": 17.501453399658203, "rewards/rejected": -21.76290512084961, "step": 1040 }, { "epoch": 6.168888888888889, "grad_norm": 0.12535486058516476, "learning_rate": 7.356815655235286e-08, "logits/chosen": -1.4930031299591064, "logits/rejected": -1.5464279651641846, "logps/chosen": -48.69932556152344, "logps/rejected": -79.17095184326172, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -4.835171222686768, "rewards/margins": 11.31641960144043, "rewards/rejected": -16.15159034729004, "step": 1041 }, { "epoch": 6.174814814814815, "grad_norm": 0.11849196685478744, "learning_rate": 7.310850322696283e-08, "logits/chosen": -1.7987161874771118, "logits/rejected": -1.7363475561141968, "logps/chosen": -35.88686752319336, "logps/rejected": -63.077423095703125, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.5645644664764404, "rewards/margins": 11.16557788848877, "rewards/rejected": -13.730142593383789, "step": 1042 }, { "epoch": 6.180740740740741, "grad_norm": 0.1458720969738312, "learning_rate": 7.265004431274236e-08, "logits/chosen": -1.8062262535095215, "logits/rejected": -1.7760688066482544, "logps/chosen": -41.64423370361328, "logps/rejected": -69.88961029052734, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.090489625930786, "rewards/margins": 13.364202499389648, "rewards/rejected": -15.454690933227539, "step": 1043 }, { "epoch": 6.1866666666666665, "grad_norm": 0.15732193479335854, "learning_rate": 7.219278290530909e-08, "logits/chosen": -1.9366602897644043, "logits/rejected": -1.9759104251861572, "logps/chosen": -51.57427978515625, "logps/rejected": -78.43927001953125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.0921897888183594, "rewards/margins": 12.398733139038086, "rewards/rejected": -15.490922927856445, "step": 1044 }, { "epoch": 6.192592592592592, "grad_norm": 0.162687196124052, "learning_rate": 7.173672209219494e-08, "logits/chosen": -1.7788739204406738, "logits/rejected": -1.6164450645446777, "logps/chosen": -45.710723876953125, "logps/rejected": -83.01245880126953, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -2.6430821418762207, "rewards/margins": 12.03801441192627, "rewards/rejected": -14.681096076965332, "step": 1045 }, { "epoch": 6.198518518518519, "grad_norm": 0.12348007532130807, "learning_rate": 7.128186495282507e-08, "logits/chosen": -2.1181507110595703, "logits/rejected": -2.0345942974090576, "logps/chosen": -39.752044677734375, "logps/rejected": -77.72023010253906, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.7648587226867676, "rewards/margins": 11.238019943237305, "rewards/rejected": -14.00287914276123, "step": 1046 }, { "epoch": 6.204444444444444, "grad_norm": 0.13581994327741484, "learning_rate": 7.082821455849717e-08, "logits/chosen": -1.4847235679626465, "logits/rejected": -1.4669365882873535, "logps/chosen": -53.27600860595703, "logps/rejected": -86.11213684082031, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -5.276714324951172, "rewards/margins": 12.011651992797852, "rewards/rejected": -17.288368225097656, "step": 1047 }, { "epoch": 6.21037037037037, "grad_norm": 0.08876930704947257, "learning_rate": 7.037577397236074e-08, "logits/chosen": -1.8768885135650635, "logits/rejected": -1.6755867004394531, "logps/chosen": -52.622314453125, "logps/rejected": -87.13565063476562, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -4.662373065948486, "rewards/margins": 13.155412673950195, "rewards/rejected": -17.817785263061523, "step": 1048 }, { "epoch": 6.216296296296297, "grad_norm": 0.15555493957770292, "learning_rate": 6.992454624939636e-08, "logits/chosen": -2.0798707008361816, "logits/rejected": -1.913529872894287, "logps/chosen": -46.629173278808594, "logps/rejected": -91.40234375, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.7776923179626465, "rewards/margins": 12.142569541931152, "rewards/rejected": -15.920262336730957, "step": 1049 }, { "epoch": 6.222222222222222, "grad_norm": 0.12171810468034984, "learning_rate": 6.947453443639514e-08, "logits/chosen": -1.5369203090667725, "logits/rejected": -1.4309501647949219, "logps/chosen": -45.19132995605469, "logps/rejected": -82.57927703857422, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.4305639266967773, "rewards/margins": 12.945052146911621, "rewards/rejected": -16.3756160736084, "step": 1050 }, { "epoch": 6.228148148148148, "grad_norm": 0.18943800217365878, "learning_rate": 6.902574157193794e-08, "logits/chosen": -1.2610238790512085, "logits/rejected": -1.3482708930969238, "logps/chosen": -56.56427764892578, "logps/rejected": -74.21857452392578, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -5.313958168029785, "rewards/margins": 10.111957550048828, "rewards/rejected": -15.42591667175293, "step": 1051 }, { "epoch": 6.234074074074074, "grad_norm": 0.1220262778690326, "learning_rate": 6.857817068637526e-08, "logits/chosen": -1.7904787063598633, "logits/rejected": -1.857863426208496, "logps/chosen": -53.34080123901367, "logps/rejected": -70.02799224853516, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -1.458836317062378, "rewards/margins": 11.524408340454102, "rewards/rejected": -12.983244895935059, "step": 1052 }, { "epoch": 6.24, "grad_norm": 0.13902229219240764, "learning_rate": 6.81318248018064e-08, "logits/chosen": -1.5270187854766846, "logits/rejected": -1.3275320529937744, "logps/chosen": -49.507606506347656, "logps/rejected": -101.7363052368164, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.40425443649292, "rewards/margins": 15.39886474609375, "rewards/rejected": -18.803119659423828, "step": 1053 }, { "epoch": 6.245925925925926, "grad_norm": 0.1279638964163061, "learning_rate": 6.7686706932059e-08, "logits/chosen": -1.7549011707305908, "logits/rejected": -1.5517879724502563, "logps/chosen": -47.363651275634766, "logps/rejected": -83.65386962890625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -4.2582550048828125, "rewards/margins": 12.157743453979492, "rewards/rejected": -16.415998458862305, "step": 1054 }, { "epoch": 6.2518518518518515, "grad_norm": 0.09007238556981374, "learning_rate": 6.72428200826691e-08, "logits/chosen": -2.5198426246643066, "logits/rejected": -2.465240001678467, "logps/chosen": -55.144351959228516, "logps/rejected": -88.67027282714844, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.7151663303375244, "rewards/margins": 13.719581604003906, "rewards/rejected": -16.43474769592285, "step": 1055 }, { "epoch": 6.257777777777778, "grad_norm": 0.1288368518732618, "learning_rate": 6.680016725086052e-08, "logits/chosen": -1.9952564239501953, "logits/rejected": -1.8888041973114014, "logps/chosen": -43.91956329345703, "logps/rejected": -81.23291778564453, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.3017802238464355, "rewards/margins": 12.755228042602539, "rewards/rejected": -16.057010650634766, "step": 1056 }, { "epoch": 6.263703703703704, "grad_norm": 0.11925320073673232, "learning_rate": 6.635875142552475e-08, "logits/chosen": -1.750340223312378, "logits/rejected": -1.4810287952423096, "logps/chosen": -48.30830764770508, "logps/rejected": -90.56777954101562, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -4.03714656829834, "rewards/margins": 14.28243637084961, "rewards/rejected": -18.319583892822266, "step": 1057 }, { "epoch": 6.269629629629629, "grad_norm": 0.14219106176572774, "learning_rate": 6.591857558720071e-08, "logits/chosen": -1.4469225406646729, "logits/rejected": -1.4411016702651978, "logps/chosen": -36.428924560546875, "logps/rejected": -62.26043701171875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -1.5755970478057861, "rewards/margins": 11.159796714782715, "rewards/rejected": -12.735394477844238, "step": 1058 }, { "epoch": 6.275555555555556, "grad_norm": 0.14888832078955183, "learning_rate": 6.547964270805467e-08, "logits/chosen": -1.8655922412872314, "logits/rejected": -1.5784927606582642, "logps/chosen": -34.269927978515625, "logps/rejected": -79.37550354003906, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -1.8423409461975098, "rewards/margins": 14.538299560546875, "rewards/rejected": -16.380640029907227, "step": 1059 }, { "epoch": 6.281481481481482, "grad_norm": 0.16797342298925552, "learning_rate": 6.504195575186008e-08, "logits/chosen": -1.7597219944000244, "logits/rejected": -1.5347667932510376, "logps/chosen": -45.811729431152344, "logps/rejected": -86.42766571044922, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -3.779290199279785, "rewards/margins": 11.868831634521484, "rewards/rejected": -15.64812183380127, "step": 1060 }, { "epoch": 6.287407407407407, "grad_norm": 0.17570328072046215, "learning_rate": 6.460551767397784e-08, "logits/chosen": -1.8049561977386475, "logits/rejected": -1.6960246562957764, "logps/chosen": -43.38131332397461, "logps/rejected": -77.1976318359375, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -3.0591611862182617, "rewards/margins": 13.242414474487305, "rewards/rejected": -16.301576614379883, "step": 1061 }, { "epoch": 6.293333333333333, "grad_norm": 0.12263604264249418, "learning_rate": 6.417033142133593e-08, "logits/chosen": -1.7794426679611206, "logits/rejected": -1.6573376655578613, "logps/chosen": -36.931121826171875, "logps/rejected": -74.627685546875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.626795530319214, "rewards/margins": 12.069509506225586, "rewards/rejected": -14.696305274963379, "step": 1062 }, { "epoch": 6.29925925925926, "grad_norm": 0.1182645145918132, "learning_rate": 6.37363999324098e-08, "logits/chosen": -1.6106207370758057, "logits/rejected": -1.441240906715393, "logps/chosen": -37.177764892578125, "logps/rejected": -76.4002914428711, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.1715197563171387, "rewards/margins": 14.079158782958984, "rewards/rejected": -16.25067901611328, "step": 1063 }, { "epoch": 6.305185185185185, "grad_norm": 0.1829491566195646, "learning_rate": 6.330372613720247e-08, "logits/chosen": -1.360914945602417, "logits/rejected": -1.3094801902770996, "logps/chosen": -45.058780670166016, "logps/rejected": -70.208984375, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -3.909440755844116, "rewards/margins": 9.265682220458984, "rewards/rejected": -13.175124168395996, "step": 1064 }, { "epoch": 6.311111111111111, "grad_norm": 0.10702577184483547, "learning_rate": 6.28723129572247e-08, "logits/chosen": -1.6841546297073364, "logits/rejected": -1.691527247428894, "logps/chosen": -57.962867736816406, "logps/rejected": -83.77704620361328, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.677487373352051, "rewards/margins": 12.261930465698242, "rewards/rejected": -15.939416885375977, "step": 1065 }, { "epoch": 6.3170370370370375, "grad_norm": 0.10785794121226135, "learning_rate": 6.244216330547533e-08, "logits/chosen": -1.9304916858673096, "logits/rejected": -1.8183112144470215, "logps/chosen": -39.911956787109375, "logps/rejected": -63.655853271484375, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.665860891342163, "rewards/margins": 12.657873153686523, "rewards/rejected": -15.323734283447266, "step": 1066 }, { "epoch": 6.322962962962963, "grad_norm": 0.1432749380231685, "learning_rate": 6.201328008642159e-08, "logits/chosen": -1.8575468063354492, "logits/rejected": -1.846190333366394, "logps/chosen": -42.935150146484375, "logps/rejected": -63.82366943359375, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.917294502258301, "rewards/margins": 11.173959732055664, "rewards/rejected": -14.091255187988281, "step": 1067 }, { "epoch": 6.328888888888889, "grad_norm": 0.1765937601001302, "learning_rate": 6.158566619597932e-08, "logits/chosen": -1.4488952159881592, "logits/rejected": -1.4074013233184814, "logps/chosen": -36.881935119628906, "logps/rejected": -72.05622863769531, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -3.9991321563720703, "rewards/margins": 12.601561546325684, "rewards/rejected": -16.60069465637207, "step": 1068 }, { "epoch": 6.3348148148148145, "grad_norm": 0.15153245583303798, "learning_rate": 6.115932452149372e-08, "logits/chosen": -1.8595564365386963, "logits/rejected": -1.6728187799453735, "logps/chosen": -38.90936279296875, "logps/rejected": -69.89384460449219, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -1.4666457176208496, "rewards/margins": 11.830467224121094, "rewards/rejected": -13.297114372253418, "step": 1069 }, { "epoch": 6.340740740740741, "grad_norm": 0.11794101889037722, "learning_rate": 6.07342579417196e-08, "logits/chosen": -1.581072449684143, "logits/rejected": -1.4845892190933228, "logps/chosen": -43.72598648071289, "logps/rejected": -79.68162536621094, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.1735429763793945, "rewards/margins": 13.215681076049805, "rewards/rejected": -16.389225006103516, "step": 1070 }, { "epoch": 6.346666666666667, "grad_norm": 0.08378610393720609, "learning_rate": 6.031046932680229e-08, "logits/chosen": -2.0058701038360596, "logits/rejected": -1.950735092163086, "logps/chosen": -50.23748016357422, "logps/rejected": -87.69313049316406, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -4.4842119216918945, "rewards/margins": 12.081066131591797, "rewards/rejected": -16.565277099609375, "step": 1071 }, { "epoch": 6.352592592592592, "grad_norm": 0.0721461908064281, "learning_rate": 5.988796153825768e-08, "logits/chosen": -1.5199872255325317, "logits/rejected": -1.5319875478744507, "logps/chosen": -61.35587692260742, "logps/rejected": -80.61036682128906, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -5.392558574676514, "rewards/margins": 11.08071231842041, "rewards/rejected": -16.473270416259766, "step": 1072 }, { "epoch": 6.358518518518519, "grad_norm": 0.1428585794653907, "learning_rate": 5.9466737428953444e-08, "logits/chosen": -2.0055718421936035, "logits/rejected": -1.690445899963379, "logps/chosen": -50.62556457519531, "logps/rejected": -103.78714752197266, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -4.993103981018066, "rewards/margins": 14.962833404541016, "rewards/rejected": -19.955936431884766, "step": 1073 }, { "epoch": 6.364444444444445, "grad_norm": 0.09808909586979048, "learning_rate": 5.9046799843089464e-08, "logits/chosen": -1.5556331872940063, "logits/rejected": -1.5716986656188965, "logps/chosen": -37.94038772583008, "logps/rejected": -64.6926040649414, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.3914966583251953, "rewards/margins": 11.77543830871582, "rewards/rejected": -14.166935920715332, "step": 1074 }, { "epoch": 6.37037037037037, "grad_norm": 0.08266518492599137, "learning_rate": 5.862815161617879e-08, "logits/chosen": -1.463660717010498, "logits/rejected": -1.4374220371246338, "logps/chosen": -50.42168045043945, "logps/rejected": -88.21818542480469, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.8865365982055664, "rewards/margins": 14.44270133972168, "rewards/rejected": -17.329238891601562, "step": 1075 }, { "epoch": 6.376296296296296, "grad_norm": 0.09520254743607896, "learning_rate": 5.8210795575028395e-08, "logits/chosen": -1.9621179103851318, "logits/rejected": -1.7960467338562012, "logps/chosen": -48.25514221191406, "logps/rejected": -96.09195709228516, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -4.190402507781982, "rewards/margins": 13.114240646362305, "rewards/rejected": -17.304643630981445, "step": 1076 }, { "epoch": 6.3822222222222225, "grad_norm": 0.10568535832114431, "learning_rate": 5.7794734537720156e-08, "logits/chosen": -1.6859076023101807, "logits/rejected": -1.7142068147659302, "logps/chosen": -60.96441650390625, "logps/rejected": -74.63279724121094, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -4.792672157287598, "rewards/margins": 9.826908111572266, "rewards/rejected": -14.61958122253418, "step": 1077 }, { "epoch": 6.388148148148148, "grad_norm": 0.12543603244981047, "learning_rate": 5.7379971313591736e-08, "logits/chosen": -1.7731202840805054, "logits/rejected": -1.796823263168335, "logps/chosen": -63.01026916503906, "logps/rejected": -96.41645812988281, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -5.990747928619385, "rewards/margins": 12.424217224121094, "rewards/rejected": -18.41496467590332, "step": 1078 }, { "epoch": 6.394074074074074, "grad_norm": 0.1373856939972385, "learning_rate": 5.69665087032177e-08, "logits/chosen": -1.725167989730835, "logits/rejected": -1.743959903717041, "logps/chosen": -42.21112060546875, "logps/rejected": -65.99535369873047, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.9228241443634033, "rewards/margins": 9.872057914733887, "rewards/rejected": -12.794881820678711, "step": 1079 }, { "epoch": 6.4, "grad_norm": 0.07565936247852624, "learning_rate": 5.6554349498390606e-08, "logits/chosen": -1.7092314958572388, "logits/rejected": -1.7479329109191895, "logps/chosen": -42.521568298339844, "logps/rejected": -76.67975616455078, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.7571520805358887, "rewards/margins": 11.863056182861328, "rewards/rejected": -15.620209693908691, "step": 1080 }, { "epoch": 6.405925925925926, "grad_norm": 0.11025606213310722, "learning_rate": 5.614349648210212e-08, "logits/chosen": -1.960831642150879, "logits/rejected": -1.9402929544448853, "logps/chosen": -46.01992416381836, "logps/rejected": -71.65948486328125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.4495725631713867, "rewards/margins": 11.952836990356445, "rewards/rejected": -14.402410507202148, "step": 1081 }, { "epoch": 6.411851851851852, "grad_norm": 0.11391905861785986, "learning_rate": 5.573395242852416e-08, "logits/chosen": -1.9040777683258057, "logits/rejected": -1.6277384757995605, "logps/chosen": -45.58112335205078, "logps/rejected": -93.3453598022461, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.9349770545959473, "rewards/margins": 14.115599632263184, "rewards/rejected": -18.05057716369629, "step": 1082 }, { "epoch": 6.417777777777777, "grad_norm": 0.15868471217100444, "learning_rate": 5.532572010299033e-08, "logits/chosen": -1.5244793891906738, "logits/rejected": -1.5053770542144775, "logps/chosen": -38.31887435913086, "logps/rejected": -68.69171142578125, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -1.4086740016937256, "rewards/margins": 10.506559371948242, "rewards/rejected": -11.91523265838623, "step": 1083 }, { "epoch": 6.423703703703704, "grad_norm": 0.11512703826874703, "learning_rate": 5.4918802261977067e-08, "logits/chosen": -1.4734156131744385, "logits/rejected": -1.5725305080413818, "logps/chosen": -47.85783386230469, "logps/rejected": -72.31632995605469, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.557512044906616, "rewards/margins": 10.896509170532227, "rewards/rejected": -14.454021453857422, "step": 1084 }, { "epoch": 6.42962962962963, "grad_norm": 0.13099659731085148, "learning_rate": 5.451320165308518e-08, "logits/chosen": -1.9137905836105347, "logits/rejected": -1.7296216487884521, "logps/chosen": -47.043785095214844, "logps/rejected": -89.14530944824219, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -4.782802581787109, "rewards/margins": 12.57824420928955, "rewards/rejected": -17.361045837402344, "step": 1085 }, { "epoch": 6.435555555555555, "grad_norm": 0.15877214990532035, "learning_rate": 5.410892101502118e-08, "logits/chosen": -1.1672827005386353, "logits/rejected": -1.379494309425354, "logps/chosen": -50.326271057128906, "logps/rejected": -83.04608154296875, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -4.681979179382324, "rewards/margins": 14.329689979553223, "rewards/rejected": -19.011669158935547, "step": 1086 }, { "epoch": 6.441481481481482, "grad_norm": 0.08272935062465174, "learning_rate": 5.370596307757885e-08, "logits/chosen": -1.2756975889205933, "logits/rejected": -1.0763273239135742, "logps/chosen": -39.87290954589844, "logps/rejected": -84.71408081054688, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.9266140460968018, "rewards/margins": 14.486490249633789, "rewards/rejected": -17.413105010986328, "step": 1087 }, { "epoch": 6.4474074074074075, "grad_norm": 0.14231261967741088, "learning_rate": 5.330433056162084e-08, "logits/chosen": -1.8586760759353638, "logits/rejected": -1.7465415000915527, "logps/chosen": -44.88026428222656, "logps/rejected": -62.242008209228516, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -2.546351432800293, "rewards/margins": 10.57691478729248, "rewards/rejected": -13.123266220092773, "step": 1088 }, { "epoch": 6.453333333333333, "grad_norm": 0.1266739271924691, "learning_rate": 5.29040261790602e-08, "logits/chosen": -1.321254849433899, "logits/rejected": -1.3049745559692383, "logps/chosen": -63.391944885253906, "logps/rejected": -92.37260437011719, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -4.090176105499268, "rewards/margins": 13.020833015441895, "rewards/rejected": -17.11100959777832, "step": 1089 }, { "epoch": 6.459259259259259, "grad_norm": 0.1378265215323224, "learning_rate": 5.2505052632842187e-08, "logits/chosen": -2.24951171875, "logits/rejected": -2.298269033432007, "logps/chosen": -40.54319381713867, "logps/rejected": -66.77867889404297, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -1.7393558025360107, "rewards/margins": 11.914773941040039, "rewards/rejected": -13.654130935668945, "step": 1090 }, { "epoch": 6.465185185185185, "grad_norm": 0.06869704455843761, "learning_rate": 5.210741261692586e-08, "logits/chosen": -2.1489977836608887, "logits/rejected": -2.0202672481536865, "logps/chosen": -35.78162384033203, "logps/rejected": -75.97494506835938, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.267564535140991, "rewards/margins": 13.613672256469727, "rewards/rejected": -15.881235122680664, "step": 1091 }, { "epoch": 6.471111111111111, "grad_norm": 0.11844219734103974, "learning_rate": 5.171110881626603e-08, "logits/chosen": -1.2343411445617676, "logits/rejected": -1.0903527736663818, "logps/chosen": -46.51551055908203, "logps/rejected": -68.22870635986328, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.7705905437469482, "rewards/margins": 10.409194946289062, "rewards/rejected": -14.179784774780273, "step": 1092 }, { "epoch": 6.477037037037037, "grad_norm": 0.0798285794405465, "learning_rate": 5.1316143906795175e-08, "logits/chosen": -1.5782904624938965, "logits/rejected": -1.5185699462890625, "logps/chosen": -50.782135009765625, "logps/rejected": -86.63662719726562, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -4.781415939331055, "rewards/margins": 13.581457138061523, "rewards/rejected": -18.362873077392578, "step": 1093 }, { "epoch": 6.482962962962963, "grad_norm": 0.08253145895304921, "learning_rate": 5.092252055540513e-08, "logits/chosen": -2.0359816551208496, "logits/rejected": -1.9892759323120117, "logps/chosen": -48.8431396484375, "logps/rejected": -75.51835632324219, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.010666608810425, "rewards/margins": 10.410385131835938, "rewards/rejected": -13.421051979064941, "step": 1094 }, { "epoch": 6.488888888888889, "grad_norm": 0.16430761634687271, "learning_rate": 5.053024141992934e-08, "logits/chosen": -1.6572614908218384, "logits/rejected": -1.7257788181304932, "logps/chosen": -35.166404724121094, "logps/rejected": -56.80643844604492, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -1.8248682022094727, "rewards/margins": 10.291166305541992, "rewards/rejected": -12.116033554077148, "step": 1095 }, { "epoch": 6.494814814814815, "grad_norm": 0.11363606507794657, "learning_rate": 5.013930914912476e-08, "logits/chosen": -1.8092420101165771, "logits/rejected": -1.643286943435669, "logps/chosen": -36.295989990234375, "logps/rejected": -69.26307678222656, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.8994760513305664, "rewards/margins": 11.408160209655762, "rewards/rejected": -14.307637214660645, "step": 1096 }, { "epoch": 6.50074074074074, "grad_norm": 0.1640644182662116, "learning_rate": 4.97497263826539e-08, "logits/chosen": -1.8906135559082031, "logits/rejected": -1.6718571186065674, "logps/chosen": -32.947410583496094, "logps/rejected": -79.05558776855469, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -1.182114601135254, "rewards/margins": 13.58568000793457, "rewards/rejected": -14.767793655395508, "step": 1097 }, { "epoch": 6.506666666666667, "grad_norm": 0.21762442747119778, "learning_rate": 4.936149575106727e-08, "logits/chosen": -2.068748950958252, "logits/rejected": -1.930734395980835, "logps/chosen": -56.75259780883789, "logps/rejected": -74.71015930175781, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.925926923751831, "rewards/margins": 10.583144187927246, "rewards/rejected": -14.509071350097656, "step": 1098 }, { "epoch": 6.5125925925925925, "grad_norm": 0.1661638343862068, "learning_rate": 4.897461987578541e-08, "logits/chosen": -1.9975709915161133, "logits/rejected": -2.0516114234924316, "logps/chosen": -34.44652557373047, "logps/rejected": -62.443790435791016, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -1.581756353378296, "rewards/margins": 11.710376739501953, "rewards/rejected": -13.292133331298828, "step": 1099 }, { "epoch": 6.518518518518518, "grad_norm": 0.11832971936961147, "learning_rate": 4.8589101369081235e-08, "logits/chosen": -2.3408203125, "logits/rejected": -2.202461004257202, "logps/chosen": -41.1694450378418, "logps/rejected": -70.74188232421875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.2778067588806152, "rewards/margins": 10.766231536865234, "rewards/rejected": -13.044036865234375, "step": 1100 }, { "epoch": 6.524444444444445, "grad_norm": 0.11115353794278574, "learning_rate": 4.8204942834062373e-08, "logits/chosen": -1.5817288160324097, "logits/rejected": -1.3780860900878906, "logps/chosen": -32.763893127441406, "logps/rejected": -64.86897277832031, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.493917942047119, "rewards/margins": 10.884139060974121, "rewards/rejected": -13.378057479858398, "step": 1101 }, { "epoch": 6.53037037037037, "grad_norm": 0.0986118294670484, "learning_rate": 4.7822146864653744e-08, "logits/chosen": -1.5795146226882935, "logits/rejected": -1.5201057195663452, "logps/chosen": -52.62881088256836, "logps/rejected": -87.83228302001953, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.7620527744293213, "rewards/margins": 13.284611701965332, "rewards/rejected": -17.04666519165039, "step": 1102 }, { "epoch": 6.536296296296296, "grad_norm": 0.1151527660718973, "learning_rate": 4.744071604557978e-08, "logits/chosen": -1.3033947944641113, "logits/rejected": -1.3502681255340576, "logps/chosen": -41.56209945678711, "logps/rejected": -65.2261734008789, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.61061954498291, "rewards/margins": 9.524845123291016, "rewards/rejected": -13.135465621948242, "step": 1103 }, { "epoch": 6.542222222222223, "grad_norm": 0.11668221228478101, "learning_rate": 4.706065295234718e-08, "logits/chosen": -1.6934748888015747, "logits/rejected": -1.5031492710113525, "logps/chosen": -42.92441177368164, "logps/rejected": -85.13064575195312, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.7842092514038086, "rewards/margins": 12.543503761291504, "rewards/rejected": -15.327713012695312, "step": 1104 }, { "epoch": 6.548148148148148, "grad_norm": 0.08872367783103548, "learning_rate": 4.668196015122736e-08, "logits/chosen": -1.4549543857574463, "logits/rejected": -1.4374938011169434, "logps/chosen": -45.51918411254883, "logps/rejected": -70.88013458251953, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.88723087310791, "rewards/margins": 9.359696388244629, "rewards/rejected": -13.246927261352539, "step": 1105 }, { "epoch": 6.554074074074074, "grad_norm": 0.09173901658647882, "learning_rate": 4.630464019923932e-08, "logits/chosen": -1.9627599716186523, "logits/rejected": -1.9463365077972412, "logps/chosen": -40.94268798828125, "logps/rejected": -73.16978454589844, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.460191488265991, "rewards/margins": 12.181102752685547, "rewards/rejected": -14.641294479370117, "step": 1106 }, { "epoch": 6.5600000000000005, "grad_norm": 0.08523040261229967, "learning_rate": 4.5928695644132266e-08, "logits/chosen": -1.5671050548553467, "logits/rejected": -1.4865435361862183, "logps/chosen": -39.919410705566406, "logps/rejected": -78.70149230957031, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.508589506149292, "rewards/margins": 13.942964553833008, "rewards/rejected": -17.451553344726562, "step": 1107 }, { "epoch": 6.565925925925926, "grad_norm": 0.11606576737889197, "learning_rate": 4.5554129024368334e-08, "logits/chosen": -1.3970017433166504, "logits/rejected": -1.3437788486480713, "logps/chosen": -45.93317413330078, "logps/rejected": -86.33053588867188, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -4.457387924194336, "rewards/margins": 13.490986824035645, "rewards/rejected": -17.948373794555664, "step": 1108 }, { "epoch": 6.571851851851852, "grad_norm": 0.14023652410181175, "learning_rate": 4.5180942869105594e-08, "logits/chosen": -1.8171439170837402, "logits/rejected": -1.9706523418426514, "logps/chosen": -50.49400329589844, "logps/rejected": -75.61033630371094, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -5.006141662597656, "rewards/margins": 11.371904373168945, "rewards/rejected": -16.3780460357666, "step": 1109 }, { "epoch": 6.5777777777777775, "grad_norm": 0.15347892294400567, "learning_rate": 4.480913969818098e-08, "logits/chosen": -1.5099637508392334, "logits/rejected": -1.4124467372894287, "logps/chosen": -46.44249725341797, "logps/rejected": -85.35438537597656, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -4.218708038330078, "rewards/margins": 13.186939239501953, "rewards/rejected": -17.40564727783203, "step": 1110 }, { "epoch": 6.583703703703704, "grad_norm": 0.13839792487158512, "learning_rate": 4.4438722022092925e-08, "logits/chosen": -1.5241343975067139, "logits/rejected": -1.473794937133789, "logps/chosen": -47.51785659790039, "logps/rejected": -78.66014099121094, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -4.042856216430664, "rewards/margins": 13.097108840942383, "rewards/rejected": -17.139965057373047, "step": 1111 }, { "epoch": 6.58962962962963, "grad_norm": 0.10521400647939427, "learning_rate": 4.406969234198507e-08, "logits/chosen": -1.8187470436096191, "logits/rejected": -1.762420654296875, "logps/chosen": -46.16997528076172, "logps/rejected": -89.59684753417969, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.4138479232788086, "rewards/margins": 14.086780548095703, "rewards/rejected": -17.500629425048828, "step": 1112 }, { "epoch": 6.595555555555555, "grad_norm": 0.19039851900446791, "learning_rate": 4.370205314962872e-08, "logits/chosen": -1.5640621185302734, "logits/rejected": -1.5517284870147705, "logps/chosen": -49.66551971435547, "logps/rejected": -68.79605865478516, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.7969613075256348, "rewards/margins": 9.614969253540039, "rewards/rejected": -13.411931037902832, "step": 1113 }, { "epoch": 6.601481481481482, "grad_norm": 0.11915859572952245, "learning_rate": 4.333580692740643e-08, "logits/chosen": -1.5478522777557373, "logits/rejected": -1.3028783798217773, "logps/chosen": -29.00416374206543, "logps/rejected": -68.0338134765625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -1.0459089279174805, "rewards/margins": 11.805730819702148, "rewards/rejected": -12.851640701293945, "step": 1114 }, { "epoch": 6.607407407407408, "grad_norm": 0.11920592216417088, "learning_rate": 4.2970956148295075e-08, "logits/chosen": -1.2512449026107788, "logits/rejected": -1.1777236461639404, "logps/chosen": -33.25419235229492, "logps/rejected": -63.37179946899414, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -1.9790241718292236, "rewards/margins": 10.882867813110352, "rewards/rejected": -12.861892700195312, "step": 1115 }, { "epoch": 6.613333333333333, "grad_norm": 0.13864941881061246, "learning_rate": 4.260750327584911e-08, "logits/chosen": -1.652343988418579, "logits/rejected": -1.5353381633758545, "logps/chosen": -45.20256042480469, "logps/rejected": -73.66735076904297, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.5702972412109375, "rewards/margins": 11.877695083618164, "rewards/rejected": -14.447990417480469, "step": 1116 }, { "epoch": 6.619259259259259, "grad_norm": 0.11481299769727026, "learning_rate": 4.2245450764184095e-08, "logits/chosen": -1.8509702682495117, "logits/rejected": -1.8344902992248535, "logps/chosen": -48.48194122314453, "logps/rejected": -85.86804962158203, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.4326438903808594, "rewards/margins": 13.968793869018555, "rewards/rejected": -16.401439666748047, "step": 1117 }, { "epoch": 6.6251851851851855, "grad_norm": 0.101719771379674, "learning_rate": 4.188480105796005e-08, "logits/chosen": -1.7616229057312012, "logits/rejected": -1.6706197261810303, "logps/chosen": -38.84326934814453, "logps/rejected": -66.91342163085938, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.4328653812408447, "rewards/margins": 11.19166374206543, "rewards/rejected": -13.624527931213379, "step": 1118 }, { "epoch": 6.631111111111111, "grad_norm": 0.11980446027636844, "learning_rate": 4.1525556592364843e-08, "logits/chosen": -2.0169639587402344, "logits/rejected": -2.028402805328369, "logps/chosen": -58.9628791809082, "logps/rejected": -84.61741638183594, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -5.251021385192871, "rewards/margins": 11.501065254211426, "rewards/rejected": -16.752086639404297, "step": 1119 }, { "epoch": 6.637037037037037, "grad_norm": 0.18917276649888526, "learning_rate": 4.116771979309797e-08, "logits/chosen": -1.4528882503509521, "logits/rejected": -1.2737305164337158, "logps/chosen": -33.34897994995117, "logps/rejected": -87.13203430175781, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.971200466156006, "rewards/margins": 14.857851028442383, "rewards/rejected": -17.829051971435547, "step": 1120 }, { "epoch": 6.642962962962963, "grad_norm": 0.10275853238339516, "learning_rate": 4.081129307635389e-08, "logits/chosen": -1.6249778270721436, "logits/rejected": -1.575737476348877, "logps/chosen": -39.56550598144531, "logps/rejected": -67.03308868408203, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.602834701538086, "rewards/margins": 10.713581085205078, "rewards/rejected": -13.316415786743164, "step": 1121 }, { "epoch": 6.648888888888889, "grad_norm": 0.1430084669140178, "learning_rate": 4.045627884880606e-08, "logits/chosen": -1.4841854572296143, "logits/rejected": -1.1887649297714233, "logps/chosen": -48.28791427612305, "logps/rejected": -97.2742919921875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.086124897003174, "rewards/margins": 15.608391761779785, "rewards/rejected": -18.694517135620117, "step": 1122 }, { "epoch": 6.654814814814815, "grad_norm": 0.09521747618211378, "learning_rate": 4.010267950759025e-08, "logits/chosen": -1.9114545583724976, "logits/rejected": -1.8105525970458984, "logps/chosen": -42.81127166748047, "logps/rejected": -84.86613464355469, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -1.7864162921905518, "rewards/margins": 13.881184577941895, "rewards/rejected": -15.667600631713867, "step": 1123 }, { "epoch": 6.66074074074074, "grad_norm": 0.07782578972378952, "learning_rate": 3.9750497440288935e-08, "logits/chosen": -1.5309398174285889, "logits/rejected": -1.6354602575302124, "logps/chosen": -57.512237548828125, "logps/rejected": -79.76363372802734, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -5.54788875579834, "rewards/margins": 10.909296035766602, "rewards/rejected": -16.457183837890625, "step": 1124 }, { "epoch": 6.666666666666667, "grad_norm": 0.13416727252612903, "learning_rate": 3.9399735024914475e-08, "logits/chosen": -1.5638493299484253, "logits/rejected": -1.5976860523223877, "logps/chosen": -40.705963134765625, "logps/rejected": -60.32302474975586, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.148594379425049, "rewards/margins": 9.42080307006836, "rewards/rejected": -12.56939697265625, "step": 1125 }, { "epoch": 6.672592592592593, "grad_norm": 0.10613156696033546, "learning_rate": 3.905039462989365e-08, "logits/chosen": -1.9716284275054932, "logits/rejected": -1.9705533981323242, "logps/chosen": -50.73835754394531, "logps/rejected": -82.14369201660156, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.9389829635620117, "rewards/margins": 13.061878204345703, "rewards/rejected": -17.00086212158203, "step": 1126 }, { "epoch": 6.678518518518518, "grad_norm": 0.22781953538865385, "learning_rate": 3.8702478614051345e-08, "logits/chosen": -1.71630859375, "logits/rejected": -1.6615371704101562, "logps/chosen": -33.904579162597656, "logps/rejected": -61.124977111816406, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.4092910587787628, "rewards/margins": 9.47354793548584, "rewards/rejected": -9.88283920288086, "step": 1127 }, { "epoch": 6.684444444444445, "grad_norm": 0.1070585196057158, "learning_rate": 3.835598932659476e-08, "logits/chosen": -1.8027238845825195, "logits/rejected": -1.7290318012237549, "logps/chosen": -49.52033996582031, "logps/rejected": -89.80229949951172, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.248178482055664, "rewards/margins": 14.71170425415039, "rewards/rejected": -17.959882736206055, "step": 1128 }, { "epoch": 6.6903703703703705, "grad_norm": 0.09133588650231188, "learning_rate": 3.801092910709749e-08, "logits/chosen": -1.7302844524383545, "logits/rejected": -1.5114187002182007, "logps/chosen": -43.30107498168945, "logps/rejected": -77.09686279296875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.723008155822754, "rewards/margins": 10.668754577636719, "rewards/rejected": -13.391761779785156, "step": 1129 }, { "epoch": 6.696296296296296, "grad_norm": 0.09223332224300108, "learning_rate": 3.766730028548376e-08, "logits/chosen": -1.7790238857269287, "logits/rejected": -1.750356912612915, "logps/chosen": -45.946449279785156, "logps/rejected": -80.86420440673828, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.758995532989502, "rewards/margins": 12.887953758239746, "rewards/rejected": -16.646949768066406, "step": 1130 }, { "epoch": 6.702222222222222, "grad_norm": 0.19653263190443007, "learning_rate": 3.732510518201265e-08, "logits/chosen": -1.8037636280059814, "logits/rejected": -1.7017388343811035, "logps/chosen": -56.00579833984375, "logps/rejected": -79.54396057128906, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -5.373067855834961, "rewards/margins": 11.808576583862305, "rewards/rejected": -17.181644439697266, "step": 1131 }, { "epoch": 6.708148148148148, "grad_norm": 0.1298054144861316, "learning_rate": 3.698434610726245e-08, "logits/chosen": -1.7327450513839722, "logits/rejected": -1.633286952972412, "logps/chosen": -41.85569381713867, "logps/rejected": -84.49916076660156, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.524967670440674, "rewards/margins": 14.326324462890625, "rewards/rejected": -17.85129165649414, "step": 1132 }, { "epoch": 6.714074074074074, "grad_norm": 0.09808405051715205, "learning_rate": 3.6645025362115e-08, "logits/chosen": -1.9530103206634521, "logits/rejected": -1.9716355800628662, "logps/chosen": -51.91473388671875, "logps/rejected": -81.49636840820312, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.951500415802002, "rewards/margins": 12.983062744140625, "rewards/rejected": -15.934563636779785, "step": 1133 }, { "epoch": 6.72, "grad_norm": 0.08128562051403439, "learning_rate": 3.630714523774042e-08, "logits/chosen": -1.4830005168914795, "logits/rejected": -1.3773740530014038, "logps/chosen": -46.95196533203125, "logps/rejected": -93.6692886352539, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -4.364588737487793, "rewards/margins": 13.5150785446167, "rewards/rejected": -17.879667282104492, "step": 1134 }, { "epoch": 6.725925925925926, "grad_norm": 0.10695265097038231, "learning_rate": 3.597070801558122e-08, "logits/chosen": -1.9932217597961426, "logits/rejected": -1.7008832693099976, "logps/chosen": -39.68993377685547, "logps/rejected": -88.78868103027344, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.271031379699707, "rewards/margins": 15.981220245361328, "rewards/rejected": -19.25225257873535, "step": 1135 }, { "epoch": 6.731851851851852, "grad_norm": 0.1242820848290576, "learning_rate": 3.563571596733722e-08, "logits/chosen": -1.7718029022216797, "logits/rejected": -1.7178691625595093, "logps/chosen": -47.097042083740234, "logps/rejected": -83.14295196533203, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -4.281190395355225, "rewards/margins": 11.175911903381348, "rewards/rejected": -15.45710277557373, "step": 1136 }, { "epoch": 6.737777777777778, "grad_norm": 0.09366512262058838, "learning_rate": 3.530217135495006e-08, "logits/chosen": -2.007751703262329, "logits/rejected": -1.9636316299438477, "logps/chosen": -38.424827575683594, "logps/rejected": -86.53182983398438, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.3459277153015137, "rewards/margins": 14.052453994750977, "rewards/rejected": -17.398380279541016, "step": 1137 }, { "epoch": 6.743703703703703, "grad_norm": 0.16347283237886825, "learning_rate": 3.4970076430588027e-08, "logits/chosen": -2.208000659942627, "logits/rejected": -2.010059118270874, "logps/chosen": -32.67522430419922, "logps/rejected": -93.86961364746094, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -2.409874200820923, "rewards/margins": 14.899282455444336, "rewards/rejected": -17.30915641784668, "step": 1138 }, { "epoch": 6.74962962962963, "grad_norm": 0.06627283701796888, "learning_rate": 3.463943343663065e-08, "logits/chosen": -1.7873049974441528, "logits/rejected": -1.7198983430862427, "logps/chosen": -48.43455505371094, "logps/rejected": -89.15277862548828, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -4.063439846038818, "rewards/margins": 12.770740509033203, "rewards/rejected": -16.834178924560547, "step": 1139 }, { "epoch": 6.7555555555555555, "grad_norm": 0.12790584904297592, "learning_rate": 3.4310244605653795e-08, "logits/chosen": -1.747499942779541, "logits/rejected": -1.7736481428146362, "logps/chosen": -59.638973236083984, "logps/rejected": -84.97341918945312, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -4.622666835784912, "rewards/margins": 12.510490417480469, "rewards/rejected": -17.13315773010254, "step": 1140 }, { "epoch": 6.761481481481481, "grad_norm": 0.054293900686124816, "learning_rate": 3.3982512160414505e-08, "logits/chosen": -1.5526971817016602, "logits/rejected": -1.3642184734344482, "logps/chosen": -51.45740509033203, "logps/rejected": -93.18586730957031, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -4.02336311340332, "rewards/margins": 13.009660720825195, "rewards/rejected": -17.033023834228516, "step": 1141 }, { "epoch": 6.767407407407408, "grad_norm": 0.11040010049183906, "learning_rate": 3.365623831383599e-08, "logits/chosen": -1.9688798189163208, "logits/rejected": -1.9011162519454956, "logps/chosen": -44.33605194091797, "logps/rejected": -81.40846252441406, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.4609012603759766, "rewards/margins": 13.169167518615723, "rewards/rejected": -16.630069732666016, "step": 1142 }, { "epoch": 6.773333333333333, "grad_norm": 0.17505663368553703, "learning_rate": 3.3331425268992547e-08, "logits/chosen": -1.7000912427902222, "logits/rejected": -1.7651917934417725, "logps/chosen": -37.774322509765625, "logps/rejected": -68.81632995605469, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -2.9970366954803467, "rewards/margins": 11.976476669311523, "rewards/rejected": -14.973514556884766, "step": 1143 }, { "epoch": 6.779259259259259, "grad_norm": 0.16025890402331744, "learning_rate": 3.3008075219095045e-08, "logits/chosen": -2.3175556659698486, "logits/rejected": -2.2146213054656982, "logps/chosen": -54.18362808227539, "logps/rejected": -82.90629577636719, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.281198263168335, "rewards/margins": 11.708627700805664, "rewards/rejected": -13.989827156066895, "step": 1144 }, { "epoch": 6.785185185185185, "grad_norm": 0.09533316032443495, "learning_rate": 3.268619034747566e-08, "logits/chosen": -1.9728548526763916, "logits/rejected": -1.731310248374939, "logps/chosen": -43.94758224487305, "logps/rejected": -86.31636047363281, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -4.777540683746338, "rewards/margins": 13.461470603942871, "rewards/rejected": -18.239011764526367, "step": 1145 }, { "epoch": 6.791111111111111, "grad_norm": 0.20000749435302137, "learning_rate": 3.236577282757347e-08, "logits/chosen": -1.510556936264038, "logits/rejected": -1.6342051029205322, "logps/chosen": -50.926918029785156, "logps/rejected": -73.72340393066406, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -4.107477188110352, "rewards/margins": 12.7825927734375, "rewards/rejected": -16.89006996154785, "step": 1146 }, { "epoch": 6.797037037037037, "grad_norm": 0.13788508775382483, "learning_rate": 3.204682482291959e-08, "logits/chosen": -1.755540370941162, "logits/rejected": -1.726999044418335, "logps/chosen": -41.485496520996094, "logps/rejected": -68.89039611816406, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -2.579153060913086, "rewards/margins": 10.574809074401855, "rewards/rejected": -13.153963088989258, "step": 1147 }, { "epoch": 6.802962962962963, "grad_norm": 0.08315953883685909, "learning_rate": 3.172934848712272e-08, "logits/chosen": -2.2260100841522217, "logits/rejected": -1.9643924236297607, "logps/chosen": -36.83528137207031, "logps/rejected": -80.32170104980469, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.6309359073638916, "rewards/margins": 12.441807746887207, "rewards/rejected": -15.072744369506836, "step": 1148 }, { "epoch": 6.808888888888889, "grad_norm": 0.1377713974555495, "learning_rate": 3.141334596385447e-08, "logits/chosen": -2.1011102199554443, "logits/rejected": -1.9738458395004272, "logps/chosen": -42.32820129394531, "logps/rejected": -70.49113464355469, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.5573666095733643, "rewards/margins": 11.54711627960205, "rewards/rejected": -14.104482650756836, "step": 1149 }, { "epoch": 6.814814814814815, "grad_norm": 0.09606738124748687, "learning_rate": 3.109881938683492e-08, "logits/chosen": -1.3309812545776367, "logits/rejected": -1.147838830947876, "logps/chosen": -32.87968444824219, "logps/rejected": -77.53007507324219, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.162910223007202, "rewards/margins": 12.752610206604004, "rewards/rejected": -14.915519714355469, "step": 1150 }, { "epoch": 6.8207407407407405, "grad_norm": 0.10629644897987281, "learning_rate": 3.078577087981832e-08, "logits/chosen": -1.8460427522659302, "logits/rejected": -1.6246216297149658, "logps/chosen": -46.028106689453125, "logps/rejected": -93.64574432373047, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.4755055904388428, "rewards/margins": 14.272528648376465, "rewards/rejected": -17.748035430908203, "step": 1151 }, { "epoch": 6.826666666666666, "grad_norm": 0.138240279213544, "learning_rate": 3.047420255657851e-08, "logits/chosen": -1.4086066484451294, "logits/rejected": -1.3227324485778809, "logps/chosen": -47.98810577392578, "logps/rejected": -78.81871032714844, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -4.2726149559021, "rewards/margins": 12.465730667114258, "rewards/rejected": -16.738346099853516, "step": 1152 }, { "epoch": 6.832592592592593, "grad_norm": 0.06213028998609326, "learning_rate": 3.016411652089493e-08, "logits/chosen": -1.773051381111145, "logits/rejected": -1.5124402046203613, "logps/chosen": -38.5433235168457, "logps/rejected": -79.8031234741211, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.6178457736968994, "rewards/margins": 13.169754028320312, "rewards/rejected": -15.787599563598633, "step": 1153 }, { "epoch": 6.838518518518518, "grad_norm": 0.09071262739179312, "learning_rate": 2.985551486653823e-08, "logits/chosen": -1.7397191524505615, "logits/rejected": -1.4591525793075562, "logps/chosen": -43.423797607421875, "logps/rejected": -88.40834045410156, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.5840625762939453, "rewards/margins": 14.329594612121582, "rewards/rejected": -16.913658142089844, "step": 1154 }, { "epoch": 6.844444444444444, "grad_norm": 0.1022541987842882, "learning_rate": 2.954839967725617e-08, "logits/chosen": -1.8566625118255615, "logits/rejected": -1.992887258529663, "logps/chosen": -53.398765563964844, "logps/rejected": -72.7064208984375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.7381906509399414, "rewards/margins": 12.036535263061523, "rewards/rejected": -15.774726867675781, "step": 1155 }, { "epoch": 6.850370370370371, "grad_norm": 0.12377028777884179, "learning_rate": 2.924277302675962e-08, "logits/chosen": -1.9297981262207031, "logits/rejected": -1.9528807401657104, "logps/chosen": -46.666847229003906, "logps/rejected": -66.36315155029297, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.1120994091033936, "rewards/margins": 10.05501937866211, "rewards/rejected": -13.167118072509766, "step": 1156 }, { "epoch": 6.856296296296296, "grad_norm": 0.14259671866915122, "learning_rate": 2.893863697870841e-08, "logits/chosen": -1.6172630786895752, "logits/rejected": -1.6391055583953857, "logps/chosen": -49.92350769042969, "logps/rejected": -71.48528289794922, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -4.502070426940918, "rewards/margins": 11.089262008666992, "rewards/rejected": -15.591333389282227, "step": 1157 }, { "epoch": 6.862222222222222, "grad_norm": 0.10511089653127045, "learning_rate": 2.863599358669755e-08, "logits/chosen": -1.3771047592163086, "logits/rejected": -1.4124231338500977, "logps/chosen": -46.10894012451172, "logps/rejected": -82.56547546386719, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.284719467163086, "rewards/margins": 13.74131965637207, "rewards/rejected": -16.026039123535156, "step": 1158 }, { "epoch": 6.868148148148148, "grad_norm": 0.1426796176134692, "learning_rate": 2.8334844894243287e-08, "logits/chosen": -2.127091646194458, "logits/rejected": -2.0703983306884766, "logps/chosen": -73.11487579345703, "logps/rejected": -92.82722473144531, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -5.963024139404297, "rewards/margins": 11.033499717712402, "rewards/rejected": -16.996522903442383, "step": 1159 }, { "epoch": 6.874074074074074, "grad_norm": 0.10377975946996132, "learning_rate": 2.803519293476936e-08, "logits/chosen": -1.9476478099822998, "logits/rejected": -2.002642869949341, "logps/chosen": -51.1083984375, "logps/rejected": -82.77229309082031, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -4.5188798904418945, "rewards/margins": 14.644804000854492, "rewards/rejected": -19.16368293762207, "step": 1160 }, { "epoch": 6.88, "grad_norm": 0.14750986834387664, "learning_rate": 2.7737039731593138e-08, "logits/chosen": -1.6242280006408691, "logits/rejected": -1.6870909929275513, "logps/chosen": -59.50197982788086, "logps/rejected": -80.40160369873047, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -4.18382453918457, "rewards/margins": 10.354097366333008, "rewards/rejected": -14.537919998168945, "step": 1161 }, { "epoch": 6.885925925925926, "grad_norm": 0.15189056943975968, "learning_rate": 2.7440387297912122e-08, "logits/chosen": -1.9823570251464844, "logits/rejected": -2.019186019897461, "logps/chosen": -40.22568130493164, "logps/rejected": -73.96324920654297, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.187943935394287, "rewards/margins": 12.574700355529785, "rewards/rejected": -14.762643814086914, "step": 1162 }, { "epoch": 6.891851851851852, "grad_norm": 0.06879956370628074, "learning_rate": 2.7145237636790276e-08, "logits/chosen": -1.6803816556930542, "logits/rejected": -1.5852973461151123, "logps/chosen": -54.0800895690918, "logps/rejected": -79.88774871826172, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -5.2677531242370605, "rewards/margins": 10.459793090820312, "rewards/rejected": -15.727546691894531, "step": 1163 }, { "epoch": 6.897777777777778, "grad_norm": 0.0806033063593912, "learning_rate": 2.685159274114443e-08, "logits/chosen": -1.5133965015411377, "logits/rejected": -1.4351083040237427, "logps/chosen": -46.148075103759766, "logps/rejected": -72.73712921142578, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -4.488842010498047, "rewards/margins": 10.604263305664062, "rewards/rejected": -15.09310531616211, "step": 1164 }, { "epoch": 6.9037037037037035, "grad_norm": 0.08411857989859493, "learning_rate": 2.6559454593731072e-08, "logits/chosen": -1.4596953392028809, "logits/rejected": -1.104498028755188, "logps/chosen": -47.473480224609375, "logps/rejected": -97.63139343261719, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -4.91493558883667, "rewards/margins": 16.23548126220703, "rewards/rejected": -21.150415420532227, "step": 1165 }, { "epoch": 6.90962962962963, "grad_norm": 0.07035183659263981, "learning_rate": 2.6268825167132636e-08, "logits/chosen": -1.459460735321045, "logits/rejected": -1.4765464067459106, "logps/chosen": -45.411216735839844, "logps/rejected": -77.99122619628906, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.320858955383301, "rewards/margins": 11.635019302368164, "rewards/rejected": -14.955877304077148, "step": 1166 }, { "epoch": 6.915555555555556, "grad_norm": 0.15470041672532103, "learning_rate": 2.5979706423744392e-08, "logits/chosen": -1.671322226524353, "logits/rejected": -1.7293105125427246, "logps/chosen": -45.03539276123047, "logps/rejected": -71.42250061035156, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.933180809020996, "rewards/margins": 11.5748929977417, "rewards/rejected": -15.508073806762695, "step": 1167 }, { "epoch": 6.921481481481481, "grad_norm": 0.1191864531412148, "learning_rate": 2.5692100315761023e-08, "logits/chosen": -2.120941162109375, "logits/rejected": -2.0219168663024902, "logps/chosen": -71.7652587890625, "logps/rejected": -108.49716186523438, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -6.246131896972656, "rewards/margins": 12.393702507019043, "rewards/rejected": -18.639835357666016, "step": 1168 }, { "epoch": 6.927407407407408, "grad_norm": 0.1348876028904682, "learning_rate": 2.5406008785163717e-08, "logits/chosen": -1.3151806592941284, "logits/rejected": -1.417667269706726, "logps/chosen": -53.786460876464844, "logps/rejected": -87.53273010253906, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -4.237397193908691, "rewards/margins": 12.491253852844238, "rewards/rejected": -16.728652954101562, "step": 1169 }, { "epoch": 6.933333333333334, "grad_norm": 0.07570071684837183, "learning_rate": 2.512143376370682e-08, "logits/chosen": -1.1454825401306152, "logits/rejected": -1.2216933965682983, "logps/chosen": -37.72205352783203, "logps/rejected": -65.66807556152344, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -1.8132212162017822, "rewards/margins": 11.190628051757812, "rewards/rejected": -13.003849029541016, "step": 1170 }, { "epoch": 6.939259259259259, "grad_norm": 0.2254278351780427, "learning_rate": 2.4838377172904907e-08, "logits/chosen": -1.7721288204193115, "logits/rejected": -1.6282906532287598, "logps/chosen": -56.43745803833008, "logps/rejected": -84.1759033203125, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -4.852671146392822, "rewards/margins": 11.335627555847168, "rewards/rejected": -16.18829917907715, "step": 1171 }, { "epoch": 6.945185185185185, "grad_norm": 0.048586698362349696, "learning_rate": 2.455684092401969e-08, "logits/chosen": -1.8530497550964355, "logits/rejected": -1.7356008291244507, "logps/chosen": -31.423507690429688, "logps/rejected": -75.07907104492188, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -1.9417579174041748, "rewards/margins": 14.43879222869873, "rewards/rejected": -16.38054847717285, "step": 1172 }, { "epoch": 6.9511111111111115, "grad_norm": 0.08382029726208863, "learning_rate": 2.4276826918047277e-08, "logits/chosen": -1.500767469406128, "logits/rejected": -1.3265349864959717, "logps/chosen": -54.78949737548828, "logps/rejected": -90.59709167480469, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -4.294247150421143, "rewards/margins": 13.05902099609375, "rewards/rejected": -17.353267669677734, "step": 1173 }, { "epoch": 6.957037037037037, "grad_norm": 0.1279084717839541, "learning_rate": 2.399833704570517e-08, "logits/chosen": -1.5946778059005737, "logits/rejected": -1.6028144359588623, "logps/chosen": -35.326480865478516, "logps/rejected": -65.81742095947266, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -1.336696982383728, "rewards/margins": 11.668586730957031, "rewards/rejected": -13.00528335571289, "step": 1174 }, { "epoch": 6.962962962962963, "grad_norm": 0.16211559794676547, "learning_rate": 2.372137318741968e-08, "logits/chosen": -2.113690137863159, "logits/rejected": -1.9204535484313965, "logps/chosen": -61.13996505737305, "logps/rejected": -97.43546295166016, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.8684582710266113, "rewards/margins": 13.243110656738281, "rewards/rejected": -17.111570358276367, "step": 1175 }, { "epoch": 6.968888888888889, "grad_norm": 0.19733756655662368, "learning_rate": 2.3445937213313062e-08, "logits/chosen": -1.8909013271331787, "logits/rejected": -1.9597067832946777, "logps/chosen": -68.76529693603516, "logps/rejected": -95.87532043457031, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -6.259731292724609, "rewards/margins": 13.89039421081543, "rewards/rejected": -20.150123596191406, "step": 1176 }, { "epoch": 6.974814814814815, "grad_norm": 0.12202352629812559, "learning_rate": 2.3172030983190926e-08, "logits/chosen": -1.5236730575561523, "logits/rejected": -1.5534064769744873, "logps/chosen": -35.21092224121094, "logps/rejected": -61.23134994506836, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -1.4225057363510132, "rewards/margins": 10.738033294677734, "rewards/rejected": -12.160539627075195, "step": 1177 }, { "epoch": 6.980740740740741, "grad_norm": 0.16135077452071456, "learning_rate": 2.2899656346529768e-08, "logits/chosen": -1.932398796081543, "logits/rejected": -2.1537177562713623, "logps/chosen": -48.79818344116211, "logps/rejected": -61.05226516723633, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -5.045696258544922, "rewards/margins": 9.089506149291992, "rewards/rejected": -14.135202407836914, "step": 1178 }, { "epoch": 6.986666666666666, "grad_norm": 0.07643197243759331, "learning_rate": 2.2628815142464342e-08, "logits/chosen": -1.6719024181365967, "logits/rejected": -1.4917726516723633, "logps/chosen": -45.62458419799805, "logps/rejected": -90.16924285888672, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -4.244566917419434, "rewards/margins": 13.924651145935059, "rewards/rejected": -18.169218063354492, "step": 1179 }, { "epoch": 6.992592592592593, "grad_norm": 0.126597010412571, "learning_rate": 2.2359509199775446e-08, "logits/chosen": -1.7478291988372803, "logits/rejected": -1.7482573986053467, "logps/chosen": -52.008060455322266, "logps/rejected": -87.12303161621094, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.8778417110443115, "rewards/margins": 13.71127700805664, "rewards/rejected": -16.58911895751953, "step": 1180 }, { "epoch": 6.998518518518519, "grad_norm": 0.12244130358438209, "learning_rate": 2.2091740336877358e-08, "logits/chosen": -1.7030671834945679, "logits/rejected": -1.458512783050537, "logps/chosen": -53.036376953125, "logps/rejected": -104.52532958984375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -5.178653717041016, "rewards/margins": 13.309745788574219, "rewards/rejected": -18.488399505615234, "step": 1181 }, { "epoch": 7.004444444444444, "grad_norm": 0.10438348279258286, "learning_rate": 2.1825510361805576e-08, "logits/chosen": -1.7498127222061157, "logits/rejected": -1.8160085678100586, "logps/chosen": -35.32305145263672, "logps/rejected": -67.66831970214844, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -1.5238306522369385, "rewards/margins": 12.23428726196289, "rewards/rejected": -13.75811767578125, "step": 1182 }, { "epoch": 7.010370370370371, "grad_norm": 0.10306756779932433, "learning_rate": 2.156082107220486e-08, "logits/chosen": -1.6162192821502686, "logits/rejected": -1.6883609294891357, "logps/chosen": -39.13275909423828, "logps/rejected": -70.13416290283203, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.180786371231079, "rewards/margins": 11.219168663024902, "rewards/rejected": -14.399954795837402, "step": 1183 }, { "epoch": 7.0162962962962965, "grad_norm": 0.15017006309609432, "learning_rate": 2.129767425531673e-08, "logits/chosen": -2.2658438682556152, "logits/rejected": -2.1510543823242188, "logps/chosen": -53.666160583496094, "logps/rejected": -82.80941772460938, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -4.077384948730469, "rewards/margins": 12.220909118652344, "rewards/rejected": -16.298294067382812, "step": 1184 }, { "epoch": 7.022222222222222, "grad_norm": 0.11969844631684366, "learning_rate": 2.1036071687967783e-08, "logits/chosen": -1.3385977745056152, "logits/rejected": -1.4503644704818726, "logps/chosen": -63.32877731323242, "logps/rejected": -79.03787994384766, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -7.450685501098633, "rewards/margins": 10.056939125061035, "rewards/rejected": -17.50762367248535, "step": 1185 }, { "epoch": 7.028148148148148, "grad_norm": 0.09289746206047685, "learning_rate": 2.077601513655733e-08, "logits/chosen": -1.334474802017212, "logits/rejected": -1.4268428087234497, "logps/chosen": -42.259456634521484, "logps/rejected": -63.025047302246094, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.244335174560547, "rewards/margins": 10.316512107849121, "rewards/rejected": -13.560847282409668, "step": 1186 }, { "epoch": 7.034074074074074, "grad_norm": 0.0993388595523258, "learning_rate": 2.0517506357045715e-08, "logits/chosen": -1.8795228004455566, "logits/rejected": -1.7030866146087646, "logps/chosen": -50.661319732666016, "logps/rejected": -95.91410827636719, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -4.647890090942383, "rewards/margins": 13.566815376281738, "rewards/rejected": -18.214706420898438, "step": 1187 }, { "epoch": 7.04, "grad_norm": 0.13225732135263005, "learning_rate": 2.0260547094942348e-08, "logits/chosen": -1.479757308959961, "logits/rejected": -1.536267638206482, "logps/chosen": -43.72503662109375, "logps/rejected": -73.44718933105469, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.7695412635803223, "rewards/margins": 12.269949913024902, "rewards/rejected": -16.03948974609375, "step": 1188 }, { "epoch": 7.045925925925926, "grad_norm": 0.15398598189046675, "learning_rate": 2.0005139085293942e-08, "logits/chosen": -1.760892391204834, "logits/rejected": -1.7211508750915527, "logps/chosen": -59.702056884765625, "logps/rejected": -85.88159942626953, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -5.470168590545654, "rewards/margins": 11.876346588134766, "rewards/rejected": -17.346511840820312, "step": 1189 }, { "epoch": 7.051851851851852, "grad_norm": 0.1244116390199716, "learning_rate": 1.9751284052672873e-08, "logits/chosen": -1.8358004093170166, "logits/rejected": -1.6322147846221924, "logps/chosen": -49.1774787902832, "logps/rejected": -78.3975830078125, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -4.361578941345215, "rewards/margins": 11.528911590576172, "rewards/rejected": -15.890491485595703, "step": 1190 }, { "epoch": 7.057777777777778, "grad_norm": 0.11892932478519877, "learning_rate": 1.9498983711165345e-08, "logits/chosen": -1.858366847038269, "logits/rejected": -1.6905018091201782, "logps/chosen": -42.442138671875, "logps/rejected": -95.295166015625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.855883836746216, "rewards/margins": 16.18788719177246, "rewards/rejected": -20.043773651123047, "step": 1191 }, { "epoch": 7.063703703703704, "grad_norm": 0.1370620549733962, "learning_rate": 1.9248239764360048e-08, "logits/chosen": -1.8696699142456055, "logits/rejected": -2.089273691177368, "logps/chosen": -54.89391326904297, "logps/rejected": -64.32718658447266, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -1.9205586910247803, "rewards/margins": 9.19931411743164, "rewards/rejected": -11.119873046875, "step": 1192 }, { "epoch": 7.069629629629629, "grad_norm": 0.1198645761373536, "learning_rate": 1.899905390533649e-08, "logits/chosen": -2.04950213432312, "logits/rejected": -1.9079272747039795, "logps/chosen": -45.07698059082031, "logps/rejected": -74.14395904541016, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.3388748168945312, "rewards/margins": 11.454214096069336, "rewards/rejected": -13.793089866638184, "step": 1193 }, { "epoch": 7.075555555555556, "grad_norm": 0.10462021921279856, "learning_rate": 1.8751427816653618e-08, "logits/chosen": -1.4157465696334839, "logits/rejected": -1.3236104249954224, "logps/chosen": -38.29302215576172, "logps/rejected": -66.2728271484375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.617685317993164, "rewards/margins": 11.805870056152344, "rewards/rejected": -15.423555374145508, "step": 1194 }, { "epoch": 7.0814814814814815, "grad_norm": 0.10647414220192886, "learning_rate": 1.8505363170338517e-08, "logits/chosen": -1.7032594680786133, "logits/rejected": -1.5659446716308594, "logps/chosen": -52.43858337402344, "logps/rejected": -85.01385498046875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.4055228233337402, "rewards/margins": 12.859024047851562, "rewards/rejected": -16.264545440673828, "step": 1195 }, { "epoch": 7.087407407407407, "grad_norm": 0.1217673476066229, "learning_rate": 1.826086162787499e-08, "logits/chosen": -1.2698402404785156, "logits/rejected": -1.432144045829773, "logps/chosen": -45.8774528503418, "logps/rejected": -66.63996887207031, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.07426118850708, "rewards/margins": 9.285086631774902, "rewards/rejected": -12.35934829711914, "step": 1196 }, { "epoch": 7.093333333333334, "grad_norm": 0.130593236991155, "learning_rate": 1.8017924840192433e-08, "logits/chosen": -1.5936301946640015, "logits/rejected": -1.5533777475357056, "logps/chosen": -41.87461853027344, "logps/rejected": -65.9970932006836, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -2.350405216217041, "rewards/margins": 11.087535858154297, "rewards/rejected": -13.437941551208496, "step": 1197 }, { "epoch": 7.099259259259259, "grad_norm": 0.08742307775397884, "learning_rate": 1.7776554447654717e-08, "logits/chosen": -1.670688271522522, "logits/rejected": -1.7398487329483032, "logps/chosen": -49.48249816894531, "logps/rejected": -74.95970153808594, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -4.721832275390625, "rewards/margins": 11.39289665222168, "rewards/rejected": -16.114727020263672, "step": 1198 }, { "epoch": 7.105185185185185, "grad_norm": 0.22187158357419212, "learning_rate": 1.7536752080048955e-08, "logits/chosen": -1.444218635559082, "logits/rejected": -1.2818149328231812, "logps/chosen": -54.006370544433594, "logps/rejected": -89.37923431396484, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -4.9230194091796875, "rewards/margins": 12.409586906433105, "rewards/rejected": -17.332605361938477, "step": 1199 }, { "epoch": 7.111111111111111, "grad_norm": 0.11655345767008983, "learning_rate": 1.7298519356574726e-08, "logits/chosen": -1.6009925603866577, "logits/rejected": -1.6573829650878906, "logps/chosen": -44.316978454589844, "logps/rejected": -74.49774169921875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.8197593688964844, "rewards/margins": 10.937904357910156, "rewards/rejected": -14.75766372680664, "step": 1200 }, { "epoch": 7.117037037037037, "grad_norm": 0.1087475095181626, "learning_rate": 1.706185788583289e-08, "logits/chosen": -1.8507537841796875, "logits/rejected": -1.7265064716339111, "logps/chosen": -45.44957733154297, "logps/rejected": -79.27318572998047, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.685542583465576, "rewards/margins": 13.47083854675293, "rewards/rejected": -16.156381607055664, "step": 1201 }, { "epoch": 7.122962962962963, "grad_norm": 0.1065517059466155, "learning_rate": 1.6826769265815e-08, "logits/chosen": -1.3087055683135986, "logits/rejected": -1.2989039421081543, "logps/chosen": -41.19493103027344, "logps/rejected": -81.62377166748047, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.281460762023926, "rewards/margins": 13.077204704284668, "rewards/rejected": -16.358665466308594, "step": 1202 }, { "epoch": 7.128888888888889, "grad_norm": 0.11318510157348385, "learning_rate": 1.6593255083892228e-08, "logits/chosen": -1.4445735216140747, "logits/rejected": -1.2522766590118408, "logps/chosen": -47.712623596191406, "logps/rejected": -87.87376403808594, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.7921104431152344, "rewards/margins": 11.766212463378906, "rewards/rejected": -15.55832290649414, "step": 1203 }, { "epoch": 7.134814814814815, "grad_norm": 0.14086531041944936, "learning_rate": 1.6361316916804896e-08, "logits/chosen": -2.008540153503418, "logits/rejected": -2.004145622253418, "logps/chosen": -43.6939582824707, "logps/rejected": -76.02693176269531, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.715010166168213, "rewards/margins": 11.305937767028809, "rewards/rejected": -14.020947456359863, "step": 1204 }, { "epoch": 7.140740740740741, "grad_norm": 0.07673022638161378, "learning_rate": 1.6130956330651646e-08, "logits/chosen": -1.7630785703659058, "logits/rejected": -1.8952760696411133, "logps/chosen": -35.75289535522461, "logps/rejected": -61.844886779785156, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.7321906089782715, "rewards/margins": 11.385841369628906, "rewards/rejected": -14.118032455444336, "step": 1205 }, { "epoch": 7.1466666666666665, "grad_norm": 0.12215685713176862, "learning_rate": 1.5902174880878916e-08, "logits/chosen": -1.8901073932647705, "logits/rejected": -1.8296229839324951, "logps/chosen": -38.18608856201172, "logps/rejected": -79.75143432617188, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.1476891040802, "rewards/margins": 14.448156356811523, "rewards/rejected": -16.595844268798828, "step": 1206 }, { "epoch": 7.152592592592592, "grad_norm": 0.12354753151439266, "learning_rate": 1.567497411227059e-08, "logits/chosen": -2.1340646743774414, "logits/rejected": -2.106337785720825, "logps/chosen": -56.164154052734375, "logps/rejected": -88.27549743652344, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -5.686497211456299, "rewards/margins": 13.833208084106445, "rewards/rejected": -19.51970672607422, "step": 1207 }, { "epoch": 7.158518518518519, "grad_norm": 0.15837268371755486, "learning_rate": 1.5449355558937337e-08, "logits/chosen": -2.158918857574463, "logits/rejected": -2.007082939147949, "logps/chosen": -50.713157653808594, "logps/rejected": -83.93782043457031, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -2.5467233657836914, "rewards/margins": 13.906265258789062, "rewards/rejected": -16.452987670898438, "step": 1208 }, { "epoch": 7.164444444444444, "grad_norm": 0.09623401985337293, "learning_rate": 1.5225320744306408e-08, "logits/chosen": -1.6897797584533691, "logits/rejected": -1.607255220413208, "logps/chosen": -40.003414154052734, "logps/rejected": -81.5127182006836, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.148927688598633, "rewards/margins": 12.95841121673584, "rewards/rejected": -16.107337951660156, "step": 1209 }, { "epoch": 7.17037037037037, "grad_norm": 0.1268425724222277, "learning_rate": 1.5002871181111153e-08, "logits/chosen": -1.3181800842285156, "logits/rejected": -1.3308837413787842, "logps/chosen": -44.879539489746094, "logps/rejected": -76.36341857910156, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.692798614501953, "rewards/margins": 12.881628036499023, "rewards/rejected": -16.574426651000977, "step": 1210 }, { "epoch": 7.176296296296297, "grad_norm": 0.14576067610000562, "learning_rate": 1.4782008371381105e-08, "logits/chosen": -1.5605270862579346, "logits/rejected": -1.4438588619232178, "logps/chosen": -46.77557373046875, "logps/rejected": -87.78450012207031, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -4.457500457763672, "rewards/margins": 12.943517684936523, "rewards/rejected": -17.401016235351562, "step": 1211 }, { "epoch": 7.182222222222222, "grad_norm": 0.0901720784546393, "learning_rate": 1.4562733806431666e-08, "logits/chosen": -2.1737942695617676, "logits/rejected": -2.242323875427246, "logps/chosen": -37.04216766357422, "logps/rejected": -68.79451751708984, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -1.2785842418670654, "rewards/margins": 11.042994499206543, "rewards/rejected": -12.321578025817871, "step": 1212 }, { "epoch": 7.188148148148148, "grad_norm": 0.15482314701909852, "learning_rate": 1.434504896685393e-08, "logits/chosen": -1.648055076599121, "logits/rejected": -1.6363328695297241, "logps/chosen": -45.66598892211914, "logps/rejected": -70.53289794921875, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -2.6662845611572266, "rewards/margins": 12.108028411865234, "rewards/rejected": -14.774312973022461, "step": 1213 }, { "epoch": 7.194074074074074, "grad_norm": 0.11655454571606337, "learning_rate": 1.4128955322504965e-08, "logits/chosen": -1.491405725479126, "logits/rejected": -1.4429785013198853, "logps/chosen": -51.250797271728516, "logps/rejected": -85.06803894042969, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.8182194232940674, "rewards/margins": 11.68270492553711, "rewards/rejected": -15.500925064086914, "step": 1214 }, { "epoch": 7.2, "grad_norm": 0.1574759664564603, "learning_rate": 1.3914454332497604e-08, "logits/chosen": -1.9412479400634766, "logits/rejected": -1.8919508457183838, "logps/chosen": -39.027462005615234, "logps/rejected": -71.54948425292969, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -2.6357011795043945, "rewards/margins": 13.384864807128906, "rewards/rejected": -16.020565032958984, "step": 1215 }, { "epoch": 7.205925925925926, "grad_norm": 0.10315524246712969, "learning_rate": 1.3701547445190836e-08, "logits/chosen": -1.7386341094970703, "logits/rejected": -1.432328701019287, "logps/chosen": -57.30453109741211, "logps/rejected": -104.63716125488281, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -4.707429885864258, "rewards/margins": 13.92609977722168, "rewards/rejected": -18.633529663085938, "step": 1216 }, { "epoch": 7.2118518518518515, "grad_norm": 0.11981906949643559, "learning_rate": 1.3490236098179813e-08, "logits/chosen": -1.6856032609939575, "logits/rejected": -1.7121961116790771, "logps/chosen": -53.183937072753906, "logps/rejected": -97.51420593261719, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -4.14708137512207, "rewards/margins": 13.861615180969238, "rewards/rejected": -18.008697509765625, "step": 1217 }, { "epoch": 7.217777777777778, "grad_norm": 0.1193832583649323, "learning_rate": 1.3280521718286253e-08, "logits/chosen": -1.4530967473983765, "logits/rejected": -1.2759112119674683, "logps/chosen": -45.5231819152832, "logps/rejected": -75.6134262084961, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -4.124416828155518, "rewards/margins": 11.91776180267334, "rewards/rejected": -16.042179107666016, "step": 1218 }, { "epoch": 7.223703703703704, "grad_norm": 0.09709503888659948, "learning_rate": 1.3072405721548857e-08, "logits/chosen": -1.8256022930145264, "logits/rejected": -2.0024163722991943, "logps/chosen": -61.09766387939453, "logps/rejected": -71.4029312133789, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.887969493865967, "rewards/margins": 9.966340065002441, "rewards/rejected": -13.854310035705566, "step": 1219 }, { "epoch": 7.229629629629629, "grad_norm": 0.10475361104381965, "learning_rate": 1.2865889513213628e-08, "logits/chosen": -1.619799256324768, "logits/rejected": -1.7144474983215332, "logps/chosen": -47.38188171386719, "logps/rejected": -72.77843475341797, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -4.503984451293945, "rewards/margins": 10.436614990234375, "rewards/rejected": -14.94059944152832, "step": 1220 }, { "epoch": 7.235555555555556, "grad_norm": 0.07688143241284169, "learning_rate": 1.2660974487724407e-08, "logits/chosen": -1.3199595212936401, "logits/rejected": -1.292825698852539, "logps/chosen": -42.49819564819336, "logps/rejected": -77.67166137695312, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -4.343793869018555, "rewards/margins": 13.139480590820312, "rewards/rejected": -17.4832763671875, "step": 1221 }, { "epoch": 7.241481481481482, "grad_norm": 0.060681308189254796, "learning_rate": 1.2457662028713594e-08, "logits/chosen": -1.419334888458252, "logits/rejected": -1.3187925815582275, "logps/chosen": -36.37110137939453, "logps/rejected": -81.13549041748047, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.139266014099121, "rewards/margins": 13.64179801940918, "rewards/rejected": -16.781063079833984, "step": 1222 }, { "epoch": 7.247407407407407, "grad_norm": 0.10166151212202382, "learning_rate": 1.2255953508992612e-08, "logits/chosen": -1.9275920391082764, "logits/rejected": -1.8258066177368164, "logps/chosen": -51.44398880004883, "logps/rejected": -87.87835693359375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -5.75351095199585, "rewards/margins": 12.301464080810547, "rewards/rejected": -18.054975509643555, "step": 1223 }, { "epoch": 7.253333333333333, "grad_norm": 0.07296420361158287, "learning_rate": 1.205585029054279e-08, "logits/chosen": -1.6533150672912598, "logits/rejected": -1.7473328113555908, "logps/chosen": -56.52097702026367, "logps/rejected": -86.59286499023438, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -5.681492805480957, "rewards/margins": 11.921660423278809, "rewards/rejected": -17.603153228759766, "step": 1224 }, { "epoch": 7.2592592592592595, "grad_norm": 0.0960882965233299, "learning_rate": 1.1857353724505942e-08, "logits/chosen": -1.789186716079712, "logits/rejected": -1.7957122325897217, "logps/chosen": -55.02898406982422, "logps/rejected": -95.71049499511719, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -5.103061199188232, "rewards/margins": 14.242263793945312, "rewards/rejected": -19.34532356262207, "step": 1225 }, { "epoch": 7.265185185185185, "grad_norm": 0.11358951568845953, "learning_rate": 1.1660465151175664e-08, "logits/chosen": -2.1114211082458496, "logits/rejected": -2.090874671936035, "logps/chosen": -44.74166488647461, "logps/rejected": -83.52067565917969, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.4121861457824707, "rewards/margins": 13.98027515411377, "rewards/rejected": -17.3924617767334, "step": 1226 }, { "epoch": 7.271111111111111, "grad_norm": 0.0780915485477315, "learning_rate": 1.1465185899987794e-08, "logits/chosen": -1.786757230758667, "logits/rejected": -1.7711902856826782, "logps/chosen": -44.79767608642578, "logps/rejected": -80.92269134521484, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -4.684630870819092, "rewards/margins": 12.207763671875, "rewards/rejected": -16.89239501953125, "step": 1227 }, { "epoch": 7.277037037037037, "grad_norm": 0.08736095702879869, "learning_rate": 1.1271517289511783e-08, "logits/chosen": -1.7297133207321167, "logits/rejected": -1.6254792213439941, "logps/chosen": -46.41914367675781, "logps/rejected": -73.42701721191406, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -4.2106242179870605, "rewards/margins": 10.198822021484375, "rewards/rejected": -14.409445762634277, "step": 1228 }, { "epoch": 7.282962962962963, "grad_norm": 0.12460175164369738, "learning_rate": 1.1079460627441666e-08, "logits/chosen": -2.1245529651641846, "logits/rejected": -1.9352598190307617, "logps/chosen": -30.707019805908203, "logps/rejected": -71.42001342773438, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -1.4688324928283691, "rewards/margins": 12.905763626098633, "rewards/rejected": -14.374595642089844, "step": 1229 }, { "epoch": 7.288888888888889, "grad_norm": 0.04378917663791941, "learning_rate": 1.0889017210587215e-08, "logits/chosen": -1.6909232139587402, "logits/rejected": -1.1679630279541016, "logps/chosen": -45.363487243652344, "logps/rejected": -105.81968688964844, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.5888261795043945, "rewards/margins": 15.115147590637207, "rewards/rejected": -18.7039737701416, "step": 1230 }, { "epoch": 7.294814814814814, "grad_norm": 0.10066231851710405, "learning_rate": 1.0700188324865189e-08, "logits/chosen": -1.2599670886993408, "logits/rejected": -1.2062746286392212, "logps/chosen": -55.062339782714844, "logps/rejected": -85.76885223388672, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.546828031539917, "rewards/margins": 12.588035583496094, "rewards/rejected": -16.134864807128906, "step": 1231 }, { "epoch": 7.300740740740741, "grad_norm": 0.10055897394398317, "learning_rate": 1.0512975245290685e-08, "logits/chosen": -1.695151448249817, "logits/rejected": -1.590423345565796, "logps/chosen": -32.681095123291016, "logps/rejected": -64.72657012939453, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.348266839981079, "rewards/margins": 10.46805191040039, "rewards/rejected": -12.81631851196289, "step": 1232 }, { "epoch": 7.306666666666667, "grad_norm": 0.1545562358316079, "learning_rate": 1.0327379235968548e-08, "logits/chosen": -1.6161727905273438, "logits/rejected": -1.5110998153686523, "logps/chosen": -36.74101257324219, "logps/rejected": -69.12883758544922, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -2.4509549140930176, "rewards/margins": 11.02175521850586, "rewards/rejected": -13.472709655761719, "step": 1233 }, { "epoch": 7.312592592592592, "grad_norm": 0.097607206610237, "learning_rate": 1.0143401550084751e-08, "logits/chosen": -2.2964179515838623, "logits/rejected": -2.0990958213806152, "logps/chosen": -41.50244140625, "logps/rejected": -97.56422424316406, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.9923524856567383, "rewards/margins": 13.67741584777832, "rewards/rejected": -16.669769287109375, "step": 1234 }, { "epoch": 7.318518518518519, "grad_norm": 0.18438840616271115, "learning_rate": 9.961043429898036e-09, "logits/chosen": -1.7009012699127197, "logits/rejected": -1.5941734313964844, "logps/chosen": -66.56523132324219, "logps/rejected": -89.49272155761719, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -4.492288112640381, "rewards/margins": 13.816864013671875, "rewards/rejected": -18.309152603149414, "step": 1235 }, { "epoch": 7.3244444444444445, "grad_norm": 0.10830083734526227, "learning_rate": 9.780306106731418e-09, "logits/chosen": -2.0649874210357666, "logits/rejected": -1.8523389101028442, "logps/chosen": -40.13294982910156, "logps/rejected": -87.09971618652344, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.167768716812134, "rewards/margins": 13.352956771850586, "rewards/rejected": -16.52072525024414, "step": 1236 }, { "epoch": 7.33037037037037, "grad_norm": 0.13555625022568338, "learning_rate": 9.601190800963942e-09, "logits/chosen": -1.7808589935302734, "logits/rejected": -1.6981477737426758, "logps/chosen": -38.0859489440918, "logps/rejected": -69.84672546386719, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.0800421237945557, "rewards/margins": 11.428643226623535, "rewards/rejected": -14.508685111999512, "step": 1237 }, { "epoch": 7.336296296296297, "grad_norm": 0.07701345679436172, "learning_rate": 9.423698722022505e-09, "logits/chosen": -1.7928366661071777, "logits/rejected": -1.6887304782867432, "logps/chosen": -53.604183197021484, "logps/rejected": -96.15071105957031, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -4.998568534851074, "rewards/margins": 13.373642921447754, "rewards/rejected": -18.372211456298828, "step": 1238 }, { "epoch": 7.342222222222222, "grad_norm": 0.09833084593667012, "learning_rate": 9.247831068373458e-09, "logits/chosen": -1.3438589572906494, "logits/rejected": -1.445772409439087, "logps/chosen": -50.54953384399414, "logps/rejected": -79.28211975097656, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.8045411109924316, "rewards/margins": 12.622430801391602, "rewards/rejected": -15.426971435546875, "step": 1239 }, { "epoch": 7.348148148148148, "grad_norm": 0.07525388467886729, "learning_rate": 9.073589027514789e-09, "logits/chosen": -1.3987599611282349, "logits/rejected": -1.130275011062622, "logps/chosen": -42.32148361206055, "logps/rejected": -93.08763122558594, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.6124842166900635, "rewards/margins": 16.635757446289062, "rewards/rejected": -19.248241424560547, "step": 1240 }, { "epoch": 7.354074074074074, "grad_norm": 0.06922858097247736, "learning_rate": 8.900973775967963e-09, "logits/chosen": -1.4953887462615967, "logits/rejected": -1.475359320640564, "logps/chosen": -38.44170379638672, "logps/rejected": -63.54737091064453, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.280339241027832, "rewards/margins": 10.804010391235352, "rewards/rejected": -13.0843505859375, "step": 1241 }, { "epoch": 7.36, "grad_norm": 0.09047335182590145, "learning_rate": 8.729986479269924e-09, "logits/chosen": -1.6588947772979736, "logits/rejected": -1.534369945526123, "logps/chosen": -51.98193359375, "logps/rejected": -84.31365966796875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.556525945663452, "rewards/margins": 14.048201560974121, "rewards/rejected": -17.604726791381836, "step": 1242 }, { "epoch": 7.365925925925926, "grad_norm": 0.12417063115745969, "learning_rate": 8.56062829196541e-09, "logits/chosen": -1.9344502687454224, "logits/rejected": -1.9332109689712524, "logps/chosen": -49.665287017822266, "logps/rejected": -76.17633819580078, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -2.408128261566162, "rewards/margins": 10.428698539733887, "rewards/rejected": -12.83682632446289, "step": 1243 }, { "epoch": 7.371851851851852, "grad_norm": 0.18621901326488574, "learning_rate": 8.392900357598959e-09, "logits/chosen": -1.5785934925079346, "logits/rejected": -1.6058554649353027, "logps/chosen": -56.491905212402344, "logps/rejected": -87.51421356201172, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -4.567201614379883, "rewards/margins": 13.711860656738281, "rewards/rejected": -18.27906036376953, "step": 1244 }, { "epoch": 7.377777777777778, "grad_norm": 0.2040690241681394, "learning_rate": 8.2268038087073e-09, "logits/chosen": -1.2561684846878052, "logits/rejected": -1.3627725839614868, "logps/chosen": -57.90671920776367, "logps/rejected": -68.91231536865234, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.6979713439941406, "rewards/margins": 11.910882949829102, "rewards/rejected": -15.608854293823242, "step": 1245 }, { "epoch": 7.383703703703704, "grad_norm": 0.09974348633584786, "learning_rate": 8.062339766811726e-09, "logits/chosen": -1.4112184047698975, "logits/rejected": -1.5091230869293213, "logps/chosen": -54.87425231933594, "logps/rejected": -84.4752197265625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.7919604778289795, "rewards/margins": 13.433491706848145, "rewards/rejected": -17.225452423095703, "step": 1246 }, { "epoch": 7.3896296296296295, "grad_norm": 0.09597782439298651, "learning_rate": 7.899509342410376e-09, "logits/chosen": -1.48615562915802, "logits/rejected": -1.1188569068908691, "logps/chosen": -43.99871063232422, "logps/rejected": -87.73038482666016, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.9534430503845215, "rewards/margins": 12.593347549438477, "rewards/rejected": -16.546789169311523, "step": 1247 }, { "epoch": 7.395555555555555, "grad_norm": 0.0992930007513707, "learning_rate": 7.738313634970962e-09, "logits/chosen": -2.0040321350097656, "logits/rejected": -1.9670236110687256, "logps/chosen": -46.382930755615234, "logps/rejected": -87.54319763183594, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.6569628715515137, "rewards/margins": 15.009819030761719, "rewards/rejected": -18.66678237915039, "step": 1248 }, { "epoch": 7.401481481481482, "grad_norm": 0.10285352268854367, "learning_rate": 7.578753732923132e-09, "logits/chosen": -1.5322126150131226, "logits/rejected": -1.5202000141143799, "logps/chosen": -48.83302307128906, "logps/rejected": -90.44819641113281, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.2107291221618652, "rewards/margins": 14.075639724731445, "rewards/rejected": -17.28636932373047, "step": 1249 }, { "epoch": 7.407407407407407, "grad_norm": 0.07384109176112451, "learning_rate": 7.4208307136512385e-09, "logits/chosen": -1.393030047416687, "logits/rejected": -1.254082202911377, "logps/chosen": -40.82709884643555, "logps/rejected": -70.26875305175781, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.902104377746582, "rewards/margins": 11.157198905944824, "rewards/rejected": -15.059304237365723, "step": 1250 }, { "epoch": 7.413333333333333, "grad_norm": 0.10033634481057024, "learning_rate": 7.2645456434869965e-09, "logits/chosen": -1.778923511505127, "logits/rejected": -1.6090879440307617, "logps/chosen": -52.271240234375, "logps/rejected": -99.33088684082031, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -4.88856315612793, "rewards/margins": 14.440168380737305, "rewards/rejected": -19.328731536865234, "step": 1251 }, { "epoch": 7.41925925925926, "grad_norm": 0.07932056686265246, "learning_rate": 7.109899577702389e-09, "logits/chosen": -1.5965080261230469, "logits/rejected": -1.5556455850601196, "logps/chosen": -38.09616470336914, "logps/rejected": -73.47975158691406, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -1.3132272958755493, "rewards/margins": 12.046424865722656, "rewards/rejected": -13.35965347290039, "step": 1252 }, { "epoch": 7.425185185185185, "grad_norm": 0.08761752858525221, "learning_rate": 6.956893560502358e-09, "logits/chosen": -2.030911445617676, "logits/rejected": -1.9399561882019043, "logps/chosen": -42.745521545410156, "logps/rejected": -75.47513580322266, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.092884063720703, "rewards/margins": 10.458334922790527, "rewards/rejected": -13.551218032836914, "step": 1253 }, { "epoch": 7.431111111111111, "grad_norm": 0.10380101419190634, "learning_rate": 6.805528625018014e-09, "logits/chosen": -1.7056939601898193, "logits/rejected": -1.5778566598892212, "logps/chosen": -54.559730529785156, "logps/rejected": -87.96572875976562, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -4.20688533782959, "rewards/margins": 12.745380401611328, "rewards/rejected": -16.9522647857666, "step": 1254 }, { "epoch": 7.437037037037037, "grad_norm": 0.10501305505929422, "learning_rate": 6.655805793299413e-09, "logits/chosen": -1.62470281124115, "logits/rejected": -1.469839334487915, "logps/chosen": -42.00901794433594, "logps/rejected": -80.80410766601562, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.080652952194214, "rewards/margins": 12.35036849975586, "rewards/rejected": -15.43101978302002, "step": 1255 }, { "epoch": 7.442962962962963, "grad_norm": 0.10261502163857839, "learning_rate": 6.5077260763087836e-09, "logits/chosen": -1.2496845722198486, "logits/rejected": -1.1165101528167725, "logps/chosen": -43.556697845458984, "logps/rejected": -72.50883483886719, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.3539299964904785, "rewards/margins": 10.830613136291504, "rewards/rejected": -14.18454360961914, "step": 1256 }, { "epoch": 7.448888888888889, "grad_norm": 0.13754126034863676, "learning_rate": 6.361290473913705e-09, "logits/chosen": -1.7846851348876953, "logits/rejected": -1.6844943761825562, "logps/chosen": -54.899391174316406, "logps/rejected": -99.20624542236328, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -4.213879108428955, "rewards/margins": 15.631208419799805, "rewards/rejected": -19.8450870513916, "step": 1257 }, { "epoch": 7.454814814814815, "grad_norm": 0.13332224892140737, "learning_rate": 6.216499974880274e-09, "logits/chosen": -1.4883358478546143, "logits/rejected": -1.5634472370147705, "logps/chosen": -45.47296142578125, "logps/rejected": -67.79109191894531, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.1466634273529053, "rewards/margins": 12.485091209411621, "rewards/rejected": -14.631753921508789, "step": 1258 }, { "epoch": 7.460740740740741, "grad_norm": 0.10757164589821686, "learning_rate": 6.073355556866527e-09, "logits/chosen": -1.5689035654067993, "logits/rejected": -1.6330454349517822, "logps/chosen": -62.354061126708984, "logps/rejected": -74.84004211425781, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.7635915279388428, "rewards/margins": 12.152324676513672, "rewards/rejected": -15.915916442871094, "step": 1259 }, { "epoch": 7.466666666666667, "grad_norm": 0.15137621474800703, "learning_rate": 5.9318581864157555e-09, "logits/chosen": -1.6206462383270264, "logits/rejected": -1.5390565395355225, "logps/chosen": -45.372283935546875, "logps/rejected": -71.48526000976562, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.424093723297119, "rewards/margins": 11.92187213897705, "rewards/rejected": -15.345966339111328, "step": 1260 }, { "epoch": 7.4725925925925925, "grad_norm": 0.06945252873508109, "learning_rate": 5.792008818950034e-09, "logits/chosen": -1.677442193031311, "logits/rejected": -1.543940544128418, "logps/chosen": -41.09063720703125, "logps/rejected": -78.29434204101562, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.2955427169799805, "rewards/margins": 12.228326797485352, "rewards/rejected": -15.523869514465332, "step": 1261 }, { "epoch": 7.478518518518518, "grad_norm": 0.09200967443221757, "learning_rate": 5.653808398763726e-09, "logits/chosen": -1.9046722650527954, "logits/rejected": -1.8483989238739014, "logps/chosen": -35.713706970214844, "logps/rejected": -54.89629364013672, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.9885141253471375, "rewards/margins": 9.532726287841797, "rewards/rejected": -10.521241188049316, "step": 1262 }, { "epoch": 7.484444444444445, "grad_norm": 0.08851787133475601, "learning_rate": 5.5172578590171606e-09, "logits/chosen": -1.712750792503357, "logits/rejected": -1.7361048460006714, "logps/chosen": -33.406742095947266, "logps/rejected": -64.78385925292969, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.1024856567382812, "rewards/margins": 11.14158821105957, "rewards/rejected": -13.244073867797852, "step": 1263 }, { "epoch": 7.49037037037037, "grad_norm": 0.13185333469961655, "learning_rate": 5.382358121730296e-09, "logits/chosen": -2.138230800628662, "logits/rejected": -2.0349910259246826, "logps/chosen": -38.78353500366211, "logps/rejected": -73.98619842529297, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -2.396270751953125, "rewards/margins": 11.905975341796875, "rewards/rejected": -14.30224609375, "step": 1264 }, { "epoch": 7.496296296296296, "grad_norm": 0.11830301808182496, "learning_rate": 5.249110097776482e-09, "logits/chosen": -1.7554690837860107, "logits/rejected": -1.6527049541473389, "logps/chosen": -55.28874969482422, "logps/rejected": -85.10946655273438, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -5.7993483543396, "rewards/margins": 12.124895095825195, "rewards/rejected": -17.924243927001953, "step": 1265 }, { "epoch": 7.502222222222223, "grad_norm": 0.10086200648031751, "learning_rate": 5.117514686876378e-09, "logits/chosen": -1.4983659982681274, "logits/rejected": -1.3238434791564941, "logps/chosen": -41.69336700439453, "logps/rejected": -90.9068603515625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.6861684322357178, "rewards/margins": 14.651307106018066, "rewards/rejected": -18.337474822998047, "step": 1266 }, { "epoch": 7.508148148148148, "grad_norm": 0.11418057405054963, "learning_rate": 4.987572777591764e-09, "logits/chosen": -2.029273271560669, "logits/rejected": -1.9411393404006958, "logps/chosen": -50.297698974609375, "logps/rejected": -85.19852447509766, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -4.682843208312988, "rewards/margins": 13.862180709838867, "rewards/rejected": -18.545024871826172, "step": 1267 }, { "epoch": 7.514074074074074, "grad_norm": 0.110024960488623, "learning_rate": 4.859285247319656e-09, "logits/chosen": -1.9883079528808594, "logits/rejected": -1.835797667503357, "logps/chosen": -38.44123077392578, "logps/rejected": -76.23625183105469, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.574667453765869, "rewards/margins": 12.788910865783691, "rewards/rejected": -16.36357879638672, "step": 1268 }, { "epoch": 7.52, "grad_norm": 0.07976217403911946, "learning_rate": 4.732652962286282e-09, "logits/chosen": -1.9681137800216675, "logits/rejected": -1.8588993549346924, "logps/chosen": -48.73500442504883, "logps/rejected": -96.4697265625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -5.277359962463379, "rewards/margins": 12.159460067749023, "rewards/rejected": -17.436819076538086, "step": 1269 }, { "epoch": 7.525925925925926, "grad_norm": 0.17881558205057357, "learning_rate": 4.607676777541342e-09, "logits/chosen": -1.4052648544311523, "logits/rejected": -1.2790547609329224, "logps/chosen": -48.86443328857422, "logps/rejected": -83.3725357055664, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -4.212798118591309, "rewards/margins": 12.557598114013672, "rewards/rejected": -16.770395278930664, "step": 1270 }, { "epoch": 7.531851851851852, "grad_norm": 0.13713918527473565, "learning_rate": 4.4843575369521155e-09, "logits/chosen": -1.8819085359573364, "logits/rejected": -2.066300868988037, "logps/chosen": -74.62556457519531, "logps/rejected": -95.09159851074219, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -6.8993940353393555, "rewards/margins": 12.837356567382812, "rewards/rejected": -19.736751556396484, "step": 1271 }, { "epoch": 7.5377777777777775, "grad_norm": 0.11548976688435701, "learning_rate": 4.362696073197863e-09, "logits/chosen": -1.3876936435699463, "logits/rejected": -1.496161937713623, "logps/chosen": -46.12397003173828, "logps/rejected": -64.80559539794922, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.6518354415893555, "rewards/margins": 10.367033004760742, "rewards/rejected": -13.018869400024414, "step": 1272 }, { "epoch": 7.543703703703704, "grad_norm": 0.14402006709619897, "learning_rate": 4.242693207764159e-09, "logits/chosen": -1.6995124816894531, "logits/rejected": -1.7169837951660156, "logps/chosen": -46.91732406616211, "logps/rejected": -79.42771911621094, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -2.7356224060058594, "rewards/margins": 13.237366676330566, "rewards/rejected": -15.972990036010742, "step": 1273 }, { "epoch": 7.54962962962963, "grad_norm": 0.06793914185958956, "learning_rate": 4.12434975093734e-09, "logits/chosen": -1.6203457117080688, "logits/rejected": -1.6125106811523438, "logps/chosen": -48.54278564453125, "logps/rejected": -76.17129516601562, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -4.657160758972168, "rewards/margins": 12.374560356140137, "rewards/rejected": -17.031721115112305, "step": 1274 }, { "epoch": 7.555555555555555, "grad_norm": 0.08491480689935392, "learning_rate": 4.007666501799012e-09, "logits/chosen": -2.2714755535125732, "logits/rejected": -2.1652722358703613, "logps/chosen": -45.407203674316406, "logps/rejected": -86.5791015625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.742861270904541, "rewards/margins": 11.433135032653809, "rewards/rejected": -15.175996780395508, "step": 1275 }, { "epoch": 7.561481481481481, "grad_norm": 0.11529216841661874, "learning_rate": 3.89264424822075e-09, "logits/chosen": -1.641927719116211, "logits/rejected": -1.4134973287582397, "logps/chosen": -49.21048355102539, "logps/rejected": -88.19728088378906, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -4.468847274780273, "rewards/margins": 11.007088661193848, "rewards/rejected": -15.475936889648438, "step": 1276 }, { "epoch": 7.567407407407408, "grad_norm": 0.11468580187199377, "learning_rate": 3.779283766858682e-09, "logits/chosen": -1.4971985816955566, "logits/rejected": -1.2270244359970093, "logps/chosen": -28.876060485839844, "logps/rejected": -70.2495346069336, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -1.6484756469726562, "rewards/margins": 12.464676856994629, "rewards/rejected": -14.113153457641602, "step": 1277 }, { "epoch": 7.573333333333333, "grad_norm": 0.10707890797785785, "learning_rate": 3.667585823148217e-09, "logits/chosen": -1.5404367446899414, "logits/rejected": -1.5498936176300049, "logps/chosen": -54.821353912353516, "logps/rejected": -76.89654541015625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -4.003150939941406, "rewards/margins": 10.68851089477539, "rewards/rejected": -14.691661834716797, "step": 1278 }, { "epoch": 7.579259259259259, "grad_norm": 0.13044904772149862, "learning_rate": 3.5575511712990504e-09, "logits/chosen": -1.5401300191879272, "logits/rejected": -1.5082144737243652, "logps/chosen": -49.69709396362305, "logps/rejected": -83.05561065673828, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.6358282566070557, "rewards/margins": 11.022361755371094, "rewards/rejected": -14.65818977355957, "step": 1279 }, { "epoch": 7.5851851851851855, "grad_norm": 0.1139761611228426, "learning_rate": 3.4491805542899155e-09, "logits/chosen": -1.276806354522705, "logits/rejected": -1.2018065452575684, "logps/chosen": -38.74787139892578, "logps/rejected": -74.66535186767578, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -1.6435717344284058, "rewards/margins": 12.292912483215332, "rewards/rejected": -13.936485290527344, "step": 1280 }, { "epoch": 7.591111111111111, "grad_norm": 0.11980572970276052, "learning_rate": 3.342474703863507e-09, "logits/chosen": -1.935957908630371, "logits/rejected": -1.670924425125122, "logps/chosen": -50.889198303222656, "logps/rejected": -93.57077026367188, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.593352794647217, "rewards/margins": 14.393285751342773, "rewards/rejected": -17.986637115478516, "step": 1281 }, { "epoch": 7.597037037037037, "grad_norm": 0.14396962997949114, "learning_rate": 3.2374343405217884e-09, "logits/chosen": -1.7795342206954956, "logits/rejected": -1.6266406774520874, "logps/chosen": -37.90673065185547, "logps/rejected": -76.17749786376953, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.9176108837127686, "rewards/margins": 13.06861686706543, "rewards/rejected": -15.986227035522461, "step": 1282 }, { "epoch": 7.6029629629629625, "grad_norm": 0.050320004658963226, "learning_rate": 3.1340601735209137e-09, "logits/chosen": -1.5465812683105469, "logits/rejected": -1.3979713916778564, "logps/chosen": -44.11369323730469, "logps/rejected": -81.26278686523438, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.2288765907287598, "rewards/margins": 13.506202697753906, "rewards/rejected": -16.735078811645508, "step": 1283 }, { "epoch": 7.608888888888889, "grad_norm": 0.12064099276652479, "learning_rate": 3.0323529008664807e-09, "logits/chosen": -1.6967335939407349, "logits/rejected": -1.5087066888809204, "logps/chosen": -42.999183654785156, "logps/rejected": -77.17369079589844, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.500814437866211, "rewards/margins": 11.972524642944336, "rewards/rejected": -14.473339080810547, "step": 1284 }, { "epoch": 7.614814814814815, "grad_norm": 0.12569846714622554, "learning_rate": 2.9323132093088954e-09, "logits/chosen": -1.8397209644317627, "logits/rejected": -1.7531307935714722, "logps/chosen": -36.641639709472656, "logps/rejected": -76.00559997558594, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.290510654449463, "rewards/margins": 14.336465835571289, "rewards/rejected": -16.626977920532227, "step": 1285 }, { "epoch": 7.62074074074074, "grad_norm": 0.16279004525534851, "learning_rate": 2.833941774338655e-09, "logits/chosen": -2.2504396438598633, "logits/rejected": -2.192629337310791, "logps/chosen": -41.64829635620117, "logps/rejected": -79.78437805175781, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -1.9262151718139648, "rewards/margins": 14.325329780578613, "rewards/rejected": -16.251544952392578, "step": 1286 }, { "epoch": 7.626666666666667, "grad_norm": 0.12188734734718085, "learning_rate": 2.7372392601817675e-09, "logits/chosen": -1.862606406211853, "logits/rejected": -1.7344017028808594, "logps/chosen": -46.619632720947266, "logps/rejected": -80.42242431640625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.7526557445526123, "rewards/margins": 12.96417236328125, "rewards/rejected": -15.716828346252441, "step": 1287 }, { "epoch": 7.632592592592593, "grad_norm": 0.10275281860218967, "learning_rate": 2.6422063197953926e-09, "logits/chosen": -2.094736099243164, "logits/rejected": -2.141115427017212, "logps/chosen": -49.34331130981445, "logps/rejected": -82.15042114257812, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.431687593460083, "rewards/margins": 11.898085594177246, "rewards/rejected": -15.32977294921875, "step": 1288 }, { "epoch": 7.638518518518518, "grad_norm": 0.10108153760024327, "learning_rate": 2.548843594863348e-09, "logits/chosen": -1.8961472511291504, "logits/rejected": -1.8754181861877441, "logps/chosen": -51.4503173828125, "logps/rejected": -77.76640319824219, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.0304574966430664, "rewards/margins": 11.970703125, "rewards/rejected": -15.00115966796875, "step": 1289 }, { "epoch": 7.644444444444445, "grad_norm": 0.11013184830599393, "learning_rate": 2.4571517157916944e-09, "logits/chosen": -1.6822834014892578, "logits/rejected": -1.51578688621521, "logps/chosen": -33.87409210205078, "logps/rejected": -77.11097717285156, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.183983325958252, "rewards/margins": 12.748117446899414, "rewards/rejected": -14.932100296020508, "step": 1290 }, { "epoch": 7.6503703703703705, "grad_norm": 0.2332749309815809, "learning_rate": 2.3671313017046557e-09, "logits/chosen": -1.6843562126159668, "logits/rejected": -1.731232762336731, "logps/chosen": -53.30952453613281, "logps/rejected": -75.22252655029297, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -4.883240699768066, "rewards/margins": 11.473252296447754, "rewards/rejected": -16.35649299621582, "step": 1291 }, { "epoch": 7.656296296296296, "grad_norm": 0.06386399382541402, "learning_rate": 2.27878296044029e-09, "logits/chosen": -1.9962579011917114, "logits/rejected": -1.9769573211669922, "logps/chosen": -50.019596099853516, "logps/rejected": -79.72898864746094, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.3184549808502197, "rewards/margins": 11.753643989562988, "rewards/rejected": -15.072098731994629, "step": 1292 }, { "epoch": 7.662222222222223, "grad_norm": 0.1382007429150244, "learning_rate": 2.1921072885464633e-09, "logits/chosen": -1.746502161026001, "logits/rejected": -1.880075216293335, "logps/chosen": -46.33978271484375, "logps/rejected": -69.8848648071289, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.3827314376831055, "rewards/margins": 10.580240249633789, "rewards/rejected": -13.962972640991211, "step": 1293 }, { "epoch": 7.668148148148148, "grad_norm": 0.08545353980119474, "learning_rate": 2.1071048712768545e-09, "logits/chosen": -2.118556499481201, "logits/rejected": -2.007209062576294, "logps/chosen": -39.88838195800781, "logps/rejected": -72.66770935058594, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -1.8593382835388184, "rewards/margins": 11.902694702148438, "rewards/rejected": -13.762033462524414, "step": 1294 }, { "epoch": 7.674074074074074, "grad_norm": 0.10569469042372463, "learning_rate": 2.0237762825868752e-09, "logits/chosen": -2.210296154022217, "logits/rejected": -2.1025829315185547, "logps/chosen": -53.71271514892578, "logps/rejected": -79.43670654296875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -4.210070610046387, "rewards/margins": 12.35400390625, "rewards/rejected": -16.56407356262207, "step": 1295 }, { "epoch": 7.68, "grad_norm": 0.12680804276707394, "learning_rate": 1.9421220851298657e-09, "logits/chosen": -1.7897592782974243, "logits/rejected": -1.6870161294937134, "logps/chosen": -45.054420471191406, "logps/rejected": -87.12640380859375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.4378550052642822, "rewards/margins": 13.260967254638672, "rewards/rejected": -16.698823928833008, "step": 1296 }, { "epoch": 7.685925925925926, "grad_norm": 0.1026870860090837, "learning_rate": 1.8621428302533492e-09, "logits/chosen": -1.8493924140930176, "logits/rejected": -1.8512948751449585, "logps/chosen": -46.09538650512695, "logps/rejected": -74.38146209716797, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.512159824371338, "rewards/margins": 11.365545272827148, "rewards/rejected": -14.877706527709961, "step": 1297 }, { "epoch": 7.691851851851852, "grad_norm": 0.12428573647645252, "learning_rate": 1.7838390579952567e-09, "logits/chosen": -1.5749092102050781, "logits/rejected": -1.710465908050537, "logps/chosen": -43.68793487548828, "logps/rejected": -69.81632995605469, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.554255723953247, "rewards/margins": 11.78613567352295, "rewards/rejected": -15.340391159057617, "step": 1298 }, { "epoch": 7.697777777777778, "grad_norm": 0.07377154770168977, "learning_rate": 1.7072112970802633e-09, "logits/chosen": -1.6239237785339355, "logits/rejected": -1.5087684392929077, "logps/chosen": -39.3017578125, "logps/rejected": -78.0733642578125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.597720146179199, "rewards/margins": 12.936697006225586, "rewards/rejected": -15.534416198730469, "step": 1299 }, { "epoch": 7.703703703703704, "grad_norm": 0.11840426071981136, "learning_rate": 1.6322600649162354e-09, "logits/chosen": -1.8614153861999512, "logits/rejected": -1.8858567476272583, "logps/chosen": -49.18476867675781, "logps/rejected": -71.32328796386719, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -4.097779273986816, "rewards/margins": 9.952096939086914, "rewards/rejected": -14.04987621307373, "step": 1300 }, { "epoch": 7.70962962962963, "grad_norm": 0.11361634619507371, "learning_rate": 1.5589858675907618e-09, "logits/chosen": -1.7827186584472656, "logits/rejected": -1.7072844505310059, "logps/chosen": -50.35566711425781, "logps/rejected": -82.6529769897461, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.8080339431762695, "rewards/margins": 11.548542022705078, "rewards/rejected": -15.356575965881348, "step": 1301 }, { "epoch": 7.7155555555555555, "grad_norm": 0.10114625449764604, "learning_rate": 1.4873891998677112e-09, "logits/chosen": -1.7124638557434082, "logits/rejected": -1.6303023099899292, "logps/chosen": -39.04692077636719, "logps/rejected": -69.08177947998047, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -1.496570110321045, "rewards/margins": 10.149101257324219, "rewards/rejected": -11.645671844482422, "step": 1302 }, { "epoch": 7.721481481481481, "grad_norm": 0.10974877746783143, "learning_rate": 1.4174705451838743e-09, "logits/chosen": -1.6638457775115967, "logits/rejected": -1.7446383237838745, "logps/chosen": -44.485374450683594, "logps/rejected": -66.89187622070312, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -1.829464316368103, "rewards/margins": 11.198625564575195, "rewards/rejected": -13.02808952331543, "step": 1303 }, { "epoch": 7.727407407407408, "grad_norm": 0.11233063444015381, "learning_rate": 1.3492303756457158e-09, "logits/chosen": -1.914198398590088, "logits/rejected": -1.5145885944366455, "logps/chosen": -45.77354431152344, "logps/rejected": -98.0726089477539, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -5.005705833435059, "rewards/margins": 17.062856674194336, "rewards/rejected": -22.06856346130371, "step": 1304 }, { "epoch": 7.733333333333333, "grad_norm": 0.11411994741058401, "learning_rate": 1.2826691520262112e-09, "logits/chosen": -1.5672929286956787, "logits/rejected": -1.4095594882965088, "logps/chosen": -43.227386474609375, "logps/rejected": -78.2294692993164, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.725564956665039, "rewards/margins": 10.642614364624023, "rewards/rejected": -14.368179321289062, "step": 1305 }, { "epoch": 7.739259259259259, "grad_norm": 0.11628123883143882, "learning_rate": 1.2177873237617375e-09, "logits/chosen": -1.3895998001098633, "logits/rejected": -1.5497479438781738, "logps/chosen": -59.86248016357422, "logps/rejected": -70.28961181640625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -5.182040691375732, "rewards/margins": 10.678327560424805, "rewards/rejected": -15.860368728637695, "step": 1306 }, { "epoch": 7.745185185185186, "grad_norm": 0.10568108304021323, "learning_rate": 1.1545853289489927e-09, "logits/chosen": -1.4103091955184937, "logits/rejected": -1.400766372680664, "logps/chosen": -34.94217300415039, "logps/rejected": -60.6082763671875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.777432441711426, "rewards/margins": 9.586267471313477, "rewards/rejected": -12.363698959350586, "step": 1307 }, { "epoch": 7.751111111111111, "grad_norm": 0.12159343302298652, "learning_rate": 1.0930635943420253e-09, "logits/chosen": -2.34028697013855, "logits/rejected": -2.267382860183716, "logps/chosen": -37.190250396728516, "logps/rejected": -84.93000030517578, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.365945339202881, "rewards/margins": 14.706781387329102, "rewards/rejected": -17.07272720336914, "step": 1308 }, { "epoch": 7.757037037037037, "grad_norm": 0.11929320977244282, "learning_rate": 1.0332225353494318e-09, "logits/chosen": -1.680659294128418, "logits/rejected": -1.76425039768219, "logps/chosen": -55.115943908691406, "logps/rejected": -80.5603256225586, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.972663164138794, "rewards/margins": 12.271014213562012, "rewards/rejected": -16.243677139282227, "step": 1309 }, { "epoch": 7.762962962962963, "grad_norm": 0.09092136853386971, "learning_rate": 9.750625560315528e-10, "logits/chosen": -1.676912546157837, "logits/rejected": -1.6585233211517334, "logps/chosen": -52.220970153808594, "logps/rejected": -79.3109130859375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.994405746459961, "rewards/margins": 11.580153465270996, "rewards/rejected": -14.57455825805664, "step": 1310 }, { "epoch": 7.768888888888889, "grad_norm": 0.1932632252968447, "learning_rate": 9.185840490975594e-10, "logits/chosen": -1.5779668092727661, "logits/rejected": -1.5990784168243408, "logps/chosen": -47.03139114379883, "logps/rejected": -77.99703216552734, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.3039872646331787, "rewards/margins": 13.618040084838867, "rewards/rejected": -16.922027587890625, "step": 1311 }, { "epoch": 7.774814814814815, "grad_norm": 0.11174616662096246, "learning_rate": 8.637873959031206e-10, "logits/chosen": -2.237205982208252, "logits/rejected": -2.167008638381958, "logps/chosen": -43.71097183227539, "logps/rejected": -75.61814880371094, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.187469005584717, "rewards/margins": 12.020849227905273, "rewards/rejected": -15.208316802978516, "step": 1312 }, { "epoch": 7.7807407407407405, "grad_norm": 0.10082024626851338, "learning_rate": 8.106729664475176e-10, "logits/chosen": -2.1557466983795166, "logits/rejected": -2.0947141647338867, "logps/chosen": -40.8330078125, "logps/rejected": -75.10816955566406, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.995957851409912, "rewards/margins": 11.368391990661621, "rewards/rejected": -15.364350318908691, "step": 1313 }, { "epoch": 7.786666666666667, "grad_norm": 0.11218104181699615, "learning_rate": 7.592411193713122e-10, "logits/chosen": -1.7130632400512695, "logits/rejected": -1.5563557147979736, "logps/chosen": -56.98699188232422, "logps/rejected": -95.31857299804688, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -5.602969646453857, "rewards/margins": 11.59247875213623, "rewards/rejected": -17.195449829101562, "step": 1314 }, { "epoch": 7.792592592592593, "grad_norm": 0.09595333522956707, "learning_rate": 7.094922019539318e-10, "logits/chosen": -1.564188838005066, "logits/rejected": -1.606329083442688, "logps/chosen": -39.70273971557617, "logps/rejected": -75.67445373535156, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.5167977809906006, "rewards/margins": 14.296806335449219, "rewards/rejected": -17.813602447509766, "step": 1315 }, { "epoch": 7.798518518518518, "grad_norm": 0.1047928513357108, "learning_rate": 6.61426550111227e-10, "logits/chosen": -1.9991806745529175, "logits/rejected": -1.858351230621338, "logps/chosen": -38.9638671875, "logps/rejected": -89.70887756347656, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.846712112426758, "rewards/margins": 14.311556816101074, "rewards/rejected": -17.158267974853516, "step": 1316 }, { "epoch": 7.804444444444444, "grad_norm": 0.12049712411179087, "learning_rate": 6.150444883933348e-10, "logits/chosen": -1.5024068355560303, "logits/rejected": -1.3233418464660645, "logps/chosen": -47.504905700683594, "logps/rejected": -93.08120727539062, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.704084634780884, "rewards/margins": 13.947275161743164, "rewards/rejected": -17.65135955810547, "step": 1317 }, { "epoch": 7.810370370370371, "grad_norm": 0.12781150582163409, "learning_rate": 5.703463299823186e-10, "logits/chosen": -1.7797167301177979, "logits/rejected": -1.7678923606872559, "logps/chosen": -33.082733154296875, "logps/rejected": -97.64846801757812, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -1.2870450019836426, "rewards/margins": 17.191646575927734, "rewards/rejected": -18.47869110107422, "step": 1318 }, { "epoch": 7.816296296296296, "grad_norm": 0.10997039292160629, "learning_rate": 5.27332376690226e-10, "logits/chosen": -2.0640406608581543, "logits/rejected": -1.9499751329421997, "logps/chosen": -46.399940490722656, "logps/rejected": -87.12672424316406, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -4.000057220458984, "rewards/margins": 11.56611442565918, "rewards/rejected": -15.566171646118164, "step": 1319 }, { "epoch": 7.822222222222222, "grad_norm": 0.09734750549803473, "learning_rate": 4.860029189569237e-10, "logits/chosen": -2.239694595336914, "logits/rejected": -2.2137787342071533, "logps/chosen": -58.1463737487793, "logps/rejected": -77.39590454101562, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -5.256124973297119, "rewards/margins": 11.735958099365234, "rewards/rejected": -16.992082595825195, "step": 1320 }, { "epoch": 7.8281481481481485, "grad_norm": 0.06290331703403879, "learning_rate": 4.463582358482376e-10, "logits/chosen": -2.029208183288574, "logits/rejected": -1.9457252025604248, "logps/chosen": -48.41543197631836, "logps/rejected": -94.46034240722656, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.4602203369140625, "rewards/margins": 14.713550567626953, "rewards/rejected": -18.173768997192383, "step": 1321 }, { "epoch": 7.834074074074074, "grad_norm": 0.11405387953819877, "learning_rate": 4.083985950539548e-10, "logits/chosen": -1.8435865640640259, "logits/rejected": -1.8563013076782227, "logps/chosen": -52.38283920288086, "logps/rejected": -86.04027557373047, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -4.0260491371154785, "rewards/margins": 13.521184921264648, "rewards/rejected": -17.54723358154297, "step": 1322 }, { "epoch": 7.84, "grad_norm": 0.1428495263399808, "learning_rate": 3.721242528861024e-10, "logits/chosen": -1.6836150884628296, "logits/rejected": -1.6538059711456299, "logps/chosen": -46.02968978881836, "logps/rejected": -71.76293182373047, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -4.687353134155273, "rewards/margins": 11.388587951660156, "rewards/rejected": -16.075942993164062, "step": 1323 }, { "epoch": 7.8459259259259255, "grad_norm": 0.1021413462070928, "learning_rate": 3.3753545427722687e-10, "logits/chosen": -2.0874242782592773, "logits/rejected": -2.1049461364746094, "logps/chosen": -44.546688079833984, "logps/rejected": -89.52922058105469, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.7562994956970215, "rewards/margins": 12.259674072265625, "rewards/rejected": -16.015974044799805, "step": 1324 }, { "epoch": 7.851851851851852, "grad_norm": 0.11440705165713286, "learning_rate": 3.0463243277864534e-10, "logits/chosen": -1.6675009727478027, "logits/rejected": -1.6661535501480103, "logps/chosen": -41.84225082397461, "logps/rejected": -66.14202117919922, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -1.4963809251785278, "rewards/margins": 11.167799949645996, "rewards/rejected": -12.664179801940918, "step": 1325 }, { "epoch": 7.857777777777778, "grad_norm": 0.08839074484858349, "learning_rate": 2.734154105589748e-10, "logits/chosen": -1.431457757949829, "logits/rejected": -1.4145808219909668, "logps/chosen": -34.363014221191406, "logps/rejected": -62.02093505859375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.3365049362182617, "rewards/margins": 10.666013717651367, "rewards/rejected": -13.002519607543945, "step": 1326 }, { "epoch": 7.863703703703703, "grad_norm": 0.10791454724929499, "learning_rate": 2.4388459840257724e-10, "logits/chosen": -1.8096765279769897, "logits/rejected": -1.8453236818313599, "logps/chosen": -40.75598907470703, "logps/rejected": -75.48899841308594, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.80953311920166, "rewards/margins": 13.202881813049316, "rewards/rejected": -17.012414932250977, "step": 1327 }, { "epoch": 7.86962962962963, "grad_norm": 0.11257458760022891, "learning_rate": 2.1604019570811704e-10, "logits/chosen": -1.8961286544799805, "logits/rejected": -1.9870917797088623, "logps/chosen": -49.68988037109375, "logps/rejected": -73.82746887207031, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.7470507621765137, "rewards/margins": 12.514230728149414, "rewards/rejected": -15.261281967163086, "step": 1328 }, { "epoch": 7.875555555555556, "grad_norm": 0.08900179041357273, "learning_rate": 1.8988239048725595e-10, "logits/chosen": -1.9694173336029053, "logits/rejected": -1.8216187953948975, "logps/chosen": -44.448753356933594, "logps/rejected": -81.052490234375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.6496639251708984, "rewards/margins": 12.538106918334961, "rewards/rejected": -16.18777084350586, "step": 1329 }, { "epoch": 7.881481481481481, "grad_norm": 0.09927287718802277, "learning_rate": 1.6541135936343208e-10, "logits/chosen": -2.2331199645996094, "logits/rejected": -1.9718306064605713, "logps/chosen": -51.08818054199219, "logps/rejected": -108.66032409667969, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.2654128074645996, "rewards/margins": 15.549224853515625, "rewards/rejected": -18.81463623046875, "step": 1330 }, { "epoch": 7.887407407407407, "grad_norm": 0.13440981318068518, "learning_rate": 1.426272675704998e-10, "logits/chosen": -1.9113953113555908, "logits/rejected": -1.9644521474838257, "logps/chosen": -47.6470947265625, "logps/rejected": -83.68902587890625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.3995003700256348, "rewards/margins": 13.84328556060791, "rewards/rejected": -17.24278450012207, "step": 1331 }, { "epoch": 7.8933333333333335, "grad_norm": 0.11681222659218105, "learning_rate": 1.2153026895178608e-10, "logits/chosen": -1.6768038272857666, "logits/rejected": -1.722691297531128, "logps/chosen": -52.901611328125, "logps/rejected": -76.805908203125, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.354005813598633, "rewards/margins": 12.054499626159668, "rewards/rejected": -15.4085054397583, "step": 1332 }, { "epoch": 7.899259259259259, "grad_norm": 0.08841688627616534, "learning_rate": 1.0212050595895249e-10, "logits/chosen": -1.667407751083374, "logits/rejected": -1.7305935621261597, "logps/chosen": -52.599090576171875, "logps/rejected": -74.85166931152344, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.8365044593811035, "rewards/margins": 11.444159507751465, "rewards/rejected": -14.280664443969727, "step": 1333 }, { "epoch": 7.905185185185185, "grad_norm": 0.096242284043103, "learning_rate": 8.439810965113481e-11, "logits/chosen": -1.8078327178955078, "logits/rejected": -1.6372575759887695, "logps/chosen": -40.54448318481445, "logps/rejected": -73.397705078125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -4.039003372192383, "rewards/margins": 11.167409896850586, "rewards/rejected": -15.206413269042969, "step": 1334 }, { "epoch": 7.911111111111111, "grad_norm": 0.10789774171873755, "learning_rate": 6.836319969388827e-11, "logits/chosen": -1.5104821920394897, "logits/rejected": -1.3770867586135864, "logps/chosen": -46.491546630859375, "logps/rejected": -78.11680603027344, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.605240821838379, "rewards/margins": 10.936412811279297, "rewards/rejected": -14.541654586791992, "step": 1335 }, { "epoch": 7.917037037037037, "grad_norm": 0.08868861034370847, "learning_rate": 5.4015884358549204e-11, "logits/chosen": -1.4172427654266357, "logits/rejected": -1.3312875032424927, "logps/chosen": -46.777320861816406, "logps/rejected": -77.1638412475586, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.3355093002319336, "rewards/margins": 13.356868743896484, "rewards/rejected": -16.692378997802734, "step": 1336 }, { "epoch": 7.922962962962963, "grad_norm": 0.09206868646615148, "learning_rate": 4.135626052143015e-11, "logits/chosen": -1.7921714782714844, "logits/rejected": -1.6932836771011353, "logps/chosen": -43.38299560546875, "logps/rejected": -83.28189086914062, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.84717059135437, "rewards/margins": 13.680866241455078, "rewards/rejected": -17.52803611755371, "step": 1337 }, { "epoch": 7.928888888888888, "grad_norm": 0.20766800780823566, "learning_rate": 3.0384413663125944e-11, "logits/chosen": -1.8923168182373047, "logits/rejected": -2.0263829231262207, "logps/chosen": -42.76374435424805, "logps/rejected": -70.45816040039062, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.6587953567504883, "rewards/margins": 10.849041938781738, "rewards/rejected": -13.507837295532227, "step": 1338 }, { "epoch": 7.934814814814815, "grad_norm": 0.10849471922864087, "learning_rate": 2.110041786804184e-11, "logits/chosen": -1.8717421293258667, "logits/rejected": -1.8520114421844482, "logps/chosen": -54.066280364990234, "logps/rejected": -92.87806701660156, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -4.783078193664551, "rewards/margins": 13.9727783203125, "rewards/rejected": -18.755857467651367, "step": 1339 }, { "epoch": 7.940740740740741, "grad_norm": 0.11149416835250378, "learning_rate": 1.350433582381072e-11, "logits/chosen": -1.572990894317627, "logits/rejected": -1.0800707340240479, "logps/chosen": -39.92055892944336, "logps/rejected": -71.92617797851562, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.5409345626831055, "rewards/margins": 10.899103164672852, "rewards/rejected": -13.44003677368164, "step": 1340 }, { "epoch": 7.946666666666666, "grad_norm": 0.11266971237546647, "learning_rate": 7.596218820876688e-12, "logits/chosen": -1.4236558675765991, "logits/rejected": -1.7003998756408691, "logps/chosen": -57.53947448730469, "logps/rejected": -71.41385650634766, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.154088020324707, "rewards/margins": 10.86117935180664, "rewards/rejected": -14.015266418457031, "step": 1341 }, { "epoch": 7.952592592592593, "grad_norm": 0.10920237224370884, "learning_rate": 3.376106752134289e-12, "logits/chosen": -2.0125412940979004, "logits/rejected": -1.873422622680664, "logps/chosen": -31.185543060302734, "logps/rejected": -70.14773559570312, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -1.9257241487503052, "rewards/margins": 13.334260940551758, "rewards/rejected": -15.259984970092773, "step": 1342 }, { "epoch": 7.9585185185185185, "grad_norm": 0.09535432194912208, "learning_rate": 8.440281127897186e-13, "logits/chosen": -2.0392391681671143, "logits/rejected": -1.8588032722473145, "logps/chosen": -50.48688507080078, "logps/rejected": -101.44157409667969, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.4265379905700684, "rewards/margins": 17.10596466064453, "rewards/rejected": -20.532503128051758, "step": 1343 }, { "epoch": 7.964444444444444, "grad_norm": 0.1169578512158143, "learning_rate": 0.0, "logits/chosen": -1.6152704954147339, "logits/rejected": -1.6888506412506104, "logps/chosen": -48.29261779785156, "logps/rejected": -78.50033569335938, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.6884405612945557, "rewards/margins": 12.385543823242188, "rewards/rejected": -16.073984146118164, "step": 1344 }, { "epoch": 7.964444444444444, "step": 1344, "total_flos": 0.0, "train_loss": 0.11349717956385402, "train_runtime": 13006.5105, "train_samples_per_second": 6.64, "train_steps_per_second": 0.103 } ], "logging_steps": 1, "max_steps": 1344, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 400, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }