diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,2881 +1,1441 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.9997382884061764, + "epoch": 0.9994767137624281, "eval_steps": 500, - "global_step": 1910, + "global_step": 955, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.005234231876472127, - "grad_norm": 312.0, - "learning_rate": 1.0416666666666667e-06, - "logits/chosen": 0.665995180606842, - "logits/rejected": 0.7168087959289551, - "logps/chosen": -331.14556884765625, - "logps/rejected": -289.13482666015625, - "loss": 0.6929, - "rewards/accuracies": 0.40312498807907104, - "rewards/chosen": -0.0007204435532912612, - "rewards/margins": 0.0008210704545490444, - "rewards/rejected": -0.0015415140660479665, + "epoch": 0.010465724751439037, + "grad_norm": 26.42055892944336, + "learning_rate": 2.0833333333333333e-07, + "logits/chosen": 0.7578125, + "logits/rejected": 0.97265625, + "logps/chosen": -284.0, + "logps/rejected": -294.0, + "loss": 0.6958, + "rewards/accuracies": 0.23125000298023224, + "rewards/chosen": 0.00531005859375, + "rewards/margins": 0.0027618408203125, + "rewards/rejected": 0.0025177001953125, "step": 10 }, { - "epoch": 0.010468463752944255, - "grad_norm": 284.0, - "learning_rate": 2.0833333333333334e-06, - "logits/chosen": 0.7776141166687012, - "logits/rejected": 0.7684425115585327, - "logps/chosen": -357.8346862792969, - "logps/rejected": -317.8344421386719, - "loss": 0.6936, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -0.0012695642653852701, - "rewards/margins": -0.0004827965167351067, - "rewards/rejected": -0.0007867676904425025, + "epoch": 0.020931449502878074, + "grad_norm": 25.812650680541992, + "learning_rate": 4.1666666666666667e-07, + "logits/chosen": 0.63671875, + "logits/rejected": 0.65234375, + "logps/chosen": -382.0, + "logps/rejected": -308.0, + "loss": 0.6917, + "rewards/accuracies": 0.34375, + "rewards/chosen": 0.0172119140625, + "rewards/margins": 0.01953125, + "rewards/rejected": -0.002410888671875, "step": 20 }, { - "epoch": 0.015702695629416383, - "grad_norm": 322.0, - "learning_rate": 3.125e-06, - "logits/chosen": 0.795623779296875, - "logits/rejected": 0.8778733015060425, - "logps/chosen": -350.8582458496094, - "logps/rejected": -318.2168884277344, - "loss": 0.6904, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -0.0022780809085816145, - "rewards/margins": 0.0060666268691420555, - "rewards/rejected": -0.008344708010554314, + "epoch": 0.03139717425431711, + "grad_norm": 28.49739646911621, + "learning_rate": 6.249999999999999e-07, + "logits/chosen": 0.8203125, + "logits/rejected": 0.7421875, + "logps/chosen": -304.0, + "logps/rejected": -260.0, + "loss": 0.6961, + "rewards/accuracies": 0.3375000059604645, + "rewards/chosen": 0.00148773193359375, + "rewards/margins": -0.005279541015625, + "rewards/rejected": 0.00677490234375, "step": 30 }, { - "epoch": 0.02093692750588851, - "grad_norm": 298.0, - "learning_rate": 4.166666666666667e-06, - "logits/chosen": 0.7740770578384399, - "logits/rejected": 0.8167956471443176, - "logps/chosen": -319.42022705078125, - "logps/rejected": -278.17071533203125, - "loss": 0.681, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": 0.006301078014075756, - "rewards/margins": 0.026611831039190292, - "rewards/rejected": -0.02031075581908226, + "epoch": 0.04186289900575615, + "grad_norm": 25.176631927490234, + "learning_rate": 8.333333333333333e-07, + "logits/chosen": 0.80078125, + "logits/rejected": 0.98046875, + "logps/chosen": -340.0, + "logps/rejected": -320.0, + "loss": 0.6748, + "rewards/accuracies": 0.53125, + "rewards/chosen": 0.0247802734375, + "rewards/margins": 0.0751953125, + "rewards/rejected": -0.05029296875, "step": 40 }, { - "epoch": 0.02617115938236064, - "grad_norm": 266.0, - "learning_rate": 5.208333333333334e-06, - "logits/chosen": 0.6866067051887512, - "logits/rejected": 0.7555549144744873, - "logps/chosen": -320.47479248046875, - "logps/rejected": -284.809814453125, - "loss": 0.6727, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.005391906015574932, - "rewards/margins": 0.04711990803480148, - "rewards/rejected": -0.05251181870698929, + "epoch": 0.052328623757195186, + "grad_norm": 23.213665008544922, + "learning_rate": 9.999880027023293e-07, + "logits/chosen": 0.8046875, + "logits/rejected": 0.87109375, + "logps/chosen": -322.0, + "logps/rejected": -280.0, + "loss": 0.6482, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.08544921875, + "rewards/margins": 0.1982421875, + "rewards/rejected": -0.11328125, "step": 50 }, { - "epoch": 0.031405391258832765, - "grad_norm": 288.0, - "learning_rate": 6.25e-06, - "logits/chosen": 0.6783124208450317, - "logits/rejected": 0.7343226671218872, - "logps/chosen": -337.063232421875, - "logps/rejected": -299.95220947265625, - "loss": 0.6656, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.011515669524669647, - "rewards/margins": 0.07207117229700089, - "rewards/rejected": -0.08358683437108994, + "epoch": 0.06279434850863422, + "grad_norm": 22.0967960357666, + "learning_rate": 9.995681577335256e-07, + "logits/chosen": 0.79296875, + "logits/rejected": 0.984375, + "logps/chosen": -320.0, + "logps/rejected": -300.0, + "loss": 0.6378, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": 0.029541015625, + "rewards/margins": 0.1767578125, + "rewards/rejected": -0.1474609375, "step": 60 }, { - "epoch": 0.036639623135304895, - "grad_norm": 262.0, - "learning_rate": 7.291666666666667e-06, - "logits/chosen": 0.8541976809501648, - "logits/rejected": 0.9135104417800903, - "logps/chosen": -324.9010925292969, - "logps/rejected": -313.7442321777344, - "loss": 0.6576, - "rewards/accuracies": 0.621874988079071, - "rewards/chosen": -0.033813267946243286, - "rewards/margins": 0.1023920327425003, - "rewards/rejected": -0.1362052857875824, + "epoch": 0.07326007326007326, + "grad_norm": 22.011857986450195, + "learning_rate": 9.985490234976131e-07, + "logits/chosen": 0.75390625, + "logits/rejected": 0.671875, + "logps/chosen": -342.0, + "logps/rejected": -264.0, + "loss": 0.6117, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.09375, + "rewards/margins": 0.443359375, + "rewards/rejected": -0.349609375, "step": 70 }, { - "epoch": 0.04187385501177702, - "grad_norm": 282.0, - "learning_rate": 8.333333333333334e-06, - "logits/chosen": 0.8204466700553894, - "logits/rejected": 0.9971317052841187, - "logps/chosen": -309.83648681640625, - "logps/rejected": -280.83837890625, - "loss": 0.649, - "rewards/accuracies": 0.640625, - "rewards/chosen": -0.051890332251787186, - "rewards/margins": 0.13043811917304993, - "rewards/rejected": -0.18232843279838562, + "epoch": 0.0837257980115123, + "grad_norm": 27.431547164916992, + "learning_rate": 9.969318225629239e-07, + "logits/chosen": 0.56640625, + "logits/rejected": 0.6875, + "logps/chosen": -388.0, + "logps/rejected": -342.0, + "loss": 0.6463, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": 0.0264892578125, + "rewards/margins": 0.451171875, + "rewards/rejected": -0.423828125, "step": 80 }, { - "epoch": 0.04710808688824915, - "grad_norm": 290.0, - "learning_rate": 9.375000000000001e-06, - "logits/chosen": 0.8248814344406128, - "logits/rejected": 0.798936128616333, - "logps/chosen": -330.08660888671875, - "logps/rejected": -278.1947937011719, - "loss": 0.6422, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.07527975738048553, - "rewards/margins": 0.17323294281959534, - "rewards/rejected": -0.24851271510124207, + "epoch": 0.09419152276295134, + "grad_norm": 24.862520217895508, + "learning_rate": 9.947184949473476e-07, + "logits/chosen": 0.796875, + "logits/rejected": 0.8359375, + "logps/chosen": -356.0, + "logps/rejected": -298.0, + "loss": 0.6146, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.055419921875, + "rewards/margins": 0.44921875, + "rewards/rejected": -0.392578125, "step": 90 }, { - "epoch": 0.05234231876472128, - "grad_norm": 234.0, - "learning_rate": 9.999880027023295e-06, - "logits/chosen": 0.5035718083381653, - "logits/rejected": 0.6347559094429016, - "logps/chosen": -328.1427917480469, - "logps/rejected": -298.84197998046875, - "loss": 0.6071, - "rewards/accuracies": 0.671875, - "rewards/chosen": -0.14533154666423798, - "rewards/margins": 0.28229671716690063, - "rewards/rejected": -0.4276282787322998, + "epoch": 0.10465724751439037, + "grad_norm": 23.47401237487793, + "learning_rate": 9.919116957910565e-07, + "logits/chosen": 0.4921875, + "logits/rejected": 0.51953125, + "logps/chosen": -358.0, + "logps/rejected": -298.0, + "loss": 0.6125, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": 0.0224609375, + "rewards/margins": 0.49609375, + "rewards/rejected": -0.474609375, "step": 100 }, { - "epoch": 0.05757655064119341, - "grad_norm": 246.0, - "learning_rate": 9.998530397154684e-06, - "logits/chosen": 0.5344328284263611, - "logits/rejected": 0.6689058542251587, - "logps/chosen": -325.33978271484375, - "logps/rejected": -314.258544921875, - "loss": 0.6253, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.20693711936473846, - "rewards/margins": 0.24847209453582764, - "rewards/rejected": -0.4554091989994049, + "epoch": 0.1151229722658294, + "grad_norm": 24.995296478271484, + "learning_rate": 9.88514792171362e-07, + "logits/chosen": 0.671875, + "logits/rejected": 0.6796875, + "logps/chosen": -330.0, + "logps/rejected": -308.0, + "loss": 0.5918, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.06494140625, + "rewards/margins": 0.341796875, + "rewards/rejected": -0.408203125, "step": 110 }, { - "epoch": 0.06281078251766553, - "grad_norm": 284.0, - "learning_rate": 9.995681577335256e-06, - "logits/chosen": 0.4409152865409851, - "logits/rejected": 0.5330216288566589, - "logps/chosen": -340.72930908203125, - "logps/rejected": -319.43524169921875, - "loss": 0.6163, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.3839780390262604, - "rewards/margins": 0.3311913013458252, - "rewards/rejected": -0.7151693105697632, + "epoch": 0.12558869701726844, + "grad_norm": 22.512697219848633, + "learning_rate": 9.845318590635185e-07, + "logits/chosen": 0.5703125, + "logits/rejected": 0.62109375, + "logps/chosen": -340.0, + "logps/rejected": -270.0, + "loss": 0.5845, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.0174560546875, + "rewards/margins": 0.5390625, + "rewards/rejected": -0.55859375, "step": 120 }, { - "epoch": 0.06804501439413765, - "grad_norm": 446.0, - "learning_rate": 9.99133442200056e-06, - "logits/chosen": 0.30335044860839844, - "logits/rejected": 0.46630460023880005, - "logps/chosen": -353.91961669921875, - "logps/rejected": -310.37689208984375, - "loss": 0.6173, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.37794750928878784, - "rewards/margins": 0.34848180413246155, - "rewards/rejected": -0.7264293432235718, + "epoch": 0.1360544217687075, + "grad_norm": 26.06734848022461, + "learning_rate": 9.799676744523238e-07, + "logits/chosen": 0.54296875, + "logits/rejected": 0.62890625, + "logps/chosen": -348.0, + "logps/rejected": -290.0, + "loss": 0.5964, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.0166015625, + "rewards/margins": 0.55078125, + "rewards/rejected": -0.56640625, "step": 130 }, { - "epoch": 0.07327924627060979, - "grad_norm": 294.0, - "learning_rate": 9.985490234976132e-06, - "logits/chosen": 0.45180901885032654, - "logits/rejected": 0.5098147392272949, - "logps/chosen": -345.41558837890625, - "logps/rejected": -291.6056823730469, - "loss": 0.5936, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.4161813259124756, - "rewards/margins": 0.40652480721473694, - "rewards/rejected": -0.8227061033248901, + "epoch": 0.14652014652014653, + "grad_norm": 21.467668533325195, + "learning_rate": 9.748277136003789e-07, + "logits/chosen": 0.62890625, + "logits/rejected": 0.8828125, + "logps/chosen": -324.0, + "logps/rejected": -308.0, + "loss": 0.6083, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.041015625, + "rewards/margins": 0.369140625, + "rewards/rejected": -0.41015625, "step": 140 }, { - "epoch": 0.07851347814708191, - "grad_norm": 270.0, - "learning_rate": 9.978150769086457e-06, - "logits/chosen": 0.40853095054626465, - "logits/rejected": 0.5322204828262329, - "logps/chosen": -341.7745666503906, - "logps/rejected": -305.63043212890625, - "loss": 0.6364, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.5094475746154785, - "rewards/margins": 0.3103070855140686, - "rewards/rejected": -0.8197546005249023, + "epoch": 0.15698587127158556, + "grad_norm": 24.102495193481445, + "learning_rate": 9.691181424798824e-07, + "logits/chosen": 0.62109375, + "logits/rejected": 0.66796875, + "logps/chosen": -298.0, + "logps/rejected": -276.0, + "loss": 0.6033, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": 0.05615234375, + "rewards/margins": 0.423828125, + "rewards/rejected": -0.3671875, "step": 150 }, { - "epoch": 0.08374771002355404, - "grad_norm": 294.0, - "learning_rate": 9.96931822562924e-06, - "logits/chosen": 0.33679673075675964, - "logits/rejected": 0.4187542498111725, - "logps/chosen": -353.4910888671875, - "logps/rejected": -340.1171875, - "loss": 0.6111, - "rewards/accuracies": 0.671875, - "rewards/chosen": -0.3925863802433014, - "rewards/margins": 0.3052482306957245, - "rewards/rejected": -0.6978346109390259, + "epoch": 0.1674515960230246, + "grad_norm": 22.33710479736328, + "learning_rate": 9.628458103758402e-07, + "logits/chosen": 0.3828125, + "logits/rejected": 0.5234375, + "logps/chosen": -348.0, + "logps/rejected": -320.0, + "loss": 0.5875, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.00665283203125, + "rewards/margins": 0.44921875, + "rewards/rejected": -0.455078125, "step": 160 }, { - "epoch": 0.08898194190002617, - "grad_norm": 288.0, - "learning_rate": 9.958995253715193e-06, - "logits/chosen": 0.36519330739974976, - "logits/rejected": 0.3689248263835907, - "logps/chosen": -358.2540283203125, - "logps/rejected": -317.26116943359375, - "loss": 0.6105, - "rewards/accuracies": 0.6468750238418579, - "rewards/chosen": -0.3615415096282959, - "rewards/margins": 0.3146303594112396, - "rewards/rejected": -0.6761718988418579, + "epoch": 0.17791732077446362, + "grad_norm": 21.965810775756836, + "learning_rate": 9.560182416695637e-07, + "logits/chosen": 0.490234375, + "logits/rejected": 0.5, + "logps/chosen": -332.0, + "logps/rejected": -306.0, + "loss": 0.6099, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.0615234375, + "rewards/margins": 0.28515625, + "rewards/rejected": -0.34765625, "step": 170 }, { - "epoch": 0.0942161737764983, - "grad_norm": 316.0, - "learning_rate": 9.947184949473478e-06, - "logits/chosen": 0.30113479495048523, - "logits/rejected": 0.36322420835494995, - "logps/chosen": -344.6726379394531, - "logps/rejected": -300.2312316894531, - "loss": 0.5862, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.5419459342956543, - "rewards/margins": 0.40768465399742126, - "rewards/rejected": -0.949630618095398, + "epoch": 0.18838304552590268, + "grad_norm": 23.698848724365234, + "learning_rate": 9.486436268123111e-07, + "logits/chosen": 0.5390625, + "logits/rejected": 0.6015625, + "logps/chosen": -344.0, + "logps/rejected": -314.0, + "loss": 0.6223, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.0267333984375, + "rewards/margins": 0.390625, + "rewards/rejected": -0.41796875, "step": 180 }, { - "epoch": 0.09945040565297043, - "grad_norm": 322.0, - "learning_rate": 9.933890855123114e-06, - "logits/chosen": 0.16772204637527466, - "logits/rejected": 0.20316286385059357, - "logps/chosen": -375.6507263183594, - "logps/rejected": -358.7842102050781, - "loss": 0.6266, - "rewards/accuracies": 0.6468750238418579, - "rewards/chosen": -0.9694031476974487, - "rewards/margins": 0.3656802773475647, - "rewards/rejected": -1.3350833654403687, + "epoch": 0.1988487702773417, + "grad_norm": 20.521411895751953, + "learning_rate": 9.40730812499903e-07, + "logits/chosen": 0.6953125, + "logits/rejected": 0.60546875, + "logps/chosen": -322.0, + "logps/rejected": -282.0, + "loss": 0.6127, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.005859375, + "rewards/margins": 0.482421875, + "rewards/rejected": -0.48828125, "step": 190 }, { - "epoch": 0.10468463752944256, - "grad_norm": 324.0, - "learning_rate": 9.919116957910566e-06, - "logits/chosen": 0.14172935485839844, - "logits/rejected": 0.11142061650753021, - "logps/chosen": -349.0318298339844, - "logps/rejected": -289.46453857421875, - "loss": 0.5972, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.6366950273513794, - "rewards/margins": 0.4041665494441986, - "rewards/rejected": -1.0408614873886108, + "epoch": 0.20931449502878074, + "grad_norm": 25.220121383666992, + "learning_rate": 9.322892910600958e-07, + "logits/chosen": 0.73046875, + "logits/rejected": 0.9765625, + "logps/chosen": -316.0, + "logps/rejected": -292.0, + "loss": 0.5889, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.0478515625, + "rewards/margins": 0.52734375, + "rewards/rejected": -0.478515625, "step": 200 }, { - "epoch": 0.10991886940591468, - "grad_norm": 276.0, - "learning_rate": 9.902867688913869e-06, - "logits/chosen": 0.3469844162464142, - "logits/rejected": 0.3956855535507202, - "logps/chosen": -365.18756103515625, - "logps/rejected": -319.1282958984375, - "loss": 0.5693, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.8478742837905884, - "rewards/margins": 0.47847065329551697, - "rewards/rejected": -1.3263448476791382, + "epoch": 0.21978021978021978, + "grad_norm": 26.859111785888672, + "learning_rate": 9.233291890654476e-07, + "logits/chosen": 0.458984375, + "logits/rejected": 0.498046875, + "logps/chosen": -338.0, + "logps/rejected": -296.0, + "loss": 0.5686, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.04443359375, + "rewards/margins": 0.62890625, + "rewards/rejected": -0.671875, "step": 210 }, { - "epoch": 0.11515310128238682, - "grad_norm": 304.0, - "learning_rate": 9.885147921713621e-06, - "logits/chosen": 0.19320572912693024, - "logits/rejected": 0.27649644017219543, - "logps/chosen": -338.0028076171875, - "logps/rejected": -322.33349609375, - "loss": 0.5867, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.8439861536026001, - "rewards/margins": 0.49753037095069885, - "rewards/rejected": -1.3415164947509766, + "epoch": 0.2302459445316588, + "grad_norm": 19.048847198486328, + "learning_rate": 9.138612551853332e-07, + "logits/chosen": 0.427734375, + "logits/rejected": 0.51953125, + "logps/chosen": -306.0, + "logps/rejected": -314.0, + "loss": 0.6049, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.1962890625, + "rewards/margins": 0.458984375, + "rewards/rejected": -0.65625, "step": 220 }, { - "epoch": 0.12038733315885894, - "grad_norm": 256.0, - "learning_rate": 9.865962970931287e-06, - "logits/chosen": 0.397473007440567, - "logits/rejected": 0.41920894384384155, - "logps/chosen": -357.3067321777344, - "logps/rejected": -311.8251037597656, - "loss": 0.588, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.2804175317287445, - "rewards/margins": 0.4174894690513611, - "rewards/rejected": -0.6979071497917175, + "epoch": 0.24071166928309787, + "grad_norm": 29.931427001953125, + "learning_rate": 9.03896847291683e-07, + "logits/chosen": 0.498046875, + "logits/rejected": 0.490234375, + "logps/chosen": -368.0, + "logps/rejected": -320.0, + "loss": 0.5824, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.06884765625, + "rewards/margins": 0.54296875, + "rewards/rejected": -0.61328125, "step": 230 }, { - "epoch": 0.12562156503533106, - "grad_norm": 280.0, - "learning_rate": 9.845318590635186e-06, - "logits/chosen": 0.4800747036933899, - "logits/rejected": 0.6408789753913879, - "logps/chosen": -353.8960876464844, - "logps/rejected": -311.86163330078125, - "loss": 0.5746, - "rewards/accuracies": 0.684374988079071, - "rewards/chosen": -0.3724919259548187, - "rewards/margins": 0.5004433989524841, - "rewards/rejected": -0.8729352951049805, + "epoch": 0.25117739403453687, + "grad_norm": 24.653114318847656, + "learning_rate": 8.934479188339137e-07, + "logits/chosen": 0.515625, + "logits/rejected": 0.384765625, + "logps/chosen": -358.0, + "logps/rejected": -300.0, + "loss": 0.5824, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.11669921875, + "rewards/margins": 0.53125, + "rewards/rejected": -0.6484375, "step": 240 }, { - "epoch": 0.13085579691180318, - "grad_norm": 294.0, - "learning_rate": 9.823220972614712e-06, - "logits/chosen": 0.3700530230998993, - "logits/rejected": 0.4417282044887543, - "logps/chosen": -369.8304443359375, - "logps/rejected": -302.2646789550781, - "loss": 0.573, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.5568550825119019, - "rewards/margins": 0.5210874080657959, - "rewards/rejected": -1.0779423713684082, + "epoch": 0.2616431187859759, + "grad_norm": 27.315216064453125, + "learning_rate": 8.825270044993962e-07, + "logits/chosen": 0.447265625, + "logits/rejected": 0.71875, + "logps/chosen": -394.0, + "logps/rejected": -350.0, + "loss": 0.5854, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.056640625, + "rewards/margins": 0.76953125, + "rewards/rejected": -0.828125, "step": 250 }, { - "epoch": 0.1360900287882753, - "grad_norm": 312.0, - "learning_rate": 9.79967674452324e-06, - "logits/chosen": 0.3898628354072571, - "logits/rejected": 0.4610047936439514, - "logps/chosen": -347.03118896484375, - "logps/rejected": -336.3883056640625, - "loss": 0.593, - "rewards/accuracies": 0.6656249761581421, - "rewards/chosen": -0.400566428899765, - "rewards/margins": 0.50266432762146, - "rewards/rejected": -0.9032306671142578, + "epoch": 0.272108843537415, + "grad_norm": 26.475204467773438, + "learning_rate": 8.711472051766605e-07, + "logits/chosen": 0.390625, + "logits/rejected": 0.466796875, + "logps/chosen": -322.0, + "logps/rejected": -324.0, + "loss": 0.5773, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.162109375, + "rewards/margins": 0.59375, + "rewards/rejected": -0.7578125, "step": 260 }, { - "epoch": 0.14132426066474746, - "grad_norm": 294.0, - "learning_rate": 9.774692967890332e-06, - "logits/chosen": 0.17694059014320374, - "logits/rejected": 0.22063255310058594, - "logps/chosen": -356.22650146484375, - "logps/rejected": -323.39617919921875, - "loss": 0.5893, - "rewards/accuracies": 0.690625011920929, - "rewards/chosen": -0.3973694443702698, - "rewards/margins": 0.4625066816806793, - "rewards/rejected": -0.8598760366439819, + "epoch": 0.282574568288854, + "grad_norm": 24.804954528808594, + "learning_rate": 8.593221722393789e-07, + "logits/chosen": 0.419921875, + "logits/rejected": 0.443359375, + "logps/chosen": -324.0, + "logps/rejected": -290.0, + "loss": 0.5661, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.16796875, + "rewards/margins": 0.63671875, + "rewards/rejected": -0.8046875, "step": 270 }, { - "epoch": 0.14655849254121958, - "grad_norm": 310.0, - "learning_rate": 9.74827713600379e-06, - "logits/chosen": 0.2775232195854187, - "logits/rejected": 0.38801032304763794, - "logps/chosen": -316.23944091796875, - "logps/rejected": -285.94964599609375, - "loss": 0.6144, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.6175938844680786, - "rewards/margins": 0.40199989080429077, - "rewards/rejected": -1.0195937156677246, + "epoch": 0.29304029304029305, + "grad_norm": 26.523075103759766, + "learning_rate": 8.470660911699782e-07, + "logits/chosen": 0.498046875, + "logits/rejected": 0.640625, + "logps/chosen": -328.0, + "logps/rejected": -286.0, + "loss": 0.5697, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.34375, + "rewards/margins": 0.6171875, + "rewards/rejected": -0.96484375, "step": 280 }, { - "epoch": 0.1517927244176917, - "grad_norm": 272.0, - "learning_rate": 9.720437171662232e-06, - "logits/chosen": 0.39185574650764465, - "logits/rejected": 0.48235875368118286, - "logps/chosen": -336.41375732421875, - "logps/rejected": -312.65216064453125, - "loss": 0.5847, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.23143287003040314, - "rewards/margins": 0.4122442305088043, - "rewards/rejected": -0.6436771154403687, + "epoch": 0.3035060177917321, + "grad_norm": 18.05160140991211, + "learning_rate": 8.343936645425276e-07, + "logits/chosen": 0.60546875, + "logits/rejected": 0.75390625, + "logps/chosen": -344.0, + "logps/rejected": -312.0, + "loss": 0.5687, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.63671875, + "rewards/margins": 0.59375, + "rewards/rejected": -1.2265625, "step": 290 }, { - "epoch": 0.15702695629416383, - "grad_norm": 286.0, - "learning_rate": 9.691181424798825e-06, - "logits/chosen": 0.30432984232902527, - "logits/rejected": 0.2606360912322998, - "logps/chosen": -320.1929626464844, - "logps/rejected": -296.85626220703125, - "loss": 0.5877, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.16091197729110718, - "rewards/margins": 0.4196176528930664, - "rewards/rejected": -0.5805296897888184, + "epoch": 0.3139717425431711, + "grad_norm": 30.55604362487793, + "learning_rate": 8.213200943853158e-07, + "logits/chosen": 0.275390625, + "logits/rejected": 0.6015625, + "logps/chosen": -312.0, + "logps/rejected": -310.0, + "loss": 0.6047, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.453125, + "rewards/margins": 0.341796875, + "rewards/rejected": -0.796875, "step": 300 }, { - "epoch": 0.16226118817063595, - "grad_norm": 320.0, - "learning_rate": 9.660518669976936e-06, - "logits/chosen": 0.3142179250717163, - "logits/rejected": 0.42381685972213745, - "logps/chosen": -351.7626647949219, - "logps/rejected": -305.1934509277344, - "loss": 0.6083, - "rewards/accuracies": 0.671875, - "rewards/chosen": -0.3087378144264221, - "rewards/margins": 0.41920194029808044, - "rewards/rejected": -0.7279397249221802, + "epoch": 0.32443746729461015, + "grad_norm": 25.680500030517578, + "learning_rate": 8.07861063944276e-07, + "logits/chosen": 0.421875, + "logits/rejected": 0.423828125, + "logps/chosen": -394.0, + "logps/rejected": -288.0, + "loss": 0.5542, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.013427734375, + "rewards/margins": 0.859375, + "rewards/rejected": -0.84375, "step": 310 }, { - "epoch": 0.16749542004710807, - "grad_norm": 302.0, - "learning_rate": 9.628458103758403e-06, - "logits/chosen": 0.33300966024398804, - "logits/rejected": 0.37351295351982117, - "logps/chosen": -366.12384033203125, - "logps/rejected": -330.9198913574219, - "loss": 0.5437, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -0.4144722819328308, - "rewards/margins": 0.5553442239761353, - "rewards/rejected": -0.9698165655136108, + "epoch": 0.3349031920460492, + "grad_norm": 27.477874755859375, + "learning_rate": 7.940327188691341e-07, + "logits/chosen": 0.283203125, + "logits/rejected": 0.326171875, + "logps/chosen": -332.0, + "logps/rejected": -294.0, + "loss": 0.5681, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.2177734375, + "rewards/margins": 0.59765625, + "rewards/rejected": -0.81640625, "step": 320 }, { - "epoch": 0.17272965192358022, - "grad_norm": 314.0, - "learning_rate": 9.595009341945246e-06, - "logits/chosen": 0.22988705337047577, - "logits/rejected": 0.2729721665382385, - "logps/chosen": -334.3009338378906, - "logps/rejected": -321.2250671386719, - "loss": 0.6288, - "rewards/accuracies": 0.659375011920929, - "rewards/chosen": -0.8995565176010132, - "rewards/margins": 0.4780782163143158, - "rewards/rejected": -1.3776347637176514, + "epoch": 0.3453689167974882, + "grad_norm": 23.42730712890625, + "learning_rate": 7.798516478448514e-07, + "logits/chosen": 0.2490234375, + "logits/rejected": 0.25390625, + "logps/chosen": -356.0, + "logps/rejected": -302.0, + "loss": 0.5724, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.05810546875, + "rewards/margins": 0.44921875, + "rewards/rejected": -0.5078125, "step": 330 }, { - "epoch": 0.17796388380005235, - "grad_norm": 288.0, - "learning_rate": 9.560182416695639e-06, - "logits/chosen": 0.2716033458709717, - "logits/rejected": 0.2932526171207428, - "logps/chosen": -331.0648193359375, - "logps/rejected": -329.04937744140625, - "loss": 0.5775, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.6781526803970337, - "rewards/margins": 0.5396233201026917, - "rewards/rejected": -1.2177760601043701, + "epoch": 0.35583464154892724, + "grad_norm": 24.55620574951172, + "learning_rate": 7.653348626915957e-07, + "logits/chosen": 0.302734375, + "logits/rejected": 0.3125, + "logps/chosen": -320.0, + "logps/rejected": -298.0, + "loss": 0.5743, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.1396484375, + "rewards/margins": 0.51953125, + "rewards/rejected": -0.66015625, "step": 340 }, { - "epoch": 0.18319811567652447, - "grad_norm": 330.0, - "learning_rate": 9.523987773514999e-06, - "logits/chosen": 0.22474929690361023, - "logits/rejected": 0.27910318970680237, - "logps/chosen": -335.4002990722656, - "logps/rejected": -297.45635986328125, - "loss": 0.6094, - "rewards/accuracies": 0.659375011920929, - "rewards/chosen": -0.4196121096611023, - "rewards/margins": 0.37615495920181274, - "rewards/rejected": -0.7957671284675598, + "epoch": 0.3663003663003663, + "grad_norm": 23.369258880615234, + "learning_rate": 7.504997779571132e-07, + "logits/chosen": 0.53515625, + "logits/rejected": 0.5546875, + "logps/chosen": -350.0, + "logps/rejected": -302.0, + "loss": 0.5759, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.291015625, + "rewards/margins": 0.443359375, + "rewards/rejected": -0.734375, "step": 350 }, { - "epoch": 0.1884323475529966, - "grad_norm": 314.0, - "learning_rate": 9.486436268123112e-06, - "logits/chosen": 0.1711244434118271, - "logits/rejected": 0.24781985580921173, - "logps/chosen": -365.4104919433594, - "logps/rejected": -339.17510986328125, - "loss": 0.5956, + "epoch": 0.37676609105180536, + "grad_norm": 26.217023849487305, + "learning_rate": 7.353641900259823e-07, + "logits/chosen": 0.3203125, + "logits/rejected": 0.2138671875, + "logps/chosen": -368.0, + "logps/rejected": -302.0, + "loss": 0.5576, "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.35211288928985596, - "rewards/margins": 0.45698657631874084, - "rewards/rejected": -0.8090993762016296, + "rewards/chosen": -0.345703125, + "rewards/margins": 0.63671875, + "rewards/rejected": -0.984375, "step": 360 }, { - "epoch": 0.19366657942946872, - "grad_norm": 322.0, - "learning_rate": 9.447539163198218e-06, - "logits/chosen": 0.3507956266403198, - "logits/rejected": 0.3582335114479065, - "logps/chosen": -343.34564208984375, - "logps/rejected": -306.3224182128906, - "loss": 0.5941, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.42065954208374023, - "rewards/margins": 0.4902091920375824, - "rewards/rejected": -0.9108688235282898, + "epoch": 0.3872318158032444, + "grad_norm": 26.125743865966797, + "learning_rate": 7.199462557708097e-07, + "logits/chosen": 0.265625, + "logits/rejected": 0.455078125, + "logps/chosen": -332.0, + "logps/rejected": -336.0, + "loss": 0.5876, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.23828125, + "rewards/margins": 0.6015625, + "rewards/rejected": -0.83984375, "step": 370 }, { - "epoch": 0.19890081130594087, - "grad_norm": 201.0, - "learning_rate": 9.407308124999031e-06, - "logits/chosen": 0.535057544708252, - "logits/rejected": 0.5568063855171204, - "logps/chosen": -361.60809326171875, - "logps/rejected": -340.9134826660156, - "loss": 0.6044, - "rewards/accuracies": 0.6656249761581421, - "rewards/chosen": -0.8800666928291321, - "rewards/margins": 0.5229350924491882, - "rewards/rejected": -1.4030016660690308, + "epoch": 0.3976975405546834, + "grad_norm": 25.499277114868164, + "learning_rate": 7.042644707709815e-07, + "logits/chosen": 0.287109375, + "logits/rejected": 0.423828125, + "logps/chosen": -312.0, + "logps/rejected": -251.0, + "loss": 0.5691, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.275390625, + "rewards/margins": 0.56640625, + "rewards/rejected": -0.83984375, "step": 380 }, { - "epoch": 0.204135043182413, - "grad_norm": 253.0, - "learning_rate": 9.365755219865733e-06, - "logits/chosen": 0.5194161534309387, - "logits/rejected": 0.6202957630157471, - "logps/chosen": -356.85552978515625, - "logps/rejected": -337.2948913574219, - "loss": 0.5604, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.6672223806381226, - "rewards/margins": 0.6344886422157288, - "rewards/rejected": -1.3017112016677856, + "epoch": 0.40816326530612246, + "grad_norm": 24.560373306274414, + "learning_rate": 6.883376471250955e-07, + "logits/chosen": 0.388671875, + "logits/rejected": 0.263671875, + "logps/chosen": -326.0, + "logps/rejected": -296.0, + "loss": 0.59, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.244140625, + "rewards/margins": 0.50390625, + "rewards/rejected": -0.75, "step": 390 }, { - "epoch": 0.2093692750588851, - "grad_norm": 358.0, - "learning_rate": 9.322892910600959e-06, - "logits/chosen": 0.539501965045929, - "logits/rejected": 0.7477941513061523, - "logps/chosen": -328.63128662109375, - "logps/rejected": -295.7342529296875, - "loss": 0.6114, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.6417405009269714, - "rewards/margins": 0.4066389203071594, - "rewards/rejected": -1.0483794212341309, + "epoch": 0.4186289900575615, + "grad_norm": 27.33049774169922, + "learning_rate": 6.72184890883692e-07, + "logits/chosen": 0.349609375, + "logits/rejected": 0.4375, + "logps/chosen": -306.0, + "logps/rejected": -262.0, + "loss": 0.5551, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.0361328125, + "rewards/margins": 0.546875, + "rewards/rejected": -0.58203125, "step": 400 }, { - "epoch": 0.21460350693535724, - "grad_norm": 454.0, - "learning_rate": 9.278734052731876e-06, - "logits/chosen": 0.4946824014186859, - "logits/rejected": 0.5248245596885681, - "logps/chosen": -346.6952819824219, - "logps/rejected": -322.2110595703125, - "loss": 0.5717, - "rewards/accuracies": 0.7406250238418579, - "rewards/chosen": -0.4744420647621155, - "rewards/margins": 0.5339844226837158, - "rewards/rejected": -1.0084264278411865, + "epoch": 0.4290947148090005, + "grad_norm": 24.623563766479492, + "learning_rate": 6.558255791293571e-07, + "logits/chosen": 0.453125, + "logits/rejected": 0.44921875, + "logps/chosen": -320.0, + "logps/rejected": -282.0, + "loss": 0.5582, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.07958984375, + "rewards/margins": 0.72265625, + "rewards/rejected": -0.8046875, "step": 410 }, { - "epoch": 0.21983773881182936, - "grad_norm": 344.0, - "learning_rate": 9.233291890654477e-06, - "logits/chosen": 0.1536872535943985, - "logits/rejected": 0.21127930283546448, - "logps/chosen": -349.3604736328125, - "logps/rejected": -306.022216796875, - "loss": 0.5426, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.4529080390930176, - "rewards/margins": 0.5982980728149414, - "rewards/rejected": -1.051206111907959, + "epoch": 0.43956043956043955, + "grad_norm": 31.462345123291016, + "learning_rate": 6.392793367316904e-07, + "logits/chosen": 0.3984375, + "logits/rejected": 0.3203125, + "logps/chosen": -360.0, + "logps/rejected": -316.0, + "loss": 0.549, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.0059814453125, + "rewards/margins": 0.70703125, + "rewards/rejected": -0.7109375, "step": 420 }, { - "epoch": 0.22507197068830148, - "grad_norm": 312.0, - "learning_rate": 9.186580053661238e-06, - "logits/chosen": 0.2585577666759491, - "logits/rejected": 0.21868768334388733, - "logps/chosen": -345.9136962890625, - "logps/rejected": -358.81427001953125, - "loss": 0.6164, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.6662707328796387, - "rewards/margins": 0.5226248502731323, - "rewards/rejected": -1.1888954639434814, + "epoch": 0.4500261643118786, + "grad_norm": 29.43328285217285, + "learning_rate": 6.225660128050247e-07, + "logits/chosen": 0.373046875, + "logits/rejected": 0.515625, + "logps/chosen": -308.0, + "logps/rejected": -282.0, + "loss": 0.5875, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.376953125, + "rewards/margins": 0.48828125, + "rewards/rejected": -0.8671875, "step": 430 }, { - "epoch": 0.23030620256477363, - "grad_norm": 260.0, - "learning_rate": 9.138612551853334e-06, - "logits/chosen": 0.14254237711429596, - "logits/rejected": 0.29151830077171326, - "logps/chosen": -357.33001708984375, - "logps/rejected": -309.49505615234375, - "loss": 0.5649, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.3096122741699219, - "rewards/margins": 0.5217168927192688, - "rewards/rejected": -0.8313292264938354, + "epoch": 0.4604918890633176, + "grad_norm": 25.328622817993164, + "learning_rate": 6.057056568971383e-07, + "logits/chosen": 0.4140625, + "logits/rejected": 0.46484375, + "logps/chosen": -408.0, + "logps/rejected": -318.0, + "loss": 0.5225, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.474609375, + "rewards/margins": 0.89453125, + "rewards/rejected": -1.3671875, "step": 440 }, { - "epoch": 0.23554043444124576, - "grad_norm": 268.0, - "learning_rate": 9.089403771938651e-06, - "logits/chosen": 0.2172239124774933, - "logits/rejected": 0.3413824439048767, - "logps/chosen": -343.6112365722656, - "logps/rejected": -313.96453857421875, - "loss": 0.5926, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.2865334153175354, - "rewards/margins": 0.4800845980644226, - "rewards/rejected": -0.766618013381958, + "epoch": 0.47095761381475665, + "grad_norm": 20.754179000854492, + "learning_rate": 5.887184949375242e-07, + "logits/chosen": 0.48828125, + "logits/rejected": 0.388671875, + "logps/chosen": -350.0, + "logps/rejected": -310.0, + "loss": 0.5658, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.50390625, + "rewards/margins": 0.7578125, + "rewards/rejected": -1.265625, "step": 450 }, { - "epoch": 0.24077466631771788, - "grad_norm": 270.0, - "learning_rate": 9.038968472916831e-06, - "logits/chosen": 0.20296330749988556, - "logits/rejected": 0.29848283529281616, - "logps/chosen": -370.986328125, - "logps/rejected": -362.0606384277344, - "loss": 0.5738, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.44281673431396484, - "rewards/margins": 0.5560091733932495, - "rewards/rejected": -0.9988259077072144, + "epoch": 0.48142333856619574, + "grad_norm": 25.111303329467773, + "learning_rate": 5.716249049740689e-07, + "logits/chosen": 0.41796875, + "logits/rejected": 0.578125, + "logps/chosen": -368.0, + "logps/rejected": -336.0, + "loss": 0.5623, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.1806640625, + "rewards/margins": 0.79296875, + "rewards/rejected": -0.97265625, "step": 460 }, { - "epoch": 0.24600889819419, - "grad_norm": 284.0, - "learning_rate": 8.987321781652663e-06, - "logits/chosen": 0.3275991380214691, - "logits/rejected": 0.31361979246139526, - "logps/chosen": -329.7540588378906, - "logps/rejected": -297.33807373046875, - "loss": 0.56, - "rewards/accuracies": 0.690625011920929, - "rewards/chosen": -0.5578715205192566, - "rewards/margins": 0.5787358283996582, - "rewards/rejected": -1.1366074085235596, + "epoch": 0.49188906331763477, + "grad_norm": 20.85157585144043, + "learning_rate": 5.544453927272492e-07, + "logits/chosen": 0.44140625, + "logits/rejected": 0.466796875, + "logps/chosen": -348.0, + "logps/rejected": -330.0, + "loss": 0.545, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.103515625, + "rewards/margins": 0.7578125, + "rewards/rejected": -0.86328125, "step": 470 }, { - "epoch": 0.2512431300706621, - "grad_norm": 348.0, - "learning_rate": 8.93447918833914e-06, - "logits/chosen": 0.24553251266479492, - "logits/rejected": 0.24857684969902039, - "logps/chosen": -366.5342102050781, - "logps/rejected": -313.3810119628906, - "loss": 0.5912, - "rewards/accuracies": 0.684374988079071, - "rewards/chosen": -0.5592954754829407, - "rewards/margins": 0.5262395143508911, - "rewards/rejected": -1.0855350494384766, + "epoch": 0.5023547880690737, + "grad_norm": 27.79718017578125, + "learning_rate": 5.372005669911693e-07, + "logits/chosen": 0.50390625, + "logits/rejected": 0.5546875, + "logps/chosen": -346.0, + "logps/rejected": -280.0, + "loss": 0.6289, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.07666015625, + "rewards/margins": 0.61328125, + "rewards/rejected": -0.69140625, "step": 480 }, { - "epoch": 0.2564773619471343, - "grad_norm": 300.0, - "learning_rate": 8.880456541851544e-06, - "logits/chosen": 0.21284916996955872, - "logits/rejected": 0.29197466373443604, - "logps/chosen": -394.79217529296875, - "logps/rejected": -337.88397216796875, - "loss": 0.5492, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.5321744680404663, - "rewards/margins": 0.6479983925819397, - "rewards/rejected": -1.1801728010177612, + "epoch": 0.5128205128205128, + "grad_norm": 25.19495964050293, + "learning_rate": 5.199111149109497e-07, + "logits/chosen": 0.302734375, + "logits/rejected": 0.49609375, + "logps/chosen": -330.0, + "logps/rejected": -292.0, + "loss": 0.5888, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.03759765625, + "rewards/margins": 0.89453125, + "rewards/rejected": -0.93359375, "step": 490 }, { - "epoch": 0.26171159382360637, - "grad_norm": 370.0, - "learning_rate": 8.825270044993963e-06, - "logits/chosen": 0.30395790934562683, - "logits/rejected": 0.41689762473106384, - "logps/chosen": -316.583251953125, - "logps/rejected": -323.14434814453125, - "loss": 0.577, - "rewards/accuracies": 0.659375011920929, - "rewards/chosen": -0.4370909631252289, - "rewards/margins": 0.5204497575759888, - "rewards/rejected": -0.9575408101081848, + "epoch": 0.5232862375719518, + "grad_norm": 23.607749938964844, + "learning_rate": 5.025977771661266e-07, + "logits/chosen": 0.494140625, + "logits/rejected": 0.50390625, + "logps/chosen": -306.0, + "logps/rejected": -312.0, + "loss": 0.592, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.330078125, + "rewards/margins": 0.4296875, + "rewards/rejected": -0.7578125, "step": 500 }, { - "epoch": 0.2669458257000785, - "grad_norm": 350.0, - "learning_rate": 8.768936249639632e-06, - "logits/chosen": 0.1348932683467865, - "logits/rejected": 0.26566845178604126, - "logps/chosen": -331.4639587402344, - "logps/rejected": -320.91461181640625, - "loss": 0.597, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.4939492344856262, - "rewards/margins": 0.47716912627220154, - "rewards/rejected": -0.9711184501647949, + "epoch": 0.533751962323391, + "grad_norm": 28.496578216552734, + "learning_rate": 4.852813230898279e-07, + "logits/chosen": 0.462890625, + "logits/rejected": 0.498046875, + "logps/chosen": -374.0, + "logps/rejected": -310.0, + "loss": 0.599, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.050048828125, + "rewards/margins": 0.6953125, + "rewards/rejected": -0.64453125, "step": 510 }, { - "epoch": 0.2721800575765506, - "grad_norm": 312.0, - "learning_rate": 8.711472051766606e-06, - "logits/chosen": 0.19042043387889862, - "logits/rejected": 0.2794221341609955, - "logps/chosen": -354.65740966796875, - "logps/rejected": -331.80267333984375, - "loss": 0.552, - "rewards/accuracies": 0.6781250238418579, - "rewards/chosen": -0.4176334738731384, - "rewards/margins": 0.569202184677124, - "rewards/rejected": -0.986835777759552, + "epoch": 0.54421768707483, + "grad_norm": 23.15301513671875, + "learning_rate": 4.679825257535794e-07, + "logits/chosen": 0.439453125, + "logits/rejected": 0.43359375, + "logps/chosen": -366.0, + "logps/rejected": -334.0, + "loss": 0.5337, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0294189453125, + "rewards/margins": 0.71484375, + "rewards/rejected": -0.68359375, "step": 520 }, { - "epoch": 0.27741428945302277, - "grad_norm": 292.0, - "learning_rate": 8.652894686390205e-06, - "logits/chosen": 0.2197699099779129, - "logits/rejected": 0.2926613390445709, - "logps/chosen": -357.4103698730469, - "logps/rejected": -326.92315673828125, - "loss": 0.5695, + "epoch": 0.554683411826269, + "grad_norm": 22.724777221679688, + "learning_rate": 4.507221370476223e-07, + "logits/chosen": 0.4609375, + "logits/rejected": 0.59765625, + "logps/chosen": -336.0, + "logps/rejected": -328.0, + "loss": 0.5767, "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.5999321341514587, - "rewards/margins": 0.5926289558410645, - "rewards/rejected": -1.1925609111785889, + "rewards/chosen": 0.02880859375, + "rewards/margins": 0.671875, + "rewards/rejected": -0.640625, "step": 530 }, { - "epoch": 0.2826485213294949, - "grad_norm": 326.0, - "learning_rate": 8.593221722393789e-06, - "logits/chosen": 0.17706915736198425, - "logits/rejected": 0.2584270238876343, - "logps/chosen": -358.02740478515625, - "logps/rejected": -326.58367919921875, - "loss": 0.5489, - "rewards/accuracies": 0.690625011920929, - "rewards/chosen": -0.8800150156021118, - "rewards/margins": 0.6341498494148254, - "rewards/rejected": -1.514164686203003, + "epoch": 0.565149136577708, + "grad_norm": 20.862764358520508, + "learning_rate": 4.3352086278664377e-07, + "logits/chosen": 0.4921875, + "logits/rejected": 0.515625, + "logps/chosen": -318.0, + "logps/rejected": -270.0, + "loss": 0.5311, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.051025390625, + "rewards/margins": 0.7109375, + "rewards/rejected": -0.66015625, "step": 540 }, { - "epoch": 0.287882753205967, - "grad_norm": 332.0, - "learning_rate": 8.53247105725939e-06, - "logits/chosen": 0.22675807774066925, - "logits/rejected": 0.23919430375099182, - "logps/chosen": -325.2323913574219, - "logps/rejected": -299.59039306640625, - "loss": 0.5546, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.658308207988739, - "rewards/margins": 0.6446129083633423, - "rewards/rejected": -1.302921175956726, + "epoch": 0.5756148613291471, + "grad_norm": 23.62275505065918, + "learning_rate": 4.1639933787077854e-07, + "logits/chosen": 0.46484375, + "logits/rejected": 0.609375, + "logps/chosen": -332.0, + "logps/rejected": -304.0, + "loss": 0.5508, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.01165771484375, + "rewards/margins": 0.6640625, + "rewards/rejected": -0.6796875, "step": 550 }, { - "epoch": 0.29311698508243916, - "grad_norm": 330.0, - "learning_rate": 8.470660911699783e-06, - "logits/chosen": 0.09726688265800476, - "logits/rejected": 0.15922322869300842, - "logps/chosen": -337.0237121582031, - "logps/rejected": -292.3273010253906, - "loss": 0.5796, - "rewards/accuracies": 0.690625011920929, - "rewards/chosen": -0.5876582860946655, - "rewards/margins": 0.5548220276832581, - "rewards/rejected": -1.1424801349639893, + "epoch": 0.5860805860805861, + "grad_norm": 26.077327728271484, + "learning_rate": 3.9937810153168016e-07, + "logits/chosen": 0.5078125, + "logits/rejected": 0.427734375, + "logps/chosen": -374.0, + "logps/rejected": -334.0, + "loss": 0.5848, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.2197265625, + "rewards/margins": 0.609375, + "rewards/rejected": -0.828125, "step": 560 }, { - "epoch": 0.29835121695891126, - "grad_norm": 294.0, - "learning_rate": 8.407809824193624e-06, - "logits/chosen": 0.08568461239337921, - "logits/rejected": 0.2500315308570862, - "logps/chosen": -373.3963928222656, - "logps/rejected": -339.5152282714844, - "loss": 0.5893, - "rewards/accuracies": 0.6968749761581421, - "rewards/chosen": -0.7039493918418884, - "rewards/margins": 0.5431109666824341, - "rewards/rejected": -1.2470605373382568, + "epoch": 0.5965463108320251, + "grad_norm": 25.223384857177734, + "learning_rate": 3.8247757269335957e-07, + "logits/chosen": 0.37890625, + "logits/rejected": 0.474609375, + "logps/chosen": -334.0, + "logps/rejected": -302.0, + "loss": 0.5283, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.1943359375, + "rewards/margins": 0.72265625, + "rewards/rejected": -0.91796875, "step": 570 }, { - "epoch": 0.3035854488353834, - "grad_norm": 274.0, - "learning_rate": 8.343936645425277e-06, - "logits/chosen": 0.3479989767074585, - "logits/rejected": 0.3873814642429352, - "logps/chosen": -325.4124450683594, - "logps/rejected": -307.15423583984375, - "loss": 0.5188, - "rewards/accuracies": 0.7406250238418579, - "rewards/chosen": -0.594230055809021, - "rewards/margins": 0.7160730957984924, - "rewards/rejected": -1.3103030920028687, + "epoch": 0.6070120355834642, + "grad_norm": 23.150033950805664, + "learning_rate": 3.657180254773445e-07, + "logits/chosen": 0.421875, + "logits/rejected": 0.546875, + "logps/chosen": -334.0, + "logps/rejected": -274.0, + "loss": 0.5584, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.359375, + "rewards/margins": 0.828125, + "rewards/rejected": -1.1875, "step": 580 }, { - "epoch": 0.30881968071185556, - "grad_norm": 286.0, - "learning_rate": 8.279060532630991e-06, - "logits/chosen": 0.3080836236476898, - "logits/rejected": 0.4134043753147125, - "logps/chosen": -358.85076904296875, - "logps/rejected": -332.5126953125, - "loss": 0.5926, - "rewards/accuracies": 0.6656249761581421, - "rewards/chosen": -0.7294015884399414, - "rewards/margins": 0.5793807506561279, - "rewards/rejected": -1.3087823390960693, + "epoch": 0.6174777603349032, + "grad_norm": 26.779611587524414, + "learning_rate": 3.4911956488154694e-07, + "logits/chosen": 0.48828125, + "logits/rejected": 0.625, + "logps/chosen": -360.0, + "logps/rejected": -310.0, + "loss": 0.5908, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.4296875, + "rewards/margins": 0.73046875, + "rewards/rejected": -1.15625, "step": 590 }, { - "epoch": 0.31405391258832765, - "grad_norm": 332.0, - "learning_rate": 8.21320094385316e-06, - "logits/chosen": 0.3551139533519745, - "logits/rejected": 0.38634929060935974, - "logps/chosen": -369.88446044921875, - "logps/rejected": -338.72308349609375, - "loss": 0.6129, - "rewards/accuracies": 0.6656249761581421, - "rewards/chosen": -0.5936108231544495, - "rewards/margins": 0.5011196732521057, - "rewards/rejected": -1.0947306156158447, + "epoch": 0.6279434850863422, + "grad_norm": 30.289024353027344, + "learning_rate": 3.327021026620137e-07, + "logits/chosen": 0.53125, + "logits/rejected": 0.5234375, + "logps/chosen": -356.0, + "logps/rejected": -328.0, + "loss": 0.5481, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.4921875, + "rewards/margins": 0.734375, + "rewards/rejected": -1.2265625, "step": 600 }, { - "epoch": 0.3192881444647998, - "grad_norm": 290.0, - "learning_rate": 8.146377632104328e-06, - "logits/chosen": 0.22317269444465637, - "logits/rejected": 0.4244447648525238, - "logps/chosen": -381.1096496582031, - "logps/rejected": -322.06689453125, - "loss": 0.5404, - "rewards/accuracies": 0.715624988079071, - "rewards/chosen": -0.44336166977882385, - "rewards/margins": 0.691791832447052, - "rewards/rejected": -1.1351535320281982, + "epoch": 0.6384092098377813, + "grad_norm": 26.568483352661133, + "learning_rate": 3.16485333446493e-07, + "logits/chosen": 0.396484375, + "logits/rejected": 0.81640625, + "logps/chosen": -328.0, + "logps/rejected": -368.0, + "loss": 0.5517, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.58984375, + "rewards/margins": 0.76953125, + "rewards/rejected": -1.359375, "step": 610 }, { - "epoch": 0.3245223763412719, - "grad_norm": 338.0, - "learning_rate": 8.078610639442761e-06, - "logits/chosen": 0.23834876716136932, - "logits/rejected": 0.2626754641532898, - "logps/chosen": -367.4986267089844, - "logps/rejected": -314.33697509765625, - "loss": 0.5753, - "rewards/accuracies": 0.690625011920929, - "rewards/chosen": -0.5537876486778259, - "rewards/margins": 0.5256361365318298, - "rewards/rejected": -1.0794237852096558, + "epoch": 0.6488749345892203, + "grad_norm": 23.833829879760742, + "learning_rate": 3.004887111084704e-07, + "logits/chosen": 0.314453125, + "logits/rejected": 0.37890625, + "logps/chosen": -358.0, + "logps/rejected": -312.0, + "loss": 0.5447, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.373046875, + "rewards/margins": 0.8125, + "rewards/rejected": -1.1875, "step": 620 }, { - "epoch": 0.32975660821774405, - "grad_norm": 346.0, - "learning_rate": 8.009920290961302e-06, - "logits/chosen": 0.18554073572158813, - "logits/rejected": 0.0812699943780899, - "logps/chosen": -345.83111572265625, - "logps/rejected": -332.9136047363281, - "loss": 0.549, - "rewards/accuracies": 0.721875011920929, - "rewards/chosen": -0.5027406215667725, - "rewards/margins": 0.6489895582199097, - "rewards/rejected": -1.1517301797866821, + "epoch": 0.6593406593406593, + "grad_norm": 24.47187042236328, + "learning_rate": 2.8473142543001816e-07, + "logits/chosen": 0.45703125, + "logits/rejected": 0.455078125, + "logps/chosen": -324.0, + "logps/rejected": -310.0, + "loss": 0.552, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.55859375, + "rewards/margins": 0.70703125, + "rewards/rejected": -1.265625, "step": 630 }, { - "epoch": 0.33499084009421615, - "grad_norm": 326.0, - "learning_rate": 7.94032718869134e-06, - "logits/chosen": 0.1383436620235443, - "logits/rejected": 0.1128048524260521, - "logps/chosen": -360.80792236328125, - "logps/rejected": -321.8601379394531, - "loss": 0.5403, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.42300111055374146, - "rewards/margins": 0.6382675170898438, - "rewards/rejected": -1.0612685680389404, + "epoch": 0.6698063840920984, + "grad_norm": 29.288145065307617, + "learning_rate": 2.6923237908145226e-07, + "logits/chosen": 0.458984375, + "logits/rejected": 0.494140625, + "logps/chosen": -344.0, + "logps/rejected": -324.0, + "loss": 0.5347, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.396484375, + "rewards/margins": 0.76171875, + "rewards/rejected": -1.15625, "step": 640 }, { - "epoch": 0.3402250719706883, - "grad_norm": 270.0, - "learning_rate": 7.869852205423738e-06, - "logits/chosen": 0.062131352722644806, - "logits/rejected": 0.09803047776222229, - "logps/chosen": -344.85394287109375, - "logps/rejected": -308.65142822265625, - "loss": 0.5739, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.47032850980758667, - "rewards/margins": 0.5552009344100952, - "rewards/rejected": -1.0255295038223267, + "epoch": 0.6802721088435374, + "grad_norm": 25.19725799560547, + "learning_rate": 2.540101649454119e-07, + "logits/chosen": 0.6171875, + "logits/rejected": 0.5859375, + "logps/chosen": -346.0, + "logps/rejected": -316.0, + "loss": 0.5428, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.302734375, + "rewards/margins": 0.8984375, + "rewards/rejected": -1.1953125, "step": 650 }, { - "epoch": 0.34545930384716045, - "grad_norm": 328.0, - "learning_rate": 7.798516478448514e-06, - "logits/chosen": 0.0817512795329094, - "logits/rejected": 0.07857748121023178, - "logps/chosen": -365.5155029296875, - "logps/rejected": -317.58673095703125, - "loss": 0.5679, - "rewards/accuracies": 0.684374988079071, - "rewards/chosen": -0.4949292540550232, - "rewards/margins": 0.571694016456604, - "rewards/rejected": -1.066623330116272, + "epoch": 0.6907378335949764, + "grad_norm": 23.4490909576416, + "learning_rate": 2.3908304381256603e-07, + "logits/chosen": 0.439453125, + "logits/rejected": 0.69921875, + "logps/chosen": -326.0, + "logps/rejected": -298.0, + "loss": 0.5694, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.515625, + "rewards/margins": 0.625, + "rewards/rejected": -1.140625, "step": 660 }, { - "epoch": 0.35069353572363254, - "grad_norm": 272.0, - "learning_rate": 7.726341403215237e-06, - "logits/chosen": 0.16348454356193542, - "logits/rejected": 0.16545890271663666, - "logps/chosen": -343.07965087890625, - "logps/rejected": -295.33526611328125, - "loss": 0.5704, - "rewards/accuracies": 0.690625011920929, - "rewards/chosen": -0.5133311152458191, - "rewards/margins": 0.6606391668319702, - "rewards/rejected": -1.173970341682434, + "epoch": 0.7012035583464155, + "grad_norm": 25.44082260131836, + "learning_rate": 2.2446892247570255e-07, + "logits/chosen": 0.458984375, + "logits/rejected": 0.55859375, + "logps/chosen": -334.0, + "logps/rejected": -314.0, + "loss": 0.5794, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.216796875, + "rewards/margins": 0.859375, + "rewards/rejected": -1.078125, "step": 670 }, { - "epoch": 0.3559277676001047, - "grad_norm": 288.0, - "learning_rate": 7.653348626915957e-06, - "logits/chosen": 0.21217510104179382, - "logits/rejected": 0.2788509726524353, - "logps/chosen": -338.8388977050781, - "logps/rejected": -317.30279541015625, - "loss": 0.5509, - "rewards/accuracies": 0.721875011920929, - "rewards/chosen": -0.48806896805763245, - "rewards/margins": 0.6682482957839966, - "rewards/rejected": -1.1563172340393066, + "epoch": 0.7116692830978545, + "grad_norm": 27.55499839782715, + "learning_rate": 2.1018533224847633e-07, + "logits/chosen": 0.51171875, + "logits/rejected": 0.5, + "logps/chosen": -328.0, + "logps/rejected": -286.0, + "loss": 0.5717, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.171875, + "rewards/margins": 0.7890625, + "rewards/rejected": -0.9609375, "step": 680 }, { - "epoch": 0.3611619994765768, - "grad_norm": 282.0, - "learning_rate": 7.5795600419926595e-06, - "logits/chosen": 0.3617590069770813, - "logits/rejected": 0.3512483537197113, - "logps/chosen": -350.26263427734375, - "logps/rejected": -308.1812744140625, - "loss": 0.5607, - "rewards/accuracies": 0.6968749761581421, - "rewards/chosen": -0.3443593382835388, - "rewards/margins": 0.5753520727157593, - "rewards/rejected": -0.9197114109992981, + "epoch": 0.7221350078492935, + "grad_norm": 24.051036834716797, + "learning_rate": 1.9624940793459055e-07, + "logits/chosen": 0.3125, + "logits/rejected": 0.486328125, + "logps/chosen": -358.0, + "logps/rejected": -318.0, + "loss": 0.5578, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.03173828125, + "rewards/margins": 0.69921875, + "rewards/rejected": -0.734375, "step": 690 }, { - "epoch": 0.36639623135304894, - "grad_norm": 258.0, - "learning_rate": 7.504997779571134e-06, - "logits/chosen": 0.35462698340415955, - "logits/rejected": 0.4073667526245117, - "logps/chosen": -342.01727294921875, - "logps/rejected": -312.5554504394531, - "loss": 0.5957, - "rewards/accuracies": 0.721875011920929, - "rewards/chosen": -0.38807958364486694, - "rewards/margins": 0.4581042230129242, - "rewards/rejected": -0.8461838960647583, + "epoch": 0.7326007326007326, + "grad_norm": 28.70475196838379, + "learning_rate": 1.8267786727263424e-07, + "logits/chosen": 0.58203125, + "logits/rejected": 0.8046875, + "logps/chosen": -292.0, + "logps/rejected": -288.0, + "loss": 0.5475, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.1865234375, + "rewards/margins": 0.484375, + "rewards/rejected": -0.671875, "step": 700 }, { - "epoch": 0.3716304632295211, - "grad_norm": 360.0, - "learning_rate": 7.429684202823284e-06, - "logits/chosen": 0.3464614748954773, - "logits/rejected": 0.26026487350463867, - "logps/chosen": -367.31829833984375, - "logps/rejected": -318.8564453125, - "loss": 0.529, - "rewards/accuracies": 0.684374988079071, - "rewards/chosen": -0.44659894704818726, - "rewards/margins": 0.6956688165664673, - "rewards/rejected": -1.1422678232192993, + "epoch": 0.7430664573521716, + "grad_norm": 21.762775421142578, + "learning_rate": 1.694869908812399e-07, + "logits/chosen": 0.458984375, + "logits/rejected": 0.419921875, + "logps/chosen": -320.0, + "logps/rejected": -284.0, + "loss": 0.5696, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2578125, + "rewards/margins": 0.53515625, + "rewards/rejected": -0.7890625, "step": 710 }, { - "epoch": 0.3768646951059932, - "grad_norm": 320.0, - "learning_rate": 7.353641900259823e-06, - "logits/chosen": 0.33798351883888245, - "logits/rejected": 0.30416375398635864, - "logps/chosen": -348.59454345703125, - "logps/rejected": -317.1212158203125, - "loss": 0.5682, - "rewards/accuracies": 0.6968749761581421, - "rewards/chosen": -0.6439448595046997, - "rewards/margins": 0.6287893056869507, - "rewards/rejected": -1.2727340459823608, + "epoch": 0.7535321821036107, + "grad_norm": 24.395294189453125, + "learning_rate": 1.5669260272861422e-07, + "logits/chosen": 0.4140625, + "logits/rejected": 0.33984375, + "logps/chosen": -302.0, + "logps/rejected": -290.0, + "loss": 0.5357, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.018310546875, + "rewards/margins": 0.75, + "rewards/rejected": -0.73046875, "step": 720 }, { - "epoch": 0.38209892698246534, - "grad_norm": 344.0, - "learning_rate": 7.276893678955387e-06, - "logits/chosen": 0.15998776257038116, - "logits/rejected": 0.35106360912323, - "logps/chosen": -367.888671875, - "logps/rejected": -336.6737060546875, - "loss": 0.5989, - "rewards/accuracies": 0.690625011920929, - "rewards/chosen": -0.7499735951423645, - "rewards/margins": 0.6262648701667786, - "rewards/rejected": -1.3762385845184326, + "epoch": 0.7639979068550498, + "grad_norm": 25.604717254638672, + "learning_rate": 1.4431005114987483e-07, + "logits/chosen": 0.56640625, + "logits/rejected": 0.466796875, + "logps/chosen": -360.0, + "logps/rejected": -312.0, + "loss": 0.5581, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.09716796875, + "rewards/margins": 0.6796875, + "rewards/rejected": -0.77734375, "step": 730 }, { - "epoch": 0.38733315885893743, - "grad_norm": 264.0, - "learning_rate": 7.199462557708098e-06, - "logits/chosen": 0.1745099276304245, - "logits/rejected": 0.2742732763290405, - "logps/chosen": -310.071533203125, - "logps/rejected": -299.4850769042969, - "loss": 0.5774, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.6706897020339966, - "rewards/margins": 0.5428994297981262, - "rewards/rejected": -1.2135891914367676, + "epoch": 0.7744636316064888, + "grad_norm": 27.11602783203125, + "learning_rate": 1.323541904349636e-07, + "logits/chosen": 0.5234375, + "logits/rejected": 0.58984375, + "logps/chosen": -362.0, + "logps/rejected": -288.0, + "loss": 0.5973, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.1513671875, + "rewards/margins": 0.74609375, + "rewards/rejected": -0.8984375, "step": 740 }, { - "epoch": 0.3925673907354096, - "grad_norm": 314.0, - "learning_rate": 7.1213717601356245e-06, - "logits/chosen": 0.1733260601758957, - "logits/rejected": 0.1483219712972641, - "logps/chosen": -360.5184020996094, - "logps/rejected": -322.8306579589844, - "loss": 0.5344, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.47234034538269043, - "rewards/margins": 0.6140186190605164, - "rewards/rejected": -1.086358904838562, + "epoch": 0.7849293563579278, + "grad_norm": 25.82685661315918, + "learning_rate": 1.2083936300922237e-07, + "logits/chosen": 0.60546875, + "logits/rejected": 0.6015625, + "logps/chosen": -356.0, + "logps/rejected": -344.0, + "loss": 0.5742, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.11083984375, + "rewards/margins": 0.640625, + "rewards/rejected": -0.75, "step": 750 }, { - "epoch": 0.39780162261188173, - "grad_norm": 340.0, - "learning_rate": 7.042644707709816e-06, - "logits/chosen": 0.17112216353416443, - "logits/rejected": 0.19962458312511444, - "logps/chosen": -351.1370544433594, - "logps/rejected": -333.3719177246094, - "loss": 0.5823, - "rewards/accuracies": 0.6968749761581421, - "rewards/chosen": -0.4944635331630707, - "rewards/margins": 0.5449696779251099, - "rewards/rejected": -1.039433240890503, + "epoch": 0.7953950811093669, + "grad_norm": 24.40570068359375, + "learning_rate": 1.0977938222801004e-07, + "logits/chosen": 0.578125, + "logits/rejected": 0.40234375, + "logps/chosen": -334.0, + "logps/rejected": -300.0, + "loss": 0.5828, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.19140625, + "rewards/margins": 0.734375, + "rewards/rejected": -0.92578125, "step": 760 }, { - "epoch": 0.40303585448835383, - "grad_norm": 370.0, - "learning_rate": 6.963305012731984e-06, - "logits/chosen": 0.198240727186203, - "logits/rejected": 0.220525860786438, - "logps/chosen": -305.16143798828125, - "logps/rejected": -299.1192932128906, - "loss": 0.6028, - "rewards/accuracies": 0.6781250238418579, - "rewards/chosen": -0.5226008296012878, - "rewards/margins": 0.5330361127853394, - "rewards/rejected": -1.055637001991272, + "epoch": 0.8058608058608059, + "grad_norm": 22.45295524597168, + "learning_rate": 9.918751580599999e-08, + "logits/chosen": 0.54296875, + "logits/rejected": 0.53515625, + "logps/chosen": -364.0, + "logps/rejected": -308.0, + "loss": 0.5766, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.09814453125, + "rewards/margins": 0.69921875, + "rewards/rejected": -0.796875, "step": 770 }, { - "epoch": 0.408270086364826, - "grad_norm": 328.0, - "learning_rate": 6.8833764712509554e-06, - "logits/chosen": 0.19480012357234955, - "logits/rejected": 0.2182588279247284, - "logps/chosen": -317.003662109375, - "logps/rejected": -302.61334228515625, - "loss": 0.5657, - "rewards/accuracies": 0.715624988079071, - "rewards/chosen": -0.3436005711555481, - "rewards/margins": 0.5159841775894165, - "rewards/rejected": -0.8595848083496094, + "epoch": 0.8163265306122449, + "grad_norm": 23.214521408081055, + "learning_rate": 8.907646990103495e-08, + "logits/chosen": 0.59375, + "logits/rejected": 0.5625, + "logps/chosen": -306.0, + "logps/rejected": -278.0, + "loss": 0.5229, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.212890625, + "rewards/margins": 0.69140625, + "rewards/rejected": -0.90234375, "step": 780 }, { - "epoch": 0.4135043182412981, - "grad_norm": 364.0, - "learning_rate": 6.802883055926026e-06, - "logits/chosen": 0.15303662419319153, - "logits/rejected": 0.21523818373680115, - "logps/chosen": -333.7895812988281, - "logps/rejected": -296.78851318359375, - "loss": 0.5489, - "rewards/accuracies": 0.6968749761581421, - "rewards/chosen": -0.3119350075721741, - "rewards/margins": 0.6673828959465027, - "rewards/rejected": -0.979317843914032, + "epoch": 0.826792255363684, + "grad_norm": 21.733230590820312, + "learning_rate": 7.945837387163424e-08, + "logits/chosen": 0.5234375, + "logits/rejected": 0.53125, + "logps/chosen": -356.0, + "logps/rejected": -350.0, + "loss": 0.5792, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.216796875, + "rewards/margins": 0.80078125, + "rewards/rejected": -1.015625, "step": 790 }, { - "epoch": 0.4187385501177702, - "grad_norm": 366.0, - "learning_rate": 6.721848908836921e-06, - "logits/chosen": 0.11557696759700775, - "logits/rejected": 0.14221954345703125, - "logps/chosen": -379.16595458984375, - "logps/rejected": -320.8966979980469, - "loss": 0.5204, - "rewards/accuracies": 0.721875011920929, - "rewards/chosen": -0.3791603744029999, - "rewards/margins": 0.6833642721176147, - "rewards/rejected": -1.0625245571136475, + "epoch": 0.837257980115123, + "grad_norm": 28.44561004638672, + "learning_rate": 7.034476572643854e-08, + "logits/chosen": 0.609375, + "logits/rejected": 0.625, + "logps/chosen": -324.0, + "logps/rejected": -318.0, + "loss": 0.5711, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.220703125, + "rewards/margins": 0.625, + "rewards/rejected": -0.84765625, "step": 800 }, { - "epoch": 0.4239727819942423, - "grad_norm": 264.0, - "learning_rate": 6.640298334242959e-06, - "logits/chosen": 0.08530505001544952, - "logits/rejected": 0.15681490302085876, - "logps/chosen": -323.5985107421875, - "logps/rejected": -319.4253234863281, - "loss": 0.526, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -0.5749447345733643, - "rewards/margins": 0.6821728944778442, - "rewards/rejected": -1.2571176290512085, + "epoch": 0.847723704866562, + "grad_norm": 24.274005889892578, + "learning_rate": 6.174657828304541e-08, + "logits/chosen": 0.46875, + "logits/rejected": 0.55078125, + "logps/chosen": -324.0, + "logps/rejected": -310.0, + "loss": 0.5782, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.1640625, + "rewards/margins": 0.74609375, + "rewards/rejected": -0.91015625, "step": 810 }, { - "epoch": 0.42920701387071447, - "grad_norm": 350.0, - "learning_rate": 6.558255791293572e-06, - "logits/chosen": 0.0707249790430069, - "logits/rejected": 0.15043438971042633, - "logps/chosen": -357.5900573730469, - "logps/rejected": -328.65118408203125, - "loss": 0.5905, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.674008846282959, - "rewards/margins": 0.6449601054191589, - "rewards/rejected": -1.3189690113067627, + "epoch": 0.858189429618001, + "grad_norm": 18.96664810180664, + "learning_rate": 5.36741260528415e-08, + "logits/chosen": 0.423828125, + "logits/rejected": 0.310546875, + "logps/chosen": -364.0, + "logps/rejected": -294.0, + "loss": 0.529, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.058349609375, + "rewards/margins": 0.97265625, + "rewards/rejected": -1.03125, "step": 820 }, { - "epoch": 0.4344412457471866, - "grad_norm": 282.0, - "learning_rate": 6.475745886692361e-06, - "logits/chosen": 0.1705600768327713, - "logits/rejected": 0.156109020113945, - "logps/chosen": -352.35650634765625, - "logps/rejected": -340.0906677246094, - "loss": 0.5481, - "rewards/accuracies": 0.7093750238418579, - "rewards/chosen": -0.580664873123169, - "rewards/margins": 0.7169455289840698, - "rewards/rejected": -1.2976105213165283, + "epoch": 0.8686551543694401, + "grad_norm": 21.882705688476562, + "learning_rate": 4.613709286756412e-08, + "logits/chosen": 0.4921875, + "logits/rejected": 0.443359375, + "logps/chosen": -326.0, + "logps/rejected": -284.0, + "loss": 0.5432, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.07568359375, + "rewards/margins": 0.85546875, + "rewards/rejected": -0.9296875, "step": 830 }, { - "epoch": 0.4396754776236587, - "grad_norm": 368.0, - "learning_rate": 6.392793367316905e-06, - "logits/chosen": 0.047196634113788605, - "logits/rejected": 0.10491514205932617, - "logps/chosen": -344.0426940917969, - "logps/rejected": -328.0356140136719, - "loss": 0.5316, - "rewards/accuracies": 0.7406250238418579, - "rewards/chosen": -0.6165002584457397, - "rewards/margins": 0.6772734522819519, - "rewards/rejected": -1.2937736511230469, + "epoch": 0.8791208791208791, + "grad_norm": 16.712879180908203, + "learning_rate": 3.914452026243509e-08, + "logits/chosen": 0.482421875, + "logits/rejected": 0.4765625, + "logps/chosen": -356.0, + "logps/rejected": -320.0, + "loss": 0.5462, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.1796875, + "rewards/margins": 0.58984375, + "rewards/rejected": -0.76953125, "step": 840 }, { - "epoch": 0.44490970950013087, - "grad_norm": 390.0, - "learning_rate": 6.309423112796529e-06, - "logits/chosen": 0.08787860721349716, - "logits/rejected": 0.29786446690559387, - "logps/chosen": -330.8527526855469, - "logps/rejected": -336.91888427734375, - "loss": 0.5739, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.8546838760375977, - "rewards/margins": 0.6589832305908203, - "rewards/rejected": -1.513667106628418, + "epoch": 0.8895866038723181, + "grad_norm": 23.453189849853516, + "learning_rate": 3.270479662980247e-08, + "logits/chosen": 0.57421875, + "logits/rejected": 0.361328125, + "logps/chosen": -328.0, + "logps/rejected": -286.0, + "loss": 0.5781, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.0693359375, + "rewards/margins": 0.703125, + "rewards/rejected": -0.7734375, "step": 850 }, { - "epoch": 0.45014394137660296, - "grad_norm": 400.0, - "learning_rate": 6.225660128050248e-06, - "logits/chosen": 0.1513369381427765, - "logits/rejected": 0.18070130050182343, - "logps/chosen": -346.57659912109375, - "logps/rejected": -328.84130859375, - "loss": 0.5638, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.0357505083084106, - "rewards/margins": 0.6643354892730713, - "rewards/rejected": -1.700085997581482, + "epoch": 0.9000523286237572, + "grad_norm": 25.8822021484375, + "learning_rate": 2.6825647156302865e-08, + "logits/chosen": 0.60546875, + "logits/rejected": 0.66015625, + "logps/chosen": -356.0, + "logps/rejected": -360.0, + "loss": 0.5378, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.2099609375, + "rewards/margins": 0.70703125, + "rewards/rejected": -0.9140625, "step": 860 }, { - "epoch": 0.4553781732530751, - "grad_norm": 354.0, - "learning_rate": 6.141529535787139e-06, - "logits/chosen": 0.23875145614147186, - "logits/rejected": 0.28748852014541626, - "logps/chosen": -382.6456604003906, - "logps/rejected": -346.1590270996094, - "loss": 0.5211, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.7971159219741821, - "rewards/margins": 0.7644892930984497, - "rewards/rejected": -1.5616052150726318, + "epoch": 0.9105180533751962, + "grad_norm": 29.531282424926758, + "learning_rate": 2.151412455561441e-08, + "logits/chosen": 0.5234375, + "logits/rejected": 0.61328125, + "logps/chosen": -342.0, + "logps/rejected": -298.0, + "loss": 0.5749, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.224609375, + "rewards/margins": 0.75390625, + "rewards/rejected": -0.9765625, "step": 870 }, { - "epoch": 0.46061240512954726, - "grad_norm": 318.0, - "learning_rate": 6.057056568971383e-06, - "logits/chosen": 0.1365332305431366, - "logits/rejected": 0.18043069541454315, - "logps/chosen": -365.5246887207031, - "logps/rejected": -337.0705261230469, - "loss": 0.5237, - "rewards/accuracies": 0.715624988079071, - "rewards/chosen": -0.8823005557060242, - "rewards/margins": 0.8230516314506531, - "rewards/rejected": -1.7053521871566772, + "epoch": 0.9209837781266352, + "grad_norm": 27.597761154174805, + "learning_rate": 1.6776600607918356e-08, + "logits/chosen": 0.5078125, + "logits/rejected": 0.41015625, + "logps/chosen": -328.0, + "logps/rejected": -278.0, + "loss": 0.5578, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.2333984375, + "rewards/margins": 0.71875, + "rewards/rejected": -0.953125, "step": 880 }, { - "epoch": 0.46584663700601936, - "grad_norm": 294.0, - "learning_rate": 5.972266563254246e-06, - "logits/chosen": 0.33021894097328186, - "logits/rejected": 0.26815542578697205, - "logps/chosen": -393.06182861328125, - "logps/rejected": -348.9358825683594, - "loss": 0.5696, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.9404589533805847, - "rewards/margins": 0.702355682849884, - "rewards/rejected": -1.6428148746490479, + "epoch": 0.9314495028780743, + "grad_norm": 22.400959014892578, + "learning_rate": 1.2618758516218186e-08, + "logits/chosen": 0.384765625, + "logits/rejected": 0.443359375, + "logps/chosen": -348.0, + "logps/rejected": -312.0, + "loss": 0.5566, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.09814453125, + "rewards/margins": 0.7734375, + "rewards/rejected": -0.87109375, "step": 890 }, { - "epoch": 0.4710808688824915, - "grad_norm": 292.0, - "learning_rate": 5.887184949375242e-06, - "logits/chosen": 0.24816791713237762, - "logits/rejected": 0.3371456563472748, - "logps/chosen": -343.94952392578125, - "logps/rejected": -303.48870849609375, - "loss": 0.5451, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.5168992877006531, - "rewards/margins": 0.7089965343475342, - "rewards/rejected": -1.225895881652832, + "epoch": 0.9419152276295133, + "grad_norm": 23.831453323364258, + "learning_rate": 9.045586088686496e-09, + "logits/chosen": 0.443359375, + "logits/rejected": 0.427734375, + "logps/chosen": -372.0, + "logps/rejected": -308.0, + "loss": 0.5636, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": 0.0031890869140625, + "rewards/margins": 0.65625, + "rewards/rejected": -0.65234375, "step": 900 }, { - "epoch": 0.4763151007589636, - "grad_norm": 256.0, - "learning_rate": 5.8018372455348e-06, - "logits/chosen": 0.32140612602233887, - "logits/rejected": 0.3305002748966217, - "logps/chosen": -359.08538818359375, - "logps/rejected": -316.7978515625, - "loss": 0.5553, - "rewards/accuracies": 0.6968749761581421, - "rewards/chosen": -0.44215431809425354, - "rewards/margins": 0.6605509519577026, - "rewards/rejected": -1.1027053594589233, + "epoch": 0.9523809523809523, + "grad_norm": 24.923038482666016, + "learning_rate": 6.06136975521715e-09, + "logits/chosen": 0.3671875, + "logits/rejected": 0.40625, + "logps/chosen": -304.0, + "logps/rejected": -274.0, + "loss": 0.5489, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.1318359375, + "rewards/margins": 0.81640625, + "rewards/rejected": -0.94921875, "step": 910 }, { - "epoch": 0.48154933263543576, - "grad_norm": 268.0, - "learning_rate": 5.71624904974069e-06, - "logits/chosen": 0.24679967761039734, - "logits/rejected": 0.3497712016105652, - "logps/chosen": -361.283935546875, - "logps/rejected": -338.38153076171875, - "loss": 0.5496, - "rewards/accuracies": 0.7093750238418579, - "rewards/chosen": -0.4841860234737396, - "rewards/margins": 0.6799232959747314, - "rewards/rejected": -1.1641093492507935, + "epoch": 0.9628466771323915, + "grad_norm": 24.579931259155273, + "learning_rate": 3.6696894253614442e-09, + "logits/chosen": 0.375, + "logits/rejected": 0.3671875, + "logps/chosen": -344.0, + "logps/rejected": -312.0, + "loss": 0.572, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.0146484375, + "rewards/margins": 0.82421875, + "rewards/rejected": -0.83984375, "step": 920 }, { - "epoch": 0.48678356451190785, - "grad_norm": 250.0, - "learning_rate": 5.630446032130498e-06, - "logits/chosen": 0.2412446290254593, - "logits/rejected": 0.34884771704673767, - "logps/chosen": -347.1498107910156, - "logps/rejected": -331.9222412109375, - "loss": 0.5455, - "rewards/accuracies": 0.721875011920929, - "rewards/chosen": -0.5564799904823303, - "rewards/margins": 0.6565181016921997, - "rewards/rejected": -1.2129981517791748, + "epoch": 0.9733124018838305, + "grad_norm": 23.33355712890625, + "learning_rate": 1.8734141938160918e-09, + "logits/chosen": 0.640625, + "logits/rejected": 0.64453125, + "logps/chosen": -350.0, + "logps/rejected": -324.0, + "loss": 0.5611, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.0311279296875, + "rewards/margins": 0.796875, + "rewards/rejected": -0.828125, "step": 930 }, { - "epoch": 0.49201779638838, - "grad_norm": 352.0, - "learning_rate": 5.5444539272724925e-06, - "logits/chosen": 0.22467438876628876, - "logits/rejected": 0.42467164993286133, - "logps/chosen": -348.5015869140625, - "logps/rejected": -336.44342041015625, - "loss": 0.534, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.635161280632019, - "rewards/margins": 0.7458114624023438, - "rewards/rejected": -1.3809726238250732, + "epoch": 0.9837781266352695, + "grad_norm": 24.21430778503418, + "learning_rate": 6.746988986155999e-10, + "logits/chosen": 0.50390625, + "logits/rejected": 0.443359375, + "logps/chosen": -358.0, + "logps/rejected": -320.0, + "loss": 0.5494, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.07275390625, + "rewards/margins": 0.77734375, + "rewards/rejected": -0.8515625, "step": 940 }, { - "epoch": 0.49725202826485215, - "grad_norm": 370.0, - "learning_rate": 5.458298526447155e-06, - "logits/chosen": 0.277851402759552, - "logits/rejected": 0.343191534280777, - "logps/chosen": -348.0686950683594, - "logps/rejected": -311.85992431640625, - "loss": 0.6059, - "rewards/accuracies": 0.690625011920929, - "rewards/chosen": -0.6322701573371887, - "rewards/margins": 0.5943492650985718, - "rewards/rejected": -1.2266194820404053, - "step": 950 - }, - { - "epoch": 0.5024862601413242, - "grad_norm": 376.0, - "learning_rate": 5.372005669911694e-06, - "logits/chosen": 0.18535420298576355, - "logits/rejected": 0.29728174209594727, - "logps/chosen": -323.85418701171875, - "logps/rejected": -302.52703857421875, - "loss": 0.6372, - "rewards/accuracies": 0.59375, - "rewards/chosen": -0.5875571966171265, - "rewards/margins": 0.4703772962093353, - "rewards/rejected": -1.0579345226287842, - "step": 960 - }, - { - "epoch": 0.5077204920177963, - "grad_norm": 318.0, - "learning_rate": 5.285601239149875e-06, - "logits/chosen": 0.2485879361629486, - "logits/rejected": 0.1934640109539032, - "logps/chosen": -363.02557373046875, - "logps/rejected": -332.2878112792969, - "loss": 0.5864, - "rewards/accuracies": 0.703125, - "rewards/chosen": -0.4688809812068939, - "rewards/margins": 0.6029322743415833, - "rewards/rejected": -1.0718133449554443, - "step": 970 - }, - { - "epoch": 0.5129547238942685, - "grad_norm": 352.0, - "learning_rate": 5.199111149109498e-06, - "logits/chosen": 0.14167314767837524, - "logits/rejected": 0.3146267533302307, - "logps/chosen": -308.8305969238281, - "logps/rejected": -299.807373046875, - "loss": 0.5698, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.5121305584907532, - "rewards/margins": 0.6366390585899353, - "rewards/rejected": -1.1487696170806885, - "step": 980 - }, - { - "epoch": 0.5181889557707406, - "grad_norm": 324.0, - "learning_rate": 5.112561340429817e-06, - "logits/chosen": 0.30979007482528687, - "logits/rejected": 0.24150173366069794, - "logps/chosen": -337.63214111328125, - "logps/rejected": -298.39111328125, - "loss": 0.5672, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.5298670530319214, - "rewards/margins": 0.603586733341217, - "rewards/rejected": -1.1334538459777832, - "step": 990 - }, - { - "epoch": 0.5234231876472127, - "grad_norm": 320.0, - "learning_rate": 5.0259777716612665e-06, - "logits/chosen": 0.23533792793750763, - "logits/rejected": 0.3287450671195984, - "logps/chosen": -365.8298645019531, - "logps/rejected": -339.43133544921875, - "loss": 0.596, - "rewards/accuracies": 0.703125, - "rewards/chosen": -0.4760667681694031, - "rewards/margins": 0.5520576238632202, - "rewards/rejected": -1.028124451637268, - "step": 1000 - }, - { - "epoch": 0.528657419523685, - "grad_norm": 372.0, - "learning_rate": 4.939386411479814e-06, - "logits/chosen": 0.29721060395240784, - "logits/rejected": 0.3717629909515381, - "logps/chosen": -360.3513488769531, - "logps/rejected": -350.74090576171875, - "loss": 0.5676, - "rewards/accuracies": 0.703125, - "rewards/chosen": -0.3760520815849304, - "rewards/margins": 0.6218014359474182, - "rewards/rejected": -0.9978535771369934, - "step": 1010 - }, - { - "epoch": 0.533891651400157, - "grad_norm": 344.0, - "learning_rate": 4.85281323089828e-06, - "logits/chosen": 0.34880155324935913, - "logits/rejected": 0.32166919112205505, - "logps/chosen": -375.96368408203125, - "logps/rejected": -341.3938293457031, - "loss": 0.6075, - "rewards/accuracies": 0.6468750238418579, - "rewards/chosen": -0.4991912841796875, - "rewards/margins": 0.5173603296279907, - "rewards/rejected": -1.0165516138076782, - "step": 1020 - }, - { - "epoch": 0.5391258832766291, - "grad_norm": 296.0, - "learning_rate": 4.766284195476943e-06, - "logits/chosen": 0.3899001479148865, - "logits/rejected": 0.400698721408844, - "logps/chosen": -355.5746154785156, - "logps/rejected": -325.11474609375, - "loss": 0.5121, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -0.29699790477752686, - "rewards/margins": 0.7675551772117615, - "rewards/rejected": -1.0645530223846436, - "step": 1030 - }, - { - "epoch": 0.5443601151531012, - "grad_norm": 304.0, - "learning_rate": 4.679825257535795e-06, - "logits/chosen": 0.34135550260543823, - "logits/rejected": 0.3182600736618042, - "logps/chosen": -349.2203063964844, - "logps/rejected": -308.77166748046875, - "loss": 0.5413, - "rewards/accuracies": 0.715624988079071, - "rewards/chosen": -0.34611597657203674, - "rewards/margins": 0.6459987759590149, - "rewards/rejected": -0.9921148419380188, - "step": 1040 - }, - { - "epoch": 0.5495943470295734, - "grad_norm": 370.0, - "learning_rate": 4.593462348370759e-06, - "logits/chosen": 0.2810625433921814, - "logits/rejected": 0.3787192404270172, - "logps/chosen": -342.5027770996094, - "logps/rejected": -316.65985107421875, - "loss": 0.552, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.3849152624607086, - "rewards/margins": 0.6414963006973267, - "rewards/rejected": -1.0264116525650024, - "step": 1050 - }, - { - "epoch": 0.5548285789060455, - "grad_norm": 276.0, - "learning_rate": 4.507221370476223e-06, - "logits/chosen": 0.34859079122543335, - "logits/rejected": 0.34620124101638794, - "logps/chosen": -351.231689453125, - "logps/rejected": -333.7611389160156, - "loss": 0.5648, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.3917597532272339, - "rewards/margins": 0.6424742937088013, - "rewards/rejected": -1.0342340469360352, - "step": 1060 - }, - { - "epoch": 0.5600628107825176, - "grad_norm": 217.0, - "learning_rate": 4.421128189776195e-06, - "logits/chosen": 0.29422345757484436, - "logits/rejected": 0.3956177234649658, - "logps/chosen": -312.7371826171875, - "logps/rejected": -271.85382080078125, - "loss": 0.5427, - "rewards/accuracies": 0.721875011920929, - "rewards/chosen": -0.39171385765075684, - "rewards/margins": 0.6362006664276123, - "rewards/rejected": -1.0279145240783691, - "step": 1070 - }, - { - "epoch": 0.5652970426589898, - "grad_norm": 268.0, - "learning_rate": 4.335208627866438e-06, - "logits/chosen": 0.43888354301452637, - "logits/rejected": 0.47600990533828735, - "logps/chosen": -342.0023193359375, - "logps/rejected": -299.25982666015625, - "loss": 0.5153, - "rewards/accuracies": 0.721875011920929, - "rewards/chosen": -0.41604313254356384, - "rewards/margins": 0.7133996486663818, - "rewards/rejected": -1.1294429302215576, - "step": 1080 - }, - { - "epoch": 0.5705312745354619, - "grad_norm": 330.0, - "learning_rate": 4.249488454269908e-06, - "logits/chosen": 0.37367188930511475, - "logits/rejected": 0.4846370816230774, - "logps/chosen": -356.6741638183594, - "logps/rejected": -335.75311279296875, - "loss": 0.5415, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.5590308904647827, - "rewards/margins": 0.6862959861755371, - "rewards/rejected": -1.2453268766403198, - "step": 1090 - }, - { - "epoch": 0.575765506411934, - "grad_norm": 310.0, - "learning_rate": 4.163993378707786e-06, - "logits/chosen": 0.3371972143650055, - "logits/rejected": 0.39444199204444885, - "logps/chosen": -321.92254638671875, - "logps/rejected": -300.8343505859375, - "loss": 0.5644, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.5255651473999023, - "rewards/margins": 0.709193766117096, - "rewards/rejected": -1.2347590923309326, - "step": 1100 - }, - { - "epoch": 0.5809997382884062, - "grad_norm": 380.0, - "learning_rate": 4.0787490433884685e-06, - "logits/chosen": 0.3301977813243866, - "logits/rejected": 0.4222096800804138, - "logps/chosen": -329.1367492675781, - "logps/rejected": -303.93341064453125, - "loss": 0.5756, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.5914889574050903, - "rewards/margins": 0.5726789236068726, - "rewards/rejected": -1.1641680002212524, - "step": 1110 - }, - { - "epoch": 0.5862339701648783, - "grad_norm": 340.0, - "learning_rate": 3.993781015316802e-06, - "logits/chosen": 0.32290878891944885, - "logits/rejected": 0.3790258765220642, - "logps/chosen": -377.72662353515625, - "logps/rejected": -331.83441162109375, - "loss": 0.5863, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.6014608144760132, - "rewards/margins": 0.6531153917312622, - "rewards/rejected": -1.2545760869979858, - "step": 1120 - }, - { - "epoch": 0.5914682020413504, - "grad_norm": 356.0, - "learning_rate": 3.909114778625861e-06, - "logits/chosen": 0.33370259404182434, - "logits/rejected": 0.3154350221157074, - "logps/chosen": -382.1673278808594, - "logps/rejected": -314.76025390625, - "loss": 0.4867, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -0.493272602558136, - "rewards/margins": 0.8312891125679016, - "rewards/rejected": -1.3245617151260376, - "step": 1130 - }, - { - "epoch": 0.5967024339178225, - "grad_norm": 342.0, - "learning_rate": 3.824775726933596e-06, - "logits/chosen": 0.3804655969142914, - "logits/rejected": 0.4335269033908844, - "logps/chosen": -347.4833984375, - "logps/rejected": -302.07989501953125, - "loss": 0.5511, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.5972079038619995, - "rewards/margins": 0.6958715319633484, - "rewards/rejected": -1.2930794954299927, - "step": 1140 - }, - { - "epoch": 0.6019366657942947, - "grad_norm": 452.0, - "learning_rate": 3.7407891557266242e-06, - "logits/chosen": 0.28930753469467163, - "logits/rejected": 0.3638380169868469, - "logps/chosen": -337.60076904296875, - "logps/rejected": -327.3893127441406, - "loss": 0.5637, + "epoch": 0.9942438513867086, + "grad_norm": 23.800779342651367, + "learning_rate": 7.498153615653758e-11, + "logits/chosen": 0.515625, + "logits/rejected": 0.447265625, + "logps/chosen": -338.0, + "logps/rejected": -324.0, + "loss": 0.5375, "rewards/accuracies": 0.71875, - "rewards/chosen": -0.7492964863777161, - "rewards/margins": 0.7008451819419861, - "rewards/rejected": -1.4501416683197021, - "step": 1150 - }, - { - "epoch": 0.6071708976707668, - "grad_norm": 340.0, - "learning_rate": 3.6571802547734457e-06, - "logits/chosen": 0.29493704438209534, - "logits/rejected": 0.42343488335609436, - "logps/chosen": -345.88421630859375, - "logps/rejected": -324.6173095703125, - "loss": 0.5322, - "rewards/accuracies": 0.7281249761581421, - "rewards/chosen": -0.7139222025871277, - "rewards/margins": 0.7538946866989136, - "rewards/rejected": -1.467816948890686, - "step": 1160 - }, - { - "epoch": 0.6124051295472389, - "grad_norm": 340.0, - "learning_rate": 3.5739741005693807e-06, - "logits/chosen": 0.36524444818496704, - "logits/rejected": 0.4817792475223541, - "logps/chosen": -373.5266418457031, - "logps/rejected": -342.4767150878906, - "loss": 0.5557, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.6850903630256653, - "rewards/margins": 0.7452309727668762, - "rewards/rejected": -1.430321455001831, - "step": 1170 - }, - { - "epoch": 0.6176393614237111, - "grad_norm": 288.0, - "learning_rate": 3.4911956488154696e-06, - "logits/chosen": 0.2990756034851074, - "logits/rejected": 0.3051765561103821, - "logps/chosen": -341.70721435546875, - "logps/rejected": -312.9698791503906, - "loss": 0.6205, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.803567111492157, - "rewards/margins": 0.5913842916488647, - "rewards/rejected": -1.3949514627456665, - "step": 1180 - }, - { - "epoch": 0.6228735933001832, - "grad_norm": 274.0, - "learning_rate": 3.4088697269336045e-06, - "logits/chosen": 0.3608161211013794, - "logits/rejected": 0.40439486503601074, - "logps/chosen": -363.10028076171875, - "logps/rejected": -309.46038818359375, - "loss": 0.4981, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -0.5898820757865906, - "rewards/margins": 0.8492467999458313, - "rewards/rejected": -1.4391288757324219, - "step": 1190 - }, - { - "epoch": 0.6281078251766553, - "grad_norm": 396.0, - "learning_rate": 3.3270210266201373e-06, - "logits/chosen": 0.39221999049186707, - "logits/rejected": 0.44608980417251587, - "logps/chosen": -349.74444580078125, - "logps/rejected": -325.3342590332031, - "loss": 0.5947, - "rewards/accuracies": 0.703125, - "rewards/chosen": -0.7365375757217407, - "rewards/margins": 0.6322949528694153, - "rewards/rejected": -1.3688325881958008, - "step": 1200 - }, - { - "epoch": 0.6333420570531274, - "grad_norm": 270.0, - "learning_rate": 3.2456740964401977e-06, - "logits/chosen": 0.39257878065109253, - "logits/rejected": 0.5652514696121216, - "logps/chosen": -351.83795166015625, - "logps/rejected": -336.03948974609375, - "loss": 0.5676, - "rewards/accuracies": 0.690625011920929, - "rewards/chosen": -0.7251110672950745, - "rewards/margins": 0.6608349084854126, - "rewards/rejected": -1.3859459161758423, - "step": 1210 - }, - { - "epoch": 0.6385762889295996, - "grad_norm": 352.0, - "learning_rate": 3.1648533344649303e-06, - "logits/chosen": 0.3172612190246582, - "logits/rejected": 0.5099163055419922, - "logps/chosen": -338.2564392089844, - "logps/rejected": -346.206298828125, - "loss": 0.5242, - "rewards/accuracies": 0.734375, - "rewards/chosen": -0.7247421145439148, - "rewards/margins": 0.7186325192451477, - "rewards/rejected": -1.4433746337890625, - "step": 1220 - }, - { - "epoch": 0.6438105208060717, - "grad_norm": 396.0, - "learning_rate": 3.084582980953881e-06, - "logits/chosen": 0.3695985674858093, - "logits/rejected": 0.3975854814052582, - "logps/chosen": -386.8086242675781, - "logps/rejected": -308.7548828125, - "loss": 0.5514, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.7079992890357971, - "rewards/margins": 0.7007894515991211, - "rewards/rejected": -1.4087889194488525, - "step": 1230 - }, - { - "epoch": 0.6490447526825438, - "grad_norm": 306.0, - "learning_rate": 3.0048871110847043e-06, - "logits/chosen": 0.36669978499412537, - "logits/rejected": 0.3211643695831299, - "logps/chosen": -364.41278076171875, - "logps/rejected": -323.64862060546875, - "loss": 0.507, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -0.6616480350494385, - "rewards/margins": 0.8358721733093262, - "rewards/rejected": -1.4975202083587646, - "step": 1240 - }, - { - "epoch": 0.654278984559016, - "grad_norm": 314.0, - "learning_rate": 2.925789627732395e-06, - "logits/chosen": 0.30698102712631226, - "logits/rejected": 0.35474127531051636, - "logps/chosen": -357.7930603027344, - "logps/rejected": -327.1461181640625, - "loss": 0.5319, - "rewards/accuracies": 0.7281249761581421, - "rewards/chosen": -0.7104305624961853, - "rewards/margins": 0.7990323901176453, - "rewards/rejected": -1.5094630718231201, - "step": 1250 - }, - { - "epoch": 0.6595132164354881, - "grad_norm": 278.0, - "learning_rate": 2.8473142543001818e-06, - "logits/chosen": 0.3213528096675873, - "logits/rejected": 0.38778603076934814, - "logps/chosen": -319.82452392578125, - "logps/rejected": -299.6809387207031, - "loss": 0.5615, - "rewards/accuracies": 0.7281249761581421, - "rewards/chosen": -0.7967512011528015, - "rewards/margins": 0.6921517848968506, - "rewards/rejected": -1.4889030456542969, - "step": 1260 - }, - { - "epoch": 0.6647474483119602, - "grad_norm": 312.0, - "learning_rate": 2.7694845276042714e-06, - "logits/chosen": 0.3033554255962372, - "logits/rejected": 0.31667545437812805, - "logps/chosen": -360.24053955078125, - "logps/rejected": -331.3898010253906, - "loss": 0.5285, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -0.6901473999023438, - "rewards/margins": 0.8282734751701355, - "rewards/rejected": -1.5184208154678345, - "step": 1270 - }, - { - "epoch": 0.6699816801884323, - "grad_norm": 356.0, - "learning_rate": 2.6923237908145227e-06, - "logits/chosen": 0.3856261074542999, - "logits/rejected": 0.39977845549583435, - "logps/chosen": -324.6197204589844, - "logps/rejected": -332.137939453125, - "loss": 0.5253, - "rewards/accuracies": 0.7281249761581421, - "rewards/chosen": -0.6914088726043701, - "rewards/margins": 0.7797143459320068, - "rewards/rejected": -1.471123218536377, - "step": 1280 - }, - { - "epoch": 0.6752159120649045, - "grad_norm": 290.0, - "learning_rate": 2.615855186453241e-06, - "logits/chosen": 0.3395998179912567, - "logits/rejected": 0.4570327401161194, - "logps/chosen": -357.72625732421875, - "logps/rejected": -345.084716796875, - "loss": 0.5233, - "rewards/accuracies": 0.7406250238418579, - "rewards/chosen": -0.630346417427063, - "rewards/margins": 0.8085399866104126, - "rewards/rejected": -1.438886284828186, - "step": 1290 - }, - { - "epoch": 0.6804501439413766, - "grad_norm": 366.0, - "learning_rate": 2.5401016494541193e-06, - "logits/chosen": 0.29590579867362976, - "logits/rejected": 0.41916173696517944, - "logps/chosen": -340.41949462890625, - "logps/rejected": -329.39324951171875, - "loss": 0.5494, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.7232412099838257, - "rewards/margins": 0.7066032886505127, - "rewards/rejected": -1.429844617843628, - "step": 1300 - }, - { - "epoch": 0.6856843758178487, - "grad_norm": 420.0, - "learning_rate": 2.4650859002834465e-06, - "logits/chosen": 0.33335959911346436, - "logits/rejected": 0.5272048115730286, - "logps/chosen": -347.9224548339844, - "logps/rejected": -328.1867980957031, - "loss": 0.5436, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.6804947853088379, - "rewards/margins": 0.6814740896224976, - "rewards/rejected": -1.3619688749313354, - "step": 1310 - }, - { - "epoch": 0.6909186076943209, - "grad_norm": 330.0, - "learning_rate": 2.390830438125661e-06, - "logits/chosen": 0.2588549256324768, - "logits/rejected": 0.298136442899704, - "logps/chosen": -352.1745910644531, - "logps/rejected": -324.03460693359375, - "loss": 0.595, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.6992086172103882, - "rewards/margins": 0.6043084263801575, - "rewards/rejected": -1.3035171031951904, - "step": 1320 - }, - { - "epoch": 0.696152839570793, - "grad_norm": 266.0, - "learning_rate": 2.3173575341352457e-06, - "logits/chosen": 0.31691044569015503, - "logits/rejected": 0.4209415316581726, - "logps/chosen": -344.1717529296875, - "logps/rejected": -322.6068115234375, - "loss": 0.5849, - "rewards/accuracies": 0.659375011920929, - "rewards/chosen": -0.5810431241989136, - "rewards/margins": 0.6289807558059692, - "rewards/rejected": -1.2100238800048828, - "step": 1330 - }, - { - "epoch": 0.7013870714472651, - "grad_norm": 316.0, - "learning_rate": 2.2446892247570257e-06, - "logits/chosen": 0.34166431427001953, - "logits/rejected": 0.4205591678619385, - "logps/chosen": -349.1050109863281, - "logps/rejected": -328.892822265625, - "loss": 0.5814, - "rewards/accuracies": 0.7093750238418579, - "rewards/chosen": -0.6503351926803589, - "rewards/margins": 0.6150357723236084, - "rewards/rejected": -1.2653712034225464, - "step": 1340 - }, - { - "epoch": 0.7066213033237373, - "grad_norm": 302.0, - "learning_rate": 2.172847305116872e-06, - "logits/chosen": 0.3496546149253845, - "logits/rejected": 0.339820921421051, - "logps/chosen": -345.417236328125, - "logps/rejected": -320.3566589355469, - "loss": 0.5675, - "rewards/accuracies": 0.659375011920929, - "rewards/chosen": -0.5050859451293945, - "rewards/margins": 0.6172486543655396, - "rewards/rejected": -1.122334599494934, - "step": 1350 - }, - { - "epoch": 0.7118555352002094, - "grad_norm": 294.0, - "learning_rate": 2.1018533224847638e-06, - "logits/chosen": 0.36049994826316833, - "logits/rejected": 0.3396483063697815, - "logps/chosen": -375.68121337890625, - "logps/rejected": -331.56939697265625, - "loss": 0.5571, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.5250564217567444, - "rewards/margins": 0.7472478151321411, - "rewards/rejected": -1.2723041772842407, - "step": 1360 - }, - { - "epoch": 0.7170897670766815, - "grad_norm": 264.0, - "learning_rate": 2.0317285698122035e-06, - "logits/chosen": 0.23286870121955872, - "logits/rejected": 0.4170301556587219, - "logps/chosen": -337.4573059082031, - "logps/rejected": -320.9345397949219, - "loss": 0.5347, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.49074649810791016, - "rewards/margins": 0.73365718126297, - "rewards/rejected": -1.2244036197662354, - "step": 1370 - }, - { - "epoch": 0.7223239989531536, - "grad_norm": 280.0, - "learning_rate": 1.962494079345906e-06, - "logits/chosen": 0.20548689365386963, - "logits/rejected": 0.2943686544895172, - "logps/chosen": -381.2185363769531, - "logps/rejected": -330.40753173828125, - "loss": 0.5545, - "rewards/accuracies": 0.6656249761581421, - "rewards/chosen": -0.536620557308197, - "rewards/margins": 0.7038464546203613, - "rewards/rejected": -1.2404670715332031, - "step": 1380 - }, - { - "epoch": 0.7275582308296258, - "grad_norm": 254.0, - "learning_rate": 1.8941706163196676e-06, - "logits/chosen": 0.38386040925979614, - "logits/rejected": 0.5160520672798157, - "logps/chosen": -312.7783203125, - "logps/rejected": -307.5871887207031, - "loss": 0.5393, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.5432096719741821, - "rewards/margins": 0.667094349861145, - "rewards/rejected": -1.2103040218353271, - "step": 1390 - }, - { - "epoch": 0.7327924627060979, - "grad_norm": 255.0, - "learning_rate": 1.8267786727263426e-06, - "logits/chosen": 0.3777836263179779, - "logits/rejected": 0.4588887691497803, - "logps/chosen": -339.7103576660156, - "logps/rejected": -315.67547607421875, - "loss": 0.5454, - "rewards/accuracies": 0.7281249761581421, - "rewards/chosen": -0.5095704793930054, - "rewards/margins": 0.6885578632354736, - "rewards/rejected": -1.198128342628479, - "step": 1400 - }, - { - "epoch": 0.73802669458257, - "grad_norm": 249.0, - "learning_rate": 1.760338461171755e-06, - "logits/chosen": 0.33284902572631836, - "logits/rejected": 0.41128548979759216, - "logps/chosen": -326.13421630859375, - "logps/rejected": -321.0505065917969, - "loss": 0.5956, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.6211567521095276, - "rewards/margins": 0.6012374758720398, - "rewards/rejected": -1.2223942279815674, - "step": 1410 - }, - { - "epoch": 0.7432609264590422, - "grad_norm": 278.0, - "learning_rate": 1.6948699088123992e-06, - "logits/chosen": 0.3391318917274475, - "logits/rejected": 0.35012945532798767, - "logps/chosen": -332.51690673828125, - "logps/rejected": -305.8824768066406, - "loss": 0.5685, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.5398066639900208, - "rewards/margins": 0.6431677341461182, - "rewards/rejected": -1.1829744577407837, - "step": 1420 - }, - { - "epoch": 0.7484951583355143, - "grad_norm": 342.0, - "learning_rate": 1.6303926513787821e-06, - "logits/chosen": 0.19832518696784973, - "logits/rejected": 0.17641706764698029, - "logps/chosen": -337.8302001953125, - "logps/rejected": -308.45574951171875, - "loss": 0.5347, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.5528967976570129, - "rewards/margins": 0.7190114855766296, - "rewards/rejected": -1.2719082832336426, - "step": 1430 - }, - { - "epoch": 0.7537293902119864, - "grad_norm": 292.0, - "learning_rate": 1.5669260272861426e-06, - "logits/chosen": 0.3353267014026642, - "logits/rejected": 0.34371891617774963, - "logps/chosen": -334.8343200683594, - "logps/rejected": -333.1044006347656, - "loss": 0.5223, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -0.514175534248352, - "rewards/margins": 0.7716984748840332, - "rewards/rejected": -1.2858738899230957, - "step": 1440 - }, - { - "epoch": 0.7589636220884585, - "grad_norm": 296.0, - "learning_rate": 1.5044890718343535e-06, - "logits/chosen": 0.3490106463432312, - "logits/rejected": 0.2644171118736267, - "logps/chosen": -323.67138671875, - "logps/rejected": -314.19647216796875, - "loss": 0.5748, - "rewards/accuracies": 0.703125, - "rewards/chosen": -0.6448107957839966, - "rewards/margins": 0.6091581583023071, - "rewards/rejected": -1.2539689540863037, - "step": 1450 - }, - { - "epoch": 0.7641978539649307, - "grad_norm": 300.0, - "learning_rate": 1.4431005114987485e-06, - "logits/chosen": 0.37269195914268494, - "logits/rejected": 0.3853556513786316, - "logps/chosen": -394.0393371582031, - "logps/rejected": -351.55548095703125, - "loss": 0.5406, - "rewards/accuracies": 0.7093750238418579, - "rewards/chosen": -0.4938036799430847, - "rewards/margins": 0.7198097109794617, - "rewards/rejected": -1.2136132717132568, - "step": 1460 - }, - { - "epoch": 0.7694320858414028, - "grad_norm": 302.0, - "learning_rate": 1.3827787583135533e-06, - "logits/chosen": 0.2608596086502075, - "logits/rejected": 0.3895898461341858, - "logps/chosen": -346.5641784667969, - "logps/rejected": -333.06097412109375, - "loss": 0.5955, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.5579820871353149, - "rewards/margins": 0.6600214838981628, - "rewards/rejected": -1.218003511428833, - "step": 1470 - }, - { - "epoch": 0.7746663177178749, - "grad_norm": 384.0, - "learning_rate": 1.3235419043496362e-06, - "logits/chosen": 0.4145224094390869, - "logits/rejected": 0.5047595500946045, - "logps/chosen": -339.38018798828125, - "logps/rejected": -315.99114990234375, - "loss": 0.5873, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.5417883992195129, - "rewards/margins": 0.6276899576187134, - "rewards/rejected": -1.169478178024292, - "step": 1480 - }, - { - "epoch": 0.7799005495943471, - "grad_norm": 308.0, - "learning_rate": 1.2654077162882271e-06, - "logits/chosen": 0.3089558482170105, - "logits/rejected": 0.33582228422164917, - "logps/chosen": -344.6625671386719, - "logps/rejected": -318.9618835449219, - "loss": 0.5562, - "rewards/accuracies": 0.715624988079071, - "rewards/chosen": -0.5232676863670349, - "rewards/margins": 0.7178744077682495, - "rewards/rejected": -1.2411420345306396, - "step": 1490 - }, - { - "epoch": 0.7851347814708192, - "grad_norm": 290.0, - "learning_rate": 1.2083936300922238e-06, - "logits/chosen": 0.45748963952064514, - "logits/rejected": 0.5170606374740601, - "logps/chosen": -360.7559814453125, - "logps/rejected": -332.96112060546875, - "loss": 0.59, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.5006940364837646, - "rewards/margins": 0.6657803058624268, - "rewards/rejected": -1.1664743423461914, - "step": 1500 - }, - { - "epoch": 0.7903690133472913, - "grad_norm": 242.0, - "learning_rate": 1.1525167457766856e-06, - "logits/chosen": 0.33311501145362854, - "logits/rejected": 0.3369537889957428, - "logps/chosen": -335.03680419921875, - "logps/rejected": -313.3392333984375, - "loss": 0.5675, - "rewards/accuracies": 0.671875, - "rewards/chosen": -0.5090693831443787, - "rewards/margins": 0.669906497001648, - "rewards/rejected": -1.1789758205413818, - "step": 1510 - }, - { - "epoch": 0.7956032452237635, - "grad_norm": 334.0, - "learning_rate": 1.0977938222801004e-06, - "logits/chosen": 0.36881986260414124, - "logits/rejected": 0.45787104964256287, - "logps/chosen": -338.68902587890625, - "logps/rejected": -313.40631103515625, - "loss": 0.5784, - "rewards/accuracies": 0.684374988079071, - "rewards/chosen": -0.5590661764144897, - "rewards/margins": 0.6329992413520813, - "rewards/rejected": -1.1920652389526367, - "step": 1520 - }, - { - "epoch": 0.8008374771002356, - "grad_norm": 322.0, - "learning_rate": 1.0442412724379365e-06, - "logits/chosen": 0.26200932264328003, - "logits/rejected": 0.27576130628585815, - "logps/chosen": -344.83148193359375, - "logps/rejected": -293.1750793457031, - "loss": 0.5813, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.629385769367218, - "rewards/margins": 0.5875921845436096, - "rewards/rejected": -1.216977834701538, - "step": 1530 - }, - { - "epoch": 0.8060717089767077, - "grad_norm": 286.0, - "learning_rate": 9.9187515806e-07, - "logits/chosen": 0.44481024146080017, - "logits/rejected": 0.47209352254867554, - "logps/chosen": -366.44891357421875, - "logps/rejected": -317.8438415527344, - "loss": 0.565, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.4901696741580963, - "rewards/margins": 0.6533368825912476, - "rewards/rejected": -1.1435067653656006, - "step": 1540 - }, - { - "epoch": 0.8113059408531798, - "grad_norm": 282.0, - "learning_rate": 9.407111851130879e-07, - "logits/chosen": 0.43470582365989685, - "logits/rejected": 0.3668103814125061, - "logps/chosen": -337.2115783691406, - "logps/rejected": -318.40155029296875, - "loss": 0.5212, - "rewards/accuracies": 0.746874988079071, - "rewards/chosen": -0.49172383546829224, - "rewards/margins": 0.7541629672050476, - "rewards/rejected": -1.2458868026733398, - "step": 1550 - }, - { - "epoch": 0.816540172729652, - "grad_norm": 298.0, - "learning_rate": 8.907646990103496e-07, - "logits/chosen": 0.37122786045074463, - "logits/rejected": 0.4722965657711029, - "logps/chosen": -329.2878723144531, - "logps/rejected": -302.6796569824219, - "loss": 0.533, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.5332542657852173, - "rewards/margins": 0.6741858720779419, - "rewards/rejected": -1.2074401378631592, - "step": 1560 - }, - { - "epoch": 0.821774404606124, - "grad_norm": 234.0, - "learning_rate": 8.42050680008798e-07, - "logits/chosen": 0.2228083610534668, - "logits/rejected": 0.2700883150100708, - "logps/chosen": -343.5450744628906, - "logps/rejected": -327.5457458496094, - "loss": 0.561, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.5294641256332397, - "rewards/margins": 0.6504173278808594, - "rewards/rejected": -1.1798814535140991, - "step": 1570 - }, - { - "epoch": 0.8270086364825961, - "grad_norm": 322.0, - "learning_rate": 7.945837387163424e-07, - "logits/chosen": 0.42028242349624634, - "logits/rejected": 0.4180065095424652, - "logps/chosen": -353.06536865234375, - "logps/rejected": -321.9817810058594, - "loss": 0.5844, - "rewards/accuracies": 0.7093750238418579, - "rewards/chosen": -0.5255261659622192, - "rewards/margins": 0.6547245979309082, - "rewards/rejected": -1.1802507638931274, - "step": 1580 - }, - { - "epoch": 0.8322428683590684, - "grad_norm": 380.0, - "learning_rate": 7.483781117096828e-07, - "logits/chosen": 0.38973018527030945, - "logits/rejected": 0.43399643898010254, - "logps/chosen": -370.450439453125, - "logps/rejected": -342.24395751953125, - "loss": 0.547, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.527172327041626, - "rewards/margins": 0.7201731204986572, - "rewards/rejected": -1.2473453283309937, - "step": 1590 - }, - { - "epoch": 0.8374771002355405, - "grad_norm": 372.0, - "learning_rate": 7.034476572643855e-07, - "logits/chosen": 0.4103736877441406, - "logits/rejected": 0.42073503136634827, - "logps/chosen": -348.49932861328125, - "logps/rejected": -322.942626953125, - "loss": 0.5787, - "rewards/accuracies": 0.671875, - "rewards/chosen": -0.5726643800735474, - "rewards/margins": 0.6367040872573853, - "rewards/rejected": -1.2093684673309326, - "step": 1600 - }, - { - "epoch": 0.8427113321120125, - "grad_norm": 324.0, - "learning_rate": 6.598058511984307e-07, - "logits/chosen": 0.4105502665042877, - "logits/rejected": 0.4338196814060211, - "logps/chosen": -334.387451171875, - "logps/rejected": -300.1074523925781, - "loss": 0.556, - "rewards/accuracies": 0.690625011920929, - "rewards/chosen": -0.640805721282959, - "rewards/margins": 0.6867104768753052, - "rewards/rejected": -1.3275163173675537, - "step": 1610 - }, - { - "epoch": 0.8479455639884846, - "grad_norm": 246.0, - "learning_rate": 6.174657828304543e-07, - "logits/chosen": 0.3273460268974304, - "logits/rejected": 0.38331374526023865, - "logps/chosen": -332.36773681640625, - "logps/rejected": -316.30072021484375, - "loss": 0.6104, - "rewards/accuracies": 0.653124988079071, - "rewards/chosen": -0.5995886325836182, - "rewards/margins": 0.5190034508705139, - "rewards/rejected": -1.1185920238494873, - "step": 1620 - }, - { - "epoch": 0.8531797958649568, - "grad_norm": 256.0, - "learning_rate": 5.764401510539253e-07, - "logits/chosen": 0.39274919033050537, - "logits/rejected": 0.30704575777053833, - "logps/chosen": -358.00335693359375, - "logps/rejected": -301.65203857421875, - "loss": 0.5616, - "rewards/accuracies": 0.7406250238418579, - "rewards/chosen": -0.5209869742393494, - "rewards/margins": 0.6791882514953613, - "rewards/rejected": -1.2001752853393555, - "step": 1630 - }, - { - "epoch": 0.8584140277414289, - "grad_norm": 230.0, - "learning_rate": 5.36741260528415e-07, - "logits/chosen": 0.2753371000289917, - "logits/rejected": 0.3843556344509125, - "logps/chosen": -372.30059814453125, - "logps/rejected": -354.3190002441406, - "loss": 0.4937, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -0.4627212584018707, - "rewards/margins": 0.8342711329460144, - "rewards/rejected": -1.296992540359497, - "step": 1640 - }, - { - "epoch": 0.863648259617901, - "grad_norm": 253.0, - "learning_rate": 4.98381017989103e-07, - "logits/chosen": 0.25747784972190857, - "logits/rejected": 0.3317343294620514, - "logps/chosen": -345.8427734375, - "logps/rejected": -307.8304748535156, - "loss": 0.5145, - "rewards/accuracies": 0.746874988079071, - "rewards/chosen": -0.4770580232143402, - "rewards/margins": 0.7587519884109497, - "rewards/rejected": -1.2358100414276123, - "step": 1650 - }, - { - "epoch": 0.8688824914943732, - "grad_norm": 274.0, - "learning_rate": 4.6137092867564127e-07, - "logits/chosen": 0.3641647398471832, - "logits/rejected": 0.42129549384117126, - "logps/chosen": -317.44012451171875, - "logps/rejected": -299.4899597167969, - "loss": 0.5478, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.5020959377288818, - "rewards/margins": 0.6763127446174622, - "rewards/rejected": -1.1784086227416992, - "step": 1660 - }, - { - "epoch": 0.8741167233708453, - "grad_norm": 253.0, - "learning_rate": 4.2572209288143095e-07, - "logits/chosen": 0.35885730385780334, - "logits/rejected": 0.3501403331756592, - "logps/chosen": -347.42071533203125, - "logps/rejected": -318.55279541015625, - "loss": 0.5766, - "rewards/accuracies": 0.690625011920929, - "rewards/chosen": -0.5235342383384705, - "rewards/margins": 0.6213586926460266, - "rewards/rejected": -1.144892930984497, - "step": 1670 - }, - { - "epoch": 0.8793509552473174, - "grad_norm": 232.0, - "learning_rate": 3.9144520262435094e-07, - "logits/chosen": 0.35745617747306824, - "logits/rejected": 0.4186561703681946, - "logps/chosen": -373.4577941894531, - "logps/rejected": -316.75909423828125, - "loss": 0.5036, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.39865627884864807, - "rewards/margins": 0.827733039855957, - "rewards/rejected": -1.2263892889022827, - "step": 1680 - }, - { - "epoch": 0.8845851871237895, - "grad_norm": 348.0, - "learning_rate": 3.5855053843994625e-07, - "logits/chosen": 0.3469906747341156, - "logits/rejected": 0.3383873999118805, - "logps/chosen": -330.7817687988281, - "logps/rejected": -343.49249267578125, - "loss": 0.5905, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.5502803921699524, - "rewards/margins": 0.5968191623687744, - "rewards/rejected": -1.1470996141433716, - "step": 1690 - }, - { - "epoch": 0.8898194190002617, - "grad_norm": 290.0, - "learning_rate": 3.270479662980247e-07, - "logits/chosen": 0.4720439314842224, - "logits/rejected": 0.507573127746582, - "logps/chosen": -340.6956787109375, - "logps/rejected": -328.48541259765625, - "loss": 0.5653, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.5593769550323486, - "rewards/margins": 0.6956008672714233, - "rewards/rejected": -1.254977822303772, - "step": 1700 - }, - { - "epoch": 0.8950536508767338, - "grad_norm": 320.0, - "learning_rate": 2.9694693464359434e-07, - "logits/chosen": 0.36417311429977417, - "logits/rejected": 0.32077834010124207, - "logps/chosen": -358.11871337890625, - "logps/rejected": -348.39031982421875, - "loss": 0.55, - "rewards/accuracies": 0.721875011920929, - "rewards/chosen": -0.5245743989944458, - "rewards/margins": 0.7115057706832886, - "rewards/rejected": -1.2360801696777344, - "step": 1710 - }, - { - "epoch": 0.9002878827532059, - "grad_norm": 302.0, - "learning_rate": 2.682564715630287e-07, - "logits/chosen": 0.33791905641555786, - "logits/rejected": 0.3675960600376129, - "logps/chosen": -351.0929870605469, - "logps/rejected": -325.4019775390625, - "loss": 0.5036, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -0.5013250112533569, - "rewards/margins": 0.8183242678642273, - "rewards/rejected": -1.319649338722229, - "step": 1720 - }, - { - "epoch": 0.9055221146296781, - "grad_norm": 384.0, - "learning_rate": 2.4098518207630706e-07, - "logits/chosen": 0.38433754444122314, - "logits/rejected": 0.397840678691864, - "logps/chosen": -345.4227294921875, - "logps/rejected": -300.4522399902344, - "loss": 0.5736, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.558509886264801, - "rewards/margins": 0.6275274157524109, - "rewards/rejected": -1.186037302017212, - "step": 1730 - }, - { - "epoch": 0.9107563465061502, - "grad_norm": 374.0, - "learning_rate": 2.1514124555614412e-07, - "logits/chosen": 0.2413160502910614, - "logits/rejected": 0.3199203610420227, - "logps/chosen": -373.3514099121094, - "logps/rejected": -336.98895263671875, - "loss": 0.5558, - "rewards/accuracies": 0.703125, - "rewards/chosen": -0.5669843554496765, - "rewards/margins": 0.670451283454895, - "rewards/rejected": -1.2374355792999268, - "step": 1740 - }, - { - "epoch": 0.9159905783826223, - "grad_norm": 274.0, - "learning_rate": 1.9073241327478287e-07, - "logits/chosen": 0.2592464089393616, - "logits/rejected": 0.2286282330751419, - "logps/chosen": -335.2867736816406, - "logps/rejected": -296.50677490234375, - "loss": 0.5734, - "rewards/accuracies": 0.684374988079071, - "rewards/chosen": -0.5148959159851074, - "rewards/margins": 0.5882579684257507, - "rewards/rejected": -1.103153944015503, - "step": 1750 - }, - { - "epoch": 0.9212248102590945, - "grad_norm": 348.0, - "learning_rate": 1.677660060791836e-07, - "logits/chosen": 0.35103827714920044, - "logits/rejected": 0.36917150020599365, - "logps/chosen": -350.96551513671875, - "logps/rejected": -313.03997802734375, - "loss": 0.5246, - "rewards/accuracies": 0.734375, - "rewards/chosen": -0.4882447123527527, - "rewards/margins": 0.7595881819725037, - "rewards/rejected": -1.247833013534546, - "step": 1760 - }, - { - "epoch": 0.9264590421355666, - "grad_norm": 286.0, - "learning_rate": 1.4624891219531256e-07, - "logits/chosen": 0.3121943771839142, - "logits/rejected": 0.3472541272640228, - "logps/chosen": -350.4696044921875, - "logps/rejected": -318.84222412109375, - "loss": 0.5622, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.5181047320365906, - "rewards/margins": 0.6555663347244263, - "rewards/rejected": -1.173671007156372, - "step": 1770 - }, - { - "epoch": 0.9316932740120387, - "grad_norm": 251.0, - "learning_rate": 1.2618758516218187e-07, - "logits/chosen": 0.38791024684906006, - "logits/rejected": 0.38494163751602173, - "logps/chosen": -309.5002746582031, - "logps/rejected": -289.1304016113281, - "loss": 0.5638, - "rewards/accuracies": 0.703125, - "rewards/chosen": -0.49522829055786133, - "rewards/margins": 0.622164249420166, - "rewards/rejected": -1.1173925399780273, - "step": 1780 - }, - { - "epoch": 0.9369275058885108, - "grad_norm": 336.0, - "learning_rate": 1.0758804189626492e-07, - "logits/chosen": 0.34429654479026794, - "logits/rejected": 0.43823686242103577, - "logps/chosen": -337.650146484375, - "logps/rejected": -311.02020263671875, - "loss": 0.5783, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.5348609685897827, - "rewards/margins": 0.6113255620002747, - "rewards/rejected": -1.1461864709854126, - "step": 1790 - }, - { - "epoch": 0.942161737764983, - "grad_norm": 360.0, - "learning_rate": 9.045586088686497e-08, - "logits/chosen": 0.35523998737335205, - "logits/rejected": 0.29251137375831604, - "logps/chosen": -362.7002868652344, - "logps/rejected": -318.9178161621094, - "loss": 0.5547, - "rewards/accuracies": 0.703125, - "rewards/chosen": -0.5227453112602234, - "rewards/margins": 0.6950558423995972, - "rewards/rejected": -1.2178010940551758, - "step": 1800 - }, - { - "epoch": 0.9473959696414551, - "grad_norm": 370.0, - "learning_rate": 7.479618052298132e-08, - "logits/chosen": 0.46711522340774536, - "logits/rejected": 0.34828323125839233, - "logps/chosen": -365.4064636230469, - "logps/rejected": -346.60321044921875, - "loss": 0.5483, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.5453445315361023, - "rewards/margins": 0.6945411562919617, - "rewards/rejected": -1.2398855686187744, - "step": 1810 - }, - { - "epoch": 0.9526302015179272, - "grad_norm": 272.0, - "learning_rate": 6.06136975521715e-08, - "logits/chosen": 0.21810802817344666, - "logits/rejected": 0.33456215262413025, - "logps/chosen": -362.4671630859375, - "logps/rejected": -329.0609436035156, - "loss": 0.5389, - "rewards/accuracies": 0.7281249761581421, - "rewards/chosen": -0.5769203305244446, - "rewards/margins": 0.7337585687637329, - "rewards/rejected": -1.3106788396835327, - "step": 1820 - }, - { - "epoch": 0.9578644333943994, - "grad_norm": 280.0, - "learning_rate": 4.7912665671874246e-08, - "logits/chosen": 0.2832115590572357, - "logits/rejected": 0.31732824444770813, - "logps/chosen": -343.995361328125, - "logps/rejected": -318.9495544433594, - "loss": 0.5587, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.5360749959945679, - "rewards/margins": 0.6776861548423767, - "rewards/rejected": -1.2137610912322998, - "step": 1830 - }, - { - "epoch": 0.9630986652708715, - "grad_norm": 334.0, - "learning_rate": 3.669689425361444e-08, - "logits/chosen": 0.32280421257019043, - "logits/rejected": 0.36163073778152466, - "logps/chosen": -317.0058288574219, - "logps/rejected": -308.7008972167969, - "loss": 0.5632, - "rewards/accuracies": 0.6781250238418579, - "rewards/chosen": -0.4824526906013489, - "rewards/margins": 0.6204373240470886, - "rewards/rejected": -1.102890133857727, - "step": 1840 - }, - { - "epoch": 0.9683328971473436, - "grad_norm": 368.0, - "learning_rate": 2.6969747200472073e-08, - "logits/chosen": 0.378648579120636, - "logits/rejected": 0.5615746378898621, - "logps/chosen": -327.52935791015625, - "logps/rejected": -317.4973449707031, - "loss": 0.588, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.559417188167572, - "rewards/margins": 0.6587087512016296, - "rewards/rejected": -1.2181260585784912, - "step": 1850 - }, - { - "epoch": 0.9735671290238157, - "grad_norm": 340.0, - "learning_rate": 1.873414193816092e-08, - "logits/chosen": 0.3898963928222656, - "logits/rejected": 0.37868732213974, - "logps/chosen": -372.6197814941406, - "logps/rejected": -348.3377380371094, - "loss": 0.5311, - "rewards/accuracies": 0.721875011920929, - "rewards/chosen": -0.5276554226875305, - "rewards/margins": 0.7353495359420776, - "rewards/rejected": -1.2630048990249634, - "step": 1860 - }, - { - "epoch": 0.9788013609002879, - "grad_norm": 422.0, - "learning_rate": 1.1992548540016858e-08, - "logits/chosen": 0.31078967452049255, - "logits/rejected": 0.2968718707561493, - "logps/chosen": -372.1559143066406, - "logps/rejected": -341.91424560546875, - "loss": 0.5663, - "rewards/accuracies": 0.690625011920929, - "rewards/chosen": -0.5914555788040161, - "rewards/margins": 0.6402640342712402, - "rewards/rejected": -1.2317196130752563, - "step": 1870 - }, - { - "epoch": 0.98403559277676, - "grad_norm": 306.0, - "learning_rate": 6.746988986156e-09, - "logits/chosen": 0.37066927552223206, - "logits/rejected": 0.4628186821937561, - "logps/chosen": -331.226318359375, - "logps/rejected": -304.57989501953125, - "loss": 0.5334, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -0.5163390636444092, - "rewards/margins": 0.7293740510940552, - "rewards/rejected": -1.245713233947754, - "step": 1880 - }, - { - "epoch": 0.9892698246532321, - "grad_norm": 326.0, - "learning_rate": 2.9990365570314874e-09, - "logits/chosen": 0.37458479404449463, - "logits/rejected": 0.33924776315689087, - "logps/chosen": -363.9075012207031, - "logps/rejected": -337.91070556640625, - "loss": 0.5197, - "rewards/accuracies": 0.721875011920929, - "rewards/chosen": -0.5259437561035156, - "rewards/margins": 0.7279617190361023, - "rewards/rejected": -1.2539054155349731, - "step": 1890 - }, - { - "epoch": 0.9945040565297043, - "grad_norm": 338.0, - "learning_rate": 7.498153615653758e-10, - "logits/chosen": 0.327511191368103, - "logits/rejected": 0.3362283408641815, - "logps/chosen": -310.9585266113281, - "logps/rejected": -305.3989562988281, - "loss": 0.5745, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.5321878790855408, - "rewards/margins": 0.6392725706100464, - "rewards/rejected": -1.171460509300232, - "step": 1900 - }, - { - "epoch": 0.9997382884061764, - "grad_norm": 304.0, - "learning_rate": 0.0, - "logits/chosen": 0.31302136182785034, - "logits/rejected": 0.31303030252456665, - "logps/chosen": -355.60076904296875, - "logps/rejected": -323.382568359375, - "loss": 0.5713, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.5852067470550537, - "rewards/margins": 0.5970735549926758, - "rewards/rejected": -1.1822803020477295, - "step": 1910 + "rewards/chosen": -0.15625, + "rewards/margins": 0.72265625, + "rewards/rejected": -0.87890625, + "step": 950 } ], "logging_steps": 10, - "max_steps": 1910, + "max_steps": 955, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, @@ -2892,7 +1452,7 @@ } }, "total_flos": 0.0, - "train_batch_size": 4, + "train_batch_size": 8, "trial_name": null, "trial_params": null }