{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9974726200505476, "eval_steps": 16, "global_step": 148, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006739679865206402, "grad_norm": 0.639816164970398, "kl": 0.0, "learning_rate": 2.702702702702703e-07, "logps/chosen": -12.887619018554688, "logps/rejected": -14.09291500515408, "loss": 0.5, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.013479359730412805, "grad_norm": 0.9776865839958191, "kl": 0.0, "learning_rate": 5.405405405405406e-07, "logps/chosen": -12.487646663890166, "logps/rejected": -13.836714680989584, "loss": 0.5, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.020219039595619208, "grad_norm": 1.2699075937271118, "kl": 0.0047824084758758545, "learning_rate": 8.108108108108109e-07, "logps/chosen": -10.518890380859375, "logps/rejected": -14.338980538504464, "loss": 0.5056, "rewards/chosen": -0.0008002470151103776, "rewards/margins": -0.0007839773286115653, "rewards/rejected": -1.6269686498812268e-05, "step": 3 }, { "epoch": 0.02695871946082561, "grad_norm": 0.8817349076271057, "kl": 0.0018611401319503784, "learning_rate": 1.0810810810810812e-06, "logps/chosen": -12.266104305491728, "logps/rejected": -13.822291056315104, "loss": 0.503, "rewards/chosen": 0.00035604768816162557, "rewards/margins": 0.00037912309087593767, "rewards/rejected": -2.3075402714312078e-05, "step": 4 }, { "epoch": 0.03369839932603201, "grad_norm": 1.0762742757797241, "kl": 0.002072639763355255, "learning_rate": 1.3513513513513515e-06, "logps/chosen": -12.169254503752056, "logps/rejected": -14.365091177133413, "loss": 0.5045, "rewards/chosen": -6.65488861207115e-05, "rewards/margins": -0.00010910112201518979, "rewards/rejected": 4.255223589447828e-05, "step": 5 }, { "epoch": 0.040438079191238416, "grad_norm": 0.9145295023918152, "kl": 0.007123664021492004, "learning_rate": 1.6216216216216219e-06, "logps/chosen": -11.956428773941532, "logps/rejected": -15.07843202533144, "loss": 0.5043, "rewards/chosen": -9.064829998439358e-05, "rewards/margins": -0.0003112042532143122, "rewards/rejected": 0.00022055595322991863, "step": 6 }, { "epoch": 0.04717775905644482, "grad_norm": 1.0839142799377441, "kl": 0.0074616968631744385, "learning_rate": 1.8918918918918922e-06, "logps/chosen": -11.537665473090279, "logps/rejected": -15.229617527553014, "loss": 0.5044, "rewards/chosen": -5.648369228260385e-05, "rewards/margins": 0.0015003369430020925, "rewards/rejected": -0.0015568206352846964, "step": 7 }, { "epoch": 0.05391743892165122, "grad_norm": 1.325909972190857, "kl": 0.00619843602180481, "learning_rate": 2.1621621621621623e-06, "logps/chosen": -12.332542017886514, "logps/rejected": -15.343857985276442, "loss": 0.5046, "rewards/chosen": 0.00017708414969475647, "rewards/margins": 7.641464490669702e-05, "rewards/rejected": 0.00010066950478805945, "step": 8 }, { "epoch": 0.060657118786857624, "grad_norm": 1.1749920845031738, "kl": 0.008414536714553833, "learning_rate": 2.432432432432433e-06, "logps/chosen": -10.904085356613685, "logps/rejected": -15.790567452566965, "loss": 0.5041, "rewards/chosen": -2.645625500008464e-05, "rewards/margins": 0.00024653847296057004, "rewards/rejected": -0.0002729947279606547, "step": 9 }, { "epoch": 0.06739679865206402, "grad_norm": 1.0879679918289185, "kl": 0.00559939444065094, "learning_rate": 2.702702702702703e-06, "logps/chosen": -12.677191734313965, "logps/rejected": -16.435983657836914, "loss": 0.5039, "rewards/chosen": 0.00023014770704321563, "rewards/margins": 0.0006161145865917206, "rewards/rejected": -0.00038596687954850495, "step": 10 }, { "epoch": 0.07413647851727043, "grad_norm": 0.8410467505455017, "kl": 0.008653655648231506, "learning_rate": 2.9729729729729736e-06, "logps/chosen": -12.12862807053786, "logps/rejected": -14.626085783305921, "loss": 0.5025, "rewards/chosen": 0.00037280140587916743, "rewards/margins": 0.0007017212071520115, "rewards/rejected": -0.00032891980127284403, "step": 11 }, { "epoch": 0.08087615838247683, "grad_norm": 1.1322226524353027, "kl": 0.004855245351791382, "learning_rate": 3.2432432432432437e-06, "logps/chosen": -12.374617682562935, "logps/rejected": -13.656366620744977, "loss": 0.5031, "rewards/chosen": 0.0005280521905256642, "rewards/margins": 0.0008345128310519079, "rewards/rejected": -0.00030646064052624363, "step": 12 }, { "epoch": 0.08761583824768324, "grad_norm": 0.9295361042022705, "kl": 0.005203694105148315, "learning_rate": 3.513513513513514e-06, "logps/chosen": -11.762645945829505, "logps/rejected": -15.171628824869792, "loss": 0.5036, "rewards/chosen": 0.000179896637906923, "rewards/margins": -0.00038859495509634996, "rewards/rejected": 0.000568491593003273, "step": 13 }, { "epoch": 0.09435551811288964, "grad_norm": 0.8818777203559875, "kl": 0.005161814391613007, "learning_rate": 3.7837837837837844e-06, "logps/chosen": -12.79319673426011, "logps/rejected": -14.996563720703126, "loss": 0.5027, "rewards/chosen": 0.0002456202226526597, "rewards/margins": -3.756979553430688e-05, "rewards/rejected": 0.0002831900181869666, "step": 14 }, { "epoch": 0.10109519797809605, "grad_norm": 0.9348131418228149, "kl": 0.007587306201457977, "learning_rate": 4.0540540540540545e-06, "logps/chosen": -12.363379276160037, "logps/rejected": -14.615665558845766, "loss": 0.5047, "rewards/chosen": 0.0002821806366696502, "rewards/margins": -3.7112612848990366e-05, "rewards/rejected": 0.00031929324951864057, "step": 15 }, { "epoch": 0.10783487784330244, "grad_norm": 0.9292178153991699, "kl": 0.006865538656711578, "learning_rate": 4.324324324324325e-06, "logps/chosen": -12.438332112630208, "logps/rejected": -13.490447100471048, "loss": 0.5025, "rewards/chosen": 0.0007689857234557469, "rewards/margins": 0.0013605295552634724, "rewards/rejected": -0.0005915438318077256, "step": 16 }, { "epoch": 0.10783487784330244, "eval_kl": 0.009904867969453335, "eval_logps/chosen": -12.143770363298767, "eval_logps/rejected": -16.15631522168321, "eval_loss": 0.5038444399833679, "eval_rewards/chosen": 0.0003763478032142058, "eval_rewards/margins": -0.00032021752057413065, "eval_rewards/rejected": 0.0006965653237883364, "eval_runtime": 116.6391, "eval_samples_per_second": 4.287, "eval_steps_per_second": 1.072, "step": 16 }, { "epoch": 0.11457455770850884, "grad_norm": 0.9445685148239136, "kl": 0.008397102355957031, "learning_rate": 4.594594594594596e-06, "logps/chosen": -12.696038448449338, "logps/rejected": -13.775493990990423, "loss": 0.502, "rewards/chosen": 0.0011519584240335407, "rewards/margins": 0.00034577670240390577, "rewards/rejected": 0.000806181721629635, "step": 17 }, { "epoch": 0.12131423757371525, "grad_norm": 1.1721104383468628, "kl": 0.009495265781879425, "learning_rate": 4.864864864864866e-06, "logps/chosen": -12.26165771484375, "logps/rejected": -15.170940977154356, "loss": 0.5034, "rewards/chosen": 0.0009342439833187288, "rewards/margins": 0.00023290609990047218, "rewards/rejected": 0.0007013378834182566, "step": 18 }, { "epoch": 0.12805391743892164, "grad_norm": 0.5523513555526733, "kl": 0.01127266138792038, "learning_rate": 5.135135135135135e-06, "logps/chosen": -12.93335693359375, "logps/rejected": -14.879100310496796, "loss": 0.5016, "rewards/chosen": 0.0013011722266674042, "rewards/margins": 0.000309275094515238, "rewards/rejected": 0.0009918971321521662, "step": 19 }, { "epoch": 0.13479359730412804, "grad_norm": 0.8755555748939514, "kl": 0.012708976864814758, "learning_rate": 5.405405405405406e-06, "logps/chosen": -12.150137624432963, "logps/rejected": -32.386067708333336, "loss": 0.5043, "rewards/chosen": 0.0006054952259986631, "rewards/margins": -0.007545214171866401, "rewards/rejected": 0.008150709397865065, "step": 20 }, { "epoch": 0.14153327716933445, "grad_norm": 1.4405056238174438, "kl": 0.010751791298389435, "learning_rate": 5.675675675675676e-06, "logps/chosen": -11.948696899414063, "logps/rejected": -13.955949671128216, "loss": 0.5057, "rewards/chosen": 0.00015734033659100532, "rewards/margins": -0.0011103917296756717, "rewards/rejected": 0.001267732066266677, "step": 21 }, { "epoch": 0.14827295703454085, "grad_norm": 1.1817187070846558, "kl": 0.013703078031539917, "learning_rate": 5.945945945945947e-06, "logps/chosen": -12.940583172966452, "logps/rejected": -14.576173909505208, "loss": 0.5025, "rewards/chosen": 0.001265945132164394, "rewards/margins": -0.000688387855303054, "rewards/rejected": 0.001954332987467448, "step": 22 }, { "epoch": 0.15501263689974726, "grad_norm": 6.005044937133789, "kl": 0.00823821872472763, "learning_rate": 6.2162162162162164e-06, "logps/chosen": -12.0301513671875, "logps/rejected": -21.867945053998163, "loss": 0.5057, "rewards/chosen": 0.00015112324617803097, "rewards/margins": -0.000886952123769066, "rewards/rejected": 0.001038075369947097, "step": 23 }, { "epoch": 0.16175231676495366, "grad_norm": 0.8405526876449585, "kl": 0.015235595405101776, "learning_rate": 6.486486486486487e-06, "logps/chosen": -11.505650983537947, "logps/rejected": -15.960313468143857, "loss": 0.5035, "rewards/chosen": 0.0016303594623293196, "rewards/margins": 0.0006677421369576103, "rewards/rejected": 0.0009626173253717093, "step": 24 }, { "epoch": 0.16849199663016007, "grad_norm": 0.8300200700759888, "kl": 0.0065659284591674805, "learning_rate": 6.7567567567567575e-06, "logps/chosen": -13.24988525390625, "logps/rejected": -15.025803786057692, "loss": 0.5027, "rewards/chosen": 0.0017699988186359406, "rewards/margins": 0.00020917305961633335, "rewards/rejected": 0.0015608257590196072, "step": 25 }, { "epoch": 0.17523167649536647, "grad_norm": 1.233988642692566, "kl": 0.01195797324180603, "learning_rate": 7.027027027027028e-06, "logps/chosen": -13.01835239955357, "logps/rejected": -14.520358381600216, "loss": 0.5048, "rewards/chosen": 0.0011187103177819934, "rewards/margins": 0.00018990398407569664, "rewards/rejected": 0.0009288063337062968, "step": 26 }, { "epoch": 0.18197135636057288, "grad_norm": 1.1816742420196533, "kl": 0.01915910840034485, "learning_rate": 7.297297297297298e-06, "logps/chosen": -11.739371405707466, "logps/rejected": -14.137767246791295, "loss": 0.5033, "rewards/chosen": 0.0018767921460999383, "rewards/margins": 0.00035807005469761194, "rewards/rejected": 0.0015187220914023264, "step": 27 }, { "epoch": 0.18871103622577928, "grad_norm": 0.6885544061660767, "kl": 0.01914915442466736, "learning_rate": 7.567567567567569e-06, "logps/chosen": -11.961525656960227, "logps/rejected": -14.571149272303428, "loss": 0.5019, "rewards/chosen": 0.00266437367959456, "rewards/margins": 0.0002963455541392574, "rewards/rejected": 0.0023680281254553027, "step": 28 }, { "epoch": 0.1954507160909857, "grad_norm": 0.5657196640968323, "kl": 0.014864258468151093, "learning_rate": 7.837837837837838e-06, "logps/chosen": -14.268377685546875, "logps/rejected": -19.700423803084934, "loss": 0.5008, "rewards/chosen": 0.0028473290801048277, "rewards/margins": -0.0012096259150749596, "rewards/rejected": 0.004056954995179787, "step": 29 }, { "epoch": 0.2021903959561921, "grad_norm": 0.7043168544769287, "kl": 0.011253654956817627, "learning_rate": 8.108108108108109e-06, "logps/chosen": -12.092169761657715, "logps/rejected": -21.373014450073242, "loss": 0.501, "rewards/chosen": 0.002662060549482703, "rewards/margins": 0.0013133042957633734, "rewards/rejected": 0.0013487562537193298, "step": 30 }, { "epoch": 0.20893007582139847, "grad_norm": 2.2267584800720215, "kl": 0.013516634702682495, "learning_rate": 8.378378378378378e-06, "logps/chosen": -12.272081928868447, "logps/rejected": -14.440037582859848, "loss": 0.5081, "rewards/chosen": 0.0015497208843308111, "rewards/margins": 0.0003958268897682575, "rewards/rejected": 0.0011538939945625536, "step": 31 }, { "epoch": 0.21566975568660487, "grad_norm": 0.6539870500564575, "kl": 0.0161922425031662, "learning_rate": 8.64864864864865e-06, "logps/chosen": -12.081860710592832, "logps/rejected": -14.507745361328125, "loss": 0.502, "rewards/chosen": 0.0034291958984206707, "rewards/margins": 0.0014160188462804348, "rewards/rejected": 0.002013177052140236, "step": 32 }, { "epoch": 0.21566975568660487, "eval_kl": 0.01995524764060974, "eval_logps/chosen": -12.114965053951794, "eval_logps/rejected": -16.145021293998195, "eval_loss": 0.5019354224205017, "eval_rewards/chosen": 0.0032570009274333045, "eval_rewards/margins": 0.0014309825096479906, "eval_rewards/rejected": 0.001826018417785314, "eval_runtime": 116.8406, "eval_samples_per_second": 4.279, "eval_steps_per_second": 1.07, "step": 32 }, { "epoch": 0.22240943555181128, "grad_norm": 0.6553356051445007, "kl": 0.01506737619638443, "learning_rate": 8.91891891891892e-06, "logps/chosen": -11.576536290785846, "logps/rejected": -14.449583943684896, "loss": 0.502, "rewards/chosen": 0.00276046509251875, "rewards/margins": -0.0007693189908476437, "rewards/rejected": 0.003529784083366394, "step": 33 }, { "epoch": 0.22914911541701768, "grad_norm": 0.7586061358451843, "kl": 0.017700180411338806, "learning_rate": 9.189189189189191e-06, "logps/chosen": -11.496786117553711, "logps/rejected": -15.68021011352539, "loss": 0.5019, "rewards/chosen": 0.004237835295498371, "rewards/margins": 0.0013041330967098475, "rewards/rejected": 0.0029337021987885237, "step": 34 }, { "epoch": 0.2358887952822241, "grad_norm": 1.1288914680480957, "kl": 0.029283612966537476, "learning_rate": 9.45945945945946e-06, "logps/chosen": -12.882062358240928, "logps/rejected": -15.065287272135416, "loss": 0.5022, "rewards/chosen": 0.0016913585845501193, "rewards/margins": -0.00014708854644063154, "rewards/rejected": 0.0018384471309907508, "step": 35 }, { "epoch": 0.2426284751474305, "grad_norm": 0.3856205940246582, "kl": 0.02245330810546875, "learning_rate": 9.729729729729732e-06, "logps/chosen": -11.367819213867188, "logps/rejected": -13.798005047966452, "loss": 0.5009, "rewards/chosen": 0.0032590890924135843, "rewards/margins": 4.188570321775071e-06, "rewards/rejected": 0.0032549005220918093, "step": 36 }, { "epoch": 0.2493681550126369, "grad_norm": 1.6323754787445068, "kl": 0.01760883629322052, "learning_rate": 1e-05, "logps/chosen": -11.14263124819155, "logps/rejected": -14.40852809596706, "loss": 0.5031, "rewards/chosen": 0.00255692419078615, "rewards/margins": 0.00025125865911220305, "rewards/rejected": 0.0023056655316739468, "step": 37 }, { "epoch": 0.2561078348778433, "grad_norm": 0.5301430821418762, "kl": 0.022485479712486267, "learning_rate": 9.99799753559161e-06, "logps/chosen": -12.966288248697916, "logps/rejected": -14.867413689108457, "loss": 0.5015, "rewards/chosen": 0.0032486026485761006, "rewards/margins": -0.00023628765461491605, "rewards/rejected": 0.0034848903031910166, "step": 38 }, { "epoch": 0.2628475147430497, "grad_norm": 1.3813471794128418, "kl": 0.015269860625267029, "learning_rate": 9.991991746311916e-06, "logps/chosen": -12.546598237136315, "logps/rejected": -14.897487095424108, "loss": 0.5059, "rewards/chosen": 0.003317505121231079, "rewards/margins": 0.0015055560639926366, "rewards/rejected": 0.0018119490572384425, "step": 39 }, { "epoch": 0.2695871946082561, "grad_norm": 0.9622092843055725, "kl": 0.024058394134044647, "learning_rate": 9.981987442712634e-06, "logps/chosen": -12.606202915736608, "logps/rejected": -14.283474626212284, "loss": 0.5025, "rewards/chosen": 0.0032653006059782845, "rewards/margins": 0.0015468575904522036, "rewards/rejected": 0.001718443015526081, "step": 40 }, { "epoch": 0.2763268744734625, "grad_norm": 0.4365544617176056, "kl": 0.022776372730731964, "learning_rate": 9.967992638098517e-06, "logps/chosen": -12.364286295572917, "logps/rejected": -12.308985093060661, "loss": 0.5005, "rewards/chosen": 0.0036993456383546193, "rewards/margins": 0.0010577291396318697, "rewards/rejected": 0.0026416164987227496, "step": 41 }, { "epoch": 0.2830665543386689, "grad_norm": 0.5576804876327515, "kl": 0.020083852112293243, "learning_rate": 9.950018542108818e-06, "logps/chosen": -12.115999009874132, "logps/rejected": -12.540671212332589, "loss": 0.5015, "rewards/chosen": 0.003624026974042257, "rewards/margins": 0.0011748063954569047, "rewards/rejected": 0.002449220578585352, "step": 42 }, { "epoch": 0.2898062342038753, "grad_norm": 0.6428059935569763, "kl": 0.02687143161892891, "learning_rate": 9.928079551738542e-06, "logps/chosen": -11.715484619140625, "logps/rejected": -14.010598182678223, "loss": 0.5018, "rewards/chosen": 0.004161514341831207, "rewards/margins": 0.0009207972325384617, "rewards/rejected": 0.0032407171092927456, "step": 43 }, { "epoch": 0.2965459140690817, "grad_norm": 0.5016953349113464, "kl": 0.0221768319606781, "learning_rate": 9.902193239806634e-06, "logps/chosen": -11.717826843261719, "logps/rejected": -15.60338020324707, "loss": 0.5004, "rewards/chosen": 0.004171658307313919, "rewards/margins": 0.0014381594955921173, "rewards/rejected": 0.0027334988117218018, "step": 44 }, { "epoch": 0.3032855939342881, "grad_norm": 0.6543410420417786, "kl": 0.025154344737529755, "learning_rate": 9.872380340880416e-06, "logps/chosen": -12.942235946655273, "logps/rejected": -14.092964172363281, "loss": 0.5008, "rewards/chosen": 0.004342000465840101, "rewards/margins": 0.0014070440083742142, "rewards/rejected": 0.002934956457465887, "step": 45 }, { "epoch": 0.3100252737994945, "grad_norm": 0.8556452393531799, "kl": 0.03452587127685547, "learning_rate": 9.838664734667496e-06, "logps/chosen": -10.915324244005927, "logps/rejected": -14.422269984654017, "loss": 0.5012, "rewards/chosen": 0.003742930190316562, "rewards/margins": 0.0028417849151665354, "rewards/rejected": 0.0009011452751500266, "step": 46 }, { "epoch": 0.3167649536647009, "grad_norm": 0.4844423234462738, "kl": 0.028471767902374268, "learning_rate": 9.801073426888447e-06, "logps/chosen": -11.690447407384072, "logps/rejected": -14.260365804036459, "loss": 0.5007, "rewards/chosen": 0.0039828251446447065, "rewards/margins": -0.00019254163271753902, "rewards/rejected": 0.0041753667773622456, "step": 47 }, { "epoch": 0.3235046335299073, "grad_norm": 0.6494544148445129, "kl": 0.027864396572113037, "learning_rate": 9.759636527645633e-06, "logps/chosen": -12.518224080403646, "logps/rejected": -14.488565717424665, "loss": 0.5026, "rewards/chosen": 0.003701325919893053, "rewards/margins": 0.0002690214545480788, "rewards/rejected": 0.003432304465344974, "step": 48 }, { "epoch": 0.3235046335299073, "eval_kl": 0.03345843777060509, "eval_logps/chosen": -12.096400513242713, "eval_logps/rejected": -16.13582151116877, "eval_loss": 0.5012729167938232, "eval_rewards/chosen": 0.005113235503568778, "eval_rewards/margins": 0.002367174851576145, "eval_rewards/rejected": 0.0027460606519926324, "eval_runtime": 117.0064, "eval_samples_per_second": 4.273, "eval_steps_per_second": 1.068, "step": 48 }, { "epoch": 0.33024431339511373, "grad_norm": 0.47848811745643616, "kl": 0.02880626916885376, "learning_rate": 9.714387227305422e-06, "logps/chosen": -11.936473301478795, "logps/rejected": -14.11018541124132, "loss": 0.5007, "rewards/chosen": 0.003300180658698082, "rewards/margins": -0.00017501186165544717, "rewards/rejected": 0.003475192520353529, "step": 49 }, { "epoch": 0.33698399326032014, "grad_norm": 1.9203778505325317, "kl": 0.03367514908313751, "learning_rate": 9.665361769913187e-06, "logps/chosen": -11.758908658414274, "logps/rejected": -14.825486924913195, "loss": 0.5021, "rewards/chosen": 0.003634077471655768, "rewards/margins": -7.272407308116452e-05, "rewards/rejected": 0.0037068015447369327, "step": 50 }, { "epoch": 0.34372367312552654, "grad_norm": 0.5142046809196472, "kl": 0.054336875677108765, "learning_rate": 9.612599424162344e-06, "logps/chosen": -12.71717924230239, "logps/rejected": -13.75819091796875, "loss": 0.5002, "rewards/chosen": 0.004144738702213063, "rewards/margins": 0.0004953872047218622, "rewards/rejected": 0.003649351497491201, "step": 51 }, { "epoch": 0.35046335299073295, "grad_norm": 5.670271873474121, "kl": 0.03541412204504013, "learning_rate": 9.55614245194068e-06, "logps/chosen": -11.491986674647178, "logps/rejected": -13.510441635594223, "loss": 0.5083, "rewards/chosen": 0.0023518490695184275, "rewards/margins": -0.003151654592648746, "rewards/rejected": 0.005503503662167173, "step": 52 }, { "epoch": 0.35720303285593935, "grad_norm": 0.394388347864151, "kl": 0.030839860439300537, "learning_rate": 9.496036074479184e-06, "logps/chosen": -12.102163380589978, "logps/rejected": -14.92567138671875, "loss": 0.5006, "rewards/chosen": 0.0048105048722234265, "rewards/margins": 0.0003774762594053894, "rewards/rejected": 0.004433028612818037, "step": 53 }, { "epoch": 0.36394271272114576, "grad_norm": 0.6340323090553284, "kl": 0.024696357548236847, "learning_rate": 9.432328436130493e-06, "logps/chosen": -12.709688186645508, "logps/rejected": -13.567533493041992, "loss": 0.5013, "rewards/chosen": 0.005041294265538454, "rewards/margins": 0.0015526015777140856, "rewards/rejected": 0.0034886926878243685, "step": 54 }, { "epoch": 0.37068239258635216, "grad_norm": 1.0442602634429932, "kl": 0.030310869216918945, "learning_rate": 9.365070565805941e-06, "logps/chosen": -11.8351806640625, "logps/rejected": -12.171940244477371, "loss": 0.5021, "rewards/chosen": 0.0029652584876332963, "rewards/margins": -0.0006688115264981841, "rewards/rejected": 0.0036340700141314804, "step": 55 }, { "epoch": 0.37742207245155857, "grad_norm": 0.41058778762817383, "kl": 0.03595775365829468, "learning_rate": 9.294316336102132e-06, "logps/chosen": -12.58010056439568, "logps/rejected": -16.123766072591145, "loss": 0.5008, "rewards/chosen": 0.00409250268164803, "rewards/margins": 0.00040192935688822913, "rewards/rejected": 0.003690573324759801, "step": 56 }, { "epoch": 0.38416175231676497, "grad_norm": 0.08168064057826996, "kl": 0.031137794256210327, "learning_rate": 9.220122420149753e-06, "logps/chosen": -12.088414510091146, "logps/rejected": -15.25335693359375, "loss": 0.5, "rewards/chosen": 0.005210169156392415, "rewards/margins": -0.0003060570826717455, "rewards/rejected": 0.005516226239064161, "step": 57 }, { "epoch": 0.3909014321819714, "grad_norm": 0.6683095693588257, "kl": 0.03323546051979065, "learning_rate": 9.142548246219212e-06, "logps/chosen": -12.333134831608954, "logps/rejected": -12.924123128255209, "loss": 0.5016, "rewards/chosen": 0.003185961697552655, "rewards/margins": -0.0001546228388408285, "rewards/rejected": 0.0033405845363934836, "step": 58 }, { "epoch": 0.3976411120471778, "grad_norm": 0.44280490279197693, "kl": 0.040286846458911896, "learning_rate": 9.06165595011943e-06, "logps/chosen": -11.647547040666852, "logps/rejected": -13.499364217122396, "loss": 0.5015, "rewards/chosen": 0.004377785005739757, "rewards/margins": -0.0010754091753846125, "rewards/rejected": 0.00545319418112437, "step": 59 }, { "epoch": 0.4043807919123842, "grad_norm": 0.253917396068573, "kl": 0.02998022735118866, "learning_rate": 8.97751032542795e-06, "logps/chosen": -10.375978168688322, "logps/rejected": -14.37469012920673, "loss": 0.4998, "rewards/chosen": 0.005374729241195478, "rewards/margins": 0.0037813447792882375, "rewards/rejected": 0.00159338446190724, "step": 60 }, { "epoch": 0.4111204717775906, "grad_norm": 0.22803018987178802, "kl": 0.0411323606967926, "learning_rate": 8.890178771592198e-06, "logps/chosen": -12.261529541015625, "logps/rejected": -14.262212416704964, "loss": 0.5007, "rewards/chosen": 0.0042099177837371824, "rewards/margins": -0.00034529444049386394, "rewards/rejected": 0.004555212224231046, "step": 61 }, { "epoch": 0.41786015164279694, "grad_norm": 0.6412160992622375, "kl": 0.03793022781610489, "learning_rate": 8.799731239943488e-06, "logps/chosen": -12.091997034409466, "logps/rejected": -15.869469197591146, "loss": 0.5016, "rewards/chosen": 0.003918992245898527, "rewards/margins": -4.28084064932506e-06, "rewards/rejected": 0.003923273086547852, "step": 62 }, { "epoch": 0.42459983150800334, "grad_norm": 0.8344623446464539, "kl": 0.04702252149581909, "learning_rate": 8.706240177667003e-06, "logps/chosen": -11.470931159125435, "logps/rejected": -14.442997523716517, "loss": 0.5058, "rewards/chosen": 0.0035239855448404946, "rewards/margins": -0.0012164868059612454, "rewards/rejected": 0.00474047235080174, "step": 63 }, { "epoch": 0.43133951137320975, "grad_norm": 0.8020732402801514, "kl": 0.039898380637168884, "learning_rate": 8.609780469772623e-06, "logps/chosen": -14.423152378627233, "logps/rejected": -14.408963290127842, "loss": 0.5021, "rewards/chosen": 0.004889163587774549, "rewards/margins": 0.0007666330542657283, "rewards/rejected": 0.004122530533508821, "step": 64 }, { "epoch": 0.43133951137320975, "eval_kl": 0.04057914763689041, "eval_logps/chosen": -12.08929306509249, "eval_logps/rejected": -16.12696370148917, "eval_loss": 0.5015159249305725, "eval_rewards/chosen": 0.005824030782075207, "eval_rewards/margins": 0.0021923597992461368, "eval_rewards/rejected": 0.00363167098282907, "eval_runtime": 117.006, "eval_samples_per_second": 4.273, "eval_steps_per_second": 1.068, "step": 64 }, { "epoch": 0.43807919123841615, "grad_norm": 0.4794953465461731, "kl": 0.06649504601955414, "learning_rate": 8.510429379113114e-06, "logps/chosen": -12.707015991210938, "logps/rejected": -14.950631534352022, "loss": 0.5008, "rewards/chosen": 0.004046256840229035, "rewards/margins": 0.0002097017624798944, "rewards/rejected": 0.00383655507774914, "step": 65 }, { "epoch": 0.44481887110362256, "grad_norm": 0.3255169689655304, "kl": 0.03476836532354355, "learning_rate": 8.408266484497664e-06, "logps/chosen": -12.403576190655048, "logps/rejected": -14.6284035130551, "loss": 0.4999, "rewards/chosen": 0.008390213434512798, "rewards/margins": 0.004481334163833725, "rewards/rejected": 0.0039088792706790726, "step": 66 }, { "epoch": 0.45155855096882896, "grad_norm": 0.5565071105957031, "kl": 0.036111876368522644, "learning_rate": 8.303373616950408e-06, "logps/chosen": -12.935804578993055, "logps/rejected": -15.007106236049108, "loss": 0.5002, "rewards/chosen": 0.004996686346001095, "rewards/margins": 0.0004498769366551962, "rewards/rejected": 0.004546809409345899, "step": 67 }, { "epoch": 0.45829823083403537, "grad_norm": 0.49582043290138245, "kl": 0.03154793381690979, "learning_rate": 8.195834794164925e-06, "logps/chosen": -12.002599225725447, "logps/rejected": -14.013414054081357, "loss": 0.501, "rewards/chosen": 0.00556163489818573, "rewards/margins": 0.000681957808034173, "rewards/rejected": 0.004879677090151557, "step": 68 }, { "epoch": 0.4650379106992418, "grad_norm": 0.5658519268035889, "kl": 0.029201030731201172, "learning_rate": 8.085736153207277e-06, "logps/chosen": -12.031069587258731, "logps/rejected": -14.41396484375, "loss": 0.5007, "rewards/chosen": 0.004736459868795732, "rewards/margins": -9.984157833398469e-06, "rewards/rejected": 0.00474644402662913, "step": 69 }, { "epoch": 0.4717775905644482, "grad_norm": 0.3203012943267822, "kl": 0.027451664209365845, "learning_rate": 7.973165881521435e-06, "logps/chosen": -11.853369140625, "logps/rejected": -13.254534491177264, "loss": 0.5003, "rewards/chosen": 0.005844617741448539, "rewards/margins": -0.0013138286732687736, "rewards/rejected": 0.007158446414717312, "step": 70 }, { "epoch": 0.4785172704296546, "grad_norm": 0.360921710729599, "kl": 0.03561514616012573, "learning_rate": 7.858214146292394e-06, "logps/chosen": -12.516302926199776, "logps/rejected": -14.650658501519096, "loss": 0.5001, "rewards/chosen": 0.005021435873849052, "rewards/margins": 0.001271195236652617, "rewards/rejected": 0.0037502406371964347, "step": 71 }, { "epoch": 0.485256950294861, "grad_norm": 0.6086096167564392, "kl": 0.04020478576421738, "learning_rate": 7.74097302222355e-06, "logps/chosen": -11.879875921433971, "logps/rejected": -15.616689970999053, "loss": 0.5002, "rewards/chosen": 0.004931771466808934, "rewards/margins": 0.0010645643153381719, "rewards/rejected": 0.0038672071514707623, "step": 72 }, { "epoch": 0.4919966301600674, "grad_norm": 0.19885578751564026, "kl": 0.026517115533351898, "learning_rate": 7.621536417786159e-06, "logps/chosen": -11.950851440429688, "logps/rejected": -13.586245368508731, "loss": 0.5004, "rewards/chosen": 0.004206822315851847, "rewards/margins": 0.0002687697317086011, "rewards/rejected": 0.003938052584143246, "step": 73 }, { "epoch": 0.4987363100252738, "grad_norm": 0.6865639090538025, "kl": 0.04328171908855438, "learning_rate": 7.500000000000001e-06, "logps/chosen": -10.888592311314174, "logps/rejected": -13.478512234157986, "loss": 0.5006, "rewards/chosen": 0.00372953899204731, "rewards/margins": -0.0011417826430665124, "rewards/rejected": 0.004871321635113822, "step": 74 }, { "epoch": 0.5054759898904801, "grad_norm": 0.9542965888977051, "kl": 0.03713301569223404, "learning_rate": 7.37646111780545e-06, "logps/chosen": -12.198631286621094, "logps/rejected": -14.544124603271484, "loss": 0.501, "rewards/chosen": 0.0044771101325750354, "rewards/margins": 0.0009494200969735783, "rewards/rejected": 0.003527690035601457, "step": 75 }, { "epoch": 0.5122156697556866, "grad_norm": 0.2528373599052429, "kl": 0.027506500482559204, "learning_rate": 7.251018724088367e-06, "logps/chosen": -13.080149332682291, "logps/rejected": -13.036513869826859, "loss": 0.5005, "rewards/chosen": 0.006512647425686872, "rewards/margins": 0.0014634541980735778, "rewards/rejected": 0.005049193227613294, "step": 76 }, { "epoch": 0.518955349620893, "grad_norm": 0.22596140205860138, "kl": 0.040385909378528595, "learning_rate": 7.12377329642024e-06, "logps/chosen": -11.61421907865084, "logps/rejected": -15.499625758120889, "loss": 0.4998, "rewards/chosen": 0.00586964304630573, "rewards/margins": 0.002287208219530129, "rewards/rejected": 0.003582434826775601, "step": 77 }, { "epoch": 0.5256950294860994, "grad_norm": 0.2265264242887497, "kl": 0.02709801495075226, "learning_rate": 6.994826756577082e-06, "logps/chosen": -10.6973180356233, "logps/rejected": -12.833906964557928, "loss": 0.5005, "rewards/chosen": 0.004355713725090027, "rewards/margins": 0.00035547337880948663, "rewards/rejected": 0.00400024034628054, "step": 78 }, { "epoch": 0.5324347093513058, "grad_norm": 0.49220699071884155, "kl": 0.026310235261917114, "learning_rate": 6.864282388901544e-06, "logps/chosen": -14.165910391971982, "logps/rejected": -19.099337332589286, "loss": 0.5, "rewards/chosen": 0.004868762246493636, "rewards/margins": 0.002894112944896586, "rewards/rejected": 0.00197464930159705, "step": 79 }, { "epoch": 0.5391743892165122, "grad_norm": 0.6800411343574524, "kl": 0.046130433678627014, "learning_rate": 6.732244757573619e-06, "logps/chosen": -12.125639073988971, "logps/rejected": -13.411381022135417, "loss": 0.5017, "rewards/chosen": 0.005388632416725159, "rewards/margins": 0.000751591225465139, "rewards/rejected": 0.00463704119126002, "step": 80 }, { "epoch": 0.5391743892165122, "eval_kl": 0.04340193793177605, "eval_logps/chosen": -12.083256332329036, "eval_logps/rejected": -16.12648952222473, "eval_loss": 0.501190721988678, "eval_rewards/chosen": 0.006427717315776466, "eval_rewards/margins": 0.0027484710674554836, "eval_rewards/rejected": 0.0036792462483209824, "eval_runtime": 117.1394, "eval_samples_per_second": 4.268, "eval_steps_per_second": 1.067, "step": 80 }, { "epoch": 0.5459140690817186, "grad_norm": 0.6403890252113342, "kl": 0.0381980761885643, "learning_rate": 6.598819622856227e-06, "logps/chosen": -12.278984656700722, "logps/rejected": -14.581547787314967, "loss": 0.5024, "rewards/chosen": 0.005194329871581151, "rewards/margins": 0.000755395906174231, "rewards/rejected": 0.00443893396540692, "step": 81 }, { "epoch": 0.552653748946925, "grad_norm": 0.3272389769554138, "kl": 0.029798954725265503, "learning_rate": 6.464113856382752e-06, "logps/chosen": -11.796954530658144, "logps/rejected": -14.982394310735888, "loss": 0.5005, "rewards/chosen": 0.005100182511589744, "rewards/margins": 0.000986845306287413, "rewards/rejected": 0.004113337205302331, "step": 82 }, { "epoch": 0.5593934288121314, "grad_norm": 0.549321711063385, "kl": 0.05309329181909561, "learning_rate": 6.328235355554382e-06, "logps/chosen": -12.00343715122768, "logps/rejected": -15.50714742726293, "loss": 0.5003, "rewards/chosen": 0.005881215419088091, "rewards/margins": 0.003232161487851824, "rewards/rejected": 0.002649053931236267, "step": 83 }, { "epoch": 0.5661331086773378, "grad_norm": 0.6684555411338806, "kl": 0.035583049058914185, "learning_rate": 6.191292957115825e-06, "logps/chosen": -12.240760294596354, "logps/rejected": -15.023827945484834, "loss": 0.5017, "rewards/chosen": 0.003915249804655711, "rewards/margins": -0.0005642145579936456, "rewards/rejected": 0.004479464362649357, "step": 84 }, { "epoch": 0.5728727885425442, "grad_norm": 1.0044106245040894, "kl": 0.04308219999074936, "learning_rate": 6.053396349978632e-06, "logps/chosen": -11.776771791519657, "logps/rejected": -12.308844364050662, "loss": 0.5021, "rewards/chosen": 0.004112924779615094, "rewards/margins": 0.000594291009394887, "rewards/rejected": 0.0035186337702202072, "step": 85 }, { "epoch": 0.5796124684077506, "grad_norm": 0.4596560299396515, "kl": 0.033609502017498016, "learning_rate": 5.914655987361934e-06, "logps/chosen": -11.814070268110795, "logps/rejected": -13.036851452242944, "loss": 0.5013, "rewards/chosen": 0.0055642552448041515, "rewards/margins": 0.002487225062919386, "rewards/rejected": 0.0030770301818847656, "step": 86 }, { "epoch": 0.586352148272957, "grad_norm": 0.3680490255355835, "kl": 0.037708625197410583, "learning_rate": 5.77518299832099e-06, "logps/chosen": -11.723768967848558, "logps/rejected": -13.26518410130551, "loss": 0.5004, "rewards/chosen": 0.004937492884122408, "rewards/margins": -0.00045244181566392837, "rewards/rejected": 0.005389934699786336, "step": 87 }, { "epoch": 0.5930918281381634, "grad_norm": 0.5304110050201416, "kl": 0.05044492334127426, "learning_rate": 5.635089098734394e-06, "logps/chosen": -13.223031997680664, "logps/rejected": -14.862555503845215, "loss": 0.5013, "rewards/chosen": 0.00417838990688324, "rewards/margins": 0.00042380671948194504, "rewards/rejected": 0.0037545831874012947, "step": 88 }, { "epoch": 0.5998315080033698, "grad_norm": 0.41980600357055664, "kl": 0.03524252772331238, "learning_rate": 5.49448650182125e-06, "logps/chosen": -12.427776439769849, "logps/rejected": -13.90366843894676, "loss": 0.4999, "rewards/chosen": 0.006115401918823655, "rewards/margins": 0.0020641214004508013, "rewards/rejected": 0.004051280518372853, "step": 89 }, { "epoch": 0.6065711878685762, "grad_norm": 0.5898346304893494, "kl": 0.031301796436309814, "learning_rate": 5.353487828259973e-06, "logps/chosen": -11.024669647216797, "logps/rejected": -21.888626098632812, "loss": 0.5011, "rewards/chosen": 0.00403784541413188, "rewards/margins": 0.006319438107311726, "rewards/rejected": -0.002281592693179846, "step": 90 }, { "epoch": 0.6133108677337826, "grad_norm": 0.35757502913475037, "kl": 0.03623806685209274, "learning_rate": 5.212206015980742e-06, "logps/chosen": -12.488767736098346, "logps/rejected": -15.077747599283855, "loss": 0.5011, "rewards/chosen": 0.004482402959290673, "rewards/margins": 0.0013234442063406403, "rewards/rejected": 0.0031589587529500325, "step": 91 }, { "epoch": 0.620050547598989, "grad_norm": 0.42492297291755676, "kl": 0.03657727688550949, "learning_rate": 5.070754229703811e-06, "logps/chosen": -12.937924194335938, "logps/rejected": -15.475282556870404, "loss": 0.5009, "rewards/chosen": 0.006935717165470123, "rewards/margins": 0.002569316239917979, "rewards/rejected": 0.004366400925552144, "step": 92 }, { "epoch": 0.6267902274641954, "grad_norm": 0.5660274028778076, "kl": 0.04550507664680481, "learning_rate": 4.929245770296191e-06, "logps/chosen": -11.731661478678385, "logps/rejected": -15.350058419363839, "loss": 0.5016, "rewards/chosen": 0.004625084913439221, "rewards/margins": 0.001465273725371512, "rewards/rejected": 0.0031598111880677088, "step": 93 }, { "epoch": 0.6335299073294018, "grad_norm": 0.6102348566055298, "kl": 0.02795557677745819, "learning_rate": 4.78779398401926e-06, "logps/chosen": -12.868092256433824, "logps/rejected": -15.27595723470052, "loss": 0.5015, "rewards/chosen": 0.00481748317970949, "rewards/margins": 0.0005204015502742691, "rewards/rejected": 0.004297081629435221, "step": 94 }, { "epoch": 0.6402695871946082, "grad_norm": 0.34281784296035767, "kl": 0.030268676578998566, "learning_rate": 4.646512171740028e-06, "logps/chosen": -12.013111943783967, "logps/rejected": -14.691796279535062, "loss": 0.5016, "rewards/chosen": 0.0030449654745019, "rewards/margins": -0.0003336467646844563, "rewards/rejected": 0.0033786122391863565, "step": 95 }, { "epoch": 0.6470092670598147, "grad_norm": 0.4762212932109833, "kl": 0.041235461831092834, "learning_rate": 4.505513498178752e-06, "logps/chosen": -12.398789760044643, "logps/rejected": -16.021841114964978, "loss": 0.5003, "rewards/chosen": 0.006167218514851161, "rewards/margins": 0.0024321594775603907, "rewards/rejected": 0.0037350590372907705, "step": 96 }, { "epoch": 0.6470092670598147, "eval_kl": 0.04306062310934067, "eval_logps/chosen": -12.08118387401906, "eval_logps/rejected": -16.131139651962997, "eval_loss": 0.5007099509239197, "eval_rewards/chosen": 0.006635084815089478, "eval_rewards/margins": 0.003420951039467933, "eval_rewards/rejected": 0.003214133775621545, "eval_runtime": 117.1734, "eval_samples_per_second": 4.267, "eval_steps_per_second": 1.067, "step": 96 }, { "epoch": 0.6537489469250211, "grad_norm": 0.6140352487564087, "kl": 0.024514369666576385, "learning_rate": 4.364910901265607e-06, "logps/chosen": -12.712422688802084, "logps/rejected": -15.036084856305804, "loss": 0.5012, "rewards/chosen": 0.005062463382879893, "rewards/margins": 0.0017749588226988206, "rewards/rejected": 0.0032875045601810727, "step": 97 }, { "epoch": 0.6604886267902275, "grad_norm": 1.0478788614273071, "kl": 0.0483347624540329, "learning_rate": 4.224817001679011e-06, "logps/chosen": -12.04401277240954, "logps/rejected": -15.022868229792667, "loss": 0.504, "rewards/chosen": 0.0031436879776026074, "rewards/margins": -0.0013222903284708012, "rewards/rejected": 0.004465978306073409, "step": 98 }, { "epoch": 0.6672283066554339, "grad_norm": 0.26045116782188416, "kl": 0.04508065804839134, "learning_rate": 4.085344012638067e-06, "logps/chosen": -12.453051017992424, "logps/rejected": -13.936600223664314, "loss": 0.4996, "rewards/chosen": 0.006089657545089722, "rewards/margins": 0.002818903375056482, "rewards/rejected": 0.0032707541700332395, "step": 99 }, { "epoch": 0.6739679865206403, "grad_norm": 0.9558663964271545, "kl": 0.04088562726974487, "learning_rate": 3.94660365002137e-06, "logps/chosen": -11.501955817727481, "logps/rejected": -14.433762613932291, "loss": 0.5039, "rewards/chosen": 0.004746326628853293, "rewards/margins": 0.0011829081411455188, "rewards/rejected": 0.003563418487707774, "step": 100 }, { "epoch": 0.6807076663858467, "grad_norm": 0.5541660189628601, "kl": 0.0344005823135376, "learning_rate": 3.808707042884176e-06, "logps/chosen": -13.376131119266633, "logps/rejected": -26.542641379616477, "loss": 0.5008, "rewards/chosen": 0.0056792925442418745, "rewards/margins": 0.003930307243780423, "rewards/rejected": 0.0017489853004614513, "step": 101 }, { "epoch": 0.6874473462510531, "grad_norm": 0.3918946087360382, "kl": 0.038804955780506134, "learning_rate": 3.6717646444456196e-06, "logps/chosen": -12.274094154094827, "logps/rejected": -14.250191824776786, "loss": 0.5003, "rewards/chosen": 0.005658124541414195, "rewards/margins": 0.0010038192195845359, "rewards/rejected": 0.004654305321829659, "step": 102 }, { "epoch": 0.6941870261162595, "grad_norm": 0.45218226313591003, "kl": 0.048004306852817535, "learning_rate": 3.5358861436172487e-06, "logps/chosen": -12.376973152160645, "logps/rejected": -15.0717134475708, "loss": 0.5002, "rewards/chosen": 0.007525511085987091, "rewards/margins": 0.00302144605666399, "rewards/rejected": 0.004504065029323101, "step": 103 }, { "epoch": 0.7009267059814659, "grad_norm": 0.4819565713405609, "kl": 0.024100862443447113, "learning_rate": 3.401180377143774e-06, "logps/chosen": -11.111696079799108, "logps/rejected": -14.007704109981143, "loss": 0.5003, "rewards/chosen": 0.004905121667044503, "rewards/margins": 2.896227860098386e-05, "rewards/rejected": 0.004876159388443519, "step": 104 }, { "epoch": 0.7076663858466723, "grad_norm": 0.4155801236629486, "kl": 0.03822548687458038, "learning_rate": 3.2677552424263836e-06, "logps/chosen": -11.696998333108835, "logps/rejected": -13.534483119419642, "loss": 0.5012, "rewards/chosen": 0.004182511876369345, "rewards/margins": -0.00044645423372390806, "rewards/rejected": 0.004628966110093253, "step": 105 }, { "epoch": 0.7144060657118787, "grad_norm": 1.0962884426116943, "kl": 0.038600608706474304, "learning_rate": 3.1357176110984578e-06, "logps/chosen": -11.945585250854492, "logps/rejected": -15.512099266052246, "loss": 0.5007, "rewards/chosen": 0.005235604010522366, "rewards/margins": 0.0009247651323676109, "rewards/rejected": 0.004310838878154755, "step": 106 }, { "epoch": 0.7211457455770851, "grad_norm": 0.39281150698661804, "kl": 0.03581796586513519, "learning_rate": 3.0051732434229185e-06, "logps/chosen": -12.050497519003379, "logps/rejected": -14.399078369140625, "loss": 0.5003, "rewards/chosen": 0.006019020000019589, "rewards/margins": 0.0028386163237216597, "rewards/rejected": 0.0031804036762979296, "step": 107 }, { "epoch": 0.7278854254422915, "grad_norm": 0.44166335463523865, "kl": 0.030040442943572998, "learning_rate": 2.8762267035797607e-06, "logps/chosen": -12.115191650390624, "logps/rejected": -27.07490270278033, "loss": 0.4988, "rewards/chosen": 0.005298165480295817, "rewards/margins": 0.013872249804291071, "rewards/rejected": -0.008574084323995253, "step": 108 }, { "epoch": 0.7346251053074979, "grad_norm": 0.05213148146867752, "kl": 0.033815205097198486, "learning_rate": 2.748981275911633e-06, "logps/chosen": -11.917203630719866, "logps/rejected": -14.394385443793404, "loss": 0.4998, "rewards/chosen": 0.006377638982875007, "rewards/margins": 0.0014188214545212095, "rewards/rejected": 0.004958817528353797, "step": 109 }, { "epoch": 0.7413647851727043, "grad_norm": 0.5125302672386169, "kl": 0.028670266270637512, "learning_rate": 2.6235388821945497e-06, "logps/chosen": -13.117928466796876, "logps/rejected": -13.176620092147436, "loss": 0.5013, "rewards/chosen": 0.0036279964447021485, "rewards/margins": -0.0016446411303984811, "rewards/rejected": 0.00527263757510063, "step": 110 }, { "epoch": 0.7481044650379107, "grad_norm": 0.46608638763427734, "kl": 0.027143821120262146, "learning_rate": 2.5000000000000015e-06, "logps/chosen": -12.09948812948691, "logps/rejected": -14.054656982421875, "loss": 0.5006, "rewards/chosen": 0.004925508354161237, "rewards/margins": 0.0006050687130387724, "rewards/rejected": 0.004320439641122465, "step": 111 }, { "epoch": 0.7548441449031171, "grad_norm": 0.05120411515235901, "kl": 0.04401693493127823, "learning_rate": 2.3784635822138424e-06, "logps/chosen": -11.725931475239415, "logps/rejected": -14.287259188565342, "loss": 0.4996, "rewards/chosen": 0.006879215278933125, "rewards/margins": 0.0028178333536970296, "rewards/rejected": 0.004061381925236095, "step": 112 }, { "epoch": 0.7548441449031171, "eval_kl": 0.04366951808333397, "eval_logps/chosen": -12.08456680913677, "eval_logps/rejected": -16.13528211022112, "eval_loss": 0.5012484788894653, "eval_rewards/chosen": 0.006296707375701768, "eval_rewards/margins": 0.0034967447587021107, "eval_rewards/rejected": 0.002799962616999657, "eval_runtime": 116.9002, "eval_samples_per_second": 4.277, "eval_steps_per_second": 1.069, "step": 112 }, { "epoch": 0.7615838247683235, "grad_norm": 0.8294048309326172, "kl": 0.033523499965667725, "learning_rate": 2.2590269777764516e-06, "logps/chosen": -12.994817461286273, "logps/rejected": -20.967634412977432, "loss": 0.5014, "rewards/chosen": 0.004394657643777984, "rewards/margins": 0.007389767864157283, "rewards/rejected": -0.0029951102203792995, "step": 113 }, { "epoch": 0.7683235046335299, "grad_norm": 0.6650245785713196, "kl": 0.044409215450286865, "learning_rate": 2.141785853707607e-06, "logps/chosen": -13.633831317608173, "logps/rejected": -14.064775390625, "loss": 0.5009, "rewards/chosen": 0.00470560521651537, "rewards/margins": 0.0020437052922371106, "rewards/rejected": 0.002661899924278259, "step": 114 }, { "epoch": 0.7750631844987363, "grad_norm": 1.0355861186981201, "kl": 0.03707036375999451, "learning_rate": 2.0268341184785674e-06, "logps/chosen": -12.006687837488512, "logps/rejected": -12.987567138671874, "loss": 0.5002, "rewards/chosen": 0.005552893614067751, "rewards/margins": 0.001937851295167325, "rewards/rejected": 0.003615042318900426, "step": 115 }, { "epoch": 0.7818028643639428, "grad_norm": 0.3267318606376648, "kl": 0.03743256628513336, "learning_rate": 1.9142638467927254e-06, "logps/chosen": -12.60701437557445, "logps/rejected": -14.109915161132813, "loss": 0.4998, "rewards/chosen": 0.005473397233906914, "rewards/margins": 0.001956190723998874, "rewards/rejected": 0.0035172065099080404, "step": 116 }, { "epoch": 0.7885425442291492, "grad_norm": 1.15928316116333, "kl": 0.035497263073921204, "learning_rate": 1.8041652058350768e-06, "logps/chosen": -12.61219596862793, "logps/rejected": -14.834460258483887, "loss": 0.5042, "rewards/chosen": 0.004248159006237984, "rewards/margins": -0.0009399871341884136, "rewards/rejected": 0.005188146140426397, "step": 117 }, { "epoch": 0.7952822240943556, "grad_norm": 0.9165633916854858, "kl": 0.0436873733997345, "learning_rate": 1.6966263830495939e-06, "logps/chosen": -11.850751989028034, "logps/rejected": -21.343756103515624, "loss": 0.5008, "rewards/chosen": 0.00493351457750096, "rewards/margins": 0.00906530259286656, "rewards/rejected": -0.0041317880153656, "step": 118 }, { "epoch": 0.802021903959562, "grad_norm": 0.3139224648475647, "kl": 0.041897207498550415, "learning_rate": 1.5917335155023368e-06, "logps/chosen": -11.65659688313802, "logps/rejected": -15.448626349954043, "loss": 0.5, "rewards/chosen": 0.005463708937168121, "rewards/margins": 0.002067950367927551, "rewards/rejected": 0.00339575856924057, "step": 119 }, { "epoch": 0.8087615838247684, "grad_norm": 0.1552770733833313, "kl": 0.03437415510416031, "learning_rate": 1.4895706208868876e-06, "logps/chosen": -11.553650684845753, "logps/rejected": -13.126873779296876, "loss": 0.4998, "rewards/chosen": 0.0057815661032994585, "rewards/margins": 0.0021546780069669085, "rewards/rejected": 0.00362688809633255, "step": 120 }, { "epoch": 0.8155012636899748, "grad_norm": 0.5283660292625427, "kl": 0.038322046399116516, "learning_rate": 1.390219530227378e-06, "logps/chosen": -12.621611595153809, "logps/rejected": -14.466289520263672, "loss": 0.5009, "rewards/chosen": 0.005145063623785973, "rewards/margins": 0.0015712629538029432, "rewards/rejected": 0.0035738006699830294, "step": 121 }, { "epoch": 0.8222409435551812, "grad_norm": 0.976498007774353, "kl": 0.03444386273622513, "learning_rate": 1.2937598223330006e-06, "logps/chosen": -12.92395662006579, "logps/rejected": -22.93274864783654, "loss": 0.5004, "rewards/chosen": 0.005029119943317614, "rewards/margins": 0.01761611934132904, "rewards/rejected": -0.012586999398011427, "step": 122 }, { "epoch": 0.8289806234203876, "grad_norm": 0.7180256843566895, "kl": 0.03909187763929367, "learning_rate": 1.2002687600565138e-06, "logps/chosen": -13.27630615234375, "logps/rejected": -14.22623291015625, "loss": 0.5006, "rewards/chosen": 0.005644300842986387, "rewards/margins": 0.002153261268840116, "rewards/rejected": 0.003491039574146271, "step": 123 }, { "epoch": 0.8357203032855939, "grad_norm": 0.6930535435676575, "kl": 0.04802556335926056, "learning_rate": 1.1098212284078037e-06, "logps/chosen": -12.030011407260236, "logps/rejected": -19.90482352120536, "loss": 0.5013, "rewards/chosen": 0.004783567683450107, "rewards/margins": -0.001280770234286491, "rewards/rejected": 0.006064337917736598, "step": 124 }, { "epoch": 0.8424599831508003, "grad_norm": 0.26308876276016235, "kl": 0.034075237810611725, "learning_rate": 1.0224896745720513e-06, "logps/chosen": -12.563464749243952, "logps/rejected": -13.066956491181344, "loss": 0.4999, "rewards/chosen": 0.006204346975972576, "rewards/margins": 0.0027266234407093987, "rewards/rejected": 0.003477723535263177, "step": 125 }, { "epoch": 0.8491996630160067, "grad_norm": 0.05575637146830559, "kl": 0.03818386048078537, "learning_rate": 9.383440498805712e-07, "logps/chosen": -12.770438561072716, "logps/rejected": -13.97016023334704, "loss": 0.4997, "rewards/chosen": 0.005511786502141219, "rewards/margins": 0.0017519345288334585, "rewards/rejected": 0.00375985197330776, "step": 126 }, { "epoch": 0.8559393428812131, "grad_norm": 0.051352065056562424, "kl": 0.03995239734649658, "learning_rate": 8.574517537807897e-07, "logps/chosen": -10.37672831217448, "logps/rejected": -15.862617043887868, "loss": 0.4999, "rewards/chosen": 0.005997484922409058, "rewards/margins": 0.0014087323756778944, "rewards/rejected": 0.004588752546731164, "step": 127 }, { "epoch": 0.8626790227464195, "grad_norm": 3.6663429737091064, "kl": 0.02988211065530777, "learning_rate": 7.798775798502484e-07, "logps/chosen": -12.772437201605904, "logps/rejected": -19.50521438186233, "loss": 0.5077, "rewards/chosen": 0.003772258758544922, "rewards/margins": 0.0015482741433220939, "rewards/rejected": 0.002223984615222828, "step": 128 }, { "epoch": 0.8626790227464195, "eval_kl": 0.043279923498630524, "eval_logps/chosen": -12.084393830577355, "eval_logps/rejected": -16.137369204083935, "eval_loss": 0.5004793405532837, "eval_rewards/chosen": 0.006313999672107098, "eval_rewards/margins": 0.003722760686253593, "eval_rewards/rejected": 0.002591238985853505, "eval_runtime": 117.1667, "eval_samples_per_second": 4.267, "eval_steps_per_second": 1.067, "step": 128 }, { "epoch": 0.8694187026116259, "grad_norm": 0.5401572585105896, "kl": 0.032129742205142975, "learning_rate": 7.056836638978698e-07, "logps/chosen": -13.479878425598145, "logps/rejected": -13.713187217712402, "loss": 0.5014, "rewards/chosen": 0.004872842226177454, "rewards/margins": 0.0015969944652169943, "rewards/rejected": 0.0032758477609604597, "step": 129 }, { "epoch": 0.8761583824768323, "grad_norm": 0.4993322193622589, "kl": 0.02726106345653534, "learning_rate": 6.349294341940593e-07, "logps/chosen": -11.89748062626008, "logps/rejected": -14.18370194868608, "loss": 0.5008, "rewards/chosen": 0.005981699112922915, "rewards/margins": 0.0013335568726587158, "rewards/rejected": 0.004648142240264199, "step": 130 }, { "epoch": 0.8828980623420387, "grad_norm": 0.9090181589126587, "kl": 0.04542076587677002, "learning_rate": 5.676715638695063e-07, "logps/chosen": -11.69959716796875, "logps/rejected": -14.096754402949893, "loss": 0.5006, "rewards/chosen": 0.005059713976723807, "rewards/margins": 0.0012234999481680357, "rewards/rejected": 0.0038362140285557716, "step": 131 }, { "epoch": 0.8896377422072451, "grad_norm": 0.5459097027778625, "kl": 0.04598844051361084, "learning_rate": 5.039639255208156e-07, "logps/chosen": -10.5826806640625, "logps/rejected": -14.339314778645834, "loss": 0.5004, "rewards/chosen": 0.005033199787139893, "rewards/margins": 0.0015520608119475535, "rewards/rejected": 0.003481138975192339, "step": 132 }, { "epoch": 0.8963774220724515, "grad_norm": 0.36847659945487976, "kl": 0.04135167598724365, "learning_rate": 4.43857548059321e-07, "logps/chosen": -12.215231759207589, "logps/rejected": -14.881442365975216, "loss": 0.5009, "rewards/chosen": 0.005727479713303702, "rewards/margins": -0.008206131613900509, "rewards/rejected": 0.01393361132720421, "step": 133 }, { "epoch": 0.9031171019376579, "grad_norm": 0.46339669823646545, "kl": 0.040303200483322144, "learning_rate": 3.87400575837657e-07, "logps/chosen": -12.203319549560547, "logps/rejected": -15.03320026397705, "loss": 0.5005, "rewards/chosen": 0.004549141973257065, "rewards/margins": -2.8897076845169067e-05, "rewards/rejected": 0.004578039050102234, "step": 134 }, { "epoch": 0.9098567818028643, "grad_norm": 0.5133540034294128, "kl": 0.04136868566274643, "learning_rate": 3.346382300868134e-07, "logps/chosen": -10.265506320529514, "logps/rejected": -15.82147216796875, "loss": 0.501, "rewards/chosen": 0.005173130167855157, "rewards/margins": 0.001688669617469604, "rewards/rejected": 0.0034844605503855527, "step": 135 }, { "epoch": 0.9165964616680707, "grad_norm": 0.220728799700737, "kl": 0.04973362386226654, "learning_rate": 2.85612772694579e-07, "logps/chosen": -12.548672380118534, "logps/rejected": -15.221932547433036, "loss": 0.5, "rewards/chosen": 0.005709037184715271, "rewards/margins": 0.001126085434641157, "rewards/rejected": 0.004582951750074114, "step": 136 }, { "epoch": 0.9233361415332771, "grad_norm": 0.601239800453186, "kl": 0.042025819420814514, "learning_rate": 2.403634723543674e-07, "logps/chosen": -12.426138136121962, "logps/rejected": -14.724080766950335, "loss": 0.5028, "rewards/chosen": 0.005159664071268505, "rewards/margins": -0.003907858556698239, "rewards/rejected": 0.009067522627966744, "step": 137 }, { "epoch": 0.9300758213984835, "grad_norm": 0.35333240032196045, "kl": 0.042685166001319885, "learning_rate": 1.989265731115525e-07, "logps/chosen": -12.459028089368665, "logps/rejected": -15.966356065538195, "loss": 0.5002, "rewards/chosen": 0.005651528368125091, "rewards/margins": 0.0006864719443373732, "rewards/rejected": 0.004965056423787717, "step": 138 }, { "epoch": 0.93681550126369, "grad_norm": 0.5635568499565125, "kl": 0.03784912824630737, "learning_rate": 1.6133526533250566e-07, "logps/chosen": -13.104137073863637, "logps/rejected": -27.47379426033266, "loss": 0.4988, "rewards/chosen": 0.004985654895955866, "rewards/margins": 0.015799503522184937, "rewards/rejected": -0.010813848626229071, "step": 139 }, { "epoch": 0.9435551811288964, "grad_norm": 0.7115698456764221, "kl": 0.03980047255754471, "learning_rate": 1.2761965911958385e-07, "logps/chosen": -11.41650390625, "logps/rejected": -14.686680385044642, "loss": 0.5017, "rewards/chosen": 0.005472733184348705, "rewards/margins": 0.0009131208392208733, "rewards/rejected": 0.004559612345127832, "step": 140 }, { "epoch": 0.9502948609941028, "grad_norm": 0.1111406609416008, "kl": 0.048449933528900146, "learning_rate": 9.780676019336632e-08, "logps/chosen": -12.19880845811632, "logps/rejected": -14.06743724926098, "loss": 0.4998, "rewards/chosen": 0.006176349189546373, "rewards/margins": 0.0013241183023910974, "rewards/rejected": 0.004852230887155275, "step": 141 }, { "epoch": 0.9570345408593092, "grad_norm": 0.7242898941040039, "kl": 0.0280449241399765, "learning_rate": 7.192044826145772e-08, "logps/chosen": -12.05575180053711, "logps/rejected": -20.671449661254883, "loss": 0.4999, "rewards/chosen": 0.0058912248350679874, "rewards/margins": 0.011503569316118956, "rewards/rejected": -0.005612344481050968, "step": 142 }, { "epoch": 0.9637742207245156, "grad_norm": 0.7172530889511108, "kl": 0.0630464106798172, "learning_rate": 4.998145789118114e-08, "logps/chosen": -11.753687241498161, "logps/rejected": -16.088516235351562, "loss": 0.5005, "rewards/chosen": 0.004976525026209214, "rewards/margins": -0.0003784421904414305, "rewards/rejected": 0.005354967216650645, "step": 143 }, { "epoch": 0.970513900589722, "grad_norm": 0.5185176134109497, "kl": 0.031040333211421967, "learning_rate": 3.2007361901485455e-08, "logps/chosen": -12.094314575195312, "logps/rejected": -13.739120483398438, "loss": 0.5012, "rewards/chosen": 0.006150585495763355, "rewards/margins": 0.001710880577327713, "rewards/rejected": 0.004439704918435642, "step": 144 }, { "epoch": 0.970513900589722, "eval_kl": 0.04314365237951279, "eval_logps/chosen": -12.083704105941704, "eval_logps/rejected": -16.140120854016246, "eval_loss": 0.5004004240036011, "eval_rewards/chosen": 0.006382944337990252, "eval_rewards/margins": 0.004066789610123318, "eval_rewards/rejected": 0.0023161547278669338, "eval_runtime": 116.8675, "eval_samples_per_second": 4.278, "eval_steps_per_second": 1.07, "step": 144 }, { "epoch": 0.9772535804549284, "grad_norm": 0.25673621892929077, "kl": 0.0404113307595253, "learning_rate": 1.8012557287367394e-08, "logps/chosen": -12.68321533203125, "logps/rejected": -21.220464369829962, "loss": 0.4988, "rewards/chosen": 0.005415428181489309, "rewards/margins": 0.009387091211244172, "rewards/rejected": -0.003971663029754863, "step": 145 }, { "epoch": 0.9839932603201348, "grad_norm": 0.4361318349838257, "kl": 0.031056255102157593, "learning_rate": 8.008253688084888e-09, "logps/chosen": -12.425528861380911, "logps/rejected": -15.46146873191551, "loss": 0.5013, "rewards/chosen": 0.0055796177806081, "rewards/margins": 0.00028554923660881713, "rewards/rejected": 0.005294068543999283, "step": 146 }, { "epoch": 0.9907329401853412, "grad_norm": 0.9861050844192505, "kl": 0.03052590787410736, "learning_rate": 2.002464408392135e-09, "logps/chosen": -12.423692272555444, "logps/rejected": -12.56340350526752, "loss": 0.5021, "rewards/chosen": 0.0039527707042232635, "rewards/margins": -0.0018114563176009667, "rewards/rejected": 0.00576422702182423, "step": 147 }, { "epoch": 0.9974726200505476, "grad_norm": 0.8566290736198425, "kl": 0.03132675588130951, "learning_rate": 0.0, "logps/chosen": -12.614923292590726, "logps/rejected": -14.566915801077178, "loss": 0.5016, "rewards/chosen": 0.00556570339587427, "rewards/margins": 0.0018311565412337718, "rewards/rejected": 0.003734546854640498, "step": 148 }, { "epoch": 0.9974726200505476, "step": 148, "total_flos": 2.1537474524636774e+17, "train_loss": 0.5015917984214989, "train_runtime": 4835.0561, "train_samples_per_second": 1.964, "train_steps_per_second": 0.031 } ], "logging_steps": 1, "max_steps": 148, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.1537474524636774e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }