{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.7493643784290112, "eval_steps": 500, "global_step": 1400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 189.7083396911621, "epoch": 0.0005352602703064365, "grad_norm": 2.15625, "kl": 0.0, "learning_rate": 2.6737967914438503e-08, "loss": 0.0, "reward": 1.494916707277298, "reward_std": 0.7261392325162888, "rewards/correctness_reward_func": 1.0833333432674408, "rewards/int_reward_func": 0.20833334140479565, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2032500095665455, "step": 1 }, { "completion_length": 170.7916717529297, "epoch": 0.001070520540612873, "grad_norm": 8.625, "kl": 0.0, "learning_rate": 5.3475935828877005e-08, "loss": 0.0, "reward": 1.3800000250339508, "reward_std": 1.1691229492425919, "rewards/correctness_reward_func": 1.083333358168602, "rewards/int_reward_func": 0.14583333767950535, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15083333975053392, "step": 2 }, { "completion_length": 134.20833587646484, "epoch": 0.0016057808109193096, "grad_norm": 4.65625, "kl": 0.0004565252238535322, "learning_rate": 8.021390374331552e-08, "loss": 0.0, "reward": 1.666666716337204, "reward_std": 0.9313502460718155, "rewards/correctness_reward_func": 1.5833334028720856, "rewards/int_reward_func": 0.02083333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.06250000046566129, "step": 3 }, { "completion_length": 165.6666717529297, "epoch": 0.002141041081225746, "grad_norm": 5.5625, "kl": 0.0006007923657307401, "learning_rate": 1.0695187165775401e-07, "loss": 0.0, "reward": 0.8437083810567856, "reward_std": 0.7530571222305298, "rewards/correctness_reward_func": 0.583333358168602, "rewards/int_reward_func": 0.0833333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1770416758954525, "step": 4 }, { "completion_length": 177.95833587646484, "epoch": 0.0026763013515321826, "grad_norm": 8.3125, "kl": 0.0006772031701984815, "learning_rate": 1.3368983957219251e-07, "loss": 0.0, "reward": 1.058833360671997, "reward_std": 0.9749192595481873, "rewards/correctness_reward_func": 0.666666679084301, "rewards/int_reward_func": 0.25000000931322575, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14216666715219617, "step": 5 }, { "completion_length": 136.00000381469727, "epoch": 0.003211561621838619, "grad_norm": 9.0625, "kl": 0.0006230503495316952, "learning_rate": 1.6042780748663104e-07, "loss": 0.0, "reward": 1.7279167473316193, "reward_std": 0.7088067084550858, "rewards/correctness_reward_func": 1.3333333656191826, "rewards/int_reward_func": 0.2291666753590107, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.02083333395421505, "rewards/xmlcount_reward_func": 0.14458333980292082, "step": 6 }, { "completion_length": 136.00000381469727, "epoch": 0.0037468218921450553, "grad_norm": 9.5, "kl": 0.00030847315065329894, "learning_rate": 1.8716577540106952e-07, "loss": 0.0, "reward": 1.7916666865348816, "reward_std": 1.0116209387779236, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.1041666679084301, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.02083333395421505, "rewards/xmlcount_reward_func": 0.25000000558793545, "step": 7 }, { "completion_length": 206.1666717529297, "epoch": 0.004282082162451492, "grad_norm": 13.125, "kl": 0.00030440252157859504, "learning_rate": 2.1390374331550802e-07, "loss": 0.0, "reward": 1.0468750447034836, "reward_std": 0.6654687821865082, "rewards/correctness_reward_func": 0.8333333730697632, "rewards/int_reward_func": 0.0833333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1302083395421505, "step": 8 }, { "completion_length": 155.33333587646484, "epoch": 0.004817342432757929, "grad_norm": 6.75, "kl": 0.00041832496208371595, "learning_rate": 2.4064171122994655e-07, "loss": 0.0, "reward": 1.9687500596046448, "reward_std": 1.100151926279068, "rewards/correctness_reward_func": 1.5833334028720856, "rewards/int_reward_func": 0.18750000558793545, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0416666679084301, "rewards/xmlcount_reward_func": 0.15625000279396772, "step": 9 }, { "completion_length": 102.37500286102295, "epoch": 0.005352602703064365, "grad_norm": 8.3125, "kl": 0.0007533838943345472, "learning_rate": 2.6737967914438503e-07, "loss": 0.0, "reward": 1.5795000493526459, "reward_std": 1.121803194284439, "rewards/correctness_reward_func": 1.3333333730697632, "rewards/int_reward_func": 0.14583333767950535, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10033333860337734, "step": 10 }, { "completion_length": 204.83333587646484, "epoch": 0.005887862973370801, "grad_norm": 11.25, "kl": 0.00043151845966349356, "learning_rate": 2.9411764705882356e-07, "loss": 0.0, "reward": 0.8431666977703571, "reward_std": 0.6774237751960754, "rewards/correctness_reward_func": 0.416666679084301, "rewards/int_reward_func": 0.1250000037252903, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0416666679084301, "rewards/xmlcount_reward_func": 0.25983333960175514, "step": 11 }, { "completion_length": 187.79166793823242, "epoch": 0.006423123243677238, "grad_norm": 9.375, "kl": 0.0004325892587075941, "learning_rate": 3.208556149732621e-07, "loss": 0.0, "reward": 1.3333333358168602, "reward_std": 0.9049101024866104, "rewards/correctness_reward_func": 1.166666679084301, "rewards/int_reward_func": 0.06250000186264515, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.02083333395421505, "rewards/xmlcount_reward_func": 0.0833333358168602, "step": 12 }, { "completion_length": 132.83333778381348, "epoch": 0.0069583835139836745, "grad_norm": 3.46875, "kl": 0.00037954464642098173, "learning_rate": 3.4759358288770056e-07, "loss": 0.0, "reward": 1.2552083632908762, "reward_std": 0.6861025653779507, "rewards/correctness_reward_func": 1.083333358168602, "rewards/int_reward_func": 0.08333333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08854166744276881, "step": 13 }, { "completion_length": 157.37500381469727, "epoch": 0.007493643784290111, "grad_norm": 11.1875, "kl": 0.0005396545675466768, "learning_rate": 3.7433155080213904e-07, "loss": 0.0, "reward": 1.3489583544433117, "reward_std": 0.839453861117363, "rewards/correctness_reward_func": 1.0833333730697632, "rewards/int_reward_func": 0.1041666679084301, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.02083333395421505, "rewards/xmlcount_reward_func": 0.1406250037252903, "step": 14 }, { "completion_length": 130.66667079925537, "epoch": 0.008028904054596548, "grad_norm": 4.71875, "kl": 0.000455528381280601, "learning_rate": 4.0106951871657757e-07, "loss": 0.0, "reward": 0.9166666772216558, "reward_std": 0.5038535855710506, "rewards/correctness_reward_func": 0.7500000298023224, "rewards/int_reward_func": 0.0416666679084301, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1250000037252903, "step": 15 }, { "completion_length": 154.45833587646484, "epoch": 0.008564164324902984, "grad_norm": 27.125, "kl": 0.0005314001718943473, "learning_rate": 4.2780748663101604e-07, "loss": 0.0, "reward": 1.1155416816473007, "reward_std": 0.8059945106506348, "rewards/correctness_reward_func": 0.9166667014360428, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13637500535696745, "step": 16 }, { "completion_length": 215.0416774749756, "epoch": 0.00909942459520942, "grad_norm": 6.28125, "kl": 0.0004003984504379332, "learning_rate": 4.5454545454545457e-07, "loss": 0.0, "reward": 1.2968750596046448, "reward_std": 1.074883759021759, "rewards/correctness_reward_func": 0.833333358168602, "rewards/int_reward_func": 0.1666666679084301, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2968750037252903, "step": 17 }, { "completion_length": 166.3333396911621, "epoch": 0.009634684865515858, "grad_norm": 6.1875, "kl": 0.0005555601237574592, "learning_rate": 4.812834224598931e-07, "loss": 0.0, "reward": 1.2804166674613953, "reward_std": 0.611274242401123, "rewards/correctness_reward_func": 0.9166666716337204, "rewards/int_reward_func": 0.06250000186264515, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.02083333395421505, "rewards/xmlcount_reward_func": 0.28041666746139526, "step": 18 }, { "completion_length": 174.0833396911621, "epoch": 0.010169945135822294, "grad_norm": 10.4375, "kl": 0.000547043266124092, "learning_rate": 5.080213903743316e-07, "loss": 0.0, "reward": 1.40583336353302, "reward_std": 1.315255880355835, "rewards/correctness_reward_func": 1.0833333432674408, "rewards/int_reward_func": 0.1666666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1558333386783488, "step": 19 }, { "completion_length": 122.12500286102295, "epoch": 0.01070520540612873, "grad_norm": 4.65625, "kl": 0.0004533547835308127, "learning_rate": 5.347593582887701e-07, "loss": 0.0, "reward": 1.7031250298023224, "reward_std": 0.8821883350610733, "rewards/correctness_reward_func": 1.4166667014360428, "rewards/int_reward_func": 0.0416666679084301, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.02083333395421505, "rewards/xmlcount_reward_func": 0.22395833861082792, "step": 20 }, { "completion_length": 169.2500057220459, "epoch": 0.011240465676435166, "grad_norm": 3.34375, "kl": 0.0004296539118513465, "learning_rate": 5.614973262032086e-07, "loss": 0.0, "reward": 1.5156250298023224, "reward_std": 1.2371932864189148, "rewards/correctness_reward_func": 1.083333358168602, "rewards/int_reward_func": 0.1875000037252903, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2447916716337204, "step": 21 }, { "completion_length": 138.66666793823242, "epoch": 0.011775725946741603, "grad_norm": 7.4375, "kl": 0.000820733854197897, "learning_rate": 5.882352941176471e-07, "loss": 0.0, "reward": 1.129666696768254, "reward_std": 0.876496072858572, "rewards/correctness_reward_func": 1.0000000298023224, "rewards/int_reward_func": 0.0416666679084301, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0880000009201467, "step": 22 }, { "completion_length": 175.9583396911621, "epoch": 0.01231098621704804, "grad_norm": 2.484375, "kl": 0.0003364777185197454, "learning_rate": 6.149732620320856e-07, "loss": 0.0, "reward": 1.3780416967347264, "reward_std": 0.7526774629950523, "rewards/correctness_reward_func": 0.9166666716337204, "rewards/int_reward_func": 0.2083333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25304167438298464, "step": 23 }, { "completion_length": 144.41667079925537, "epoch": 0.012846246487354477, "grad_norm": 22.125, "kl": 0.0017973248832277022, "learning_rate": 6.417112299465242e-07, "loss": 0.0001, "reward": 0.8507083356380463, "reward_std": 0.7706352546811104, "rewards/correctness_reward_func": 0.583333358168602, "rewards/int_reward_func": 0.12500000186264515, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1423750054091215, "step": 24 }, { "completion_length": 148.9166717529297, "epoch": 0.013381506757660913, "grad_norm": 11.3125, "kl": 0.000723773060599342, "learning_rate": 6.684491978609627e-07, "loss": 0.0, "reward": 1.4583333730697632, "reward_std": 1.0291605293750763, "rewards/correctness_reward_func": 1.2500000447034836, "rewards/int_reward_func": 0.0833333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1250000037252903, "step": 25 }, { "completion_length": 136.00000476837158, "epoch": 0.013916767027967349, "grad_norm": 4.59375, "kl": 0.000691680412273854, "learning_rate": 6.951871657754011e-07, "loss": 0.0, "reward": 0.6868333332240582, "reward_std": 0.8856478333473206, "rewards/correctness_reward_func": 0.5000000074505806, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12433334020897746, "step": 26 }, { "completion_length": 249.1666717529297, "epoch": 0.014452027298273785, "grad_norm": 3.53125, "kl": 0.0003578776722861221, "learning_rate": 7.219251336898397e-07, "loss": 0.0, "reward": 1.854166716337204, "reward_std": 0.9099880866706371, "rewards/correctness_reward_func": 1.2500000298023224, "rewards/int_reward_func": 0.2708333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3333333386108279, "step": 27 }, { "completion_length": 184.70834350585938, "epoch": 0.014987287568580221, "grad_norm": 6.625, "kl": 0.00038619608676526695, "learning_rate": 7.486631016042781e-07, "loss": 0.0, "reward": 1.7500000596046448, "reward_std": 0.82208052277565, "rewards/correctness_reward_func": 1.333333358168602, "rewards/int_reward_func": 0.1250000037252903, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2916666753590107, "step": 28 }, { "completion_length": 206.87500381469727, "epoch": 0.01552254783888666, "grad_norm": 2.53125, "kl": 0.00043380017814342864, "learning_rate": 7.754010695187167e-07, "loss": 0.0, "reward": 2.182291716337204, "reward_std": 0.7352110594511032, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.2500000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.02083333395421505, "rewards/xmlcount_reward_func": 0.24479166977107525, "step": 29 }, { "completion_length": 228.37500381469727, "epoch": 0.016057808109193095, "grad_norm": 5.625, "kl": 0.0007612094195792452, "learning_rate": 8.021390374331551e-07, "loss": 0.0, "reward": 1.8489584177732468, "reward_std": 0.6690051779150963, "rewards/correctness_reward_func": 1.1666666716337204, "rewards/int_reward_func": 0.3333333469927311, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3489583469927311, "step": 30 }, { "completion_length": 204.12500381469727, "epoch": 0.01659306837949953, "grad_norm": 10.875, "kl": 0.0005931528867222369, "learning_rate": 8.288770053475937e-07, "loss": 0.0, "reward": 1.2760416865348816, "reward_std": 0.7161198072135448, "rewards/correctness_reward_func": 1.0833333432674408, "rewards/int_reward_func": 0.06250000186264515, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13020833488553762, "step": 31 }, { "completion_length": 148.5833396911621, "epoch": 0.017128328649805968, "grad_norm": 12.8125, "kl": 0.0014113030629232526, "learning_rate": 8.556149732620321e-07, "loss": 0.0001, "reward": 1.6899999976158142, "reward_std": 0.8904432207345963, "rewards/correctness_reward_func": 1.2500000223517418, "rewards/int_reward_func": 0.1875000037252903, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25250000692903996, "step": 32 }, { "completion_length": 147.58333778381348, "epoch": 0.017663588920112404, "grad_norm": 13.75, "kl": 0.0011093771463492885, "learning_rate": 8.823529411764707e-07, "loss": 0.0, "reward": 1.3604583442211151, "reward_std": 0.998242624104023, "rewards/correctness_reward_func": 1.0000000298023224, "rewards/int_reward_func": 0.12500000186264515, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2354583442211151, "step": 33 }, { "completion_length": 173.66667556762695, "epoch": 0.01819884919041884, "grad_norm": 7.875, "kl": 0.0017010539158945903, "learning_rate": 9.090909090909091e-07, "loss": 0.0001, "reward": 1.4270833730697632, "reward_std": 0.8556555807590485, "rewards/correctness_reward_func": 0.916666679084301, "rewards/int_reward_func": 0.22916666977107525, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2812500037252903, "step": 34 }, { "completion_length": 175.37500190734863, "epoch": 0.018734109460725276, "grad_norm": 4.6875, "kl": 0.0009809281909838319, "learning_rate": 9.358288770053477e-07, "loss": 0.0, "reward": 1.3022500425577164, "reward_std": 0.909055307507515, "rewards/correctness_reward_func": 1.0833333507180214, "rewards/int_reward_func": 0.0833333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13558333739638329, "step": 35 }, { "completion_length": 146.16666984558105, "epoch": 0.019269369731031716, "grad_norm": 4.65625, "kl": 0.0009264845575671643, "learning_rate": 9.625668449197862e-07, "loss": 0.0, "reward": 1.401041716337204, "reward_std": 0.5991803035140038, "rewards/correctness_reward_func": 1.2500000298023224, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.02083333395421505, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0677083358168602, "step": 36 }, { "completion_length": 145.58333778381348, "epoch": 0.019804630001338152, "grad_norm": 4.96875, "kl": 0.0008276553271571174, "learning_rate": 9.893048128342248e-07, "loss": 0.0, "reward": 1.9479166716337204, "reward_std": 0.41099051013588905, "rewards/correctness_reward_func": 1.3333333432674408, "rewards/int_reward_func": 0.2916666679084301, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3229166753590107, "step": 37 }, { "completion_length": 188.16667366027832, "epoch": 0.020339890271644588, "grad_norm": 4.6875, "kl": 0.0018217733450001106, "learning_rate": 1.0160427807486633e-06, "loss": 0.0001, "reward": 1.1983333826065063, "reward_std": 0.7537417262792587, "rewards/correctness_reward_func": 0.9166666716337204, "rewards/int_reward_func": 0.0833333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19833334162831306, "step": 38 }, { "completion_length": 146.79166984558105, "epoch": 0.020875150541951024, "grad_norm": 3.1875, "kl": 0.0017429170693503693, "learning_rate": 1.0427807486631017e-06, "loss": 0.0001, "reward": 1.6458333730697632, "reward_std": 0.448615238070488, "rewards/correctness_reward_func": 1.3333333432674408, "rewards/int_reward_func": 0.02083333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2916666828095913, "step": 39 }, { "completion_length": 131.62500762939453, "epoch": 0.02141041081225746, "grad_norm": 14.9375, "kl": 0.0036563000176101923, "learning_rate": 1.0695187165775401e-06, "loss": 0.0001, "reward": 1.3437500447034836, "reward_std": 0.9089661091566086, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.0833333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.02083333395421505, "rewards/xmlcount_reward_func": 0.2395833358168602, "step": 40 }, { "completion_length": 167.45833778381348, "epoch": 0.021945671082563897, "grad_norm": 15.0, "kl": 0.0035842061261064373, "learning_rate": 1.0962566844919787e-06, "loss": 0.0001, "reward": 1.2604167088866234, "reward_std": 0.7910265475511551, "rewards/correctness_reward_func": 1.0833333507180214, "rewards/int_reward_func": 0.0416666679084301, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13541666977107525, "step": 41 }, { "completion_length": 152.29167366027832, "epoch": 0.022480931352870333, "grad_norm": 7.8125, "kl": 0.0029216272378107533, "learning_rate": 1.1229946524064172e-06, "loss": 0.0001, "reward": 1.052083358168602, "reward_std": 1.137702077627182, "rewards/correctness_reward_func": 0.8333333358168602, "rewards/int_reward_func": 0.12500000186264515, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09375000279396772, "step": 42 }, { "completion_length": 212.2083396911621, "epoch": 0.02301619162317677, "grad_norm": 3.5625, "kl": 0.0010128439316758886, "learning_rate": 1.1497326203208558e-06, "loss": 0.0, "reward": 1.6354167461395264, "reward_std": 0.8953899294137955, "rewards/correctness_reward_func": 1.0000000149011612, "rewards/int_reward_func": 0.3125000037252903, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3229166679084301, "step": 43 }, { "completion_length": 185.75000381469727, "epoch": 0.023551451893483205, "grad_norm": 3.921875, "kl": 0.0024199254185077734, "learning_rate": 1.1764705882352942e-06, "loss": 0.0001, "reward": 2.192708373069763, "reward_std": 1.02582186460495, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.3333333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.02083333395421505, "rewards/xmlcount_reward_func": 0.4218750074505806, "step": 44 }, { "completion_length": 194.12500381469727, "epoch": 0.02408671216378964, "grad_norm": 5.28125, "kl": 0.002183001925004646, "learning_rate": 1.2032085561497326e-06, "loss": 0.0001, "reward": 1.9635416865348816, "reward_std": 0.8007803931832314, "rewards/correctness_reward_func": 1.3333333730697632, "rewards/int_reward_func": 0.2708333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3593750149011612, "step": 45 }, { "completion_length": 169.20833587646484, "epoch": 0.02462197243409608, "grad_norm": 3.34375, "kl": 0.002503075505956076, "learning_rate": 1.2299465240641713e-06, "loss": 0.0001, "reward": 1.9375000596046448, "reward_std": 0.9027109891176224, "rewards/correctness_reward_func": 1.4166667312383652, "rewards/int_reward_func": 0.20833334140479565, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3125000074505806, "step": 46 }, { "completion_length": 164.91667366027832, "epoch": 0.025157232704402517, "grad_norm": 4.65625, "kl": 0.003346432466059923, "learning_rate": 1.2566844919786097e-06, "loss": 0.0001, "reward": 1.6684584021568298, "reward_std": 1.296816736459732, "rewards/correctness_reward_func": 1.3333333879709244, "rewards/int_reward_func": 0.14583333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18929166719317436, "step": 47 }, { "completion_length": 136.95833587646484, "epoch": 0.025692492974708953, "grad_norm": 10.875, "kl": 0.006269982142839581, "learning_rate": 1.2834224598930483e-06, "loss": 0.0003, "reward": 2.161458432674408, "reward_std": 0.7979137673974037, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.1875000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.307291679084301, "step": 48 }, { "completion_length": 153.45833587646484, "epoch": 0.02622775324501539, "grad_norm": 4.65625, "kl": 0.006891902536153793, "learning_rate": 1.3101604278074868e-06, "loss": 0.0003, "reward": 1.7187500596046448, "reward_std": 1.0350174307823181, "rewards/correctness_reward_func": 1.1666667088866234, "rewards/int_reward_func": 0.20833333767950535, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0416666679084301, "rewards/xmlcount_reward_func": 0.3020833395421505, "step": 49 }, { "completion_length": 197.20834350585938, "epoch": 0.026763013515321826, "grad_norm": 2.375, "kl": 0.002022897358983755, "learning_rate": 1.3368983957219254e-06, "loss": 0.0001, "reward": 2.1718750447034836, "reward_std": 0.3917969614267349, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.33333333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4218750074505806, "step": 50 }, { "completion_length": 212.04166793823242, "epoch": 0.02729827378562826, "grad_norm": 1.8671875, "kl": 0.0011677205184241757, "learning_rate": 1.3636363636363636e-06, "loss": 0.0, "reward": 1.7864583656191826, "reward_std": 0.6358048617839813, "rewards/correctness_reward_func": 1.1666666865348816, "rewards/int_reward_func": 0.1875000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4322916716337204, "step": 51 }, { "completion_length": 174.5416717529297, "epoch": 0.027833534055934698, "grad_norm": 4.03125, "kl": 0.004340659594163299, "learning_rate": 1.3903743315508022e-06, "loss": 0.0002, "reward": 1.5617916658520699, "reward_std": 0.4447403661906719, "rewards/correctness_reward_func": 1.0833333358168602, "rewards/int_reward_func": 0.1041666679084301, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37429168075323105, "step": 52 }, { "completion_length": 193.70833778381348, "epoch": 0.028368794326241134, "grad_norm": 7.3125, "kl": 0.002266606839839369, "learning_rate": 1.4171122994652409e-06, "loss": 0.0001, "reward": 2.4166667461395264, "reward_std": 0.43660441040992737, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.27083333767950535, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31250000558793545, "step": 53 }, { "completion_length": 150.16667366027832, "epoch": 0.02890405459654757, "grad_norm": 2.890625, "kl": 0.010927497263764963, "learning_rate": 1.4438502673796793e-06, "loss": 0.0004, "reward": 2.302083373069763, "reward_std": 0.7531506419181824, "rewards/correctness_reward_func": 1.7500000596046448, "rewards/int_reward_func": 0.2291666679084301, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3229166716337204, "step": 54 }, { "completion_length": 195.45834350585938, "epoch": 0.029439314866854006, "grad_norm": 2.265625, "kl": 0.0035543091071303934, "learning_rate": 1.4705882352941177e-06, "loss": 0.0001, "reward": 2.2359583973884583, "reward_std": 0.7737347185611725, "rewards/correctness_reward_func": 1.5833333730697632, "rewards/int_reward_func": 0.2500000037252903, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0416666679084301, "rewards/xmlcount_reward_func": 0.3609583452343941, "step": 55 }, { "completion_length": 164.0416717529297, "epoch": 0.029974575137160443, "grad_norm": 2.0625, "kl": 0.003154328849632293, "learning_rate": 1.4973262032085562e-06, "loss": 0.0001, "reward": 2.208333432674408, "reward_std": 0.8674917370080948, "rewards/correctness_reward_func": 1.5000000298023224, "rewards/int_reward_func": 0.2500000037252903, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.02083333395421505, "rewards/xmlcount_reward_func": 0.4375000074505806, "step": 56 }, { "completion_length": 158.16666793823242, "epoch": 0.030509835407466882, "grad_norm": 3.140625, "kl": 0.003223880339646712, "learning_rate": 1.5240641711229948e-06, "loss": 0.0001, "reward": 1.8229167610406876, "reward_std": 0.4800337702035904, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.14583333767950535, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2604166753590107, "step": 57 }, { "completion_length": 200.20833587646484, "epoch": 0.03104509567777332, "grad_norm": 1.8125, "kl": 0.0027400395192671567, "learning_rate": 1.5508021390374334e-06, "loss": 0.0001, "reward": 1.9427084177732468, "reward_std": 0.566053070127964, "rewards/correctness_reward_func": 1.1666666716337204, "rewards/int_reward_func": 0.31250000558793545, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.02083333395421505, "rewards/xmlcount_reward_func": 0.4427083432674408, "step": 58 }, { "completion_length": 160.0416717529297, "epoch": 0.03158035594807975, "grad_norm": 6.28125, "kl": 0.006273803039221093, "learning_rate": 1.5775401069518716e-06, "loss": 0.0003, "reward": 1.8489583730697632, "reward_std": 0.9726917743682861, "rewards/correctness_reward_func": 1.1666666939854622, "rewards/int_reward_func": 0.2500000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.02083333395421505, "rewards/xmlcount_reward_func": 0.4114583432674408, "step": 59 }, { "completion_length": 224.41667556762695, "epoch": 0.03211561621838619, "grad_norm": 11.5625, "kl": 0.023900436048279516, "learning_rate": 1.6042780748663103e-06, "loss": 0.001, "reward": 1.4687500447034836, "reward_std": 0.720452331006527, "rewards/correctness_reward_func": 0.8333333358168602, "rewards/int_reward_func": 0.2500000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.385416679084301, "step": 60 }, { "completion_length": 205.50000381469727, "epoch": 0.03265087648869262, "grad_norm": 3.28125, "kl": 0.0030198894964996725, "learning_rate": 1.631016042780749e-06, "loss": 0.0001, "reward": 1.9166667759418488, "reward_std": 0.9933225437998772, "rewards/correctness_reward_func": 1.333333358168602, "rewards/int_reward_func": 0.20833334140479565, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3750000149011612, "step": 61 }, { "completion_length": 153.8333396911621, "epoch": 0.03318613675899906, "grad_norm": 2.15625, "kl": 0.004072973737493157, "learning_rate": 1.6577540106951873e-06, "loss": 0.0002, "reward": 2.0937500596046448, "reward_std": 0.8774870336055756, "rewards/correctness_reward_func": 1.3333333656191826, "rewards/int_reward_func": 0.31250001303851604, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4479166716337204, "step": 62 }, { "completion_length": 174.0833396911621, "epoch": 0.0337213970293055, "grad_norm": 2.0625, "kl": 0.0055269336444325745, "learning_rate": 1.684491978609626e-06, "loss": 0.0002, "reward": 2.3186666667461395, "reward_std": 0.7296848772093654, "rewards/correctness_reward_func": 1.5833333730697632, "rewards/int_reward_func": 0.33333333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4020000025629997, "step": 63 }, { "completion_length": 156.87500762939453, "epoch": 0.034256657299611935, "grad_norm": 3.140625, "kl": 0.0023692850954830647, "learning_rate": 1.7112299465240642e-06, "loss": 0.0001, "reward": 1.7187500894069672, "reward_std": 0.7189934402704239, "rewards/correctness_reward_func": 0.9166666716337204, "rewards/int_reward_func": 0.3333333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.02083333395421505, "rewards/xmlcount_reward_func": 0.4479166716337204, "step": 64 }, { "completion_length": 176.7916717529297, "epoch": 0.034791917569918375, "grad_norm": 10.375, "kl": 0.04830963246058673, "learning_rate": 1.7379679144385028e-06, "loss": 0.0019, "reward": 2.2812501192092896, "reward_std": 0.8949191719293594, "rewards/correctness_reward_func": 1.5000000596046448, "rewards/int_reward_func": 0.2916666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.02083333395421505, "rewards/xmlcount_reward_func": 0.4687500074505806, "step": 65 }, { "completion_length": 180.9583396911621, "epoch": 0.03532717784022481, "grad_norm": 2.40625, "kl": 0.003214933123672381, "learning_rate": 1.7647058823529414e-06, "loss": 0.0001, "reward": 2.5000000596046448, "reward_std": 0.37490667030215263, "rewards/correctness_reward_func": 1.5000000223517418, "rewards/int_reward_func": 0.4375000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.06250000186264515, "rewards/xmlcount_reward_func": 0.5, "step": 66 }, { "completion_length": 194.2083396911621, "epoch": 0.03586243811053125, "grad_norm": 2.34375, "kl": 0.0023288802476599813, "learning_rate": 1.7914438502673799e-06, "loss": 0.0001, "reward": 1.5208334028720856, "reward_std": 0.7188220322132111, "rewards/correctness_reward_func": 0.7500000298023224, "rewards/int_reward_func": 0.2916666753590107, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.02083333395421505, "rewards/xmlcount_reward_func": 0.4583333432674408, "step": 67 }, { "completion_length": 159.66666984558105, "epoch": 0.03639769838083768, "grad_norm": 2.203125, "kl": 0.0026101931143784896, "learning_rate": 1.8181818181818183e-06, "loss": 0.0001, "reward": 2.645833373069763, "reward_std": 0.5910372547805309, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.02083333395421505, "rewards/xmlcount_reward_func": 0.5, "step": 68 }, { "completion_length": 186.91666984558105, "epoch": 0.03693295865114412, "grad_norm": 2.34375, "kl": 0.0025595282058930025, "learning_rate": 1.8449197860962567e-06, "loss": 0.0001, "reward": 2.817708432674408, "reward_std": 0.3789900913834572, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.3958333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.02083333395421505, "rewards/xmlcount_reward_func": 0.4843750074505806, "step": 69 }, { "completion_length": 159.37500381469727, "epoch": 0.03746821892145055, "grad_norm": 4.03125, "kl": 0.014193891576724127, "learning_rate": 1.8716577540106954e-06, "loss": 0.0006, "reward": 2.444666802883148, "reward_std": 0.8164463341236115, "rewards/correctness_reward_func": 1.666666716337204, "rewards/int_reward_func": 0.3333333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.02083333395421505, "rewards/xmlcount_reward_func": 0.42383334040641785, "step": 70 }, { "completion_length": 164.5416717529297, "epoch": 0.03800347919175699, "grad_norm": 2.0625, "kl": 0.006387478410033509, "learning_rate": 1.898395721925134e-06, "loss": 0.0003, "reward": 2.380208432674408, "reward_std": 0.7041773945093155, "rewards/correctness_reward_func": 1.4166667088866234, "rewards/int_reward_func": 0.3958333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0833333358168602, "rewards/xmlcount_reward_func": 0.484375, "step": 71 }, { "completion_length": 176.41666984558105, "epoch": 0.03853873946206343, "grad_norm": 2.6875, "kl": 0.004877177358139306, "learning_rate": 1.9251336898395724e-06, "loss": 0.0002, "reward": 1.932291716337204, "reward_std": 0.6992241404950619, "rewards/correctness_reward_func": 1.3333333656191826, "rewards/int_reward_func": 0.10416666977107525, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.02083333395421505, "rewards/xmlcount_reward_func": 0.4739583432674408, "step": 72 }, { "completion_length": 179.87500381469727, "epoch": 0.039073999732369864, "grad_norm": 2.046875, "kl": 0.0029720670718234032, "learning_rate": 1.951871657754011e-06, "loss": 0.0001, "reward": 2.2083334624767303, "reward_std": 0.7113956846296787, "rewards/correctness_reward_func": 1.5000000149011612, "rewards/int_reward_func": 0.2083333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0416666679084301, "rewards/xmlcount_reward_func": 0.4583333432674408, "step": 73 }, { "completion_length": 162.66666793823242, "epoch": 0.039609260002676304, "grad_norm": 1.421875, "kl": 0.0038638823752989992, "learning_rate": 1.9786096256684497e-06, "loss": 0.0002, "reward": 2.348958373069763, "reward_std": 0.6586650460958481, "rewards/correctness_reward_func": 1.4166666716337204, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4739583358168602, "step": 74 }, { "completion_length": 151.29166793823242, "epoch": 0.04014452027298274, "grad_norm": 2.421875, "kl": 0.005501059582456946, "learning_rate": 2.0053475935828877e-06, "loss": 0.0002, "reward": 2.645833373069763, "reward_std": 0.5747457854449749, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.35416667722165585, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.06250000186264515, "rewards/xmlcount_reward_func": 0.4791666716337204, "step": 75 }, { "completion_length": 220.9583396911621, "epoch": 0.040679780543289176, "grad_norm": 1.421875, "kl": 0.004471051681321114, "learning_rate": 2.0320855614973265e-06, "loss": 0.0002, "reward": 1.9540833532810211, "reward_std": 0.8409619331359863, "rewards/correctness_reward_func": 1.0833333358168602, "rewards/int_reward_func": 0.3958333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4749166667461395, "step": 76 }, { "completion_length": 202.75000381469727, "epoch": 0.04121504081359561, "grad_norm": 1.875, "kl": 0.0032015527540352196, "learning_rate": 2.058823529411765e-06, "loss": 0.0001, "reward": 2.229166716337204, "reward_std": 0.8260998427867889, "rewards/correctness_reward_func": 1.5000000298023224, "rewards/int_reward_func": 0.2291666753590107, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.02083333395421505, "rewards/xmlcount_reward_func": 0.4791666716337204, "step": 77 }, { "completion_length": 182.58333587646484, "epoch": 0.04175030108390205, "grad_norm": 2.3125, "kl": 0.0037266534636728466, "learning_rate": 2.0855614973262034e-06, "loss": 0.0001, "reward": 1.9532501101493835, "reward_std": 0.4688983578234911, "rewards/correctness_reward_func": 1.1666666716337204, "rewards/int_reward_func": 0.2916666679084301, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.02083333395421505, "rewards/xmlcount_reward_func": 0.474083349108696, "step": 78 }, { "completion_length": 196.0833396911621, "epoch": 0.04228556135420848, "grad_norm": 1.8125, "kl": 0.0038231085636653006, "learning_rate": 2.112299465240642e-06, "loss": 0.0002, "reward": 2.2470000088214874, "reward_std": 0.6223583808168769, "rewards/correctness_reward_func": 1.3333333358168602, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0416666679084301, "rewards/xmlcount_reward_func": 0.43450000137090683, "step": 79 }, { "completion_length": 172.58333587646484, "epoch": 0.04282082162451492, "grad_norm": 8.0, "kl": 0.17783336297725327, "learning_rate": 2.1390374331550802e-06, "loss": 0.0071, "reward": 2.3935834169387817, "reward_std": 0.7652425169944763, "rewards/correctness_reward_func": 1.5833333730697632, "rewards/int_reward_func": 0.31250000186264515, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0625, "rewards/xmlcount_reward_func": 0.43525000661611557, "step": 80 }, { "completion_length": 182.7083396911621, "epoch": 0.043356081894821354, "grad_norm": 2.96875, "kl": 0.010094487282913178, "learning_rate": 2.165775401069519e-06, "loss": 0.0004, "reward": 2.4062501192092896, "reward_std": 0.6189638450741768, "rewards/correctness_reward_func": 1.5000000223517418, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.447916679084301, "step": 81 }, { "completion_length": 200.50000381469727, "epoch": 0.04389134216512779, "grad_norm": 1.09375, "kl": 0.004236105130985379, "learning_rate": 2.1925133689839575e-06, "loss": 0.0002, "reward": 1.9768334031105042, "reward_std": 0.46516112983226776, "rewards/correctness_reward_func": 1.2500000298023224, "rewards/int_reward_func": 0.25000000558793545, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4768333435058594, "step": 82 }, { "completion_length": 247.54167556762695, "epoch": 0.04442660243543423, "grad_norm": 1.65625, "kl": 0.011776420462410897, "learning_rate": 2.219251336898396e-06, "loss": 0.0005, "reward": 1.7291666865348816, "reward_std": 0.9394900351762772, "rewards/correctness_reward_func": 1.0000000447034836, "rewards/int_reward_func": 0.25000000931322575, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.02083333395421505, "rewards/xmlcount_reward_func": 0.4583333432674408, "step": 83 }, { "completion_length": 251.37500762939453, "epoch": 0.044961862705740666, "grad_norm": 1.5703125, "kl": 0.005871386732906103, "learning_rate": 2.2459893048128343e-06, "loss": 0.0002, "reward": 1.520833358168602, "reward_std": 0.937641553580761, "rewards/correctness_reward_func": 0.8333333432674408, "rewards/int_reward_func": 0.22916667349636555, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4583333432674408, "step": 84 }, { "completion_length": 164.4583396911621, "epoch": 0.045497122976047105, "grad_norm": 1.828125, "kl": 0.006025184877216816, "learning_rate": 2.2727272727272728e-06, "loss": 0.0002, "reward": 2.8541667461395264, "reward_std": 0.1530931033194065, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.33333333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.02083333395421505, "rewards/xmlcount_reward_func": 0.5, "step": 85 }, { "completion_length": 175.12500381469727, "epoch": 0.04603238324635354, "grad_norm": 2.234375, "kl": 0.012646633782424033, "learning_rate": 2.2994652406417116e-06, "loss": 0.0005, "reward": 2.2291666865348816, "reward_std": 0.8635273203253746, "rewards/correctness_reward_func": 1.5000000447034836, "rewards/int_reward_func": 0.2083333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0416666679084301, "rewards/xmlcount_reward_func": 0.4791666716337204, "step": 86 }, { "completion_length": 203.4166717529297, "epoch": 0.04656764351665998, "grad_norm": 2.09375, "kl": 0.0035205732856411487, "learning_rate": 2.32620320855615e-06, "loss": 0.0001, "reward": 2.2656250596046448, "reward_std": 0.33494970947504044, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.291666679084301, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0625, "rewards/xmlcount_reward_func": 0.4947916716337204, "step": 87 }, { "completion_length": 219.25000381469727, "epoch": 0.04710290378696641, "grad_norm": 2.09375, "kl": 0.003432907979004085, "learning_rate": 2.3529411764705885e-06, "loss": 0.0001, "reward": 1.8906250596046448, "reward_std": 0.9207641184329987, "rewards/correctness_reward_func": 1.0833333730697632, "rewards/int_reward_func": 0.2708333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0416666679084301, "rewards/xmlcount_reward_func": 0.4947916716337204, "step": 88 }, { "completion_length": 223.16667556762695, "epoch": 0.04763816405727285, "grad_norm": 1.828125, "kl": 0.010489805426914245, "learning_rate": 2.379679144385027e-06, "loss": 0.0004, "reward": 1.9583333730697632, "reward_std": 0.7567075043916702, "rewards/correctness_reward_func": 1.2500000074505806, "rewards/int_reward_func": 0.2291666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.02083333395421505, "rewards/xmlcount_reward_func": 0.4583333358168602, "step": 89 }, { "completion_length": 234.12500381469727, "epoch": 0.04817342432757928, "grad_norm": 1.828125, "kl": 0.0042737985495477915, "learning_rate": 2.4064171122994653e-06, "loss": 0.0002, "reward": 1.843416690826416, "reward_std": 1.0440644323825836, "rewards/correctness_reward_func": 1.0833333656191826, "rewards/int_reward_func": 0.2708333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.02083333395421505, "rewards/xmlcount_reward_func": 0.4684166759252548, "step": 90 }, { "completion_length": 215.37500762939453, "epoch": 0.04870868459788572, "grad_norm": 1.8671875, "kl": 0.0036017470411024988, "learning_rate": 2.433155080213904e-06, "loss": 0.0001, "reward": 2.145833432674408, "reward_std": 0.8112322501838207, "rewards/correctness_reward_func": 1.333333358168602, "rewards/int_reward_func": 0.31250001303851604, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.5, "step": 91 }, { "completion_length": 165.4166717529297, "epoch": 0.04924394486819216, "grad_norm": 2.515625, "kl": 0.005282851168885827, "learning_rate": 2.4598930481283426e-06, "loss": 0.0002, "reward": 2.489583432674408, "reward_std": 0.8116736710071564, "rewards/correctness_reward_func": 1.6666667461395264, "rewards/int_reward_func": 0.33333334885537624, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4895833358168602, "step": 92 }, { "completion_length": 229.0000114440918, "epoch": 0.049779205138498595, "grad_norm": 1.828125, "kl": 0.0037543401995208114, "learning_rate": 2.486631016042781e-06, "loss": 0.0002, "reward": 2.086958348751068, "reward_std": 1.1980505138635635, "rewards/correctness_reward_func": 1.3333333730697632, "rewards/int_reward_func": 0.3333333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.02083333395421505, "rewards/xmlcount_reward_func": 0.3994583375751972, "step": 93 }, { "completion_length": 240.16667556762695, "epoch": 0.050314465408805034, "grad_norm": 1.7578125, "kl": 0.003991760429926217, "learning_rate": 2.5133689839572194e-06, "loss": 0.0002, "reward": 1.7708334177732468, "reward_std": 0.6722075343132019, "rewards/correctness_reward_func": 0.916666679084301, "rewards/int_reward_func": 0.3333333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0416666679084301, "rewards/xmlcount_reward_func": 0.4791666716337204, "step": 94 }, { "completion_length": 186.3333396911621, "epoch": 0.05084972567911147, "grad_norm": 1.625, "kl": 0.005014055874198675, "learning_rate": 2.5401069518716583e-06, "loss": 0.0002, "reward": 2.7291666865348816, "reward_std": 0.5318794921040535, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.4375000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0416666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 95 }, { "completion_length": 182.0416717529297, "epoch": 0.051384985949417906, "grad_norm": 1.9296875, "kl": 0.01205193460918963, "learning_rate": 2.5668449197860967e-06, "loss": 0.0005, "reward": 2.479166805744171, "reward_std": 0.8848456591367722, "rewards/correctness_reward_func": 1.5833333730697632, "rewards/int_reward_func": 0.3541666753590107, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.06250000186264515, "rewards/xmlcount_reward_func": 0.4791666716337204, "step": 96 }, { "completion_length": 192.87500762939453, "epoch": 0.05192024621972434, "grad_norm": 1.8984375, "kl": 0.0356319691054523, "learning_rate": 2.5935828877005347e-06, "loss": 0.0014, "reward": 2.3541667461395264, "reward_std": 0.4258173182606697, "rewards/correctness_reward_func": 1.6666666716337204, "rewards/int_reward_func": 0.2291666753590107, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4583333432674408, "step": 97 }, { "completion_length": 182.5833396911621, "epoch": 0.05245550649003078, "grad_norm": 2.9375, "kl": 0.057575218263082206, "learning_rate": 2.6203208556149735e-06, "loss": 0.0023, "reward": 2.504125028848648, "reward_std": 0.500206220895052, "rewards/correctness_reward_func": 1.5000000223517418, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.02083333395421505, "rewards/xmlcount_reward_func": 0.4832916706800461, "step": 98 }, { "completion_length": 216.29167556762695, "epoch": 0.05299076676033721, "grad_norm": 1.5703125, "kl": 0.019477371592074633, "learning_rate": 2.647058823529412e-06, "loss": 0.0008, "reward": 2.6666666865348816, "reward_std": 0.39335764572024345, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4791666716337204, "step": 99 }, { "completion_length": 229.62500762939453, "epoch": 0.05352602703064365, "grad_norm": 1.4375, "kl": 0.010605788585962728, "learning_rate": 2.673796791443851e-06, "loss": 0.0004, "reward": 1.7427083849906921, "reward_std": 0.7021718323230743, "rewards/correctness_reward_func": 0.9166667088866234, "rewards/int_reward_func": 0.3333333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.02083333395421505, "rewards/xmlcount_reward_func": 0.47187500447034836, "step": 100 }, { "completion_length": 229.83334350585938, "epoch": 0.054061287300950084, "grad_norm": 2.09375, "kl": 0.005640399642288685, "learning_rate": 2.7005347593582892e-06, "loss": 0.0002, "reward": 1.8177084177732468, "reward_std": 0.6098503544926643, "rewards/correctness_reward_func": 1.0000000223517418, "rewards/int_reward_func": 0.2916666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.06250000186264515, "rewards/xmlcount_reward_func": 0.4635416716337204, "step": 101 }, { "completion_length": 184.2916717529297, "epoch": 0.05459654757125652, "grad_norm": 1.7734375, "kl": 0.01011388812912628, "learning_rate": 2.7272727272727272e-06, "loss": 0.0004, "reward": 2.645833432674408, "reward_std": 0.4501614086329937, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.2083333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0416666679084301, "rewards/xmlcount_reward_func": 0.4791666716337204, "step": 102 }, { "completion_length": 224.04166793823242, "epoch": 0.05513180784156296, "grad_norm": 1.7421875, "kl": 0.0050743266474455595, "learning_rate": 2.754010695187166e-06, "loss": 0.0002, "reward": 2.270833432674408, "reward_std": 0.7563454322516918, "rewards/correctness_reward_func": 1.4166667014360428, "rewards/int_reward_func": 0.3333333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.02083333395421505, "rewards/xmlcount_reward_func": 0.5, "step": 103 }, { "completion_length": 146.45833778381348, "epoch": 0.055667068111869396, "grad_norm": 2.5, "kl": 0.010326952033210546, "learning_rate": 2.7807486631016045e-06, "loss": 0.0004, "reward": 2.7812500596046448, "reward_std": 0.38120611757040024, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.31250001303851604, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0833333358168602, "rewards/xmlcount_reward_func": 0.4687500074505806, "step": 104 }, { "completion_length": 169.58333587646484, "epoch": 0.056202328382175835, "grad_norm": 1.2265625, "kl": 0.009152874117717147, "learning_rate": 2.807486631016043e-06, "loss": 0.0004, "reward": 2.9375000596046448, "reward_std": 0.11558076366782188, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.3750000037252903, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0625, "rewards/xmlcount_reward_func": 0.5, "step": 105 }, { "completion_length": 233.33334350585938, "epoch": 0.05673758865248227, "grad_norm": 1.34375, "kl": 0.01396864268463105, "learning_rate": 2.8342245989304818e-06, "loss": 0.0006, "reward": 2.3437500298023224, "reward_std": 0.6216080188751221, "rewards/correctness_reward_func": 1.416666679084301, "rewards/int_reward_func": 0.4375000149011612, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.02083333395421505, "rewards/xmlcount_reward_func": 0.46875, "step": 106 }, { "completion_length": 188.00000381469727, "epoch": 0.05727284892278871, "grad_norm": 1.6328125, "kl": 0.0061717041535303, "learning_rate": 2.8609625668449198e-06, "loss": 0.0002, "reward": 2.5625000596046448, "reward_std": 0.5238290876150131, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.3750000037252903, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.02083333395421505, "rewards/xmlcount_reward_func": 0.5, "step": 107 }, { "completion_length": 231.29167556762695, "epoch": 0.05780810919309514, "grad_norm": 1.7890625, "kl": 0.0035084771225228906, "learning_rate": 2.8877005347593586e-06, "loss": 0.0001, "reward": 1.9696250259876251, "reward_std": 0.20630286261439323, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.416666679084301, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.06250000186264515, "rewards/xmlcount_reward_func": 0.49045833945274353, "step": 108 }, { "completion_length": 274.83333587646484, "epoch": 0.05834336946340158, "grad_norm": 1.765625, "kl": 0.004193893808405846, "learning_rate": 2.914438502673797e-06, "loss": 0.0002, "reward": 2.569666802883148, "reward_std": 1.0057230442762375, "rewards/correctness_reward_func": 1.5833334028720856, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0625, "rewards/xmlcount_reward_func": 0.46549999713897705, "step": 109 }, { "completion_length": 167.7083339691162, "epoch": 0.05887862973370801, "grad_norm": 2.5625, "kl": 0.011588132474571466, "learning_rate": 2.9411764705882355e-06, "loss": 0.0005, "reward": 1.5813333690166473, "reward_std": 0.5862554460763931, "rewards/correctness_reward_func": 0.9166666716337204, "rewards/int_reward_func": 0.1666666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49800001084804535, "step": 110 }, { "completion_length": 201.16667938232422, "epoch": 0.05941389000401445, "grad_norm": 2.09375, "kl": 0.008921175263822079, "learning_rate": 2.9679144385026743e-06, "loss": 0.0004, "reward": 2.0729166865348816, "reward_std": 0.8873137533664703, "rewards/correctness_reward_func": 1.166666716337204, "rewards/int_reward_func": 0.3333333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.1041666679084301, "rewards/xmlcount_reward_func": 0.4687500074505806, "step": 111 }, { "completion_length": 154.45833587646484, "epoch": 0.059949150274320885, "grad_norm": 15.75, "kl": 0.042277290020138025, "learning_rate": 2.9946524064171123e-06, "loss": 0.0017, "reward": 2.7500001192092896, "reward_std": 0.6749640665948391, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.33333333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.12500000186264515, "rewards/xmlcount_reward_func": 0.4583333432674408, "step": 112 }, { "completion_length": 252.9166717529297, "epoch": 0.060484410544627325, "grad_norm": 1.2890625, "kl": 0.008890356635674834, "learning_rate": 3.0213903743315507e-06, "loss": 0.0004, "reward": 2.3437500596046448, "reward_std": 0.6434758454561234, "rewards/correctness_reward_func": 1.5000000223517418, "rewards/int_reward_func": 0.3750000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.02083333395421505, "rewards/xmlcount_reward_func": 0.4479166716337204, "step": 113 }, { "completion_length": 218.0416717529297, "epoch": 0.061019670814933764, "grad_norm": 1.6328125, "kl": 0.006781109143048525, "learning_rate": 3.0481283422459896e-06, "loss": 0.0003, "reward": 2.557291716337204, "reward_std": 0.5454855412244797, "rewards/correctness_reward_func": 1.583333358168602, "rewards/int_reward_func": 0.4166666679084301, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.06250000186264515, "rewards/xmlcount_reward_func": 0.4947916716337204, "step": 114 }, { "completion_length": 183.7916717529297, "epoch": 0.0615549310852402, "grad_norm": 1.84375, "kl": 0.008323265705257654, "learning_rate": 3.074866310160428e-06, "loss": 0.0003, "reward": 2.255208432674408, "reward_std": 0.9933101981878281, "rewards/correctness_reward_func": 1.2500000521540642, "rewards/int_reward_func": 0.3333333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.1875000037252903, "rewards/xmlcount_reward_func": 0.484375, "step": 115 }, { "completion_length": 224.0833396911621, "epoch": 0.06209019135554664, "grad_norm": 1.3828125, "kl": 0.022750876378268003, "learning_rate": 3.101604278074867e-06, "loss": 0.0009, "reward": 2.3177084028720856, "reward_std": 0.6239343695342541, "rewards/correctness_reward_func": 1.4166666716337204, "rewards/int_reward_func": 0.37500000558793545, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0416666679084301, "rewards/xmlcount_reward_func": 0.484375, "step": 116 }, { "completion_length": 181.8333396911621, "epoch": 0.06262545162585308, "grad_norm": 1.8359375, "kl": 0.00948640692513436, "learning_rate": 3.128342245989305e-06, "loss": 0.0004, "reward": 2.4747501015663147, "reward_std": 0.8183496445417404, "rewards/correctness_reward_func": 1.5833333730697632, "rewards/int_reward_func": 0.2916666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.10416666977107525, "rewards/xmlcount_reward_func": 0.49558334052562714, "step": 117 }, { "completion_length": 203.25000762939453, "epoch": 0.0631607118961595, "grad_norm": 1.328125, "kl": 0.011673168744891882, "learning_rate": 3.1550802139037433e-06, "loss": 0.0005, "reward": 2.5625000596046448, "reward_std": 0.32106195017695427, "rewards/correctness_reward_func": 1.6666666716337204, "rewards/int_reward_func": 0.37500000558793545, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.02083333395421505, "rewards/xmlcount_reward_func": 0.5, "step": 118 }, { "completion_length": 259.3333396911621, "epoch": 0.06369597216646594, "grad_norm": 1.65625, "kl": 0.009123387979343534, "learning_rate": 3.181818181818182e-06, "loss": 0.0004, "reward": 2.166666716337204, "reward_std": 0.8825219944119453, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.02083333395421505, "rewards/xmlcount_reward_func": 0.4791666716337204, "step": 119 }, { "completion_length": 189.75000381469727, "epoch": 0.06423123243677238, "grad_norm": 1.921875, "kl": 0.009333281544968486, "learning_rate": 3.2085561497326205e-06, "loss": 0.0004, "reward": 2.1666667461395264, "reward_std": 0.7824205458164215, "rewards/correctness_reward_func": 1.166666716337204, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0833333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 120 }, { "completion_length": 217.87500762939453, "epoch": 0.06476649270707882, "grad_norm": 1.7265625, "kl": 0.008323910529725254, "learning_rate": 3.2352941176470594e-06, "loss": 0.0003, "reward": 2.3333334028720856, "reward_std": 0.627492543309927, "rewards/correctness_reward_func": 1.3333333432674408, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0416666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 121 }, { "completion_length": 212.75000762939453, "epoch": 0.06530175297738525, "grad_norm": 2.1875, "kl": 0.00647055555600673, "learning_rate": 3.262032085561498e-06, "loss": 0.0003, "reward": 2.708333373069763, "reward_std": 0.8154087364673615, "rewards/correctness_reward_func": 1.6666667461395264, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0833333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 122 }, { "completion_length": 217.25000953674316, "epoch": 0.06583701324769169, "grad_norm": 2.125, "kl": 0.010265512275509536, "learning_rate": 3.288770053475936e-06, "loss": 0.0004, "reward": 2.333333373069763, "reward_std": 0.6681104451417923, "rewards/correctness_reward_func": 1.4166666716337204, "rewards/int_reward_func": 0.31250000558793545, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.10416666977107525, "rewards/xmlcount_reward_func": 0.5, "step": 123 }, { "completion_length": 207.9583396911621, "epoch": 0.06637227351799813, "grad_norm": 1.671875, "kl": 0.010341339744627476, "learning_rate": 3.3155080213903747e-06, "loss": 0.0004, "reward": 2.5937500596046448, "reward_std": 0.3593357726931572, "rewards/correctness_reward_func": 1.6666666716337204, "rewards/int_reward_func": 0.39583333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0416666679084301, "rewards/xmlcount_reward_func": 0.4895833358168602, "step": 124 }, { "completion_length": 206.87500381469727, "epoch": 0.06690753378830457, "grad_norm": 1.796875, "kl": 0.008234906010329723, "learning_rate": 3.342245989304813e-06, "loss": 0.0003, "reward": 2.4166667461395264, "reward_std": 0.7896890789270401, "rewards/correctness_reward_func": 1.4166667014360428, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0416666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 125 }, { "completion_length": 202.2083396911621, "epoch": 0.067442794058611, "grad_norm": 2.234375, "kl": 0.011055209208279848, "learning_rate": 3.368983957219252e-06, "loss": 0.0004, "reward": 2.4166667461395264, "reward_std": 0.5647460781037807, "rewards/correctness_reward_func": 1.5000000223517418, "rewards/int_reward_func": 0.3958333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.02083333395421505, "rewards/xmlcount_reward_func": 0.5, "step": 126 }, { "completion_length": 163.1666717529297, "epoch": 0.06797805432891743, "grad_norm": 1.9765625, "kl": 0.009094940614886582, "learning_rate": 3.3957219251336904e-06, "loss": 0.0004, "reward": 2.3541667461395264, "reward_std": 0.611662745475769, "rewards/correctness_reward_func": 1.2500000298023224, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.1458333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 127 }, { "completion_length": 168.9583396911621, "epoch": 0.06851331459922387, "grad_norm": 2.015625, "kl": 0.011237279628403485, "learning_rate": 3.4224598930481284e-06, "loss": 0.0004, "reward": 2.4375000447034836, "reward_std": 0.6058737970888615, "rewards/correctness_reward_func": 1.416666679084301, "rewards/int_reward_func": 0.37500000558793545, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.14583333767950535, "rewards/xmlcount_reward_func": 0.5, "step": 128 }, { "completion_length": 227.3333396911621, "epoch": 0.06904857486953031, "grad_norm": 1.71875, "kl": 0.006947090849280357, "learning_rate": 3.449197860962567e-06, "loss": 0.0003, "reward": 2.250000089406967, "reward_std": 0.9643253535032272, "rewards/correctness_reward_func": 1.3333333879709244, "rewards/int_reward_func": 0.3958333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.02083333395421505, "rewards/xmlcount_reward_func": 0.5, "step": 129 }, { "completion_length": 212.9583396911621, "epoch": 0.06958383513983675, "grad_norm": 2.25, "kl": 0.012483905302360654, "learning_rate": 3.4759358288770056e-06, "loss": 0.0005, "reward": 2.4218750298023224, "reward_std": 0.7965251952409744, "rewards/correctness_reward_func": 1.5000000298023224, "rewards/int_reward_func": 0.3541666753590107, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0833333358168602, "rewards/xmlcount_reward_func": 0.484375, "step": 130 }, { "completion_length": 178.8333396911621, "epoch": 0.07011909541014318, "grad_norm": 2.1875, "kl": 0.013714013854041696, "learning_rate": 3.5026737967914445e-06, "loss": 0.0005, "reward": 2.291666716337204, "reward_std": 0.9178697988390923, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.2708333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.10416666977107525, "rewards/xmlcount_reward_func": 0.5, "step": 131 }, { "completion_length": 182.875, "epoch": 0.07065435568044962, "grad_norm": 2.203125, "kl": 0.007862797006964684, "learning_rate": 3.529411764705883e-06, "loss": 0.0003, "reward": 2.6458334922790527, "reward_std": 0.8817127346992493, "rewards/correctness_reward_func": 1.5000000596046448, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.1875000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 132 }, { "completion_length": 166.62500381469727, "epoch": 0.07118961595075605, "grad_norm": 1.46875, "kl": 0.011107051279395819, "learning_rate": 3.556149732620321e-06, "loss": 0.0004, "reward": 3.031000018119812, "reward_std": 0.33994986675679684, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.1666666679084301, "rewards/xmlcount_reward_func": 0.4893333315849304, "step": 133 }, { "completion_length": 205.41667556762695, "epoch": 0.0717248762210625, "grad_norm": 1.78125, "kl": 0.00937680620700121, "learning_rate": 3.5828877005347597e-06, "loss": 0.0004, "reward": 2.395833373069763, "reward_std": 1.0875979363918304, "rewards/correctness_reward_func": 1.416666716337204, "rewards/int_reward_func": 0.416666679084301, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.06250000186264515, "rewards/xmlcount_reward_func": 0.5, "step": 134 }, { "completion_length": 139.54166984558105, "epoch": 0.07226013649136893, "grad_norm": 4.6875, "kl": 0.08071585092693567, "learning_rate": 3.609625668449198e-06, "loss": 0.0032, "reward": 2.8697917461395264, "reward_std": 0.5514856986701488, "rewards/correctness_reward_func": 1.8333333432674408, "rewards/int_reward_func": 0.4375000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.1250000037252903, "rewards/xmlcount_reward_func": 0.4739583432674408, "step": 135 }, { "completion_length": 187.62500381469727, "epoch": 0.07279539676167536, "grad_norm": 1.34375, "kl": 0.011190556921064854, "learning_rate": 3.6363636363636366e-06, "loss": 0.0004, "reward": 2.708333432674408, "reward_std": 0.31584101915359497, "rewards/correctness_reward_func": 1.5833333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.1250000037252903, "rewards/xmlcount_reward_func": 0.5, "step": 136 }, { "completion_length": 172.83333778381348, "epoch": 0.0733306570319818, "grad_norm": 1.765625, "kl": 0.009673898573964834, "learning_rate": 3.6631016042780754e-06, "loss": 0.0004, "reward": 2.7916666865348816, "reward_std": 0.4644980877637863, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0833333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 137 }, { "completion_length": 178.2083396911621, "epoch": 0.07386591730228824, "grad_norm": 2.34375, "kl": 0.015270714182406664, "learning_rate": 3.6898395721925134e-06, "loss": 0.0006, "reward": 2.7500001192092896, "reward_std": 0.6503244712948799, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.3750000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.1250000037252903, "rewards/xmlcount_reward_func": 0.5, "step": 138 }, { "completion_length": 171.83333587646484, "epoch": 0.07440117757259468, "grad_norm": 1.765625, "kl": 0.013465502765029669, "learning_rate": 3.716577540106952e-06, "loss": 0.0005, "reward": 2.7187500596046448, "reward_std": 0.6593321561813354, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.08333333395421505, "rewards/xmlcount_reward_func": 0.4895833358168602, "step": 139 }, { "completion_length": 187.0416717529297, "epoch": 0.0749364378429011, "grad_norm": 1.328125, "kl": 0.011786214541643858, "learning_rate": 3.7433155080213907e-06, "loss": 0.0005, "reward": 2.5729166865348816, "reward_std": 0.141096293926239, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0833333358168602, "rewards/xmlcount_reward_func": 0.4895833358168602, "step": 140 }, { "completion_length": 197.58333587646484, "epoch": 0.07547169811320754, "grad_norm": 2.09375, "kl": 0.007584544597193599, "learning_rate": 3.770053475935829e-06, "loss": 0.0003, "reward": 2.2031250596046448, "reward_std": 0.8265210092067719, "rewards/correctness_reward_func": 1.1666666939854622, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.1041666679084301, "rewards/xmlcount_reward_func": 0.4947916716337204, "step": 141 }, { "completion_length": 139.75, "epoch": 0.07600695838351398, "grad_norm": 2.546875, "kl": 0.03859049454331398, "learning_rate": 3.796791443850268e-06, "loss": 0.0015, "reward": 2.9791667461395264, "reward_std": 0.48826754838228226, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.12500000186264515, "rewards/xmlcount_reward_func": 0.4791666716337204, "step": 142 }, { "completion_length": 214.5, "epoch": 0.07654221865382042, "grad_norm": 1.4375, "kl": 0.00880357634741813, "learning_rate": 3.8235294117647055e-06, "loss": 0.0004, "reward": 2.375000089406967, "reward_std": 0.8150961697101593, "rewards/correctness_reward_func": 1.333333358168602, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0833333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 143 }, { "completion_length": 151.37500381469727, "epoch": 0.07707747892412686, "grad_norm": 2.15625, "kl": 0.012635418446734548, "learning_rate": 3.850267379679145e-06, "loss": 0.0005, "reward": 2.7708334028720856, "reward_std": 0.5846511572599411, "rewards/correctness_reward_func": 1.583333358168602, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.2500000037252903, "rewards/xmlcount_reward_func": 0.5, "step": 144 }, { "completion_length": 228.16667366027832, "epoch": 0.07761273919443329, "grad_norm": 1.8125, "kl": 0.00903172290418297, "learning_rate": 3.877005347593583e-06, "loss": 0.0004, "reward": 2.578125089406967, "reward_std": 0.500914141535759, "rewards/correctness_reward_func": 1.5000000223517418, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.18750000186264515, "rewards/xmlcount_reward_func": 0.453125, "step": 145 }, { "completion_length": 129.25000190734863, "epoch": 0.07814799946473973, "grad_norm": 2.21875, "kl": 0.020013232016935945, "learning_rate": 3.903743315508022e-06, "loss": 0.0008, "reward": 2.770833373069763, "reward_std": 0.6944468766450882, "rewards/correctness_reward_func": 1.7500000596046448, "rewards/int_reward_func": 0.3958333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.1250000037252903, "rewards/xmlcount_reward_func": 0.5, "step": 146 }, { "completion_length": 173.00000190734863, "epoch": 0.07868325973504617, "grad_norm": 1.9140625, "kl": 0.025609272299334407, "learning_rate": 3.93048128342246e-06, "loss": 0.001, "reward": 2.733708381652832, "reward_std": 0.5789778083562851, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.10416666977107525, "rewards/xmlcount_reward_func": 0.46287500113248825, "step": 147 }, { "completion_length": 146.1250057220459, "epoch": 0.07921852000535261, "grad_norm": 1.859375, "kl": 0.017028656788170338, "learning_rate": 3.957219251336899e-06, "loss": 0.0007, "reward": 2.125000074505806, "reward_std": 0.491043072193861, "rewards/correctness_reward_func": 1.1666666716337204, "rewards/int_reward_func": 0.25000000558793545, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.20833334140479565, "rewards/xmlcount_reward_func": 0.5, "step": 148 }, { "completion_length": 137.12500381469727, "epoch": 0.07975378027565903, "grad_norm": 2.390625, "kl": 0.023823135998100042, "learning_rate": 3.983957219251337e-06, "loss": 0.001, "reward": 3.083333432674408, "reward_std": 0.2887342944741249, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.3958333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.18750000558793545, "rewards/xmlcount_reward_func": 0.5, "step": 149 }, { "completion_length": 138.20833587646484, "epoch": 0.08028904054596547, "grad_norm": 1.90625, "kl": 0.016617624554783106, "learning_rate": 4.010695187165775e-06, "loss": 0.0007, "reward": 3.1250000596046448, "reward_std": 0.4123322442173958, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.2500000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 150 }, { "completion_length": 196.87500381469727, "epoch": 0.08082430081627191, "grad_norm": 1.9609375, "kl": 0.017903268802911043, "learning_rate": 4.037433155080215e-06, "loss": 0.0007, "reward": 2.4166667461395264, "reward_std": 0.8529610484838486, "rewards/correctness_reward_func": 1.2500000521540642, "rewards/int_reward_func": 0.416666679084301, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.25000000931322575, "rewards/xmlcount_reward_func": 0.5, "step": 151 }, { "completion_length": 203.6666717529297, "epoch": 0.08135956108657835, "grad_norm": 1.8515625, "kl": 0.0095352737698704, "learning_rate": 4.064171122994653e-06, "loss": 0.0004, "reward": 2.8125000596046448, "reward_std": 0.7167538553476334, "rewards/correctness_reward_func": 1.7500000596046448, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.1041666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 152 }, { "completion_length": 156.12500381469727, "epoch": 0.08189482135688479, "grad_norm": 1.484375, "kl": 0.012113512842915952, "learning_rate": 4.0909090909090915e-06, "loss": 0.0005, "reward": 3.1875000596046448, "reward_std": 0.309229951351881, "rewards/correctness_reward_func": 1.8333333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3541666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 153 }, { "completion_length": 140.0416717529297, "epoch": 0.08243008162719122, "grad_norm": 2.15625, "kl": 0.02159164287149906, "learning_rate": 4.11764705882353e-06, "loss": 0.0009, "reward": 3.0000001192092896, "reward_std": 0.5643851608037949, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.2916666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 154 }, { "completion_length": 202.50000762939453, "epoch": 0.08296534189749766, "grad_norm": 1.59375, "kl": 0.01972877373918891, "learning_rate": 4.144385026737968e-06, "loss": 0.0008, "reward": 2.3750001192092896, "reward_std": 0.6778506711125374, "rewards/correctness_reward_func": 1.2500000298023224, "rewards/int_reward_func": 0.4375000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.1875000037252903, "rewards/xmlcount_reward_func": 0.5, "step": 155 }, { "completion_length": 165.33333778381348, "epoch": 0.0835006021678041, "grad_norm": 2.28125, "kl": 0.028622428653761744, "learning_rate": 4.171122994652407e-06, "loss": 0.0011, "reward": 2.282666653394699, "reward_std": 1.0192717239260674, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.416666679084301, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.1458333358168602, "rewards/xmlcount_reward_func": 0.4701666682958603, "step": 156 }, { "completion_length": 123.58333778381348, "epoch": 0.08403586243811054, "grad_norm": 2.53125, "kl": 0.02671874687075615, "learning_rate": 4.197860962566845e-06, "loss": 0.0011, "reward": 2.9166667461395264, "reward_std": 0.8969832062721252, "rewards/correctness_reward_func": 1.6666667461395264, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 157 }, { "completion_length": 153.6666717529297, "epoch": 0.08457112270841696, "grad_norm": 2.359375, "kl": 0.020442907931283116, "learning_rate": 4.224598930481284e-06, "loss": 0.0008, "reward": 2.531250089406967, "reward_std": 0.547564685344696, "rewards/correctness_reward_func": 1.2500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.2916666753590107, "rewards/xmlcount_reward_func": 0.4895833358168602, "step": 158 }, { "completion_length": 143.62500381469727, "epoch": 0.0851063829787234, "grad_norm": 1.921875, "kl": 0.030001087579876184, "learning_rate": 4.251336898395722e-06, "loss": 0.0012, "reward": 2.8095834255218506, "reward_std": 0.577202744781971, "rewards/correctness_reward_func": 1.5000000223517418, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3333333358168602, "rewards/xmlcount_reward_func": 0.4970833361148834, "step": 159 }, { "completion_length": 143.125, "epoch": 0.08564164324902984, "grad_norm": 1.8046875, "kl": 0.02288861945271492, "learning_rate": 4.2780748663101604e-06, "loss": 0.0009, "reward": 2.8125, "reward_std": 0.6792502254247665, "rewards/correctness_reward_func": 1.5833333730697632, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.25000000186264515, "rewards/xmlcount_reward_func": 0.5, "step": 160 }, { "completion_length": 175.7083396911621, "epoch": 0.08617690351933628, "grad_norm": 2.078125, "kl": 0.018278248608112335, "learning_rate": 4.304812834224599e-06, "loss": 0.0007, "reward": 2.0726667046546936, "reward_std": 0.8482229933142662, "rewards/correctness_reward_func": 0.916666679084301, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.2083333358168602, "rewards/xmlcount_reward_func": 0.4893333315849304, "step": 161 }, { "completion_length": 148.04166984558105, "epoch": 0.08671216378964271, "grad_norm": 2.390625, "kl": 0.025503937155008316, "learning_rate": 4.331550802139038e-06, "loss": 0.001, "reward": 2.6666667461395264, "reward_std": 0.6222646199166775, "rewards/correctness_reward_func": 1.416666679084301, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.33333334140479565, "rewards/xmlcount_reward_func": 0.5, "step": 162 }, { "completion_length": 148.5416717529297, "epoch": 0.08724742405994915, "grad_norm": 2.65625, "kl": 0.03295175568200648, "learning_rate": 4.3582887700534766e-06, "loss": 0.0013, "reward": 1.6875000298023224, "reward_std": 0.28862859681248665, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.3125000149011612, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3750000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 163 }, { "completion_length": 161.5416717529297, "epoch": 0.08778268433025559, "grad_norm": 1.140625, "kl": 0.021444957936182618, "learning_rate": 4.385026737967915e-06, "loss": 0.0009, "reward": 2.6875000596046448, "reward_std": 0.496665894985199, "rewards/correctness_reward_func": 1.4166666716337204, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.2708333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 164 }, { "completion_length": 132.0833396911621, "epoch": 0.08831794460056203, "grad_norm": 4.125, "kl": 0.08498809393495321, "learning_rate": 4.411764705882353e-06, "loss": 0.0034, "reward": 3.161458373069763, "reward_std": 0.6175251640379429, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333432674408, "rewards/xmlcount_reward_func": 0.4739583358168602, "step": 165 }, { "completion_length": 119.50000190734863, "epoch": 0.08885320487086847, "grad_norm": 2.46875, "kl": 0.04442449565976858, "learning_rate": 4.438502673796792e-06, "loss": 0.0018, "reward": 2.7916667461395264, "reward_std": 0.8464668020606041, "rewards/correctness_reward_func": 1.5000000447034836, "rewards/int_reward_func": 0.4375000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.354166679084301, "rewards/xmlcount_reward_func": 0.5, "step": 166 }, { "completion_length": 150.41666793823242, "epoch": 0.08938846514117489, "grad_norm": 1.96875, "kl": 0.020863166078925133, "learning_rate": 4.46524064171123e-06, "loss": 0.0008, "reward": 2.958333432674408, "reward_std": 0.7232691049575806, "rewards/correctness_reward_func": 1.666666716337204, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125000149011612, "rewards/xmlcount_reward_func": 0.5, "step": 167 }, { "completion_length": 132.20833778381348, "epoch": 0.08992372541148133, "grad_norm": 1.4296875, "kl": 0.024739216547459364, "learning_rate": 4.491978609625669e-06, "loss": 0.001, "reward": 3.333333373069763, "reward_std": 0.12909945845603943, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333395421505, "rewards/xmlcount_reward_func": 0.5, "step": 168 }, { "completion_length": 172.33333778381348, "epoch": 0.09045898568178777, "grad_norm": 2.375, "kl": 0.030473611317574978, "learning_rate": 4.518716577540107e-06, "loss": 0.0012, "reward": 2.775750070810318, "reward_std": 0.27488668262958527, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.33333334140479565, "rewards/xmlcount_reward_func": 0.4424166679382324, "step": 169 }, { "completion_length": 114.875, "epoch": 0.09099424595209421, "grad_norm": 2.765625, "kl": 0.032696583308279514, "learning_rate": 4.5454545454545455e-06, "loss": 0.0013, "reward": 3.036458432674408, "reward_std": 0.6265082620084286, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333432674408, "rewards/xmlcount_reward_func": 0.4947916716337204, "step": 170 }, { "completion_length": 186.7916717529297, "epoch": 0.09152950622240064, "grad_norm": 2.734375, "kl": 0.05477259890176356, "learning_rate": 4.572192513368984e-06, "loss": 0.0022, "reward": 2.830708384513855, "reward_std": 0.8129361271858215, "rewards/correctness_reward_func": 1.5833333730697632, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3333333395421505, "rewards/xmlcount_reward_func": 0.4348750039935112, "step": 171 }, { "completion_length": 132.04167366027832, "epoch": 0.09206476649270708, "grad_norm": 2.71875, "kl": 0.06303630210459232, "learning_rate": 4.598930481283423e-06, "loss": 0.0025, "reward": 3.1875000596046448, "reward_std": 0.4721617363393307, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.4375000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3333333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 172 }, { "completion_length": 144.54167556762695, "epoch": 0.09260002676301352, "grad_norm": 2.28125, "kl": 0.03534765588119626, "learning_rate": 4.625668449197862e-06, "loss": 0.0014, "reward": 3.020833432674408, "reward_std": 0.4301304928958416, "rewards/correctness_reward_func": 1.6666666716337204, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.354166679084301, "rewards/xmlcount_reward_func": 0.5, "step": 173 }, { "completion_length": 110.5000057220459, "epoch": 0.09313528703331996, "grad_norm": 1.4765625, "kl": 0.030329300556331873, "learning_rate": 4.6524064171123e-06, "loss": 0.0012, "reward": 3.270833373069763, "reward_std": 0.38332105800509453, "rewards/correctness_reward_func": 1.8333333432674408, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 174 }, { "completion_length": 159.04166793823242, "epoch": 0.0936705473036264, "grad_norm": 1.34375, "kl": 0.029189520981162786, "learning_rate": 4.6791443850267385e-06, "loss": 0.0012, "reward": 3.0416666865348816, "reward_std": 0.4541241377592087, "rewards/correctness_reward_func": 1.583333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 175 }, { "completion_length": 153.7083396911621, "epoch": 0.09420580757393282, "grad_norm": 2.1875, "kl": 0.023249680642038584, "learning_rate": 4.705882352941177e-06, "loss": 0.0009, "reward": 2.8125001192092896, "reward_std": 0.8104839585721493, "rewards/correctness_reward_func": 1.5833333730697632, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.31250000186264515, "rewards/xmlcount_reward_func": 0.5, "step": 176 }, { "completion_length": 196.79167556762695, "epoch": 0.09474106784423926, "grad_norm": 2.1875, "kl": 0.023467288352549076, "learning_rate": 4.732620320855615e-06, "loss": 0.0009, "reward": 2.2916666865348816, "reward_std": 0.6971899420022964, "rewards/correctness_reward_func": 1.0833333432674408, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.2708333395421505, "rewards/xmlcount_reward_func": 0.4583333432674408, "step": 177 }, { "completion_length": 133.58334159851074, "epoch": 0.0952763281145457, "grad_norm": 2.3125, "kl": 0.0329542844556272, "learning_rate": 4.759358288770054e-06, "loss": 0.0013, "reward": 2.9010416865348816, "reward_std": 0.6468523591756821, "rewards/correctness_reward_func": 1.5833333432674408, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.37500000558793545, "rewards/xmlcount_reward_func": 0.484375, "step": 178 }, { "completion_length": 131.9166717529297, "epoch": 0.09581158838485214, "grad_norm": 1.71875, "kl": 0.028344920370727777, "learning_rate": 4.786096256684493e-06, "loss": 0.0011, "reward": 2.7760417461395264, "reward_std": 0.6638420633971691, "rewards/correctness_reward_func": 1.4166666716337204, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333432674408, "rewards/xmlcount_reward_func": 0.484375, "step": 179 }, { "completion_length": 168.2083396911621, "epoch": 0.09634684865515857, "grad_norm": 1.5546875, "kl": 0.020674246130511165, "learning_rate": 4.812834224598931e-06, "loss": 0.0008, "reward": 2.7291666865348816, "reward_std": 0.8422547429800034, "rewards/correctness_reward_func": 1.5000000298023224, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.2708333395421505, "rewards/xmlcount_reward_func": 0.5, "step": 180 }, { "completion_length": 99.12500190734863, "epoch": 0.096882108925465, "grad_norm": 2.015625, "kl": 0.029507741332054138, "learning_rate": 4.839572192513369e-06, "loss": 0.0012, "reward": 3.2812500596046448, "reward_std": 0.3093881160020828, "rewards/correctness_reward_func": 1.8333333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.4895833432674408, "step": 181 }, { "completion_length": 160.16667366027832, "epoch": 0.09741736919577144, "grad_norm": 1.171875, "kl": 0.026030527194961905, "learning_rate": 4.866310160427808e-06, "loss": 0.001, "reward": 2.958333373069763, "reward_std": 0.35120461508631706, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.2291666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 182 }, { "completion_length": 178.00000381469727, "epoch": 0.09795262946607788, "grad_norm": 1.4140625, "kl": 0.03037263359874487, "learning_rate": 4.893048128342247e-06, "loss": 0.0012, "reward": 2.708333373069763, "reward_std": 0.5456972420215607, "rewards/correctness_reward_func": 1.416666679084301, "rewards/int_reward_func": 0.4166666679084301, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.37500000558793545, "rewards/xmlcount_reward_func": 0.5, "step": 183 }, { "completion_length": 138.83333778381348, "epoch": 0.09848788973638432, "grad_norm": 2.375, "kl": 0.024358084425330162, "learning_rate": 4.919786096256685e-06, "loss": 0.001, "reward": 2.9479167461395264, "reward_std": 0.669994905591011, "rewards/correctness_reward_func": 1.666666716337204, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.4895833358168602, "step": 184 }, { "completion_length": 134.33333778381348, "epoch": 0.09902315000669075, "grad_norm": 0.8828125, "kl": 0.027697773184627295, "learning_rate": 4.9465240641711236e-06, "loss": 0.0011, "reward": 3.4791666865348816, "reward_std": 0.05103103816509247, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 185 }, { "completion_length": 141.70833587646484, "epoch": 0.09955841027699719, "grad_norm": 2.546875, "kl": 0.04492489667609334, "learning_rate": 4.973262032085562e-06, "loss": 0.0018, "reward": 2.4795000553131104, "reward_std": 0.7360228151082993, "rewards/correctness_reward_func": 1.166666716337204, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.33333334885537624, "rewards/xmlcount_reward_func": 0.4794999957084656, "step": 186 }, { "completion_length": 104.79166793823242, "epoch": 0.10009367054730363, "grad_norm": 1.2109375, "kl": 0.03745970083400607, "learning_rate": 5e-06, "loss": 0.0015, "reward": 2.5, "reward_std": 0.2622022032737732, "rewards/correctness_reward_func": 1.0833333358168602, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 187 }, { "completion_length": 137.0833396911621, "epoch": 0.10062893081761007, "grad_norm": 2.5, "kl": 0.030969139654189348, "learning_rate": 4.999995634095768e-06, "loss": 0.0012, "reward": 3.2447916865348816, "reward_std": 0.3546534702181816, "rewards/correctness_reward_func": 1.8333333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.4947916716337204, "step": 188 }, { "completion_length": 148.04166984558105, "epoch": 0.1011641910879165, "grad_norm": 2.359375, "kl": 0.028358498588204384, "learning_rate": 4.999982536398319e-06, "loss": 0.0011, "reward": 2.7916666865348816, "reward_std": 0.6756742745637894, "rewards/correctness_reward_func": 1.5000000149011612, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.2916666753590107, "rewards/xmlcount_reward_func": 0.5, "step": 189 }, { "completion_length": 195.62500762939453, "epoch": 0.10169945135822293, "grad_norm": 1.875, "kl": 0.0333657874725759, "learning_rate": 4.9999607069534e-06, "loss": 0.0013, "reward": 1.7708334028720856, "reward_std": 0.8301980048418045, "rewards/correctness_reward_func": 0.666666679084301, "rewards/int_reward_func": 0.3333333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.2708333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 190 }, { "completion_length": 203.08333587646484, "epoch": 0.10223471162852937, "grad_norm": 2.109375, "kl": 0.021336913108825684, "learning_rate": 4.999930145837254e-06, "loss": 0.0009, "reward": 2.447916716337204, "reward_std": 0.7104772366583347, "rewards/correctness_reward_func": 1.1666666865348816, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.33333334885537624, "rewards/xmlcount_reward_func": 0.4895833358168602, "step": 191 }, { "completion_length": 155.75000381469727, "epoch": 0.10276997189883581, "grad_norm": 2.3125, "kl": 0.031167209148406982, "learning_rate": 4.999890853156626e-06, "loss": 0.0012, "reward": 2.520833432674408, "reward_std": 1.0309316962957382, "rewards/correctness_reward_func": 1.0833333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.5, "step": 192 }, { "completion_length": 144.79166984558105, "epoch": 0.10330523216914224, "grad_norm": 2.765625, "kl": 0.035610498394817114, "learning_rate": 4.999842829048751e-06, "loss": 0.0014, "reward": 2.7500000596046448, "reward_std": 0.9765221327543259, "rewards/correctness_reward_func": 1.416666716337204, "rewards/int_reward_func": 0.3958333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 193 }, { "completion_length": 208.25000381469727, "epoch": 0.10384049243944868, "grad_norm": 1.625, "kl": 0.021585837937891483, "learning_rate": 4.999786073681365e-06, "loss": 0.0009, "reward": 2.333333373069763, "reward_std": 0.8039402067661285, "rewards/correctness_reward_func": 1.0000000447034836, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.33333333395421505, "rewards/xmlcount_reward_func": 0.5, "step": 194 }, { "completion_length": 125.16667175292969, "epoch": 0.10437575270975512, "grad_norm": 1.8984375, "kl": 0.028131581842899323, "learning_rate": 4.9997205872526996e-06, "loss": 0.0011, "reward": 3.020833373069763, "reward_std": 0.6476409286260605, "rewards/correctness_reward_func": 1.5833333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 195 }, { "completion_length": 137.54166793823242, "epoch": 0.10491101298006156, "grad_norm": 1.9921875, "kl": 0.03637383785098791, "learning_rate": 4.9996463699914795e-06, "loss": 0.0015, "reward": 3.270833373069763, "reward_std": 0.30922994017601013, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3541666753590107, "rewards/xmlcount_reward_func": 0.5, "step": 196 }, { "completion_length": 167.3333396911621, "epoch": 0.105446273250368, "grad_norm": 2.078125, "kl": 0.028204525355249643, "learning_rate": 4.9995634221569264e-06, "loss": 0.0011, "reward": 2.9166667461395264, "reward_std": 0.9102587252855301, "rewards/correctness_reward_func": 1.5833334028720856, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3541666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 197 }, { "completion_length": 151.83333778381348, "epoch": 0.10598153352067442, "grad_norm": 1.203125, "kl": 0.025021064560860395, "learning_rate": 4.9994717440387545e-06, "loss": 0.001, "reward": 2.9791666865348816, "reward_std": 0.3012026697397232, "rewards/correctness_reward_func": 1.5833333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 198 }, { "completion_length": 122.87500381469727, "epoch": 0.10651679379098086, "grad_norm": 3.109375, "kl": 0.04365252796560526, "learning_rate": 4.999371335957167e-06, "loss": 0.0017, "reward": 3.151041805744171, "reward_std": 0.6672081500291824, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.484375, "step": 199 }, { "completion_length": 104.16667175292969, "epoch": 0.1070520540612873, "grad_norm": 3.3125, "kl": 0.057026736438274384, "learning_rate": 4.999262198262866e-06, "loss": 0.0023, "reward": 3.0000000596046448, "reward_std": 0.5163978338241577, "rewards/correctness_reward_func": 1.5000000149011612, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 200 }, { "completion_length": 141.33333778381348, "epoch": 0.10758731433159374, "grad_norm": 1.7109375, "kl": 0.03623780608177185, "learning_rate": 4.999144331337035e-06, "loss": 0.0014, "reward": 3.125, "reward_std": 0.3869306445121765, "rewards/correctness_reward_func": 1.8333333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.2916666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 201 }, { "completion_length": 150.00000381469727, "epoch": 0.10812257460190017, "grad_norm": 1.828125, "kl": 0.02583132265135646, "learning_rate": 4.999017735591354e-06, "loss": 0.001, "reward": 2.895833373069763, "reward_std": 0.6385845988988876, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 202 }, { "completion_length": 125.04167175292969, "epoch": 0.10865783487220661, "grad_norm": 9.125, "kl": 0.09583986504003406, "learning_rate": 4.998882411467984e-06, "loss": 0.0038, "reward": 3.1718750596046448, "reward_std": 0.6912017911672592, "rewards/correctness_reward_func": 1.7500000596046448, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.484375, "step": 203 }, { "completion_length": 160.25000762939453, "epoch": 0.10919309514251305, "grad_norm": 2.125, "kl": 0.03802371025085449, "learning_rate": 4.998738359439576e-06, "loss": 0.0015, "reward": 2.375000089406967, "reward_std": 0.9705919325351715, "rewards/correctness_reward_func": 1.0833333805203438, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.37500000558793545, "rewards/xmlcount_reward_func": 0.5, "step": 204 }, { "completion_length": 111.91666793823242, "epoch": 0.10972835541281949, "grad_norm": 2.671875, "kl": 0.04831968434154987, "learning_rate": 4.998585580009266e-06, "loss": 0.0019, "reward": 2.8958334028720856, "reward_std": 0.5064464919269085, "rewards/correctness_reward_func": 1.5000000223517418, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.416666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 205 }, { "completion_length": 129.2916717529297, "epoch": 0.11026361568312593, "grad_norm": 2.078125, "kl": 0.04842359526082873, "learning_rate": 4.998424073710667e-06, "loss": 0.0019, "reward": 3.208333373069763, "reward_std": 0.5172697007656097, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3750000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 206 }, { "completion_length": 101.54166984558105, "epoch": 0.11079887595343235, "grad_norm": 4.8125, "kl": 0.08093613479286432, "learning_rate": 4.998253841107877e-06, "loss": 0.0032, "reward": 2.4947917461395264, "reward_std": 0.8822575807571411, "rewards/correctness_reward_func": 1.0833333432674408, "rewards/int_reward_func": 0.4375000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.4947916716337204, "step": 207 }, { "completion_length": 138.5416717529297, "epoch": 0.11133413622373879, "grad_norm": 1.765625, "kl": 0.03517375607043505, "learning_rate": 4.998074882795473e-06, "loss": 0.0014, "reward": 3.083333373069763, "reward_std": 0.5884110182523727, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 208 }, { "completion_length": 124.20833969116211, "epoch": 0.11186939649404523, "grad_norm": 2.734375, "kl": 0.04947302211076021, "learning_rate": 4.997887199398504e-06, "loss": 0.002, "reward": 3.1250000596046448, "reward_std": 0.7061345875263214, "rewards/correctness_reward_func": 1.666666716337204, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 209 }, { "completion_length": 103.00000381469727, "epoch": 0.11240465676435167, "grad_norm": 1.1796875, "kl": 0.031111895572394133, "learning_rate": 4.997690791572498e-06, "loss": 0.0012, "reward": 3.145833373069763, "reward_std": 0.27857524156570435, "rewards/correctness_reward_func": 1.6666666716337204, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 210 }, { "completion_length": 141.79166984558105, "epoch": 0.1129399170346581, "grad_norm": 3.609375, "kl": 0.10411204094998538, "learning_rate": 4.997485660003453e-06, "loss": 0.0042, "reward": 2.726250022649765, "reward_std": 0.46303591132164, "rewards/correctness_reward_func": 1.416666679084301, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3333333358168602, "rewards/xmlcount_reward_func": 0.4970833361148834, "step": 211 }, { "completion_length": 107.3750057220459, "epoch": 0.11347517730496454, "grad_norm": 2.890625, "kl": 0.048455359414219856, "learning_rate": 4.997271805407836e-06, "loss": 0.0019, "reward": 3.2187500596046448, "reward_std": 0.5164157301187515, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.4895833358168602, "step": 212 }, { "completion_length": 147.1250057220459, "epoch": 0.11401043757527098, "grad_norm": 2.359375, "kl": 0.03473258297890425, "learning_rate": 4.997049228532583e-06, "loss": 0.0014, "reward": 2.4375001192092896, "reward_std": 0.9079003632068634, "rewards/correctness_reward_func": 1.000000037252903, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000149011612, "rewards/xmlcount_reward_func": 0.5, "step": 213 }, { "completion_length": 119.20833396911621, "epoch": 0.11454569784557742, "grad_norm": 2.109375, "kl": 0.060375045984983444, "learning_rate": 4.996817930155094e-06, "loss": 0.0024, "reward": 2.8645834028720856, "reward_std": 0.5899006575345993, "rewards/correctness_reward_func": 1.416666679084301, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.4895833358168602, "step": 214 }, { "completion_length": 110.95833778381348, "epoch": 0.11508095811588386, "grad_norm": 2.09375, "kl": 0.07388751022517681, "learning_rate": 4.996577911083228e-06, "loss": 0.003, "reward": 3.0416667461395264, "reward_std": 0.425990492105484, "rewards/correctness_reward_func": 1.6666666716337204, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 215 }, { "completion_length": 151.2083339691162, "epoch": 0.11561621838619028, "grad_norm": 25.0, "kl": 0.05382871255278587, "learning_rate": 4.996329172155307e-06, "loss": 0.0022, "reward": 2.6562500596046448, "reward_std": 0.5608328096568584, "rewards/correctness_reward_func": 1.3333333432674408, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.416666679084301, "rewards/xmlcount_reward_func": 0.4479166716337204, "step": 216 }, { "completion_length": 115.50000381469727, "epoch": 0.11615147865649672, "grad_norm": 3.375, "kl": 0.07328876806423068, "learning_rate": 4.996071714240108e-06, "loss": 0.0029, "reward": 3.020833343267441, "reward_std": 0.30094872415065765, "rewards/correctness_reward_func": 1.5833333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.4791666716337204, "step": 217 }, { "completion_length": 164.125, "epoch": 0.11668673892680316, "grad_norm": 1.8203125, "kl": 0.04349048109725118, "learning_rate": 4.995805538236858e-06, "loss": 0.0017, "reward": 2.6510416865348816, "reward_std": 0.46429676935076714, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.484375, "step": 218 }, { "completion_length": 148.95833587646484, "epoch": 0.1172219991971096, "grad_norm": 1.984375, "kl": 0.028181300032883883, "learning_rate": 4.995530645075237e-06, "loss": 0.0011, "reward": 2.3541667461395264, "reward_std": 0.8104839585721493, "rewards/correctness_reward_func": 0.9166666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 219 }, { "completion_length": 147.70833778381348, "epoch": 0.11775725946741603, "grad_norm": 1.8515625, "kl": 0.027232277672737837, "learning_rate": 4.9952470357153715e-06, "loss": 0.0011, "reward": 2.520833373069763, "reward_std": 0.4765267074108124, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.3541666753590107, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 220 }, { "completion_length": 135.66666793823242, "epoch": 0.11829251973772247, "grad_norm": 2.109375, "kl": 0.03699612431228161, "learning_rate": 4.9949547111478295e-06, "loss": 0.0015, "reward": 3.208333432674408, "reward_std": 0.538923554122448, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333395421505, "rewards/xmlcount_reward_func": 0.5, "step": 221 }, { "completion_length": 153.8333396911621, "epoch": 0.1188277800080289, "grad_norm": 2.84375, "kl": 0.09540390037000179, "learning_rate": 4.994653672393622e-06, "loss": 0.0038, "reward": 2.8750000596046448, "reward_std": 0.8296718895435333, "rewards/correctness_reward_func": 1.5000000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3750000111758709, "rewards/xmlcount_reward_func": 0.5, "step": 222 }, { "completion_length": 141.62500190734863, "epoch": 0.11936304027833534, "grad_norm": 2.765625, "kl": 0.043064896017313004, "learning_rate": 4.99434392050419e-06, "loss": 0.0017, "reward": 2.708333373069763, "reward_std": 0.7524303495883942, "rewards/correctness_reward_func": 1.3333333656191826, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.39583333395421505, "rewards/xmlcount_reward_func": 0.5, "step": 223 }, { "completion_length": 135.00000381469727, "epoch": 0.11989830054864177, "grad_norm": 2.484375, "kl": 0.04069590661674738, "learning_rate": 4.994025456561415e-06, "loss": 0.0016, "reward": 3.083333432674408, "reward_std": 0.7696890532970428, "rewards/correctness_reward_func": 1.666666716337204, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 224 }, { "completion_length": 177.62500190734863, "epoch": 0.12043356081894821, "grad_norm": 2.15625, "kl": 0.03938570665195584, "learning_rate": 4.993698281677603e-06, "loss": 0.0016, "reward": 2.4791667461395264, "reward_std": 0.584863156080246, "rewards/correctness_reward_func": 1.1666666865348816, "rewards/int_reward_func": 0.4375000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.37500000558793545, "rewards/xmlcount_reward_func": 0.5, "step": 225 }, { "completion_length": 154.7916717529297, "epoch": 0.12096882108925465, "grad_norm": 1.4375, "kl": 0.023624973371624947, "learning_rate": 4.993362396995484e-06, "loss": 0.0009, "reward": 2.968000113964081, "reward_std": 0.7112085819244385, "rewards/correctness_reward_func": 1.5000000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.48883333057165146, "step": 226 }, { "completion_length": 204.00000381469727, "epoch": 0.12150408135956109, "grad_norm": 2.171875, "kl": 0.0315577844157815, "learning_rate": 4.993017803688211e-06, "loss": 0.0013, "reward": 1.9062500298023224, "reward_std": 0.6422140449285507, "rewards/correctness_reward_func": 0.6666666939854622, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.31250001303851604, "rewards/xmlcount_reward_func": 0.4895833358168602, "step": 227 }, { "completion_length": 153.83333778381348, "epoch": 0.12203934162986753, "grad_norm": 1.890625, "kl": 0.04619473172351718, "learning_rate": 4.992664502959351e-06, "loss": 0.0018, "reward": 2.770833432674408, "reward_std": 0.4689341187477112, "rewards/correctness_reward_func": 1.3333333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 228 }, { "completion_length": 142.33333587646484, "epoch": 0.12257460190017395, "grad_norm": 2.296875, "kl": 0.04707263316959143, "learning_rate": 4.99230249604289e-06, "loss": 0.0019, "reward": 2.833333373069763, "reward_std": 0.8086781054735184, "rewards/correctness_reward_func": 1.5000000298023224, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3750000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 229 }, { "completion_length": 131.6666717529297, "epoch": 0.1231098621704804, "grad_norm": 1.703125, "kl": 0.050373317673802376, "learning_rate": 4.991931784203215e-06, "loss": 0.002, "reward": 3.0795000195503235, "reward_std": 0.5383563190698624, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3333333395421505, "rewards/xmlcount_reward_func": 0.4961666688323021, "step": 230 }, { "completion_length": 133.08333587646484, "epoch": 0.12364512244078683, "grad_norm": 1.4921875, "kl": 0.03360940143465996, "learning_rate": 4.991552368735119e-06, "loss": 0.0013, "reward": 2.7500000596046448, "reward_std": 0.3624359928071499, "rewards/correctness_reward_func": 1.3333333432674408, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 231 }, { "completion_length": 114.95833587646484, "epoch": 0.12418038271109327, "grad_norm": 3.0, "kl": 0.09261773619800806, "learning_rate": 4.991164250963799e-06, "loss": 0.0037, "reward": 3.083333432674408, "reward_std": 0.7157893627882004, "rewards/correctness_reward_func": 1.666666716337204, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.416666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 232 }, { "completion_length": 146.91666984558105, "epoch": 0.1247156429813997, "grad_norm": 2.1875, "kl": 0.03772323578596115, "learning_rate": 4.990767432244839e-06, "loss": 0.0015, "reward": 2.9791667461395264, "reward_std": 0.6879045069217682, "rewards/correctness_reward_func": 1.666666716337204, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3541666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 233 }, { "completion_length": 116.29166984558105, "epoch": 0.12525090325170615, "grad_norm": 2.09375, "kl": 0.024761986453086138, "learning_rate": 4.990361913964221e-06, "loss": 0.001, "reward": 2.4791667461395264, "reward_std": 0.9406071752309799, "rewards/correctness_reward_func": 1.000000037252903, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 234 }, { "completion_length": 120.33333587646484, "epoch": 0.12578616352201258, "grad_norm": 1.5390625, "kl": 0.02583275781944394, "learning_rate": 4.989947697538305e-06, "loss": 0.001, "reward": 2.9791666865348816, "reward_std": 0.05103103816509247, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 235 }, { "completion_length": 147.62500381469727, "epoch": 0.126321423792319, "grad_norm": 2.46875, "kl": 0.033338344655930996, "learning_rate": 4.989524784413835e-06, "loss": 0.0013, "reward": 2.645833432674408, "reward_std": 0.7206298857927322, "rewards/correctness_reward_func": 1.2500000223517418, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 236 }, { "completion_length": 152.75000762939453, "epoch": 0.12685668406262546, "grad_norm": 1.9609375, "kl": 0.028460218803957105, "learning_rate": 4.98909317606793e-06, "loss": 0.0011, "reward": 2.7500000596046448, "reward_std": 0.3881702572107315, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3541666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 237 }, { "completion_length": 157.2916717529297, "epoch": 0.12739194433293188, "grad_norm": 2.03125, "kl": 0.029248481849208474, "learning_rate": 4.98865287400808e-06, "loss": 0.0012, "reward": 3.0104167461395264, "reward_std": 0.6388503015041351, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.354166679084301, "rewards/xmlcount_reward_func": 0.4895833358168602, "step": 238 }, { "completion_length": 136.70833587646484, "epoch": 0.12792720460323834, "grad_norm": 2.109375, "kl": 0.038170427549630404, "learning_rate": 4.988203879772136e-06, "loss": 0.0015, "reward": 2.854166716337204, "reward_std": 0.6508470773696899, "rewards/correctness_reward_func": 1.4166667088866234, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.5, "step": 239 }, { "completion_length": 131.41667366027832, "epoch": 0.12846246487354476, "grad_norm": 2.546875, "kl": 0.03704935172572732, "learning_rate": 4.987746194928311e-06, "loss": 0.0015, "reward": 2.8541667461395264, "reward_std": 0.8091580979526043, "rewards/correctness_reward_func": 1.4166667014360428, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 240 }, { "completion_length": 163.2083396911621, "epoch": 0.1289977251438512, "grad_norm": 1.796875, "kl": 0.03925035800784826, "learning_rate": 4.9872798210751725e-06, "loss": 0.0016, "reward": 2.4375000596046448, "reward_std": 0.815828587859869, "rewards/correctness_reward_func": 1.0000000074505806, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 241 }, { "completion_length": 168.9583396911621, "epoch": 0.12953298541415764, "grad_norm": 1.703125, "kl": 0.02837916323915124, "learning_rate": 4.986804759841635e-06, "loss": 0.0011, "reward": 2.833333373069763, "reward_std": 0.3236204944550991, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 242 }, { "completion_length": 149.8750057220459, "epoch": 0.13006824568446407, "grad_norm": 1.8203125, "kl": 0.06834348477423191, "learning_rate": 4.986321012886956e-06, "loss": 0.0027, "reward": 3.070833384990692, "reward_std": 0.3826806955039501, "rewards/correctness_reward_func": 1.6666666716337204, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.416666679084301, "rewards/xmlcount_reward_func": 0.48750000447034836, "step": 243 }, { "completion_length": 139.2500057220459, "epoch": 0.1306035059547705, "grad_norm": 1.59375, "kl": 0.040616510435938835, "learning_rate": 4.98582858190073e-06, "loss": 0.0016, "reward": 2.895833373069763, "reward_std": 0.5750639587640762, "rewards/correctness_reward_func": 1.583333358168602, "rewards/int_reward_func": 0.3333333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 244 }, { "completion_length": 193.5416717529297, "epoch": 0.13113876622507695, "grad_norm": 1.625, "kl": 0.030194721184670925, "learning_rate": 4.985327468602881e-06, "loss": 0.0012, "reward": 2.802083432674408, "reward_std": 0.816511832177639, "rewards/correctness_reward_func": 1.5000000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.31250000558793545, "rewards/xmlcount_reward_func": 0.4895833358168602, "step": 245 }, { "completion_length": 137.37500381469727, "epoch": 0.13167402649538337, "grad_norm": 1.7265625, "kl": 0.0411061723716557, "learning_rate": 4.984817674743661e-06, "loss": 0.0016, "reward": 2.8125000596046448, "reward_std": 0.6043404638767242, "rewards/correctness_reward_func": 1.4166667088866234, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 246 }, { "completion_length": 145.75000381469727, "epoch": 0.13220928676568983, "grad_norm": 1.90625, "kl": 0.039873102214187384, "learning_rate": 4.984299202103638e-06, "loss": 0.0016, "reward": 2.9166666865348816, "reward_std": 0.6238463968038559, "rewards/correctness_reward_func": 1.5833333432674408, "rewards/int_reward_func": 0.4375000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333395421505, "rewards/xmlcount_reward_func": 0.5, "step": 247 }, { "completion_length": 103.16666793823242, "epoch": 0.13274454703599625, "grad_norm": 2.109375, "kl": 0.03916989779099822, "learning_rate": 4.9837720524936935e-06, "loss": 0.0016, "reward": 3.2500000596046448, "reward_std": 0.46232305467128754, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 248 }, { "completion_length": 152.0833396911621, "epoch": 0.13327980730630268, "grad_norm": 1.4296875, "kl": 0.022294364403933287, "learning_rate": 4.983236227755015e-06, "loss": 0.0009, "reward": 2.833333373069763, "reward_std": 0.7361843436956406, "rewards/correctness_reward_func": 1.333333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 249 }, { "completion_length": 140.79166984558105, "epoch": 0.13381506757660913, "grad_norm": 2.625, "kl": 0.023324530571699142, "learning_rate": 4.98269172975909e-06, "loss": 0.0009, "reward": 2.6666667461395264, "reward_std": 0.7144345194101334, "rewards/correctness_reward_func": 1.2500000596046448, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 250 }, { "completion_length": 145.9583396911621, "epoch": 0.13435032784691556, "grad_norm": 1.8984375, "kl": 0.0366662573069334, "learning_rate": 4.982138560407701e-06, "loss": 0.0015, "reward": 3.208333373069763, "reward_std": 0.4392816424369812, "rewards/correctness_reward_func": 1.8333333432674408, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.4791666716337204, "step": 251 }, { "completion_length": 163.04167556762695, "epoch": 0.134885588117222, "grad_norm": 1.828125, "kl": 0.015792567282915115, "learning_rate": 4.9815767216329145e-06, "loss": 0.0006, "reward": 2.5000000596046448, "reward_std": 0.7361843585968018, "rewards/correctness_reward_func": 1.0000000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 252 }, { "completion_length": 120.37500190734863, "epoch": 0.13542084838752844, "grad_norm": 1.671875, "kl": 0.02686859155073762, "learning_rate": 4.981006215397077e-06, "loss": 0.0011, "reward": 3.4479166865348816, "reward_std": 0.12757760286331177, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.4895833358168602, "step": 253 }, { "completion_length": 132.12500381469727, "epoch": 0.13595610865783486, "grad_norm": 1.453125, "kl": 0.029003456234931946, "learning_rate": 4.980427043692809e-06, "loss": 0.0012, "reward": 3.0625, "reward_std": 0.5670122802257538, "rewards/correctness_reward_func": 1.5833333432674408, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 254 }, { "completion_length": 116.25000381469727, "epoch": 0.13649136892814132, "grad_norm": 2.375, "kl": 0.03643447207286954, "learning_rate": 4.979839208542999e-06, "loss": 0.0015, "reward": 3.2291667461395264, "reward_std": 0.5133541040122509, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 255 }, { "completion_length": 141.1666717529297, "epoch": 0.13702662919844774, "grad_norm": 2.078125, "kl": 0.04122666455805302, "learning_rate": 4.979242712000792e-06, "loss": 0.0016, "reward": 2.958333432674408, "reward_std": 0.7841716669499874, "rewards/correctness_reward_func": 1.5833333730697632, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.39583333395421505, "rewards/xmlcount_reward_func": 0.5, "step": 256 }, { "completion_length": 116.50000381469727, "epoch": 0.1375618894687542, "grad_norm": 0.91015625, "kl": 0.036849388387054205, "learning_rate": 4.978637556149582e-06, "loss": 0.0015, "reward": 3.4791666865348816, "reward_std": 0.05103103816509247, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 257 }, { "completion_length": 118.50000381469727, "epoch": 0.13809714973906062, "grad_norm": 1.890625, "kl": 0.032016648445278406, "learning_rate": 4.978023743103017e-06, "loss": 0.0013, "reward": 3.145833373069763, "reward_std": 0.5290164947509766, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 258 }, { "completion_length": 184.2916717529297, "epoch": 0.13863241000936705, "grad_norm": 1.8359375, "kl": 0.02645092085003853, "learning_rate": 4.977401275004971e-06, "loss": 0.0011, "reward": 2.250000089406967, "reward_std": 0.4426039345562458, "rewards/correctness_reward_func": 0.9166666865348816, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 259 }, { "completion_length": 131.45833778381348, "epoch": 0.1391676702796735, "grad_norm": 2.015625, "kl": 0.03592094453051686, "learning_rate": 4.976770154029556e-06, "loss": 0.0014, "reward": 3.020833432674408, "reward_std": 0.7555890046060085, "rewards/correctness_reward_func": 1.5833333730697632, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 260 }, { "completion_length": 135.91666984558105, "epoch": 0.13970293054997993, "grad_norm": 2.765625, "kl": 0.04309300798922777, "learning_rate": 4.9761303823811004e-06, "loss": 0.0017, "reward": 2.333333432674408, "reward_std": 0.89227694272995, "rewards/correctness_reward_func": 1.0000000447034836, "rewards/int_reward_func": 0.35416666977107525, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 261 }, { "completion_length": 119.50000190734863, "epoch": 0.14023819082028635, "grad_norm": 0.06640625, "kl": 0.04456313792616129, "learning_rate": 4.975481962294152e-06, "loss": 0.0018, "reward": 3.5, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 262 }, { "completion_length": 131.2083396911621, "epoch": 0.1407734510905928, "grad_norm": 1.6953125, "kl": 0.035875370260328054, "learning_rate": 4.974824896033462e-06, "loss": 0.0014, "reward": 2.447916716337204, "reward_std": 0.4791714996099472, "rewards/correctness_reward_func": 1.0000000223517418, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.4895833358168602, "step": 263 }, { "completion_length": 141.12500381469727, "epoch": 0.14130871136089923, "grad_norm": 1.7890625, "kl": 0.04122765874490142, "learning_rate": 4.97415918589398e-06, "loss": 0.0016, "reward": 2.1041666865348816, "reward_std": 0.7500797137618065, "rewards/correctness_reward_func": 0.8333333432674408, "rewards/int_reward_func": 0.2916666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 264 }, { "completion_length": 137.37500381469727, "epoch": 0.14184397163120568, "grad_norm": 2.171875, "kl": 0.036459858529269695, "learning_rate": 4.973484834200849e-06, "loss": 0.0015, "reward": 3.0416667461395264, "reward_std": 0.743688777089119, "rewards/correctness_reward_func": 1.5833333730697632, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 265 }, { "completion_length": 146.6250057220459, "epoch": 0.1423792319015121, "grad_norm": 1.84375, "kl": 0.04969721753150225, "learning_rate": 4.972801843309392e-06, "loss": 0.002, "reward": 2.7916667461395264, "reward_std": 0.6460228823125362, "rewards/correctness_reward_func": 1.4166667088866234, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3750000111758709, "rewards/xmlcount_reward_func": 0.5, "step": 266 }, { "completion_length": 105.58333587646484, "epoch": 0.14291449217181854, "grad_norm": 1.0234375, "kl": 0.0363903921097517, "learning_rate": 4.972110215605108e-06, "loss": 0.0015, "reward": 3.4791666865348816, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 267 }, { "completion_length": 153.37500381469727, "epoch": 0.143449752442125, "grad_norm": 1.9609375, "kl": 0.041106805205345154, "learning_rate": 4.9714099535036606e-06, "loss": 0.0016, "reward": 3.3541667461395264, "reward_std": 0.31970491632819176, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 268 }, { "completion_length": 131.62500190734863, "epoch": 0.14398501271243141, "grad_norm": 1.4296875, "kl": 0.044663478154689074, "learning_rate": 4.970701059450872e-06, "loss": 0.0018, "reward": 3.145833373069763, "reward_std": 0.2837683856487274, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333395421505, "rewards/xmlcount_reward_func": 0.5, "step": 269 }, { "completion_length": 105.79166984558105, "epoch": 0.14452027298273787, "grad_norm": 2.421875, "kl": 0.03859696490690112, "learning_rate": 4.969983535922712e-06, "loss": 0.0015, "reward": 3.145833373069763, "reward_std": 0.7013992667198181, "rewards/correctness_reward_func": 1.666666716337204, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 270 }, { "completion_length": 165.45833587646484, "epoch": 0.1450555332530443, "grad_norm": 1.796875, "kl": 0.03418656159192324, "learning_rate": 4.9692573854252934e-06, "loss": 0.0014, "reward": 2.8541667461395264, "reward_std": 0.6371217370033264, "rewards/correctness_reward_func": 1.4166667088866234, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 271 }, { "completion_length": 124.70833778381348, "epoch": 0.14559079352335072, "grad_norm": 1.078125, "kl": 0.03386909421533346, "learning_rate": 4.968522610494858e-06, "loss": 0.0014, "reward": 3.333333373069763, "reward_std": 0.25819891691207886, "rewards/correctness_reward_func": 1.8333333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 272 }, { "completion_length": 138.0833339691162, "epoch": 0.14612605379365717, "grad_norm": 2.171875, "kl": 0.041910297237336636, "learning_rate": 4.967779213697771e-06, "loss": 0.0017, "reward": 2.779083400964737, "reward_std": 0.713197335600853, "rewards/correctness_reward_func": 1.4166667088866234, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333395421505, "rewards/xmlcount_reward_func": 0.48741666972637177, "step": 273 }, { "completion_length": 120.08333969116211, "epoch": 0.1466613140639636, "grad_norm": 2.34375, "kl": 0.04539045970886946, "learning_rate": 4.967027197630513e-06, "loss": 0.0018, "reward": 2.7500001192092896, "reward_std": 1.0729063749313354, "rewards/correctness_reward_func": 1.3333333730697632, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 274 }, { "completion_length": 119.16666984558105, "epoch": 0.14719657433427003, "grad_norm": 2.1875, "kl": 0.041658067144453526, "learning_rate": 4.966266564919667e-06, "loss": 0.0017, "reward": 2.895833373069763, "reward_std": 0.7781590074300766, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 275 }, { "completion_length": 126.6250057220459, "epoch": 0.14773183460457648, "grad_norm": 1.578125, "kl": 0.022411894984543324, "learning_rate": 4.965497318221915e-06, "loss": 0.0009, "reward": 3.3125000596046448, "reward_std": 0.459279328584671, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 276 }, { "completion_length": 134.58333587646484, "epoch": 0.1482670948748829, "grad_norm": 1.1328125, "kl": 0.03880942426621914, "learning_rate": 4.964719460224019e-06, "loss": 0.0016, "reward": 2.9166666865348816, "reward_std": 0.20412415266036987, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 277 }, { "completion_length": 126.3750057220459, "epoch": 0.14880235514518936, "grad_norm": 1.3125, "kl": 0.022782811895012856, "learning_rate": 4.963932993642825e-06, "loss": 0.0009, "reward": 3.020833373069763, "reward_std": 0.6005255281925201, "rewards/correctness_reward_func": 1.5833333432674408, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 278 }, { "completion_length": 138.50000381469727, "epoch": 0.14933761541549578, "grad_norm": 2.34375, "kl": 0.03208020143210888, "learning_rate": 4.963137921225241e-06, "loss": 0.0013, "reward": 2.5416667461395264, "reward_std": 0.5553287528455257, "rewards/correctness_reward_func": 1.1666666865348816, "rewards/int_reward_func": 0.37500000558793545, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 279 }, { "completion_length": 122.50000190734863, "epoch": 0.1498728756858022, "grad_norm": 0.83203125, "kl": 0.029957736376672983, "learning_rate": 4.962334245748237e-06, "loss": 0.0012, "reward": 3.2916666865348816, "reward_std": 0.3227486312389374, "rewards/correctness_reward_func": 1.8333333432674408, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 280 }, { "completion_length": 147.12500381469727, "epoch": 0.15040813595610866, "grad_norm": 2.21875, "kl": 0.022831235080957413, "learning_rate": 4.961521970018828e-06, "loss": 0.0009, "reward": 2.8125000596046448, "reward_std": 0.8186086416244507, "rewards/correctness_reward_func": 1.4166667014360428, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 281 }, { "completion_length": 140.50000381469727, "epoch": 0.1509433962264151, "grad_norm": 1.7265625, "kl": 0.0313710174523294, "learning_rate": 4.960701096874069e-06, "loss": 0.0013, "reward": 3.208333373069763, "reward_std": 0.4854898601770401, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 282 }, { "completion_length": 195.08333778381348, "epoch": 0.15147865649672154, "grad_norm": 1.7734375, "kl": 0.025825404096394777, "learning_rate": 4.959871629181043e-06, "loss": 0.001, "reward": 2.109375014901161, "reward_std": 0.4769704192876816, "rewards/correctness_reward_func": 0.8333333432674408, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3750000111758709, "rewards/xmlcount_reward_func": 0.484375, "step": 283 }, { "completion_length": 136.3333339691162, "epoch": 0.15201391676702797, "grad_norm": 0.99609375, "kl": 0.0337559818290174, "learning_rate": 4.95903356983685e-06, "loss": 0.0014, "reward": 3.2291666865348816, "reward_std": 0.3001735806465149, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 284 }, { "completion_length": 149.25000381469727, "epoch": 0.1525491770373344, "grad_norm": 1.5078125, "kl": 0.04021261353045702, "learning_rate": 4.958186921768601e-06, "loss": 0.0016, "reward": 2.645833373069763, "reward_std": 0.309229951351881, "rewards/correctness_reward_func": 1.1666666716337204, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 285 }, { "completion_length": 114.50000190734863, "epoch": 0.15308443730764085, "grad_norm": 1.9296875, "kl": 0.049264345318078995, "learning_rate": 4.957331687933402e-06, "loss": 0.002, "reward": 3.395833373069763, "reward_std": 0.25515518710017204, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 286 }, { "completion_length": 147.0833396911621, "epoch": 0.15361969757794727, "grad_norm": 2.140625, "kl": 0.04388295952230692, "learning_rate": 4.956467871318349e-06, "loss": 0.0018, "reward": 2.208333373069763, "reward_std": 1.1271048188209534, "rewards/correctness_reward_func": 0.8333333507180214, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 287 }, { "completion_length": 153.33333587646484, "epoch": 0.15415495784825373, "grad_norm": 1.6875, "kl": 0.034068225882947445, "learning_rate": 4.955595474940515e-06, "loss": 0.0014, "reward": 2.9375000596046448, "reward_std": 0.5253209173679352, "rewards/correctness_reward_func": 1.5000000149011612, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.5, "step": 288 }, { "completion_length": 149.25, "epoch": 0.15469021811856015, "grad_norm": 2.171875, "kl": 0.04076318442821503, "learning_rate": 4.954714501846938e-06, "loss": 0.0016, "reward": 3.083333432674408, "reward_std": 0.6206349097192287, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 289 }, { "completion_length": 154.7916717529297, "epoch": 0.15522547838886658, "grad_norm": 1.71875, "kl": 0.03785711620002985, "learning_rate": 4.9538249551146145e-06, "loss": 0.0015, "reward": 2.6250000596046448, "reward_std": 0.8643502295017242, "rewards/correctness_reward_func": 1.1666666716337204, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 290 }, { "completion_length": 149.54166793823242, "epoch": 0.15576073865917303, "grad_norm": 1.2578125, "kl": 0.038248957600444555, "learning_rate": 4.952926837850485e-06, "loss": 0.0015, "reward": 3.3541667461395264, "reward_std": 0.31970493495464325, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 291 }, { "completion_length": 165.25000381469727, "epoch": 0.15629599892947946, "grad_norm": 1.6875, "kl": 0.027612535748630762, "learning_rate": 4.9520201531914234e-06, "loss": 0.0011, "reward": 2.9166666865348816, "reward_std": 0.69357730448246, "rewards/correctness_reward_func": 1.5000000447034836, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 292 }, { "completion_length": 186.5416717529297, "epoch": 0.15683125919978588, "grad_norm": 1.7578125, "kl": 0.03856555838137865, "learning_rate": 4.9511049043042304e-06, "loss": 0.0015, "reward": 2.3125000596046448, "reward_std": 0.9206569194793701, "rewards/correctness_reward_func": 0.9166666865348816, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.4791666716337204, "step": 293 }, { "completion_length": 145.58333587646484, "epoch": 0.15736651947009234, "grad_norm": 2.84375, "kl": 0.0297771948389709, "learning_rate": 4.950181094385616e-06, "loss": 0.0012, "reward": 2.6875000596046448, "reward_std": 0.9492315649986267, "rewards/correctness_reward_func": 1.2500000447034836, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.5, "step": 294 }, { "completion_length": 124.66666984558105, "epoch": 0.15790177974039876, "grad_norm": 1.9921875, "kl": 0.05197975039482117, "learning_rate": 4.9492487266621925e-06, "loss": 0.0021, "reward": 3.3541667461395264, "reward_std": 0.3023223206400871, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000149011612, "rewards/xmlcount_reward_func": 0.5, "step": 295 }, { "completion_length": 121.95833587646484, "epoch": 0.15843704001070522, "grad_norm": 1.7265625, "kl": 0.04530602786689997, "learning_rate": 4.948307804390462e-06, "loss": 0.0018, "reward": 3.1250000596046448, "reward_std": 0.4909362643957138, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 296 }, { "completion_length": 163.5833396911621, "epoch": 0.15897230028101164, "grad_norm": 1.671875, "kl": 0.03576376102864742, "learning_rate": 4.947358330856808e-06, "loss": 0.0014, "reward": 2.291666716337204, "reward_std": 0.6950604170560837, "rewards/correctness_reward_func": 0.8333333432674408, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 297 }, { "completion_length": 113.6250057220459, "epoch": 0.15950756055131807, "grad_norm": 1.875, "kl": 0.03266171971336007, "learning_rate": 4.946400309377477e-06, "loss": 0.0014, "reward": 3.4165834188461304, "reward_std": 0.06475385317753535, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666679084301, "rewards/xmlcount_reward_func": 0.4999166652560234, "step": 298 }, { "completion_length": 176.79166793823242, "epoch": 0.16004282082162452, "grad_norm": 1.65625, "kl": 0.03471789788454771, "learning_rate": 4.945433743298573e-06, "loss": 0.0014, "reward": 2.515500009059906, "reward_std": 0.9147455990314484, "rewards/correctness_reward_func": 1.2500000149011612, "rewards/int_reward_func": 0.33333333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.4946666657924652, "step": 299 }, { "completion_length": 159.7083339691162, "epoch": 0.16057808109193095, "grad_norm": 2.4375, "kl": 0.061009292490780354, "learning_rate": 4.944458635996045e-06, "loss": 0.0024, "reward": 2.6875001192092896, "reward_std": 0.9677619636058807, "rewards/correctness_reward_func": 1.333333395421505, "rewards/int_reward_func": 0.4375000149011612, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 300 }, { "completion_length": 129.8333396911621, "epoch": 0.1611133413622374, "grad_norm": 1.5078125, "kl": 0.028242026921361685, "learning_rate": 4.943474990875673e-06, "loss": 0.0011, "reward": 3.3125000596046448, "reward_std": 0.4592793434858322, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 301 }, { "completion_length": 148.87500381469727, "epoch": 0.16164860163254383, "grad_norm": 1.890625, "kl": 0.030265600653365254, "learning_rate": 4.942482811373056e-06, "loss": 0.0012, "reward": 3.0625001192092896, "reward_std": 0.7166580855846405, "rewards/correctness_reward_func": 1.5833333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 302 }, { "completion_length": 151.2500057220459, "epoch": 0.16218386190285025, "grad_norm": 2.296875, "kl": 0.03763581905514002, "learning_rate": 4.941482100953604e-06, "loss": 0.0015, "reward": 2.2916667461395264, "reward_std": 0.6810928508639336, "rewards/correctness_reward_func": 0.9166667088866234, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 303 }, { "completion_length": 123.37500190734863, "epoch": 0.1627191221731567, "grad_norm": 1.9921875, "kl": 0.03812553267925978, "learning_rate": 4.940472863112521e-06, "loss": 0.0015, "reward": 2.2875834107398987, "reward_std": 0.296846117824316, "rewards/correctness_reward_func": 0.8333333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.4959166646003723, "step": 304 }, { "completion_length": 127.33333778381348, "epoch": 0.16325438244346313, "grad_norm": 1.203125, "kl": 0.03971145674586296, "learning_rate": 4.939455101374795e-06, "loss": 0.0016, "reward": 2.7291666865348816, "reward_std": 0.3001735806465149, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 305 }, { "completion_length": 131.66666984558105, "epoch": 0.16378964271376958, "grad_norm": 1.8671875, "kl": 0.036954211071133614, "learning_rate": 4.938428819295187e-06, "loss": 0.0015, "reward": 2.520833373069763, "reward_std": 0.7971479445695877, "rewards/correctness_reward_func": 1.0833333507180214, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 306 }, { "completion_length": 168.5833396911621, "epoch": 0.164324902984076, "grad_norm": 1.6953125, "kl": 0.02943408628925681, "learning_rate": 4.937394020458216e-06, "loss": 0.0012, "reward": 3.1250001192092896, "reward_std": 0.5554859936237335, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 307 }, { "completion_length": 207.83333587646484, "epoch": 0.16486016325438244, "grad_norm": 1.859375, "kl": 0.04282863391563296, "learning_rate": 4.9363507084781495e-06, "loss": 0.0017, "reward": 2.286916732788086, "reward_std": 0.894590687006712, "rewards/correctness_reward_func": 1.0000000074505806, "rewards/int_reward_func": 0.3958333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333432674408, "rewards/xmlcount_reward_func": 0.49525000900030136, "step": 308 }, { "completion_length": 135.37500381469727, "epoch": 0.1653954235246889, "grad_norm": 1.359375, "kl": 0.028727824799716473, "learning_rate": 4.935298886998986e-06, "loss": 0.0011, "reward": 3.375, "reward_std": 0.25, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 309 }, { "completion_length": 177.5833396911621, "epoch": 0.16593068379499532, "grad_norm": 1.390625, "kl": 0.025000998750329018, "learning_rate": 4.934238559694448e-06, "loss": 0.001, "reward": 2.7291667461395264, "reward_std": 0.8388482332229614, "rewards/correctness_reward_func": 1.333333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 310 }, { "completion_length": 153.2916717529297, "epoch": 0.16646594406530174, "grad_norm": 1.3046875, "kl": 0.03711768053472042, "learning_rate": 4.9331697302679645e-06, "loss": 0.0015, "reward": 3.0000000298023224, "reward_std": 0.2350260429084301, "rewards/correctness_reward_func": 1.5833333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 311 }, { "completion_length": 102.04166793823242, "epoch": 0.1670012043356082, "grad_norm": 2.3125, "kl": 0.04422319959849119, "learning_rate": 4.932092402452662e-06, "loss": 0.0018, "reward": 3.3854166865348816, "reward_std": 0.280670702457428, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.4895833358168602, "step": 312 }, { "completion_length": 133.75000190734863, "epoch": 0.16753646460591462, "grad_norm": 1.4375, "kl": 0.03163261990994215, "learning_rate": 4.931006580011348e-06, "loss": 0.0013, "reward": 3.333333373069763, "reward_std": 0.40824829041957855, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 313 }, { "completion_length": 100.00000381469727, "epoch": 0.16807172487622107, "grad_norm": 1.203125, "kl": 0.04145035380497575, "learning_rate": 4.9299122667365e-06, "loss": 0.0017, "reward": 3.3125, "reward_std": 0.29315099120140076, "rewards/correctness_reward_func": 1.8333333432674408, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 314 }, { "completion_length": 118.04166984558105, "epoch": 0.1686069851465275, "grad_norm": 0.0859375, "kl": 0.028052533976733685, "learning_rate": 4.928809466450252e-06, "loss": 0.0011, "reward": 3.5, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 315 }, { "completion_length": 132.41666984558105, "epoch": 0.16914224541683393, "grad_norm": 2.125, "kl": 0.06144689908251166, "learning_rate": 4.927698183004379e-06, "loss": 0.0025, "reward": 2.6250000596046448, "reward_std": 0.3296062760055065, "rewards/correctness_reward_func": 1.1666666716337204, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 316 }, { "completion_length": 130.50000286102295, "epoch": 0.16967750568714038, "grad_norm": 1.625, "kl": 0.0342918885871768, "learning_rate": 4.926578420280288e-06, "loss": 0.0014, "reward": 3.020833373069763, "reward_std": 0.841457188129425, "rewards/correctness_reward_func": 1.5833333730697632, "rewards/int_reward_func": 0.4375000149011612, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 317 }, { "completion_length": 131.16666984558105, "epoch": 0.1702127659574468, "grad_norm": 1.5625, "kl": 0.037405913695693016, "learning_rate": 4.925450182189e-06, "loss": 0.0015, "reward": 3.208333373069763, "reward_std": 0.6003471612930298, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 318 }, { "completion_length": 192.0833396911621, "epoch": 0.17074802622775326, "grad_norm": 1.453125, "kl": 0.020101528149098158, "learning_rate": 4.924313472671139e-06, "loss": 0.0008, "reward": 2.2291666865348816, "reward_std": 0.3248923234641552, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 319 }, { "completion_length": 152.12500381469727, "epoch": 0.17128328649805968, "grad_norm": 1.0703125, "kl": 0.04198923846706748, "learning_rate": 4.923168295696917e-06, "loss": 0.0017, "reward": 2.854166716337204, "reward_std": 0.4421939253807068, "rewards/correctness_reward_func": 1.416666679084301, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.5, "step": 320 }, { "completion_length": 149.20833778381348, "epoch": 0.1718185467683661, "grad_norm": 0.7421875, "kl": 0.025474284309893847, "learning_rate": 4.92201465526612e-06, "loss": 0.001, "reward": 3.1666666865348816, "reward_std": 0.25819891691207886, "rewards/correctness_reward_func": 1.6666666716337204, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 321 }, { "completion_length": 157.66666793823242, "epoch": 0.17235380703867256, "grad_norm": 1.4921875, "kl": 0.02526680286973715, "learning_rate": 4.920852555408093e-06, "loss": 0.001, "reward": 2.895833373069763, "reward_std": 0.680737629532814, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 322 }, { "completion_length": 139.75000381469727, "epoch": 0.172889067308979, "grad_norm": 2.5, "kl": 0.05358001682907343, "learning_rate": 4.919682000181734e-06, "loss": 0.0021, "reward": 2.6041667759418488, "reward_std": 0.7565664201974869, "rewards/correctness_reward_func": 1.2500000596046448, "rewards/int_reward_func": 0.4375000149011612, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 323 }, { "completion_length": 145.37500190734863, "epoch": 0.17342432757928541, "grad_norm": 1.2734375, "kl": 0.03602644964121282, "learning_rate": 4.918502993675464e-06, "loss": 0.0014, "reward": 2.833333373069763, "reward_std": 0.40824830532073975, "rewards/correctness_reward_func": 1.3333333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 324 }, { "completion_length": 132.2916717529297, "epoch": 0.17395958784959187, "grad_norm": 1.296875, "kl": 0.026516761165112257, "learning_rate": 4.917315540007229e-06, "loss": 0.0011, "reward": 3.2500000596046448, "reward_std": 0.46232305467128754, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 325 }, { "completion_length": 135.6666717529297, "epoch": 0.1744948481198983, "grad_norm": 1.4453125, "kl": 0.03170072380453348, "learning_rate": 4.916119643324475e-06, "loss": 0.0013, "reward": 2.7916666865348816, "reward_std": 0.3441820256412029, "rewards/correctness_reward_func": 1.3333333432674408, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 326 }, { "completion_length": 112.7500057220459, "epoch": 0.17503010839020475, "grad_norm": 1.875, "kl": 0.03724998049438, "learning_rate": 4.91491530780414e-06, "loss": 0.0015, "reward": 3.145833432674408, "reward_std": 0.6625833064317703, "rewards/correctness_reward_func": 1.666666716337204, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 327 }, { "completion_length": 136.5416717529297, "epoch": 0.17556536866051117, "grad_norm": 1.171875, "kl": 0.029606230091303587, "learning_rate": 4.913702537652634e-06, "loss": 0.0012, "reward": 3.25, "reward_std": 0.273861289024353, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 328 }, { "completion_length": 132.79166793823242, "epoch": 0.1761006289308176, "grad_norm": 1.3671875, "kl": 0.03448170889168978, "learning_rate": 4.912481337105827e-06, "loss": 0.0014, "reward": 3.1041666865348816, "reward_std": 0.5164600908756256, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 329 }, { "completion_length": 139.29166984558105, "epoch": 0.17663588920112405, "grad_norm": 2.0, "kl": 0.03615036187693477, "learning_rate": 4.911251710429034e-06, "loss": 0.0014, "reward": 2.3125000596046448, "reward_std": 0.7174782603979111, "rewards/correctness_reward_func": 0.833333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 330 }, { "completion_length": 109.29166793823242, "epoch": 0.17717114947143048, "grad_norm": 1.921875, "kl": 0.03804372949525714, "learning_rate": 4.910013661917004e-06, "loss": 0.0015, "reward": 2.9791667461395264, "reward_std": 0.5674288682639599, "rewards/correctness_reward_func": 1.5000000149011612, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 331 }, { "completion_length": 134.50000381469727, "epoch": 0.17770640974173693, "grad_norm": 0.95703125, "kl": 0.051706235855817795, "learning_rate": 4.908767195893894e-06, "loss": 0.0021, "reward": 3.274958372116089, "reward_std": 0.36580392718315125, "rewards/correctness_reward_func": 1.8333333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.4624583348631859, "step": 332 }, { "completion_length": 142.33333587646484, "epoch": 0.17824167001204336, "grad_norm": 2.078125, "kl": 0.05041641462594271, "learning_rate": 4.907512316713269e-06, "loss": 0.002, "reward": 2.020833373069763, "reward_std": 0.6808377355337143, "rewards/correctness_reward_func": 0.583333358168602, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 333 }, { "completion_length": 145.45833587646484, "epoch": 0.17877693028234978, "grad_norm": 1.5390625, "kl": 0.01903323526494205, "learning_rate": 4.906249028758072e-06, "loss": 0.0008, "reward": 2.708333373069763, "reward_std": 0.7714351117610931, "rewards/correctness_reward_func": 1.2500000223517418, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 334 }, { "completion_length": 139.83333778381348, "epoch": 0.17931219055265624, "grad_norm": 2.203125, "kl": 0.038641279097646475, "learning_rate": 4.9049773364406185e-06, "loss": 0.0015, "reward": 2.958333432674408, "reward_std": 0.8382464461028576, "rewards/correctness_reward_func": 1.5000000298023224, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 335 }, { "completion_length": 149.8333396911621, "epoch": 0.17984745082296266, "grad_norm": 1.4296875, "kl": 0.03192885918542743, "learning_rate": 4.90369724420258e-06, "loss": 0.0013, "reward": 3.208333373069763, "reward_std": 0.5020104348659515, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 336 }, { "completion_length": 209.75000762939453, "epoch": 0.18038271109326912, "grad_norm": 0.69140625, "kl": 0.02079350664280355, "learning_rate": 4.902408756514964e-06, "loss": 0.0008, "reward": 2.8125, "reward_std": 0.10458251088857651, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3333333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 337 }, { "completion_length": 109.70833587646484, "epoch": 0.18091797136357554, "grad_norm": 1.359375, "kl": 0.044482083059847355, "learning_rate": 4.901111877878099e-06, "loss": 0.0018, "reward": 3.2500000596046448, "reward_std": 0.46232305467128754, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 338 }, { "completion_length": 119.83333778381348, "epoch": 0.18145323163388197, "grad_norm": 1.0, "kl": 0.037186274304986, "learning_rate": 4.899806612821626e-06, "loss": 0.0015, "reward": 3.2291666865348816, "reward_std": 0.3001735806465149, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 339 }, { "completion_length": 135.04166793823242, "epoch": 0.18198849190418842, "grad_norm": 2.140625, "kl": 0.03643118590116501, "learning_rate": 4.898492965904475e-06, "loss": 0.0015, "reward": 3.145833432674408, "reward_std": 0.5953381061553955, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 340 }, { "completion_length": 163.2916717529297, "epoch": 0.18252375217449485, "grad_norm": 1.0625, "kl": 0.03810536675155163, "learning_rate": 4.89717094171485e-06, "loss": 0.0015, "reward": 2.4375000596046448, "reward_std": 0.11558076739311218, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 341 }, { "completion_length": 110.87500381469727, "epoch": 0.18305901244480127, "grad_norm": 2.609375, "kl": 0.08168017817661166, "learning_rate": 4.8958405448702166e-06, "loss": 0.0033, "reward": 3.208333373069763, "reward_std": 0.6582482904195786, "rewards/correctness_reward_func": 1.7500000596046448, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 342 }, { "completion_length": 116.29166984558105, "epoch": 0.18359427271510773, "grad_norm": 0.8828125, "kl": 0.030964480247348547, "learning_rate": 4.894501780017281e-06, "loss": 0.0012, "reward": 3.4791666865348816, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 343 }, { "completion_length": 168.2916717529297, "epoch": 0.18412953298541415, "grad_norm": 1.6328125, "kl": 0.03289065742865205, "learning_rate": 4.893154651831982e-06, "loss": 0.0013, "reward": 2.7500000298023224, "reward_std": 0.46232305467128754, "rewards/correctness_reward_func": 1.3333333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 344 }, { "completion_length": 129.58333778381348, "epoch": 0.1846647932557206, "grad_norm": 1.921875, "kl": 0.03485443163663149, "learning_rate": 4.891799165019462e-06, "loss": 0.0014, "reward": 2.958333432674408, "reward_std": 0.8382464498281479, "rewards/correctness_reward_func": 1.5000000298023224, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 345 }, { "completion_length": 120.33333396911621, "epoch": 0.18520005352602703, "grad_norm": 1.65625, "kl": 0.04113559052348137, "learning_rate": 4.890435324314064e-06, "loss": 0.0016, "reward": 3.2500000596046448, "reward_std": 0.46232305467128754, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 346 }, { "completion_length": 139.12500190734863, "epoch": 0.18573531379633346, "grad_norm": 2.71875, "kl": 0.06854599853977561, "learning_rate": 4.889063134479307e-06, "loss": 0.0027, "reward": 2.9375, "reward_std": 0.770716518163681, "rewards/correctness_reward_func": 1.5000000298023224, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 347 }, { "completion_length": 141.54166984558105, "epoch": 0.1862705740666399, "grad_norm": 0.94140625, "kl": 0.02241353038698435, "learning_rate": 4.887682600307868e-06, "loss": 0.0009, "reward": 3.208333373069763, "reward_std": 0.33841101825237274, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 348 }, { "completion_length": 114.79166984558105, "epoch": 0.18680583433694634, "grad_norm": 1.46875, "kl": 0.03936735028401017, "learning_rate": 4.886293726621572e-06, "loss": 0.0016, "reward": 2.895833373069763, "reward_std": 0.25515517592430115, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 349 }, { "completion_length": 118.91667175292969, "epoch": 0.1873410946072528, "grad_norm": 1.78125, "kl": 0.04522312618792057, "learning_rate": 4.884896518271371e-06, "loss": 0.0018, "reward": 2.833333373069763, "reward_std": 0.40824827551841736, "rewards/correctness_reward_func": 1.3333333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 350 }, { "completion_length": 135.08333778381348, "epoch": 0.18787635487755922, "grad_norm": 1.2734375, "kl": 0.03455492667853832, "learning_rate": 4.883490980137327e-06, "loss": 0.0014, "reward": 3.020833373069763, "reward_std": 0.41942431032657623, "rewards/correctness_reward_func": 1.583333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.5, "step": 351 }, { "completion_length": 130.08333587646484, "epoch": 0.18841161514786564, "grad_norm": 2.734375, "kl": 0.033602004405111074, "learning_rate": 4.882077117128596e-06, "loss": 0.0013, "reward": 3.0000001192092896, "reward_std": 0.9246461093425751, "rewards/correctness_reward_func": 1.5000000596046448, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 352 }, { "completion_length": 150.91666793823242, "epoch": 0.1889468754181721, "grad_norm": 1.984375, "kl": 0.031116609927266836, "learning_rate": 4.88065493418341e-06, "loss": 0.0012, "reward": 2.6041666865348816, "reward_std": 0.8001735806465149, "rewards/correctness_reward_func": 1.2500000223517418, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3750000111758709, "rewards/xmlcount_reward_func": 0.5, "step": 353 }, { "completion_length": 128.8333339691162, "epoch": 0.18948213568847852, "grad_norm": 1.734375, "kl": 0.03071408625692129, "learning_rate": 4.879224436269061e-06, "loss": 0.0012, "reward": 2.8333334028720856, "reward_std": 0.6664472073316574, "rewards/correctness_reward_func": 1.3333333656191826, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 354 }, { "completion_length": 179.3333396911621, "epoch": 0.19001739595878495, "grad_norm": 1.7265625, "kl": 0.024944405537098646, "learning_rate": 4.877785628381882e-06, "loss": 0.001, "reward": 2.995333433151245, "reward_std": 0.6755668371915817, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3541666679084301, "rewards/xmlcount_reward_func": 0.47450000047683716, "step": 355 }, { "completion_length": 173.66666793823242, "epoch": 0.1905526562290914, "grad_norm": 1.78125, "kl": 0.03421852085739374, "learning_rate": 4.8763385155472335e-06, "loss": 0.0014, "reward": 2.432291716337204, "reward_std": 0.8869683742523193, "rewards/correctness_reward_func": 1.0833333432674408, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333395421505, "rewards/xmlcount_reward_func": 0.4947916716337204, "step": 356 }, { "completion_length": 157.12500381469727, "epoch": 0.19108791649939783, "grad_norm": 2.125, "kl": 0.04010986629873514, "learning_rate": 4.874883102819477e-06, "loss": 0.0016, "reward": 3.020833432674408, "reward_std": 0.6519949287176132, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 357 }, { "completion_length": 165.7916717529297, "epoch": 0.19162317676970428, "grad_norm": 1.8203125, "kl": 0.039810370188206434, "learning_rate": 4.873419395281968e-06, "loss": 0.0016, "reward": 2.6875000596046448, "reward_std": 0.623045951128006, "rewards/correctness_reward_func": 1.3333333358168602, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3750000037252903, "rewards/xmlcount_reward_func": 0.5, "step": 358 }, { "completion_length": 109.95833587646484, "epoch": 0.1921584370400107, "grad_norm": 1.4765625, "kl": 0.04814850306138396, "learning_rate": 4.871947398047031e-06, "loss": 0.0019, "reward": 3.3125000596046448, "reward_std": 0.40438438951969147, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 359 }, { "completion_length": 121.00000190734863, "epoch": 0.19269369731031713, "grad_norm": 2.6875, "kl": 0.0602885982953012, "learning_rate": 4.870467116255947e-06, "loss": 0.0024, "reward": 2.979166716337204, "reward_std": 0.4592793136835098, "rewards/correctness_reward_func": 1.5000000223517418, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 360 }, { "completion_length": 122.95833587646484, "epoch": 0.19322895758062358, "grad_norm": 1.9921875, "kl": 0.044847925659269094, "learning_rate": 4.86897855507893e-06, "loss": 0.0018, "reward": 2.895833432674408, "reward_std": 0.7715530209243298, "rewards/correctness_reward_func": 1.4166667014360428, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 361 }, { "completion_length": 154.25000190734863, "epoch": 0.19376421785093, "grad_norm": 1.8046875, "kl": 0.025079205399379134, "learning_rate": 4.867481719715112e-06, "loss": 0.001, "reward": 2.8695000410079956, "reward_std": 0.31965839862823486, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.4945000037550926, "step": 362 }, { "completion_length": 166.75000762939453, "epoch": 0.19429947812123646, "grad_norm": 1.421875, "kl": 0.03631326276808977, "learning_rate": 4.8659766153925244e-06, "loss": 0.0015, "reward": 3.020833373069763, "reward_std": 0.5223132222890854, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3750000037252903, "rewards/xmlcount_reward_func": 0.5, "step": 363 }, { "completion_length": 160.00000762939453, "epoch": 0.1948347383915429, "grad_norm": 1.1484375, "kl": 0.03517352696508169, "learning_rate": 4.864463247368082e-06, "loss": 0.0014, "reward": 2.916666716337204, "reward_std": 0.43686148524284363, "rewards/correctness_reward_func": 1.5000000223517418, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.5, "step": 364 }, { "completion_length": 145.70833587646484, "epoch": 0.19536999866184931, "grad_norm": 2.203125, "kl": 0.025485435500741005, "learning_rate": 4.862941620927559e-06, "loss": 0.001, "reward": 2.7291667461395264, "reward_std": 0.9913395643234253, "rewards/correctness_reward_func": 1.2500000447034836, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 365 }, { "completion_length": 103.95833396911621, "epoch": 0.19590525893215577, "grad_norm": 2.5625, "kl": 0.05637668911367655, "learning_rate": 4.861411741385578e-06, "loss": 0.0023, "reward": 3.3125000596046448, "reward_std": 0.40438440442085266, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 366 }, { "completion_length": 138.25000381469727, "epoch": 0.1964405192024622, "grad_norm": 2.28125, "kl": 0.036815219558775425, "learning_rate": 4.859873614085582e-06, "loss": 0.0015, "reward": 2.7916667461395264, "reward_std": 0.7334393262863159, "rewards/correctness_reward_func": 1.3333333656191826, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 367 }, { "completion_length": 123.00000190734863, "epoch": 0.19697577947276865, "grad_norm": 2.734375, "kl": 0.06413549091666937, "learning_rate": 4.8583272443998265e-06, "loss": 0.0026, "reward": 3.020833373069763, "reward_std": 0.7092793434858322, "rewards/correctness_reward_func": 1.5833333730697632, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 368 }, { "completion_length": 112.29166793823242, "epoch": 0.19751103974307507, "grad_norm": 0.107421875, "kl": 0.04899565642699599, "learning_rate": 4.856772637729352e-06, "loss": 0.002, "reward": 3.5, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 369 }, { "completion_length": 134.91666793823242, "epoch": 0.1980463000133815, "grad_norm": 2.640625, "kl": 0.08906554849818349, "learning_rate": 4.8552097995039696e-06, "loss": 0.0036, "reward": 2.625000089406967, "reward_std": 0.963577076792717, "rewards/correctness_reward_func": 1.1666667088866234, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 370 }, { "completion_length": 126.83333778381348, "epoch": 0.19858156028368795, "grad_norm": 0.88671875, "kl": 0.02941274642944336, "learning_rate": 4.853638735182241e-06, "loss": 0.0012, "reward": 3.0, "reward_std": 0.19364917278289795, "rewards/correctness_reward_func": 1.5833333358168602, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 371 }, { "completion_length": 126.54167175292969, "epoch": 0.19911682055399438, "grad_norm": 1.8515625, "kl": 0.0400623818859458, "learning_rate": 4.852059450251459e-06, "loss": 0.0016, "reward": 3.2291667461395264, "reward_std": 0.5133540891110897, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 372 }, { "completion_length": 148.29166793823242, "epoch": 0.1996520808243008, "grad_norm": 2.125, "kl": 0.029276425717398524, "learning_rate": 4.850471950227631e-06, "loss": 0.0012, "reward": 2.833333432674408, "reward_std": 1.0537454932928085, "rewards/correctness_reward_func": 1.416666716337204, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 373 }, { "completion_length": 138.7500057220459, "epoch": 0.20018734109460726, "grad_norm": 0.71875, "kl": 0.026868863962590694, "learning_rate": 4.848876240655452e-06, "loss": 0.0011, "reward": 3.4791666865348816, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 374 }, { "completion_length": 152.9583396911621, "epoch": 0.20072260136491368, "grad_norm": 1.53125, "kl": 0.027000244241207838, "learning_rate": 4.847272327108298e-06, "loss": 0.0011, "reward": 2.9375000596046448, "reward_std": 0.6154161691665649, "rewards/correctness_reward_func": 1.5833333432674408, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 375 }, { "completion_length": 150.87500381469727, "epoch": 0.20125786163522014, "grad_norm": 1.6953125, "kl": 0.03350049676373601, "learning_rate": 4.845660215188192e-06, "loss": 0.0013, "reward": 2.5000000298023224, "reward_std": 0.5425351560115814, "rewards/correctness_reward_func": 1.1666666865348816, "rewards/int_reward_func": 0.4166666679084301, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 376 }, { "completion_length": 138.00000190734863, "epoch": 0.20179312190552656, "grad_norm": 1.6328125, "kl": 0.03322783159092069, "learning_rate": 4.844039910525797e-06, "loss": 0.0013, "reward": 2.583333432674408, "reward_std": 0.6289348416030407, "rewards/correctness_reward_func": 1.1666666865348816, "rewards/int_reward_func": 0.4375000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 377 }, { "completion_length": 146.33333587646484, "epoch": 0.202328382175833, "grad_norm": 1.359375, "kl": 0.03654646477662027, "learning_rate": 4.8424114187803885e-06, "loss": 0.0015, "reward": 2.8125000596046448, "reward_std": 0.309229951351881, "rewards/correctness_reward_func": 1.3333333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 378 }, { "completion_length": 161.54166984558105, "epoch": 0.20286364244613944, "grad_norm": 1.90625, "kl": 0.032768722623586655, "learning_rate": 4.8407747456398365e-06, "loss": 0.0013, "reward": 3.0000000596046448, "reward_std": 0.5796062797307968, "rewards/correctness_reward_func": 1.583333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.416666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 379 }, { "completion_length": 135.62500190734863, "epoch": 0.20339890271644587, "grad_norm": 0.9453125, "kl": 0.0361147103831172, "learning_rate": 4.83912989682059e-06, "loss": 0.0014, "reward": 3.2291666865348816, "reward_std": 0.3001735806465149, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 380 }, { "completion_length": 123.00000190734863, "epoch": 0.20393416298675232, "grad_norm": 2.484375, "kl": 0.030705954413861036, "learning_rate": 4.837476878067649e-06, "loss": 0.0012, "reward": 3.2009167671203613, "reward_std": 0.6777066141366959, "rewards/correctness_reward_func": 1.7500000596046448, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.49258333444595337, "step": 381 }, { "completion_length": 139.33333587646484, "epoch": 0.20446942325705875, "grad_norm": 1.140625, "kl": 0.03536796988919377, "learning_rate": 4.8358156951545515e-06, "loss": 0.0014, "reward": 3.333333373069763, "reward_std": 0.2686738818883896, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 382 }, { "completion_length": 151.5833396911621, "epoch": 0.20500468352736517, "grad_norm": 1.453125, "kl": 0.02349434932693839, "learning_rate": 4.834146353883349e-06, "loss": 0.0009, "reward": 2.6875000596046448, "reward_std": 0.5792608670890331, "rewards/correctness_reward_func": 1.3333333432674408, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.37500000558793545, "rewards/xmlcount_reward_func": 0.5, "step": 383 }, { "completion_length": 140.75000381469727, "epoch": 0.20553994379767163, "grad_norm": 1.8203125, "kl": 0.02586292941123247, "learning_rate": 4.832468860084591e-06, "loss": 0.001, "reward": 2.708333373069763, "reward_std": 0.6942067444324493, "rewards/correctness_reward_func": 1.2500000223517418, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 384 }, { "completion_length": 169.2083396911621, "epoch": 0.20607520406797805, "grad_norm": 1.9765625, "kl": 0.040242417715489864, "learning_rate": 4.830783219617296e-06, "loss": 0.0016, "reward": 2.7291667759418488, "reward_std": 0.9853319078683853, "rewards/correctness_reward_func": 1.3333333879709244, "rewards/int_reward_func": 0.4166666679084301, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 385 }, { "completion_length": 163.58333587646484, "epoch": 0.20661046433828448, "grad_norm": 1.28125, "kl": 0.039259279146790504, "learning_rate": 4.829089438368944e-06, "loss": 0.0016, "reward": 2.9791666865348816, "reward_std": 0.6364961266517639, "rewards/correctness_reward_func": 1.5833333432674408, "rewards/int_reward_func": 0.4375000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 386 }, { "completion_length": 142.4166717529297, "epoch": 0.20714572460859093, "grad_norm": 1.59375, "kl": 0.027523174416273832, "learning_rate": 4.82738752225544e-06, "loss": 0.0011, "reward": 2.4375000596046448, "reward_std": 0.7460914701223373, "rewards/correctness_reward_func": 1.0833333507180214, "rewards/int_reward_func": 0.39583333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 387 }, { "completion_length": 154.37500381469727, "epoch": 0.20768098487889736, "grad_norm": 1.6875, "kl": 0.05612138286232948, "learning_rate": 4.825677477221109e-06, "loss": 0.0022, "reward": 2.4166667461395264, "reward_std": 0.16661180183291435, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 388 }, { "completion_length": 160.5000057220459, "epoch": 0.2082162451492038, "grad_norm": 1.8984375, "kl": 0.04402688471600413, "learning_rate": 4.823959309238665e-06, "loss": 0.0018, "reward": 2.395833373069763, "reward_std": 0.6816265136003494, "rewards/correctness_reward_func": 1.083333358168602, "rewards/int_reward_func": 0.4375000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.5, "step": 389 }, { "completion_length": 140.50000190734863, "epoch": 0.20875150541951024, "grad_norm": 1.7109375, "kl": 0.034618140663951635, "learning_rate": 4.822233024309193e-06, "loss": 0.0014, "reward": 2.708333373069763, "reward_std": 0.37592335790395737, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 390 }, { "completion_length": 113.45833587646484, "epoch": 0.20928676568981666, "grad_norm": 0.83203125, "kl": 0.02321735117584467, "learning_rate": 4.820498628462129e-06, "loss": 0.0009, "reward": 2.9166666865348816, "reward_std": 0.20412415266036987, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 391 }, { "completion_length": 123.66666984558105, "epoch": 0.20982202596012312, "grad_norm": 1.9296875, "kl": 0.026443745708093047, "learning_rate": 4.8187561277552376e-06, "loss": 0.0011, "reward": 3.2847084403038025, "reward_std": 0.37730535864830017, "rewards/correctness_reward_func": 1.8333333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.49304167181253433, "step": 392 }, { "completion_length": 146.54166793823242, "epoch": 0.21035728623042954, "grad_norm": 2.046875, "kl": 0.050067766569554806, "learning_rate": 4.8170055282745915e-06, "loss": 0.002, "reward": 2.5000000596046448, "reward_std": 0.6325703375041485, "rewards/correctness_reward_func": 1.0833333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.416666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 393 }, { "completion_length": 143.58333587646484, "epoch": 0.210892546500736, "grad_norm": 1.9140625, "kl": 0.04486254137009382, "learning_rate": 4.815246836134551e-06, "loss": 0.0018, "reward": 2.8750000596046448, "reward_std": 0.8515234887599945, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 394 }, { "completion_length": 161.75000381469727, "epoch": 0.21142780677104242, "grad_norm": 2.453125, "kl": 0.027609082404524088, "learning_rate": 4.8134800574777415e-06, "loss": 0.0011, "reward": 2.8281250596046448, "reward_std": 0.8165152966976166, "rewards/correctness_reward_func": 1.4166667014360428, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.4947916716337204, "step": 395 }, { "completion_length": 164.70833587646484, "epoch": 0.21196306704134885, "grad_norm": 1.625, "kl": 0.053220887668430805, "learning_rate": 4.811705198475032e-06, "loss": 0.0021, "reward": 3.145833373069763, "reward_std": 0.5137150399386883, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.33333334140479565, "rewards/xmlcount_reward_func": 0.5, "step": 396 }, { "completion_length": 140.75000381469727, "epoch": 0.2124983273116553, "grad_norm": 1.09375, "kl": 0.03010843973606825, "learning_rate": 4.809922265325513e-06, "loss": 0.0012, "reward": 2.7291666865348816, "reward_std": 0.35721729323267937, "rewards/correctness_reward_func": 1.3333333432674408, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 397 }, { "completion_length": 136.33333778381348, "epoch": 0.21303358758196173, "grad_norm": 1.3828125, "kl": 0.027536609675735235, "learning_rate": 4.808131264256479e-06, "loss": 0.0011, "reward": 2.645833373069763, "reward_std": 0.5042977333068848, "rewards/correctness_reward_func": 1.1666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 398 }, { "completion_length": 153.8333396911621, "epoch": 0.21356884785226818, "grad_norm": 1.390625, "kl": 0.03445200156420469, "learning_rate": 4.806332201523399e-06, "loss": 0.0014, "reward": 2.708333373069763, "reward_std": 0.564385175704956, "rewards/correctness_reward_func": 1.2500000298023224, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 399 }, { "completion_length": 130.9166717529297, "epoch": 0.2141041081225746, "grad_norm": 1.8046875, "kl": 0.055761674884706736, "learning_rate": 4.804525083409902e-06, "loss": 0.0022, "reward": 3.3750000596046448, "reward_std": 0.3061862215399742, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 400 }, { "completion_length": 156.5833396911621, "epoch": 0.21463936839288103, "grad_norm": 1.796875, "kl": 0.028924104291945696, "learning_rate": 4.802709916227753e-06, "loss": 0.0012, "reward": 3.2916667461395264, "reward_std": 0.3347994163632393, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3750000111758709, "rewards/xmlcount_reward_func": 0.5, "step": 401 }, { "completion_length": 98.45833587646484, "epoch": 0.21517462866318748, "grad_norm": 2.046875, "kl": 0.07245343318209052, "learning_rate": 4.800886706316828e-06, "loss": 0.0029, "reward": 3.208333373069763, "reward_std": 0.5483061634004116, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 402 }, { "completion_length": 154.0000057220459, "epoch": 0.2157098889334939, "grad_norm": 1.453125, "kl": 0.02168190269730985, "learning_rate": 4.7990554600450945e-06, "loss": 0.0009, "reward": 2.8750000298023224, "reward_std": 0.4854898750782013, "rewards/correctness_reward_func": 1.416666679084301, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 403 }, { "completion_length": 132.45833587646484, "epoch": 0.21624514920380034, "grad_norm": 1.90625, "kl": 0.033673313446342945, "learning_rate": 4.79721618380859e-06, "loss": 0.0013, "reward": 2.65625, "reward_std": 0.4188731759786606, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.4895833358168602, "step": 404 }, { "completion_length": 130.37500381469727, "epoch": 0.2167804094741068, "grad_norm": 1.1328125, "kl": 0.026179906912148, "learning_rate": 4.795368884031397e-06, "loss": 0.001, "reward": 3.2291666865348816, "reward_std": 0.3248923234641552, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 405 }, { "completion_length": 135.83334159851074, "epoch": 0.21731566974441321, "grad_norm": 1.90625, "kl": 0.043639494106173515, "learning_rate": 4.793513567165623e-06, "loss": 0.0017, "reward": 2.645833373069763, "reward_std": 0.5155290961265564, "rewards/correctness_reward_func": 1.2500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333395421505, "rewards/xmlcount_reward_func": 0.5, "step": 406 }, { "completion_length": 150.3333396911621, "epoch": 0.21785093001471967, "grad_norm": 1.28125, "kl": 0.028193223755806684, "learning_rate": 4.791650239691377e-06, "loss": 0.0011, "reward": 3.145833373069763, "reward_std": 0.5290164947509766, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 407 }, { "completion_length": 139.62500762939453, "epoch": 0.2183861902850261, "grad_norm": 1.1796875, "kl": 0.04055926762521267, "learning_rate": 4.7897789081167444e-06, "loss": 0.0016, "reward": 3.020833373069763, "reward_std": 0.22181354090571404, "rewards/correctness_reward_func": 1.5833333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 408 }, { "completion_length": 137.7083396911621, "epoch": 0.21892145055533252, "grad_norm": 1.5234375, "kl": 0.03536925697699189, "learning_rate": 4.787899578977772e-06, "loss": 0.0014, "reward": 2.6666666865348816, "reward_std": 0.4518480896949768, "rewards/correctness_reward_func": 1.2500000074505806, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 409 }, { "completion_length": 152.33333587646484, "epoch": 0.21945671082563897, "grad_norm": 1.96875, "kl": 0.044180845376104116, "learning_rate": 4.786012258838433e-06, "loss": 0.0018, "reward": 3.0000000596046448, "reward_std": 0.48101906478405, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3333333395421505, "rewards/xmlcount_reward_func": 0.5, "step": 410 }, { "completion_length": 155.33333587646484, "epoch": 0.2199919710959454, "grad_norm": 1.921875, "kl": 0.022907113656401634, "learning_rate": 4.784116954290618e-06, "loss": 0.0009, "reward": 2.8750000596046448, "reward_std": 0.7123230546712875, "rewards/correctness_reward_func": 1.4166667014360428, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 411 }, { "completion_length": 136.7916717529297, "epoch": 0.22052723136625185, "grad_norm": 2.578125, "kl": 0.05602648947387934, "learning_rate": 4.782213671954099e-06, "loss": 0.0022, "reward": 2.708333373069763, "reward_std": 0.5020104050636292, "rewards/correctness_reward_func": 1.2500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 412 }, { "completion_length": 172.08333587646484, "epoch": 0.22106249163655828, "grad_norm": 1.2265625, "kl": 0.018192848656326532, "learning_rate": 4.780302418476516e-06, "loss": 0.0007, "reward": 3.208333373069763, "reward_std": 0.5020104013383389, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 413 }, { "completion_length": 151.29167366027832, "epoch": 0.2215977519068647, "grad_norm": 1.0703125, "kl": 0.04675913043320179, "learning_rate": 4.778383200533349e-06, "loss": 0.0019, "reward": 2.9791666865348816, "reward_std": 0.2601960562169552, "rewards/correctness_reward_func": 1.5833333358168602, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 414 }, { "completion_length": 179.75000381469727, "epoch": 0.22213301217717116, "grad_norm": 1.640625, "kl": 0.03261849144473672, "learning_rate": 4.776456024827895e-06, "loss": 0.0013, "reward": 2.4375000596046448, "reward_std": 0.8668341487646103, "rewards/correctness_reward_func": 1.166666679084301, "rewards/int_reward_func": 0.4375000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.33333334140479565, "rewards/xmlcount_reward_func": 0.5, "step": 415 }, { "completion_length": 165.37500381469727, "epoch": 0.22266827244747758, "grad_norm": 1.4453125, "kl": 0.034663321916013956, "learning_rate": 4.774520898091244e-06, "loss": 0.0014, "reward": 2.8125000596046448, "reward_std": 0.5357958674430847, "rewards/correctness_reward_func": 1.5000000223517418, "rewards/int_reward_func": 0.3750000037252903, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 416 }, { "completion_length": 148.0416717529297, "epoch": 0.223203532717784, "grad_norm": 2.09375, "kl": 0.038772601168602705, "learning_rate": 4.772577827082261e-06, "loss": 0.0016, "reward": 2.687500089406967, "reward_std": 0.7555890195071697, "rewards/correctness_reward_func": 1.2500000223517418, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000149011612, "rewards/xmlcount_reward_func": 0.5, "step": 417 }, { "completion_length": 135.08333778381348, "epoch": 0.22373879298809046, "grad_norm": 1.7109375, "kl": 0.017822970170527697, "learning_rate": 4.770626818587554e-06, "loss": 0.0007, "reward": 2.5416666865348816, "reward_std": 0.2686738669872284, "rewards/correctness_reward_func": 1.0833333358168602, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 418 }, { "completion_length": 207.33333778381348, "epoch": 0.2242740532583969, "grad_norm": 3.328125, "kl": 0.03107347432523966, "learning_rate": 4.768667879421457e-06, "loss": 0.0012, "reward": 2.3177084177732468, "reward_std": 0.7329771295189857, "rewards/correctness_reward_func": 1.083333358168602, "rewards/int_reward_func": 0.3958333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.354166679084301, "rewards/xmlcount_reward_func": 0.484375, "step": 419 }, { "completion_length": 174.8333396911621, "epoch": 0.22480931352870334, "grad_norm": 1.9921875, "kl": 0.05148722976446152, "learning_rate": 4.7667010164260016e-06, "loss": 0.0021, "reward": 2.452416777610779, "reward_std": 0.6645737141370773, "rewards/correctness_reward_func": 1.1666666865348816, "rewards/int_reward_func": 0.4375000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.354166679084301, "rewards/xmlcount_reward_func": 0.49408333748579025, "step": 420 }, { "completion_length": 153.12500381469727, "epoch": 0.22534457379900977, "grad_norm": 1.8359375, "kl": 0.0318888071924448, "learning_rate": 4.764726236470897e-06, "loss": 0.0013, "reward": 2.604166716337204, "reward_std": 0.6094112247228622, "rewards/correctness_reward_func": 1.2500000298023224, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 421 }, { "completion_length": 151.2083396911621, "epoch": 0.2258798340693162, "grad_norm": 1.578125, "kl": 0.03988643130287528, "learning_rate": 4.762743546453503e-06, "loss": 0.0016, "reward": 2.458333373069763, "reward_std": 0.556085180491209, "rewards/correctness_reward_func": 1.0000000149011612, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 422 }, { "completion_length": 104.75000381469727, "epoch": 0.22641509433962265, "grad_norm": 1.6953125, "kl": 0.02869172114878893, "learning_rate": 4.760752953298807e-06, "loss": 0.0011, "reward": 3.0625000298023224, "reward_std": 0.25515517219901085, "rewards/correctness_reward_func": 1.5833333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 423 }, { "completion_length": 148.50000762939453, "epoch": 0.22695035460992907, "grad_norm": 1.7109375, "kl": 0.028270404785871506, "learning_rate": 4.758754463959401e-06, "loss": 0.0011, "reward": 2.2850833535194397, "reward_std": 0.49467696249485016, "rewards/correctness_reward_func": 0.916666679084301, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.416666679084301, "rewards/xmlcount_reward_func": 0.4934166669845581, "step": 424 }, { "completion_length": 131.50000190734863, "epoch": 0.22748561488023553, "grad_norm": 1.359375, "kl": 0.024721851106733084, "learning_rate": 4.756748085415455e-06, "loss": 0.001, "reward": 3.083333373069763, "reward_std": 0.46232305467128754, "rewards/correctness_reward_func": 1.583333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 425 }, { "completion_length": 148.7916717529297, "epoch": 0.22802087515054195, "grad_norm": 1.3046875, "kl": 0.026703921612352133, "learning_rate": 4.754733824674694e-06, "loss": 0.0011, "reward": 2.6666666865348816, "reward_std": 0.48720720410346985, "rewards/correctness_reward_func": 1.2500000074505806, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 426 }, { "completion_length": 179.00000762939453, "epoch": 0.22855613542084838, "grad_norm": 1.7109375, "kl": 0.034272789023816586, "learning_rate": 4.752711688772375e-06, "loss": 0.0014, "reward": 2.577833414077759, "reward_std": 0.4778827279806137, "rewards/correctness_reward_func": 1.2500000298023224, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3541666716337204, "rewards/xmlcount_reward_func": 0.4945000037550926, "step": 427 }, { "completion_length": 124.58333969116211, "epoch": 0.22909139569115483, "grad_norm": 1.765625, "kl": 0.039995139464735985, "learning_rate": 4.750681684771257e-06, "loss": 0.0016, "reward": 3.020833373069763, "reward_std": 0.6677707135677338, "rewards/correctness_reward_func": 1.5833333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 428 }, { "completion_length": 157.37500381469727, "epoch": 0.22962665596146126, "grad_norm": 1.390625, "kl": 0.023208866827189922, "learning_rate": 4.748643819761585e-06, "loss": 0.0009, "reward": 3.208333373069763, "reward_std": 0.48936041817069054, "rewards/correctness_reward_func": 1.8333333432674408, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.416666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 429 }, { "completion_length": 123.29167175292969, "epoch": 0.2301619162317677, "grad_norm": 1.9140625, "kl": 0.08427455788478255, "learning_rate": 4.7465981008610555e-06, "loss": 0.0034, "reward": 3.208333373069763, "reward_std": 0.35120461508631706, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 430 }, { "completion_length": 142.08333587646484, "epoch": 0.23069717650207414, "grad_norm": 1.65625, "kl": 0.03192599257454276, "learning_rate": 4.7445445352148e-06, "loss": 0.0013, "reward": 3.1875000596046448, "reward_std": 0.37377963587641716, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 431 }, { "completion_length": 152.2916717529297, "epoch": 0.23123243677238056, "grad_norm": 1.296875, "kl": 0.023425794672220945, "learning_rate": 4.742483129995355e-06, "loss": 0.0009, "reward": 2.8750000596046448, "reward_std": 0.25129128992557526, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 432 }, { "completion_length": 202.2916717529297, "epoch": 0.23176769704268702, "grad_norm": 1.6328125, "kl": 0.027119331993162632, "learning_rate": 4.740413892402639e-06, "loss": 0.0011, "reward": 2.7500000596046448, "reward_std": 0.6782792210578918, "rewards/correctness_reward_func": 1.4166667088866234, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.35416667722165585, "rewards/xmlcount_reward_func": 0.5, "step": 433 }, { "completion_length": 162.50000190734863, "epoch": 0.23230295731299344, "grad_norm": 1.8671875, "kl": 0.022649120073765516, "learning_rate": 4.738336829663926e-06, "loss": 0.0009, "reward": 2.520833432674408, "reward_std": 0.5779038220643997, "rewards/correctness_reward_func": 1.083333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 434 }, { "completion_length": 113.7500057220459, "epoch": 0.23283821758329987, "grad_norm": 1.7265625, "kl": 0.0458745863288641, "learning_rate": 4.736251949033823e-06, "loss": 0.0018, "reward": 3.1875, "reward_std": 0.5431509912014008, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 435 }, { "completion_length": 164.83333778381348, "epoch": 0.23337347785360632, "grad_norm": 1.7265625, "kl": 0.03575094696134329, "learning_rate": 4.734159257794239e-06, "loss": 0.0014, "reward": 2.7760417461395264, "reward_std": 0.8451508581638336, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.484375, "step": 436 }, { "completion_length": 128.9583339691162, "epoch": 0.23390873812391275, "grad_norm": 1.859375, "kl": 0.03346340823918581, "learning_rate": 4.732058763254368e-06, "loss": 0.0013, "reward": 3.208333432674408, "reward_std": 0.5094902031123638, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 437 }, { "completion_length": 162.37500381469727, "epoch": 0.2344439983942192, "grad_norm": 2.328125, "kl": 0.027165093226358294, "learning_rate": 4.729950472750654e-06, "loss": 0.0011, "reward": 3.0000001192092896, "reward_std": 0.8066200762987137, "rewards/correctness_reward_func": 1.5833333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666865348816, "rewards/xmlcount_reward_func": 0.5, "step": 438 }, { "completion_length": 131.62500190734863, "epoch": 0.23497925866452563, "grad_norm": 1.8671875, "kl": 0.07023448962718248, "learning_rate": 4.7278343936467745e-06, "loss": 0.0028, "reward": 2.833333432674408, "reward_std": 0.6454972475767136, "rewards/correctness_reward_func": 1.4166666716337204, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 439 }, { "completion_length": 160.9166717529297, "epoch": 0.23551451893483205, "grad_norm": 1.5390625, "kl": 0.04344167560338974, "learning_rate": 4.725710533333608e-06, "loss": 0.0017, "reward": 3.0416666865348816, "reward_std": 0.7441303730010986, "rewards/correctness_reward_func": 1.5833333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 440 }, { "completion_length": 120.33333587646484, "epoch": 0.2360497792051385, "grad_norm": 2.046875, "kl": 0.02986164903268218, "learning_rate": 4.72357889922921e-06, "loss": 0.0012, "reward": 3.145833373069763, "reward_std": 0.5042977184057236, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 441 }, { "completion_length": 141.16666793823242, "epoch": 0.23658503947544493, "grad_norm": 1.4453125, "kl": 0.025641448330134153, "learning_rate": 4.72143949877879e-06, "loss": 0.001, "reward": 3.3125000596046448, "reward_std": 0.309229951351881, "rewards/correctness_reward_func": 1.8333333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 442 }, { "completion_length": 128.12500190734863, "epoch": 0.23712029974575138, "grad_norm": 1.0234375, "kl": 0.03461711807176471, "learning_rate": 4.719292339454682e-06, "loss": 0.0014, "reward": 3.4166666865348816, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 443 }, { "completion_length": 125.91666984558105, "epoch": 0.2376555600160578, "grad_norm": 1.8984375, "kl": 0.03600315935909748, "learning_rate": 4.71713742875632e-06, "loss": 0.0014, "reward": 2.833333373069763, "reward_std": 0.40824827551841736, "rewards/correctness_reward_func": 1.3333333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 444 }, { "completion_length": 123.54166984558105, "epoch": 0.23819082028636424, "grad_norm": 1.890625, "kl": 0.05057946778833866, "learning_rate": 4.714974774210209e-06, "loss": 0.002, "reward": 2.895833373069763, "reward_std": 0.25515518710017204, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 445 }, { "completion_length": 139.8333396911621, "epoch": 0.2387260805566707, "grad_norm": 1.8125, "kl": 0.04080198332667351, "learning_rate": 4.712804383369905e-06, "loss": 0.0016, "reward": 2.770833432674408, "reward_std": 0.45155154168605804, "rewards/correctness_reward_func": 1.3333333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000149011612, "rewards/xmlcount_reward_func": 0.5, "step": 446 }, { "completion_length": 134.375, "epoch": 0.23926134082697711, "grad_norm": 2.703125, "kl": 0.030578581616282463, "learning_rate": 4.710626263815982e-06, "loss": 0.0012, "reward": 2.958333373069763, "reward_std": 0.9482545256614685, "rewards/correctness_reward_func": 1.5000000596046448, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 447 }, { "completion_length": 144.37500381469727, "epoch": 0.23979660109728354, "grad_norm": 1.0625, "kl": 0.03926865756511688, "learning_rate": 4.7084404231560085e-06, "loss": 0.0016, "reward": 3.083333343267441, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func": 1.5833333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 448 }, { "completion_length": 129.58333587646484, "epoch": 0.24033186136759, "grad_norm": 2.34375, "kl": 0.02997904270887375, "learning_rate": 4.706246869024523e-06, "loss": 0.0012, "reward": 3.1197916865348816, "reward_std": 0.49308109283447266, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.4947916716337204, "step": 449 }, { "completion_length": 121.41667175292969, "epoch": 0.24086712163789642, "grad_norm": 1.3203125, "kl": 0.041862784419208765, "learning_rate": 4.7040456090830015e-06, "loss": 0.0017, "reward": 2.979166716337204, "reward_std": 0.25515517592430115, "rewards/correctness_reward_func": 1.5833333358168602, "rewards/int_reward_func": 0.39583333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 450 }, { "completion_length": 130.6250057220459, "epoch": 0.24140238190820287, "grad_norm": 2.21875, "kl": 0.04182751849293709, "learning_rate": 4.701836651019838e-06, "loss": 0.0017, "reward": 2.6875000596046448, "reward_std": 0.7292841225862503, "rewards/correctness_reward_func": 1.2500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 451 }, { "completion_length": 126.20833778381348, "epoch": 0.2419376421785093, "grad_norm": 1.5625, "kl": 0.06659243628382683, "learning_rate": 4.69962000255031e-06, "loss": 0.0027, "reward": 3.1406250596046448, "reward_std": 0.41231444105505943, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333432674408, "rewards/xmlcount_reward_func": 0.4947916716337204, "step": 452 }, { "completion_length": 128.16666984558105, "epoch": 0.24247290244881572, "grad_norm": 1.7265625, "kl": 0.025568378157913685, "learning_rate": 4.697395671416559e-06, "loss": 0.001, "reward": 2.833333373069763, "reward_std": 0.7361843436956406, "rewards/correctness_reward_func": 1.333333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 453 }, { "completion_length": 168.0416717529297, "epoch": 0.24300816271912218, "grad_norm": 1.890625, "kl": 0.03563910163938999, "learning_rate": 4.6951636653875576e-06, "loss": 0.0014, "reward": 2.5625000596046448, "reward_std": 0.6485128216445446, "rewards/correctness_reward_func": 1.1666666865348816, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 454 }, { "completion_length": 160.70833587646484, "epoch": 0.2435434229894286, "grad_norm": 1.9375, "kl": 0.03328033583238721, "learning_rate": 4.6929239922590856e-06, "loss": 0.0013, "reward": 2.708333373069763, "reward_std": 0.6793645471334457, "rewards/correctness_reward_func": 1.2500000223517418, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 455 }, { "completion_length": 158.08333778381348, "epoch": 0.24407868325973506, "grad_norm": 2.125, "kl": 0.045512993820011616, "learning_rate": 4.690676659853702e-06, "loss": 0.0018, "reward": 2.520833432674408, "reward_std": 1.0850803554058075, "rewards/correctness_reward_func": 1.083333358168602, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 456 }, { "completion_length": 132.00000381469727, "epoch": 0.24461394353004148, "grad_norm": 1.421875, "kl": 0.047892688773572445, "learning_rate": 4.688421676020717e-06, "loss": 0.0019, "reward": 3.395833373069763, "reward_std": 0.25515519082546234, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 457 }, { "completion_length": 129.75000381469727, "epoch": 0.2451492038003479, "grad_norm": 1.9140625, "kl": 0.02556005073711276, "learning_rate": 4.686159048636165e-06, "loss": 0.001, "reward": 2.895833373069763, "reward_std": 0.25515517219901085, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 458 }, { "completion_length": 121.41666984558105, "epoch": 0.24568446407065436, "grad_norm": 2.546875, "kl": 0.047415590612217784, "learning_rate": 4.683888785602778e-06, "loss": 0.0019, "reward": 2.7916667461395264, "reward_std": 1.0333142131567001, "rewards/correctness_reward_func": 1.3333333730697632, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 459 }, { "completion_length": 135.6250057220459, "epoch": 0.2462197243409608, "grad_norm": 1.875, "kl": 0.05515039339661598, "learning_rate": 4.681610894849957e-06, "loss": 0.0022, "reward": 2.9791667461395264, "reward_std": 0.7915693670511246, "rewards/correctness_reward_func": 1.5833333730697632, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 460 }, { "completion_length": 136.37500762939453, "epoch": 0.24675498461126724, "grad_norm": 2.015625, "kl": 0.02769710123538971, "learning_rate": 4.679325384333744e-06, "loss": 0.0011, "reward": 3.2291667461395264, "reward_std": 0.5133541077375412, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 461 }, { "completion_length": 138.70833587646484, "epoch": 0.24729024488157367, "grad_norm": 1.03125, "kl": 0.029878970235586166, "learning_rate": 4.677032262036794e-06, "loss": 0.0012, "reward": 3.145833373069763, "reward_std": 0.5513499081134796, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 462 }, { "completion_length": 175.41667556762695, "epoch": 0.2478255051518801, "grad_norm": 2.078125, "kl": 0.042299624998122454, "learning_rate": 4.674731535968351e-06, "loss": 0.0017, "reward": 2.708333373069763, "reward_std": 0.7748701274394989, "rewards/correctness_reward_func": 1.333333358168602, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 463 }, { "completion_length": 220.66667556762695, "epoch": 0.24836076542218655, "grad_norm": 1.671875, "kl": 0.038636271841824055, "learning_rate": 4.6724232141642135e-06, "loss": 0.0015, "reward": 2.6927084028720856, "reward_std": 0.5764480978250504, "rewards/correctness_reward_func": 1.416666679084301, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.31250000558793545, "rewards/xmlcount_reward_func": 0.484375, "step": 464 }, { "completion_length": 129.7916717529297, "epoch": 0.24889602569249297, "grad_norm": 1.515625, "kl": 0.029603436589241028, "learning_rate": 4.6701073046867106e-06, "loss": 0.0012, "reward": 2.9791667461395264, "reward_std": 0.6551453582942486, "rewards/correctness_reward_func": 1.5833333432674408, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 465 }, { "completion_length": 159.25000381469727, "epoch": 0.2494312859627994, "grad_norm": 2.265625, "kl": 0.02957107638940215, "learning_rate": 4.667783815624675e-06, "loss": 0.0012, "reward": 2.437500089406967, "reward_std": 1.0319268852472305, "rewards/correctness_reward_func": 1.0000000223517418, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 466 }, { "completion_length": 157.62500190734863, "epoch": 0.24996654623310585, "grad_norm": 1.5234375, "kl": 0.03178174514323473, "learning_rate": 4.66545275509341e-06, "loss": 0.0013, "reward": 3.0416667461395264, "reward_std": 0.5337304323911667, "rewards/correctness_reward_func": 1.583333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 467 }, { "completion_length": 192.2083396911621, "epoch": 0.2505018065034123, "grad_norm": 1.3046875, "kl": 0.033401607535779476, "learning_rate": 4.663114131234666e-06, "loss": 0.0013, "reward": 2.479166716337204, "reward_std": 0.7958995848894119, "rewards/correctness_reward_func": 1.0833333507180214, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 468 }, { "completion_length": 115.79166984558105, "epoch": 0.25103706677371873, "grad_norm": 1.703125, "kl": 0.03035385813564062, "learning_rate": 4.6607679522166085e-06, "loss": 0.0012, "reward": 3.2500000596046448, "reward_std": 0.46232305467128754, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 469 }, { "completion_length": 135.91666984558105, "epoch": 0.25157232704402516, "grad_norm": 2.09375, "kl": 0.03181264130398631, "learning_rate": 4.658414226233792e-06, "loss": 0.0013, "reward": 3.208333432674408, "reward_std": 0.6595396101474762, "rewards/correctness_reward_func": 1.7500000596046448, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 470 }, { "completion_length": 118.70833587646484, "epoch": 0.2521075873143316, "grad_norm": 1.5703125, "kl": 0.038648287765681744, "learning_rate": 4.656052961507131e-06, "loss": 0.0015, "reward": 3.145833373069763, "reward_std": 0.309229951351881, "rewards/correctness_reward_func": 1.6666666716337204, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 471 }, { "completion_length": 220.6250057220459, "epoch": 0.252642847584638, "grad_norm": 1.7265625, "kl": 0.025634529069066048, "learning_rate": 4.653684166283869e-06, "loss": 0.001, "reward": 2.2812500298023224, "reward_std": 0.7136656455695629, "rewards/correctness_reward_func": 1.0000000149011612, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333395421505, "rewards/xmlcount_reward_func": 0.46875, "step": 472 }, { "completion_length": 99.83333587646484, "epoch": 0.2531781078549445, "grad_norm": 0.65625, "kl": 0.056579564698040485, "learning_rate": 4.651307848837553e-06, "loss": 0.0023, "reward": 3.5, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 473 }, { "completion_length": 138.66666793823242, "epoch": 0.2537133681252509, "grad_norm": 1.2265625, "kl": 0.04535471182316542, "learning_rate": 4.648924017468003e-06, "loss": 0.0018, "reward": 3.1666667461395264, "reward_std": 0.34591545164585114, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 474 }, { "completion_length": 190.79166984558105, "epoch": 0.25424862839555734, "grad_norm": 1.15625, "kl": 0.01769639761187136, "learning_rate": 4.646532680501282e-06, "loss": 0.0007, "reward": 2.875, "reward_std": 0.5589051842689514, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.39583333395421505, "rewards/xmlcount_reward_func": 0.5, "step": 475 }, { "completion_length": 134.8750057220459, "epoch": 0.25478388866586377, "grad_norm": 1.109375, "kl": 0.042043234687298536, "learning_rate": 4.644133846289669e-06, "loss": 0.0017, "reward": 3.3125000596046448, "reward_std": 0.3092299550771713, "rewards/correctness_reward_func": 1.8333333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 476 }, { "completion_length": 171.7916717529297, "epoch": 0.2553191489361702, "grad_norm": 1.296875, "kl": 0.03471332183107734, "learning_rate": 4.641727523211627e-06, "loss": 0.0014, "reward": 3.208333373069763, "reward_std": 0.306186243891716, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 477 }, { "completion_length": 140.16666793823242, "epoch": 0.2558544092064767, "grad_norm": 1.6640625, "kl": 0.032892528688535094, "learning_rate": 4.6393137196717785e-06, "loss": 0.0013, "reward": 3.0625000596046448, "reward_std": 0.6782456934452057, "rewards/correctness_reward_func": 1.5833333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 478 }, { "completion_length": 146.08333778381348, "epoch": 0.2563896694767831, "grad_norm": 1.328125, "kl": 0.03279207367449999, "learning_rate": 4.63689244410087e-06, "loss": 0.0013, "reward": 3.1666666865348816, "reward_std": 0.5933245718479156, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 479 }, { "completion_length": 156.62500381469727, "epoch": 0.2569249297470895, "grad_norm": 1.484375, "kl": 0.02493216237053275, "learning_rate": 4.6344637049557495e-06, "loss": 0.001, "reward": 3.0416667461395264, "reward_std": 0.5643851570785046, "rewards/correctness_reward_func": 1.583333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 480 }, { "completion_length": 140.91666984558105, "epoch": 0.25746019001739595, "grad_norm": 2.109375, "kl": 0.05418694904074073, "learning_rate": 4.632027510719329e-06, "loss": 0.0022, "reward": 3.2500000596046448, "reward_std": 0.556186206638813, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 481 }, { "completion_length": 151.50000381469727, "epoch": 0.2579954502877024, "grad_norm": 1.671875, "kl": 0.043965504970401525, "learning_rate": 4.629583869900562e-06, "loss": 0.0018, "reward": 3.0416667461395264, "reward_std": 0.618552640080452, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 482 }, { "completion_length": 149.75000381469727, "epoch": 0.25853071055800886, "grad_norm": 1.0546875, "kl": 0.025529312435537577, "learning_rate": 4.627132791034411e-06, "loss": 0.001, "reward": 3.270833373069763, "reward_std": 0.16614501178264618, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.39583333395421505, "rewards/xmlcount_reward_func": 0.4791666716337204, "step": 483 }, { "completion_length": 166.50000381469727, "epoch": 0.2590659708283153, "grad_norm": 1.3984375, "kl": 0.028888692380860448, "learning_rate": 4.624674282681814e-06, "loss": 0.0012, "reward": 2.8541666865348816, "reward_std": 0.5258883386850357, "rewards/correctness_reward_func": 1.416666679084301, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 484 }, { "completion_length": 171.62500762939453, "epoch": 0.2596012310986217, "grad_norm": 1.53125, "kl": 0.01867962582036853, "learning_rate": 4.622208353429661e-06, "loss": 0.0007, "reward": 2.7500000596046448, "reward_std": 0.7895646393299103, "rewards/correctness_reward_func": 1.333333358168602, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 485 }, { "completion_length": 124.7500057220459, "epoch": 0.26013649136892814, "grad_norm": 1.6015625, "kl": 0.04171385709196329, "learning_rate": 4.619735011890763e-06, "loss": 0.0017, "reward": 3.395833373069763, "reward_std": 0.25515517592430115, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 486 }, { "completion_length": 196.70833587646484, "epoch": 0.26067175163923456, "grad_norm": 1.59375, "kl": 0.020838663913309574, "learning_rate": 4.617254266703816e-06, "loss": 0.0008, "reward": 2.4791667461395264, "reward_std": 0.6868235319852829, "rewards/correctness_reward_func": 1.0000000447034836, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 487 }, { "completion_length": 149.79166793823242, "epoch": 0.261207011909541, "grad_norm": 1.65625, "kl": 0.05422941967844963, "learning_rate": 4.614766126533378e-06, "loss": 0.0022, "reward": 2.9375000596046448, "reward_std": 0.4713764898478985, "rewards/correctness_reward_func": 1.5000000223517418, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000149011612, "rewards/xmlcount_reward_func": 0.5, "step": 488 }, { "completion_length": 169.87500381469727, "epoch": 0.26174227217984747, "grad_norm": 1.9453125, "kl": 0.057755385991185904, "learning_rate": 4.612270600069833e-06, "loss": 0.0023, "reward": 2.3541667461395264, "reward_std": 1.0590701699256897, "rewards/correctness_reward_func": 1.000000037252903, "rewards/int_reward_func": 0.4375000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.416666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 489 }, { "completion_length": 120.66667175292969, "epoch": 0.2622775324501539, "grad_norm": 1.25, "kl": 0.031516329385340214, "learning_rate": 4.609767696029365e-06, "loss": 0.0013, "reward": 3.0625000298023224, "reward_std": 0.25515519082546234, "rewards/correctness_reward_func": 1.5833333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 490 }, { "completion_length": 185.2083396911621, "epoch": 0.2628127927204603, "grad_norm": 2.203125, "kl": 0.08892032504081726, "learning_rate": 4.6072574231539255e-06, "loss": 0.0036, "reward": 2.6875000596046448, "reward_std": 0.5697049051523209, "rewards/correctness_reward_func": 1.3333333730697632, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 491 }, { "completion_length": 178.33333778381348, "epoch": 0.26334805299076675, "grad_norm": 1.5, "kl": 0.039903036784380674, "learning_rate": 4.604739790211203e-06, "loss": 0.0016, "reward": 2.500000089406967, "reward_std": 0.9305254966020584, "rewards/correctness_reward_func": 1.0833333805203438, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.5, "step": 492 }, { "completion_length": 165.79167366027832, "epoch": 0.26388331326107317, "grad_norm": 1.578125, "kl": 0.03498702170327306, "learning_rate": 4.6022148059945945e-06, "loss": 0.0014, "reward": 2.895833432674408, "reward_std": 0.49754732847213745, "rewards/correctness_reward_func": 1.5000000223517418, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 493 }, { "completion_length": 137.45833778381348, "epoch": 0.26441857353137965, "grad_norm": 1.7421875, "kl": 0.03628614521585405, "learning_rate": 4.599682479323171e-06, "loss": 0.0015, "reward": 2.9166666865348816, "reward_std": 0.572748601436615, "rewards/correctness_reward_func": 1.5000000149011612, "rewards/int_reward_func": 0.4375000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 494 }, { "completion_length": 159.8333396911621, "epoch": 0.2649538338016861, "grad_norm": 1.9375, "kl": 0.024191310163587332, "learning_rate": 4.597142819041647e-06, "loss": 0.001, "reward": 2.6666667461395264, "reward_std": 0.581819411367178, "rewards/correctness_reward_func": 1.2500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 495 }, { "completion_length": 141.66667366027832, "epoch": 0.2654890940719925, "grad_norm": 2.203125, "kl": 0.06287940312176943, "learning_rate": 4.594595834020355e-06, "loss": 0.0025, "reward": 3.145833432674408, "reward_std": 0.6394104324281216, "rewards/correctness_reward_func": 1.7500000596046448, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 496 }, { "completion_length": 170.41666793823242, "epoch": 0.26602435434229893, "grad_norm": 1.78125, "kl": 0.03716146480292082, "learning_rate": 4.5920415331552095e-06, "loss": 0.0015, "reward": 3.1250001192092896, "reward_std": 0.7136143408715725, "rewards/correctness_reward_func": 1.666666716337204, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 497 }, { "completion_length": 183.25000762939453, "epoch": 0.26655961461260536, "grad_norm": 1.28125, "kl": 0.0346057191491127, "learning_rate": 4.589479925367676e-06, "loss": 0.0014, "reward": 3.1250000596046448, "reward_std": 0.7477540075778961, "rewards/correctness_reward_func": 1.7500000596046448, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 498 }, { "completion_length": 135.3333339691162, "epoch": 0.26709487488291184, "grad_norm": 1.6484375, "kl": 0.037114487029612064, "learning_rate": 4.586911019604742e-06, "loss": 0.0015, "reward": 2.8750000298023224, "reward_std": 0.6255477480590343, "rewards/correctness_reward_func": 1.5000000149011612, "rewards/int_reward_func": 0.3958333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 499 }, { "completion_length": 162.375, "epoch": 0.26763013515321826, "grad_norm": 1.90625, "kl": 0.03033427568152547, "learning_rate": 4.584334824838885e-06, "loss": 0.0012, "reward": 2.9322917461395264, "reward_std": 0.5931039564311504, "rewards/correctness_reward_func": 1.583333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.35416667722165585, "rewards/xmlcount_reward_func": 0.4947916716337204, "step": 500 }, { "completion_length": 143.83333587646484, "epoch": 0.2681653954235247, "grad_norm": 1.8046875, "kl": 0.025395017582923174, "learning_rate": 4.581751350068041e-06, "loss": 0.001, "reward": 3.145833432674408, "reward_std": 0.7174782603979111, "rewards/correctness_reward_func": 1.666666716337204, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 501 }, { "completion_length": 139.79166984558105, "epoch": 0.2687006556938311, "grad_norm": 1.8828125, "kl": 0.04936455516144633, "learning_rate": 4.579160604315572e-06, "loss": 0.002, "reward": 3.083333432674408, "reward_std": 0.6196396760642529, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 502 }, { "completion_length": 145.5416717529297, "epoch": 0.26923591596413754, "grad_norm": 1.484375, "kl": 0.019221351481974125, "learning_rate": 4.576562596630237e-06, "loss": 0.0008, "reward": 3.020833373069763, "reward_std": 0.6095356941223145, "rewards/correctness_reward_func": 1.5833333432674408, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 503 }, { "completion_length": 157.08333587646484, "epoch": 0.269771176234444, "grad_norm": 1.640625, "kl": 0.031065318267792463, "learning_rate": 4.573957336086158e-06, "loss": 0.0012, "reward": 3.0104167461395264, "reward_std": 0.476203590631485, "rewards/correctness_reward_func": 1.5833333432674408, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.4895833358168602, "step": 504 }, { "completion_length": 190.79167366027832, "epoch": 0.27030643650475045, "grad_norm": 1.8671875, "kl": 0.037793589755892754, "learning_rate": 4.571344831782789e-06, "loss": 0.0015, "reward": 2.7500001192092896, "reward_std": 0.9386454932391644, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.354166679084301, "rewards/xmlcount_reward_func": 0.5, "step": 505 }, { "completion_length": 148.9583339691162, "epoch": 0.2708416967750569, "grad_norm": 1.5703125, "kl": 0.021527512930333614, "learning_rate": 4.568725092844886e-06, "loss": 0.0009, "reward": 2.708333373069763, "reward_std": 0.5483061634004116, "rewards/correctness_reward_func": 1.2500000298023224, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 506 }, { "completion_length": 173.25000762939453, "epoch": 0.2713769570453633, "grad_norm": 1.8671875, "kl": 0.043221392668783665, "learning_rate": 4.566098128422471e-06, "loss": 0.0017, "reward": 2.880208373069763, "reward_std": 0.8724417686462402, "rewards/correctness_reward_func": 1.5833333730697632, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3541666716337204, "rewards/xmlcount_reward_func": 0.484375, "step": 507 }, { "completion_length": 134.5416717529297, "epoch": 0.2719122173156697, "grad_norm": 2.078125, "kl": 0.03808927442878485, "learning_rate": 4.563463947690804e-06, "loss": 0.0015, "reward": 2.1875000298023224, "reward_std": 0.61852215975523, "rewards/correctness_reward_func": 0.8333333358168602, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 508 }, { "completion_length": 138.91666793823242, "epoch": 0.2724474775859762, "grad_norm": 2.421875, "kl": 0.04213061882182956, "learning_rate": 4.5608225598503506e-06, "loss": 0.0017, "reward": 3.208333432674408, "reward_std": 0.5036999098956585, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3750000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 509 }, { "completion_length": 135.04166793823242, "epoch": 0.27298273785628263, "grad_norm": 0.96484375, "kl": 0.03594761900603771, "learning_rate": 4.558173974126749e-06, "loss": 0.0014, "reward": 3.4791666865348816, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 510 }, { "completion_length": 171.37500381469727, "epoch": 0.27351799812658906, "grad_norm": 2.28125, "kl": 0.059704599902033806, "learning_rate": 4.555518199770774e-06, "loss": 0.0024, "reward": 3.1250001192092896, "reward_std": 0.7136143408715725, "rewards/correctness_reward_func": 1.666666716337204, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 511 }, { "completion_length": 124.08333587646484, "epoch": 0.2740532583968955, "grad_norm": 1.71875, "kl": 0.03336996538564563, "learning_rate": 4.552855246058313e-06, "loss": 0.0013, "reward": 3.0625000596046448, "reward_std": 0.5830912441015244, "rewards/correctness_reward_func": 1.5833333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 512 }, { "completion_length": 152.45833587646484, "epoch": 0.2745885186672019, "grad_norm": 1.5546875, "kl": 0.03267518850043416, "learning_rate": 4.550185122290324e-06, "loss": 0.0013, "reward": 3.1250000596046448, "reward_std": 0.5268727391958237, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 513 }, { "completion_length": 186.75000381469727, "epoch": 0.2751237789375084, "grad_norm": 1.1796875, "kl": 0.025649972492828965, "learning_rate": 4.547507837792814e-06, "loss": 0.001, "reward": 2.8437500596046448, "reward_std": 0.33822767809033394, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.4895833358168602, "step": 514 }, { "completion_length": 208.9166717529297, "epoch": 0.2756590392078148, "grad_norm": 1.1171875, "kl": 0.030423552729189396, "learning_rate": 4.544823401916794e-06, "loss": 0.0012, "reward": 2.833333432674408, "reward_std": 0.718227207660675, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 515 }, { "completion_length": 184.45833587646484, "epoch": 0.27619429947812124, "grad_norm": 2.234375, "kl": 0.100022466853261, "learning_rate": 4.542131824038259e-06, "loss": 0.004, "reward": 2.5625000298023224, "reward_std": 0.6425507217645645, "rewards/correctness_reward_func": 1.3333333730697632, "rewards/int_reward_func": 0.4375000149011612, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.2916666753590107, "rewards/xmlcount_reward_func": 0.5, "step": 516 }, { "completion_length": 155.20833778381348, "epoch": 0.27672955974842767, "grad_norm": 1.7109375, "kl": 0.04028411163017154, "learning_rate": 4.539433113558144e-06, "loss": 0.0016, "reward": 2.645833373069763, "reward_std": 0.7378925532102585, "rewards/correctness_reward_func": 1.2500000223517418, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.416666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 517 }, { "completion_length": 147.00000762939453, "epoch": 0.2772648200187341, "grad_norm": 1.0234375, "kl": 0.027115366887301207, "learning_rate": 4.536727279902299e-06, "loss": 0.0011, "reward": 2.7916666865348816, "reward_std": 0.2813657224178314, "rewards/correctness_reward_func": 1.3333333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 518 }, { "completion_length": 133.16666984558105, "epoch": 0.2778000802890405, "grad_norm": 2.03125, "kl": 0.04532007407397032, "learning_rate": 4.534014332521451e-06, "loss": 0.0018, "reward": 2.604166716337204, "reward_std": 0.9020710438489914, "rewards/correctness_reward_func": 1.1666667088866234, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 519 }, { "completion_length": 125.70833778381348, "epoch": 0.278335340559347, "grad_norm": 0.2275390625, "kl": 0.032215571496635675, "learning_rate": 4.5312942808911775e-06, "loss": 0.0013, "reward": 3.5, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 520 }, { "completion_length": 170.33333778381348, "epoch": 0.2788706008296534, "grad_norm": 1.484375, "kl": 0.035013212356716394, "learning_rate": 4.528567134511864e-06, "loss": 0.0014, "reward": 3.1875000596046448, "reward_std": 0.502879124134779, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 521 }, { "completion_length": 174.12500381469727, "epoch": 0.27940586109995985, "grad_norm": 2.328125, "kl": 0.061464476864784956, "learning_rate": 4.52583290290868e-06, "loss": 0.0025, "reward": 2.708333432674408, "reward_std": 0.7608912438154221, "rewards/correctness_reward_func": 1.4166667088866234, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 522 }, { "completion_length": 128.87500381469727, "epoch": 0.2799411213702663, "grad_norm": 1.71875, "kl": 0.03376815561205149, "learning_rate": 4.523091595631539e-06, "loss": 0.0014, "reward": 3.083333373069763, "reward_std": 0.5320602059364319, "rewards/correctness_reward_func": 1.5833333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 523 }, { "completion_length": 149.95833778381348, "epoch": 0.2804763816405727, "grad_norm": 2.046875, "kl": 0.052414572797715664, "learning_rate": 4.5203432222550705e-06, "loss": 0.0021, "reward": 2.6666667759418488, "reward_std": 0.9600770622491837, "rewards/correctness_reward_func": 1.2500000521540642, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 524 }, { "completion_length": 121.5000057220459, "epoch": 0.2810116419108792, "grad_norm": 1.421875, "kl": 0.04683410096913576, "learning_rate": 4.517587792378581e-06, "loss": 0.0019, "reward": 3.2291667461395264, "reward_std": 0.45845916867256165, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 525 }, { "completion_length": 160.95833587646484, "epoch": 0.2815469021811856, "grad_norm": 1.78125, "kl": 0.026103714015334845, "learning_rate": 4.514825315626024e-06, "loss": 0.001, "reward": 2.083333432674408, "reward_std": 0.789296954870224, "rewards/correctness_reward_func": 0.6666666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 526 }, { "completion_length": 146.20833778381348, "epoch": 0.28208216245149204, "grad_norm": 1.6484375, "kl": 0.03722587740048766, "learning_rate": 4.51205580164597e-06, "loss": 0.0015, "reward": 3.1822917461395264, "reward_std": 0.5032989047467709, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.4947916716337204, "step": 527 }, { "completion_length": 135.20833778381348, "epoch": 0.28261742272179846, "grad_norm": 2.25, "kl": 0.03908930625766516, "learning_rate": 4.509279260111563e-06, "loss": 0.0016, "reward": 3.2291667461395264, "reward_std": 0.39121396839618683, "rewards/correctness_reward_func": 1.8333333432674408, "rewards/int_reward_func": 0.3958333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 528 }, { "completion_length": 154.16666793823242, "epoch": 0.2831526829921049, "grad_norm": 1.7265625, "kl": 0.032162437215447426, "learning_rate": 4.506495700720494e-06, "loss": 0.0013, "reward": 2.833333432674408, "reward_std": 0.7331711798906326, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.416666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 529 }, { "completion_length": 158.08334159851074, "epoch": 0.28368794326241137, "grad_norm": 1.109375, "kl": 0.025485132355242968, "learning_rate": 4.503705133194967e-06, "loss": 0.001, "reward": 2.895833373069763, "reward_std": 0.25515517219901085, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 530 }, { "completion_length": 123.54166984558105, "epoch": 0.2842232035327178, "grad_norm": 0.07958984375, "kl": 0.025541551876813173, "learning_rate": 4.500907567281663e-06, "loss": 0.001, "reward": 3.5, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 531 }, { "completion_length": 138.62500381469727, "epoch": 0.2847584638030242, "grad_norm": 1.7421875, "kl": 0.037108127027750015, "learning_rate": 4.498103012751704e-06, "loss": 0.0015, "reward": 2.7708334028720856, "reward_std": 0.7820279747247696, "rewards/correctness_reward_func": 1.3333333656191826, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 532 }, { "completion_length": 180.25000381469727, "epoch": 0.28529372407333065, "grad_norm": 2.15625, "kl": 0.05364688206464052, "learning_rate": 4.4952914794006255e-06, "loss": 0.0021, "reward": 3.1250001192092896, "reward_std": 0.7430477403104305, "rewards/correctness_reward_func": 1.7500000596046448, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 533 }, { "completion_length": 149.2500057220459, "epoch": 0.28582898434363707, "grad_norm": 1.40625, "kl": 0.033399499487131834, "learning_rate": 4.4924729770483346e-06, "loss": 0.0013, "reward": 2.958333373069763, "reward_std": 0.5717262327671051, "rewards/correctness_reward_func": 1.5000000149011612, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 534 }, { "completion_length": 136.2500057220459, "epoch": 0.28636424461394355, "grad_norm": 0.99609375, "kl": 0.033650084398686886, "learning_rate": 4.4896475155390796e-06, "loss": 0.0013, "reward": 2.9791666865348816, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 535 }, { "completion_length": 130.00000381469727, "epoch": 0.28689950488425, "grad_norm": 0.87890625, "kl": 0.03125099744647741, "learning_rate": 4.486815104741418e-06, "loss": 0.0013, "reward": 3.4375, "reward_std": 0.06846532225608826, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.5, "step": 536 }, { "completion_length": 206.00000381469727, "epoch": 0.2874347651545564, "grad_norm": 1.671875, "kl": 0.030830624978989363, "learning_rate": 4.483975754548175e-06, "loss": 0.0012, "reward": 2.6250000596046448, "reward_std": 1.0065668523311615, "rewards/correctness_reward_func": 1.3333333879709244, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 537 }, { "completion_length": 203.4583396911621, "epoch": 0.28797002542486283, "grad_norm": 2.140625, "kl": 0.025279700057581067, "learning_rate": 4.4811294748764175e-06, "loss": 0.001, "reward": 1.8645833432674408, "reward_std": 0.8061150163412094, "rewards/correctness_reward_func": 0.5833333432674408, "rewards/int_reward_func": 0.3958333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.46875, "step": 538 }, { "completion_length": 162.6666717529297, "epoch": 0.28850528569516926, "grad_norm": 1.765625, "kl": 0.06328402180224657, "learning_rate": 4.478276275667411e-06, "loss": 0.0025, "reward": 2.5625000596046448, "reward_std": 0.6288169585168362, "rewards/correctness_reward_func": 1.2500000074505806, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.31250001303851604, "rewards/xmlcount_reward_func": 0.5, "step": 539 }, { "completion_length": 213.0833396911621, "epoch": 0.28904054596547574, "grad_norm": 1.6484375, "kl": 0.02844400331377983, "learning_rate": 4.475416166886593e-06, "loss": 0.0011, "reward": 2.5416667461395264, "reward_std": 0.8251103907823563, "rewards/correctness_reward_func": 1.2500000223517418, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3333333395421505, "rewards/xmlcount_reward_func": 0.5, "step": 540 }, { "completion_length": 164.62500381469727, "epoch": 0.28957580623578216, "grad_norm": 1.4296875, "kl": 0.0281327604316175, "learning_rate": 4.4725491585235305e-06, "loss": 0.0011, "reward": 3.0625000298023224, "reward_std": 0.25515517592430115, "rewards/correctness_reward_func": 1.5833333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 541 }, { "completion_length": 147.04166984558105, "epoch": 0.2901110665060886, "grad_norm": 1.609375, "kl": 0.04105441318824887, "learning_rate": 4.4696752605918924e-06, "loss": 0.0016, "reward": 2.833333343267441, "reward_std": 0.4771459996700287, "rewards/correctness_reward_func": 1.416666679084301, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 542 }, { "completion_length": 178.7916717529297, "epoch": 0.290646326776395, "grad_norm": 1.328125, "kl": 0.02182354126125574, "learning_rate": 4.466794483129409e-06, "loss": 0.0009, "reward": 2.875000089406967, "reward_std": 0.5268727838993073, "rewards/correctness_reward_func": 1.416666679084301, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 543 }, { "completion_length": 140.83333778381348, "epoch": 0.29118158704670144, "grad_norm": 2.28125, "kl": 0.02299963030964136, "learning_rate": 4.463906836197838e-06, "loss": 0.0009, "reward": 3.028375029563904, "reward_std": 0.8897148221731186, "rewards/correctness_reward_func": 1.666666716337204, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.4033749997615814, "step": 544 }, { "completion_length": 181.0416717529297, "epoch": 0.2917168473170079, "grad_norm": 1.40625, "kl": 0.023073547054082155, "learning_rate": 4.461012329882931e-06, "loss": 0.0009, "reward": 2.8125000298023224, "reward_std": 0.3071485310792923, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 545 }, { "completion_length": 180.70833587646484, "epoch": 0.29225210758731435, "grad_norm": 1.578125, "kl": 0.03637926373630762, "learning_rate": 4.4581109742944e-06, "loss": 0.0015, "reward": 2.9375000596046448, "reward_std": 0.833091240376234, "rewards/correctness_reward_func": 1.5000000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 546 }, { "completion_length": 149.62500381469727, "epoch": 0.2927873678576208, "grad_norm": 1.296875, "kl": 0.02528524398803711, "learning_rate": 4.455202779565876e-06, "loss": 0.001, "reward": 2.7291667461395264, "reward_std": 0.5133541226387024, "rewards/correctness_reward_func": 1.2500000298023224, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 547 }, { "completion_length": 162.5416717529297, "epoch": 0.2933226281279272, "grad_norm": 1.078125, "kl": 0.046681386418640614, "learning_rate": 4.452287755854879e-06, "loss": 0.0019, "reward": 3.458333373069763, "reward_std": 0.10206207260489464, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 548 }, { "completion_length": 131.45833778381348, "epoch": 0.2938578883982336, "grad_norm": 0.96875, "kl": 0.0450198519974947, "learning_rate": 4.449365913342781e-06, "loss": 0.0018, "reward": 3.4791666865348816, "reward_std": 0.05103103816509247, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 549 }, { "completion_length": 132.25000762939453, "epoch": 0.29439314866854005, "grad_norm": 1.609375, "kl": 0.041971832513809204, "learning_rate": 4.446437262234769e-06, "loss": 0.0017, "reward": 3.020833373069763, "reward_std": 0.5409832894802094, "rewards/correctness_reward_func": 1.5833333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.5, "step": 550 }, { "completion_length": 126.33333778381348, "epoch": 0.29492840893884653, "grad_norm": 0.546875, "kl": 0.023539585061371326, "learning_rate": 4.4435018127598115e-06, "loss": 0.0009, "reward": 2.9791666865348816, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 551 }, { "completion_length": 149.75000381469727, "epoch": 0.29546366920915296, "grad_norm": 2.546875, "kl": 0.03675627941265702, "learning_rate": 4.440559575170621e-06, "loss": 0.0015, "reward": 3.083333432674408, "reward_std": 0.7781640253961086, "rewards/correctness_reward_func": 1.666666716337204, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 552 }, { "completion_length": 130.50000381469727, "epoch": 0.2959989294794594, "grad_norm": 1.4140625, "kl": 0.037120907101780176, "learning_rate": 4.437610559743621e-06, "loss": 0.0015, "reward": 2.916666716337204, "reward_std": 0.503996953368187, "rewards/correctness_reward_func": 1.5000000149011612, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.5, "step": 553 }, { "completion_length": 144.9583339691162, "epoch": 0.2965341897497658, "grad_norm": 1.671875, "kl": 0.03651731228455901, "learning_rate": 4.434654776778905e-06, "loss": 0.0015, "reward": 3.1666666865348816, "reward_std": 0.5657404661178589, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 554 }, { "completion_length": 146.04166984558105, "epoch": 0.29706945002007223, "grad_norm": 2.21875, "kl": 0.023792235646396875, "learning_rate": 4.431692236600206e-06, "loss": 0.001, "reward": 2.520833432674408, "reward_std": 1.014427661895752, "rewards/correctness_reward_func": 1.0833333730697632, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 555 }, { "completion_length": 152.62500381469727, "epoch": 0.2976047102903787, "grad_norm": 1.6640625, "kl": 0.05981873255223036, "learning_rate": 4.428722949554858e-06, "loss": 0.0024, "reward": 2.8750000596046448, "reward_std": 0.7758503705263138, "rewards/correctness_reward_func": 1.4166667014360428, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 556 }, { "completion_length": 156.4583339691162, "epoch": 0.29813997056068514, "grad_norm": 1.5078125, "kl": 0.029872288461774588, "learning_rate": 4.4257469260137575e-06, "loss": 0.0012, "reward": 3.083333373069763, "reward_std": 0.4854898601770401, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 557 }, { "completion_length": 208.87500190734863, "epoch": 0.29867523083099157, "grad_norm": 3.40625, "kl": 0.02115720184519887, "learning_rate": 4.422764176371333e-06, "loss": 0.0008, "reward": 2.0928750336170197, "reward_std": 0.3820215165615082, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333395421505, "rewards/xmlcount_reward_func": 0.44704167544841766, "step": 558 }, { "completion_length": 151.4166717529297, "epoch": 0.299210491101298, "grad_norm": 1.75, "kl": 0.030247972812503576, "learning_rate": 4.419774711045505e-06, "loss": 0.0012, "reward": 3.208333432674408, "reward_std": 0.6046446561813354, "rewards/correctness_reward_func": 1.7500000596046448, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 559 }, { "completion_length": 144.12500381469727, "epoch": 0.2997457513716044, "grad_norm": 1.140625, "kl": 0.03926022304221988, "learning_rate": 4.416778540477646e-06, "loss": 0.0016, "reward": 3.3750000596046448, "reward_std": 0.25129128620028496, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 560 }, { "completion_length": 225.04167556762695, "epoch": 0.3002810116419109, "grad_norm": 1.75, "kl": 0.03247348219156265, "learning_rate": 4.413775675132553e-06, "loss": 0.0013, "reward": 2.260416716337204, "reward_std": 0.8023487627506256, "rewards/correctness_reward_func": 1.083333358168602, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.2708333432674408, "rewards/xmlcount_reward_func": 0.46875, "step": 561 }, { "completion_length": 163.58333778381348, "epoch": 0.3008162719122173, "grad_norm": 1.421875, "kl": 0.02386433444917202, "learning_rate": 4.4107661254984035e-06, "loss": 0.001, "reward": 2.958333373069763, "reward_std": 0.6122723072767258, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 562 }, { "completion_length": 164.20833778381348, "epoch": 0.30135153218252375, "grad_norm": 1.5546875, "kl": 0.02886300766840577, "learning_rate": 4.407749902086722e-06, "loss": 0.0012, "reward": 2.833333373069763, "reward_std": 0.5685558170080185, "rewards/correctness_reward_func": 1.4166667088866234, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 563 }, { "completion_length": 164.5416717529297, "epoch": 0.3018867924528302, "grad_norm": 1.9296875, "kl": 0.03690410777926445, "learning_rate": 4.404727015432343e-06, "loss": 0.0015, "reward": 2.3541666865348816, "reward_std": 0.6257302761077881, "rewards/correctness_reward_func": 1.0000000149011612, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 564 }, { "completion_length": 132.45833587646484, "epoch": 0.3024220527231366, "grad_norm": 1.7421875, "kl": 0.04732774198055267, "learning_rate": 4.401697476093372e-06, "loss": 0.0019, "reward": 3.0625000596046448, "reward_std": 0.51335409283638, "rewards/correctness_reward_func": 1.583333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 565 }, { "completion_length": 136.58333587646484, "epoch": 0.3029573129934431, "grad_norm": 1.515625, "kl": 0.052906479453668, "learning_rate": 4.3986612946511535e-06, "loss": 0.0021, "reward": 3.395833373069763, "reward_std": 0.2002602517604828, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 566 }, { "completion_length": 160.62500381469727, "epoch": 0.3034925732637495, "grad_norm": 1.5703125, "kl": 0.056891399435698986, "learning_rate": 4.395618481710229e-06, "loss": 0.0023, "reward": 3.3541667461395264, "reward_std": 0.3023223280906677, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000149011612, "rewards/xmlcount_reward_func": 0.5, "step": 567 }, { "completion_length": 137.83333778381348, "epoch": 0.30402783353405594, "grad_norm": 1.359375, "kl": 0.038504095282405615, "learning_rate": 4.392569047898301e-06, "loss": 0.0015, "reward": 3.1250000596046448, "reward_std": 0.5395646393299103, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 568 }, { "completion_length": 150.66666793823242, "epoch": 0.30456309380436236, "grad_norm": 1.078125, "kl": 0.035421257838606834, "learning_rate": 4.3895130038662e-06, "loss": 0.0014, "reward": 2.895833373069763, "reward_std": 0.2002602517604828, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 569 }, { "completion_length": 155.5833396911621, "epoch": 0.3050983540746688, "grad_norm": 1.8828125, "kl": 0.031221465673297644, "learning_rate": 4.386450360287842e-06, "loss": 0.0012, "reward": 2.8958334028720856, "reward_std": 0.7271329909563065, "rewards/correctness_reward_func": 1.5000000447034836, "rewards/int_reward_func": 0.4166666679084301, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 570 }, { "completion_length": 187.7916717529297, "epoch": 0.30563361434497527, "grad_norm": 1.4921875, "kl": 0.02132181730121374, "learning_rate": 4.383381127860194e-06, "loss": 0.0009, "reward": 2.666666731238365, "reward_std": 0.5103103630244732, "rewards/correctness_reward_func": 1.3333333730697632, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 571 }, { "completion_length": 140.2083396911621, "epoch": 0.3061688746152817, "grad_norm": 1.7734375, "kl": 0.04399279458448291, "learning_rate": 4.380305317303236e-06, "loss": 0.0018, "reward": 2.958333373069763, "reward_std": 0.6023809425532818, "rewards/correctness_reward_func": 1.5000000149011612, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 572 }, { "completion_length": 127.37500381469727, "epoch": 0.3067041348855881, "grad_norm": 1.921875, "kl": 0.042581514455378056, "learning_rate": 4.377222939359922e-06, "loss": 0.0017, "reward": 2.9791667461395264, "reward_std": 0.9372647851705551, "rewards/correctness_reward_func": 1.5000000596046448, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 573 }, { "completion_length": 153.6666717529297, "epoch": 0.30723939515589455, "grad_norm": 1.796875, "kl": 0.028469674289226532, "learning_rate": 4.374134004796147e-06, "loss": 0.0011, "reward": 2.750000089406967, "reward_std": 0.7205219715833664, "rewards/correctness_reward_func": 1.2500000223517418, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 574 }, { "completion_length": 124.58333587646484, "epoch": 0.30777465542620097, "grad_norm": 2.359375, "kl": 0.03075863840058446, "learning_rate": 4.371038524400706e-06, "loss": 0.0012, "reward": 3.208333432674408, "reward_std": 0.5643851384520531, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 575 }, { "completion_length": 111.6250057220459, "epoch": 0.30830991569650745, "grad_norm": 1.9765625, "kl": 0.024300793651491404, "learning_rate": 4.367936508985252e-06, "loss": 0.001, "reward": 3.0625000596046448, "reward_std": 0.4334801435470581, "rewards/correctness_reward_func": 1.583333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 576 }, { "completion_length": 165.87500762939453, "epoch": 0.3088451759668139, "grad_norm": 1.6875, "kl": 0.02696134801954031, "learning_rate": 4.364827969384271e-06, "loss": 0.0011, "reward": 2.270833373069763, "reward_std": 0.7186580300331116, "rewards/correctness_reward_func": 0.8333333432674408, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 577 }, { "completion_length": 143.41666793823242, "epoch": 0.3093804362371203, "grad_norm": 2.03125, "kl": 0.03807497629895806, "learning_rate": 4.3617129164550294e-06, "loss": 0.0015, "reward": 3.0416667461395264, "reward_std": 0.659539595246315, "rewards/correctness_reward_func": 1.5833333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 578 }, { "completion_length": 141.33333587646484, "epoch": 0.30991569650742673, "grad_norm": 1.5859375, "kl": 0.02221487811766565, "learning_rate": 4.358591361077546e-06, "loss": 0.0009, "reward": 3.3125000596046448, "reward_std": 0.3092299550771713, "rewards/correctness_reward_func": 1.8333333432674408, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 579 }, { "completion_length": 154.37500381469727, "epoch": 0.31045095677773316, "grad_norm": 2.0625, "kl": 0.031037595123052597, "learning_rate": 4.355463314154551e-06, "loss": 0.0012, "reward": 3.1666667461395264, "reward_std": 0.6664472073316574, "rewards/correctness_reward_func": 1.666666716337204, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 580 }, { "completion_length": 140.5833396911621, "epoch": 0.3109862170480396, "grad_norm": 0.98046875, "kl": 0.028645613696426153, "learning_rate": 4.352328786611446e-06, "loss": 0.0011, "reward": 3.4166666865348816, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 581 }, { "completion_length": 153.75000381469727, "epoch": 0.31152147731834606, "grad_norm": 1.5625, "kl": 0.02715424238704145, "learning_rate": 4.349187789396269e-06, "loss": 0.0011, "reward": 3.0416666865348816, "reward_std": 0.2711162380874157, "rewards/correctness_reward_func": 1.5833333358168602, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 582 }, { "completion_length": 135.45833778381348, "epoch": 0.3120567375886525, "grad_norm": 1.203125, "kl": 0.02633284032344818, "learning_rate": 4.346040333479655e-06, "loss": 0.0011, "reward": 2.5625000298023224, "reward_std": 0.41247179359197617, "rewards/correctness_reward_func": 1.1666666716337204, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 583 }, { "completion_length": 105.00000381469727, "epoch": 0.3125919978589589, "grad_norm": 6.46875, "kl": 0.21342097874730825, "learning_rate": 4.342886429854797e-06, "loss": 0.0085, "reward": 3.3750000596046448, "reward_std": 0.306186206638813, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 584 }, { "completion_length": 123.29167175292969, "epoch": 0.31312725812926534, "grad_norm": 1.8671875, "kl": 0.023702097591012716, "learning_rate": 4.339726089537406e-06, "loss": 0.0009, "reward": 3.270833432674408, "reward_std": 0.5238290727138519, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 585 }, { "completion_length": 143.58333587646484, "epoch": 0.31366251839957177, "grad_norm": 0.74609375, "kl": 0.0426813792437315, "learning_rate": 4.336559323565679e-06, "loss": 0.0017, "reward": 2.9791666865348816, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 586 }, { "completion_length": 149.5000057220459, "epoch": 0.31419777866987825, "grad_norm": 1.7890625, "kl": 0.03509718319401145, "learning_rate": 4.3333861430002525e-06, "loss": 0.0014, "reward": 2.812500089406967, "reward_std": 0.7153968065977097, "rewards/correctness_reward_func": 1.4166667088866234, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 587 }, { "completion_length": 109.33333587646484, "epoch": 0.3147330389401847, "grad_norm": 1.28125, "kl": 0.03983949590474367, "learning_rate": 4.330206558924168e-06, "loss": 0.0016, "reward": 3.395833373069763, "reward_std": 0.25515520572662354, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 588 }, { "completion_length": 123.66666984558105, "epoch": 0.3152682992104911, "grad_norm": 0.96875, "kl": 0.05020787101238966, "learning_rate": 4.327020582442834e-06, "loss": 0.002, "reward": 3.4791666865348816, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 589 }, { "completion_length": 152.875, "epoch": 0.3158035594807975, "grad_norm": 1.1796875, "kl": 0.031143656466156244, "learning_rate": 4.323828224683983e-06, "loss": 0.0012, "reward": 2.958333343267441, "reward_std": 0.39777331054210663, "rewards/correctness_reward_func": 1.5000000223517418, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 590 }, { "completion_length": 138.16666793823242, "epoch": 0.31633881975110395, "grad_norm": 1.4296875, "kl": 0.030376011971384287, "learning_rate": 4.320629496797642e-06, "loss": 0.0012, "reward": 2.8541666865348816, "reward_std": 0.27258946001529694, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 591 }, { "completion_length": 143.45833587646484, "epoch": 0.31687408002141043, "grad_norm": 2.25, "kl": 0.0474525555036962, "learning_rate": 4.317424409956078e-06, "loss": 0.0019, "reward": 3.208333432674408, "reward_std": 0.5643851235508919, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 592 }, { "completion_length": 135.00000190734863, "epoch": 0.31740934029171686, "grad_norm": 1.5703125, "kl": 0.02484218031167984, "learning_rate": 4.3142129753537755e-06, "loss": 0.001, "reward": 2.5000000298023224, "reward_std": 0.40824829041957855, "rewards/correctness_reward_func": 1.0000000223517418, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 593 }, { "completion_length": 186.83333587646484, "epoch": 0.3179446005620233, "grad_norm": 2.0, "kl": 0.02980051003396511, "learning_rate": 4.310995204207386e-06, "loss": 0.0012, "reward": 2.5625000596046448, "reward_std": 0.8611014932394028, "rewards/correctness_reward_func": 1.1666667088866234, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.416666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 594 }, { "completion_length": 193.70833587646484, "epoch": 0.3184798608323297, "grad_norm": 1.6015625, "kl": 0.03126836335286498, "learning_rate": 4.307771107755695e-06, "loss": 0.0013, "reward": 2.6041666865348816, "reward_std": 0.5614049583673477, "rewards/correctness_reward_func": 1.2500000298023224, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3750000111758709, "rewards/xmlcount_reward_func": 0.5, "step": 595 }, { "completion_length": 127.91667175292969, "epoch": 0.31901512110263613, "grad_norm": 2.5, "kl": 0.041764695663005114, "learning_rate": 4.304540697259578e-06, "loss": 0.0017, "reward": 2.802083432674408, "reward_std": 0.999303549528122, "rewards/correctness_reward_func": 1.416666716337204, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.4895833358168602, "step": 596 }, { "completion_length": 143.5833396911621, "epoch": 0.3195503813729426, "grad_norm": 2.078125, "kl": 0.05798004241660237, "learning_rate": 4.3013039840019675e-06, "loss": 0.0023, "reward": 2.5416667461395264, "reward_std": 0.5643851384520531, "rewards/correctness_reward_func": 1.083333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 597 }, { "completion_length": 169.50000381469727, "epoch": 0.32008564164324904, "grad_norm": 1.6484375, "kl": 0.019811555510386825, "learning_rate": 4.298060979287807e-06, "loss": 0.0008, "reward": 2.7916667461395264, "reward_std": 0.8075917363166809, "rewards/correctness_reward_func": 1.333333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 598 }, { "completion_length": 155.29166984558105, "epoch": 0.32062090191355547, "grad_norm": 2.015625, "kl": 0.03277602745220065, "learning_rate": 4.294811694444013e-06, "loss": 0.0013, "reward": 2.7500000596046448, "reward_std": 0.8081966787576675, "rewards/correctness_reward_func": 1.3333333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666865348816, "rewards/xmlcount_reward_func": 0.5, "step": 599 }, { "completion_length": 126.91666793823242, "epoch": 0.3211561621838619, "grad_norm": 1.9453125, "kl": 0.03925598133355379, "learning_rate": 4.29155614081944e-06, "loss": 0.0016, "reward": 2.8541667461395264, "reward_std": 0.3023223280906677, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000149011612, "rewards/xmlcount_reward_func": 0.5, "step": 600 }, { "completion_length": 166.75000190734863, "epoch": 0.3216914224541683, "grad_norm": 1.828125, "kl": 0.030431517399847507, "learning_rate": 4.288294329784838e-06, "loss": 0.0012, "reward": 2.895833432674408, "reward_std": 0.921602413058281, "rewards/correctness_reward_func": 1.416666716337204, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 601 }, { "completion_length": 142.83333587646484, "epoch": 0.3222266827244748, "grad_norm": 1.2109375, "kl": 0.022962462157011032, "learning_rate": 4.285026272732808e-06, "loss": 0.0009, "reward": 3.3125000596046448, "reward_std": 0.309229951351881, "rewards/correctness_reward_func": 1.8333333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 602 }, { "completion_length": 156.37500762939453, "epoch": 0.3227619429947812, "grad_norm": 0.5546875, "kl": 0.020614446373656392, "learning_rate": 4.28175198107777e-06, "loss": 0.0008, "reward": 3.4791666865348816, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 603 }, { "completion_length": 150.62500762939453, "epoch": 0.32329720326508765, "grad_norm": 1.015625, "kl": 0.03634029906243086, "learning_rate": 4.27847146625592e-06, "loss": 0.0015, "reward": 2.5416666865348816, "reward_std": 0.2457980364561081, "rewards/correctness_reward_func": 1.0833333358168602, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 604 }, { "completion_length": 159.4583396911621, "epoch": 0.3238324635353941, "grad_norm": 1.6328125, "kl": 0.04880431201308966, "learning_rate": 4.275184739725188e-06, "loss": 0.002, "reward": 3.0, "reward_std": 0.773861289024353, "rewards/correctness_reward_func": 1.5833333730697632, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 605 }, { "completion_length": 133.6250057220459, "epoch": 0.3243677238057005, "grad_norm": 2.03125, "kl": 0.033091878052800894, "learning_rate": 4.2718918129652e-06, "loss": 0.0013, "reward": 3.083333373069763, "reward_std": 0.6821095645427704, "rewards/correctness_reward_func": 1.5833333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 606 }, { "completion_length": 169.1666717529297, "epoch": 0.324902984076007, "grad_norm": 2.34375, "kl": 0.04404675355181098, "learning_rate": 4.26859269747724e-06, "loss": 0.0018, "reward": 2.9166667461395264, "reward_std": 0.7273796200752258, "rewards/correctness_reward_func": 1.5000000447034836, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 607 }, { "completion_length": 162.7916717529297, "epoch": 0.3254382443463134, "grad_norm": 1.375, "kl": 0.05273099755868316, "learning_rate": 4.265287404784204e-06, "loss": 0.0021, "reward": 2.7291666865348816, "reward_std": 0.4782841205596924, "rewards/correctness_reward_func": 1.2500000074505806, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 608 }, { "completion_length": 147.70833778381348, "epoch": 0.32597350461661984, "grad_norm": 1.671875, "kl": 0.029417749494314194, "learning_rate": 4.261975946430567e-06, "loss": 0.0012, "reward": 2.645833432674408, "reward_std": 0.7949730753898621, "rewards/correctness_reward_func": 1.1666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 609 }, { "completion_length": 105.20833778381348, "epoch": 0.32650876488692626, "grad_norm": 1.125, "kl": 0.039872271940112114, "learning_rate": 4.258658333982335e-06, "loss": 0.0016, "reward": 3.375, "reward_std": 0.25, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 610 }, { "completion_length": 156.5833396911621, "epoch": 0.3270440251572327, "grad_norm": 1.625, "kl": 0.05882585886865854, "learning_rate": 4.255334579027013e-06, "loss": 0.0024, "reward": 2.7916667461395264, "reward_std": 0.5643851608037949, "rewards/correctness_reward_func": 1.416666679084301, "rewards/int_reward_func": 0.39583333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 611 }, { "completion_length": 147.9166717529297, "epoch": 0.32757928542753917, "grad_norm": 1.9609375, "kl": 0.03394132852554321, "learning_rate": 4.252004693173555e-06, "loss": 0.0014, "reward": 2.7500000596046448, "reward_std": 0.6123724430799484, "rewards/correctness_reward_func": 1.2500000596046448, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 612 }, { "completion_length": 117.95833587646484, "epoch": 0.3281145456978456, "grad_norm": 1.0625, "kl": 0.04348599258810282, "learning_rate": 4.2486686880523335e-06, "loss": 0.0017, "reward": 3.25, "reward_std": 0.273861289024353, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 613 }, { "completion_length": 147.58333778381348, "epoch": 0.328649805968152, "grad_norm": 1.765625, "kl": 0.04881718289107084, "learning_rate": 4.24532657531509e-06, "loss": 0.002, "reward": 2.958333432674408, "reward_std": 0.5809475630521774, "rewards/correctness_reward_func": 1.5000000149011612, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 614 }, { "completion_length": 118.87500381469727, "epoch": 0.32918506623845845, "grad_norm": 1.59375, "kl": 0.03824450308457017, "learning_rate": 4.2419783666349e-06, "loss": 0.0015, "reward": 3.2916667461395264, "reward_std": 0.32274864614009857, "rewards/correctness_reward_func": 1.8333333432674408, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 615 }, { "completion_length": 146.4583396911621, "epoch": 0.32972032650876487, "grad_norm": 1.609375, "kl": 0.03492473717778921, "learning_rate": 4.2386240737061315e-06, "loss": 0.0014, "reward": 3.1666667461395264, "reward_std": 0.49322495982050896, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 616 }, { "completion_length": 170.58333778381348, "epoch": 0.3302555867790713, "grad_norm": 1.4140625, "kl": 0.04447829117998481, "learning_rate": 4.2352637082443995e-06, "loss": 0.0018, "reward": 3.270833373069763, "reward_std": 0.30349136516451836, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3541666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 617 }, { "completion_length": 121.87500190734863, "epoch": 0.3307908470493778, "grad_norm": 1.421875, "kl": 0.04443943314254284, "learning_rate": 4.231897281986534e-06, "loss": 0.0018, "reward": 3.458333373069763, "reward_std": 0.10206206887960434, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 618 }, { "completion_length": 144.87500381469727, "epoch": 0.3313261073196842, "grad_norm": 1.5390625, "kl": 0.045205289497971535, "learning_rate": 4.228524806690529e-06, "loss": 0.0018, "reward": 3.1875000596046448, "reward_std": 0.49814651533961296, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000149011612, "rewards/xmlcount_reward_func": 0.5, "step": 619 }, { "completion_length": 182.50000381469727, "epoch": 0.33186136758999063, "grad_norm": 1.6953125, "kl": 0.036972432397305965, "learning_rate": 4.2251462941355075e-06, "loss": 0.0015, "reward": 3.1666666865348816, "reward_std": 0.5194446891546249, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 620 }, { "completion_length": 140.5416717529297, "epoch": 0.33239662786029706, "grad_norm": 1.7421875, "kl": 0.059178040362894535, "learning_rate": 4.22176175612168e-06, "loss": 0.0024, "reward": 2.7500000596046448, "reward_std": 0.3624359965324402, "rewards/correctness_reward_func": 1.3333333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.416666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 621 }, { "completion_length": 117.50000381469727, "epoch": 0.3329318881306035, "grad_norm": 1.765625, "kl": 0.03811078518629074, "learning_rate": 4.218371204470303e-06, "loss": 0.0015, "reward": 3.2291667461395264, "reward_std": 0.5133541226387024, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 622 }, { "completion_length": 106.12500381469727, "epoch": 0.33346714840090996, "grad_norm": 2.03125, "kl": 0.03282386902719736, "learning_rate": 4.214974651023632e-06, "loss": 0.0013, "reward": 3.302083373069763, "reward_std": 0.48479484021663666, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.4895833358168602, "step": 623 }, { "completion_length": 151.79166793823242, "epoch": 0.3340024086712164, "grad_norm": 2.453125, "kl": 0.03183559700846672, "learning_rate": 4.211572107644891e-06, "loss": 0.0013, "reward": 2.8750000298023224, "reward_std": 0.730063334107399, "rewards/correctness_reward_func": 1.5000000447034836, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 624 }, { "completion_length": 188.20833587646484, "epoch": 0.3345376689415228, "grad_norm": 1.34375, "kl": 0.030532730743288994, "learning_rate": 4.208163586218223e-06, "loss": 0.0012, "reward": 2.520833373069763, "reward_std": 0.5441423058509827, "rewards/correctness_reward_func": 1.1666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3541666753590107, "rewards/xmlcount_reward_func": 0.5, "step": 625 }, { "completion_length": 127.16666984558105, "epoch": 0.33507292921182924, "grad_norm": 1.5078125, "kl": 0.05405988823622465, "learning_rate": 4.204749098648651e-06, "loss": 0.0022, "reward": 3.0416666865348816, "reward_std": 0.4541241377592087, "rewards/correctness_reward_func": 1.583333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 626 }, { "completion_length": 175.20833587646484, "epoch": 0.33560818948213567, "grad_norm": 1.5546875, "kl": 0.044757843017578125, "learning_rate": 4.201328656862033e-06, "loss": 0.0018, "reward": 2.520833343267441, "reward_std": 0.3266642242670059, "rewards/correctness_reward_func": 1.1666666716337204, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3541666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 627 }, { "completion_length": 160.41666793823242, "epoch": 0.33614344975244215, "grad_norm": 1.8046875, "kl": 0.045004216488450766, "learning_rate": 4.197902272805028e-06, "loss": 0.0018, "reward": 2.6875000596046448, "reward_std": 0.880591869354248, "rewards/correctness_reward_func": 1.333333358168602, "rewards/int_reward_func": 0.4375000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.416666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 628 }, { "completion_length": 177.4583396911621, "epoch": 0.3366787100227486, "grad_norm": 1.6015625, "kl": 0.04143283236771822, "learning_rate": 4.194469958445048e-06, "loss": 0.0017, "reward": 2.6041667461395264, "reward_std": 0.5357958674430847, "rewards/correctness_reward_func": 1.2500000298023224, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 629 }, { "completion_length": 136.41666793823242, "epoch": 0.337213970293055, "grad_norm": 0.87109375, "kl": 0.029942914377897978, "learning_rate": 4.191031725770216e-06, "loss": 0.0012, "reward": 3.0625, "reward_std": 0.1530931293964386, "rewards/correctness_reward_func": 1.5833333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 630 }, { "completion_length": 119.37500190734863, "epoch": 0.3377492305633614, "grad_norm": 1.1484375, "kl": 0.03256394062191248, "learning_rate": 4.187587586789329e-06, "loss": 0.0013, "reward": 3.395833373069763, "reward_std": 0.25515517219901085, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 631 }, { "completion_length": 137.6250057220459, "epoch": 0.33828449083366785, "grad_norm": 1.5, "kl": 0.03323498251847923, "learning_rate": 4.184137553531812e-06, "loss": 0.0013, "reward": 2.8750000596046448, "reward_std": 0.3061862401664257, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 632 }, { "completion_length": 151.33333587646484, "epoch": 0.33881975110397433, "grad_norm": 1.5234375, "kl": 0.029517395421862602, "learning_rate": 4.180681638047675e-06, "loss": 0.0012, "reward": 2.5625000596046448, "reward_std": 0.5583724975585938, "rewards/correctness_reward_func": 1.0833333432674408, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 633 }, { "completion_length": 144.87500381469727, "epoch": 0.33935501137428076, "grad_norm": 1.296875, "kl": 0.03951950464397669, "learning_rate": 4.177219852407477e-06, "loss": 0.0016, "reward": 3.3541666865348816, "reward_std": 0.24468021094799042, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 634 }, { "completion_length": 176.41666793823242, "epoch": 0.3398902716445872, "grad_norm": 1.703125, "kl": 0.033194053918123245, "learning_rate": 4.173752208702277e-06, "loss": 0.0013, "reward": 3.1250000596046448, "reward_std": 0.510659247636795, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.5, "step": 635 }, { "completion_length": 135.20833587646484, "epoch": 0.3404255319148936, "grad_norm": 0.76953125, "kl": 0.020830919034779072, "learning_rate": 4.170278719043594e-06, "loss": 0.0008, "reward": 2.9791666865348816, "reward_std": 0.05103103816509247, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 636 }, { "completion_length": 181.95833587646484, "epoch": 0.34096079218520003, "grad_norm": 1.859375, "kl": 0.03406862914562225, "learning_rate": 4.1667993955633685e-06, "loss": 0.0014, "reward": 2.895833432674408, "reward_std": 1.027670457959175, "rewards/correctness_reward_func": 1.5000000596046448, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.5, "step": 637 }, { "completion_length": 167.0416717529297, "epoch": 0.3414960524555065, "grad_norm": 1.3046875, "kl": 0.01679137465544045, "learning_rate": 4.163314250413913e-06, "loss": 0.0007, "reward": 2.6666666865348816, "reward_std": 0.5487253814935684, "rewards/correctness_reward_func": 1.2500000298023224, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 638 }, { "completion_length": 158.00000381469727, "epoch": 0.34203131272581294, "grad_norm": 1.4453125, "kl": 0.03285357216373086, "learning_rate": 4.1598232957678784e-06, "loss": 0.0013, "reward": 2.9791667461395264, "reward_std": 0.5357958823442459, "rewards/correctness_reward_func": 1.583333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 639 }, { "completion_length": 130.41666793823242, "epoch": 0.34256657299611937, "grad_norm": 2.375, "kl": 0.06797064701095223, "learning_rate": 4.1563265438182e-06, "loss": 0.0027, "reward": 2.8750001192092896, "reward_std": 0.791929330676794, "rewards/correctness_reward_func": 1.4166667014360428, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 640 }, { "completion_length": 139.9166717529297, "epoch": 0.3431018332664258, "grad_norm": 1.9140625, "kl": 0.027372614247724414, "learning_rate": 4.152824006778068e-06, "loss": 0.0011, "reward": 3.2916666865348816, "reward_std": 0.39777331054210663, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 641 }, { "completion_length": 139.33333778381348, "epoch": 0.3436370935367322, "grad_norm": 2.015625, "kl": 0.03689718246459961, "learning_rate": 4.149315696880873e-06, "loss": 0.0015, "reward": 3.1875000596046448, "reward_std": 0.5993371978402138, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 642 }, { "completion_length": 152.50000381469727, "epoch": 0.3441723538070387, "grad_norm": 1.375, "kl": 0.03251838870346546, "learning_rate": 4.145801626380174e-06, "loss": 0.0013, "reward": 2.833333373069763, "reward_std": 0.29362983629107475, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 643 }, { "completion_length": 148.33333587646484, "epoch": 0.3447076140773451, "grad_norm": 1.203125, "kl": 0.03755293879657984, "learning_rate": 4.142281807549644e-06, "loss": 0.0015, "reward": 3.145833373069763, "reward_std": 0.3092299550771713, "rewards/correctness_reward_func": 1.6666666716337204, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 644 }, { "completion_length": 176.54166793823242, "epoch": 0.34524287434765155, "grad_norm": 1.6328125, "kl": 0.05265080649405718, "learning_rate": 4.138756252683039e-06, "loss": 0.0021, "reward": 2.8125000596046448, "reward_std": 0.6279504112899303, "rewards/correctness_reward_func": 1.416666679084301, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000149011612, "rewards/xmlcount_reward_func": 0.5, "step": 645 }, { "completion_length": 162.41667366027832, "epoch": 0.345778134617958, "grad_norm": 1.4453125, "kl": 0.05229492858052254, "learning_rate": 4.135224974094145e-06, "loss": 0.0021, "reward": 2.9791667461395264, "reward_std": 0.6616143435239792, "rewards/correctness_reward_func": 1.5833333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 646 }, { "completion_length": 197.79167556762695, "epoch": 0.3463133948882644, "grad_norm": 1.9375, "kl": 0.031138702295720577, "learning_rate": 4.131687984116743e-06, "loss": 0.0012, "reward": 2.416666716337204, "reward_std": 1.1080896109342575, "rewards/correctness_reward_func": 1.083333358168602, "rewards/int_reward_func": 0.3958333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.5, "step": 647 }, { "completion_length": 188.83333778381348, "epoch": 0.34684865515857083, "grad_norm": 1.359375, "kl": 0.0418109823949635, "learning_rate": 4.128145295104561e-06, "loss": 0.0017, "reward": 2.895833432674408, "reward_std": 0.6499251537024975, "rewards/correctness_reward_func": 1.5000000149011612, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.416666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 648 }, { "completion_length": 173.2916717529297, "epoch": 0.3473839154288773, "grad_norm": 1.0390625, "kl": 0.042974590323865414, "learning_rate": 4.124596919431229e-06, "loss": 0.0017, "reward": 3.4375000596046448, "reward_std": 0.11558076366782188, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 649 }, { "completion_length": 137.7083396911621, "epoch": 0.34791917569918374, "grad_norm": 1.0, "kl": 0.03159995749592781, "learning_rate": 4.1210428694902444e-06, "loss": 0.0013, "reward": 3.0625, "reward_std": 0.22008520364761353, "rewards/correctness_reward_func": 1.5833333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 650 }, { "completion_length": 195.58333587646484, "epoch": 0.34845443596949016, "grad_norm": 0.7109375, "kl": 0.02461721864528954, "learning_rate": 4.117483157694919e-06, "loss": 0.001, "reward": 2.4010416865348816, "reward_std": 0.13465330004692078, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.484375, "step": 651 }, { "completion_length": 134.12500381469727, "epoch": 0.3489896962397966, "grad_norm": 1.5546875, "kl": 0.036613046657294035, "learning_rate": 4.113917796478342e-06, "loss": 0.0015, "reward": 3.020833373069763, "reward_std": 0.5618228912353516, "rewards/correctness_reward_func": 1.5833333432674408, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 652 }, { "completion_length": 202.5416717529297, "epoch": 0.349524956510103, "grad_norm": 1.5, "kl": 0.03586659440770745, "learning_rate": 4.110346798293334e-06, "loss": 0.0014, "reward": 3.0416667461395264, "reward_std": 0.6170316934585571, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.31250001303851604, "rewards/xmlcount_reward_func": 0.5, "step": 653 }, { "completion_length": 151.9166717529297, "epoch": 0.3500602167804095, "grad_norm": 2.046875, "kl": 0.03458790061995387, "learning_rate": 4.106770175612404e-06, "loss": 0.0014, "reward": 2.6041666865348816, "reward_std": 0.8134097754955292, "rewards/correctness_reward_func": 1.166666679084301, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000149011612, "rewards/xmlcount_reward_func": 0.5, "step": 654 }, { "completion_length": 176.4583396911621, "epoch": 0.3505954770507159, "grad_norm": 0.9609375, "kl": 0.0380466477945447, "learning_rate": 4.103187940927705e-06, "loss": 0.0015, "reward": 3.270833373069763, "reward_std": 0.2002602517604828, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3541666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 655 }, { "completion_length": 177.83333587646484, "epoch": 0.35113073732102235, "grad_norm": 1.4921875, "kl": 0.026743045775219798, "learning_rate": 4.099600106750993e-06, "loss": 0.0011, "reward": 1.9166667312383652, "reward_std": 0.5914224684238434, "rewards/correctness_reward_func": 0.6666666865348816, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3750000037252903, "rewards/xmlcount_reward_func": 0.5, "step": 656 }, { "completion_length": 140.16666793823242, "epoch": 0.35166599759132877, "grad_norm": 1.890625, "kl": 0.032465869560837746, "learning_rate": 4.096006685613579e-06, "loss": 0.0013, "reward": 3.2500000596046448, "reward_std": 0.4727980047464371, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 657 }, { "completion_length": 150.33333587646484, "epoch": 0.3522012578616352, "grad_norm": 1.71875, "kl": 0.036532831378281116, "learning_rate": 4.09240769006629e-06, "loss": 0.0015, "reward": 3.0000000596046448, "reward_std": 0.6798528283834457, "rewards/correctness_reward_func": 1.5833333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.416666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 658 }, { "completion_length": 179.2916717529297, "epoch": 0.3527365181319417, "grad_norm": 1.1875, "kl": 0.02686024410650134, "learning_rate": 4.088803132679421e-06, "loss": 0.0011, "reward": 3.0000000298023224, "reward_std": 0.29362983629107475, "rewards/correctness_reward_func": 1.5833333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 659 }, { "completion_length": 168.7083396911621, "epoch": 0.3532717784022481, "grad_norm": 1.3046875, "kl": 0.023333940654993057, "learning_rate": 4.085193026042695e-06, "loss": 0.0009, "reward": 3.020833373069763, "reward_std": 0.4421939253807068, "rewards/correctness_reward_func": 1.583333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.5, "step": 660 }, { "completion_length": 125.58333587646484, "epoch": 0.35380703867255453, "grad_norm": 2.375, "kl": 0.06006305478513241, "learning_rate": 4.081577382765215e-06, "loss": 0.0024, "reward": 2.645833373069763, "reward_std": 0.7491974383592606, "rewards/correctness_reward_func": 1.2500000223517418, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.5, "step": 661 }, { "completion_length": 138.58333587646484, "epoch": 0.35434229894286096, "grad_norm": 1.7109375, "kl": 0.03619189281016588, "learning_rate": 4.077956215475423e-06, "loss": 0.0014, "reward": 3.1250000596046448, "reward_std": 0.3602609895169735, "rewards/correctness_reward_func": 1.6666666716337204, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 662 }, { "completion_length": 185.08333587646484, "epoch": 0.3548775592131674, "grad_norm": 1.7265625, "kl": 0.05054088030010462, "learning_rate": 4.074329536821056e-06, "loss": 0.002, "reward": 2.4375000596046448, "reward_std": 0.4713764898478985, "rewards/correctness_reward_func": 1.0000000223517418, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000149011612, "rewards/xmlcount_reward_func": 0.5, "step": 663 }, { "completion_length": 154.8333396911621, "epoch": 0.35541281948347386, "grad_norm": 1.8984375, "kl": 0.03618910349905491, "learning_rate": 4.070697359469097e-06, "loss": 0.0014, "reward": 3.1041667461395264, "reward_std": 0.631978552788496, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 664 }, { "completion_length": 130.0416717529297, "epoch": 0.3559480797537803, "grad_norm": 1.59375, "kl": 0.02808321500197053, "learning_rate": 4.067059696105738e-06, "loss": 0.0011, "reward": 2.7916666865348816, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 665 }, { "completion_length": 164.5416717529297, "epoch": 0.3564833400240867, "grad_norm": 1.328125, "kl": 0.026204224675893784, "learning_rate": 4.063416559436332e-06, "loss": 0.001, "reward": 2.6250000298023224, "reward_std": 0.43686148524284363, "rewards/correctness_reward_func": 1.1666666716337204, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 666 }, { "completion_length": 125.91667175292969, "epoch": 0.35701860029439314, "grad_norm": 2.078125, "kl": 0.04998402390629053, "learning_rate": 4.059767962185346e-06, "loss": 0.002, "reward": 2.5416667461395264, "reward_std": 0.6769221723079681, "rewards/correctness_reward_func": 1.083333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 667 }, { "completion_length": 153.04166984558105, "epoch": 0.35755386056469957, "grad_norm": 0.470703125, "kl": 0.048956929706037045, "learning_rate": 4.056113917096321e-06, "loss": 0.002, "reward": 3.4791666865348816, "reward_std": 0.05103103816509247, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 668 }, { "completion_length": 127.5000057220459, "epoch": 0.35808912083500605, "grad_norm": 1.0234375, "kl": 0.032527790404856205, "learning_rate": 4.052454436931826e-06, "loss": 0.0013, "reward": 3.333333373069763, "reward_std": 0.25819891691207886, "rewards/correctness_reward_func": 1.8333333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 669 }, { "completion_length": 164.16666984558105, "epoch": 0.3586243811053125, "grad_norm": 2.203125, "kl": 0.08344291755929589, "learning_rate": 4.048789534473414e-06, "loss": 0.0033, "reward": 2.729166716337204, "reward_std": 0.535199623554945, "rewards/correctness_reward_func": 1.3333333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 670 }, { "completion_length": 149.4583396911621, "epoch": 0.3591596413756189, "grad_norm": 1.5, "kl": 0.029162777587771416, "learning_rate": 4.045119222521574e-06, "loss": 0.0012, "reward": 3.208333373069763, "reward_std": 0.5483061634004116, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 671 }, { "completion_length": 145.70833587646484, "epoch": 0.3596949016459253, "grad_norm": 2.3125, "kl": 0.049506490118801594, "learning_rate": 4.041443513895692e-06, "loss": 0.002, "reward": 3.0000000596046448, "reward_std": 0.6572890840470791, "rewards/correctness_reward_func": 1.5833333432674408, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000149011612, "rewards/xmlcount_reward_func": 0.5, "step": 672 }, { "completion_length": 115.62500381469727, "epoch": 0.36023016191623175, "grad_norm": 1.6015625, "kl": 0.04335561767220497, "learning_rate": 4.037762421434e-06, "loss": 0.0017, "reward": 3.2500000596046448, "reward_std": 0.46232306957244873, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 673 }, { "completion_length": 171.8750057220459, "epoch": 0.36076542218653823, "grad_norm": 1.125, "kl": 0.0296230330131948, "learning_rate": 4.034075957993537e-06, "loss": 0.0012, "reward": 2.8750000596046448, "reward_std": 0.5809475183486938, "rewards/correctness_reward_func": 1.4166666716337204, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 674 }, { "completion_length": 135.70833587646484, "epoch": 0.36130068245684466, "grad_norm": 1.7734375, "kl": 0.035232785856351256, "learning_rate": 4.030384136450098e-06, "loss": 0.0014, "reward": 3.0625000596046448, "reward_std": 0.5583724975585938, "rewards/correctness_reward_func": 1.5833333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 675 }, { "completion_length": 114.75000190734863, "epoch": 0.3618359427271511, "grad_norm": 0.99609375, "kl": 0.031613022554665804, "learning_rate": 4.026686969698196e-06, "loss": 0.0013, "reward": 3.395833373069763, "reward_std": 0.2002602517604828, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 676 }, { "completion_length": 153.5000057220459, "epoch": 0.3623712029974575, "grad_norm": 1.5, "kl": 0.033104993868619204, "learning_rate": 4.022984470651012e-06, "loss": 0.0013, "reward": 2.854166716337204, "reward_std": 0.5922432988882065, "rewards/correctness_reward_func": 1.4166667088866234, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.5, "step": 677 }, { "completion_length": 150.2500057220459, "epoch": 0.36290646326776393, "grad_norm": 1.609375, "kl": 0.0589370196685195, "learning_rate": 4.01927665224035e-06, "loss": 0.0024, "reward": 2.489583373069763, "reward_std": 0.5041960887610912, "rewards/correctness_reward_func": 1.083333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.416666679084301, "rewards/xmlcount_reward_func": 0.4895833358168602, "step": 678 }, { "completion_length": 134.3750057220459, "epoch": 0.36344172353807036, "grad_norm": 0.9765625, "kl": 0.026309417095035315, "learning_rate": 4.015563527416596e-06, "loss": 0.0011, "reward": 3.4166666865348816, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 679 }, { "completion_length": 151.5833396911621, "epoch": 0.36397698380837684, "grad_norm": 1.359375, "kl": 0.023342951899394393, "learning_rate": 4.011845109148666e-06, "loss": 0.0009, "reward": 3.1666666865348816, "reward_std": 0.4779854416847229, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 680 }, { "completion_length": 138.29166984558105, "epoch": 0.36451224407868327, "grad_norm": 2.390625, "kl": 0.02908670110628009, "learning_rate": 4.0081214104239656e-06, "loss": 0.0012, "reward": 2.5625000298023224, "reward_std": 0.6283334940671921, "rewards/correctness_reward_func": 1.083333358168602, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 681 }, { "completion_length": 164.4583396911621, "epoch": 0.3650475043489897, "grad_norm": 1.7421875, "kl": 0.04680294170975685, "learning_rate": 4.004392444248347e-06, "loss": 0.0019, "reward": 2.8906250596046448, "reward_std": 0.8390896618366241, "rewards/correctness_reward_func": 1.5000000298023224, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.416666679084301, "rewards/xmlcount_reward_func": 0.4947916716337204, "step": 682 }, { "completion_length": 113.66666793823242, "epoch": 0.3655827646192961, "grad_norm": 1.390625, "kl": 0.03361522685736418, "learning_rate": 4.000658223646057e-06, "loss": 0.0013, "reward": 3.2291666865348816, "reward_std": 0.4509793668985367, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 683 }, { "completion_length": 127.54166984558105, "epoch": 0.36611802488960254, "grad_norm": 0.87109375, "kl": 0.035005373414605856, "learning_rate": 3.996918761659694e-06, "loss": 0.0014, "reward": 2.895833373069763, "reward_std": 0.2002602517604828, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 684 }, { "completion_length": 172.6250057220459, "epoch": 0.366653285159909, "grad_norm": 1.828125, "kl": 0.027844190131872892, "learning_rate": 3.993174071350164e-06, "loss": 0.0011, "reward": 2.6302084624767303, "reward_std": 0.8579408079385757, "rewards/correctness_reward_func": 1.2500000223517418, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.484375, "step": 685 }, { "completion_length": 129.62500381469727, "epoch": 0.36718854543021545, "grad_norm": 1.4765625, "kl": 0.02923845173791051, "learning_rate": 3.989424165796637e-06, "loss": 0.0012, "reward": 3.083333373069763, "reward_std": 0.46232306957244873, "rewards/correctness_reward_func": 1.583333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 686 }, { "completion_length": 129.66666984558105, "epoch": 0.3677238057005219, "grad_norm": 1.765625, "kl": 0.0485138155054301, "learning_rate": 3.985669058096493e-06, "loss": 0.0019, "reward": 3.2500000596046448, "reward_std": 0.49983541294932365, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 687 }, { "completion_length": 162.2500057220459, "epoch": 0.3682590659708283, "grad_norm": 1.5078125, "kl": 0.022985886316746473, "learning_rate": 3.981908761365286e-06, "loss": 0.0009, "reward": 2.442708373069763, "reward_std": 0.4739741384983063, "rewards/correctness_reward_func": 1.0000000223517418, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.484375, "step": 688 }, { "completion_length": 156.87500381469727, "epoch": 0.36879432624113473, "grad_norm": 2.046875, "kl": 0.02942904783412814, "learning_rate": 3.978143288736692e-06, "loss": 0.0012, "reward": 2.5208334028720856, "reward_std": 0.7050773799419403, "rewards/correctness_reward_func": 1.083333358168602, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 689 }, { "completion_length": 134.25000190734863, "epoch": 0.3693295865114412, "grad_norm": 2.765625, "kl": 0.08899490907788277, "learning_rate": 3.974372653362466e-06, "loss": 0.0036, "reward": 2.770833373069763, "reward_std": 0.6551035642623901, "rewards/correctness_reward_func": 1.3333333656191826, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 690 }, { "completion_length": 158.16666984558105, "epoch": 0.36986484678174764, "grad_norm": 1.96875, "kl": 0.03482615575194359, "learning_rate": 3.970596868412393e-06, "loss": 0.0014, "reward": 2.3750001192092896, "reward_std": 0.7919293642044067, "rewards/correctness_reward_func": 0.9166667014360428, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 691 }, { "completion_length": 159.29166793823242, "epoch": 0.37040010705205406, "grad_norm": 1.984375, "kl": 0.02983904629945755, "learning_rate": 3.966815947074246e-06, "loss": 0.0012, "reward": 3.0833334922790527, "reward_std": 0.9657258689403534, "rewards/correctness_reward_func": 1.6666667461395264, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 692 }, { "completion_length": 129.87500190734863, "epoch": 0.3709353673223605, "grad_norm": 1.71875, "kl": 0.04448452312499285, "learning_rate": 3.963029902553738e-06, "loss": 0.0018, "reward": 3.3541667461395264, "reward_std": 0.31970490142703056, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 693 }, { "completion_length": 164.91666793823242, "epoch": 0.3714706275926669, "grad_norm": 1.7109375, "kl": 0.03725033439695835, "learning_rate": 3.959238748074474e-06, "loss": 0.0015, "reward": 2.5833334028720856, "reward_std": 0.4248107150197029, "rewards/correctness_reward_func": 1.1666666716337204, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.416666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 694 }, { "completion_length": 150.75000762939453, "epoch": 0.3720058878629734, "grad_norm": 1.765625, "kl": 0.04319385718554258, "learning_rate": 3.955442496877908e-06, "loss": 0.0017, "reward": 2.7916667461395264, "reward_std": 0.6950604021549225, "rewards/correctness_reward_func": 1.3333333656191826, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 695 }, { "completion_length": 175.54167556762695, "epoch": 0.3725411481332798, "grad_norm": 1.234375, "kl": 0.04713658872060478, "learning_rate": 3.951641162223298e-06, "loss": 0.0019, "reward": 2.7916666865348816, "reward_std": 0.29788626730442047, "rewards/correctness_reward_func": 1.3333333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 696 }, { "completion_length": 144.95833778381348, "epoch": 0.37307640840358625, "grad_norm": 1.9765625, "kl": 0.03795685060322285, "learning_rate": 3.947834757387651e-06, "loss": 0.0015, "reward": 3.0416667461395264, "reward_std": 0.7436887919902802, "rewards/correctness_reward_func": 1.5833333730697632, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 697 }, { "completion_length": 155.7083396911621, "epoch": 0.37361166867389267, "grad_norm": 1.1171875, "kl": 0.02909352071583271, "learning_rate": 3.944023295665688e-06, "loss": 0.0012, "reward": 2.8125000596046448, "reward_std": 0.4592793434858322, "rewards/correctness_reward_func": 1.3333333730697632, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 698 }, { "completion_length": 118.83333587646484, "epoch": 0.3741469289441991, "grad_norm": 1.0078125, "kl": 0.028855583164840937, "learning_rate": 3.9402067903697894e-06, "loss": 0.0012, "reward": 3.4166666865348816, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 699 }, { "completion_length": 106.25000190734863, "epoch": 0.3746821892145056, "grad_norm": 1.4765625, "kl": 0.04083001893013716, "learning_rate": 3.936385254829953e-06, "loss": 0.0016, "reward": 2.895833373069763, "reward_std": 0.11558076739311218, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4166666679084301, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 700 }, { "completion_length": 141.2083396911621, "epoch": 0.375217449484812, "grad_norm": 1.1484375, "kl": 0.04432675335556269, "learning_rate": 3.932558702393746e-06, "loss": 0.0018, "reward": 3.020833373069763, "reward_std": 0.5874452292919159, "rewards/correctness_reward_func": 1.5833333432674408, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 701 }, { "completion_length": 122.20833587646484, "epoch": 0.37575270975511843, "grad_norm": 1.078125, "kl": 0.02684123977087438, "learning_rate": 3.928727146426258e-06, "loss": 0.0011, "reward": 2.7291666865348816, "reward_std": 0.25515520572662354, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 702 }, { "completion_length": 110.75000381469727, "epoch": 0.37628797002542486, "grad_norm": 2.09375, "kl": 0.05112092103809118, "learning_rate": 3.9248906003100514e-06, "loss": 0.002, "reward": 3.3125000596046448, "reward_std": 0.40438438951969147, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 703 }, { "completion_length": 182.45833778381348, "epoch": 0.3768232302957313, "grad_norm": 2.03125, "kl": 0.027650201227515936, "learning_rate": 3.921049077445124e-06, "loss": 0.0011, "reward": 2.005208373069763, "reward_std": 0.39549052342772484, "rewards/correctness_reward_func": 0.5833333358168602, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.484375, "step": 704 }, { "completion_length": 162.87500381469727, "epoch": 0.37735849056603776, "grad_norm": 0.65234375, "kl": 0.019340375438332558, "learning_rate": 3.91720259124885e-06, "loss": 0.0008, "reward": 3.125, "reward_std": 0.2958039939403534, "rewards/correctness_reward_func": 1.6666666716337204, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 705 }, { "completion_length": 141.70833587646484, "epoch": 0.3778937508363442, "grad_norm": 1.25, "kl": 0.030775428283959627, "learning_rate": 3.913351155155943e-06, "loss": 0.0012, "reward": 2.9375, "reward_std": 0.6161879003047943, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 706 }, { "completion_length": 200.62500381469727, "epoch": 0.3784290111066506, "grad_norm": 1.8203125, "kl": 0.04161107540130615, "learning_rate": 3.909494782618403e-06, "loss": 0.0017, "reward": 2.5208334028720856, "reward_std": 0.7303955964744091, "rewards/correctness_reward_func": 1.083333358168602, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 707 }, { "completion_length": 110.66667175292969, "epoch": 0.37896427137695704, "grad_norm": 3.234375, "kl": 0.061036181170493364, "learning_rate": 3.905633487105474e-06, "loss": 0.0024, "reward": 3.3541667461395264, "reward_std": 0.3572172410786152, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 708 }, { "completion_length": 107.00000381469727, "epoch": 0.37949953164726347, "grad_norm": 1.6875, "kl": 0.036092507652938366, "learning_rate": 3.9017672821035915e-06, "loss": 0.0014, "reward": 3.4166666865348816, "reward_std": 0.20412415266036987, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 709 }, { "completion_length": 155.79167366027832, "epoch": 0.3800347919175699, "grad_norm": 1.5, "kl": 0.025613056030124426, "learning_rate": 3.897896181116341e-06, "loss": 0.001, "reward": 3.083333373069763, "reward_std": 0.46232305467128754, "rewards/correctness_reward_func": 1.583333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 710 }, { "completion_length": 146.33333587646484, "epoch": 0.3805700521878764, "grad_norm": 1.4296875, "kl": 0.02829862991347909, "learning_rate": 3.8940201976644065e-06, "loss": 0.0011, "reward": 2.916666716337204, "reward_std": 0.16661179810762405, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 711 }, { "completion_length": 146.33333778381348, "epoch": 0.3811053124581828, "grad_norm": 1.484375, "kl": 0.02284587174654007, "learning_rate": 3.890139345285527e-06, "loss": 0.0009, "reward": 2.7916666865348816, "reward_std": 0.5202587842941284, "rewards/correctness_reward_func": 1.3333333358168602, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 712 }, { "completion_length": 125.29166984558105, "epoch": 0.3816405727284892, "grad_norm": 0.62890625, "kl": 0.02871770365163684, "learning_rate": 3.886253637534447e-06, "loss": 0.0011, "reward": 3.4791666865348816, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 713 }, { "completion_length": 154.54166793823242, "epoch": 0.38217583299879565, "grad_norm": 1.8046875, "kl": 0.0300689903087914, "learning_rate": 3.882363087982868e-06, "loss": 0.0012, "reward": 3.0000000596046448, "reward_std": 0.7361843585968018, "rewards/correctness_reward_func": 1.5000000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 714 }, { "completion_length": 178.29167556762695, "epoch": 0.3827110932691021, "grad_norm": 1.2109375, "kl": 0.02768306853249669, "learning_rate": 3.878467710219402e-06, "loss": 0.0011, "reward": 2.6041666865348816, "reward_std": 0.30103103816509247, "rewards/correctness_reward_func": 1.1666666716337204, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 715 }, { "completion_length": 142.58333587646484, "epoch": 0.38324635353940856, "grad_norm": 1.5234375, "kl": 0.029177965596318245, "learning_rate": 3.874567517849529e-06, "loss": 0.0012, "reward": 3.083333373069763, "reward_std": 0.5974817872047424, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 716 }, { "completion_length": 146.62500762939453, "epoch": 0.383781613809715, "grad_norm": 1.4375, "kl": 0.040057963225990534, "learning_rate": 3.87066252449554e-06, "loss": 0.0016, "reward": 3.270833373069763, "reward_std": 0.2837683819234371, "rewards/correctness_reward_func": 1.8333333432674408, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 717 }, { "completion_length": 161.1666717529297, "epoch": 0.3843168740800214, "grad_norm": 1.34375, "kl": 0.03427933529019356, "learning_rate": 3.8667527437964974e-06, "loss": 0.0014, "reward": 2.8125000298023224, "reward_std": 0.4592793434858322, "rewards/correctness_reward_func": 1.3333333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 718 }, { "completion_length": 172.37500762939453, "epoch": 0.38485213435032783, "grad_norm": 1.7578125, "kl": 0.03410201659426093, "learning_rate": 3.8628381894081835e-06, "loss": 0.0014, "reward": 2.9791667461395264, "reward_std": 0.8043745756149292, "rewards/correctness_reward_func": 1.5833333730697632, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 719 }, { "completion_length": 143.0416717529297, "epoch": 0.38538739462063426, "grad_norm": 1.4921875, "kl": 0.03300033137202263, "learning_rate": 3.858918875003053e-06, "loss": 0.0013, "reward": 3.1250000596046448, "reward_std": 0.5425351560115814, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 720 }, { "completion_length": 142.12500381469727, "epoch": 0.38592265489094074, "grad_norm": 1.6171875, "kl": 0.03706725034862757, "learning_rate": 3.854994814270189e-06, "loss": 0.0015, "reward": 3.1875000596046448, "reward_std": 0.7654656171798706, "rewards/correctness_reward_func": 1.7500000596046448, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.4791666716337204, "step": 721 }, { "completion_length": 126.95833396911621, "epoch": 0.38645791516124717, "grad_norm": 3.34375, "kl": 0.09948566742241383, "learning_rate": 3.851066020915248e-06, "loss": 0.004, "reward": 3.208333373069763, "reward_std": 0.6018974781036377, "rewards/correctness_reward_func": 1.7500000596046448, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 722 }, { "completion_length": 161.5833396911621, "epoch": 0.3869931754315536, "grad_norm": 1.78125, "kl": 0.0415203096345067, "learning_rate": 3.84713250866042e-06, "loss": 0.0017, "reward": 3.2916667461395264, "reward_std": 0.45541542395949364, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 723 }, { "completion_length": 150.625, "epoch": 0.38752843570186, "grad_norm": 2.0625, "kl": 0.05728019238449633, "learning_rate": 3.843194291244375e-06, "loss": 0.0023, "reward": 2.6041667461395264, "reward_std": 0.6071162149310112, "rewards/correctness_reward_func": 1.1666666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 724 }, { "completion_length": 126.62500190734863, "epoch": 0.38806369597216644, "grad_norm": 2.453125, "kl": 0.049750881269574165, "learning_rate": 3.839251382422217e-06, "loss": 0.002, "reward": 3.1875000596046448, "reward_std": 0.6529284827411175, "rewards/correctness_reward_func": 1.7500000596046448, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 725 }, { "completion_length": 122.04166984558105, "epoch": 0.3885989562424729, "grad_norm": 1.6484375, "kl": 0.03970834193751216, "learning_rate": 3.8353037959654344e-06, "loss": 0.0016, "reward": 3.1666667461395264, "reward_std": 0.5163978338241577, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 726 }, { "completion_length": 112.58333587646484, "epoch": 0.38913421651277935, "grad_norm": 2.4375, "kl": 0.05409126076847315, "learning_rate": 3.8313515456618565e-06, "loss": 0.0022, "reward": 3.3125000596046448, "reward_std": 0.4592793248593807, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 727 }, { "completion_length": 159.87500381469727, "epoch": 0.3896694767830858, "grad_norm": 1.78125, "kl": 0.04191382694989443, "learning_rate": 3.827394645315601e-06, "loss": 0.0017, "reward": 2.5416667461395264, "reward_std": 0.6094035357236862, "rewards/correctness_reward_func": 1.0833333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 728 }, { "completion_length": 200.4166717529297, "epoch": 0.3902047370533922, "grad_norm": 3.890625, "kl": 0.15797852352261543, "learning_rate": 3.823433108747024e-06, "loss": 0.0063, "reward": 2.2500000298023224, "reward_std": 0.6845854222774506, "rewards/correctness_reward_func": 1.0000000447034836, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125000111758709, "rewards/xmlcount_reward_func": 0.5, "step": 729 }, { "completion_length": 120.58333587646484, "epoch": 0.39073999732369863, "grad_norm": 1.28125, "kl": 0.03376244753599167, "learning_rate": 3.819466949792677e-06, "loss": 0.0014, "reward": 3.1041666865348816, "reward_std": 0.54645074903965, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 730 }, { "completion_length": 121.66666793823242, "epoch": 0.3912752575940051, "grad_norm": 1.4375, "kl": 0.02545109740458429, "learning_rate": 3.81549618230526e-06, "loss": 0.001, "reward": 3.083333373069763, "reward_std": 0.5320602059364319, "rewards/correctness_reward_func": 1.5833333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 731 }, { "completion_length": 143.37500381469727, "epoch": 0.39181051786431154, "grad_norm": 1.6875, "kl": 0.03742834506556392, "learning_rate": 3.8115208201535603e-06, "loss": 0.0015, "reward": 2.583333373069763, "reward_std": 0.5425351560115814, "rewards/correctness_reward_func": 1.1666666865348816, "rewards/int_reward_func": 0.4166666679084301, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 732 }, { "completion_length": 136.87500381469727, "epoch": 0.39234577813461796, "grad_norm": 1.90625, "kl": 0.02556539513170719, "learning_rate": 3.8075408772224214e-06, "loss": 0.001, "reward": 2.9791667461395264, "reward_std": 0.5674288719892502, "rewards/correctness_reward_func": 1.5000000149011612, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 733 }, { "completion_length": 147.9583396911621, "epoch": 0.3928810384049244, "grad_norm": 1.59375, "kl": 0.02737267129123211, "learning_rate": 3.8035563674126818e-06, "loss": 0.0011, "reward": 3.333333373069763, "reward_std": 0.40824827551841736, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 734 }, { "completion_length": 182.4583396911621, "epoch": 0.3934162986752308, "grad_norm": 1.5, "kl": 0.042339869774878025, "learning_rate": 3.7995673046411336e-06, "loss": 0.0017, "reward": 2.8125, "reward_std": 0.2621144950389862, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 735 }, { "completion_length": 134.54166984558105, "epoch": 0.3939515589455373, "grad_norm": 2.453125, "kl": 0.04328146809712052, "learning_rate": 3.795573702840468e-06, "loss": 0.0017, "reward": 2.6875000596046448, "reward_std": 0.729445070028305, "rewards/correctness_reward_func": 1.2500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.5, "step": 736 }, { "completion_length": 131.83333587646484, "epoch": 0.3944868192158437, "grad_norm": 1.8046875, "kl": 0.046282849740237, "learning_rate": 3.791575575959232e-06, "loss": 0.0019, "reward": 2.6250000596046448, "reward_std": 0.7266311347484589, "rewards/correctness_reward_func": 1.2500000298023224, "rewards/int_reward_func": 0.4166666679084301, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 737 }, { "completion_length": 142.70833587646484, "epoch": 0.39502207948615015, "grad_norm": 1.5703125, "kl": 0.041007681749761105, "learning_rate": 3.7875729379617766e-06, "loss": 0.0016, "reward": 2.9166666865348816, "reward_std": 0.6185525953769684, "rewards/correctness_reward_func": 1.5000000149011612, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 738 }, { "completion_length": 122.33333778381348, "epoch": 0.39555733975645657, "grad_norm": 1.8359375, "kl": 0.037428571842610836, "learning_rate": 3.7835658028282092e-06, "loss": 0.0015, "reward": 3.1250000596046448, "reward_std": 0.5553287714719772, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 739 }, { "completion_length": 162.91666793823242, "epoch": 0.396092600026763, "grad_norm": 1.65625, "kl": 0.04300609743222594, "learning_rate": 3.779554184554345e-06, "loss": 0.0017, "reward": 3.0000001192092896, "reward_std": 0.60411436855793, "rewards/correctness_reward_func": 1.5833333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 740 }, { "completion_length": 162.66666984558105, "epoch": 0.3966278602970694, "grad_norm": 1.1328125, "kl": 0.02084392588585615, "learning_rate": 3.7755380971516563e-06, "loss": 0.0008, "reward": 3.020833373069763, "reward_std": 0.5409832894802094, "rewards/correctness_reward_func": 1.5833333432674408, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 741 }, { "completion_length": 154.6250057220459, "epoch": 0.3971631205673759, "grad_norm": 2.359375, "kl": 0.056923945900052786, "learning_rate": 3.771517554647226e-06, "loss": 0.0023, "reward": 3.208333432674408, "reward_std": 0.5449211224913597, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 742 }, { "completion_length": 128.25000381469727, "epoch": 0.39769838083768233, "grad_norm": 0.83984375, "kl": 0.035280851647257805, "learning_rate": 3.7674925710836964e-06, "loss": 0.0014, "reward": 3.4166666865348816, "reward_std": 0.20412415266036987, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 743 }, { "completion_length": 134.4583396911621, "epoch": 0.39823364110798876, "grad_norm": 2.140625, "kl": 0.03490069089457393, "learning_rate": 3.7634631605192225e-06, "loss": 0.0014, "reward": 3.395833373069763, "reward_std": 0.25515518710017204, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 744 }, { "completion_length": 141.95833778381348, "epoch": 0.3987689013782952, "grad_norm": 1.6796875, "kl": 0.040149425622075796, "learning_rate": 3.7594293370274193e-06, "loss": 0.0016, "reward": 3.4375000596046448, "reward_std": 0.1530931070446968, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000149011612, "rewards/xmlcount_reward_func": 0.5, "step": 745 }, { "completion_length": 119.70833587646484, "epoch": 0.3993041616486016, "grad_norm": 0.97265625, "kl": 0.0623900992795825, "learning_rate": 3.7553911146973176e-06, "loss": 0.0025, "reward": 3.4791666865348816, "reward_std": 0.05103103816509247, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 746 }, { "completion_length": 154.41666984558105, "epoch": 0.3998394219189081, "grad_norm": 2.0625, "kl": 0.030291962437331676, "learning_rate": 3.7513485076333116e-06, "loss": 0.0012, "reward": 2.8385417461395264, "reward_std": 0.8882782272994518, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.484375, "step": 747 }, { "completion_length": 129.2083396911621, "epoch": 0.4003746821892145, "grad_norm": 0.9375, "kl": 0.04303775355219841, "learning_rate": 3.747301529955108e-06, "loss": 0.0017, "reward": 3.4166666865348816, "reward_std": 0.20412415266036987, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 748 }, { "completion_length": 148.37500381469727, "epoch": 0.40090994245952094, "grad_norm": 1.3359375, "kl": 0.04529419634491205, "learning_rate": 3.743250195797682e-06, "loss": 0.0018, "reward": 2.5416666865348816, "reward_std": 0.2686738818883896, "rewards/correctness_reward_func": 1.0833333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 749 }, { "completion_length": 186.37500762939453, "epoch": 0.40144520272982737, "grad_norm": 1.6796875, "kl": 0.03184348437935114, "learning_rate": 3.739194519311221e-06, "loss": 0.0013, "reward": 2.8125000596046448, "reward_std": 0.7753209173679352, "rewards/correctness_reward_func": 1.4166667014360428, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 750 }, { "completion_length": 136.5000057220459, "epoch": 0.4019804630001338, "grad_norm": 1.6328125, "kl": 0.02823848556727171, "learning_rate": 3.735134514661083e-06, "loss": 0.0011, "reward": 2.9791667461395264, "reward_std": 0.5674288682639599, "rewards/correctness_reward_func": 1.5000000149011612, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 751 }, { "completion_length": 149.00000381469727, "epoch": 0.4025157232704403, "grad_norm": 1.8203125, "kl": 0.03137681819498539, "learning_rate": 3.7310701960277412e-06, "loss": 0.0013, "reward": 2.8541667461395264, "reward_std": 0.7610780447721481, "rewards/correctness_reward_func": 1.4166667014360428, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 752 }, { "completion_length": 122.08333396911621, "epoch": 0.4030509835407467, "grad_norm": 0.08154296875, "kl": 0.02845725789666176, "learning_rate": 3.7270015776067354e-06, "loss": 0.0011, "reward": 3.5, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 753 }, { "completion_length": 193.6666717529297, "epoch": 0.4035862438110531, "grad_norm": 1.1484375, "kl": 0.031050281133502722, "learning_rate": 3.7229286736086238e-06, "loss": 0.0012, "reward": 2.520833343267441, "reward_std": 0.4007553458213806, "rewards/correctness_reward_func": 1.1666666716337204, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 754 }, { "completion_length": 158.5416717529297, "epoch": 0.40412150408135955, "grad_norm": 1.8359375, "kl": 0.05140957282856107, "learning_rate": 3.718851498258935e-06, "loss": 0.0021, "reward": 3.2291667461395264, "reward_std": 0.3866970017552376, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.31250001303851604, "rewards/xmlcount_reward_func": 0.5, "step": 755 }, { "completion_length": 152.6666717529297, "epoch": 0.404656764351666, "grad_norm": 1.375, "kl": 0.027617693413048983, "learning_rate": 3.714770065798114e-06, "loss": 0.0011, "reward": 3.1875000596046448, "reward_std": 0.536520928144455, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 756 }, { "completion_length": 126.50000381469727, "epoch": 0.40519202462197246, "grad_norm": 1.6171875, "kl": 0.03795776842162013, "learning_rate": 3.7106843904814754e-06, "loss": 0.0015, "reward": 3.333333373069763, "reward_std": 0.40824829041957855, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 757 }, { "completion_length": 149.50000190734863, "epoch": 0.4057272848922789, "grad_norm": 1.4765625, "kl": 0.04007122712209821, "learning_rate": 3.7065944865791528e-06, "loss": 0.0016, "reward": 3.3541667461395264, "reward_std": 0.19615865871310234, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 758 }, { "completion_length": 168.58333778381348, "epoch": 0.4062625451625853, "grad_norm": 1.234375, "kl": 0.027591979131102562, "learning_rate": 3.7025003683760485e-06, "loss": 0.0011, "reward": 2.958333373069763, "reward_std": 0.6673498451709747, "rewards/correctness_reward_func": 1.5833333432674408, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 759 }, { "completion_length": 209.04167556762695, "epoch": 0.40679780543289173, "grad_norm": 1.6953125, "kl": 0.04418479232117534, "learning_rate": 3.6984020501717864e-06, "loss": 0.0018, "reward": 2.1250000596046448, "reward_std": 0.7144279181957245, "rewards/correctness_reward_func": 0.916666679084301, "rewards/int_reward_func": 0.3958333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3333333432674408, "rewards/xmlcount_reward_func": 0.4791666716337204, "step": 760 }, { "completion_length": 150.41666793823242, "epoch": 0.40733306570319816, "grad_norm": 2.359375, "kl": 0.044540490955114365, "learning_rate": 3.6942995462806574e-06, "loss": 0.0018, "reward": 3.083333432674408, "reward_std": 0.7685092948377132, "rewards/correctness_reward_func": 1.666666716337204, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 761 }, { "completion_length": 171.1250057220459, "epoch": 0.40786832597350464, "grad_norm": 0.65234375, "kl": 0.04095438402146101, "learning_rate": 3.690192871031574e-06, "loss": 0.0016, "reward": 3.2916666865348816, "reward_std": 0.2813657224178314, "rewards/correctness_reward_func": 1.8333333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 762 }, { "completion_length": 134.0416717529297, "epoch": 0.40840358624381107, "grad_norm": 2.0, "kl": 0.035194840747863054, "learning_rate": 3.6860820387680145e-06, "loss": 0.0014, "reward": 2.8750000596046448, "reward_std": 0.306186206638813, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 763 }, { "completion_length": 173.16667366027832, "epoch": 0.4089388465141175, "grad_norm": 0.9609375, "kl": 0.029444378335028887, "learning_rate": 3.681967063847981e-06, "loss": 0.0012, "reward": 2.9479166865348816, "reward_std": 0.3116655945777893, "rewards/correctness_reward_func": 1.5833333358168602, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46875, "step": 764 }, { "completion_length": 127.33333778381348, "epoch": 0.4094741067844239, "grad_norm": 2.71875, "kl": 0.06386626185849309, "learning_rate": 3.6778479606439412e-06, "loss": 0.0026, "reward": 2.9791667461395264, "reward_std": 0.7575252056121826, "rewards/correctness_reward_func": 1.5833333730697632, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000149011612, "rewards/xmlcount_reward_func": 0.5, "step": 765 }, { "completion_length": 178.8333396911621, "epoch": 0.41000936705473034, "grad_norm": 1.7890625, "kl": 0.032990507781505585, "learning_rate": 3.673724743542785e-06, "loss": 0.0013, "reward": 2.432291716337204, "reward_std": 0.6304677873849869, "rewards/correctness_reward_func": 1.083333358168602, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666679084301, "rewards/xmlcount_reward_func": 0.4947916716337204, "step": 766 }, { "completion_length": 165.2500057220459, "epoch": 0.4105446273250368, "grad_norm": 1.4296875, "kl": 0.028883651364594698, "learning_rate": 3.669597426945768e-06, "loss": 0.0012, "reward": 2.8750000596046448, "reward_std": 0.7516101598739624, "rewards/correctness_reward_func": 1.4166667014360428, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 767 }, { "completion_length": 134.41666793823242, "epoch": 0.41107988759534325, "grad_norm": 1.203125, "kl": 0.020841211080551147, "learning_rate": 3.6654660252684643e-06, "loss": 0.0008, "reward": 3.395833373069763, "reward_std": 0.25515518710017204, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 768 }, { "completion_length": 136.41666984558105, "epoch": 0.4116151478656497, "grad_norm": 1.1875, "kl": 0.03158806264400482, "learning_rate": 3.661330552940719e-06, "loss": 0.0013, "reward": 3.1875000596046448, "reward_std": 0.4421939253807068, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.5, "step": 769 }, { "completion_length": 143.41667366027832, "epoch": 0.4121504081359561, "grad_norm": 1.3984375, "kl": 0.022135701961815357, "learning_rate": 3.6571910244065927e-06, "loss": 0.0009, "reward": 2.9791666865348816, "reward_std": 0.35721728205680847, "rewards/correctness_reward_func": 1.5000000223517418, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 770 }, { "completion_length": 160.29167366027832, "epoch": 0.41268566840626253, "grad_norm": 1.3671875, "kl": 0.033591088373214006, "learning_rate": 3.6530474541243127e-06, "loss": 0.0013, "reward": 2.708333343267441, "reward_std": 0.4999281316995621, "rewards/correctness_reward_func": 1.2500000074505806, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 771 }, { "completion_length": 147.66666793823242, "epoch": 0.41322092867656895, "grad_norm": 0.07470703125, "kl": 0.02724658139050007, "learning_rate": 3.648899856566225e-06, "loss": 0.0011, "reward": 3.0, "reward_std": 0.0, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 772 }, { "completion_length": 126.41666984558105, "epoch": 0.41375618894687544, "grad_norm": 1.0234375, "kl": 0.04276125319302082, "learning_rate": 3.644748246218739e-06, "loss": 0.0017, "reward": 2.9791666865348816, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 773 }, { "completion_length": 128.54166984558105, "epoch": 0.41429144921718186, "grad_norm": 3.296875, "kl": 0.10612674592994153, "learning_rate": 3.6405926375822824e-06, "loss": 0.0042, "reward": 3.1875, "reward_std": 0.36869701743125916, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.4791666716337204, "step": 774 }, { "completion_length": 150.33333778381348, "epoch": 0.4148267094874883, "grad_norm": 2.078125, "kl": 0.04650158202275634, "learning_rate": 3.636433045171247e-06, "loss": 0.0019, "reward": 2.8281250298023224, "reward_std": 0.6533168256282806, "rewards/correctness_reward_func": 1.4166667088866234, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666679084301, "rewards/xmlcount_reward_func": 0.4947916716337204, "step": 775 }, { "completion_length": 181.29167556762695, "epoch": 0.4153619697577947, "grad_norm": 1.6640625, "kl": 0.019466244149953127, "learning_rate": 3.6322694835139384e-06, "loss": 0.0008, "reward": 2.7083334028720856, "reward_std": 0.659539595246315, "rewards/correctness_reward_func": 1.2500000223517418, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 776 }, { "completion_length": 150.6666717529297, "epoch": 0.41589723002810114, "grad_norm": 1.7421875, "kl": 0.03336867177858949, "learning_rate": 3.6281019671525236e-06, "loss": 0.0013, "reward": 2.708333432674408, "reward_std": 0.7875140719115734, "rewards/correctness_reward_func": 1.2500000223517418, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 777 }, { "completion_length": 185.62500762939453, "epoch": 0.4164324902984076, "grad_norm": 1.6640625, "kl": 0.030382550787180662, "learning_rate": 3.6239305106429866e-06, "loss": 0.0012, "reward": 2.6250000298023224, "reward_std": 0.32274864614009857, "rewards/correctness_reward_func": 1.1666666716337204, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 778 }, { "completion_length": 130.12500190734863, "epoch": 0.41696775056871405, "grad_norm": 1.9921875, "kl": 0.03384571289643645, "learning_rate": 3.619755128555071e-06, "loss": 0.0014, "reward": 2.958333373069763, "reward_std": 0.7672725319862366, "rewards/correctness_reward_func": 1.5000000298023224, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 779 }, { "completion_length": 123.58333587646484, "epoch": 0.41750301083902047, "grad_norm": 1.5546875, "kl": 0.0329477502964437, "learning_rate": 3.6155758354722313e-06, "loss": 0.0013, "reward": 3.3125000596046448, "reward_std": 0.40438438951969147, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 780 }, { "completion_length": 155.29166793823242, "epoch": 0.4180382711093269, "grad_norm": 1.40625, "kl": 0.023525531636551023, "learning_rate": 3.6113926459915822e-06, "loss": 0.0009, "reward": 3.2916666865348816, "reward_std": 0.4541241526603699, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 781 }, { "completion_length": 167.0833396911621, "epoch": 0.4185735313796333, "grad_norm": 1.5234375, "kl": 0.037068808916956186, "learning_rate": 3.6072055747238465e-06, "loss": 0.0015, "reward": 2.875000089406967, "reward_std": 0.5268727838993073, "rewards/correctness_reward_func": 1.416666679084301, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 782 }, { "completion_length": 151.9583396911621, "epoch": 0.4191087916499398, "grad_norm": 1.765625, "kl": 0.06076545687392354, "learning_rate": 3.603014636293307e-06, "loss": 0.0024, "reward": 2.9791666865348816, "reward_std": 0.5683934539556503, "rewards/correctness_reward_func": 1.583333358168602, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 783 }, { "completion_length": 174.3750057220459, "epoch": 0.41964405192024623, "grad_norm": 1.5859375, "kl": 0.027573922649025917, "learning_rate": 3.598819845337752e-06, "loss": 0.0011, "reward": 2.8125000596046448, "reward_std": 0.3092299550771713, "rewards/correctness_reward_func": 1.3333333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 784 }, { "completion_length": 138.2916717529297, "epoch": 0.42017931219055266, "grad_norm": 1.8515625, "kl": 0.028004382736980915, "learning_rate": 3.594621216508426e-06, "loss": 0.0011, "reward": 2.8958334028720856, "reward_std": 0.6634034961462021, "rewards/correctness_reward_func": 1.4166667088866234, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 785 }, { "completion_length": 131.83333587646484, "epoch": 0.4207145724608591, "grad_norm": 0.71875, "kl": 0.024483149405568838, "learning_rate": 3.590418764469978e-06, "loss": 0.001, "reward": 3.4791666865348816, "reward_std": 0.05103103816509247, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 786 }, { "completion_length": 118.62500381469727, "epoch": 0.4212498327311655, "grad_norm": 1.71875, "kl": 0.029699893668293953, "learning_rate": 3.586212503900411e-06, "loss": 0.0012, "reward": 3.145833373069763, "reward_std": 0.5042977333068848, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 787 }, { "completion_length": 117.00000381469727, "epoch": 0.421785093001472, "grad_norm": 1.3203125, "kl": 0.03554026409983635, "learning_rate": 3.582002449491029e-06, "loss": 0.0014, "reward": 3.3125000596046448, "reward_std": 0.40438440442085266, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 788 }, { "completion_length": 166.95833587646484, "epoch": 0.4223203532717784, "grad_norm": 1.6640625, "kl": 0.044573254650458694, "learning_rate": 3.5777886159463875e-06, "loss": 0.0018, "reward": 2.5000000596046448, "reward_std": 0.7781640589237213, "rewards/correctness_reward_func": 1.166666679084301, "rewards/int_reward_func": 0.3750000037252903, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 789 }, { "completion_length": 142.29166984558105, "epoch": 0.42285561354208484, "grad_norm": 1.2265625, "kl": 0.04031791351735592, "learning_rate": 3.573571017984242e-06, "loss": 0.0016, "reward": 2.958333373069763, "reward_std": 0.5254304707050323, "rewards/correctness_reward_func": 1.5000000149011612, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 790 }, { "completion_length": 172.25000762939453, "epoch": 0.42339087381239127, "grad_norm": 1.3671875, "kl": 0.033897851360961795, "learning_rate": 3.5693496703354956e-06, "loss": 0.0014, "reward": 2.895833373069763, "reward_std": 0.47181354090571404, "rewards/correctness_reward_func": 1.5000000223517418, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 791 }, { "completion_length": 145.50000381469727, "epoch": 0.4239261340826977, "grad_norm": 1.5234375, "kl": 0.04099600203335285, "learning_rate": 3.5651245877441476e-06, "loss": 0.0016, "reward": 3.395833373069763, "reward_std": 0.1705273911356926, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 792 }, { "completion_length": 123.83333969116211, "epoch": 0.4244613943530042, "grad_norm": 0.703125, "kl": 0.03077300125733018, "learning_rate": 3.560895784967242e-06, "loss": 0.0012, "reward": 3.4791666865348816, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 793 }, { "completion_length": 111.25000381469727, "epoch": 0.4249966546233106, "grad_norm": 2.125, "kl": 0.0695495493710041, "learning_rate": 3.5566632767748183e-06, "loss": 0.0028, "reward": 3.3125000596046448, "reward_std": 0.3746515288949013, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 794 }, { "completion_length": 136.08333778381348, "epoch": 0.425531914893617, "grad_norm": 2.0, "kl": 0.036902827210724354, "learning_rate": 3.552427077949856e-06, "loss": 0.0015, "reward": 2.7291666865348816, "reward_std": 0.8262978196144104, "rewards/correctness_reward_func": 1.3333333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.39583333395421505, "rewards/xmlcount_reward_func": 0.5, "step": 795 }, { "completion_length": 133.45833587646484, "epoch": 0.42606717516392345, "grad_norm": 1.3671875, "kl": 0.04546619579195976, "learning_rate": 3.5481872032882276e-06, "loss": 0.0018, "reward": 2.583333373069763, "reward_std": 0.33393850177526474, "rewards/correctness_reward_func": 1.1666666716337204, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 796 }, { "completion_length": 107.29166984558105, "epoch": 0.4266024354342299, "grad_norm": 1.1328125, "kl": 0.03557090531103313, "learning_rate": 3.5439436675986403e-06, "loss": 0.0014, "reward": 3.1666666865348816, "reward_std": 0.25819891691207886, "rewards/correctness_reward_func": 1.6666666716337204, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 797 }, { "completion_length": 141.91667366027832, "epoch": 0.42713769570453636, "grad_norm": 1.90625, "kl": 0.03451378410682082, "learning_rate": 3.539696485702592e-06, "loss": 0.0014, "reward": 3.2291667461395264, "reward_std": 0.6085085272789001, "rewards/correctness_reward_func": 1.7500000596046448, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 798 }, { "completion_length": 187.5416717529297, "epoch": 0.4276729559748428, "grad_norm": 1.5, "kl": 0.029891248792409897, "learning_rate": 3.535445672434313e-06, "loss": 0.0012, "reward": 2.479166716337204, "reward_std": 0.9521069824695587, "rewards/correctness_reward_func": 1.0000000223517418, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 799 }, { "completion_length": 153.5833396911621, "epoch": 0.4282082162451492, "grad_norm": 1.9296875, "kl": 0.0294723529368639, "learning_rate": 3.5311912426407185e-06, "loss": 0.0012, "reward": 3.1875000596046448, "reward_std": 0.6529284864664078, "rewards/correctness_reward_func": 1.7500000596046448, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 800 }, { "completion_length": 140.91667556762695, "epoch": 0.42874347651545563, "grad_norm": 1.8203125, "kl": 0.030609098728746176, "learning_rate": 3.526933211181356e-06, "loss": 0.0012, "reward": 2.958333373069763, "reward_std": 0.8007340431213379, "rewards/correctness_reward_func": 1.5000000298023224, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 801 }, { "completion_length": 180.5416717529297, "epoch": 0.42927873678576206, "grad_norm": 1.484375, "kl": 0.025776101276278496, "learning_rate": 3.5226715929283507e-06, "loss": 0.001, "reward": 2.8750000596046448, "reward_std": 0.8291900753974915, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 802 }, { "completion_length": 158.5833396911621, "epoch": 0.4298139970560685, "grad_norm": 0.9765625, "kl": 0.03874217625707388, "learning_rate": 3.5184064027663554e-06, "loss": 0.0015, "reward": 3.3750000596046448, "reward_std": 0.25129128992557526, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 803 }, { "completion_length": 126.08333778381348, "epoch": 0.43034925732637497, "grad_norm": 0.8984375, "kl": 0.033212998416274786, "learning_rate": 3.514137655592501e-06, "loss": 0.0013, "reward": 3.1875, "reward_std": 0.25920552015304565, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.5, "step": 804 }, { "completion_length": 136.16666793823242, "epoch": 0.4308845175966814, "grad_norm": 1.8203125, "kl": 0.026766558177769184, "learning_rate": 3.5098653663163405e-06, "loss": 0.0011, "reward": 3.1041667461395264, "reward_std": 0.7366744130849838, "rewards/correctness_reward_func": 1.666666716337204, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 805 }, { "completion_length": 112.29166984558105, "epoch": 0.4314197778669878, "grad_norm": 1.25, "kl": 0.03665527980774641, "learning_rate": 3.505589549859798e-06, "loss": 0.0015, "reward": 3.2291667461395264, "reward_std": 0.5133541226387024, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 806 }, { "completion_length": 145.5416717529297, "epoch": 0.43195503813729424, "grad_norm": 2.03125, "kl": 0.030686243437230587, "learning_rate": 3.5013102211571182e-06, "loss": 0.0012, "reward": 2.770833373069763, "reward_std": 0.6543844044208527, "rewards/correctness_reward_func": 1.4166667088866234, "rewards/int_reward_func": 0.37500000558793545, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 807 }, { "completion_length": 132.4166717529297, "epoch": 0.43249029840760067, "grad_norm": 0.9296875, "kl": 0.05077707674354315, "learning_rate": 3.497027395154811e-06, "loss": 0.002, "reward": 3.3541666865348816, "reward_std": 0.24259880185127258, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 808 }, { "completion_length": 177.4166717529297, "epoch": 0.43302555867790715, "grad_norm": 1.4140625, "kl": 0.02278682473115623, "learning_rate": 3.4927410868116047e-06, "loss": 0.0009, "reward": 3.0468750596046448, "reward_std": 0.7866534292697906, "rewards/correctness_reward_func": 1.666666716337204, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.484375, "step": 809 }, { "completion_length": 155.7916717529297, "epoch": 0.4335608189482136, "grad_norm": 2.03125, "kl": 0.06975524500012398, "learning_rate": 3.4884513110983886e-06, "loss": 0.0028, "reward": 3.020833373069763, "reward_std": 0.6062580458819866, "rewards/correctness_reward_func": 1.5833333432674408, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 810 }, { "completion_length": 143.50000762939453, "epoch": 0.43409607921852, "grad_norm": 1.5390625, "kl": 0.03790471563115716, "learning_rate": 3.484158082998162e-06, "loss": 0.0015, "reward": 3.2291667461395264, "reward_std": 0.4352862983942032, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 811 }, { "completion_length": 184.7083396911621, "epoch": 0.43463133948882643, "grad_norm": 1.6796875, "kl": 0.042383064050227404, "learning_rate": 3.4798614175059832e-06, "loss": 0.0017, "reward": 2.7500000596046448, "reward_std": 0.9657306373119354, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3750000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 812 }, { "completion_length": 193.20833778381348, "epoch": 0.43516659975913285, "grad_norm": 1.7109375, "kl": 0.02804533112794161, "learning_rate": 3.4755613296289152e-06, "loss": 0.0011, "reward": 1.8802083730697632, "reward_std": 0.5338976383209229, "rewards/correctness_reward_func": 0.5000000223517418, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.484375, "step": 813 }, { "completion_length": 142.04167366027832, "epoch": 0.43570186002943934, "grad_norm": 2.0625, "kl": 0.03716372000053525, "learning_rate": 3.4712578343859775e-06, "loss": 0.0015, "reward": 3.1875000596046448, "reward_std": 0.42695439979434013, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000149011612, "rewards/xmlcount_reward_func": 0.5, "step": 814 }, { "completion_length": 163.08333587646484, "epoch": 0.43623712029974576, "grad_norm": 1.2265625, "kl": 0.025884422473609447, "learning_rate": 3.4669509468080874e-06, "loss": 0.001, "reward": 3.0000000596046448, "reward_std": 0.5222772061824799, "rewards/correctness_reward_func": 1.583333358168602, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 815 }, { "completion_length": 135.9583396911621, "epoch": 0.4367723805700522, "grad_norm": 2.34375, "kl": 0.02469735313206911, "learning_rate": 3.4626406819380125e-06, "loss": 0.001, "reward": 2.9791667461395264, "reward_std": 0.5674288682639599, "rewards/correctness_reward_func": 1.5000000149011612, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 816 }, { "completion_length": 122.87500381469727, "epoch": 0.4373076408403586, "grad_norm": 0.047119140625, "kl": 0.02518481481820345, "learning_rate": 3.458327054830315e-06, "loss": 0.001, "reward": 3.5, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 817 }, { "completion_length": 156.2500057220459, "epoch": 0.43784290111066504, "grad_norm": 1.5234375, "kl": 0.037520273588597775, "learning_rate": 3.4540100805513016e-06, "loss": 0.0015, "reward": 3.083333373069763, "reward_std": 0.5320602059364319, "rewards/correctness_reward_func": 1.5833333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 818 }, { "completion_length": 176.7500057220459, "epoch": 0.4383781613809715, "grad_norm": 1.203125, "kl": 0.036197793669998646, "learning_rate": 3.4496897741789693e-06, "loss": 0.0014, "reward": 2.958333343267441, "reward_std": 0.4541241526603699, "rewards/correctness_reward_func": 1.5000000223517418, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 819 }, { "completion_length": 165.25000381469727, "epoch": 0.43891342165127795, "grad_norm": 1.6875, "kl": 0.03109767520800233, "learning_rate": 3.445366150802953e-06, "loss": 0.0012, "reward": 2.708333373069763, "reward_std": 0.7077522426843643, "rewards/correctness_reward_func": 1.3333333656191826, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 820 }, { "completion_length": 202.75000381469727, "epoch": 0.43944868192158437, "grad_norm": 1.2890625, "kl": 0.02838379517197609, "learning_rate": 3.4410392255244727e-06, "loss": 0.0011, "reward": 2.541666716337204, "reward_std": 0.3347994200885296, "rewards/correctness_reward_func": 1.1666666716337204, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3750000111758709, "rewards/xmlcount_reward_func": 0.5, "step": 821 }, { "completion_length": 139.4583396911621, "epoch": 0.4399839421918908, "grad_norm": 1.0390625, "kl": 0.026547775603830814, "learning_rate": 3.436709013456283e-06, "loss": 0.0011, "reward": 3.3750000596046448, "reward_std": 0.2686738669872284, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 822 }, { "completion_length": 210.25000762939453, "epoch": 0.4405192024621972, "grad_norm": 1.234375, "kl": 0.02991744503378868, "learning_rate": 3.4323755297226157e-06, "loss": 0.0012, "reward": 2.371125027537346, "reward_std": 0.7365857362747192, "rewards/correctness_reward_func": 1.083333358168602, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.4127916656434536, "step": 823 }, { "completion_length": 156.7083396911621, "epoch": 0.4410544627325037, "grad_norm": 0.984375, "kl": 0.049160730093717575, "learning_rate": 3.4280387894591304e-06, "loss": 0.002, "reward": 3.4375000596046448, "reward_std": 0.11558076366782188, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 824 }, { "completion_length": 194.50000381469727, "epoch": 0.44158972300281013, "grad_norm": 1.7109375, "kl": 0.02425103122368455, "learning_rate": 3.423698807812863e-06, "loss": 0.001, "reward": 3.0, "reward_std": 0.8363019824028015, "rewards/correctness_reward_func": 1.5833333730697632, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 825 }, { "completion_length": 133.0416717529297, "epoch": 0.44212498327311656, "grad_norm": 2.15625, "kl": 0.03063865751028061, "learning_rate": 3.419355599942167e-06, "loss": 0.0012, "reward": 3.2500000596046448, "reward_std": 0.612372413277626, "rewards/correctness_reward_func": 1.7500000596046448, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 826 }, { "completion_length": 132.0416717529297, "epoch": 0.442660243543423, "grad_norm": 1.6171875, "kl": 0.036599946208298206, "learning_rate": 3.4150091810166676e-06, "loss": 0.0015, "reward": 2.5625000596046448, "reward_std": 0.4816259741783142, "rewards/correctness_reward_func": 1.2500000298023224, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 827 }, { "completion_length": 129.37500190734863, "epoch": 0.4431955038137294, "grad_norm": 1.5078125, "kl": 0.031639018561691046, "learning_rate": 3.410659566217202e-06, "loss": 0.0013, "reward": 2.4791666865348816, "reward_std": 0.5740348696708679, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 828 }, { "completion_length": 184.7916717529297, "epoch": 0.4437307640840359, "grad_norm": 1.546875, "kl": 0.03468810860067606, "learning_rate": 3.406306770735773e-06, "loss": 0.0014, "reward": 2.8125000596046448, "reward_std": 0.6695836298167706, "rewards/correctness_reward_func": 1.4166666716337204, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 829 }, { "completion_length": 168.00000762939453, "epoch": 0.4442660243543423, "grad_norm": 1.5390625, "kl": 0.04705773899331689, "learning_rate": 3.4019508097754912e-06, "loss": 0.0019, "reward": 2.375, "reward_std": 0.7058513760566711, "rewards/correctness_reward_func": 1.083333358168602, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3750000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 830 }, { "completion_length": 181.20833587646484, "epoch": 0.44480128462464874, "grad_norm": 2.0625, "kl": 0.034845305141061544, "learning_rate": 3.3975916985505223e-06, "loss": 0.0014, "reward": 2.520833373069763, "reward_std": 0.929972916841507, "rewards/correctness_reward_func": 1.1666667088866234, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 831 }, { "completion_length": 138.08333778381348, "epoch": 0.44533654489495517, "grad_norm": 1.859375, "kl": 0.04686246067285538, "learning_rate": 3.3932294522860376e-06, "loss": 0.0019, "reward": 3.0937500596046448, "reward_std": 0.58812665194273, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.4895833358168602, "step": 832 }, { "completion_length": 169.7083396911621, "epoch": 0.4458718051652616, "grad_norm": 1.9375, "kl": 0.03669201582670212, "learning_rate": 3.388864086218155e-06, "loss": 0.0015, "reward": 3.1406250596046448, "reward_std": 0.44707968831062317, "rewards/correctness_reward_func": 1.8333333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125000074505806, "rewards/xmlcount_reward_func": 0.4947916716337204, "step": 833 }, { "completion_length": 137.4583396911621, "epoch": 0.446407065435568, "grad_norm": 5.03125, "kl": 0.09618548629805446, "learning_rate": 3.3844956155938915e-06, "loss": 0.0038, "reward": 2.333333373069763, "reward_std": 0.6804374605417252, "rewards/correctness_reward_func": 1.0000000149011612, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333358168602, "rewards/xmlcount_reward_func": 0.4791666716337204, "step": 834 }, { "completion_length": 238.25000762939453, "epoch": 0.4469423257058745, "grad_norm": 1.21875, "kl": 0.032248204573988914, "learning_rate": 3.380124055671106e-06, "loss": 0.0013, "reward": 2.666666716337204, "reward_std": 0.502664253115654, "rewards/correctness_reward_func": 1.416666679084301, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.2916666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 835 }, { "completion_length": 135.62500381469727, "epoch": 0.4474775859761809, "grad_norm": 1.21875, "kl": 0.03981878375634551, "learning_rate": 3.3757494217184493e-06, "loss": 0.0016, "reward": 3.2291666865348816, "reward_std": 0.30103103443980217, "rewards/correctness_reward_func": 1.8333333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333395421505, "rewards/xmlcount_reward_func": 0.5, "step": 836 }, { "completion_length": 186.62500762939453, "epoch": 0.44801284624648735, "grad_norm": 1.7421875, "kl": 0.043279207311570644, "learning_rate": 3.371371729015307e-06, "loss": 0.0017, "reward": 3.0000001192092896, "reward_std": 0.4269207715988159, "rewards/correctness_reward_func": 1.6666666716337204, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3541666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 837 }, { "completion_length": 180.4583396911621, "epoch": 0.4485481065167938, "grad_norm": 1.5625, "kl": 0.027423355961218476, "learning_rate": 3.3669909928517476e-06, "loss": 0.0011, "reward": 3.1041667461395264, "reward_std": 0.7092793136835098, "rewards/correctness_reward_func": 1.7500000596046448, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3750000037252903, "rewards/xmlcount_reward_func": 0.5, "step": 838 }, { "completion_length": 140.29166984558105, "epoch": 0.4490833667871002, "grad_norm": 1.90625, "kl": 0.033189952839165926, "learning_rate": 3.362607228528473e-06, "loss": 0.0013, "reward": 3.1250001192092896, "reward_std": 0.5809475630521774, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 839 }, { "completion_length": 161.12500190734863, "epoch": 0.4496186270574067, "grad_norm": 1.4765625, "kl": 0.03806134918704629, "learning_rate": 3.358220451356758e-06, "loss": 0.0015, "reward": 2.973958373069763, "reward_std": 0.2913762256503105, "rewards/correctness_reward_func": 1.5833333358168602, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.416666679084301, "rewards/xmlcount_reward_func": 0.4947916716337204, "step": 840 }, { "completion_length": 226.7500057220459, "epoch": 0.4501538873277131, "grad_norm": 1.0546875, "kl": 0.025143309962004423, "learning_rate": 3.3538306766584015e-06, "loss": 0.001, "reward": 2.833333373069763, "reward_std": 0.48273734748363495, "rewards/correctness_reward_func": 1.4166666716337204, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 841 }, { "completion_length": 165.00000381469727, "epoch": 0.45068914759801953, "grad_norm": 1.234375, "kl": 0.02779693342745304, "learning_rate": 3.349437919765673e-06, "loss": 0.0011, "reward": 2.895833373069763, "reward_std": 0.25515517219901085, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 842 }, { "completion_length": 141.9583396911621, "epoch": 0.45122440786832596, "grad_norm": 1.640625, "kl": 0.04394981171935797, "learning_rate": 3.345042196021257e-06, "loss": 0.0018, "reward": 3.1041667461395264, "reward_std": 0.5791352987289429, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 843 }, { "completion_length": 120.33333778381348, "epoch": 0.4517596681386324, "grad_norm": 1.0, "kl": 0.03951650392264128, "learning_rate": 3.340643520778201e-06, "loss": 0.0016, "reward": 2.8125, "reward_std": 0.246855229139328, "rewards/correctness_reward_func": 1.3333333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 844 }, { "completion_length": 150.5416717529297, "epoch": 0.45229492840893887, "grad_norm": 1.609375, "kl": 0.03706312831491232, "learning_rate": 3.336241909399861e-06, "loss": 0.0015, "reward": 3.0000000596046448, "reward_std": 0.712575301527977, "rewards/correctness_reward_func": 1.5833333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 845 }, { "completion_length": 157.2500057220459, "epoch": 0.4528301886792453, "grad_norm": 1.1328125, "kl": 0.03455189196392894, "learning_rate": 3.331837377259847e-06, "loss": 0.0014, "reward": 2.9375000596046448, "reward_std": 0.11558076366782188, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 846 }, { "completion_length": 141.83333587646484, "epoch": 0.4533654489495517, "grad_norm": 1.4140625, "kl": 0.03348656743764877, "learning_rate": 3.327429939741971e-06, "loss": 0.0013, "reward": 2.770833373069763, "reward_std": 0.5251599848270416, "rewards/correctness_reward_func": 1.3333333432674408, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 847 }, { "completion_length": 200.54167366027832, "epoch": 0.45390070921985815, "grad_norm": 1.15625, "kl": 0.02685615699738264, "learning_rate": 3.3230196122401946e-06, "loss": 0.0011, "reward": 2.833333373069763, "reward_std": 0.5149645358324051, "rewards/correctness_reward_func": 1.416666679084301, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.416666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 848 }, { "completion_length": 131.9583396911621, "epoch": 0.45443596949016457, "grad_norm": 1.65625, "kl": 0.025201458483934402, "learning_rate": 3.318606410158572e-06, "loss": 0.001, "reward": 3.0625000596046448, "reward_std": 0.6782456785440445, "rewards/correctness_reward_func": 1.5833333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 849 }, { "completion_length": 142.37500381469727, "epoch": 0.45497122976047105, "grad_norm": 0.80859375, "kl": 0.030992007814347744, "learning_rate": 3.3141903489111966e-06, "loss": 0.0012, "reward": 3.4791666865348816, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 850 }, { "completion_length": 181.41667366027832, "epoch": 0.4555064900307775, "grad_norm": 1.6328125, "kl": 0.03103213245049119, "learning_rate": 3.3097714439221477e-06, "loss": 0.0012, "reward": 3.1250001192092896, "reward_std": 0.6587194204330444, "rewards/correctness_reward_func": 1.666666716337204, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 851 }, { "completion_length": 153.6250057220459, "epoch": 0.4560417503010839, "grad_norm": 1.1015625, "kl": 0.036851195618510246, "learning_rate": 3.3053497106254394e-06, "loss": 0.0015, "reward": 2.895833373069763, "reward_std": 0.25515517219901085, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 852 }, { "completion_length": 125.70833587646484, "epoch": 0.45657701057139033, "grad_norm": 1.078125, "kl": 0.022068005986511707, "learning_rate": 3.3009251644649637e-06, "loss": 0.0009, "reward": 2.9375, "reward_std": 0.06846532225608826, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 853 }, { "completion_length": 145.7083396911621, "epoch": 0.45711227084169676, "grad_norm": 1.921875, "kl": 0.029460490681231022, "learning_rate": 3.296497820894435e-06, "loss": 0.0012, "reward": 3.145833432674408, "reward_std": 0.6625833064317703, "rewards/correctness_reward_func": 1.666666716337204, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 854 }, { "completion_length": 179.12500381469727, "epoch": 0.45764753111200324, "grad_norm": 1.90625, "kl": 0.03744522435590625, "learning_rate": 3.29206769537734e-06, "loss": 0.0015, "reward": 2.6875000298023224, "reward_std": 0.6332302503287792, "rewards/correctness_reward_func": 1.4166666716337204, "rewards/int_reward_func": 0.3958333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.37500000558793545, "rewards/xmlcount_reward_func": 0.5, "step": 855 }, { "completion_length": 143.7916717529297, "epoch": 0.45818279138230966, "grad_norm": 0.8125, "kl": 0.029497163370251656, "learning_rate": 3.287634803386882e-06, "loss": 0.0012, "reward": 3.0625, "reward_std": 0.22008520364761353, "rewards/correctness_reward_func": 1.5833333358168602, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 856 }, { "completion_length": 122.04166984558105, "epoch": 0.4587180516526161, "grad_norm": 1.5078125, "kl": 0.06524349935352802, "learning_rate": 3.283199160405926e-06, "loss": 0.0026, "reward": 2.854166716337204, "reward_std": 0.2648099809885025, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 857 }, { "completion_length": 160.7083396911621, "epoch": 0.4592533119229225, "grad_norm": 1.125, "kl": 0.03336925012990832, "learning_rate": 3.2787607819269473e-06, "loss": 0.0013, "reward": 3.270833373069763, "reward_std": 0.44672295451164246, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 858 }, { "completion_length": 140.87500190734863, "epoch": 0.45978857219322894, "grad_norm": 3.0, "kl": 0.0893707680515945, "learning_rate": 3.274319683451973e-06, "loss": 0.0036, "reward": 2.708333432674408, "reward_std": 0.6814524829387665, "rewards/correctness_reward_func": 1.3333333656191826, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 859 }, { "completion_length": 136.37500381469727, "epoch": 0.4603238324635354, "grad_norm": 1.5390625, "kl": 0.031437342055141926, "learning_rate": 3.269875880492532e-06, "loss": 0.0013, "reward": 3.2291667461395264, "reward_std": 0.5133540891110897, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 860 }, { "completion_length": 188.66667366027832, "epoch": 0.46085909273384185, "grad_norm": 1.0078125, "kl": 0.031875348184257746, "learning_rate": 3.2654293885695983e-06, "loss": 0.0013, "reward": 3.0468750596046448, "reward_std": 0.45117713510990143, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3541666679084301, "rewards/xmlcount_reward_func": 0.484375, "step": 861 }, { "completion_length": 163.8333396911621, "epoch": 0.46139435300414827, "grad_norm": 1.9453125, "kl": 0.05007671285420656, "learning_rate": 3.260980223213539e-06, "loss": 0.002, "reward": 2.8541667461395264, "reward_std": 0.7710062265396118, "rewards/correctness_reward_func": 1.5000000298023224, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3750000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 862 }, { "completion_length": 200.66667556762695, "epoch": 0.4619296132744547, "grad_norm": 2.078125, "kl": 0.03132034745067358, "learning_rate": 3.256528399964057e-06, "loss": 0.0013, "reward": 2.479166716337204, "reward_std": 0.9049717783927917, "rewards/correctness_reward_func": 1.0833333656191826, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 863 }, { "completion_length": 115.5000057220459, "epoch": 0.4624648735447611, "grad_norm": 1.9921875, "kl": 0.030180228408426046, "learning_rate": 3.252073934370142e-06, "loss": 0.0012, "reward": 3.2187500596046448, "reward_std": 0.3946995995938778, "rewards/correctness_reward_func": 1.8333333432674408, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.416666679084301, "rewards/xmlcount_reward_func": 0.4895833358168602, "step": 864 }, { "completion_length": 153.6666717529297, "epoch": 0.46300013381506755, "grad_norm": 2.3125, "kl": 0.029024141374975443, "learning_rate": 3.2476168419900066e-06, "loss": 0.0012, "reward": 3.0260416865348816, "reward_std": 0.7233164459466934, "rewards/correctness_reward_func": 1.666666716337204, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333395421505, "rewards/xmlcount_reward_func": 0.484375, "step": 865 }, { "completion_length": 142.66666984558105, "epoch": 0.46353539408537403, "grad_norm": 1.6171875, "kl": 0.03154592076316476, "learning_rate": 3.2431571383910445e-06, "loss": 0.0013, "reward": 3.020833373069763, "reward_std": 0.5285752415657043, "rewards/correctness_reward_func": 1.583333358168602, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 866 }, { "completion_length": 164.1666717529297, "epoch": 0.46407065435568046, "grad_norm": 2.078125, "kl": 0.029656716156750917, "learning_rate": 3.238694839149764e-06, "loss": 0.0012, "reward": 3.208333432674408, "reward_std": 0.5643851272761822, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 867 }, { "completion_length": 142.33333778381348, "epoch": 0.4646059146259869, "grad_norm": 1.7578125, "kl": 0.03601653641089797, "learning_rate": 3.2342299598517444e-06, "loss": 0.0014, "reward": 2.625000089406967, "reward_std": 0.7309969216585159, "rewards/correctness_reward_func": 1.166666716337204, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 868 }, { "completion_length": 169.3750057220459, "epoch": 0.4651411748962933, "grad_norm": 1.6484375, "kl": 0.02826155023649335, "learning_rate": 3.2297625160915735e-06, "loss": 0.0011, "reward": 3.0416667461395264, "reward_std": 0.6094035319983959, "rewards/correctness_reward_func": 1.5833333432674408, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 869 }, { "completion_length": 201.37500762939453, "epoch": 0.46567643516659973, "grad_norm": 1.6796875, "kl": 0.029593814630061388, "learning_rate": 3.2252925234727955e-06, "loss": 0.0012, "reward": 3.020833373069763, "reward_std": 0.7951613962650299, "rewards/correctness_reward_func": 1.5833333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000149011612, "rewards/xmlcount_reward_func": 0.5, "step": 870 }, { "completion_length": 166.5000057220459, "epoch": 0.4662116954369062, "grad_norm": 1.21875, "kl": 0.03089164919219911, "learning_rate": 3.22081999760786e-06, "loss": 0.0012, "reward": 2.7708334028720856, "reward_std": 0.37377968057990074, "rewards/correctness_reward_func": 1.3333333432674408, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 871 }, { "completion_length": 167.00000762939453, "epoch": 0.46674695570721264, "grad_norm": 2.296875, "kl": 0.034221252892166376, "learning_rate": 3.216344954118061e-06, "loss": 0.0014, "reward": 2.8125000596046448, "reward_std": 0.7076172083616257, "rewards/correctness_reward_func": 1.5000000149011612, "rewards/int_reward_func": 0.4375000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3750000037252903, "rewards/xmlcount_reward_func": 0.5, "step": 872 }, { "completion_length": 155.00000381469727, "epoch": 0.46728221597751907, "grad_norm": 1.6796875, "kl": 0.03355083800852299, "learning_rate": 3.211867408633488e-06, "loss": 0.0013, "reward": 2.7500000596046448, "reward_std": 0.7845312654972076, "rewards/correctness_reward_func": 1.333333358168602, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 873 }, { "completion_length": 128.00000381469727, "epoch": 0.4678174762478255, "grad_norm": 1.296875, "kl": 0.05736191477626562, "learning_rate": 3.2073873767929693e-06, "loss": 0.0023, "reward": 3.458333373069763, "reward_std": 0.10206206887960434, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 874 }, { "completion_length": 215.1666717529297, "epoch": 0.4683527365181319, "grad_norm": 1.2578125, "kl": 0.02975275507196784, "learning_rate": 3.2029048742440166e-06, "loss": 0.0012, "reward": 2.6718750596046448, "reward_std": 0.9017057120800018, "rewards/correctness_reward_func": 1.3333333432674408, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333432674408, "rewards/xmlcount_reward_func": 0.484375, "step": 875 }, { "completion_length": 140.8333396911621, "epoch": 0.4688879967884384, "grad_norm": 1.796875, "kl": 0.031159482430666685, "learning_rate": 3.198419916642771e-06, "loss": 0.0012, "reward": 2.505208373069763, "reward_std": 0.7537505924701691, "rewards/correctness_reward_func": 1.0833333432674408, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.484375, "step": 876 }, { "completion_length": 201.08333587646484, "epoch": 0.4694232570587448, "grad_norm": 1.5390625, "kl": 0.03459092229604721, "learning_rate": 3.1939325196539496e-06, "loss": 0.0014, "reward": 3.1666667461395264, "reward_std": 0.4727980047464371, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3333333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 877 }, { "completion_length": 154.3333396911621, "epoch": 0.46995851732905125, "grad_norm": 1.2734375, "kl": 0.03349528927356005, "learning_rate": 3.1894426989507877e-06, "loss": 0.0013, "reward": 3.2291666865348816, "reward_std": 0.35770072042942047, "rewards/correctness_reward_func": 1.8333333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333395421505, "rewards/xmlcount_reward_func": 0.5, "step": 878 }, { "completion_length": 181.75, "epoch": 0.4704937775993577, "grad_norm": 1.7421875, "kl": 0.03606862062588334, "learning_rate": 3.1849504702149885e-06, "loss": 0.0014, "reward": 2.5416666865348816, "reward_std": 0.5717475526034832, "rewards/correctness_reward_func": 1.0833333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 879 }, { "completion_length": 147.33333778381348, "epoch": 0.4710290378696641, "grad_norm": 1.859375, "kl": 0.03201776463538408, "learning_rate": 3.180455849136664e-06, "loss": 0.0013, "reward": 2.9375000596046448, "reward_std": 0.9778521060943604, "rewards/correctness_reward_func": 1.5000000596046448, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 880 }, { "completion_length": 140.6666717529297, "epoch": 0.4715642981399706, "grad_norm": 1.3515625, "kl": 0.06457794364541769, "learning_rate": 3.175958851414281e-06, "loss": 0.0026, "reward": 3.2916667461395264, "reward_std": 0.31584101915359497, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3750000111758709, "rewards/xmlcount_reward_func": 0.5, "step": 881 }, { "completion_length": 199.25000762939453, "epoch": 0.472099558410277, "grad_norm": 1.3203125, "kl": 0.041178013663738966, "learning_rate": 3.1714594927546094e-06, "loss": 0.0016, "reward": 2.588541716337204, "reward_std": 0.4356187731027603, "rewards/correctness_reward_func": 1.3333333432674408, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.29166666977107525, "rewards/xmlcount_reward_func": 0.484375, "step": 882 }, { "completion_length": 119.04166793823242, "epoch": 0.47263481868058344, "grad_norm": 0.8359375, "kl": 0.03467118879780173, "learning_rate": 3.1669577888726655e-06, "loss": 0.0014, "reward": 3.3125, "reward_std": 0.29315099120140076, "rewards/correctness_reward_func": 1.8333333432674408, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 883 }, { "completion_length": 109.58333587646484, "epoch": 0.47317007895088986, "grad_norm": 1.78125, "kl": 0.0753163555637002, "learning_rate": 3.162453755491655e-06, "loss": 0.003, "reward": 3.333333373069763, "reward_std": 0.29362983629107475, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 884 }, { "completion_length": 131.41667366027832, "epoch": 0.4737053392211963, "grad_norm": 1.7265625, "kl": 0.03353723953478038, "learning_rate": 3.1579474083429195e-06, "loss": 0.0013, "reward": 2.791666716337204, "reward_std": 0.7123230695724487, "rewards/correctness_reward_func": 1.3333333656191826, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 885 }, { "completion_length": 155.62500381469727, "epoch": 0.47424059949150277, "grad_norm": 1.3046875, "kl": 0.02384982886724174, "learning_rate": 3.153438763165884e-06, "loss": 0.001, "reward": 3.2291666865348816, "reward_std": 0.3248923234641552, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 886 }, { "completion_length": 155.2916717529297, "epoch": 0.4747758597618092, "grad_norm": 1.21875, "kl": 0.031146604102104902, "learning_rate": 3.1489278357079996e-06, "loss": 0.0012, "reward": 3.3125000596046448, "reward_std": 0.3071485310792923, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 887 }, { "completion_length": 145.25000190734863, "epoch": 0.4753111200321156, "grad_norm": 1.375, "kl": 0.07489143451675773, "learning_rate": 3.1444146417246875e-06, "loss": 0.003, "reward": 3.3125000596046448, "reward_std": 0.3071485310792923, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 888 }, { "completion_length": 202.62500762939453, "epoch": 0.47584638030242205, "grad_norm": 1.7421875, "kl": 0.021212580613791943, "learning_rate": 3.139899196979286e-06, "loss": 0.0008, "reward": 2.833333432674408, "reward_std": 0.9302727431058884, "rewards/correctness_reward_func": 1.416666716337204, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.416666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 889 }, { "completion_length": 170.7916717529297, "epoch": 0.47638164057272847, "grad_norm": 1.5703125, "kl": 0.023964946623891592, "learning_rate": 3.1353815172429937e-06, "loss": 0.001, "reward": 3.083333373069763, "reward_std": 0.3776952736079693, "rewards/correctness_reward_func": 1.6666666716337204, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 890 }, { "completion_length": 131.9583339691162, "epoch": 0.47691690084303495, "grad_norm": 1.84375, "kl": 0.03578362660482526, "learning_rate": 3.130861618294817e-06, "loss": 0.0014, "reward": 3.270833373069763, "reward_std": 0.5613414198160172, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.4791666716337204, "step": 891 }, { "completion_length": 169.7083339691162, "epoch": 0.4774521611133414, "grad_norm": 1.5546875, "kl": 0.027614878490567207, "learning_rate": 3.1263395159215125e-06, "loss": 0.0011, "reward": 2.9791666865348816, "reward_std": 0.4242093414068222, "rewards/correctness_reward_func": 1.5000000223517418, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 892 }, { "completion_length": 156.04167556762695, "epoch": 0.4779874213836478, "grad_norm": 1.21875, "kl": 0.03386624017730355, "learning_rate": 3.121815225917534e-06, "loss": 0.0014, "reward": 3.2500000596046448, "reward_std": 0.3181530348956585, "rewards/correctness_reward_func": 1.8333333432674408, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 893 }, { "completion_length": 134.5416717529297, "epoch": 0.47852268165395423, "grad_norm": 1.84375, "kl": 0.07315583759918809, "learning_rate": 3.1172887640849736e-06, "loss": 0.0029, "reward": 3.2291667461395264, "reward_std": 0.45845915377140045, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 894 }, { "completion_length": 138.25000381469727, "epoch": 0.47905794192426066, "grad_norm": 0.92578125, "kl": 0.01682589342817664, "learning_rate": 3.1127601462335106e-06, "loss": 0.0007, "reward": 2.833333373069763, "reward_std": 0.25819891691207886, "rewards/correctness_reward_func": 1.3333333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 895 }, { "completion_length": 148.54167366027832, "epoch": 0.4795932021945671, "grad_norm": 0.59375, "kl": 0.02296333061531186, "learning_rate": 3.108229388180355e-06, "loss": 0.0009, "reward": 3.458333373069763, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 896 }, { "completion_length": 151.12500381469727, "epoch": 0.48012846246487356, "grad_norm": 1.9765625, "kl": 0.03428164287470281, "learning_rate": 3.103696505750191e-06, "loss": 0.0014, "reward": 2.958333432674408, "reward_std": 0.94781294465065, "rewards/correctness_reward_func": 1.5833334028720856, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 897 }, { "completion_length": 121.00000190734863, "epoch": 0.48066372273518, "grad_norm": 1.421875, "kl": 0.026538813253864646, "learning_rate": 3.099161514775123e-06, "loss": 0.0011, "reward": 2.8125000596046448, "reward_std": 0.3092299550771713, "rewards/correctness_reward_func": 1.3333333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 898 }, { "completion_length": 182.9583396911621, "epoch": 0.4811989830054864, "grad_norm": 2.359375, "kl": 0.07469095708802342, "learning_rate": 3.094624431094621e-06, "loss": 0.003, "reward": 3.1250001192092896, "reward_std": 0.612732045352459, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666865348816, "rewards/xmlcount_reward_func": 0.5, "step": 899 }, { "completion_length": 153.87500762939453, "epoch": 0.48173424327579284, "grad_norm": 3.84375, "kl": 0.13914787722751498, "learning_rate": 3.0900852705554618e-06, "loss": 0.0056, "reward": 2.7916667461395264, "reward_std": 1.1055711507797241, "rewards/correctness_reward_func": 1.416666716337204, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333507180214, "rewards/xmlcount_reward_func": 0.5, "step": 900 }, { "completion_length": 150.1666717529297, "epoch": 0.48226950354609927, "grad_norm": 1.4921875, "kl": 0.04241298232227564, "learning_rate": 3.085544049011679e-06, "loss": 0.0017, "reward": 3.2916666865348816, "reward_std": 0.4541241526603699, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 901 }, { "completion_length": 187.75000762939453, "epoch": 0.48280476381640575, "grad_norm": 1.765625, "kl": 0.03613791987299919, "learning_rate": 3.0810007823245016e-06, "loss": 0.0014, "reward": 2.3125000596046448, "reward_std": 1.1677038073539734, "rewards/correctness_reward_func": 0.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333507180214, "rewards/xmlcount_reward_func": 0.5, "step": 902 }, { "completion_length": 177.7083396911621, "epoch": 0.48334002408671217, "grad_norm": 1.28125, "kl": 0.03979034349322319, "learning_rate": 3.0764554863623054e-06, "loss": 0.0016, "reward": 2.8541667461395264, "reward_std": 0.6266467720270157, "rewards/correctness_reward_func": 1.5000000447034836, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3541666753590107, "rewards/xmlcount_reward_func": 0.5, "step": 903 }, { "completion_length": 135.87500381469727, "epoch": 0.4838752843570186, "grad_norm": 0.498046875, "kl": 0.04881257377564907, "learning_rate": 3.07190817700055e-06, "loss": 0.002, "reward": 3.458333373069763, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 904 }, { "completion_length": 175.87500762939453, "epoch": 0.484410544627325, "grad_norm": 2.03125, "kl": 0.030201736837625504, "learning_rate": 3.0673588701217306e-06, "loss": 0.0012, "reward": 2.645833373069763, "reward_std": 0.6988043785095215, "rewards/correctness_reward_func": 1.4166667088866234, "rewards/int_reward_func": 0.3541666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3750000037252903, "rewards/xmlcount_reward_func": 0.5, "step": 905 }, { "completion_length": 161.45833587646484, "epoch": 0.48494580489763145, "grad_norm": 1.4765625, "kl": 0.03659933200106025, "learning_rate": 3.062807581615317e-06, "loss": 0.0015, "reward": 2.723958432674408, "reward_std": 0.7829360365867615, "rewards/correctness_reward_func": 1.333333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333432674408, "rewards/xmlcount_reward_func": 0.4947916716337204, "step": 906 }, { "completion_length": 156.5833396911621, "epoch": 0.48548106516793793, "grad_norm": 1.3125, "kl": 0.022064207587391138, "learning_rate": 3.058254327377701e-06, "loss": 0.0009, "reward": 3.0416667461395264, "reward_std": 0.5643851570785046, "rewards/correctness_reward_func": 1.583333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 907 }, { "completion_length": 156.20833778381348, "epoch": 0.48601632543824436, "grad_norm": 0.66796875, "kl": 0.029581542825326324, "learning_rate": 3.053699123312141e-06, "loss": 0.0012, "reward": 2.9791666865348816, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 908 }, { "completion_length": 198.3750057220459, "epoch": 0.4865515857085508, "grad_norm": 2.296875, "kl": 0.03689955791924149, "learning_rate": 3.0491419853287037e-06, "loss": 0.0015, "reward": 2.2916667070239782, "reward_std": 0.7524303048849106, "rewards/correctness_reward_func": 1.166666716337204, "rewards/int_reward_func": 0.37500000558793545, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.375, "step": 909 }, { "completion_length": 142.6666717529297, "epoch": 0.4870868459788572, "grad_norm": 1.578125, "kl": 0.03775651101022959, "learning_rate": 3.044582929344212e-06, "loss": 0.0015, "reward": 2.8541667461395264, "reward_std": 0.5779038518667221, "rewards/correctness_reward_func": 1.4166666716337204, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 910 }, { "completion_length": 178.7083396911621, "epoch": 0.48762210624916363, "grad_norm": 1.265625, "kl": 0.034308540634810925, "learning_rate": 3.0400219712821864e-06, "loss": 0.0014, "reward": 3.145833432674408, "reward_std": 0.5230088979005814, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333395421505, "rewards/xmlcount_reward_func": 0.5, "step": 911 }, { "completion_length": 150.12500381469727, "epoch": 0.4881573665194701, "grad_norm": 1.8671875, "kl": 0.028787806164473295, "learning_rate": 3.0354591270727936e-06, "loss": 0.0012, "reward": 2.583333343267441, "reward_std": 0.8096110820770264, "rewards/correctness_reward_func": 1.166666679084301, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 912 }, { "completion_length": 135.25000381469727, "epoch": 0.48869262678977654, "grad_norm": 2.046875, "kl": 0.027838943991810083, "learning_rate": 3.030894412652785e-06, "loss": 0.0011, "reward": 3.1666667461395264, "reward_std": 0.6664472222328186, "rewards/correctness_reward_func": 1.666666716337204, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 913 }, { "completion_length": 181.5416717529297, "epoch": 0.48922788706008297, "grad_norm": 1.4765625, "kl": 0.036700944416224957, "learning_rate": 3.0263278439654465e-06, "loss": 0.0015, "reward": 2.8958334028720856, "reward_std": 0.541967298835516, "rewards/correctness_reward_func": 1.5000000149011612, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333395421505, "rewards/xmlcount_reward_func": 0.5, "step": 914 }, { "completion_length": 141.37500190734863, "epoch": 0.4897631473303894, "grad_norm": 1.6015625, "kl": 0.017945259111002088, "learning_rate": 3.0217594369605373e-06, "loss": 0.0007, "reward": 3.041666716337204, "reward_std": 0.306186206638813, "rewards/correctness_reward_func": 1.5833333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 915 }, { "completion_length": 119.29167175292969, "epoch": 0.4902984076006958, "grad_norm": 0.9609375, "kl": 0.025332989636808634, "learning_rate": 3.0171892075942415e-06, "loss": 0.001, "reward": 3.333333373069763, "reward_std": 0.25819891691207886, "rewards/correctness_reward_func": 1.8333333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 916 }, { "completion_length": 210.2083396911621, "epoch": 0.4908336678710023, "grad_norm": 1.1640625, "kl": 0.022253695176914334, "learning_rate": 3.0126171718291045e-06, "loss": 0.0009, "reward": 2.6875000596046448, "reward_std": 0.6221463531255722, "rewards/correctness_reward_func": 1.3333333432674408, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.37500000558793545, "rewards/xmlcount_reward_func": 0.5, "step": 917 }, { "completion_length": 135.37500381469727, "epoch": 0.4913689281413087, "grad_norm": 1.46875, "kl": 0.023740992648527026, "learning_rate": 3.008043345633984e-06, "loss": 0.0009, "reward": 3.395833373069763, "reward_std": 0.25515517592430115, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 918 }, { "completion_length": 161.4166717529297, "epoch": 0.49190418841161515, "grad_norm": 1.5, "kl": 0.03153214603662491, "learning_rate": 3.0034677449839893e-06, "loss": 0.0013, "reward": 2.5625000596046448, "reward_std": 0.6113040260970592, "rewards/correctness_reward_func": 1.1666666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 919 }, { "completion_length": 194.62500381469727, "epoch": 0.4924394486819216, "grad_norm": 1.8515625, "kl": 0.02594477077946067, "learning_rate": 2.9988903858604275e-06, "loss": 0.001, "reward": 2.2343750298023224, "reward_std": 0.7281012237071991, "rewards/correctness_reward_func": 0.8333333432674408, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.484375, "step": 920 }, { "completion_length": 183.25000381469727, "epoch": 0.492974708952228, "grad_norm": 6.125, "kl": 0.20129929389804602, "learning_rate": 2.9943112842507473e-06, "loss": 0.0081, "reward": 3.2916667461395264, "reward_std": 0.35817956551909447, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 921 }, { "completion_length": 167.5833396911621, "epoch": 0.4935099692225345, "grad_norm": 1.7421875, "kl": 0.021955529926344752, "learning_rate": 2.989730456148484e-06, "loss": 0.0009, "reward": 2.6875000596046448, "reward_std": 0.7304233312606812, "rewards/correctness_reward_func": 1.2500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 922 }, { "completion_length": 174.16667556762695, "epoch": 0.4940452294928409, "grad_norm": 1.0703125, "kl": 0.03281415533274412, "learning_rate": 2.985147917553205e-06, "loss": 0.0013, "reward": 2.8593750298023224, "reward_std": 0.44327686727046967, "rewards/correctness_reward_func": 1.5000000223517418, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.39583333395421505, "rewards/xmlcount_reward_func": 0.484375, "step": 923 }, { "completion_length": 158.45833778381348, "epoch": 0.49458048976314734, "grad_norm": 2.046875, "kl": 0.025178374722599983, "learning_rate": 2.980563684470448e-06, "loss": 0.001, "reward": 2.770833373069763, "reward_std": 0.8783334791660309, "rewards/correctness_reward_func": 1.333333395421505, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 924 }, { "completion_length": 159.62500381469727, "epoch": 0.49511575003345376, "grad_norm": 1.3515625, "kl": 0.04505129624158144, "learning_rate": 2.975977772911671e-06, "loss": 0.0018, "reward": 2.9010416865348816, "reward_std": 0.1904354840517044, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.484375, "step": 925 }, { "completion_length": 162.8333396911621, "epoch": 0.4956510103037602, "grad_norm": 1.6171875, "kl": 0.0264615248888731, "learning_rate": 2.9713901988941956e-06, "loss": 0.0011, "reward": 3.1875000596046448, "reward_std": 0.4816259741783142, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 926 }, { "completion_length": 146.7083396911621, "epoch": 0.4961862705740666, "grad_norm": 2.15625, "kl": 0.03129548905417323, "learning_rate": 2.9668009784411497e-06, "loss": 0.0013, "reward": 3.0416667461395264, "reward_std": 0.7602093182504177, "rewards/correctness_reward_func": 1.5833333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 927 }, { "completion_length": 191.08334350585938, "epoch": 0.4967215308443731, "grad_norm": 2.21875, "kl": 0.03684541070833802, "learning_rate": 2.9622101275814087e-06, "loss": 0.0015, "reward": 2.421875089406967, "reward_std": 0.6723825596272945, "rewards/correctness_reward_func": 1.083333358168602, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3750000074505806, "rewards/xmlcount_reward_func": 0.484375, "step": 928 }, { "completion_length": 172.9583339691162, "epoch": 0.4972567911146795, "grad_norm": 5.625, "kl": 0.17009031027555466, "learning_rate": 2.9576176623495457e-06, "loss": 0.0068, "reward": 3.1041667461395264, "reward_std": 0.8021577149629593, "rewards/correctness_reward_func": 1.6666667461395264, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 929 }, { "completion_length": 155.3333396911621, "epoch": 0.49779205138498595, "grad_norm": 2.390625, "kl": 0.08845770079642534, "learning_rate": 2.9530235987857715e-06, "loss": 0.0035, "reward": 3.270833432674408, "reward_std": 0.37377968057990074, "rewards/correctness_reward_func": 1.8333333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 930 }, { "completion_length": 166.95833587646484, "epoch": 0.49832731165529237, "grad_norm": 1.65625, "kl": 0.028497768100351095, "learning_rate": 2.948427952935879e-06, "loss": 0.0011, "reward": 2.458333373069763, "reward_std": 0.600963905453682, "rewards/correctness_reward_func": 1.083333358168602, "rewards/int_reward_func": 0.4375000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.4791666716337204, "step": 931 }, { "completion_length": 166.4166717529297, "epoch": 0.4988625719255988, "grad_norm": 1.71875, "kl": 0.04362269816920161, "learning_rate": 2.943830740851189e-06, "loss": 0.0017, "reward": 2.7500000596046448, "reward_std": 0.6913313567638397, "rewards/correctness_reward_func": 1.3333333656191826, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 932 }, { "completion_length": 162.5416717529297, "epoch": 0.4993978321959053, "grad_norm": 1.9140625, "kl": 0.03401909116655588, "learning_rate": 2.939231978588491e-06, "loss": 0.0014, "reward": 3.1041667461395264, "reward_std": 0.7571656107902527, "rewards/correctness_reward_func": 1.666666716337204, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 933 }, { "completion_length": 145.75, "epoch": 0.4999330924662117, "grad_norm": 0.9921875, "kl": 0.0415206546895206, "learning_rate": 2.934631682209989e-06, "loss": 0.0017, "reward": 3.4791666865348816, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 934 }, { "completion_length": 175.70833587646484, "epoch": 0.5004683527365181, "grad_norm": 0.67578125, "kl": 0.01767243049107492, "learning_rate": 2.930029867783246e-06, "loss": 0.0007, "reward": 3.125, "reward_std": 0.25, "rewards/correctness_reward_func": 1.6666666716337204, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 935 }, { "completion_length": 116.75000381469727, "epoch": 0.5010036130068246, "grad_norm": 1.296875, "kl": 0.029608782147988677, "learning_rate": 2.9254265513811274e-06, "loss": 0.0012, "reward": 3.333333373069763, "reward_std": 0.40824829041957855, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 936 }, { "completion_length": 165.5833396911621, "epoch": 0.501538873277131, "grad_norm": 1.7421875, "kl": 0.049315739423036575, "learning_rate": 2.920821749081744e-06, "loss": 0.002, "reward": 2.895833432674408, "reward_std": 0.473104827105999, "rewards/correctness_reward_func": 1.5000000223517418, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333507180214, "rewards/xmlcount_reward_func": 0.5, "step": 937 }, { "completion_length": 122.04166793823242, "epoch": 0.5020741335474375, "grad_norm": 1.6484375, "kl": 0.05571300070732832, "learning_rate": 2.9162154769683958e-06, "loss": 0.0022, "reward": 3.3750000596046448, "reward_std": 0.25129128620028496, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 938 }, { "completion_length": 126.45833778381348, "epoch": 0.5026093938177438, "grad_norm": 1.9140625, "kl": 0.034338406287133694, "learning_rate": 2.911607751129517e-06, "loss": 0.0014, "reward": 3.333333373069763, "reward_std": 0.40824829041957855, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 939 }, { "completion_length": 147.54166984558105, "epoch": 0.5031446540880503, "grad_norm": 1.765625, "kl": 0.0225078200455755, "learning_rate": 2.9069985876586206e-06, "loss": 0.0009, "reward": 2.6875000596046448, "reward_std": 0.566905565559864, "rewards/correctness_reward_func": 1.2500000298023224, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 940 }, { "completion_length": 143.0416717529297, "epoch": 0.5036799143583568, "grad_norm": 1.6953125, "kl": 0.041789953131228685, "learning_rate": 2.9023880026542383e-06, "loss": 0.0017, "reward": 3.0416667461395264, "reward_std": 0.5643851235508919, "rewards/correctness_reward_func": 1.583333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 941 }, { "completion_length": 149.6666717529297, "epoch": 0.5042151746286632, "grad_norm": 2.171875, "kl": 0.026956678368151188, "learning_rate": 2.8977760122198697e-06, "loss": 0.0011, "reward": 2.625000089406967, "reward_std": 0.5094902031123638, "rewards/correctness_reward_func": 1.2500000298023224, "rewards/int_reward_func": 0.39583333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 942 }, { "completion_length": 119.25000381469727, "epoch": 0.5047504348989696, "grad_norm": 1.546875, "kl": 0.04554880363866687, "learning_rate": 2.89316263246392e-06, "loss": 0.0018, "reward": 3.1250000596046448, "reward_std": 0.555328756570816, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 943 }, { "completion_length": 123.87500381469727, "epoch": 0.505285695169276, "grad_norm": 1.5234375, "kl": 0.02432346437126398, "learning_rate": 2.8885478794996496e-06, "loss": 0.001, "reward": 3.2500000596046448, "reward_std": 0.46232305467128754, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 944 }, { "completion_length": 149.4166717529297, "epoch": 0.5058209554395825, "grad_norm": 0.091796875, "kl": 0.02958611771464348, "learning_rate": 2.883931769445114e-06, "loss": 0.0012, "reward": 3.5, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 945 }, { "completion_length": 137.6250057220459, "epoch": 0.506356215709889, "grad_norm": 2.015625, "kl": 0.03613093541935086, "learning_rate": 2.879314318423108e-06, "loss": 0.0014, "reward": 2.5208334028720856, "reward_std": 0.6476409733295441, "rewards/correctness_reward_func": 1.0833333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 946 }, { "completion_length": 128.70833778381348, "epoch": 0.5068914759801953, "grad_norm": 1.421875, "kl": 0.024170507676899433, "learning_rate": 2.8746955425611122e-06, "loss": 0.001, "reward": 3.1666666865348816, "reward_std": 0.25819891691207886, "rewards/correctness_reward_func": 1.6666666716337204, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 947 }, { "completion_length": 138.3333396911621, "epoch": 0.5074267362505018, "grad_norm": 1.5234375, "kl": 0.0231890631839633, "learning_rate": 2.8700754579912315e-06, "loss": 0.0009, "reward": 3.3125000596046448, "reward_std": 0.4592793434858322, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 948 }, { "completion_length": 177.25000762939453, "epoch": 0.5079619965208082, "grad_norm": 1.546875, "kl": 0.027233313769102097, "learning_rate": 2.8654540808501447e-06, "loss": 0.0011, "reward": 2.536458373069763, "reward_std": 0.40045326575636864, "rewards/correctness_reward_func": 1.1666666716337204, "rewards/int_reward_func": 0.4166666679084301, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.4947916716337204, "step": 949 }, { "completion_length": 170.1666717529297, "epoch": 0.5084972567911147, "grad_norm": 3.359375, "kl": 0.07531993184238672, "learning_rate": 2.8608314272790427e-06, "loss": 0.003, "reward": 2.770833432674408, "reward_std": 1.1262514144182205, "rewards/correctness_reward_func": 1.416666716337204, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333358168602, "rewards/xmlcount_reward_func": 0.4791666716337204, "step": 950 }, { "completion_length": 123.58333778381348, "epoch": 0.5090325170614212, "grad_norm": 1.625, "kl": 0.03198196832090616, "learning_rate": 2.8562075134235757e-06, "loss": 0.0013, "reward": 3.2291667461395264, "reward_std": 0.5133541226387024, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 951 }, { "completion_length": 151.91666984558105, "epoch": 0.5095677773317275, "grad_norm": 1.03125, "kl": 0.03794420650228858, "learning_rate": 2.8515823554337973e-06, "loss": 0.0015, "reward": 3.1875000596046448, "reward_std": 0.3647233098745346, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 952 }, { "completion_length": 144.1666717529297, "epoch": 0.510103037602034, "grad_norm": 3.234375, "kl": 0.10791193041950464, "learning_rate": 2.846955969464103e-06, "loss": 0.0043, "reward": 3.2500000596046448, "reward_std": 0.3347994238138199, "rewards/correctness_reward_func": 1.8333333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.416666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 953 }, { "completion_length": 181.12500381469727, "epoch": 0.5106382978723404, "grad_norm": 1.5390625, "kl": 0.027862816117703915, "learning_rate": 2.8423283716731807e-06, "loss": 0.0011, "reward": 2.8750000596046448, "reward_std": 0.6469470970332623, "rewards/correctness_reward_func": 1.583333358168602, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3750000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 954 }, { "completion_length": 138.75000381469727, "epoch": 0.5111735581426469, "grad_norm": 1.5703125, "kl": 0.04192983591929078, "learning_rate": 2.8376995782239486e-06, "loss": 0.0017, "reward": 3.0416666865348816, "reward_std": 0.5552270114421844, "rewards/correctness_reward_func": 1.5833333432674408, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 955 }, { "completion_length": 143.0416717529297, "epoch": 0.5117088184129533, "grad_norm": 2.078125, "kl": 0.05718976445496082, "learning_rate": 2.8330696052835017e-06, "loss": 0.0023, "reward": 3.145833373069763, "reward_std": 0.5989172980189323, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666865348816, "rewards/xmlcount_reward_func": 0.5, "step": 956 }, { "completion_length": 156.25000381469727, "epoch": 0.5122440786832597, "grad_norm": 1.859375, "kl": 0.033939515706151724, "learning_rate": 2.828438469023056e-06, "loss": 0.0014, "reward": 2.9375000596046448, "reward_std": 0.8021840006113052, "rewards/correctness_reward_func": 1.5000000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000149011612, "rewards/xmlcount_reward_func": 0.5, "step": 957 }, { "completion_length": 138.04166793823242, "epoch": 0.5127793389535662, "grad_norm": 1.734375, "kl": 0.02495129080489278, "learning_rate": 2.8238061856178888e-06, "loss": 0.001, "reward": 3.2500000596046448, "reward_std": 0.46232306957244873, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 958 }, { "completion_length": 158.12500762939453, "epoch": 0.5133145992238726, "grad_norm": 1.9921875, "kl": 0.03997566644102335, "learning_rate": 2.8191727712472837e-06, "loss": 0.0016, "reward": 2.833333373069763, "reward_std": 0.7882219962775707, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 959 }, { "completion_length": 170.9583339691162, "epoch": 0.513849859494179, "grad_norm": 1.984375, "kl": 0.023815520806238055, "learning_rate": 2.8145382420944767e-06, "loss": 0.001, "reward": 1.8333333730697632, "reward_std": 0.7716152630746365, "rewards/correctness_reward_func": 0.4166666716337204, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 960 }, { "completion_length": 151.45833587646484, "epoch": 0.5143851197644855, "grad_norm": 1.09375, "kl": 0.030205977149307728, "learning_rate": 2.8099026143465952e-06, "loss": 0.0012, "reward": 3.1250000596046448, "reward_std": 0.2803870290517807, "rewards/correctness_reward_func": 1.6666666716337204, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 961 }, { "completion_length": 146.4583396911621, "epoch": 0.5149203800347919, "grad_norm": 1.5625, "kl": 0.02901528449729085, "learning_rate": 2.8052659041946063e-06, "loss": 0.0012, "reward": 2.7291667461395264, "reward_std": 0.45845916867256165, "rewards/correctness_reward_func": 1.2500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 962 }, { "completion_length": 149.00000381469727, "epoch": 0.5154556403050984, "grad_norm": 1.6640625, "kl": 0.045791531912982464, "learning_rate": 2.8006281278332542e-06, "loss": 0.0018, "reward": 2.8541667461395264, "reward_std": 0.6371217519044876, "rewards/correctness_reward_func": 1.4166667088866234, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 963 }, { "completion_length": 131.41666793823242, "epoch": 0.5159909005754048, "grad_norm": 0.796875, "kl": 0.024918334558606148, "learning_rate": 2.795989301461009e-06, "loss": 0.001, "reward": 3.4166666865348816, "reward_std": 0.20412415266036987, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 964 }, { "completion_length": 178.00000381469727, "epoch": 0.5165261608457112, "grad_norm": 1.296875, "kl": 0.03127570729702711, "learning_rate": 2.7913494412800087e-06, "loss": 0.0013, "reward": 2.9635417461395264, "reward_std": 0.5725657343864441, "rewards/correctness_reward_func": 1.583333358168602, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.484375, "step": 965 }, { "completion_length": 165.16667366027832, "epoch": 0.5170614211160177, "grad_norm": 1.390625, "kl": 0.02652121242135763, "learning_rate": 2.786708563496002e-06, "loss": 0.0011, "reward": 3.1875000596046448, "reward_std": 0.5145338624715805, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 966 }, { "completion_length": 155.50000762939453, "epoch": 0.5175966813863241, "grad_norm": 1.0625, "kl": 0.028957795351743698, "learning_rate": 2.78206668431829e-06, "loss": 0.0012, "reward": 3.1041666865348816, "reward_std": 0.3266642391681671, "rewards/correctness_reward_func": 1.6666666716337204, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.5, "step": 967 }, { "completion_length": 171.50000381469727, "epoch": 0.5181319416566306, "grad_norm": 1.359375, "kl": 0.02106982236728072, "learning_rate": 2.7774238199596726e-06, "loss": 0.0008, "reward": 2.8125, "reward_std": 0.4269544184207916, "rewards/correctness_reward_func": 1.3333333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 968 }, { "completion_length": 180.08333587646484, "epoch": 0.5186672019269369, "grad_norm": 1.7109375, "kl": 0.017889021197333932, "learning_rate": 2.772779986636392e-06, "loss": 0.0007, "reward": 2.958333432674408, "reward_std": 0.743688777089119, "rewards/correctness_reward_func": 1.5000000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 969 }, { "completion_length": 106.16666793823242, "epoch": 0.5192024621972434, "grad_norm": 1.265625, "kl": 0.03550974791869521, "learning_rate": 2.768135200568073e-06, "loss": 0.0014, "reward": 3.395833373069763, "reward_std": 0.25515517219901085, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 970 }, { "completion_length": 165.04167366027832, "epoch": 0.5197377224675499, "grad_norm": 1.421875, "kl": 0.02399549330584705, "learning_rate": 2.7634894779776676e-06, "loss": 0.001, "reward": 3.1875000596046448, "reward_std": 0.4875549077987671, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.4791666716337204, "step": 971 }, { "completion_length": 177.45833587646484, "epoch": 0.5202729827378563, "grad_norm": 1.734375, "kl": 0.04331376403570175, "learning_rate": 2.7588428350914014e-06, "loss": 0.0017, "reward": 2.770833373069763, "reward_std": 0.8424348831176758, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 972 }, { "completion_length": 149.16667366027832, "epoch": 0.5208082430081628, "grad_norm": 1.3203125, "kl": 0.05422828788869083, "learning_rate": 2.7541952881387115e-06, "loss": 0.0022, "reward": 3.2916667461395264, "reward_std": 0.3602609895169735, "rewards/correctness_reward_func": 1.8333333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 973 }, { "completion_length": 116.33333587646484, "epoch": 0.5213435032784691, "grad_norm": 1.6015625, "kl": 0.026882473845034838, "learning_rate": 2.7495468533521935e-06, "loss": 0.0011, "reward": 3.333333373069763, "reward_std": 0.25819891691207886, "rewards/correctness_reward_func": 1.8333333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 974 }, { "completion_length": 167.29166984558105, "epoch": 0.5218787635487756, "grad_norm": 1.59375, "kl": 0.0297106949146837, "learning_rate": 2.744897546967545e-06, "loss": 0.0012, "reward": 2.8125000596046448, "reward_std": 0.6229222267866135, "rewards/correctness_reward_func": 1.4166666716337204, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 975 }, { "completion_length": 141.7916717529297, "epoch": 0.522414023819082, "grad_norm": 1.3984375, "kl": 0.03204418160021305, "learning_rate": 2.7402473852235073e-06, "loss": 0.0013, "reward": 3.395833373069763, "reward_std": 0.25515518710017204, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 976 }, { "completion_length": 235.25000381469727, "epoch": 0.5229492840893885, "grad_norm": 1.078125, "kl": 0.027873071609064937, "learning_rate": 2.735596384361809e-06, "loss": 0.0011, "reward": 2.4166666865348816, "reward_std": 0.7820602059364319, "rewards/correctness_reward_func": 1.0000000074505806, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 977 }, { "completion_length": 134.4583396911621, "epoch": 0.5234845443596949, "grad_norm": 1.65625, "kl": 0.03450268576852977, "learning_rate": 2.730944560627109e-06, "loss": 0.0014, "reward": 3.020833432674408, "reward_std": 0.6229222267866135, "rewards/correctness_reward_func": 1.5833333432674408, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 978 }, { "completion_length": 173.29166793823242, "epoch": 0.5240198046300013, "grad_norm": 1.8828125, "kl": 0.040093475952744484, "learning_rate": 2.7262919302669405e-06, "loss": 0.0016, "reward": 2.312500089406967, "reward_std": 0.9042879492044449, "rewards/correctness_reward_func": 0.9166666865348816, "rewards/int_reward_func": 0.4375000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 979 }, { "completion_length": 144.16667366027832, "epoch": 0.5245550649003078, "grad_norm": 1.9609375, "kl": 0.024688265286386013, "learning_rate": 2.721638509531656e-06, "loss": 0.001, "reward": 3.083333432674408, "reward_std": 0.7205219864845276, "rewards/correctness_reward_func": 1.5833333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 980 }, { "completion_length": 162.4166717529297, "epoch": 0.5250903251706142, "grad_norm": 1.7578125, "kl": 0.04531784076243639, "learning_rate": 2.7169843146743658e-06, "loss": 0.0018, "reward": 2.9791667461395264, "reward_std": 0.5473008155822754, "rewards/correctness_reward_func": 1.583333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 981 }, { "completion_length": 114.29166984558105, "epoch": 0.5256255854409206, "grad_norm": 1.8125, "kl": 0.041602776385843754, "learning_rate": 2.7123293619508855e-06, "loss": 0.0017, "reward": 2.7708334028720856, "reward_std": 0.3572172783315182, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.37500000558793545, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 982 }, { "completion_length": 218.9166717529297, "epoch": 0.5261608457112271, "grad_norm": 1.3515625, "kl": 0.04277056595310569, "learning_rate": 2.7076736676196764e-06, "loss": 0.0017, "reward": 2.692708432674408, "reward_std": 0.719441369175911, "rewards/correctness_reward_func": 1.4166666716337204, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.31250000558793545, "rewards/xmlcount_reward_func": 0.484375, "step": 983 }, { "completion_length": 173.4583396911621, "epoch": 0.5266961059815335, "grad_norm": 2.03125, "kl": 0.032342477701604366, "learning_rate": 2.703017247941793e-06, "loss": 0.0013, "reward": 2.9166667461395264, "reward_std": 0.8832631707191467, "rewards/correctness_reward_func": 1.5000000596046448, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 984 }, { "completion_length": 142.9583396911621, "epoch": 0.52723136625184, "grad_norm": 0.94921875, "kl": 0.02848183922469616, "learning_rate": 2.6983601191808184e-06, "loss": 0.0011, "reward": 2.8750000596046448, "reward_std": 0.25129128992557526, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 985 }, { "completion_length": 134.70833778381348, "epoch": 0.5277666265221463, "grad_norm": 1.4296875, "kl": 0.024114880245178938, "learning_rate": 2.6937022976028176e-06, "loss": 0.001, "reward": 3.2291667461395264, "reward_std": 0.5133541226387024, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 986 }, { "completion_length": 166.91666793823242, "epoch": 0.5283018867924528, "grad_norm": 1.8359375, "kl": 0.045298111625015736, "learning_rate": 2.6890437994762716e-06, "loss": 0.0018, "reward": 2.395833358168602, "reward_std": 0.6988043487071991, "rewards/correctness_reward_func": 1.166666716337204, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3541666753590107, "rewards/xmlcount_reward_func": 0.5, "step": 987 }, { "completion_length": 160.79166984558105, "epoch": 0.5288371470627593, "grad_norm": 1.90625, "kl": 0.029747297056019306, "learning_rate": 2.684384641072026e-06, "loss": 0.0012, "reward": 3.3125000596046448, "reward_std": 0.31970490515232086, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333395421505, "rewards/xmlcount_reward_func": 0.5, "step": 988 }, { "completion_length": 122.95833396911621, "epoch": 0.5293724073330657, "grad_norm": 3.671875, "kl": 0.04499641829170287, "learning_rate": 2.6797248386632328e-06, "loss": 0.0018, "reward": 3.3854166865348816, "reward_std": 0.280670702457428, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.4895833358168602, "step": 989 }, { "completion_length": 148.25000381469727, "epoch": 0.5299076676033722, "grad_norm": 2.078125, "kl": 0.09255302604287863, "learning_rate": 2.6750644085252926e-06, "loss": 0.0037, "reward": 3.1666667461395264, "reward_std": 0.4376493915915489, "rewards/correctness_reward_func": 1.8333333432674408, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.354166679084301, "rewards/xmlcount_reward_func": 0.5, "step": 990 }, { "completion_length": 188.8750057220459, "epoch": 0.5304429278736785, "grad_norm": 0.9375, "kl": 0.03525672573596239, "learning_rate": 2.6704033669357986e-06, "loss": 0.0014, "reward": 2.895833343267441, "reward_std": 0.3776441812515259, "rewards/correctness_reward_func": 1.5000000223517418, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 991 }, { "completion_length": 123.04167175292969, "epoch": 0.530978188143985, "grad_norm": 2.140625, "kl": 0.06268464820459485, "learning_rate": 2.6657417301744796e-06, "loss": 0.0025, "reward": 2.770833432674408, "reward_std": 0.7469579875469208, "rewards/correctness_reward_func": 1.3333333656191826, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 992 }, { "completion_length": 125.41666984558105, "epoch": 0.5315134484142915, "grad_norm": 1.21875, "kl": 0.0292810145765543, "learning_rate": 2.6610795145231443e-06, "loss": 0.0012, "reward": 3.4166666865348816, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 993 }, { "completion_length": 141.0416717529297, "epoch": 0.5320487086845979, "grad_norm": 1.3671875, "kl": 0.030940232798457146, "learning_rate": 2.656416736265621e-06, "loss": 0.0012, "reward": 3.2916666865348816, "reward_std": 0.39777331054210663, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 994 }, { "completion_length": 143.00000190734863, "epoch": 0.5325839689549043, "grad_norm": 1.03125, "kl": 0.0226780385710299, "learning_rate": 2.6517534116877046e-06, "loss": 0.0009, "reward": 3.4166666865348816, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 995 }, { "completion_length": 205.6666717529297, "epoch": 0.5331192292252107, "grad_norm": 1.34375, "kl": 0.03955272724851966, "learning_rate": 2.647089557077099e-06, "loss": 0.0016, "reward": 2.7447916865348816, "reward_std": 0.5635670721530914, "rewards/correctness_reward_func": 1.5000000223517418, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3541666679084301, "rewards/xmlcount_reward_func": 0.453125, "step": 996 }, { "completion_length": 188.54167556762695, "epoch": 0.5336544894955172, "grad_norm": 1.1796875, "kl": 0.031185157131403685, "learning_rate": 2.6424251887233574e-06, "loss": 0.0012, "reward": 3.0416667461395264, "reward_std": 0.3827027641236782, "rewards/correctness_reward_func": 1.6666666716337204, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3750000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 997 }, { "completion_length": 168.875, "epoch": 0.5341897497658237, "grad_norm": 2.03125, "kl": 0.028122437419369817, "learning_rate": 2.6377603229178278e-06, "loss": 0.0011, "reward": 2.7291667461395264, "reward_std": 0.5599979385733604, "rewards/correctness_reward_func": 1.3333333730697632, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.416666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 998 }, { "completion_length": 115.91667175292969, "epoch": 0.53472501003613, "grad_norm": 1.015625, "kl": 0.021800895920023322, "learning_rate": 2.633094975953597e-06, "loss": 0.0009, "reward": 3.1666666865348816, "reward_std": 0.25819891691207886, "rewards/correctness_reward_func": 1.6666666716337204, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 999 }, { "completion_length": 180.4166717529297, "epoch": 0.5352602703064365, "grad_norm": 1.875, "kl": 0.04836212657392025, "learning_rate": 2.6284291641254308e-06, "loss": 0.0019, "reward": 2.2500000298023224, "reward_std": 1.158027172088623, "rewards/correctness_reward_func": 0.9166666865348816, "rewards/int_reward_func": 0.3958333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 1000 }, { "completion_length": 138.3750057220459, "epoch": 0.5357955305767429, "grad_norm": 1.5, "kl": 0.029096576385200024, "learning_rate": 2.62376290372972e-06, "loss": 0.0012, "reward": 3.333333373069763, "reward_std": 0.40824830532073975, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1001 }, { "completion_length": 160.2916717529297, "epoch": 0.5363307908470494, "grad_norm": 1.8671875, "kl": 0.06832502828910947, "learning_rate": 2.6190962110644215e-06, "loss": 0.0027, "reward": 3.0625000596046448, "reward_std": 0.6765787862241268, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 1002 }, { "completion_length": 145.79166984558105, "epoch": 0.5368660511173559, "grad_norm": 1.3828125, "kl": 0.04142569610849023, "learning_rate": 2.6144291024290004e-06, "loss": 0.0017, "reward": 3.3125, "reward_std": 0.246855229139328, "rewards/correctness_reward_func": 1.8333333432674408, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1003 }, { "completion_length": 175.95833587646484, "epoch": 0.5374013113876622, "grad_norm": 1.4375, "kl": 0.029265500139445066, "learning_rate": 2.6097615941243777e-06, "loss": 0.0012, "reward": 2.6875000596046448, "reward_std": 0.40223564952611923, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1004 }, { "completion_length": 116.12500381469727, "epoch": 0.5379365716579687, "grad_norm": 0.90625, "kl": 0.04694632440805435, "learning_rate": 2.605093702452868e-06, "loss": 0.0019, "reward": 3.3125, "reward_std": 0.246855229139328, "rewards/correctness_reward_func": 1.8333333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1005 }, { "completion_length": 151.50000381469727, "epoch": 0.5384718319282751, "grad_norm": 1.9921875, "kl": 0.04991014767438173, "learning_rate": 2.600425443718127e-06, "loss": 0.002, "reward": 3.0625000596046448, "reward_std": 0.5599183700978756, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1006 }, { "completion_length": 146.8333396911621, "epoch": 0.5390070921985816, "grad_norm": 1.6640625, "kl": 0.03239775216206908, "learning_rate": 2.595756834225089e-06, "loss": 0.0013, "reward": 3.458333373069763, "reward_std": 0.10206207633018494, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1007 }, { "completion_length": 163.5833396911621, "epoch": 0.539542352468888, "grad_norm": 1.9609375, "kl": 0.03426534216850996, "learning_rate": 2.591087890279917e-06, "loss": 0.0014, "reward": 3.0000000596046448, "reward_std": 0.8296719007194042, "rewards/correctness_reward_func": 1.5833333730697632, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 1008 }, { "completion_length": 170.04166793823242, "epoch": 0.5400776127391944, "grad_norm": 1.109375, "kl": 0.02527566161006689, "learning_rate": 2.58641862818994e-06, "loss": 0.001, "reward": 2.645833343267441, "reward_std": 0.4242093414068222, "rewards/correctness_reward_func": 1.1666666716337204, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1009 }, { "completion_length": 150.33333587646484, "epoch": 0.5406128730095009, "grad_norm": 1.59375, "kl": 0.027325558941811323, "learning_rate": 2.5817490642636e-06, "loss": 0.0011, "reward": 2.583333432674408, "reward_std": 0.7205219715833664, "rewards/correctness_reward_func": 1.0833333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1010 }, { "completion_length": 124.375, "epoch": 0.5411481332798073, "grad_norm": 1.2421875, "kl": 0.03881736192852259, "learning_rate": 2.5770792148103916e-06, "loss": 0.0016, "reward": 3.333333373069763, "reward_std": 0.25819891691207886, "rewards/correctness_reward_func": 1.8333333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1011 }, { "completion_length": 187.00000762939453, "epoch": 0.5416833935501137, "grad_norm": 1.796875, "kl": 0.04065545601770282, "learning_rate": 2.5724090961408066e-06, "loss": 0.0016, "reward": 2.3958333879709244, "reward_std": 0.3572172410786152, "rewards/correctness_reward_func": 1.0833333358168602, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000149011612, "rewards/xmlcount_reward_func": 0.5, "step": 1012 }, { "completion_length": 121.04166984558105, "epoch": 0.5422186538204202, "grad_norm": 0.73828125, "kl": 0.039184169843792915, "learning_rate": 2.5677387245662782e-06, "loss": 0.0016, "reward": 3.4791666865348816, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1013 }, { "completion_length": 143.58333778381348, "epoch": 0.5427539140907266, "grad_norm": 1.703125, "kl": 0.03704654565081, "learning_rate": 2.5630681163991224e-06, "loss": 0.0015, "reward": 3.145833373069763, "reward_std": 0.5618248581886292, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 1014 }, { "completion_length": 137.87500762939453, "epoch": 0.5432891743610331, "grad_norm": 1.46875, "kl": 0.026060293428599834, "learning_rate": 2.5583972879524817e-06, "loss": 0.001, "reward": 3.2291667461395264, "reward_std": 0.6634034961462021, "rewards/correctness_reward_func": 1.7500000596046448, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1015 }, { "completion_length": 150.66666984558105, "epoch": 0.5438244346313394, "grad_norm": 1.484375, "kl": 0.023670056369155645, "learning_rate": 2.5537262555402675e-06, "loss": 0.0009, "reward": 3.2291667461395264, "reward_std": 0.51335409283638, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1016 }, { "completion_length": 137.62500190734863, "epoch": 0.5443596949016459, "grad_norm": 1.6015625, "kl": 0.025291157886385918, "learning_rate": 2.5490550354771044e-06, "loss": 0.001, "reward": 3.395833373069763, "reward_std": 0.25515517219901085, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1017 }, { "completion_length": 199.79166984558105, "epoch": 0.5448949551719524, "grad_norm": 1.6953125, "kl": 0.04733401257544756, "learning_rate": 2.544383644078271e-06, "loss": 0.0019, "reward": 2.2916667461395264, "reward_std": 0.5906235836446285, "rewards/correctness_reward_func": 0.9166666716337204, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.416666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 1018 }, { "completion_length": 163.5416717529297, "epoch": 0.5454302154422588, "grad_norm": 1.8046875, "kl": 0.019132951041683555, "learning_rate": 2.539712097659647e-06, "loss": 0.0008, "reward": 2.958333373069763, "reward_std": 0.47524039447307587, "rewards/correctness_reward_func": 1.5000000223517418, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1019 }, { "completion_length": 124.37500381469727, "epoch": 0.5459654757125653, "grad_norm": 1.6640625, "kl": 0.03320982772856951, "learning_rate": 2.5350404125376494e-06, "loss": 0.0013, "reward": 3.2500000596046448, "reward_std": 0.46232305467128754, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1020 }, { "completion_length": 171.2500057220459, "epoch": 0.5465007359828716, "grad_norm": 1.3125, "kl": 0.03306609811261296, "learning_rate": 2.530368605029185e-06, "loss": 0.0013, "reward": 2.645833373069763, "reward_std": 0.4741215407848358, "rewards/correctness_reward_func": 1.1666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1021 }, { "completion_length": 127.37500381469727, "epoch": 0.5470359962531781, "grad_norm": 0.921875, "kl": 0.036080996971577406, "learning_rate": 2.5256966914515823e-06, "loss": 0.0014, "reward": 3.4791666865348816, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1022 }, { "completion_length": 154.00000381469727, "epoch": 0.5475712565234846, "grad_norm": 1.6328125, "kl": 0.028959017246961594, "learning_rate": 2.5210246881225448e-06, "loss": 0.0012, "reward": 3.4375000596046448, "reward_std": 0.11558076739311218, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 1023 }, { "completion_length": 153.16667366027832, "epoch": 0.548106516793791, "grad_norm": 1.5, "kl": 0.04141217190772295, "learning_rate": 2.516352611360088e-06, "loss": 0.0017, "reward": 2.6875000596046448, "reward_std": 0.3894420526921749, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 1024 }, { "completion_length": 179.12500381469727, "epoch": 0.5486417770640974, "grad_norm": 1.4453125, "kl": 0.02090016705915332, "learning_rate": 2.511680477482482e-06, "loss": 0.0008, "reward": 2.7291667461395264, "reward_std": 0.39121396839618683, "rewards/correctness_reward_func": 1.3333333432674408, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 1025 }, { "completion_length": 168.7916717529297, "epoch": 0.5491770373344038, "grad_norm": 1.265625, "kl": 0.02719574049115181, "learning_rate": 2.5070083028082004e-06, "loss": 0.0011, "reward": 2.8333334028720856, "reward_std": 0.3707359693944454, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000149011612, "rewards/xmlcount_reward_func": 0.5, "step": 1026 }, { "completion_length": 185.50000762939453, "epoch": 0.5497122976047103, "grad_norm": 1.59375, "kl": 0.02753805136308074, "learning_rate": 2.5023361036558546e-06, "loss": 0.0011, "reward": 2.208333432674408, "reward_std": 0.8469306528568268, "rewards/correctness_reward_func": 0.8333333432674408, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 1027 }, { "completion_length": 128.08333587646484, "epoch": 0.5502475578750168, "grad_norm": 1.578125, "kl": 0.02586387423798442, "learning_rate": 2.497663896344146e-06, "loss": 0.001, "reward": 3.2500000596046448, "reward_std": 0.46232305467128754, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1028 }, { "completion_length": 165.0833396911621, "epoch": 0.5507828181453231, "grad_norm": 0.5546875, "kl": 0.02411092072725296, "learning_rate": 2.4929916971917995e-06, "loss": 0.001, "reward": 3.458333373069763, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 1029 }, { "completion_length": 141.66666793823242, "epoch": 0.5513180784156296, "grad_norm": 0.72265625, "kl": 0.029878363013267517, "learning_rate": 2.4883195225175188e-06, "loss": 0.0012, "reward": 3.2291666865348816, "reward_std": 0.25515520572662354, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1030 }, { "completion_length": 142.4583339691162, "epoch": 0.551853338685936, "grad_norm": 1.421875, "kl": 0.048918829299509525, "learning_rate": 2.4836473886399133e-06, "loss": 0.002, "reward": 3.2500000596046448, "reward_std": 0.46232303977012634, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1031 }, { "completion_length": 184.1666717529297, "epoch": 0.5523885989562425, "grad_norm": 1.6484375, "kl": 0.027948823757469654, "learning_rate": 2.4789753118774552e-06, "loss": 0.0011, "reward": 3.208333432674408, "reward_std": 0.5643851608037949, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1032 }, { "completion_length": 175.29167556762695, "epoch": 0.552923859226549, "grad_norm": 1.265625, "kl": 0.028014506213366985, "learning_rate": 2.474303308548418e-06, "loss": 0.0011, "reward": 2.776041716337204, "reward_std": 0.5555943250656128, "rewards/correctness_reward_func": 1.416666679084301, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.484375, "step": 1033 }, { "completion_length": 156.00000381469727, "epoch": 0.5534591194968553, "grad_norm": 1.2109375, "kl": 0.047277290374040604, "learning_rate": 2.469631394970816e-06, "loss": 0.0019, "reward": 3.1666667461395264, "reward_std": 0.5163978338241577, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1034 }, { "completion_length": 179.62500762939453, "epoch": 0.5539943797671618, "grad_norm": 1.5234375, "kl": 0.03146013617515564, "learning_rate": 2.464959587462351e-06, "loss": 0.0013, "reward": 2.75, "reward_std": 0.6605896055698395, "rewards/correctness_reward_func": 1.3333333656191826, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.416666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 1035 }, { "completion_length": 138.83333587646484, "epoch": 0.5545296400374682, "grad_norm": 1.6875, "kl": 0.018021578900516033, "learning_rate": 2.4602879023403547e-06, "loss": 0.0007, "reward": 3.2291667461395264, "reward_std": 0.45845916867256165, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1036 }, { "completion_length": 179.50000190734863, "epoch": 0.5550649003077747, "grad_norm": 1.7734375, "kl": 0.03059578686952591, "learning_rate": 2.4556163559217294e-06, "loss": 0.0012, "reward": 2.333333373069763, "reward_std": 0.8427640199661255, "rewards/correctness_reward_func": 0.9166666865348816, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 1037 }, { "completion_length": 165.4166717529297, "epoch": 0.555600160578081, "grad_norm": 2.09375, "kl": 0.04378655459731817, "learning_rate": 2.4509449645228965e-06, "loss": 0.0018, "reward": 2.833333432674408, "reward_std": 0.3533533588051796, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666865348816, "rewards/xmlcount_reward_func": 0.5, "step": 1038 }, { "completion_length": 145.95833587646484, "epoch": 0.5561354208483875, "grad_norm": 1.390625, "kl": 0.030613688752055168, "learning_rate": 2.4462737444597337e-06, "loss": 0.0012, "reward": 3.125, "reward_std": 0.523861289024353, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1039 }, { "completion_length": 145.1666717529297, "epoch": 0.556670681118694, "grad_norm": 1.515625, "kl": 0.06425127293914557, "learning_rate": 2.441602712047519e-06, "loss": 0.0026, "reward": 2.645833373069763, "reward_std": 0.4741215407848358, "rewards/correctness_reward_func": 1.1666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1040 }, { "completion_length": 143.87500381469727, "epoch": 0.5572059413890004, "grad_norm": 2.625, "kl": 0.04020787123590708, "learning_rate": 2.436931883600879e-06, "loss": 0.0016, "reward": 3.057291805744171, "reward_std": 0.6287723630666733, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333507180214, "rewards/xmlcount_reward_func": 0.4947916716337204, "step": 1041 }, { "completion_length": 164.70833587646484, "epoch": 0.5577412016593069, "grad_norm": 1.796875, "kl": 0.027084154076874256, "learning_rate": 2.432261275433722e-06, "loss": 0.0011, "reward": 2.6621667444705963, "reward_std": 0.6238786093890667, "rewards/correctness_reward_func": 1.2500000298023224, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000149011612, "rewards/xmlcount_reward_func": 0.49549999833106995, "step": 1042 }, { "completion_length": 145.70834159851074, "epoch": 0.5582764619296132, "grad_norm": 1.8828125, "kl": 0.028898541815578938, "learning_rate": 2.427590903859194e-06, "loss": 0.0012, "reward": 2.791666716337204, "reward_std": 0.6409856230020523, "rewards/correctness_reward_func": 1.3333333656191826, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 1043 }, { "completion_length": 157.3333396911621, "epoch": 0.5588117221999197, "grad_norm": 1.953125, "kl": 0.036141276359558105, "learning_rate": 2.4229207851896096e-06, "loss": 0.0014, "reward": 2.9947917461395264, "reward_std": 0.7928604781627655, "rewards/correctness_reward_func": 1.5833333730697632, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.4947916716337204, "step": 1044 }, { "completion_length": 153.50000381469727, "epoch": 0.5593469824702262, "grad_norm": 1.7890625, "kl": 0.03226162260398269, "learning_rate": 2.4182509357364005e-06, "loss": 0.0013, "reward": 2.708333373069763, "reward_std": 0.5884110182523727, "rewards/correctness_reward_func": 1.3333333358168602, "rewards/int_reward_func": 0.4166666679084301, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 1045 }, { "completion_length": 235.75000381469727, "epoch": 0.5598822427405326, "grad_norm": 1.9296875, "kl": 0.03827035194262862, "learning_rate": 2.4135813718100607e-06, "loss": 0.0015, "reward": 2.541666731238365, "reward_std": 0.43266692385077477, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.4166666679084301, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.2708333395421505, "rewards/xmlcount_reward_func": 0.4375, "step": 1046 }, { "completion_length": 141.6666717529297, "epoch": 0.560417503010839, "grad_norm": 2.015625, "kl": 0.034219959285110235, "learning_rate": 2.4089121097200836e-06, "loss": 0.0014, "reward": 3.2916667461395264, "reward_std": 0.5103103443980217, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1047 }, { "completion_length": 142.29166984558105, "epoch": 0.5609527632811454, "grad_norm": 2.265625, "kl": 0.03296273294836283, "learning_rate": 2.404243165774912e-06, "loss": 0.0013, "reward": 2.9166667461395264, "reward_std": 0.6358941905200481, "rewards/correctness_reward_func": 1.5000000149011612, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1048 }, { "completion_length": 183.08333587646484, "epoch": 0.5614880235514519, "grad_norm": 1.5625, "kl": 0.05004376173019409, "learning_rate": 2.3995745562818747e-06, "loss": 0.002, "reward": 2.704166680574417, "reward_std": 0.9476025104522705, "rewards/correctness_reward_func": 1.4166667014360428, "rewards/int_reward_func": 0.3958333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.433333333581686, "step": 1049 }, { "completion_length": 158.0416717529297, "epoch": 0.5620232838217584, "grad_norm": 0.84765625, "kl": 0.027449314016848803, "learning_rate": 2.3949062975471325e-06, "loss": 0.0011, "reward": 3.395833373069763, "reward_std": 0.25515517592430115, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1050 }, { "completion_length": 128.50000381469727, "epoch": 0.5625585440920647, "grad_norm": 1.4765625, "kl": 0.03116176975890994, "learning_rate": 2.390238405875623e-06, "loss": 0.0012, "reward": 3.2916666865348816, "reward_std": 0.2978862635791302, "rewards/correctness_reward_func": 1.8333333432674408, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1051 }, { "completion_length": 203.7916717529297, "epoch": 0.5630938043623712, "grad_norm": 1.7734375, "kl": 0.027977202786132693, "learning_rate": 2.385570897571001e-06, "loss": 0.0011, "reward": 2.8125000596046448, "reward_std": 0.6778688579797745, "rewards/correctness_reward_func": 1.4166666716337204, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1052 }, { "completion_length": 102.75000190734863, "epoch": 0.5636290646326776, "grad_norm": 1.4765625, "kl": 0.05541924946010113, "learning_rate": 2.3809037889355794e-06, "loss": 0.0022, "reward": 3.4166666865348816, "reward_std": 0.11949635669589043, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.5, "step": 1053 }, { "completion_length": 144.45833778381348, "epoch": 0.5641643249029841, "grad_norm": 1.8125, "kl": 0.057408999651670456, "learning_rate": 2.3762370962702803e-06, "loss": 0.0023, "reward": 3.083333373069763, "reward_std": 0.5582601875066757, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.416666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 1054 }, { "completion_length": 163.0833396911621, "epoch": 0.5646995851732906, "grad_norm": 1.0546875, "kl": 0.022327065002173185, "learning_rate": 2.371570835874569e-06, "loss": 0.0009, "reward": 3.395833373069763, "reward_std": 0.25515518710017204, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1055 }, { "completion_length": 190.25000381469727, "epoch": 0.5652348454435969, "grad_norm": 1.4609375, "kl": 0.0315181240439415, "learning_rate": 2.366905024046404e-06, "loss": 0.0013, "reward": 2.7916666865348816, "reward_std": 0.7279854267835617, "rewards/correctness_reward_func": 1.333333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 1056 }, { "completion_length": 148.12500381469727, "epoch": 0.5657701057139034, "grad_norm": 1.9375, "kl": 0.031283630756661296, "learning_rate": 2.3622396770821735e-06, "loss": 0.0013, "reward": 2.7916667461395264, "reward_std": 0.743688777089119, "rewards/correctness_reward_func": 1.333333358168602, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1057 }, { "completion_length": 167.4583396911621, "epoch": 0.5663053659842098, "grad_norm": 1.796875, "kl": 0.04646863928064704, "learning_rate": 2.3575748112766434e-06, "loss": 0.0019, "reward": 2.708333373069763, "reward_std": 0.517269667237997, "rewards/correctness_reward_func": 1.3333333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3750000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 1058 }, { "completion_length": 153.87500381469727, "epoch": 0.5668406262545163, "grad_norm": 1.296875, "kl": 0.032381411641836166, "learning_rate": 2.352910442922902e-06, "loss": 0.0013, "reward": 3.145833373069763, "reward_std": 0.4937000125646591, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.5, "step": 1059 }, { "completion_length": 167.9583396911621, "epoch": 0.5673758865248227, "grad_norm": 1.5390625, "kl": 0.029462992679327726, "learning_rate": 2.348246588312296e-06, "loss": 0.0012, "reward": 2.333333373069763, "reward_std": 0.5401924960315228, "rewards/correctness_reward_func": 0.9166666716337204, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1060 }, { "completion_length": 155.00000381469727, "epoch": 0.5679111467951291, "grad_norm": 1.3125, "kl": 0.02696467051282525, "learning_rate": 2.34358326373438e-06, "loss": 0.0011, "reward": 2.8125000596046448, "reward_std": 0.40438438951969147, "rewards/correctness_reward_func": 1.3333333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1061 }, { "completion_length": 154.04166793823242, "epoch": 0.5684464070654356, "grad_norm": 1.5390625, "kl": 0.03205305617302656, "learning_rate": 2.338920485476857e-06, "loss": 0.0013, "reward": 3.1666667461395264, "reward_std": 0.5222771726548672, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1062 }, { "completion_length": 151.37500190734863, "epoch": 0.568981667335742, "grad_norm": 1.34375, "kl": 0.04629839211702347, "learning_rate": 2.3342582698255204e-06, "loss": 0.0019, "reward": 3.1875000596046448, "reward_std": 0.35412560403347015, "rewards/correctness_reward_func": 1.8333333432674408, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333395421505, "rewards/xmlcount_reward_func": 0.5, "step": 1063 }, { "completion_length": 131.95833778381348, "epoch": 0.5695169276060484, "grad_norm": 1.640625, "kl": 0.03145950939506292, "learning_rate": 2.3295966330642018e-06, "loss": 0.0013, "reward": 2.3259166926145554, "reward_std": 0.12022912129759789, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.3541666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.49258333444595337, "step": 1064 }, { "completion_length": 155.87500381469727, "epoch": 0.5700521878763549, "grad_norm": 1.765625, "kl": 0.03759356401860714, "learning_rate": 2.3249355914747078e-06, "loss": 0.0015, "reward": 2.8541667461395264, "reward_std": 0.9787831455469131, "rewards/correctness_reward_func": 1.416666716337204, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 1065 }, { "completion_length": 169.33333778381348, "epoch": 0.5705874481466613, "grad_norm": 2.265625, "kl": 0.023252596147358418, "learning_rate": 2.3202751613367676e-06, "loss": 0.0009, "reward": 2.145833432674408, "reward_std": 0.9450224339962006, "rewards/correctness_reward_func": 0.6666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1066 }, { "completion_length": 149.00000762939453, "epoch": 0.5711227084169678, "grad_norm": 99.0, "kl": 1.3318175182212144, "learning_rate": 2.3156153589279745e-06, "loss": 0.0533, "reward": 3.4444167017936707, "reward_std": 0.1361508071422577, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.48608332872390747, "step": 1067 }, { "completion_length": 137.58333778381348, "epoch": 0.5716579686872741, "grad_norm": 1.8828125, "kl": 0.03667767532169819, "learning_rate": 2.3109562005237284e-06, "loss": 0.0015, "reward": 3.2291666865348816, "reward_std": 0.4864138811826706, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.5, "step": 1068 }, { "completion_length": 128.25000381469727, "epoch": 0.5721932289575806, "grad_norm": 3.453125, "kl": 0.0949320113286376, "learning_rate": 2.306297702397183e-06, "loss": 0.0038, "reward": 3.3125000596046448, "reward_std": 0.40438438951969147, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1069 }, { "completion_length": 163.62500190734863, "epoch": 0.5727284892278871, "grad_norm": 296.0, "kl": 4.624088962562382, "learning_rate": 2.301639880819183e-06, "loss": 0.185, "reward": 2.812500089406967, "reward_std": 0.9114490151405334, "rewards/correctness_reward_func": 1.5000000447034836, "rewards/int_reward_func": 0.4375000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.416666679084301, "rewards/xmlcount_reward_func": 0.4583333358168602, "step": 1070 }, { "completion_length": 137.37500190734863, "epoch": 0.5732637494981935, "grad_norm": 2.078125, "kl": 0.04243481811136007, "learning_rate": 2.296982752058208e-06, "loss": 0.0017, "reward": 3.3125000596046448, "reward_std": 0.4592793248593807, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1071 }, { "completion_length": 228.2916717529297, "epoch": 0.5737990097685, "grad_norm": 1.875, "kl": 0.04375831922516227, "learning_rate": 2.292326332380324e-06, "loss": 0.0018, "reward": 2.7968750596046448, "reward_std": 0.918915580958128, "rewards/correctness_reward_func": 1.5000000298023224, "rewards/int_reward_func": 0.4375000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3750000149011612, "rewards/xmlcount_reward_func": 0.484375, "step": 1072 }, { "completion_length": 146.1250057220459, "epoch": 0.5743342700388063, "grad_norm": 0.62890625, "kl": 0.04183990182355046, "learning_rate": 2.2876706380491153e-06, "loss": 0.0017, "reward": 3.3541666865348816, "reward_std": 0.18399503827095032, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.5, "step": 1073 }, { "completion_length": 161.2916717529297, "epoch": 0.5748695303091128, "grad_norm": 2.03125, "kl": 0.08961763884872198, "learning_rate": 2.283015685325635e-06, "loss": 0.0036, "reward": 2.5000000596046448, "reward_std": 0.3347994200885296, "rewards/correctness_reward_func": 1.0833333358168602, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 1074 }, { "completion_length": 137.58333587646484, "epoch": 0.5754047905794193, "grad_norm": 1.3203125, "kl": 0.04768128413707018, "learning_rate": 2.278361490468345e-06, "loss": 0.0019, "reward": 3.1666666865348816, "reward_std": 0.3546550087630749, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1075 }, { "completion_length": 150.29166984558105, "epoch": 0.5759400508497257, "grad_norm": 1.828125, "kl": 0.057581949047744274, "learning_rate": 2.2737080697330595e-06, "loss": 0.0023, "reward": 3.2916667461395264, "reward_std": 0.3602609895169735, "rewards/correctness_reward_func": 1.8333333432674408, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1076 }, { "completion_length": 121.45833587646484, "epoch": 0.5764753111200321, "grad_norm": 1.3984375, "kl": 0.05234599346294999, "learning_rate": 2.269055439372892e-06, "loss": 0.0021, "reward": 3.1666666865348816, "reward_std": 0.4779854416847229, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1077 }, { "completion_length": 190.4583396911621, "epoch": 0.5770105713903385, "grad_norm": 1.7890625, "kl": 0.028957413276657462, "learning_rate": 2.2644036156381923e-06, "loss": 0.0012, "reward": 2.5625000596046448, "reward_std": 1.0113781988620758, "rewards/correctness_reward_func": 1.1666667088866234, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 1078 }, { "completion_length": 142.75000381469727, "epoch": 0.577545831660645, "grad_norm": 2.140625, "kl": 0.027713227085769176, "learning_rate": 2.2597526147764935e-06, "loss": 0.0011, "reward": 3.083333432674408, "reward_std": 0.5763519518077374, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1079 }, { "completion_length": 164.33333778381348, "epoch": 0.5780810919309515, "grad_norm": 1.5078125, "kl": 0.04261440085247159, "learning_rate": 2.255102453032456e-06, "loss": 0.0017, "reward": 3.309333384037018, "reward_std": 0.26535478234291077, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333395421505, "rewards/xmlcount_reward_func": 0.49683333933353424, "step": 1080 }, { "completion_length": 135.12500190734863, "epoch": 0.5786163522012578, "grad_norm": 1.4375, "kl": 0.025709964334964752, "learning_rate": 2.250453146647807e-06, "loss": 0.001, "reward": 3.2291666865348816, "reward_std": 0.49727512896060944, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1081 }, { "completion_length": 152.2916717529297, "epoch": 0.5791516124715643, "grad_norm": 0.92578125, "kl": 0.025608718395233154, "learning_rate": 2.2458047118612894e-06, "loss": 0.001, "reward": 3.25, "reward_std": 0.273861289024353, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1082 }, { "completion_length": 161.4166717529297, "epoch": 0.5796868727418707, "grad_norm": 0.79296875, "kl": 0.027878669556230307, "learning_rate": 2.2411571649086e-06, "loss": 0.0011, "reward": 3.3125, "reward_std": 0.246855229139328, "rewards/correctness_reward_func": 1.8333333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1083 }, { "completion_length": 148.3750057220459, "epoch": 0.5802221330121772, "grad_norm": 1.71875, "kl": 0.0236201249063015, "learning_rate": 2.236510522022333e-06, "loss": 0.0009, "reward": 2.8750000596046448, "reward_std": 0.743688777089119, "rewards/correctness_reward_func": 1.4166667014360428, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 1084 }, { "completion_length": 140.45833778381348, "epoch": 0.5807573932824837, "grad_norm": 1.3515625, "kl": 0.05232280120253563, "learning_rate": 2.231864799431928e-06, "loss": 0.0021, "reward": 3.2291666865348816, "reward_std": 0.3248923271894455, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1085 }, { "completion_length": 188.9166717529297, "epoch": 0.58129265355279, "grad_norm": 1.4453125, "kl": 0.038821437396109104, "learning_rate": 2.227220013363608e-06, "loss": 0.0016, "reward": 2.8958334028720856, "reward_std": 0.5133540891110897, "rewards/correctness_reward_func": 1.416666679084301, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1086 }, { "completion_length": 166.25000381469727, "epoch": 0.5818279138230965, "grad_norm": 1.5, "kl": 0.034582878928631544, "learning_rate": 2.2225761800403278e-06, "loss": 0.0014, "reward": 2.958333373069763, "reward_std": 0.6907386183738708, "rewards/correctness_reward_func": 1.5833333432674408, "rewards/int_reward_func": 0.3958333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1087 }, { "completion_length": 129.37500381469727, "epoch": 0.5823631740934029, "grad_norm": 1.6484375, "kl": 0.04625696036964655, "learning_rate": 2.2179333156817114e-06, "loss": 0.0019, "reward": 3.0416666865348816, "reward_std": 0.506598636507988, "rewards/correctness_reward_func": 1.5833333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 1088 }, { "completion_length": 159.70833778381348, "epoch": 0.5828984343637094, "grad_norm": 1.109375, "kl": 0.03190090577118099, "learning_rate": 2.2132914365039993e-06, "loss": 0.0013, "reward": 3.2291666865348816, "reward_std": 0.3248923271894455, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1089 }, { "completion_length": 121.00000190734863, "epoch": 0.5834336946340158, "grad_norm": 1.6875, "kl": 0.04292154032737017, "learning_rate": 2.208650558719992e-06, "loss": 0.0017, "reward": 3.2500000596046448, "reward_std": 0.46232306957244873, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1090 }, { "completion_length": 115.08333587646484, "epoch": 0.5839689549043222, "grad_norm": 1.8046875, "kl": 0.06790947215631604, "learning_rate": 2.2040106985389925e-06, "loss": 0.0027, "reward": 3.1875000596046448, "reward_std": 0.3323967605829239, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1091 }, { "completion_length": 139.66666984558105, "epoch": 0.5845042151746287, "grad_norm": 1.0625, "kl": 0.031267859041690826, "learning_rate": 2.1993718721667466e-06, "loss": 0.0013, "reward": 3.458333373069763, "reward_std": 0.10206206887960434, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1092 }, { "completion_length": 131.5833339691162, "epoch": 0.5850394754449351, "grad_norm": 0.6953125, "kl": 0.02611220208927989, "learning_rate": 2.194734095805395e-06, "loss": 0.001, "reward": 3.4166666865348816, "reward_std": 0.20412415266036987, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1093 }, { "completion_length": 159.3333396911621, "epoch": 0.5855747357152415, "grad_norm": 1.265625, "kl": 0.02461553202010691, "learning_rate": 2.1900973856534048e-06, "loss": 0.001, "reward": 2.458333373069763, "reward_std": 0.10206207260489464, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1094 }, { "completion_length": 131.3333396911621, "epoch": 0.586109995985548, "grad_norm": 7.03125, "kl": 0.26300066569820046, "learning_rate": 2.185461757905524e-06, "loss": 0.0105, "reward": 3.2916667461395264, "reward_std": 0.3602609857916832, "rewards/correctness_reward_func": 1.8333333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1095 }, { "completion_length": 136.79166793823242, "epoch": 0.5866452562558544, "grad_norm": 1.921875, "kl": 0.07489533023908734, "learning_rate": 2.1808272287527176e-06, "loss": 0.003, "reward": 2.4375000596046448, "reward_std": 0.6227572709321976, "rewards/correctness_reward_func": 1.0000000149011612, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1096 }, { "completion_length": 133.0833396911621, "epoch": 0.5871805165261609, "grad_norm": 1.359375, "kl": 0.04591457825154066, "learning_rate": 2.1761938143821116e-06, "loss": 0.0018, "reward": 3.083333373069763, "reward_std": 0.5515970289707184, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1097 }, { "completion_length": 145.33333778381348, "epoch": 0.5877157767964672, "grad_norm": 1.421875, "kl": 0.03769760578870773, "learning_rate": 2.1715615309769446e-06, "loss": 0.0015, "reward": 3.083333373069763, "reward_std": 0.37134991213679314, "rewards/correctness_reward_func": 1.6666666716337204, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 1098 }, { "completion_length": 141.08333778381348, "epoch": 0.5882510370667737, "grad_norm": 1.140625, "kl": 0.02804331900551915, "learning_rate": 2.1669303947164983e-06, "loss": 0.0011, "reward": 3.4166666865348816, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1099 }, { "completion_length": 164.58333587646484, "epoch": 0.5887862973370801, "grad_norm": 1.859375, "kl": 0.040140153374522924, "learning_rate": 2.162300421776052e-06, "loss": 0.0016, "reward": 2.6041667461395264, "reward_std": 0.6310785673558712, "rewards/correctness_reward_func": 1.1666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000149011612, "rewards/xmlcount_reward_func": 0.5, "step": 1100 }, { "completion_length": 164.00000381469727, "epoch": 0.5893215576073866, "grad_norm": 1.15625, "kl": 0.04401758685708046, "learning_rate": 2.1576716283268206e-06, "loss": 0.0018, "reward": 2.854166716337204, "reward_std": 0.2648099809885025, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 1101 }, { "completion_length": 128.50000381469727, "epoch": 0.5898568178776931, "grad_norm": 1.6328125, "kl": 0.034419551491737366, "learning_rate": 2.1530440305358972e-06, "loss": 0.0014, "reward": 3.083333373069763, "reward_std": 0.6821095794439316, "rewards/correctness_reward_func": 1.5833333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1102 }, { "completion_length": 131.7500057220459, "epoch": 0.5903920781479994, "grad_norm": 0.99609375, "kl": 0.03005096409469843, "learning_rate": 2.1484176445662035e-06, "loss": 0.0012, "reward": 3.25, "reward_std": 0.273861289024353, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1103 }, { "completion_length": 166.37500381469727, "epoch": 0.5909273384183059, "grad_norm": 1.46875, "kl": 0.06135753355920315, "learning_rate": 2.1437924865764247e-06, "loss": 0.0025, "reward": 3.2500001192092896, "reward_std": 0.4650702327489853, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.416666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 1104 }, { "completion_length": 108.41666984558105, "epoch": 0.5914625986886123, "grad_norm": 3.53125, "kl": 0.05622281040996313, "learning_rate": 2.139168572720958e-06, "loss": 0.0022, "reward": 2.8750000596046448, "reward_std": 0.7841716818511486, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1105 }, { "completion_length": 157.3333396911621, "epoch": 0.5919978589589188, "grad_norm": 1.75, "kl": 0.04904384817928076, "learning_rate": 2.1345459191498565e-06, "loss": 0.002, "reward": 3.1875000596046448, "reward_std": 0.3572172783315182, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1106 }, { "completion_length": 129.29166984558105, "epoch": 0.5925331192292252, "grad_norm": 1.03125, "kl": 0.023244800977408886, "learning_rate": 2.1299245420087685e-06, "loss": 0.0009, "reward": 3.4166666865348816, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1107 }, { "completion_length": 126.33333587646484, "epoch": 0.5930683794995316, "grad_norm": 1.5703125, "kl": 0.029206049628555775, "learning_rate": 2.1253044574388886e-06, "loss": 0.0012, "reward": 3.2916667461395264, "reward_std": 0.3602609895169735, "rewards/correctness_reward_func": 1.8333333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1108 }, { "completion_length": 139.79166984558105, "epoch": 0.5936036397698381, "grad_norm": 1.2890625, "kl": 0.022994154831394553, "learning_rate": 2.1206856815768925e-06, "loss": 0.0009, "reward": 3.3541666865348816, "reward_std": 0.27258947491645813, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.5, "step": 1109 }, { "completion_length": 173.4166717529297, "epoch": 0.5941389000401445, "grad_norm": 2.078125, "kl": 0.07352372910827398, "learning_rate": 2.1160682305548867e-06, "loss": 0.0029, "reward": 3.1041667461395264, "reward_std": 0.6647366434335709, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125000111758709, "rewards/xmlcount_reward_func": 0.4791666716337204, "step": 1110 }, { "completion_length": 128.04166984558105, "epoch": 0.594674160310451, "grad_norm": 2.078125, "kl": 0.05020967125892639, "learning_rate": 2.1114521205003512e-06, "loss": 0.002, "reward": 2.833333373069763, "reward_std": 0.5222772061824799, "rewards/correctness_reward_func": 1.4166666716337204, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1111 }, { "completion_length": 178.00000381469727, "epoch": 0.5952094205807574, "grad_norm": 2.015625, "kl": 0.050168720073997974, "learning_rate": 2.10683736753608e-06, "loss": 0.002, "reward": 2.7291667759418488, "reward_std": 0.8597998470067978, "rewards/correctness_reward_func": 1.333333395421505, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.416666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 1112 }, { "completion_length": 153.00000381469727, "epoch": 0.5957446808510638, "grad_norm": 1.7578125, "kl": 0.025686150649562478, "learning_rate": 2.1022239877801316e-06, "loss": 0.001, "reward": 3.3281251192092896, "reward_std": 0.3285987824201584, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.416666679084301, "rewards/xmlcount_reward_func": 0.4947916716337204, "step": 1113 }, { "completion_length": 122.33333778381348, "epoch": 0.5962799411213703, "grad_norm": 1.84375, "kl": 0.0372623517177999, "learning_rate": 2.0976119973457625e-06, "loss": 0.0015, "reward": 3.270833373069763, "reward_std": 0.37377963587641716, "rewards/correctness_reward_func": 1.8333333432674408, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1114 }, { "completion_length": 133.83333587646484, "epoch": 0.5968152013916767, "grad_norm": 0.6953125, "kl": 0.029676989652216434, "learning_rate": 2.09300141234138e-06, "loss": 0.0012, "reward": 3.375, "reward_std": 0.19364917278289795, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 1115 }, { "completion_length": 139.7083396911621, "epoch": 0.5973504616619831, "grad_norm": 1.296875, "kl": 0.02908479329198599, "learning_rate": 2.0883922488704835e-06, "loss": 0.0012, "reward": 3.3750000596046448, "reward_std": 0.25129128620028496, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1116 }, { "completion_length": 148.04166984558105, "epoch": 0.5978857219322896, "grad_norm": 2.1875, "kl": 0.04395298566669226, "learning_rate": 2.083784523031605e-06, "loss": 0.0018, "reward": 2.8750000596046448, "reward_std": 0.4449404589831829, "rewards/correctness_reward_func": 1.5000000223517418, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.416666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 1117 }, { "completion_length": 138.58333587646484, "epoch": 0.598420982202596, "grad_norm": 1.6796875, "kl": 0.051251002587378025, "learning_rate": 2.079178250918257e-06, "loss": 0.0021, "reward": 2.958333432674408, "reward_std": 0.7584633976221085, "rewards/correctness_reward_func": 1.5833333730697632, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.4791666716337204, "step": 1118 }, { "completion_length": 179.33333778381348, "epoch": 0.5989562424729025, "grad_norm": 1.765625, "kl": 0.04658348159864545, "learning_rate": 2.074573448618874e-06, "loss": 0.0019, "reward": 2.312500089406967, "reward_std": 0.8353358805179596, "rewards/correctness_reward_func": 1.0833333432674408, "rewards/int_reward_func": 0.4166666679084301, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125000037252903, "rewards/xmlcount_reward_func": 0.5, "step": 1119 }, { "completion_length": 142.04166984558105, "epoch": 0.5994915027432088, "grad_norm": 1.1328125, "kl": 0.02434215135872364, "learning_rate": 2.069970132216754e-06, "loss": 0.001, "reward": 2.9791666865348816, "reward_std": 0.5050541460514069, "rewards/correctness_reward_func": 1.5000000149011612, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1120 }, { "completion_length": 161.62500762939453, "epoch": 0.6000267630135153, "grad_norm": 1.65625, "kl": 0.04097714927047491, "learning_rate": 2.0653683177900114e-06, "loss": 0.0016, "reward": 2.770833373069763, "reward_std": 0.8850989937782288, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.3958333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1121 }, { "completion_length": 114.45833778381348, "epoch": 0.6005620232838218, "grad_norm": 2.640625, "kl": 0.0499598728492856, "learning_rate": 2.06076802141151e-06, "loss": 0.002, "reward": 2.6875000298023224, "reward_std": 0.6272481828927994, "rewards/correctness_reward_func": 1.3333333358168602, "rewards/int_reward_func": 0.4375000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.4583333432674408, "step": 1122 }, { "completion_length": 178.12500381469727, "epoch": 0.6010972835541282, "grad_norm": 1.9921875, "kl": 0.03528116596862674, "learning_rate": 2.0561692591488113e-06, "loss": 0.0014, "reward": 2.6458334028720856, "reward_std": 0.7926383912563324, "rewards/correctness_reward_func": 1.3333333656191826, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3333333395421505, "rewards/xmlcount_reward_func": 0.5, "step": 1123 }, { "completion_length": 162.5833396911621, "epoch": 0.6016325438244347, "grad_norm": 1.1796875, "kl": 0.025379335740581155, "learning_rate": 2.0515720470641216e-06, "loss": 0.001, "reward": 3.3125000596046448, "reward_std": 0.309229951351881, "rewards/correctness_reward_func": 1.8333333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1124 }, { "completion_length": 147.6250057220459, "epoch": 0.602167804094741, "grad_norm": 1.96875, "kl": 0.024831503629684448, "learning_rate": 2.046976401214229e-06, "loss": 0.001, "reward": 2.708333373069763, "reward_std": 0.8290883004665375, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1125 }, { "completion_length": 176.50000381469727, "epoch": 0.6027030643650475, "grad_norm": 1.65625, "kl": 0.02572201332077384, "learning_rate": 2.042382337650455e-06, "loss": 0.001, "reward": 2.7916667461395264, "reward_std": 0.8075917363166809, "rewards/correctness_reward_func": 1.333333358168602, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1126 }, { "completion_length": 158.50000762939453, "epoch": 0.603238324635354, "grad_norm": 2.203125, "kl": 0.06118709687143564, "learning_rate": 2.0377898724185926e-06, "loss": 0.0024, "reward": 2.895833432674408, "reward_std": 0.7121906578540802, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1127 }, { "completion_length": 198.8750114440918, "epoch": 0.6037735849056604, "grad_norm": 1.5546875, "kl": 0.019993150606751442, "learning_rate": 2.033199021558851e-06, "loss": 0.0008, "reward": 2.270833373069763, "reward_std": 0.7049218565225601, "rewards/correctness_reward_func": 0.8333333432674408, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1128 }, { "completion_length": 184.2916717529297, "epoch": 0.6043088451759668, "grad_norm": 1.6796875, "kl": 0.03804916702210903, "learning_rate": 2.028609801105805e-06, "loss": 0.0015, "reward": 2.1041667461395264, "reward_std": 0.6306373104453087, "rewards/correctness_reward_func": 0.7500000074505806, "rewards/int_reward_func": 0.4166666679084301, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000149011612, "rewards/xmlcount_reward_func": 0.5, "step": 1129 }, { "completion_length": 145.25000381469727, "epoch": 0.6048441054462732, "grad_norm": 1.6640625, "kl": 0.08065536711364985, "learning_rate": 2.024022227088329e-06, "loss": 0.0032, "reward": 3.145833373069763, "reward_std": 0.6237796358764172, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.4375000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 1130 }, { "completion_length": 131.75000381469727, "epoch": 0.6053793657165797, "grad_norm": 1.4453125, "kl": 0.028958051931113005, "learning_rate": 2.0194363155295525e-06, "loss": 0.0012, "reward": 2.9791666865348816, "reward_std": 0.4242093414068222, "rewards/correctness_reward_func": 1.5000000223517418, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1131 }, { "completion_length": 149.3750057220459, "epoch": 0.6059146259868862, "grad_norm": 1.171875, "kl": 0.03974736947566271, "learning_rate": 2.014852082446796e-06, "loss": 0.0016, "reward": 3.1250000596046448, "reward_std": 0.2803870253264904, "rewards/correctness_reward_func": 1.6666666716337204, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1132 }, { "completion_length": 177.7500057220459, "epoch": 0.6064498862571925, "grad_norm": 1.3515625, "kl": 0.03239813074469566, "learning_rate": 2.010269543851516e-06, "loss": 0.0013, "reward": 3.208333373069763, "reward_std": 0.37592336162924767, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1133 }, { "completion_length": 158.87500762939453, "epoch": 0.606985146527499, "grad_norm": 1.609375, "kl": 0.037661376409232616, "learning_rate": 2.005688715749254e-06, "loss": 0.0015, "reward": 2.7291667461395264, "reward_std": 0.6971828788518906, "rewards/correctness_reward_func": 1.3333333656191826, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.4791666716337204, "step": 1134 }, { "completion_length": 172.54167938232422, "epoch": 0.6075204067978054, "grad_norm": 1.96875, "kl": 0.03389792237430811, "learning_rate": 2.0011096141395742e-06, "loss": 0.0014, "reward": 2.8541667461395264, "reward_std": 0.8515709191560745, "rewards/correctness_reward_func": 1.5000000447034836, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.4791666716337204, "step": 1135 }, { "completion_length": 168.83334350585938, "epoch": 0.6080556670681119, "grad_norm": 1.4140625, "kl": 0.03520802827551961, "learning_rate": 1.996532255016011e-06, "loss": 0.0014, "reward": 3.125, "reward_std": 0.493710458278656, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1136 }, { "completion_length": 171.5416717529297, "epoch": 0.6085909273384184, "grad_norm": 2.015625, "kl": 0.03871652204543352, "learning_rate": 1.9919566543660163e-06, "loss": 0.0015, "reward": 2.833333432674408, "reward_std": 0.6220272481441498, "rewards/correctness_reward_func": 1.4166667088866234, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.416666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 1137 }, { "completion_length": 128.9583339691162, "epoch": 0.6091261876087247, "grad_norm": 1.1640625, "kl": 0.029786940664052963, "learning_rate": 1.9873828281708954e-06, "loss": 0.0012, "reward": 2.9393333196640015, "reward_std": 0.14860239997506142, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.48099999874830246, "step": 1138 }, { "completion_length": 169.66667366027832, "epoch": 0.6096614478790312, "grad_norm": 1.453125, "kl": 0.024774388410151005, "learning_rate": 1.9828107924057593e-06, "loss": 0.001, "reward": 2.3125000298023224, "reward_std": 0.3655807636678219, "rewards/correctness_reward_func": 0.9166666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.416666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 1139 }, { "completion_length": 136.62500190734863, "epoch": 0.6101967081493376, "grad_norm": 2.40625, "kl": 0.037171173840761185, "learning_rate": 1.9782405630394635e-06, "loss": 0.0015, "reward": 2.9375000596046448, "reward_std": 0.889277458190918, "rewards/correctness_reward_func": 1.5000000298023224, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.4791666716337204, "step": 1140 }, { "completion_length": 138.8333396911621, "epoch": 0.6107319684196441, "grad_norm": 2.09375, "kl": 0.03229498118162155, "learning_rate": 1.9736721560345543e-06, "loss": 0.0013, "reward": 2.895833432674408, "reward_std": 0.866707444190979, "rewards/correctness_reward_func": 1.4166667312383652, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1141 }, { "completion_length": 122.66666984558105, "epoch": 0.6112672286899505, "grad_norm": 1.234375, "kl": 0.01973505667410791, "learning_rate": 1.9691055873472153e-06, "loss": 0.0008, "reward": 3.395833373069763, "reward_std": 0.25515520572662354, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1142 }, { "completion_length": 122.70833969116211, "epoch": 0.6118024889602569, "grad_norm": 2.125, "kl": 0.033139331731945276, "learning_rate": 1.9645408729272068e-06, "loss": 0.0013, "reward": 2.583333373069763, "reward_std": 0.5320602059364319, "rewards/correctness_reward_func": 1.0833333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1143 }, { "completion_length": 141.5833339691162, "epoch": 0.6123377492305634, "grad_norm": 1.8046875, "kl": 0.06788896024227142, "learning_rate": 1.959978028717814e-06, "loss": 0.0027, "reward": 3.041666716337204, "reward_std": 0.3872983306646347, "rewards/correctness_reward_func": 1.6666666716337204, "rewards/int_reward_func": 0.4166666679084301, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 1144 }, { "completion_length": 157.5416717529297, "epoch": 0.6128730095008698, "grad_norm": 1.2421875, "kl": 0.02102912962436676, "learning_rate": 1.9554170706557897e-06, "loss": 0.0008, "reward": 3.2916666865348816, "reward_std": 0.4541241526603699, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 1145 }, { "completion_length": 153.16666984558105, "epoch": 0.6134082697711762, "grad_norm": 1.8125, "kl": 0.05245727300643921, "learning_rate": 1.9508580146712967e-06, "loss": 0.0021, "reward": 3.1875000596046448, "reward_std": 0.5530414395034313, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000149011612, "rewards/xmlcount_reward_func": 0.5, "step": 1146 }, { "completion_length": 143.50000381469727, "epoch": 0.6139435300414827, "grad_norm": 1.96875, "kl": 0.030988771468400955, "learning_rate": 1.94630087668786e-06, "loss": 0.0012, "reward": 2.9791666865348816, "reward_std": 0.7711364179849625, "rewards/correctness_reward_func": 1.5000000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1147 }, { "completion_length": 186.62500381469727, "epoch": 0.6144787903117891, "grad_norm": 0.97265625, "kl": 0.03605042304843664, "learning_rate": 1.9417456726222995e-06, "loss": 0.0014, "reward": 2.9375000298023224, "reward_std": 0.11558076366782188, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1148 }, { "completion_length": 126.6250057220459, "epoch": 0.6150140505820956, "grad_norm": 0.953125, "kl": 0.02856651460751891, "learning_rate": 1.9371924183846835e-06, "loss": 0.0011, "reward": 2.7916666865348816, "reward_std": 0.2813657224178314, "rewards/correctness_reward_func": 1.3333333432674408, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1149 }, { "completion_length": 169.2083396911621, "epoch": 0.6155493108524019, "grad_norm": 1.3984375, "kl": 0.01892117876559496, "learning_rate": 1.9326411298782706e-06, "loss": 0.0008, "reward": 3.208333373069763, "reward_std": 0.6018974483013153, "rewards/correctness_reward_func": 1.7500000596046448, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 1150 }, { "completion_length": 125.70833587646484, "epoch": 0.6160845711227084, "grad_norm": 1.1328125, "kl": 0.03264283831231296, "learning_rate": 1.92809182299945e-06, "loss": 0.0013, "reward": 3.395833373069763, "reward_std": 0.25515517219901085, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1151 }, { "completion_length": 174.83333587646484, "epoch": 0.6166198313930149, "grad_norm": 1.078125, "kl": 0.03417334379628301, "learning_rate": 1.9235445136376954e-06, "loss": 0.0014, "reward": 2.9791667461395264, "reward_std": 0.4493577480316162, "rewards/correctness_reward_func": 1.583333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333395421505, "rewards/xmlcount_reward_func": 0.5, "step": 1152 }, { "completion_length": 138.83333778381348, "epoch": 0.6171550916633213, "grad_norm": 2.125, "kl": 0.045399333350360394, "learning_rate": 1.9189992176754997e-06, "loss": 0.0018, "reward": 3.083333432674408, "reward_std": 0.7781640440225601, "rewards/correctness_reward_func": 1.666666716337204, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1153 }, { "completion_length": 160.00000381469727, "epoch": 0.6176903519336278, "grad_norm": 1.6875, "kl": 0.02971332473680377, "learning_rate": 1.914455950988322e-06, "loss": 0.0012, "reward": 2.291666716337204, "reward_std": 0.5397901386022568, "rewards/correctness_reward_func": 1.0000000223517418, "rewards/int_reward_func": 0.3333333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 1154 }, { "completion_length": 126.0000057220459, "epoch": 0.6182256122039341, "grad_norm": 1.0078125, "kl": 0.032387261278927326, "learning_rate": 1.909914729444539e-06, "loss": 0.0013, "reward": 3.083333343267441, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func": 1.5833333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1155 }, { "completion_length": 129.66666984558105, "epoch": 0.6187608724742406, "grad_norm": 2.28125, "kl": 0.08170109568163753, "learning_rate": 1.9053755689053794e-06, "loss": 0.0033, "reward": 3.0416667461395264, "reward_std": 0.5094902068376541, "rewards/correctness_reward_func": 1.583333358168602, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1156 }, { "completion_length": 132.75000381469727, "epoch": 0.6192961327445471, "grad_norm": 1.5078125, "kl": 0.036833798978477716, "learning_rate": 1.9008384852248775e-06, "loss": 0.0015, "reward": 3.145833373069763, "reward_std": 0.4592793583869934, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1157 }, { "completion_length": 170.83333778381348, "epoch": 0.6198313930148535, "grad_norm": 1.59375, "kl": 0.023054254008457065, "learning_rate": 1.89630349424981e-06, "loss": 0.0009, "reward": 2.6250000596046448, "reward_std": 0.7045579701662064, "rewards/correctness_reward_func": 1.2500000223517418, "rewards/int_reward_func": 0.39583333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1158 }, { "completion_length": 143.6250057220459, "epoch": 0.6203666532851599, "grad_norm": 1.5390625, "kl": 0.02555786306038499, "learning_rate": 1.8917706118196455e-06, "loss": 0.001, "reward": 2.7291666865348816, "reward_std": 0.3248923234641552, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1159 }, { "completion_length": 251.33334350585938, "epoch": 0.6209019135554663, "grad_norm": 1.6328125, "kl": 0.03249187534675002, "learning_rate": 1.8872398537664902e-06, "loss": 0.0013, "reward": 2.479166716337204, "reward_std": 1.0055317729711533, "rewards/correctness_reward_func": 1.1666667014360428, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3333333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1160 }, { "completion_length": 147.75000381469727, "epoch": 0.6214371738257728, "grad_norm": 1.609375, "kl": 0.023566798539832234, "learning_rate": 1.8827112359150277e-06, "loss": 0.0009, "reward": 3.0885416865348816, "reward_std": 0.498736172914505, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.484375, "step": 1161 }, { "completion_length": 176.08333778381348, "epoch": 0.6219724340960792, "grad_norm": 1.359375, "kl": 0.02330430643633008, "learning_rate": 1.878184774082467e-06, "loss": 0.0009, "reward": 2.895833373069763, "reward_std": 0.6062580496072769, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333395421505, "rewards/xmlcount_reward_func": 0.5, "step": 1162 }, { "completion_length": 143.0833396911621, "epoch": 0.6225076943663856, "grad_norm": 1.078125, "kl": 0.04478012444451451, "learning_rate": 1.8736604840784884e-06, "loss": 0.0018, "reward": 3.395833373069763, "reward_std": 0.25515518710017204, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1163 }, { "completion_length": 131.45833587646484, "epoch": 0.6230429546366921, "grad_norm": 1.0546875, "kl": 0.03378752525895834, "learning_rate": 1.8691383817051833e-06, "loss": 0.0014, "reward": 2.458333343267441, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 1164 }, { "completion_length": 167.25000381469727, "epoch": 0.6235782149069985, "grad_norm": 0.62890625, "kl": 0.017930781934410334, "learning_rate": 1.8646184827570074e-06, "loss": 0.0007, "reward": 3.4166666865348816, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1165 }, { "completion_length": 162.7500057220459, "epoch": 0.624113475177305, "grad_norm": 0.859375, "kl": 0.02851248439401388, "learning_rate": 1.8601008030207157e-06, "loss": 0.0011, "reward": 2.5625, "reward_std": 0.22008520364761353, "rewards/correctness_reward_func": 1.0833333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1166 }, { "completion_length": 180.1666717529297, "epoch": 0.6246487354476113, "grad_norm": 1.3203125, "kl": 0.03482568962499499, "learning_rate": 1.8555853582753136e-06, "loss": 0.0014, "reward": 2.895833373069763, "reward_std": 0.5618228912353516, "rewards/correctness_reward_func": 1.5000000149011612, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.5, "step": 1167 }, { "completion_length": 119.83333778381348, "epoch": 0.6251839957179178, "grad_norm": 0.6640625, "kl": 0.03804555209353566, "learning_rate": 1.8510721642920015e-06, "loss": 0.0015, "reward": 3.4791666865348816, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1168 }, { "completion_length": 163.58334159851074, "epoch": 0.6257192559882243, "grad_norm": 1.4296875, "kl": 0.02223725477233529, "learning_rate": 1.8465612368341157e-06, "loss": 0.0009, "reward": 2.6666667461395264, "reward_std": 0.5163978338241577, "rewards/correctness_reward_func": 1.1666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1169 }, { "completion_length": 158.29166793823242, "epoch": 0.6262545162585307, "grad_norm": 1.65625, "kl": 0.0354935098439455, "learning_rate": 1.8420525916570811e-06, "loss": 0.0014, "reward": 2.5625000298023224, "reward_std": 0.7091782838106155, "rewards/correctness_reward_func": 1.0833333507180214, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1170 }, { "completion_length": 195.50000762939453, "epoch": 0.6267897765288372, "grad_norm": 1.0390625, "kl": 0.027738153701648116, "learning_rate": 1.8375462445083464e-06, "loss": 0.0011, "reward": 2.8125, "reward_std": 0.2621144950389862, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 1171 }, { "completion_length": 216.83334350585938, "epoch": 0.6273250367991435, "grad_norm": 1.5546875, "kl": 0.031087984796613455, "learning_rate": 1.8330422111273349e-06, "loss": 0.0012, "reward": 2.3333334028720856, "reward_std": 0.7801860384643078, "rewards/correctness_reward_func": 1.0000000074505806, "rewards/int_reward_func": 0.4166666679084301, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.416666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 1172 }, { "completion_length": 181.62500381469727, "epoch": 0.62786029706945, "grad_norm": 1.6484375, "kl": 0.02746220026165247, "learning_rate": 1.828540507245391e-06, "loss": 0.0011, "reward": 2.5833334624767303, "reward_std": 0.7781641036272049, "rewards/correctness_reward_func": 1.166666716337204, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 1173 }, { "completion_length": 171.08333587646484, "epoch": 0.6283955573397565, "grad_norm": 1.3515625, "kl": 0.04072369076311588, "learning_rate": 1.8240411485857201e-06, "loss": 0.0016, "reward": 2.958333373069763, "reward_std": 0.5736270248889923, "rewards/correctness_reward_func": 1.5833333432674408, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 1174 }, { "completion_length": 145.08333587646484, "epoch": 0.6289308176100629, "grad_norm": 1.25, "kl": 0.044766807463020086, "learning_rate": 1.8195441508633368e-06, "loss": 0.0018, "reward": 3.0625000298023224, "reward_std": 0.37377963587641716, "rewards/correctness_reward_func": 1.6666666716337204, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1175 }, { "completion_length": 165.70833587646484, "epoch": 0.6294660778803693, "grad_norm": 1.53125, "kl": 0.029149475507438183, "learning_rate": 1.8150495297850121e-06, "loss": 0.0012, "reward": 2.8750000596046448, "reward_std": 0.7602093517780304, "rewards/correctness_reward_func": 1.4166667014360428, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1176 }, { "completion_length": 134.58333778381348, "epoch": 0.6300013381506757, "grad_norm": 1.859375, "kl": 0.035790836438536644, "learning_rate": 1.8105573010492123e-06, "loss": 0.0014, "reward": 3.2916667461395264, "reward_std": 0.5103103779256344, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1177 }, { "completion_length": 179.50000381469727, "epoch": 0.6305365984209822, "grad_norm": 1.4453125, "kl": 0.0371116129681468, "learning_rate": 1.8060674803460514e-06, "loss": 0.0015, "reward": 2.5416666865348816, "reward_std": 0.7251393496990204, "rewards/correctness_reward_func": 1.0833333507180214, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1178 }, { "completion_length": 130.33333778381348, "epoch": 0.6310718586912887, "grad_norm": 2.25, "kl": 0.06264671497046947, "learning_rate": 1.80158008335723e-06, "loss": 0.0025, "reward": 3.2916667461395264, "reward_std": 0.4554154574871063, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666865348816, "rewards/xmlcount_reward_func": 0.4791666716337204, "step": 1179 }, { "completion_length": 151.91666984558105, "epoch": 0.631607118961595, "grad_norm": 1.5078125, "kl": 0.027635585516691208, "learning_rate": 1.797095125755984e-06, "loss": 0.0011, "reward": 3.0625000596046448, "reward_std": 0.5653917640447617, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 1180 }, { "completion_length": 135.50000381469727, "epoch": 0.6321423792319015, "grad_norm": 0.050537109375, "kl": 0.022512939991429448, "learning_rate": 1.7926126232070315e-06, "loss": 0.0009, "reward": 3.5, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1181 }, { "completion_length": 195.8333396911621, "epoch": 0.6326776395022079, "grad_norm": 1.6796875, "kl": 0.0218606092967093, "learning_rate": 1.7881325913665127e-06, "loss": 0.0009, "reward": 2.192708373069763, "reward_std": 0.5448218882083893, "rewards/correctness_reward_func": 0.8333333730697632, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.484375, "step": 1182 }, { "completion_length": 159.8333396911621, "epoch": 0.6332128997725144, "grad_norm": 1.2109375, "kl": 0.017797658685594797, "learning_rate": 1.78365504588194e-06, "loss": 0.0007, "reward": 3.333333373069763, "reward_std": 0.25819891691207886, "rewards/correctness_reward_func": 1.8333333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1183 }, { "completion_length": 175.8333396911621, "epoch": 0.6337481600428209, "grad_norm": 1.5078125, "kl": 0.03092574281617999, "learning_rate": 1.7791800023921412e-06, "loss": 0.0012, "reward": 2.520833432674408, "reward_std": 0.7812077701091766, "rewards/correctness_reward_func": 1.0833333730697632, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1184 }, { "completion_length": 119.50000381469727, "epoch": 0.6342834203131272, "grad_norm": 1.1171875, "kl": 0.035085609182715416, "learning_rate": 1.7747074765272047e-06, "loss": 0.0014, "reward": 3.4166666865348816, "reward_std": 0.20412415266036987, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1185 }, { "completion_length": 132.5000057220459, "epoch": 0.6348186805834337, "grad_norm": 1.640625, "kl": 0.03759643901139498, "learning_rate": 1.7702374839084275e-06, "loss": 0.0015, "reward": 3.145833373069763, "reward_std": 0.5513499081134796, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1186 }, { "completion_length": 138.00000381469727, "epoch": 0.6353539408537401, "grad_norm": 1.625, "kl": 0.04007141292095184, "learning_rate": 1.7657700401482564e-06, "loss": 0.0016, "reward": 2.833333373069763, "reward_std": 0.6394436359405518, "rewards/correctness_reward_func": 1.4166666716337204, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.4791666716337204, "step": 1187 }, { "completion_length": 115.25000190734863, "epoch": 0.6358892011240466, "grad_norm": 17.375, "kl": 0.3322620280086994, "learning_rate": 1.7613051608502365e-06, "loss": 0.0133, "reward": 3.333333373069763, "reward_std": 0.40824833512306213, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.4791666716337204, "step": 1188 }, { "completion_length": 198.8333396911621, "epoch": 0.636424461394353, "grad_norm": 1.9453125, "kl": 0.028016306459903717, "learning_rate": 1.7568428616089572e-06, "loss": 0.0011, "reward": 1.8958333730697632, "reward_std": 0.9316931664943695, "rewards/correctness_reward_func": 0.5000000149011612, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1189 }, { "completion_length": 164.08333587646484, "epoch": 0.6369597216646594, "grad_norm": 1.859375, "kl": 0.042940919287502766, "learning_rate": 1.7523831580099938e-06, "loss": 0.0017, "reward": 3.0208334922790527, "reward_std": 0.7812078148126602, "rewards/correctness_reward_func": 1.5833333730697632, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1190 }, { "completion_length": 139.70833587646484, "epoch": 0.6374949819349659, "grad_norm": 1.3203125, "kl": 0.03346863482147455, "learning_rate": 1.747926065629859e-06, "loss": 0.0013, "reward": 3.3125000596046448, "reward_std": 0.4592793434858322, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1191 }, { "completion_length": 137.20833587646484, "epoch": 0.6380302422052723, "grad_norm": 1.5234375, "kl": 0.0404424169100821, "learning_rate": 1.743471600035943e-06, "loss": 0.0016, "reward": 3.2916666865348816, "reward_std": 0.4132891744375229, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1192 }, { "completion_length": 201.3333396911621, "epoch": 0.6385655024755788, "grad_norm": 1.6953125, "kl": 0.02739003114402294, "learning_rate": 1.7390197767864614e-06, "loss": 0.0011, "reward": 2.1106250286102295, "reward_std": 0.6114244014024734, "rewards/correctness_reward_func": 0.8333333730697632, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333432674408, "rewards/xmlcount_reward_func": 0.4231249988079071, "step": 1193 }, { "completion_length": 186.20833587646484, "epoch": 0.6391007627458852, "grad_norm": 1.8515625, "kl": 0.05962704448029399, "learning_rate": 1.7345706114304023e-06, "loss": 0.0024, "reward": 2.9791667461395264, "reward_std": 0.6494128406047821, "rewards/correctness_reward_func": 1.5833333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 1194 }, { "completion_length": 167.62500381469727, "epoch": 0.6396360230161916, "grad_norm": 1.3046875, "kl": 0.024999674409627914, "learning_rate": 1.7301241195074683e-06, "loss": 0.001, "reward": 2.8750000596046448, "reward_std": 0.5809475183486938, "rewards/correctness_reward_func": 1.4166666716337204, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1195 }, { "completion_length": 153.2500057220459, "epoch": 0.6401712832864981, "grad_norm": 1.21875, "kl": 0.029375402722507715, "learning_rate": 1.725680316548028e-06, "loss": 0.0012, "reward": 2.9166666865348816, "reward_std": 0.5320602059364319, "rewards/correctness_reward_func": 1.4166666716337204, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1196 }, { "completion_length": 143.95833778381348, "epoch": 0.6407065435568045, "grad_norm": 1.828125, "kl": 0.03262898838147521, "learning_rate": 1.721239218073054e-06, "loss": 0.0013, "reward": 2.541666716337204, "reward_std": 0.7466593086719513, "rewards/correctness_reward_func": 1.0833333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 1197 }, { "completion_length": 108.62500190734863, "epoch": 0.6412418038271109, "grad_norm": 1.078125, "kl": 0.021220164373517036, "learning_rate": 1.7168008395940738e-06, "loss": 0.0008, "reward": 3.4166666865348816, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1198 }, { "completion_length": 174.16667556762695, "epoch": 0.6417770640974174, "grad_norm": 1.546875, "kl": 0.03896147897467017, "learning_rate": 1.712365196613119e-06, "loss": 0.0016, "reward": 2.8541667461395264, "reward_std": 0.6673594415187836, "rewards/correctness_reward_func": 1.5000000149011612, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3750000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 1199 }, { "completion_length": 170.5, "epoch": 0.6423123243677238, "grad_norm": 1.6796875, "kl": 0.037333715707063675, "learning_rate": 1.7079323046226612e-06, "loss": 0.0015, "reward": 2.9375000596046448, "reward_std": 0.4259376786649227, "rewards/correctness_reward_func": 1.5000000223517418, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1200 }, { "completion_length": 143.91666984558105, "epoch": 0.6428475846380303, "grad_norm": 1.6640625, "kl": 0.04931775387376547, "learning_rate": 1.7035021791055662e-06, "loss": 0.002, "reward": 2.9583334028720856, "reward_std": 0.45541542768478394, "rewards/correctness_reward_func": 1.5000000223517418, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1201 }, { "completion_length": 165.5833396911621, "epoch": 0.6433828449083366, "grad_norm": 1.3046875, "kl": 0.03186797956004739, "learning_rate": 1.6990748355350375e-06, "loss": 0.0013, "reward": 3.0416667461395264, "reward_std": 0.503996953368187, "rewards/correctness_reward_func": 1.5833333432674408, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1202 }, { "completion_length": 154.66666793823242, "epoch": 0.6439181051786431, "grad_norm": 1.6640625, "kl": 0.03737725364044309, "learning_rate": 1.6946502893745603e-06, "loss": 0.0015, "reward": 3.0312500596046448, "reward_std": 0.5864861123263836, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333432674408, "rewards/xmlcount_reward_func": 0.4895833358168602, "step": 1203 }, { "completion_length": 174.5833339691162, "epoch": 0.6444533654489496, "grad_norm": 1.3671875, "kl": 0.04018306778743863, "learning_rate": 1.6902285560778529e-06, "loss": 0.0016, "reward": 3.2291666865348816, "reward_std": 0.49727514386177063, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1204 }, { "completion_length": 190.37500381469727, "epoch": 0.644988625719256, "grad_norm": 1.875, "kl": 0.049945867620408535, "learning_rate": 1.6858096510888048e-06, "loss": 0.002, "reward": 2.1250000447034836, "reward_std": 0.8531132750213146, "rewards/correctness_reward_func": 0.8333333432674408, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.416666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 1205 }, { "completion_length": 123.08333587646484, "epoch": 0.6455238859895625, "grad_norm": 1.109375, "kl": 0.030124272685498, "learning_rate": 1.6813935898414286e-06, "loss": 0.0012, "reward": 2.958333343267441, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 1206 }, { "completion_length": 134.2916717529297, "epoch": 0.6460591462598688, "grad_norm": 1.28125, "kl": 0.028698857873678207, "learning_rate": 1.676980387759806e-06, "loss": 0.0011, "reward": 3.333333373069763, "reward_std": 0.40824830532073975, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1207 }, { "completion_length": 182.00000381469727, "epoch": 0.6465944065301753, "grad_norm": 1.171875, "kl": 0.04143868666142225, "learning_rate": 1.6725700602580292e-06, "loss": 0.0017, "reward": 3.1041667461395264, "reward_std": 0.37377968057990074, "rewards/correctness_reward_func": 1.6666666716337204, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 1208 }, { "completion_length": 154.37500381469727, "epoch": 0.6471296668004818, "grad_norm": 0.94921875, "kl": 0.03367770742624998, "learning_rate": 1.6681626227401542e-06, "loss": 0.0013, "reward": 2.9166666865348816, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1209 }, { "completion_length": 169.62500381469727, "epoch": 0.6476649270707882, "grad_norm": 2.171875, "kl": 0.028921236284077168, "learning_rate": 1.6637580906001405e-06, "loss": 0.0012, "reward": 2.7135417461395264, "reward_std": 0.4438832513988018, "rewards/correctness_reward_func": 1.3333333432674408, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.484375, "step": 1210 }, { "completion_length": 128.3750057220459, "epoch": 0.6482001873410946, "grad_norm": 1.46875, "kl": 0.029809471685439348, "learning_rate": 1.6593564792217995e-06, "loss": 0.0012, "reward": 3.3125000596046448, "reward_std": 0.309229951351881, "rewards/correctness_reward_func": 1.8333333432674408, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1211 }, { "completion_length": 142.0000057220459, "epoch": 0.648735447611401, "grad_norm": 1.9765625, "kl": 0.06288173329085112, "learning_rate": 1.6549578039787436e-06, "loss": 0.0025, "reward": 3.270833373069763, "reward_std": 0.33421211317181587, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333395421505, "rewards/xmlcount_reward_func": 0.5, "step": 1212 }, { "completion_length": 235.4166717529297, "epoch": 0.6492707078817075, "grad_norm": 1.8203125, "kl": 0.04126214608550072, "learning_rate": 1.650562080234327e-06, "loss": 0.0017, "reward": 2.119791716337204, "reward_std": 0.7860080450773239, "rewards/correctness_reward_func": 0.7500000223517418, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3750000149011612, "rewards/xmlcount_reward_func": 0.4947916716337204, "step": 1213 }, { "completion_length": 149.6666717529297, "epoch": 0.649805968152014, "grad_norm": 2.203125, "kl": 0.03802600037306547, "learning_rate": 1.646169323341599e-06, "loss": 0.0015, "reward": 2.8333334028720856, "reward_std": 0.5740398876369, "rewards/correctness_reward_func": 1.5000000223517418, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.416666679084301, "rewards/xmlcount_reward_func": 0.4583333358168602, "step": 1214 }, { "completion_length": 123.33333587646484, "epoch": 0.6503412284223203, "grad_norm": 1.09375, "kl": 0.03457137290388346, "learning_rate": 1.641779548643243e-06, "loss": 0.0014, "reward": 3.395833373069763, "reward_std": 0.2002602517604828, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1215 }, { "completion_length": 149.7916717529297, "epoch": 0.6508764886926268, "grad_norm": 1.796875, "kl": 0.05280859861522913, "learning_rate": 1.6373927714715277e-06, "loss": 0.0021, "reward": 3.2916666865348816, "reward_std": 0.4256826154887676, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1216 }, { "completion_length": 111.58333396911621, "epoch": 0.6514117489629332, "grad_norm": 1.3515625, "kl": 0.026734239421784878, "learning_rate": 1.633009007148253e-06, "loss": 0.0011, "reward": 3.083333373069763, "reward_std": 0.46232305467128754, "rewards/correctness_reward_func": 1.583333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1217 }, { "completion_length": 143.7916717529297, "epoch": 0.6519470092332397, "grad_norm": 1.828125, "kl": 0.03858533035963774, "learning_rate": 1.6286282709846947e-06, "loss": 0.0015, "reward": 3.2916666865348816, "reward_std": 0.39777331054210663, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 1218 }, { "completion_length": 157.75000381469727, "epoch": 0.6524822695035462, "grad_norm": 2.0, "kl": 0.031744038220494986, "learning_rate": 1.624250578281551e-06, "loss": 0.0013, "reward": 2.7708334028720856, "reward_std": 0.8570526540279388, "rewards/correctness_reward_func": 1.333333395421505, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 1219 }, { "completion_length": 122.45833396911621, "epoch": 0.6530175297738525, "grad_norm": 1.8828125, "kl": 0.047506920993328094, "learning_rate": 1.6198759443288941e-06, "loss": 0.0019, "reward": 3.1875000596046448, "reward_std": 0.577903788536787, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1220 }, { "completion_length": 149.37500190734863, "epoch": 0.653552790044159, "grad_norm": 1.34375, "kl": 0.0284970928914845, "learning_rate": 1.6155043844061092e-06, "loss": 0.0011, "reward": 3.2916666865348816, "reward_std": 0.39777331054210663, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 1221 }, { "completion_length": 156.12500190734863, "epoch": 0.6540880503144654, "grad_norm": 1.6015625, "kl": 0.044686511624604464, "learning_rate": 1.6111359137818458e-06, "loss": 0.0018, "reward": 2.4791666865348816, "reward_std": 0.6236923336982727, "rewards/correctness_reward_func": 1.0833333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 1222 }, { "completion_length": 123.66667175292969, "epoch": 0.6546233105847719, "grad_norm": 1.6171875, "kl": 0.01990818837657571, "learning_rate": 1.6067705477139637e-06, "loss": 0.0008, "reward": 2.7500000596046448, "reward_std": 0.46232305467128754, "rewards/correctness_reward_func": 1.2500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1223 }, { "completion_length": 203.3750057220459, "epoch": 0.6551585708550783, "grad_norm": 1.3046875, "kl": 0.02167564001865685, "learning_rate": 1.6024083014494777e-06, "loss": 0.0009, "reward": 2.739583343267441, "reward_std": 0.41847972571849823, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333358168602, "rewards/xmlcount_reward_func": 0.46875, "step": 1224 }, { "completion_length": 201.0833396911621, "epoch": 0.6556938311253847, "grad_norm": 1.8359375, "kl": 0.06671202601864934, "learning_rate": 1.5980491902245094e-06, "loss": 0.0027, "reward": 2.729166716337204, "reward_std": 0.7667488753795624, "rewards/correctness_reward_func": 1.4166666716337204, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3750000074505806, "rewards/xmlcount_reward_func": 0.4791666716337204, "step": 1225 }, { "completion_length": 188.45833587646484, "epoch": 0.6562290913956912, "grad_norm": 1.1953125, "kl": 0.038865368347615004, "learning_rate": 1.593693229264227e-06, "loss": 0.0016, "reward": 2.791666716337204, "reward_std": 0.45111703872680664, "rewards/correctness_reward_func": 1.416666679084301, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.5, "step": 1226 }, { "completion_length": 141.79166793823242, "epoch": 0.6567643516659976, "grad_norm": 3.140625, "kl": 0.08252203557640314, "learning_rate": 1.5893404337827986e-06, "loss": 0.0033, "reward": 2.6041667461395264, "reward_std": 0.9125833064317703, "rewards/correctness_reward_func": 1.166666716337204, "rewards/int_reward_func": 0.4375000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1227 }, { "completion_length": 169.87500381469727, "epoch": 0.657299611936304, "grad_norm": 1.203125, "kl": 0.01972397230565548, "learning_rate": 1.5849908189833341e-06, "loss": 0.0008, "reward": 3.0625, "reward_std": 0.5670122802257538, "rewards/correctness_reward_func": 1.5833333432674408, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1228 }, { "completion_length": 139.87500381469727, "epoch": 0.6578348722066104, "grad_norm": 0.96875, "kl": 0.027716852258890867, "learning_rate": 1.580644400057833e-06, "loss": 0.0011, "reward": 3.4166666865348816, "reward_std": 0.20412415266036987, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1229 }, { "completion_length": 183.6250057220459, "epoch": 0.6583701324769169, "grad_norm": 1.1015625, "kl": 0.030865561682730913, "learning_rate": 1.5763011921871377e-06, "loss": 0.0012, "reward": 3.2916667461395264, "reward_std": 0.4727980047464371, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 1230 }, { "completion_length": 150.5833396911621, "epoch": 0.6589053927472234, "grad_norm": 1.5, "kl": 0.0449206349439919, "learning_rate": 1.57196121054087e-06, "loss": 0.0018, "reward": 2.583333373069763, "reward_std": 0.8052270114421844, "rewards/correctness_reward_func": 1.1666666716337204, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1231 }, { "completion_length": 160.7500057220459, "epoch": 0.6594406530175297, "grad_norm": 1.109375, "kl": 0.023968304740265012, "learning_rate": 1.5676244702773852e-06, "loss": 0.001, "reward": 3.2500000596046448, "reward_std": 0.46232305467128754, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1232 }, { "completion_length": 174.12500381469727, "epoch": 0.6599759132878362, "grad_norm": 1.6796875, "kl": 0.036710976622998714, "learning_rate": 1.563290986543718e-06, "loss": 0.0015, "reward": 2.7135417461395264, "reward_std": 0.5892635434865952, "rewards/correctness_reward_func": 1.3333333730697632, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.484375, "step": 1233 }, { "completion_length": 168.25000381469727, "epoch": 0.6605111735581426, "grad_norm": 1.703125, "kl": 0.05276072025299072, "learning_rate": 1.5589607744755269e-06, "loss": 0.0021, "reward": 3.145833373069763, "reward_std": 0.515580803155899, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1234 }, { "completion_length": 117.29167366027832, "epoch": 0.6610464338284491, "grad_norm": 0.1123046875, "kl": 0.04392884857952595, "learning_rate": 1.5546338491970476e-06, "loss": 0.0018, "reward": 3.5, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1235 }, { "completion_length": 132.16666984558105, "epoch": 0.6615816940987556, "grad_norm": 0.8515625, "kl": 0.033093469217419624, "learning_rate": 1.5503102258210324e-06, "loss": 0.0013, "reward": 2.958333343267441, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 1236 }, { "completion_length": 166.25000381469727, "epoch": 0.6621169543690619, "grad_norm": 1.0703125, "kl": 0.042634851299226284, "learning_rate": 1.5459899194486988e-06, "loss": 0.0017, "reward": 2.6875, "reward_std": 0.246855229139328, "rewards/correctness_reward_func": 1.3333333432674408, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333395421505, "rewards/xmlcount_reward_func": 0.5, "step": 1237 }, { "completion_length": 176.2916717529297, "epoch": 0.6626522146393684, "grad_norm": 1.78125, "kl": 0.048626034520566463, "learning_rate": 1.5416729451696857e-06, "loss": 0.0019, "reward": 3.020833373069763, "reward_std": 0.7135948985815048, "rewards/correctness_reward_func": 1.666666716337204, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333358168602, "rewards/xmlcount_reward_func": 0.4791666716337204, "step": 1238 }, { "completion_length": 153.16666793823242, "epoch": 0.6631874749096748, "grad_norm": 1.34375, "kl": 0.05162359494715929, "learning_rate": 1.5373593180619875e-06, "loss": 0.0021, "reward": 3.1510417461395264, "reward_std": 0.42381148040294647, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.484375, "step": 1239 }, { "completion_length": 149.58333587646484, "epoch": 0.6637227351799813, "grad_norm": 1.8046875, "kl": 0.04151236591860652, "learning_rate": 1.5330490531919132e-06, "loss": 0.0017, "reward": 3.3125000596046448, "reward_std": 0.4592793248593807, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1240 }, { "completion_length": 150.50000762939453, "epoch": 0.6642579954502877, "grad_norm": 2.5625, "kl": 0.055647075176239014, "learning_rate": 1.5287421656140233e-06, "loss": 0.0022, "reward": 2.770833432674408, "reward_std": 1.0548006296157837, "rewards/correctness_reward_func": 1.416666716337204, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1241 }, { "completion_length": 174.7083396911621, "epoch": 0.6647932557205941, "grad_norm": 1.7578125, "kl": 0.03474069572985172, "learning_rate": 1.524438670371085e-06, "loss": 0.0014, "reward": 3.2187500596046448, "reward_std": 0.4192725531756878, "rewards/correctness_reward_func": 1.8333333432674408, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.4895833358168602, "step": 1242 }, { "completion_length": 169.9166717529297, "epoch": 0.6653285159909006, "grad_norm": 1.515625, "kl": 0.04177691554650664, "learning_rate": 1.5201385824940178e-06, "loss": 0.0017, "reward": 3.2291667461395264, "reward_std": 0.4352863281965256, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1243 }, { "completion_length": 165.12500762939453, "epoch": 0.665863776261207, "grad_norm": 1.9140625, "kl": 0.03382923407480121, "learning_rate": 1.515841917001839e-06, "loss": 0.0014, "reward": 2.6250001192092896, "reward_std": 0.8982227295637131, "rewards/correctness_reward_func": 1.2500000298023224, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1244 }, { "completion_length": 123.62500381469727, "epoch": 0.6663990365315134, "grad_norm": 1.984375, "kl": 0.04350885096937418, "learning_rate": 1.511548688901612e-06, "loss": 0.0017, "reward": 2.6875000596046448, "reward_std": 0.7840872555971146, "rewards/correctness_reward_func": 1.2500000223517418, "rewards/int_reward_func": 0.4375000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1245 }, { "completion_length": 153.6250057220459, "epoch": 0.6669342968018199, "grad_norm": 1.2265625, "kl": 0.020385520765557885, "learning_rate": 1.5072589131883959e-06, "loss": 0.0008, "reward": 2.895833373069763, "reward_std": 0.5524365305900574, "rewards/correctness_reward_func": 1.4166666716337204, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1246 }, { "completion_length": 150.91666984558105, "epoch": 0.6674695570721263, "grad_norm": 3.234375, "kl": 0.09925921354442835, "learning_rate": 1.502972604845189e-06, "loss": 0.004, "reward": 2.833333373069763, "reward_std": 0.6605667285621166, "rewards/correctness_reward_func": 1.5000000149011612, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333395421505, "rewards/xmlcount_reward_func": 0.4791666716337204, "step": 1247 }, { "completion_length": 201.70833587646484, "epoch": 0.6680048173424328, "grad_norm": 1.4765625, "kl": 0.028366721235215664, "learning_rate": 1.4986897788428828e-06, "loss": 0.0011, "reward": 2.6875000596046448, "reward_std": 0.775413990020752, "rewards/correctness_reward_func": 1.2500000223517418, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1248 }, { "completion_length": 121.08333587646484, "epoch": 0.6685400776127391, "grad_norm": 2.140625, "kl": 0.0443681632168591, "learning_rate": 1.4944104501402028e-06, "loss": 0.0018, "reward": 2.3958334028720856, "reward_std": 0.6104944199323654, "rewards/correctness_reward_func": 1.0000000149011612, "rewards/int_reward_func": 0.4166666679084301, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1249 }, { "completion_length": 137.29166793823242, "epoch": 0.6690753378830456, "grad_norm": 60.75, "kl": 0.9240593300200999, "learning_rate": 1.4901346336836603e-06, "loss": 0.037, "reward": 3.270833373069763, "reward_std": 0.561341404914856, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.4791666716337204, "step": 1250 }, { "completion_length": 159.79167556762695, "epoch": 0.6696105981533521, "grad_norm": 1.4453125, "kl": 0.02926387684419751, "learning_rate": 1.4858623444075e-06, "loss": 0.0012, "reward": 2.5833334028720856, "reward_std": 0.5914224684238434, "rewards/correctness_reward_func": 1.1666666865348816, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 1251 }, { "completion_length": 168.54166984558105, "epoch": 0.6701458584236585, "grad_norm": 1.6171875, "kl": 0.021817692555487156, "learning_rate": 1.4815935972336448e-06, "loss": 0.0009, "reward": 2.458333373069763, "reward_std": 0.4752403795719147, "rewards/correctness_reward_func": 1.0000000223517418, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1252 }, { "completion_length": 135.25000762939453, "epoch": 0.670681118693965, "grad_norm": 2.390625, "kl": 0.026987558696419, "learning_rate": 1.4773284070716504e-06, "loss": 0.0011, "reward": 2.8125000596046448, "reward_std": 0.7872153930366039, "rewards/correctness_reward_func": 1.333333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.02083333395421505, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.4791666716337204, "step": 1253 }, { "completion_length": 106.79166793823242, "epoch": 0.6712163789642713, "grad_norm": 2.046875, "kl": 0.0452488474547863, "learning_rate": 1.473066788818645e-06, "loss": 0.0018, "reward": 3.1979166865348816, "reward_std": 0.4014388881623745, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.4895833358168602, "step": 1254 }, { "completion_length": 176.70833587646484, "epoch": 0.6717516392345778, "grad_norm": 1.015625, "kl": 0.03923133295029402, "learning_rate": 1.4688087573592819e-06, "loss": 0.0016, "reward": 3.0260416865348816, "reward_std": 0.334529023617506, "rewards/correctness_reward_func": 1.6666666716337204, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333395421505, "rewards/xmlcount_reward_func": 0.484375, "step": 1255 }, { "completion_length": 145.7916717529297, "epoch": 0.6722868995048843, "grad_norm": 0.875, "kl": 0.03602353483438492, "learning_rate": 1.4645543275656881e-06, "loss": 0.0014, "reward": 3.2916666865348816, "reward_std": 0.23273734748363495, "rewards/correctness_reward_func": 1.8333333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 1256 }, { "completion_length": 235.12500381469727, "epoch": 0.6728221597751907, "grad_norm": 1.390625, "kl": 0.037159725558012724, "learning_rate": 1.4603035142974094e-06, "loss": 0.0015, "reward": 2.880208373069763, "reward_std": 0.9242165684700012, "rewards/correctness_reward_func": 1.5000000298023224, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.484375, "step": 1257 }, { "completion_length": 128.8333339691162, "epoch": 0.6733574200454971, "grad_norm": 1.953125, "kl": 0.03066550148651004, "learning_rate": 1.4560563324013605e-06, "loss": 0.0012, "reward": 3.145833373069763, "reward_std": 0.5050541460514069, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1258 }, { "completion_length": 182.0416717529297, "epoch": 0.6738926803158035, "grad_norm": 1.203125, "kl": 0.02439832128584385, "learning_rate": 1.4518127967117737e-06, "loss": 0.001, "reward": 2.770833373069763, "reward_std": 0.6864498257637024, "rewards/correctness_reward_func": 1.4166666716337204, "rewards/int_reward_func": 0.3958333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 1259 }, { "completion_length": 138.5416717529297, "epoch": 0.67442794058611, "grad_norm": 1.8515625, "kl": 0.03161265095695853, "learning_rate": 1.4475729220501439e-06, "loss": 0.0013, "reward": 2.848958432674408, "reward_std": 0.7870994806289673, "rewards/correctness_reward_func": 1.4166667014360428, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.4947916716337204, "step": 1260 }, { "completion_length": 138.70833587646484, "epoch": 0.6749632008564165, "grad_norm": 1.328125, "kl": 0.03222563769668341, "learning_rate": 1.4433367232251824e-06, "loss": 0.0013, "reward": 3.125, "reward_std": 0.523861289024353, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 1261 }, { "completion_length": 180.16666793823242, "epoch": 0.6754984611267228, "grad_norm": 1.7265625, "kl": 0.048063420690596104, "learning_rate": 1.439104215032759e-06, "loss": 0.0019, "reward": 2.791666716337204, "reward_std": 0.3602609746158123, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.416666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 1262 }, { "completion_length": 164.12500381469727, "epoch": 0.6760337213970293, "grad_norm": 1.390625, "kl": 0.02397937048226595, "learning_rate": 1.4348754122558533e-06, "loss": 0.001, "reward": 1.9375000298023224, "reward_std": 0.11558076366782188, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1263 }, { "completion_length": 130.12500190734863, "epoch": 0.6765689816673357, "grad_norm": 1.3203125, "kl": 0.036618311423808336, "learning_rate": 1.4306503296645052e-06, "loss": 0.0015, "reward": 3.1041666865348816, "reward_std": 0.5464507639408112, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1264 }, { "completion_length": 158.08333587646484, "epoch": 0.6771042419376422, "grad_norm": 2.21875, "kl": 0.04947941284626722, "learning_rate": 1.4264289820157579e-06, "loss": 0.002, "reward": 3.020833432674408, "reward_std": 0.7563454322516918, "rewards/correctness_reward_func": 1.5833333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000149011612, "rewards/xmlcount_reward_func": 0.5, "step": 1265 }, { "completion_length": 151.7916717529297, "epoch": 0.6776395022079487, "grad_norm": 1.3984375, "kl": 0.026323188096284866, "learning_rate": 1.4222113840536124e-06, "loss": 0.0011, "reward": 3.1666667461395264, "reward_std": 0.5163978338241577, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1266 }, { "completion_length": 147.58333778381348, "epoch": 0.678174762478255, "grad_norm": 0.80078125, "kl": 0.022768684197217226, "learning_rate": 1.4179975505089715e-06, "loss": 0.0009, "reward": 3.375, "reward_std": 0.19364917278289795, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 1267 }, { "completion_length": 131.75000190734863, "epoch": 0.6787100227485615, "grad_norm": 1.8046875, "kl": 0.03656624024733901, "learning_rate": 1.4137874960995898e-06, "loss": 0.0015, "reward": 3.020833373069763, "reward_std": 0.5051551908254623, "rewards/correctness_reward_func": 1.583333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 1268 }, { "completion_length": 153.70833778381348, "epoch": 0.6792452830188679, "grad_norm": 2.265625, "kl": 0.025506778620183468, "learning_rate": 1.4095812355300229e-06, "loss": 0.001, "reward": 2.9791667461395264, "reward_std": 0.7174782454967499, "rewards/correctness_reward_func": 1.5000000447034836, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1269 }, { "completion_length": 123.83333778381348, "epoch": 0.6797805432891744, "grad_norm": 1.9765625, "kl": 0.03775408584624529, "learning_rate": 1.4053787834915753e-06, "loss": 0.0015, "reward": 2.7291667461395264, "reward_std": 0.5133540891110897, "rewards/correctness_reward_func": 1.2500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1270 }, { "completion_length": 166.8333396911621, "epoch": 0.6803158035594808, "grad_norm": 1.4921875, "kl": 0.03264944674447179, "learning_rate": 1.4011801546622483e-06, "loss": 0.0013, "reward": 2.7916667461395264, "reward_std": 0.5385859459638596, "rewards/correctness_reward_func": 1.3333333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1271 }, { "completion_length": 158.5833396911621, "epoch": 0.6808510638297872, "grad_norm": 0.671875, "kl": 0.02247608732432127, "learning_rate": 1.3969853637066939e-06, "loss": 0.0009, "reward": 3.0625, "reward_std": 0.22008520364761353, "rewards/correctness_reward_func": 1.5833333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1272 }, { "completion_length": 161.29166793823242, "epoch": 0.6813863241000937, "grad_norm": 1.5390625, "kl": 0.031257415656000376, "learning_rate": 1.3927944252761535e-06, "loss": 0.0013, "reward": 3.083333373069763, "reward_std": 0.64549720287323, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1273 }, { "completion_length": 126.25000381469727, "epoch": 0.6819215843704001, "grad_norm": 1.78125, "kl": 0.02825942961499095, "learning_rate": 1.3886073540084184e-06, "loss": 0.0011, "reward": 2.7500000596046448, "reward_std": 0.46232305467128754, "rewards/correctness_reward_func": 1.2500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1274 }, { "completion_length": 156.54166793823242, "epoch": 0.6824568446407066, "grad_norm": 1.9296875, "kl": 0.07228852156549692, "learning_rate": 1.3844241645277693e-06, "loss": 0.0029, "reward": 3.020833373069763, "reward_std": 0.6077007204294205, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3750000037252903, "rewards/xmlcount_reward_func": 0.5, "step": 1275 }, { "completion_length": 170.8333339691162, "epoch": 0.682992104911013, "grad_norm": 1.90625, "kl": 0.049814184196293354, "learning_rate": 1.3802448714449284e-06, "loss": 0.002, "reward": 2.8645834028720856, "reward_std": 0.5899006687104702, "rewards/correctness_reward_func": 1.416666679084301, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.4895833358168602, "step": 1276 }, { "completion_length": 140.91666984558105, "epoch": 0.6835273651813194, "grad_norm": 2.03125, "kl": 0.04045526869595051, "learning_rate": 1.3760694893570132e-06, "loss": 0.0016, "reward": 2.916666805744171, "reward_std": 0.8322388045489788, "rewards/correctness_reward_func": 1.5000000298023224, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1277 }, { "completion_length": 154.1666717529297, "epoch": 0.6840626254516259, "grad_norm": 1.0859375, "kl": 0.030904434388503432, "learning_rate": 1.3718980328474768e-06, "loss": 0.0012, "reward": 2.9375000596046448, "reward_std": 0.43299759924411774, "rewards/correctness_reward_func": 1.5000000223517418, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1278 }, { "completion_length": 186.0416717529297, "epoch": 0.6845978857219323, "grad_norm": 1.671875, "kl": 0.034098445903509855, "learning_rate": 1.3677305164860633e-06, "loss": 0.0014, "reward": 2.5104167759418488, "reward_std": 0.8771627657115459, "rewards/correctness_reward_func": 1.2500000223517418, "rewards/int_reward_func": 0.3750000037252903, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333432674408, "rewards/xmlcount_reward_func": 0.4895833358168602, "step": 1279 }, { "completion_length": 172.50000381469727, "epoch": 0.6851331459922387, "grad_norm": 1.4609375, "kl": 0.03586054500192404, "learning_rate": 1.363566954828754e-06, "loss": 0.0014, "reward": 3.083333432674408, "reward_std": 0.8156764209270477, "rewards/correctness_reward_func": 1.666666716337204, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.4791666716337204, "step": 1280 }, { "completion_length": 175.8333396911621, "epoch": 0.6856684062625452, "grad_norm": 2.640625, "kl": 0.025415783748030663, "learning_rate": 1.3594073624177176e-06, "loss": 0.001, "reward": 2.4791667461395264, "reward_std": 0.6265050880610943, "rewards/correctness_reward_func": 1.083333358168602, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.416666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 1281 }, { "completion_length": 158.2083396911621, "epoch": 0.6862036665328516, "grad_norm": 1.171875, "kl": 0.035512601025402546, "learning_rate": 1.3552517537812614e-06, "loss": 0.0014, "reward": 2.9166666865348816, "reward_std": 0.11949636042118073, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1282 }, { "completion_length": 148.66667366027832, "epoch": 0.6867389268031581, "grad_norm": 1.4140625, "kl": 0.02693007607012987, "learning_rate": 1.3511001434337762e-06, "loss": 0.0011, "reward": 2.708333343267441, "reward_std": 0.4999281316995621, "rewards/correctness_reward_func": 1.2500000074505806, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 1283 }, { "completion_length": 156.41666984558105, "epoch": 0.6872741870734644, "grad_norm": 1.5078125, "kl": 0.024214577628299594, "learning_rate": 1.3469525458756873e-06, "loss": 0.001, "reward": 3.1822917461395264, "reward_std": 0.5373664647340775, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4947916716337204, "step": 1284 }, { "completion_length": 147.25000762939453, "epoch": 0.6878094473437709, "grad_norm": 1.3203125, "kl": 0.04691416956484318, "learning_rate": 1.342808975593408e-06, "loss": 0.0019, "reward": 3.3125000596046448, "reward_std": 0.30922993645071983, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1285 }, { "completion_length": 201.95834350585938, "epoch": 0.6883447076140774, "grad_norm": 1.0234375, "kl": 0.03229131503030658, "learning_rate": 1.3386694470592815e-06, "loss": 0.0013, "reward": 2.052083343267441, "reward_std": 0.280670702457428, "rewards/correctness_reward_func": 0.5833333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.4895833358168602, "step": 1286 }, { "completion_length": 152.16666984558105, "epoch": 0.6888799678843838, "grad_norm": 1.046875, "kl": 0.0439167320728302, "learning_rate": 1.3345339747315367e-06, "loss": 0.0018, "reward": 2.78125, "reward_std": 0.2788088917732239, "rewards/correctness_reward_func": 1.3333333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.4895833358168602, "step": 1287 }, { "completion_length": 187.95833587646484, "epoch": 0.6894152281546903, "grad_norm": 1.546875, "kl": 0.01913665747269988, "learning_rate": 1.3304025730542342e-06, "loss": 0.0008, "reward": 2.270833373069763, "reward_std": 0.7110214680433273, "rewards/correctness_reward_func": 0.8333333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 1288 }, { "completion_length": 113.16666793823242, "epoch": 0.6899504884249966, "grad_norm": 1.453125, "kl": 0.03505153767764568, "learning_rate": 1.3262752564572156e-06, "loss": 0.0014, "reward": 2.833333373069763, "reward_std": 0.5163978338241577, "rewards/correctness_reward_func": 1.3333333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1289 }, { "completion_length": 136.37500381469727, "epoch": 0.6904857486953031, "grad_norm": 1.7421875, "kl": 0.040967449080199, "learning_rate": 1.3221520393560594e-06, "loss": 0.0016, "reward": 3.020833373069763, "reward_std": 0.6062580458819866, "rewards/correctness_reward_func": 1.5833333432674408, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1290 }, { "completion_length": 155.9583396911621, "epoch": 0.6910210089656095, "grad_norm": 22.375, "kl": 0.3683767984621227, "learning_rate": 1.3180329361520195e-06, "loss": 0.0147, "reward": 3.130208373069763, "reward_std": 0.6834863424301147, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.4375000149011612, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.484375, "step": 1291 }, { "completion_length": 130.5000057220459, "epoch": 0.691556269235916, "grad_norm": 1.9609375, "kl": 0.07399945426732302, "learning_rate": 1.313917961231986e-06, "loss": 0.003, "reward": 3.3750000596046448, "reward_std": 0.306186206638813, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1292 }, { "completion_length": 136.9583396911621, "epoch": 0.6920915295062224, "grad_norm": 1.8828125, "kl": 0.051089849323034286, "learning_rate": 1.3098071289684271e-06, "loss": 0.002, "reward": 2.8541667461395264, "reward_std": 0.6297798566520214, "rewards/correctness_reward_func": 1.4166666716337204, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1293 }, { "completion_length": 137.45833587646484, "epoch": 0.6926267897765288, "grad_norm": 1.0859375, "kl": 0.04624842945486307, "learning_rate": 1.3057004537193424e-06, "loss": 0.0019, "reward": 2.958333373069763, "reward_std": 0.10206207260489464, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1294 }, { "completion_length": 140.70833587646484, "epoch": 0.6931620500468353, "grad_norm": 1.5625, "kl": 0.047750290017575026, "learning_rate": 1.3015979498282138e-06, "loss": 0.0019, "reward": 3.1250000596046448, "reward_std": 0.35120461508631706, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.4791666716337204, "step": 1295 }, { "completion_length": 142.58333778381348, "epoch": 0.6936973103171417, "grad_norm": 1.21875, "kl": 0.045999363996088505, "learning_rate": 1.297499631623952e-06, "loss": 0.0018, "reward": 3.270833373069763, "reward_std": 0.37377963587641716, "rewards/correctness_reward_func": 1.8333333432674408, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1296 }, { "completion_length": 173.8333396911621, "epoch": 0.6942325705874481, "grad_norm": 1.765625, "kl": 0.026084277778863907, "learning_rate": 1.2934055134208487e-06, "loss": 0.001, "reward": 2.7916667461395264, "reward_std": 0.7578418999910355, "rewards/correctness_reward_func": 1.4166667088866234, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.4791666716337204, "step": 1297 }, { "completion_length": 145.79166984558105, "epoch": 0.6947678308577546, "grad_norm": 1.671875, "kl": 0.03775433311238885, "learning_rate": 1.2893156095185261e-06, "loss": 0.0015, "reward": 3.333333373069763, "reward_std": 0.26872557401657104, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1298 }, { "completion_length": 155.33333778381348, "epoch": 0.695303091128061, "grad_norm": 0.8515625, "kl": 0.03690297156572342, "learning_rate": 1.2852299342018864e-06, "loss": 0.0015, "reward": 3.4375000596046448, "reward_std": 0.11558076366782188, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 1299 }, { "completion_length": 167.95833587646484, "epoch": 0.6958383513983675, "grad_norm": 1.53125, "kl": 0.03322401223704219, "learning_rate": 1.2811485017410657e-06, "loss": 0.0013, "reward": 3.0000000596046448, "reward_std": 0.6986719593405724, "rewards/correctness_reward_func": 1.5833333432674408, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1300 }, { "completion_length": 172.8333396911621, "epoch": 0.6963736116686738, "grad_norm": 1.9140625, "kl": 0.055822163820266724, "learning_rate": 1.277071326391377e-06, "loss": 0.0022, "reward": 2.3958334028720856, "reward_std": 0.6410112343728542, "rewards/correctness_reward_func": 1.0000000149011612, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.416666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 1301 }, { "completion_length": 164.50000381469727, "epoch": 0.6969088719389803, "grad_norm": 2.1875, "kl": 0.0494797071442008, "learning_rate": 1.2729984223932655e-06, "loss": 0.002, "reward": 2.645833432674408, "reward_std": 0.9421651512384415, "rewards/correctness_reward_func": 1.2500000447034836, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1302 }, { "completion_length": 163.70833587646484, "epoch": 0.6974441322092868, "grad_norm": 1.0, "kl": 0.044104176107794046, "learning_rate": 1.2689298039722598e-06, "loss": 0.0018, "reward": 3.333333373069763, "reward_std": 0.3145497292280197, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 1303 }, { "completion_length": 166.1250057220459, "epoch": 0.6979793924795932, "grad_norm": 1.90625, "kl": 0.02964206924661994, "learning_rate": 1.2648654853389163e-06, "loss": 0.0012, "reward": 2.5625000298023224, "reward_std": 0.7055750638246536, "rewards/correctness_reward_func": 1.1666666939854622, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333395421505, "rewards/xmlcount_reward_func": 0.5, "step": 1304 }, { "completion_length": 151.9166717529297, "epoch": 0.6985146527498997, "grad_norm": 1.7578125, "kl": 0.038047782611101866, "learning_rate": 1.2608054806887786e-06, "loss": 0.0015, "reward": 3.0416667461395264, "reward_std": 0.7594528906047344, "rewards/correctness_reward_func": 1.5833333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1305 }, { "completion_length": 139.12500381469727, "epoch": 0.699049913020206, "grad_norm": 0.59375, "kl": 0.03407254721969366, "learning_rate": 1.2567498042023187e-06, "loss": 0.0014, "reward": 3.395833373069763, "reward_std": 0.25515520572662354, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1306 }, { "completion_length": 144.04166793823242, "epoch": 0.6995851732905125, "grad_norm": 2.921875, "kl": 0.060761140659451485, "learning_rate": 1.2526984700448924e-06, "loss": 0.0024, "reward": 2.8854166865348816, "reward_std": 0.28067072853446007, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000149011612, "rewards/xmlcount_reward_func": 0.4687500074505806, "step": 1307 }, { "completion_length": 164.7916717529297, "epoch": 0.700120433560819, "grad_norm": 1.8828125, "kl": 0.04494229191914201, "learning_rate": 1.2486514923666895e-06, "loss": 0.0018, "reward": 3.208333432674408, "reward_std": 0.5094902031123638, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1308 }, { "completion_length": 151.62500190734863, "epoch": 0.7006556938311254, "grad_norm": 1.59375, "kl": 0.039783548563718796, "learning_rate": 1.2446088853026824e-06, "loss": 0.0016, "reward": 3.083333373069763, "reward_std": 0.6293679773807526, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.4791666716337204, "step": 1309 }, { "completion_length": 137.04166984558105, "epoch": 0.7011909541014318, "grad_norm": 1.671875, "kl": 0.053418907802551985, "learning_rate": 1.2405706629725814e-06, "loss": 0.0021, "reward": 3.3541667461395264, "reward_std": 0.3572172746062279, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1310 }, { "completion_length": 155.6666717529297, "epoch": 0.7017262143717382, "grad_norm": 1.2578125, "kl": 0.04420957248657942, "learning_rate": 1.236536839480779e-06, "loss": 0.0018, "reward": 3.1250000596046448, "reward_std": 0.4554154574871063, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1311 }, { "completion_length": 154.08333778381348, "epoch": 0.7022614746420447, "grad_norm": 1.5546875, "kl": 0.0402086041867733, "learning_rate": 1.2325074289163038e-06, "loss": 0.0016, "reward": 3.0260416865348816, "reward_std": 0.495657354593277, "rewards/correctness_reward_func": 1.6666666716337204, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.4635416716337204, "step": 1312 }, { "completion_length": 135.83333778381348, "epoch": 0.7027967349123512, "grad_norm": 1.8671875, "kl": 0.037908658385276794, "learning_rate": 1.2284824453527747e-06, "loss": 0.0015, "reward": 3.2291667461395264, "reward_std": 0.5133541226387024, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1313 }, { "completion_length": 129.8750057220459, "epoch": 0.7033319951826575, "grad_norm": 1.78125, "kl": 0.034716119058430195, "learning_rate": 1.2244619028483445e-06, "loss": 0.0014, "reward": 3.2500000596046448, "reward_std": 0.4623230807483196, "rewards/correctness_reward_func": 1.8333333432674408, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.4791666716337204, "step": 1314 }, { "completion_length": 146.0416717529297, "epoch": 0.703867255452964, "grad_norm": 0.8046875, "kl": 0.026491194032132626, "learning_rate": 1.2204458154456552e-06, "loss": 0.0011, "reward": 3.4166666865348816, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1315 }, { "completion_length": 110.41666984558105, "epoch": 0.7044025157232704, "grad_norm": 1.7265625, "kl": 0.037520342506468296, "learning_rate": 1.216434197171791e-06, "loss": 0.0015, "reward": 3.3750000596046448, "reward_std": 0.25129128620028496, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1316 }, { "completion_length": 148.79166984558105, "epoch": 0.7049377759935769, "grad_norm": 1.8125, "kl": 0.02962551638484001, "learning_rate": 1.2124270620382242e-06, "loss": 0.0012, "reward": 3.145833373069763, "reward_std": 0.5042977184057236, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1317 }, { "completion_length": 187.37500762939453, "epoch": 0.7054730362638834, "grad_norm": 0.671875, "kl": 0.023369870614260435, "learning_rate": 1.2084244240407692e-06, "loss": 0.0009, "reward": 2.9479166865348816, "reward_std": 0.3116655945777893, "rewards/correctness_reward_func": 1.5833333358168602, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46875, "step": 1318 }, { "completion_length": 152.66666984558105, "epoch": 0.7060082965341897, "grad_norm": 1.6484375, "kl": 0.03934650029987097, "learning_rate": 1.2044262971595336e-06, "loss": 0.0016, "reward": 3.3750000596046448, "reward_std": 0.3061862215399742, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1319 }, { "completion_length": 117.91666984558105, "epoch": 0.7065435568044962, "grad_norm": 2.3125, "kl": 0.07858934300020337, "learning_rate": 1.2004326953588672e-06, "loss": 0.0031, "reward": 3.0625000596046448, "reward_std": 0.6130734980106354, "rewards/correctness_reward_func": 1.666666716337204, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1320 }, { "completion_length": 158.00000190734863, "epoch": 0.7070788170748026, "grad_norm": 1.671875, "kl": 0.016671715071424842, "learning_rate": 1.1964436325873186e-06, "loss": 0.0007, "reward": 3.0625000596046448, "reward_std": 0.6634034812450409, "rewards/correctness_reward_func": 1.5833333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1321 }, { "completion_length": 161.62500762939453, "epoch": 0.7076140773451091, "grad_norm": 2.40625, "kl": 0.04676167294383049, "learning_rate": 1.1924591227775785e-06, "loss": 0.0019, "reward": 2.1250000596046448, "reward_std": 0.9239676892757416, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1322 }, { "completion_length": 165.9166717529297, "epoch": 0.7081493376154155, "grad_norm": 1.7734375, "kl": 0.06114206160418689, "learning_rate": 1.18847917984644e-06, "loss": 0.0024, "reward": 2.520833373069763, "reward_std": 0.8347173631191254, "rewards/correctness_reward_func": 1.2500000223517418, "rewards/int_reward_func": 0.3750000111758709, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 1323 }, { "completion_length": 121.00000190734863, "epoch": 0.7086845978857219, "grad_norm": 2.0625, "kl": 0.10707360180094838, "learning_rate": 1.1845038176947413e-06, "loss": 0.0043, "reward": 3.3541667461395264, "reward_std": 0.31970493495464325, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 1324 }, { "completion_length": 150.4583396911621, "epoch": 0.7092198581560284, "grad_norm": 1.9140625, "kl": 0.030952117405831814, "learning_rate": 1.1805330502073227e-06, "loss": 0.0012, "reward": 3.3125000596046448, "reward_std": 0.40438438951969147, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1325 }, { "completion_length": 163.62500762939453, "epoch": 0.7097551184263348, "grad_norm": 1.734375, "kl": 0.04357670247554779, "learning_rate": 1.1765668912529774e-06, "loss": 0.0017, "reward": 2.895833432674408, "reward_std": 0.7646453753113747, "rewards/correctness_reward_func": 1.5000000447034836, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1326 }, { "completion_length": 217.95833587646484, "epoch": 0.7102903786966412, "grad_norm": 1.6796875, "kl": 0.036881398409605026, "learning_rate": 1.172605354684401e-06, "loss": 0.0015, "reward": 2.2864584028720856, "reward_std": 0.7588135302066803, "rewards/correctness_reward_func": 1.0000000298023224, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125000074505806, "rewards/xmlcount_reward_func": 0.4947916716337204, "step": 1327 }, { "completion_length": 173.87500381469727, "epoch": 0.7108256389669477, "grad_norm": 1.8828125, "kl": 0.060357251670211554, "learning_rate": 1.1686484543381437e-06, "loss": 0.0024, "reward": 2.7812500596046448, "reward_std": 0.6692114621400833, "rewards/correctness_reward_func": 1.3333333656191826, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.4895833358168602, "step": 1328 }, { "completion_length": 194.2500057220459, "epoch": 0.7113608992372541, "grad_norm": 1.3671875, "kl": 0.05172150093130767, "learning_rate": 1.1646962040345664e-06, "loss": 0.0021, "reward": 2.8125, "reward_std": 0.5303218066692352, "rewards/correctness_reward_func": 1.416666679084301, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.4791666716337204, "step": 1329 }, { "completion_length": 143.8333396911621, "epoch": 0.7118961595075606, "grad_norm": 0.74609375, "kl": 0.045900904573500156, "learning_rate": 1.160748617577784e-06, "loss": 0.0018, "reward": 3.4791666865348816, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1330 }, { "completion_length": 154.83333778381348, "epoch": 0.712431419777867, "grad_norm": 2.078125, "kl": 0.03502502292394638, "learning_rate": 1.1568057087556256e-06, "loss": 0.0014, "reward": 2.9791667461395264, "reward_std": 0.8382776975631714, "rewards/correctness_reward_func": 1.5833333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1331 }, { "completion_length": 137.12500381469727, "epoch": 0.7129666800481734, "grad_norm": 2.453125, "kl": 0.03675575461238623, "learning_rate": 1.1528674913395807e-06, "loss": 0.0015, "reward": 2.520833358168602, "reward_std": 0.5290164612233639, "rewards/correctness_reward_func": 1.1666666865348816, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1332 }, { "completion_length": 166.6666717529297, "epoch": 0.7135019403184799, "grad_norm": 1.5, "kl": 0.03452045936137438, "learning_rate": 1.148933979084752e-06, "loss": 0.0014, "reward": 2.4375000596046448, "reward_std": 0.4259376786649227, "rewards/correctness_reward_func": 1.0000000223517418, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1333 }, { "completion_length": 131.58333778381348, "epoch": 0.7140372005887863, "grad_norm": 0.96875, "kl": 0.022100039292126894, "learning_rate": 1.1450051857298118e-06, "loss": 0.0009, "reward": 3.4166666865348816, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1334 }, { "completion_length": 175.00000762939453, "epoch": 0.7145724608590928, "grad_norm": 1.75, "kl": 0.03826391091570258, "learning_rate": 1.1410811249969475e-06, "loss": 0.0015, "reward": 2.9166667461395264, "reward_std": 0.8184719085693359, "rewards/correctness_reward_func": 1.5000000298023224, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1335 }, { "completion_length": 176.66666793823242, "epoch": 0.7151077211293991, "grad_norm": 2.21875, "kl": 0.049864266999065876, "learning_rate": 1.1371618105918177e-06, "loss": 0.002, "reward": 2.583333432674408, "reward_std": 0.6198784969747066, "rewards/correctness_reward_func": 1.1666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.416666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 1336 }, { "completion_length": 160.50000762939453, "epoch": 0.7156429813997056, "grad_norm": 1.9453125, "kl": 0.02226860891096294, "learning_rate": 1.1332472562035038e-06, "loss": 0.0009, "reward": 3.145833432674408, "reward_std": 0.6300618499517441, "rewards/correctness_reward_func": 1.7500000596046448, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333395421505, "rewards/xmlcount_reward_func": 0.5, "step": 1337 }, { "completion_length": 158.87500762939453, "epoch": 0.7161782416700121, "grad_norm": 1.4375, "kl": 0.06685441732406616, "learning_rate": 1.1293374755044602e-06, "loss": 0.0027, "reward": 2.9375000596046448, "reward_std": 0.5262714438140392, "rewards/correctness_reward_func": 1.5000000223517418, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1338 }, { "completion_length": 147.00000762939453, "epoch": 0.7167135019403185, "grad_norm": 1.671875, "kl": 0.03872367646545172, "learning_rate": 1.1254324821504717e-06, "loss": 0.0015, "reward": 3.0312501192092896, "reward_std": 0.7034612894058228, "rewards/correctness_reward_func": 1.5833333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.4895833358168602, "step": 1339 }, { "completion_length": 129.66667366027832, "epoch": 0.717248762210625, "grad_norm": 1.5234375, "kl": 0.03320115152746439, "learning_rate": 1.1215322897805984e-06, "loss": 0.0013, "reward": 2.8750000596046448, "reward_std": 0.4909362643957138, "rewards/correctness_reward_func": 1.4166666716337204, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 1340 }, { "completion_length": 188.20833587646484, "epoch": 0.7177840224809313, "grad_norm": 1.8984375, "kl": 0.03837414178997278, "learning_rate": 1.117636912017133e-06, "loss": 0.0015, "reward": 2.7916667461395264, "reward_std": 0.9020005911588669, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.416666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 1341 }, { "completion_length": 123.20833396911621, "epoch": 0.7183192827512378, "grad_norm": 1.0859375, "kl": 0.02904220810160041, "learning_rate": 1.1137463624655537e-06, "loss": 0.0012, "reward": 2.7916666865348816, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1342 }, { "completion_length": 163.12500381469727, "epoch": 0.7188545430215443, "grad_norm": 1.078125, "kl": 0.029067810624837875, "learning_rate": 1.1098606547144727e-06, "loss": 0.0012, "reward": 3.2135416865348816, "reward_std": 0.3886178582906723, "rewards/correctness_reward_func": 1.8333333432674408, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.484375, "step": 1343 }, { "completion_length": 165.5833396911621, "epoch": 0.7193898032918506, "grad_norm": 1.0078125, "kl": 0.022788936970755458, "learning_rate": 1.105979802335594e-06, "loss": 0.0009, "reward": 2.708333373069763, "reward_std": 0.4289814233779907, "rewards/correctness_reward_func": 1.2500000074505806, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 1344 }, { "completion_length": 141.4583396911621, "epoch": 0.7199250635621571, "grad_norm": 2.09375, "kl": 0.07417263370007277, "learning_rate": 1.1021038188836602e-06, "loss": 0.003, "reward": 2.9791667461395264, "reward_std": 0.6616143435239792, "rewards/correctness_reward_func": 1.5833333432674408, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000149011612, "rewards/xmlcount_reward_func": 0.5, "step": 1345 }, { "completion_length": 158.79166984558105, "epoch": 0.7204603238324635, "grad_norm": 1.65625, "kl": 0.02835595328360796, "learning_rate": 1.09823271789641e-06, "loss": 0.0011, "reward": 2.958333373069763, "reward_std": 0.4752403795719147, "rewards/correctness_reward_func": 1.5000000223517418, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1346 }, { "completion_length": 155.25000381469727, "epoch": 0.72099558410277, "grad_norm": 1.015625, "kl": 0.04103809129446745, "learning_rate": 1.0943665128945277e-06, "loss": 0.0016, "reward": 3.0000000596046448, "reward_std": 0.6123724579811096, "rewards/correctness_reward_func": 1.5833333432674408, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1347 }, { "completion_length": 153.33333587646484, "epoch": 0.7215308443730765, "grad_norm": 1.515625, "kl": 0.05304192844778299, "learning_rate": 1.0905052173815974e-06, "loss": 0.0021, "reward": 3.0416667461395264, "reward_std": 0.5268727838993073, "rewards/correctness_reward_func": 1.583333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 1348 }, { "completion_length": 148.58333587646484, "epoch": 0.7220661046433828, "grad_norm": 1.546875, "kl": 0.05763786751776934, "learning_rate": 1.086648844844058e-06, "loss": 0.0023, "reward": 3.2291667461395264, "reward_std": 0.34831811115145683, "rewards/correctness_reward_func": 1.8333333432674408, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1349 }, { "completion_length": 144.29167366027832, "epoch": 0.7226013649136893, "grad_norm": 1.8671875, "kl": 0.026925162645056844, "learning_rate": 1.082797408751151e-06, "loss": 0.0011, "reward": 2.7916667461395264, "reward_std": 0.3602609895169735, "rewards/correctness_reward_func": 1.3333333432674408, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1350 }, { "completion_length": 137.0416717529297, "epoch": 0.7231366251839957, "grad_norm": 0.04931640625, "kl": 0.015334914904087782, "learning_rate": 1.0789509225548767e-06, "loss": 0.0006, "reward": 3.5, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1351 }, { "completion_length": 174.50000381469727, "epoch": 0.7236718854543022, "grad_norm": 1.3125, "kl": 0.024215523153543472, "learning_rate": 1.0751093996899486e-06, "loss": 0.001, "reward": 2.8750000298023224, "reward_std": 0.2686738669872284, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 1352 }, { "completion_length": 125.04166984558105, "epoch": 0.7242071457246085, "grad_norm": 1.84375, "kl": 0.044260346330702305, "learning_rate": 1.0712728535737432e-06, "loss": 0.0018, "reward": 3.0625000596046448, "reward_std": 0.7091782838106155, "rewards/correctness_reward_func": 1.5833333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1353 }, { "completion_length": 155.08333587646484, "epoch": 0.724742405994915, "grad_norm": 1.3125, "kl": 0.029146920423954725, "learning_rate": 1.0674412976062538e-06, "loss": 0.0012, "reward": 3.083333373069763, "reward_std": 0.46232306957244873, "rewards/correctness_reward_func": 1.583333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1354 }, { "completion_length": 143.54166793823242, "epoch": 0.7252776662652215, "grad_norm": 2.21875, "kl": 0.034341649152338505, "learning_rate": 1.063614745170047e-06, "loss": 0.0014, "reward": 2.916666716337204, "reward_std": 0.6123724281787872, "rewards/correctness_reward_func": 1.4166667088866234, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1355 }, { "completion_length": 143.12500762939453, "epoch": 0.7258129265355279, "grad_norm": 0.92578125, "kl": 0.04088182095438242, "learning_rate": 1.0597932096302114e-06, "loss": 0.0016, "reward": 3.4166666865348816, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1356 }, { "completion_length": 223.50000762939453, "epoch": 0.7263481868058344, "grad_norm": 1.7734375, "kl": 0.030694124288856983, "learning_rate": 1.0559767043343132e-06, "loss": 0.0012, "reward": 2.437500089406967, "reward_std": 0.9109830409288406, "rewards/correctness_reward_func": 1.0833333656191826, "rewards/int_reward_func": 0.4375000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.4583333358168602, "step": 1357 }, { "completion_length": 158.5833396911621, "epoch": 0.7268834470761407, "grad_norm": 2.0, "kl": 0.045903034042567015, "learning_rate": 1.0521652426123504e-06, "loss": 0.0018, "reward": 2.854166805744171, "reward_std": 0.7812078148126602, "rewards/correctness_reward_func": 1.4166667014360428, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 1358 }, { "completion_length": 147.08333587646484, "epoch": 0.7274187073464472, "grad_norm": 24.875, "kl": 0.4180497103370726, "learning_rate": 1.0483588377767028e-06, "loss": 0.0167, "reward": 2.8802084028720856, "reward_std": 0.67439004778862, "rewards/correctness_reward_func": 1.583333358168602, "rewards/int_reward_func": 0.3958333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.484375, "step": 1359 }, { "completion_length": 166.87500762939453, "epoch": 0.7279539676167537, "grad_norm": 1.65625, "kl": 0.0429176758043468, "learning_rate": 1.044557503122092e-06, "loss": 0.0017, "reward": 2.958333373069763, "reward_std": 0.6023809425532818, "rewards/correctness_reward_func": 1.5000000149011612, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1360 }, { "completion_length": 128.5416717529297, "epoch": 0.72848922788706, "grad_norm": 1.390625, "kl": 0.04165979754179716, "learning_rate": 1.0407612519255262e-06, "loss": 0.0017, "reward": 2.375, "reward_std": 0.5132361948490143, "rewards/correctness_reward_func": 0.916666679084301, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1361 }, { "completion_length": 171.9166717529297, "epoch": 0.7290244881573665, "grad_norm": 1.65625, "kl": 0.04312491184100509, "learning_rate": 1.0369700974462627e-06, "loss": 0.0017, "reward": 2.6666667461395264, "reward_std": 0.40296074748039246, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 1362 }, { "completion_length": 184.04166984558105, "epoch": 0.7295597484276729, "grad_norm": 1.4921875, "kl": 0.029584042262285948, "learning_rate": 1.0331840529257544e-06, "loss": 0.0012, "reward": 2.9375000596046448, "reward_std": 0.6062580458819866, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 1363 }, { "completion_length": 140.25000190734863, "epoch": 0.7300950086979794, "grad_norm": 2.28125, "kl": 0.04231497598811984, "learning_rate": 1.0294031315876072e-06, "loss": 0.0017, "reward": 3.208333432674408, "reward_std": 0.5094902068376541, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1364 }, { "completion_length": 153.08333778381348, "epoch": 0.7306302689682859, "grad_norm": 1.890625, "kl": 0.044678494334220886, "learning_rate": 1.0256273466375353e-06, "loss": 0.0018, "reward": 2.645833373069763, "reward_std": 0.7632530629634857, "rewards/correctness_reward_func": 1.1666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1365 }, { "completion_length": 159.29166984558105, "epoch": 0.7311655292385922, "grad_norm": 1.71875, "kl": 0.03514173999428749, "learning_rate": 1.021856711263309e-06, "loss": 0.0014, "reward": 2.708333373069763, "reward_std": 0.6272146701812744, "rewards/correctness_reward_func": 1.2500000223517418, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1366 }, { "completion_length": 217.1666717529297, "epoch": 0.7317007895088987, "grad_norm": 0.83984375, "kl": 0.02654549153521657, "learning_rate": 1.0180912386347144e-06, "loss": 0.0011, "reward": 3.0885417461395264, "reward_std": 0.3606287091970444, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3750000037252903, "rewards/xmlcount_reward_func": 0.484375, "step": 1367 }, { "completion_length": 152.83333778381348, "epoch": 0.7322360497792051, "grad_norm": 6.0625, "kl": 0.20837300829589367, "learning_rate": 1.014330941903508e-06, "loss": 0.0083, "reward": 3.0000001192092896, "reward_std": 0.7425693273544312, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3750000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 1368 }, { "completion_length": 170.00000381469727, "epoch": 0.7327713100495116, "grad_norm": 1.3671875, "kl": 0.04278033087030053, "learning_rate": 1.0105758342033636e-06, "loss": 0.0017, "reward": 2.4166667461395264, "reward_std": 0.16661179810762405, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 1369 }, { "completion_length": 118.66667175292969, "epoch": 0.733306570319818, "grad_norm": 26.0, "kl": 0.13473708741366863, "learning_rate": 1.0068259286498363e-06, "loss": 0.0054, "reward": 3.2916667461395264, "reward_std": 0.47279801592230797, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.4375000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.4791666716337204, "step": 1370 }, { "completion_length": 120.79166984558105, "epoch": 0.7338418305901244, "grad_norm": 1.171875, "kl": 0.020003549987450242, "learning_rate": 1.0030812383403074e-06, "loss": 0.0008, "reward": 3.395833373069763, "reward_std": 0.2002602517604828, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1371 }, { "completion_length": 173.6666717529297, "epoch": 0.7343770908604309, "grad_norm": 1.5859375, "kl": 0.031346763018518686, "learning_rate": 9.993417763539438e-07, "loss": 0.0013, "reward": 2.6250000298023224, "reward_std": 0.6747233681380749, "rewards/correctness_reward_func": 1.3333333432674408, "rewards/int_reward_func": 0.4166666679084301, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3750000111758709, "rewards/xmlcount_reward_func": 0.5, "step": 1372 }, { "completion_length": 196.75000762939453, "epoch": 0.7349123511307373, "grad_norm": 1.7734375, "kl": 0.042542679235339165, "learning_rate": 9.956075557516535e-07, "loss": 0.0017, "reward": 2.583333432674408, "reward_std": 0.80763865634799, "rewards/correctness_reward_func": 1.2500000223517418, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.416666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 1373 }, { "completion_length": 139.20833778381348, "epoch": 0.7354476114010438, "grad_norm": 2.109375, "kl": 0.041438264306634665, "learning_rate": 9.918785895760348e-07, "loss": 0.0017, "reward": 2.770833432674408, "reward_std": 1.0852452516555786, "rewards/correctness_reward_func": 1.3333333730697632, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1374 }, { "completion_length": 145.62500381469727, "epoch": 0.7359828716713502, "grad_norm": 2.15625, "kl": 0.037300676107406616, "learning_rate": 9.881548908513358e-07, "loss": 0.0015, "reward": 3.1875000596046448, "reward_std": 0.6376042664051056, "rewards/correctness_reward_func": 1.7500000596046448, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.4791666716337204, "step": 1375 }, { "completion_length": 194.37500762939453, "epoch": 0.7365181319416566, "grad_norm": 1.8125, "kl": 0.042442481964826584, "learning_rate": 9.844364725834058e-07, "loss": 0.0017, "reward": 2.416666716337204, "reward_std": 0.5596308261156082, "rewards/correctness_reward_func": 1.0000000223517418, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.4791666716337204, "step": 1376 }, { "completion_length": 190.41666984558105, "epoch": 0.7370533922119631, "grad_norm": 1.53125, "kl": 0.02683873614296317, "learning_rate": 9.807233477596504e-07, "loss": 0.0011, "reward": 2.3541667461395264, "reward_std": 0.8361027240753174, "rewards/correctness_reward_func": 1.0000000074505806, "rewards/int_reward_func": 0.35416666977107525, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1377 }, { "completion_length": 136.83333587646484, "epoch": 0.7375886524822695, "grad_norm": 1.2109375, "kl": 0.027323594084009528, "learning_rate": 9.77015529348989e-07, "loss": 0.0011, "reward": 3.2500000596046448, "reward_std": 0.46232305467128754, "rewards/correctness_reward_func": 1.7500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1378 }, { "completion_length": 184.875, "epoch": 0.7381239127525759, "grad_norm": 1.2265625, "kl": 0.037914395332336426, "learning_rate": 9.733130303018051e-07, "loss": 0.0015, "reward": 2.380208358168602, "reward_std": 0.32088102027773857, "rewards/correctness_reward_func": 1.0833333358168602, "rewards/int_reward_func": 0.3541666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.484375, "step": 1379 }, { "completion_length": 146.0833396911621, "epoch": 0.7386591730228824, "grad_norm": 8.0625, "kl": 0.25140155758708715, "learning_rate": 9.696158635499032e-07, "loss": 0.0101, "reward": 3.1875001192092896, "reward_std": 0.6730582565069199, "rewards/correctness_reward_func": 1.7500000596046448, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 1380 }, { "completion_length": 147.50000190734863, "epoch": 0.7391944332931888, "grad_norm": 1.125, "kl": 0.0396621716208756, "learning_rate": 9.659240420064647e-07, "loss": 0.0016, "reward": 3.395833373069763, "reward_std": 0.25515518710017204, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1381 }, { "completion_length": 135.12500381469727, "epoch": 0.7397296935634953, "grad_norm": 0.98828125, "kl": 0.03235420072451234, "learning_rate": 9.622375785660004e-07, "loss": 0.0013, "reward": 3.4166666865348816, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1382 }, { "completion_length": 134.79166984558105, "epoch": 0.7402649538338016, "grad_norm": 1.9140625, "kl": 0.04798926878720522, "learning_rate": 9.585564861043087e-07, "loss": 0.0019, "reward": 3.3125000596046448, "reward_std": 0.459279328584671, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1383 }, { "completion_length": 158.12500762939453, "epoch": 0.7408002141041081, "grad_norm": 1.671875, "kl": 0.03119900869205594, "learning_rate": 9.548807774784264e-07, "loss": 0.0012, "reward": 2.9791667461395264, "reward_std": 0.5674288682639599, "rewards/correctness_reward_func": 1.5000000149011612, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1384 }, { "completion_length": 171.87500381469727, "epoch": 0.7413354743744146, "grad_norm": 2.046875, "kl": 0.04780485853552818, "learning_rate": 9.512104655265869e-07, "loss": 0.0019, "reward": 2.6041667461395264, "reward_std": 1.1054013073444366, "rewards/correctness_reward_func": 1.1666666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1385 }, { "completion_length": 121.33333778381348, "epoch": 0.741870734644721, "grad_norm": 1.0078125, "kl": 0.029458944220095873, "learning_rate": 9.475455630681745e-07, "loss": 0.0012, "reward": 3.4166666865348816, "reward_std": 0.20412415266036987, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1386 }, { "completion_length": 133.41666984558105, "epoch": 0.7424059949150275, "grad_norm": 1.3359375, "kl": 0.03565332805737853, "learning_rate": 9.438860829036794e-07, "loss": 0.0014, "reward": 3.395833373069763, "reward_std": 0.20241357013583183, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.4791666716337204, "step": 1387 }, { "completion_length": 145.0416717529297, "epoch": 0.7429412551853338, "grad_norm": 1.5859375, "kl": 0.04745309241116047, "learning_rate": 9.402320378146551e-07, "loss": 0.0019, "reward": 2.770833373069763, "reward_std": 0.4488043449819088, "rewards/correctness_reward_func": 1.3333333730697632, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 1388 }, { "completion_length": 141.7500057220459, "epoch": 0.7434765154556403, "grad_norm": 1.34375, "kl": 0.04251825390383601, "learning_rate": 9.365834405636692e-07, "loss": 0.0017, "reward": 2.9166666865348816, "reward_std": 0.11949636042118073, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1389 }, { "completion_length": 164.12500381469727, "epoch": 0.7440117757259468, "grad_norm": 2.03125, "kl": 0.037888340186327696, "learning_rate": 9.329403038942617e-07, "loss": 0.0015, "reward": 3.0000001192092896, "reward_std": 0.5433852225542068, "rewards/correctness_reward_func": 1.583333358168602, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1390 }, { "completion_length": 157.25000762939453, "epoch": 0.7445470359962532, "grad_norm": 2.375, "kl": 0.03995004156604409, "learning_rate": 9.293026405309033e-07, "loss": 0.0016, "reward": 1.791666716337204, "reward_std": 0.6524690836668015, "rewards/correctness_reward_func": 0.4166666716337204, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 1391 }, { "completion_length": 194.3333396911621, "epoch": 0.7450822962665596, "grad_norm": 2.109375, "kl": 0.041776688769459724, "learning_rate": 9.256704631789443e-07, "loss": 0.0017, "reward": 2.1041667759418488, "reward_std": 1.0553452968597412, "rewards/correctness_reward_func": 0.8333333507180214, "rewards/int_reward_func": 0.416666679084301, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3541666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1392 }, { "completion_length": 139.20833778381348, "epoch": 0.745617556536866, "grad_norm": 1.953125, "kl": 0.046518485993146896, "learning_rate": 9.220437845245766e-07, "loss": 0.0019, "reward": 3.3750000596046448, "reward_std": 0.306186206638813, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1393 }, { "completion_length": 133.8333396911621, "epoch": 0.7461528168071725, "grad_norm": 2.375, "kl": 0.04975722776725888, "learning_rate": 9.184226172347854e-07, "loss": 0.002, "reward": 2.7916667461395264, "reward_std": 1.1091627776622772, "rewards/correctness_reward_func": 1.416666716337204, "rewards/int_reward_func": 0.416666679084301, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1394 }, { "completion_length": 148.95833587646484, "epoch": 0.746688077077479, "grad_norm": 0.7578125, "kl": 0.029789446853101254, "learning_rate": 9.148069739573056e-07, "loss": 0.0012, "reward": 3.1666666865348816, "reward_std": 0.25819891691207886, "rewards/correctness_reward_func": 1.6666666716337204, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1395 }, { "completion_length": 185.4583396911621, "epoch": 0.7472233373477853, "grad_norm": 0.8828125, "kl": 0.03485368099063635, "learning_rate": 9.111968673205799e-07, "loss": 0.0014, "reward": 3.005208373069763, "reward_std": 0.34312520548701286, "rewards/correctness_reward_func": 1.6666666716337204, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333395421505, "rewards/xmlcount_reward_func": 0.484375, "step": 1396 }, { "completion_length": 141.4583396911621, "epoch": 0.7477585976180918, "grad_norm": 1.34375, "kl": 0.03351549245417118, "learning_rate": 9.075923099337114e-07, "loss": 0.0013, "reward": 3.333333373069763, "reward_std": 0.25819891691207886, "rewards/correctness_reward_func": 1.8333333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1397 }, { "completion_length": 166.12500381469727, "epoch": 0.7482938578883982, "grad_norm": 1.5703125, "kl": 0.04391408711671829, "learning_rate": 9.039933143864216e-07, "loss": 0.0018, "reward": 3.3541666865348816, "reward_std": 0.22407886758446693, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 1398 }, { "completion_length": 139.04166984558105, "epoch": 0.7488291181587047, "grad_norm": 1.4609375, "kl": 0.03669837862253189, "learning_rate": 9.003998932490079e-07, "loss": 0.0015, "reward": 2.895833373069763, "reward_std": 0.25515519082546234, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 1399 }, { "completion_length": 167.7916717529297, "epoch": 0.7493643784290112, "grad_norm": 1.6328125, "kl": 0.032958225812762976, "learning_rate": 8.968120590722951e-07, "loss": 0.0013, "reward": 2.958333343267441, "reward_std": 0.572748601436615, "rewards/correctness_reward_func": 1.583333358168602, "rewards/int_reward_func": 0.3750000037252903, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1400 } ], "logging_steps": 1, "max_steps": 1868, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }