diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,22433 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.7493643784290112, + "eval_steps": 500, + "global_step": 1400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "completion_length": 189.7083396911621, + "epoch": 0.0005352602703064365, + "grad_norm": 2.15625, + "kl": 0.0, + "learning_rate": 2.6737967914438503e-08, + "loss": 0.0, + "reward": 1.494916707277298, + "reward_std": 0.7261392325162888, + "rewards/correctness_reward_func": 1.0833333432674408, + "rewards/int_reward_func": 0.20833334140479565, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2032500095665455, + "step": 1 + }, + { + "completion_length": 170.7916717529297, + "epoch": 0.001070520540612873, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 5.3475935828877005e-08, + "loss": 0.0, + "reward": 1.3800000250339508, + "reward_std": 1.1691229492425919, + "rewards/correctness_reward_func": 1.083333358168602, + "rewards/int_reward_func": 0.14583333767950535, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.15083333975053392, + "step": 2 + }, + { + "completion_length": 134.20833587646484, + "epoch": 0.0016057808109193096, + "grad_norm": 4.65625, + "kl": 0.0004565252238535322, + "learning_rate": 8.021390374331552e-08, + "loss": 0.0, + "reward": 1.666666716337204, + "reward_std": 0.9313502460718155, + "rewards/correctness_reward_func": 1.5833334028720856, + "rewards/int_reward_func": 0.02083333395421505, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.06250000046566129, + "step": 3 + }, + { + "completion_length": 165.6666717529297, + "epoch": 0.002141041081225746, + "grad_norm": 5.5625, + "kl": 0.0006007923657307401, + "learning_rate": 1.0695187165775401e-07, + "loss": 0.0, + "reward": 0.8437083810567856, + "reward_std": 0.7530571222305298, + "rewards/correctness_reward_func": 0.583333358168602, + "rewards/int_reward_func": 0.0833333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1770416758954525, + "step": 4 + }, + { + "completion_length": 177.95833587646484, + "epoch": 0.0026763013515321826, + "grad_norm": 8.3125, + "kl": 0.0006772031701984815, + "learning_rate": 1.3368983957219251e-07, + "loss": 0.0, + "reward": 1.058833360671997, + "reward_std": 0.9749192595481873, + "rewards/correctness_reward_func": 0.666666679084301, + "rewards/int_reward_func": 0.25000000931322575, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.14216666715219617, + "step": 5 + }, + { + "completion_length": 136.00000381469727, + "epoch": 0.003211561621838619, + "grad_norm": 9.0625, + "kl": 0.0006230503495316952, + "learning_rate": 1.6042780748663104e-07, + "loss": 0.0, + "reward": 1.7279167473316193, + "reward_std": 0.7088067084550858, + "rewards/correctness_reward_func": 1.3333333656191826, + "rewards/int_reward_func": 0.2291666753590107, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.02083333395421505, + "rewards/xmlcount_reward_func": 0.14458333980292082, + "step": 6 + }, + { + "completion_length": 136.00000381469727, + "epoch": 0.0037468218921450553, + "grad_norm": 9.5, + "kl": 0.00030847315065329894, + "learning_rate": 1.8716577540106952e-07, + "loss": 0.0, + "reward": 1.7916666865348816, + "reward_std": 1.0116209387779236, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.1041666679084301, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.02083333395421505, + "rewards/xmlcount_reward_func": 0.25000000558793545, + "step": 7 + }, + { + "completion_length": 206.1666717529297, + "epoch": 0.004282082162451492, + "grad_norm": 13.125, + "kl": 0.00030440252157859504, + "learning_rate": 2.1390374331550802e-07, + "loss": 0.0, + "reward": 1.0468750447034836, + "reward_std": 0.6654687821865082, + "rewards/correctness_reward_func": 0.8333333730697632, + "rewards/int_reward_func": 0.0833333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1302083395421505, + "step": 8 + }, + { + "completion_length": 155.33333587646484, + "epoch": 0.004817342432757929, + "grad_norm": 6.75, + "kl": 0.00041832496208371595, + "learning_rate": 2.4064171122994655e-07, + "loss": 0.0, + "reward": 1.9687500596046448, + "reward_std": 1.100151926279068, + "rewards/correctness_reward_func": 1.5833334028720856, + "rewards/int_reward_func": 0.18750000558793545, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0416666679084301, + "rewards/xmlcount_reward_func": 0.15625000279396772, + "step": 9 + }, + { + "completion_length": 102.37500286102295, + "epoch": 0.005352602703064365, + "grad_norm": 8.3125, + "kl": 0.0007533838943345472, + "learning_rate": 2.6737967914438503e-07, + "loss": 0.0, + "reward": 1.5795000493526459, + "reward_std": 1.121803194284439, + "rewards/correctness_reward_func": 1.3333333730697632, + "rewards/int_reward_func": 0.14583333767950535, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.10033333860337734, + "step": 10 + }, + { + "completion_length": 204.83333587646484, + "epoch": 0.005887862973370801, + "grad_norm": 11.25, + "kl": 0.00043151845966349356, + "learning_rate": 2.9411764705882356e-07, + "loss": 0.0, + "reward": 0.8431666977703571, + "reward_std": 0.6774237751960754, + "rewards/correctness_reward_func": 0.416666679084301, + "rewards/int_reward_func": 0.1250000037252903, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0416666679084301, + "rewards/xmlcount_reward_func": 0.25983333960175514, + "step": 11 + }, + { + "completion_length": 187.79166793823242, + "epoch": 0.006423123243677238, + "grad_norm": 9.375, + "kl": 0.0004325892587075941, + "learning_rate": 3.208556149732621e-07, + "loss": 0.0, + "reward": 1.3333333358168602, + "reward_std": 0.9049101024866104, + "rewards/correctness_reward_func": 1.166666679084301, + "rewards/int_reward_func": 0.06250000186264515, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.02083333395421505, + "rewards/xmlcount_reward_func": 0.0833333358168602, + "step": 12 + }, + { + "completion_length": 132.83333778381348, + "epoch": 0.0069583835139836745, + "grad_norm": 3.46875, + "kl": 0.00037954464642098173, + "learning_rate": 3.4759358288770056e-07, + "loss": 0.0, + "reward": 1.2552083632908762, + "reward_std": 0.6861025653779507, + "rewards/correctness_reward_func": 1.083333358168602, + "rewards/int_reward_func": 0.08333333395421505, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.08854166744276881, + "step": 13 + }, + { + "completion_length": 157.37500381469727, + "epoch": 0.007493643784290111, + "grad_norm": 11.1875, + "kl": 0.0005396545675466768, + "learning_rate": 3.7433155080213904e-07, + "loss": 0.0, + "reward": 1.3489583544433117, + "reward_std": 0.839453861117363, + "rewards/correctness_reward_func": 1.0833333730697632, + "rewards/int_reward_func": 0.1041666679084301, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.02083333395421505, + "rewards/xmlcount_reward_func": 0.1406250037252903, + "step": 14 + }, + { + "completion_length": 130.66667079925537, + "epoch": 0.008028904054596548, + "grad_norm": 4.71875, + "kl": 0.000455528381280601, + "learning_rate": 4.0106951871657757e-07, + "loss": 0.0, + "reward": 0.9166666772216558, + "reward_std": 0.5038535855710506, + "rewards/correctness_reward_func": 0.7500000298023224, + "rewards/int_reward_func": 0.0416666679084301, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1250000037252903, + "step": 15 + }, + { + "completion_length": 154.45833587646484, + "epoch": 0.008564164324902984, + "grad_norm": 27.125, + "kl": 0.0005314001718943473, + "learning_rate": 4.2780748663101604e-07, + "loss": 0.0, + "reward": 1.1155416816473007, + "reward_std": 0.8059945106506348, + "rewards/correctness_reward_func": 0.9166667014360428, + "rewards/int_reward_func": 0.0625, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.13637500535696745, + "step": 16 + }, + { + "completion_length": 215.0416774749756, + "epoch": 0.00909942459520942, + "grad_norm": 6.28125, + "kl": 0.0004003984504379332, + "learning_rate": 4.5454545454545457e-07, + "loss": 0.0, + "reward": 1.2968750596046448, + "reward_std": 1.074883759021759, + "rewards/correctness_reward_func": 0.833333358168602, + "rewards/int_reward_func": 0.1666666679084301, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2968750037252903, + "step": 17 + }, + { + "completion_length": 166.3333396911621, + "epoch": 0.009634684865515858, + "grad_norm": 6.1875, + "kl": 0.0005555601237574592, + "learning_rate": 4.812834224598931e-07, + "loss": 0.0, + "reward": 1.2804166674613953, + "reward_std": 0.611274242401123, + "rewards/correctness_reward_func": 0.9166666716337204, + "rewards/int_reward_func": 0.06250000186264515, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.02083333395421505, + "rewards/xmlcount_reward_func": 0.28041666746139526, + "step": 18 + }, + { + "completion_length": 174.0833396911621, + "epoch": 0.010169945135822294, + "grad_norm": 10.4375, + "kl": 0.000547043266124092, + "learning_rate": 5.080213903743316e-07, + "loss": 0.0, + "reward": 1.40583336353302, + "reward_std": 1.315255880355835, + "rewards/correctness_reward_func": 1.0833333432674408, + "rewards/int_reward_func": 0.1666666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1558333386783488, + "step": 19 + }, + { + "completion_length": 122.12500286102295, + "epoch": 0.01070520540612873, + "grad_norm": 4.65625, + "kl": 0.0004533547835308127, + "learning_rate": 5.347593582887701e-07, + "loss": 0.0, + "reward": 1.7031250298023224, + "reward_std": 0.8821883350610733, + "rewards/correctness_reward_func": 1.4166667014360428, + "rewards/int_reward_func": 0.0416666679084301, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.02083333395421505, + "rewards/xmlcount_reward_func": 0.22395833861082792, + "step": 20 + }, + { + "completion_length": 169.2500057220459, + "epoch": 0.011240465676435166, + "grad_norm": 3.34375, + "kl": 0.0004296539118513465, + "learning_rate": 5.614973262032086e-07, + "loss": 0.0, + "reward": 1.5156250298023224, + "reward_std": 1.2371932864189148, + "rewards/correctness_reward_func": 1.083333358168602, + "rewards/int_reward_func": 0.1875000037252903, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2447916716337204, + "step": 21 + }, + { + "completion_length": 138.66666793823242, + "epoch": 0.011775725946741603, + "grad_norm": 7.4375, + "kl": 0.000820733854197897, + "learning_rate": 5.882352941176471e-07, + "loss": 0.0, + "reward": 1.129666696768254, + "reward_std": 0.876496072858572, + "rewards/correctness_reward_func": 1.0000000298023224, + "rewards/int_reward_func": 0.0416666679084301, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0880000009201467, + "step": 22 + }, + { + "completion_length": 175.9583396911621, + "epoch": 0.01231098621704804, + "grad_norm": 2.484375, + "kl": 0.0003364777185197454, + "learning_rate": 6.149732620320856e-07, + "loss": 0.0, + "reward": 1.3780416967347264, + "reward_std": 0.7526774629950523, + "rewards/correctness_reward_func": 0.9166666716337204, + "rewards/int_reward_func": 0.2083333395421505, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.25304167438298464, + "step": 23 + }, + { + "completion_length": 144.41667079925537, + "epoch": 0.012846246487354477, + "grad_norm": 22.125, + "kl": 0.0017973248832277022, + "learning_rate": 6.417112299465242e-07, + "loss": 0.0001, + "reward": 0.8507083356380463, + "reward_std": 0.7706352546811104, + "rewards/correctness_reward_func": 0.583333358168602, + "rewards/int_reward_func": 0.12500000186264515, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1423750054091215, + "step": 24 + }, + { + "completion_length": 148.9166717529297, + "epoch": 0.013381506757660913, + "grad_norm": 11.3125, + "kl": 0.000723773060599342, + "learning_rate": 6.684491978609627e-07, + "loss": 0.0, + "reward": 1.4583333730697632, + "reward_std": 1.0291605293750763, + "rewards/correctness_reward_func": 1.2500000447034836, + "rewards/int_reward_func": 0.0833333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1250000037252903, + "step": 25 + }, + { + "completion_length": 136.00000476837158, + "epoch": 0.013916767027967349, + "grad_norm": 4.59375, + "kl": 0.000691680412273854, + "learning_rate": 6.951871657754011e-07, + "loss": 0.0, + "reward": 0.6868333332240582, + "reward_std": 0.8856478333473206, + "rewards/correctness_reward_func": 0.5000000074505806, + "rewards/int_reward_func": 0.0625, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.12433334020897746, + "step": 26 + }, + { + "completion_length": 249.1666717529297, + "epoch": 0.014452027298273785, + "grad_norm": 3.53125, + "kl": 0.0003578776722861221, + "learning_rate": 7.219251336898397e-07, + "loss": 0.0, + "reward": 1.854166716337204, + "reward_std": 0.9099880866706371, + "rewards/correctness_reward_func": 1.2500000298023224, + "rewards/int_reward_func": 0.2708333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.3333333386108279, + "step": 27 + }, + { + "completion_length": 184.70834350585938, + "epoch": 0.014987287568580221, + "grad_norm": 6.625, + "kl": 0.00038619608676526695, + "learning_rate": 7.486631016042781e-07, + "loss": 0.0, + "reward": 1.7500000596046448, + "reward_std": 0.82208052277565, + "rewards/correctness_reward_func": 1.333333358168602, + "rewards/int_reward_func": 0.1250000037252903, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2916666753590107, + "step": 28 + }, + { + "completion_length": 206.87500381469727, + "epoch": 0.01552254783888666, + "grad_norm": 2.53125, + "kl": 0.00043380017814342864, + "learning_rate": 7.754010695187167e-07, + "loss": 0.0, + "reward": 2.182291716337204, + "reward_std": 0.7352110594511032, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.2500000074505806, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.02083333395421505, + "rewards/xmlcount_reward_func": 0.24479166977107525, + "step": 29 + }, + { + "completion_length": 228.37500381469727, + "epoch": 0.016057808109193095, + "grad_norm": 5.625, + "kl": 0.0007612094195792452, + "learning_rate": 8.021390374331551e-07, + "loss": 0.0, + "reward": 1.8489584177732468, + "reward_std": 0.6690051779150963, + "rewards/correctness_reward_func": 1.1666666716337204, + "rewards/int_reward_func": 0.3333333469927311, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.3489583469927311, + "step": 30 + }, + { + "completion_length": 204.12500381469727, + "epoch": 0.01659306837949953, + "grad_norm": 10.875, + "kl": 0.0005931528867222369, + "learning_rate": 8.288770053475937e-07, + "loss": 0.0, + "reward": 1.2760416865348816, + "reward_std": 0.7161198072135448, + "rewards/correctness_reward_func": 1.0833333432674408, + "rewards/int_reward_func": 0.06250000186264515, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.13020833488553762, + "step": 31 + }, + { + "completion_length": 148.5833396911621, + "epoch": 0.017128328649805968, + "grad_norm": 12.8125, + "kl": 0.0014113030629232526, + "learning_rate": 8.556149732620321e-07, + "loss": 0.0001, + "reward": 1.6899999976158142, + "reward_std": 0.8904432207345963, + "rewards/correctness_reward_func": 1.2500000223517418, + "rewards/int_reward_func": 0.1875000037252903, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.25250000692903996, + "step": 32 + }, + { + "completion_length": 147.58333778381348, + "epoch": 0.017663588920112404, + "grad_norm": 13.75, + "kl": 0.0011093771463492885, + "learning_rate": 8.823529411764707e-07, + "loss": 0.0, + "reward": 1.3604583442211151, + "reward_std": 0.998242624104023, + "rewards/correctness_reward_func": 1.0000000298023224, + "rewards/int_reward_func": 0.12500000186264515, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2354583442211151, + "step": 33 + }, + { + "completion_length": 173.66667556762695, + "epoch": 0.01819884919041884, + "grad_norm": 7.875, + "kl": 0.0017010539158945903, + "learning_rate": 9.090909090909091e-07, + "loss": 0.0001, + "reward": 1.4270833730697632, + "reward_std": 0.8556555807590485, + "rewards/correctness_reward_func": 0.916666679084301, + "rewards/int_reward_func": 0.22916666977107525, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2812500037252903, + "step": 34 + }, + { + "completion_length": 175.37500190734863, + "epoch": 0.018734109460725276, + "grad_norm": 4.6875, + "kl": 0.0009809281909838319, + "learning_rate": 9.358288770053477e-07, + "loss": 0.0, + "reward": 1.3022500425577164, + "reward_std": 0.909055307507515, + "rewards/correctness_reward_func": 1.0833333507180214, + "rewards/int_reward_func": 0.0833333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.13558333739638329, + "step": 35 + }, + { + "completion_length": 146.16666984558105, + "epoch": 0.019269369731031716, + "grad_norm": 4.65625, + "kl": 0.0009264845575671643, + "learning_rate": 9.625668449197862e-07, + "loss": 0.0, + "reward": 1.401041716337204, + "reward_std": 0.5991803035140038, + "rewards/correctness_reward_func": 1.2500000298023224, + "rewards/int_reward_func": 0.0625, + "rewards/soft_format_reward_func": 0.02083333395421505, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0677083358168602, + "step": 36 + }, + { + "completion_length": 145.58333778381348, + "epoch": 0.019804630001338152, + "grad_norm": 4.96875, + "kl": 0.0008276553271571174, + "learning_rate": 9.893048128342248e-07, + "loss": 0.0, + "reward": 1.9479166716337204, + "reward_std": 0.41099051013588905, + "rewards/correctness_reward_func": 1.3333333432674408, + "rewards/int_reward_func": 0.2916666679084301, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.3229166753590107, + "step": 37 + }, + { + "completion_length": 188.16667366027832, + "epoch": 0.020339890271644588, + "grad_norm": 4.6875, + "kl": 0.0018217733450001106, + "learning_rate": 1.0160427807486633e-06, + "loss": 0.0001, + "reward": 1.1983333826065063, + "reward_std": 0.7537417262792587, + "rewards/correctness_reward_func": 0.9166666716337204, + "rewards/int_reward_func": 0.0833333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.19833334162831306, + "step": 38 + }, + { + "completion_length": 146.79166984558105, + "epoch": 0.020875150541951024, + "grad_norm": 3.1875, + "kl": 0.0017429170693503693, + "learning_rate": 1.0427807486631017e-06, + "loss": 0.0001, + "reward": 1.6458333730697632, + "reward_std": 0.448615238070488, + "rewards/correctness_reward_func": 1.3333333432674408, + "rewards/int_reward_func": 0.02083333395421505, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2916666828095913, + "step": 39 + }, + { + "completion_length": 131.62500762939453, + "epoch": 0.02141041081225746, + "grad_norm": 14.9375, + "kl": 0.0036563000176101923, + "learning_rate": 1.0695187165775401e-06, + "loss": 0.0001, + "reward": 1.3437500447034836, + "reward_std": 0.9089661091566086, + "rewards/correctness_reward_func": 1.0, + "rewards/int_reward_func": 0.0833333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.02083333395421505, + "rewards/xmlcount_reward_func": 0.2395833358168602, + "step": 40 + }, + { + "completion_length": 167.45833778381348, + "epoch": 0.021945671082563897, + "grad_norm": 15.0, + "kl": 0.0035842061261064373, + "learning_rate": 1.0962566844919787e-06, + "loss": 0.0001, + "reward": 1.2604167088866234, + "reward_std": 0.7910265475511551, + "rewards/correctness_reward_func": 1.0833333507180214, + "rewards/int_reward_func": 0.0416666679084301, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.13541666977107525, + "step": 41 + }, + { + "completion_length": 152.29167366027832, + "epoch": 0.022480931352870333, + "grad_norm": 7.8125, + "kl": 0.0029216272378107533, + "learning_rate": 1.1229946524064172e-06, + "loss": 0.0001, + "reward": 1.052083358168602, + "reward_std": 1.137702077627182, + "rewards/correctness_reward_func": 0.8333333358168602, + "rewards/int_reward_func": 0.12500000186264515, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.09375000279396772, + "step": 42 + }, + { + "completion_length": 212.2083396911621, + "epoch": 0.02301619162317677, + "grad_norm": 3.5625, + "kl": 0.0010128439316758886, + "learning_rate": 1.1497326203208558e-06, + "loss": 0.0, + "reward": 1.6354167461395264, + "reward_std": 0.8953899294137955, + "rewards/correctness_reward_func": 1.0000000149011612, + "rewards/int_reward_func": 0.3125000037252903, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.3229166679084301, + "step": 43 + }, + { + "completion_length": 185.75000381469727, + "epoch": 0.023551451893483205, + "grad_norm": 3.921875, + "kl": 0.0024199254185077734, + "learning_rate": 1.1764705882352942e-06, + "loss": 0.0001, + "reward": 2.192708373069763, + "reward_std": 1.02582186460495, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.3333333395421505, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.02083333395421505, + "rewards/xmlcount_reward_func": 0.4218750074505806, + "step": 44 + }, + { + "completion_length": 194.12500381469727, + "epoch": 0.02408671216378964, + "grad_norm": 5.28125, + "kl": 0.002183001925004646, + "learning_rate": 1.2032085561497326e-06, + "loss": 0.0001, + "reward": 1.9635416865348816, + "reward_std": 0.8007803931832314, + "rewards/correctness_reward_func": 1.3333333730697632, + "rewards/int_reward_func": 0.2708333395421505, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.3593750149011612, + "step": 45 + }, + { + "completion_length": 169.20833587646484, + "epoch": 0.02462197243409608, + "grad_norm": 3.34375, + "kl": 0.002503075505956076, + "learning_rate": 1.2299465240641713e-06, + "loss": 0.0001, + "reward": 1.9375000596046448, + "reward_std": 0.9027109891176224, + "rewards/correctness_reward_func": 1.4166667312383652, + "rewards/int_reward_func": 0.20833334140479565, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.3125000074505806, + "step": 46 + }, + { + "completion_length": 164.91667366027832, + "epoch": 0.025157232704402517, + "grad_norm": 4.65625, + "kl": 0.003346432466059923, + "learning_rate": 1.2566844919786097e-06, + "loss": 0.0001, + "reward": 1.6684584021568298, + "reward_std": 1.296816736459732, + "rewards/correctness_reward_func": 1.3333333879709244, + "rewards/int_reward_func": 0.14583333395421505, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.18929166719317436, + "step": 47 + }, + { + "completion_length": 136.95833587646484, + "epoch": 0.025692492974708953, + "grad_norm": 10.875, + "kl": 0.006269982142839581, + "learning_rate": 1.2834224598930483e-06, + "loss": 0.0003, + "reward": 2.161458432674408, + "reward_std": 0.7979137673974037, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.1875000074505806, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.307291679084301, + "step": 48 + }, + { + "completion_length": 153.45833587646484, + "epoch": 0.02622775324501539, + "grad_norm": 4.65625, + "kl": 0.006891902536153793, + "learning_rate": 1.3101604278074868e-06, + "loss": 0.0003, + "reward": 1.7187500596046448, + "reward_std": 1.0350174307823181, + "rewards/correctness_reward_func": 1.1666667088866234, + "rewards/int_reward_func": 0.20833333767950535, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0416666679084301, + "rewards/xmlcount_reward_func": 0.3020833395421505, + "step": 49 + }, + { + "completion_length": 197.20834350585938, + "epoch": 0.026763013515321826, + "grad_norm": 2.375, + "kl": 0.002022897358983755, + "learning_rate": 1.3368983957219254e-06, + "loss": 0.0001, + "reward": 2.1718750447034836, + "reward_std": 0.3917969614267349, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.33333333395421505, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4218750074505806, + "step": 50 + }, + { + "completion_length": 212.04166793823242, + "epoch": 0.02729827378562826, + "grad_norm": 1.8671875, + "kl": 0.0011677205184241757, + "learning_rate": 1.3636363636363636e-06, + "loss": 0.0, + "reward": 1.7864583656191826, + "reward_std": 0.6358048617839813, + "rewards/correctness_reward_func": 1.1666666865348816, + "rewards/int_reward_func": 0.1875000074505806, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4322916716337204, + "step": 51 + }, + { + "completion_length": 174.5416717529297, + "epoch": 0.027833534055934698, + "grad_norm": 4.03125, + "kl": 0.004340659594163299, + "learning_rate": 1.3903743315508022e-06, + "loss": 0.0002, + "reward": 1.5617916658520699, + "reward_std": 0.4447403661906719, + "rewards/correctness_reward_func": 1.0833333358168602, + "rewards/int_reward_func": 0.1041666679084301, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37429168075323105, + "step": 52 + }, + { + "completion_length": 193.70833778381348, + "epoch": 0.028368794326241134, + "grad_norm": 7.3125, + "kl": 0.002266606839839369, + "learning_rate": 1.4171122994652409e-06, + "loss": 0.0001, + "reward": 2.4166667461395264, + "reward_std": 0.43660441040992737, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.27083333767950535, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31250000558793545, + "step": 53 + }, + { + "completion_length": 150.16667366027832, + "epoch": 0.02890405459654757, + "grad_norm": 2.890625, + "kl": 0.010927497263764963, + "learning_rate": 1.4438502673796793e-06, + "loss": 0.0004, + "reward": 2.302083373069763, + "reward_std": 0.7531506419181824, + "rewards/correctness_reward_func": 1.7500000596046448, + "rewards/int_reward_func": 0.2291666679084301, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.3229166716337204, + "step": 54 + }, + { + "completion_length": 195.45834350585938, + "epoch": 0.029439314866854006, + "grad_norm": 2.265625, + "kl": 0.0035543091071303934, + "learning_rate": 1.4705882352941177e-06, + "loss": 0.0001, + "reward": 2.2359583973884583, + "reward_std": 0.7737347185611725, + "rewards/correctness_reward_func": 1.5833333730697632, + "rewards/int_reward_func": 0.2500000037252903, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0416666679084301, + "rewards/xmlcount_reward_func": 0.3609583452343941, + "step": 55 + }, + { + "completion_length": 164.0416717529297, + "epoch": 0.029974575137160443, + "grad_norm": 2.0625, + "kl": 0.003154328849632293, + "learning_rate": 1.4973262032085562e-06, + "loss": 0.0001, + "reward": 2.208333432674408, + "reward_std": 0.8674917370080948, + "rewards/correctness_reward_func": 1.5000000298023224, + "rewards/int_reward_func": 0.2500000037252903, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.02083333395421505, + "rewards/xmlcount_reward_func": 0.4375000074505806, + "step": 56 + }, + { + "completion_length": 158.16666793823242, + "epoch": 0.030509835407466882, + "grad_norm": 3.140625, + "kl": 0.003223880339646712, + "learning_rate": 1.5240641711229948e-06, + "loss": 0.0001, + "reward": 1.8229167610406876, + "reward_std": 0.4800337702035904, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.14583333767950535, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2604166753590107, + "step": 57 + }, + { + "completion_length": 200.20833587646484, + "epoch": 0.03104509567777332, + "grad_norm": 1.8125, + "kl": 0.0027400395192671567, + "learning_rate": 1.5508021390374334e-06, + "loss": 0.0001, + "reward": 1.9427084177732468, + "reward_std": 0.566053070127964, + "rewards/correctness_reward_func": 1.1666666716337204, + "rewards/int_reward_func": 0.31250000558793545, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.02083333395421505, + "rewards/xmlcount_reward_func": 0.4427083432674408, + "step": 58 + }, + { + "completion_length": 160.0416717529297, + "epoch": 0.03158035594807975, + "grad_norm": 6.28125, + "kl": 0.006273803039221093, + "learning_rate": 1.5775401069518716e-06, + "loss": 0.0003, + "reward": 1.8489583730697632, + "reward_std": 0.9726917743682861, + "rewards/correctness_reward_func": 1.1666666939854622, + "rewards/int_reward_func": 0.2500000074505806, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.02083333395421505, + "rewards/xmlcount_reward_func": 0.4114583432674408, + "step": 59 + }, + { + "completion_length": 224.41667556762695, + "epoch": 0.03211561621838619, + "grad_norm": 11.5625, + "kl": 0.023900436048279516, + "learning_rate": 1.6042780748663103e-06, + "loss": 0.001, + "reward": 1.4687500447034836, + "reward_std": 0.720452331006527, + "rewards/correctness_reward_func": 0.8333333358168602, + "rewards/int_reward_func": 0.2500000074505806, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.385416679084301, + "step": 60 + }, + { + "completion_length": 205.50000381469727, + "epoch": 0.03265087648869262, + "grad_norm": 3.28125, + "kl": 0.0030198894964996725, + "learning_rate": 1.631016042780749e-06, + "loss": 0.0001, + "reward": 1.9166667759418488, + "reward_std": 0.9933225437998772, + "rewards/correctness_reward_func": 1.333333358168602, + "rewards/int_reward_func": 0.20833334140479565, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.3750000149011612, + "step": 61 + }, + { + "completion_length": 153.8333396911621, + "epoch": 0.03318613675899906, + "grad_norm": 2.15625, + "kl": 0.004072973737493157, + "learning_rate": 1.6577540106951873e-06, + "loss": 0.0002, + "reward": 2.0937500596046448, + "reward_std": 0.8774870336055756, + "rewards/correctness_reward_func": 1.3333333656191826, + "rewards/int_reward_func": 0.31250001303851604, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4479166716337204, + "step": 62 + }, + { + "completion_length": 174.0833396911621, + "epoch": 0.0337213970293055, + "grad_norm": 2.0625, + "kl": 0.0055269336444325745, + "learning_rate": 1.684491978609626e-06, + "loss": 0.0002, + "reward": 2.3186666667461395, + "reward_std": 0.7296848772093654, + "rewards/correctness_reward_func": 1.5833333730697632, + "rewards/int_reward_func": 0.33333333395421505, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4020000025629997, + "step": 63 + }, + { + "completion_length": 156.87500762939453, + "epoch": 0.034256657299611935, + "grad_norm": 3.140625, + "kl": 0.0023692850954830647, + "learning_rate": 1.7112299465240642e-06, + "loss": 0.0001, + "reward": 1.7187500894069672, + "reward_std": 0.7189934402704239, + "rewards/correctness_reward_func": 0.9166666716337204, + "rewards/int_reward_func": 0.3333333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.02083333395421505, + "rewards/xmlcount_reward_func": 0.4479166716337204, + "step": 64 + }, + { + "completion_length": 176.7916717529297, + "epoch": 0.034791917569918375, + "grad_norm": 10.375, + "kl": 0.04830963246058673, + "learning_rate": 1.7379679144385028e-06, + "loss": 0.0019, + "reward": 2.2812501192092896, + "reward_std": 0.8949191719293594, + "rewards/correctness_reward_func": 1.5000000596046448, + "rewards/int_reward_func": 0.2916666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.02083333395421505, + "rewards/xmlcount_reward_func": 0.4687500074505806, + "step": 65 + }, + { + "completion_length": 180.9583396911621, + "epoch": 0.03532717784022481, + "grad_norm": 2.40625, + "kl": 0.003214933123672381, + "learning_rate": 1.7647058823529414e-06, + "loss": 0.0001, + "reward": 2.5000000596046448, + "reward_std": 0.37490667030215263, + "rewards/correctness_reward_func": 1.5000000223517418, + "rewards/int_reward_func": 0.4375000074505806, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.06250000186264515, + "rewards/xmlcount_reward_func": 0.5, + "step": 66 + }, + { + "completion_length": 194.2083396911621, + "epoch": 0.03586243811053125, + "grad_norm": 2.34375, + "kl": 0.0023288802476599813, + "learning_rate": 1.7914438502673799e-06, + "loss": 0.0001, + "reward": 1.5208334028720856, + "reward_std": 0.7188220322132111, + "rewards/correctness_reward_func": 0.7500000298023224, + "rewards/int_reward_func": 0.2916666753590107, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.02083333395421505, + "rewards/xmlcount_reward_func": 0.4583333432674408, + "step": 67 + }, + { + "completion_length": 159.66666984558105, + "epoch": 0.03639769838083768, + "grad_norm": 2.203125, + "kl": 0.0026101931143784896, + "learning_rate": 1.8181818181818183e-06, + "loss": 0.0001, + "reward": 2.645833373069763, + "reward_std": 0.5910372547805309, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.02083333395421505, + "rewards/xmlcount_reward_func": 0.5, + "step": 68 + }, + { + "completion_length": 186.91666984558105, + "epoch": 0.03693295865114412, + "grad_norm": 2.34375, + "kl": 0.0025595282058930025, + "learning_rate": 1.8449197860962567e-06, + "loss": 0.0001, + "reward": 2.817708432674408, + "reward_std": 0.3789900913834572, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.3958333395421505, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.02083333395421505, + "rewards/xmlcount_reward_func": 0.4843750074505806, + "step": 69 + }, + { + "completion_length": 159.37500381469727, + "epoch": 0.03746821892145055, + "grad_norm": 4.03125, + "kl": 0.014193891576724127, + "learning_rate": 1.8716577540106954e-06, + "loss": 0.0006, + "reward": 2.444666802883148, + "reward_std": 0.8164463341236115, + "rewards/correctness_reward_func": 1.666666716337204, + "rewards/int_reward_func": 0.3333333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.02083333395421505, + "rewards/xmlcount_reward_func": 0.42383334040641785, + "step": 70 + }, + { + "completion_length": 164.5416717529297, + "epoch": 0.03800347919175699, + "grad_norm": 2.0625, + "kl": 0.006387478410033509, + "learning_rate": 1.898395721925134e-06, + "loss": 0.0003, + "reward": 2.380208432674408, + "reward_std": 0.7041773945093155, + "rewards/correctness_reward_func": 1.4166667088866234, + "rewards/int_reward_func": 0.3958333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0833333358168602, + "rewards/xmlcount_reward_func": 0.484375, + "step": 71 + }, + { + "completion_length": 176.41666984558105, + "epoch": 0.03853873946206343, + "grad_norm": 2.6875, + "kl": 0.004877177358139306, + "learning_rate": 1.9251336898395724e-06, + "loss": 0.0002, + "reward": 1.932291716337204, + "reward_std": 0.6992241404950619, + "rewards/correctness_reward_func": 1.3333333656191826, + "rewards/int_reward_func": 0.10416666977107525, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.02083333395421505, + "rewards/xmlcount_reward_func": 0.4739583432674408, + "step": 72 + }, + { + "completion_length": 179.87500381469727, + "epoch": 0.039073999732369864, + "grad_norm": 2.046875, + "kl": 0.0029720670718234032, + "learning_rate": 1.951871657754011e-06, + "loss": 0.0001, + "reward": 2.2083334624767303, + "reward_std": 0.7113956846296787, + "rewards/correctness_reward_func": 1.5000000149011612, + "rewards/int_reward_func": 0.2083333395421505, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0416666679084301, + "rewards/xmlcount_reward_func": 0.4583333432674408, + "step": 73 + }, + { + "completion_length": 162.66666793823242, + "epoch": 0.039609260002676304, + "grad_norm": 1.421875, + "kl": 0.0038638823752989992, + "learning_rate": 1.9786096256684497e-06, + "loss": 0.0002, + "reward": 2.348958373069763, + "reward_std": 0.6586650460958481, + "rewards/correctness_reward_func": 1.4166666716337204, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4739583358168602, + "step": 74 + }, + { + "completion_length": 151.29166793823242, + "epoch": 0.04014452027298274, + "grad_norm": 2.421875, + "kl": 0.005501059582456946, + "learning_rate": 2.0053475935828877e-06, + "loss": 0.0002, + "reward": 2.645833373069763, + "reward_std": 0.5747457854449749, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.35416667722165585, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.06250000186264515, + "rewards/xmlcount_reward_func": 0.4791666716337204, + "step": 75 + }, + { + "completion_length": 220.9583396911621, + "epoch": 0.040679780543289176, + "grad_norm": 1.421875, + "kl": 0.004471051681321114, + "learning_rate": 2.0320855614973265e-06, + "loss": 0.0002, + "reward": 1.9540833532810211, + "reward_std": 0.8409619331359863, + "rewards/correctness_reward_func": 1.0833333358168602, + "rewards/int_reward_func": 0.3958333395421505, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4749166667461395, + "step": 76 + }, + { + "completion_length": 202.75000381469727, + "epoch": 0.04121504081359561, + "grad_norm": 1.875, + "kl": 0.0032015527540352196, + "learning_rate": 2.058823529411765e-06, + "loss": 0.0001, + "reward": 2.229166716337204, + "reward_std": 0.8260998427867889, + "rewards/correctness_reward_func": 1.5000000298023224, + "rewards/int_reward_func": 0.2291666753590107, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.02083333395421505, + "rewards/xmlcount_reward_func": 0.4791666716337204, + "step": 77 + }, + { + "completion_length": 182.58333587646484, + "epoch": 0.04175030108390205, + "grad_norm": 2.3125, + "kl": 0.0037266534636728466, + "learning_rate": 2.0855614973262034e-06, + "loss": 0.0001, + "reward": 1.9532501101493835, + "reward_std": 0.4688983578234911, + "rewards/correctness_reward_func": 1.1666666716337204, + "rewards/int_reward_func": 0.2916666679084301, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.02083333395421505, + "rewards/xmlcount_reward_func": 0.474083349108696, + "step": 78 + }, + { + "completion_length": 196.0833396911621, + "epoch": 0.04228556135420848, + "grad_norm": 1.8125, + "kl": 0.0038231085636653006, + "learning_rate": 2.112299465240642e-06, + "loss": 0.0002, + "reward": 2.2470000088214874, + "reward_std": 0.6223583808168769, + "rewards/correctness_reward_func": 1.3333333358168602, + "rewards/int_reward_func": 0.4375, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0416666679084301, + "rewards/xmlcount_reward_func": 0.43450000137090683, + "step": 79 + }, + { + "completion_length": 172.58333587646484, + "epoch": 0.04282082162451492, + "grad_norm": 8.0, + "kl": 0.17783336297725327, + "learning_rate": 2.1390374331550802e-06, + "loss": 0.0071, + "reward": 2.3935834169387817, + "reward_std": 0.7652425169944763, + "rewards/correctness_reward_func": 1.5833333730697632, + "rewards/int_reward_func": 0.31250000186264515, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0625, + "rewards/xmlcount_reward_func": 0.43525000661611557, + "step": 80 + }, + { + "completion_length": 182.7083396911621, + "epoch": 0.043356081894821354, + "grad_norm": 2.96875, + "kl": 0.010094487282913178, + "learning_rate": 2.165775401069519e-06, + "loss": 0.0004, + "reward": 2.4062501192092896, + "reward_std": 0.6189638450741768, + "rewards/correctness_reward_func": 1.5000000223517418, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.447916679084301, + "step": 81 + }, + { + "completion_length": 200.50000381469727, + "epoch": 0.04389134216512779, + "grad_norm": 1.09375, + "kl": 0.004236105130985379, + "learning_rate": 2.1925133689839575e-06, + "loss": 0.0002, + "reward": 1.9768334031105042, + "reward_std": 0.46516112983226776, + "rewards/correctness_reward_func": 1.2500000298023224, + "rewards/int_reward_func": 0.25000000558793545, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4768333435058594, + "step": 82 + }, + { + "completion_length": 247.54167556762695, + "epoch": 0.04442660243543423, + "grad_norm": 1.65625, + "kl": 0.011776420462410897, + "learning_rate": 2.219251336898396e-06, + "loss": 0.0005, + "reward": 1.7291666865348816, + "reward_std": 0.9394900351762772, + "rewards/correctness_reward_func": 1.0000000447034836, + "rewards/int_reward_func": 0.25000000931322575, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.02083333395421505, + "rewards/xmlcount_reward_func": 0.4583333432674408, + "step": 83 + }, + { + "completion_length": 251.37500762939453, + "epoch": 0.044961862705740666, + "grad_norm": 1.5703125, + "kl": 0.005871386732906103, + "learning_rate": 2.2459893048128343e-06, + "loss": 0.0002, + "reward": 1.520833358168602, + "reward_std": 0.937641553580761, + "rewards/correctness_reward_func": 0.8333333432674408, + "rewards/int_reward_func": 0.22916667349636555, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4583333432674408, + "step": 84 + }, + { + "completion_length": 164.4583396911621, + "epoch": 0.045497122976047105, + "grad_norm": 1.828125, + "kl": 0.006025184877216816, + "learning_rate": 2.2727272727272728e-06, + "loss": 0.0002, + "reward": 2.8541667461395264, + "reward_std": 0.1530931033194065, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.33333333395421505, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.02083333395421505, + "rewards/xmlcount_reward_func": 0.5, + "step": 85 + }, + { + "completion_length": 175.12500381469727, + "epoch": 0.04603238324635354, + "grad_norm": 2.234375, + "kl": 0.012646633782424033, + "learning_rate": 2.2994652406417116e-06, + "loss": 0.0005, + "reward": 2.2291666865348816, + "reward_std": 0.8635273203253746, + "rewards/correctness_reward_func": 1.5000000447034836, + "rewards/int_reward_func": 0.2083333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0416666679084301, + "rewards/xmlcount_reward_func": 0.4791666716337204, + "step": 86 + }, + { + "completion_length": 203.4166717529297, + "epoch": 0.04656764351665998, + "grad_norm": 2.09375, + "kl": 0.0035205732856411487, + "learning_rate": 2.32620320855615e-06, + "loss": 0.0001, + "reward": 2.2656250596046448, + "reward_std": 0.33494970947504044, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.291666679084301, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0625, + "rewards/xmlcount_reward_func": 0.4947916716337204, + "step": 87 + }, + { + "completion_length": 219.25000381469727, + "epoch": 0.04710290378696641, + "grad_norm": 2.09375, + "kl": 0.003432907979004085, + "learning_rate": 2.3529411764705885e-06, + "loss": 0.0001, + "reward": 1.8906250596046448, + "reward_std": 0.9207641184329987, + "rewards/correctness_reward_func": 1.0833333730697632, + "rewards/int_reward_func": 0.2708333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0416666679084301, + "rewards/xmlcount_reward_func": 0.4947916716337204, + "step": 88 + }, + { + "completion_length": 223.16667556762695, + "epoch": 0.04763816405727285, + "grad_norm": 1.828125, + "kl": 0.010489805426914245, + "learning_rate": 2.379679144385027e-06, + "loss": 0.0004, + "reward": 1.9583333730697632, + "reward_std": 0.7567075043916702, + "rewards/correctness_reward_func": 1.2500000074505806, + "rewards/int_reward_func": 0.2291666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.02083333395421505, + "rewards/xmlcount_reward_func": 0.4583333358168602, + "step": 89 + }, + { + "completion_length": 234.12500381469727, + "epoch": 0.04817342432757928, + "grad_norm": 1.828125, + "kl": 0.0042737985495477915, + "learning_rate": 2.4064171122994653e-06, + "loss": 0.0002, + "reward": 1.843416690826416, + "reward_std": 1.0440644323825836, + "rewards/correctness_reward_func": 1.0833333656191826, + "rewards/int_reward_func": 0.2708333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.02083333395421505, + "rewards/xmlcount_reward_func": 0.4684166759252548, + "step": 90 + }, + { + "completion_length": 215.37500762939453, + "epoch": 0.04870868459788572, + "grad_norm": 1.8671875, + "kl": 0.0036017470411024988, + "learning_rate": 2.433155080213904e-06, + "loss": 0.0001, + "reward": 2.145833432674408, + "reward_std": 0.8112322501838207, + "rewards/correctness_reward_func": 1.333333358168602, + "rewards/int_reward_func": 0.31250001303851604, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.5, + "step": 91 + }, + { + "completion_length": 165.4166717529297, + "epoch": 0.04924394486819216, + "grad_norm": 2.515625, + "kl": 0.005282851168885827, + "learning_rate": 2.4598930481283426e-06, + "loss": 0.0002, + "reward": 2.489583432674408, + "reward_std": 0.8116736710071564, + "rewards/correctness_reward_func": 1.6666667461395264, + "rewards/int_reward_func": 0.33333334885537624, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4895833358168602, + "step": 92 + }, + { + "completion_length": 229.0000114440918, + "epoch": 0.049779205138498595, + "grad_norm": 1.828125, + "kl": 0.0037543401995208114, + "learning_rate": 2.486631016042781e-06, + "loss": 0.0002, + "reward": 2.086958348751068, + "reward_std": 1.1980505138635635, + "rewards/correctness_reward_func": 1.3333333730697632, + "rewards/int_reward_func": 0.3333333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.02083333395421505, + "rewards/xmlcount_reward_func": 0.3994583375751972, + "step": 93 + }, + { + "completion_length": 240.16667556762695, + "epoch": 0.050314465408805034, + "grad_norm": 1.7578125, + "kl": 0.003991760429926217, + "learning_rate": 2.5133689839572194e-06, + "loss": 0.0002, + "reward": 1.7708334177732468, + "reward_std": 0.6722075343132019, + "rewards/correctness_reward_func": 0.916666679084301, + "rewards/int_reward_func": 0.3333333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0416666679084301, + "rewards/xmlcount_reward_func": 0.4791666716337204, + "step": 94 + }, + { + "completion_length": 186.3333396911621, + "epoch": 0.05084972567911147, + "grad_norm": 1.625, + "kl": 0.005014055874198675, + "learning_rate": 2.5401069518716583e-06, + "loss": 0.0002, + "reward": 2.7291666865348816, + "reward_std": 0.5318794921040535, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.4375000074505806, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0416666679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 95 + }, + { + "completion_length": 182.0416717529297, + "epoch": 0.051384985949417906, + "grad_norm": 1.9296875, + "kl": 0.01205193460918963, + "learning_rate": 2.5668449197860967e-06, + "loss": 0.0005, + "reward": 2.479166805744171, + "reward_std": 0.8848456591367722, + "rewards/correctness_reward_func": 1.5833333730697632, + "rewards/int_reward_func": 0.3541666753590107, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.06250000186264515, + "rewards/xmlcount_reward_func": 0.4791666716337204, + "step": 96 + }, + { + "completion_length": 192.87500762939453, + "epoch": 0.05192024621972434, + "grad_norm": 1.8984375, + "kl": 0.0356319691054523, + "learning_rate": 2.5935828877005347e-06, + "loss": 0.0014, + "reward": 2.3541667461395264, + "reward_std": 0.4258173182606697, + "rewards/correctness_reward_func": 1.6666666716337204, + "rewards/int_reward_func": 0.2291666753590107, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4583333432674408, + "step": 97 + }, + { + "completion_length": 182.5833396911621, + "epoch": 0.05245550649003078, + "grad_norm": 2.9375, + "kl": 0.057575218263082206, + "learning_rate": 2.6203208556149735e-06, + "loss": 0.0023, + "reward": 2.504125028848648, + "reward_std": 0.500206220895052, + "rewards/correctness_reward_func": 1.5000000223517418, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.02083333395421505, + "rewards/xmlcount_reward_func": 0.4832916706800461, + "step": 98 + }, + { + "completion_length": 216.29167556762695, + "epoch": 0.05299076676033721, + "grad_norm": 1.5703125, + "kl": 0.019477371592074633, + "learning_rate": 2.647058823529412e-06, + "loss": 0.0008, + "reward": 2.6666666865348816, + "reward_std": 0.39335764572024345, + "rewards/correctness_reward_func": 1.75, + "rewards/int_reward_func": 0.4375, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4791666716337204, + "step": 99 + }, + { + "completion_length": 229.62500762939453, + "epoch": 0.05352602703064365, + "grad_norm": 1.4375, + "kl": 0.010605788585962728, + "learning_rate": 2.673796791443851e-06, + "loss": 0.0004, + "reward": 1.7427083849906921, + "reward_std": 0.7021718323230743, + "rewards/correctness_reward_func": 0.9166667088866234, + "rewards/int_reward_func": 0.3333333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.02083333395421505, + "rewards/xmlcount_reward_func": 0.47187500447034836, + "step": 100 + }, + { + "completion_length": 229.83334350585938, + "epoch": 0.054061287300950084, + "grad_norm": 2.09375, + "kl": 0.005640399642288685, + "learning_rate": 2.7005347593582892e-06, + "loss": 0.0002, + "reward": 1.8177084177732468, + "reward_std": 0.6098503544926643, + "rewards/correctness_reward_func": 1.0000000223517418, + "rewards/int_reward_func": 0.2916666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.06250000186264515, + "rewards/xmlcount_reward_func": 0.4635416716337204, + "step": 101 + }, + { + "completion_length": 184.2916717529297, + "epoch": 0.05459654757125652, + "grad_norm": 1.7734375, + "kl": 0.01011388812912628, + "learning_rate": 2.7272727272727272e-06, + "loss": 0.0004, + "reward": 2.645833432674408, + "reward_std": 0.4501614086329937, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.2083333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0416666679084301, + "rewards/xmlcount_reward_func": 0.4791666716337204, + "step": 102 + }, + { + "completion_length": 224.04166793823242, + "epoch": 0.05513180784156296, + "grad_norm": 1.7421875, + "kl": 0.0050743266474455595, + "learning_rate": 2.754010695187166e-06, + "loss": 0.0002, + "reward": 2.270833432674408, + "reward_std": 0.7563454322516918, + "rewards/correctness_reward_func": 1.4166667014360428, + "rewards/int_reward_func": 0.3333333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.02083333395421505, + "rewards/xmlcount_reward_func": 0.5, + "step": 103 + }, + { + "completion_length": 146.45833778381348, + "epoch": 0.055667068111869396, + "grad_norm": 2.5, + "kl": 0.010326952033210546, + "learning_rate": 2.7807486631016045e-06, + "loss": 0.0004, + "reward": 2.7812500596046448, + "reward_std": 0.38120611757040024, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.31250001303851604, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0833333358168602, + "rewards/xmlcount_reward_func": 0.4687500074505806, + "step": 104 + }, + { + "completion_length": 169.58333587646484, + "epoch": 0.056202328382175835, + "grad_norm": 1.2265625, + "kl": 0.009152874117717147, + "learning_rate": 2.807486631016043e-06, + "loss": 0.0004, + "reward": 2.9375000596046448, + "reward_std": 0.11558076366782188, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.3750000037252903, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0625, + "rewards/xmlcount_reward_func": 0.5, + "step": 105 + }, + { + "completion_length": 233.33334350585938, + "epoch": 0.05673758865248227, + "grad_norm": 1.34375, + "kl": 0.01396864268463105, + "learning_rate": 2.8342245989304818e-06, + "loss": 0.0006, + "reward": 2.3437500298023224, + "reward_std": 0.6216080188751221, + "rewards/correctness_reward_func": 1.416666679084301, + "rewards/int_reward_func": 0.4375000149011612, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.02083333395421505, + "rewards/xmlcount_reward_func": 0.46875, + "step": 106 + }, + { + "completion_length": 188.00000381469727, + "epoch": 0.05727284892278871, + "grad_norm": 1.6328125, + "kl": 0.0061717041535303, + "learning_rate": 2.8609625668449198e-06, + "loss": 0.0002, + "reward": 2.5625000596046448, + "reward_std": 0.5238290876150131, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.3750000037252903, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.02083333395421505, + "rewards/xmlcount_reward_func": 0.5, + "step": 107 + }, + { + "completion_length": 231.29167556762695, + "epoch": 0.05780810919309514, + "grad_norm": 1.7890625, + "kl": 0.0035084771225228906, + "learning_rate": 2.8877005347593586e-06, + "loss": 0.0001, + "reward": 1.9696250259876251, + "reward_std": 0.20630286261439323, + "rewards/correctness_reward_func": 1.0, + "rewards/int_reward_func": 0.416666679084301, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.06250000186264515, + "rewards/xmlcount_reward_func": 0.49045833945274353, + "step": 108 + }, + { + "completion_length": 274.83333587646484, + "epoch": 0.05834336946340158, + "grad_norm": 1.765625, + "kl": 0.004193893808405846, + "learning_rate": 2.914438502673797e-06, + "loss": 0.0002, + "reward": 2.569666802883148, + "reward_std": 1.0057230442762375, + "rewards/correctness_reward_func": 1.5833334028720856, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0625, + "rewards/xmlcount_reward_func": 0.46549999713897705, + "step": 109 + }, + { + "completion_length": 167.7083339691162, + "epoch": 0.05887862973370801, + "grad_norm": 2.5625, + "kl": 0.011588132474571466, + "learning_rate": 2.9411764705882355e-06, + "loss": 0.0005, + "reward": 1.5813333690166473, + "reward_std": 0.5862554460763931, + "rewards/correctness_reward_func": 0.9166666716337204, + "rewards/int_reward_func": 0.1666666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49800001084804535, + "step": 110 + }, + { + "completion_length": 201.16667938232422, + "epoch": 0.05941389000401445, + "grad_norm": 2.09375, + "kl": 0.008921175263822079, + "learning_rate": 2.9679144385026743e-06, + "loss": 0.0004, + "reward": 2.0729166865348816, + "reward_std": 0.8873137533664703, + "rewards/correctness_reward_func": 1.166666716337204, + "rewards/int_reward_func": 0.3333333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.1041666679084301, + "rewards/xmlcount_reward_func": 0.4687500074505806, + "step": 111 + }, + { + "completion_length": 154.45833587646484, + "epoch": 0.059949150274320885, + "grad_norm": 15.75, + "kl": 0.042277290020138025, + "learning_rate": 2.9946524064171123e-06, + "loss": 0.0017, + "reward": 2.7500001192092896, + "reward_std": 0.6749640665948391, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.33333333395421505, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.12500000186264515, + "rewards/xmlcount_reward_func": 0.4583333432674408, + "step": 112 + }, + { + "completion_length": 252.9166717529297, + "epoch": 0.060484410544627325, + "grad_norm": 1.2890625, + "kl": 0.008890356635674834, + "learning_rate": 3.0213903743315507e-06, + "loss": 0.0004, + "reward": 2.3437500596046448, + "reward_std": 0.6434758454561234, + "rewards/correctness_reward_func": 1.5000000223517418, + "rewards/int_reward_func": 0.3750000074505806, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.02083333395421505, + "rewards/xmlcount_reward_func": 0.4479166716337204, + "step": 113 + }, + { + "completion_length": 218.0416717529297, + "epoch": 0.061019670814933764, + "grad_norm": 1.6328125, + "kl": 0.006781109143048525, + "learning_rate": 3.0481283422459896e-06, + "loss": 0.0003, + "reward": 2.557291716337204, + "reward_std": 0.5454855412244797, + "rewards/correctness_reward_func": 1.583333358168602, + "rewards/int_reward_func": 0.4166666679084301, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.06250000186264515, + "rewards/xmlcount_reward_func": 0.4947916716337204, + "step": 114 + }, + { + "completion_length": 183.7916717529297, + "epoch": 0.0615549310852402, + "grad_norm": 1.84375, + "kl": 0.008323265705257654, + "learning_rate": 3.074866310160428e-06, + "loss": 0.0003, + "reward": 2.255208432674408, + "reward_std": 0.9933101981878281, + "rewards/correctness_reward_func": 1.2500000521540642, + "rewards/int_reward_func": 0.3333333395421505, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.1875000037252903, + "rewards/xmlcount_reward_func": 0.484375, + "step": 115 + }, + { + "completion_length": 224.0833396911621, + "epoch": 0.06209019135554664, + "grad_norm": 1.3828125, + "kl": 0.022750876378268003, + "learning_rate": 3.101604278074867e-06, + "loss": 0.0009, + "reward": 2.3177084028720856, + "reward_std": 0.6239343695342541, + "rewards/correctness_reward_func": 1.4166666716337204, + "rewards/int_reward_func": 0.37500000558793545, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0416666679084301, + "rewards/xmlcount_reward_func": 0.484375, + "step": 116 + }, + { + "completion_length": 181.8333396911621, + "epoch": 0.06262545162585308, + "grad_norm": 1.8359375, + "kl": 0.00948640692513436, + "learning_rate": 3.128342245989305e-06, + "loss": 0.0004, + "reward": 2.4747501015663147, + "reward_std": 0.8183496445417404, + "rewards/correctness_reward_func": 1.5833333730697632, + "rewards/int_reward_func": 0.2916666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.10416666977107525, + "rewards/xmlcount_reward_func": 0.49558334052562714, + "step": 117 + }, + { + "completion_length": 203.25000762939453, + "epoch": 0.0631607118961595, + "grad_norm": 1.328125, + "kl": 0.011673168744891882, + "learning_rate": 3.1550802139037433e-06, + "loss": 0.0005, + "reward": 2.5625000596046448, + "reward_std": 0.32106195017695427, + "rewards/correctness_reward_func": 1.6666666716337204, + "rewards/int_reward_func": 0.37500000558793545, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.02083333395421505, + "rewards/xmlcount_reward_func": 0.5, + "step": 118 + }, + { + "completion_length": 259.3333396911621, + "epoch": 0.06369597216646594, + "grad_norm": 1.65625, + "kl": 0.009123387979343534, + "learning_rate": 3.181818181818182e-06, + "loss": 0.0004, + "reward": 2.166666716337204, + "reward_std": 0.8825219944119453, + "rewards/correctness_reward_func": 1.25, + "rewards/int_reward_func": 0.4166666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.02083333395421505, + "rewards/xmlcount_reward_func": 0.4791666716337204, + "step": 119 + }, + { + "completion_length": 189.75000381469727, + "epoch": 0.06423123243677238, + "grad_norm": 1.921875, + "kl": 0.009333281544968486, + "learning_rate": 3.2085561497326205e-06, + "loss": 0.0004, + "reward": 2.1666667461395264, + "reward_std": 0.7824205458164215, + "rewards/correctness_reward_func": 1.166666716337204, + "rewards/int_reward_func": 0.4166666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0833333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 120 + }, + { + "completion_length": 217.87500762939453, + "epoch": 0.06476649270707882, + "grad_norm": 1.7265625, + "kl": 0.008323910529725254, + "learning_rate": 3.2352941176470594e-06, + "loss": 0.0003, + "reward": 2.3333334028720856, + "reward_std": 0.627492543309927, + "rewards/correctness_reward_func": 1.3333333432674408, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0416666679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 121 + }, + { + "completion_length": 212.75000762939453, + "epoch": 0.06530175297738525, + "grad_norm": 2.1875, + "kl": 0.00647055555600673, + "learning_rate": 3.262032085561498e-06, + "loss": 0.0003, + "reward": 2.708333373069763, + "reward_std": 0.8154087364673615, + "rewards/correctness_reward_func": 1.6666667461395264, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0833333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 122 + }, + { + "completion_length": 217.25000953674316, + "epoch": 0.06583701324769169, + "grad_norm": 2.125, + "kl": 0.010265512275509536, + "learning_rate": 3.288770053475936e-06, + "loss": 0.0004, + "reward": 2.333333373069763, + "reward_std": 0.6681104451417923, + "rewards/correctness_reward_func": 1.4166666716337204, + "rewards/int_reward_func": 0.31250000558793545, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.10416666977107525, + "rewards/xmlcount_reward_func": 0.5, + "step": 123 + }, + { + "completion_length": 207.9583396911621, + "epoch": 0.06637227351799813, + "grad_norm": 1.671875, + "kl": 0.010341339744627476, + "learning_rate": 3.3155080213903747e-06, + "loss": 0.0004, + "reward": 2.5937500596046448, + "reward_std": 0.3593357726931572, + "rewards/correctness_reward_func": 1.6666666716337204, + "rewards/int_reward_func": 0.39583333395421505, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0416666679084301, + "rewards/xmlcount_reward_func": 0.4895833358168602, + "step": 124 + }, + { + "completion_length": 206.87500381469727, + "epoch": 0.06690753378830457, + "grad_norm": 1.796875, + "kl": 0.008234906010329723, + "learning_rate": 3.342245989304813e-06, + "loss": 0.0003, + "reward": 2.4166667461395264, + "reward_std": 0.7896890789270401, + "rewards/correctness_reward_func": 1.4166667014360428, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0416666679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 125 + }, + { + "completion_length": 202.2083396911621, + "epoch": 0.067442794058611, + "grad_norm": 2.234375, + "kl": 0.011055209208279848, + "learning_rate": 3.368983957219252e-06, + "loss": 0.0004, + "reward": 2.4166667461395264, + "reward_std": 0.5647460781037807, + "rewards/correctness_reward_func": 1.5000000223517418, + "rewards/int_reward_func": 0.3958333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.02083333395421505, + "rewards/xmlcount_reward_func": 0.5, + "step": 126 + }, + { + "completion_length": 163.1666717529297, + "epoch": 0.06797805432891743, + "grad_norm": 1.9765625, + "kl": 0.009094940614886582, + "learning_rate": 3.3957219251336904e-06, + "loss": 0.0004, + "reward": 2.3541667461395264, + "reward_std": 0.611662745475769, + "rewards/correctness_reward_func": 1.2500000298023224, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.1458333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 127 + }, + { + "completion_length": 168.9583396911621, + "epoch": 0.06851331459922387, + "grad_norm": 2.015625, + "kl": 0.011237279628403485, + "learning_rate": 3.4224598930481284e-06, + "loss": 0.0004, + "reward": 2.4375000447034836, + "reward_std": 0.6058737970888615, + "rewards/correctness_reward_func": 1.416666679084301, + "rewards/int_reward_func": 0.37500000558793545, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.14583333767950535, + "rewards/xmlcount_reward_func": 0.5, + "step": 128 + }, + { + "completion_length": 227.3333396911621, + "epoch": 0.06904857486953031, + "grad_norm": 1.71875, + "kl": 0.006947090849280357, + "learning_rate": 3.449197860962567e-06, + "loss": 0.0003, + "reward": 2.250000089406967, + "reward_std": 0.9643253535032272, + "rewards/correctness_reward_func": 1.3333333879709244, + "rewards/int_reward_func": 0.3958333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.02083333395421505, + "rewards/xmlcount_reward_func": 0.5, + "step": 129 + }, + { + "completion_length": 212.9583396911621, + "epoch": 0.06958383513983675, + "grad_norm": 2.25, + "kl": 0.012483905302360654, + "learning_rate": 3.4759358288770056e-06, + "loss": 0.0005, + "reward": 2.4218750298023224, + "reward_std": 0.7965251952409744, + "rewards/correctness_reward_func": 1.5000000298023224, + "rewards/int_reward_func": 0.3541666753590107, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0833333358168602, + "rewards/xmlcount_reward_func": 0.484375, + "step": 130 + }, + { + "completion_length": 178.8333396911621, + "epoch": 0.07011909541014318, + "grad_norm": 2.1875, + "kl": 0.013714013854041696, + "learning_rate": 3.5026737967914445e-06, + "loss": 0.0005, + "reward": 2.291666716337204, + "reward_std": 0.9178697988390923, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.2708333395421505, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.10416666977107525, + "rewards/xmlcount_reward_func": 0.5, + "step": 131 + }, + { + "completion_length": 182.875, + "epoch": 0.07065435568044962, + "grad_norm": 2.203125, + "kl": 0.007862797006964684, + "learning_rate": 3.529411764705883e-06, + "loss": 0.0003, + "reward": 2.6458334922790527, + "reward_std": 0.8817127346992493, + "rewards/correctness_reward_func": 1.5000000596046448, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.1875000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 132 + }, + { + "completion_length": 166.62500381469727, + "epoch": 0.07118961595075605, + "grad_norm": 1.46875, + "kl": 0.011107051279395819, + "learning_rate": 3.556149732620321e-06, + "loss": 0.0004, + "reward": 3.031000018119812, + "reward_std": 0.33994986675679684, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.1666666679084301, + "rewards/xmlcount_reward_func": 0.4893333315849304, + "step": 133 + }, + { + "completion_length": 205.41667556762695, + "epoch": 0.0717248762210625, + "grad_norm": 1.78125, + "kl": 0.00937680620700121, + "learning_rate": 3.5828877005347597e-06, + "loss": 0.0004, + "reward": 2.395833373069763, + "reward_std": 1.0875979363918304, + "rewards/correctness_reward_func": 1.416666716337204, + "rewards/int_reward_func": 0.416666679084301, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.06250000186264515, + "rewards/xmlcount_reward_func": 0.5, + "step": 134 + }, + { + "completion_length": 139.54166984558105, + "epoch": 0.07226013649136893, + "grad_norm": 4.6875, + "kl": 0.08071585092693567, + "learning_rate": 3.609625668449198e-06, + "loss": 0.0032, + "reward": 2.8697917461395264, + "reward_std": 0.5514856986701488, + "rewards/correctness_reward_func": 1.8333333432674408, + "rewards/int_reward_func": 0.4375000074505806, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.1250000037252903, + "rewards/xmlcount_reward_func": 0.4739583432674408, + "step": 135 + }, + { + "completion_length": 187.62500381469727, + "epoch": 0.07279539676167536, + "grad_norm": 1.34375, + "kl": 0.011190556921064854, + "learning_rate": 3.6363636363636366e-06, + "loss": 0.0004, + "reward": 2.708333432674408, + "reward_std": 0.31584101915359497, + "rewards/correctness_reward_func": 1.5833333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.1250000037252903, + "rewards/xmlcount_reward_func": 0.5, + "step": 136 + }, + { + "completion_length": 172.83333778381348, + "epoch": 0.0733306570319818, + "grad_norm": 1.765625, + "kl": 0.009673898573964834, + "learning_rate": 3.6631016042780754e-06, + "loss": 0.0004, + "reward": 2.7916666865348816, + "reward_std": 0.4644980877637863, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0833333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 137 + }, + { + "completion_length": 178.2083396911621, + "epoch": 0.07386591730228824, + "grad_norm": 2.34375, + "kl": 0.015270714182406664, + "learning_rate": 3.6898395721925134e-06, + "loss": 0.0006, + "reward": 2.7500001192092896, + "reward_std": 0.6503244712948799, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.3750000074505806, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.1250000037252903, + "rewards/xmlcount_reward_func": 0.5, + "step": 138 + }, + { + "completion_length": 171.83333587646484, + "epoch": 0.07440117757259468, + "grad_norm": 1.765625, + "kl": 0.013465502765029669, + "learning_rate": 3.716577540106952e-06, + "loss": 0.0005, + "reward": 2.7187500596046448, + "reward_std": 0.6593321561813354, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.08333333395421505, + "rewards/xmlcount_reward_func": 0.4895833358168602, + "step": 139 + }, + { + "completion_length": 187.0416717529297, + "epoch": 0.0749364378429011, + "grad_norm": 1.328125, + "kl": 0.011786214541643858, + "learning_rate": 3.7433155080213907e-06, + "loss": 0.0005, + "reward": 2.5729166865348816, + "reward_std": 0.141096293926239, + "rewards/correctness_reward_func": 1.5, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0833333358168602, + "rewards/xmlcount_reward_func": 0.4895833358168602, + "step": 140 + }, + { + "completion_length": 197.58333587646484, + "epoch": 0.07547169811320754, + "grad_norm": 2.09375, + "kl": 0.007584544597193599, + "learning_rate": 3.770053475935829e-06, + "loss": 0.0003, + "reward": 2.2031250596046448, + "reward_std": 0.8265210092067719, + "rewards/correctness_reward_func": 1.1666666939854622, + "rewards/int_reward_func": 0.4375, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.1041666679084301, + "rewards/xmlcount_reward_func": 0.4947916716337204, + "step": 141 + }, + { + "completion_length": 139.75, + "epoch": 0.07600695838351398, + "grad_norm": 2.546875, + "kl": 0.03859049454331398, + "learning_rate": 3.796791443850268e-06, + "loss": 0.0015, + "reward": 2.9791667461395264, + "reward_std": 0.48826754838228226, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.12500000186264515, + "rewards/xmlcount_reward_func": 0.4791666716337204, + "step": 142 + }, + { + "completion_length": 214.5, + "epoch": 0.07654221865382042, + "grad_norm": 1.4375, + "kl": 0.00880357634741813, + "learning_rate": 3.8235294117647055e-06, + "loss": 0.0004, + "reward": 2.375000089406967, + "reward_std": 0.8150961697101593, + "rewards/correctness_reward_func": 1.333333358168602, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0833333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 143 + }, + { + "completion_length": 151.37500381469727, + "epoch": 0.07707747892412686, + "grad_norm": 2.15625, + "kl": 0.012635418446734548, + "learning_rate": 3.850267379679145e-06, + "loss": 0.0005, + "reward": 2.7708334028720856, + "reward_std": 0.5846511572599411, + "rewards/correctness_reward_func": 1.583333358168602, + "rewards/int_reward_func": 0.4375, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.2500000037252903, + "rewards/xmlcount_reward_func": 0.5, + "step": 144 + }, + { + "completion_length": 228.16667366027832, + "epoch": 0.07761273919443329, + "grad_norm": 1.8125, + "kl": 0.00903172290418297, + "learning_rate": 3.877005347593583e-06, + "loss": 0.0004, + "reward": 2.578125089406967, + "reward_std": 0.500914141535759, + "rewards/correctness_reward_func": 1.5000000223517418, + "rewards/int_reward_func": 0.4375, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.18750000186264515, + "rewards/xmlcount_reward_func": 0.453125, + "step": 145 + }, + { + "completion_length": 129.25000190734863, + "epoch": 0.07814799946473973, + "grad_norm": 2.21875, + "kl": 0.020013232016935945, + "learning_rate": 3.903743315508022e-06, + "loss": 0.0008, + "reward": 2.770833373069763, + "reward_std": 0.6944468766450882, + "rewards/correctness_reward_func": 1.7500000596046448, + "rewards/int_reward_func": 0.3958333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.1250000037252903, + "rewards/xmlcount_reward_func": 0.5, + "step": 146 + }, + { + "completion_length": 173.00000190734863, + "epoch": 0.07868325973504617, + "grad_norm": 1.9140625, + "kl": 0.025609272299334407, + "learning_rate": 3.93048128342246e-06, + "loss": 0.001, + "reward": 2.733708381652832, + "reward_std": 0.5789778083562851, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.10416666977107525, + "rewards/xmlcount_reward_func": 0.46287500113248825, + "step": 147 + }, + { + "completion_length": 146.1250057220459, + "epoch": 0.07921852000535261, + "grad_norm": 1.859375, + "kl": 0.017028656788170338, + "learning_rate": 3.957219251336899e-06, + "loss": 0.0007, + "reward": 2.125000074505806, + "reward_std": 0.491043072193861, + "rewards/correctness_reward_func": 1.1666666716337204, + "rewards/int_reward_func": 0.25000000558793545, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.20833334140479565, + "rewards/xmlcount_reward_func": 0.5, + "step": 148 + }, + { + "completion_length": 137.12500381469727, + "epoch": 0.07975378027565903, + "grad_norm": 2.390625, + "kl": 0.023823135998100042, + "learning_rate": 3.983957219251337e-06, + "loss": 0.001, + "reward": 3.083333432674408, + "reward_std": 0.2887342944741249, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.3958333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.18750000558793545, + "rewards/xmlcount_reward_func": 0.5, + "step": 149 + }, + { + "completion_length": 138.20833587646484, + "epoch": 0.08028904054596547, + "grad_norm": 1.90625, + "kl": 0.016617624554783106, + "learning_rate": 4.010695187165775e-06, + "loss": 0.0007, + "reward": 3.1250000596046448, + "reward_std": 0.4123322442173958, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.2500000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 150 + }, + { + "completion_length": 196.87500381469727, + "epoch": 0.08082430081627191, + "grad_norm": 1.9609375, + "kl": 0.017903268802911043, + "learning_rate": 4.037433155080215e-06, + "loss": 0.0007, + "reward": 2.4166667461395264, + "reward_std": 0.8529610484838486, + "rewards/correctness_reward_func": 1.2500000521540642, + "rewards/int_reward_func": 0.416666679084301, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.25000000931322575, + "rewards/xmlcount_reward_func": 0.5, + "step": 151 + }, + { + "completion_length": 203.6666717529297, + "epoch": 0.08135956108657835, + "grad_norm": 1.8515625, + "kl": 0.0095352737698704, + "learning_rate": 4.064171122994653e-06, + "loss": 0.0004, + "reward": 2.8125000596046448, + "reward_std": 0.7167538553476334, + "rewards/correctness_reward_func": 1.7500000596046448, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.1041666679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 152 + }, + { + "completion_length": 156.12500381469727, + "epoch": 0.08189482135688479, + "grad_norm": 1.484375, + "kl": 0.012113512842915952, + "learning_rate": 4.0909090909090915e-06, + "loss": 0.0005, + "reward": 3.1875000596046448, + "reward_std": 0.309229951351881, + "rewards/correctness_reward_func": 1.8333333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3541666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 153 + }, + { + "completion_length": 140.0416717529297, + "epoch": 0.08243008162719122, + "grad_norm": 2.15625, + "kl": 0.02159164287149906, + "learning_rate": 4.11764705882353e-06, + "loss": 0.0009, + "reward": 3.0000001192092896, + "reward_std": 0.5643851608037949, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.2916666679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 154 + }, + { + "completion_length": 202.50000762939453, + "epoch": 0.08296534189749766, + "grad_norm": 1.59375, + "kl": 0.01972877373918891, + "learning_rate": 4.144385026737968e-06, + "loss": 0.0008, + "reward": 2.3750001192092896, + "reward_std": 0.6778506711125374, + "rewards/correctness_reward_func": 1.2500000298023224, + "rewards/int_reward_func": 0.4375000074505806, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.1875000037252903, + "rewards/xmlcount_reward_func": 0.5, + "step": 155 + }, + { + "completion_length": 165.33333778381348, + "epoch": 0.0835006021678041, + "grad_norm": 2.28125, + "kl": 0.028622428653761744, + "learning_rate": 4.171122994652407e-06, + "loss": 0.0011, + "reward": 2.282666653394699, + "reward_std": 1.0192717239260674, + "rewards/correctness_reward_func": 1.25, + "rewards/int_reward_func": 0.416666679084301, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.1458333358168602, + "rewards/xmlcount_reward_func": 0.4701666682958603, + "step": 156 + }, + { + "completion_length": 123.58333778381348, + "epoch": 0.08403586243811054, + "grad_norm": 2.53125, + "kl": 0.02671874687075615, + "learning_rate": 4.197860962566845e-06, + "loss": 0.0011, + "reward": 2.9166667461395264, + "reward_std": 0.8969832062721252, + "rewards/correctness_reward_func": 1.6666667461395264, + "rewards/int_reward_func": 0.4375, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3125000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 157 + }, + { + "completion_length": 153.6666717529297, + "epoch": 0.08457112270841696, + "grad_norm": 2.359375, + "kl": 0.020442907931283116, + "learning_rate": 4.224598930481284e-06, + "loss": 0.0008, + "reward": 2.531250089406967, + "reward_std": 0.547564685344696, + "rewards/correctness_reward_func": 1.2500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.2916666753590107, + "rewards/xmlcount_reward_func": 0.4895833358168602, + "step": 158 + }, + { + "completion_length": 143.62500381469727, + "epoch": 0.0851063829787234, + "grad_norm": 1.921875, + "kl": 0.030001087579876184, + "learning_rate": 4.251336898395722e-06, + "loss": 0.0012, + "reward": 2.8095834255218506, + "reward_std": 0.577202744781971, + "rewards/correctness_reward_func": 1.5000000223517418, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3333333358168602, + "rewards/xmlcount_reward_func": 0.4970833361148834, + "step": 159 + }, + { + "completion_length": 143.125, + "epoch": 0.08564164324902984, + "grad_norm": 1.8046875, + "kl": 0.02288861945271492, + "learning_rate": 4.2780748663101604e-06, + "loss": 0.0009, + "reward": 2.8125, + "reward_std": 0.6792502254247665, + "rewards/correctness_reward_func": 1.5833333730697632, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.25000000186264515, + "rewards/xmlcount_reward_func": 0.5, + "step": 160 + }, + { + "completion_length": 175.7083396911621, + "epoch": 0.08617690351933628, + "grad_norm": 2.078125, + "kl": 0.018278248608112335, + "learning_rate": 4.304812834224599e-06, + "loss": 0.0007, + "reward": 2.0726667046546936, + "reward_std": 0.8482229933142662, + "rewards/correctness_reward_func": 0.916666679084301, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.2083333358168602, + "rewards/xmlcount_reward_func": 0.4893333315849304, + "step": 161 + }, + { + "completion_length": 148.04166984558105, + "epoch": 0.08671216378964271, + "grad_norm": 2.390625, + "kl": 0.025503937155008316, + "learning_rate": 4.331550802139038e-06, + "loss": 0.001, + "reward": 2.6666667461395264, + "reward_std": 0.6222646199166775, + "rewards/correctness_reward_func": 1.416666679084301, + "rewards/int_reward_func": 0.4166666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.33333334140479565, + "rewards/xmlcount_reward_func": 0.5, + "step": 162 + }, + { + "completion_length": 148.5416717529297, + "epoch": 0.08724742405994915, + "grad_norm": 2.65625, + "kl": 0.03295175568200648, + "learning_rate": 4.3582887700534766e-06, + "loss": 0.0013, + "reward": 1.6875000298023224, + "reward_std": 0.28862859681248665, + "rewards/correctness_reward_func": 0.5, + "rewards/int_reward_func": 0.3125000149011612, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3750000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 163 + }, + { + "completion_length": 161.5416717529297, + "epoch": 0.08778268433025559, + "grad_norm": 1.140625, + "kl": 0.021444957936182618, + "learning_rate": 4.385026737967915e-06, + "loss": 0.0009, + "reward": 2.6875000596046448, + "reward_std": 0.496665894985199, + "rewards/correctness_reward_func": 1.4166666716337204, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.2708333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 164 + }, + { + "completion_length": 132.0833396911621, + "epoch": 0.08831794460056203, + "grad_norm": 4.125, + "kl": 0.08498809393495321, + "learning_rate": 4.411764705882353e-06, + "loss": 0.0034, + "reward": 3.161458373069763, + "reward_std": 0.6175251640379429, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333432674408, + "rewards/xmlcount_reward_func": 0.4739583358168602, + "step": 165 + }, + { + "completion_length": 119.50000190734863, + "epoch": 0.08885320487086847, + "grad_norm": 2.46875, + "kl": 0.04442449565976858, + "learning_rate": 4.438502673796792e-06, + "loss": 0.0018, + "reward": 2.7916667461395264, + "reward_std": 0.8464668020606041, + "rewards/correctness_reward_func": 1.5000000447034836, + "rewards/int_reward_func": 0.4375000074505806, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.354166679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 166 + }, + { + "completion_length": 150.41666793823242, + "epoch": 0.08938846514117489, + "grad_norm": 1.96875, + "kl": 0.020863166078925133, + "learning_rate": 4.46524064171123e-06, + "loss": 0.0008, + "reward": 2.958333432674408, + "reward_std": 0.7232691049575806, + "rewards/correctness_reward_func": 1.666666716337204, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3125000149011612, + "rewards/xmlcount_reward_func": 0.5, + "step": 167 + }, + { + "completion_length": 132.20833778381348, + "epoch": 0.08992372541148133, + "grad_norm": 1.4296875, + "kl": 0.024739216547459364, + "learning_rate": 4.491978609625669e-06, + "loss": 0.001, + "reward": 3.333333373069763, + "reward_std": 0.12909945845603943, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.4375, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333395421505, + "rewards/xmlcount_reward_func": 0.5, + "step": 168 + }, + { + "completion_length": 172.33333778381348, + "epoch": 0.09045898568178777, + "grad_norm": 2.375, + "kl": 0.030473611317574978, + "learning_rate": 4.518716577540107e-06, + "loss": 0.0012, + "reward": 2.775750070810318, + "reward_std": 0.27488668262958527, + "rewards/correctness_reward_func": 1.5, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.33333334140479565, + "rewards/xmlcount_reward_func": 0.4424166679382324, + "step": 169 + }, + { + "completion_length": 114.875, + "epoch": 0.09099424595209421, + "grad_norm": 2.765625, + "kl": 0.032696583308279514, + "learning_rate": 4.5454545454545455e-06, + "loss": 0.0013, + "reward": 3.036458432674408, + "reward_std": 0.6265082620084286, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333432674408, + "rewards/xmlcount_reward_func": 0.4947916716337204, + "step": 170 + }, + { + "completion_length": 186.7916717529297, + "epoch": 0.09152950622240064, + "grad_norm": 2.734375, + "kl": 0.05477259890176356, + "learning_rate": 4.572192513368984e-06, + "loss": 0.0022, + "reward": 2.830708384513855, + "reward_std": 0.8129361271858215, + "rewards/correctness_reward_func": 1.5833333730697632, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3333333395421505, + "rewards/xmlcount_reward_func": 0.4348750039935112, + "step": 171 + }, + { + "completion_length": 132.04167366027832, + "epoch": 0.09206476649270708, + "grad_norm": 2.71875, + "kl": 0.06303630210459232, + "learning_rate": 4.598930481283423e-06, + "loss": 0.0025, + "reward": 3.1875000596046448, + "reward_std": 0.4721617363393307, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.4375000074505806, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3333333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 172 + }, + { + "completion_length": 144.54167556762695, + "epoch": 0.09260002676301352, + "grad_norm": 2.28125, + "kl": 0.03534765588119626, + "learning_rate": 4.625668449197862e-06, + "loss": 0.0014, + "reward": 3.020833432674408, + "reward_std": 0.4301304928958416, + "rewards/correctness_reward_func": 1.6666666716337204, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.354166679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 173 + }, + { + "completion_length": 110.5000057220459, + "epoch": 0.09313528703331996, + "grad_norm": 1.4765625, + "kl": 0.030329300556331873, + "learning_rate": 4.6524064171123e-06, + "loss": 0.0012, + "reward": 3.270833373069763, + "reward_std": 0.38332105800509453, + "rewards/correctness_reward_func": 1.8333333432674408, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 174 + }, + { + "completion_length": 159.04166793823242, + "epoch": 0.0936705473036264, + "grad_norm": 1.34375, + "kl": 0.029189520981162786, + "learning_rate": 4.6791443850267385e-06, + "loss": 0.0012, + "reward": 3.0416666865348816, + "reward_std": 0.4541241377592087, + "rewards/correctness_reward_func": 1.583333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 175 + }, + { + "completion_length": 153.7083396911621, + "epoch": 0.09420580757393282, + "grad_norm": 2.1875, + "kl": 0.023249680642038584, + "learning_rate": 4.705882352941177e-06, + "loss": 0.0009, + "reward": 2.8125001192092896, + "reward_std": 0.8104839585721493, + "rewards/correctness_reward_func": 1.5833333730697632, + "rewards/int_reward_func": 0.4166666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.31250000186264515, + "rewards/xmlcount_reward_func": 0.5, + "step": 176 + }, + { + "completion_length": 196.79167556762695, + "epoch": 0.09474106784423926, + "grad_norm": 2.1875, + "kl": 0.023467288352549076, + "learning_rate": 4.732620320855615e-06, + "loss": 0.0009, + "reward": 2.2916666865348816, + "reward_std": 0.6971899420022964, + "rewards/correctness_reward_func": 1.0833333432674408, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.2708333395421505, + "rewards/xmlcount_reward_func": 0.4583333432674408, + "step": 177 + }, + { + "completion_length": 133.58334159851074, + "epoch": 0.0952763281145457, + "grad_norm": 2.3125, + "kl": 0.0329542844556272, + "learning_rate": 4.759358288770054e-06, + "loss": 0.0013, + "reward": 2.9010416865348816, + "reward_std": 0.6468523591756821, + "rewards/correctness_reward_func": 1.5833333432674408, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.37500000558793545, + "rewards/xmlcount_reward_func": 0.484375, + "step": 178 + }, + { + "completion_length": 131.9166717529297, + "epoch": 0.09581158838485214, + "grad_norm": 1.71875, + "kl": 0.028344920370727777, + "learning_rate": 4.786096256684493e-06, + "loss": 0.0011, + "reward": 2.7760417461395264, + "reward_std": 0.6638420633971691, + "rewards/correctness_reward_func": 1.4166666716337204, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333432674408, + "rewards/xmlcount_reward_func": 0.484375, + "step": 179 + }, + { + "completion_length": 168.2083396911621, + "epoch": 0.09634684865515857, + "grad_norm": 1.5546875, + "kl": 0.020674246130511165, + "learning_rate": 4.812834224598931e-06, + "loss": 0.0008, + "reward": 2.7291666865348816, + "reward_std": 0.8422547429800034, + "rewards/correctness_reward_func": 1.5000000298023224, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.2708333395421505, + "rewards/xmlcount_reward_func": 0.5, + "step": 180 + }, + { + "completion_length": 99.12500190734863, + "epoch": 0.096882108925465, + "grad_norm": 2.015625, + "kl": 0.029507741332054138, + "learning_rate": 4.839572192513369e-06, + "loss": 0.0012, + "reward": 3.2812500596046448, + "reward_std": 0.3093881160020828, + "rewards/correctness_reward_func": 1.8333333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.4895833432674408, + "step": 181 + }, + { + "completion_length": 160.16667366027832, + "epoch": 0.09741736919577144, + "grad_norm": 1.171875, + "kl": 0.026030527194961905, + "learning_rate": 4.866310160427808e-06, + "loss": 0.001, + "reward": 2.958333373069763, + "reward_std": 0.35120461508631706, + "rewards/correctness_reward_func": 1.75, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.2291666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 182 + }, + { + "completion_length": 178.00000381469727, + "epoch": 0.09795262946607788, + "grad_norm": 1.4140625, + "kl": 0.03037263359874487, + "learning_rate": 4.893048128342247e-06, + "loss": 0.0012, + "reward": 2.708333373069763, + "reward_std": 0.5456972420215607, + "rewards/correctness_reward_func": 1.416666679084301, + "rewards/int_reward_func": 0.4166666679084301, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.37500000558793545, + "rewards/xmlcount_reward_func": 0.5, + "step": 183 + }, + { + "completion_length": 138.83333778381348, + "epoch": 0.09848788973638432, + "grad_norm": 2.375, + "kl": 0.024358084425330162, + "learning_rate": 4.919786096256685e-06, + "loss": 0.001, + "reward": 2.9479167461395264, + "reward_std": 0.669994905591011, + "rewards/correctness_reward_func": 1.666666716337204, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3125, + "rewards/xmlcount_reward_func": 0.4895833358168602, + "step": 184 + }, + { + "completion_length": 134.33333778381348, + "epoch": 0.09902315000669075, + "grad_norm": 0.8828125, + "kl": 0.027697773184627295, + "learning_rate": 4.9465240641711236e-06, + "loss": 0.0011, + "reward": 3.4791666865348816, + "reward_std": 0.05103103816509247, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 185 + }, + { + "completion_length": 141.70833587646484, + "epoch": 0.09955841027699719, + "grad_norm": 2.546875, + "kl": 0.04492489667609334, + "learning_rate": 4.973262032085562e-06, + "loss": 0.0018, + "reward": 2.4795000553131104, + "reward_std": 0.7360228151082993, + "rewards/correctness_reward_func": 1.166666716337204, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.33333334885537624, + "rewards/xmlcount_reward_func": 0.4794999957084656, + "step": 186 + }, + { + "completion_length": 104.79166793823242, + "epoch": 0.10009367054730363, + "grad_norm": 1.2109375, + "kl": 0.03745970083400607, + "learning_rate": 5e-06, + "loss": 0.0015, + "reward": 2.5, + "reward_std": 0.2622022032737732, + "rewards/correctness_reward_func": 1.0833333358168602, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 187 + }, + { + "completion_length": 137.0833396911621, + "epoch": 0.10062893081761007, + "grad_norm": 2.5, + "kl": 0.030969139654189348, + "learning_rate": 4.999995634095768e-06, + "loss": 0.0012, + "reward": 3.2447916865348816, + "reward_std": 0.3546534702181816, + "rewards/correctness_reward_func": 1.8333333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.4947916716337204, + "step": 188 + }, + { + "completion_length": 148.04166984558105, + "epoch": 0.1011641910879165, + "grad_norm": 2.359375, + "kl": 0.028358498588204384, + "learning_rate": 4.999982536398319e-06, + "loss": 0.0011, + "reward": 2.7916666865348816, + "reward_std": 0.6756742745637894, + "rewards/correctness_reward_func": 1.5000000149011612, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.2916666753590107, + "rewards/xmlcount_reward_func": 0.5, + "step": 189 + }, + { + "completion_length": 195.62500762939453, + "epoch": 0.10169945135822293, + "grad_norm": 1.875, + "kl": 0.0333657874725759, + "learning_rate": 4.9999607069534e-06, + "loss": 0.0013, + "reward": 1.7708334028720856, + "reward_std": 0.8301980048418045, + "rewards/correctness_reward_func": 0.666666679084301, + "rewards/int_reward_func": 0.3333333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.2708333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 190 + }, + { + "completion_length": 203.08333587646484, + "epoch": 0.10223471162852937, + "grad_norm": 2.109375, + "kl": 0.021336913108825684, + "learning_rate": 4.999930145837254e-06, + "loss": 0.0009, + "reward": 2.447916716337204, + "reward_std": 0.7104772366583347, + "rewards/correctness_reward_func": 1.1666666865348816, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.33333334885537624, + "rewards/xmlcount_reward_func": 0.4895833358168602, + "step": 191 + }, + { + "completion_length": 155.75000381469727, + "epoch": 0.10276997189883581, + "grad_norm": 2.3125, + "kl": 0.031167209148406982, + "learning_rate": 4.999890853156626e-06, + "loss": 0.0012, + "reward": 2.520833432674408, + "reward_std": 1.0309316962957382, + "rewards/correctness_reward_func": 1.0833333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375, + "rewards/xmlcount_reward_func": 0.5, + "step": 192 + }, + { + "completion_length": 144.79166984558105, + "epoch": 0.10330523216914224, + "grad_norm": 2.765625, + "kl": 0.035610498394817114, + "learning_rate": 4.999842829048751e-06, + "loss": 0.0014, + "reward": 2.7500000596046448, + "reward_std": 0.9765221327543259, + "rewards/correctness_reward_func": 1.416666716337204, + "rewards/int_reward_func": 0.3958333395421505, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 193 + }, + { + "completion_length": 208.25000381469727, + "epoch": 0.10384049243944868, + "grad_norm": 1.625, + "kl": 0.021585837937891483, + "learning_rate": 4.999786073681365e-06, + "loss": 0.0009, + "reward": 2.333333373069763, + "reward_std": 0.8039402067661285, + "rewards/correctness_reward_func": 1.0000000447034836, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.33333333395421505, + "rewards/xmlcount_reward_func": 0.5, + "step": 194 + }, + { + "completion_length": 125.16667175292969, + "epoch": 0.10437575270975512, + "grad_norm": 1.8984375, + "kl": 0.028131581842899323, + "learning_rate": 4.9997205872526996e-06, + "loss": 0.0011, + "reward": 3.020833373069763, + "reward_std": 0.6476409286260605, + "rewards/correctness_reward_func": 1.5833333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 195 + }, + { + "completion_length": 137.54166793823242, + "epoch": 0.10491101298006156, + "grad_norm": 1.9921875, + "kl": 0.03637383785098791, + "learning_rate": 4.9996463699914795e-06, + "loss": 0.0015, + "reward": 3.270833373069763, + "reward_std": 0.30922994017601013, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3541666753590107, + "rewards/xmlcount_reward_func": 0.5, + "step": 196 + }, + { + "completion_length": 167.3333396911621, + "epoch": 0.105446273250368, + "grad_norm": 2.078125, + "kl": 0.028204525355249643, + "learning_rate": 4.9995634221569264e-06, + "loss": 0.0011, + "reward": 2.9166667461395264, + "reward_std": 0.9102587252855301, + "rewards/correctness_reward_func": 1.5833334028720856, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3541666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 197 + }, + { + "completion_length": 151.83333778381348, + "epoch": 0.10598153352067442, + "grad_norm": 1.203125, + "kl": 0.025021064560860395, + "learning_rate": 4.9994717440387545e-06, + "loss": 0.001, + "reward": 2.9791666865348816, + "reward_std": 0.3012026697397232, + "rewards/correctness_reward_func": 1.5833333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 198 + }, + { + "completion_length": 122.87500381469727, + "epoch": 0.10651679379098086, + "grad_norm": 3.109375, + "kl": 0.04365252796560526, + "learning_rate": 4.999371335957167e-06, + "loss": 0.0017, + "reward": 3.151041805744171, + "reward_std": 0.6672081500291824, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.484375, + "step": 199 + }, + { + "completion_length": 104.16667175292969, + "epoch": 0.1070520540612873, + "grad_norm": 3.3125, + "kl": 0.057026736438274384, + "learning_rate": 4.999262198262866e-06, + "loss": 0.0023, + "reward": 3.0000000596046448, + "reward_std": 0.5163978338241577, + "rewards/correctness_reward_func": 1.5000000149011612, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 200 + }, + { + "completion_length": 141.33333778381348, + "epoch": 0.10758731433159374, + "grad_norm": 1.7109375, + "kl": 0.03623780608177185, + "learning_rate": 4.999144331337035e-06, + "loss": 0.0014, + "reward": 3.125, + "reward_std": 0.3869306445121765, + "rewards/correctness_reward_func": 1.8333333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.2916666679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 201 + }, + { + "completion_length": 150.00000381469727, + "epoch": 0.10812257460190017, + "grad_norm": 1.828125, + "kl": 0.02583132265135646, + "learning_rate": 4.999017735591354e-06, + "loss": 0.001, + "reward": 2.895833373069763, + "reward_std": 0.6385845988988876, + "rewards/correctness_reward_func": 1.5, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 202 + }, + { + "completion_length": 125.04167175292969, + "epoch": 0.10865783487220661, + "grad_norm": 9.125, + "kl": 0.09583986504003406, + "learning_rate": 4.998882411467984e-06, + "loss": 0.0038, + "reward": 3.1718750596046448, + "reward_std": 0.6912017911672592, + "rewards/correctness_reward_func": 1.7500000596046448, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.484375, + "step": 203 + }, + { + "completion_length": 160.25000762939453, + "epoch": 0.10919309514251305, + "grad_norm": 2.125, + "kl": 0.03802371025085449, + "learning_rate": 4.998738359439576e-06, + "loss": 0.0015, + "reward": 2.375000089406967, + "reward_std": 0.9705919325351715, + "rewards/correctness_reward_func": 1.0833333805203438, + "rewards/int_reward_func": 0.4166666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.37500000558793545, + "rewards/xmlcount_reward_func": 0.5, + "step": 204 + }, + { + "completion_length": 111.91666793823242, + "epoch": 0.10972835541281949, + "grad_norm": 2.671875, + "kl": 0.04831968434154987, + "learning_rate": 4.998585580009266e-06, + "loss": 0.0019, + "reward": 2.8958334028720856, + "reward_std": 0.5064464919269085, + "rewards/correctness_reward_func": 1.5000000223517418, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.416666679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 205 + }, + { + "completion_length": 129.2916717529297, + "epoch": 0.11026361568312593, + "grad_norm": 2.078125, + "kl": 0.04842359526082873, + "learning_rate": 4.998424073710667e-06, + "loss": 0.0019, + "reward": 3.208333373069763, + "reward_std": 0.5172697007656097, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3750000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 206 + }, + { + "completion_length": 101.54166984558105, + "epoch": 0.11079887595343235, + "grad_norm": 4.8125, + "kl": 0.08093613479286432, + "learning_rate": 4.998253841107877e-06, + "loss": 0.0032, + "reward": 2.4947917461395264, + "reward_std": 0.8822575807571411, + "rewards/correctness_reward_func": 1.0833333432674408, + "rewards/int_reward_func": 0.4375000074505806, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.4947916716337204, + "step": 207 + }, + { + "completion_length": 138.5416717529297, + "epoch": 0.11133413622373879, + "grad_norm": 1.765625, + "kl": 0.03517375607043505, + "learning_rate": 4.998074882795473e-06, + "loss": 0.0014, + "reward": 3.083333373069763, + "reward_std": 0.5884110182523727, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 208 + }, + { + "completion_length": 124.20833969116211, + "epoch": 0.11186939649404523, + "grad_norm": 2.734375, + "kl": 0.04947302211076021, + "learning_rate": 4.997887199398504e-06, + "loss": 0.002, + "reward": 3.1250000596046448, + "reward_std": 0.7061345875263214, + "rewards/correctness_reward_func": 1.666666716337204, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 209 + }, + { + "completion_length": 103.00000381469727, + "epoch": 0.11240465676435167, + "grad_norm": 1.1796875, + "kl": 0.031111895572394133, + "learning_rate": 4.997690791572498e-06, + "loss": 0.0012, + "reward": 3.145833373069763, + "reward_std": 0.27857524156570435, + "rewards/correctness_reward_func": 1.6666666716337204, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 210 + }, + { + "completion_length": 141.79166984558105, + "epoch": 0.1129399170346581, + "grad_norm": 3.609375, + "kl": 0.10411204094998538, + "learning_rate": 4.997485660003453e-06, + "loss": 0.0042, + "reward": 2.726250022649765, + "reward_std": 0.46303591132164, + "rewards/correctness_reward_func": 1.416666679084301, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3333333358168602, + "rewards/xmlcount_reward_func": 0.4970833361148834, + "step": 211 + }, + { + "completion_length": 107.3750057220459, + "epoch": 0.11347517730496454, + "grad_norm": 2.890625, + "kl": 0.048455359414219856, + "learning_rate": 4.997271805407836e-06, + "loss": 0.0019, + "reward": 3.2187500596046448, + "reward_std": 0.5164157301187515, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.4895833358168602, + "step": 212 + }, + { + "completion_length": 147.1250057220459, + "epoch": 0.11401043757527098, + "grad_norm": 2.359375, + "kl": 0.03473258297890425, + "learning_rate": 4.997049228532583e-06, + "loss": 0.0014, + "reward": 2.4375001192092896, + "reward_std": 0.9079003632068634, + "rewards/correctness_reward_func": 1.000000037252903, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000149011612, + "rewards/xmlcount_reward_func": 0.5, + "step": 213 + }, + { + "completion_length": 119.20833396911621, + "epoch": 0.11454569784557742, + "grad_norm": 2.109375, + "kl": 0.060375045984983444, + "learning_rate": 4.996817930155094e-06, + "loss": 0.0024, + "reward": 2.8645834028720856, + "reward_std": 0.5899006575345993, + "rewards/correctness_reward_func": 1.416666679084301, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.4895833358168602, + "step": 214 + }, + { + "completion_length": 110.95833778381348, + "epoch": 0.11508095811588386, + "grad_norm": 2.09375, + "kl": 0.07388751022517681, + "learning_rate": 4.996577911083228e-06, + "loss": 0.003, + "reward": 3.0416667461395264, + "reward_std": 0.425990492105484, + "rewards/correctness_reward_func": 1.6666666716337204, + "rewards/int_reward_func": 0.4375, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 215 + }, + { + "completion_length": 151.2083339691162, + "epoch": 0.11561621838619028, + "grad_norm": 25.0, + "kl": 0.05382871255278587, + "learning_rate": 4.996329172155307e-06, + "loss": 0.0022, + "reward": 2.6562500596046448, + "reward_std": 0.5608328096568584, + "rewards/correctness_reward_func": 1.3333333432674408, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.416666679084301, + "rewards/xmlcount_reward_func": 0.4479166716337204, + "step": 216 + }, + { + "completion_length": 115.50000381469727, + "epoch": 0.11615147865649672, + "grad_norm": 3.375, + "kl": 0.07328876806423068, + "learning_rate": 4.996071714240108e-06, + "loss": 0.0029, + "reward": 3.020833343267441, + "reward_std": 0.30094872415065765, + "rewards/correctness_reward_func": 1.5833333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.4791666716337204, + "step": 217 + }, + { + "completion_length": 164.125, + "epoch": 0.11668673892680316, + "grad_norm": 1.8203125, + "kl": 0.04349048109725118, + "learning_rate": 4.995805538236858e-06, + "loss": 0.0017, + "reward": 2.6510416865348816, + "reward_std": 0.46429676935076714, + "rewards/correctness_reward_func": 1.25, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.484375, + "step": 218 + }, + { + "completion_length": 148.95833587646484, + "epoch": 0.1172219991971096, + "grad_norm": 1.984375, + "kl": 0.028181300032883883, + "learning_rate": 4.995530645075237e-06, + "loss": 0.0011, + "reward": 2.3541667461395264, + "reward_std": 0.8104839585721493, + "rewards/correctness_reward_func": 0.9166666865348816, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 219 + }, + { + "completion_length": 147.70833778381348, + "epoch": 0.11775725946741603, + "grad_norm": 1.8515625, + "kl": 0.027232277672737837, + "learning_rate": 4.9952470357153715e-06, + "loss": 0.0011, + "reward": 2.520833373069763, + "reward_std": 0.4765267074108124, + "rewards/correctness_reward_func": 1.25, + "rewards/int_reward_func": 0.3541666753590107, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 220 + }, + { + "completion_length": 135.66666793823242, + "epoch": 0.11829251973772247, + "grad_norm": 2.109375, + "kl": 0.03699612431228161, + "learning_rate": 4.9949547111478295e-06, + "loss": 0.0015, + "reward": 3.208333432674408, + "reward_std": 0.538923554122448, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333395421505, + "rewards/xmlcount_reward_func": 0.5, + "step": 221 + }, + { + "completion_length": 153.8333396911621, + "epoch": 0.1188277800080289, + "grad_norm": 2.84375, + "kl": 0.09540390037000179, + "learning_rate": 4.994653672393622e-06, + "loss": 0.0038, + "reward": 2.8750000596046448, + "reward_std": 0.8296718895435333, + "rewards/correctness_reward_func": 1.5000000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3750000111758709, + "rewards/xmlcount_reward_func": 0.5, + "step": 222 + }, + { + "completion_length": 141.62500190734863, + "epoch": 0.11936304027833534, + "grad_norm": 2.765625, + "kl": 0.043064896017313004, + "learning_rate": 4.99434392050419e-06, + "loss": 0.0017, + "reward": 2.708333373069763, + "reward_std": 0.7524303495883942, + "rewards/correctness_reward_func": 1.3333333656191826, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.39583333395421505, + "rewards/xmlcount_reward_func": 0.5, + "step": 223 + }, + { + "completion_length": 135.00000381469727, + "epoch": 0.11989830054864177, + "grad_norm": 2.484375, + "kl": 0.04069590661674738, + "learning_rate": 4.994025456561415e-06, + "loss": 0.0016, + "reward": 3.083333432674408, + "reward_std": 0.7696890532970428, + "rewards/correctness_reward_func": 1.666666716337204, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 224 + }, + { + "completion_length": 177.62500190734863, + "epoch": 0.12043356081894821, + "grad_norm": 2.15625, + "kl": 0.03938570665195584, + "learning_rate": 4.993698281677603e-06, + "loss": 0.0016, + "reward": 2.4791667461395264, + "reward_std": 0.584863156080246, + "rewards/correctness_reward_func": 1.1666666865348816, + "rewards/int_reward_func": 0.4375000074505806, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.37500000558793545, + "rewards/xmlcount_reward_func": 0.5, + "step": 225 + }, + { + "completion_length": 154.7916717529297, + "epoch": 0.12096882108925465, + "grad_norm": 1.4375, + "kl": 0.023624973371624947, + "learning_rate": 4.993362396995484e-06, + "loss": 0.0009, + "reward": 2.968000113964081, + "reward_std": 0.7112085819244385, + "rewards/correctness_reward_func": 1.5000000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.48883333057165146, + "step": 226 + }, + { + "completion_length": 204.00000381469727, + "epoch": 0.12150408135956109, + "grad_norm": 2.171875, + "kl": 0.0315577844157815, + "learning_rate": 4.993017803688211e-06, + "loss": 0.0013, + "reward": 1.9062500298023224, + "reward_std": 0.6422140449285507, + "rewards/correctness_reward_func": 0.6666666939854622, + "rewards/int_reward_func": 0.4375, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.31250001303851604, + "rewards/xmlcount_reward_func": 0.4895833358168602, + "step": 227 + }, + { + "completion_length": 153.83333778381348, + "epoch": 0.12203934162986753, + "grad_norm": 1.890625, + "kl": 0.04619473172351718, + "learning_rate": 4.992664502959351e-06, + "loss": 0.0018, + "reward": 2.770833432674408, + "reward_std": 0.4689341187477112, + "rewards/correctness_reward_func": 1.3333333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 228 + }, + { + "completion_length": 142.33333587646484, + "epoch": 0.12257460190017395, + "grad_norm": 2.296875, + "kl": 0.04707263316959143, + "learning_rate": 4.99230249604289e-06, + "loss": 0.0019, + "reward": 2.833333373069763, + "reward_std": 0.8086781054735184, + "rewards/correctness_reward_func": 1.5000000298023224, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3750000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 229 + }, + { + "completion_length": 131.6666717529297, + "epoch": 0.1231098621704804, + "grad_norm": 1.703125, + "kl": 0.050373317673802376, + "learning_rate": 4.991931784203215e-06, + "loss": 0.002, + "reward": 3.0795000195503235, + "reward_std": 0.5383563190698624, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3333333395421505, + "rewards/xmlcount_reward_func": 0.4961666688323021, + "step": 230 + }, + { + "completion_length": 133.08333587646484, + "epoch": 0.12364512244078683, + "grad_norm": 1.4921875, + "kl": 0.03360940143465996, + "learning_rate": 4.991552368735119e-06, + "loss": 0.0013, + "reward": 2.7500000596046448, + "reward_std": 0.3624359928071499, + "rewards/correctness_reward_func": 1.3333333432674408, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 231 + }, + { + "completion_length": 114.95833587646484, + "epoch": 0.12418038271109327, + "grad_norm": 3.0, + "kl": 0.09261773619800806, + "learning_rate": 4.991164250963799e-06, + "loss": 0.0037, + "reward": 3.083333432674408, + "reward_std": 0.7157893627882004, + "rewards/correctness_reward_func": 1.666666716337204, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.416666679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 232 + }, + { + "completion_length": 146.91666984558105, + "epoch": 0.1247156429813997, + "grad_norm": 2.1875, + "kl": 0.03772323578596115, + "learning_rate": 4.990767432244839e-06, + "loss": 0.0015, + "reward": 2.9791667461395264, + "reward_std": 0.6879045069217682, + "rewards/correctness_reward_func": 1.666666716337204, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3541666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 233 + }, + { + "completion_length": 116.29166984558105, + "epoch": 0.12525090325170615, + "grad_norm": 2.09375, + "kl": 0.024761986453086138, + "learning_rate": 4.990361913964221e-06, + "loss": 0.001, + "reward": 2.4791667461395264, + "reward_std": 0.9406071752309799, + "rewards/correctness_reward_func": 1.000000037252903, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 234 + }, + { + "completion_length": 120.33333587646484, + "epoch": 0.12578616352201258, + "grad_norm": 1.5390625, + "kl": 0.02583275781944394, + "learning_rate": 4.989947697538305e-06, + "loss": 0.001, + "reward": 2.9791666865348816, + "reward_std": 0.05103103816509247, + "rewards/correctness_reward_func": 1.5, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 235 + }, + { + "completion_length": 147.62500381469727, + "epoch": 0.126321423792319, + "grad_norm": 2.46875, + "kl": 0.033338344655930996, + "learning_rate": 4.989524784413835e-06, + "loss": 0.0013, + "reward": 2.645833432674408, + "reward_std": 0.7206298857927322, + "rewards/correctness_reward_func": 1.2500000223517418, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 236 + }, + { + "completion_length": 152.75000762939453, + "epoch": 0.12685668406262546, + "grad_norm": 1.9609375, + "kl": 0.028460218803957105, + "learning_rate": 4.98909317606793e-06, + "loss": 0.0011, + "reward": 2.7500000596046448, + "reward_std": 0.3881702572107315, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3541666679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 237 + }, + { + "completion_length": 157.2916717529297, + "epoch": 0.12739194433293188, + "grad_norm": 2.03125, + "kl": 0.029248481849208474, + "learning_rate": 4.98865287400808e-06, + "loss": 0.0012, + "reward": 3.0104167461395264, + "reward_std": 0.6388503015041351, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.354166679084301, + "rewards/xmlcount_reward_func": 0.4895833358168602, + "step": 238 + }, + { + "completion_length": 136.70833587646484, + "epoch": 0.12792720460323834, + "grad_norm": 2.109375, + "kl": 0.038170427549630404, + "learning_rate": 4.988203879772136e-06, + "loss": 0.0015, + "reward": 2.854166716337204, + "reward_std": 0.6508470773696899, + "rewards/correctness_reward_func": 1.4166667088866234, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375, + "rewards/xmlcount_reward_func": 0.5, + "step": 239 + }, + { + "completion_length": 131.41667366027832, + "epoch": 0.12846246487354476, + "grad_norm": 2.546875, + "kl": 0.03704935172572732, + "learning_rate": 4.987746194928311e-06, + "loss": 0.0015, + "reward": 2.8541667461395264, + "reward_std": 0.8091580979526043, + "rewards/correctness_reward_func": 1.4166667014360428, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 240 + }, + { + "completion_length": 163.2083396911621, + "epoch": 0.1289977251438512, + "grad_norm": 1.796875, + "kl": 0.03925035800784826, + "learning_rate": 4.9872798210751725e-06, + "loss": 0.0016, + "reward": 2.4375000596046448, + "reward_std": 0.815828587859869, + "rewards/correctness_reward_func": 1.0000000074505806, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 241 + }, + { + "completion_length": 168.9583396911621, + "epoch": 0.12953298541415764, + "grad_norm": 1.703125, + "kl": 0.02837916323915124, + "learning_rate": 4.986804759841635e-06, + "loss": 0.0011, + "reward": 2.833333373069763, + "reward_std": 0.3236204944550991, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 242 + }, + { + "completion_length": 149.8750057220459, + "epoch": 0.13006824568446407, + "grad_norm": 1.8203125, + "kl": 0.06834348477423191, + "learning_rate": 4.986321012886956e-06, + "loss": 0.0027, + "reward": 3.070833384990692, + "reward_std": 0.3826806955039501, + "rewards/correctness_reward_func": 1.6666666716337204, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.416666679084301, + "rewards/xmlcount_reward_func": 0.48750000447034836, + "step": 243 + }, + { + "completion_length": 139.2500057220459, + "epoch": 0.1306035059547705, + "grad_norm": 1.59375, + "kl": 0.040616510435938835, + "learning_rate": 4.98582858190073e-06, + "loss": 0.0016, + "reward": 2.895833373069763, + "reward_std": 0.5750639587640762, + "rewards/correctness_reward_func": 1.583333358168602, + "rewards/int_reward_func": 0.3333333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 244 + }, + { + "completion_length": 193.5416717529297, + "epoch": 0.13113876622507695, + "grad_norm": 1.625, + "kl": 0.030194721184670925, + "learning_rate": 4.985327468602881e-06, + "loss": 0.0012, + "reward": 2.802083432674408, + "reward_std": 0.816511832177639, + "rewards/correctness_reward_func": 1.5000000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.31250000558793545, + "rewards/xmlcount_reward_func": 0.4895833358168602, + "step": 245 + }, + { + "completion_length": 137.37500381469727, + "epoch": 0.13167402649538337, + "grad_norm": 1.7265625, + "kl": 0.0411061723716557, + "learning_rate": 4.984817674743661e-06, + "loss": 0.0016, + "reward": 2.8125000596046448, + "reward_std": 0.6043404638767242, + "rewards/correctness_reward_func": 1.4166667088866234, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 246 + }, + { + "completion_length": 145.75000381469727, + "epoch": 0.13220928676568983, + "grad_norm": 1.90625, + "kl": 0.039873102214187384, + "learning_rate": 4.984299202103638e-06, + "loss": 0.0016, + "reward": 2.9166666865348816, + "reward_std": 0.6238463968038559, + "rewards/correctness_reward_func": 1.5833333432674408, + "rewards/int_reward_func": 0.4375000074505806, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333395421505, + "rewards/xmlcount_reward_func": 0.5, + "step": 247 + }, + { + "completion_length": 103.16666793823242, + "epoch": 0.13274454703599625, + "grad_norm": 2.109375, + "kl": 0.03916989779099822, + "learning_rate": 4.9837720524936935e-06, + "loss": 0.0016, + "reward": 3.2500000596046448, + "reward_std": 0.46232305467128754, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 248 + }, + { + "completion_length": 152.0833396911621, + "epoch": 0.13327980730630268, + "grad_norm": 1.4296875, + "kl": 0.022294364403933287, + "learning_rate": 4.983236227755015e-06, + "loss": 0.0009, + "reward": 2.833333373069763, + "reward_std": 0.7361843436956406, + "rewards/correctness_reward_func": 1.333333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 249 + }, + { + "completion_length": 140.79166984558105, + "epoch": 0.13381506757660913, + "grad_norm": 2.625, + "kl": 0.023324530571699142, + "learning_rate": 4.98269172975909e-06, + "loss": 0.0009, + "reward": 2.6666667461395264, + "reward_std": 0.7144345194101334, + "rewards/correctness_reward_func": 1.2500000596046448, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 250 + }, + { + "completion_length": 145.9583396911621, + "epoch": 0.13435032784691556, + "grad_norm": 1.8984375, + "kl": 0.0366662573069334, + "learning_rate": 4.982138560407701e-06, + "loss": 0.0015, + "reward": 3.208333373069763, + "reward_std": 0.4392816424369812, + "rewards/correctness_reward_func": 1.8333333432674408, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.4791666716337204, + "step": 251 + }, + { + "completion_length": 163.04167556762695, + "epoch": 0.134885588117222, + "grad_norm": 1.828125, + "kl": 0.015792567282915115, + "learning_rate": 4.9815767216329145e-06, + "loss": 0.0006, + "reward": 2.5000000596046448, + "reward_std": 0.7361843585968018, + "rewards/correctness_reward_func": 1.0000000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 252 + }, + { + "completion_length": 120.37500190734863, + "epoch": 0.13542084838752844, + "grad_norm": 1.671875, + "kl": 0.02686859155073762, + "learning_rate": 4.981006215397077e-06, + "loss": 0.0011, + "reward": 3.4479166865348816, + "reward_std": 0.12757760286331177, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.4895833358168602, + "step": 253 + }, + { + "completion_length": 132.12500381469727, + "epoch": 0.13595610865783486, + "grad_norm": 1.453125, + "kl": 0.029003456234931946, + "learning_rate": 4.980427043692809e-06, + "loss": 0.0012, + "reward": 3.0625, + "reward_std": 0.5670122802257538, + "rewards/correctness_reward_func": 1.5833333432674408, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 254 + }, + { + "completion_length": 116.25000381469727, + "epoch": 0.13649136892814132, + "grad_norm": 2.375, + "kl": 0.03643447207286954, + "learning_rate": 4.979839208542999e-06, + "loss": 0.0015, + "reward": 3.2291667461395264, + "reward_std": 0.5133541040122509, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 255 + }, + { + "completion_length": 141.1666717529297, + "epoch": 0.13702662919844774, + "grad_norm": 2.078125, + "kl": 0.04122666455805302, + "learning_rate": 4.979242712000792e-06, + "loss": 0.0016, + "reward": 2.958333432674408, + "reward_std": 0.7841716669499874, + "rewards/correctness_reward_func": 1.5833333730697632, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.39583333395421505, + "rewards/xmlcount_reward_func": 0.5, + "step": 256 + }, + { + "completion_length": 116.50000381469727, + "epoch": 0.1375618894687542, + "grad_norm": 0.91015625, + "kl": 0.036849388387054205, + "learning_rate": 4.978637556149582e-06, + "loss": 0.0015, + "reward": 3.4791666865348816, + "reward_std": 0.05103103816509247, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 257 + }, + { + "completion_length": 118.50000381469727, + "epoch": 0.13809714973906062, + "grad_norm": 1.890625, + "kl": 0.032016648445278406, + "learning_rate": 4.978023743103017e-06, + "loss": 0.0013, + "reward": 3.145833373069763, + "reward_std": 0.5290164947509766, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 258 + }, + { + "completion_length": 184.2916717529297, + "epoch": 0.13863241000936705, + "grad_norm": 1.8359375, + "kl": 0.02645092085003853, + "learning_rate": 4.977401275004971e-06, + "loss": 0.0011, + "reward": 2.250000089406967, + "reward_std": 0.4426039345562458, + "rewards/correctness_reward_func": 0.9166666865348816, + "rewards/int_reward_func": 0.4166666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 259 + }, + { + "completion_length": 131.45833778381348, + "epoch": 0.1391676702796735, + "grad_norm": 2.015625, + "kl": 0.03592094453051686, + "learning_rate": 4.976770154029556e-06, + "loss": 0.0014, + "reward": 3.020833432674408, + "reward_std": 0.7555890046060085, + "rewards/correctness_reward_func": 1.5833333730697632, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 260 + }, + { + "completion_length": 135.91666984558105, + "epoch": 0.13970293054997993, + "grad_norm": 2.765625, + "kl": 0.04309300798922777, + "learning_rate": 4.9761303823811004e-06, + "loss": 0.0017, + "reward": 2.333333432674408, + "reward_std": 0.89227694272995, + "rewards/correctness_reward_func": 1.0000000447034836, + "rewards/int_reward_func": 0.35416666977107525, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 261 + }, + { + "completion_length": 119.50000190734863, + "epoch": 0.14023819082028635, + "grad_norm": 0.06640625, + "kl": 0.04456313792616129, + "learning_rate": 4.975481962294152e-06, + "loss": 0.0018, + "reward": 3.5, + "reward_std": 0.0, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 262 + }, + { + "completion_length": 131.2083396911621, + "epoch": 0.1407734510905928, + "grad_norm": 1.6953125, + "kl": 0.035875370260328054, + "learning_rate": 4.974824896033462e-06, + "loss": 0.0014, + "reward": 2.447916716337204, + "reward_std": 0.4791714996099472, + "rewards/correctness_reward_func": 1.0000000223517418, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.4895833358168602, + "step": 263 + }, + { + "completion_length": 141.12500381469727, + "epoch": 0.14130871136089923, + "grad_norm": 1.7890625, + "kl": 0.04122765874490142, + "learning_rate": 4.97415918589398e-06, + "loss": 0.0016, + "reward": 2.1041666865348816, + "reward_std": 0.7500797137618065, + "rewards/correctness_reward_func": 0.8333333432674408, + "rewards/int_reward_func": 0.2916666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 264 + }, + { + "completion_length": 137.37500381469727, + "epoch": 0.14184397163120568, + "grad_norm": 2.171875, + "kl": 0.036459858529269695, + "learning_rate": 4.973484834200849e-06, + "loss": 0.0015, + "reward": 3.0416667461395264, + "reward_std": 0.743688777089119, + "rewards/correctness_reward_func": 1.5833333730697632, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 265 + }, + { + "completion_length": 146.6250057220459, + "epoch": 0.1423792319015121, + "grad_norm": 1.84375, + "kl": 0.04969721753150225, + "learning_rate": 4.972801843309392e-06, + "loss": 0.002, + "reward": 2.7916667461395264, + "reward_std": 0.6460228823125362, + "rewards/correctness_reward_func": 1.4166667088866234, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3750000111758709, + "rewards/xmlcount_reward_func": 0.5, + "step": 266 + }, + { + "completion_length": 105.58333587646484, + "epoch": 0.14291449217181854, + "grad_norm": 1.0234375, + "kl": 0.0363903921097517, + "learning_rate": 4.972110215605108e-06, + "loss": 0.0015, + "reward": 3.4791666865348816, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 267 + }, + { + "completion_length": 153.37500381469727, + "epoch": 0.143449752442125, + "grad_norm": 1.9609375, + "kl": 0.041106805205345154, + "learning_rate": 4.9714099535036606e-06, + "loss": 0.0016, + "reward": 3.3541667461395264, + "reward_std": 0.31970491632819176, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 268 + }, + { + "completion_length": 131.62500190734863, + "epoch": 0.14398501271243141, + "grad_norm": 1.4296875, + "kl": 0.044663478154689074, + "learning_rate": 4.970701059450872e-06, + "loss": 0.0018, + "reward": 3.145833373069763, + "reward_std": 0.2837683856487274, + "rewards/correctness_reward_func": 1.75, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333395421505, + "rewards/xmlcount_reward_func": 0.5, + "step": 269 + }, + { + "completion_length": 105.79166984558105, + "epoch": 0.14452027298273787, + "grad_norm": 2.421875, + "kl": 0.03859696490690112, + "learning_rate": 4.969983535922712e-06, + "loss": 0.0015, + "reward": 3.145833373069763, + "reward_std": 0.7013992667198181, + "rewards/correctness_reward_func": 1.666666716337204, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 270 + }, + { + "completion_length": 165.45833587646484, + "epoch": 0.1450555332530443, + "grad_norm": 1.796875, + "kl": 0.03418656159192324, + "learning_rate": 4.9692573854252934e-06, + "loss": 0.0014, + "reward": 2.8541667461395264, + "reward_std": 0.6371217370033264, + "rewards/correctness_reward_func": 1.4166667088866234, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 271 + }, + { + "completion_length": 124.70833778381348, + "epoch": 0.14559079352335072, + "grad_norm": 1.078125, + "kl": 0.03386909421533346, + "learning_rate": 4.968522610494858e-06, + "loss": 0.0014, + "reward": 3.333333373069763, + "reward_std": 0.25819891691207886, + "rewards/correctness_reward_func": 1.8333333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 272 + }, + { + "completion_length": 138.0833339691162, + "epoch": 0.14612605379365717, + "grad_norm": 2.171875, + "kl": 0.041910297237336636, + "learning_rate": 4.967779213697771e-06, + "loss": 0.0017, + "reward": 2.779083400964737, + "reward_std": 0.713197335600853, + "rewards/correctness_reward_func": 1.4166667088866234, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333395421505, + "rewards/xmlcount_reward_func": 0.48741666972637177, + "step": 273 + }, + { + "completion_length": 120.08333969116211, + "epoch": 0.1466613140639636, + "grad_norm": 2.34375, + "kl": 0.04539045970886946, + "learning_rate": 4.967027197630513e-06, + "loss": 0.0018, + "reward": 2.7500001192092896, + "reward_std": 1.0729063749313354, + "rewards/correctness_reward_func": 1.3333333730697632, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 274 + }, + { + "completion_length": 119.16666984558105, + "epoch": 0.14719657433427003, + "grad_norm": 2.1875, + "kl": 0.041658067144453526, + "learning_rate": 4.966266564919667e-06, + "loss": 0.0017, + "reward": 2.895833373069763, + "reward_std": 0.7781590074300766, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 275 + }, + { + "completion_length": 126.6250057220459, + "epoch": 0.14773183460457648, + "grad_norm": 1.578125, + "kl": 0.022411894984543324, + "learning_rate": 4.965497318221915e-06, + "loss": 0.0009, + "reward": 3.3125000596046448, + "reward_std": 0.459279328584671, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 276 + }, + { + "completion_length": 134.58333587646484, + "epoch": 0.1482670948748829, + "grad_norm": 1.1328125, + "kl": 0.03880942426621914, + "learning_rate": 4.964719460224019e-06, + "loss": 0.0016, + "reward": 2.9166666865348816, + "reward_std": 0.20412415266036987, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 277 + }, + { + "completion_length": 126.3750057220459, + "epoch": 0.14880235514518936, + "grad_norm": 1.3125, + "kl": 0.022782811895012856, + "learning_rate": 4.963932993642825e-06, + "loss": 0.0009, + "reward": 3.020833373069763, + "reward_std": 0.6005255281925201, + "rewards/correctness_reward_func": 1.5833333432674408, + "rewards/int_reward_func": 0.4375, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 278 + }, + { + "completion_length": 138.50000381469727, + "epoch": 0.14933761541549578, + "grad_norm": 2.34375, + "kl": 0.03208020143210888, + "learning_rate": 4.963137921225241e-06, + "loss": 0.0013, + "reward": 2.5416667461395264, + "reward_std": 0.5553287528455257, + "rewards/correctness_reward_func": 1.1666666865348816, + "rewards/int_reward_func": 0.37500000558793545, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 279 + }, + { + "completion_length": 122.50000190734863, + "epoch": 0.1498728756858022, + "grad_norm": 0.83203125, + "kl": 0.029957736376672983, + "learning_rate": 4.962334245748237e-06, + "loss": 0.0012, + "reward": 3.2916666865348816, + "reward_std": 0.3227486312389374, + "rewards/correctness_reward_func": 1.8333333432674408, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 280 + }, + { + "completion_length": 147.12500381469727, + "epoch": 0.15040813595610866, + "grad_norm": 2.21875, + "kl": 0.022831235080957413, + "learning_rate": 4.961521970018828e-06, + "loss": 0.0009, + "reward": 2.8125000596046448, + "reward_std": 0.8186086416244507, + "rewards/correctness_reward_func": 1.4166667014360428, + "rewards/int_reward_func": 0.4166666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 281 + }, + { + "completion_length": 140.50000381469727, + "epoch": 0.1509433962264151, + "grad_norm": 1.7265625, + "kl": 0.0313710174523294, + "learning_rate": 4.960701096874069e-06, + "loss": 0.0013, + "reward": 3.208333373069763, + "reward_std": 0.4854898601770401, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 282 + }, + { + "completion_length": 195.08333778381348, + "epoch": 0.15147865649672154, + "grad_norm": 1.7734375, + "kl": 0.025825404096394777, + "learning_rate": 4.959871629181043e-06, + "loss": 0.001, + "reward": 2.109375014901161, + "reward_std": 0.4769704192876816, + "rewards/correctness_reward_func": 0.8333333432674408, + "rewards/int_reward_func": 0.4166666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3750000111758709, + "rewards/xmlcount_reward_func": 0.484375, + "step": 283 + }, + { + "completion_length": 136.3333339691162, + "epoch": 0.15201391676702797, + "grad_norm": 0.99609375, + "kl": 0.0337559818290174, + "learning_rate": 4.95903356983685e-06, + "loss": 0.0014, + "reward": 3.2291666865348816, + "reward_std": 0.3001735806465149, + "rewards/correctness_reward_func": 1.75, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 284 + }, + { + "completion_length": 149.25000381469727, + "epoch": 0.1525491770373344, + "grad_norm": 1.5078125, + "kl": 0.04021261353045702, + "learning_rate": 4.958186921768601e-06, + "loss": 0.0016, + "reward": 2.645833373069763, + "reward_std": 0.309229951351881, + "rewards/correctness_reward_func": 1.1666666716337204, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 285 + }, + { + "completion_length": 114.50000190734863, + "epoch": 0.15308443730764085, + "grad_norm": 1.9296875, + "kl": 0.049264345318078995, + "learning_rate": 4.957331687933402e-06, + "loss": 0.002, + "reward": 3.395833373069763, + "reward_std": 0.25515518710017204, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 286 + }, + { + "completion_length": 147.0833396911621, + "epoch": 0.15361969757794727, + "grad_norm": 2.140625, + "kl": 0.04388295952230692, + "learning_rate": 4.956467871318349e-06, + "loss": 0.0018, + "reward": 2.208333373069763, + "reward_std": 1.1271048188209534, + "rewards/correctness_reward_func": 0.8333333507180214, + "rewards/int_reward_func": 0.4166666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 287 + }, + { + "completion_length": 153.33333587646484, + "epoch": 0.15415495784825373, + "grad_norm": 1.6875, + "kl": 0.034068225882947445, + "learning_rate": 4.955595474940515e-06, + "loss": 0.0014, + "reward": 2.9375000596046448, + "reward_std": 0.5253209173679352, + "rewards/correctness_reward_func": 1.5000000149011612, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375, + "rewards/xmlcount_reward_func": 0.5, + "step": 288 + }, + { + "completion_length": 149.25, + "epoch": 0.15469021811856015, + "grad_norm": 2.171875, + "kl": 0.04076318442821503, + "learning_rate": 4.954714501846938e-06, + "loss": 0.0016, + "reward": 3.083333432674408, + "reward_std": 0.6206349097192287, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 289 + }, + { + "completion_length": 154.7916717529297, + "epoch": 0.15522547838886658, + "grad_norm": 1.71875, + "kl": 0.03785711620002985, + "learning_rate": 4.9538249551146145e-06, + "loss": 0.0015, + "reward": 2.6250000596046448, + "reward_std": 0.8643502295017242, + "rewards/correctness_reward_func": 1.1666666716337204, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 290 + }, + { + "completion_length": 149.54166793823242, + "epoch": 0.15576073865917303, + "grad_norm": 1.2578125, + "kl": 0.038248957600444555, + "learning_rate": 4.952926837850485e-06, + "loss": 0.0015, + "reward": 3.3541667461395264, + "reward_std": 0.31970493495464325, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 291 + }, + { + "completion_length": 165.25000381469727, + "epoch": 0.15629599892947946, + "grad_norm": 1.6875, + "kl": 0.027612535748630762, + "learning_rate": 4.9520201531914234e-06, + "loss": 0.0011, + "reward": 2.9166666865348816, + "reward_std": 0.69357730448246, + "rewards/correctness_reward_func": 1.5000000447034836, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 292 + }, + { + "completion_length": 186.5416717529297, + "epoch": 0.15683125919978588, + "grad_norm": 1.7578125, + "kl": 0.03856555838137865, + "learning_rate": 4.9511049043042304e-06, + "loss": 0.0015, + "reward": 2.3125000596046448, + "reward_std": 0.9206569194793701, + "rewards/correctness_reward_func": 0.9166666865348816, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.4791666716337204, + "step": 293 + }, + { + "completion_length": 145.58333587646484, + "epoch": 0.15736651947009234, + "grad_norm": 2.84375, + "kl": 0.0297771948389709, + "learning_rate": 4.950181094385616e-06, + "loss": 0.0012, + "reward": 2.6875000596046448, + "reward_std": 0.9492315649986267, + "rewards/correctness_reward_func": 1.2500000447034836, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375, + "rewards/xmlcount_reward_func": 0.5, + "step": 294 + }, + { + "completion_length": 124.66666984558105, + "epoch": 0.15790177974039876, + "grad_norm": 1.9921875, + "kl": 0.05197975039482117, + "learning_rate": 4.9492487266621925e-06, + "loss": 0.0021, + "reward": 3.3541667461395264, + "reward_std": 0.3023223206400871, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000149011612, + "rewards/xmlcount_reward_func": 0.5, + "step": 295 + }, + { + "completion_length": 121.95833587646484, + "epoch": 0.15843704001070522, + "grad_norm": 1.7265625, + "kl": 0.04530602786689997, + "learning_rate": 4.948307804390462e-06, + "loss": 0.0018, + "reward": 3.1250000596046448, + "reward_std": 0.4909362643957138, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 296 + }, + { + "completion_length": 163.5833396911621, + "epoch": 0.15897230028101164, + "grad_norm": 1.671875, + "kl": 0.03576376102864742, + "learning_rate": 4.947358330856808e-06, + "loss": 0.0014, + "reward": 2.291666716337204, + "reward_std": 0.6950604170560837, + "rewards/correctness_reward_func": 0.8333333432674408, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 297 + }, + { + "completion_length": 113.6250057220459, + "epoch": 0.15950756055131807, + "grad_norm": 1.875, + "kl": 0.03266171971336007, + "learning_rate": 4.946400309377477e-06, + "loss": 0.0014, + "reward": 3.4165834188461304, + "reward_std": 0.06475385317753535, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666679084301, + "rewards/xmlcount_reward_func": 0.4999166652560234, + "step": 298 + }, + { + "completion_length": 176.79166793823242, + "epoch": 0.16004282082162452, + "grad_norm": 1.65625, + "kl": 0.03471789788454771, + "learning_rate": 4.945433743298573e-06, + "loss": 0.0014, + "reward": 2.515500009059906, + "reward_std": 0.9147455990314484, + "rewards/correctness_reward_func": 1.2500000149011612, + "rewards/int_reward_func": 0.33333333395421505, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.4946666657924652, + "step": 299 + }, + { + "completion_length": 159.7083339691162, + "epoch": 0.16057808109193095, + "grad_norm": 2.4375, + "kl": 0.061009292490780354, + "learning_rate": 4.944458635996045e-06, + "loss": 0.0024, + "reward": 2.6875001192092896, + "reward_std": 0.9677619636058807, + "rewards/correctness_reward_func": 1.333333395421505, + "rewards/int_reward_func": 0.4375000149011612, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 300 + }, + { + "completion_length": 129.8333396911621, + "epoch": 0.1611133413622374, + "grad_norm": 1.5078125, + "kl": 0.028242026921361685, + "learning_rate": 4.943474990875673e-06, + "loss": 0.0011, + "reward": 3.3125000596046448, + "reward_std": 0.4592793434858322, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 301 + }, + { + "completion_length": 148.87500381469727, + "epoch": 0.16164860163254383, + "grad_norm": 1.890625, + "kl": 0.030265600653365254, + "learning_rate": 4.942482811373056e-06, + "loss": 0.0012, + "reward": 3.0625001192092896, + "reward_std": 0.7166580855846405, + "rewards/correctness_reward_func": 1.5833333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 302 + }, + { + "completion_length": 151.2500057220459, + "epoch": 0.16218386190285025, + "grad_norm": 2.296875, + "kl": 0.03763581905514002, + "learning_rate": 4.941482100953604e-06, + "loss": 0.0015, + "reward": 2.2916667461395264, + "reward_std": 0.6810928508639336, + "rewards/correctness_reward_func": 0.9166667088866234, + "rewards/int_reward_func": 0.4166666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 303 + }, + { + "completion_length": 123.37500190734863, + "epoch": 0.1627191221731567, + "grad_norm": 1.9921875, + "kl": 0.03812553267925978, + "learning_rate": 4.940472863112521e-06, + "loss": 0.0015, + "reward": 2.2875834107398987, + "reward_std": 0.296846117824316, + "rewards/correctness_reward_func": 0.8333333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.4959166646003723, + "step": 304 + }, + { + "completion_length": 127.33333778381348, + "epoch": 0.16325438244346313, + "grad_norm": 1.203125, + "kl": 0.03971145674586296, + "learning_rate": 4.939455101374795e-06, + "loss": 0.0016, + "reward": 2.7291666865348816, + "reward_std": 0.3001735806465149, + "rewards/correctness_reward_func": 1.25, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 305 + }, + { + "completion_length": 131.66666984558105, + "epoch": 0.16378964271376958, + "grad_norm": 1.8671875, + "kl": 0.036954211071133614, + "learning_rate": 4.938428819295187e-06, + "loss": 0.0015, + "reward": 2.520833373069763, + "reward_std": 0.7971479445695877, + "rewards/correctness_reward_func": 1.0833333507180214, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 306 + }, + { + "completion_length": 168.5833396911621, + "epoch": 0.164324902984076, + "grad_norm": 1.6953125, + "kl": 0.02943408628925681, + "learning_rate": 4.937394020458216e-06, + "loss": 0.0012, + "reward": 3.1250001192092896, + "reward_std": 0.5554859936237335, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 307 + }, + { + "completion_length": 207.83333587646484, + "epoch": 0.16486016325438244, + "grad_norm": 1.859375, + "kl": 0.04282863391563296, + "learning_rate": 4.9363507084781495e-06, + "loss": 0.0017, + "reward": 2.286916732788086, + "reward_std": 0.894590687006712, + "rewards/correctness_reward_func": 1.0000000074505806, + "rewards/int_reward_func": 0.3958333395421505, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333432674408, + "rewards/xmlcount_reward_func": 0.49525000900030136, + "step": 308 + }, + { + "completion_length": 135.37500381469727, + "epoch": 0.1653954235246889, + "grad_norm": 1.359375, + "kl": 0.028727824799716473, + "learning_rate": 4.935298886998986e-06, + "loss": 0.0011, + "reward": 3.375, + "reward_std": 0.25, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 309 + }, + { + "completion_length": 177.5833396911621, + "epoch": 0.16593068379499532, + "grad_norm": 1.390625, + "kl": 0.025000998750329018, + "learning_rate": 4.934238559694448e-06, + "loss": 0.001, + "reward": 2.7291667461395264, + "reward_std": 0.8388482332229614, + "rewards/correctness_reward_func": 1.333333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 310 + }, + { + "completion_length": 153.2916717529297, + "epoch": 0.16646594406530174, + "grad_norm": 1.3046875, + "kl": 0.03711768053472042, + "learning_rate": 4.9331697302679645e-06, + "loss": 0.0015, + "reward": 3.0000000298023224, + "reward_std": 0.2350260429084301, + "rewards/correctness_reward_func": 1.5833333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 311 + }, + { + "completion_length": 102.04166793823242, + "epoch": 0.1670012043356082, + "grad_norm": 2.3125, + "kl": 0.04422319959849119, + "learning_rate": 4.932092402452662e-06, + "loss": 0.0018, + "reward": 3.3854166865348816, + "reward_std": 0.280670702457428, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.4895833358168602, + "step": 312 + }, + { + "completion_length": 133.75000190734863, + "epoch": 0.16753646460591462, + "grad_norm": 1.4375, + "kl": 0.03163261990994215, + "learning_rate": 4.931006580011348e-06, + "loss": 0.0013, + "reward": 3.333333373069763, + "reward_std": 0.40824829041957855, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 313 + }, + { + "completion_length": 100.00000381469727, + "epoch": 0.16807172487622107, + "grad_norm": 1.203125, + "kl": 0.04145035380497575, + "learning_rate": 4.9299122667365e-06, + "loss": 0.0017, + "reward": 3.3125, + "reward_std": 0.29315099120140076, + "rewards/correctness_reward_func": 1.8333333432674408, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 314 + }, + { + "completion_length": 118.04166984558105, + "epoch": 0.1686069851465275, + "grad_norm": 0.0859375, + "kl": 0.028052533976733685, + "learning_rate": 4.928809466450252e-06, + "loss": 0.0011, + "reward": 3.5, + "reward_std": 0.0, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 315 + }, + { + "completion_length": 132.41666984558105, + "epoch": 0.16914224541683393, + "grad_norm": 2.125, + "kl": 0.06144689908251166, + "learning_rate": 4.927698183004379e-06, + "loss": 0.0025, + "reward": 2.6250000596046448, + "reward_std": 0.3296062760055065, + "rewards/correctness_reward_func": 1.1666666716337204, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 316 + }, + { + "completion_length": 130.50000286102295, + "epoch": 0.16967750568714038, + "grad_norm": 1.625, + "kl": 0.0342918885871768, + "learning_rate": 4.926578420280288e-06, + "loss": 0.0014, + "reward": 3.020833373069763, + "reward_std": 0.841457188129425, + "rewards/correctness_reward_func": 1.5833333730697632, + "rewards/int_reward_func": 0.4375000149011612, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 317 + }, + { + "completion_length": 131.16666984558105, + "epoch": 0.1702127659574468, + "grad_norm": 1.5625, + "kl": 0.037405913695693016, + "learning_rate": 4.925450182189e-06, + "loss": 0.0015, + "reward": 3.208333373069763, + "reward_std": 0.6003471612930298, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 318 + }, + { + "completion_length": 192.0833396911621, + "epoch": 0.17074802622775326, + "grad_norm": 1.453125, + "kl": 0.020101528149098158, + "learning_rate": 4.924313472671139e-06, + "loss": 0.0008, + "reward": 2.2291666865348816, + "reward_std": 0.3248923234641552, + "rewards/correctness_reward_func": 0.75, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 319 + }, + { + "completion_length": 152.12500381469727, + "epoch": 0.17128328649805968, + "grad_norm": 1.0703125, + "kl": 0.04198923846706748, + "learning_rate": 4.923168295696917e-06, + "loss": 0.0017, + "reward": 2.854166716337204, + "reward_std": 0.4421939253807068, + "rewards/correctness_reward_func": 1.416666679084301, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375, + "rewards/xmlcount_reward_func": 0.5, + "step": 320 + }, + { + "completion_length": 149.20833778381348, + "epoch": 0.1718185467683661, + "grad_norm": 0.7421875, + "kl": 0.025474284309893847, + "learning_rate": 4.92201465526612e-06, + "loss": 0.001, + "reward": 3.1666666865348816, + "reward_std": 0.25819891691207886, + "rewards/correctness_reward_func": 1.6666666716337204, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 321 + }, + { + "completion_length": 157.66666793823242, + "epoch": 0.17235380703867256, + "grad_norm": 1.4921875, + "kl": 0.02526680286973715, + "learning_rate": 4.920852555408093e-06, + "loss": 0.001, + "reward": 2.895833373069763, + "reward_std": 0.680737629532814, + "rewards/correctness_reward_func": 1.5, + "rewards/int_reward_func": 0.4375, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 322 + }, + { + "completion_length": 139.75000381469727, + "epoch": 0.172889067308979, + "grad_norm": 2.5, + "kl": 0.05358001682907343, + "learning_rate": 4.919682000181734e-06, + "loss": 0.0021, + "reward": 2.6041667759418488, + "reward_std": 0.7565664201974869, + "rewards/correctness_reward_func": 1.2500000596046448, + "rewards/int_reward_func": 0.4375000149011612, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 323 + }, + { + "completion_length": 145.37500190734863, + "epoch": 0.17342432757928541, + "grad_norm": 1.2734375, + "kl": 0.03602644964121282, + "learning_rate": 4.918502993675464e-06, + "loss": 0.0014, + "reward": 2.833333373069763, + "reward_std": 0.40824830532073975, + "rewards/correctness_reward_func": 1.3333333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 324 + }, + { + "completion_length": 132.2916717529297, + "epoch": 0.17395958784959187, + "grad_norm": 1.296875, + "kl": 0.026516761165112257, + "learning_rate": 4.917315540007229e-06, + "loss": 0.0011, + "reward": 3.2500000596046448, + "reward_std": 0.46232305467128754, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 325 + }, + { + "completion_length": 135.6666717529297, + "epoch": 0.1744948481198983, + "grad_norm": 1.4453125, + "kl": 0.03170072380453348, + "learning_rate": 4.916119643324475e-06, + "loss": 0.0013, + "reward": 2.7916666865348816, + "reward_std": 0.3441820256412029, + "rewards/correctness_reward_func": 1.3333333432674408, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 326 + }, + { + "completion_length": 112.7500057220459, + "epoch": 0.17503010839020475, + "grad_norm": 1.875, + "kl": 0.03724998049438, + "learning_rate": 4.91491530780414e-06, + "loss": 0.0015, + "reward": 3.145833432674408, + "reward_std": 0.6625833064317703, + "rewards/correctness_reward_func": 1.666666716337204, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 327 + }, + { + "completion_length": 136.5416717529297, + "epoch": 0.17556536866051117, + "grad_norm": 1.171875, + "kl": 0.029606230091303587, + "learning_rate": 4.913702537652634e-06, + "loss": 0.0012, + "reward": 3.25, + "reward_std": 0.273861289024353, + "rewards/correctness_reward_func": 1.75, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 328 + }, + { + "completion_length": 132.79166793823242, + "epoch": 0.1761006289308176, + "grad_norm": 1.3671875, + "kl": 0.03448170889168978, + "learning_rate": 4.912481337105827e-06, + "loss": 0.0014, + "reward": 3.1041666865348816, + "reward_std": 0.5164600908756256, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 329 + }, + { + "completion_length": 139.29166984558105, + "epoch": 0.17663588920112405, + "grad_norm": 2.0, + "kl": 0.03615036187693477, + "learning_rate": 4.911251710429034e-06, + "loss": 0.0014, + "reward": 2.3125000596046448, + "reward_std": 0.7174782603979111, + "rewards/correctness_reward_func": 0.833333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 330 + }, + { + "completion_length": 109.29166793823242, + "epoch": 0.17717114947143048, + "grad_norm": 1.921875, + "kl": 0.03804372949525714, + "learning_rate": 4.910013661917004e-06, + "loss": 0.0015, + "reward": 2.9791667461395264, + "reward_std": 0.5674288682639599, + "rewards/correctness_reward_func": 1.5000000149011612, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 331 + }, + { + "completion_length": 134.50000381469727, + "epoch": 0.17770640974173693, + "grad_norm": 0.95703125, + "kl": 0.051706235855817795, + "learning_rate": 4.908767195893894e-06, + "loss": 0.0021, + "reward": 3.274958372116089, + "reward_std": 0.36580392718315125, + "rewards/correctness_reward_func": 1.8333333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.4624583348631859, + "step": 332 + }, + { + "completion_length": 142.33333587646484, + "epoch": 0.17824167001204336, + "grad_norm": 2.078125, + "kl": 0.05041641462594271, + "learning_rate": 4.907512316713269e-06, + "loss": 0.002, + "reward": 2.020833373069763, + "reward_std": 0.6808377355337143, + "rewards/correctness_reward_func": 0.583333358168602, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 333 + }, + { + "completion_length": 145.45833587646484, + "epoch": 0.17877693028234978, + "grad_norm": 1.5390625, + "kl": 0.01903323526494205, + "learning_rate": 4.906249028758072e-06, + "loss": 0.0008, + "reward": 2.708333373069763, + "reward_std": 0.7714351117610931, + "rewards/correctness_reward_func": 1.2500000223517418, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 334 + }, + { + "completion_length": 139.83333778381348, + "epoch": 0.17931219055265624, + "grad_norm": 2.203125, + "kl": 0.038641279097646475, + "learning_rate": 4.9049773364406185e-06, + "loss": 0.0015, + "reward": 2.958333432674408, + "reward_std": 0.8382464461028576, + "rewards/correctness_reward_func": 1.5000000298023224, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 335 + }, + { + "completion_length": 149.8333396911621, + "epoch": 0.17984745082296266, + "grad_norm": 1.4296875, + "kl": 0.03192885918542743, + "learning_rate": 4.90369724420258e-06, + "loss": 0.0013, + "reward": 3.208333373069763, + "reward_std": 0.5020104348659515, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 336 + }, + { + "completion_length": 209.75000762939453, + "epoch": 0.18038271109326912, + "grad_norm": 0.69140625, + "kl": 0.02079350664280355, + "learning_rate": 4.902408756514964e-06, + "loss": 0.0008, + "reward": 2.8125, + "reward_std": 0.10458251088857651, + "rewards/correctness_reward_func": 1.5, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3333333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 337 + }, + { + "completion_length": 109.70833587646484, + "epoch": 0.18091797136357554, + "grad_norm": 1.359375, + "kl": 0.044482083059847355, + "learning_rate": 4.901111877878099e-06, + "loss": 0.0018, + "reward": 3.2500000596046448, + "reward_std": 0.46232305467128754, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 338 + }, + { + "completion_length": 119.83333778381348, + "epoch": 0.18145323163388197, + "grad_norm": 1.0, + "kl": 0.037186274304986, + "learning_rate": 4.899806612821626e-06, + "loss": 0.0015, + "reward": 3.2291666865348816, + "reward_std": 0.3001735806465149, + "rewards/correctness_reward_func": 1.75, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 339 + }, + { + "completion_length": 135.04166793823242, + "epoch": 0.18198849190418842, + "grad_norm": 2.140625, + "kl": 0.03643118590116501, + "learning_rate": 4.898492965904475e-06, + "loss": 0.0015, + "reward": 3.145833432674408, + "reward_std": 0.5953381061553955, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 340 + }, + { + "completion_length": 163.2916717529297, + "epoch": 0.18252375217449485, + "grad_norm": 1.0625, + "kl": 0.03810536675155163, + "learning_rate": 4.89717094171485e-06, + "loss": 0.0015, + "reward": 2.4375000596046448, + "reward_std": 0.11558076739311218, + "rewards/correctness_reward_func": 1.0, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 341 + }, + { + "completion_length": 110.87500381469727, + "epoch": 0.18305901244480127, + "grad_norm": 2.609375, + "kl": 0.08168017817661166, + "learning_rate": 4.8958405448702166e-06, + "loss": 0.0033, + "reward": 3.208333373069763, + "reward_std": 0.6582482904195786, + "rewards/correctness_reward_func": 1.7500000596046448, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 342 + }, + { + "completion_length": 116.29166984558105, + "epoch": 0.18359427271510773, + "grad_norm": 0.8828125, + "kl": 0.030964480247348547, + "learning_rate": 4.894501780017281e-06, + "loss": 0.0012, + "reward": 3.4791666865348816, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 343 + }, + { + "completion_length": 168.2916717529297, + "epoch": 0.18412953298541415, + "grad_norm": 1.6328125, + "kl": 0.03289065742865205, + "learning_rate": 4.893154651831982e-06, + "loss": 0.0013, + "reward": 2.7500000298023224, + "reward_std": 0.46232305467128754, + "rewards/correctness_reward_func": 1.3333333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 344 + }, + { + "completion_length": 129.58333778381348, + "epoch": 0.1846647932557206, + "grad_norm": 1.921875, + "kl": 0.03485443163663149, + "learning_rate": 4.891799165019462e-06, + "loss": 0.0014, + "reward": 2.958333432674408, + "reward_std": 0.8382464498281479, + "rewards/correctness_reward_func": 1.5000000298023224, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 345 + }, + { + "completion_length": 120.33333396911621, + "epoch": 0.18520005352602703, + "grad_norm": 1.65625, + "kl": 0.04113559052348137, + "learning_rate": 4.890435324314064e-06, + "loss": 0.0016, + "reward": 3.2500000596046448, + "reward_std": 0.46232305467128754, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 346 + }, + { + "completion_length": 139.12500190734863, + "epoch": 0.18573531379633346, + "grad_norm": 2.71875, + "kl": 0.06854599853977561, + "learning_rate": 4.889063134479307e-06, + "loss": 0.0027, + "reward": 2.9375, + "reward_std": 0.770716518163681, + "rewards/correctness_reward_func": 1.5000000298023224, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 347 + }, + { + "completion_length": 141.54166984558105, + "epoch": 0.1862705740666399, + "grad_norm": 0.94140625, + "kl": 0.02241353038698435, + "learning_rate": 4.887682600307868e-06, + "loss": 0.0009, + "reward": 3.208333373069763, + "reward_std": 0.33841101825237274, + "rewards/correctness_reward_func": 1.75, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 348 + }, + { + "completion_length": 114.79166984558105, + "epoch": 0.18680583433694634, + "grad_norm": 1.46875, + "kl": 0.03936735028401017, + "learning_rate": 4.886293726621572e-06, + "loss": 0.0016, + "reward": 2.895833373069763, + "reward_std": 0.25515517592430115, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 349 + }, + { + "completion_length": 118.91667175292969, + "epoch": 0.1873410946072528, + "grad_norm": 1.78125, + "kl": 0.04522312618792057, + "learning_rate": 4.884896518271371e-06, + "loss": 0.0018, + "reward": 2.833333373069763, + "reward_std": 0.40824827551841736, + "rewards/correctness_reward_func": 1.3333333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 350 + }, + { + "completion_length": 135.08333778381348, + "epoch": 0.18787635487755922, + "grad_norm": 1.2734375, + "kl": 0.03455492667853832, + "learning_rate": 4.883490980137327e-06, + "loss": 0.0014, + "reward": 3.020833373069763, + "reward_std": 0.41942431032657623, + "rewards/correctness_reward_func": 1.583333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375, + "rewards/xmlcount_reward_func": 0.5, + "step": 351 + }, + { + "completion_length": 130.08333587646484, + "epoch": 0.18841161514786564, + "grad_norm": 2.734375, + "kl": 0.033602004405111074, + "learning_rate": 4.882077117128596e-06, + "loss": 0.0013, + "reward": 3.0000001192092896, + "reward_std": 0.9246461093425751, + "rewards/correctness_reward_func": 1.5000000596046448, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 352 + }, + { + "completion_length": 150.91666793823242, + "epoch": 0.1889468754181721, + "grad_norm": 1.984375, + "kl": 0.031116609927266836, + "learning_rate": 4.88065493418341e-06, + "loss": 0.0012, + "reward": 2.6041666865348816, + "reward_std": 0.8001735806465149, + "rewards/correctness_reward_func": 1.2500000223517418, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3750000111758709, + "rewards/xmlcount_reward_func": 0.5, + "step": 353 + }, + { + "completion_length": 128.8333339691162, + "epoch": 0.18948213568847852, + "grad_norm": 1.734375, + "kl": 0.03071408625692129, + "learning_rate": 4.879224436269061e-06, + "loss": 0.0012, + "reward": 2.8333334028720856, + "reward_std": 0.6664472073316574, + "rewards/correctness_reward_func": 1.3333333656191826, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 354 + }, + { + "completion_length": 179.3333396911621, + "epoch": 0.19001739595878495, + "grad_norm": 1.7265625, + "kl": 0.024944405537098646, + "learning_rate": 4.877785628381882e-06, + "loss": 0.001, + "reward": 2.995333433151245, + "reward_std": 0.6755668371915817, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3541666679084301, + "rewards/xmlcount_reward_func": 0.47450000047683716, + "step": 355 + }, + { + "completion_length": 173.66666793823242, + "epoch": 0.1905526562290914, + "grad_norm": 1.78125, + "kl": 0.03421852085739374, + "learning_rate": 4.8763385155472335e-06, + "loss": 0.0014, + "reward": 2.432291716337204, + "reward_std": 0.8869683742523193, + "rewards/correctness_reward_func": 1.0833333432674408, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333395421505, + "rewards/xmlcount_reward_func": 0.4947916716337204, + "step": 356 + }, + { + "completion_length": 157.12500381469727, + "epoch": 0.19108791649939783, + "grad_norm": 2.125, + "kl": 0.04010986629873514, + "learning_rate": 4.874883102819477e-06, + "loss": 0.0016, + "reward": 3.020833432674408, + "reward_std": 0.6519949287176132, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 357 + }, + { + "completion_length": 165.7916717529297, + "epoch": 0.19162317676970428, + "grad_norm": 1.8203125, + "kl": 0.039810370188206434, + "learning_rate": 4.873419395281968e-06, + "loss": 0.0016, + "reward": 2.6875000596046448, + "reward_std": 0.623045951128006, + "rewards/correctness_reward_func": 1.3333333358168602, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3750000037252903, + "rewards/xmlcount_reward_func": 0.5, + "step": 358 + }, + { + "completion_length": 109.95833587646484, + "epoch": 0.1921584370400107, + "grad_norm": 1.4765625, + "kl": 0.04814850306138396, + "learning_rate": 4.871947398047031e-06, + "loss": 0.0019, + "reward": 3.3125000596046448, + "reward_std": 0.40438438951969147, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 359 + }, + { + "completion_length": 121.00000190734863, + "epoch": 0.19269369731031713, + "grad_norm": 2.6875, + "kl": 0.0602885982953012, + "learning_rate": 4.870467116255947e-06, + "loss": 0.0024, + "reward": 2.979166716337204, + "reward_std": 0.4592793136835098, + "rewards/correctness_reward_func": 1.5000000223517418, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 360 + }, + { + "completion_length": 122.95833587646484, + "epoch": 0.19322895758062358, + "grad_norm": 1.9921875, + "kl": 0.044847925659269094, + "learning_rate": 4.86897855507893e-06, + "loss": 0.0018, + "reward": 2.895833432674408, + "reward_std": 0.7715530209243298, + "rewards/correctness_reward_func": 1.4166667014360428, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 361 + }, + { + "completion_length": 154.25000190734863, + "epoch": 0.19376421785093, + "grad_norm": 1.8046875, + "kl": 0.025079205399379134, + "learning_rate": 4.867481719715112e-06, + "loss": 0.001, + "reward": 2.8695000410079956, + "reward_std": 0.31965839862823486, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.4945000037550926, + "step": 362 + }, + { + "completion_length": 166.75000762939453, + "epoch": 0.19429947812123646, + "grad_norm": 1.421875, + "kl": 0.03631326276808977, + "learning_rate": 4.8659766153925244e-06, + "loss": 0.0015, + "reward": 3.020833373069763, + "reward_std": 0.5223132222890854, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3750000037252903, + "rewards/xmlcount_reward_func": 0.5, + "step": 363 + }, + { + "completion_length": 160.00000762939453, + "epoch": 0.1948347383915429, + "grad_norm": 1.1484375, + "kl": 0.03517352696508169, + "learning_rate": 4.864463247368082e-06, + "loss": 0.0014, + "reward": 2.916666716337204, + "reward_std": 0.43686148524284363, + "rewards/correctness_reward_func": 1.5000000223517418, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375, + "rewards/xmlcount_reward_func": 0.5, + "step": 364 + }, + { + "completion_length": 145.70833587646484, + "epoch": 0.19536999866184931, + "grad_norm": 2.203125, + "kl": 0.025485435500741005, + "learning_rate": 4.862941620927559e-06, + "loss": 0.001, + "reward": 2.7291667461395264, + "reward_std": 0.9913395643234253, + "rewards/correctness_reward_func": 1.2500000447034836, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 365 + }, + { + "completion_length": 103.95833396911621, + "epoch": 0.19590525893215577, + "grad_norm": 2.5625, + "kl": 0.05637668911367655, + "learning_rate": 4.861411741385578e-06, + "loss": 0.0023, + "reward": 3.3125000596046448, + "reward_std": 0.40438440442085266, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 366 + }, + { + "completion_length": 138.25000381469727, + "epoch": 0.1964405192024622, + "grad_norm": 2.28125, + "kl": 0.036815219558775425, + "learning_rate": 4.859873614085582e-06, + "loss": 0.0015, + "reward": 2.7916667461395264, + "reward_std": 0.7334393262863159, + "rewards/correctness_reward_func": 1.3333333656191826, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 367 + }, + { + "completion_length": 123.00000190734863, + "epoch": 0.19697577947276865, + "grad_norm": 2.734375, + "kl": 0.06413549091666937, + "learning_rate": 4.8583272443998265e-06, + "loss": 0.0026, + "reward": 3.020833373069763, + "reward_std": 0.7092793434858322, + "rewards/correctness_reward_func": 1.5833333730697632, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 368 + }, + { + "completion_length": 112.29166793823242, + "epoch": 0.19751103974307507, + "grad_norm": 0.107421875, + "kl": 0.04899565642699599, + "learning_rate": 4.856772637729352e-06, + "loss": 0.002, + "reward": 3.5, + "reward_std": 0.0, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 369 + }, + { + "completion_length": 134.91666793823242, + "epoch": 0.1980463000133815, + "grad_norm": 2.640625, + "kl": 0.08906554849818349, + "learning_rate": 4.8552097995039696e-06, + "loss": 0.0036, + "reward": 2.625000089406967, + "reward_std": 0.963577076792717, + "rewards/correctness_reward_func": 1.1666667088866234, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 370 + }, + { + "completion_length": 126.83333778381348, + "epoch": 0.19858156028368795, + "grad_norm": 0.88671875, + "kl": 0.02941274642944336, + "learning_rate": 4.853638735182241e-06, + "loss": 0.0012, + "reward": 3.0, + "reward_std": 0.19364917278289795, + "rewards/correctness_reward_func": 1.5833333358168602, + "rewards/int_reward_func": 0.4375, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 371 + }, + { + "completion_length": 126.54167175292969, + "epoch": 0.19911682055399438, + "grad_norm": 1.8515625, + "kl": 0.0400623818859458, + "learning_rate": 4.852059450251459e-06, + "loss": 0.0016, + "reward": 3.2291667461395264, + "reward_std": 0.5133540891110897, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 372 + }, + { + "completion_length": 148.29166793823242, + "epoch": 0.1996520808243008, + "grad_norm": 2.125, + "kl": 0.029276425717398524, + "learning_rate": 4.850471950227631e-06, + "loss": 0.0012, + "reward": 2.833333432674408, + "reward_std": 1.0537454932928085, + "rewards/correctness_reward_func": 1.416666716337204, + "rewards/int_reward_func": 0.4166666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 373 + }, + { + "completion_length": 138.7500057220459, + "epoch": 0.20018734109460726, + "grad_norm": 0.71875, + "kl": 0.026868863962590694, + "learning_rate": 4.848876240655452e-06, + "loss": 0.0011, + "reward": 3.4791666865348816, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 374 + }, + { + "completion_length": 152.9583396911621, + "epoch": 0.20072260136491368, + "grad_norm": 1.53125, + "kl": 0.027000244241207838, + "learning_rate": 4.847272327108298e-06, + "loss": 0.0011, + "reward": 2.9375000596046448, + "reward_std": 0.6154161691665649, + "rewards/correctness_reward_func": 1.5833333432674408, + "rewards/int_reward_func": 0.4375, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 375 + }, + { + "completion_length": 150.87500381469727, + "epoch": 0.20125786163522014, + "grad_norm": 1.6953125, + "kl": 0.03350049676373601, + "learning_rate": 4.845660215188192e-06, + "loss": 0.0013, + "reward": 2.5000000298023224, + "reward_std": 0.5425351560115814, + "rewards/correctness_reward_func": 1.1666666865348816, + "rewards/int_reward_func": 0.4166666679084301, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 376 + }, + { + "completion_length": 138.00000190734863, + "epoch": 0.20179312190552656, + "grad_norm": 1.6328125, + "kl": 0.03322783159092069, + "learning_rate": 4.844039910525797e-06, + "loss": 0.0013, + "reward": 2.583333432674408, + "reward_std": 0.6289348416030407, + "rewards/correctness_reward_func": 1.1666666865348816, + "rewards/int_reward_func": 0.4375000074505806, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 377 + }, + { + "completion_length": 146.33333587646484, + "epoch": 0.202328382175833, + "grad_norm": 1.359375, + "kl": 0.03654646477662027, + "learning_rate": 4.8424114187803885e-06, + "loss": 0.0015, + "reward": 2.8125000596046448, + "reward_std": 0.309229951351881, + "rewards/correctness_reward_func": 1.3333333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 378 + }, + { + "completion_length": 161.54166984558105, + "epoch": 0.20286364244613944, + "grad_norm": 1.90625, + "kl": 0.032768722623586655, + "learning_rate": 4.8407747456398365e-06, + "loss": 0.0013, + "reward": 3.0000000596046448, + "reward_std": 0.5796062797307968, + "rewards/correctness_reward_func": 1.583333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.416666679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 379 + }, + { + "completion_length": 135.62500190734863, + "epoch": 0.20339890271644587, + "grad_norm": 0.9453125, + "kl": 0.0361147103831172, + "learning_rate": 4.83912989682059e-06, + "loss": 0.0014, + "reward": 3.2291666865348816, + "reward_std": 0.3001735806465149, + "rewards/correctness_reward_func": 1.75, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 380 + }, + { + "completion_length": 123.00000190734863, + "epoch": 0.20393416298675232, + "grad_norm": 2.484375, + "kl": 0.030705954413861036, + "learning_rate": 4.837476878067649e-06, + "loss": 0.0012, + "reward": 3.2009167671203613, + "reward_std": 0.6777066141366959, + "rewards/correctness_reward_func": 1.7500000596046448, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.49258333444595337, + "step": 381 + }, + { + "completion_length": 139.33333587646484, + "epoch": 0.20446942325705875, + "grad_norm": 1.140625, + "kl": 0.03536796988919377, + "learning_rate": 4.8358156951545515e-06, + "loss": 0.0014, + "reward": 3.333333373069763, + "reward_std": 0.2686738818883896, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 382 + }, + { + "completion_length": 151.5833396911621, + "epoch": 0.20500468352736517, + "grad_norm": 1.453125, + "kl": 0.02349434932693839, + "learning_rate": 4.834146353883349e-06, + "loss": 0.0009, + "reward": 2.6875000596046448, + "reward_std": 0.5792608670890331, + "rewards/correctness_reward_func": 1.3333333432674408, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.37500000558793545, + "rewards/xmlcount_reward_func": 0.5, + "step": 383 + }, + { + "completion_length": 140.75000381469727, + "epoch": 0.20553994379767163, + "grad_norm": 1.8203125, + "kl": 0.02586292941123247, + "learning_rate": 4.832468860084591e-06, + "loss": 0.001, + "reward": 2.708333373069763, + "reward_std": 0.6942067444324493, + "rewards/correctness_reward_func": 1.2500000223517418, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 384 + }, + { + "completion_length": 169.2083396911621, + "epoch": 0.20607520406797805, + "grad_norm": 1.9765625, + "kl": 0.040242417715489864, + "learning_rate": 4.830783219617296e-06, + "loss": 0.0016, + "reward": 2.7291667759418488, + "reward_std": 0.9853319078683853, + "rewards/correctness_reward_func": 1.3333333879709244, + "rewards/int_reward_func": 0.4166666679084301, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 385 + }, + { + "completion_length": 163.58333587646484, + "epoch": 0.20661046433828448, + "grad_norm": 1.28125, + "kl": 0.039259279146790504, + "learning_rate": 4.829089438368944e-06, + "loss": 0.0016, + "reward": 2.9791666865348816, + "reward_std": 0.6364961266517639, + "rewards/correctness_reward_func": 1.5833333432674408, + "rewards/int_reward_func": 0.4375000074505806, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 386 + }, + { + "completion_length": 142.4166717529297, + "epoch": 0.20714572460859093, + "grad_norm": 1.59375, + "kl": 0.027523174416273832, + "learning_rate": 4.82738752225544e-06, + "loss": 0.0011, + "reward": 2.4375000596046448, + "reward_std": 0.7460914701223373, + "rewards/correctness_reward_func": 1.0833333507180214, + "rewards/int_reward_func": 0.39583333395421505, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 387 + }, + { + "completion_length": 154.37500381469727, + "epoch": 0.20768098487889736, + "grad_norm": 1.6875, + "kl": 0.05612138286232948, + "learning_rate": 4.825677477221109e-06, + "loss": 0.0022, + "reward": 2.4166667461395264, + "reward_std": 0.16661180183291435, + "rewards/correctness_reward_func": 1.0, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 388 + }, + { + "completion_length": 160.5000057220459, + "epoch": 0.2082162451492038, + "grad_norm": 1.8984375, + "kl": 0.04402688471600413, + "learning_rate": 4.823959309238665e-06, + "loss": 0.0018, + "reward": 2.395833373069763, + "reward_std": 0.6816265136003494, + "rewards/correctness_reward_func": 1.083333358168602, + "rewards/int_reward_func": 0.4375000074505806, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.375, + "rewards/xmlcount_reward_func": 0.5, + "step": 389 + }, + { + "completion_length": 140.50000190734863, + "epoch": 0.20875150541951024, + "grad_norm": 1.7109375, + "kl": 0.034618140663951635, + "learning_rate": 4.822233024309193e-06, + "loss": 0.0014, + "reward": 2.708333373069763, + "reward_std": 0.37592335790395737, + "rewards/correctness_reward_func": 1.25, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 390 + }, + { + "completion_length": 113.45833587646484, + "epoch": 0.20928676568981666, + "grad_norm": 0.83203125, + "kl": 0.02321735117584467, + "learning_rate": 4.820498628462129e-06, + "loss": 0.0009, + "reward": 2.9166666865348816, + "reward_std": 0.20412415266036987, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 391 + }, + { + "completion_length": 123.66666984558105, + "epoch": 0.20982202596012312, + "grad_norm": 1.9296875, + "kl": 0.026443745708093047, + "learning_rate": 4.8187561277552376e-06, + "loss": 0.0011, + "reward": 3.2847084403038025, + "reward_std": 0.37730535864830017, + "rewards/correctness_reward_func": 1.8333333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.49304167181253433, + "step": 392 + }, + { + "completion_length": 146.54166793823242, + "epoch": 0.21035728623042954, + "grad_norm": 2.046875, + "kl": 0.050067766569554806, + "learning_rate": 4.8170055282745915e-06, + "loss": 0.002, + "reward": 2.5000000596046448, + "reward_std": 0.6325703375041485, + "rewards/correctness_reward_func": 1.0833333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.416666679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 393 + }, + { + "completion_length": 143.58333587646484, + "epoch": 0.210892546500736, + "grad_norm": 1.9140625, + "kl": 0.04486254137009382, + "learning_rate": 4.815246836134551e-06, + "loss": 0.0018, + "reward": 2.8750000596046448, + "reward_std": 0.8515234887599945, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 394 + }, + { + "completion_length": 161.75000381469727, + "epoch": 0.21142780677104242, + "grad_norm": 2.453125, + "kl": 0.027609082404524088, + "learning_rate": 4.8134800574777415e-06, + "loss": 0.0011, + "reward": 2.8281250596046448, + "reward_std": 0.8165152966976166, + "rewards/correctness_reward_func": 1.4166667014360428, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.4947916716337204, + "step": 395 + }, + { + "completion_length": 164.70833587646484, + "epoch": 0.21196306704134885, + "grad_norm": 1.625, + "kl": 0.053220887668430805, + "learning_rate": 4.811705198475032e-06, + "loss": 0.0021, + "reward": 3.145833373069763, + "reward_std": 0.5137150399386883, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.33333334140479565, + "rewards/xmlcount_reward_func": 0.5, + "step": 396 + }, + { + "completion_length": 140.75000381469727, + "epoch": 0.2124983273116553, + "grad_norm": 1.09375, + "kl": 0.03010843973606825, + "learning_rate": 4.809922265325513e-06, + "loss": 0.0012, + "reward": 2.7291666865348816, + "reward_std": 0.35721729323267937, + "rewards/correctness_reward_func": 1.3333333432674408, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 397 + }, + { + "completion_length": 136.33333778381348, + "epoch": 0.21303358758196173, + "grad_norm": 1.3828125, + "kl": 0.027536609675735235, + "learning_rate": 4.808131264256479e-06, + "loss": 0.0011, + "reward": 2.645833373069763, + "reward_std": 0.5042977333068848, + "rewards/correctness_reward_func": 1.1666666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 398 + }, + { + "completion_length": 153.8333396911621, + "epoch": 0.21356884785226818, + "grad_norm": 1.390625, + "kl": 0.03445200156420469, + "learning_rate": 4.806332201523399e-06, + "loss": 0.0014, + "reward": 2.708333373069763, + "reward_std": 0.564385175704956, + "rewards/correctness_reward_func": 1.2500000298023224, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 399 + }, + { + "completion_length": 130.9166717529297, + "epoch": 0.2141041081225746, + "grad_norm": 1.8046875, + "kl": 0.055761674884706736, + "learning_rate": 4.804525083409902e-06, + "loss": 0.0022, + "reward": 3.3750000596046448, + "reward_std": 0.3061862215399742, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 400 + }, + { + "completion_length": 156.5833396911621, + "epoch": 0.21463936839288103, + "grad_norm": 1.796875, + "kl": 0.028924104291945696, + "learning_rate": 4.802709916227753e-06, + "loss": 0.0012, + "reward": 3.2916667461395264, + "reward_std": 0.3347994163632393, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3750000111758709, + "rewards/xmlcount_reward_func": 0.5, + "step": 401 + }, + { + "completion_length": 98.45833587646484, + "epoch": 0.21517462866318748, + "grad_norm": 2.046875, + "kl": 0.07245343318209052, + "learning_rate": 4.800886706316828e-06, + "loss": 0.0029, + "reward": 3.208333373069763, + "reward_std": 0.5483061634004116, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 402 + }, + { + "completion_length": 154.0000057220459, + "epoch": 0.2157098889334939, + "grad_norm": 1.453125, + "kl": 0.02168190269730985, + "learning_rate": 4.7990554600450945e-06, + "loss": 0.0009, + "reward": 2.8750000298023224, + "reward_std": 0.4854898750782013, + "rewards/correctness_reward_func": 1.416666679084301, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 403 + }, + { + "completion_length": 132.45833587646484, + "epoch": 0.21624514920380034, + "grad_norm": 1.90625, + "kl": 0.033673313446342945, + "learning_rate": 4.79721618380859e-06, + "loss": 0.0013, + "reward": 2.65625, + "reward_std": 0.4188731759786606, + "rewards/correctness_reward_func": 1.25, + "rewards/int_reward_func": 0.4375, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.4895833358168602, + "step": 404 + }, + { + "completion_length": 130.37500381469727, + "epoch": 0.2167804094741068, + "grad_norm": 1.1328125, + "kl": 0.026179906912148, + "learning_rate": 4.795368884031397e-06, + "loss": 0.001, + "reward": 3.2291666865348816, + "reward_std": 0.3248923234641552, + "rewards/correctness_reward_func": 1.75, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 405 + }, + { + "completion_length": 135.83334159851074, + "epoch": 0.21731566974441321, + "grad_norm": 1.90625, + "kl": 0.043639494106173515, + "learning_rate": 4.793513567165623e-06, + "loss": 0.0017, + "reward": 2.645833373069763, + "reward_std": 0.5155290961265564, + "rewards/correctness_reward_func": 1.2500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333395421505, + "rewards/xmlcount_reward_func": 0.5, + "step": 406 + }, + { + "completion_length": 150.3333396911621, + "epoch": 0.21785093001471967, + "grad_norm": 1.28125, + "kl": 0.028193223755806684, + "learning_rate": 4.791650239691377e-06, + "loss": 0.0011, + "reward": 3.145833373069763, + "reward_std": 0.5290164947509766, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 407 + }, + { + "completion_length": 139.62500762939453, + "epoch": 0.2183861902850261, + "grad_norm": 1.1796875, + "kl": 0.04055926762521267, + "learning_rate": 4.7897789081167444e-06, + "loss": 0.0016, + "reward": 3.020833373069763, + "reward_std": 0.22181354090571404, + "rewards/correctness_reward_func": 1.5833333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 408 + }, + { + "completion_length": 137.7083396911621, + "epoch": 0.21892145055533252, + "grad_norm": 1.5234375, + "kl": 0.03536925697699189, + "learning_rate": 4.787899578977772e-06, + "loss": 0.0014, + "reward": 2.6666666865348816, + "reward_std": 0.4518480896949768, + "rewards/correctness_reward_func": 1.2500000074505806, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 409 + }, + { + "completion_length": 152.33333587646484, + "epoch": 0.21945671082563897, + "grad_norm": 1.96875, + "kl": 0.044180845376104116, + "learning_rate": 4.786012258838433e-06, + "loss": 0.0018, + "reward": 3.0000000596046448, + "reward_std": 0.48101906478405, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3333333395421505, + "rewards/xmlcount_reward_func": 0.5, + "step": 410 + }, + { + "completion_length": 155.33333587646484, + "epoch": 0.2199919710959454, + "grad_norm": 1.921875, + "kl": 0.022907113656401634, + "learning_rate": 4.784116954290618e-06, + "loss": 0.0009, + "reward": 2.8750000596046448, + "reward_std": 0.7123230546712875, + "rewards/correctness_reward_func": 1.4166667014360428, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 411 + }, + { + "completion_length": 136.7916717529297, + "epoch": 0.22052723136625185, + "grad_norm": 2.578125, + "kl": 0.05602648947387934, + "learning_rate": 4.782213671954099e-06, + "loss": 0.0022, + "reward": 2.708333373069763, + "reward_std": 0.5020104050636292, + "rewards/correctness_reward_func": 1.2500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 412 + }, + { + "completion_length": 172.08333587646484, + "epoch": 0.22106249163655828, + "grad_norm": 1.2265625, + "kl": 0.018192848656326532, + "learning_rate": 4.780302418476516e-06, + "loss": 0.0007, + "reward": 3.208333373069763, + "reward_std": 0.5020104013383389, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 413 + }, + { + "completion_length": 151.29167366027832, + "epoch": 0.2215977519068647, + "grad_norm": 1.0703125, + "kl": 0.04675913043320179, + "learning_rate": 4.778383200533349e-06, + "loss": 0.0019, + "reward": 2.9791666865348816, + "reward_std": 0.2601960562169552, + "rewards/correctness_reward_func": 1.5833333358168602, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 414 + }, + { + "completion_length": 179.75000381469727, + "epoch": 0.22213301217717116, + "grad_norm": 1.640625, + "kl": 0.03261849144473672, + "learning_rate": 4.776456024827895e-06, + "loss": 0.0013, + "reward": 2.4375000596046448, + "reward_std": 0.8668341487646103, + "rewards/correctness_reward_func": 1.166666679084301, + "rewards/int_reward_func": 0.4375000074505806, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.33333334140479565, + "rewards/xmlcount_reward_func": 0.5, + "step": 415 + }, + { + "completion_length": 165.37500381469727, + "epoch": 0.22266827244747758, + "grad_norm": 1.4453125, + "kl": 0.034663321916013956, + "learning_rate": 4.774520898091244e-06, + "loss": 0.0014, + "reward": 2.8125000596046448, + "reward_std": 0.5357958674430847, + "rewards/correctness_reward_func": 1.5000000223517418, + "rewards/int_reward_func": 0.3750000037252903, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 416 + }, + { + "completion_length": 148.0416717529297, + "epoch": 0.223203532717784, + "grad_norm": 2.09375, + "kl": 0.038772601168602705, + "learning_rate": 4.772577827082261e-06, + "loss": 0.0016, + "reward": 2.687500089406967, + "reward_std": 0.7555890195071697, + "rewards/correctness_reward_func": 1.2500000223517418, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000149011612, + "rewards/xmlcount_reward_func": 0.5, + "step": 417 + }, + { + "completion_length": 135.08333778381348, + "epoch": 0.22373879298809046, + "grad_norm": 1.7109375, + "kl": 0.017822970170527697, + "learning_rate": 4.770626818587554e-06, + "loss": 0.0007, + "reward": 2.5416666865348816, + "reward_std": 0.2686738669872284, + "rewards/correctness_reward_func": 1.0833333358168602, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 418 + }, + { + "completion_length": 207.33333778381348, + "epoch": 0.2242740532583969, + "grad_norm": 3.328125, + "kl": 0.03107347432523966, + "learning_rate": 4.768667879421457e-06, + "loss": 0.0012, + "reward": 2.3177084177732468, + "reward_std": 0.7329771295189857, + "rewards/correctness_reward_func": 1.083333358168602, + "rewards/int_reward_func": 0.3958333395421505, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.354166679084301, + "rewards/xmlcount_reward_func": 0.484375, + "step": 419 + }, + { + "completion_length": 174.8333396911621, + "epoch": 0.22480931352870334, + "grad_norm": 1.9921875, + "kl": 0.05148722976446152, + "learning_rate": 4.7667010164260016e-06, + "loss": 0.0021, + "reward": 2.452416777610779, + "reward_std": 0.6645737141370773, + "rewards/correctness_reward_func": 1.1666666865348816, + "rewards/int_reward_func": 0.4375000074505806, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.354166679084301, + "rewards/xmlcount_reward_func": 0.49408333748579025, + "step": 420 + }, + { + "completion_length": 153.12500381469727, + "epoch": 0.22534457379900977, + "grad_norm": 1.8359375, + "kl": 0.0318888071924448, + "learning_rate": 4.764726236470897e-06, + "loss": 0.0013, + "reward": 2.604166716337204, + "reward_std": 0.6094112247228622, + "rewards/correctness_reward_func": 1.2500000298023224, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 421 + }, + { + "completion_length": 151.2083396911621, + "epoch": 0.2258798340693162, + "grad_norm": 1.578125, + "kl": 0.03988643130287528, + "learning_rate": 4.762743546453503e-06, + "loss": 0.0016, + "reward": 2.458333373069763, + "reward_std": 0.556085180491209, + "rewards/correctness_reward_func": 1.0000000149011612, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 422 + }, + { + "completion_length": 104.75000381469727, + "epoch": 0.22641509433962265, + "grad_norm": 1.6953125, + "kl": 0.02869172114878893, + "learning_rate": 4.760752953298807e-06, + "loss": 0.0011, + "reward": 3.0625000298023224, + "reward_std": 0.25515517219901085, + "rewards/correctness_reward_func": 1.5833333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 423 + }, + { + "completion_length": 148.50000762939453, + "epoch": 0.22695035460992907, + "grad_norm": 1.7109375, + "kl": 0.028270404785871506, + "learning_rate": 4.758754463959401e-06, + "loss": 0.0011, + "reward": 2.2850833535194397, + "reward_std": 0.49467696249485016, + "rewards/correctness_reward_func": 0.916666679084301, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.416666679084301, + "rewards/xmlcount_reward_func": 0.4934166669845581, + "step": 424 + }, + { + "completion_length": 131.50000190734863, + "epoch": 0.22748561488023553, + "grad_norm": 1.359375, + "kl": 0.024721851106733084, + "learning_rate": 4.756748085415455e-06, + "loss": 0.001, + "reward": 3.083333373069763, + "reward_std": 0.46232305467128754, + "rewards/correctness_reward_func": 1.583333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 425 + }, + { + "completion_length": 148.7916717529297, + "epoch": 0.22802087515054195, + "grad_norm": 1.3046875, + "kl": 0.026703921612352133, + "learning_rate": 4.754733824674694e-06, + "loss": 0.0011, + "reward": 2.6666666865348816, + "reward_std": 0.48720720410346985, + "rewards/correctness_reward_func": 1.2500000074505806, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 426 + }, + { + "completion_length": 179.00000762939453, + "epoch": 0.22855613542084838, + "grad_norm": 1.7109375, + "kl": 0.034272789023816586, + "learning_rate": 4.752711688772375e-06, + "loss": 0.0014, + "reward": 2.577833414077759, + "reward_std": 0.4778827279806137, + "rewards/correctness_reward_func": 1.2500000298023224, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3541666716337204, + "rewards/xmlcount_reward_func": 0.4945000037550926, + "step": 427 + }, + { + "completion_length": 124.58333969116211, + "epoch": 0.22909139569115483, + "grad_norm": 1.765625, + "kl": 0.039995139464735985, + "learning_rate": 4.750681684771257e-06, + "loss": 0.0016, + "reward": 3.020833373069763, + "reward_std": 0.6677707135677338, + "rewards/correctness_reward_func": 1.5833333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 428 + }, + { + "completion_length": 157.37500381469727, + "epoch": 0.22962665596146126, + "grad_norm": 1.390625, + "kl": 0.023208866827189922, + "learning_rate": 4.748643819761585e-06, + "loss": 0.0009, + "reward": 3.208333373069763, + "reward_std": 0.48936041817069054, + "rewards/correctness_reward_func": 1.8333333432674408, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.416666679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 429 + }, + { + "completion_length": 123.29167175292969, + "epoch": 0.2301619162317677, + "grad_norm": 1.9140625, + "kl": 0.08427455788478255, + "learning_rate": 4.7465981008610555e-06, + "loss": 0.0034, + "reward": 3.208333373069763, + "reward_std": 0.35120461508631706, + "rewards/correctness_reward_func": 1.75, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 430 + }, + { + "completion_length": 142.08333587646484, + "epoch": 0.23069717650207414, + "grad_norm": 1.65625, + "kl": 0.03192599257454276, + "learning_rate": 4.7445445352148e-06, + "loss": 0.0013, + "reward": 3.1875000596046448, + "reward_std": 0.37377963587641716, + "rewards/correctness_reward_func": 1.75, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 431 + }, + { + "completion_length": 152.2916717529297, + "epoch": 0.23123243677238056, + "grad_norm": 1.296875, + "kl": 0.023425794672220945, + "learning_rate": 4.742483129995355e-06, + "loss": 0.0009, + "reward": 2.8750000596046448, + "reward_std": 0.25129128992557526, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 432 + }, + { + "completion_length": 202.2916717529297, + "epoch": 0.23176769704268702, + "grad_norm": 1.6328125, + "kl": 0.027119331993162632, + "learning_rate": 4.740413892402639e-06, + "loss": 0.0011, + "reward": 2.7500000596046448, + "reward_std": 0.6782792210578918, + "rewards/correctness_reward_func": 1.4166667088866234, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.35416667722165585, + "rewards/xmlcount_reward_func": 0.5, + "step": 433 + }, + { + "completion_length": 162.50000190734863, + "epoch": 0.23230295731299344, + "grad_norm": 1.8671875, + "kl": 0.022649120073765516, + "learning_rate": 4.738336829663926e-06, + "loss": 0.0009, + "reward": 2.520833432674408, + "reward_std": 0.5779038220643997, + "rewards/correctness_reward_func": 1.083333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 434 + }, + { + "completion_length": 113.7500057220459, + "epoch": 0.23283821758329987, + "grad_norm": 1.7265625, + "kl": 0.0458745863288641, + "learning_rate": 4.736251949033823e-06, + "loss": 0.0018, + "reward": 3.1875, + "reward_std": 0.5431509912014008, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 435 + }, + { + "completion_length": 164.83333778381348, + "epoch": 0.23337347785360632, + "grad_norm": 1.7265625, + "kl": 0.03575094696134329, + "learning_rate": 4.734159257794239e-06, + "loss": 0.0014, + "reward": 2.7760417461395264, + "reward_std": 0.8451508581638336, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.484375, + "step": 436 + }, + { + "completion_length": 128.9583339691162, + "epoch": 0.23390873812391275, + "grad_norm": 1.859375, + "kl": 0.03346340823918581, + "learning_rate": 4.732058763254368e-06, + "loss": 0.0013, + "reward": 3.208333432674408, + "reward_std": 0.5094902031123638, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 437 + }, + { + "completion_length": 162.37500381469727, + "epoch": 0.2344439983942192, + "grad_norm": 2.328125, + "kl": 0.027165093226358294, + "learning_rate": 4.729950472750654e-06, + "loss": 0.0011, + "reward": 3.0000001192092896, + "reward_std": 0.8066200762987137, + "rewards/correctness_reward_func": 1.5833333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666865348816, + "rewards/xmlcount_reward_func": 0.5, + "step": 438 + }, + { + "completion_length": 131.62500190734863, + "epoch": 0.23497925866452563, + "grad_norm": 1.8671875, + "kl": 0.07023448962718248, + "learning_rate": 4.7278343936467745e-06, + "loss": 0.0028, + "reward": 2.833333432674408, + "reward_std": 0.6454972475767136, + "rewards/correctness_reward_func": 1.4166666716337204, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 439 + }, + { + "completion_length": 160.9166717529297, + "epoch": 0.23551451893483205, + "grad_norm": 1.5390625, + "kl": 0.04344167560338974, + "learning_rate": 4.725710533333608e-06, + "loss": 0.0017, + "reward": 3.0416666865348816, + "reward_std": 0.7441303730010986, + "rewards/correctness_reward_func": 1.5833333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 440 + }, + { + "completion_length": 120.33333587646484, + "epoch": 0.2360497792051385, + "grad_norm": 2.046875, + "kl": 0.02986164903268218, + "learning_rate": 4.72357889922921e-06, + "loss": 0.0012, + "reward": 3.145833373069763, + "reward_std": 0.5042977184057236, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 441 + }, + { + "completion_length": 141.16666793823242, + "epoch": 0.23658503947544493, + "grad_norm": 1.4453125, + "kl": 0.025641448330134153, + "learning_rate": 4.72143949877879e-06, + "loss": 0.001, + "reward": 3.3125000596046448, + "reward_std": 0.309229951351881, + "rewards/correctness_reward_func": 1.8333333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 442 + }, + { + "completion_length": 128.12500190734863, + "epoch": 0.23712029974575138, + "grad_norm": 1.0234375, + "kl": 0.03461711807176471, + "learning_rate": 4.719292339454682e-06, + "loss": 0.0014, + "reward": 3.4166666865348816, + "reward_std": 0.20412413775920868, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 443 + }, + { + "completion_length": 125.91666984558105, + "epoch": 0.2376555600160578, + "grad_norm": 1.8984375, + "kl": 0.03600315935909748, + "learning_rate": 4.71713742875632e-06, + "loss": 0.0014, + "reward": 2.833333373069763, + "reward_std": 0.40824827551841736, + "rewards/correctness_reward_func": 1.3333333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 444 + }, + { + "completion_length": 123.54166984558105, + "epoch": 0.23819082028636424, + "grad_norm": 1.890625, + "kl": 0.05057946778833866, + "learning_rate": 4.714974774210209e-06, + "loss": 0.002, + "reward": 2.895833373069763, + "reward_std": 0.25515518710017204, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 445 + }, + { + "completion_length": 139.8333396911621, + "epoch": 0.2387260805566707, + "grad_norm": 1.8125, + "kl": 0.04080198332667351, + "learning_rate": 4.712804383369905e-06, + "loss": 0.0016, + "reward": 2.770833432674408, + "reward_std": 0.45155154168605804, + "rewards/correctness_reward_func": 1.3333333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000149011612, + "rewards/xmlcount_reward_func": 0.5, + "step": 446 + }, + { + "completion_length": 134.375, + "epoch": 0.23926134082697711, + "grad_norm": 2.703125, + "kl": 0.030578581616282463, + "learning_rate": 4.710626263815982e-06, + "loss": 0.0012, + "reward": 2.958333373069763, + "reward_std": 0.9482545256614685, + "rewards/correctness_reward_func": 1.5000000596046448, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 447 + }, + { + "completion_length": 144.37500381469727, + "epoch": 0.23979660109728354, + "grad_norm": 1.0625, + "kl": 0.03926865756511688, + "learning_rate": 4.7084404231560085e-06, + "loss": 0.0016, + "reward": 3.083333343267441, + "reward_std": 0.20412413775920868, + "rewards/correctness_reward_func": 1.5833333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 448 + }, + { + "completion_length": 129.58333587646484, + "epoch": 0.24033186136759, + "grad_norm": 2.34375, + "kl": 0.02997904270887375, + "learning_rate": 4.706246869024523e-06, + "loss": 0.0012, + "reward": 3.1197916865348816, + "reward_std": 0.49308109283447266, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.4947916716337204, + "step": 449 + }, + { + "completion_length": 121.41667175292969, + "epoch": 0.24086712163789642, + "grad_norm": 1.3203125, + "kl": 0.041862784419208765, + "learning_rate": 4.7040456090830015e-06, + "loss": 0.0017, + "reward": 2.979166716337204, + "reward_std": 0.25515517592430115, + "rewards/correctness_reward_func": 1.5833333358168602, + "rewards/int_reward_func": 0.39583333395421505, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 450 + }, + { + "completion_length": 130.6250057220459, + "epoch": 0.24140238190820287, + "grad_norm": 2.21875, + "kl": 0.04182751849293709, + "learning_rate": 4.701836651019838e-06, + "loss": 0.0017, + "reward": 2.6875000596046448, + "reward_std": 0.7292841225862503, + "rewards/correctness_reward_func": 1.2500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 451 + }, + { + "completion_length": 126.20833778381348, + "epoch": 0.2419376421785093, + "grad_norm": 1.5625, + "kl": 0.06659243628382683, + "learning_rate": 4.69962000255031e-06, + "loss": 0.0027, + "reward": 3.1406250596046448, + "reward_std": 0.41231444105505943, + "rewards/correctness_reward_func": 1.75, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333432674408, + "rewards/xmlcount_reward_func": 0.4947916716337204, + "step": 452 + }, + { + "completion_length": 128.16666984558105, + "epoch": 0.24247290244881572, + "grad_norm": 1.7265625, + "kl": 0.025568378157913685, + "learning_rate": 4.697395671416559e-06, + "loss": 0.001, + "reward": 2.833333373069763, + "reward_std": 0.7361843436956406, + "rewards/correctness_reward_func": 1.333333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 453 + }, + { + "completion_length": 168.0416717529297, + "epoch": 0.24300816271912218, + "grad_norm": 1.890625, + "kl": 0.03563910163938999, + "learning_rate": 4.6951636653875576e-06, + "loss": 0.0014, + "reward": 2.5625000596046448, + "reward_std": 0.6485128216445446, + "rewards/correctness_reward_func": 1.1666666865348816, + "rewards/int_reward_func": 0.4375, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 454 + }, + { + "completion_length": 160.70833587646484, + "epoch": 0.2435434229894286, + "grad_norm": 1.9375, + "kl": 0.03328033583238721, + "learning_rate": 4.6929239922590856e-06, + "loss": 0.0013, + "reward": 2.708333373069763, + "reward_std": 0.6793645471334457, + "rewards/correctness_reward_func": 1.2500000223517418, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 455 + }, + { + "completion_length": 158.08333778381348, + "epoch": 0.24407868325973506, + "grad_norm": 2.125, + "kl": 0.045512993820011616, + "learning_rate": 4.690676659853702e-06, + "loss": 0.0018, + "reward": 2.520833432674408, + "reward_std": 1.0850803554058075, + "rewards/correctness_reward_func": 1.083333358168602, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 456 + }, + { + "completion_length": 132.00000381469727, + "epoch": 0.24461394353004148, + "grad_norm": 1.421875, + "kl": 0.047892688773572445, + "learning_rate": 4.688421676020717e-06, + "loss": 0.0019, + "reward": 3.395833373069763, + "reward_std": 0.25515519082546234, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 457 + }, + { + "completion_length": 129.75000381469727, + "epoch": 0.2451492038003479, + "grad_norm": 1.9140625, + "kl": 0.02556005073711276, + "learning_rate": 4.686159048636165e-06, + "loss": 0.001, + "reward": 2.895833373069763, + "reward_std": 0.25515517219901085, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 458 + }, + { + "completion_length": 121.41666984558105, + "epoch": 0.24568446407065436, + "grad_norm": 2.546875, + "kl": 0.047415590612217784, + "learning_rate": 4.683888785602778e-06, + "loss": 0.0019, + "reward": 2.7916667461395264, + "reward_std": 1.0333142131567001, + "rewards/correctness_reward_func": 1.3333333730697632, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 459 + }, + { + "completion_length": 135.6250057220459, + "epoch": 0.2462197243409608, + "grad_norm": 1.875, + "kl": 0.05515039339661598, + "learning_rate": 4.681610894849957e-06, + "loss": 0.0022, + "reward": 2.9791667461395264, + "reward_std": 0.7915693670511246, + "rewards/correctness_reward_func": 1.5833333730697632, + "rewards/int_reward_func": 0.4375, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 460 + }, + { + "completion_length": 136.37500762939453, + "epoch": 0.24675498461126724, + "grad_norm": 2.015625, + "kl": 0.02769710123538971, + "learning_rate": 4.679325384333744e-06, + "loss": 0.0011, + "reward": 3.2291667461395264, + "reward_std": 0.5133541077375412, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 461 + }, + { + "completion_length": 138.70833587646484, + "epoch": 0.24729024488157367, + "grad_norm": 1.03125, + "kl": 0.029878970235586166, + "learning_rate": 4.677032262036794e-06, + "loss": 0.0012, + "reward": 3.145833373069763, + "reward_std": 0.5513499081134796, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 462 + }, + { + "completion_length": 175.41667556762695, + "epoch": 0.2478255051518801, + "grad_norm": 2.078125, + "kl": 0.042299624998122454, + "learning_rate": 4.674731535968351e-06, + "loss": 0.0017, + "reward": 2.708333373069763, + "reward_std": 0.7748701274394989, + "rewards/correctness_reward_func": 1.333333358168602, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 463 + }, + { + "completion_length": 220.66667556762695, + "epoch": 0.24836076542218655, + "grad_norm": 1.671875, + "kl": 0.038636271841824055, + "learning_rate": 4.6724232141642135e-06, + "loss": 0.0015, + "reward": 2.6927084028720856, + "reward_std": 0.5764480978250504, + "rewards/correctness_reward_func": 1.416666679084301, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.31250000558793545, + "rewards/xmlcount_reward_func": 0.484375, + "step": 464 + }, + { + "completion_length": 129.7916717529297, + "epoch": 0.24889602569249297, + "grad_norm": 1.515625, + "kl": 0.029603436589241028, + "learning_rate": 4.6701073046867106e-06, + "loss": 0.0012, + "reward": 2.9791667461395264, + "reward_std": 0.6551453582942486, + "rewards/correctness_reward_func": 1.5833333432674408, + "rewards/int_reward_func": 0.4166666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 465 + }, + { + "completion_length": 159.25000381469727, + "epoch": 0.2494312859627994, + "grad_norm": 2.265625, + "kl": 0.02957107638940215, + "learning_rate": 4.667783815624675e-06, + "loss": 0.0012, + "reward": 2.437500089406967, + "reward_std": 1.0319268852472305, + "rewards/correctness_reward_func": 1.0000000223517418, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 466 + }, + { + "completion_length": 157.62500190734863, + "epoch": 0.24996654623310585, + "grad_norm": 1.5234375, + "kl": 0.03178174514323473, + "learning_rate": 4.66545275509341e-06, + "loss": 0.0013, + "reward": 3.0416667461395264, + "reward_std": 0.5337304323911667, + "rewards/correctness_reward_func": 1.583333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 467 + }, + { + "completion_length": 192.2083396911621, + "epoch": 0.2505018065034123, + "grad_norm": 1.3046875, + "kl": 0.033401607535779476, + "learning_rate": 4.663114131234666e-06, + "loss": 0.0013, + "reward": 2.479166716337204, + "reward_std": 0.7958995848894119, + "rewards/correctness_reward_func": 1.0833333507180214, + "rewards/int_reward_func": 0.4375, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 468 + }, + { + "completion_length": 115.79166984558105, + "epoch": 0.25103706677371873, + "grad_norm": 1.703125, + "kl": 0.03035385813564062, + "learning_rate": 4.6607679522166085e-06, + "loss": 0.0012, + "reward": 3.2500000596046448, + "reward_std": 0.46232305467128754, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 469 + }, + { + "completion_length": 135.91666984558105, + "epoch": 0.25157232704402516, + "grad_norm": 2.09375, + "kl": 0.03181264130398631, + "learning_rate": 4.658414226233792e-06, + "loss": 0.0013, + "reward": 3.208333432674408, + "reward_std": 0.6595396101474762, + "rewards/correctness_reward_func": 1.7500000596046448, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 470 + }, + { + "completion_length": 118.70833587646484, + "epoch": 0.2521075873143316, + "grad_norm": 1.5703125, + "kl": 0.038648287765681744, + "learning_rate": 4.656052961507131e-06, + "loss": 0.0015, + "reward": 3.145833373069763, + "reward_std": 0.309229951351881, + "rewards/correctness_reward_func": 1.6666666716337204, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 471 + }, + { + "completion_length": 220.6250057220459, + "epoch": 0.252642847584638, + "grad_norm": 1.7265625, + "kl": 0.025634529069066048, + "learning_rate": 4.653684166283869e-06, + "loss": 0.001, + "reward": 2.2812500298023224, + "reward_std": 0.7136656455695629, + "rewards/correctness_reward_func": 1.0000000149011612, + "rewards/int_reward_func": 0.4166666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333395421505, + "rewards/xmlcount_reward_func": 0.46875, + "step": 472 + }, + { + "completion_length": 99.83333587646484, + "epoch": 0.2531781078549445, + "grad_norm": 0.65625, + "kl": 0.056579564698040485, + "learning_rate": 4.651307848837553e-06, + "loss": 0.0023, + "reward": 3.5, + "reward_std": 0.0, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 473 + }, + { + "completion_length": 138.66666793823242, + "epoch": 0.2537133681252509, + "grad_norm": 1.2265625, + "kl": 0.04535471182316542, + "learning_rate": 4.648924017468003e-06, + "loss": 0.0018, + "reward": 3.1666667461395264, + "reward_std": 0.34591545164585114, + "rewards/correctness_reward_func": 1.75, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 474 + }, + { + "completion_length": 190.79166984558105, + "epoch": 0.25424862839555734, + "grad_norm": 1.15625, + "kl": 0.01769639761187136, + "learning_rate": 4.646532680501282e-06, + "loss": 0.0007, + "reward": 2.875, + "reward_std": 0.5589051842689514, + "rewards/correctness_reward_func": 1.5, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.39583333395421505, + "rewards/xmlcount_reward_func": 0.5, + "step": 475 + }, + { + "completion_length": 134.8750057220459, + "epoch": 0.25478388866586377, + "grad_norm": 1.109375, + "kl": 0.042043234687298536, + "learning_rate": 4.644133846289669e-06, + "loss": 0.0017, + "reward": 3.3125000596046448, + "reward_std": 0.3092299550771713, + "rewards/correctness_reward_func": 1.8333333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 476 + }, + { + "completion_length": 171.7916717529297, + "epoch": 0.2553191489361702, + "grad_norm": 1.296875, + "kl": 0.03471332183107734, + "learning_rate": 4.641727523211627e-06, + "loss": 0.0014, + "reward": 3.208333373069763, + "reward_std": 0.306186243891716, + "rewards/correctness_reward_func": 1.75, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 477 + }, + { + "completion_length": 140.16666793823242, + "epoch": 0.2558544092064767, + "grad_norm": 1.6640625, + "kl": 0.032892528688535094, + "learning_rate": 4.6393137196717785e-06, + "loss": 0.0013, + "reward": 3.0625000596046448, + "reward_std": 0.6782456934452057, + "rewards/correctness_reward_func": 1.5833333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 478 + }, + { + "completion_length": 146.08333778381348, + "epoch": 0.2563896694767831, + "grad_norm": 1.328125, + "kl": 0.03279207367449999, + "learning_rate": 4.63689244410087e-06, + "loss": 0.0013, + "reward": 3.1666666865348816, + "reward_std": 0.5933245718479156, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 479 + }, + { + "completion_length": 156.62500381469727, + "epoch": 0.2569249297470895, + "grad_norm": 1.484375, + "kl": 0.02493216237053275, + "learning_rate": 4.6344637049557495e-06, + "loss": 0.001, + "reward": 3.0416667461395264, + "reward_std": 0.5643851570785046, + "rewards/correctness_reward_func": 1.583333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 480 + }, + { + "completion_length": 140.91666984558105, + "epoch": 0.25746019001739595, + "grad_norm": 2.109375, + "kl": 0.05418694904074073, + "learning_rate": 4.632027510719329e-06, + "loss": 0.0022, + "reward": 3.2500000596046448, + "reward_std": 0.556186206638813, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 481 + }, + { + "completion_length": 151.50000381469727, + "epoch": 0.2579954502877024, + "grad_norm": 1.671875, + "kl": 0.043965504970401525, + "learning_rate": 4.629583869900562e-06, + "loss": 0.0018, + "reward": 3.0416667461395264, + "reward_std": 0.618552640080452, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 482 + }, + { + "completion_length": 149.75000381469727, + "epoch": 0.25853071055800886, + "grad_norm": 1.0546875, + "kl": 0.025529312435537577, + "learning_rate": 4.627132791034411e-06, + "loss": 0.001, + "reward": 3.270833373069763, + "reward_std": 0.16614501178264618, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.39583333395421505, + "rewards/xmlcount_reward_func": 0.4791666716337204, + "step": 483 + }, + { + "completion_length": 166.50000381469727, + "epoch": 0.2590659708283153, + "grad_norm": 1.3984375, + "kl": 0.028888692380860448, + "learning_rate": 4.624674282681814e-06, + "loss": 0.0012, + "reward": 2.8541666865348816, + "reward_std": 0.5258883386850357, + "rewards/correctness_reward_func": 1.416666679084301, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 484 + }, + { + "completion_length": 171.62500762939453, + "epoch": 0.2596012310986217, + "grad_norm": 1.53125, + "kl": 0.01867962582036853, + "learning_rate": 4.622208353429661e-06, + "loss": 0.0007, + "reward": 2.7500000596046448, + "reward_std": 0.7895646393299103, + "rewards/correctness_reward_func": 1.333333358168602, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 485 + }, + { + "completion_length": 124.7500057220459, + "epoch": 0.26013649136892814, + "grad_norm": 1.6015625, + "kl": 0.04171385709196329, + "learning_rate": 4.619735011890763e-06, + "loss": 0.0017, + "reward": 3.395833373069763, + "reward_std": 0.25515517592430115, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 486 + }, + { + "completion_length": 196.70833587646484, + "epoch": 0.26067175163923456, + "grad_norm": 1.59375, + "kl": 0.020838663913309574, + "learning_rate": 4.617254266703816e-06, + "loss": 0.0008, + "reward": 2.4791667461395264, + "reward_std": 0.6868235319852829, + "rewards/correctness_reward_func": 1.0000000447034836, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 487 + }, + { + "completion_length": 149.79166793823242, + "epoch": 0.261207011909541, + "grad_norm": 1.65625, + "kl": 0.05422941967844963, + "learning_rate": 4.614766126533378e-06, + "loss": 0.0022, + "reward": 2.9375000596046448, + "reward_std": 0.4713764898478985, + "rewards/correctness_reward_func": 1.5000000223517418, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000149011612, + "rewards/xmlcount_reward_func": 0.5, + "step": 488 + }, + { + "completion_length": 169.87500381469727, + "epoch": 0.26174227217984747, + "grad_norm": 1.9453125, + "kl": 0.057755385991185904, + "learning_rate": 4.612270600069833e-06, + "loss": 0.0023, + "reward": 2.3541667461395264, + "reward_std": 1.0590701699256897, + "rewards/correctness_reward_func": 1.000000037252903, + "rewards/int_reward_func": 0.4375000074505806, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.416666679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 489 + }, + { + "completion_length": 120.66667175292969, + "epoch": 0.2622775324501539, + "grad_norm": 1.25, + "kl": 0.031516329385340214, + "learning_rate": 4.609767696029365e-06, + "loss": 0.0013, + "reward": 3.0625000298023224, + "reward_std": 0.25515519082546234, + "rewards/correctness_reward_func": 1.5833333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 490 + }, + { + "completion_length": 185.2083396911621, + "epoch": 0.2628127927204603, + "grad_norm": 2.203125, + "kl": 0.08892032504081726, + "learning_rate": 4.6072574231539255e-06, + "loss": 0.0036, + "reward": 2.6875000596046448, + "reward_std": 0.5697049051523209, + "rewards/correctness_reward_func": 1.3333333730697632, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 491 + }, + { + "completion_length": 178.33333778381348, + "epoch": 0.26334805299076675, + "grad_norm": 1.5, + "kl": 0.039903036784380674, + "learning_rate": 4.604739790211203e-06, + "loss": 0.0016, + "reward": 2.500000089406967, + "reward_std": 0.9305254966020584, + "rewards/correctness_reward_func": 1.0833333805203438, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375, + "rewards/xmlcount_reward_func": 0.5, + "step": 492 + }, + { + "completion_length": 165.79167366027832, + "epoch": 0.26388331326107317, + "grad_norm": 1.578125, + "kl": 0.03498702170327306, + "learning_rate": 4.6022148059945945e-06, + "loss": 0.0014, + "reward": 2.895833432674408, + "reward_std": 0.49754732847213745, + "rewards/correctness_reward_func": 1.5000000223517418, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 493 + }, + { + "completion_length": 137.45833778381348, + "epoch": 0.26441857353137965, + "grad_norm": 1.7421875, + "kl": 0.03628614521585405, + "learning_rate": 4.599682479323171e-06, + "loss": 0.0015, + "reward": 2.9166666865348816, + "reward_std": 0.572748601436615, + "rewards/correctness_reward_func": 1.5000000149011612, + "rewards/int_reward_func": 0.4375000074505806, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 494 + }, + { + "completion_length": 159.8333396911621, + "epoch": 0.2649538338016861, + "grad_norm": 1.9375, + "kl": 0.024191310163587332, + "learning_rate": 4.597142819041647e-06, + "loss": 0.001, + "reward": 2.6666667461395264, + "reward_std": 0.581819411367178, + "rewards/correctness_reward_func": 1.2500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 495 + }, + { + "completion_length": 141.66667366027832, + "epoch": 0.2654890940719925, + "grad_norm": 2.203125, + "kl": 0.06287940312176943, + "learning_rate": 4.594595834020355e-06, + "loss": 0.0025, + "reward": 3.145833432674408, + "reward_std": 0.6394104324281216, + "rewards/correctness_reward_func": 1.7500000596046448, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 496 + }, + { + "completion_length": 170.41666793823242, + "epoch": 0.26602435434229893, + "grad_norm": 1.78125, + "kl": 0.03716146480292082, + "learning_rate": 4.5920415331552095e-06, + "loss": 0.0015, + "reward": 3.1250001192092896, + "reward_std": 0.7136143408715725, + "rewards/correctness_reward_func": 1.666666716337204, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 497 + }, + { + "completion_length": 183.25000762939453, + "epoch": 0.26655961461260536, + "grad_norm": 1.28125, + "kl": 0.0346057191491127, + "learning_rate": 4.589479925367676e-06, + "loss": 0.0014, + "reward": 3.1250000596046448, + "reward_std": 0.7477540075778961, + "rewards/correctness_reward_func": 1.7500000596046448, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 498 + }, + { + "completion_length": 135.3333339691162, + "epoch": 0.26709487488291184, + "grad_norm": 1.6484375, + "kl": 0.037114487029612064, + "learning_rate": 4.586911019604742e-06, + "loss": 0.0015, + "reward": 2.8750000298023224, + "reward_std": 0.6255477480590343, + "rewards/correctness_reward_func": 1.5000000149011612, + "rewards/int_reward_func": 0.3958333395421505, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 499 + }, + { + "completion_length": 162.375, + "epoch": 0.26763013515321826, + "grad_norm": 1.90625, + "kl": 0.03033427568152547, + "learning_rate": 4.584334824838885e-06, + "loss": 0.0012, + "reward": 2.9322917461395264, + "reward_std": 0.5931039564311504, + "rewards/correctness_reward_func": 1.583333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.35416667722165585, + "rewards/xmlcount_reward_func": 0.4947916716337204, + "step": 500 + }, + { + "completion_length": 143.83333587646484, + "epoch": 0.2681653954235247, + "grad_norm": 1.8046875, + "kl": 0.025395017582923174, + "learning_rate": 4.581751350068041e-06, + "loss": 0.001, + "reward": 3.145833432674408, + "reward_std": 0.7174782603979111, + "rewards/correctness_reward_func": 1.666666716337204, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 501 + }, + { + "completion_length": 139.79166984558105, + "epoch": 0.2687006556938311, + "grad_norm": 1.8828125, + "kl": 0.04936455516144633, + "learning_rate": 4.579160604315572e-06, + "loss": 0.002, + "reward": 3.083333432674408, + "reward_std": 0.6196396760642529, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 502 + }, + { + "completion_length": 145.5416717529297, + "epoch": 0.26923591596413754, + "grad_norm": 1.484375, + "kl": 0.019221351481974125, + "learning_rate": 4.576562596630237e-06, + "loss": 0.0008, + "reward": 3.020833373069763, + "reward_std": 0.6095356941223145, + "rewards/correctness_reward_func": 1.5833333432674408, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 503 + }, + { + "completion_length": 157.08333587646484, + "epoch": 0.269771176234444, + "grad_norm": 1.640625, + "kl": 0.031065318267792463, + "learning_rate": 4.573957336086158e-06, + "loss": 0.0012, + "reward": 3.0104167461395264, + "reward_std": 0.476203590631485, + "rewards/correctness_reward_func": 1.5833333432674408, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.4895833358168602, + "step": 504 + }, + { + "completion_length": 190.79167366027832, + "epoch": 0.27030643650475045, + "grad_norm": 1.8671875, + "kl": 0.037793589755892754, + "learning_rate": 4.571344831782789e-06, + "loss": 0.0015, + "reward": 2.7500001192092896, + "reward_std": 0.9386454932391644, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.354166679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 505 + }, + { + "completion_length": 148.9583339691162, + "epoch": 0.2708416967750569, + "grad_norm": 1.5703125, + "kl": 0.021527512930333614, + "learning_rate": 4.568725092844886e-06, + "loss": 0.0009, + "reward": 2.708333373069763, + "reward_std": 0.5483061634004116, + "rewards/correctness_reward_func": 1.2500000298023224, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 506 + }, + { + "completion_length": 173.25000762939453, + "epoch": 0.2713769570453633, + "grad_norm": 1.8671875, + "kl": 0.043221392668783665, + "learning_rate": 4.566098128422471e-06, + "loss": 0.0017, + "reward": 2.880208373069763, + "reward_std": 0.8724417686462402, + "rewards/correctness_reward_func": 1.5833333730697632, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3541666716337204, + "rewards/xmlcount_reward_func": 0.484375, + "step": 507 + }, + { + "completion_length": 134.5416717529297, + "epoch": 0.2719122173156697, + "grad_norm": 2.078125, + "kl": 0.03808927442878485, + "learning_rate": 4.563463947690804e-06, + "loss": 0.0015, + "reward": 2.1875000298023224, + "reward_std": 0.61852215975523, + "rewards/correctness_reward_func": 0.8333333358168602, + "rewards/int_reward_func": 0.375, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 508 + }, + { + "completion_length": 138.91666793823242, + "epoch": 0.2724474775859762, + "grad_norm": 2.421875, + "kl": 0.04213061882182956, + "learning_rate": 4.5608225598503506e-06, + "loss": 0.0017, + "reward": 3.208333432674408, + "reward_std": 0.5036999098956585, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3750000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 509 + }, + { + "completion_length": 135.04166793823242, + "epoch": 0.27298273785628263, + "grad_norm": 0.96484375, + "kl": 0.03594761900603771, + "learning_rate": 4.558173974126749e-06, + "loss": 0.0014, + "reward": 3.4791666865348816, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 510 + }, + { + "completion_length": 171.37500381469727, + "epoch": 0.27351799812658906, + "grad_norm": 2.28125, + "kl": 0.059704599902033806, + "learning_rate": 4.555518199770774e-06, + "loss": 0.0024, + "reward": 3.1250001192092896, + "reward_std": 0.7136143408715725, + "rewards/correctness_reward_func": 1.666666716337204, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 511 + }, + { + "completion_length": 124.08333587646484, + "epoch": 0.2740532583968955, + "grad_norm": 1.71875, + "kl": 0.03336996538564563, + "learning_rate": 4.552855246058313e-06, + "loss": 0.0013, + "reward": 3.0625000596046448, + "reward_std": 0.5830912441015244, + "rewards/correctness_reward_func": 1.5833333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 512 + }, + { + "completion_length": 152.45833587646484, + "epoch": 0.2745885186672019, + "grad_norm": 1.5546875, + "kl": 0.03267518850043416, + "learning_rate": 4.550185122290324e-06, + "loss": 0.0013, + "reward": 3.1250000596046448, + "reward_std": 0.5268727391958237, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 513 + }, + { + "completion_length": 186.75000381469727, + "epoch": 0.2751237789375084, + "grad_norm": 1.1796875, + "kl": 0.025649972492828965, + "learning_rate": 4.547507837792814e-06, + "loss": 0.001, + "reward": 2.8437500596046448, + "reward_std": 0.33822767809033394, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.4895833358168602, + "step": 514 + }, + { + "completion_length": 208.9166717529297, + "epoch": 0.2756590392078148, + "grad_norm": 1.1171875, + "kl": 0.030423552729189396, + "learning_rate": 4.544823401916794e-06, + "loss": 0.0012, + "reward": 2.833333432674408, + "reward_std": 0.718227207660675, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 515 + }, + { + "completion_length": 184.45833587646484, + "epoch": 0.27619429947812124, + "grad_norm": 2.234375, + "kl": 0.100022466853261, + "learning_rate": 4.542131824038259e-06, + "loss": 0.004, + "reward": 2.5625000298023224, + "reward_std": 0.6425507217645645, + "rewards/correctness_reward_func": 1.3333333730697632, + "rewards/int_reward_func": 0.4375000149011612, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.2916666753590107, + "rewards/xmlcount_reward_func": 0.5, + "step": 516 + }, + { + "completion_length": 155.20833778381348, + "epoch": 0.27672955974842767, + "grad_norm": 1.7109375, + "kl": 0.04028411163017154, + "learning_rate": 4.539433113558144e-06, + "loss": 0.0016, + "reward": 2.645833373069763, + "reward_std": 0.7378925532102585, + "rewards/correctness_reward_func": 1.2500000223517418, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.416666679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 517 + }, + { + "completion_length": 147.00000762939453, + "epoch": 0.2772648200187341, + "grad_norm": 1.0234375, + "kl": 0.027115366887301207, + "learning_rate": 4.536727279902299e-06, + "loss": 0.0011, + "reward": 2.7916666865348816, + "reward_std": 0.2813657224178314, + "rewards/correctness_reward_func": 1.3333333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 518 + }, + { + "completion_length": 133.16666984558105, + "epoch": 0.2778000802890405, + "grad_norm": 2.03125, + "kl": 0.04532007407397032, + "learning_rate": 4.534014332521451e-06, + "loss": 0.0018, + "reward": 2.604166716337204, + "reward_std": 0.9020710438489914, + "rewards/correctness_reward_func": 1.1666667088866234, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 519 + }, + { + "completion_length": 125.70833778381348, + "epoch": 0.278335340559347, + "grad_norm": 0.2275390625, + "kl": 0.032215571496635675, + "learning_rate": 4.5312942808911775e-06, + "loss": 0.0013, + "reward": 3.5, + "reward_std": 0.0, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 520 + }, + { + "completion_length": 170.33333778381348, + "epoch": 0.2788706008296534, + "grad_norm": 1.484375, + "kl": 0.035013212356716394, + "learning_rate": 4.528567134511864e-06, + "loss": 0.0014, + "reward": 3.1875000596046448, + "reward_std": 0.502879124134779, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 521 + }, + { + "completion_length": 174.12500381469727, + "epoch": 0.27940586109995985, + "grad_norm": 2.328125, + "kl": 0.061464476864784956, + "learning_rate": 4.52583290290868e-06, + "loss": 0.0025, + "reward": 2.708333432674408, + "reward_std": 0.7608912438154221, + "rewards/correctness_reward_func": 1.4166667088866234, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3125000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 522 + }, + { + "completion_length": 128.87500381469727, + "epoch": 0.2799411213702663, + "grad_norm": 1.71875, + "kl": 0.03376815561205149, + "learning_rate": 4.523091595631539e-06, + "loss": 0.0014, + "reward": 3.083333373069763, + "reward_std": 0.5320602059364319, + "rewards/correctness_reward_func": 1.5833333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 523 + }, + { + "completion_length": 149.95833778381348, + "epoch": 0.2804763816405727, + "grad_norm": 2.046875, + "kl": 0.052414572797715664, + "learning_rate": 4.5203432222550705e-06, + "loss": 0.0021, + "reward": 2.6666667759418488, + "reward_std": 0.9600770622491837, + "rewards/correctness_reward_func": 1.2500000521540642, + "rewards/int_reward_func": 0.4166666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 524 + }, + { + "completion_length": 121.5000057220459, + "epoch": 0.2810116419108792, + "grad_norm": 1.421875, + "kl": 0.04683410096913576, + "learning_rate": 4.517587792378581e-06, + "loss": 0.0019, + "reward": 3.2291667461395264, + "reward_std": 0.45845916867256165, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 525 + }, + { + "completion_length": 160.95833587646484, + "epoch": 0.2815469021811856, + "grad_norm": 1.78125, + "kl": 0.026103714015334845, + "learning_rate": 4.514825315626024e-06, + "loss": 0.001, + "reward": 2.083333432674408, + "reward_std": 0.789296954870224, + "rewards/correctness_reward_func": 0.6666666865348816, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 526 + }, + { + "completion_length": 146.20833778381348, + "epoch": 0.28208216245149204, + "grad_norm": 1.6484375, + "kl": 0.03722587740048766, + "learning_rate": 4.51205580164597e-06, + "loss": 0.0015, + "reward": 3.1822917461395264, + "reward_std": 0.5032989047467709, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.4947916716337204, + "step": 527 + }, + { + "completion_length": 135.20833778381348, + "epoch": 0.28261742272179846, + "grad_norm": 2.25, + "kl": 0.03908930625766516, + "learning_rate": 4.509279260111563e-06, + "loss": 0.0016, + "reward": 3.2291667461395264, + "reward_std": 0.39121396839618683, + "rewards/correctness_reward_func": 1.8333333432674408, + "rewards/int_reward_func": 0.3958333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 528 + }, + { + "completion_length": 154.16666793823242, + "epoch": 0.2831526829921049, + "grad_norm": 1.7265625, + "kl": 0.032162437215447426, + "learning_rate": 4.506495700720494e-06, + "loss": 0.0013, + "reward": 2.833333432674408, + "reward_std": 0.7331711798906326, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.416666679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 529 + }, + { + "completion_length": 158.08334159851074, + "epoch": 0.28368794326241137, + "grad_norm": 1.109375, + "kl": 0.025485132355242968, + "learning_rate": 4.503705133194967e-06, + "loss": 0.001, + "reward": 2.895833373069763, + "reward_std": 0.25515517219901085, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 530 + }, + { + "completion_length": 123.54166984558105, + "epoch": 0.2842232035327178, + "grad_norm": 0.07958984375, + "kl": 0.025541551876813173, + "learning_rate": 4.500907567281663e-06, + "loss": 0.001, + "reward": 3.5, + "reward_std": 0.0, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 531 + }, + { + "completion_length": 138.62500381469727, + "epoch": 0.2847584638030242, + "grad_norm": 1.7421875, + "kl": 0.037108127027750015, + "learning_rate": 4.498103012751704e-06, + "loss": 0.0015, + "reward": 2.7708334028720856, + "reward_std": 0.7820279747247696, + "rewards/correctness_reward_func": 1.3333333656191826, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 532 + }, + { + "completion_length": 180.25000381469727, + "epoch": 0.28529372407333065, + "grad_norm": 2.15625, + "kl": 0.05364688206464052, + "learning_rate": 4.4952914794006255e-06, + "loss": 0.0021, + "reward": 3.1250001192092896, + "reward_std": 0.7430477403104305, + "rewards/correctness_reward_func": 1.7500000596046448, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 533 + }, + { + "completion_length": 149.2500057220459, + "epoch": 0.28582898434363707, + "grad_norm": 1.40625, + "kl": 0.033399499487131834, + "learning_rate": 4.4924729770483346e-06, + "loss": 0.0013, + "reward": 2.958333373069763, + "reward_std": 0.5717262327671051, + "rewards/correctness_reward_func": 1.5000000149011612, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 534 + }, + { + "completion_length": 136.2500057220459, + "epoch": 0.28636424461394355, + "grad_norm": 0.99609375, + "kl": 0.033650084398686886, + "learning_rate": 4.4896475155390796e-06, + "loss": 0.0013, + "reward": 2.9791666865348816, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 1.5, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 535 + }, + { + "completion_length": 130.00000381469727, + "epoch": 0.28689950488425, + "grad_norm": 0.87890625, + "kl": 0.03125099744647741, + "learning_rate": 4.486815104741418e-06, + "loss": 0.0013, + "reward": 3.4375, + "reward_std": 0.06846532225608826, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375, + "rewards/xmlcount_reward_func": 0.5, + "step": 536 + }, + { + "completion_length": 206.00000381469727, + "epoch": 0.2874347651545564, + "grad_norm": 1.671875, + "kl": 0.030830624978989363, + "learning_rate": 4.483975754548175e-06, + "loss": 0.0012, + "reward": 2.6250000596046448, + "reward_std": 1.0065668523311615, + "rewards/correctness_reward_func": 1.3333333879709244, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3125000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 537 + }, + { + "completion_length": 203.4583396911621, + "epoch": 0.28797002542486283, + "grad_norm": 2.140625, + "kl": 0.025279700057581067, + "learning_rate": 4.4811294748764175e-06, + "loss": 0.001, + "reward": 1.8645833432674408, + "reward_std": 0.8061150163412094, + "rewards/correctness_reward_func": 0.5833333432674408, + "rewards/int_reward_func": 0.3958333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.46875, + "step": 538 + }, + { + "completion_length": 162.6666717529297, + "epoch": 0.28850528569516926, + "grad_norm": 1.765625, + "kl": 0.06328402180224657, + "learning_rate": 4.478276275667411e-06, + "loss": 0.0025, + "reward": 2.5625000596046448, + "reward_std": 0.6288169585168362, + "rewards/correctness_reward_func": 1.2500000074505806, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.31250001303851604, + "rewards/xmlcount_reward_func": 0.5, + "step": 539 + }, + { + "completion_length": 213.0833396911621, + "epoch": 0.28904054596547574, + "grad_norm": 1.6484375, + "kl": 0.02844400331377983, + "learning_rate": 4.475416166886593e-06, + "loss": 0.0011, + "reward": 2.5416667461395264, + "reward_std": 0.8251103907823563, + "rewards/correctness_reward_func": 1.2500000223517418, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3333333395421505, + "rewards/xmlcount_reward_func": 0.5, + "step": 540 + }, + { + "completion_length": 164.62500381469727, + "epoch": 0.28957580623578216, + "grad_norm": 1.4296875, + "kl": 0.0281327604316175, + "learning_rate": 4.4725491585235305e-06, + "loss": 0.0011, + "reward": 3.0625000298023224, + "reward_std": 0.25515517592430115, + "rewards/correctness_reward_func": 1.5833333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 541 + }, + { + "completion_length": 147.04166984558105, + "epoch": 0.2901110665060886, + "grad_norm": 1.609375, + "kl": 0.04105441318824887, + "learning_rate": 4.4696752605918924e-06, + "loss": 0.0016, + "reward": 2.833333343267441, + "reward_std": 0.4771459996700287, + "rewards/correctness_reward_func": 1.416666679084301, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 542 + }, + { + "completion_length": 178.7916717529297, + "epoch": 0.290646326776395, + "grad_norm": 1.328125, + "kl": 0.02182354126125574, + "learning_rate": 4.466794483129409e-06, + "loss": 0.0009, + "reward": 2.875000089406967, + "reward_std": 0.5268727838993073, + "rewards/correctness_reward_func": 1.416666679084301, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 543 + }, + { + "completion_length": 140.83333778381348, + "epoch": 0.29118158704670144, + "grad_norm": 2.28125, + "kl": 0.02299963030964136, + "learning_rate": 4.463906836197838e-06, + "loss": 0.0009, + "reward": 3.028375029563904, + "reward_std": 0.8897148221731186, + "rewards/correctness_reward_func": 1.666666716337204, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.4033749997615814, + "step": 544 + }, + { + "completion_length": 181.0416717529297, + "epoch": 0.2917168473170079, + "grad_norm": 1.40625, + "kl": 0.023073547054082155, + "learning_rate": 4.461012329882931e-06, + "loss": 0.0009, + "reward": 2.8125000298023224, + "reward_std": 0.3071485310792923, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 545 + }, + { + "completion_length": 180.70833587646484, + "epoch": 0.29225210758731435, + "grad_norm": 1.578125, + "kl": 0.03637926373630762, + "learning_rate": 4.4581109742944e-06, + "loss": 0.0015, + "reward": 2.9375000596046448, + "reward_std": 0.833091240376234, + "rewards/correctness_reward_func": 1.5000000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 546 + }, + { + "completion_length": 149.62500381469727, + "epoch": 0.2927873678576208, + "grad_norm": 1.296875, + "kl": 0.02528524398803711, + "learning_rate": 4.455202779565876e-06, + "loss": 0.001, + "reward": 2.7291667461395264, + "reward_std": 0.5133541226387024, + "rewards/correctness_reward_func": 1.2500000298023224, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 547 + }, + { + "completion_length": 162.5416717529297, + "epoch": 0.2933226281279272, + "grad_norm": 1.078125, + "kl": 0.046681386418640614, + "learning_rate": 4.452287755854879e-06, + "loss": 0.0019, + "reward": 3.458333373069763, + "reward_std": 0.10206207260489464, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 548 + }, + { + "completion_length": 131.45833778381348, + "epoch": 0.2938578883982336, + "grad_norm": 0.96875, + "kl": 0.0450198519974947, + "learning_rate": 4.449365913342781e-06, + "loss": 0.0018, + "reward": 3.4791666865348816, + "reward_std": 0.05103103816509247, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 549 + }, + { + "completion_length": 132.25000762939453, + "epoch": 0.29439314866854005, + "grad_norm": 1.609375, + "kl": 0.041971832513809204, + "learning_rate": 4.446437262234769e-06, + "loss": 0.0017, + "reward": 3.020833373069763, + "reward_std": 0.5409832894802094, + "rewards/correctness_reward_func": 1.5833333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375, + "rewards/xmlcount_reward_func": 0.5, + "step": 550 + }, + { + "completion_length": 126.33333778381348, + "epoch": 0.29492840893884653, + "grad_norm": 0.546875, + "kl": 0.023539585061371326, + "learning_rate": 4.4435018127598115e-06, + "loss": 0.0009, + "reward": 2.9791666865348816, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 1.5, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 551 + }, + { + "completion_length": 149.75000381469727, + "epoch": 0.29546366920915296, + "grad_norm": 2.546875, + "kl": 0.03675627941265702, + "learning_rate": 4.440559575170621e-06, + "loss": 0.0015, + "reward": 3.083333432674408, + "reward_std": 0.7781640253961086, + "rewards/correctness_reward_func": 1.666666716337204, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 552 + }, + { + "completion_length": 130.50000381469727, + "epoch": 0.2959989294794594, + "grad_norm": 1.4140625, + "kl": 0.037120907101780176, + "learning_rate": 4.437610559743621e-06, + "loss": 0.0015, + "reward": 2.916666716337204, + "reward_std": 0.503996953368187, + "rewards/correctness_reward_func": 1.5000000149011612, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375, + "rewards/xmlcount_reward_func": 0.5, + "step": 553 + }, + { + "completion_length": 144.9583339691162, + "epoch": 0.2965341897497658, + "grad_norm": 1.671875, + "kl": 0.03651731228455901, + "learning_rate": 4.434654776778905e-06, + "loss": 0.0015, + "reward": 3.1666666865348816, + "reward_std": 0.5657404661178589, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 554 + }, + { + "completion_length": 146.04166984558105, + "epoch": 0.29706945002007223, + "grad_norm": 2.21875, + "kl": 0.023792235646396875, + "learning_rate": 4.431692236600206e-06, + "loss": 0.001, + "reward": 2.520833432674408, + "reward_std": 1.014427661895752, + "rewards/correctness_reward_func": 1.0833333730697632, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 555 + }, + { + "completion_length": 152.62500381469727, + "epoch": 0.2976047102903787, + "grad_norm": 1.6640625, + "kl": 0.05981873255223036, + "learning_rate": 4.428722949554858e-06, + "loss": 0.0024, + "reward": 2.8750000596046448, + "reward_std": 0.7758503705263138, + "rewards/correctness_reward_func": 1.4166667014360428, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 556 + }, + { + "completion_length": 156.4583339691162, + "epoch": 0.29813997056068514, + "grad_norm": 1.5078125, + "kl": 0.029872288461774588, + "learning_rate": 4.4257469260137575e-06, + "loss": 0.0012, + "reward": 3.083333373069763, + "reward_std": 0.4854898601770401, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 557 + }, + { + "completion_length": 208.87500190734863, + "epoch": 0.29867523083099157, + "grad_norm": 3.40625, + "kl": 0.02115720184519887, + "learning_rate": 4.422764176371333e-06, + "loss": 0.0008, + "reward": 2.0928750336170197, + "reward_std": 0.3820215165615082, + "rewards/correctness_reward_func": 0.75, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333395421505, + "rewards/xmlcount_reward_func": 0.44704167544841766, + "step": 558 + }, + { + "completion_length": 151.4166717529297, + "epoch": 0.299210491101298, + "grad_norm": 1.75, + "kl": 0.030247972812503576, + "learning_rate": 4.419774711045505e-06, + "loss": 0.0012, + "reward": 3.208333432674408, + "reward_std": 0.6046446561813354, + "rewards/correctness_reward_func": 1.7500000596046448, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 559 + }, + { + "completion_length": 144.12500381469727, + "epoch": 0.2997457513716044, + "grad_norm": 1.140625, + "kl": 0.03926022304221988, + "learning_rate": 4.416778540477646e-06, + "loss": 0.0016, + "reward": 3.3750000596046448, + "reward_std": 0.25129128620028496, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 560 + }, + { + "completion_length": 225.04167556762695, + "epoch": 0.3002810116419109, + "grad_norm": 1.75, + "kl": 0.03247348219156265, + "learning_rate": 4.413775675132553e-06, + "loss": 0.0013, + "reward": 2.260416716337204, + "reward_std": 0.8023487627506256, + "rewards/correctness_reward_func": 1.083333358168602, + "rewards/int_reward_func": 0.4375, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.2708333432674408, + "rewards/xmlcount_reward_func": 0.46875, + "step": 561 + }, + { + "completion_length": 163.58333778381348, + "epoch": 0.3008162719122173, + "grad_norm": 1.421875, + "kl": 0.02386433444917202, + "learning_rate": 4.4107661254984035e-06, + "loss": 0.001, + "reward": 2.958333373069763, + "reward_std": 0.6122723072767258, + "rewards/correctness_reward_func": 1.5, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 562 + }, + { + "completion_length": 164.20833778381348, + "epoch": 0.30135153218252375, + "grad_norm": 1.5546875, + "kl": 0.02886300766840577, + "learning_rate": 4.407749902086722e-06, + "loss": 0.0012, + "reward": 2.833333373069763, + "reward_std": 0.5685558170080185, + "rewards/correctness_reward_func": 1.4166667088866234, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 563 + }, + { + "completion_length": 164.5416717529297, + "epoch": 0.3018867924528302, + "grad_norm": 1.9296875, + "kl": 0.03690410777926445, + "learning_rate": 4.404727015432343e-06, + "loss": 0.0015, + "reward": 2.3541666865348816, + "reward_std": 0.6257302761077881, + "rewards/correctness_reward_func": 1.0000000149011612, + "rewards/int_reward_func": 0.4375, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 564 + }, + { + "completion_length": 132.45833587646484, + "epoch": 0.3024220527231366, + "grad_norm": 1.7421875, + "kl": 0.04732774198055267, + "learning_rate": 4.401697476093372e-06, + "loss": 0.0019, + "reward": 3.0625000596046448, + "reward_std": 0.51335409283638, + "rewards/correctness_reward_func": 1.583333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 565 + }, + { + "completion_length": 136.58333587646484, + "epoch": 0.3029573129934431, + "grad_norm": 1.515625, + "kl": 0.052906479453668, + "learning_rate": 4.3986612946511535e-06, + "loss": 0.0021, + "reward": 3.395833373069763, + "reward_std": 0.2002602517604828, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 566 + }, + { + "completion_length": 160.62500381469727, + "epoch": 0.3034925732637495, + "grad_norm": 1.5703125, + "kl": 0.056891399435698986, + "learning_rate": 4.395618481710229e-06, + "loss": 0.0023, + "reward": 3.3541667461395264, + "reward_std": 0.3023223280906677, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000149011612, + "rewards/xmlcount_reward_func": 0.5, + "step": 567 + }, + { + "completion_length": 137.83333778381348, + "epoch": 0.30402783353405594, + "grad_norm": 1.359375, + "kl": 0.038504095282405615, + "learning_rate": 4.392569047898301e-06, + "loss": 0.0015, + "reward": 3.1250000596046448, + "reward_std": 0.5395646393299103, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 568 + }, + { + "completion_length": 150.66666793823242, + "epoch": 0.30456309380436236, + "grad_norm": 1.078125, + "kl": 0.035421257838606834, + "learning_rate": 4.3895130038662e-06, + "loss": 0.0014, + "reward": 2.895833373069763, + "reward_std": 0.2002602517604828, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 569 + }, + { + "completion_length": 155.5833396911621, + "epoch": 0.3050983540746688, + "grad_norm": 1.8828125, + "kl": 0.031221465673297644, + "learning_rate": 4.386450360287842e-06, + "loss": 0.0012, + "reward": 2.8958334028720856, + "reward_std": 0.7271329909563065, + "rewards/correctness_reward_func": 1.5000000447034836, + "rewards/int_reward_func": 0.4166666679084301, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 570 + }, + { + "completion_length": 187.7916717529297, + "epoch": 0.30563361434497527, + "grad_norm": 1.4921875, + "kl": 0.02132181730121374, + "learning_rate": 4.383381127860194e-06, + "loss": 0.0009, + "reward": 2.666666731238365, + "reward_std": 0.5103103630244732, + "rewards/correctness_reward_func": 1.3333333730697632, + "rewards/int_reward_func": 0.375, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 571 + }, + { + "completion_length": 140.2083396911621, + "epoch": 0.3061688746152817, + "grad_norm": 1.7734375, + "kl": 0.04399279458448291, + "learning_rate": 4.380305317303236e-06, + "loss": 0.0018, + "reward": 2.958333373069763, + "reward_std": 0.6023809425532818, + "rewards/correctness_reward_func": 1.5000000149011612, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 572 + }, + { + "completion_length": 127.37500381469727, + "epoch": 0.3067041348855881, + "grad_norm": 1.921875, + "kl": 0.042581514455378056, + "learning_rate": 4.377222939359922e-06, + "loss": 0.0017, + "reward": 2.9791667461395264, + "reward_std": 0.9372647851705551, + "rewards/correctness_reward_func": 1.5000000596046448, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 573 + }, + { + "completion_length": 153.6666717529297, + "epoch": 0.30723939515589455, + "grad_norm": 1.796875, + "kl": 0.028469674289226532, + "learning_rate": 4.374134004796147e-06, + "loss": 0.0011, + "reward": 2.750000089406967, + "reward_std": 0.7205219715833664, + "rewards/correctness_reward_func": 1.2500000223517418, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 574 + }, + { + "completion_length": 124.58333587646484, + "epoch": 0.30777465542620097, + "grad_norm": 2.359375, + "kl": 0.03075863840058446, + "learning_rate": 4.371038524400706e-06, + "loss": 0.0012, + "reward": 3.208333432674408, + "reward_std": 0.5643851384520531, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 575 + }, + { + "completion_length": 111.6250057220459, + "epoch": 0.30830991569650745, + "grad_norm": 1.9765625, + "kl": 0.024300793651491404, + "learning_rate": 4.367936508985252e-06, + "loss": 0.001, + "reward": 3.0625000596046448, + "reward_std": 0.4334801435470581, + "rewards/correctness_reward_func": 1.583333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 576 + }, + { + "completion_length": 165.87500762939453, + "epoch": 0.3088451759668139, + "grad_norm": 1.6875, + "kl": 0.02696134801954031, + "learning_rate": 4.364827969384271e-06, + "loss": 0.0011, + "reward": 2.270833373069763, + "reward_std": 0.7186580300331116, + "rewards/correctness_reward_func": 0.8333333432674408, + "rewards/int_reward_func": 0.4375, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 577 + }, + { + "completion_length": 143.41666793823242, + "epoch": 0.3093804362371203, + "grad_norm": 2.03125, + "kl": 0.03807497629895806, + "learning_rate": 4.3617129164550294e-06, + "loss": 0.0015, + "reward": 3.0416667461395264, + "reward_std": 0.659539595246315, + "rewards/correctness_reward_func": 1.5833333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 578 + }, + { + "completion_length": 141.33333587646484, + "epoch": 0.30991569650742673, + "grad_norm": 1.5859375, + "kl": 0.02221487811766565, + "learning_rate": 4.358591361077546e-06, + "loss": 0.0009, + "reward": 3.3125000596046448, + "reward_std": 0.3092299550771713, + "rewards/correctness_reward_func": 1.8333333432674408, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 579 + }, + { + "completion_length": 154.37500381469727, + "epoch": 0.31045095677773316, + "grad_norm": 2.0625, + "kl": 0.031037595123052597, + "learning_rate": 4.355463314154551e-06, + "loss": 0.0012, + "reward": 3.1666667461395264, + "reward_std": 0.6664472073316574, + "rewards/correctness_reward_func": 1.666666716337204, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 580 + }, + { + "completion_length": 140.5833396911621, + "epoch": 0.3109862170480396, + "grad_norm": 0.98046875, + "kl": 0.028645613696426153, + "learning_rate": 4.352328786611446e-06, + "loss": 0.0011, + "reward": 3.4166666865348816, + "reward_std": 0.20412413775920868, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 581 + }, + { + "completion_length": 153.75000381469727, + "epoch": 0.31152147731834606, + "grad_norm": 1.5625, + "kl": 0.02715424238704145, + "learning_rate": 4.349187789396269e-06, + "loss": 0.0011, + "reward": 3.0416666865348816, + "reward_std": 0.2711162380874157, + "rewards/correctness_reward_func": 1.5833333358168602, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 582 + }, + { + "completion_length": 135.45833778381348, + "epoch": 0.3120567375886525, + "grad_norm": 1.203125, + "kl": 0.02633284032344818, + "learning_rate": 4.346040333479655e-06, + "loss": 0.0011, + "reward": 2.5625000298023224, + "reward_std": 0.41247179359197617, + "rewards/correctness_reward_func": 1.1666666716337204, + "rewards/int_reward_func": 0.4166666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 583 + }, + { + "completion_length": 105.00000381469727, + "epoch": 0.3125919978589589, + "grad_norm": 6.46875, + "kl": 0.21342097874730825, + "learning_rate": 4.342886429854797e-06, + "loss": 0.0085, + "reward": 3.3750000596046448, + "reward_std": 0.306186206638813, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 584 + }, + { + "completion_length": 123.29167175292969, + "epoch": 0.31312725812926534, + "grad_norm": 1.8671875, + "kl": 0.023702097591012716, + "learning_rate": 4.339726089537406e-06, + "loss": 0.0009, + "reward": 3.270833432674408, + "reward_std": 0.5238290727138519, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 585 + }, + { + "completion_length": 143.58333587646484, + "epoch": 0.31366251839957177, + "grad_norm": 0.74609375, + "kl": 0.0426813792437315, + "learning_rate": 4.336559323565679e-06, + "loss": 0.0017, + "reward": 2.9791666865348816, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 1.5, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 586 + }, + { + "completion_length": 149.5000057220459, + "epoch": 0.31419777866987825, + "grad_norm": 1.7890625, + "kl": 0.03509718319401145, + "learning_rate": 4.3333861430002525e-06, + "loss": 0.0014, + "reward": 2.812500089406967, + "reward_std": 0.7153968065977097, + "rewards/correctness_reward_func": 1.4166667088866234, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 587 + }, + { + "completion_length": 109.33333587646484, + "epoch": 0.3147330389401847, + "grad_norm": 1.28125, + "kl": 0.03983949590474367, + "learning_rate": 4.330206558924168e-06, + "loss": 0.0016, + "reward": 3.395833373069763, + "reward_std": 0.25515520572662354, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 588 + }, + { + "completion_length": 123.66666984558105, + "epoch": 0.3152682992104911, + "grad_norm": 0.96875, + "kl": 0.05020787101238966, + "learning_rate": 4.327020582442834e-06, + "loss": 0.002, + "reward": 3.4791666865348816, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 589 + }, + { + "completion_length": 152.875, + "epoch": 0.3158035594807975, + "grad_norm": 1.1796875, + "kl": 0.031143656466156244, + "learning_rate": 4.323828224683983e-06, + "loss": 0.0012, + "reward": 2.958333343267441, + "reward_std": 0.39777331054210663, + "rewards/correctness_reward_func": 1.5000000223517418, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 590 + }, + { + "completion_length": 138.16666793823242, + "epoch": 0.31633881975110395, + "grad_norm": 1.4296875, + "kl": 0.030376011971384287, + "learning_rate": 4.320629496797642e-06, + "loss": 0.0012, + "reward": 2.8541666865348816, + "reward_std": 0.27258946001529694, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.4375, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 591 + }, + { + "completion_length": 143.45833587646484, + "epoch": 0.31687408002141043, + "grad_norm": 2.25, + "kl": 0.0474525555036962, + "learning_rate": 4.317424409956078e-06, + "loss": 0.0019, + "reward": 3.208333432674408, + "reward_std": 0.5643851235508919, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 592 + }, + { + "completion_length": 135.00000190734863, + "epoch": 0.31740934029171686, + "grad_norm": 1.5703125, + "kl": 0.02484218031167984, + "learning_rate": 4.3142129753537755e-06, + "loss": 0.001, + "reward": 2.5000000298023224, + "reward_std": 0.40824829041957855, + "rewards/correctness_reward_func": 1.0000000223517418, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 593 + }, + { + "completion_length": 186.83333587646484, + "epoch": 0.3179446005620233, + "grad_norm": 2.0, + "kl": 0.02980051003396511, + "learning_rate": 4.310995204207386e-06, + "loss": 0.0012, + "reward": 2.5625000596046448, + "reward_std": 0.8611014932394028, + "rewards/correctness_reward_func": 1.1666667088866234, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.416666679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 594 + }, + { + "completion_length": 193.70833587646484, + "epoch": 0.3184798608323297, + "grad_norm": 1.6015625, + "kl": 0.03126836335286498, + "learning_rate": 4.307771107755695e-06, + "loss": 0.0013, + "reward": 2.6041666865348816, + "reward_std": 0.5614049583673477, + "rewards/correctness_reward_func": 1.2500000298023224, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3750000111758709, + "rewards/xmlcount_reward_func": 0.5, + "step": 595 + }, + { + "completion_length": 127.91667175292969, + "epoch": 0.31901512110263613, + "grad_norm": 2.5, + "kl": 0.041764695663005114, + "learning_rate": 4.304540697259578e-06, + "loss": 0.0017, + "reward": 2.802083432674408, + "reward_std": 0.999303549528122, + "rewards/correctness_reward_func": 1.416666716337204, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.4895833358168602, + "step": 596 + }, + { + "completion_length": 143.5833396911621, + "epoch": 0.3195503813729426, + "grad_norm": 2.078125, + "kl": 0.05798004241660237, + "learning_rate": 4.3013039840019675e-06, + "loss": 0.0023, + "reward": 2.5416667461395264, + "reward_std": 0.5643851384520531, + "rewards/correctness_reward_func": 1.083333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 597 + }, + { + "completion_length": 169.50000381469727, + "epoch": 0.32008564164324904, + "grad_norm": 1.6484375, + "kl": 0.019811555510386825, + "learning_rate": 4.298060979287807e-06, + "loss": 0.0008, + "reward": 2.7916667461395264, + "reward_std": 0.8075917363166809, + "rewards/correctness_reward_func": 1.333333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 598 + }, + { + "completion_length": 155.29166984558105, + "epoch": 0.32062090191355547, + "grad_norm": 2.015625, + "kl": 0.03277602745220065, + "learning_rate": 4.294811694444013e-06, + "loss": 0.0013, + "reward": 2.7500000596046448, + "reward_std": 0.8081966787576675, + "rewards/correctness_reward_func": 1.3333333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666865348816, + "rewards/xmlcount_reward_func": 0.5, + "step": 599 + }, + { + "completion_length": 126.91666793823242, + "epoch": 0.3211561621838619, + "grad_norm": 1.9453125, + "kl": 0.03925598133355379, + "learning_rate": 4.29155614081944e-06, + "loss": 0.0016, + "reward": 2.8541667461395264, + "reward_std": 0.3023223280906677, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000149011612, + "rewards/xmlcount_reward_func": 0.5, + "step": 600 + }, + { + "completion_length": 166.75000190734863, + "epoch": 0.3216914224541683, + "grad_norm": 1.828125, + "kl": 0.030431517399847507, + "learning_rate": 4.288294329784838e-06, + "loss": 0.0012, + "reward": 2.895833432674408, + "reward_std": 0.921602413058281, + "rewards/correctness_reward_func": 1.416666716337204, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 601 + }, + { + "completion_length": 142.83333587646484, + "epoch": 0.3222266827244748, + "grad_norm": 1.2109375, + "kl": 0.022962462157011032, + "learning_rate": 4.285026272732808e-06, + "loss": 0.0009, + "reward": 3.3125000596046448, + "reward_std": 0.309229951351881, + "rewards/correctness_reward_func": 1.8333333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 602 + }, + { + "completion_length": 156.37500762939453, + "epoch": 0.3227619429947812, + "grad_norm": 0.5546875, + "kl": 0.020614446373656392, + "learning_rate": 4.28175198107777e-06, + "loss": 0.0008, + "reward": 3.4791666865348816, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 603 + }, + { + "completion_length": 150.62500762939453, + "epoch": 0.32329720326508765, + "grad_norm": 1.015625, + "kl": 0.03634029906243086, + "learning_rate": 4.27847146625592e-06, + "loss": 0.0015, + "reward": 2.5416666865348816, + "reward_std": 0.2457980364561081, + "rewards/correctness_reward_func": 1.0833333358168602, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 604 + }, + { + "completion_length": 159.4583396911621, + "epoch": 0.3238324635353941, + "grad_norm": 1.6328125, + "kl": 0.04880431201308966, + "learning_rate": 4.275184739725188e-06, + "loss": 0.002, + "reward": 3.0, + "reward_std": 0.773861289024353, + "rewards/correctness_reward_func": 1.5833333730697632, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 605 + }, + { + "completion_length": 133.6250057220459, + "epoch": 0.3243677238057005, + "grad_norm": 2.03125, + "kl": 0.033091878052800894, + "learning_rate": 4.2718918129652e-06, + "loss": 0.0013, + "reward": 3.083333373069763, + "reward_std": 0.6821095645427704, + "rewards/correctness_reward_func": 1.5833333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 606 + }, + { + "completion_length": 169.1666717529297, + "epoch": 0.324902984076007, + "grad_norm": 2.34375, + "kl": 0.04404675355181098, + "learning_rate": 4.26859269747724e-06, + "loss": 0.0018, + "reward": 2.9166667461395264, + "reward_std": 0.7273796200752258, + "rewards/correctness_reward_func": 1.5000000447034836, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 607 + }, + { + "completion_length": 162.7916717529297, + "epoch": 0.3254382443463134, + "grad_norm": 1.375, + "kl": 0.05273099755868316, + "learning_rate": 4.265287404784204e-06, + "loss": 0.0021, + "reward": 2.7291666865348816, + "reward_std": 0.4782841205596924, + "rewards/correctness_reward_func": 1.2500000074505806, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 608 + }, + { + "completion_length": 147.70833778381348, + "epoch": 0.32597350461661984, + "grad_norm": 1.671875, + "kl": 0.029417749494314194, + "learning_rate": 4.261975946430567e-06, + "loss": 0.0012, + "reward": 2.645833432674408, + "reward_std": 0.7949730753898621, + "rewards/correctness_reward_func": 1.1666666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 609 + }, + { + "completion_length": 105.20833778381348, + "epoch": 0.32650876488692626, + "grad_norm": 1.125, + "kl": 0.039872271940112114, + "learning_rate": 4.258658333982335e-06, + "loss": 0.0016, + "reward": 3.375, + "reward_std": 0.25, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 610 + }, + { + "completion_length": 156.5833396911621, + "epoch": 0.3270440251572327, + "grad_norm": 1.625, + "kl": 0.05882585886865854, + "learning_rate": 4.255334579027013e-06, + "loss": 0.0024, + "reward": 2.7916667461395264, + "reward_std": 0.5643851608037949, + "rewards/correctness_reward_func": 1.416666679084301, + "rewards/int_reward_func": 0.39583333395421505, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 611 + }, + { + "completion_length": 147.9166717529297, + "epoch": 0.32757928542753917, + "grad_norm": 1.9609375, + "kl": 0.03394132852554321, + "learning_rate": 4.252004693173555e-06, + "loss": 0.0014, + "reward": 2.7500000596046448, + "reward_std": 0.6123724430799484, + "rewards/correctness_reward_func": 1.2500000596046448, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 612 + }, + { + "completion_length": 117.95833587646484, + "epoch": 0.3281145456978456, + "grad_norm": 1.0625, + "kl": 0.04348599258810282, + "learning_rate": 4.2486686880523335e-06, + "loss": 0.0017, + "reward": 3.25, + "reward_std": 0.273861289024353, + "rewards/correctness_reward_func": 1.75, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 613 + }, + { + "completion_length": 147.58333778381348, + "epoch": 0.328649805968152, + "grad_norm": 1.765625, + "kl": 0.04881718289107084, + "learning_rate": 4.24532657531509e-06, + "loss": 0.002, + "reward": 2.958333432674408, + "reward_std": 0.5809475630521774, + "rewards/correctness_reward_func": 1.5000000149011612, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 614 + }, + { + "completion_length": 118.87500381469727, + "epoch": 0.32918506623845845, + "grad_norm": 1.59375, + "kl": 0.03824450308457017, + "learning_rate": 4.2419783666349e-06, + "loss": 0.0015, + "reward": 3.2916667461395264, + "reward_std": 0.32274864614009857, + "rewards/correctness_reward_func": 1.8333333432674408, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 615 + }, + { + "completion_length": 146.4583396911621, + "epoch": 0.32972032650876487, + "grad_norm": 1.609375, + "kl": 0.03492473717778921, + "learning_rate": 4.2386240737061315e-06, + "loss": 0.0014, + "reward": 3.1666667461395264, + "reward_std": 0.49322495982050896, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 616 + }, + { + "completion_length": 170.58333778381348, + "epoch": 0.3302555867790713, + "grad_norm": 1.4140625, + "kl": 0.04447829117998481, + "learning_rate": 4.2352637082443995e-06, + "loss": 0.0018, + "reward": 3.270833373069763, + "reward_std": 0.30349136516451836, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3541666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 617 + }, + { + "completion_length": 121.87500190734863, + "epoch": 0.3307908470493778, + "grad_norm": 1.421875, + "kl": 0.04443943314254284, + "learning_rate": 4.231897281986534e-06, + "loss": 0.0018, + "reward": 3.458333373069763, + "reward_std": 0.10206206887960434, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 618 + }, + { + "completion_length": 144.87500381469727, + "epoch": 0.3313261073196842, + "grad_norm": 1.5390625, + "kl": 0.045205289497971535, + "learning_rate": 4.228524806690529e-06, + "loss": 0.0018, + "reward": 3.1875000596046448, + "reward_std": 0.49814651533961296, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000149011612, + "rewards/xmlcount_reward_func": 0.5, + "step": 619 + }, + { + "completion_length": 182.50000381469727, + "epoch": 0.33186136758999063, + "grad_norm": 1.6953125, + "kl": 0.036972432397305965, + "learning_rate": 4.2251462941355075e-06, + "loss": 0.0015, + "reward": 3.1666666865348816, + "reward_std": 0.5194446891546249, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 620 + }, + { + "completion_length": 140.5416717529297, + "epoch": 0.33239662786029706, + "grad_norm": 1.7421875, + "kl": 0.059178040362894535, + "learning_rate": 4.22176175612168e-06, + "loss": 0.0024, + "reward": 2.7500000596046448, + "reward_std": 0.3624359965324402, + "rewards/correctness_reward_func": 1.3333333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.416666679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 621 + }, + { + "completion_length": 117.50000381469727, + "epoch": 0.3329318881306035, + "grad_norm": 1.765625, + "kl": 0.03811078518629074, + "learning_rate": 4.218371204470303e-06, + "loss": 0.0015, + "reward": 3.2291667461395264, + "reward_std": 0.5133541226387024, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 622 + }, + { + "completion_length": 106.12500381469727, + "epoch": 0.33346714840090996, + "grad_norm": 2.03125, + "kl": 0.03282386902719736, + "learning_rate": 4.214974651023632e-06, + "loss": 0.0013, + "reward": 3.302083373069763, + "reward_std": 0.48479484021663666, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.4895833358168602, + "step": 623 + }, + { + "completion_length": 151.79166793823242, + "epoch": 0.3340024086712164, + "grad_norm": 2.453125, + "kl": 0.03183559700846672, + "learning_rate": 4.211572107644891e-06, + "loss": 0.0013, + "reward": 2.8750000298023224, + "reward_std": 0.730063334107399, + "rewards/correctness_reward_func": 1.5000000447034836, + "rewards/int_reward_func": 0.4375, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 624 + }, + { + "completion_length": 188.20833587646484, + "epoch": 0.3345376689415228, + "grad_norm": 1.34375, + "kl": 0.030532730743288994, + "learning_rate": 4.208163586218223e-06, + "loss": 0.0012, + "reward": 2.520833373069763, + "reward_std": 0.5441423058509827, + "rewards/correctness_reward_func": 1.1666666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3541666753590107, + "rewards/xmlcount_reward_func": 0.5, + "step": 625 + }, + { + "completion_length": 127.16666984558105, + "epoch": 0.33507292921182924, + "grad_norm": 1.5078125, + "kl": 0.05405988823622465, + "learning_rate": 4.204749098648651e-06, + "loss": 0.0022, + "reward": 3.0416666865348816, + "reward_std": 0.4541241377592087, + "rewards/correctness_reward_func": 1.583333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 626 + }, + { + "completion_length": 175.20833587646484, + "epoch": 0.33560818948213567, + "grad_norm": 1.5546875, + "kl": 0.044757843017578125, + "learning_rate": 4.201328656862033e-06, + "loss": 0.0018, + "reward": 2.520833343267441, + "reward_std": 0.3266642242670059, + "rewards/correctness_reward_func": 1.1666666716337204, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3541666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 627 + }, + { + "completion_length": 160.41666793823242, + "epoch": 0.33614344975244215, + "grad_norm": 1.8046875, + "kl": 0.045004216488450766, + "learning_rate": 4.197902272805028e-06, + "loss": 0.0018, + "reward": 2.6875000596046448, + "reward_std": 0.880591869354248, + "rewards/correctness_reward_func": 1.333333358168602, + "rewards/int_reward_func": 0.4375000074505806, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.416666679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 628 + }, + { + "completion_length": 177.4583396911621, + "epoch": 0.3366787100227486, + "grad_norm": 1.6015625, + "kl": 0.04143283236771822, + "learning_rate": 4.194469958445048e-06, + "loss": 0.0017, + "reward": 2.6041667461395264, + "reward_std": 0.5357958674430847, + "rewards/correctness_reward_func": 1.2500000298023224, + "rewards/int_reward_func": 0.4375, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 629 + }, + { + "completion_length": 136.41666793823242, + "epoch": 0.337213970293055, + "grad_norm": 0.87109375, + "kl": 0.029942914377897978, + "learning_rate": 4.191031725770216e-06, + "loss": 0.0012, + "reward": 3.0625, + "reward_std": 0.1530931293964386, + "rewards/correctness_reward_func": 1.5833333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 630 + }, + { + "completion_length": 119.37500190734863, + "epoch": 0.3377492305633614, + "grad_norm": 1.1484375, + "kl": 0.03256394062191248, + "learning_rate": 4.187587586789329e-06, + "loss": 0.0013, + "reward": 3.395833373069763, + "reward_std": 0.25515517219901085, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 631 + }, + { + "completion_length": 137.6250057220459, + "epoch": 0.33828449083366785, + "grad_norm": 1.5, + "kl": 0.03323498251847923, + "learning_rate": 4.184137553531812e-06, + "loss": 0.0013, + "reward": 2.8750000596046448, + "reward_std": 0.3061862401664257, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 632 + }, + { + "completion_length": 151.33333587646484, + "epoch": 0.33881975110397433, + "grad_norm": 1.5234375, + "kl": 0.029517395421862602, + "learning_rate": 4.180681638047675e-06, + "loss": 0.0012, + "reward": 2.5625000596046448, + "reward_std": 0.5583724975585938, + "rewards/correctness_reward_func": 1.0833333432674408, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 633 + }, + { + "completion_length": 144.87500381469727, + "epoch": 0.33935501137428076, + "grad_norm": 1.296875, + "kl": 0.03951950464397669, + "learning_rate": 4.177219852407477e-06, + "loss": 0.0016, + "reward": 3.3541666865348816, + "reward_std": 0.24468021094799042, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 634 + }, + { + "completion_length": 176.41666793823242, + "epoch": 0.3398902716445872, + "grad_norm": 1.703125, + "kl": 0.033194053918123245, + "learning_rate": 4.173752208702277e-06, + "loss": 0.0013, + "reward": 3.1250000596046448, + "reward_std": 0.510659247636795, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.375, + "rewards/xmlcount_reward_func": 0.5, + "step": 635 + }, + { + "completion_length": 135.20833587646484, + "epoch": 0.3404255319148936, + "grad_norm": 0.76953125, + "kl": 0.020830919034779072, + "learning_rate": 4.170278719043594e-06, + "loss": 0.0008, + "reward": 2.9791666865348816, + "reward_std": 0.05103103816509247, + "rewards/correctness_reward_func": 1.5, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 636 + }, + { + "completion_length": 181.95833587646484, + "epoch": 0.34096079218520003, + "grad_norm": 1.859375, + "kl": 0.03406862914562225, + "learning_rate": 4.1667993955633685e-06, + "loss": 0.0014, + "reward": 2.895833432674408, + "reward_std": 1.027670457959175, + "rewards/correctness_reward_func": 1.5000000596046448, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375, + "rewards/xmlcount_reward_func": 0.5, + "step": 637 + }, + { + "completion_length": 167.0416717529297, + "epoch": 0.3414960524555065, + "grad_norm": 1.3046875, + "kl": 0.01679137465544045, + "learning_rate": 4.163314250413913e-06, + "loss": 0.0007, + "reward": 2.6666666865348816, + "reward_std": 0.5487253814935684, + "rewards/correctness_reward_func": 1.2500000298023224, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 638 + }, + { + "completion_length": 158.00000381469727, + "epoch": 0.34203131272581294, + "grad_norm": 1.4453125, + "kl": 0.03285357216373086, + "learning_rate": 4.1598232957678784e-06, + "loss": 0.0013, + "reward": 2.9791667461395264, + "reward_std": 0.5357958823442459, + "rewards/correctness_reward_func": 1.583333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 639 + }, + { + "completion_length": 130.41666793823242, + "epoch": 0.34256657299611937, + "grad_norm": 2.375, + "kl": 0.06797064701095223, + "learning_rate": 4.1563265438182e-06, + "loss": 0.0027, + "reward": 2.8750001192092896, + "reward_std": 0.791929330676794, + "rewards/correctness_reward_func": 1.4166667014360428, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 640 + }, + { + "completion_length": 139.9166717529297, + "epoch": 0.3431018332664258, + "grad_norm": 1.9140625, + "kl": 0.027372614247724414, + "learning_rate": 4.152824006778068e-06, + "loss": 0.0011, + "reward": 3.2916666865348816, + "reward_std": 0.39777331054210663, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 641 + }, + { + "completion_length": 139.33333778381348, + "epoch": 0.3436370935367322, + "grad_norm": 2.015625, + "kl": 0.03689718246459961, + "learning_rate": 4.149315696880873e-06, + "loss": 0.0015, + "reward": 3.1875000596046448, + "reward_std": 0.5993371978402138, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 642 + }, + { + "completion_length": 152.50000381469727, + "epoch": 0.3441723538070387, + "grad_norm": 1.375, + "kl": 0.03251838870346546, + "learning_rate": 4.145801626380174e-06, + "loss": 0.0013, + "reward": 2.833333373069763, + "reward_std": 0.29362983629107475, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 643 + }, + { + "completion_length": 148.33333587646484, + "epoch": 0.3447076140773451, + "grad_norm": 1.203125, + "kl": 0.03755293879657984, + "learning_rate": 4.142281807549644e-06, + "loss": 0.0015, + "reward": 3.145833373069763, + "reward_std": 0.3092299550771713, + "rewards/correctness_reward_func": 1.6666666716337204, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 644 + }, + { + "completion_length": 176.54166793823242, + "epoch": 0.34524287434765155, + "grad_norm": 1.6328125, + "kl": 0.05265080649405718, + "learning_rate": 4.138756252683039e-06, + "loss": 0.0021, + "reward": 2.8125000596046448, + "reward_std": 0.6279504112899303, + "rewards/correctness_reward_func": 1.416666679084301, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000149011612, + "rewards/xmlcount_reward_func": 0.5, + "step": 645 + }, + { + "completion_length": 162.41667366027832, + "epoch": 0.345778134617958, + "grad_norm": 1.4453125, + "kl": 0.05229492858052254, + "learning_rate": 4.135224974094145e-06, + "loss": 0.0021, + "reward": 2.9791667461395264, + "reward_std": 0.6616143435239792, + "rewards/correctness_reward_func": 1.5833333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 646 + }, + { + "completion_length": 197.79167556762695, + "epoch": 0.3463133948882644, + "grad_norm": 1.9375, + "kl": 0.031138702295720577, + "learning_rate": 4.131687984116743e-06, + "loss": 0.0012, + "reward": 2.416666716337204, + "reward_std": 1.1080896109342575, + "rewards/correctness_reward_func": 1.083333358168602, + "rewards/int_reward_func": 0.3958333395421505, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375, + "rewards/xmlcount_reward_func": 0.5, + "step": 647 + }, + { + "completion_length": 188.83333778381348, + "epoch": 0.34684865515857083, + "grad_norm": 1.359375, + "kl": 0.0418109823949635, + "learning_rate": 4.128145295104561e-06, + "loss": 0.0017, + "reward": 2.895833432674408, + "reward_std": 0.6499251537024975, + "rewards/correctness_reward_func": 1.5000000149011612, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.416666679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 648 + }, + { + "completion_length": 173.2916717529297, + "epoch": 0.3473839154288773, + "grad_norm": 1.0390625, + "kl": 0.042974590323865414, + "learning_rate": 4.124596919431229e-06, + "loss": 0.0017, + "reward": 3.4375000596046448, + "reward_std": 0.11558076366782188, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 649 + }, + { + "completion_length": 137.7083396911621, + "epoch": 0.34791917569918374, + "grad_norm": 1.0, + "kl": 0.03159995749592781, + "learning_rate": 4.1210428694902444e-06, + "loss": 0.0013, + "reward": 3.0625, + "reward_std": 0.22008520364761353, + "rewards/correctness_reward_func": 1.5833333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 650 + }, + { + "completion_length": 195.58333587646484, + "epoch": 0.34845443596949016, + "grad_norm": 0.7109375, + "kl": 0.02461721864528954, + "learning_rate": 4.117483157694919e-06, + "loss": 0.001, + "reward": 2.4010416865348816, + "reward_std": 0.13465330004692078, + "rewards/correctness_reward_func": 1.0, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375, + "rewards/xmlcount_reward_func": 0.484375, + "step": 651 + }, + { + "completion_length": 134.12500381469727, + "epoch": 0.3489896962397966, + "grad_norm": 1.5546875, + "kl": 0.036613046657294035, + "learning_rate": 4.113917796478342e-06, + "loss": 0.0015, + "reward": 3.020833373069763, + "reward_std": 0.5618228912353516, + "rewards/correctness_reward_func": 1.5833333432674408, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 652 + }, + { + "completion_length": 202.5416717529297, + "epoch": 0.349524956510103, + "grad_norm": 1.5, + "kl": 0.03586659440770745, + "learning_rate": 4.110346798293334e-06, + "loss": 0.0014, + "reward": 3.0416667461395264, + "reward_std": 0.6170316934585571, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.31250001303851604, + "rewards/xmlcount_reward_func": 0.5, + "step": 653 + }, + { + "completion_length": 151.9166717529297, + "epoch": 0.3500602167804095, + "grad_norm": 2.046875, + "kl": 0.03458790061995387, + "learning_rate": 4.106770175612404e-06, + "loss": 0.0014, + "reward": 2.6041666865348816, + "reward_std": 0.8134097754955292, + "rewards/correctness_reward_func": 1.166666679084301, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000149011612, + "rewards/xmlcount_reward_func": 0.5, + "step": 654 + }, + { + "completion_length": 176.4583396911621, + "epoch": 0.3505954770507159, + "grad_norm": 0.9609375, + "kl": 0.0380466477945447, + "learning_rate": 4.103187940927705e-06, + "loss": 0.0015, + "reward": 3.270833373069763, + "reward_std": 0.2002602517604828, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3541666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 655 + }, + { + "completion_length": 177.83333587646484, + "epoch": 0.35113073732102235, + "grad_norm": 1.4921875, + "kl": 0.026743045775219798, + "learning_rate": 4.099600106750993e-06, + "loss": 0.0011, + "reward": 1.9166667312383652, + "reward_std": 0.5914224684238434, + "rewards/correctness_reward_func": 0.6666666865348816, + "rewards/int_reward_func": 0.375, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3750000037252903, + "rewards/xmlcount_reward_func": 0.5, + "step": 656 + }, + { + "completion_length": 140.16666793823242, + "epoch": 0.35166599759132877, + "grad_norm": 1.890625, + "kl": 0.032465869560837746, + "learning_rate": 4.096006685613579e-06, + "loss": 0.0013, + "reward": 3.2500000596046448, + "reward_std": 0.4727980047464371, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 657 + }, + { + "completion_length": 150.33333587646484, + "epoch": 0.3522012578616352, + "grad_norm": 1.71875, + "kl": 0.036532831378281116, + "learning_rate": 4.09240769006629e-06, + "loss": 0.0015, + "reward": 3.0000000596046448, + "reward_std": 0.6798528283834457, + "rewards/correctness_reward_func": 1.5833333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.416666679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 658 + }, + { + "completion_length": 179.2916717529297, + "epoch": 0.3527365181319417, + "grad_norm": 1.1875, + "kl": 0.02686024410650134, + "learning_rate": 4.088803132679421e-06, + "loss": 0.0011, + "reward": 3.0000000298023224, + "reward_std": 0.29362983629107475, + "rewards/correctness_reward_func": 1.5833333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 659 + }, + { + "completion_length": 168.7083396911621, + "epoch": 0.3532717784022481, + "grad_norm": 1.3046875, + "kl": 0.023333940654993057, + "learning_rate": 4.085193026042695e-06, + "loss": 0.0009, + "reward": 3.020833373069763, + "reward_std": 0.4421939253807068, + "rewards/correctness_reward_func": 1.583333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375, + "rewards/xmlcount_reward_func": 0.5, + "step": 660 + }, + { + "completion_length": 125.58333587646484, + "epoch": 0.35380703867255453, + "grad_norm": 2.375, + "kl": 0.06006305478513241, + "learning_rate": 4.081577382765215e-06, + "loss": 0.0024, + "reward": 2.645833373069763, + "reward_std": 0.7491974383592606, + "rewards/correctness_reward_func": 1.2500000223517418, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375, + "rewards/xmlcount_reward_func": 0.5, + "step": 661 + }, + { + "completion_length": 138.58333587646484, + "epoch": 0.35434229894286096, + "grad_norm": 1.7109375, + "kl": 0.03619189281016588, + "learning_rate": 4.077956215475423e-06, + "loss": 0.0014, + "reward": 3.1250000596046448, + "reward_std": 0.3602609895169735, + "rewards/correctness_reward_func": 1.6666666716337204, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 662 + }, + { + "completion_length": 185.08333587646484, + "epoch": 0.3548775592131674, + "grad_norm": 1.7265625, + "kl": 0.05054088030010462, + "learning_rate": 4.074329536821056e-06, + "loss": 0.002, + "reward": 2.4375000596046448, + "reward_std": 0.4713764898478985, + "rewards/correctness_reward_func": 1.0000000223517418, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000149011612, + "rewards/xmlcount_reward_func": 0.5, + "step": 663 + }, + { + "completion_length": 154.8333396911621, + "epoch": 0.35541281948347386, + "grad_norm": 1.8984375, + "kl": 0.03618910349905491, + "learning_rate": 4.070697359469097e-06, + "loss": 0.0014, + "reward": 3.1041667461395264, + "reward_std": 0.631978552788496, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 664 + }, + { + "completion_length": 130.0416717529297, + "epoch": 0.3559480797537803, + "grad_norm": 1.59375, + "kl": 0.02808321500197053, + "learning_rate": 4.067059696105738e-06, + "loss": 0.0011, + "reward": 2.7916666865348816, + "reward_std": 0.20412413775920868, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.375, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 665 + }, + { + "completion_length": 164.5416717529297, + "epoch": 0.3564833400240867, + "grad_norm": 1.328125, + "kl": 0.026204224675893784, + "learning_rate": 4.063416559436332e-06, + "loss": 0.001, + "reward": 2.6250000298023224, + "reward_std": 0.43686148524284363, + "rewards/correctness_reward_func": 1.1666666716337204, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 666 + }, + { + "completion_length": 125.91667175292969, + "epoch": 0.35701860029439314, + "grad_norm": 2.078125, + "kl": 0.04998402390629053, + "learning_rate": 4.059767962185346e-06, + "loss": 0.002, + "reward": 2.5416667461395264, + "reward_std": 0.6769221723079681, + "rewards/correctness_reward_func": 1.083333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 667 + }, + { + "completion_length": 153.04166984558105, + "epoch": 0.35755386056469957, + "grad_norm": 0.470703125, + "kl": 0.048956929706037045, + "learning_rate": 4.056113917096321e-06, + "loss": 0.002, + "reward": 3.4791666865348816, + "reward_std": 0.05103103816509247, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 668 + }, + { + "completion_length": 127.5000057220459, + "epoch": 0.35808912083500605, + "grad_norm": 1.0234375, + "kl": 0.032527790404856205, + "learning_rate": 4.052454436931826e-06, + "loss": 0.0013, + "reward": 3.333333373069763, + "reward_std": 0.25819891691207886, + "rewards/correctness_reward_func": 1.8333333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 669 + }, + { + "completion_length": 164.16666984558105, + "epoch": 0.3586243811053125, + "grad_norm": 2.203125, + "kl": 0.08344291755929589, + "learning_rate": 4.048789534473414e-06, + "loss": 0.0033, + "reward": 2.729166716337204, + "reward_std": 0.535199623554945, + "rewards/correctness_reward_func": 1.3333333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 670 + }, + { + "completion_length": 149.4583396911621, + "epoch": 0.3591596413756189, + "grad_norm": 1.5, + "kl": 0.029162777587771416, + "learning_rate": 4.045119222521574e-06, + "loss": 0.0012, + "reward": 3.208333373069763, + "reward_std": 0.5483061634004116, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 671 + }, + { + "completion_length": 145.70833587646484, + "epoch": 0.3596949016459253, + "grad_norm": 2.3125, + "kl": 0.049506490118801594, + "learning_rate": 4.041443513895692e-06, + "loss": 0.002, + "reward": 3.0000000596046448, + "reward_std": 0.6572890840470791, + "rewards/correctness_reward_func": 1.5833333432674408, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000149011612, + "rewards/xmlcount_reward_func": 0.5, + "step": 672 + }, + { + "completion_length": 115.62500381469727, + "epoch": 0.36023016191623175, + "grad_norm": 1.6015625, + "kl": 0.04335561767220497, + "learning_rate": 4.037762421434e-06, + "loss": 0.0017, + "reward": 3.2500000596046448, + "reward_std": 0.46232306957244873, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 673 + }, + { + "completion_length": 171.8750057220459, + "epoch": 0.36076542218653823, + "grad_norm": 1.125, + "kl": 0.0296230330131948, + "learning_rate": 4.034075957993537e-06, + "loss": 0.0012, + "reward": 2.8750000596046448, + "reward_std": 0.5809475183486938, + "rewards/correctness_reward_func": 1.4166666716337204, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 674 + }, + { + "completion_length": 135.70833587646484, + "epoch": 0.36130068245684466, + "grad_norm": 1.7734375, + "kl": 0.035232785856351256, + "learning_rate": 4.030384136450098e-06, + "loss": 0.0014, + "reward": 3.0625000596046448, + "reward_std": 0.5583724975585938, + "rewards/correctness_reward_func": 1.5833333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 675 + }, + { + "completion_length": 114.75000190734863, + "epoch": 0.3618359427271511, + "grad_norm": 0.99609375, + "kl": 0.031613022554665804, + "learning_rate": 4.026686969698196e-06, + "loss": 0.0013, + "reward": 3.395833373069763, + "reward_std": 0.2002602517604828, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 676 + }, + { + "completion_length": 153.5000057220459, + "epoch": 0.3623712029974575, + "grad_norm": 1.5, + "kl": 0.033104993868619204, + "learning_rate": 4.022984470651012e-06, + "loss": 0.0013, + "reward": 2.854166716337204, + "reward_std": 0.5922432988882065, + "rewards/correctness_reward_func": 1.4166667088866234, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375, + "rewards/xmlcount_reward_func": 0.5, + "step": 677 + }, + { + "completion_length": 150.2500057220459, + "epoch": 0.36290646326776393, + "grad_norm": 1.609375, + "kl": 0.0589370196685195, + "learning_rate": 4.01927665224035e-06, + "loss": 0.0024, + "reward": 2.489583373069763, + "reward_std": 0.5041960887610912, + "rewards/correctness_reward_func": 1.083333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.416666679084301, + "rewards/xmlcount_reward_func": 0.4895833358168602, + "step": 678 + }, + { + "completion_length": 134.3750057220459, + "epoch": 0.36344172353807036, + "grad_norm": 0.9765625, + "kl": 0.026309417095035315, + "learning_rate": 4.015563527416596e-06, + "loss": 0.0011, + "reward": 3.4166666865348816, + "reward_std": 0.20412413775920868, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 679 + }, + { + "completion_length": 151.5833396911621, + "epoch": 0.36397698380837684, + "grad_norm": 1.359375, + "kl": 0.023342951899394393, + "learning_rate": 4.011845109148666e-06, + "loss": 0.0009, + "reward": 3.1666666865348816, + "reward_std": 0.4779854416847229, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 680 + }, + { + "completion_length": 138.29166984558105, + "epoch": 0.36451224407868327, + "grad_norm": 2.390625, + "kl": 0.02908670110628009, + "learning_rate": 4.0081214104239656e-06, + "loss": 0.0012, + "reward": 2.5625000298023224, + "reward_std": 0.6283334940671921, + "rewards/correctness_reward_func": 1.083333358168602, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 681 + }, + { + "completion_length": 164.4583396911621, + "epoch": 0.3650475043489897, + "grad_norm": 1.7421875, + "kl": 0.04680294170975685, + "learning_rate": 4.004392444248347e-06, + "loss": 0.0019, + "reward": 2.8906250596046448, + "reward_std": 0.8390896618366241, + "rewards/correctness_reward_func": 1.5000000298023224, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.416666679084301, + "rewards/xmlcount_reward_func": 0.4947916716337204, + "step": 682 + }, + { + "completion_length": 113.66666793823242, + "epoch": 0.3655827646192961, + "grad_norm": 1.390625, + "kl": 0.03361522685736418, + "learning_rate": 4.000658223646057e-06, + "loss": 0.0013, + "reward": 3.2291666865348816, + "reward_std": 0.4509793668985367, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 683 + }, + { + "completion_length": 127.54166984558105, + "epoch": 0.36611802488960254, + "grad_norm": 0.87109375, + "kl": 0.035005373414605856, + "learning_rate": 3.996918761659694e-06, + "loss": 0.0014, + "reward": 2.895833373069763, + "reward_std": 0.2002602517604828, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 684 + }, + { + "completion_length": 172.6250057220459, + "epoch": 0.366653285159909, + "grad_norm": 1.828125, + "kl": 0.027844190131872892, + "learning_rate": 3.993174071350164e-06, + "loss": 0.0011, + "reward": 2.6302084624767303, + "reward_std": 0.8579408079385757, + "rewards/correctness_reward_func": 1.2500000223517418, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.484375, + "step": 685 + }, + { + "completion_length": 129.62500381469727, + "epoch": 0.36718854543021545, + "grad_norm": 1.4765625, + "kl": 0.02923845173791051, + "learning_rate": 3.989424165796637e-06, + "loss": 0.0012, + "reward": 3.083333373069763, + "reward_std": 0.46232306957244873, + "rewards/correctness_reward_func": 1.583333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 686 + }, + { + "completion_length": 129.66666984558105, + "epoch": 0.3677238057005219, + "grad_norm": 1.765625, + "kl": 0.0485138155054301, + "learning_rate": 3.985669058096493e-06, + "loss": 0.0019, + "reward": 3.2500000596046448, + "reward_std": 0.49983541294932365, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 687 + }, + { + "completion_length": 162.2500057220459, + "epoch": 0.3682590659708283, + "grad_norm": 1.5078125, + "kl": 0.022985886316746473, + "learning_rate": 3.981908761365286e-06, + "loss": 0.0009, + "reward": 2.442708373069763, + "reward_std": 0.4739741384983063, + "rewards/correctness_reward_func": 1.0000000223517418, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.484375, + "step": 688 + }, + { + "completion_length": 156.87500381469727, + "epoch": 0.36879432624113473, + "grad_norm": 2.046875, + "kl": 0.02942904783412814, + "learning_rate": 3.978143288736692e-06, + "loss": 0.0012, + "reward": 2.5208334028720856, + "reward_std": 0.7050773799419403, + "rewards/correctness_reward_func": 1.083333358168602, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 689 + }, + { + "completion_length": 134.25000190734863, + "epoch": 0.3693295865114412, + "grad_norm": 2.765625, + "kl": 0.08899490907788277, + "learning_rate": 3.974372653362466e-06, + "loss": 0.0036, + "reward": 2.770833373069763, + "reward_std": 0.6551035642623901, + "rewards/correctness_reward_func": 1.3333333656191826, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 690 + }, + { + "completion_length": 158.16666984558105, + "epoch": 0.36986484678174764, + "grad_norm": 1.96875, + "kl": 0.03482615575194359, + "learning_rate": 3.970596868412393e-06, + "loss": 0.0014, + "reward": 2.3750001192092896, + "reward_std": 0.7919293642044067, + "rewards/correctness_reward_func": 0.9166667014360428, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 691 + }, + { + "completion_length": 159.29166793823242, + "epoch": 0.37040010705205406, + "grad_norm": 1.984375, + "kl": 0.02983904629945755, + "learning_rate": 3.966815947074246e-06, + "loss": 0.0012, + "reward": 3.0833334922790527, + "reward_std": 0.9657258689403534, + "rewards/correctness_reward_func": 1.6666667461395264, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 692 + }, + { + "completion_length": 129.87500190734863, + "epoch": 0.3709353673223605, + "grad_norm": 1.71875, + "kl": 0.04448452312499285, + "learning_rate": 3.963029902553738e-06, + "loss": 0.0018, + "reward": 3.3541667461395264, + "reward_std": 0.31970490142703056, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 693 + }, + { + "completion_length": 164.91666793823242, + "epoch": 0.3714706275926669, + "grad_norm": 1.7109375, + "kl": 0.03725033439695835, + "learning_rate": 3.959238748074474e-06, + "loss": 0.0015, + "reward": 2.5833334028720856, + "reward_std": 0.4248107150197029, + "rewards/correctness_reward_func": 1.1666666716337204, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.416666679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 694 + }, + { + "completion_length": 150.75000762939453, + "epoch": 0.3720058878629734, + "grad_norm": 1.765625, + "kl": 0.04319385718554258, + "learning_rate": 3.955442496877908e-06, + "loss": 0.0017, + "reward": 2.7916667461395264, + "reward_std": 0.6950604021549225, + "rewards/correctness_reward_func": 1.3333333656191826, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 695 + }, + { + "completion_length": 175.54167556762695, + "epoch": 0.3725411481332798, + "grad_norm": 1.234375, + "kl": 0.04713658872060478, + "learning_rate": 3.951641162223298e-06, + "loss": 0.0019, + "reward": 2.7916666865348816, + "reward_std": 0.29788626730442047, + "rewards/correctness_reward_func": 1.3333333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 696 + }, + { + "completion_length": 144.95833778381348, + "epoch": 0.37307640840358625, + "grad_norm": 1.9765625, + "kl": 0.03795685060322285, + "learning_rate": 3.947834757387651e-06, + "loss": 0.0015, + "reward": 3.0416667461395264, + "reward_std": 0.7436887919902802, + "rewards/correctness_reward_func": 1.5833333730697632, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 697 + }, + { + "completion_length": 155.7083396911621, + "epoch": 0.37361166867389267, + "grad_norm": 1.1171875, + "kl": 0.02909352071583271, + "learning_rate": 3.944023295665688e-06, + "loss": 0.0012, + "reward": 2.8125000596046448, + "reward_std": 0.4592793434858322, + "rewards/correctness_reward_func": 1.3333333730697632, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 698 + }, + { + "completion_length": 118.83333587646484, + "epoch": 0.3741469289441991, + "grad_norm": 1.0078125, + "kl": 0.028855583164840937, + "learning_rate": 3.9402067903697894e-06, + "loss": 0.0012, + "reward": 3.4166666865348816, + "reward_std": 0.20412413775920868, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 699 + }, + { + "completion_length": 106.25000190734863, + "epoch": 0.3746821892145056, + "grad_norm": 1.4765625, + "kl": 0.04083001893013716, + "learning_rate": 3.936385254829953e-06, + "loss": 0.0016, + "reward": 2.895833373069763, + "reward_std": 0.11558076739311218, + "rewards/correctness_reward_func": 1.5, + "rewards/int_reward_func": 0.4166666679084301, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 700 + }, + { + "completion_length": 141.2083396911621, + "epoch": 0.375217449484812, + "grad_norm": 1.1484375, + "kl": 0.04432675335556269, + "learning_rate": 3.932558702393746e-06, + "loss": 0.0018, + "reward": 3.020833373069763, + "reward_std": 0.5874452292919159, + "rewards/correctness_reward_func": 1.5833333432674408, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 701 + }, + { + "completion_length": 122.20833587646484, + "epoch": 0.37575270975511843, + "grad_norm": 1.078125, + "kl": 0.02684123977087438, + "learning_rate": 3.928727146426258e-06, + "loss": 0.0011, + "reward": 2.7291666865348816, + "reward_std": 0.25515520572662354, + "rewards/correctness_reward_func": 1.25, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 702 + }, + { + "completion_length": 110.75000381469727, + "epoch": 0.37628797002542486, + "grad_norm": 2.09375, + "kl": 0.05112092103809118, + "learning_rate": 3.9248906003100514e-06, + "loss": 0.002, + "reward": 3.3125000596046448, + "reward_std": 0.40438438951969147, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 703 + }, + { + "completion_length": 182.45833778381348, + "epoch": 0.3768232302957313, + "grad_norm": 2.03125, + "kl": 0.027650201227515936, + "learning_rate": 3.921049077445124e-06, + "loss": 0.0011, + "reward": 2.005208373069763, + "reward_std": 0.39549052342772484, + "rewards/correctness_reward_func": 0.5833333358168602, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.484375, + "step": 704 + }, + { + "completion_length": 162.87500381469727, + "epoch": 0.37735849056603776, + "grad_norm": 0.65234375, + "kl": 0.019340375438332558, + "learning_rate": 3.91720259124885e-06, + "loss": 0.0008, + "reward": 3.125, + "reward_std": 0.2958039939403534, + "rewards/correctness_reward_func": 1.6666666716337204, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 705 + }, + { + "completion_length": 141.70833587646484, + "epoch": 0.3778937508363442, + "grad_norm": 1.25, + "kl": 0.030775428283959627, + "learning_rate": 3.913351155155943e-06, + "loss": 0.0012, + "reward": 2.9375, + "reward_std": 0.6161879003047943, + "rewards/correctness_reward_func": 1.5, + "rewards/int_reward_func": 0.4375, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 706 + }, + { + "completion_length": 200.62500381469727, + "epoch": 0.3784290111066506, + "grad_norm": 1.8203125, + "kl": 0.04161107540130615, + "learning_rate": 3.909494782618403e-06, + "loss": 0.0017, + "reward": 2.5208334028720856, + "reward_std": 0.7303955964744091, + "rewards/correctness_reward_func": 1.083333358168602, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 707 + }, + { + "completion_length": 110.66667175292969, + "epoch": 0.37896427137695704, + "grad_norm": 3.234375, + "kl": 0.061036181170493364, + "learning_rate": 3.905633487105474e-06, + "loss": 0.0024, + "reward": 3.3541667461395264, + "reward_std": 0.3572172410786152, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 708 + }, + { + "completion_length": 107.00000381469727, + "epoch": 0.37949953164726347, + "grad_norm": 1.6875, + "kl": 0.036092507652938366, + "learning_rate": 3.9017672821035915e-06, + "loss": 0.0014, + "reward": 3.4166666865348816, + "reward_std": 0.20412415266036987, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 709 + }, + { + "completion_length": 155.79167366027832, + "epoch": 0.3800347919175699, + "grad_norm": 1.5, + "kl": 0.025613056030124426, + "learning_rate": 3.897896181116341e-06, + "loss": 0.001, + "reward": 3.083333373069763, + "reward_std": 0.46232305467128754, + "rewards/correctness_reward_func": 1.583333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 710 + }, + { + "completion_length": 146.33333587646484, + "epoch": 0.3805700521878764, + "grad_norm": 1.4296875, + "kl": 0.02829862991347909, + "learning_rate": 3.8940201976644065e-06, + "loss": 0.0011, + "reward": 2.916666716337204, + "reward_std": 0.16661179810762405, + "rewards/correctness_reward_func": 1.5, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 711 + }, + { + "completion_length": 146.33333778381348, + "epoch": 0.3811053124581828, + "grad_norm": 1.484375, + "kl": 0.02284587174654007, + "learning_rate": 3.890139345285527e-06, + "loss": 0.0009, + "reward": 2.7916666865348816, + "reward_std": 0.5202587842941284, + "rewards/correctness_reward_func": 1.3333333358168602, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 712 + }, + { + "completion_length": 125.29166984558105, + "epoch": 0.3816405727284892, + "grad_norm": 0.62890625, + "kl": 0.02871770365163684, + "learning_rate": 3.886253637534447e-06, + "loss": 0.0011, + "reward": 3.4791666865348816, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 713 + }, + { + "completion_length": 154.54166793823242, + "epoch": 0.38217583299879565, + "grad_norm": 1.8046875, + "kl": 0.0300689903087914, + "learning_rate": 3.882363087982868e-06, + "loss": 0.0012, + "reward": 3.0000000596046448, + "reward_std": 0.7361843585968018, + "rewards/correctness_reward_func": 1.5000000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 714 + }, + { + "completion_length": 178.29167556762695, + "epoch": 0.3827110932691021, + "grad_norm": 1.2109375, + "kl": 0.02768306853249669, + "learning_rate": 3.878467710219402e-06, + "loss": 0.0011, + "reward": 2.6041666865348816, + "reward_std": 0.30103103816509247, + "rewards/correctness_reward_func": 1.1666666716337204, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 715 + }, + { + "completion_length": 142.58333587646484, + "epoch": 0.38324635353940856, + "grad_norm": 1.5234375, + "kl": 0.029177965596318245, + "learning_rate": 3.874567517849529e-06, + "loss": 0.0012, + "reward": 3.083333373069763, + "reward_std": 0.5974817872047424, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.4375, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 716 + }, + { + "completion_length": 146.62500762939453, + "epoch": 0.383781613809715, + "grad_norm": 1.4375, + "kl": 0.040057963225990534, + "learning_rate": 3.87066252449554e-06, + "loss": 0.0016, + "reward": 3.270833373069763, + "reward_std": 0.2837683819234371, + "rewards/correctness_reward_func": 1.8333333432674408, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 717 + }, + { + "completion_length": 161.1666717529297, + "epoch": 0.3843168740800214, + "grad_norm": 1.34375, + "kl": 0.03427933529019356, + "learning_rate": 3.8667527437964974e-06, + "loss": 0.0014, + "reward": 2.8125000298023224, + "reward_std": 0.4592793434858322, + "rewards/correctness_reward_func": 1.3333333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 718 + }, + { + "completion_length": 172.37500762939453, + "epoch": 0.38485213435032783, + "grad_norm": 1.7578125, + "kl": 0.03410201659426093, + "learning_rate": 3.8628381894081835e-06, + "loss": 0.0014, + "reward": 2.9791667461395264, + "reward_std": 0.8043745756149292, + "rewards/correctness_reward_func": 1.5833333730697632, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 719 + }, + { + "completion_length": 143.0416717529297, + "epoch": 0.38538739462063426, + "grad_norm": 1.4921875, + "kl": 0.03300033137202263, + "learning_rate": 3.858918875003053e-06, + "loss": 0.0013, + "reward": 3.1250000596046448, + "reward_std": 0.5425351560115814, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 720 + }, + { + "completion_length": 142.12500381469727, + "epoch": 0.38592265489094074, + "grad_norm": 1.6171875, + "kl": 0.03706725034862757, + "learning_rate": 3.854994814270189e-06, + "loss": 0.0015, + "reward": 3.1875000596046448, + "reward_std": 0.7654656171798706, + "rewards/correctness_reward_func": 1.7500000596046448, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.4791666716337204, + "step": 721 + }, + { + "completion_length": 126.95833396911621, + "epoch": 0.38645791516124717, + "grad_norm": 3.34375, + "kl": 0.09948566742241383, + "learning_rate": 3.851066020915248e-06, + "loss": 0.004, + "reward": 3.208333373069763, + "reward_std": 0.6018974781036377, + "rewards/correctness_reward_func": 1.7500000596046448, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 722 + }, + { + "completion_length": 161.5833396911621, + "epoch": 0.3869931754315536, + "grad_norm": 1.78125, + "kl": 0.0415203096345067, + "learning_rate": 3.84713250866042e-06, + "loss": 0.0017, + "reward": 3.2916667461395264, + "reward_std": 0.45541542395949364, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 723 + }, + { + "completion_length": 150.625, + "epoch": 0.38752843570186, + "grad_norm": 2.0625, + "kl": 0.05728019238449633, + "learning_rate": 3.843194291244375e-06, + "loss": 0.0023, + "reward": 2.6041667461395264, + "reward_std": 0.6071162149310112, + "rewards/correctness_reward_func": 1.1666666865348816, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 724 + }, + { + "completion_length": 126.62500190734863, + "epoch": 0.38806369597216644, + "grad_norm": 2.453125, + "kl": 0.049750881269574165, + "learning_rate": 3.839251382422217e-06, + "loss": 0.002, + "reward": 3.1875000596046448, + "reward_std": 0.6529284827411175, + "rewards/correctness_reward_func": 1.7500000596046448, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 725 + }, + { + "completion_length": 122.04166984558105, + "epoch": 0.3885989562424729, + "grad_norm": 1.6484375, + "kl": 0.03970834193751216, + "learning_rate": 3.8353037959654344e-06, + "loss": 0.0016, + "reward": 3.1666667461395264, + "reward_std": 0.5163978338241577, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 726 + }, + { + "completion_length": 112.58333587646484, + "epoch": 0.38913421651277935, + "grad_norm": 2.4375, + "kl": 0.05409126076847315, + "learning_rate": 3.8313515456618565e-06, + "loss": 0.0022, + "reward": 3.3125000596046448, + "reward_std": 0.4592793248593807, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 727 + }, + { + "completion_length": 159.87500381469727, + "epoch": 0.3896694767830858, + "grad_norm": 1.78125, + "kl": 0.04191382694989443, + "learning_rate": 3.827394645315601e-06, + "loss": 0.0017, + "reward": 2.5416667461395264, + "reward_std": 0.6094035357236862, + "rewards/correctness_reward_func": 1.0833333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 728 + }, + { + "completion_length": 200.4166717529297, + "epoch": 0.3902047370533922, + "grad_norm": 3.890625, + "kl": 0.15797852352261543, + "learning_rate": 3.823433108747024e-06, + "loss": 0.0063, + "reward": 2.2500000298023224, + "reward_std": 0.6845854222774506, + "rewards/correctness_reward_func": 1.0000000447034836, + "rewards/int_reward_func": 0.4375, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3125000111758709, + "rewards/xmlcount_reward_func": 0.5, + "step": 729 + }, + { + "completion_length": 120.58333587646484, + "epoch": 0.39073999732369863, + "grad_norm": 1.28125, + "kl": 0.03376244753599167, + "learning_rate": 3.819466949792677e-06, + "loss": 0.0014, + "reward": 3.1041666865348816, + "reward_std": 0.54645074903965, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.4375, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 730 + }, + { + "completion_length": 121.66666793823242, + "epoch": 0.3912752575940051, + "grad_norm": 1.4375, + "kl": 0.02545109740458429, + "learning_rate": 3.81549618230526e-06, + "loss": 0.001, + "reward": 3.083333373069763, + "reward_std": 0.5320602059364319, + "rewards/correctness_reward_func": 1.5833333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 731 + }, + { + "completion_length": 143.37500381469727, + "epoch": 0.39181051786431154, + "grad_norm": 1.6875, + "kl": 0.03742834506556392, + "learning_rate": 3.8115208201535603e-06, + "loss": 0.0015, + "reward": 2.583333373069763, + "reward_std": 0.5425351560115814, + "rewards/correctness_reward_func": 1.1666666865348816, + "rewards/int_reward_func": 0.4166666679084301, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 732 + }, + { + "completion_length": 136.87500381469727, + "epoch": 0.39234577813461796, + "grad_norm": 1.90625, + "kl": 0.02556539513170719, + "learning_rate": 3.8075408772224214e-06, + "loss": 0.001, + "reward": 2.9791667461395264, + "reward_std": 0.5674288719892502, + "rewards/correctness_reward_func": 1.5000000149011612, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 733 + }, + { + "completion_length": 147.9583396911621, + "epoch": 0.3928810384049244, + "grad_norm": 1.59375, + "kl": 0.02737267129123211, + "learning_rate": 3.8035563674126818e-06, + "loss": 0.0011, + "reward": 3.333333373069763, + "reward_std": 0.40824827551841736, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 734 + }, + { + "completion_length": 182.4583396911621, + "epoch": 0.3934162986752308, + "grad_norm": 1.5, + "kl": 0.042339869774878025, + "learning_rate": 3.7995673046411336e-06, + "loss": 0.0017, + "reward": 2.8125, + "reward_std": 0.2621144950389862, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 735 + }, + { + "completion_length": 134.54166984558105, + "epoch": 0.3939515589455373, + "grad_norm": 2.453125, + "kl": 0.04328146809712052, + "learning_rate": 3.795573702840468e-06, + "loss": 0.0017, + "reward": 2.6875000596046448, + "reward_std": 0.729445070028305, + "rewards/correctness_reward_func": 1.2500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375, + "rewards/xmlcount_reward_func": 0.5, + "step": 736 + }, + { + "completion_length": 131.83333587646484, + "epoch": 0.3944868192158437, + "grad_norm": 1.8046875, + "kl": 0.046282849740237, + "learning_rate": 3.791575575959232e-06, + "loss": 0.0019, + "reward": 2.6250000596046448, + "reward_std": 0.7266311347484589, + "rewards/correctness_reward_func": 1.2500000298023224, + "rewards/int_reward_func": 0.4166666679084301, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 737 + }, + { + "completion_length": 142.70833587646484, + "epoch": 0.39502207948615015, + "grad_norm": 1.5703125, + "kl": 0.041007681749761105, + "learning_rate": 3.7875729379617766e-06, + "loss": 0.0016, + "reward": 2.9166666865348816, + "reward_std": 0.6185525953769684, + "rewards/correctness_reward_func": 1.5000000149011612, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 738 + }, + { + "completion_length": 122.33333778381348, + "epoch": 0.39555733975645657, + "grad_norm": 1.8359375, + "kl": 0.037428571842610836, + "learning_rate": 3.7835658028282092e-06, + "loss": 0.0015, + "reward": 3.1250000596046448, + "reward_std": 0.5553287714719772, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 739 + }, + { + "completion_length": 162.91666793823242, + "epoch": 0.396092600026763, + "grad_norm": 1.65625, + "kl": 0.04300609743222594, + "learning_rate": 3.779554184554345e-06, + "loss": 0.0017, + "reward": 3.0000001192092896, + "reward_std": 0.60411436855793, + "rewards/correctness_reward_func": 1.5833333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 740 + }, + { + "completion_length": 162.66666984558105, + "epoch": 0.3966278602970694, + "grad_norm": 1.1328125, + "kl": 0.02084392588585615, + "learning_rate": 3.7755380971516563e-06, + "loss": 0.0008, + "reward": 3.020833373069763, + "reward_std": 0.5409832894802094, + "rewards/correctness_reward_func": 1.5833333432674408, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 741 + }, + { + "completion_length": 154.6250057220459, + "epoch": 0.3971631205673759, + "grad_norm": 2.359375, + "kl": 0.056923945900052786, + "learning_rate": 3.771517554647226e-06, + "loss": 0.0023, + "reward": 3.208333432674408, + "reward_std": 0.5449211224913597, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 742 + }, + { + "completion_length": 128.25000381469727, + "epoch": 0.39769838083768233, + "grad_norm": 0.83984375, + "kl": 0.035280851647257805, + "learning_rate": 3.7674925710836964e-06, + "loss": 0.0014, + "reward": 3.4166666865348816, + "reward_std": 0.20412415266036987, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 743 + }, + { + "completion_length": 134.4583396911621, + "epoch": 0.39823364110798876, + "grad_norm": 2.140625, + "kl": 0.03490069089457393, + "learning_rate": 3.7634631605192225e-06, + "loss": 0.0014, + "reward": 3.395833373069763, + "reward_std": 0.25515518710017204, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 744 + }, + { + "completion_length": 141.95833778381348, + "epoch": 0.3987689013782952, + "grad_norm": 1.6796875, + "kl": 0.040149425622075796, + "learning_rate": 3.7594293370274193e-06, + "loss": 0.0016, + "reward": 3.4375000596046448, + "reward_std": 0.1530931070446968, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000149011612, + "rewards/xmlcount_reward_func": 0.5, + "step": 745 + }, + { + "completion_length": 119.70833587646484, + "epoch": 0.3993041616486016, + "grad_norm": 0.97265625, + "kl": 0.0623900992795825, + "learning_rate": 3.7553911146973176e-06, + "loss": 0.0025, + "reward": 3.4791666865348816, + "reward_std": 0.05103103816509247, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 746 + }, + { + "completion_length": 154.41666984558105, + "epoch": 0.3998394219189081, + "grad_norm": 2.0625, + "kl": 0.030291962437331676, + "learning_rate": 3.7513485076333116e-06, + "loss": 0.0012, + "reward": 2.8385417461395264, + "reward_std": 0.8882782272994518, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.484375, + "step": 747 + }, + { + "completion_length": 129.2083396911621, + "epoch": 0.4003746821892145, + "grad_norm": 0.9375, + "kl": 0.04303775355219841, + "learning_rate": 3.747301529955108e-06, + "loss": 0.0017, + "reward": 3.4166666865348816, + "reward_std": 0.20412415266036987, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 748 + }, + { + "completion_length": 148.37500381469727, + "epoch": 0.40090994245952094, + "grad_norm": 1.3359375, + "kl": 0.04529419634491205, + "learning_rate": 3.743250195797682e-06, + "loss": 0.0018, + "reward": 2.5416666865348816, + "reward_std": 0.2686738818883896, + "rewards/correctness_reward_func": 1.0833333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 749 + }, + { + "completion_length": 186.37500762939453, + "epoch": 0.40144520272982737, + "grad_norm": 1.6796875, + "kl": 0.03184348437935114, + "learning_rate": 3.739194519311221e-06, + "loss": 0.0013, + "reward": 2.8125000596046448, + "reward_std": 0.7753209173679352, + "rewards/correctness_reward_func": 1.4166667014360428, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 750 + }, + { + "completion_length": 136.5000057220459, + "epoch": 0.4019804630001338, + "grad_norm": 1.6328125, + "kl": 0.02823848556727171, + "learning_rate": 3.735134514661083e-06, + "loss": 0.0011, + "reward": 2.9791667461395264, + "reward_std": 0.5674288682639599, + "rewards/correctness_reward_func": 1.5000000149011612, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 751 + }, + { + "completion_length": 149.00000381469727, + "epoch": 0.4025157232704403, + "grad_norm": 1.8203125, + "kl": 0.03137681819498539, + "learning_rate": 3.7310701960277412e-06, + "loss": 0.0013, + "reward": 2.8541667461395264, + "reward_std": 0.7610780447721481, + "rewards/correctness_reward_func": 1.4166667014360428, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 752 + }, + { + "completion_length": 122.08333396911621, + "epoch": 0.4030509835407467, + "grad_norm": 0.08154296875, + "kl": 0.02845725789666176, + "learning_rate": 3.7270015776067354e-06, + "loss": 0.0011, + "reward": 3.5, + "reward_std": 0.0, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 753 + }, + { + "completion_length": 193.6666717529297, + "epoch": 0.4035862438110531, + "grad_norm": 1.1484375, + "kl": 0.031050281133502722, + "learning_rate": 3.7229286736086238e-06, + "loss": 0.0012, + "reward": 2.520833343267441, + "reward_std": 0.4007553458213806, + "rewards/correctness_reward_func": 1.1666666716337204, + "rewards/int_reward_func": 0.4375, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 754 + }, + { + "completion_length": 158.5416717529297, + "epoch": 0.40412150408135955, + "grad_norm": 1.8359375, + "kl": 0.05140957282856107, + "learning_rate": 3.718851498258935e-06, + "loss": 0.0021, + "reward": 3.2291667461395264, + "reward_std": 0.3866970017552376, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.31250001303851604, + "rewards/xmlcount_reward_func": 0.5, + "step": 755 + }, + { + "completion_length": 152.6666717529297, + "epoch": 0.404656764351666, + "grad_norm": 1.375, + "kl": 0.027617693413048983, + "learning_rate": 3.714770065798114e-06, + "loss": 0.0011, + "reward": 3.1875000596046448, + "reward_std": 0.536520928144455, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 756 + }, + { + "completion_length": 126.50000381469727, + "epoch": 0.40519202462197246, + "grad_norm": 1.6171875, + "kl": 0.03795776842162013, + "learning_rate": 3.7106843904814754e-06, + "loss": 0.0015, + "reward": 3.333333373069763, + "reward_std": 0.40824829041957855, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 757 + }, + { + "completion_length": 149.50000190734863, + "epoch": 0.4057272848922789, + "grad_norm": 1.4765625, + "kl": 0.04007122712209821, + "learning_rate": 3.7065944865791528e-06, + "loss": 0.0016, + "reward": 3.3541667461395264, + "reward_std": 0.19615865871310234, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 758 + }, + { + "completion_length": 168.58333778381348, + "epoch": 0.4062625451625853, + "grad_norm": 1.234375, + "kl": 0.027591979131102562, + "learning_rate": 3.7025003683760485e-06, + "loss": 0.0011, + "reward": 2.958333373069763, + "reward_std": 0.6673498451709747, + "rewards/correctness_reward_func": 1.5833333432674408, + "rewards/int_reward_func": 0.4166666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 759 + }, + { + "completion_length": 209.04167556762695, + "epoch": 0.40679780543289173, + "grad_norm": 1.6953125, + "kl": 0.04418479232117534, + "learning_rate": 3.6984020501717864e-06, + "loss": 0.0018, + "reward": 2.1250000596046448, + "reward_std": 0.7144279181957245, + "rewards/correctness_reward_func": 0.916666679084301, + "rewards/int_reward_func": 0.3958333395421505, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3333333432674408, + "rewards/xmlcount_reward_func": 0.4791666716337204, + "step": 760 + }, + { + "completion_length": 150.41666793823242, + "epoch": 0.40733306570319816, + "grad_norm": 2.359375, + "kl": 0.044540490955114365, + "learning_rate": 3.6942995462806574e-06, + "loss": 0.0018, + "reward": 3.083333432674408, + "reward_std": 0.7685092948377132, + "rewards/correctness_reward_func": 1.666666716337204, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 761 + }, + { + "completion_length": 171.1250057220459, + "epoch": 0.40786832597350464, + "grad_norm": 0.65234375, + "kl": 0.04095438402146101, + "learning_rate": 3.690192871031574e-06, + "loss": 0.0016, + "reward": 3.2916666865348816, + "reward_std": 0.2813657224178314, + "rewards/correctness_reward_func": 1.8333333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 762 + }, + { + "completion_length": 134.0416717529297, + "epoch": 0.40840358624381107, + "grad_norm": 2.0, + "kl": 0.035194840747863054, + "learning_rate": 3.6860820387680145e-06, + "loss": 0.0014, + "reward": 2.8750000596046448, + "reward_std": 0.306186206638813, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 763 + }, + { + "completion_length": 173.16667366027832, + "epoch": 0.4089388465141175, + "grad_norm": 0.9609375, + "kl": 0.029444378335028887, + "learning_rate": 3.681967063847981e-06, + "loss": 0.0012, + "reward": 2.9479166865348816, + "reward_std": 0.3116655945777893, + "rewards/correctness_reward_func": 1.5833333358168602, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375, + "rewards/xmlcount_reward_func": 0.46875, + "step": 764 + }, + { + "completion_length": 127.33333778381348, + "epoch": 0.4094741067844239, + "grad_norm": 2.71875, + "kl": 0.06386626185849309, + "learning_rate": 3.6778479606439412e-06, + "loss": 0.0026, + "reward": 2.9791667461395264, + "reward_std": 0.7575252056121826, + "rewards/correctness_reward_func": 1.5833333730697632, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000149011612, + "rewards/xmlcount_reward_func": 0.5, + "step": 765 + }, + { + "completion_length": 178.8333396911621, + "epoch": 0.41000936705473034, + "grad_norm": 1.7890625, + "kl": 0.032990507781505585, + "learning_rate": 3.673724743542785e-06, + "loss": 0.0013, + "reward": 2.432291716337204, + "reward_std": 0.6304677873849869, + "rewards/correctness_reward_func": 1.083333358168602, + "rewards/int_reward_func": 0.4375, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666679084301, + "rewards/xmlcount_reward_func": 0.4947916716337204, + "step": 766 + }, + { + "completion_length": 165.2500057220459, + "epoch": 0.4105446273250368, + "grad_norm": 1.4296875, + "kl": 0.028883651364594698, + "learning_rate": 3.669597426945768e-06, + "loss": 0.0012, + "reward": 2.8750000596046448, + "reward_std": 0.7516101598739624, + "rewards/correctness_reward_func": 1.4166667014360428, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 767 + }, + { + "completion_length": 134.41666793823242, + "epoch": 0.41107988759534325, + "grad_norm": 1.203125, + "kl": 0.020841211080551147, + "learning_rate": 3.6654660252684643e-06, + "loss": 0.0008, + "reward": 3.395833373069763, + "reward_std": 0.25515518710017204, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 768 + }, + { + "completion_length": 136.41666984558105, + "epoch": 0.4116151478656497, + "grad_norm": 1.1875, + "kl": 0.03158806264400482, + "learning_rate": 3.661330552940719e-06, + "loss": 0.0013, + "reward": 3.1875000596046448, + "reward_std": 0.4421939253807068, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375, + "rewards/xmlcount_reward_func": 0.5, + "step": 769 + }, + { + "completion_length": 143.41667366027832, + "epoch": 0.4121504081359561, + "grad_norm": 1.3984375, + "kl": 0.022135701961815357, + "learning_rate": 3.6571910244065927e-06, + "loss": 0.0009, + "reward": 2.9791666865348816, + "reward_std": 0.35721728205680847, + "rewards/correctness_reward_func": 1.5000000223517418, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 770 + }, + { + "completion_length": 160.29167366027832, + "epoch": 0.41268566840626253, + "grad_norm": 1.3671875, + "kl": 0.033591088373214006, + "learning_rate": 3.6530474541243127e-06, + "loss": 0.0013, + "reward": 2.708333343267441, + "reward_std": 0.4999281316995621, + "rewards/correctness_reward_func": 1.2500000074505806, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 771 + }, + { + "completion_length": 147.66666793823242, + "epoch": 0.41322092867656895, + "grad_norm": 0.07470703125, + "kl": 0.02724658139050007, + "learning_rate": 3.648899856566225e-06, + "loss": 0.0011, + "reward": 3.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 1.5, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 772 + }, + { + "completion_length": 126.41666984558105, + "epoch": 0.41375618894687544, + "grad_norm": 1.0234375, + "kl": 0.04276125319302082, + "learning_rate": 3.644748246218739e-06, + "loss": 0.0017, + "reward": 2.9791666865348816, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 1.5, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 773 + }, + { + "completion_length": 128.54166984558105, + "epoch": 0.41429144921718186, + "grad_norm": 3.296875, + "kl": 0.10612674592994153, + "learning_rate": 3.6405926375822824e-06, + "loss": 0.0042, + "reward": 3.1875, + "reward_std": 0.36869701743125916, + "rewards/correctness_reward_func": 1.75, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.4791666716337204, + "step": 774 + }, + { + "completion_length": 150.33333778381348, + "epoch": 0.4148267094874883, + "grad_norm": 2.078125, + "kl": 0.04650158202275634, + "learning_rate": 3.636433045171247e-06, + "loss": 0.0019, + "reward": 2.8281250298023224, + "reward_std": 0.6533168256282806, + "rewards/correctness_reward_func": 1.4166667088866234, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666679084301, + "rewards/xmlcount_reward_func": 0.4947916716337204, + "step": 775 + }, + { + "completion_length": 181.29167556762695, + "epoch": 0.4153619697577947, + "grad_norm": 1.6640625, + "kl": 0.019466244149953127, + "learning_rate": 3.6322694835139384e-06, + "loss": 0.0008, + "reward": 2.7083334028720856, + "reward_std": 0.659539595246315, + "rewards/correctness_reward_func": 1.2500000223517418, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 776 + }, + { + "completion_length": 150.6666717529297, + "epoch": 0.41589723002810114, + "grad_norm": 1.7421875, + "kl": 0.03336867177858949, + "learning_rate": 3.6281019671525236e-06, + "loss": 0.0013, + "reward": 2.708333432674408, + "reward_std": 0.7875140719115734, + "rewards/correctness_reward_func": 1.2500000223517418, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 777 + }, + { + "completion_length": 185.62500762939453, + "epoch": 0.4164324902984076, + "grad_norm": 1.6640625, + "kl": 0.030382550787180662, + "learning_rate": 3.6239305106429866e-06, + "loss": 0.0012, + "reward": 2.6250000298023224, + "reward_std": 0.32274864614009857, + "rewards/correctness_reward_func": 1.1666666716337204, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 778 + }, + { + "completion_length": 130.12500190734863, + "epoch": 0.41696775056871405, + "grad_norm": 1.9921875, + "kl": 0.03384571289643645, + "learning_rate": 3.619755128555071e-06, + "loss": 0.0014, + "reward": 2.958333373069763, + "reward_std": 0.7672725319862366, + "rewards/correctness_reward_func": 1.5000000298023224, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 779 + }, + { + "completion_length": 123.58333587646484, + "epoch": 0.41750301083902047, + "grad_norm": 1.5546875, + "kl": 0.0329477502964437, + "learning_rate": 3.6155758354722313e-06, + "loss": 0.0013, + "reward": 3.3125000596046448, + "reward_std": 0.40438438951969147, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 780 + }, + { + "completion_length": 155.29166793823242, + "epoch": 0.4180382711093269, + "grad_norm": 1.40625, + "kl": 0.023525531636551023, + "learning_rate": 3.6113926459915822e-06, + "loss": 0.0009, + "reward": 3.2916666865348816, + "reward_std": 0.4541241526603699, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 781 + }, + { + "completion_length": 167.0833396911621, + "epoch": 0.4185735313796333, + "grad_norm": 1.5234375, + "kl": 0.037068808916956186, + "learning_rate": 3.6072055747238465e-06, + "loss": 0.0015, + "reward": 2.875000089406967, + "reward_std": 0.5268727838993073, + "rewards/correctness_reward_func": 1.416666679084301, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 782 + }, + { + "completion_length": 151.9583396911621, + "epoch": 0.4191087916499398, + "grad_norm": 1.765625, + "kl": 0.06076545687392354, + "learning_rate": 3.603014636293307e-06, + "loss": 0.0024, + "reward": 2.9791666865348816, + "reward_std": 0.5683934539556503, + "rewards/correctness_reward_func": 1.583333358168602, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 783 + }, + { + "completion_length": 174.3750057220459, + "epoch": 0.41964405192024623, + "grad_norm": 1.5859375, + "kl": 0.027573922649025917, + "learning_rate": 3.598819845337752e-06, + "loss": 0.0011, + "reward": 2.8125000596046448, + "reward_std": 0.3092299550771713, + "rewards/correctness_reward_func": 1.3333333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 784 + }, + { + "completion_length": 138.2916717529297, + "epoch": 0.42017931219055266, + "grad_norm": 1.8515625, + "kl": 0.028004382736980915, + "learning_rate": 3.594621216508426e-06, + "loss": 0.0011, + "reward": 2.8958334028720856, + "reward_std": 0.6634034961462021, + "rewards/correctness_reward_func": 1.4166667088866234, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 785 + }, + { + "completion_length": 131.83333587646484, + "epoch": 0.4207145724608591, + "grad_norm": 0.71875, + "kl": 0.024483149405568838, + "learning_rate": 3.590418764469978e-06, + "loss": 0.001, + "reward": 3.4791666865348816, + "reward_std": 0.05103103816509247, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 786 + }, + { + "completion_length": 118.62500381469727, + "epoch": 0.4212498327311655, + "grad_norm": 1.71875, + "kl": 0.029699893668293953, + "learning_rate": 3.586212503900411e-06, + "loss": 0.0012, + "reward": 3.145833373069763, + "reward_std": 0.5042977333068848, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 787 + }, + { + "completion_length": 117.00000381469727, + "epoch": 0.421785093001472, + "grad_norm": 1.3203125, + "kl": 0.03554026409983635, + "learning_rate": 3.582002449491029e-06, + "loss": 0.0014, + "reward": 3.3125000596046448, + "reward_std": 0.40438440442085266, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 788 + }, + { + "completion_length": 166.95833587646484, + "epoch": 0.4223203532717784, + "grad_norm": 1.6640625, + "kl": 0.044573254650458694, + "learning_rate": 3.5777886159463875e-06, + "loss": 0.0018, + "reward": 2.5000000596046448, + "reward_std": 0.7781640589237213, + "rewards/correctness_reward_func": 1.166666679084301, + "rewards/int_reward_func": 0.3750000037252903, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 789 + }, + { + "completion_length": 142.29166984558105, + "epoch": 0.42285561354208484, + "grad_norm": 1.2265625, + "kl": 0.04031791351735592, + "learning_rate": 3.573571017984242e-06, + "loss": 0.0016, + "reward": 2.958333373069763, + "reward_std": 0.5254304707050323, + "rewards/correctness_reward_func": 1.5000000149011612, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 790 + }, + { + "completion_length": 172.25000762939453, + "epoch": 0.42339087381239127, + "grad_norm": 1.3671875, + "kl": 0.033897851360961795, + "learning_rate": 3.5693496703354956e-06, + "loss": 0.0014, + "reward": 2.895833373069763, + "reward_std": 0.47181354090571404, + "rewards/correctness_reward_func": 1.5000000223517418, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 791 + }, + { + "completion_length": 145.50000381469727, + "epoch": 0.4239261340826977, + "grad_norm": 1.5234375, + "kl": 0.04099600203335285, + "learning_rate": 3.5651245877441476e-06, + "loss": 0.0016, + "reward": 3.395833373069763, + "reward_std": 0.1705273911356926, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 792 + }, + { + "completion_length": 123.83333969116211, + "epoch": 0.4244613943530042, + "grad_norm": 0.703125, + "kl": 0.03077300125733018, + "learning_rate": 3.560895784967242e-06, + "loss": 0.0012, + "reward": 3.4791666865348816, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 793 + }, + { + "completion_length": 111.25000381469727, + "epoch": 0.4249966546233106, + "grad_norm": 2.125, + "kl": 0.0695495493710041, + "learning_rate": 3.5566632767748183e-06, + "loss": 0.0028, + "reward": 3.3125000596046448, + "reward_std": 0.3746515288949013, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 794 + }, + { + "completion_length": 136.08333778381348, + "epoch": 0.425531914893617, + "grad_norm": 2.0, + "kl": 0.036902827210724354, + "learning_rate": 3.552427077949856e-06, + "loss": 0.0015, + "reward": 2.7291666865348816, + "reward_std": 0.8262978196144104, + "rewards/correctness_reward_func": 1.3333333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.39583333395421505, + "rewards/xmlcount_reward_func": 0.5, + "step": 795 + }, + { + "completion_length": 133.45833587646484, + "epoch": 0.42606717516392345, + "grad_norm": 1.3671875, + "kl": 0.04546619579195976, + "learning_rate": 3.5481872032882276e-06, + "loss": 0.0018, + "reward": 2.583333373069763, + "reward_std": 0.33393850177526474, + "rewards/correctness_reward_func": 1.1666666716337204, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 796 + }, + { + "completion_length": 107.29166984558105, + "epoch": 0.4266024354342299, + "grad_norm": 1.1328125, + "kl": 0.03557090531103313, + "learning_rate": 3.5439436675986403e-06, + "loss": 0.0014, + "reward": 3.1666666865348816, + "reward_std": 0.25819891691207886, + "rewards/correctness_reward_func": 1.6666666716337204, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 797 + }, + { + "completion_length": 141.91667366027832, + "epoch": 0.42713769570453636, + "grad_norm": 1.90625, + "kl": 0.03451378410682082, + "learning_rate": 3.539696485702592e-06, + "loss": 0.0014, + "reward": 3.2291667461395264, + "reward_std": 0.6085085272789001, + "rewards/correctness_reward_func": 1.7500000596046448, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 798 + }, + { + "completion_length": 187.5416717529297, + "epoch": 0.4276729559748428, + "grad_norm": 1.5, + "kl": 0.029891248792409897, + "learning_rate": 3.535445672434313e-06, + "loss": 0.0012, + "reward": 2.479166716337204, + "reward_std": 0.9521069824695587, + "rewards/correctness_reward_func": 1.0000000223517418, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 799 + }, + { + "completion_length": 153.5833396911621, + "epoch": 0.4282082162451492, + "grad_norm": 1.9296875, + "kl": 0.0294723529368639, + "learning_rate": 3.5311912426407185e-06, + "loss": 0.0012, + "reward": 3.1875000596046448, + "reward_std": 0.6529284864664078, + "rewards/correctness_reward_func": 1.7500000596046448, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 800 + }, + { + "completion_length": 140.91667556762695, + "epoch": 0.42874347651545563, + "grad_norm": 1.8203125, + "kl": 0.030609098728746176, + "learning_rate": 3.526933211181356e-06, + "loss": 0.0012, + "reward": 2.958333373069763, + "reward_std": 0.8007340431213379, + "rewards/correctness_reward_func": 1.5000000298023224, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 801 + }, + { + "completion_length": 180.5416717529297, + "epoch": 0.42927873678576206, + "grad_norm": 1.484375, + "kl": 0.025776101276278496, + "learning_rate": 3.5226715929283507e-06, + "loss": 0.001, + "reward": 2.8750000596046448, + "reward_std": 0.8291900753974915, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 802 + }, + { + "completion_length": 158.5833396911621, + "epoch": 0.4298139970560685, + "grad_norm": 0.9765625, + "kl": 0.03874217625707388, + "learning_rate": 3.5184064027663554e-06, + "loss": 0.0015, + "reward": 3.3750000596046448, + "reward_std": 0.25129128992557526, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 803 + }, + { + "completion_length": 126.08333778381348, + "epoch": 0.43034925732637497, + "grad_norm": 0.8984375, + "kl": 0.033212998416274786, + "learning_rate": 3.514137655592501e-06, + "loss": 0.0013, + "reward": 3.1875, + "reward_std": 0.25920552015304565, + "rewards/correctness_reward_func": 1.75, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375, + "rewards/xmlcount_reward_func": 0.5, + "step": 804 + }, + { + "completion_length": 136.16666793823242, + "epoch": 0.4308845175966814, + "grad_norm": 1.8203125, + "kl": 0.026766558177769184, + "learning_rate": 3.5098653663163405e-06, + "loss": 0.0011, + "reward": 3.1041667461395264, + "reward_std": 0.7366744130849838, + "rewards/correctness_reward_func": 1.666666716337204, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 805 + }, + { + "completion_length": 112.29166984558105, + "epoch": 0.4314197778669878, + "grad_norm": 1.25, + "kl": 0.03665527980774641, + "learning_rate": 3.505589549859798e-06, + "loss": 0.0015, + "reward": 3.2291667461395264, + "reward_std": 0.5133541226387024, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 806 + }, + { + "completion_length": 145.5416717529297, + "epoch": 0.43195503813729424, + "grad_norm": 2.03125, + "kl": 0.030686243437230587, + "learning_rate": 3.5013102211571182e-06, + "loss": 0.0012, + "reward": 2.770833373069763, + "reward_std": 0.6543844044208527, + "rewards/correctness_reward_func": 1.4166667088866234, + "rewards/int_reward_func": 0.37500000558793545, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 807 + }, + { + "completion_length": 132.4166717529297, + "epoch": 0.43249029840760067, + "grad_norm": 0.9296875, + "kl": 0.05077707674354315, + "learning_rate": 3.497027395154811e-06, + "loss": 0.002, + "reward": 3.3541666865348816, + "reward_std": 0.24259880185127258, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 808 + }, + { + "completion_length": 177.4166717529297, + "epoch": 0.43302555867790715, + "grad_norm": 1.4140625, + "kl": 0.02278682473115623, + "learning_rate": 3.4927410868116047e-06, + "loss": 0.0009, + "reward": 3.0468750596046448, + "reward_std": 0.7866534292697906, + "rewards/correctness_reward_func": 1.666666716337204, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.484375, + "step": 809 + }, + { + "completion_length": 155.7916717529297, + "epoch": 0.4335608189482136, + "grad_norm": 2.03125, + "kl": 0.06975524500012398, + "learning_rate": 3.4884513110983886e-06, + "loss": 0.0028, + "reward": 3.020833373069763, + "reward_std": 0.6062580458819866, + "rewards/correctness_reward_func": 1.5833333432674408, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 810 + }, + { + "completion_length": 143.50000762939453, + "epoch": 0.43409607921852, + "grad_norm": 1.5390625, + "kl": 0.03790471563115716, + "learning_rate": 3.484158082998162e-06, + "loss": 0.0015, + "reward": 3.2291667461395264, + "reward_std": 0.4352862983942032, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 811 + }, + { + "completion_length": 184.7083396911621, + "epoch": 0.43463133948882643, + "grad_norm": 1.6796875, + "kl": 0.042383064050227404, + "learning_rate": 3.4798614175059832e-06, + "loss": 0.0017, + "reward": 2.7500000596046448, + "reward_std": 0.9657306373119354, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3750000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 812 + }, + { + "completion_length": 193.20833778381348, + "epoch": 0.43516659975913285, + "grad_norm": 1.7109375, + "kl": 0.02804533112794161, + "learning_rate": 3.4755613296289152e-06, + "loss": 0.0011, + "reward": 1.8802083730697632, + "reward_std": 0.5338976383209229, + "rewards/correctness_reward_func": 0.5000000223517418, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375, + "rewards/xmlcount_reward_func": 0.484375, + "step": 813 + }, + { + "completion_length": 142.04167366027832, + "epoch": 0.43570186002943934, + "grad_norm": 2.0625, + "kl": 0.03716372000053525, + "learning_rate": 3.4712578343859775e-06, + "loss": 0.0015, + "reward": 3.1875000596046448, + "reward_std": 0.42695439979434013, + "rewards/correctness_reward_func": 1.75, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000149011612, + "rewards/xmlcount_reward_func": 0.5, + "step": 814 + }, + { + "completion_length": 163.08333587646484, + "epoch": 0.43623712029974576, + "grad_norm": 1.2265625, + "kl": 0.025884422473609447, + "learning_rate": 3.4669509468080874e-06, + "loss": 0.001, + "reward": 3.0000000596046448, + "reward_std": 0.5222772061824799, + "rewards/correctness_reward_func": 1.583333358168602, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 815 + }, + { + "completion_length": 135.9583396911621, + "epoch": 0.4367723805700522, + "grad_norm": 2.34375, + "kl": 0.02469735313206911, + "learning_rate": 3.4626406819380125e-06, + "loss": 0.001, + "reward": 2.9791667461395264, + "reward_std": 0.5674288682639599, + "rewards/correctness_reward_func": 1.5000000149011612, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 816 + }, + { + "completion_length": 122.87500381469727, + "epoch": 0.4373076408403586, + "grad_norm": 0.047119140625, + "kl": 0.02518481481820345, + "learning_rate": 3.458327054830315e-06, + "loss": 0.001, + "reward": 3.5, + "reward_std": 0.0, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 817 + }, + { + "completion_length": 156.2500057220459, + "epoch": 0.43784290111066504, + "grad_norm": 1.5234375, + "kl": 0.037520273588597775, + "learning_rate": 3.4540100805513016e-06, + "loss": 0.0015, + "reward": 3.083333373069763, + "reward_std": 0.5320602059364319, + "rewards/correctness_reward_func": 1.5833333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 818 + }, + { + "completion_length": 176.7500057220459, + "epoch": 0.4383781613809715, + "grad_norm": 1.203125, + "kl": 0.036197793669998646, + "learning_rate": 3.4496897741789693e-06, + "loss": 0.0014, + "reward": 2.958333343267441, + "reward_std": 0.4541241526603699, + "rewards/correctness_reward_func": 1.5000000223517418, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 819 + }, + { + "completion_length": 165.25000381469727, + "epoch": 0.43891342165127795, + "grad_norm": 1.6875, + "kl": 0.03109767520800233, + "learning_rate": 3.445366150802953e-06, + "loss": 0.0012, + "reward": 2.708333373069763, + "reward_std": 0.7077522426843643, + "rewards/correctness_reward_func": 1.3333333656191826, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 820 + }, + { + "completion_length": 202.75000381469727, + "epoch": 0.43944868192158437, + "grad_norm": 1.2890625, + "kl": 0.02838379517197609, + "learning_rate": 3.4410392255244727e-06, + "loss": 0.0011, + "reward": 2.541666716337204, + "reward_std": 0.3347994200885296, + "rewards/correctness_reward_func": 1.1666666716337204, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3750000111758709, + "rewards/xmlcount_reward_func": 0.5, + "step": 821 + }, + { + "completion_length": 139.4583396911621, + "epoch": 0.4399839421918908, + "grad_norm": 1.0390625, + "kl": 0.026547775603830814, + "learning_rate": 3.436709013456283e-06, + "loss": 0.0011, + "reward": 3.3750000596046448, + "reward_std": 0.2686738669872284, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 822 + }, + { + "completion_length": 210.25000762939453, + "epoch": 0.4405192024621972, + "grad_norm": 1.234375, + "kl": 0.02991744503378868, + "learning_rate": 3.4323755297226157e-06, + "loss": 0.0012, + "reward": 2.371125027537346, + "reward_std": 0.7365857362747192, + "rewards/correctness_reward_func": 1.083333358168602, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.4127916656434536, + "step": 823 + }, + { + "completion_length": 156.7083396911621, + "epoch": 0.4410544627325037, + "grad_norm": 0.984375, + "kl": 0.049160730093717575, + "learning_rate": 3.4280387894591304e-06, + "loss": 0.002, + "reward": 3.4375000596046448, + "reward_std": 0.11558076366782188, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 824 + }, + { + "completion_length": 194.50000381469727, + "epoch": 0.44158972300281013, + "grad_norm": 1.7109375, + "kl": 0.02425103122368455, + "learning_rate": 3.423698807812863e-06, + "loss": 0.001, + "reward": 3.0, + "reward_std": 0.8363019824028015, + "rewards/correctness_reward_func": 1.5833333730697632, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 825 + }, + { + "completion_length": 133.0416717529297, + "epoch": 0.44212498327311656, + "grad_norm": 2.15625, + "kl": 0.03063865751028061, + "learning_rate": 3.419355599942167e-06, + "loss": 0.0012, + "reward": 3.2500000596046448, + "reward_std": 0.612372413277626, + "rewards/correctness_reward_func": 1.7500000596046448, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 826 + }, + { + "completion_length": 132.0416717529297, + "epoch": 0.442660243543423, + "grad_norm": 1.6171875, + "kl": 0.036599946208298206, + "learning_rate": 3.4150091810166676e-06, + "loss": 0.0015, + "reward": 2.5625000596046448, + "reward_std": 0.4816259741783142, + "rewards/correctness_reward_func": 1.2500000298023224, + "rewards/int_reward_func": 0.375, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 827 + }, + { + "completion_length": 129.37500190734863, + "epoch": 0.4431955038137294, + "grad_norm": 1.5078125, + "kl": 0.031639018561691046, + "learning_rate": 3.410659566217202e-06, + "loss": 0.0013, + "reward": 2.4791666865348816, + "reward_std": 0.5740348696708679, + "rewards/correctness_reward_func": 1.0, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 828 + }, + { + "completion_length": 184.7916717529297, + "epoch": 0.4437307640840359, + "grad_norm": 1.546875, + "kl": 0.03468810860067606, + "learning_rate": 3.406306770735773e-06, + "loss": 0.0014, + "reward": 2.8125000596046448, + "reward_std": 0.6695836298167706, + "rewards/correctness_reward_func": 1.4166666716337204, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 829 + }, + { + "completion_length": 168.00000762939453, + "epoch": 0.4442660243543423, + "grad_norm": 1.5390625, + "kl": 0.04705773899331689, + "learning_rate": 3.4019508097754912e-06, + "loss": 0.0019, + "reward": 2.375, + "reward_std": 0.7058513760566711, + "rewards/correctness_reward_func": 1.083333358168602, + "rewards/int_reward_func": 0.4166666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3750000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 830 + }, + { + "completion_length": 181.20833587646484, + "epoch": 0.44480128462464874, + "grad_norm": 2.0625, + "kl": 0.034845305141061544, + "learning_rate": 3.3975916985505223e-06, + "loss": 0.0014, + "reward": 2.520833373069763, + "reward_std": 0.929972916841507, + "rewards/correctness_reward_func": 1.1666667088866234, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 831 + }, + { + "completion_length": 138.08333778381348, + "epoch": 0.44533654489495517, + "grad_norm": 1.859375, + "kl": 0.04686246067285538, + "learning_rate": 3.3932294522860376e-06, + "loss": 0.0019, + "reward": 3.0937500596046448, + "reward_std": 0.58812665194273, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.4895833358168602, + "step": 832 + }, + { + "completion_length": 169.7083396911621, + "epoch": 0.4458718051652616, + "grad_norm": 1.9375, + "kl": 0.03669201582670212, + "learning_rate": 3.388864086218155e-06, + "loss": 0.0015, + "reward": 3.1406250596046448, + "reward_std": 0.44707968831062317, + "rewards/correctness_reward_func": 1.8333333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3125000074505806, + "rewards/xmlcount_reward_func": 0.4947916716337204, + "step": 833 + }, + { + "completion_length": 137.4583396911621, + "epoch": 0.446407065435568, + "grad_norm": 5.03125, + "kl": 0.09618548629805446, + "learning_rate": 3.3844956155938915e-06, + "loss": 0.0038, + "reward": 2.333333373069763, + "reward_std": 0.6804374605417252, + "rewards/correctness_reward_func": 1.0000000149011612, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333358168602, + "rewards/xmlcount_reward_func": 0.4791666716337204, + "step": 834 + }, + { + "completion_length": 238.25000762939453, + "epoch": 0.4469423257058745, + "grad_norm": 1.21875, + "kl": 0.032248204573988914, + "learning_rate": 3.380124055671106e-06, + "loss": 0.0013, + "reward": 2.666666716337204, + "reward_std": 0.502664253115654, + "rewards/correctness_reward_func": 1.416666679084301, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.2916666679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 835 + }, + { + "completion_length": 135.62500381469727, + "epoch": 0.4474775859761809, + "grad_norm": 1.21875, + "kl": 0.03981878375634551, + "learning_rate": 3.3757494217184493e-06, + "loss": 0.0016, + "reward": 3.2291666865348816, + "reward_std": 0.30103103443980217, + "rewards/correctness_reward_func": 1.8333333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333395421505, + "rewards/xmlcount_reward_func": 0.5, + "step": 836 + }, + { + "completion_length": 186.62500762939453, + "epoch": 0.44801284624648735, + "grad_norm": 1.7421875, + "kl": 0.043279207311570644, + "learning_rate": 3.371371729015307e-06, + "loss": 0.0017, + "reward": 3.0000001192092896, + "reward_std": 0.4269207715988159, + "rewards/correctness_reward_func": 1.6666666716337204, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3541666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 837 + }, + { + "completion_length": 180.4583396911621, + "epoch": 0.4485481065167938, + "grad_norm": 1.5625, + "kl": 0.027423355961218476, + "learning_rate": 3.3669909928517476e-06, + "loss": 0.0011, + "reward": 3.1041667461395264, + "reward_std": 0.7092793136835098, + "rewards/correctness_reward_func": 1.7500000596046448, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3750000037252903, + "rewards/xmlcount_reward_func": 0.5, + "step": 838 + }, + { + "completion_length": 140.29166984558105, + "epoch": 0.4490833667871002, + "grad_norm": 1.90625, + "kl": 0.033189952839165926, + "learning_rate": 3.362607228528473e-06, + "loss": 0.0013, + "reward": 3.1250001192092896, + "reward_std": 0.5809475630521774, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 839 + }, + { + "completion_length": 161.12500190734863, + "epoch": 0.4496186270574067, + "grad_norm": 1.4765625, + "kl": 0.03806134918704629, + "learning_rate": 3.358220451356758e-06, + "loss": 0.0015, + "reward": 2.973958373069763, + "reward_std": 0.2913762256503105, + "rewards/correctness_reward_func": 1.5833333358168602, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.416666679084301, + "rewards/xmlcount_reward_func": 0.4947916716337204, + "step": 840 + }, + { + "completion_length": 226.7500057220459, + "epoch": 0.4501538873277131, + "grad_norm": 1.0546875, + "kl": 0.025143309962004423, + "learning_rate": 3.3538306766584015e-06, + "loss": 0.001, + "reward": 2.833333373069763, + "reward_std": 0.48273734748363495, + "rewards/correctness_reward_func": 1.4166666716337204, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 841 + }, + { + "completion_length": 165.00000381469727, + "epoch": 0.45068914759801953, + "grad_norm": 1.234375, + "kl": 0.02779693342745304, + "learning_rate": 3.349437919765673e-06, + "loss": 0.0011, + "reward": 2.895833373069763, + "reward_std": 0.25515517219901085, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 842 + }, + { + "completion_length": 141.9583396911621, + "epoch": 0.45122440786832596, + "grad_norm": 1.640625, + "kl": 0.04394981171935797, + "learning_rate": 3.345042196021257e-06, + "loss": 0.0018, + "reward": 3.1041667461395264, + "reward_std": 0.5791352987289429, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 843 + }, + { + "completion_length": 120.33333778381348, + "epoch": 0.4517596681386324, + "grad_norm": 1.0, + "kl": 0.03951650392264128, + "learning_rate": 3.340643520778201e-06, + "loss": 0.0016, + "reward": 2.8125, + "reward_std": 0.246855229139328, + "rewards/correctness_reward_func": 1.3333333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 844 + }, + { + "completion_length": 150.5416717529297, + "epoch": 0.45229492840893887, + "grad_norm": 1.609375, + "kl": 0.03706312831491232, + "learning_rate": 3.336241909399861e-06, + "loss": 0.0015, + "reward": 3.0000000596046448, + "reward_std": 0.712575301527977, + "rewards/correctness_reward_func": 1.5833333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 845 + }, + { + "completion_length": 157.2500057220459, + "epoch": 0.4528301886792453, + "grad_norm": 1.1328125, + "kl": 0.03455189196392894, + "learning_rate": 3.331837377259847e-06, + "loss": 0.0014, + "reward": 2.9375000596046448, + "reward_std": 0.11558076366782188, + "rewards/correctness_reward_func": 1.5, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 846 + }, + { + "completion_length": 141.83333587646484, + "epoch": 0.4533654489495517, + "grad_norm": 1.4140625, + "kl": 0.03348656743764877, + "learning_rate": 3.327429939741971e-06, + "loss": 0.0013, + "reward": 2.770833373069763, + "reward_std": 0.5251599848270416, + "rewards/correctness_reward_func": 1.3333333432674408, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 847 + }, + { + "completion_length": 200.54167366027832, + "epoch": 0.45390070921985815, + "grad_norm": 1.15625, + "kl": 0.02685615699738264, + "learning_rate": 3.3230196122401946e-06, + "loss": 0.0011, + "reward": 2.833333373069763, + "reward_std": 0.5149645358324051, + "rewards/correctness_reward_func": 1.416666679084301, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.416666679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 848 + }, + { + "completion_length": 131.9583396911621, + "epoch": 0.45443596949016457, + "grad_norm": 1.65625, + "kl": 0.025201458483934402, + "learning_rate": 3.318606410158572e-06, + "loss": 0.001, + "reward": 3.0625000596046448, + "reward_std": 0.6782456785440445, + "rewards/correctness_reward_func": 1.5833333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 849 + }, + { + "completion_length": 142.37500381469727, + "epoch": 0.45497122976047105, + "grad_norm": 0.80859375, + "kl": 0.030992007814347744, + "learning_rate": 3.3141903489111966e-06, + "loss": 0.0012, + "reward": 3.4791666865348816, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 850 + }, + { + "completion_length": 181.41667366027832, + "epoch": 0.4555064900307775, + "grad_norm": 1.6328125, + "kl": 0.03103213245049119, + "learning_rate": 3.3097714439221477e-06, + "loss": 0.0012, + "reward": 3.1250001192092896, + "reward_std": 0.6587194204330444, + "rewards/correctness_reward_func": 1.666666716337204, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 851 + }, + { + "completion_length": 153.6250057220459, + "epoch": 0.4560417503010839, + "grad_norm": 1.1015625, + "kl": 0.036851195618510246, + "learning_rate": 3.3053497106254394e-06, + "loss": 0.0015, + "reward": 2.895833373069763, + "reward_std": 0.25515517219901085, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 852 + }, + { + "completion_length": 125.70833587646484, + "epoch": 0.45657701057139033, + "grad_norm": 1.078125, + "kl": 0.022068005986511707, + "learning_rate": 3.3009251644649637e-06, + "loss": 0.0009, + "reward": 2.9375, + "reward_std": 0.06846532225608826, + "rewards/correctness_reward_func": 1.5, + "rewards/int_reward_func": 0.4375, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 853 + }, + { + "completion_length": 145.7083396911621, + "epoch": 0.45711227084169676, + "grad_norm": 1.921875, + "kl": 0.029460490681231022, + "learning_rate": 3.296497820894435e-06, + "loss": 0.0012, + "reward": 3.145833432674408, + "reward_std": 0.6625833064317703, + "rewards/correctness_reward_func": 1.666666716337204, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 854 + }, + { + "completion_length": 179.12500381469727, + "epoch": 0.45764753111200324, + "grad_norm": 1.90625, + "kl": 0.03744522435590625, + "learning_rate": 3.29206769537734e-06, + "loss": 0.0015, + "reward": 2.6875000298023224, + "reward_std": 0.6332302503287792, + "rewards/correctness_reward_func": 1.4166666716337204, + "rewards/int_reward_func": 0.3958333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.37500000558793545, + "rewards/xmlcount_reward_func": 0.5, + "step": 855 + }, + { + "completion_length": 143.7916717529297, + "epoch": 0.45818279138230966, + "grad_norm": 0.8125, + "kl": 0.029497163370251656, + "learning_rate": 3.287634803386882e-06, + "loss": 0.0012, + "reward": 3.0625, + "reward_std": 0.22008520364761353, + "rewards/correctness_reward_func": 1.5833333358168602, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 856 + }, + { + "completion_length": 122.04166984558105, + "epoch": 0.4587180516526161, + "grad_norm": 1.5078125, + "kl": 0.06524349935352802, + "learning_rate": 3.283199160405926e-06, + "loss": 0.0026, + "reward": 2.854166716337204, + "reward_std": 0.2648099809885025, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 857 + }, + { + "completion_length": 160.7083396911621, + "epoch": 0.4592533119229225, + "grad_norm": 1.125, + "kl": 0.03336925012990832, + "learning_rate": 3.2787607819269473e-06, + "loss": 0.0013, + "reward": 3.270833373069763, + "reward_std": 0.44672295451164246, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 858 + }, + { + "completion_length": 140.87500190734863, + "epoch": 0.45978857219322894, + "grad_norm": 3.0, + "kl": 0.0893707680515945, + "learning_rate": 3.274319683451973e-06, + "loss": 0.0036, + "reward": 2.708333432674408, + "reward_std": 0.6814524829387665, + "rewards/correctness_reward_func": 1.3333333656191826, + "rewards/int_reward_func": 0.4375, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 859 + }, + { + "completion_length": 136.37500381469727, + "epoch": 0.4603238324635354, + "grad_norm": 1.5390625, + "kl": 0.031437342055141926, + "learning_rate": 3.269875880492532e-06, + "loss": 0.0013, + "reward": 3.2291667461395264, + "reward_std": 0.5133540891110897, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 860 + }, + { + "completion_length": 188.66667366027832, + "epoch": 0.46085909273384185, + "grad_norm": 1.0078125, + "kl": 0.031875348184257746, + "learning_rate": 3.2654293885695983e-06, + "loss": 0.0013, + "reward": 3.0468750596046448, + "reward_std": 0.45117713510990143, + "rewards/correctness_reward_func": 1.75, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3541666679084301, + "rewards/xmlcount_reward_func": 0.484375, + "step": 861 + }, + { + "completion_length": 163.8333396911621, + "epoch": 0.46139435300414827, + "grad_norm": 1.9453125, + "kl": 0.05007671285420656, + "learning_rate": 3.260980223213539e-06, + "loss": 0.002, + "reward": 2.8541667461395264, + "reward_std": 0.7710062265396118, + "rewards/correctness_reward_func": 1.5000000298023224, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3750000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 862 + }, + { + "completion_length": 200.66667556762695, + "epoch": 0.4619296132744547, + "grad_norm": 2.078125, + "kl": 0.03132034745067358, + "learning_rate": 3.256528399964057e-06, + "loss": 0.0013, + "reward": 2.479166716337204, + "reward_std": 0.9049717783927917, + "rewards/correctness_reward_func": 1.0833333656191826, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 863 + }, + { + "completion_length": 115.5000057220459, + "epoch": 0.4624648735447611, + "grad_norm": 1.9921875, + "kl": 0.030180228408426046, + "learning_rate": 3.252073934370142e-06, + "loss": 0.0012, + "reward": 3.2187500596046448, + "reward_std": 0.3946995995938778, + "rewards/correctness_reward_func": 1.8333333432674408, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.416666679084301, + "rewards/xmlcount_reward_func": 0.4895833358168602, + "step": 864 + }, + { + "completion_length": 153.6666717529297, + "epoch": 0.46300013381506755, + "grad_norm": 2.3125, + "kl": 0.029024141374975443, + "learning_rate": 3.2476168419900066e-06, + "loss": 0.0012, + "reward": 3.0260416865348816, + "reward_std": 0.7233164459466934, + "rewards/correctness_reward_func": 1.666666716337204, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333395421505, + "rewards/xmlcount_reward_func": 0.484375, + "step": 865 + }, + { + "completion_length": 142.66666984558105, + "epoch": 0.46353539408537403, + "grad_norm": 1.6171875, + "kl": 0.03154592076316476, + "learning_rate": 3.2431571383910445e-06, + "loss": 0.0013, + "reward": 3.020833373069763, + "reward_std": 0.5285752415657043, + "rewards/correctness_reward_func": 1.583333358168602, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 866 + }, + { + "completion_length": 164.1666717529297, + "epoch": 0.46407065435568046, + "grad_norm": 2.078125, + "kl": 0.029656716156750917, + "learning_rate": 3.238694839149764e-06, + "loss": 0.0012, + "reward": 3.208333432674408, + "reward_std": 0.5643851272761822, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 867 + }, + { + "completion_length": 142.33333778381348, + "epoch": 0.4646059146259869, + "grad_norm": 1.7578125, + "kl": 0.03601653641089797, + "learning_rate": 3.2342299598517444e-06, + "loss": 0.0014, + "reward": 2.625000089406967, + "reward_std": 0.7309969216585159, + "rewards/correctness_reward_func": 1.166666716337204, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 868 + }, + { + "completion_length": 169.3750057220459, + "epoch": 0.4651411748962933, + "grad_norm": 1.6484375, + "kl": 0.02826155023649335, + "learning_rate": 3.2297625160915735e-06, + "loss": 0.0011, + "reward": 3.0416667461395264, + "reward_std": 0.6094035319983959, + "rewards/correctness_reward_func": 1.5833333432674408, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 869 + }, + { + "completion_length": 201.37500762939453, + "epoch": 0.46567643516659973, + "grad_norm": 1.6796875, + "kl": 0.029593814630061388, + "learning_rate": 3.2252925234727955e-06, + "loss": 0.0012, + "reward": 3.020833373069763, + "reward_std": 0.7951613962650299, + "rewards/correctness_reward_func": 1.5833333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000149011612, + "rewards/xmlcount_reward_func": 0.5, + "step": 870 + }, + { + "completion_length": 166.5000057220459, + "epoch": 0.4662116954369062, + "grad_norm": 1.21875, + "kl": 0.03089164919219911, + "learning_rate": 3.22081999760786e-06, + "loss": 0.0012, + "reward": 2.7708334028720856, + "reward_std": 0.37377968057990074, + "rewards/correctness_reward_func": 1.3333333432674408, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 871 + }, + { + "completion_length": 167.00000762939453, + "epoch": 0.46674695570721264, + "grad_norm": 2.296875, + "kl": 0.034221252892166376, + "learning_rate": 3.216344954118061e-06, + "loss": 0.0014, + "reward": 2.8125000596046448, + "reward_std": 0.7076172083616257, + "rewards/correctness_reward_func": 1.5000000149011612, + "rewards/int_reward_func": 0.4375000074505806, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3750000037252903, + "rewards/xmlcount_reward_func": 0.5, + "step": 872 + }, + { + "completion_length": 155.00000381469727, + "epoch": 0.46728221597751907, + "grad_norm": 1.6796875, + "kl": 0.03355083800852299, + "learning_rate": 3.211867408633488e-06, + "loss": 0.0013, + "reward": 2.7500000596046448, + "reward_std": 0.7845312654972076, + "rewards/correctness_reward_func": 1.333333358168602, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 873 + }, + { + "completion_length": 128.00000381469727, + "epoch": 0.4678174762478255, + "grad_norm": 1.296875, + "kl": 0.05736191477626562, + "learning_rate": 3.2073873767929693e-06, + "loss": 0.0023, + "reward": 3.458333373069763, + "reward_std": 0.10206206887960434, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 874 + }, + { + "completion_length": 215.1666717529297, + "epoch": 0.4683527365181319, + "grad_norm": 1.2578125, + "kl": 0.02975275507196784, + "learning_rate": 3.2029048742440166e-06, + "loss": 0.0012, + "reward": 2.6718750596046448, + "reward_std": 0.9017057120800018, + "rewards/correctness_reward_func": 1.3333333432674408, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333432674408, + "rewards/xmlcount_reward_func": 0.484375, + "step": 875 + }, + { + "completion_length": 140.8333396911621, + "epoch": 0.4688879967884384, + "grad_norm": 1.796875, + "kl": 0.031159482430666685, + "learning_rate": 3.198419916642771e-06, + "loss": 0.0012, + "reward": 2.505208373069763, + "reward_std": 0.7537505924701691, + "rewards/correctness_reward_func": 1.0833333432674408, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.484375, + "step": 876 + }, + { + "completion_length": 201.08333587646484, + "epoch": 0.4694232570587448, + "grad_norm": 1.5390625, + "kl": 0.03459092229604721, + "learning_rate": 3.1939325196539496e-06, + "loss": 0.0014, + "reward": 3.1666667461395264, + "reward_std": 0.4727980047464371, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3333333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 877 + }, + { + "completion_length": 154.3333396911621, + "epoch": 0.46995851732905125, + "grad_norm": 1.2734375, + "kl": 0.03349528927356005, + "learning_rate": 3.1894426989507877e-06, + "loss": 0.0013, + "reward": 3.2291666865348816, + "reward_std": 0.35770072042942047, + "rewards/correctness_reward_func": 1.8333333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333395421505, + "rewards/xmlcount_reward_func": 0.5, + "step": 878 + }, + { + "completion_length": 181.75, + "epoch": 0.4704937775993577, + "grad_norm": 1.7421875, + "kl": 0.03606862062588334, + "learning_rate": 3.1849504702149885e-06, + "loss": 0.0014, + "reward": 2.5416666865348816, + "reward_std": 0.5717475526034832, + "rewards/correctness_reward_func": 1.0833333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 879 + }, + { + "completion_length": 147.33333778381348, + "epoch": 0.4710290378696641, + "grad_norm": 1.859375, + "kl": 0.03201776463538408, + "learning_rate": 3.180455849136664e-06, + "loss": 0.0013, + "reward": 2.9375000596046448, + "reward_std": 0.9778521060943604, + "rewards/correctness_reward_func": 1.5000000596046448, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 880 + }, + { + "completion_length": 140.6666717529297, + "epoch": 0.4715642981399706, + "grad_norm": 1.3515625, + "kl": 0.06457794364541769, + "learning_rate": 3.175958851414281e-06, + "loss": 0.0026, + "reward": 3.2916667461395264, + "reward_std": 0.31584101915359497, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3750000111758709, + "rewards/xmlcount_reward_func": 0.5, + "step": 881 + }, + { + "completion_length": 199.25000762939453, + "epoch": 0.472099558410277, + "grad_norm": 1.3203125, + "kl": 0.041178013663738966, + "learning_rate": 3.1714594927546094e-06, + "loss": 0.0016, + "reward": 2.588541716337204, + "reward_std": 0.4356187731027603, + "rewards/correctness_reward_func": 1.3333333432674408, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.29166666977107525, + "rewards/xmlcount_reward_func": 0.484375, + "step": 882 + }, + { + "completion_length": 119.04166793823242, + "epoch": 0.47263481868058344, + "grad_norm": 0.8359375, + "kl": 0.03467118879780173, + "learning_rate": 3.1669577888726655e-06, + "loss": 0.0014, + "reward": 3.3125, + "reward_std": 0.29315099120140076, + "rewards/correctness_reward_func": 1.8333333432674408, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 883 + }, + { + "completion_length": 109.58333587646484, + "epoch": 0.47317007895088986, + "grad_norm": 1.78125, + "kl": 0.0753163555637002, + "learning_rate": 3.162453755491655e-06, + "loss": 0.003, + "reward": 3.333333373069763, + "reward_std": 0.29362983629107475, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 884 + }, + { + "completion_length": 131.41667366027832, + "epoch": 0.4737053392211963, + "grad_norm": 1.7265625, + "kl": 0.03353723953478038, + "learning_rate": 3.1579474083429195e-06, + "loss": 0.0013, + "reward": 2.791666716337204, + "reward_std": 0.7123230695724487, + "rewards/correctness_reward_func": 1.3333333656191826, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 885 + }, + { + "completion_length": 155.62500381469727, + "epoch": 0.47424059949150277, + "grad_norm": 1.3046875, + "kl": 0.02384982886724174, + "learning_rate": 3.153438763165884e-06, + "loss": 0.001, + "reward": 3.2291666865348816, + "reward_std": 0.3248923234641552, + "rewards/correctness_reward_func": 1.75, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 886 + }, + { + "completion_length": 155.2916717529297, + "epoch": 0.4747758597618092, + "grad_norm": 1.21875, + "kl": 0.031146604102104902, + "learning_rate": 3.1489278357079996e-06, + "loss": 0.0012, + "reward": 3.3125000596046448, + "reward_std": 0.3071485310792923, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 887 + }, + { + "completion_length": 145.25000190734863, + "epoch": 0.4753111200321156, + "grad_norm": 1.375, + "kl": 0.07489143451675773, + "learning_rate": 3.1444146417246875e-06, + "loss": 0.003, + "reward": 3.3125000596046448, + "reward_std": 0.3071485310792923, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 888 + }, + { + "completion_length": 202.62500762939453, + "epoch": 0.47584638030242205, + "grad_norm": 1.7421875, + "kl": 0.021212580613791943, + "learning_rate": 3.139899196979286e-06, + "loss": 0.0008, + "reward": 2.833333432674408, + "reward_std": 0.9302727431058884, + "rewards/correctness_reward_func": 1.416666716337204, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.416666679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 889 + }, + { + "completion_length": 170.7916717529297, + "epoch": 0.47638164057272847, + "grad_norm": 1.5703125, + "kl": 0.023964946623891592, + "learning_rate": 3.1353815172429937e-06, + "loss": 0.001, + "reward": 3.083333373069763, + "reward_std": 0.3776952736079693, + "rewards/correctness_reward_func": 1.6666666716337204, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 890 + }, + { + "completion_length": 131.9583339691162, + "epoch": 0.47691690084303495, + "grad_norm": 1.84375, + "kl": 0.03578362660482526, + "learning_rate": 3.130861618294817e-06, + "loss": 0.0014, + "reward": 3.270833373069763, + "reward_std": 0.5613414198160172, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.4791666716337204, + "step": 891 + }, + { + "completion_length": 169.7083339691162, + "epoch": 0.4774521611133414, + "grad_norm": 1.5546875, + "kl": 0.027614878490567207, + "learning_rate": 3.1263395159215125e-06, + "loss": 0.0011, + "reward": 2.9791666865348816, + "reward_std": 0.4242093414068222, + "rewards/correctness_reward_func": 1.5000000223517418, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 892 + }, + { + "completion_length": 156.04167556762695, + "epoch": 0.4779874213836478, + "grad_norm": 1.21875, + "kl": 0.03386624017730355, + "learning_rate": 3.121815225917534e-06, + "loss": 0.0014, + "reward": 3.2500000596046448, + "reward_std": 0.3181530348956585, + "rewards/correctness_reward_func": 1.8333333432674408, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 893 + }, + { + "completion_length": 134.5416717529297, + "epoch": 0.47852268165395423, + "grad_norm": 1.84375, + "kl": 0.07315583759918809, + "learning_rate": 3.1172887640849736e-06, + "loss": 0.0029, + "reward": 3.2291667461395264, + "reward_std": 0.45845915377140045, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 894 + }, + { + "completion_length": 138.25000381469727, + "epoch": 0.47905794192426066, + "grad_norm": 0.92578125, + "kl": 0.01682589342817664, + "learning_rate": 3.1127601462335106e-06, + "loss": 0.0007, + "reward": 2.833333373069763, + "reward_std": 0.25819891691207886, + "rewards/correctness_reward_func": 1.3333333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 895 + }, + { + "completion_length": 148.54167366027832, + "epoch": 0.4795932021945671, + "grad_norm": 0.59375, + "kl": 0.02296333061531186, + "learning_rate": 3.108229388180355e-06, + "loss": 0.0009, + "reward": 3.458333373069763, + "reward_std": 0.06454972922801971, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 896 + }, + { + "completion_length": 151.12500381469727, + "epoch": 0.48012846246487356, + "grad_norm": 1.9765625, + "kl": 0.03428164287470281, + "learning_rate": 3.103696505750191e-06, + "loss": 0.0014, + "reward": 2.958333432674408, + "reward_std": 0.94781294465065, + "rewards/correctness_reward_func": 1.5833334028720856, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 897 + }, + { + "completion_length": 121.00000190734863, + "epoch": 0.48066372273518, + "grad_norm": 1.421875, + "kl": 0.026538813253864646, + "learning_rate": 3.099161514775123e-06, + "loss": 0.0011, + "reward": 2.8125000596046448, + "reward_std": 0.3092299550771713, + "rewards/correctness_reward_func": 1.3333333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 898 + }, + { + "completion_length": 182.9583396911621, + "epoch": 0.4811989830054864, + "grad_norm": 2.359375, + "kl": 0.07469095708802342, + "learning_rate": 3.094624431094621e-06, + "loss": 0.003, + "reward": 3.1250001192092896, + "reward_std": 0.612732045352459, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666865348816, + "rewards/xmlcount_reward_func": 0.5, + "step": 899 + }, + { + "completion_length": 153.87500762939453, + "epoch": 0.48173424327579284, + "grad_norm": 3.84375, + "kl": 0.13914787722751498, + "learning_rate": 3.0900852705554618e-06, + "loss": 0.0056, + "reward": 2.7916667461395264, + "reward_std": 1.1055711507797241, + "rewards/correctness_reward_func": 1.416666716337204, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333507180214, + "rewards/xmlcount_reward_func": 0.5, + "step": 900 + }, + { + "completion_length": 150.1666717529297, + "epoch": 0.48226950354609927, + "grad_norm": 1.4921875, + "kl": 0.04241298232227564, + "learning_rate": 3.085544049011679e-06, + "loss": 0.0017, + "reward": 3.2916666865348816, + "reward_std": 0.4541241526603699, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 901 + }, + { + "completion_length": 187.75000762939453, + "epoch": 0.48280476381640575, + "grad_norm": 1.765625, + "kl": 0.03613791987299919, + "learning_rate": 3.0810007823245016e-06, + "loss": 0.0014, + "reward": 2.3125000596046448, + "reward_std": 1.1677038073539734, + "rewards/correctness_reward_func": 0.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333507180214, + "rewards/xmlcount_reward_func": 0.5, + "step": 902 + }, + { + "completion_length": 177.7083396911621, + "epoch": 0.48334002408671217, + "grad_norm": 1.28125, + "kl": 0.03979034349322319, + "learning_rate": 3.0764554863623054e-06, + "loss": 0.0016, + "reward": 2.8541667461395264, + "reward_std": 0.6266467720270157, + "rewards/correctness_reward_func": 1.5000000447034836, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3541666753590107, + "rewards/xmlcount_reward_func": 0.5, + "step": 903 + }, + { + "completion_length": 135.87500381469727, + "epoch": 0.4838752843570186, + "grad_norm": 0.498046875, + "kl": 0.04881257377564907, + "learning_rate": 3.07190817700055e-06, + "loss": 0.002, + "reward": 3.458333373069763, + "reward_std": 0.06454972922801971, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 904 + }, + { + "completion_length": 175.87500762939453, + "epoch": 0.484410544627325, + "grad_norm": 2.03125, + "kl": 0.030201736837625504, + "learning_rate": 3.0673588701217306e-06, + "loss": 0.0012, + "reward": 2.645833373069763, + "reward_std": 0.6988043785095215, + "rewards/correctness_reward_func": 1.4166667088866234, + "rewards/int_reward_func": 0.3541666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3750000037252903, + "rewards/xmlcount_reward_func": 0.5, + "step": 905 + }, + { + "completion_length": 161.45833587646484, + "epoch": 0.48494580489763145, + "grad_norm": 1.4765625, + "kl": 0.03659933200106025, + "learning_rate": 3.062807581615317e-06, + "loss": 0.0015, + "reward": 2.723958432674408, + "reward_std": 0.7829360365867615, + "rewards/correctness_reward_func": 1.333333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333432674408, + "rewards/xmlcount_reward_func": 0.4947916716337204, + "step": 906 + }, + { + "completion_length": 156.5833396911621, + "epoch": 0.48548106516793793, + "grad_norm": 1.3125, + "kl": 0.022064207587391138, + "learning_rate": 3.058254327377701e-06, + "loss": 0.0009, + "reward": 3.0416667461395264, + "reward_std": 0.5643851570785046, + "rewards/correctness_reward_func": 1.583333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 907 + }, + { + "completion_length": 156.20833778381348, + "epoch": 0.48601632543824436, + "grad_norm": 0.66796875, + "kl": 0.029581542825326324, + "learning_rate": 3.053699123312141e-06, + "loss": 0.0012, + "reward": 2.9791666865348816, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 1.5, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 908 + }, + { + "completion_length": 198.3750057220459, + "epoch": 0.4865515857085508, + "grad_norm": 2.296875, + "kl": 0.03689955791924149, + "learning_rate": 3.0491419853287037e-06, + "loss": 0.0015, + "reward": 2.2916667070239782, + "reward_std": 0.7524303048849106, + "rewards/correctness_reward_func": 1.166666716337204, + "rewards/int_reward_func": 0.37500000558793545, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.375, + "rewards/xmlcount_reward_func": 0.375, + "step": 909 + }, + { + "completion_length": 142.6666717529297, + "epoch": 0.4870868459788572, + "grad_norm": 1.578125, + "kl": 0.03775651101022959, + "learning_rate": 3.044582929344212e-06, + "loss": 0.0015, + "reward": 2.8541667461395264, + "reward_std": 0.5779038518667221, + "rewards/correctness_reward_func": 1.4166666716337204, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 910 + }, + { + "completion_length": 178.7083396911621, + "epoch": 0.48762210624916363, + "grad_norm": 1.265625, + "kl": 0.034308540634810925, + "learning_rate": 3.0400219712821864e-06, + "loss": 0.0014, + "reward": 3.145833432674408, + "reward_std": 0.5230088979005814, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333395421505, + "rewards/xmlcount_reward_func": 0.5, + "step": 911 + }, + { + "completion_length": 150.12500381469727, + "epoch": 0.4881573665194701, + "grad_norm": 1.8671875, + "kl": 0.028787806164473295, + "learning_rate": 3.0354591270727936e-06, + "loss": 0.0012, + "reward": 2.583333343267441, + "reward_std": 0.8096110820770264, + "rewards/correctness_reward_func": 1.166666679084301, + "rewards/int_reward_func": 0.4375, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 912 + }, + { + "completion_length": 135.25000381469727, + "epoch": 0.48869262678977654, + "grad_norm": 2.046875, + "kl": 0.027838943991810083, + "learning_rate": 3.030894412652785e-06, + "loss": 0.0011, + "reward": 3.1666667461395264, + "reward_std": 0.6664472222328186, + "rewards/correctness_reward_func": 1.666666716337204, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 913 + }, + { + "completion_length": 181.5416717529297, + "epoch": 0.48922788706008297, + "grad_norm": 1.4765625, + "kl": 0.036700944416224957, + "learning_rate": 3.0263278439654465e-06, + "loss": 0.0015, + "reward": 2.8958334028720856, + "reward_std": 0.541967298835516, + "rewards/correctness_reward_func": 1.5000000149011612, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333395421505, + "rewards/xmlcount_reward_func": 0.5, + "step": 914 + }, + { + "completion_length": 141.37500190734863, + "epoch": 0.4897631473303894, + "grad_norm": 1.6015625, + "kl": 0.017945259111002088, + "learning_rate": 3.0217594369605373e-06, + "loss": 0.0007, + "reward": 3.041666716337204, + "reward_std": 0.306186206638813, + "rewards/correctness_reward_func": 1.5833333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 915 + }, + { + "completion_length": 119.29167175292969, + "epoch": 0.4902984076006958, + "grad_norm": 0.9609375, + "kl": 0.025332989636808634, + "learning_rate": 3.0171892075942415e-06, + "loss": 0.001, + "reward": 3.333333373069763, + "reward_std": 0.25819891691207886, + "rewards/correctness_reward_func": 1.8333333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 916 + }, + { + "completion_length": 210.2083396911621, + "epoch": 0.4908336678710023, + "grad_norm": 1.1640625, + "kl": 0.022253695176914334, + "learning_rate": 3.0126171718291045e-06, + "loss": 0.0009, + "reward": 2.6875000596046448, + "reward_std": 0.6221463531255722, + "rewards/correctness_reward_func": 1.3333333432674408, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.37500000558793545, + "rewards/xmlcount_reward_func": 0.5, + "step": 917 + }, + { + "completion_length": 135.37500381469727, + "epoch": 0.4913689281413087, + "grad_norm": 1.46875, + "kl": 0.023740992648527026, + "learning_rate": 3.008043345633984e-06, + "loss": 0.0009, + "reward": 3.395833373069763, + "reward_std": 0.25515517592430115, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 918 + }, + { + "completion_length": 161.4166717529297, + "epoch": 0.49190418841161515, + "grad_norm": 1.5, + "kl": 0.03153214603662491, + "learning_rate": 3.0034677449839893e-06, + "loss": 0.0013, + "reward": 2.5625000596046448, + "reward_std": 0.6113040260970592, + "rewards/correctness_reward_func": 1.1666666865348816, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 919 + }, + { + "completion_length": 194.62500381469727, + "epoch": 0.4924394486819216, + "grad_norm": 1.8515625, + "kl": 0.02594477077946067, + "learning_rate": 2.9988903858604275e-06, + "loss": 0.001, + "reward": 2.2343750298023224, + "reward_std": 0.7281012237071991, + "rewards/correctness_reward_func": 0.8333333432674408, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.484375, + "step": 920 + }, + { + "completion_length": 183.25000381469727, + "epoch": 0.492974708952228, + "grad_norm": 6.125, + "kl": 0.20129929389804602, + "learning_rate": 2.9943112842507473e-06, + "loss": 0.0081, + "reward": 3.2916667461395264, + "reward_std": 0.35817956551909447, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 921 + }, + { + "completion_length": 167.5833396911621, + "epoch": 0.4935099692225345, + "grad_norm": 1.7421875, + "kl": 0.021955529926344752, + "learning_rate": 2.989730456148484e-06, + "loss": 0.0009, + "reward": 2.6875000596046448, + "reward_std": 0.7304233312606812, + "rewards/correctness_reward_func": 1.2500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 922 + }, + { + "completion_length": 174.16667556762695, + "epoch": 0.4940452294928409, + "grad_norm": 1.0703125, + "kl": 0.03281415533274412, + "learning_rate": 2.985147917553205e-06, + "loss": 0.0013, + "reward": 2.8593750298023224, + "reward_std": 0.44327686727046967, + "rewards/correctness_reward_func": 1.5000000223517418, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.39583333395421505, + "rewards/xmlcount_reward_func": 0.484375, + "step": 923 + }, + { + "completion_length": 158.45833778381348, + "epoch": 0.49458048976314734, + "grad_norm": 2.046875, + "kl": 0.025178374722599983, + "learning_rate": 2.980563684470448e-06, + "loss": 0.001, + "reward": 2.770833373069763, + "reward_std": 0.8783334791660309, + "rewards/correctness_reward_func": 1.333333395421505, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 924 + }, + { + "completion_length": 159.62500381469727, + "epoch": 0.49511575003345376, + "grad_norm": 1.3515625, + "kl": 0.04505129624158144, + "learning_rate": 2.975977772911671e-06, + "loss": 0.0018, + "reward": 2.9010416865348816, + "reward_std": 0.1904354840517044, + "rewards/correctness_reward_func": 1.5, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.484375, + "step": 925 + }, + { + "completion_length": 162.8333396911621, + "epoch": 0.4956510103037602, + "grad_norm": 1.6171875, + "kl": 0.0264615248888731, + "learning_rate": 2.9713901988941956e-06, + "loss": 0.0011, + "reward": 3.1875000596046448, + "reward_std": 0.4816259741783142, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 926 + }, + { + "completion_length": 146.7083396911621, + "epoch": 0.4961862705740666, + "grad_norm": 2.15625, + "kl": 0.03129548905417323, + "learning_rate": 2.9668009784411497e-06, + "loss": 0.0013, + "reward": 3.0416667461395264, + "reward_std": 0.7602093182504177, + "rewards/correctness_reward_func": 1.5833333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 927 + }, + { + "completion_length": 191.08334350585938, + "epoch": 0.4967215308443731, + "grad_norm": 2.21875, + "kl": 0.03684541070833802, + "learning_rate": 2.9622101275814087e-06, + "loss": 0.0015, + "reward": 2.421875089406967, + "reward_std": 0.6723825596272945, + "rewards/correctness_reward_func": 1.083333358168602, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3750000074505806, + "rewards/xmlcount_reward_func": 0.484375, + "step": 928 + }, + { + "completion_length": 172.9583339691162, + "epoch": 0.4972567911146795, + "grad_norm": 5.625, + "kl": 0.17009031027555466, + "learning_rate": 2.9576176623495457e-06, + "loss": 0.0068, + "reward": 3.1041667461395264, + "reward_std": 0.8021577149629593, + "rewards/correctness_reward_func": 1.6666667461395264, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 929 + }, + { + "completion_length": 155.3333396911621, + "epoch": 0.49779205138498595, + "grad_norm": 2.390625, + "kl": 0.08845770079642534, + "learning_rate": 2.9530235987857715e-06, + "loss": 0.0035, + "reward": 3.270833432674408, + "reward_std": 0.37377968057990074, + "rewards/correctness_reward_func": 1.8333333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 930 + }, + { + "completion_length": 166.95833587646484, + "epoch": 0.49832731165529237, + "grad_norm": 1.65625, + "kl": 0.028497768100351095, + "learning_rate": 2.948427952935879e-06, + "loss": 0.0011, + "reward": 2.458333373069763, + "reward_std": 0.600963905453682, + "rewards/correctness_reward_func": 1.083333358168602, + "rewards/int_reward_func": 0.4375000074505806, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.4791666716337204, + "step": 931 + }, + { + "completion_length": 166.4166717529297, + "epoch": 0.4988625719255988, + "grad_norm": 1.71875, + "kl": 0.04362269816920161, + "learning_rate": 2.943830740851189e-06, + "loss": 0.0017, + "reward": 2.7500000596046448, + "reward_std": 0.6913313567638397, + "rewards/correctness_reward_func": 1.3333333656191826, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 932 + }, + { + "completion_length": 162.5416717529297, + "epoch": 0.4993978321959053, + "grad_norm": 1.9140625, + "kl": 0.03401909116655588, + "learning_rate": 2.939231978588491e-06, + "loss": 0.0014, + "reward": 3.1041667461395264, + "reward_std": 0.7571656107902527, + "rewards/correctness_reward_func": 1.666666716337204, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 933 + }, + { + "completion_length": 145.75, + "epoch": 0.4999330924662117, + "grad_norm": 0.9921875, + "kl": 0.0415206546895206, + "learning_rate": 2.934631682209989e-06, + "loss": 0.0017, + "reward": 3.4791666865348816, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 934 + }, + { + "completion_length": 175.70833587646484, + "epoch": 0.5004683527365181, + "grad_norm": 0.67578125, + "kl": 0.01767243049107492, + "learning_rate": 2.930029867783246e-06, + "loss": 0.0007, + "reward": 3.125, + "reward_std": 0.25, + "rewards/correctness_reward_func": 1.6666666716337204, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 935 + }, + { + "completion_length": 116.75000381469727, + "epoch": 0.5010036130068246, + "grad_norm": 1.296875, + "kl": 0.029608782147988677, + "learning_rate": 2.9254265513811274e-06, + "loss": 0.0012, + "reward": 3.333333373069763, + "reward_std": 0.40824829041957855, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 936 + }, + { + "completion_length": 165.5833396911621, + "epoch": 0.501538873277131, + "grad_norm": 1.7421875, + "kl": 0.049315739423036575, + "learning_rate": 2.920821749081744e-06, + "loss": 0.002, + "reward": 2.895833432674408, + "reward_std": 0.473104827105999, + "rewards/correctness_reward_func": 1.5000000223517418, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333507180214, + "rewards/xmlcount_reward_func": 0.5, + "step": 937 + }, + { + "completion_length": 122.04166793823242, + "epoch": 0.5020741335474375, + "grad_norm": 1.6484375, + "kl": 0.05571300070732832, + "learning_rate": 2.9162154769683958e-06, + "loss": 0.0022, + "reward": 3.3750000596046448, + "reward_std": 0.25129128620028496, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 938 + }, + { + "completion_length": 126.45833778381348, + "epoch": 0.5026093938177438, + "grad_norm": 1.9140625, + "kl": 0.034338406287133694, + "learning_rate": 2.911607751129517e-06, + "loss": 0.0014, + "reward": 3.333333373069763, + "reward_std": 0.40824829041957855, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 939 + }, + { + "completion_length": 147.54166984558105, + "epoch": 0.5031446540880503, + "grad_norm": 1.765625, + "kl": 0.0225078200455755, + "learning_rate": 2.9069985876586206e-06, + "loss": 0.0009, + "reward": 2.6875000596046448, + "reward_std": 0.566905565559864, + "rewards/correctness_reward_func": 1.2500000298023224, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 940 + }, + { + "completion_length": 143.0416717529297, + "epoch": 0.5036799143583568, + "grad_norm": 1.6953125, + "kl": 0.041789953131228685, + "learning_rate": 2.9023880026542383e-06, + "loss": 0.0017, + "reward": 3.0416667461395264, + "reward_std": 0.5643851235508919, + "rewards/correctness_reward_func": 1.583333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 941 + }, + { + "completion_length": 149.6666717529297, + "epoch": 0.5042151746286632, + "grad_norm": 2.171875, + "kl": 0.026956678368151188, + "learning_rate": 2.8977760122198697e-06, + "loss": 0.0011, + "reward": 2.625000089406967, + "reward_std": 0.5094902031123638, + "rewards/correctness_reward_func": 1.2500000298023224, + "rewards/int_reward_func": 0.39583333395421505, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 942 + }, + { + "completion_length": 119.25000381469727, + "epoch": 0.5047504348989696, + "grad_norm": 1.546875, + "kl": 0.04554880363866687, + "learning_rate": 2.89316263246392e-06, + "loss": 0.0018, + "reward": 3.1250000596046448, + "reward_std": 0.555328756570816, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 943 + }, + { + "completion_length": 123.87500381469727, + "epoch": 0.505285695169276, + "grad_norm": 1.5234375, + "kl": 0.02432346437126398, + "learning_rate": 2.8885478794996496e-06, + "loss": 0.001, + "reward": 3.2500000596046448, + "reward_std": 0.46232305467128754, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 944 + }, + { + "completion_length": 149.4166717529297, + "epoch": 0.5058209554395825, + "grad_norm": 0.091796875, + "kl": 0.02958611771464348, + "learning_rate": 2.883931769445114e-06, + "loss": 0.0012, + "reward": 3.5, + "reward_std": 0.0, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 945 + }, + { + "completion_length": 137.6250057220459, + "epoch": 0.506356215709889, + "grad_norm": 2.015625, + "kl": 0.03613093541935086, + "learning_rate": 2.879314318423108e-06, + "loss": 0.0014, + "reward": 2.5208334028720856, + "reward_std": 0.6476409733295441, + "rewards/correctness_reward_func": 1.0833333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 946 + }, + { + "completion_length": 128.70833778381348, + "epoch": 0.5068914759801953, + "grad_norm": 1.421875, + "kl": 0.024170507676899433, + "learning_rate": 2.8746955425611122e-06, + "loss": 0.001, + "reward": 3.1666666865348816, + "reward_std": 0.25819891691207886, + "rewards/correctness_reward_func": 1.6666666716337204, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 947 + }, + { + "completion_length": 138.3333396911621, + "epoch": 0.5074267362505018, + "grad_norm": 1.5234375, + "kl": 0.0231890631839633, + "learning_rate": 2.8700754579912315e-06, + "loss": 0.0009, + "reward": 3.3125000596046448, + "reward_std": 0.4592793434858322, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 948 + }, + { + "completion_length": 177.25000762939453, + "epoch": 0.5079619965208082, + "grad_norm": 1.546875, + "kl": 0.027233313769102097, + "learning_rate": 2.8654540808501447e-06, + "loss": 0.0011, + "reward": 2.536458373069763, + "reward_std": 0.40045326575636864, + "rewards/correctness_reward_func": 1.1666666716337204, + "rewards/int_reward_func": 0.4166666679084301, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.4947916716337204, + "step": 949 + }, + { + "completion_length": 170.1666717529297, + "epoch": 0.5084972567911147, + "grad_norm": 3.359375, + "kl": 0.07531993184238672, + "learning_rate": 2.8608314272790427e-06, + "loss": 0.003, + "reward": 2.770833432674408, + "reward_std": 1.1262514144182205, + "rewards/correctness_reward_func": 1.416666716337204, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333358168602, + "rewards/xmlcount_reward_func": 0.4791666716337204, + "step": 950 + }, + { + "completion_length": 123.58333778381348, + "epoch": 0.5090325170614212, + "grad_norm": 1.625, + "kl": 0.03198196832090616, + "learning_rate": 2.8562075134235757e-06, + "loss": 0.0013, + "reward": 3.2291667461395264, + "reward_std": 0.5133541226387024, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 951 + }, + { + "completion_length": 151.91666984558105, + "epoch": 0.5095677773317275, + "grad_norm": 1.03125, + "kl": 0.03794420650228858, + "learning_rate": 2.8515823554337973e-06, + "loss": 0.0015, + "reward": 3.1875000596046448, + "reward_std": 0.3647233098745346, + "rewards/correctness_reward_func": 1.75, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 952 + }, + { + "completion_length": 144.1666717529297, + "epoch": 0.510103037602034, + "grad_norm": 3.234375, + "kl": 0.10791193041950464, + "learning_rate": 2.846955969464103e-06, + "loss": 0.0043, + "reward": 3.2500000596046448, + "reward_std": 0.3347994238138199, + "rewards/correctness_reward_func": 1.8333333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.416666679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 953 + }, + { + "completion_length": 181.12500381469727, + "epoch": 0.5106382978723404, + "grad_norm": 1.5390625, + "kl": 0.027862816117703915, + "learning_rate": 2.8423283716731807e-06, + "loss": 0.0011, + "reward": 2.8750000596046448, + "reward_std": 0.6469470970332623, + "rewards/correctness_reward_func": 1.583333358168602, + "rewards/int_reward_func": 0.4166666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3750000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 954 + }, + { + "completion_length": 138.75000381469727, + "epoch": 0.5111735581426469, + "grad_norm": 1.5703125, + "kl": 0.04192983591929078, + "learning_rate": 2.8376995782239486e-06, + "loss": 0.0017, + "reward": 3.0416666865348816, + "reward_std": 0.5552270114421844, + "rewards/correctness_reward_func": 1.5833333432674408, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 955 + }, + { + "completion_length": 143.0416717529297, + "epoch": 0.5117088184129533, + "grad_norm": 2.078125, + "kl": 0.05718976445496082, + "learning_rate": 2.8330696052835017e-06, + "loss": 0.0023, + "reward": 3.145833373069763, + "reward_std": 0.5989172980189323, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666865348816, + "rewards/xmlcount_reward_func": 0.5, + "step": 956 + }, + { + "completion_length": 156.25000381469727, + "epoch": 0.5122440786832597, + "grad_norm": 1.859375, + "kl": 0.033939515706151724, + "learning_rate": 2.828438469023056e-06, + "loss": 0.0014, + "reward": 2.9375000596046448, + "reward_std": 0.8021840006113052, + "rewards/correctness_reward_func": 1.5000000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000149011612, + "rewards/xmlcount_reward_func": 0.5, + "step": 957 + }, + { + "completion_length": 138.04166793823242, + "epoch": 0.5127793389535662, + "grad_norm": 1.734375, + "kl": 0.02495129080489278, + "learning_rate": 2.8238061856178888e-06, + "loss": 0.001, + "reward": 3.2500000596046448, + "reward_std": 0.46232306957244873, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 958 + }, + { + "completion_length": 158.12500762939453, + "epoch": 0.5133145992238726, + "grad_norm": 1.9921875, + "kl": 0.03997566644102335, + "learning_rate": 2.8191727712472837e-06, + "loss": 0.0016, + "reward": 2.833333373069763, + "reward_std": 0.7882219962775707, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 959 + }, + { + "completion_length": 170.9583339691162, + "epoch": 0.513849859494179, + "grad_norm": 1.984375, + "kl": 0.023815520806238055, + "learning_rate": 2.8145382420944767e-06, + "loss": 0.001, + "reward": 1.8333333730697632, + "reward_std": 0.7716152630746365, + "rewards/correctness_reward_func": 0.4166666716337204, + "rewards/int_reward_func": 0.4375, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 960 + }, + { + "completion_length": 151.45833587646484, + "epoch": 0.5143851197644855, + "grad_norm": 1.09375, + "kl": 0.030205977149307728, + "learning_rate": 2.8099026143465952e-06, + "loss": 0.0012, + "reward": 3.1250000596046448, + "reward_std": 0.2803870290517807, + "rewards/correctness_reward_func": 1.6666666716337204, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 961 + }, + { + "completion_length": 146.4583396911621, + "epoch": 0.5149203800347919, + "grad_norm": 1.5625, + "kl": 0.02901528449729085, + "learning_rate": 2.8052659041946063e-06, + "loss": 0.0012, + "reward": 2.7291667461395264, + "reward_std": 0.45845916867256165, + "rewards/correctness_reward_func": 1.2500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 962 + }, + { + "completion_length": 149.00000381469727, + "epoch": 0.5154556403050984, + "grad_norm": 1.6640625, + "kl": 0.045791531912982464, + "learning_rate": 2.8006281278332542e-06, + "loss": 0.0018, + "reward": 2.8541667461395264, + "reward_std": 0.6371217519044876, + "rewards/correctness_reward_func": 1.4166667088866234, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 963 + }, + { + "completion_length": 131.41666793823242, + "epoch": 0.5159909005754048, + "grad_norm": 0.796875, + "kl": 0.024918334558606148, + "learning_rate": 2.795989301461009e-06, + "loss": 0.001, + "reward": 3.4166666865348816, + "reward_std": 0.20412415266036987, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 964 + }, + { + "completion_length": 178.00000381469727, + "epoch": 0.5165261608457112, + "grad_norm": 1.296875, + "kl": 0.03127570729702711, + "learning_rate": 2.7913494412800087e-06, + "loss": 0.0013, + "reward": 2.9635417461395264, + "reward_std": 0.5725657343864441, + "rewards/correctness_reward_func": 1.583333358168602, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.484375, + "step": 965 + }, + { + "completion_length": 165.16667366027832, + "epoch": 0.5170614211160177, + "grad_norm": 1.390625, + "kl": 0.02652121242135763, + "learning_rate": 2.786708563496002e-06, + "loss": 0.0011, + "reward": 3.1875000596046448, + "reward_std": 0.5145338624715805, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 966 + }, + { + "completion_length": 155.50000762939453, + "epoch": 0.5175966813863241, + "grad_norm": 1.0625, + "kl": 0.028957795351743698, + "learning_rate": 2.78206668431829e-06, + "loss": 0.0012, + "reward": 3.1041666865348816, + "reward_std": 0.3266642391681671, + "rewards/correctness_reward_func": 1.6666666716337204, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375, + "rewards/xmlcount_reward_func": 0.5, + "step": 967 + }, + { + "completion_length": 171.50000381469727, + "epoch": 0.5181319416566306, + "grad_norm": 1.359375, + "kl": 0.02106982236728072, + "learning_rate": 2.7774238199596726e-06, + "loss": 0.0008, + "reward": 2.8125, + "reward_std": 0.4269544184207916, + "rewards/correctness_reward_func": 1.3333333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 968 + }, + { + "completion_length": 180.08333587646484, + "epoch": 0.5186672019269369, + "grad_norm": 1.7109375, + "kl": 0.017889021197333932, + "learning_rate": 2.772779986636392e-06, + "loss": 0.0007, + "reward": 2.958333432674408, + "reward_std": 0.743688777089119, + "rewards/correctness_reward_func": 1.5000000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 969 + }, + { + "completion_length": 106.16666793823242, + "epoch": 0.5192024621972434, + "grad_norm": 1.265625, + "kl": 0.03550974791869521, + "learning_rate": 2.768135200568073e-06, + "loss": 0.0014, + "reward": 3.395833373069763, + "reward_std": 0.25515517219901085, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 970 + }, + { + "completion_length": 165.04167366027832, + "epoch": 0.5197377224675499, + "grad_norm": 1.421875, + "kl": 0.02399549330584705, + "learning_rate": 2.7634894779776676e-06, + "loss": 0.001, + "reward": 3.1875000596046448, + "reward_std": 0.4875549077987671, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.4791666716337204, + "step": 971 + }, + { + "completion_length": 177.45833587646484, + "epoch": 0.5202729827378563, + "grad_norm": 1.734375, + "kl": 0.04331376403570175, + "learning_rate": 2.7588428350914014e-06, + "loss": 0.0017, + "reward": 2.770833373069763, + "reward_std": 0.8424348831176758, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 972 + }, + { + "completion_length": 149.16667366027832, + "epoch": 0.5208082430081628, + "grad_norm": 1.3203125, + "kl": 0.05422828788869083, + "learning_rate": 2.7541952881387115e-06, + "loss": 0.0022, + "reward": 3.2916667461395264, + "reward_std": 0.3602609895169735, + "rewards/correctness_reward_func": 1.8333333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 973 + }, + { + "completion_length": 116.33333587646484, + "epoch": 0.5213435032784691, + "grad_norm": 1.6015625, + "kl": 0.026882473845034838, + "learning_rate": 2.7495468533521935e-06, + "loss": 0.0011, + "reward": 3.333333373069763, + "reward_std": 0.25819891691207886, + "rewards/correctness_reward_func": 1.8333333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 974 + }, + { + "completion_length": 167.29166984558105, + "epoch": 0.5218787635487756, + "grad_norm": 1.59375, + "kl": 0.0297106949146837, + "learning_rate": 2.744897546967545e-06, + "loss": 0.0012, + "reward": 2.8125000596046448, + "reward_std": 0.6229222267866135, + "rewards/correctness_reward_func": 1.4166666716337204, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 975 + }, + { + "completion_length": 141.7916717529297, + "epoch": 0.522414023819082, + "grad_norm": 1.3984375, + "kl": 0.03204418160021305, + "learning_rate": 2.7402473852235073e-06, + "loss": 0.0013, + "reward": 3.395833373069763, + "reward_std": 0.25515518710017204, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 976 + }, + { + "completion_length": 235.25000381469727, + "epoch": 0.5229492840893885, + "grad_norm": 1.078125, + "kl": 0.027873071609064937, + "learning_rate": 2.735596384361809e-06, + "loss": 0.0011, + "reward": 2.4166666865348816, + "reward_std": 0.7820602059364319, + "rewards/correctness_reward_func": 1.0000000074505806, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 977 + }, + { + "completion_length": 134.4583396911621, + "epoch": 0.5234845443596949, + "grad_norm": 1.65625, + "kl": 0.03450268576852977, + "learning_rate": 2.730944560627109e-06, + "loss": 0.0014, + "reward": 3.020833432674408, + "reward_std": 0.6229222267866135, + "rewards/correctness_reward_func": 1.5833333432674408, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 978 + }, + { + "completion_length": 173.29166793823242, + "epoch": 0.5240198046300013, + "grad_norm": 1.8828125, + "kl": 0.040093475952744484, + "learning_rate": 2.7262919302669405e-06, + "loss": 0.0016, + "reward": 2.312500089406967, + "reward_std": 0.9042879492044449, + "rewards/correctness_reward_func": 0.9166666865348816, + "rewards/int_reward_func": 0.4375000074505806, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 979 + }, + { + "completion_length": 144.16667366027832, + "epoch": 0.5245550649003078, + "grad_norm": 1.9609375, + "kl": 0.024688265286386013, + "learning_rate": 2.721638509531656e-06, + "loss": 0.001, + "reward": 3.083333432674408, + "reward_std": 0.7205219864845276, + "rewards/correctness_reward_func": 1.5833333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 980 + }, + { + "completion_length": 162.4166717529297, + "epoch": 0.5250903251706142, + "grad_norm": 1.7578125, + "kl": 0.04531784076243639, + "learning_rate": 2.7169843146743658e-06, + "loss": 0.0018, + "reward": 2.9791667461395264, + "reward_std": 0.5473008155822754, + "rewards/correctness_reward_func": 1.583333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 981 + }, + { + "completion_length": 114.29166984558105, + "epoch": 0.5256255854409206, + "grad_norm": 1.8125, + "kl": 0.041602776385843754, + "learning_rate": 2.7123293619508855e-06, + "loss": 0.0017, + "reward": 2.7708334028720856, + "reward_std": 0.3572172783315182, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.37500000558793545, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 982 + }, + { + "completion_length": 218.9166717529297, + "epoch": 0.5261608457112271, + "grad_norm": 1.3515625, + "kl": 0.04277056595310569, + "learning_rate": 2.7076736676196764e-06, + "loss": 0.0017, + "reward": 2.692708432674408, + "reward_std": 0.719441369175911, + "rewards/correctness_reward_func": 1.4166666716337204, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.31250000558793545, + "rewards/xmlcount_reward_func": 0.484375, + "step": 983 + }, + { + "completion_length": 173.4583396911621, + "epoch": 0.5266961059815335, + "grad_norm": 2.03125, + "kl": 0.032342477701604366, + "learning_rate": 2.703017247941793e-06, + "loss": 0.0013, + "reward": 2.9166667461395264, + "reward_std": 0.8832631707191467, + "rewards/correctness_reward_func": 1.5000000596046448, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 984 + }, + { + "completion_length": 142.9583396911621, + "epoch": 0.52723136625184, + "grad_norm": 0.94921875, + "kl": 0.02848183922469616, + "learning_rate": 2.6983601191808184e-06, + "loss": 0.0011, + "reward": 2.8750000596046448, + "reward_std": 0.25129128992557526, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 985 + }, + { + "completion_length": 134.70833778381348, + "epoch": 0.5277666265221463, + "grad_norm": 1.4296875, + "kl": 0.024114880245178938, + "learning_rate": 2.6937022976028176e-06, + "loss": 0.001, + "reward": 3.2291667461395264, + "reward_std": 0.5133541226387024, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 986 + }, + { + "completion_length": 166.91666793823242, + "epoch": 0.5283018867924528, + "grad_norm": 1.8359375, + "kl": 0.045298111625015736, + "learning_rate": 2.6890437994762716e-06, + "loss": 0.0018, + "reward": 2.395833358168602, + "reward_std": 0.6988043487071991, + "rewards/correctness_reward_func": 1.166666716337204, + "rewards/int_reward_func": 0.375, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3541666753590107, + "rewards/xmlcount_reward_func": 0.5, + "step": 987 + }, + { + "completion_length": 160.79166984558105, + "epoch": 0.5288371470627593, + "grad_norm": 1.90625, + "kl": 0.029747297056019306, + "learning_rate": 2.684384641072026e-06, + "loss": 0.0012, + "reward": 3.3125000596046448, + "reward_std": 0.31970490515232086, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333395421505, + "rewards/xmlcount_reward_func": 0.5, + "step": 988 + }, + { + "completion_length": 122.95833396911621, + "epoch": 0.5293724073330657, + "grad_norm": 3.671875, + "kl": 0.04499641829170287, + "learning_rate": 2.6797248386632328e-06, + "loss": 0.0018, + "reward": 3.3854166865348816, + "reward_std": 0.280670702457428, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.4895833358168602, + "step": 989 + }, + { + "completion_length": 148.25000381469727, + "epoch": 0.5299076676033722, + "grad_norm": 2.078125, + "kl": 0.09255302604287863, + "learning_rate": 2.6750644085252926e-06, + "loss": 0.0037, + "reward": 3.1666667461395264, + "reward_std": 0.4376493915915489, + "rewards/correctness_reward_func": 1.8333333432674408, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.354166679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 990 + }, + { + "completion_length": 188.8750057220459, + "epoch": 0.5304429278736785, + "grad_norm": 0.9375, + "kl": 0.03525672573596239, + "learning_rate": 2.6704033669357986e-06, + "loss": 0.0014, + "reward": 2.895833343267441, + "reward_std": 0.3776441812515259, + "rewards/correctness_reward_func": 1.5000000223517418, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 991 + }, + { + "completion_length": 123.04167175292969, + "epoch": 0.530978188143985, + "grad_norm": 2.140625, + "kl": 0.06268464820459485, + "learning_rate": 2.6657417301744796e-06, + "loss": 0.0025, + "reward": 2.770833432674408, + "reward_std": 0.7469579875469208, + "rewards/correctness_reward_func": 1.3333333656191826, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 992 + }, + { + "completion_length": 125.41666984558105, + "epoch": 0.5315134484142915, + "grad_norm": 1.21875, + "kl": 0.0292810145765543, + "learning_rate": 2.6610795145231443e-06, + "loss": 0.0012, + "reward": 3.4166666865348816, + "reward_std": 0.20412413775920868, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 993 + }, + { + "completion_length": 141.0416717529297, + "epoch": 0.5320487086845979, + "grad_norm": 1.3671875, + "kl": 0.030940232798457146, + "learning_rate": 2.656416736265621e-06, + "loss": 0.0012, + "reward": 3.2916666865348816, + "reward_std": 0.39777331054210663, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 994 + }, + { + "completion_length": 143.00000190734863, + "epoch": 0.5325839689549043, + "grad_norm": 1.03125, + "kl": 0.0226780385710299, + "learning_rate": 2.6517534116877046e-06, + "loss": 0.0009, + "reward": 3.4166666865348816, + "reward_std": 0.20412413775920868, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 995 + }, + { + "completion_length": 205.6666717529297, + "epoch": 0.5331192292252107, + "grad_norm": 1.34375, + "kl": 0.03955272724851966, + "learning_rate": 2.647089557077099e-06, + "loss": 0.0016, + "reward": 2.7447916865348816, + "reward_std": 0.5635670721530914, + "rewards/correctness_reward_func": 1.5000000223517418, + "rewards/int_reward_func": 0.4375, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3541666679084301, + "rewards/xmlcount_reward_func": 0.453125, + "step": 996 + }, + { + "completion_length": 188.54167556762695, + "epoch": 0.5336544894955172, + "grad_norm": 1.1796875, + "kl": 0.031185157131403685, + "learning_rate": 2.6424251887233574e-06, + "loss": 0.0012, + "reward": 3.0416667461395264, + "reward_std": 0.3827027641236782, + "rewards/correctness_reward_func": 1.6666666716337204, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3750000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 997 + }, + { + "completion_length": 168.875, + "epoch": 0.5341897497658237, + "grad_norm": 2.03125, + "kl": 0.028122437419369817, + "learning_rate": 2.6377603229178278e-06, + "loss": 0.0011, + "reward": 2.7291667461395264, + "reward_std": 0.5599979385733604, + "rewards/correctness_reward_func": 1.3333333730697632, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.416666679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 998 + }, + { + "completion_length": 115.91667175292969, + "epoch": 0.53472501003613, + "grad_norm": 1.015625, + "kl": 0.021800895920023322, + "learning_rate": 2.633094975953597e-06, + "loss": 0.0009, + "reward": 3.1666666865348816, + "reward_std": 0.25819891691207886, + "rewards/correctness_reward_func": 1.6666666716337204, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 999 + }, + { + "completion_length": 180.4166717529297, + "epoch": 0.5352602703064365, + "grad_norm": 1.875, + "kl": 0.04836212657392025, + "learning_rate": 2.6284291641254308e-06, + "loss": 0.0019, + "reward": 2.2500000298023224, + "reward_std": 1.158027172088623, + "rewards/correctness_reward_func": 0.9166666865348816, + "rewards/int_reward_func": 0.3958333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 1000 + }, + { + "completion_length": 138.3750057220459, + "epoch": 0.5357955305767429, + "grad_norm": 1.5, + "kl": 0.029096576385200024, + "learning_rate": 2.62376290372972e-06, + "loss": 0.0012, + "reward": 3.333333373069763, + "reward_std": 0.40824830532073975, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1001 + }, + { + "completion_length": 160.2916717529297, + "epoch": 0.5363307908470494, + "grad_norm": 1.8671875, + "kl": 0.06832502828910947, + "learning_rate": 2.6190962110644215e-06, + "loss": 0.0027, + "reward": 3.0625000596046448, + "reward_std": 0.6765787862241268, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 1002 + }, + { + "completion_length": 145.79166984558105, + "epoch": 0.5368660511173559, + "grad_norm": 1.3828125, + "kl": 0.04142569610849023, + "learning_rate": 2.6144291024290004e-06, + "loss": 0.0017, + "reward": 3.3125, + "reward_std": 0.246855229139328, + "rewards/correctness_reward_func": 1.8333333432674408, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1003 + }, + { + "completion_length": 175.95833587646484, + "epoch": 0.5374013113876622, + "grad_norm": 1.4375, + "kl": 0.029265500139445066, + "learning_rate": 2.6097615941243777e-06, + "loss": 0.0012, + "reward": 2.6875000596046448, + "reward_std": 0.40223564952611923, + "rewards/correctness_reward_func": 1.25, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1004 + }, + { + "completion_length": 116.12500381469727, + "epoch": 0.5379365716579687, + "grad_norm": 0.90625, + "kl": 0.04694632440805435, + "learning_rate": 2.605093702452868e-06, + "loss": 0.0019, + "reward": 3.3125, + "reward_std": 0.246855229139328, + "rewards/correctness_reward_func": 1.8333333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1005 + }, + { + "completion_length": 151.50000381469727, + "epoch": 0.5384718319282751, + "grad_norm": 1.9921875, + "kl": 0.04991014767438173, + "learning_rate": 2.600425443718127e-06, + "loss": 0.002, + "reward": 3.0625000596046448, + "reward_std": 0.5599183700978756, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1006 + }, + { + "completion_length": 146.8333396911621, + "epoch": 0.5390070921985816, + "grad_norm": 1.6640625, + "kl": 0.03239775216206908, + "learning_rate": 2.595756834225089e-06, + "loss": 0.0013, + "reward": 3.458333373069763, + "reward_std": 0.10206207633018494, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1007 + }, + { + "completion_length": 163.5833396911621, + "epoch": 0.539542352468888, + "grad_norm": 1.9609375, + "kl": 0.03426534216850996, + "learning_rate": 2.591087890279917e-06, + "loss": 0.0014, + "reward": 3.0000000596046448, + "reward_std": 0.8296719007194042, + "rewards/correctness_reward_func": 1.5833333730697632, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 1008 + }, + { + "completion_length": 170.04166793823242, + "epoch": 0.5400776127391944, + "grad_norm": 1.109375, + "kl": 0.02527566161006689, + "learning_rate": 2.58641862818994e-06, + "loss": 0.001, + "reward": 2.645833343267441, + "reward_std": 0.4242093414068222, + "rewards/correctness_reward_func": 1.1666666716337204, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1009 + }, + { + "completion_length": 150.33333587646484, + "epoch": 0.5406128730095009, + "grad_norm": 1.59375, + "kl": 0.027325558941811323, + "learning_rate": 2.5817490642636e-06, + "loss": 0.0011, + "reward": 2.583333432674408, + "reward_std": 0.7205219715833664, + "rewards/correctness_reward_func": 1.0833333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1010 + }, + { + "completion_length": 124.375, + "epoch": 0.5411481332798073, + "grad_norm": 1.2421875, + "kl": 0.03881736192852259, + "learning_rate": 2.5770792148103916e-06, + "loss": 0.0016, + "reward": 3.333333373069763, + "reward_std": 0.25819891691207886, + "rewards/correctness_reward_func": 1.8333333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1011 + }, + { + "completion_length": 187.00000762939453, + "epoch": 0.5416833935501137, + "grad_norm": 1.796875, + "kl": 0.04065545601770282, + "learning_rate": 2.5724090961408066e-06, + "loss": 0.0016, + "reward": 2.3958333879709244, + "reward_std": 0.3572172410786152, + "rewards/correctness_reward_func": 1.0833333358168602, + "rewards/int_reward_func": 0.375, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000149011612, + "rewards/xmlcount_reward_func": 0.5, + "step": 1012 + }, + { + "completion_length": 121.04166984558105, + "epoch": 0.5422186538204202, + "grad_norm": 0.73828125, + "kl": 0.039184169843792915, + "learning_rate": 2.5677387245662782e-06, + "loss": 0.0016, + "reward": 3.4791666865348816, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1013 + }, + { + "completion_length": 143.58333778381348, + "epoch": 0.5427539140907266, + "grad_norm": 1.703125, + "kl": 0.03704654565081, + "learning_rate": 2.5630681163991224e-06, + "loss": 0.0015, + "reward": 3.145833373069763, + "reward_std": 0.5618248581886292, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 1014 + }, + { + "completion_length": 137.87500762939453, + "epoch": 0.5432891743610331, + "grad_norm": 1.46875, + "kl": 0.026060293428599834, + "learning_rate": 2.5583972879524817e-06, + "loss": 0.001, + "reward": 3.2291667461395264, + "reward_std": 0.6634034961462021, + "rewards/correctness_reward_func": 1.7500000596046448, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1015 + }, + { + "completion_length": 150.66666984558105, + "epoch": 0.5438244346313394, + "grad_norm": 1.484375, + "kl": 0.023670056369155645, + "learning_rate": 2.5537262555402675e-06, + "loss": 0.0009, + "reward": 3.2291667461395264, + "reward_std": 0.51335409283638, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1016 + }, + { + "completion_length": 137.62500190734863, + "epoch": 0.5443596949016459, + "grad_norm": 1.6015625, + "kl": 0.025291157886385918, + "learning_rate": 2.5490550354771044e-06, + "loss": 0.001, + "reward": 3.395833373069763, + "reward_std": 0.25515517219901085, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1017 + }, + { + "completion_length": 199.79166984558105, + "epoch": 0.5448949551719524, + "grad_norm": 1.6953125, + "kl": 0.04733401257544756, + "learning_rate": 2.544383644078271e-06, + "loss": 0.0019, + "reward": 2.2916667461395264, + "reward_std": 0.5906235836446285, + "rewards/correctness_reward_func": 0.9166666716337204, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.416666679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 1018 + }, + { + "completion_length": 163.5416717529297, + "epoch": 0.5454302154422588, + "grad_norm": 1.8046875, + "kl": 0.019132951041683555, + "learning_rate": 2.539712097659647e-06, + "loss": 0.0008, + "reward": 2.958333373069763, + "reward_std": 0.47524039447307587, + "rewards/correctness_reward_func": 1.5000000223517418, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1019 + }, + { + "completion_length": 124.37500381469727, + "epoch": 0.5459654757125653, + "grad_norm": 1.6640625, + "kl": 0.03320982772856951, + "learning_rate": 2.5350404125376494e-06, + "loss": 0.0013, + "reward": 3.2500000596046448, + "reward_std": 0.46232305467128754, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1020 + }, + { + "completion_length": 171.2500057220459, + "epoch": 0.5465007359828716, + "grad_norm": 1.3125, + "kl": 0.03306609811261296, + "learning_rate": 2.530368605029185e-06, + "loss": 0.0013, + "reward": 2.645833373069763, + "reward_std": 0.4741215407848358, + "rewards/correctness_reward_func": 1.1666666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1021 + }, + { + "completion_length": 127.37500381469727, + "epoch": 0.5470359962531781, + "grad_norm": 0.921875, + "kl": 0.036080996971577406, + "learning_rate": 2.5256966914515823e-06, + "loss": 0.0014, + "reward": 3.4791666865348816, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1022 + }, + { + "completion_length": 154.00000381469727, + "epoch": 0.5475712565234846, + "grad_norm": 1.6328125, + "kl": 0.028959017246961594, + "learning_rate": 2.5210246881225448e-06, + "loss": 0.0012, + "reward": 3.4375000596046448, + "reward_std": 0.11558076739311218, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 1023 + }, + { + "completion_length": 153.16667366027832, + "epoch": 0.548106516793791, + "grad_norm": 1.5, + "kl": 0.04141217190772295, + "learning_rate": 2.516352611360088e-06, + "loss": 0.0017, + "reward": 2.6875000596046448, + "reward_std": 0.3894420526921749, + "rewards/correctness_reward_func": 1.25, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 1024 + }, + { + "completion_length": 179.12500381469727, + "epoch": 0.5486417770640974, + "grad_norm": 1.4453125, + "kl": 0.02090016705915332, + "learning_rate": 2.511680477482482e-06, + "loss": 0.0008, + "reward": 2.7291667461395264, + "reward_std": 0.39121396839618683, + "rewards/correctness_reward_func": 1.3333333432674408, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 1025 + }, + { + "completion_length": 168.7916717529297, + "epoch": 0.5491770373344038, + "grad_norm": 1.265625, + "kl": 0.02719574049115181, + "learning_rate": 2.5070083028082004e-06, + "loss": 0.0011, + "reward": 2.8333334028720856, + "reward_std": 0.3707359693944454, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000149011612, + "rewards/xmlcount_reward_func": 0.5, + "step": 1026 + }, + { + "completion_length": 185.50000762939453, + "epoch": 0.5497122976047103, + "grad_norm": 1.59375, + "kl": 0.02753805136308074, + "learning_rate": 2.5023361036558546e-06, + "loss": 0.0011, + "reward": 2.208333432674408, + "reward_std": 0.8469306528568268, + "rewards/correctness_reward_func": 0.8333333432674408, + "rewards/int_reward_func": 0.4375, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 1027 + }, + { + "completion_length": 128.08333587646484, + "epoch": 0.5502475578750168, + "grad_norm": 1.578125, + "kl": 0.02586387423798442, + "learning_rate": 2.497663896344146e-06, + "loss": 0.001, + "reward": 3.2500000596046448, + "reward_std": 0.46232305467128754, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1028 + }, + { + "completion_length": 165.0833396911621, + "epoch": 0.5507828181453231, + "grad_norm": 0.5546875, + "kl": 0.02411092072725296, + "learning_rate": 2.4929916971917995e-06, + "loss": 0.001, + "reward": 3.458333373069763, + "reward_std": 0.06454972922801971, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 1029 + }, + { + "completion_length": 141.66666793823242, + "epoch": 0.5513180784156296, + "grad_norm": 0.72265625, + "kl": 0.029878363013267517, + "learning_rate": 2.4883195225175188e-06, + "loss": 0.0012, + "reward": 3.2291666865348816, + "reward_std": 0.25515520572662354, + "rewards/correctness_reward_func": 1.75, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1030 + }, + { + "completion_length": 142.4583339691162, + "epoch": 0.551853338685936, + "grad_norm": 1.421875, + "kl": 0.048918829299509525, + "learning_rate": 2.4836473886399133e-06, + "loss": 0.002, + "reward": 3.2500000596046448, + "reward_std": 0.46232303977012634, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1031 + }, + { + "completion_length": 184.1666717529297, + "epoch": 0.5523885989562425, + "grad_norm": 1.6484375, + "kl": 0.027948823757469654, + "learning_rate": 2.4789753118774552e-06, + "loss": 0.0011, + "reward": 3.208333432674408, + "reward_std": 0.5643851608037949, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1032 + }, + { + "completion_length": 175.29167556762695, + "epoch": 0.552923859226549, + "grad_norm": 1.265625, + "kl": 0.028014506213366985, + "learning_rate": 2.474303308548418e-06, + "loss": 0.0011, + "reward": 2.776041716337204, + "reward_std": 0.5555943250656128, + "rewards/correctness_reward_func": 1.416666679084301, + "rewards/int_reward_func": 0.4375, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375, + "rewards/xmlcount_reward_func": 0.484375, + "step": 1033 + }, + { + "completion_length": 156.00000381469727, + "epoch": 0.5534591194968553, + "grad_norm": 1.2109375, + "kl": 0.047277290374040604, + "learning_rate": 2.469631394970816e-06, + "loss": 0.0019, + "reward": 3.1666667461395264, + "reward_std": 0.5163978338241577, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1034 + }, + { + "completion_length": 179.62500762939453, + "epoch": 0.5539943797671618, + "grad_norm": 1.5234375, + "kl": 0.03146013617515564, + "learning_rate": 2.464959587462351e-06, + "loss": 0.0013, + "reward": 2.75, + "reward_std": 0.6605896055698395, + "rewards/correctness_reward_func": 1.3333333656191826, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.416666679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 1035 + }, + { + "completion_length": 138.83333587646484, + "epoch": 0.5545296400374682, + "grad_norm": 1.6875, + "kl": 0.018021578900516033, + "learning_rate": 2.4602879023403547e-06, + "loss": 0.0007, + "reward": 3.2291667461395264, + "reward_std": 0.45845916867256165, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1036 + }, + { + "completion_length": 179.50000190734863, + "epoch": 0.5550649003077747, + "grad_norm": 1.7734375, + "kl": 0.03059578686952591, + "learning_rate": 2.4556163559217294e-06, + "loss": 0.0012, + "reward": 2.333333373069763, + "reward_std": 0.8427640199661255, + "rewards/correctness_reward_func": 0.9166666865348816, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 1037 + }, + { + "completion_length": 165.4166717529297, + "epoch": 0.555600160578081, + "grad_norm": 2.09375, + "kl": 0.04378655459731817, + "learning_rate": 2.4509449645228965e-06, + "loss": 0.0018, + "reward": 2.833333432674408, + "reward_std": 0.3533533588051796, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666865348816, + "rewards/xmlcount_reward_func": 0.5, + "step": 1038 + }, + { + "completion_length": 145.95833587646484, + "epoch": 0.5561354208483875, + "grad_norm": 1.390625, + "kl": 0.030613688752055168, + "learning_rate": 2.4462737444597337e-06, + "loss": 0.0012, + "reward": 3.125, + "reward_std": 0.523861289024353, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1039 + }, + { + "completion_length": 145.1666717529297, + "epoch": 0.556670681118694, + "grad_norm": 1.515625, + "kl": 0.06425127293914557, + "learning_rate": 2.441602712047519e-06, + "loss": 0.0026, + "reward": 2.645833373069763, + "reward_std": 0.4741215407848358, + "rewards/correctness_reward_func": 1.1666666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1040 + }, + { + "completion_length": 143.87500381469727, + "epoch": 0.5572059413890004, + "grad_norm": 2.625, + "kl": 0.04020787123590708, + "learning_rate": 2.436931883600879e-06, + "loss": 0.0016, + "reward": 3.057291805744171, + "reward_std": 0.6287723630666733, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333507180214, + "rewards/xmlcount_reward_func": 0.4947916716337204, + "step": 1041 + }, + { + "completion_length": 164.70833587646484, + "epoch": 0.5577412016593069, + "grad_norm": 1.796875, + "kl": 0.027084154076874256, + "learning_rate": 2.432261275433722e-06, + "loss": 0.0011, + "reward": 2.6621667444705963, + "reward_std": 0.6238786093890667, + "rewards/correctness_reward_func": 1.2500000298023224, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000149011612, + "rewards/xmlcount_reward_func": 0.49549999833106995, + "step": 1042 + }, + { + "completion_length": 145.70834159851074, + "epoch": 0.5582764619296132, + "grad_norm": 1.8828125, + "kl": 0.028898541815578938, + "learning_rate": 2.427590903859194e-06, + "loss": 0.0012, + "reward": 2.791666716337204, + "reward_std": 0.6409856230020523, + "rewards/correctness_reward_func": 1.3333333656191826, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 1043 + }, + { + "completion_length": 157.3333396911621, + "epoch": 0.5588117221999197, + "grad_norm": 1.953125, + "kl": 0.036141276359558105, + "learning_rate": 2.4229207851896096e-06, + "loss": 0.0014, + "reward": 2.9947917461395264, + "reward_std": 0.7928604781627655, + "rewards/correctness_reward_func": 1.5833333730697632, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.4947916716337204, + "step": 1044 + }, + { + "completion_length": 153.50000381469727, + "epoch": 0.5593469824702262, + "grad_norm": 1.7890625, + "kl": 0.03226162260398269, + "learning_rate": 2.4182509357364005e-06, + "loss": 0.0013, + "reward": 2.708333373069763, + "reward_std": 0.5884110182523727, + "rewards/correctness_reward_func": 1.3333333358168602, + "rewards/int_reward_func": 0.4166666679084301, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 1045 + }, + { + "completion_length": 235.75000381469727, + "epoch": 0.5598822427405326, + "grad_norm": 1.9296875, + "kl": 0.03827035194262862, + "learning_rate": 2.4135813718100607e-06, + "loss": 0.0015, + "reward": 2.541666731238365, + "reward_std": 0.43266692385077477, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.4166666679084301, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.2708333395421505, + "rewards/xmlcount_reward_func": 0.4375, + "step": 1046 + }, + { + "completion_length": 141.6666717529297, + "epoch": 0.560417503010839, + "grad_norm": 2.015625, + "kl": 0.034219959285110235, + "learning_rate": 2.4089121097200836e-06, + "loss": 0.0014, + "reward": 3.2916667461395264, + "reward_std": 0.5103103443980217, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1047 + }, + { + "completion_length": 142.29166984558105, + "epoch": 0.5609527632811454, + "grad_norm": 2.265625, + "kl": 0.03296273294836283, + "learning_rate": 2.404243165774912e-06, + "loss": 0.0013, + "reward": 2.9166667461395264, + "reward_std": 0.6358941905200481, + "rewards/correctness_reward_func": 1.5000000149011612, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1048 + }, + { + "completion_length": 183.08333587646484, + "epoch": 0.5614880235514519, + "grad_norm": 1.5625, + "kl": 0.05004376173019409, + "learning_rate": 2.3995745562818747e-06, + "loss": 0.002, + "reward": 2.704166680574417, + "reward_std": 0.9476025104522705, + "rewards/correctness_reward_func": 1.4166667014360428, + "rewards/int_reward_func": 0.3958333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.433333333581686, + "step": 1049 + }, + { + "completion_length": 158.0416717529297, + "epoch": 0.5620232838217584, + "grad_norm": 0.84765625, + "kl": 0.027449314016848803, + "learning_rate": 2.3949062975471325e-06, + "loss": 0.0011, + "reward": 3.395833373069763, + "reward_std": 0.25515517592430115, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1050 + }, + { + "completion_length": 128.50000381469727, + "epoch": 0.5625585440920647, + "grad_norm": 1.4765625, + "kl": 0.03116176975890994, + "learning_rate": 2.390238405875623e-06, + "loss": 0.0012, + "reward": 3.2916666865348816, + "reward_std": 0.2978862635791302, + "rewards/correctness_reward_func": 1.8333333432674408, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1051 + }, + { + "completion_length": 203.7916717529297, + "epoch": 0.5630938043623712, + "grad_norm": 1.7734375, + "kl": 0.027977202786132693, + "learning_rate": 2.385570897571001e-06, + "loss": 0.0011, + "reward": 2.8125000596046448, + "reward_std": 0.6778688579797745, + "rewards/correctness_reward_func": 1.4166666716337204, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1052 + }, + { + "completion_length": 102.75000190734863, + "epoch": 0.5636290646326776, + "grad_norm": 1.4765625, + "kl": 0.05541924946010113, + "learning_rate": 2.3809037889355794e-06, + "loss": 0.0022, + "reward": 3.4166666865348816, + "reward_std": 0.11949635669589043, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375, + "rewards/xmlcount_reward_func": 0.5, + "step": 1053 + }, + { + "completion_length": 144.45833778381348, + "epoch": 0.5641643249029841, + "grad_norm": 1.8125, + "kl": 0.057408999651670456, + "learning_rate": 2.3762370962702803e-06, + "loss": 0.0023, + "reward": 3.083333373069763, + "reward_std": 0.5582601875066757, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.416666679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 1054 + }, + { + "completion_length": 163.0833396911621, + "epoch": 0.5646995851732906, + "grad_norm": 1.0546875, + "kl": 0.022327065002173185, + "learning_rate": 2.371570835874569e-06, + "loss": 0.0009, + "reward": 3.395833373069763, + "reward_std": 0.25515518710017204, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1055 + }, + { + "completion_length": 190.25000381469727, + "epoch": 0.5652348454435969, + "grad_norm": 1.4609375, + "kl": 0.0315181240439415, + "learning_rate": 2.366905024046404e-06, + "loss": 0.0013, + "reward": 2.7916666865348816, + "reward_std": 0.7279854267835617, + "rewards/correctness_reward_func": 1.333333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 1056 + }, + { + "completion_length": 148.12500381469727, + "epoch": 0.5657701057139034, + "grad_norm": 1.9375, + "kl": 0.031283630756661296, + "learning_rate": 2.3622396770821735e-06, + "loss": 0.0013, + "reward": 2.7916667461395264, + "reward_std": 0.743688777089119, + "rewards/correctness_reward_func": 1.333333358168602, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1057 + }, + { + "completion_length": 167.4583396911621, + "epoch": 0.5663053659842098, + "grad_norm": 1.796875, + "kl": 0.04646863928064704, + "learning_rate": 2.3575748112766434e-06, + "loss": 0.0019, + "reward": 2.708333373069763, + "reward_std": 0.517269667237997, + "rewards/correctness_reward_func": 1.3333333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3750000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 1058 + }, + { + "completion_length": 153.87500381469727, + "epoch": 0.5668406262545163, + "grad_norm": 1.296875, + "kl": 0.032381411641836166, + "learning_rate": 2.352910442922902e-06, + "loss": 0.0013, + "reward": 3.145833373069763, + "reward_std": 0.4937000125646591, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375, + "rewards/xmlcount_reward_func": 0.5, + "step": 1059 + }, + { + "completion_length": 167.9583396911621, + "epoch": 0.5673758865248227, + "grad_norm": 1.5390625, + "kl": 0.029462992679327726, + "learning_rate": 2.348246588312296e-06, + "loss": 0.0012, + "reward": 2.333333373069763, + "reward_std": 0.5401924960315228, + "rewards/correctness_reward_func": 0.9166666716337204, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1060 + }, + { + "completion_length": 155.00000381469727, + "epoch": 0.5679111467951291, + "grad_norm": 1.3125, + "kl": 0.02696467051282525, + "learning_rate": 2.34358326373438e-06, + "loss": 0.0011, + "reward": 2.8125000596046448, + "reward_std": 0.40438438951969147, + "rewards/correctness_reward_func": 1.3333333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1061 + }, + { + "completion_length": 154.04166793823242, + "epoch": 0.5684464070654356, + "grad_norm": 1.5390625, + "kl": 0.03205305617302656, + "learning_rate": 2.338920485476857e-06, + "loss": 0.0013, + "reward": 3.1666667461395264, + "reward_std": 0.5222771726548672, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1062 + }, + { + "completion_length": 151.37500190734863, + "epoch": 0.568981667335742, + "grad_norm": 1.34375, + "kl": 0.04629839211702347, + "learning_rate": 2.3342582698255204e-06, + "loss": 0.0019, + "reward": 3.1875000596046448, + "reward_std": 0.35412560403347015, + "rewards/correctness_reward_func": 1.8333333432674408, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333395421505, + "rewards/xmlcount_reward_func": 0.5, + "step": 1063 + }, + { + "completion_length": 131.95833778381348, + "epoch": 0.5695169276060484, + "grad_norm": 1.640625, + "kl": 0.03145950939506292, + "learning_rate": 2.3295966330642018e-06, + "loss": 0.0013, + "reward": 2.3259166926145554, + "reward_std": 0.12022912129759789, + "rewards/correctness_reward_func": 1.0, + "rewards/int_reward_func": 0.3541666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.49258333444595337, + "step": 1064 + }, + { + "completion_length": 155.87500381469727, + "epoch": 0.5700521878763549, + "grad_norm": 1.765625, + "kl": 0.03759356401860714, + "learning_rate": 2.3249355914747078e-06, + "loss": 0.0015, + "reward": 2.8541667461395264, + "reward_std": 0.9787831455469131, + "rewards/correctness_reward_func": 1.416666716337204, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 1065 + }, + { + "completion_length": 169.33333778381348, + "epoch": 0.5705874481466613, + "grad_norm": 2.265625, + "kl": 0.023252596147358418, + "learning_rate": 2.3202751613367676e-06, + "loss": 0.0009, + "reward": 2.145833432674408, + "reward_std": 0.9450224339962006, + "rewards/correctness_reward_func": 0.6666666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1066 + }, + { + "completion_length": 149.00000762939453, + "epoch": 0.5711227084169678, + "grad_norm": 99.0, + "kl": 1.3318175182212144, + "learning_rate": 2.3156153589279745e-06, + "loss": 0.0533, + "reward": 3.4444167017936707, + "reward_std": 0.1361508071422577, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.48608332872390747, + "step": 1067 + }, + { + "completion_length": 137.58333778381348, + "epoch": 0.5716579686872741, + "grad_norm": 1.8828125, + "kl": 0.03667767532169819, + "learning_rate": 2.3109562005237284e-06, + "loss": 0.0015, + "reward": 3.2291666865348816, + "reward_std": 0.4864138811826706, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375, + "rewards/xmlcount_reward_func": 0.5, + "step": 1068 + }, + { + "completion_length": 128.25000381469727, + "epoch": 0.5721932289575806, + "grad_norm": 3.453125, + "kl": 0.0949320113286376, + "learning_rate": 2.306297702397183e-06, + "loss": 0.0038, + "reward": 3.3125000596046448, + "reward_std": 0.40438438951969147, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1069 + }, + { + "completion_length": 163.62500190734863, + "epoch": 0.5727284892278871, + "grad_norm": 296.0, + "kl": 4.624088962562382, + "learning_rate": 2.301639880819183e-06, + "loss": 0.185, + "reward": 2.812500089406967, + "reward_std": 0.9114490151405334, + "rewards/correctness_reward_func": 1.5000000447034836, + "rewards/int_reward_func": 0.4375000074505806, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.416666679084301, + "rewards/xmlcount_reward_func": 0.4583333358168602, + "step": 1070 + }, + { + "completion_length": 137.37500190734863, + "epoch": 0.5732637494981935, + "grad_norm": 2.078125, + "kl": 0.04243481811136007, + "learning_rate": 2.296982752058208e-06, + "loss": 0.0017, + "reward": 3.3125000596046448, + "reward_std": 0.4592793248593807, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1071 + }, + { + "completion_length": 228.2916717529297, + "epoch": 0.5737990097685, + "grad_norm": 1.875, + "kl": 0.04375831922516227, + "learning_rate": 2.292326332380324e-06, + "loss": 0.0018, + "reward": 2.7968750596046448, + "reward_std": 0.918915580958128, + "rewards/correctness_reward_func": 1.5000000298023224, + "rewards/int_reward_func": 0.4375000074505806, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3750000149011612, + "rewards/xmlcount_reward_func": 0.484375, + "step": 1072 + }, + { + "completion_length": 146.1250057220459, + "epoch": 0.5743342700388063, + "grad_norm": 0.62890625, + "kl": 0.04183990182355046, + "learning_rate": 2.2876706380491153e-06, + "loss": 0.0017, + "reward": 3.3541666865348816, + "reward_std": 0.18399503827095032, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375, + "rewards/xmlcount_reward_func": 0.5, + "step": 1073 + }, + { + "completion_length": 161.2916717529297, + "epoch": 0.5748695303091128, + "grad_norm": 2.03125, + "kl": 0.08961763884872198, + "learning_rate": 2.283015685325635e-06, + "loss": 0.0036, + "reward": 2.5000000596046448, + "reward_std": 0.3347994200885296, + "rewards/correctness_reward_func": 1.0833333358168602, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 1074 + }, + { + "completion_length": 137.58333587646484, + "epoch": 0.5754047905794193, + "grad_norm": 1.3203125, + "kl": 0.04768128413707018, + "learning_rate": 2.278361490468345e-06, + "loss": 0.0019, + "reward": 3.1666666865348816, + "reward_std": 0.3546550087630749, + "rewards/correctness_reward_func": 1.75, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1075 + }, + { + "completion_length": 150.29166984558105, + "epoch": 0.5759400508497257, + "grad_norm": 1.828125, + "kl": 0.057581949047744274, + "learning_rate": 2.2737080697330595e-06, + "loss": 0.0023, + "reward": 3.2916667461395264, + "reward_std": 0.3602609895169735, + "rewards/correctness_reward_func": 1.8333333432674408, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1076 + }, + { + "completion_length": 121.45833587646484, + "epoch": 0.5764753111200321, + "grad_norm": 1.3984375, + "kl": 0.05234599346294999, + "learning_rate": 2.269055439372892e-06, + "loss": 0.0021, + "reward": 3.1666666865348816, + "reward_std": 0.4779854416847229, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1077 + }, + { + "completion_length": 190.4583396911621, + "epoch": 0.5770105713903385, + "grad_norm": 1.7890625, + "kl": 0.028957413276657462, + "learning_rate": 2.2644036156381923e-06, + "loss": 0.0012, + "reward": 2.5625000596046448, + "reward_std": 1.0113781988620758, + "rewards/correctness_reward_func": 1.1666667088866234, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 1078 + }, + { + "completion_length": 142.75000381469727, + "epoch": 0.577545831660645, + "grad_norm": 2.140625, + "kl": 0.027713227085769176, + "learning_rate": 2.2597526147764935e-06, + "loss": 0.0011, + "reward": 3.083333432674408, + "reward_std": 0.5763519518077374, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.4375, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1079 + }, + { + "completion_length": 164.33333778381348, + "epoch": 0.5780810919309515, + "grad_norm": 1.5078125, + "kl": 0.04261440085247159, + "learning_rate": 2.255102453032456e-06, + "loss": 0.0017, + "reward": 3.309333384037018, + "reward_std": 0.26535478234291077, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333395421505, + "rewards/xmlcount_reward_func": 0.49683333933353424, + "step": 1080 + }, + { + "completion_length": 135.12500190734863, + "epoch": 0.5786163522012578, + "grad_norm": 1.4375, + "kl": 0.025709964334964752, + "learning_rate": 2.250453146647807e-06, + "loss": 0.001, + "reward": 3.2291666865348816, + "reward_std": 0.49727512896060944, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1081 + }, + { + "completion_length": 152.2916717529297, + "epoch": 0.5791516124715643, + "grad_norm": 0.92578125, + "kl": 0.025608718395233154, + "learning_rate": 2.2458047118612894e-06, + "loss": 0.001, + "reward": 3.25, + "reward_std": 0.273861289024353, + "rewards/correctness_reward_func": 1.75, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1082 + }, + { + "completion_length": 161.4166717529297, + "epoch": 0.5796868727418707, + "grad_norm": 0.79296875, + "kl": 0.027878669556230307, + "learning_rate": 2.2411571649086e-06, + "loss": 0.0011, + "reward": 3.3125, + "reward_std": 0.246855229139328, + "rewards/correctness_reward_func": 1.8333333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1083 + }, + { + "completion_length": 148.3750057220459, + "epoch": 0.5802221330121772, + "grad_norm": 1.71875, + "kl": 0.0236201249063015, + "learning_rate": 2.236510522022333e-06, + "loss": 0.0009, + "reward": 2.8750000596046448, + "reward_std": 0.743688777089119, + "rewards/correctness_reward_func": 1.4166667014360428, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 1084 + }, + { + "completion_length": 140.45833778381348, + "epoch": 0.5807573932824837, + "grad_norm": 1.3515625, + "kl": 0.05232280120253563, + "learning_rate": 2.231864799431928e-06, + "loss": 0.0021, + "reward": 3.2291666865348816, + "reward_std": 0.3248923271894455, + "rewards/correctness_reward_func": 1.75, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1085 + }, + { + "completion_length": 188.9166717529297, + "epoch": 0.58129265355279, + "grad_norm": 1.4453125, + "kl": 0.038821437396109104, + "learning_rate": 2.227220013363608e-06, + "loss": 0.0016, + "reward": 2.8958334028720856, + "reward_std": 0.5133540891110897, + "rewards/correctness_reward_func": 1.416666679084301, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1086 + }, + { + "completion_length": 166.25000381469727, + "epoch": 0.5818279138230965, + "grad_norm": 1.5, + "kl": 0.034582878928631544, + "learning_rate": 2.2225761800403278e-06, + "loss": 0.0014, + "reward": 2.958333373069763, + "reward_std": 0.6907386183738708, + "rewards/correctness_reward_func": 1.5833333432674408, + "rewards/int_reward_func": 0.3958333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1087 + }, + { + "completion_length": 129.37500381469727, + "epoch": 0.5823631740934029, + "grad_norm": 1.6484375, + "kl": 0.04625696036964655, + "learning_rate": 2.2179333156817114e-06, + "loss": 0.0019, + "reward": 3.0416666865348816, + "reward_std": 0.506598636507988, + "rewards/correctness_reward_func": 1.5833333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 1088 + }, + { + "completion_length": 159.70833778381348, + "epoch": 0.5828984343637094, + "grad_norm": 1.109375, + "kl": 0.03190090577118099, + "learning_rate": 2.2132914365039993e-06, + "loss": 0.0013, + "reward": 3.2291666865348816, + "reward_std": 0.3248923271894455, + "rewards/correctness_reward_func": 1.75, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1089 + }, + { + "completion_length": 121.00000190734863, + "epoch": 0.5834336946340158, + "grad_norm": 1.6875, + "kl": 0.04292154032737017, + "learning_rate": 2.208650558719992e-06, + "loss": 0.0017, + "reward": 3.2500000596046448, + "reward_std": 0.46232306957244873, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1090 + }, + { + "completion_length": 115.08333587646484, + "epoch": 0.5839689549043222, + "grad_norm": 1.8046875, + "kl": 0.06790947215631604, + "learning_rate": 2.2040106985389925e-06, + "loss": 0.0027, + "reward": 3.1875000596046448, + "reward_std": 0.3323967605829239, + "rewards/correctness_reward_func": 1.75, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1091 + }, + { + "completion_length": 139.66666984558105, + "epoch": 0.5845042151746287, + "grad_norm": 1.0625, + "kl": 0.031267859041690826, + "learning_rate": 2.1993718721667466e-06, + "loss": 0.0013, + "reward": 3.458333373069763, + "reward_std": 0.10206206887960434, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1092 + }, + { + "completion_length": 131.5833339691162, + "epoch": 0.5850394754449351, + "grad_norm": 0.6953125, + "kl": 0.02611220208927989, + "learning_rate": 2.194734095805395e-06, + "loss": 0.001, + "reward": 3.4166666865348816, + "reward_std": 0.20412415266036987, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1093 + }, + { + "completion_length": 159.3333396911621, + "epoch": 0.5855747357152415, + "grad_norm": 1.265625, + "kl": 0.02461553202010691, + "learning_rate": 2.1900973856534048e-06, + "loss": 0.001, + "reward": 2.458333373069763, + "reward_std": 0.10206207260489464, + "rewards/correctness_reward_func": 1.0, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1094 + }, + { + "completion_length": 131.3333396911621, + "epoch": 0.586109995985548, + "grad_norm": 7.03125, + "kl": 0.26300066569820046, + "learning_rate": 2.185461757905524e-06, + "loss": 0.0105, + "reward": 3.2916667461395264, + "reward_std": 0.3602609857916832, + "rewards/correctness_reward_func": 1.8333333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1095 + }, + { + "completion_length": 136.79166793823242, + "epoch": 0.5866452562558544, + "grad_norm": 1.921875, + "kl": 0.07489533023908734, + "learning_rate": 2.1808272287527176e-06, + "loss": 0.003, + "reward": 2.4375000596046448, + "reward_std": 0.6227572709321976, + "rewards/correctness_reward_func": 1.0000000149011612, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1096 + }, + { + "completion_length": 133.0833396911621, + "epoch": 0.5871805165261609, + "grad_norm": 1.359375, + "kl": 0.04591457825154066, + "learning_rate": 2.1761938143821116e-06, + "loss": 0.0018, + "reward": 3.083333373069763, + "reward_std": 0.5515970289707184, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1097 + }, + { + "completion_length": 145.33333778381348, + "epoch": 0.5877157767964672, + "grad_norm": 1.421875, + "kl": 0.03769760578870773, + "learning_rate": 2.1715615309769446e-06, + "loss": 0.0015, + "reward": 3.083333373069763, + "reward_std": 0.37134991213679314, + "rewards/correctness_reward_func": 1.6666666716337204, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 1098 + }, + { + "completion_length": 141.08333778381348, + "epoch": 0.5882510370667737, + "grad_norm": 1.140625, + "kl": 0.02804331900551915, + "learning_rate": 2.1669303947164983e-06, + "loss": 0.0011, + "reward": 3.4166666865348816, + "reward_std": 0.20412413775920868, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1099 + }, + { + "completion_length": 164.58333587646484, + "epoch": 0.5887862973370801, + "grad_norm": 1.859375, + "kl": 0.040140153374522924, + "learning_rate": 2.162300421776052e-06, + "loss": 0.0016, + "reward": 2.6041667461395264, + "reward_std": 0.6310785673558712, + "rewards/correctness_reward_func": 1.1666666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000149011612, + "rewards/xmlcount_reward_func": 0.5, + "step": 1100 + }, + { + "completion_length": 164.00000381469727, + "epoch": 0.5893215576073866, + "grad_norm": 1.15625, + "kl": 0.04401758685708046, + "learning_rate": 2.1576716283268206e-06, + "loss": 0.0018, + "reward": 2.854166716337204, + "reward_std": 0.2648099809885025, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 1101 + }, + { + "completion_length": 128.50000381469727, + "epoch": 0.5898568178776931, + "grad_norm": 1.6328125, + "kl": 0.034419551491737366, + "learning_rate": 2.1530440305358972e-06, + "loss": 0.0014, + "reward": 3.083333373069763, + "reward_std": 0.6821095794439316, + "rewards/correctness_reward_func": 1.5833333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1102 + }, + { + "completion_length": 131.7500057220459, + "epoch": 0.5903920781479994, + "grad_norm": 0.99609375, + "kl": 0.03005096409469843, + "learning_rate": 2.1484176445662035e-06, + "loss": 0.0012, + "reward": 3.25, + "reward_std": 0.273861289024353, + "rewards/correctness_reward_func": 1.75, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1103 + }, + { + "completion_length": 166.37500381469727, + "epoch": 0.5909273384183059, + "grad_norm": 1.46875, + "kl": 0.06135753355920315, + "learning_rate": 2.1437924865764247e-06, + "loss": 0.0025, + "reward": 3.2500001192092896, + "reward_std": 0.4650702327489853, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.416666679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 1104 + }, + { + "completion_length": 108.41666984558105, + "epoch": 0.5914625986886123, + "grad_norm": 3.53125, + "kl": 0.05622281040996313, + "learning_rate": 2.139168572720958e-06, + "loss": 0.0022, + "reward": 2.8750000596046448, + "reward_std": 0.7841716818511486, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1105 + }, + { + "completion_length": 157.3333396911621, + "epoch": 0.5919978589589188, + "grad_norm": 1.75, + "kl": 0.04904384817928076, + "learning_rate": 2.1345459191498565e-06, + "loss": 0.002, + "reward": 3.1875000596046448, + "reward_std": 0.3572172783315182, + "rewards/correctness_reward_func": 1.75, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1106 + }, + { + "completion_length": 129.29166984558105, + "epoch": 0.5925331192292252, + "grad_norm": 1.03125, + "kl": 0.023244800977408886, + "learning_rate": 2.1299245420087685e-06, + "loss": 0.0009, + "reward": 3.4166666865348816, + "reward_std": 0.20412413775920868, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1107 + }, + { + "completion_length": 126.33333587646484, + "epoch": 0.5930683794995316, + "grad_norm": 1.5703125, + "kl": 0.029206049628555775, + "learning_rate": 2.1253044574388886e-06, + "loss": 0.0012, + "reward": 3.2916667461395264, + "reward_std": 0.3602609895169735, + "rewards/correctness_reward_func": 1.8333333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1108 + }, + { + "completion_length": 139.79166984558105, + "epoch": 0.5936036397698381, + "grad_norm": 1.2890625, + "kl": 0.022994154831394553, + "learning_rate": 2.1206856815768925e-06, + "loss": 0.0009, + "reward": 3.3541666865348816, + "reward_std": 0.27258947491645813, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375, + "rewards/xmlcount_reward_func": 0.5, + "step": 1109 + }, + { + "completion_length": 173.4166717529297, + "epoch": 0.5941389000401445, + "grad_norm": 2.078125, + "kl": 0.07352372910827398, + "learning_rate": 2.1160682305548867e-06, + "loss": 0.0029, + "reward": 3.1041667461395264, + "reward_std": 0.6647366434335709, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3125000111758709, + "rewards/xmlcount_reward_func": 0.4791666716337204, + "step": 1110 + }, + { + "completion_length": 128.04166984558105, + "epoch": 0.594674160310451, + "grad_norm": 2.078125, + "kl": 0.05020967125892639, + "learning_rate": 2.1114521205003512e-06, + "loss": 0.002, + "reward": 2.833333373069763, + "reward_std": 0.5222772061824799, + "rewards/correctness_reward_func": 1.4166666716337204, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1111 + }, + { + "completion_length": 178.00000381469727, + "epoch": 0.5952094205807574, + "grad_norm": 2.015625, + "kl": 0.050168720073997974, + "learning_rate": 2.10683736753608e-06, + "loss": 0.002, + "reward": 2.7291667759418488, + "reward_std": 0.8597998470067978, + "rewards/correctness_reward_func": 1.333333395421505, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.416666679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 1112 + }, + { + "completion_length": 153.00000381469727, + "epoch": 0.5957446808510638, + "grad_norm": 1.7578125, + "kl": 0.025686150649562478, + "learning_rate": 2.1022239877801316e-06, + "loss": 0.001, + "reward": 3.3281251192092896, + "reward_std": 0.3285987824201584, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.416666679084301, + "rewards/xmlcount_reward_func": 0.4947916716337204, + "step": 1113 + }, + { + "completion_length": 122.33333778381348, + "epoch": 0.5962799411213703, + "grad_norm": 1.84375, + "kl": 0.0372623517177999, + "learning_rate": 2.0976119973457625e-06, + "loss": 0.0015, + "reward": 3.270833373069763, + "reward_std": 0.37377963587641716, + "rewards/correctness_reward_func": 1.8333333432674408, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1114 + }, + { + "completion_length": 133.83333587646484, + "epoch": 0.5968152013916767, + "grad_norm": 0.6953125, + "kl": 0.029676989652216434, + "learning_rate": 2.09300141234138e-06, + "loss": 0.0012, + "reward": 3.375, + "reward_std": 0.19364917278289795, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 1115 + }, + { + "completion_length": 139.7083396911621, + "epoch": 0.5973504616619831, + "grad_norm": 1.296875, + "kl": 0.02908479329198599, + "learning_rate": 2.0883922488704835e-06, + "loss": 0.0012, + "reward": 3.3750000596046448, + "reward_std": 0.25129128620028496, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1116 + }, + { + "completion_length": 148.04166984558105, + "epoch": 0.5978857219322896, + "grad_norm": 2.1875, + "kl": 0.04395298566669226, + "learning_rate": 2.083784523031605e-06, + "loss": 0.0018, + "reward": 2.8750000596046448, + "reward_std": 0.4449404589831829, + "rewards/correctness_reward_func": 1.5000000223517418, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.416666679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 1117 + }, + { + "completion_length": 138.58333587646484, + "epoch": 0.598420982202596, + "grad_norm": 1.6796875, + "kl": 0.051251002587378025, + "learning_rate": 2.079178250918257e-06, + "loss": 0.0021, + "reward": 2.958333432674408, + "reward_std": 0.7584633976221085, + "rewards/correctness_reward_func": 1.5833333730697632, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.4791666716337204, + "step": 1118 + }, + { + "completion_length": 179.33333778381348, + "epoch": 0.5989562424729025, + "grad_norm": 1.765625, + "kl": 0.04658348159864545, + "learning_rate": 2.074573448618874e-06, + "loss": 0.0019, + "reward": 2.312500089406967, + "reward_std": 0.8353358805179596, + "rewards/correctness_reward_func": 1.0833333432674408, + "rewards/int_reward_func": 0.4166666679084301, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3125000037252903, + "rewards/xmlcount_reward_func": 0.5, + "step": 1119 + }, + { + "completion_length": 142.04166984558105, + "epoch": 0.5994915027432088, + "grad_norm": 1.1328125, + "kl": 0.02434215135872364, + "learning_rate": 2.069970132216754e-06, + "loss": 0.001, + "reward": 2.9791666865348816, + "reward_std": 0.5050541460514069, + "rewards/correctness_reward_func": 1.5000000149011612, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1120 + }, + { + "completion_length": 161.62500762939453, + "epoch": 0.6000267630135153, + "grad_norm": 1.65625, + "kl": 0.04097714927047491, + "learning_rate": 2.0653683177900114e-06, + "loss": 0.0016, + "reward": 2.770833373069763, + "reward_std": 0.8850989937782288, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.3958333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1121 + }, + { + "completion_length": 114.45833778381348, + "epoch": 0.6005620232838218, + "grad_norm": 2.640625, + "kl": 0.0499598728492856, + "learning_rate": 2.06076802141151e-06, + "loss": 0.002, + "reward": 2.6875000298023224, + "reward_std": 0.6272481828927994, + "rewards/correctness_reward_func": 1.3333333358168602, + "rewards/int_reward_func": 0.4375000074505806, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.4583333432674408, + "step": 1122 + }, + { + "completion_length": 178.12500381469727, + "epoch": 0.6010972835541282, + "grad_norm": 1.9921875, + "kl": 0.03528116596862674, + "learning_rate": 2.0561692591488113e-06, + "loss": 0.0014, + "reward": 2.6458334028720856, + "reward_std": 0.7926383912563324, + "rewards/correctness_reward_func": 1.3333333656191826, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3333333395421505, + "rewards/xmlcount_reward_func": 0.5, + "step": 1123 + }, + { + "completion_length": 162.5833396911621, + "epoch": 0.6016325438244347, + "grad_norm": 1.1796875, + "kl": 0.025379335740581155, + "learning_rate": 2.0515720470641216e-06, + "loss": 0.001, + "reward": 3.3125000596046448, + "reward_std": 0.309229951351881, + "rewards/correctness_reward_func": 1.8333333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1124 + }, + { + "completion_length": 147.6250057220459, + "epoch": 0.602167804094741, + "grad_norm": 1.96875, + "kl": 0.024831503629684448, + "learning_rate": 2.046976401214229e-06, + "loss": 0.001, + "reward": 2.708333373069763, + "reward_std": 0.8290883004665375, + "rewards/correctness_reward_func": 1.25, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1125 + }, + { + "completion_length": 176.50000381469727, + "epoch": 0.6027030643650475, + "grad_norm": 1.65625, + "kl": 0.02572201332077384, + "learning_rate": 2.042382337650455e-06, + "loss": 0.001, + "reward": 2.7916667461395264, + "reward_std": 0.8075917363166809, + "rewards/correctness_reward_func": 1.333333358168602, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1126 + }, + { + "completion_length": 158.50000762939453, + "epoch": 0.603238324635354, + "grad_norm": 2.203125, + "kl": 0.06118709687143564, + "learning_rate": 2.0377898724185926e-06, + "loss": 0.0024, + "reward": 2.895833432674408, + "reward_std": 0.7121906578540802, + "rewards/correctness_reward_func": 1.5, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1127 + }, + { + "completion_length": 198.8750114440918, + "epoch": 0.6037735849056604, + "grad_norm": 1.5546875, + "kl": 0.019993150606751442, + "learning_rate": 2.033199021558851e-06, + "loss": 0.0008, + "reward": 2.270833373069763, + "reward_std": 0.7049218565225601, + "rewards/correctness_reward_func": 0.8333333432674408, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1128 + }, + { + "completion_length": 184.2916717529297, + "epoch": 0.6043088451759668, + "grad_norm": 1.6796875, + "kl": 0.03804916702210903, + "learning_rate": 2.028609801105805e-06, + "loss": 0.0015, + "reward": 2.1041667461395264, + "reward_std": 0.6306373104453087, + "rewards/correctness_reward_func": 0.7500000074505806, + "rewards/int_reward_func": 0.4166666679084301, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000149011612, + "rewards/xmlcount_reward_func": 0.5, + "step": 1129 + }, + { + "completion_length": 145.25000381469727, + "epoch": 0.6048441054462732, + "grad_norm": 1.6640625, + "kl": 0.08065536711364985, + "learning_rate": 2.024022227088329e-06, + "loss": 0.0032, + "reward": 3.145833373069763, + "reward_std": 0.6237796358764172, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.4375000074505806, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 1130 + }, + { + "completion_length": 131.75000381469727, + "epoch": 0.6053793657165797, + "grad_norm": 1.4453125, + "kl": 0.028958051931113005, + "learning_rate": 2.0194363155295525e-06, + "loss": 0.0012, + "reward": 2.9791666865348816, + "reward_std": 0.4242093414068222, + "rewards/correctness_reward_func": 1.5000000223517418, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1131 + }, + { + "completion_length": 149.3750057220459, + "epoch": 0.6059146259868862, + "grad_norm": 1.171875, + "kl": 0.03974736947566271, + "learning_rate": 2.014852082446796e-06, + "loss": 0.0016, + "reward": 3.1250000596046448, + "reward_std": 0.2803870253264904, + "rewards/correctness_reward_func": 1.6666666716337204, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1132 + }, + { + "completion_length": 177.7500057220459, + "epoch": 0.6064498862571925, + "grad_norm": 1.3515625, + "kl": 0.03239813074469566, + "learning_rate": 2.010269543851516e-06, + "loss": 0.0013, + "reward": 3.208333373069763, + "reward_std": 0.37592336162924767, + "rewards/correctness_reward_func": 1.75, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1133 + }, + { + "completion_length": 158.87500762939453, + "epoch": 0.606985146527499, + "grad_norm": 1.609375, + "kl": 0.037661376409232616, + "learning_rate": 2.005688715749254e-06, + "loss": 0.0015, + "reward": 2.7291667461395264, + "reward_std": 0.6971828788518906, + "rewards/correctness_reward_func": 1.3333333656191826, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.4791666716337204, + "step": 1134 + }, + { + "completion_length": 172.54167938232422, + "epoch": 0.6075204067978054, + "grad_norm": 1.96875, + "kl": 0.03389792237430811, + "learning_rate": 2.0011096141395742e-06, + "loss": 0.0014, + "reward": 2.8541667461395264, + "reward_std": 0.8515709191560745, + "rewards/correctness_reward_func": 1.5000000447034836, + "rewards/int_reward_func": 0.4166666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.4791666716337204, + "step": 1135 + }, + { + "completion_length": 168.83334350585938, + "epoch": 0.6080556670681119, + "grad_norm": 1.4140625, + "kl": 0.03520802827551961, + "learning_rate": 1.996532255016011e-06, + "loss": 0.0014, + "reward": 3.125, + "reward_std": 0.493710458278656, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1136 + }, + { + "completion_length": 171.5416717529297, + "epoch": 0.6085909273384184, + "grad_norm": 2.015625, + "kl": 0.03871652204543352, + "learning_rate": 1.9919566543660163e-06, + "loss": 0.0015, + "reward": 2.833333432674408, + "reward_std": 0.6220272481441498, + "rewards/correctness_reward_func": 1.4166667088866234, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.416666679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 1137 + }, + { + "completion_length": 128.9583339691162, + "epoch": 0.6091261876087247, + "grad_norm": 1.1640625, + "kl": 0.029786940664052963, + "learning_rate": 1.9873828281708954e-06, + "loss": 0.0012, + "reward": 2.9393333196640015, + "reward_std": 0.14860239997506142, + "rewards/correctness_reward_func": 1.5, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.48099999874830246, + "step": 1138 + }, + { + "completion_length": 169.66667366027832, + "epoch": 0.6096614478790312, + "grad_norm": 1.453125, + "kl": 0.024774388410151005, + "learning_rate": 1.9828107924057593e-06, + "loss": 0.001, + "reward": 2.3125000298023224, + "reward_std": 0.3655807636678219, + "rewards/correctness_reward_func": 0.9166666865348816, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.416666679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 1139 + }, + { + "completion_length": 136.62500190734863, + "epoch": 0.6101967081493376, + "grad_norm": 2.40625, + "kl": 0.037171173840761185, + "learning_rate": 1.9782405630394635e-06, + "loss": 0.0015, + "reward": 2.9375000596046448, + "reward_std": 0.889277458190918, + "rewards/correctness_reward_func": 1.5000000298023224, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.4791666716337204, + "step": 1140 + }, + { + "completion_length": 138.8333396911621, + "epoch": 0.6107319684196441, + "grad_norm": 2.09375, + "kl": 0.03229498118162155, + "learning_rate": 1.9736721560345543e-06, + "loss": 0.0013, + "reward": 2.895833432674408, + "reward_std": 0.866707444190979, + "rewards/correctness_reward_func": 1.4166667312383652, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1141 + }, + { + "completion_length": 122.66666984558105, + "epoch": 0.6112672286899505, + "grad_norm": 1.234375, + "kl": 0.01973505667410791, + "learning_rate": 1.9691055873472153e-06, + "loss": 0.0008, + "reward": 3.395833373069763, + "reward_std": 0.25515520572662354, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1142 + }, + { + "completion_length": 122.70833969116211, + "epoch": 0.6118024889602569, + "grad_norm": 2.125, + "kl": 0.033139331731945276, + "learning_rate": 1.9645408729272068e-06, + "loss": 0.0013, + "reward": 2.583333373069763, + "reward_std": 0.5320602059364319, + "rewards/correctness_reward_func": 1.0833333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1143 + }, + { + "completion_length": 141.5833339691162, + "epoch": 0.6123377492305634, + "grad_norm": 1.8046875, + "kl": 0.06788896024227142, + "learning_rate": 1.959978028717814e-06, + "loss": 0.0027, + "reward": 3.041666716337204, + "reward_std": 0.3872983306646347, + "rewards/correctness_reward_func": 1.6666666716337204, + "rewards/int_reward_func": 0.4166666679084301, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 1144 + }, + { + "completion_length": 157.5416717529297, + "epoch": 0.6128730095008698, + "grad_norm": 1.2421875, + "kl": 0.02102912962436676, + "learning_rate": 1.9554170706557897e-06, + "loss": 0.0008, + "reward": 3.2916666865348816, + "reward_std": 0.4541241526603699, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 1145 + }, + { + "completion_length": 153.16666984558105, + "epoch": 0.6134082697711762, + "grad_norm": 1.8125, + "kl": 0.05245727300643921, + "learning_rate": 1.9508580146712967e-06, + "loss": 0.0021, + "reward": 3.1875000596046448, + "reward_std": 0.5530414395034313, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000149011612, + "rewards/xmlcount_reward_func": 0.5, + "step": 1146 + }, + { + "completion_length": 143.50000381469727, + "epoch": 0.6139435300414827, + "grad_norm": 1.96875, + "kl": 0.030988771468400955, + "learning_rate": 1.94630087668786e-06, + "loss": 0.0012, + "reward": 2.9791666865348816, + "reward_std": 0.7711364179849625, + "rewards/correctness_reward_func": 1.5000000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1147 + }, + { + "completion_length": 186.62500381469727, + "epoch": 0.6144787903117891, + "grad_norm": 0.97265625, + "kl": 0.03605042304843664, + "learning_rate": 1.9417456726222995e-06, + "loss": 0.0014, + "reward": 2.9375000298023224, + "reward_std": 0.11558076366782188, + "rewards/correctness_reward_func": 1.5, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1148 + }, + { + "completion_length": 126.6250057220459, + "epoch": 0.6150140505820956, + "grad_norm": 0.953125, + "kl": 0.02856651460751891, + "learning_rate": 1.9371924183846835e-06, + "loss": 0.0011, + "reward": 2.7916666865348816, + "reward_std": 0.2813657224178314, + "rewards/correctness_reward_func": 1.3333333432674408, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1149 + }, + { + "completion_length": 169.2083396911621, + "epoch": 0.6155493108524019, + "grad_norm": 1.3984375, + "kl": 0.01892117876559496, + "learning_rate": 1.9326411298782706e-06, + "loss": 0.0008, + "reward": 3.208333373069763, + "reward_std": 0.6018974483013153, + "rewards/correctness_reward_func": 1.7500000596046448, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 1150 + }, + { + "completion_length": 125.70833587646484, + "epoch": 0.6160845711227084, + "grad_norm": 1.1328125, + "kl": 0.03264283831231296, + "learning_rate": 1.92809182299945e-06, + "loss": 0.0013, + "reward": 3.395833373069763, + "reward_std": 0.25515517219901085, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1151 + }, + { + "completion_length": 174.83333587646484, + "epoch": 0.6166198313930149, + "grad_norm": 1.078125, + "kl": 0.03417334379628301, + "learning_rate": 1.9235445136376954e-06, + "loss": 0.0014, + "reward": 2.9791667461395264, + "reward_std": 0.4493577480316162, + "rewards/correctness_reward_func": 1.583333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333395421505, + "rewards/xmlcount_reward_func": 0.5, + "step": 1152 + }, + { + "completion_length": 138.83333778381348, + "epoch": 0.6171550916633213, + "grad_norm": 2.125, + "kl": 0.045399333350360394, + "learning_rate": 1.9189992176754997e-06, + "loss": 0.0018, + "reward": 3.083333432674408, + "reward_std": 0.7781640440225601, + "rewards/correctness_reward_func": 1.666666716337204, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1153 + }, + { + "completion_length": 160.00000381469727, + "epoch": 0.6176903519336278, + "grad_norm": 1.6875, + "kl": 0.02971332473680377, + "learning_rate": 1.914455950988322e-06, + "loss": 0.0012, + "reward": 2.291666716337204, + "reward_std": 0.5397901386022568, + "rewards/correctness_reward_func": 1.0000000223517418, + "rewards/int_reward_func": 0.3333333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 1154 + }, + { + "completion_length": 126.0000057220459, + "epoch": 0.6182256122039341, + "grad_norm": 1.0078125, + "kl": 0.032387261278927326, + "learning_rate": 1.909914729444539e-06, + "loss": 0.0013, + "reward": 3.083333343267441, + "reward_std": 0.20412413775920868, + "rewards/correctness_reward_func": 1.5833333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1155 + }, + { + "completion_length": 129.66666984558105, + "epoch": 0.6187608724742406, + "grad_norm": 2.28125, + "kl": 0.08170109568163753, + "learning_rate": 1.9053755689053794e-06, + "loss": 0.0033, + "reward": 3.0416667461395264, + "reward_std": 0.5094902068376541, + "rewards/correctness_reward_func": 1.583333358168602, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1156 + }, + { + "completion_length": 132.75000381469727, + "epoch": 0.6192961327445471, + "grad_norm": 1.5078125, + "kl": 0.036833798978477716, + "learning_rate": 1.9008384852248775e-06, + "loss": 0.0015, + "reward": 3.145833373069763, + "reward_std": 0.4592793583869934, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1157 + }, + { + "completion_length": 170.83333778381348, + "epoch": 0.6198313930148535, + "grad_norm": 1.59375, + "kl": 0.023054254008457065, + "learning_rate": 1.89630349424981e-06, + "loss": 0.0009, + "reward": 2.6250000596046448, + "reward_std": 0.7045579701662064, + "rewards/correctness_reward_func": 1.2500000223517418, + "rewards/int_reward_func": 0.39583333395421505, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1158 + }, + { + "completion_length": 143.6250057220459, + "epoch": 0.6203666532851599, + "grad_norm": 1.5390625, + "kl": 0.02555786306038499, + "learning_rate": 1.8917706118196455e-06, + "loss": 0.001, + "reward": 2.7291666865348816, + "reward_std": 0.3248923234641552, + "rewards/correctness_reward_func": 1.25, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1159 + }, + { + "completion_length": 251.33334350585938, + "epoch": 0.6209019135554663, + "grad_norm": 1.6328125, + "kl": 0.03249187534675002, + "learning_rate": 1.8872398537664902e-06, + "loss": 0.0013, + "reward": 2.479166716337204, + "reward_std": 1.0055317729711533, + "rewards/correctness_reward_func": 1.1666667014360428, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3333333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1160 + }, + { + "completion_length": 147.75000381469727, + "epoch": 0.6214371738257728, + "grad_norm": 1.609375, + "kl": 0.023566798539832234, + "learning_rate": 1.8827112359150277e-06, + "loss": 0.0009, + "reward": 3.0885416865348816, + "reward_std": 0.498736172914505, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.484375, + "step": 1161 + }, + { + "completion_length": 176.08333778381348, + "epoch": 0.6219724340960792, + "grad_norm": 1.359375, + "kl": 0.02330430643633008, + "learning_rate": 1.878184774082467e-06, + "loss": 0.0009, + "reward": 2.895833373069763, + "reward_std": 0.6062580496072769, + "rewards/correctness_reward_func": 1.5, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333395421505, + "rewards/xmlcount_reward_func": 0.5, + "step": 1162 + }, + { + "completion_length": 143.0833396911621, + "epoch": 0.6225076943663856, + "grad_norm": 1.078125, + "kl": 0.04478012444451451, + "learning_rate": 1.8736604840784884e-06, + "loss": 0.0018, + "reward": 3.395833373069763, + "reward_std": 0.25515518710017204, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1163 + }, + { + "completion_length": 131.45833587646484, + "epoch": 0.6230429546366921, + "grad_norm": 1.0546875, + "kl": 0.03378752525895834, + "learning_rate": 1.8691383817051833e-06, + "loss": 0.0014, + "reward": 2.458333343267441, + "reward_std": 0.06454972922801971, + "rewards/correctness_reward_func": 1.0, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 1164 + }, + { + "completion_length": 167.25000381469727, + "epoch": 0.6235782149069985, + "grad_norm": 0.62890625, + "kl": 0.017930781934410334, + "learning_rate": 1.8646184827570074e-06, + "loss": 0.0007, + "reward": 3.4166666865348816, + "reward_std": 0.20412413775920868, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1165 + }, + { + "completion_length": 162.7500057220459, + "epoch": 0.624113475177305, + "grad_norm": 0.859375, + "kl": 0.02851248439401388, + "learning_rate": 1.8601008030207157e-06, + "loss": 0.0011, + "reward": 2.5625, + "reward_std": 0.22008520364761353, + "rewards/correctness_reward_func": 1.0833333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1166 + }, + { + "completion_length": 180.1666717529297, + "epoch": 0.6246487354476113, + "grad_norm": 1.3203125, + "kl": 0.03482568962499499, + "learning_rate": 1.8555853582753136e-06, + "loss": 0.0014, + "reward": 2.895833373069763, + "reward_std": 0.5618228912353516, + "rewards/correctness_reward_func": 1.5000000149011612, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375, + "rewards/xmlcount_reward_func": 0.5, + "step": 1167 + }, + { + "completion_length": 119.83333778381348, + "epoch": 0.6251839957179178, + "grad_norm": 0.6640625, + "kl": 0.03804555209353566, + "learning_rate": 1.8510721642920015e-06, + "loss": 0.0015, + "reward": 3.4791666865348816, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1168 + }, + { + "completion_length": 163.58334159851074, + "epoch": 0.6257192559882243, + "grad_norm": 1.4296875, + "kl": 0.02223725477233529, + "learning_rate": 1.8465612368341157e-06, + "loss": 0.0009, + "reward": 2.6666667461395264, + "reward_std": 0.5163978338241577, + "rewards/correctness_reward_func": 1.1666666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1169 + }, + { + "completion_length": 158.29166793823242, + "epoch": 0.6262545162585307, + "grad_norm": 1.65625, + "kl": 0.0354935098439455, + "learning_rate": 1.8420525916570811e-06, + "loss": 0.0014, + "reward": 2.5625000298023224, + "reward_std": 0.7091782838106155, + "rewards/correctness_reward_func": 1.0833333507180214, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1170 + }, + { + "completion_length": 195.50000762939453, + "epoch": 0.6267897765288372, + "grad_norm": 1.0390625, + "kl": 0.027738153701648116, + "learning_rate": 1.8375462445083464e-06, + "loss": 0.0011, + "reward": 2.8125, + "reward_std": 0.2621144950389862, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 1171 + }, + { + "completion_length": 216.83334350585938, + "epoch": 0.6273250367991435, + "grad_norm": 1.5546875, + "kl": 0.031087984796613455, + "learning_rate": 1.8330422111273349e-06, + "loss": 0.0012, + "reward": 2.3333334028720856, + "reward_std": 0.7801860384643078, + "rewards/correctness_reward_func": 1.0000000074505806, + "rewards/int_reward_func": 0.4166666679084301, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.416666679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 1172 + }, + { + "completion_length": 181.62500381469727, + "epoch": 0.62786029706945, + "grad_norm": 1.6484375, + "kl": 0.02746220026165247, + "learning_rate": 1.828540507245391e-06, + "loss": 0.0011, + "reward": 2.5833334624767303, + "reward_std": 0.7781641036272049, + "rewards/correctness_reward_func": 1.166666716337204, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 1173 + }, + { + "completion_length": 171.08333587646484, + "epoch": 0.6283955573397565, + "grad_norm": 1.3515625, + "kl": 0.04072369076311588, + "learning_rate": 1.8240411485857201e-06, + "loss": 0.0016, + "reward": 2.958333373069763, + "reward_std": 0.5736270248889923, + "rewards/correctness_reward_func": 1.5833333432674408, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 1174 + }, + { + "completion_length": 145.08333587646484, + "epoch": 0.6289308176100629, + "grad_norm": 1.25, + "kl": 0.044766807463020086, + "learning_rate": 1.8195441508633368e-06, + "loss": 0.0018, + "reward": 3.0625000298023224, + "reward_std": 0.37377963587641716, + "rewards/correctness_reward_func": 1.6666666716337204, + "rewards/int_reward_func": 0.4166666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1175 + }, + { + "completion_length": 165.70833587646484, + "epoch": 0.6294660778803693, + "grad_norm": 1.53125, + "kl": 0.029149475507438183, + "learning_rate": 1.8150495297850121e-06, + "loss": 0.0012, + "reward": 2.8750000596046448, + "reward_std": 0.7602093517780304, + "rewards/correctness_reward_func": 1.4166667014360428, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1176 + }, + { + "completion_length": 134.58333778381348, + "epoch": 0.6300013381506757, + "grad_norm": 1.859375, + "kl": 0.035790836438536644, + "learning_rate": 1.8105573010492123e-06, + "loss": 0.0014, + "reward": 3.2916667461395264, + "reward_std": 0.5103103779256344, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1177 + }, + { + "completion_length": 179.50000381469727, + "epoch": 0.6305365984209822, + "grad_norm": 1.4453125, + "kl": 0.0371116129681468, + "learning_rate": 1.8060674803460514e-06, + "loss": 0.0015, + "reward": 2.5416666865348816, + "reward_std": 0.7251393496990204, + "rewards/correctness_reward_func": 1.0833333507180214, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1178 + }, + { + "completion_length": 130.33333778381348, + "epoch": 0.6310718586912887, + "grad_norm": 2.25, + "kl": 0.06264671497046947, + "learning_rate": 1.80158008335723e-06, + "loss": 0.0025, + "reward": 3.2916667461395264, + "reward_std": 0.4554154574871063, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666865348816, + "rewards/xmlcount_reward_func": 0.4791666716337204, + "step": 1179 + }, + { + "completion_length": 151.91666984558105, + "epoch": 0.631607118961595, + "grad_norm": 1.5078125, + "kl": 0.027635585516691208, + "learning_rate": 1.797095125755984e-06, + "loss": 0.0011, + "reward": 3.0625000596046448, + "reward_std": 0.5653917640447617, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 1180 + }, + { + "completion_length": 135.50000381469727, + "epoch": 0.6321423792319015, + "grad_norm": 0.050537109375, + "kl": 0.022512939991429448, + "learning_rate": 1.7926126232070315e-06, + "loss": 0.0009, + "reward": 3.5, + "reward_std": 0.0, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1181 + }, + { + "completion_length": 195.8333396911621, + "epoch": 0.6326776395022079, + "grad_norm": 1.6796875, + "kl": 0.0218606092967093, + "learning_rate": 1.7881325913665127e-06, + "loss": 0.0009, + "reward": 2.192708373069763, + "reward_std": 0.5448218882083893, + "rewards/correctness_reward_func": 0.8333333730697632, + "rewards/int_reward_func": 0.4375, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375, + "rewards/xmlcount_reward_func": 0.484375, + "step": 1182 + }, + { + "completion_length": 159.8333396911621, + "epoch": 0.6332128997725144, + "grad_norm": 1.2109375, + "kl": 0.017797658685594797, + "learning_rate": 1.78365504588194e-06, + "loss": 0.0007, + "reward": 3.333333373069763, + "reward_std": 0.25819891691207886, + "rewards/correctness_reward_func": 1.8333333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1183 + }, + { + "completion_length": 175.8333396911621, + "epoch": 0.6337481600428209, + "grad_norm": 1.5078125, + "kl": 0.03092574281617999, + "learning_rate": 1.7791800023921412e-06, + "loss": 0.0012, + "reward": 2.520833432674408, + "reward_std": 0.7812077701091766, + "rewards/correctness_reward_func": 1.0833333730697632, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1184 + }, + { + "completion_length": 119.50000381469727, + "epoch": 0.6342834203131272, + "grad_norm": 1.1171875, + "kl": 0.035085609182715416, + "learning_rate": 1.7747074765272047e-06, + "loss": 0.0014, + "reward": 3.4166666865348816, + "reward_std": 0.20412415266036987, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1185 + }, + { + "completion_length": 132.5000057220459, + "epoch": 0.6348186805834337, + "grad_norm": 1.640625, + "kl": 0.03759643901139498, + "learning_rate": 1.7702374839084275e-06, + "loss": 0.0015, + "reward": 3.145833373069763, + "reward_std": 0.5513499081134796, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1186 + }, + { + "completion_length": 138.00000381469727, + "epoch": 0.6353539408537401, + "grad_norm": 1.625, + "kl": 0.04007141292095184, + "learning_rate": 1.7657700401482564e-06, + "loss": 0.0016, + "reward": 2.833333373069763, + "reward_std": 0.6394436359405518, + "rewards/correctness_reward_func": 1.4166666716337204, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.4791666716337204, + "step": 1187 + }, + { + "completion_length": 115.25000190734863, + "epoch": 0.6358892011240466, + "grad_norm": 17.375, + "kl": 0.3322620280086994, + "learning_rate": 1.7613051608502365e-06, + "loss": 0.0133, + "reward": 3.333333373069763, + "reward_std": 0.40824833512306213, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.4791666716337204, + "step": 1188 + }, + { + "completion_length": 198.8333396911621, + "epoch": 0.636424461394353, + "grad_norm": 1.9453125, + "kl": 0.028016306459903717, + "learning_rate": 1.7568428616089572e-06, + "loss": 0.0011, + "reward": 1.8958333730697632, + "reward_std": 0.9316931664943695, + "rewards/correctness_reward_func": 0.5000000149011612, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1189 + }, + { + "completion_length": 164.08333587646484, + "epoch": 0.6369597216646594, + "grad_norm": 1.859375, + "kl": 0.042940919287502766, + "learning_rate": 1.7523831580099938e-06, + "loss": 0.0017, + "reward": 3.0208334922790527, + "reward_std": 0.7812078148126602, + "rewards/correctness_reward_func": 1.5833333730697632, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1190 + }, + { + "completion_length": 139.70833587646484, + "epoch": 0.6374949819349659, + "grad_norm": 1.3203125, + "kl": 0.03346863482147455, + "learning_rate": 1.747926065629859e-06, + "loss": 0.0013, + "reward": 3.3125000596046448, + "reward_std": 0.4592793434858322, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1191 + }, + { + "completion_length": 137.20833587646484, + "epoch": 0.6380302422052723, + "grad_norm": 1.5234375, + "kl": 0.0404424169100821, + "learning_rate": 1.743471600035943e-06, + "loss": 0.0016, + "reward": 3.2916666865348816, + "reward_std": 0.4132891744375229, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1192 + }, + { + "completion_length": 201.3333396911621, + "epoch": 0.6385655024755788, + "grad_norm": 1.6953125, + "kl": 0.02739003114402294, + "learning_rate": 1.7390197767864614e-06, + "loss": 0.0011, + "reward": 2.1106250286102295, + "reward_std": 0.6114244014024734, + "rewards/correctness_reward_func": 0.8333333730697632, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333432674408, + "rewards/xmlcount_reward_func": 0.4231249988079071, + "step": 1193 + }, + { + "completion_length": 186.20833587646484, + "epoch": 0.6391007627458852, + "grad_norm": 1.8515625, + "kl": 0.05962704448029399, + "learning_rate": 1.7345706114304023e-06, + "loss": 0.0024, + "reward": 2.9791667461395264, + "reward_std": 0.6494128406047821, + "rewards/correctness_reward_func": 1.5833333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 1194 + }, + { + "completion_length": 167.62500381469727, + "epoch": 0.6396360230161916, + "grad_norm": 1.3046875, + "kl": 0.024999674409627914, + "learning_rate": 1.7301241195074683e-06, + "loss": 0.001, + "reward": 2.8750000596046448, + "reward_std": 0.5809475183486938, + "rewards/correctness_reward_func": 1.4166666716337204, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1195 + }, + { + "completion_length": 153.2500057220459, + "epoch": 0.6401712832864981, + "grad_norm": 1.21875, + "kl": 0.029375402722507715, + "learning_rate": 1.725680316548028e-06, + "loss": 0.0012, + "reward": 2.9166666865348816, + "reward_std": 0.5320602059364319, + "rewards/correctness_reward_func": 1.4166666716337204, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1196 + }, + { + "completion_length": 143.95833778381348, + "epoch": 0.6407065435568045, + "grad_norm": 1.828125, + "kl": 0.03262898838147521, + "learning_rate": 1.721239218073054e-06, + "loss": 0.0013, + "reward": 2.541666716337204, + "reward_std": 0.7466593086719513, + "rewards/correctness_reward_func": 1.0833333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 1197 + }, + { + "completion_length": 108.62500190734863, + "epoch": 0.6412418038271109, + "grad_norm": 1.078125, + "kl": 0.021220164373517036, + "learning_rate": 1.7168008395940738e-06, + "loss": 0.0008, + "reward": 3.4166666865348816, + "reward_std": 0.20412413775920868, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1198 + }, + { + "completion_length": 174.16667556762695, + "epoch": 0.6417770640974174, + "grad_norm": 1.546875, + "kl": 0.03896147897467017, + "learning_rate": 1.712365196613119e-06, + "loss": 0.0016, + "reward": 2.8541667461395264, + "reward_std": 0.6673594415187836, + "rewards/correctness_reward_func": 1.5000000149011612, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3750000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 1199 + }, + { + "completion_length": 170.5, + "epoch": 0.6423123243677238, + "grad_norm": 1.6796875, + "kl": 0.037333715707063675, + "learning_rate": 1.7079323046226612e-06, + "loss": 0.0015, + "reward": 2.9375000596046448, + "reward_std": 0.4259376786649227, + "rewards/correctness_reward_func": 1.5000000223517418, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1200 + }, + { + "completion_length": 143.91666984558105, + "epoch": 0.6428475846380303, + "grad_norm": 1.6640625, + "kl": 0.04931775387376547, + "learning_rate": 1.7035021791055662e-06, + "loss": 0.002, + "reward": 2.9583334028720856, + "reward_std": 0.45541542768478394, + "rewards/correctness_reward_func": 1.5000000223517418, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1201 + }, + { + "completion_length": 165.5833396911621, + "epoch": 0.6433828449083366, + "grad_norm": 1.3046875, + "kl": 0.03186797956004739, + "learning_rate": 1.6990748355350375e-06, + "loss": 0.0013, + "reward": 3.0416667461395264, + "reward_std": 0.503996953368187, + "rewards/correctness_reward_func": 1.5833333432674408, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1202 + }, + { + "completion_length": 154.66666793823242, + "epoch": 0.6439181051786431, + "grad_norm": 1.6640625, + "kl": 0.03737725364044309, + "learning_rate": 1.6946502893745603e-06, + "loss": 0.0015, + "reward": 3.0312500596046448, + "reward_std": 0.5864861123263836, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333432674408, + "rewards/xmlcount_reward_func": 0.4895833358168602, + "step": 1203 + }, + { + "completion_length": 174.5833339691162, + "epoch": 0.6444533654489496, + "grad_norm": 1.3671875, + "kl": 0.04018306778743863, + "learning_rate": 1.6902285560778529e-06, + "loss": 0.0016, + "reward": 3.2291666865348816, + "reward_std": 0.49727514386177063, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1204 + }, + { + "completion_length": 190.37500381469727, + "epoch": 0.644988625719256, + "grad_norm": 1.875, + "kl": 0.049945867620408535, + "learning_rate": 1.6858096510888048e-06, + "loss": 0.002, + "reward": 2.1250000447034836, + "reward_std": 0.8531132750213146, + "rewards/correctness_reward_func": 0.8333333432674408, + "rewards/int_reward_func": 0.375, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.416666679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 1205 + }, + { + "completion_length": 123.08333587646484, + "epoch": 0.6455238859895625, + "grad_norm": 1.109375, + "kl": 0.030124272685498, + "learning_rate": 1.6813935898414286e-06, + "loss": 0.0012, + "reward": 2.958333343267441, + "reward_std": 0.06454972922801971, + "rewards/correctness_reward_func": 1.5, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 1206 + }, + { + "completion_length": 134.2916717529297, + "epoch": 0.6460591462598688, + "grad_norm": 1.28125, + "kl": 0.028698857873678207, + "learning_rate": 1.676980387759806e-06, + "loss": 0.0011, + "reward": 3.333333373069763, + "reward_std": 0.40824830532073975, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1207 + }, + { + "completion_length": 182.00000381469727, + "epoch": 0.6465944065301753, + "grad_norm": 1.171875, + "kl": 0.04143868666142225, + "learning_rate": 1.6725700602580292e-06, + "loss": 0.0017, + "reward": 3.1041667461395264, + "reward_std": 0.37377968057990074, + "rewards/correctness_reward_func": 1.6666666716337204, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 1208 + }, + { + "completion_length": 154.37500381469727, + "epoch": 0.6471296668004818, + "grad_norm": 0.94921875, + "kl": 0.03367770742624998, + "learning_rate": 1.6681626227401542e-06, + "loss": 0.0013, + "reward": 2.9166666865348816, + "reward_std": 0.20412413775920868, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1209 + }, + { + "completion_length": 169.62500381469727, + "epoch": 0.6476649270707882, + "grad_norm": 2.171875, + "kl": 0.028921236284077168, + "learning_rate": 1.6637580906001405e-06, + "loss": 0.0012, + "reward": 2.7135417461395264, + "reward_std": 0.4438832513988018, + "rewards/correctness_reward_func": 1.3333333432674408, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.484375, + "step": 1210 + }, + { + "completion_length": 128.3750057220459, + "epoch": 0.6482001873410946, + "grad_norm": 1.46875, + "kl": 0.029809471685439348, + "learning_rate": 1.6593564792217995e-06, + "loss": 0.0012, + "reward": 3.3125000596046448, + "reward_std": 0.309229951351881, + "rewards/correctness_reward_func": 1.8333333432674408, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1211 + }, + { + "completion_length": 142.0000057220459, + "epoch": 0.648735447611401, + "grad_norm": 1.9765625, + "kl": 0.06288173329085112, + "learning_rate": 1.6549578039787436e-06, + "loss": 0.0025, + "reward": 3.270833373069763, + "reward_std": 0.33421211317181587, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333395421505, + "rewards/xmlcount_reward_func": 0.5, + "step": 1212 + }, + { + "completion_length": 235.4166717529297, + "epoch": 0.6492707078817075, + "grad_norm": 1.8203125, + "kl": 0.04126214608550072, + "learning_rate": 1.650562080234327e-06, + "loss": 0.0017, + "reward": 2.119791716337204, + "reward_std": 0.7860080450773239, + "rewards/correctness_reward_func": 0.7500000223517418, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3750000149011612, + "rewards/xmlcount_reward_func": 0.4947916716337204, + "step": 1213 + }, + { + "completion_length": 149.6666717529297, + "epoch": 0.649805968152014, + "grad_norm": 2.203125, + "kl": 0.03802600037306547, + "learning_rate": 1.646169323341599e-06, + "loss": 0.0015, + "reward": 2.8333334028720856, + "reward_std": 0.5740398876369, + "rewards/correctness_reward_func": 1.5000000223517418, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.416666679084301, + "rewards/xmlcount_reward_func": 0.4583333358168602, + "step": 1214 + }, + { + "completion_length": 123.33333587646484, + "epoch": 0.6503412284223203, + "grad_norm": 1.09375, + "kl": 0.03457137290388346, + "learning_rate": 1.641779548643243e-06, + "loss": 0.0014, + "reward": 3.395833373069763, + "reward_std": 0.2002602517604828, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1215 + }, + { + "completion_length": 149.7916717529297, + "epoch": 0.6508764886926268, + "grad_norm": 1.796875, + "kl": 0.05280859861522913, + "learning_rate": 1.6373927714715277e-06, + "loss": 0.0021, + "reward": 3.2916666865348816, + "reward_std": 0.4256826154887676, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1216 + }, + { + "completion_length": 111.58333396911621, + "epoch": 0.6514117489629332, + "grad_norm": 1.3515625, + "kl": 0.026734239421784878, + "learning_rate": 1.633009007148253e-06, + "loss": 0.0011, + "reward": 3.083333373069763, + "reward_std": 0.46232305467128754, + "rewards/correctness_reward_func": 1.583333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1217 + }, + { + "completion_length": 143.7916717529297, + "epoch": 0.6519470092332397, + "grad_norm": 1.828125, + "kl": 0.03858533035963774, + "learning_rate": 1.6286282709846947e-06, + "loss": 0.0015, + "reward": 3.2916666865348816, + "reward_std": 0.39777331054210663, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 1218 + }, + { + "completion_length": 157.75000381469727, + "epoch": 0.6524822695035462, + "grad_norm": 2.0, + "kl": 0.031744038220494986, + "learning_rate": 1.624250578281551e-06, + "loss": 0.0013, + "reward": 2.7708334028720856, + "reward_std": 0.8570526540279388, + "rewards/correctness_reward_func": 1.333333395421505, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 1219 + }, + { + "completion_length": 122.45833396911621, + "epoch": 0.6530175297738525, + "grad_norm": 1.8828125, + "kl": 0.047506920993328094, + "learning_rate": 1.6198759443288941e-06, + "loss": 0.0019, + "reward": 3.1875000596046448, + "reward_std": 0.577903788536787, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1220 + }, + { + "completion_length": 149.37500190734863, + "epoch": 0.653552790044159, + "grad_norm": 1.34375, + "kl": 0.0284970928914845, + "learning_rate": 1.6155043844061092e-06, + "loss": 0.0011, + "reward": 3.2916666865348816, + "reward_std": 0.39777331054210663, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 1221 + }, + { + "completion_length": 156.12500190734863, + "epoch": 0.6540880503144654, + "grad_norm": 1.6015625, + "kl": 0.044686511624604464, + "learning_rate": 1.6111359137818458e-06, + "loss": 0.0018, + "reward": 2.4791666865348816, + "reward_std": 0.6236923336982727, + "rewards/correctness_reward_func": 1.0833333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 1222 + }, + { + "completion_length": 123.66667175292969, + "epoch": 0.6546233105847719, + "grad_norm": 1.6171875, + "kl": 0.01990818837657571, + "learning_rate": 1.6067705477139637e-06, + "loss": 0.0008, + "reward": 2.7500000596046448, + "reward_std": 0.46232305467128754, + "rewards/correctness_reward_func": 1.2500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1223 + }, + { + "completion_length": 203.3750057220459, + "epoch": 0.6551585708550783, + "grad_norm": 1.3046875, + "kl": 0.02167564001865685, + "learning_rate": 1.6024083014494777e-06, + "loss": 0.0009, + "reward": 2.739583343267441, + "reward_std": 0.41847972571849823, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333358168602, + "rewards/xmlcount_reward_func": 0.46875, + "step": 1224 + }, + { + "completion_length": 201.0833396911621, + "epoch": 0.6556938311253847, + "grad_norm": 1.8359375, + "kl": 0.06671202601864934, + "learning_rate": 1.5980491902245094e-06, + "loss": 0.0027, + "reward": 2.729166716337204, + "reward_std": 0.7667488753795624, + "rewards/correctness_reward_func": 1.4166666716337204, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3750000074505806, + "rewards/xmlcount_reward_func": 0.4791666716337204, + "step": 1225 + }, + { + "completion_length": 188.45833587646484, + "epoch": 0.6562290913956912, + "grad_norm": 1.1953125, + "kl": 0.038865368347615004, + "learning_rate": 1.593693229264227e-06, + "loss": 0.0016, + "reward": 2.791666716337204, + "reward_std": 0.45111703872680664, + "rewards/correctness_reward_func": 1.416666679084301, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.375, + "rewards/xmlcount_reward_func": 0.5, + "step": 1226 + }, + { + "completion_length": 141.79166793823242, + "epoch": 0.6567643516659976, + "grad_norm": 3.140625, + "kl": 0.08252203557640314, + "learning_rate": 1.5893404337827986e-06, + "loss": 0.0033, + "reward": 2.6041667461395264, + "reward_std": 0.9125833064317703, + "rewards/correctness_reward_func": 1.166666716337204, + "rewards/int_reward_func": 0.4375000074505806, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1227 + }, + { + "completion_length": 169.87500381469727, + "epoch": 0.657299611936304, + "grad_norm": 1.203125, + "kl": 0.01972397230565548, + "learning_rate": 1.5849908189833341e-06, + "loss": 0.0008, + "reward": 3.0625, + "reward_std": 0.5670122802257538, + "rewards/correctness_reward_func": 1.5833333432674408, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1228 + }, + { + "completion_length": 139.87500381469727, + "epoch": 0.6578348722066104, + "grad_norm": 0.96875, + "kl": 0.027716852258890867, + "learning_rate": 1.580644400057833e-06, + "loss": 0.0011, + "reward": 3.4166666865348816, + "reward_std": 0.20412415266036987, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1229 + }, + { + "completion_length": 183.6250057220459, + "epoch": 0.6583701324769169, + "grad_norm": 1.1015625, + "kl": 0.030865561682730913, + "learning_rate": 1.5763011921871377e-06, + "loss": 0.0012, + "reward": 3.2916667461395264, + "reward_std": 0.4727980047464371, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 1230 + }, + { + "completion_length": 150.5833396911621, + "epoch": 0.6589053927472234, + "grad_norm": 1.5, + "kl": 0.0449206349439919, + "learning_rate": 1.57196121054087e-06, + "loss": 0.0018, + "reward": 2.583333373069763, + "reward_std": 0.8052270114421844, + "rewards/correctness_reward_func": 1.1666666716337204, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1231 + }, + { + "completion_length": 160.7500057220459, + "epoch": 0.6594406530175297, + "grad_norm": 1.109375, + "kl": 0.023968304740265012, + "learning_rate": 1.5676244702773852e-06, + "loss": 0.001, + "reward": 3.2500000596046448, + "reward_std": 0.46232305467128754, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1232 + }, + { + "completion_length": 174.12500381469727, + "epoch": 0.6599759132878362, + "grad_norm": 1.6796875, + "kl": 0.036710976622998714, + "learning_rate": 1.563290986543718e-06, + "loss": 0.0015, + "reward": 2.7135417461395264, + "reward_std": 0.5892635434865952, + "rewards/correctness_reward_func": 1.3333333730697632, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.484375, + "step": 1233 + }, + { + "completion_length": 168.25000381469727, + "epoch": 0.6605111735581426, + "grad_norm": 1.703125, + "kl": 0.05276072025299072, + "learning_rate": 1.5589607744755269e-06, + "loss": 0.0021, + "reward": 3.145833373069763, + "reward_std": 0.515580803155899, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1234 + }, + { + "completion_length": 117.29167366027832, + "epoch": 0.6610464338284491, + "grad_norm": 0.1123046875, + "kl": 0.04392884857952595, + "learning_rate": 1.5546338491970476e-06, + "loss": 0.0018, + "reward": 3.5, + "reward_std": 0.0, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1235 + }, + { + "completion_length": 132.16666984558105, + "epoch": 0.6615816940987556, + "grad_norm": 0.8515625, + "kl": 0.033093469217419624, + "learning_rate": 1.5503102258210324e-06, + "loss": 0.0013, + "reward": 2.958333343267441, + "reward_std": 0.06454972922801971, + "rewards/correctness_reward_func": 1.5, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 1236 + }, + { + "completion_length": 166.25000381469727, + "epoch": 0.6621169543690619, + "grad_norm": 1.0703125, + "kl": 0.042634851299226284, + "learning_rate": 1.5459899194486988e-06, + "loss": 0.0017, + "reward": 2.6875, + "reward_std": 0.246855229139328, + "rewards/correctness_reward_func": 1.3333333432674408, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333395421505, + "rewards/xmlcount_reward_func": 0.5, + "step": 1237 + }, + { + "completion_length": 176.2916717529297, + "epoch": 0.6626522146393684, + "grad_norm": 1.78125, + "kl": 0.048626034520566463, + "learning_rate": 1.5416729451696857e-06, + "loss": 0.0019, + "reward": 3.020833373069763, + "reward_std": 0.7135948985815048, + "rewards/correctness_reward_func": 1.666666716337204, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333358168602, + "rewards/xmlcount_reward_func": 0.4791666716337204, + "step": 1238 + }, + { + "completion_length": 153.16666793823242, + "epoch": 0.6631874749096748, + "grad_norm": 1.34375, + "kl": 0.05162359494715929, + "learning_rate": 1.5373593180619875e-06, + "loss": 0.0021, + "reward": 3.1510417461395264, + "reward_std": 0.42381148040294647, + "rewards/correctness_reward_func": 1.75, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.484375, + "step": 1239 + }, + { + "completion_length": 149.58333587646484, + "epoch": 0.6637227351799813, + "grad_norm": 1.8046875, + "kl": 0.04151236591860652, + "learning_rate": 1.5330490531919132e-06, + "loss": 0.0017, + "reward": 3.3125000596046448, + "reward_std": 0.4592793248593807, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1240 + }, + { + "completion_length": 150.50000762939453, + "epoch": 0.6642579954502877, + "grad_norm": 2.5625, + "kl": 0.055647075176239014, + "learning_rate": 1.5287421656140233e-06, + "loss": 0.0022, + "reward": 2.770833432674408, + "reward_std": 1.0548006296157837, + "rewards/correctness_reward_func": 1.416666716337204, + "rewards/int_reward_func": 0.375, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1241 + }, + { + "completion_length": 174.7083396911621, + "epoch": 0.6647932557205941, + "grad_norm": 1.7578125, + "kl": 0.03474069572985172, + "learning_rate": 1.524438670371085e-06, + "loss": 0.0014, + "reward": 3.2187500596046448, + "reward_std": 0.4192725531756878, + "rewards/correctness_reward_func": 1.8333333432674408, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.4895833358168602, + "step": 1242 + }, + { + "completion_length": 169.9166717529297, + "epoch": 0.6653285159909006, + "grad_norm": 1.515625, + "kl": 0.04177691554650664, + "learning_rate": 1.5201385824940178e-06, + "loss": 0.0017, + "reward": 3.2291667461395264, + "reward_std": 0.4352863281965256, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1243 + }, + { + "completion_length": 165.12500762939453, + "epoch": 0.665863776261207, + "grad_norm": 1.9140625, + "kl": 0.03382923407480121, + "learning_rate": 1.515841917001839e-06, + "loss": 0.0014, + "reward": 2.6250001192092896, + "reward_std": 0.8982227295637131, + "rewards/correctness_reward_func": 1.2500000298023224, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1244 + }, + { + "completion_length": 123.62500381469727, + "epoch": 0.6663990365315134, + "grad_norm": 1.984375, + "kl": 0.04350885096937418, + "learning_rate": 1.511548688901612e-06, + "loss": 0.0017, + "reward": 2.6875000596046448, + "reward_std": 0.7840872555971146, + "rewards/correctness_reward_func": 1.2500000223517418, + "rewards/int_reward_func": 0.4375000074505806, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1245 + }, + { + "completion_length": 153.6250057220459, + "epoch": 0.6669342968018199, + "grad_norm": 1.2265625, + "kl": 0.020385520765557885, + "learning_rate": 1.5072589131883959e-06, + "loss": 0.0008, + "reward": 2.895833373069763, + "reward_std": 0.5524365305900574, + "rewards/correctness_reward_func": 1.4166666716337204, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1246 + }, + { + "completion_length": 150.91666984558105, + "epoch": 0.6674695570721263, + "grad_norm": 3.234375, + "kl": 0.09925921354442835, + "learning_rate": 1.502972604845189e-06, + "loss": 0.004, + "reward": 2.833333373069763, + "reward_std": 0.6605667285621166, + "rewards/correctness_reward_func": 1.5000000149011612, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333395421505, + "rewards/xmlcount_reward_func": 0.4791666716337204, + "step": 1247 + }, + { + "completion_length": 201.70833587646484, + "epoch": 0.6680048173424328, + "grad_norm": 1.4765625, + "kl": 0.028366721235215664, + "learning_rate": 1.4986897788428828e-06, + "loss": 0.0011, + "reward": 2.6875000596046448, + "reward_std": 0.775413990020752, + "rewards/correctness_reward_func": 1.2500000223517418, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1248 + }, + { + "completion_length": 121.08333587646484, + "epoch": 0.6685400776127391, + "grad_norm": 2.140625, + "kl": 0.0443681632168591, + "learning_rate": 1.4944104501402028e-06, + "loss": 0.0018, + "reward": 2.3958334028720856, + "reward_std": 0.6104944199323654, + "rewards/correctness_reward_func": 1.0000000149011612, + "rewards/int_reward_func": 0.4166666679084301, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1249 + }, + { + "completion_length": 137.29166793823242, + "epoch": 0.6690753378830456, + "grad_norm": 60.75, + "kl": 0.9240593300200999, + "learning_rate": 1.4901346336836603e-06, + "loss": 0.037, + "reward": 3.270833373069763, + "reward_std": 0.561341404914856, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.4791666716337204, + "step": 1250 + }, + { + "completion_length": 159.79167556762695, + "epoch": 0.6696105981533521, + "grad_norm": 1.4453125, + "kl": 0.02926387684419751, + "learning_rate": 1.4858623444075e-06, + "loss": 0.0012, + "reward": 2.5833334028720856, + "reward_std": 0.5914224684238434, + "rewards/correctness_reward_func": 1.1666666865348816, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 1251 + }, + { + "completion_length": 168.54166984558105, + "epoch": 0.6701458584236585, + "grad_norm": 1.6171875, + "kl": 0.021817692555487156, + "learning_rate": 1.4815935972336448e-06, + "loss": 0.0009, + "reward": 2.458333373069763, + "reward_std": 0.4752403795719147, + "rewards/correctness_reward_func": 1.0000000223517418, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1252 + }, + { + "completion_length": 135.25000762939453, + "epoch": 0.670681118693965, + "grad_norm": 2.390625, + "kl": 0.026987558696419, + "learning_rate": 1.4773284070716504e-06, + "loss": 0.0011, + "reward": 2.8125000596046448, + "reward_std": 0.7872153930366039, + "rewards/correctness_reward_func": 1.333333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.02083333395421505, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.4791666716337204, + "step": 1253 + }, + { + "completion_length": 106.79166793823242, + "epoch": 0.6712163789642713, + "grad_norm": 2.046875, + "kl": 0.0452488474547863, + "learning_rate": 1.473066788818645e-06, + "loss": 0.0018, + "reward": 3.1979166865348816, + "reward_std": 0.4014388881623745, + "rewards/correctness_reward_func": 1.75, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.4895833358168602, + "step": 1254 + }, + { + "completion_length": 176.70833587646484, + "epoch": 0.6717516392345778, + "grad_norm": 1.015625, + "kl": 0.03923133295029402, + "learning_rate": 1.4688087573592819e-06, + "loss": 0.0016, + "reward": 3.0260416865348816, + "reward_std": 0.334529023617506, + "rewards/correctness_reward_func": 1.6666666716337204, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333395421505, + "rewards/xmlcount_reward_func": 0.484375, + "step": 1255 + }, + { + "completion_length": 145.7916717529297, + "epoch": 0.6722868995048843, + "grad_norm": 0.875, + "kl": 0.03602353483438492, + "learning_rate": 1.4645543275656881e-06, + "loss": 0.0014, + "reward": 3.2916666865348816, + "reward_std": 0.23273734748363495, + "rewards/correctness_reward_func": 1.8333333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 1256 + }, + { + "completion_length": 235.12500381469727, + "epoch": 0.6728221597751907, + "grad_norm": 1.390625, + "kl": 0.037159725558012724, + "learning_rate": 1.4603035142974094e-06, + "loss": 0.0015, + "reward": 2.880208373069763, + "reward_std": 0.9242165684700012, + "rewards/correctness_reward_func": 1.5000000298023224, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.484375, + "step": 1257 + }, + { + "completion_length": 128.8333339691162, + "epoch": 0.6733574200454971, + "grad_norm": 1.953125, + "kl": 0.03066550148651004, + "learning_rate": 1.4560563324013605e-06, + "loss": 0.0012, + "reward": 3.145833373069763, + "reward_std": 0.5050541460514069, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1258 + }, + { + "completion_length": 182.0416717529297, + "epoch": 0.6738926803158035, + "grad_norm": 1.203125, + "kl": 0.02439832128584385, + "learning_rate": 1.4518127967117737e-06, + "loss": 0.001, + "reward": 2.770833373069763, + "reward_std": 0.6864498257637024, + "rewards/correctness_reward_func": 1.4166666716337204, + "rewards/int_reward_func": 0.3958333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 1259 + }, + { + "completion_length": 138.5416717529297, + "epoch": 0.67442794058611, + "grad_norm": 1.8515625, + "kl": 0.03161265095695853, + "learning_rate": 1.4475729220501439e-06, + "loss": 0.0013, + "reward": 2.848958432674408, + "reward_std": 0.7870994806289673, + "rewards/correctness_reward_func": 1.4166667014360428, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.4947916716337204, + "step": 1260 + }, + { + "completion_length": 138.70833587646484, + "epoch": 0.6749632008564165, + "grad_norm": 1.328125, + "kl": 0.03222563769668341, + "learning_rate": 1.4433367232251824e-06, + "loss": 0.0013, + "reward": 3.125, + "reward_std": 0.523861289024353, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 1261 + }, + { + "completion_length": 180.16666793823242, + "epoch": 0.6754984611267228, + "grad_norm": 1.7265625, + "kl": 0.048063420690596104, + "learning_rate": 1.439104215032759e-06, + "loss": 0.0019, + "reward": 2.791666716337204, + "reward_std": 0.3602609746158123, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.416666679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 1262 + }, + { + "completion_length": 164.12500381469727, + "epoch": 0.6760337213970293, + "grad_norm": 1.390625, + "kl": 0.02397937048226595, + "learning_rate": 1.4348754122558533e-06, + "loss": 0.001, + "reward": 1.9375000298023224, + "reward_std": 0.11558076366782188, + "rewards/correctness_reward_func": 0.5, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1263 + }, + { + "completion_length": 130.12500190734863, + "epoch": 0.6765689816673357, + "grad_norm": 1.3203125, + "kl": 0.036618311423808336, + "learning_rate": 1.4306503296645052e-06, + "loss": 0.0015, + "reward": 3.1041666865348816, + "reward_std": 0.5464507639408112, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.4375, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1264 + }, + { + "completion_length": 158.08333587646484, + "epoch": 0.6771042419376422, + "grad_norm": 2.21875, + "kl": 0.04947941284626722, + "learning_rate": 1.4264289820157579e-06, + "loss": 0.002, + "reward": 3.020833432674408, + "reward_std": 0.7563454322516918, + "rewards/correctness_reward_func": 1.5833333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000149011612, + "rewards/xmlcount_reward_func": 0.5, + "step": 1265 + }, + { + "completion_length": 151.7916717529297, + "epoch": 0.6776395022079487, + "grad_norm": 1.3984375, + "kl": 0.026323188096284866, + "learning_rate": 1.4222113840536124e-06, + "loss": 0.0011, + "reward": 3.1666667461395264, + "reward_std": 0.5163978338241577, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1266 + }, + { + "completion_length": 147.58333778381348, + "epoch": 0.678174762478255, + "grad_norm": 0.80078125, + "kl": 0.022768684197217226, + "learning_rate": 1.4179975505089715e-06, + "loss": 0.0009, + "reward": 3.375, + "reward_std": 0.19364917278289795, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 1267 + }, + { + "completion_length": 131.75000190734863, + "epoch": 0.6787100227485615, + "grad_norm": 1.8046875, + "kl": 0.03656624024733901, + "learning_rate": 1.4137874960995898e-06, + "loss": 0.0015, + "reward": 3.020833373069763, + "reward_std": 0.5051551908254623, + "rewards/correctness_reward_func": 1.583333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 1268 + }, + { + "completion_length": 153.70833778381348, + "epoch": 0.6792452830188679, + "grad_norm": 2.265625, + "kl": 0.025506778620183468, + "learning_rate": 1.4095812355300229e-06, + "loss": 0.001, + "reward": 2.9791667461395264, + "reward_std": 0.7174782454967499, + "rewards/correctness_reward_func": 1.5000000447034836, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1269 + }, + { + "completion_length": 123.83333778381348, + "epoch": 0.6797805432891744, + "grad_norm": 1.9765625, + "kl": 0.03775408584624529, + "learning_rate": 1.4053787834915753e-06, + "loss": 0.0015, + "reward": 2.7291667461395264, + "reward_std": 0.5133540891110897, + "rewards/correctness_reward_func": 1.2500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1270 + }, + { + "completion_length": 166.8333396911621, + "epoch": 0.6803158035594808, + "grad_norm": 1.4921875, + "kl": 0.03264944674447179, + "learning_rate": 1.4011801546622483e-06, + "loss": 0.0013, + "reward": 2.7916667461395264, + "reward_std": 0.5385859459638596, + "rewards/correctness_reward_func": 1.3333333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1271 + }, + { + "completion_length": 158.5833396911621, + "epoch": 0.6808510638297872, + "grad_norm": 0.671875, + "kl": 0.02247608732432127, + "learning_rate": 1.3969853637066939e-06, + "loss": 0.0009, + "reward": 3.0625, + "reward_std": 0.22008520364761353, + "rewards/correctness_reward_func": 1.5833333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1272 + }, + { + "completion_length": 161.29166793823242, + "epoch": 0.6813863241000937, + "grad_norm": 1.5390625, + "kl": 0.031257415656000376, + "learning_rate": 1.3927944252761535e-06, + "loss": 0.0013, + "reward": 3.083333373069763, + "reward_std": 0.64549720287323, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.4166666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1273 + }, + { + "completion_length": 126.25000381469727, + "epoch": 0.6819215843704001, + "grad_norm": 1.78125, + "kl": 0.02825942961499095, + "learning_rate": 1.3886073540084184e-06, + "loss": 0.0011, + "reward": 2.7500000596046448, + "reward_std": 0.46232305467128754, + "rewards/correctness_reward_func": 1.2500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1274 + }, + { + "completion_length": 156.54166793823242, + "epoch": 0.6824568446407066, + "grad_norm": 1.9296875, + "kl": 0.07228852156549692, + "learning_rate": 1.3844241645277693e-06, + "loss": 0.0029, + "reward": 3.020833373069763, + "reward_std": 0.6077007204294205, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3750000037252903, + "rewards/xmlcount_reward_func": 0.5, + "step": 1275 + }, + { + "completion_length": 170.8333339691162, + "epoch": 0.682992104911013, + "grad_norm": 1.90625, + "kl": 0.049814184196293354, + "learning_rate": 1.3802448714449284e-06, + "loss": 0.002, + "reward": 2.8645834028720856, + "reward_std": 0.5899006687104702, + "rewards/correctness_reward_func": 1.416666679084301, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.4895833358168602, + "step": 1276 + }, + { + "completion_length": 140.91666984558105, + "epoch": 0.6835273651813194, + "grad_norm": 2.03125, + "kl": 0.04045526869595051, + "learning_rate": 1.3760694893570132e-06, + "loss": 0.0016, + "reward": 2.916666805744171, + "reward_std": 0.8322388045489788, + "rewards/correctness_reward_func": 1.5000000298023224, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1277 + }, + { + "completion_length": 154.1666717529297, + "epoch": 0.6840626254516259, + "grad_norm": 1.0859375, + "kl": 0.030904434388503432, + "learning_rate": 1.3718980328474768e-06, + "loss": 0.0012, + "reward": 2.9375000596046448, + "reward_std": 0.43299759924411774, + "rewards/correctness_reward_func": 1.5000000223517418, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1278 + }, + { + "completion_length": 186.0416717529297, + "epoch": 0.6845978857219323, + "grad_norm": 1.671875, + "kl": 0.034098445903509855, + "learning_rate": 1.3677305164860633e-06, + "loss": 0.0014, + "reward": 2.5104167759418488, + "reward_std": 0.8771627657115459, + "rewards/correctness_reward_func": 1.2500000223517418, + "rewards/int_reward_func": 0.3750000037252903, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333432674408, + "rewards/xmlcount_reward_func": 0.4895833358168602, + "step": 1279 + }, + { + "completion_length": 172.50000381469727, + "epoch": 0.6851331459922387, + "grad_norm": 1.4609375, + "kl": 0.03586054500192404, + "learning_rate": 1.363566954828754e-06, + "loss": 0.0014, + "reward": 3.083333432674408, + "reward_std": 0.8156764209270477, + "rewards/correctness_reward_func": 1.666666716337204, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.4791666716337204, + "step": 1280 + }, + { + "completion_length": 175.8333396911621, + "epoch": 0.6856684062625452, + "grad_norm": 2.640625, + "kl": 0.025415783748030663, + "learning_rate": 1.3594073624177176e-06, + "loss": 0.001, + "reward": 2.4791667461395264, + "reward_std": 0.6265050880610943, + "rewards/correctness_reward_func": 1.083333358168602, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.416666679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 1281 + }, + { + "completion_length": 158.2083396911621, + "epoch": 0.6862036665328516, + "grad_norm": 1.171875, + "kl": 0.035512601025402546, + "learning_rate": 1.3552517537812614e-06, + "loss": 0.0014, + "reward": 2.9166666865348816, + "reward_std": 0.11949636042118073, + "rewards/correctness_reward_func": 1.5, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1282 + }, + { + "completion_length": 148.66667366027832, + "epoch": 0.6867389268031581, + "grad_norm": 1.4140625, + "kl": 0.02693007607012987, + "learning_rate": 1.3511001434337762e-06, + "loss": 0.0011, + "reward": 2.708333343267441, + "reward_std": 0.4999281316995621, + "rewards/correctness_reward_func": 1.2500000074505806, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 1283 + }, + { + "completion_length": 156.41666984558105, + "epoch": 0.6872741870734644, + "grad_norm": 1.5078125, + "kl": 0.024214577628299594, + "learning_rate": 1.3469525458756873e-06, + "loss": 0.001, + "reward": 3.1822917461395264, + "reward_std": 0.5373664647340775, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375, + "rewards/xmlcount_reward_func": 0.4947916716337204, + "step": 1284 + }, + { + "completion_length": 147.25000762939453, + "epoch": 0.6878094473437709, + "grad_norm": 1.3203125, + "kl": 0.04691416956484318, + "learning_rate": 1.342808975593408e-06, + "loss": 0.0019, + "reward": 3.3125000596046448, + "reward_std": 0.30922993645071983, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1285 + }, + { + "completion_length": 201.95834350585938, + "epoch": 0.6883447076140774, + "grad_norm": 1.0234375, + "kl": 0.03229131503030658, + "learning_rate": 1.3386694470592815e-06, + "loss": 0.0013, + "reward": 2.052083343267441, + "reward_std": 0.280670702457428, + "rewards/correctness_reward_func": 0.5833333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.4895833358168602, + "step": 1286 + }, + { + "completion_length": 152.16666984558105, + "epoch": 0.6888799678843838, + "grad_norm": 1.046875, + "kl": 0.0439167320728302, + "learning_rate": 1.3345339747315367e-06, + "loss": 0.0018, + "reward": 2.78125, + "reward_std": 0.2788088917732239, + "rewards/correctness_reward_func": 1.3333333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.4895833358168602, + "step": 1287 + }, + { + "completion_length": 187.95833587646484, + "epoch": 0.6894152281546903, + "grad_norm": 1.546875, + "kl": 0.01913665747269988, + "learning_rate": 1.3304025730542342e-06, + "loss": 0.0008, + "reward": 2.270833373069763, + "reward_std": 0.7110214680433273, + "rewards/correctness_reward_func": 0.8333333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 1288 + }, + { + "completion_length": 113.16666793823242, + "epoch": 0.6899504884249966, + "grad_norm": 1.453125, + "kl": 0.03505153767764568, + "learning_rate": 1.3262752564572156e-06, + "loss": 0.0014, + "reward": 2.833333373069763, + "reward_std": 0.5163978338241577, + "rewards/correctness_reward_func": 1.3333333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1289 + }, + { + "completion_length": 136.37500381469727, + "epoch": 0.6904857486953031, + "grad_norm": 1.7421875, + "kl": 0.040967449080199, + "learning_rate": 1.3221520393560594e-06, + "loss": 0.0016, + "reward": 3.020833373069763, + "reward_std": 0.6062580458819866, + "rewards/correctness_reward_func": 1.5833333432674408, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1290 + }, + { + "completion_length": 155.9583396911621, + "epoch": 0.6910210089656095, + "grad_norm": 22.375, + "kl": 0.3683767984621227, + "learning_rate": 1.3180329361520195e-06, + "loss": 0.0147, + "reward": 3.130208373069763, + "reward_std": 0.6834863424301147, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.4375000149011612, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.484375, + "step": 1291 + }, + { + "completion_length": 130.5000057220459, + "epoch": 0.691556269235916, + "grad_norm": 1.9609375, + "kl": 0.07399945426732302, + "learning_rate": 1.313917961231986e-06, + "loss": 0.003, + "reward": 3.3750000596046448, + "reward_std": 0.306186206638813, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1292 + }, + { + "completion_length": 136.9583396911621, + "epoch": 0.6920915295062224, + "grad_norm": 1.8828125, + "kl": 0.051089849323034286, + "learning_rate": 1.3098071289684271e-06, + "loss": 0.002, + "reward": 2.8541667461395264, + "reward_std": 0.6297798566520214, + "rewards/correctness_reward_func": 1.4166666716337204, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1293 + }, + { + "completion_length": 137.45833587646484, + "epoch": 0.6926267897765288, + "grad_norm": 1.0859375, + "kl": 0.04624842945486307, + "learning_rate": 1.3057004537193424e-06, + "loss": 0.0019, + "reward": 2.958333373069763, + "reward_std": 0.10206207260489464, + "rewards/correctness_reward_func": 1.5, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1294 + }, + { + "completion_length": 140.70833587646484, + "epoch": 0.6931620500468353, + "grad_norm": 1.5625, + "kl": 0.047750290017575026, + "learning_rate": 1.3015979498282138e-06, + "loss": 0.0019, + "reward": 3.1250000596046448, + "reward_std": 0.35120461508631706, + "rewards/correctness_reward_func": 1.75, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.4791666716337204, + "step": 1295 + }, + { + "completion_length": 142.58333778381348, + "epoch": 0.6936973103171417, + "grad_norm": 1.21875, + "kl": 0.045999363996088505, + "learning_rate": 1.297499631623952e-06, + "loss": 0.0018, + "reward": 3.270833373069763, + "reward_std": 0.37377963587641716, + "rewards/correctness_reward_func": 1.8333333432674408, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1296 + }, + { + "completion_length": 173.8333396911621, + "epoch": 0.6942325705874481, + "grad_norm": 1.765625, + "kl": 0.026084277778863907, + "learning_rate": 1.2934055134208487e-06, + "loss": 0.001, + "reward": 2.7916667461395264, + "reward_std": 0.7578418999910355, + "rewards/correctness_reward_func": 1.4166667088866234, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.4791666716337204, + "step": 1297 + }, + { + "completion_length": 145.79166984558105, + "epoch": 0.6947678308577546, + "grad_norm": 1.671875, + "kl": 0.03775433311238885, + "learning_rate": 1.2893156095185261e-06, + "loss": 0.0015, + "reward": 3.333333373069763, + "reward_std": 0.26872557401657104, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1298 + }, + { + "completion_length": 155.33333778381348, + "epoch": 0.695303091128061, + "grad_norm": 0.8515625, + "kl": 0.03690297156572342, + "learning_rate": 1.2852299342018864e-06, + "loss": 0.0015, + "reward": 3.4375000596046448, + "reward_std": 0.11558076366782188, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 1299 + }, + { + "completion_length": 167.95833587646484, + "epoch": 0.6958383513983675, + "grad_norm": 1.53125, + "kl": 0.03322401223704219, + "learning_rate": 1.2811485017410657e-06, + "loss": 0.0013, + "reward": 3.0000000596046448, + "reward_std": 0.6986719593405724, + "rewards/correctness_reward_func": 1.5833333432674408, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1300 + }, + { + "completion_length": 172.8333396911621, + "epoch": 0.6963736116686738, + "grad_norm": 1.9140625, + "kl": 0.055822163820266724, + "learning_rate": 1.277071326391377e-06, + "loss": 0.0022, + "reward": 2.3958334028720856, + "reward_std": 0.6410112343728542, + "rewards/correctness_reward_func": 1.0000000149011612, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.416666679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 1301 + }, + { + "completion_length": 164.50000381469727, + "epoch": 0.6969088719389803, + "grad_norm": 2.1875, + "kl": 0.0494797071442008, + "learning_rate": 1.2729984223932655e-06, + "loss": 0.002, + "reward": 2.645833432674408, + "reward_std": 0.9421651512384415, + "rewards/correctness_reward_func": 1.2500000447034836, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1302 + }, + { + "completion_length": 163.70833587646484, + "epoch": 0.6974441322092868, + "grad_norm": 1.0, + "kl": 0.044104176107794046, + "learning_rate": 1.2689298039722598e-06, + "loss": 0.0018, + "reward": 3.333333373069763, + "reward_std": 0.3145497292280197, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 1303 + }, + { + "completion_length": 166.1250057220459, + "epoch": 0.6979793924795932, + "grad_norm": 1.90625, + "kl": 0.02964206924661994, + "learning_rate": 1.2648654853389163e-06, + "loss": 0.0012, + "reward": 2.5625000298023224, + "reward_std": 0.7055750638246536, + "rewards/correctness_reward_func": 1.1666666939854622, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333395421505, + "rewards/xmlcount_reward_func": 0.5, + "step": 1304 + }, + { + "completion_length": 151.9166717529297, + "epoch": 0.6985146527498997, + "grad_norm": 1.7578125, + "kl": 0.038047782611101866, + "learning_rate": 1.2608054806887786e-06, + "loss": 0.0015, + "reward": 3.0416667461395264, + "reward_std": 0.7594528906047344, + "rewards/correctness_reward_func": 1.5833333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1305 + }, + { + "completion_length": 139.12500381469727, + "epoch": 0.699049913020206, + "grad_norm": 0.59375, + "kl": 0.03407254721969366, + "learning_rate": 1.2567498042023187e-06, + "loss": 0.0014, + "reward": 3.395833373069763, + "reward_std": 0.25515520572662354, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1306 + }, + { + "completion_length": 144.04166793823242, + "epoch": 0.6995851732905125, + "grad_norm": 2.921875, + "kl": 0.060761140659451485, + "learning_rate": 1.2526984700448924e-06, + "loss": 0.0024, + "reward": 2.8854166865348816, + "reward_std": 0.28067072853446007, + "rewards/correctness_reward_func": 1.5, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000149011612, + "rewards/xmlcount_reward_func": 0.4687500074505806, + "step": 1307 + }, + { + "completion_length": 164.7916717529297, + "epoch": 0.700120433560819, + "grad_norm": 1.8828125, + "kl": 0.04494229191914201, + "learning_rate": 1.2486514923666895e-06, + "loss": 0.0018, + "reward": 3.208333432674408, + "reward_std": 0.5094902031123638, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1308 + }, + { + "completion_length": 151.62500190734863, + "epoch": 0.7006556938311254, + "grad_norm": 1.59375, + "kl": 0.039783548563718796, + "learning_rate": 1.2446088853026824e-06, + "loss": 0.0016, + "reward": 3.083333373069763, + "reward_std": 0.6293679773807526, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.4791666716337204, + "step": 1309 + }, + { + "completion_length": 137.04166984558105, + "epoch": 0.7011909541014318, + "grad_norm": 1.671875, + "kl": 0.053418907802551985, + "learning_rate": 1.2405706629725814e-06, + "loss": 0.0021, + "reward": 3.3541667461395264, + "reward_std": 0.3572172746062279, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1310 + }, + { + "completion_length": 155.6666717529297, + "epoch": 0.7017262143717382, + "grad_norm": 1.2578125, + "kl": 0.04420957248657942, + "learning_rate": 1.236536839480779e-06, + "loss": 0.0018, + "reward": 3.1250000596046448, + "reward_std": 0.4554154574871063, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1311 + }, + { + "completion_length": 154.08333778381348, + "epoch": 0.7022614746420447, + "grad_norm": 1.5546875, + "kl": 0.0402086041867733, + "learning_rate": 1.2325074289163038e-06, + "loss": 0.0016, + "reward": 3.0260416865348816, + "reward_std": 0.495657354593277, + "rewards/correctness_reward_func": 1.6666666716337204, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.4635416716337204, + "step": 1312 + }, + { + "completion_length": 135.83333778381348, + "epoch": 0.7027967349123512, + "grad_norm": 1.8671875, + "kl": 0.037908658385276794, + "learning_rate": 1.2284824453527747e-06, + "loss": 0.0015, + "reward": 3.2291667461395264, + "reward_std": 0.5133541226387024, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1313 + }, + { + "completion_length": 129.8750057220459, + "epoch": 0.7033319951826575, + "grad_norm": 1.78125, + "kl": 0.034716119058430195, + "learning_rate": 1.2244619028483445e-06, + "loss": 0.0014, + "reward": 3.2500000596046448, + "reward_std": 0.4623230807483196, + "rewards/correctness_reward_func": 1.8333333432674408, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.4791666716337204, + "step": 1314 + }, + { + "completion_length": 146.0416717529297, + "epoch": 0.703867255452964, + "grad_norm": 0.8046875, + "kl": 0.026491194032132626, + "learning_rate": 1.2204458154456552e-06, + "loss": 0.0011, + "reward": 3.4166666865348816, + "reward_std": 0.20412413775920868, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1315 + }, + { + "completion_length": 110.41666984558105, + "epoch": 0.7044025157232704, + "grad_norm": 1.7265625, + "kl": 0.037520342506468296, + "learning_rate": 1.216434197171791e-06, + "loss": 0.0015, + "reward": 3.3750000596046448, + "reward_std": 0.25129128620028496, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1316 + }, + { + "completion_length": 148.79166984558105, + "epoch": 0.7049377759935769, + "grad_norm": 1.8125, + "kl": 0.02962551638484001, + "learning_rate": 1.2124270620382242e-06, + "loss": 0.0012, + "reward": 3.145833373069763, + "reward_std": 0.5042977184057236, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1317 + }, + { + "completion_length": 187.37500762939453, + "epoch": 0.7054730362638834, + "grad_norm": 0.671875, + "kl": 0.023369870614260435, + "learning_rate": 1.2084244240407692e-06, + "loss": 0.0009, + "reward": 2.9479166865348816, + "reward_std": 0.3116655945777893, + "rewards/correctness_reward_func": 1.5833333358168602, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375, + "rewards/xmlcount_reward_func": 0.46875, + "step": 1318 + }, + { + "completion_length": 152.66666984558105, + "epoch": 0.7060082965341897, + "grad_norm": 1.6484375, + "kl": 0.03934650029987097, + "learning_rate": 1.2044262971595336e-06, + "loss": 0.0016, + "reward": 3.3750000596046448, + "reward_std": 0.3061862215399742, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1319 + }, + { + "completion_length": 117.91666984558105, + "epoch": 0.7065435568044962, + "grad_norm": 2.3125, + "kl": 0.07858934300020337, + "learning_rate": 1.2004326953588672e-06, + "loss": 0.0031, + "reward": 3.0625000596046448, + "reward_std": 0.6130734980106354, + "rewards/correctness_reward_func": 1.666666716337204, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1320 + }, + { + "completion_length": 158.00000190734863, + "epoch": 0.7070788170748026, + "grad_norm": 1.671875, + "kl": 0.016671715071424842, + "learning_rate": 1.1964436325873186e-06, + "loss": 0.0007, + "reward": 3.0625000596046448, + "reward_std": 0.6634034812450409, + "rewards/correctness_reward_func": 1.5833333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1321 + }, + { + "completion_length": 161.62500762939453, + "epoch": 0.7076140773451091, + "grad_norm": 2.40625, + "kl": 0.04676167294383049, + "learning_rate": 1.1924591227775785e-06, + "loss": 0.0019, + "reward": 2.1250000596046448, + "reward_std": 0.9239676892757416, + "rewards/correctness_reward_func": 0.75, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1322 + }, + { + "completion_length": 165.9166717529297, + "epoch": 0.7081493376154155, + "grad_norm": 1.7734375, + "kl": 0.06114206160418689, + "learning_rate": 1.18847917984644e-06, + "loss": 0.0024, + "reward": 2.520833373069763, + "reward_std": 0.8347173631191254, + "rewards/correctness_reward_func": 1.2500000223517418, + "rewards/int_reward_func": 0.3750000111758709, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 1323 + }, + { + "completion_length": 121.00000190734863, + "epoch": 0.7086845978857219, + "grad_norm": 2.0625, + "kl": 0.10707360180094838, + "learning_rate": 1.1845038176947413e-06, + "loss": 0.0043, + "reward": 3.3541667461395264, + "reward_std": 0.31970493495464325, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 1324 + }, + { + "completion_length": 150.4583396911621, + "epoch": 0.7092198581560284, + "grad_norm": 1.9140625, + "kl": 0.030952117405831814, + "learning_rate": 1.1805330502073227e-06, + "loss": 0.0012, + "reward": 3.3125000596046448, + "reward_std": 0.40438438951969147, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1325 + }, + { + "completion_length": 163.62500762939453, + "epoch": 0.7097551184263348, + "grad_norm": 1.734375, + "kl": 0.04357670247554779, + "learning_rate": 1.1765668912529774e-06, + "loss": 0.0017, + "reward": 2.895833432674408, + "reward_std": 0.7646453753113747, + "rewards/correctness_reward_func": 1.5000000447034836, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1326 + }, + { + "completion_length": 217.95833587646484, + "epoch": 0.7102903786966412, + "grad_norm": 1.6796875, + "kl": 0.036881398409605026, + "learning_rate": 1.172605354684401e-06, + "loss": 0.0015, + "reward": 2.2864584028720856, + "reward_std": 0.7588135302066803, + "rewards/correctness_reward_func": 1.0000000298023224, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3125000074505806, + "rewards/xmlcount_reward_func": 0.4947916716337204, + "step": 1327 + }, + { + "completion_length": 173.87500381469727, + "epoch": 0.7108256389669477, + "grad_norm": 1.8828125, + "kl": 0.060357251670211554, + "learning_rate": 1.1686484543381437e-06, + "loss": 0.0024, + "reward": 2.7812500596046448, + "reward_std": 0.6692114621400833, + "rewards/correctness_reward_func": 1.3333333656191826, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.4895833358168602, + "step": 1328 + }, + { + "completion_length": 194.2500057220459, + "epoch": 0.7113608992372541, + "grad_norm": 1.3671875, + "kl": 0.05172150093130767, + "learning_rate": 1.1646962040345664e-06, + "loss": 0.0021, + "reward": 2.8125, + "reward_std": 0.5303218066692352, + "rewards/correctness_reward_func": 1.416666679084301, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.4791666716337204, + "step": 1329 + }, + { + "completion_length": 143.8333396911621, + "epoch": 0.7118961595075606, + "grad_norm": 0.74609375, + "kl": 0.045900904573500156, + "learning_rate": 1.160748617577784e-06, + "loss": 0.0018, + "reward": 3.4791666865348816, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1330 + }, + { + "completion_length": 154.83333778381348, + "epoch": 0.712431419777867, + "grad_norm": 2.078125, + "kl": 0.03502502292394638, + "learning_rate": 1.1568057087556256e-06, + "loss": 0.0014, + "reward": 2.9791667461395264, + "reward_std": 0.8382776975631714, + "rewards/correctness_reward_func": 1.5833333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1331 + }, + { + "completion_length": 137.12500381469727, + "epoch": 0.7129666800481734, + "grad_norm": 2.453125, + "kl": 0.03675575461238623, + "learning_rate": 1.1528674913395807e-06, + "loss": 0.0015, + "reward": 2.520833358168602, + "reward_std": 0.5290164612233639, + "rewards/correctness_reward_func": 1.1666666865348816, + "rewards/int_reward_func": 0.375, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1332 + }, + { + "completion_length": 166.6666717529297, + "epoch": 0.7135019403184799, + "grad_norm": 1.5, + "kl": 0.03452045936137438, + "learning_rate": 1.148933979084752e-06, + "loss": 0.0014, + "reward": 2.4375000596046448, + "reward_std": 0.4259376786649227, + "rewards/correctness_reward_func": 1.0000000223517418, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1333 + }, + { + "completion_length": 131.58333778381348, + "epoch": 0.7140372005887863, + "grad_norm": 0.96875, + "kl": 0.022100039292126894, + "learning_rate": 1.1450051857298118e-06, + "loss": 0.0009, + "reward": 3.4166666865348816, + "reward_std": 0.20412413775920868, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1334 + }, + { + "completion_length": 175.00000762939453, + "epoch": 0.7145724608590928, + "grad_norm": 1.75, + "kl": 0.03826391091570258, + "learning_rate": 1.1410811249969475e-06, + "loss": 0.0015, + "reward": 2.9166667461395264, + "reward_std": 0.8184719085693359, + "rewards/correctness_reward_func": 1.5000000298023224, + "rewards/int_reward_func": 0.4375, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1335 + }, + { + "completion_length": 176.66666793823242, + "epoch": 0.7151077211293991, + "grad_norm": 2.21875, + "kl": 0.049864266999065876, + "learning_rate": 1.1371618105918177e-06, + "loss": 0.002, + "reward": 2.583333432674408, + "reward_std": 0.6198784969747066, + "rewards/correctness_reward_func": 1.1666666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.416666679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 1336 + }, + { + "completion_length": 160.50000762939453, + "epoch": 0.7156429813997056, + "grad_norm": 1.9453125, + "kl": 0.02226860891096294, + "learning_rate": 1.1332472562035038e-06, + "loss": 0.0009, + "reward": 3.145833432674408, + "reward_std": 0.6300618499517441, + "rewards/correctness_reward_func": 1.7500000596046448, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333395421505, + "rewards/xmlcount_reward_func": 0.5, + "step": 1337 + }, + { + "completion_length": 158.87500762939453, + "epoch": 0.7161782416700121, + "grad_norm": 1.4375, + "kl": 0.06685441732406616, + "learning_rate": 1.1293374755044602e-06, + "loss": 0.0027, + "reward": 2.9375000596046448, + "reward_std": 0.5262714438140392, + "rewards/correctness_reward_func": 1.5000000223517418, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1338 + }, + { + "completion_length": 147.00000762939453, + "epoch": 0.7167135019403185, + "grad_norm": 1.671875, + "kl": 0.03872367646545172, + "learning_rate": 1.1254324821504717e-06, + "loss": 0.0015, + "reward": 3.0312501192092896, + "reward_std": 0.7034612894058228, + "rewards/correctness_reward_func": 1.5833333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.4895833358168602, + "step": 1339 + }, + { + "completion_length": 129.66667366027832, + "epoch": 0.717248762210625, + "grad_norm": 1.5234375, + "kl": 0.03320115152746439, + "learning_rate": 1.1215322897805984e-06, + "loss": 0.0013, + "reward": 2.8750000596046448, + "reward_std": 0.4909362643957138, + "rewards/correctness_reward_func": 1.4166666716337204, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 1340 + }, + { + "completion_length": 188.20833587646484, + "epoch": 0.7177840224809313, + "grad_norm": 1.8984375, + "kl": 0.03837414178997278, + "learning_rate": 1.117636912017133e-06, + "loss": 0.0015, + "reward": 2.7916667461395264, + "reward_std": 0.9020005911588669, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.416666679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 1341 + }, + { + "completion_length": 123.20833396911621, + "epoch": 0.7183192827512378, + "grad_norm": 1.0859375, + "kl": 0.02904220810160041, + "learning_rate": 1.1137463624655537e-06, + "loss": 0.0012, + "reward": 2.7916666865348816, + "reward_std": 0.20412413775920868, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.375, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1342 + }, + { + "completion_length": 163.12500381469727, + "epoch": 0.7188545430215443, + "grad_norm": 1.078125, + "kl": 0.029067810624837875, + "learning_rate": 1.1098606547144727e-06, + "loss": 0.0012, + "reward": 3.2135416865348816, + "reward_std": 0.3886178582906723, + "rewards/correctness_reward_func": 1.8333333432674408, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.484375, + "step": 1343 + }, + { + "completion_length": 165.5833396911621, + "epoch": 0.7193898032918506, + "grad_norm": 1.0078125, + "kl": 0.022788936970755458, + "learning_rate": 1.105979802335594e-06, + "loss": 0.0009, + "reward": 2.708333373069763, + "reward_std": 0.4289814233779907, + "rewards/correctness_reward_func": 1.2500000074505806, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 1344 + }, + { + "completion_length": 141.4583396911621, + "epoch": 0.7199250635621571, + "grad_norm": 2.09375, + "kl": 0.07417263370007277, + "learning_rate": 1.1021038188836602e-06, + "loss": 0.003, + "reward": 2.9791667461395264, + "reward_std": 0.6616143435239792, + "rewards/correctness_reward_func": 1.5833333432674408, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000149011612, + "rewards/xmlcount_reward_func": 0.5, + "step": 1345 + }, + { + "completion_length": 158.79166984558105, + "epoch": 0.7204603238324635, + "grad_norm": 1.65625, + "kl": 0.02835595328360796, + "learning_rate": 1.09823271789641e-06, + "loss": 0.0011, + "reward": 2.958333373069763, + "reward_std": 0.4752403795719147, + "rewards/correctness_reward_func": 1.5000000223517418, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1346 + }, + { + "completion_length": 155.25000381469727, + "epoch": 0.72099558410277, + "grad_norm": 1.015625, + "kl": 0.04103809129446745, + "learning_rate": 1.0943665128945277e-06, + "loss": 0.0016, + "reward": 3.0000000596046448, + "reward_std": 0.6123724579811096, + "rewards/correctness_reward_func": 1.5833333432674408, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1347 + }, + { + "completion_length": 153.33333587646484, + "epoch": 0.7215308443730765, + "grad_norm": 1.515625, + "kl": 0.05304192844778299, + "learning_rate": 1.0905052173815974e-06, + "loss": 0.0021, + "reward": 3.0416667461395264, + "reward_std": 0.5268727838993073, + "rewards/correctness_reward_func": 1.583333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 1348 + }, + { + "completion_length": 148.58333587646484, + "epoch": 0.7220661046433828, + "grad_norm": 1.546875, + "kl": 0.05763786751776934, + "learning_rate": 1.086648844844058e-06, + "loss": 0.0023, + "reward": 3.2291667461395264, + "reward_std": 0.34831811115145683, + "rewards/correctness_reward_func": 1.8333333432674408, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1349 + }, + { + "completion_length": 144.29167366027832, + "epoch": 0.7226013649136893, + "grad_norm": 1.8671875, + "kl": 0.026925162645056844, + "learning_rate": 1.082797408751151e-06, + "loss": 0.0011, + "reward": 2.7916667461395264, + "reward_std": 0.3602609895169735, + "rewards/correctness_reward_func": 1.3333333432674408, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1350 + }, + { + "completion_length": 137.0416717529297, + "epoch": 0.7231366251839957, + "grad_norm": 0.04931640625, + "kl": 0.015334914904087782, + "learning_rate": 1.0789509225548767e-06, + "loss": 0.0006, + "reward": 3.5, + "reward_std": 0.0, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1351 + }, + { + "completion_length": 174.50000381469727, + "epoch": 0.7236718854543022, + "grad_norm": 1.3125, + "kl": 0.024215523153543472, + "learning_rate": 1.0751093996899486e-06, + "loss": 0.001, + "reward": 2.8750000298023224, + "reward_std": 0.2686738669872284, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 1352 + }, + { + "completion_length": 125.04166984558105, + "epoch": 0.7242071457246085, + "grad_norm": 1.84375, + "kl": 0.044260346330702305, + "learning_rate": 1.0712728535737432e-06, + "loss": 0.0018, + "reward": 3.0625000596046448, + "reward_std": 0.7091782838106155, + "rewards/correctness_reward_func": 1.5833333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1353 + }, + { + "completion_length": 155.08333587646484, + "epoch": 0.724742405994915, + "grad_norm": 1.3125, + "kl": 0.029146920423954725, + "learning_rate": 1.0674412976062538e-06, + "loss": 0.0012, + "reward": 3.083333373069763, + "reward_std": 0.46232306957244873, + "rewards/correctness_reward_func": 1.583333358168602, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1354 + }, + { + "completion_length": 143.54166793823242, + "epoch": 0.7252776662652215, + "grad_norm": 2.21875, + "kl": 0.034341649152338505, + "learning_rate": 1.063614745170047e-06, + "loss": 0.0014, + "reward": 2.916666716337204, + "reward_std": 0.6123724281787872, + "rewards/correctness_reward_func": 1.4166667088866234, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1355 + }, + { + "completion_length": 143.12500762939453, + "epoch": 0.7258129265355279, + "grad_norm": 0.92578125, + "kl": 0.04088182095438242, + "learning_rate": 1.0597932096302114e-06, + "loss": 0.0016, + "reward": 3.4166666865348816, + "reward_std": 0.20412413775920868, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1356 + }, + { + "completion_length": 223.50000762939453, + "epoch": 0.7263481868058344, + "grad_norm": 1.7734375, + "kl": 0.030694124288856983, + "learning_rate": 1.0559767043343132e-06, + "loss": 0.0012, + "reward": 2.437500089406967, + "reward_std": 0.9109830409288406, + "rewards/correctness_reward_func": 1.0833333656191826, + "rewards/int_reward_func": 0.4375000074505806, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.4583333358168602, + "step": 1357 + }, + { + "completion_length": 158.5833396911621, + "epoch": 0.7268834470761407, + "grad_norm": 2.0, + "kl": 0.045903034042567015, + "learning_rate": 1.0521652426123504e-06, + "loss": 0.0018, + "reward": 2.854166805744171, + "reward_std": 0.7812078148126602, + "rewards/correctness_reward_func": 1.4166667014360428, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 1358 + }, + { + "completion_length": 147.08333587646484, + "epoch": 0.7274187073464472, + "grad_norm": 24.875, + "kl": 0.4180497103370726, + "learning_rate": 1.0483588377767028e-06, + "loss": 0.0167, + "reward": 2.8802084028720856, + "reward_std": 0.67439004778862, + "rewards/correctness_reward_func": 1.583333358168602, + "rewards/int_reward_func": 0.3958333395421505, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.484375, + "step": 1359 + }, + { + "completion_length": 166.87500762939453, + "epoch": 0.7279539676167537, + "grad_norm": 1.65625, + "kl": 0.0429176758043468, + "learning_rate": 1.044557503122092e-06, + "loss": 0.0017, + "reward": 2.958333373069763, + "reward_std": 0.6023809425532818, + "rewards/correctness_reward_func": 1.5000000149011612, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1360 + }, + { + "completion_length": 128.5416717529297, + "epoch": 0.72848922788706, + "grad_norm": 1.390625, + "kl": 0.04165979754179716, + "learning_rate": 1.0407612519255262e-06, + "loss": 0.0017, + "reward": 2.375, + "reward_std": 0.5132361948490143, + "rewards/correctness_reward_func": 0.916666679084301, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1361 + }, + { + "completion_length": 171.9166717529297, + "epoch": 0.7290244881573665, + "grad_norm": 1.65625, + "kl": 0.04312491184100509, + "learning_rate": 1.0369700974462627e-06, + "loss": 0.0017, + "reward": 2.6666667461395264, + "reward_std": 0.40296074748039246, + "rewards/correctness_reward_func": 1.25, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 1362 + }, + { + "completion_length": 184.04166984558105, + "epoch": 0.7295597484276729, + "grad_norm": 1.4921875, + "kl": 0.029584042262285948, + "learning_rate": 1.0331840529257544e-06, + "loss": 0.0012, + "reward": 2.9375000596046448, + "reward_std": 0.6062580458819866, + "rewards/correctness_reward_func": 1.5, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 1363 + }, + { + "completion_length": 140.25000190734863, + "epoch": 0.7300950086979794, + "grad_norm": 2.28125, + "kl": 0.04231497598811984, + "learning_rate": 1.0294031315876072e-06, + "loss": 0.0017, + "reward": 3.208333432674408, + "reward_std": 0.5094902068376541, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1364 + }, + { + "completion_length": 153.08333778381348, + "epoch": 0.7306302689682859, + "grad_norm": 1.890625, + "kl": 0.044678494334220886, + "learning_rate": 1.0256273466375353e-06, + "loss": 0.0018, + "reward": 2.645833373069763, + "reward_std": 0.7632530629634857, + "rewards/correctness_reward_func": 1.1666666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1365 + }, + { + "completion_length": 159.29166984558105, + "epoch": 0.7311655292385922, + "grad_norm": 1.71875, + "kl": 0.03514173999428749, + "learning_rate": 1.021856711263309e-06, + "loss": 0.0014, + "reward": 2.708333373069763, + "reward_std": 0.6272146701812744, + "rewards/correctness_reward_func": 1.2500000223517418, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1366 + }, + { + "completion_length": 217.1666717529297, + "epoch": 0.7317007895088987, + "grad_norm": 0.83984375, + "kl": 0.02654549153521657, + "learning_rate": 1.0180912386347144e-06, + "loss": 0.0011, + "reward": 3.0885417461395264, + "reward_std": 0.3606287091970444, + "rewards/correctness_reward_func": 1.75, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3750000037252903, + "rewards/xmlcount_reward_func": 0.484375, + "step": 1367 + }, + { + "completion_length": 152.83333778381348, + "epoch": 0.7322360497792051, + "grad_norm": 6.0625, + "kl": 0.20837300829589367, + "learning_rate": 1.014330941903508e-06, + "loss": 0.0083, + "reward": 3.0000001192092896, + "reward_std": 0.7425693273544312, + "rewards/correctness_reward_func": 1.6666666865348816, + "rewards/int_reward_func": 0.4583333432674408, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3750000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 1368 + }, + { + "completion_length": 170.00000381469727, + "epoch": 0.7327713100495116, + "grad_norm": 1.3671875, + "kl": 0.04278033087030053, + "learning_rate": 1.0105758342033636e-06, + "loss": 0.0017, + "reward": 2.4166667461395264, + "reward_std": 0.16661179810762405, + "rewards/correctness_reward_func": 1.0, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 1369 + }, + { + "completion_length": 118.66667175292969, + "epoch": 0.733306570319818, + "grad_norm": 26.0, + "kl": 0.13473708741366863, + "learning_rate": 1.0068259286498363e-06, + "loss": 0.0054, + "reward": 3.2916667461395264, + "reward_std": 0.47279801592230797, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.4375000074505806, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.4791666716337204, + "step": 1370 + }, + { + "completion_length": 120.79166984558105, + "epoch": 0.7338418305901244, + "grad_norm": 1.171875, + "kl": 0.020003549987450242, + "learning_rate": 1.0030812383403074e-06, + "loss": 0.0008, + "reward": 3.395833373069763, + "reward_std": 0.2002602517604828, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1371 + }, + { + "completion_length": 173.6666717529297, + "epoch": 0.7343770908604309, + "grad_norm": 1.5859375, + "kl": 0.031346763018518686, + "learning_rate": 9.993417763539438e-07, + "loss": 0.0013, + "reward": 2.6250000298023224, + "reward_std": 0.6747233681380749, + "rewards/correctness_reward_func": 1.3333333432674408, + "rewards/int_reward_func": 0.4166666679084301, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3750000111758709, + "rewards/xmlcount_reward_func": 0.5, + "step": 1372 + }, + { + "completion_length": 196.75000762939453, + "epoch": 0.7349123511307373, + "grad_norm": 1.7734375, + "kl": 0.042542679235339165, + "learning_rate": 9.956075557516535e-07, + "loss": 0.0017, + "reward": 2.583333432674408, + "reward_std": 0.80763865634799, + "rewards/correctness_reward_func": 1.2500000223517418, + "rewards/int_reward_func": 0.4166666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.416666679084301, + "rewards/xmlcount_reward_func": 0.5, + "step": 1373 + }, + { + "completion_length": 139.20833778381348, + "epoch": 0.7354476114010438, + "grad_norm": 2.109375, + "kl": 0.041438264306634665, + "learning_rate": 9.918785895760348e-07, + "loss": 0.0017, + "reward": 2.770833432674408, + "reward_std": 1.0852452516555786, + "rewards/correctness_reward_func": 1.3333333730697632, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1374 + }, + { + "completion_length": 145.62500381469727, + "epoch": 0.7359828716713502, + "grad_norm": 2.15625, + "kl": 0.037300676107406616, + "learning_rate": 9.881548908513358e-07, + "loss": 0.0015, + "reward": 3.1875000596046448, + "reward_std": 0.6376042664051056, + "rewards/correctness_reward_func": 1.7500000596046448, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.4791666716337204, + "step": 1375 + }, + { + "completion_length": 194.37500762939453, + "epoch": 0.7365181319416566, + "grad_norm": 1.8125, + "kl": 0.042442481964826584, + "learning_rate": 9.844364725834058e-07, + "loss": 0.0017, + "reward": 2.416666716337204, + "reward_std": 0.5596308261156082, + "rewards/correctness_reward_func": 1.0000000223517418, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.4791666716337204, + "step": 1376 + }, + { + "completion_length": 190.41666984558105, + "epoch": 0.7370533922119631, + "grad_norm": 1.53125, + "kl": 0.02683873614296317, + "learning_rate": 9.807233477596504e-07, + "loss": 0.0011, + "reward": 2.3541667461395264, + "reward_std": 0.8361027240753174, + "rewards/correctness_reward_func": 1.0000000074505806, + "rewards/int_reward_func": 0.35416666977107525, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1377 + }, + { + "completion_length": 136.83333587646484, + "epoch": 0.7375886524822695, + "grad_norm": 1.2109375, + "kl": 0.027323594084009528, + "learning_rate": 9.77015529348989e-07, + "loss": 0.0011, + "reward": 3.2500000596046448, + "reward_std": 0.46232305467128754, + "rewards/correctness_reward_func": 1.7500000298023224, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1378 + }, + { + "completion_length": 184.875, + "epoch": 0.7381239127525759, + "grad_norm": 1.2265625, + "kl": 0.037914395332336426, + "learning_rate": 9.733130303018051e-07, + "loss": 0.0015, + "reward": 2.380208358168602, + "reward_std": 0.32088102027773857, + "rewards/correctness_reward_func": 1.0833333358168602, + "rewards/int_reward_func": 0.3541666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.484375, + "step": 1379 + }, + { + "completion_length": 146.0833396911621, + "epoch": 0.7386591730228824, + "grad_norm": 8.0625, + "kl": 0.25140155758708715, + "learning_rate": 9.696158635499032e-07, + "loss": 0.0101, + "reward": 3.1875001192092896, + "reward_std": 0.6730582565069199, + "rewards/correctness_reward_func": 1.7500000596046448, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.5, + "step": 1380 + }, + { + "completion_length": 147.50000190734863, + "epoch": 0.7391944332931888, + "grad_norm": 1.125, + "kl": 0.0396621716208756, + "learning_rate": 9.659240420064647e-07, + "loss": 0.0016, + "reward": 3.395833373069763, + "reward_std": 0.25515518710017204, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1381 + }, + { + "completion_length": 135.12500381469727, + "epoch": 0.7397296935634953, + "grad_norm": 0.98828125, + "kl": 0.03235420072451234, + "learning_rate": 9.622375785660004e-07, + "loss": 0.0013, + "reward": 3.4166666865348816, + "reward_std": 0.20412413775920868, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1382 + }, + { + "completion_length": 134.79166984558105, + "epoch": 0.7402649538338016, + "grad_norm": 1.9140625, + "kl": 0.04798926878720522, + "learning_rate": 9.585564861043087e-07, + "loss": 0.0019, + "reward": 3.3125000596046448, + "reward_std": 0.459279328584671, + "rewards/correctness_reward_func": 1.8333333730697632, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1383 + }, + { + "completion_length": 158.12500762939453, + "epoch": 0.7408002141041081, + "grad_norm": 1.671875, + "kl": 0.03119900869205594, + "learning_rate": 9.548807774784264e-07, + "loss": 0.0012, + "reward": 2.9791667461395264, + "reward_std": 0.5674288682639599, + "rewards/correctness_reward_func": 1.5000000149011612, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1384 + }, + { + "completion_length": 171.87500381469727, + "epoch": 0.7413354743744146, + "grad_norm": 2.046875, + "kl": 0.04780485853552818, + "learning_rate": 9.512104655265869e-07, + "loss": 0.0019, + "reward": 2.6041667461395264, + "reward_std": 1.1054013073444366, + "rewards/correctness_reward_func": 1.1666666865348816, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1385 + }, + { + "completion_length": 121.33333778381348, + "epoch": 0.741870734644721, + "grad_norm": 1.0078125, + "kl": 0.029458944220095873, + "learning_rate": 9.475455630681745e-07, + "loss": 0.0012, + "reward": 3.4166666865348816, + "reward_std": 0.20412415266036987, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1386 + }, + { + "completion_length": 133.41666984558105, + "epoch": 0.7424059949150275, + "grad_norm": 1.3359375, + "kl": 0.03565332805737853, + "learning_rate": 9.438860829036794e-07, + "loss": 0.0014, + "reward": 3.395833373069763, + "reward_std": 0.20241357013583183, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4375000074505806, + "rewards/xmlcount_reward_func": 0.4791666716337204, + "step": 1387 + }, + { + "completion_length": 145.0416717529297, + "epoch": 0.7429412551853338, + "grad_norm": 1.5859375, + "kl": 0.04745309241116047, + "learning_rate": 9.402320378146551e-07, + "loss": 0.0019, + "reward": 2.770833373069763, + "reward_std": 0.4488043449819088, + "rewards/correctness_reward_func": 1.3333333730697632, + "rewards/int_reward_func": 0.4791666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 1388 + }, + { + "completion_length": 141.7500057220459, + "epoch": 0.7434765154556403, + "grad_norm": 1.34375, + "kl": 0.04251825390383601, + "learning_rate": 9.365834405636692e-07, + "loss": 0.0017, + "reward": 2.9166666865348816, + "reward_std": 0.11949636042118073, + "rewards/correctness_reward_func": 1.5, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4166666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1389 + }, + { + "completion_length": 164.12500381469727, + "epoch": 0.7440117757259468, + "grad_norm": 2.03125, + "kl": 0.037888340186327696, + "learning_rate": 9.329403038942617e-07, + "loss": 0.0015, + "reward": 3.0000001192092896, + "reward_std": 0.5433852225542068, + "rewards/correctness_reward_func": 1.583333358168602, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1390 + }, + { + "completion_length": 157.25000762939453, + "epoch": 0.7445470359962532, + "grad_norm": 2.375, + "kl": 0.03995004156604409, + "learning_rate": 9.293026405309033e-07, + "loss": 0.0016, + "reward": 1.791666716337204, + "reward_std": 0.6524690836668015, + "rewards/correctness_reward_func": 0.4166666716337204, + "rewards/int_reward_func": 0.4166666716337204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333358168602, + "rewards/xmlcount_reward_func": 0.5, + "step": 1391 + }, + { + "completion_length": 194.3333396911621, + "epoch": 0.7450822962665596, + "grad_norm": 2.109375, + "kl": 0.041776688769459724, + "learning_rate": 9.256704631789443e-07, + "loss": 0.0017, + "reward": 2.1041667759418488, + "reward_std": 1.0553452968597412, + "rewards/correctness_reward_func": 0.8333333507180214, + "rewards/int_reward_func": 0.416666679084301, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3541666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1392 + }, + { + "completion_length": 139.20833778381348, + "epoch": 0.745617556536866, + "grad_norm": 1.953125, + "kl": 0.046518485993146896, + "learning_rate": 9.220437845245766e-07, + "loss": 0.0019, + "reward": 3.3750000596046448, + "reward_std": 0.306186206638813, + "rewards/correctness_reward_func": 1.9166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1393 + }, + { + "completion_length": 133.8333396911621, + "epoch": 0.7461528168071725, + "grad_norm": 2.375, + "kl": 0.04975722776725888, + "learning_rate": 9.184226172347854e-07, + "loss": 0.002, + "reward": 2.7916667461395264, + "reward_std": 1.1091627776622772, + "rewards/correctness_reward_func": 1.416666716337204, + "rewards/int_reward_func": 0.416666679084301, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4583333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1394 + }, + { + "completion_length": 148.95833587646484, + "epoch": 0.746688077077479, + "grad_norm": 0.7578125, + "kl": 0.029789446853101254, + "learning_rate": 9.148069739573056e-07, + "loss": 0.0012, + "reward": 3.1666666865348816, + "reward_std": 0.25819891691207886, + "rewards/correctness_reward_func": 1.6666666716337204, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1395 + }, + { + "completion_length": 185.4583396911621, + "epoch": 0.7472233373477853, + "grad_norm": 0.8828125, + "kl": 0.03485368099063635, + "learning_rate": 9.111968673205799e-07, + "loss": 0.0014, + "reward": 3.005208373069763, + "reward_std": 0.34312520548701286, + "rewards/correctness_reward_func": 1.6666666716337204, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333395421505, + "rewards/xmlcount_reward_func": 0.484375, + "step": 1396 + }, + { + "completion_length": 141.4583396911621, + "epoch": 0.7477585976180918, + "grad_norm": 1.34375, + "kl": 0.03351549245417118, + "learning_rate": 9.075923099337114e-07, + "loss": 0.0013, + "reward": 3.333333373069763, + "reward_std": 0.25819891691207886, + "rewards/correctness_reward_func": 1.8333333432674408, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1397 + }, + { + "completion_length": 166.12500381469727, + "epoch": 0.7482938578883982, + "grad_norm": 1.5703125, + "kl": 0.04391408711671829, + "learning_rate": 9.039933143864216e-07, + "loss": 0.0018, + "reward": 3.3541666865348816, + "reward_std": 0.22407886758446693, + "rewards/correctness_reward_func": 2.0, + "rewards/int_reward_func": 0.4583333358168602, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.3958333432674408, + "rewards/xmlcount_reward_func": 0.5, + "step": 1398 + }, + { + "completion_length": 139.04166984558105, + "epoch": 0.7488291181587047, + "grad_norm": 1.4609375, + "kl": 0.03669837862253189, + "learning_rate": 9.003998932490079e-07, + "loss": 0.0015, + "reward": 2.895833373069763, + "reward_std": 0.25515519082546234, + "rewards/correctness_reward_func": 1.4166666865348816, + "rewards/int_reward_func": 0.5, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.4791666716337204, + "rewards/xmlcount_reward_func": 0.5, + "step": 1399 + }, + { + "completion_length": 167.7916717529297, + "epoch": 0.7493643784290112, + "grad_norm": 1.6328125, + "kl": 0.032958225812762976, + "learning_rate": 8.968120590722951e-07, + "loss": 0.0013, + "reward": 2.958333343267441, + "reward_std": 0.572748601436615, + "rewards/correctness_reward_func": 1.583333358168602, + "rewards/int_reward_func": 0.3750000037252903, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.5, + "rewards/xmlcount_reward_func": 0.5, + "step": 1400 + } + ], + "logging_steps": 1, + "max_steps": 1868, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}