{ "best_metric": null, "best_model_checkpoint": null, "epoch": 8.403361344537815, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 239.5, "epoch": 0.008403361344537815, "grad_norm": 0.5275928378105164, "kl": 0.0, "learning_rate": 5.0000000000000004e-08, "loss": 0.0, "reward": -0.593999981880188, "reward_std": 0.1400071531534195, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.593999981880188, "step": 1 }, { "completion_length": 194.5, "epoch": 0.01680672268907563, "grad_norm": 0.76406329870224, "kl": 0.0, "learning_rate": 1.0000000000000001e-07, "loss": 0.0, "reward": -0.4819999933242798, "reward_std": 0.4794183671474457, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4819999933242798, "step": 2 }, { "completion_length": 186.5, "epoch": 0.025210084033613446, "grad_norm": 0.7028466463088989, "kl": 0.000461863586679101, "learning_rate": 1.5000000000000002e-07, "loss": 0.0, "reward": -0.44300001859664917, "reward_std": 0.12586501240730286, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.44300001859664917, "step": 3 }, { "completion_length": 256.0, "epoch": 0.03361344537815126, "grad_norm": 0.002048119669780135, "kl": 0.00041455624159425497, "learning_rate": 2.0000000000000002e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 4 }, { "completion_length": 183.5, "epoch": 0.04201680672268908, "grad_norm": 0.6061137318611145, "kl": 0.0004681727150455117, "learning_rate": 2.5000000000000004e-07, "loss": 0.0, "reward": 0.12549999356269836, "reward_std": 0.17748379707336426, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12549999356269836, "step": 5 }, { "completion_length": 256.0, "epoch": 0.05042016806722689, "grad_norm": 0.5085726976394653, "kl": 0.0003166799433529377, "learning_rate": 3.0000000000000004e-07, "loss": 0.0, "reward": -0.3384999930858612, "reward_std": 0.6554879546165466, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3384999930858612, "step": 6 }, { "completion_length": 255.5, "epoch": 0.058823529411764705, "grad_norm": 0.620286762714386, "kl": 0.0006941207684576511, "learning_rate": 3.5000000000000004e-07, "loss": 0.0, "reward": -1.0125000476837158, "reward_std": 0.04313350468873978, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -1.0125000476837158, "step": 7 }, { "completion_length": 256.0, "epoch": 0.06722689075630252, "grad_norm": 0.001316132373176515, "kl": 0.00032240129075944424, "learning_rate": 4.0000000000000003e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 8 }, { "completion_length": 256.0, "epoch": 0.07563025210084033, "grad_norm": 0.46780896186828613, "kl": 0.00024260161444544792, "learning_rate": 4.5000000000000003e-07, "loss": 0.0, "reward": -0.3084999918937683, "reward_std": 0.6130615472793579, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3084999918937683, "step": 9 }, { "completion_length": 239.5, "epoch": 0.08403361344537816, "grad_norm": 0.8744885921478271, "kl": 0.0007633959176018834, "learning_rate": 5.000000000000001e-07, "loss": 0.0, "reward": -0.3179999887943268, "reward_std": 0.6264966130256653, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3179999887943268, "step": 10 }, { "completion_length": 256.0, "epoch": 0.09243697478991597, "grad_norm": 0.4798508584499359, "kl": 0.00046879868023097515, "learning_rate": 5.5e-07, "loss": 0.0, "reward": -0.7990000247955322, "reward_std": 0.05939696356654167, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7990000247955322, "step": 11 }, { "completion_length": 256.0, "epoch": 0.10084033613445378, "grad_norm": 0.001967609627172351, "kl": 0.0005342272343114018, "learning_rate": 6.000000000000001e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 12 }, { "completion_length": 256.0, "epoch": 0.1092436974789916, "grad_norm": 0.5731886625289917, "kl": 0.0007240658160299063, "learning_rate": 6.5e-07, "loss": 0.0, "reward": -0.3619999885559082, "reward_std": 0.6887219548225403, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3619999885559082, "step": 13 }, { "completion_length": 150.0, "epoch": 0.11764705882352941, "grad_norm": 0.7903950810432434, "kl": 0.000604015018325299, "learning_rate": 7.000000000000001e-07, "loss": 0.0, "reward": -0.18399998545646667, "reward_std": 0.07778174430131912, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.18399998545646667, "step": 14 }, { "completion_length": 256.0, "epoch": 0.12605042016806722, "grad_norm": 0.6017890572547913, "kl": 0.0004533551400527358, "learning_rate": 7.5e-07, "loss": 0.0, "reward": -0.375, "reward_std": 0.5303300619125366, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.375, "step": 15 }, { "completion_length": 256.0, "epoch": 0.13445378151260504, "grad_norm": 0.0037525801453739405, "kl": 0.0005867372965440154, "learning_rate": 8.000000000000001e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 16 }, { "completion_length": 245.5, "epoch": 0.14285714285714285, "grad_norm": 0.617506742477417, "kl": 0.000612594885751605, "learning_rate": 8.500000000000001e-07, "loss": 0.0, "reward": 0.16949999332427979, "reward_std": 0.06293250620365143, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16949999332427979, "step": 17 }, { "completion_length": 256.0, "epoch": 0.15126050420168066, "grad_norm": 0.5883203148841858, "kl": 0.00043458084110170603, "learning_rate": 9.000000000000001e-07, "loss": 0.0, "reward": -0.40799999237060547, "reward_std": 0.7537758350372314, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.40799999237060547, "step": 18 }, { "completion_length": 244.0, "epoch": 0.15966386554621848, "grad_norm": 0.6466447710990906, "kl": 0.0007290478679351509, "learning_rate": 9.500000000000001e-07, "loss": 0.0, "reward": -0.4740000069141388, "reward_std": 0.6703372597694397, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4740000069141388, "step": 19 }, { "completion_length": 178.0, "epoch": 0.16806722689075632, "grad_norm": 0.8394845724105835, "kl": 0.0006450924556702375, "learning_rate": 1.0000000000000002e-06, "loss": 0.0, "reward": -0.46950000524520874, "reward_std": 0.35567471385002136, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.46950000524520874, "step": 20 }, { "completion_length": 199.5, "epoch": 0.17647058823529413, "grad_norm": 0.8017348051071167, "kl": 0.0007282905280590057, "learning_rate": 1.0500000000000001e-06, "loss": 0.0, "reward": -0.45399999618530273, "reward_std": 0.642052948474884, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.45399999618530273, "step": 21 }, { "completion_length": 114.5, "epoch": 0.18487394957983194, "grad_norm": 0.7468470931053162, "kl": 0.0005178428255021572, "learning_rate": 1.1e-06, "loss": 0.0, "reward": -0.18299999833106995, "reward_std": 0.025455841794610023, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.18299999833106995, "step": 22 }, { "completion_length": 215.5, "epoch": 0.19327731092436976, "grad_norm": 0.6611573100090027, "kl": 0.000638445490039885, "learning_rate": 1.1500000000000002e-06, "loss": 0.0, "reward": -0.6310000419616699, "reward_std": 0.1032375693321228, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6310000419616699, "step": 23 }, { "completion_length": 256.0, "epoch": 0.20168067226890757, "grad_norm": 0.4637366533279419, "kl": 0.0005411126185208559, "learning_rate": 1.2000000000000002e-06, "loss": 0.0, "reward": -0.4194999933242798, "reward_std": 0.7700392603874207, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4194999933242798, "step": 24 }, { "completion_length": 247.0, "epoch": 0.21008403361344538, "grad_norm": 0.6701123714447021, "kl": 0.0005814994219690561, "learning_rate": 1.25e-06, "loss": 0.0, "reward": -0.840499997138977, "reward_std": 0.2538513243198395, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.840499997138977, "step": 25 }, { "completion_length": 206.0, "epoch": 0.2184873949579832, "grad_norm": 0.8757864832878113, "kl": 0.0005857151700183749, "learning_rate": 1.3e-06, "loss": 0.0, "reward": -0.6934999823570251, "reward_std": 0.3698168396949768, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6934999823570251, "step": 26 }, { "completion_length": 256.0, "epoch": 0.226890756302521, "grad_norm": 0.5860095024108887, "kl": 0.0003975172294303775, "learning_rate": 1.3500000000000002e-06, "loss": 0.0, "reward": -0.3149999976158142, "reward_std": 0.622253954410553, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3149999976158142, "step": 27 }, { "completion_length": 256.0, "epoch": 0.23529411764705882, "grad_norm": 0.5088208317756653, "kl": 0.0006337170489132404, "learning_rate": 1.4000000000000001e-06, "loss": 0.0, "reward": -0.9915000200271606, "reward_std": 0.1237436980009079, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.9915000200271606, "step": 28 }, { "completion_length": 221.5, "epoch": 0.24369747899159663, "grad_norm": 0.6378973126411438, "kl": 0.000504142080899328, "learning_rate": 1.45e-06, "loss": 0.0, "reward": -0.29649999737739563, "reward_std": 0.774281919002533, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.29649999737739563, "step": 29 }, { "completion_length": 256.0, "epoch": 0.25210084033613445, "grad_norm": 0.5039516687393188, "kl": 0.0005410774610936642, "learning_rate": 1.5e-06, "loss": 0.0, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 30 }, { "completion_length": 211.0, "epoch": 0.2605042016806723, "grad_norm": 0.7033306360244751, "kl": 0.0006368912872858346, "learning_rate": 1.5500000000000002e-06, "loss": 0.0, "reward": -0.6884999871253967, "reward_std": 0.4023437798023224, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6884999871253967, "step": 31 }, { "completion_length": 256.0, "epoch": 0.2689075630252101, "grad_norm": 0.003668795572593808, "kl": 0.0005547598702833056, "learning_rate": 1.6000000000000001e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 32 }, { "completion_length": 256.0, "epoch": 0.2773109243697479, "grad_norm": 0.5562208294868469, "kl": 0.00030744855757802725, "learning_rate": 1.6500000000000003e-06, "loss": 0.0, "reward": -0.4000000059604645, "reward_std": 0.5656854510307312, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4000000059604645, "step": 33 }, { "completion_length": 256.0, "epoch": 0.2857142857142857, "grad_norm": 0.6205936074256897, "kl": 0.0006913439137861133, "learning_rate": 1.7000000000000002e-06, "loss": 0.0, "reward": -0.9294999837875366, "reward_std": 0.11808682978153229, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.9294999837875366, "step": 34 }, { "completion_length": 217.0, "epoch": 0.29411764705882354, "grad_norm": 0.5948581695556641, "kl": 0.0006013556849211454, "learning_rate": 1.75e-06, "loss": 0.0, "reward": -0.18400000035762787, "reward_std": 0.43699201941490173, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.18400000035762787, "step": 35 }, { "completion_length": 256.0, "epoch": 0.3025210084033613, "grad_norm": 0.0018553792033344507, "kl": 0.0003625560784712434, "learning_rate": 1.8000000000000001e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 36 }, { "completion_length": 256.0, "epoch": 0.31092436974789917, "grad_norm": 0.5708751678466797, "kl": 0.00041932554449886084, "learning_rate": 1.85e-06, "loss": 0.0, "reward": -0.2930000126361847, "reward_std": 0.5911412835121155, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.2930000126361847, "step": 37 }, { "completion_length": 178.0, "epoch": 0.31932773109243695, "grad_norm": 0.004888953175395727, "kl": 0.0005122203729115427, "learning_rate": 1.9000000000000002e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 38 }, { "completion_length": 157.0, "epoch": 0.3277310924369748, "grad_norm": 1.6002107858657837, "kl": 0.0006121749756857753, "learning_rate": 1.9500000000000004e-06, "loss": 0.0, "reward": -0.4440000057220459, "reward_std": 0.7580184936523438, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4440000057220459, "step": 39 }, { "completion_length": 256.0, "epoch": 0.33613445378151263, "grad_norm": 0.6525126099586487, "kl": 0.000592769356444478, "learning_rate": 2.0000000000000003e-06, "loss": 0.0, "reward": -0.29750001430511475, "reward_std": 0.5975052118301392, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.29750001430511475, "step": 40 }, { "completion_length": 251.0, "epoch": 0.3445378151260504, "grad_norm": 0.6208035349845886, "kl": 0.00048222424811683595, "learning_rate": 2.05e-06, "loss": 0.0, "reward": -0.3569999933242798, "reward_std": 0.6816509366035461, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3569999933242798, "step": 41 }, { "completion_length": 256.0, "epoch": 0.35294117647058826, "grad_norm": 0.0017297575250267982, "kl": 0.0004967615241184831, "learning_rate": 2.1000000000000002e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 42 }, { "completion_length": 256.0, "epoch": 0.36134453781512604, "grad_norm": 0.6571947336196899, "kl": 0.0007757593411952257, "learning_rate": 2.15e-06, "loss": 0.0, "reward": -0.42649999260902405, "reward_std": 0.7799387574195862, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.42649999260902405, "step": 43 }, { "completion_length": 187.5, "epoch": 0.3697478991596639, "grad_norm": 0.8692558407783508, "kl": 0.0004379293241072446, "learning_rate": 2.2e-06, "loss": 0.0, "reward": 0.18799999356269836, "reward_std": 0.08909544348716736, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18799999356269836, "step": 44 }, { "completion_length": 256.0, "epoch": 0.37815126050420167, "grad_norm": 0.0030803035479038954, "kl": 0.00040722498670220375, "learning_rate": 2.25e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 45 }, { "completion_length": 256.0, "epoch": 0.3865546218487395, "grad_norm": 0.5593839287757874, "kl": 0.0005106262397021055, "learning_rate": 2.3000000000000004e-06, "loss": 0.0, "reward": -0.4399999976158142, "reward_std": 0.7990306615829468, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4399999976158142, "step": 46 }, { "completion_length": 167.0, "epoch": 0.3949579831932773, "grad_norm": 0.8893198370933533, "kl": 0.0006667847046628594, "learning_rate": 2.35e-06, "loss": 0.0, "reward": -0.37849998474121094, "reward_std": 0.3372899293899536, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.37849998474121094, "step": 47 }, { "completion_length": 171.0, "epoch": 0.40336134453781514, "grad_norm": 1.046221137046814, "kl": 0.0010377075523138046, "learning_rate": 2.4000000000000003e-06, "loss": 0.0, "reward": -0.3774999976158142, "reward_std": 0.6950860023498535, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3774999976158142, "step": 48 }, { "completion_length": 256.0, "epoch": 0.4117647058823529, "grad_norm": 0.6182668805122375, "kl": 0.0004191758343949914, "learning_rate": 2.4500000000000003e-06, "loss": 0.0, "reward": -0.8385000228881836, "reward_std": 0.10960157215595245, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.8385000228881836, "step": 49 }, { "completion_length": 225.5, "epoch": 0.42016806722689076, "grad_norm": 0.7239448428153992, "kl": 0.0007323765894398093, "learning_rate": 2.5e-06, "loss": 0.0, "reward": -0.715499997138977, "reward_std": 0.28920671343803406, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.715499997138977, "step": 50 }, { "completion_length": 256.0, "epoch": 0.42857142857142855, "grad_norm": 0.6164533495903015, "kl": 0.0005570960929617286, "learning_rate": 2.55e-06, "loss": 0.0, "reward": -0.4050000011920929, "reward_std": 0.5727564692497253, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4050000011920929, "step": 51 }, { "completion_length": 230.0, "epoch": 0.4369747899159664, "grad_norm": 0.8734904527664185, "kl": 0.0007351982640102506, "learning_rate": 2.6e-06, "loss": 0.0, "reward": -0.21950000524520874, "reward_std": 0.48719659447669983, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.21950000524520874, "step": 52 }, { "completion_length": 256.0, "epoch": 0.44537815126050423, "grad_norm": 0.542896568775177, "kl": 0.0005189279327169061, "learning_rate": 2.6500000000000005e-06, "loss": 0.0, "reward": -0.382999986410141, "reward_std": 0.7184205055236816, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.382999986410141, "step": 53 }, { "completion_length": 194.0, "epoch": 0.453781512605042, "grad_norm": 0.8967416286468506, "kl": 0.0008001920068636537, "learning_rate": 2.7000000000000004e-06, "loss": 0.0, "reward": -0.5579999685287476, "reward_std": 0.3592102527618408, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5579999685287476, "step": 54 }, { "completion_length": 241.0, "epoch": 0.46218487394957986, "grad_norm": 0.6024434566497803, "kl": 0.000739659764803946, "learning_rate": 2.7500000000000004e-06, "loss": 0.0, "reward": -0.3310000002384186, "reward_std": 0.6448813676834106, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3310000002384186, "step": 55 }, { "completion_length": 203.5, "epoch": 0.47058823529411764, "grad_norm": 0.6461113691329956, "kl": 0.000767363584600389, "learning_rate": 2.8000000000000003e-06, "loss": 0.0, "reward": -0.5499999523162842, "reward_std": 0.37052392959594727, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5499999523162842, "step": 56 }, { "completion_length": 256.0, "epoch": 0.4789915966386555, "grad_norm": 0.5811096429824829, "kl": 0.0007388386875391006, "learning_rate": 2.85e-06, "loss": 0.0, "reward": -0.9520000219345093, "reward_std": 0.08485277742147446, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.9520000219345093, "step": 57 }, { "completion_length": 256.0, "epoch": 0.48739495798319327, "grad_norm": 0.5078489184379578, "kl": 0.00042089505586773157, "learning_rate": 2.9e-06, "loss": 0.0, "reward": -0.320499986410141, "reward_std": 0.45325541496276855, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.320499986410141, "step": 58 }, { "completion_length": 226.0, "epoch": 0.4957983193277311, "grad_norm": 0.5292133092880249, "kl": 0.0005924634169787169, "learning_rate": 2.95e-06, "loss": -0.0, "reward": -0.6325000524520874, "reward_std": 0.0007070977007970214, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6325000524520874, "step": 59 }, { "completion_length": 256.0, "epoch": 0.5042016806722689, "grad_norm": 0.6018537282943726, "kl": 0.0005097792018204927, "learning_rate": 3e-06, "loss": 0.0, "reward": -0.7179999947547913, "reward_std": 0.018384791910648346, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7179999947547913, "step": 60 }, { "completion_length": 252.0, "epoch": 0.5126050420168067, "grad_norm": 0.6424996256828308, "kl": 0.0004537454224191606, "learning_rate": 3.05e-06, "loss": 0.0, "reward": -0.21650001406669617, "reward_std": 0.4829539656639099, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.21650001406669617, "step": 61 }, { "completion_length": 256.0, "epoch": 0.5210084033613446, "grad_norm": 0.6864486336708069, "kl": 0.0009388542966917157, "learning_rate": 3.1000000000000004e-06, "loss": 0.0, "reward": -0.7515000104904175, "reward_std": 0.09545940905809402, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7515000104904175, "step": 62 }, { "completion_length": 245.0, "epoch": 0.5294117647058824, "grad_norm": 0.5880638957023621, "kl": 0.0004731210065074265, "learning_rate": 3.1500000000000003e-06, "loss": 0.0, "reward": 0.30949997901916504, "reward_std": 0.2609224021434784, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.30949997901916504, "step": 63 }, { "completion_length": 250.0, "epoch": 0.5378151260504201, "grad_norm": 0.5453276038169861, "kl": 0.000731406849808991, "learning_rate": 3.2000000000000003e-06, "loss": 0.0, "reward": -0.9649999737739563, "reward_std": 0.1385929137468338, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.9649999737739563, "step": 64 }, { "completion_length": 256.0, "epoch": 0.5462184873949579, "grad_norm": 0.6058622002601624, "kl": 0.0016077194595709443, "learning_rate": 3.2500000000000002e-06, "loss": 0.0001, "reward": -0.2750000059604645, "reward_std": 0.5656854510307312, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.2750000059604645, "step": 65 }, { "completion_length": 256.0, "epoch": 0.5546218487394958, "grad_norm": 0.002362010069191456, "kl": 0.0003772533964365721, "learning_rate": 3.3000000000000006e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 66 }, { "completion_length": 214.0, "epoch": 0.5630252100840336, "grad_norm": 0.9808260798454285, "kl": 0.003824027720838785, "learning_rate": 3.3500000000000005e-06, "loss": 0.0002, "reward": -0.5684999823570251, "reward_std": 0.2580939531326294, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5684999823570251, "step": 67 }, { "completion_length": 185.5, "epoch": 0.5714285714285714, "grad_norm": 0.7452138662338257, "kl": 0.0008018913213163614, "learning_rate": 3.4000000000000005e-06, "loss": 0.0, "reward": -0.5065000057220459, "reward_std": 0.31466248631477356, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5065000057220459, "step": 68 }, { "completion_length": 256.0, "epoch": 0.5798319327731093, "grad_norm": 0.5869529843330383, "kl": 0.0005979267880320549, "learning_rate": 3.45e-06, "loss": 0.0, "reward": -0.3774999976158142, "reward_std": 0.7106423377990723, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3774999976158142, "step": 69 }, { "completion_length": 256.0, "epoch": 0.5882352941176471, "grad_norm": 0.009374188259243965, "kl": 0.0008036821382120252, "learning_rate": 3.5e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 70 }, { "completion_length": 256.0, "epoch": 0.5966386554621849, "grad_norm": 0.008580519817769527, "kl": 0.001065843622200191, "learning_rate": 3.5500000000000003e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 71 }, { "completion_length": 128.0, "epoch": 0.6050420168067226, "grad_norm": 0.8535614609718323, "kl": 0.0005846592830494046, "learning_rate": 3.6000000000000003e-06, "loss": 0.0, "reward": -0.24150000512599945, "reward_std": 0.02474873512983322, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.24150000512599945, "step": 72 }, { "completion_length": 256.0, "epoch": 0.6134453781512605, "grad_norm": 0.5905005931854248, "kl": 0.001255570212379098, "learning_rate": 3.65e-06, "loss": 0.0001, "reward": -0.3580000102519989, "reward_std": 0.6830651760101318, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3580000102519989, "step": 73 }, { "completion_length": 256.0, "epoch": 0.6218487394957983, "grad_norm": 0.0019246465526521206, "kl": 0.000560491462238133, "learning_rate": 3.7e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 74 }, { "completion_length": 256.0, "epoch": 0.6302521008403361, "grad_norm": 0.558110773563385, "kl": 0.0003808698384091258, "learning_rate": 3.7500000000000005e-06, "loss": 0.0, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 75 }, { "completion_length": 256.0, "epoch": 0.6386554621848739, "grad_norm": 0.7088025808334351, "kl": 0.0016581136733293533, "learning_rate": 3.8000000000000005e-06, "loss": 0.0001, "reward": -0.21399998664855957, "reward_std": 0.6561950445175171, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.21399998664855957, "step": 76 }, { "completion_length": 246.0, "epoch": 0.6470588235294118, "grad_norm": 0.43591105937957764, "kl": 0.0004626316367648542, "learning_rate": 3.85e-06, "loss": 0.0, "reward": -0.6894999742507935, "reward_std": 0.1633416712284088, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6894999742507935, "step": 77 }, { "completion_length": 256.0, "epoch": 0.6554621848739496, "grad_norm": 0.585048258304596, "kl": 0.001659805653616786, "learning_rate": 3.900000000000001e-06, "loss": 0.0001, "reward": -0.453000009059906, "reward_std": 0.8174154162406921, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.453000009059906, "step": 78 }, { "completion_length": 256.0, "epoch": 0.6638655462184874, "grad_norm": 0.5238794684410095, "kl": 0.0008105762535706162, "learning_rate": 3.95e-06, "loss": 0.0, "reward": -0.3555000126361847, "reward_std": 0.67952960729599, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3555000126361847, "step": 79 }, { "completion_length": 256.0, "epoch": 0.6722689075630253, "grad_norm": 0.57819664478302, "kl": 0.0004933737218379974, "learning_rate": 4.000000000000001e-06, "loss": 0.0, "reward": -0.398499995470047, "reward_std": 0.7403407096862793, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.398499995470047, "step": 80 }, { "completion_length": 214.5, "epoch": 0.680672268907563, "grad_norm": 0.6696988344192505, "kl": 0.001285638427361846, "learning_rate": 4.05e-06, "loss": 0.0001, "reward": -0.17949999868869781, "reward_std": 0.4306280016899109, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.17949999868869781, "step": 81 }, { "completion_length": 256.0, "epoch": 0.6890756302521008, "grad_norm": 0.616509199142456, "kl": 0.0007264763116836548, "learning_rate": 4.1e-06, "loss": 0.0, "reward": -0.3149999976158142, "reward_std": 0.622253954410553, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3149999976158142, "step": 82 }, { "completion_length": 256.0, "epoch": 0.6974789915966386, "grad_norm": 0.5198426246643066, "kl": 0.0003054221160709858, "learning_rate": 4.15e-06, "loss": 0.0, "reward": -0.45249998569488525, "reward_std": 0.6399316191673279, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.45249998569488525, "step": 83 }, { "completion_length": 256.0, "epoch": 0.7058823529411765, "grad_norm": 0.6182514429092407, "kl": 0.0015760199166834354, "learning_rate": 4.2000000000000004e-06, "loss": 0.0001, "reward": -0.31949999928474426, "reward_std": 0.6286179423332214, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.31949999928474426, "step": 84 }, { "completion_length": 256.0, "epoch": 0.7142857142857143, "grad_norm": 0.7121881246566772, "kl": 0.001009255414828658, "learning_rate": 4.25e-06, "loss": 0.0, "reward": -0.3675000071525574, "reward_std": 0.6965001821517944, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3675000071525574, "step": 85 }, { "completion_length": 153.5, "epoch": 0.7226890756302521, "grad_norm": 0.8565239906311035, "kl": 0.0011221251916140318, "learning_rate": 4.3e-06, "loss": 0.0, "reward": -0.3135000169277191, "reward_std": 0.13788582384586334, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3135000169277191, "step": 86 }, { "completion_length": 256.0, "epoch": 0.7310924369747899, "grad_norm": 0.4781147539615631, "kl": 0.0005010729655623436, "learning_rate": 4.350000000000001e-06, "loss": 0.0, "reward": -0.2865000069141388, "reward_std": 0.5819488763809204, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.2865000069141388, "step": 87 }, { "completion_length": 256.0, "epoch": 0.7394957983193278, "grad_norm": 0.6793462038040161, "kl": 0.0008797148475423455, "learning_rate": 4.4e-06, "loss": 0.0, "reward": -0.028999999165534973, "reward_std": 0.21778889000415802, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.028999999165534973, "step": 88 }, { "completion_length": 215.0, "epoch": 0.7478991596638656, "grad_norm": 0.7383250594139099, "kl": 0.0005435717757791281, "learning_rate": 4.450000000000001e-06, "loss": 0.0, "reward": 0.24549999833106995, "reward_std": 0.17041273415088654, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.24549999833106995, "step": 89 }, { "completion_length": 230.0, "epoch": 0.7563025210084033, "grad_norm": 0.6734226942062378, "kl": 0.0016420461470261216, "learning_rate": 4.5e-06, "loss": 0.0001, "reward": -0.8875000476837158, "reward_std": 0.0855599120259285, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.8875000476837158, "step": 90 }, { "completion_length": 153.5, "epoch": 0.7647058823529411, "grad_norm": 1.818917989730835, "kl": 0.0015077227726578712, "learning_rate": 4.5500000000000005e-06, "loss": 0.0001, "reward": -0.5035000443458557, "reward_std": 0.7035712599754333, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5035000443458557, "step": 91 }, { "completion_length": 256.0, "epoch": 0.773109243697479, "grad_norm": 0.5302695631980896, "kl": 0.0005277631571516395, "learning_rate": 4.600000000000001e-06, "loss": 0.0, "reward": -0.45899999141693115, "reward_std": 0.649124026298523, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.45899999141693115, "step": 92 }, { "completion_length": 240.0, "epoch": 0.7815126050420168, "grad_norm": 0.7379883527755737, "kl": 0.0011008679866790771, "learning_rate": 4.65e-06, "loss": 0.0, "reward": -0.22200000286102295, "reward_std": 0.4907321035861969, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.22200000286102295, "step": 93 }, { "completion_length": 222.0, "epoch": 0.7899159663865546, "grad_norm": 0.5233021974563599, "kl": 0.0007702452130615711, "learning_rate": 4.7e-06, "loss": 0.0, "reward": -0.09049999713897705, "reward_std": 0.3047630190849304, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09049999713897705, "step": 94 }, { "completion_length": 253.5, "epoch": 0.7983193277310925, "grad_norm": 0.509674608707428, "kl": 0.0006963105406612158, "learning_rate": 4.75e-06, "loss": 0.0, "reward": -0.6540000438690186, "reward_std": 0.12162236124277115, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6540000438690186, "step": 95 }, { "completion_length": 214.5, "epoch": 0.8067226890756303, "grad_norm": 0.8177438378334045, "kl": 0.0023356762249022722, "learning_rate": 4.800000000000001e-06, "loss": 0.0001, "reward": -0.6359999775886536, "reward_std": 0.3846661150455475, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6359999775886536, "step": 96 }, { "completion_length": 256.0, "epoch": 0.8151260504201681, "grad_norm": 0.003622900927439332, "kl": 0.0007770108059048653, "learning_rate": 4.85e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 97 }, { "completion_length": 256.0, "epoch": 0.8235294117647058, "grad_norm": 0.5421973466873169, "kl": 0.0008327988907694817, "learning_rate": 4.9000000000000005e-06, "loss": 0.0, "reward": -0.8034999966621399, "reward_std": 0.0841456800699234, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.8034999966621399, "step": 98 }, { "completion_length": 247.0, "epoch": 0.8319327731092437, "grad_norm": 0.6330018639564514, "kl": 0.0008103526197373867, "learning_rate": 4.95e-06, "loss": 0.0, "reward": -0.2720000147819519, "reward_std": 0.5614427924156189, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.2720000147819519, "step": 99 }, { "completion_length": 244.0, "epoch": 0.8403361344537815, "grad_norm": 0.5343500971794128, "kl": 0.0006257236236706376, "learning_rate": 5e-06, "loss": 0.0, "reward": -0.628000020980835, "reward_std": 0.03676954284310341, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.628000020980835, "step": 100 }, { "completion_length": 170.0, "epoch": 0.8487394957983193, "grad_norm": 1.0495034456253052, "kl": 0.0058599854819476604, "learning_rate": 4.999984769144476e-06, "loss": 0.0002, "reward": -0.43549999594688416, "reward_std": 0.04313350468873978, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.43549999594688416, "step": 101 }, { "completion_length": 256.0, "epoch": 0.8571428571428571, "grad_norm": 0.6612131595611572, "kl": 0.0008077158126980066, "learning_rate": 4.999939076763487e-06, "loss": 0.0, "reward": -0.9079999923706055, "reward_std": 0.03111271932721138, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.9079999923706055, "step": 102 }, { "completion_length": 244.5, "epoch": 0.865546218487395, "grad_norm": 0.5679981112480164, "kl": 0.000541475135833025, "learning_rate": 4.999862923413781e-06, "loss": 0.0, "reward": -0.6825000047683716, "reward_std": 0.08273148536682129, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6825000047683716, "step": 103 }, { "completion_length": 256.0, "epoch": 0.8739495798319328, "grad_norm": 0.7941910624504089, "kl": 0.0008186640916392207, "learning_rate": 4.999756310023261e-06, "loss": 0.0, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 104 }, { "completion_length": 256.0, "epoch": 0.8823529411764706, "grad_norm": 0.8296816349029541, "kl": 0.0007855751318857074, "learning_rate": 4.9996192378909785e-06, "loss": 0.0, "reward": -0.3434999883174896, "reward_std": 0.6625590324401855, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3434999883174896, "step": 105 }, { "completion_length": 256.0, "epoch": 0.8907563025210085, "grad_norm": 0.00317971664480865, "kl": 0.0008002255344763398, "learning_rate": 4.999451708687114e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 106 }, { "completion_length": 256.0, "epoch": 0.8991596638655462, "grad_norm": 0.5608940124511719, "kl": 0.0009439628338441253, "learning_rate": 4.9992537244529585e-06, "loss": 0.0, "reward": -0.5224999785423279, "reward_std": 0.9157032370567322, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5224999785423279, "step": 107 }, { "completion_length": 256.0, "epoch": 0.907563025210084, "grad_norm": 0.6995733380317688, "kl": 0.0017490392783656716, "learning_rate": 4.999025287600886e-06, "loss": 0.0001, "reward": -0.3149999976158142, "reward_std": 0.622253954410553, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3149999976158142, "step": 108 }, { "completion_length": 256.0, "epoch": 0.9159663865546218, "grad_norm": 0.0029912288300693035, "kl": 0.0006692197639495134, "learning_rate": 4.998766400914329e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 109 }, { "completion_length": 256.0, "epoch": 0.9243697478991597, "grad_norm": 0.0033117930870503187, "kl": 0.0008169824723154306, "learning_rate": 4.99847706754774e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 110 }, { "completion_length": 213.5, "epoch": 0.9327731092436975, "grad_norm": 0.005197931081056595, "kl": 0.0009741434478200972, "learning_rate": 4.998157291026553e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 111 }, { "completion_length": 256.0, "epoch": 0.9411764705882353, "grad_norm": 0.001984064467251301, "kl": 0.0005748765543103218, "learning_rate": 4.997807075247147e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 112 }, { "completion_length": 256.0, "epoch": 0.9495798319327731, "grad_norm": 0.6244724988937378, "kl": 0.0008410454029217362, "learning_rate": 4.997426424476787e-06, "loss": 0.0, "reward": -1.1699999570846558, "reward_std": 0.04666903614997864, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -1.1699999570846558, "step": 113 }, { "completion_length": 256.0, "epoch": 0.957983193277311, "grad_norm": 0.004670397378504276, "kl": 0.0005879481323063374, "learning_rate": 4.9970153433535855e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 114 }, { "completion_length": 239.0, "epoch": 0.9663865546218487, "grad_norm": 0.7702596783638, "kl": 0.0008258053567260504, "learning_rate": 4.9965738368864345e-06, "loss": 0.0, "reward": -0.30149999260902405, "reward_std": 0.6031620502471924, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.30149999260902405, "step": 115 }, { "completion_length": 256.0, "epoch": 0.9747899159663865, "grad_norm": 0.5543686151504517, "kl": 0.0005617273272946477, "learning_rate": 4.996101910454953e-06, "loss": 0.0, "reward": -0.8005000352859497, "reward_std": 0.07000359892845154, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.8005000352859497, "step": 116 }, { "completion_length": 256.0, "epoch": 0.9831932773109243, "grad_norm": 0.5355740189552307, "kl": 0.0008423441322520375, "learning_rate": 4.995599569809414e-06, "loss": 0.0, "reward": -0.3384999930858612, "reward_std": 0.6554879546165466, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3384999930858612, "step": 117 }, { "completion_length": 256.0, "epoch": 0.9915966386554622, "grad_norm": 0.004311452619731426, "kl": 0.0008572986116632819, "learning_rate": 4.9950668210706795e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 118 }, { "completion_length": 256.0, "epoch": 1.0, "grad_norm": 0.7452813386917114, "kl": 0.013450833037495613, "learning_rate": 4.994503670730126e-06, "loss": 0.0005, "reward": -0.903499960899353, "reward_std": 0.0855599120259285, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.903499960899353, "step": 119 }, { "completion_length": 256.0, "epoch": 1.0084033613445378, "grad_norm": 0.5049381852149963, "kl": 0.0006096747238188982, "learning_rate": 4.993910125649561e-06, "loss": 0.0, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 120 }, { "completion_length": 205.5, "epoch": 1.0168067226890756, "grad_norm": 0.6632373332977295, "kl": 0.0013348809443414211, "learning_rate": 4.993286193061145e-06, "loss": 0.0001, "reward": -0.6130000352859497, "reward_std": 0.394565612077713, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6130000352859497, "step": 121 }, { "completion_length": 252.0, "epoch": 1.0252100840336134, "grad_norm": 0.5395358204841614, "kl": 0.0004946549888700247, "learning_rate": 4.992631880567301e-06, "loss": 0.0, "reward": -0.1860000044107437, "reward_std": 0.43982040882110596, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1860000044107437, "step": 122 }, { "completion_length": 224.5, "epoch": 1.0336134453781514, "grad_norm": 0.704481303691864, "kl": 0.0009251289302483201, "learning_rate": 4.991947196140619e-06, "loss": 0.0, "reward": -0.2554999887943268, "reward_std": 0.538108229637146, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.2554999887943268, "step": 123 }, { "completion_length": 220.0, "epoch": 1.0420168067226891, "grad_norm": 0.6415400505065918, "kl": 0.0014562567230314016, "learning_rate": 4.9912321481237616e-06, "loss": 0.0001, "reward": -0.07899999618530273, "reward_std": 0.2884995639324188, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.07899999618530273, "step": 124 }, { "completion_length": 126.0, "epoch": 1.050420168067227, "grad_norm": 0.9022632837295532, "kl": 0.0010720144491642714, "learning_rate": 4.990486745229364e-06, "loss": 0.0, "reward": 0.05849999934434891, "reward_std": 0.2708218991756439, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.05849999934434891, "step": 125 }, { "completion_length": 256.0, "epoch": 1.0588235294117647, "grad_norm": 0.006261729635298252, "kl": 0.001457626698538661, "learning_rate": 4.989710996539926e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 126 }, { "completion_length": 220.5, "epoch": 1.0672268907563025, "grad_norm": 0.7207704186439514, "kl": 0.001375878695398569, "learning_rate": 4.9889049115077e-06, "loss": 0.0001, "reward": -0.4559999704360962, "reward_std": 0.8216580152511597, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4559999704360962, "step": 127 }, { "completion_length": 256.0, "epoch": 1.0756302521008403, "grad_norm": 0.5352386832237244, "kl": 0.000541475135833025, "learning_rate": 4.988068499954578e-06, "loss": 0.0, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 128 }, { "completion_length": 256.0, "epoch": 1.084033613445378, "grad_norm": 0.5526212453842163, "kl": 0.0017692554974928498, "learning_rate": 4.987201772071971e-06, "loss": 0.0001, "reward": -0.33399999141693115, "reward_std": 0.649124026298523, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.33399999141693115, "step": 129 }, { "completion_length": 256.0, "epoch": 1.092436974789916, "grad_norm": 0.5891320109367371, "kl": 0.000815674546174705, "learning_rate": 4.986304738420684e-06, "loss": 0.0, "reward": -0.398499995470047, "reward_std": 0.5635640621185303, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.398499995470047, "step": 130 }, { "completion_length": 256.0, "epoch": 1.1008403361344539, "grad_norm": 0.5917855501174927, "kl": 0.0008857346838340163, "learning_rate": 4.985377409930789e-06, "loss": 0.0, "reward": -0.3605000078678131, "reward_std": 0.6866006851196289, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3605000078678131, "step": 131 }, { "completion_length": 217.0, "epoch": 1.1092436974789917, "grad_norm": 0.5448706746101379, "kl": 0.0007243360159918666, "learning_rate": 4.984419797901491e-06, "loss": 0.0, "reward": -0.11399999260902405, "reward_std": 0.3379970192909241, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.11399999260902405, "step": 132 }, { "completion_length": 256.0, "epoch": 1.1176470588235294, "grad_norm": 0.011198841966688633, "kl": 0.002690156688913703, "learning_rate": 4.983431914000991e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 133 }, { "completion_length": 256.0, "epoch": 1.1260504201680672, "grad_norm": 0.5734305381774902, "kl": 0.000893282936885953, "learning_rate": 4.9824137702663424e-06, "loss": 0.0, "reward": -0.3264999985694885, "reward_std": 0.638517439365387, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3264999985694885, "step": 134 }, { "completion_length": 256.0, "epoch": 1.134453781512605, "grad_norm": 0.00967981293797493, "kl": 0.0013201625552028418, "learning_rate": 4.981365379103306e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 135 }, { "completion_length": 216.0, "epoch": 1.1428571428571428, "grad_norm": 0.7859358191490173, "kl": 0.002926263026893139, "learning_rate": 4.980286753286196e-06, "loss": 0.0001, "reward": -0.34299999475479126, "reward_std": 0.6618519425392151, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.34299999475479126, "step": 136 }, { "completion_length": 191.0, "epoch": 1.1512605042016806, "grad_norm": 0.6101590394973755, "kl": 0.0009017299162223935, "learning_rate": 4.979177905957726e-06, "loss": 0.0, "reward": -0.5954999923706055, "reward_std": 0.17041274905204773, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5954999923706055, "step": 137 }, { "completion_length": 256.0, "epoch": 1.1596638655462184, "grad_norm": 0.5506895184516907, "kl": 0.0014595092507079244, "learning_rate": 4.978038850628855e-06, "loss": 0.0001, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 138 }, { "completion_length": 157.5, "epoch": 1.1680672268907564, "grad_norm": 1.3151723146438599, "kl": 0.0019490313716232777, "learning_rate": 4.9768696011786095e-06, "loss": 0.0001, "reward": -0.41999998688697815, "reward_std": 0.7226631045341492, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.41999998688697815, "step": 139 }, { "completion_length": 195.0, "epoch": 1.1764705882352942, "grad_norm": 0.0062357778660953045, "kl": 0.001764831249602139, "learning_rate": 4.975670171853926e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 140 }, { "completion_length": 256.0, "epoch": 1.184873949579832, "grad_norm": 0.007495187688618898, "kl": 0.001572138862684369, "learning_rate": 4.974440577269473e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 141 }, { "completion_length": 206.0, "epoch": 1.1932773109243697, "grad_norm": 0.38132038712501526, "kl": 0.0009923784527927637, "learning_rate": 4.973180832407471e-06, "loss": 0.0, "reward": 0.18799999356269836, "reward_std": 0.08909544348716736, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18799999356269836, "step": 142 }, { "completion_length": 77.0, "epoch": 1.2016806722689075, "grad_norm": 2.149287223815918, "kl": 0.0030675220768898726, "learning_rate": 4.971890952617515e-06, "loss": 0.0001, "reward": 0.008500002324581146, "reward_std": 0.21708177030086517, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.008500002324581146, "step": 143 }, { "completion_length": 256.0, "epoch": 1.2100840336134453, "grad_norm": 0.5770434141159058, "kl": 0.0015670701395720243, "learning_rate": 4.970570953616383e-06, "loss": 0.0001, "reward": -0.3330000042915344, "reward_std": 0.647709846496582, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3330000042915344, "step": 144 }, { "completion_length": 256.0, "epoch": 1.2184873949579833, "grad_norm": 0.009064174257218838, "kl": 0.0023067579604685307, "learning_rate": 4.9692208514878445e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 145 }, { "completion_length": 256.0, "epoch": 1.226890756302521, "grad_norm": 0.006435454357415438, "kl": 0.0014684547204524279, "learning_rate": 4.96784066268247e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 146 }, { "completion_length": 256.0, "epoch": 1.2352941176470589, "grad_norm": 0.010733702220022678, "kl": 0.0029022865928709507, "learning_rate": 4.966430404017424e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 147 }, { "completion_length": 256.0, "epoch": 1.2436974789915967, "grad_norm": 0.003568527987226844, "kl": 0.001038318034261465, "learning_rate": 4.964990092676263e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 148 }, { "completion_length": 219.0, "epoch": 1.2521008403361344, "grad_norm": 0.6777618527412415, "kl": 0.0034735635854303837, "learning_rate": 4.963519746208726e-06, "loss": 0.0001, "reward": -0.7095000147819519, "reward_std": 0.2595081627368927, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7095000147819519, "step": 149 }, { "completion_length": 256.0, "epoch": 1.2605042016806722, "grad_norm": 0.6326704621315002, "kl": 0.0055309138260781765, "learning_rate": 4.962019382530521e-06, "loss": 0.0002, "reward": -0.3935000002384186, "reward_std": 0.7332696914672852, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3935000002384186, "step": 150 }, { "completion_length": 256.0, "epoch": 1.26890756302521, "grad_norm": 0.0065620034001767635, "kl": 0.0019510899437591434, "learning_rate": 4.960489019923105e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 151 }, { "completion_length": 256.0, "epoch": 1.2773109243697478, "grad_norm": 0.5917569994926453, "kl": 0.0023294687271118164, "learning_rate": 4.958928677033465e-06, "loss": 0.0001, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 152 }, { "completion_length": 256.0, "epoch": 1.2857142857142856, "grad_norm": 0.013220743276178837, "kl": 0.0028012192342430353, "learning_rate": 4.957338372873886e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 153 }, { "completion_length": 227.0, "epoch": 1.2941176470588236, "grad_norm": 0.6317896842956543, "kl": 0.000990564120002091, "learning_rate": 4.9557181268217225e-06, "loss": 0.0, "reward": -0.2395000010728836, "reward_std": 0.3387041389942169, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.2395000010728836, "step": 154 }, { "completion_length": 256.0, "epoch": 1.3025210084033614, "grad_norm": 0.6344255208969116, "kl": 0.002380058169364929, "learning_rate": 4.9540679586191605e-06, "loss": 0.0001, "reward": -0.8610000014305115, "reward_std": 0.08485281467437744, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.8610000014305115, "step": 155 }, { "completion_length": 210.0, "epoch": 1.3109243697478992, "grad_norm": 0.6320013403892517, "kl": 0.023301096633076668, "learning_rate": 4.9523878883729794e-06, "loss": 0.0009, "reward": -0.5799999833106995, "reward_std": 0.0664680227637291, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5799999833106995, "step": 156 }, { "completion_length": 73.5, "epoch": 1.319327731092437, "grad_norm": 1.7680063247680664, "kl": 0.004321013577282429, "learning_rate": 4.9506779365543054e-06, "loss": 0.0002, "reward": 0.002499997615814209, "reward_std": 0.18172644078731537, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.002499997615814209, "step": 157 }, { "completion_length": 256.0, "epoch": 1.3277310924369747, "grad_norm": 0.01590278185904026, "kl": 0.003557231044396758, "learning_rate": 4.94893812399836e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 158 }, { "completion_length": 148.5, "epoch": 1.3361344537815127, "grad_norm": 1.7092546224594116, "kl": 0.0027659109327942133, "learning_rate": 4.947168471904213e-06, "loss": 0.0001, "reward": -0.46799999475479126, "reward_std": 0.8386286497116089, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.46799999475479126, "step": 159 }, { "completion_length": 256.0, "epoch": 1.3445378151260505, "grad_norm": 0.6104798913002014, "kl": 0.0023875832557678223, "learning_rate": 4.9453690018345144e-06, "loss": 0.0001, "reward": -0.24900001287460327, "reward_std": 0.5289158821105957, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.24900001287460327, "step": 160 }, { "completion_length": 256.0, "epoch": 1.3529411764705883, "grad_norm": 0.008323854766786098, "kl": 0.0023486739955842495, "learning_rate": 4.9435397357152406e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 161 }, { "completion_length": 256.0, "epoch": 1.361344537815126, "grad_norm": 0.01384750846773386, "kl": 0.004234651103615761, "learning_rate": 4.9416806958354206e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 162 }, { "completion_length": 256.0, "epoch": 1.3697478991596639, "grad_norm": 0.4482683539390564, "kl": 0.001261853496544063, "learning_rate": 4.939791904846869e-06, "loss": 0.0001, "reward": -0.24399998784065247, "reward_std": 0.5218448042869568, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.24399998784065247, "step": 163 }, { "completion_length": 256.0, "epoch": 1.3781512605042017, "grad_norm": 0.011853448115289211, "kl": 0.0015549553791061044, "learning_rate": 4.937873385763909e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 164 }, { "completion_length": 170.0, "epoch": 1.3865546218487395, "grad_norm": 0.6615374684333801, "kl": 0.0015435302630066872, "learning_rate": 4.935925161963089e-06, "loss": 0.0001, "reward": -0.3850000202655792, "reward_std": 0.3436538875102997, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3850000202655792, "step": 165 }, { "completion_length": 256.0, "epoch": 1.3949579831932772, "grad_norm": 0.6419242024421692, "kl": 0.004639757331460714, "learning_rate": 4.933947257182901e-06, "loss": 0.0002, "reward": -0.34599998593330383, "reward_std": 0.6660946011543274, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.34599998593330383, "step": 166 }, { "completion_length": 237.5, "epoch": 1.403361344537815, "grad_norm": 0.6443310976028442, "kl": 0.0020278198644518852, "learning_rate": 4.9319396955234925e-06, "loss": 0.0001, "reward": -0.375, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.375, "step": 167 }, { "completion_length": 212.5, "epoch": 1.4117647058823528, "grad_norm": 0.7147384285926819, "kl": 0.0019518407061696053, "learning_rate": 4.9299025014463665e-06, "loss": 0.0001, "reward": -0.20249998569488525, "reward_std": 0.4631549119949341, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.20249998569488525, "step": 168 }, { "completion_length": 256.0, "epoch": 1.4201680672268908, "grad_norm": 0.00814531184732914, "kl": 0.0022223321720957756, "learning_rate": 4.92783569977409e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 169 }, { "completion_length": 256.0, "epoch": 1.4285714285714286, "grad_norm": 0.6234205961227417, "kl": 0.0028687326703220606, "learning_rate": 4.925739315689991e-06, "loss": 0.0001, "reward": -0.4320000112056732, "reward_std": 0.7877170443534851, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4320000112056732, "step": 170 }, { "completion_length": 256.0, "epoch": 1.4369747899159664, "grad_norm": 0.49408861994743347, "kl": 0.002177158836275339, "learning_rate": 4.923613374737848e-06, "loss": 0.0001, "reward": -0.3675000071525574, "reward_std": 0.6965001821517944, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3675000071525574, "step": 171 }, { "completion_length": 256.0, "epoch": 1.4453781512605042, "grad_norm": 0.5496301651000977, "kl": 0.0012339134700596333, "learning_rate": 4.921457902821578e-06, "loss": 0.0, "reward": -0.8660000562667847, "reward_std": 0.07495332509279251, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.8660000562667847, "step": 172 }, { "completion_length": 189.0, "epoch": 1.453781512605042, "grad_norm": 0.8816677331924438, "kl": 0.0022327026817947626, "learning_rate": 4.9192729262049285e-06, "loss": 0.0001, "reward": -0.03999999910593033, "reward_std": 0.05656854063272476, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.03999999910593033, "step": 173 }, { "completion_length": 256.0, "epoch": 1.46218487394958, "grad_norm": 0.007973435334861279, "kl": 0.0030176807194948196, "learning_rate": 4.917058471511149e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 174 }, { "completion_length": 256.0, "epoch": 1.4705882352941178, "grad_norm": 0.0035758838057518005, "kl": 0.001068331184796989, "learning_rate": 4.914814565722671e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 175 }, { "completion_length": 256.0, "epoch": 1.4789915966386555, "grad_norm": 0.012404424138367176, "kl": 0.003304045647382736, "learning_rate": 4.912541236180779e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 176 }, { "completion_length": 256.0, "epoch": 1.4873949579831933, "grad_norm": 0.5349675416946411, "kl": 0.0018023609882220626, "learning_rate": 4.910238510585275e-06, "loss": 0.0001, "reward": -0.3034999966621399, "reward_std": 0.605990469455719, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3034999966621399, "step": 177 }, { "completion_length": 234.0, "epoch": 1.495798319327731, "grad_norm": 0.7003433704376221, "kl": 0.002368425251916051, "learning_rate": 4.907906416994146e-06, "loss": 0.0001, "reward": -0.3499999940395355, "reward_std": 0.8499422669410706, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3499999940395355, "step": 178 }, { "completion_length": 240.0, "epoch": 1.504201680672269, "grad_norm": 0.5884004831314087, "kl": 0.002813747152686119, "learning_rate": 4.905544983823214e-06, "loss": 0.0001, "reward": -0.6740000247955322, "reward_std": 0.1456640213727951, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6740000247955322, "step": 179 }, { "completion_length": 256.0, "epoch": 1.5126050420168067, "grad_norm": 0.6206634640693665, "kl": 0.0015998086892068386, "learning_rate": 4.903154239845798e-06, "loss": 0.0001, "reward": -1.0184999704360962, "reward_std": 0.06717512011528015, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -1.0184999704360962, "step": 180 }, { "completion_length": 256.0, "epoch": 1.5210084033613445, "grad_norm": 0.006433509290218353, "kl": 0.001212632399983704, "learning_rate": 4.900734214192358e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 181 }, { "completion_length": 256.0, "epoch": 1.5294117647058822, "grad_norm": 0.7895285487174988, "kl": 0.0039134095422923565, "learning_rate": 4.898284936350144e-06, "loss": 0.0002, "reward": -0.8855000138282776, "reward_std": 0.18879756331443787, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.8855000138282776, "step": 182 }, { "completion_length": 256.0, "epoch": 1.53781512605042, "grad_norm": 0.7201721668243408, "kl": 0.00224531558342278, "learning_rate": 4.8958064361628334e-06, "loss": 0.0001, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 183 }, { "completion_length": 256.0, "epoch": 1.5462184873949578, "grad_norm": 0.004999098833650351, "kl": 0.0019176857313141227, "learning_rate": 4.893298743830168e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 184 }, { "completion_length": 256.0, "epoch": 1.5546218487394958, "grad_norm": 0.5888034105300903, "kl": 0.0026927320286631584, "learning_rate": 4.890761889907589e-06, "loss": 0.0001, "reward": -0.41749998927116394, "reward_std": 0.7672109007835388, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.41749998927116394, "step": 185 }, { "completion_length": 256.0, "epoch": 1.5630252100840336, "grad_norm": 0.6680886149406433, "kl": 0.0021667431574314833, "learning_rate": 4.888195905305859e-06, "loss": 0.0001, "reward": -0.4235000014305115, "reward_std": 0.7756960988044739, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4235000014305115, "step": 186 }, { "completion_length": 256.0, "epoch": 1.5714285714285714, "grad_norm": 0.6315932869911194, "kl": 0.004154995083808899, "learning_rate": 4.885600821290692e-06, "loss": 0.0002, "reward": -0.4169999957084656, "reward_std": 0.7665037512779236, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4169999957084656, "step": 187 }, { "completion_length": 256.0, "epoch": 1.5798319327731094, "grad_norm": 0.013168340548872948, "kl": 0.002807637443765998, "learning_rate": 4.882976669482368e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 188 }, { "completion_length": 256.0, "epoch": 1.5882352941176472, "grad_norm": 0.01260432880371809, "kl": 0.005540855694562197, "learning_rate": 4.880323481855347e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 189 }, { "completion_length": 256.0, "epoch": 1.596638655462185, "grad_norm": 0.48779165744781494, "kl": 0.0018089055083692074, "learning_rate": 4.8776412907378845e-06, "loss": 0.0001, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 190 }, { "completion_length": 256.0, "epoch": 1.6050420168067228, "grad_norm": 0.8137795329093933, "kl": 0.003074652049690485, "learning_rate": 4.874930128811631e-06, "loss": 0.0001, "reward": -0.3154999911785126, "reward_std": 0.6229610443115234, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3154999911785126, "step": 191 }, { "completion_length": 256.0, "epoch": 1.6134453781512605, "grad_norm": 0.008933763951063156, "kl": 0.004218064248561859, "learning_rate": 4.8721900291112415e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 192 }, { "completion_length": 249.5, "epoch": 1.6218487394957983, "grad_norm": 0.625883162021637, "kl": 0.0013728686608374119, "learning_rate": 4.869421025023965e-06, "loss": 0.0001, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 193 }, { "completion_length": 256.0, "epoch": 1.6302521008403361, "grad_norm": 0.0036960856523364782, "kl": 0.0007627500453963876, "learning_rate": 4.866623150289241e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 194 }, { "completion_length": 256.0, "epoch": 1.638655462184874, "grad_norm": 0.003923643380403519, "kl": 0.0012829414336010814, "learning_rate": 4.863796438998293e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 195 }, { "completion_length": 94.0, "epoch": 1.6470588235294117, "grad_norm": 1.2180060148239136, "kl": 0.0056479619815945625, "learning_rate": 4.860940925593703e-06, "loss": 0.0002, "reward": 0.010999999940395355, "reward_std": 0.16122034192085266, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.010999999940395355, "step": 196 }, { "completion_length": 256.0, "epoch": 1.6554621848739495, "grad_norm": 0.004531026817858219, "kl": 0.001344866119325161, "learning_rate": 4.858056644869002e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 197 }, { "completion_length": 256.0, "epoch": 1.6638655462184873, "grad_norm": 0.004335981793701649, "kl": 0.001122850924730301, "learning_rate": 4.855143631968242e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 198 }, { "completion_length": 256.0, "epoch": 1.6722689075630253, "grad_norm": 0.0052246092818677425, "kl": 0.0010601849062368274, "learning_rate": 4.852201922385564e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 199 }, { "completion_length": 256.0, "epoch": 1.680672268907563, "grad_norm": 0.00840233825147152, "kl": 0.0031644925475120544, "learning_rate": 4.849231551964771e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 200 }, { "completion_length": 256.0, "epoch": 1.6890756302521008, "grad_norm": 0.5011340379714966, "kl": 0.005810223985463381, "learning_rate": 4.84623255689889e-06, "loss": 0.0002, "reward": -0.32600000500679016, "reward_std": 0.6378102898597717, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.32600000500679016, "step": 201 }, { "completion_length": 256.0, "epoch": 1.6974789915966386, "grad_norm": 0.5445303320884705, "kl": 0.017255699262022972, "learning_rate": 4.84320497372973e-06, "loss": 0.0007, "reward": -0.22299998998641968, "reward_std": 0.4921463131904602, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.22299998998641968, "step": 202 }, { "completion_length": 256.0, "epoch": 1.7058823529411766, "grad_norm": 0.5509700179100037, "kl": 0.0029578409157693386, "learning_rate": 4.840148839347434e-06, "loss": 0.0001, "reward": 0.017999999225139618, "reward_std": 0.025455841794610023, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.017999999225139618, "step": 203 }, { "completion_length": 256.0, "epoch": 1.7142857142857144, "grad_norm": 0.6639806628227234, "kl": 0.0017400487558916211, "learning_rate": 4.837064190990036e-06, "loss": 0.0001, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 204 }, { "completion_length": 256.0, "epoch": 1.7226890756302522, "grad_norm": 0.010254215449094772, "kl": 0.0018985836068168283, "learning_rate": 4.833951066243004e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 205 }, { "completion_length": 200.0, "epoch": 1.73109243697479, "grad_norm": 11.573519706726074, "kl": 0.9488453269004822, "learning_rate": 4.830809503038781e-06, "loss": 0.038, "reward": -0.5254999995231628, "reward_std": 0.45042699575424194, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5254999995231628, "step": 206 }, { "completion_length": 256.0, "epoch": 1.7394957983193278, "grad_norm": 0.6958323121070862, "kl": 0.017392966896295547, "learning_rate": 4.8276395396563215e-06, "loss": 0.0007, "reward": -0.35100001096725464, "reward_std": 0.6731656789779663, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.35100001096725464, "step": 207 }, { "completion_length": 256.0, "epoch": 1.7478991596638656, "grad_norm": 0.013528794050216675, "kl": 0.002810065634548664, "learning_rate": 4.824441214720629e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 208 }, { "completion_length": 200.0, "epoch": 1.7563025210084033, "grad_norm": 0.9539812803268433, "kl": 0.011708428151905537, "learning_rate": 4.821214567202284e-06, "loss": 0.0005, "reward": 0.12549999356269836, "reward_std": 0.0007071082363836467, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12549999356269836, "step": 209 }, { "completion_length": 256.0, "epoch": 1.7647058823529411, "grad_norm": 0.6109620928764343, "kl": 0.006577869411557913, "learning_rate": 4.817959636416969e-06, "loss": 0.0003, "reward": -0.4754999876022339, "reward_std": 0.8492352366447449, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4754999876022339, "step": 210 }, { "completion_length": 256.0, "epoch": 1.773109243697479, "grad_norm": 0.6346261501312256, "kl": 0.01584402658045292, "learning_rate": 4.814676462024988e-06, "loss": 0.0006, "reward": -0.36899998784065247, "reward_std": 0.6986215114593506, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.36899998784065247, "step": 211 }, { "completion_length": 256.0, "epoch": 1.7815126050420167, "grad_norm": 0.5287441611289978, "kl": 0.04332538694143295, "learning_rate": 4.811365084030784e-06, "loss": 0.0017, "reward": -0.351500004529953, "reward_std": 0.6738727688789368, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.351500004529953, "step": 212 }, { "completion_length": 256.0, "epoch": 1.7899159663865545, "grad_norm": 0.6578015685081482, "kl": 0.0032243668101727962, "learning_rate": 4.808025542782453e-06, "loss": 0.0001, "reward": -0.28700000047683716, "reward_std": 0.5826559662818909, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.28700000047683716, "step": 213 }, { "completion_length": 256.0, "epoch": 1.7983193277310925, "grad_norm": 0.006729374639689922, "kl": 0.0019778632558882236, "learning_rate": 4.804657878971252e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 214 }, { "completion_length": 248.0, "epoch": 1.8067226890756303, "grad_norm": 0.8637768030166626, "kl": 0.00650005741044879, "learning_rate": 4.801262133631101e-06, "loss": 0.0003, "reward": -0.8569999933242798, "reward_std": 0.022627420723438263, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.8569999933242798, "step": 215 }, { "completion_length": 220.0, "epoch": 1.815126050420168, "grad_norm": 1.1234307289123535, "kl": 0.04851701855659485, "learning_rate": 4.7978383481380865e-06, "loss": 0.0019, "reward": -0.7459999918937683, "reward_std": 0.23617365956306458, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7459999918937683, "step": 216 }, { "completion_length": 207.5, "epoch": 1.8235294117647058, "grad_norm": 0.7308142781257629, "kl": 0.004503697622567415, "learning_rate": 4.794386564209953e-06, "loss": 0.0002, "reward": -0.3190000057220459, "reward_std": 0.8061017394065857, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3190000057220459, "step": 217 }, { "completion_length": 256.0, "epoch": 1.8319327731092439, "grad_norm": 0.012931163422763348, "kl": 0.00362013909034431, "learning_rate": 4.790906823905599e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 218 }, { "completion_length": 256.0, "epoch": 1.8403361344537816, "grad_norm": 0.5851921439170837, "kl": 0.0019937106408178806, "learning_rate": 4.787399169624562e-06, "loss": 0.0001, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 219 }, { "completion_length": 256.0, "epoch": 1.8487394957983194, "grad_norm": 0.5975505113601685, "kl": 0.0012822413118556142, "learning_rate": 4.783863644106502e-06, "loss": 0.0001, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 220 }, { "completion_length": 150.0, "epoch": 1.8571428571428572, "grad_norm": 0.955122172832489, "kl": 0.005223740823566914, "learning_rate": 4.780300290430683e-06, "loss": 0.0002, "reward": -0.07900001108646393, "reward_std": 0.46669045090675354, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.07900001108646393, "step": 221 }, { "completion_length": 256.0, "epoch": 1.865546218487395, "grad_norm": 0.005906517617404461, "kl": 0.0009002086007967591, "learning_rate": 4.776709152015443e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 222 }, { "completion_length": 152.0, "epoch": 1.8739495798319328, "grad_norm": 0.7984880805015564, "kl": 0.002290993696078658, "learning_rate": 4.773090272617672e-06, "loss": 0.0001, "reward": -0.09600000083446503, "reward_std": 0.3125411868095398, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09600000083446503, "step": 223 }, { "completion_length": 245.5, "epoch": 1.8823529411764706, "grad_norm": 0.5694111585617065, "kl": 0.0007962716626934707, "learning_rate": 4.769443696332272e-06, "loss": 0.0, "reward": -0.7419999837875366, "reward_std": 0.04242640733718872, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7419999837875366, "step": 224 }, { "completion_length": 256.0, "epoch": 1.8907563025210083, "grad_norm": 0.6426722407341003, "kl": 0.005715792998671532, "learning_rate": 4.765769467591626e-06, "loss": 0.0002, "reward": -0.3540000021457672, "reward_std": 0.6774082779884338, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3540000021457672, "step": 225 }, { "completion_length": 256.0, "epoch": 1.8991596638655461, "grad_norm": 0.6013181805610657, "kl": 0.0018825461156666279, "learning_rate": 4.762067631165049e-06, "loss": 0.0001, "reward": -0.32199999690055847, "reward_std": 0.6321534514427185, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.32199999690055847, "step": 226 }, { "completion_length": 243.5, "epoch": 1.907563025210084, "grad_norm": 0.7494937181472778, "kl": 0.0013611309695988894, "learning_rate": 4.7583382321582525e-06, "loss": 0.0001, "reward": -0.784500002861023, "reward_std": 0.06576092541217804, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.784500002861023, "step": 227 }, { "completion_length": 256.0, "epoch": 1.9159663865546217, "grad_norm": 0.008450777269899845, "kl": 0.0019867848604917526, "learning_rate": 4.754581316012785e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 228 }, { "completion_length": 256.0, "epoch": 1.9243697478991597, "grad_norm": 0.0046703568659722805, "kl": 0.0013717284891754389, "learning_rate": 4.750796928505484e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 229 }, { "completion_length": 256.0, "epoch": 1.9327731092436975, "grad_norm": 1.3561888933181763, "kl": 0.15282903611660004, "learning_rate": 4.746985115747918e-06, "loss": 0.0061, "reward": -0.26649999618530273, "reward_std": 0.5536645650863647, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.26649999618530273, "step": 230 }, { "completion_length": 256.0, "epoch": 1.9411764705882353, "grad_norm": 0.005476224236190319, "kl": 0.0016411212272942066, "learning_rate": 4.743145924185821e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 231 }, { "completion_length": 132.5, "epoch": 1.949579831932773, "grad_norm": 6.810699462890625, "kl": 0.003355429507791996, "learning_rate": 4.7392794005985324e-06, "loss": 0.0001, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 232 }, { "completion_length": 256.0, "epoch": 1.957983193277311, "grad_norm": 0.5207685828208923, "kl": 0.0020944359712302685, "learning_rate": 4.735385592098421e-06, "loss": 0.0001, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 233 }, { "completion_length": 134.0, "epoch": 1.9663865546218489, "grad_norm": 0.007784485351294279, "kl": 0.003241205122321844, "learning_rate": 4.731464546130315e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 234 }, { "completion_length": 186.0, "epoch": 1.9747899159663866, "grad_norm": 0.8313072919845581, "kl": 0.006402643397450447, "learning_rate": 4.72751631047092e-06, "loss": 0.0003, "reward": -0.5040000081062317, "reward_std": 0.26304373145103455, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5040000081062317, "step": 235 }, { "completion_length": 217.0, "epoch": 1.9831932773109244, "grad_norm": 0.6790209412574768, "kl": 0.0015397178940474987, "learning_rate": 4.723540933228245e-06, "loss": 0.0001, "reward": -0.35749998688697815, "reward_std": 0.6823580265045166, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.35749998688697815, "step": 236 }, { "completion_length": 221.0, "epoch": 1.9915966386554622, "grad_norm": 0.5055168867111206, "kl": 0.0011104149743914604, "learning_rate": 4.719538462841003e-06, "loss": 0.0, "reward": -0.3034999966621399, "reward_std": 0.605990469455719, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3034999966621399, "step": 237 }, { "completion_length": 170.5, "epoch": 2.0, "grad_norm": 1.3623337745666504, "kl": 0.022053195163607597, "learning_rate": 4.715508948078037e-06, "loss": 0.0009, "reward": -0.36550000309944153, "reward_std": 0.5876057744026184, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.36550000309944153, "step": 238 }, { "completion_length": 256.0, "epoch": 2.008403361344538, "grad_norm": 0.5293018817901611, "kl": 0.0011760479537770152, "learning_rate": 4.71145243803771e-06, "loss": 0.0, "reward": -0.3815000057220459, "reward_std": 0.7162991762161255, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3815000057220459, "step": 239 }, { "completion_length": 256.0, "epoch": 2.0168067226890756, "grad_norm": 0.0042824759148061275, "kl": 0.0015157678863033652, "learning_rate": 4.707368982147318e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 240 }, { "completion_length": 202.5, "epoch": 2.0252100840336134, "grad_norm": 0.6972439885139465, "kl": 0.0013616853393614292, "learning_rate": 4.703258630162481e-06, "loss": 0.0001, "reward": -0.1704999953508377, "reward_std": 0.24112340807914734, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1704999953508377, "step": 241 }, { "completion_length": 256.0, "epoch": 2.033613445378151, "grad_norm": 0.4448752999305725, "kl": 0.0010828673839569092, "learning_rate": 4.699121432166542e-06, "loss": 0.0, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 242 }, { "completion_length": 155.5, "epoch": 2.042016806722689, "grad_norm": 0.8050215244293213, "kl": 0.04223792254924774, "learning_rate": 4.6949574385699514e-06, "loss": 0.0017, "reward": -0.39800000190734863, "reward_std": 0.04666903614997864, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.39800000190734863, "step": 243 }, { "completion_length": 246.5, "epoch": 2.0504201680672267, "grad_norm": 0.6864359974861145, "kl": 0.002507125260308385, "learning_rate": 4.690766700109659e-06, "loss": 0.0001, "reward": -0.4345000088214874, "reward_std": 0.7912524342536926, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4345000088214874, "step": 244 }, { "completion_length": 222.0, "epoch": 2.0588235294117645, "grad_norm": 0.7071066498756409, "kl": 0.00609601428732276, "learning_rate": 4.68654926784849e-06, "loss": 0.0002, "reward": -0.7045000195503235, "reward_std": 0.13505738973617554, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7045000195503235, "step": 245 }, { "completion_length": 206.0, "epoch": 2.0672268907563027, "grad_norm": 0.7734777927398682, "kl": 0.0027890377677977085, "learning_rate": 4.682305193174524e-06, "loss": 0.0001, "reward": 0.18849998712539673, "reward_std": 0.0883883386850357, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18849998712539673, "step": 246 }, { "completion_length": 236.5, "epoch": 2.0756302521008405, "grad_norm": 0.5881247520446777, "kl": 0.001509196707047522, "learning_rate": 4.6780345278004744e-06, "loss": 0.0001, "reward": -0.13950000703334808, "reward_std": 0.3740594983100891, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.13950000703334808, "step": 247 }, { "completion_length": 256.0, "epoch": 2.0840336134453783, "grad_norm": 0.2591317892074585, "kl": 0.025479409843683243, "learning_rate": 4.673737323763048e-06, "loss": 0.001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 248 }, { "completion_length": 256.0, "epoch": 2.092436974789916, "grad_norm": 0.015124778263270855, "kl": 0.0033195968717336655, "learning_rate": 4.669413633422322e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 249 }, { "completion_length": 43.0, "epoch": 2.100840336134454, "grad_norm": 0.04131443426012993, "kl": 0.010094933211803436, "learning_rate": 4.665063509461098e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 250 }, { "completion_length": 256.0, "epoch": 2.1092436974789917, "grad_norm": 0.0036555728875100613, "kl": 0.00161719077732414, "learning_rate": 4.6606870048842626e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 251 }, { "completion_length": 256.0, "epoch": 2.1176470588235294, "grad_norm": 0.012387734837830067, "kl": 0.0026844674721360207, "learning_rate": 4.656284173018144e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 252 }, { "completion_length": 256.0, "epoch": 2.1260504201680672, "grad_norm": 0.013242681510746479, "kl": 0.006332616787403822, "learning_rate": 4.65185506750986e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 253 }, { "completion_length": 241.0, "epoch": 2.134453781512605, "grad_norm": 0.7205686569213867, "kl": 0.0029884958639740944, "learning_rate": 4.6473997423266615e-06, "loss": 0.0001, "reward": -0.3659999966621399, "reward_std": 0.6943788528442383, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3659999966621399, "step": 254 }, { "completion_length": 256.0, "epoch": 2.142857142857143, "grad_norm": 0.5938934087753296, "kl": 0.002521747723221779, "learning_rate": 4.642918251755281e-06, "loss": 0.0001, "reward": -0.4034999907016754, "reward_std": 0.747411847114563, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4034999907016754, "step": 255 }, { "completion_length": 256.0, "epoch": 2.1512605042016806, "grad_norm": 0.016242576763033867, "kl": 0.0043385326862335205, "learning_rate": 4.638410650401267e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 256 }, { "completion_length": 256.0, "epoch": 2.1596638655462184, "grad_norm": 0.7453104853630066, "kl": 0.013045329600572586, "learning_rate": 4.633876993188319e-06, "loss": 0.0005, "reward": -0.4490000009536743, "reward_std": 0.8117585778236389, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4490000009536743, "step": 257 }, { "completion_length": 239.0, "epoch": 2.168067226890756, "grad_norm": 0.6283895373344421, "kl": 0.002278773346915841, "learning_rate": 4.62931733535762e-06, "loss": 0.0001, "reward": -0.273499995470047, "reward_std": 0.5635640621185303, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.273499995470047, "step": 258 }, { "completion_length": 256.0, "epoch": 2.176470588235294, "grad_norm": 0.5015798211097717, "kl": 0.002488459460437298, "learning_rate": 4.62473173246716e-06, "loss": 0.0001, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 259 }, { "completion_length": 203.0, "epoch": 2.184873949579832, "grad_norm": 0.6714096665382385, "kl": 0.002742907963693142, "learning_rate": 4.620120240391065e-06, "loss": 0.0001, "reward": -0.7390000224113464, "reward_std": 0.43416354060173035, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7390000224113464, "step": 260 }, { "completion_length": 256.0, "epoch": 2.19327731092437, "grad_norm": 0.011858459562063217, "kl": 0.0030484909657388926, "learning_rate": 4.6154829153189105e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 261 }, { "completion_length": 199.5, "epoch": 2.2016806722689077, "grad_norm": 0.013294283300638199, "kl": 0.0040147313848137856, "learning_rate": 4.610819813755038e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 262 }, { "completion_length": 256.0, "epoch": 2.2100840336134455, "grad_norm": 0.5194240808486938, "kl": 0.0014988064067438245, "learning_rate": 4.60613099251787e-06, "loss": 0.0001, "reward": -0.32899999618530273, "reward_std": 0.642052948474884, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.32899999618530273, "step": 263 }, { "completion_length": 256.0, "epoch": 2.2184873949579833, "grad_norm": 0.7569727301597595, "kl": 0.017942776903510094, "learning_rate": 4.601416508739211e-06, "loss": 0.0007, "reward": -0.3779999911785126, "reward_std": 0.7113494277000427, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3779999911785126, "step": 264 }, { "completion_length": 256.0, "epoch": 2.226890756302521, "grad_norm": 0.006944690831005573, "kl": 0.001747514121234417, "learning_rate": 4.596676419863561e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 265 }, { "completion_length": 158.0, "epoch": 2.235294117647059, "grad_norm": 0.7987202405929565, "kl": 0.0036909261252731085, "learning_rate": 4.591910783647405e-06, "loss": 0.0001, "reward": -0.4024999737739563, "reward_std": 0.23263810575008392, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4024999737739563, "step": 266 }, { "completion_length": 256.0, "epoch": 2.2436974789915967, "grad_norm": 0.6113834977149963, "kl": 0.0024936848785728216, "learning_rate": 4.587119658158517e-06, "loss": 0.0001, "reward": -0.34850001335144043, "reward_std": 0.6696301102638245, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.34850001335144043, "step": 267 }, { "completion_length": 256.0, "epoch": 2.2521008403361344, "grad_norm": 0.011432317085564137, "kl": 0.0018549493979662657, "learning_rate": 4.582303101775249e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 268 }, { "completion_length": 204.0, "epoch": 2.2605042016806722, "grad_norm": 0.6246111989021301, "kl": 0.0031767855398356915, "learning_rate": 4.577461173185821e-06, "loss": 0.0001, "reward": 0.04100000113248825, "reward_std": 0.11879393458366394, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.04100000113248825, "step": 269 }, { "completion_length": 256.0, "epoch": 2.26890756302521, "grad_norm": 0.011164241470396519, "kl": 0.004347905516624451, "learning_rate": 4.572593931387604e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 270 }, { "completion_length": 256.0, "epoch": 2.277310924369748, "grad_norm": 0.5123689770698547, "kl": 0.0028823683969676495, "learning_rate": 4.567701435686405e-06, "loss": 0.0001, "reward": -0.38600000739097595, "reward_std": 0.722663164138794, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.38600000739097595, "step": 271 }, { "completion_length": 256.0, "epoch": 2.2857142857142856, "grad_norm": 0.671126127243042, "kl": 0.014719904400408268, "learning_rate": 4.562783745695738e-06, "loss": 0.0006, "reward": -0.9859999418258667, "reward_std": 0.009899494238197803, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.9859999418258667, "step": 272 }, { "completion_length": 256.0, "epoch": 2.2941176470588234, "grad_norm": 0.7697193026542664, "kl": 0.005495861172676086, "learning_rate": 4.5578409213361055e-06, "loss": 0.0002, "reward": -0.934499979019165, "reward_std": 0.03464820981025696, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.934499979019165, "step": 273 }, { "completion_length": 145.0, "epoch": 2.302521008403361, "grad_norm": 0.8731741309165955, "kl": 0.005798987112939358, "learning_rate": 4.55287302283426e-06, "loss": 0.0002, "reward": 0.19949999451637268, "reward_std": 0.07283198833465576, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19949999451637268, "step": 274 }, { "completion_length": 256.0, "epoch": 2.310924369747899, "grad_norm": 0.7047728896141052, "kl": 0.00372782233171165, "learning_rate": 4.54788011072248e-06, "loss": 0.0001, "reward": -0.3474999964237213, "reward_std": 0.6682159304618835, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3474999964237213, "step": 275 }, { "completion_length": 240.0, "epoch": 2.3193277310924367, "grad_norm": 0.7794302701950073, "kl": 0.003310558618977666, "learning_rate": 4.542862245837821e-06, "loss": 0.0001, "reward": -0.3580000102519989, "reward_std": 0.6830651760101318, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3580000102519989, "step": 276 }, { "completion_length": 256.0, "epoch": 2.327731092436975, "grad_norm": 0.027459578588604927, "kl": 0.0025392756797373295, "learning_rate": 4.537819489321385e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 277 }, { "completion_length": 256.0, "epoch": 2.3361344537815127, "grad_norm": 0.009526317939162254, "kl": 0.00471924664452672, "learning_rate": 4.5327519026175694e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 278 }, { "completion_length": 256.0, "epoch": 2.3445378151260505, "grad_norm": 0.42400923371315, "kl": 0.007225835230201483, "learning_rate": 4.527659547473317e-06, "loss": 0.0003, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 279 }, { "completion_length": 256.0, "epoch": 2.3529411764705883, "grad_norm": 0.5866290926933289, "kl": 0.0016281649004667997, "learning_rate": 4.522542485937369e-06, "loss": 0.0001, "reward": -0.35249999165534973, "reward_std": 0.6752869486808777, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.35249999165534973, "step": 280 }, { "completion_length": 256.0, "epoch": 2.361344537815126, "grad_norm": 0.8482140898704529, "kl": 0.0025075054727494717, "learning_rate": 4.517400780359505e-06, "loss": 0.0001, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 281 }, { "completion_length": 256.0, "epoch": 2.369747899159664, "grad_norm": 0.003956747241318226, "kl": 0.0018718561623245478, "learning_rate": 4.512234493389785e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 282 }, { "completion_length": 256.0, "epoch": 2.3781512605042017, "grad_norm": 0.41083571314811707, "kl": 0.0006332241464406252, "learning_rate": 4.507043687977787e-06, "loss": 0.0, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 283 }, { "completion_length": 256.0, "epoch": 2.3865546218487395, "grad_norm": 0.009478725492954254, "kl": 0.005353131797164679, "learning_rate": 4.501828427371834e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 284 }, { "completion_length": 256.0, "epoch": 2.3949579831932772, "grad_norm": 0.003968110308051109, "kl": 0.002520799171179533, "learning_rate": 4.496588775118232e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 285 }, { "completion_length": 256.0, "epoch": 2.403361344537815, "grad_norm": 0.5676983594894409, "kl": 0.003789010923355818, "learning_rate": 4.491324795060491e-06, "loss": 0.0002, "reward": -0.359499990940094, "reward_std": 0.6851864457130432, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.359499990940094, "step": 286 }, { "completion_length": 256.0, "epoch": 2.411764705882353, "grad_norm": 0.4427598714828491, "kl": 0.002515493892133236, "learning_rate": 4.4860365513385456e-06, "loss": 0.0001, "reward": -0.7990000247955322, "reward_std": 0.01697055622935295, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7990000247955322, "step": 287 }, { "completion_length": 156.5, "epoch": 2.4201680672268906, "grad_norm": 0.9057844281196594, "kl": 0.005625425837934017, "learning_rate": 4.4807241083879774e-06, "loss": 0.0002, "reward": -0.3474999964237213, "reward_std": 0.23829495906829834, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3474999964237213, "step": 288 }, { "completion_length": 68.5, "epoch": 2.4285714285714284, "grad_norm": 2.104400396347046, "kl": 0.01915288157761097, "learning_rate": 4.475387530939226e-06, "loss": 0.0008, "reward": 0.04399999976158142, "reward_std": 0.08061017841100693, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.04399999976158142, "step": 289 }, { "completion_length": 244.5, "epoch": 2.4369747899159666, "grad_norm": 0.5761642456054688, "kl": 0.010869665071368217, "learning_rate": 4.470026884016805e-06, "loss": 0.0004, "reward": -0.3165000081062317, "reward_std": 0.7417550086975098, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3165000081062317, "step": 290 }, { "completion_length": 256.0, "epoch": 2.4453781512605044, "grad_norm": 0.6662463545799255, "kl": 0.00887353252619505, "learning_rate": 4.464642232938505e-06, "loss": 0.0004, "reward": -0.9695000648498535, "reward_std": 0.14212849736213684, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.9695000648498535, "step": 291 }, { "completion_length": 251.0, "epoch": 2.453781512605042, "grad_norm": 0.6103852391242981, "kl": 0.0028417068533599377, "learning_rate": 4.4592336433146e-06, "loss": 0.0001, "reward": -0.28700000047683716, "reward_std": 0.5826559662818909, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.28700000047683716, "step": 292 }, { "completion_length": 256.0, "epoch": 2.46218487394958, "grad_norm": 0.01107322983443737, "kl": 0.00474724592640996, "learning_rate": 4.453801181047047e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 293 }, { "completion_length": 126.5, "epoch": 2.4705882352941178, "grad_norm": 1.1153606176376343, "kl": 0.010184656828641891, "learning_rate": 4.448344912328686e-06, "loss": 0.0004, "reward": -0.010000001639127731, "reward_std": 0.08202438801527023, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.010000001639127731, "step": 294 }, { "completion_length": 256.0, "epoch": 2.4789915966386555, "grad_norm": 0.5084386467933655, "kl": 0.01594950258731842, "learning_rate": 4.442864903642428e-06, "loss": 0.0006, "reward": -0.40149998664855957, "reward_std": 0.7445834279060364, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.40149998664855957, "step": 295 }, { "completion_length": 256.0, "epoch": 2.4873949579831933, "grad_norm": 0.012682821601629257, "kl": 0.004246335010975599, "learning_rate": 4.437361221760449e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 296 }, { "completion_length": 256.0, "epoch": 2.495798319327731, "grad_norm": 0.005470058415085077, "kl": 0.001264081452973187, "learning_rate": 4.431833933743378e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 297 }, { "completion_length": 256.0, "epoch": 2.504201680672269, "grad_norm": 0.5214419364929199, "kl": 0.0027658184990286827, "learning_rate": 4.426283106939474e-06, "loss": 0.0001, "reward": -0.2759999930858612, "reward_std": 0.5670996308326721, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.2759999930858612, "step": 298 }, { "completion_length": 256.0, "epoch": 2.5126050420168067, "grad_norm": 0.0035535518545657396, "kl": 0.001303636352531612, "learning_rate": 4.420708808983809e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 299 }, { "completion_length": 256.0, "epoch": 2.5210084033613445, "grad_norm": 0.008862876333296299, "kl": 0.002121975179761648, "learning_rate": 4.415111107797445e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 300 }, { "completion_length": 256.0, "epoch": 2.5294117647058822, "grad_norm": 0.7624014616012573, "kl": 0.0030049625784158707, "learning_rate": 4.409490071586606e-06, "loss": 0.0001, "reward": -0.906499981880188, "reward_std": 0.13222895562648773, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.906499981880188, "step": 301 }, { "completion_length": 256.0, "epoch": 2.53781512605042, "grad_norm": 0.007261218968778849, "kl": 0.0033142161555588245, "learning_rate": 4.403845768841842e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 302 }, { "completion_length": 256.0, "epoch": 2.546218487394958, "grad_norm": 0.4996301829814911, "kl": 0.003614212153479457, "learning_rate": 4.398178268337202e-06, "loss": 0.0001, "reward": -0.42149999737739563, "reward_std": 0.7728676795959473, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.42149999737739563, "step": 303 }, { "completion_length": 256.0, "epoch": 2.5546218487394956, "grad_norm": 0.007249684073030949, "kl": 0.0019305129535496235, "learning_rate": 4.3924876391293915e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 304 }, { "completion_length": 232.0, "epoch": 2.5630252100840334, "grad_norm": 0.5799632668495178, "kl": 0.004229840822517872, "learning_rate": 4.386773950556931e-06, "loss": 0.0002, "reward": 0.18799999356269836, "reward_std": 0.08909544348716736, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18799999356269836, "step": 305 }, { "completion_length": 256.0, "epoch": 2.571428571428571, "grad_norm": 0.6369227170944214, "kl": 0.013486362993717194, "learning_rate": 4.381037272239311e-06, "loss": 0.0005, "reward": -0.3880000114440918, "reward_std": 0.7254915833473206, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3880000114440918, "step": 306 }, { "completion_length": 256.0, "epoch": 2.5798319327731094, "grad_norm": 0.5972694158554077, "kl": 0.0018015628447756171, "learning_rate": 4.3752776740761495e-06, "loss": 0.0001, "reward": -0.5019999742507935, "reward_std": 1.0634886026382446, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5019999742507935, "step": 307 }, { "completion_length": 256.0, "epoch": 2.588235294117647, "grad_norm": 0.8835880756378174, "kl": 0.005852757021784782, "learning_rate": 4.36949522624633e-06, "loss": 0.0002, "reward": -0.2554999887943268, "reward_std": 0.538108229637146, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.2554999887943268, "step": 308 }, { "completion_length": 176.5, "epoch": 2.596638655462185, "grad_norm": 0.00904018059372902, "kl": 0.001535190036520362, "learning_rate": 4.3636899992071555e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 309 }, { "completion_length": 256.0, "epoch": 2.6050420168067228, "grad_norm": 0.01179829053580761, "kl": 0.005011396016925573, "learning_rate": 4.357862063693486e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 310 }, { "completion_length": 256.0, "epoch": 2.6134453781512605, "grad_norm": 0.011678867973387241, "kl": 0.0028532054275274277, "learning_rate": 4.352011490716875e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 311 }, { "completion_length": 256.0, "epoch": 2.6218487394957983, "grad_norm": 0.003083755960687995, "kl": 0.001561257871799171, "learning_rate": 4.346138351564711e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 312 }, { "completion_length": 245.0, "epoch": 2.630252100840336, "grad_norm": 0.6772823929786682, "kl": 0.0027272524312138557, "learning_rate": 4.340242717799337e-06, "loss": 0.0001, "reward": -0.25850000977516174, "reward_std": 0.5423508882522583, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.25850000977516174, "step": 313 }, { "completion_length": 256.0, "epoch": 2.638655462184874, "grad_norm": 0.011639819480478764, "kl": 0.00468604639172554, "learning_rate": 4.334324661257191e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 314 }, { "completion_length": 256.0, "epoch": 2.6470588235294117, "grad_norm": 0.6226049065589905, "kl": 0.0023004449903964996, "learning_rate": 4.328384254047927e-06, "loss": 0.0001, "reward": -0.2809999883174896, "reward_std": 0.574170708656311, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.2809999883174896, "step": 315 }, { "completion_length": 256.0, "epoch": 2.6554621848739495, "grad_norm": 0.002207543235272169, "kl": 0.00077222497202456, "learning_rate": 4.322421568553529e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 316 }, { "completion_length": 256.0, "epoch": 2.6638655462184873, "grad_norm": 0.5875176787376404, "kl": 0.0009305324638262391, "learning_rate": 4.316436677427441e-06, "loss": 0.0, "reward": -0.3334999978542328, "reward_std": 0.4716402292251587, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3334999978542328, "step": 317 }, { "completion_length": 206.5, "epoch": 2.6722689075630255, "grad_norm": 0.6272808909416199, "kl": 0.002387699903920293, "learning_rate": 4.3104296535936695e-06, "loss": 0.0001, "reward": -0.3959999978542328, "reward_std": 0.7368053197860718, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3959999978542328, "step": 318 }, { "completion_length": 256.0, "epoch": 2.6806722689075633, "grad_norm": 0.00793696753680706, "kl": 0.0024345996789634228, "learning_rate": 4.3044005702459055e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 319 }, { "completion_length": 180.0, "epoch": 2.689075630252101, "grad_norm": 1.0982134342193604, "kl": 0.02084909938275814, "learning_rate": 4.2983495008466285e-06, "loss": 0.0008, "reward": -0.4039999842643738, "reward_std": 0.03111269883811474, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4039999842643738, "step": 320 }, { "completion_length": 199.5, "epoch": 2.697478991596639, "grad_norm": 1.096677303314209, "kl": 0.031950175762176514, "learning_rate": 4.2922765191262075e-06, "loss": 0.0013, "reward": -0.7064999938011169, "reward_std": 0.3358757197856903, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7064999938011169, "step": 321 }, { "completion_length": 256.0, "epoch": 2.7058823529411766, "grad_norm": 0.006275282707065344, "kl": 0.002352240029722452, "learning_rate": 4.286181699082008e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 322 }, { "completion_length": 256.0, "epoch": 2.7142857142857144, "grad_norm": 0.009646447375416756, "kl": 0.002331285970285535, "learning_rate": 4.280065114977492e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 323 }, { "completion_length": 202.5, "epoch": 2.722689075630252, "grad_norm": 0.7079194188117981, "kl": 0.006085951346904039, "learning_rate": 4.273926841341303e-06, "loss": 0.0002, "reward": -0.6035000085830688, "reward_std": 0.2637508511543274, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6035000085830688, "step": 324 }, { "completion_length": 217.5, "epoch": 2.73109243697479, "grad_norm": 0.6229531764984131, "kl": 0.0038437368348240852, "learning_rate": 4.267766952966369e-06, "loss": 0.0002, "reward": 0.18799999356269836, "reward_std": 0.08909544348716736, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18799999356269836, "step": 325 }, { "completion_length": 256.0, "epoch": 2.7394957983193278, "grad_norm": 0.4797521233558655, "kl": 0.002125072292983532, "learning_rate": 4.261585524908987e-06, "loss": 0.0001, "reward": -0.3974999785423279, "reward_std": 0.9157032370567322, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3974999785423279, "step": 326 }, { "completion_length": 256.0, "epoch": 2.7478991596638656, "grad_norm": 0.009559920988976955, "kl": 0.0029674177058041096, "learning_rate": 4.255382632487907e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 327 }, { "completion_length": 256.0, "epoch": 2.7563025210084033, "grad_norm": 0.6199681758880615, "kl": 0.002928075147792697, "learning_rate": 4.249158351283414e-06, "loss": 0.0001, "reward": -0.2824999988079071, "reward_std": 0.5762920379638672, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.2824999988079071, "step": 328 }, { "completion_length": 220.0, "epoch": 2.764705882352941, "grad_norm": 0.6078872680664062, "kl": 0.004412318579852581, "learning_rate": 4.242912757136412e-06, "loss": 0.0002, "reward": -0.3059999942779541, "reward_std": 0.6095260381698608, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3059999942779541, "step": 329 }, { "completion_length": 256.0, "epoch": 2.773109243697479, "grad_norm": 0.01108414027839899, "kl": 0.0037416876293718815, "learning_rate": 4.236645926147493e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 330 }, { "completion_length": 256.0, "epoch": 2.7815126050420167, "grad_norm": 0.006040671374648809, "kl": 0.0028744344599545, "learning_rate": 4.230357934676017e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 331 }, { "completion_length": 256.0, "epoch": 2.7899159663865545, "grad_norm": 0.6114116311073303, "kl": 0.005102443043142557, "learning_rate": 4.224048859339175e-06, "loss": 0.0002, "reward": -0.41600000858306885, "reward_std": 0.7650895714759827, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.41600000858306885, "step": 332 }, { "completion_length": 139.0, "epoch": 2.7983193277310923, "grad_norm": 2.4217255115509033, "kl": 0.01080203615128994, "learning_rate": 4.217718777011058e-06, "loss": 0.0004, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 333 }, { "completion_length": 95.5, "epoch": 2.80672268907563, "grad_norm": 2.105055332183838, "kl": 0.014417234808206558, "learning_rate": 4.211367764821722e-06, "loss": 0.0006, "reward": -0.15850000083446503, "reward_std": 0.4009295701980591, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.15850000083446503, "step": 334 }, { "completion_length": 256.0, "epoch": 2.815126050420168, "grad_norm": 0.6110443472862244, "kl": 0.0022071583662182093, "learning_rate": 4.204995900156247e-06, "loss": 0.0001, "reward": -0.34450000524520874, "reward_std": 0.5748777985572815, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.34450000524520874, "step": 335 }, { "completion_length": 256.0, "epoch": 2.8235294117647056, "grad_norm": 0.5463011860847473, "kl": 0.001141121843829751, "learning_rate": 4.198603260653792e-06, "loss": 0.0, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 336 }, { "completion_length": 179.0, "epoch": 2.831932773109244, "grad_norm": 0.6351363062858582, "kl": 0.001277949078939855, "learning_rate": 4.192189924206652e-06, "loss": 0.0001, "reward": -0.16600000858306885, "reward_std": 0.14707821607589722, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.16600000858306885, "step": 337 }, { "completion_length": 256.0, "epoch": 2.8403361344537816, "grad_norm": 0.6723249554634094, "kl": 0.002534241182729602, "learning_rate": 4.185755968959308e-06, "loss": 0.0001, "reward": -0.42399999499320984, "reward_std": 0.7764032483100891, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.42399999499320984, "step": 338 }, { "completion_length": 256.0, "epoch": 2.8487394957983194, "grad_norm": 0.7602297067642212, "kl": 0.0012256702175363898, "learning_rate": 4.179301473307476e-06, "loss": 0.0, "reward": -0.8495000004768372, "reward_std": 0.05303304269909859, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.8495000004768372, "step": 339 }, { "completion_length": 256.0, "epoch": 2.857142857142857, "grad_norm": 0.4763251543045044, "kl": 0.0009420029819011688, "learning_rate": 4.172826515897146e-06, "loss": 0.0, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 340 }, { "completion_length": 256.0, "epoch": 2.865546218487395, "grad_norm": 0.004100496415048838, "kl": 0.0008467439329251647, "learning_rate": 4.166331175623631e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 341 }, { "completion_length": 250.0, "epoch": 2.8739495798319328, "grad_norm": 0.6433167457580566, "kl": 0.003306224476546049, "learning_rate": 4.159815531630604e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 342 }, { "completion_length": 107.0, "epoch": 2.8823529411764706, "grad_norm": 1.1251318454742432, "kl": 0.005019153468310833, "learning_rate": 4.15327966330913e-06, "loss": 0.0002, "reward": 0.09849999845027924, "reward_std": 0.037476662546396255, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09849999845027924, "step": 343 }, { "completion_length": 235.5, "epoch": 2.8907563025210083, "grad_norm": 0.790351927280426, "kl": 0.006928089074790478, "learning_rate": 4.146723650296701e-06, "loss": 0.0003, "reward": -0.2919999957084656, "reward_std": 0.5897270441055298, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.2919999957084656, "step": 344 }, { "completion_length": 194.0, "epoch": 2.899159663865546, "grad_norm": 0.6330944895744324, "kl": 0.0028837714344263077, "learning_rate": 4.140147572476269e-06, "loss": 0.0001, "reward": -0.4789999723434448, "reward_std": 0.032526895403862, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4789999723434448, "step": 345 }, { "completion_length": 256.0, "epoch": 2.907563025210084, "grad_norm": 0.01184325572103262, "kl": 0.002479660790413618, "learning_rate": 4.133551509975264e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 346 }, { "completion_length": 235.0, "epoch": 2.9159663865546217, "grad_norm": 0.778275191783905, "kl": 0.0030368894804269075, "learning_rate": 4.126935543164628e-06, "loss": 0.0001, "reward": -0.7599999904632568, "reward_std": 0.19940412044525146, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7599999904632568, "step": 347 }, { "completion_length": 256.0, "epoch": 2.92436974789916, "grad_norm": 0.4334414303302765, "kl": 0.0027941816952079535, "learning_rate": 4.120299752657828e-06, "loss": 0.0001, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 348 }, { "completion_length": 145.0, "epoch": 2.9327731092436977, "grad_norm": 0.8820681571960449, "kl": 0.014411094598472118, "learning_rate": 4.113644219309877e-06, "loss": 0.0006, "reward": -0.09200000017881393, "reward_std": 0.13010765612125397, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09200000017881393, "step": 349 }, { "completion_length": 244.0, "epoch": 2.9411764705882355, "grad_norm": 0.6002141833305359, "kl": 0.0013799238950014114, "learning_rate": 4.106969024216348e-06, "loss": 0.0001, "reward": -0.3345000147819519, "reward_std": 0.6498311758041382, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3345000147819519, "step": 350 }, { "completion_length": 202.0, "epoch": 2.9495798319327733, "grad_norm": 0.9096584320068359, "kl": 0.007384518161416054, "learning_rate": 4.1002742487123896e-06, "loss": 0.0003, "reward": -0.40299999713897705, "reward_std": 0.7467047572135925, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.40299999713897705, "step": 351 }, { "completion_length": 256.0, "epoch": 2.957983193277311, "grad_norm": 0.01192541979253292, "kl": 0.0025444000493735075, "learning_rate": 4.093559974371725e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 352 }, { "completion_length": 204.5, "epoch": 2.966386554621849, "grad_norm": 0.9316684603691101, "kl": 0.004638626705855131, "learning_rate": 4.086826283005669e-06, "loss": 0.0002, "reward": -0.1469999998807907, "reward_std": 0.3846661150455475, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1469999998807907, "step": 353 }, { "completion_length": 250.0, "epoch": 2.9747899159663866, "grad_norm": 0.5309674739837646, "kl": 0.0023688192013651133, "learning_rate": 4.080073256662128e-06, "loss": 0.0001, "reward": -0.3544999957084656, "reward_std": 0.6781154274940491, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3544999957084656, "step": 354 }, { "completion_length": 253.0, "epoch": 2.9831932773109244, "grad_norm": 0.6676669716835022, "kl": 0.0043645258992910385, "learning_rate": 4.073300977624594e-06, "loss": 0.0002, "reward": -0.34450000524520874, "reward_std": 0.6639732718467712, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.34450000524520874, "step": 355 }, { "completion_length": 256.0, "epoch": 2.991596638655462, "grad_norm": 0.019139230251312256, "kl": 0.005125146359205246, "learning_rate": 4.066509528411151e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 356 }, { "completion_length": 256.0, "epoch": 3.0, "grad_norm": 0.006308732088655233, "kl": 0.0015887669287621975, "learning_rate": 4.059698991773466e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 357 }, { "completion_length": 256.0, "epoch": 3.008403361344538, "grad_norm": 0.010206181555986404, "kl": 0.0026014517061412334, "learning_rate": 4.052869450695776e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 358 }, { "completion_length": 256.0, "epoch": 3.0168067226890756, "grad_norm": 0.620117723941803, "kl": 0.010548936203122139, "learning_rate": 4.046020988393886e-06, "loss": 0.0004, "reward": -0.46050000190734863, "reward_std": 0.6512453556060791, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.46050000190734863, "step": 359 }, { "completion_length": 256.0, "epoch": 3.0252100840336134, "grad_norm": 0.7680322527885437, "kl": 0.010939342901110649, "learning_rate": 4.039153688314146e-06, "loss": 0.0004, "reward": -0.2644999921321869, "reward_std": 0.7276129126548767, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.2644999921321869, "step": 360 }, { "completion_length": 256.0, "epoch": 3.033613445378151, "grad_norm": 0.011280158534646034, "kl": 0.00543932244181633, "learning_rate": 4.032267634132442e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 361 }, { "completion_length": 213.5, "epoch": 3.042016806722689, "grad_norm": 0.8873797655105591, "kl": 0.008191918954253197, "learning_rate": 4.02536290975317e-06, "loss": 0.0003, "reward": -0.7115000486373901, "reward_std": 0.3655742108821869, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7115000486373901, "step": 362 }, { "completion_length": 205.0, "epoch": 3.0504201680672267, "grad_norm": 0.6964902281761169, "kl": 0.0036789490841329098, "learning_rate": 4.018439599308217e-06, "loss": 0.0001, "reward": -0.3854999840259552, "reward_std": 0.9001469612121582, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3854999840259552, "step": 363 }, { "completion_length": 234.5, "epoch": 3.0588235294117645, "grad_norm": 0.6929089426994324, "kl": 0.005453826859593391, "learning_rate": 4.011497787155938e-06, "loss": 0.0002, "reward": -0.778499960899353, "reward_std": 0.2340523600578308, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.778499960899353, "step": 364 }, { "completion_length": 256.0, "epoch": 3.0672268907563027, "grad_norm": 0.007410152815282345, "kl": 0.0021636453457176685, "learning_rate": 4.0045375578801216e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 365 }, { "completion_length": 256.0, "epoch": 3.0756302521008405, "grad_norm": 0.5539571046829224, "kl": 0.0027411708142608404, "learning_rate": 3.997558996288965e-06, "loss": 0.0001, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 366 }, { "completion_length": 253.5, "epoch": 3.0840336134453783, "grad_norm": 0.5638790726661682, "kl": 0.0028606303967535496, "learning_rate": 3.9905621874140396e-06, "loss": 0.0001, "reward": -0.28850001096725464, "reward_std": 0.584777295589447, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.28850001096725464, "step": 367 }, { "completion_length": 256.0, "epoch": 3.092436974789916, "grad_norm": 0.5408520102500916, "kl": 0.004088674206286669, "learning_rate": 3.983547216509254e-06, "loss": 0.0002, "reward": -0.37700000405311584, "reward_std": 0.709935188293457, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.37700000405311584, "step": 368 }, { "completion_length": 256.0, "epoch": 3.100840336134454, "grad_norm": 0.01376523356884718, "kl": 0.0038217129185795784, "learning_rate": 3.976514169049814e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 369 }, { "completion_length": 176.0, "epoch": 3.1092436974789917, "grad_norm": 1.0740207433700562, "kl": 0.010070715099573135, "learning_rate": 3.969463130731183e-06, "loss": 0.0004, "reward": -0.28550001978874207, "reward_std": 0.7587255239486694, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.28550001978874207, "step": 370 }, { "completion_length": 248.5, "epoch": 3.1176470588235294, "grad_norm": 0.6407089233398438, "kl": 0.0026423814706504345, "learning_rate": 3.96239418746804e-06, "loss": 0.0001, "reward": -0.1770000010728836, "reward_std": 0.4270924925804138, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1770000010728836, "step": 371 }, { "completion_length": 215.5, "epoch": 3.1260504201680672, "grad_norm": 0.6876877546310425, "kl": 0.011822579428553581, "learning_rate": 3.955307425393224e-06, "loss": 0.0005, "reward": -0.46700000762939453, "reward_std": 0.6604377627372742, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.46700000762939453, "step": 372 }, { "completion_length": 231.0, "epoch": 3.134453781512605, "grad_norm": 0.5824630856513977, "kl": 0.004953335504978895, "learning_rate": 3.948202930856697e-06, "loss": 0.0002, "reward": -0.37400001287460327, "reward_std": 0.7056925892829895, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.37400001287460327, "step": 373 }, { "completion_length": 256.0, "epoch": 3.142857142857143, "grad_norm": 0.013719064183533192, "kl": 0.005780086852610111, "learning_rate": 3.941080790424483e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 374 }, { "completion_length": 120.5, "epoch": 3.1512605042016806, "grad_norm": 1.8440769910812378, "kl": 0.013958006165921688, "learning_rate": 3.933941090877615e-06, "loss": 0.0006, "reward": -0.15399999916553497, "reward_std": 0.39456555247306824, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.15399999916553497, "step": 375 }, { "completion_length": 120.0, "epoch": 3.1596638655462184, "grad_norm": 0.9358043670654297, "kl": 0.006174763664603233, "learning_rate": 3.92678391921108e-06, "loss": 0.0002, "reward": 0.08449999988079071, "reward_std": 0.05727564916014671, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08449999988079071, "step": 376 }, { "completion_length": 256.0, "epoch": 3.168067226890756, "grad_norm": 0.012176509946584702, "kl": 0.003637885209172964, "learning_rate": 3.9196093626327535e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 377 }, { "completion_length": 109.5, "epoch": 3.176470588235294, "grad_norm": 1.1400877237319946, "kl": 0.01294402964413166, "learning_rate": 3.912417508562345e-06, "loss": 0.0005, "reward": -0.2615000009536743, "reward_std": 0.050204578787088394, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.2615000009536743, "step": 378 }, { "completion_length": 169.5, "epoch": 3.184873949579832, "grad_norm": 0.8784559369087219, "kl": 0.0021055168472230434, "learning_rate": 3.905208444630326e-06, "loss": 0.0001, "reward": -0.4129999876022339, "reward_std": 0.4511341452598572, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4129999876022339, "step": 379 }, { "completion_length": 256.0, "epoch": 3.19327731092437, "grad_norm": 0.00910105463117361, "kl": 0.004807943478226662, "learning_rate": 3.897982258676867e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 380 }, { "completion_length": 256.0, "epoch": 3.2016806722689077, "grad_norm": 0.5080274939537048, "kl": 0.005599964410066605, "learning_rate": 3.890739038750763e-06, "loss": 0.0002, "reward": -0.7590000033378601, "reward_std": 0.004242670256644487, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7590000033378601, "step": 381 }, { "completion_length": 256.0, "epoch": 3.2100840336134455, "grad_norm": 0.009184548631310463, "kl": 0.003193704877048731, "learning_rate": 3.88347887310836e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 382 }, { "completion_length": 256.0, "epoch": 3.2184873949579833, "grad_norm": 0.5082441568374634, "kl": 0.0018902915762737393, "learning_rate": 3.876201850212489e-06, "loss": 0.0001, "reward": -0.44749999046325684, "reward_std": 0.8096372485160828, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.44749999046325684, "step": 383 }, { "completion_length": 256.0, "epoch": 3.226890756302521, "grad_norm": 0.6721200346946716, "kl": 0.004076722078025341, "learning_rate": 3.868908058731376e-06, "loss": 0.0002, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 384 }, { "completion_length": 164.0, "epoch": 3.235294117647059, "grad_norm": 0.9340514540672302, "kl": 0.00798124261200428, "learning_rate": 3.861597587537568e-06, "loss": 0.0003, "reward": 0.01449999213218689, "reward_std": 0.334461510181427, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.01449999213218689, "step": 385 }, { "completion_length": 256.0, "epoch": 3.2436974789915967, "grad_norm": 0.533520519733429, "kl": 0.002210881793871522, "learning_rate": 3.85427052570685e-06, "loss": 0.0001, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 386 }, { "completion_length": 256.0, "epoch": 3.2521008403361344, "grad_norm": 0.017914986237883568, "kl": 0.0029222287703305483, "learning_rate": 3.846926962517158e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 387 }, { "completion_length": 240.5, "epoch": 3.2605042016806722, "grad_norm": 0.0051656486466526985, "kl": 0.0025048735551536083, "learning_rate": 3.839566987447492e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 388 }, { "completion_length": 256.0, "epoch": 3.26890756302521, "grad_norm": 0.01239511277526617, "kl": 0.00431419350206852, "learning_rate": 3.832190690176825e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 389 }, { "completion_length": 256.0, "epoch": 3.277310924369748, "grad_norm": 0.005407815799117088, "kl": 0.0031800023280084133, "learning_rate": 3.824798160583012e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 390 }, { "completion_length": 256.0, "epoch": 3.2857142857142856, "grad_norm": 0.011482949368655682, "kl": 0.004352348856627941, "learning_rate": 3.817389488741694e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 391 }, { "completion_length": 226.5, "epoch": 3.2941176470588234, "grad_norm": 0.7392163872718811, "kl": 0.0037689288146793842, "learning_rate": 3.8099647649251984e-06, "loss": 0.0002, "reward": -0.7045000195503235, "reward_std": 0.21991020441055298, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7045000195503235, "step": 392 }, { "completion_length": 256.0, "epoch": 3.302521008403361, "grad_norm": 0.008813586086034775, "kl": 0.0030356249772012234, "learning_rate": 3.802524079601442e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 393 }, { "completion_length": 160.5, "epoch": 3.310924369747899, "grad_norm": 1.3345425128936768, "kl": 0.01419767551124096, "learning_rate": 3.795067523432826e-06, "loss": 0.0006, "reward": -0.3474999964237213, "reward_std": 0.6144757866859436, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3474999964237213, "step": 394 }, { "completion_length": 256.0, "epoch": 3.3193277310924367, "grad_norm": 0.013871869072318077, "kl": 0.007474452257156372, "learning_rate": 3.787595187275136e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 395 }, { "completion_length": 202.5, "epoch": 3.327731092436975, "grad_norm": 0.8885118961334229, "kl": 0.07740353792905807, "learning_rate": 3.780107162176429e-06, "loss": 0.0031, "reward": -0.1705000102519989, "reward_std": 0.5960910320281982, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1705000102519989, "step": 396 }, { "completion_length": 256.0, "epoch": 3.3361344537815127, "grad_norm": 0.011270146816968918, "kl": 0.005660077556967735, "learning_rate": 3.772603539375929e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 397 }, { "completion_length": 242.5, "epoch": 3.3445378151260505, "grad_norm": 0.6401829719543457, "kl": 0.00367830041795969, "learning_rate": 3.7650844103029093e-06, "loss": 0.0001, "reward": -0.38600000739097595, "reward_std": 0.722663164138794, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.38600000739097595, "step": 398 }, { "completion_length": 160.5, "epoch": 3.3529411764705883, "grad_norm": 1.7849326133728027, "kl": 0.059879012405872345, "learning_rate": 3.7575498665755884e-06, "loss": 0.0024, "reward": -0.24799999594688416, "reward_std": 0.6830651760101318, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.24799999594688416, "step": 399 }, { "completion_length": 256.0, "epoch": 3.361344537815126, "grad_norm": 0.6120083332061768, "kl": 0.0013382586184889078, "learning_rate": 3.7500000000000005e-06, "loss": 0.0001, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 400 }, { "completion_length": 256.0, "epoch": 3.369747899159664, "grad_norm": 0.015587490051984787, "kl": 0.004276780411601067, "learning_rate": 3.742434902568889e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 401 }, { "completion_length": 227.5, "epoch": 3.3781512605042017, "grad_norm": 0.6446884870529175, "kl": 0.006575770676136017, "learning_rate": 3.7348546664605777e-06, "loss": 0.0003, "reward": -0.7000000476837158, "reward_std": 0.055154334753751755, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7000000476837158, "step": 402 }, { "completion_length": 204.0, "epoch": 3.3865546218487395, "grad_norm": 0.7291668653488159, "kl": 0.012783960439264774, "learning_rate": 3.7272593840378526e-06, "loss": 0.0005, "reward": -0.2175000011920929, "reward_std": 0.48436814546585083, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.2175000011920929, "step": 403 }, { "completion_length": 256.0, "epoch": 3.3949579831932772, "grad_norm": 0.008016711100935936, "kl": 0.0020481166429817677, "learning_rate": 3.7196491478468322e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 404 }, { "completion_length": 256.0, "epoch": 3.403361344537815, "grad_norm": 0.6153783202171326, "kl": 0.016261091455817223, "learning_rate": 3.7120240506158433e-06, "loss": 0.0007, "reward": -0.7935000061988831, "reward_std": 0.06858935952186584, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7935000061988831, "step": 405 }, { "completion_length": 256.0, "epoch": 3.411764705882353, "grad_norm": 0.5767812728881836, "kl": 0.006565110757946968, "learning_rate": 3.7043841852542884e-06, "loss": 0.0003, "reward": -0.9350000023841858, "reward_std": 0.14566397666931152, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.9350000023841858, "step": 406 }, { "completion_length": 256.0, "epoch": 3.4201680672268906, "grad_norm": 0.5669788718223572, "kl": 0.004698153585195541, "learning_rate": 3.6967296448515176e-06, "loss": 0.0002, "reward": -0.3504999876022339, "reward_std": 0.6724585294723511, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3504999876022339, "step": 407 }, { "completion_length": 256.0, "epoch": 3.4285714285714284, "grad_norm": 0.009885992854833603, "kl": 0.0025288877077400684, "learning_rate": 3.689060522675689e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 408 }, { "completion_length": 256.0, "epoch": 3.4369747899159666, "grad_norm": 0.5568239688873291, "kl": 0.0027218936011195183, "learning_rate": 3.6813769121726356e-06, "loss": 0.0001, "reward": -0.36250001192092896, "reward_std": 0.6894291639328003, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.36250001192092896, "step": 409 }, { "completion_length": 256.0, "epoch": 3.4453781512605044, "grad_norm": 0.0174005888402462, "kl": 0.004320329520851374, "learning_rate": 3.6736789069647273e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 410 }, { "completion_length": 200.0, "epoch": 3.453781512605042, "grad_norm": 0.8975045084953308, "kl": 0.09174253791570663, "learning_rate": 3.6659666008497287e-06, "loss": 0.0037, "reward": -0.03400000184774399, "reward_std": 0.22485996782779694, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.03400000184774399, "step": 411 }, { "completion_length": 256.0, "epoch": 3.46218487394958, "grad_norm": 0.5174078941345215, "kl": 0.004888634197413921, "learning_rate": 3.658240087799655e-06, "loss": 0.0002, "reward": -0.2305000126361847, "reward_std": 0.502752959728241, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.2305000126361847, "step": 412 }, { "completion_length": 256.0, "epoch": 3.4705882352941178, "grad_norm": 0.008453840389847755, "kl": 0.0026917089708149433, "learning_rate": 3.6504994619596295e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 413 }, { "completion_length": 217.5, "epoch": 3.4789915966386555, "grad_norm": 0.013955477625131607, "kl": 0.003794711083173752, "learning_rate": 3.642744817646736e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 414 }, { "completion_length": 256.0, "epoch": 3.4873949579831933, "grad_norm": 0.005449412390589714, "kl": 0.0017005063127726316, "learning_rate": 3.634976249348867e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 415 }, { "completion_length": 213.0, "epoch": 3.495798319327731, "grad_norm": 0.6553326845169067, "kl": 0.007478952873498201, "learning_rate": 3.627193851723577e-06, "loss": 0.0003, "reward": -0.687000036239624, "reward_std": 0.4907321035861969, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.687000036239624, "step": 416 }, { "completion_length": 256.0, "epoch": 3.504201680672269, "grad_norm": 0.576215922832489, "kl": 0.0042199972085654736, "learning_rate": 3.6193977195969243e-06, "loss": 0.0002, "reward": -0.35600000619888306, "reward_std": 0.6802366971969604, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.35600000619888306, "step": 417 }, { "completion_length": 256.0, "epoch": 3.5126050420168067, "grad_norm": 0.010783681645989418, "kl": 0.0027378108352422714, "learning_rate": 3.611587947962319e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 418 }, { "completion_length": 238.5, "epoch": 3.5210084033613445, "grad_norm": 0.6422294974327087, "kl": 0.0025938504841178656, "learning_rate": 3.6037646319793635e-06, "loss": 0.0001, "reward": -0.22299998998641968, "reward_std": 0.4921463131904602, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.22299998998641968, "step": 419 }, { "completion_length": 256.0, "epoch": 3.5294117647058822, "grad_norm": 0.010222544893622398, "kl": 0.0034360236022621393, "learning_rate": 3.595927866972694e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 420 }, { "completion_length": 248.5, "epoch": 3.53781512605042, "grad_norm": 0.5986461043357849, "kl": 0.007497750222682953, "learning_rate": 3.5880777484308193e-06, "loss": 0.0003, "reward": -0.8730000257492065, "reward_std": 0.08626705408096313, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.8730000257492065, "step": 421 }, { "completion_length": 256.0, "epoch": 3.546218487394958, "grad_norm": 0.006208761129528284, "kl": 0.0022212634794414043, "learning_rate": 3.5802143720049565e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 422 }, { "completion_length": 215.0, "epoch": 3.5546218487394956, "grad_norm": 0.6153379082679749, "kl": 0.003703085705637932, "learning_rate": 3.5723378335078653e-06, "loss": 0.0001, "reward": 0.18799999356269836, "reward_std": 0.08909544348716736, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18799999356269836, "step": 423 }, { "completion_length": 235.5, "epoch": 3.5630252100840334, "grad_norm": 0.5790783166885376, "kl": 0.011395820416510105, "learning_rate": 3.564448228912682e-06, "loss": 0.0005, "reward": -0.273499995470047, "reward_std": 0.5635640621185303, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.273499995470047, "step": 424 }, { "completion_length": 144.0, "epoch": 3.571428571428571, "grad_norm": 0.023058855906128883, "kl": 0.007052963133901358, "learning_rate": 3.556545654351749e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 425 }, { "completion_length": 249.0, "epoch": 3.5798319327731094, "grad_norm": 0.5849809050559998, "kl": 0.0022892202250659466, "learning_rate": 3.5486302061154433e-06, "loss": 0.0001, "reward": -0.4424999952316284, "reward_std": 0.8025661706924438, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4424999952316284, "step": 426 }, { "completion_length": 256.0, "epoch": 3.588235294117647, "grad_norm": 0.5420903563499451, "kl": 0.002459827810525894, "learning_rate": 3.5407019806510035e-06, "loss": 0.0001, "reward": -0.3400000035762787, "reward_std": 0.8343860507011414, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3400000035762787, "step": 427 }, { "completion_length": 234.5, "epoch": 3.596638655462185, "grad_norm": 0.6688557863235474, "kl": 0.0029880732763558626, "learning_rate": 3.532761074561355e-06, "loss": 0.0001, "reward": -0.20500001311302185, "reward_std": 0.6434671878814697, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.20500001311302185, "step": 428 }, { "completion_length": 256.0, "epoch": 3.6050420168067228, "grad_norm": 0.006107234861701727, "kl": 0.0017487696604803205, "learning_rate": 3.524807584603932e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 429 }, { "completion_length": 256.0, "epoch": 3.6134453781512605, "grad_norm": 0.004454105626791716, "kl": 0.0011575918179005384, "learning_rate": 3.516841607689501e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 430 }, { "completion_length": 240.5, "epoch": 3.6218487394957983, "grad_norm": 0.8281577229499817, "kl": 0.0020803946536034346, "learning_rate": 3.5088632408809757e-06, "loss": 0.0001, "reward": -0.17249999940395355, "reward_std": 0.42072856426239014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.17249999940395355, "step": 431 }, { "completion_length": 214.5, "epoch": 3.630252100840336, "grad_norm": 0.6902633309364319, "kl": 0.00645034946501255, "learning_rate": 3.5008725813922383e-06, "loss": 0.0003, "reward": -0.34700000286102295, "reward_std": 0.8456997275352478, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.34700000286102295, "step": 432 }, { "completion_length": 256.0, "epoch": 3.638655462184874, "grad_norm": 0.006780046038329601, "kl": 0.0024617090821266174, "learning_rate": 3.4928697265869516e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 433 }, { "completion_length": 256.0, "epoch": 3.6470588235294117, "grad_norm": 0.0071501401253044605, "kl": 0.0015779532259330153, "learning_rate": 3.4848547739773782e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 434 }, { "completion_length": 184.5, "epoch": 3.6554621848739495, "grad_norm": 0.5768211483955383, "kl": 0.007431807927787304, "learning_rate": 3.476827821223184e-06, "loss": 0.0003, "reward": -0.18900001049041748, "reward_std": 0.622253954410553, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.18900001049041748, "step": 435 }, { "completion_length": 207.5, "epoch": 3.6638655462184873, "grad_norm": 0.617514431476593, "kl": 0.0020341938361525536, "learning_rate": 3.4687889661302577e-06, "loss": 0.0001, "reward": 0.18849998712539673, "reward_std": 0.0883883386850357, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18849998712539673, "step": 436 }, { "completion_length": 256.0, "epoch": 3.6722689075630255, "grad_norm": 0.005510859191417694, "kl": 0.001895026070997119, "learning_rate": 3.460738306649509e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 437 }, { "completion_length": 256.0, "epoch": 3.6806722689075633, "grad_norm": 0.02060985565185547, "kl": 0.005827664397656918, "learning_rate": 3.452675940875686e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 438 }, { "completion_length": 256.0, "epoch": 3.689075630252101, "grad_norm": 0.005921047646552324, "kl": 0.002219037152826786, "learning_rate": 3.4446019670461684e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 439 }, { "completion_length": 256.0, "epoch": 3.697478991596639, "grad_norm": 0.005769258365035057, "kl": 0.00171941751614213, "learning_rate": 3.436516483539781e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 440 }, { "completion_length": 256.0, "epoch": 3.7058823529411766, "grad_norm": 0.7345437407493591, "kl": 0.010616779327392578, "learning_rate": 3.4284195888755877e-06, "loss": 0.0004, "reward": -0.304500013589859, "reward_std": 0.6074047684669495, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.304500013589859, "step": 441 }, { "completion_length": 256.0, "epoch": 3.7142857142857144, "grad_norm": 0.621379017829895, "kl": 0.0015812116907909513, "learning_rate": 3.4203113817116955e-06, "loss": 0.0001, "reward": -0.42250001430511475, "reward_std": 0.774281919002533, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.42250001430511475, "step": 442 }, { "completion_length": 240.0, "epoch": 3.722689075630252, "grad_norm": 0.01309260819107294, "kl": 0.0027149072848260403, "learning_rate": 3.412191960844049e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 443 }, { "completion_length": 256.0, "epoch": 3.73109243697479, "grad_norm": 0.009527934715151787, "kl": 0.0024723124224692583, "learning_rate": 3.4040614252052305e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 444 }, { "completion_length": 53.0, "epoch": 3.7394957983193278, "grad_norm": 0.026197509840130806, "kl": 0.003858855227008462, "learning_rate": 3.39591987386325e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 445 }, { "completion_length": 256.0, "epoch": 3.7478991596638656, "grad_norm": 0.5904631614685059, "kl": 0.0022262653801590204, "learning_rate": 3.387767406020343e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 446 }, { "completion_length": 256.0, "epoch": 3.7563025210084033, "grad_norm": 0.006075546145439148, "kl": 0.002579500200226903, "learning_rate": 3.3796041210117545e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 447 }, { "completion_length": 256.0, "epoch": 3.764705882352941, "grad_norm": 0.011111943051218987, "kl": 0.0016852518310770392, "learning_rate": 3.3714301183045382e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 448 }, { "completion_length": 256.0, "epoch": 3.773109243697479, "grad_norm": 0.6621344685554504, "kl": 0.07119501382112503, "learning_rate": 3.3632454974963368e-06, "loss": 0.0028, "reward": -0.3319999873638153, "reward_std": 0.6462955474853516, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3319999873638153, "step": 449 }, { "completion_length": 244.0, "epoch": 3.7815126050420167, "grad_norm": 0.591526985168457, "kl": 0.0027746185660362244, "learning_rate": 3.3550503583141726e-06, "loss": 0.0001, "reward": -0.367000013589859, "reward_std": 0.695793092250824, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.367000013589859, "step": 450 }, { "completion_length": 256.0, "epoch": 3.7899159663865545, "grad_norm": 0.008372858166694641, "kl": 0.0015924713807180524, "learning_rate": 3.346844800613229e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 451 }, { "completion_length": 141.0, "epoch": 3.7983193277310923, "grad_norm": 0.9579232931137085, "kl": 0.00345963379368186, "learning_rate": 3.338628924375638e-06, "loss": 0.0001, "reward": -0.18700000643730164, "reward_std": 0.44123464822769165, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.18700000643730164, "step": 452 }, { "completion_length": 256.0, "epoch": 3.80672268907563, "grad_norm": 0.6012850403785706, "kl": 0.004852938465774059, "learning_rate": 3.3304028297092583e-06, "loss": 0.0002, "reward": -0.29899999499320984, "reward_std": 0.5996265411376953, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.29899999499320984, "step": 453 }, { "completion_length": 198.0, "epoch": 3.815126050420168, "grad_norm": 0.8106396198272705, "kl": 0.004517472814768553, "learning_rate": 3.3221666168464584e-06, "loss": 0.0002, "reward": -0.5885000228881836, "reward_std": 0.3104199171066284, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5885000228881836, "step": 454 }, { "completion_length": 256.0, "epoch": 3.8235294117647056, "grad_norm": 0.005074457265436649, "kl": 0.0016099036438390613, "learning_rate": 3.313920386142892e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 455 }, { "completion_length": 256.0, "epoch": 3.831932773109244, "grad_norm": 0.012825529091060162, "kl": 0.003774377517402172, "learning_rate": 3.3056642380762783e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 456 }, { "completion_length": 220.0, "epoch": 3.8403361344537816, "grad_norm": 0.6539475917816162, "kl": 0.0042293258011341095, "learning_rate": 3.2973982732451753e-06, "loss": 0.0002, "reward": -0.11900000274181366, "reward_std": 0.866912841796875, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.11900000274181366, "step": 457 }, { "completion_length": 256.0, "epoch": 3.8487394957983194, "grad_norm": 0.49199798703193665, "kl": 0.0014096302911639214, "learning_rate": 3.2891225923677565e-06, "loss": 0.0001, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 458 }, { "completion_length": 256.0, "epoch": 3.857142857142857, "grad_norm": 0.012088433839380741, "kl": 0.003585636615753174, "learning_rate": 3.280837296280582e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 459 }, { "completion_length": 233.0, "epoch": 3.865546218487395, "grad_norm": 0.5284253358840942, "kl": 0.004656251985579729, "learning_rate": 3.272542485937369e-06, "loss": 0.0002, "reward": -0.2084999978542328, "reward_std": 0.4716402292251587, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.2084999978542328, "step": 460 }, { "completion_length": 256.0, "epoch": 3.8739495798319328, "grad_norm": 0.0030368613079190254, "kl": 0.00120296492241323, "learning_rate": 3.2642382624077647e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 461 }, { "completion_length": 256.0, "epoch": 3.8823529411764706, "grad_norm": 0.010796008631587029, "kl": 0.002590796211734414, "learning_rate": 3.2559247268761117e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 462 }, { "completion_length": 256.0, "epoch": 3.8907563025210083, "grad_norm": 0.010541989468038082, "kl": 0.003159753978252411, "learning_rate": 3.247601980640217e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 463 }, { "completion_length": 245.5, "epoch": 3.899159663865546, "grad_norm": 0.5083385705947876, "kl": 0.003537024836987257, "learning_rate": 3.2392701251101172e-06, "loss": 0.0001, "reward": -0.9225000143051147, "reward_std": 0.0855599120259285, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.9225000143051147, "step": 464 }, { "completion_length": 256.0, "epoch": 3.907563025210084, "grad_norm": 0.005627671256661415, "kl": 0.0021470231004059315, "learning_rate": 3.230929261806842e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 465 }, { "completion_length": 256.0, "epoch": 3.9159663865546217, "grad_norm": 0.5799672603607178, "kl": 0.009040579199790955, "learning_rate": 3.222579492361179e-06, "loss": 0.0004, "reward": -0.34599998593330383, "reward_std": 0.6660946011543274, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.34599998593330383, "step": 466 }, { "completion_length": 256.0, "epoch": 3.92436974789916, "grad_norm": 0.0069773755967617035, "kl": 0.0021656134631484747, "learning_rate": 3.214220918512434e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 467 }, { "completion_length": 256.0, "epoch": 3.9327731092436977, "grad_norm": 0.01754561997950077, "kl": 0.005703547038137913, "learning_rate": 3.205853642107192e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 468 }, { "completion_length": 256.0, "epoch": 3.9411764705882355, "grad_norm": 0.6702913641929626, "kl": 0.0025955908931791782, "learning_rate": 3.1974777650980737e-06, "loss": 0.0001, "reward": -0.3174999952316284, "reward_std": 0.6257895231246948, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3174999952316284, "step": 469 }, { "completion_length": 256.0, "epoch": 3.9495798319327733, "grad_norm": 0.011906447820365429, "kl": 0.003325114957988262, "learning_rate": 3.189093389542498e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 470 }, { "completion_length": 256.0, "epoch": 3.957983193277311, "grad_norm": 0.004596076440066099, "kl": 0.002527457196265459, "learning_rate": 3.180700617601436e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 471 }, { "completion_length": 242.0, "epoch": 3.966386554621849, "grad_norm": 0.34630173444747925, "kl": 0.005784347653388977, "learning_rate": 3.1722995515381644e-06, "loss": 0.0002, "reward": -0.07899999618530273, "reward_std": 0.2884995639324188, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.07899999618530273, "step": 472 }, { "completion_length": 256.0, "epoch": 3.9747899159663866, "grad_norm": 0.41311922669410706, "kl": 0.0028857660945504904, "learning_rate": 3.1638902937170224e-06, "loss": 0.0001, "reward": -0.23100000619888306, "reward_std": 0.5034600496292114, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.23100000619888306, "step": 473 }, { "completion_length": 256.0, "epoch": 3.9831932773109244, "grad_norm": 0.007859251461923122, "kl": 0.002586069516837597, "learning_rate": 3.155472946602162e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 474 }, { "completion_length": 175.0, "epoch": 3.991596638655462, "grad_norm": 0.6734972596168518, "kl": 0.006680778227746487, "learning_rate": 3.147047612756302e-06, "loss": 0.0003, "reward": -0.406000018119812, "reward_std": 0.05656855180859566, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.406000018119812, "step": 475 }, { "completion_length": 256.0, "epoch": 4.0, "grad_norm": 0.6870183944702148, "kl": 0.016689736396074295, "learning_rate": 3.1386143948394764e-06, "loss": 0.0007, "reward": -0.32249999046325684, "reward_std": 0.632860541343689, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.32249999046325684, "step": 476 }, { "completion_length": 256.0, "epoch": 4.008403361344538, "grad_norm": 0.42786309123039246, "kl": 0.003426614450290799, "learning_rate": 3.130173395607785e-06, "loss": 0.0001, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 477 }, { "completion_length": 256.0, "epoch": 4.016806722689076, "grad_norm": 0.5951551795005798, "kl": 0.015506581403315067, "learning_rate": 3.121724717912138e-06, "loss": 0.0006, "reward": -0.39250001311302185, "reward_std": 0.731855571269989, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.39250001311302185, "step": 478 }, { "completion_length": 256.0, "epoch": 4.025210084033613, "grad_norm": 0.011244276538491249, "kl": 0.001748478738591075, "learning_rate": 3.1132684646970068e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 479 }, { "completion_length": 256.0, "epoch": 4.033613445378151, "grad_norm": 0.00740152457728982, "kl": 0.002155178226530552, "learning_rate": 3.1048047389991693e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 480 }, { "completion_length": 225.5, "epoch": 4.042016806722689, "grad_norm": 0.5986968874931335, "kl": 0.0012694902252405882, "learning_rate": 3.0963336439464527e-06, "loss": 0.0001, "reward": 0.18799999356269836, "reward_std": 0.08909544348716736, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18799999356269836, "step": 481 }, { "completion_length": 256.0, "epoch": 4.050420168067227, "grad_norm": 0.5260491967201233, "kl": 0.006204525008797646, "learning_rate": 3.087855282756475e-06, "loss": 0.0002, "reward": -0.9929999709129333, "reward_std": 0.08061014860868454, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.9929999709129333, "step": 482 }, { "completion_length": 256.0, "epoch": 4.0588235294117645, "grad_norm": 0.016300002112984657, "kl": 0.0045607611536979675, "learning_rate": 3.079369758735393e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 483 }, { "completion_length": 256.0, "epoch": 4.067226890756302, "grad_norm": 0.6481316685676575, "kl": 0.003185984678566456, "learning_rate": 3.0708771752766397e-06, "loss": 0.0001, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 484 }, { "completion_length": 256.0, "epoch": 4.07563025210084, "grad_norm": 0.6258234977722168, "kl": 0.009114600718021393, "learning_rate": 3.062377635859663e-06, "loss": 0.0004, "reward": -0.3140000104904175, "reward_std": 0.6208397746086121, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3140000104904175, "step": 485 }, { "completion_length": 227.0, "epoch": 4.084033613445378, "grad_norm": 0.008745075203478336, "kl": 0.0033639816101640463, "learning_rate": 3.053871244048669e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 486 }, { "completion_length": 256.0, "epoch": 4.092436974789916, "grad_norm": 0.5776768326759338, "kl": 0.008605859242379665, "learning_rate": 3.045358103491357e-06, "loss": 0.0003, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 487 }, { "completion_length": 256.0, "epoch": 4.100840336134453, "grad_norm": 0.021547332406044006, "kl": 0.00245258049108088, "learning_rate": 3.0368383179176584e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 488 }, { "completion_length": 256.0, "epoch": 4.109243697478991, "grad_norm": 0.6776788830757141, "kl": 0.01352140586823225, "learning_rate": 3.0283119911384724e-06, "loss": 0.0005, "reward": -0.3630000054836273, "reward_std": 0.6901362538337708, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3630000054836273, "step": 489 }, { "completion_length": 256.0, "epoch": 4.117647058823529, "grad_norm": 0.025309573858976364, "kl": 0.004147625062614679, "learning_rate": 3.019779227044398e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 490 }, { "completion_length": 256.0, "epoch": 4.126050420168067, "grad_norm": 0.00681652594357729, "kl": 0.002827833406627178, "learning_rate": 3.0112401296044756e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 491 }, { "completion_length": 52.5, "epoch": 4.1344537815126055, "grad_norm": 1.9969899654388428, "kl": 0.007922270335257053, "learning_rate": 3.002694802864912e-06, "loss": 0.0003, "reward": 0.029500000178813934, "reward_std": 0.13505738973617554, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.029500000178813934, "step": 492 }, { "completion_length": 51.0, "epoch": 4.142857142857143, "grad_norm": 2.0606791973114014, "kl": 0.02262522280216217, "learning_rate": 2.9941433509478157e-06, "loss": 0.0009, "reward": 0.04749999940395355, "reward_std": 0.10960154980421066, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.04749999940395355, "step": 493 }, { "completion_length": 256.0, "epoch": 4.151260504201681, "grad_norm": 0.540189802646637, "kl": 0.015345863066613674, "learning_rate": 2.98558587804993e-06, "loss": 0.0006, "reward": -0.30250000953674316, "reward_std": 0.42779961228370667, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.30250000953674316, "step": 494 }, { "completion_length": 256.0, "epoch": 4.159663865546219, "grad_norm": 0.5708104372024536, "kl": 0.011887428350746632, "learning_rate": 2.9770224884413625e-06, "loss": 0.0005, "reward": -0.359499990940094, "reward_std": 0.861963152885437, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.359499990940094, "step": 495 }, { "completion_length": 256.0, "epoch": 4.168067226890757, "grad_norm": 0.5899463295936584, "kl": 0.0040596853941679, "learning_rate": 2.9684532864643123e-06, "loss": 0.0002, "reward": -0.43800002336502075, "reward_std": 0.7962023019790649, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.43800002336502075, "step": 496 }, { "completion_length": 219.5, "epoch": 4.176470588235294, "grad_norm": 0.5904958844184875, "kl": 0.006381301209330559, "learning_rate": 2.9598783765318005e-06, "loss": 0.0003, "reward": -0.6360000371932983, "reward_std": 0.14707821607589722, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6360000371932983, "step": 497 }, { "completion_length": 192.0, "epoch": 4.184873949579832, "grad_norm": 0.8021066784858704, "kl": 0.008141201920807362, "learning_rate": 2.9512978631264006e-06, "loss": 0.0003, "reward": 0.25049999356269836, "reward_std": 0.17748379707336426, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25049999356269836, "step": 498 }, { "completion_length": 181.5, "epoch": 4.19327731092437, "grad_norm": 0.038161519914865494, "kl": 0.013759121298789978, "learning_rate": 2.942711850798959e-06, "loss": 0.0006, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 499 }, { "completion_length": 256.0, "epoch": 4.201680672268908, "grad_norm": 0.5676648616790771, "kl": 0.0027714762836694717, "learning_rate": 2.9341204441673267e-06, "loss": 0.0001, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 500 }, { "completion_length": 180.0, "epoch": 4.2100840336134455, "grad_norm": 0.9129006862640381, "kl": 0.00931185856461525, "learning_rate": 2.9255237479150815e-06, "loss": 0.0004, "reward": -0.5715000033378601, "reward_std": 0.1590990424156189, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5715000033378601, "step": 501 }, { "completion_length": 181.0, "epoch": 4.218487394957983, "grad_norm": 0.7892071604728699, "kl": 0.007072310894727707, "learning_rate": 2.9169218667902562e-06, "loss": 0.0003, "reward": -0.22100001573562622, "reward_std": 0.6675087809562683, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.22100001573562622, "step": 502 }, { "completion_length": 256.0, "epoch": 4.226890756302521, "grad_norm": 0.008051041513681412, "kl": 0.002584027126431465, "learning_rate": 2.908314905604056e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 503 }, { "completion_length": 189.5, "epoch": 4.235294117647059, "grad_norm": 0.8589924573898315, "kl": 0.004768620245158672, "learning_rate": 2.8997029692295875e-06, "loss": 0.0002, "reward": -0.5295000076293945, "reward_std": 0.49285343289375305, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5295000076293945, "step": 504 }, { "completion_length": 256.0, "epoch": 4.243697478991597, "grad_norm": 0.013881309889256954, "kl": 0.003596859984099865, "learning_rate": 2.8910861626005774e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 505 }, { "completion_length": 256.0, "epoch": 4.2521008403361344, "grad_norm": 0.00672138761729002, "kl": 0.003465231042355299, "learning_rate": 2.8824645907100957e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 506 }, { "completion_length": 198.0, "epoch": 4.260504201680672, "grad_norm": 0.6329745650291443, "kl": 0.0028259491082280874, "learning_rate": 2.8738383586092745e-06, "loss": 0.0001, "reward": 0.18799999356269836, "reward_std": 0.08909544348716736, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18799999356269836, "step": 507 }, { "completion_length": 174.5, "epoch": 4.26890756302521, "grad_norm": 1.059665560722351, "kl": 0.004569211509078741, "learning_rate": 2.8652075714060296e-06, "loss": 0.0002, "reward": -0.028499998152256012, "reward_std": 0.21708177030086517, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.028499998152256012, "step": 508 }, { "completion_length": 256.0, "epoch": 4.277310924369748, "grad_norm": 0.5572677254676819, "kl": 0.01483019907027483, "learning_rate": 2.8565723342637797e-06, "loss": 0.0006, "reward": -0.7979999780654907, "reward_std": 0.07495332509279251, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7979999780654907, "step": 509 }, { "completion_length": 157.0, "epoch": 4.285714285714286, "grad_norm": 0.8018320798873901, "kl": 0.006128108128905296, "learning_rate": 2.847932752400164e-06, "loss": 0.0002, "reward": 0.04849999397993088, "reward_std": 0.28637823462486267, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.04849999397993088, "step": 510 }, { "completion_length": 256.0, "epoch": 4.294117647058823, "grad_norm": 0.7099005579948425, "kl": 0.019516263157129288, "learning_rate": 2.8392889310857615e-06, "loss": 0.0008, "reward": -0.7315000295639038, "reward_std": 0.004949768073856831, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7315000295639038, "step": 511 }, { "completion_length": 256.0, "epoch": 4.302521008403361, "grad_norm": 0.014580655843019485, "kl": 0.005779143888503313, "learning_rate": 2.8306409756428067e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 512 }, { "completion_length": 202.5, "epoch": 4.310924369747899, "grad_norm": 0.5508143305778503, "kl": 0.007292843423783779, "learning_rate": 2.8219889914439073e-06, "loss": 0.0003, "reward": -0.2849999964237213, "reward_std": 0.7580184936523438, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.2849999964237213, "step": 513 }, { "completion_length": 195.0, "epoch": 4.319327731092437, "grad_norm": 0.7418153882026672, "kl": 0.01197185181081295, "learning_rate": 2.813333083910761e-06, "loss": 0.0005, "reward": -0.25, "reward_std": 0.5303300619125366, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.25, "step": 514 }, { "completion_length": 157.5, "epoch": 4.3277310924369745, "grad_norm": 0.7607116103172302, "kl": 0.01135020237416029, "learning_rate": 2.804673358512869e-06, "loss": 0.0005, "reward": -0.15450000762939453, "reward_std": 0.5734636187553406, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.15450000762939453, "step": 515 }, { "completion_length": 256.0, "epoch": 4.336134453781512, "grad_norm": 0.008331523276865482, "kl": 0.0036601859610527754, "learning_rate": 2.7960099207662535e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 516 }, { "completion_length": 256.0, "epoch": 4.34453781512605, "grad_norm": 0.5470165610313416, "kl": 0.021114123985171318, "learning_rate": 2.7873428762321667e-06, "loss": 0.0008, "reward": -0.4320000112056732, "reward_std": 0.7877170443534851, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4320000112056732, "step": 517 }, { "completion_length": 256.0, "epoch": 4.352941176470588, "grad_norm": 0.006100235506892204, "kl": 0.00229791272431612, "learning_rate": 2.778672330515814e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 518 }, { "completion_length": 233.5, "epoch": 4.361344537815126, "grad_norm": 0.9826712608337402, "kl": 0.06345396488904953, "learning_rate": 2.769998389265057e-06, "loss": 0.0025, "reward": -0.7450000047683716, "reward_std": 0.09899494051933289, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7450000047683716, "step": 519 }, { "completion_length": 146.5, "epoch": 4.369747899159664, "grad_norm": 0.9437168836593628, "kl": 0.008699828758835793, "learning_rate": 2.761321158169134e-06, "loss": 0.0003, "reward": -0.19299998879432678, "reward_std": 0.4497199058532715, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.19299998879432678, "step": 520 }, { "completion_length": 256.0, "epoch": 4.378151260504202, "grad_norm": 0.006568343378603458, "kl": 0.006292395759373903, "learning_rate": 2.752640742957366e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 521 }, { "completion_length": 242.0, "epoch": 4.38655462184874, "grad_norm": 0.5708332657814026, "kl": 0.013393187895417213, "learning_rate": 2.743957249397874e-06, "loss": 0.0005, "reward": -0.6805000305175781, "reward_std": 0.027577146887779236, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6805000305175781, "step": 522 }, { "completion_length": 256.0, "epoch": 4.394957983193278, "grad_norm": 0.09955497831106186, "kl": 0.013105189427733421, "learning_rate": 2.7352707832962865e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 523 }, { "completion_length": 256.0, "epoch": 4.4033613445378155, "grad_norm": 0.008908540941774845, "kl": 0.006214002147316933, "learning_rate": 2.726581450494451e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 524 }, { "completion_length": 256.0, "epoch": 4.411764705882353, "grad_norm": 0.007475841790437698, "kl": 0.006965978071093559, "learning_rate": 2.717889356869146e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 525 }, { "completion_length": 256.0, "epoch": 4.420168067226891, "grad_norm": 0.012098866514861584, "kl": 0.0033868062309920788, "learning_rate": 2.70919460833079e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 526 }, { "completion_length": 256.0, "epoch": 4.428571428571429, "grad_norm": 0.004870280157774687, "kl": 0.0029334190767258406, "learning_rate": 2.700497310822147e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 527 }, { "completion_length": 238.0, "epoch": 4.436974789915967, "grad_norm": 0.6467201113700867, "kl": 0.002818512497469783, "learning_rate": 2.6917975703170466e-06, "loss": 0.0001, "reward": -0.3230000138282776, "reward_std": 0.6335676908493042, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3230000138282776, "step": 528 }, { "completion_length": 256.0, "epoch": 4.445378151260504, "grad_norm": 0.021824827417731285, "kl": 0.00567677803337574, "learning_rate": 2.6830954928190795e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 529 }, { "completion_length": 256.0, "epoch": 4.453781512605042, "grad_norm": 0.0046065435744822025, "kl": 0.002741157542914152, "learning_rate": 2.6743911843603134e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 530 }, { "completion_length": 225.5, "epoch": 4.46218487394958, "grad_norm": 0.7079954147338867, "kl": 0.07021471112966537, "learning_rate": 2.6656847510000013e-06, "loss": 0.0028, "reward": -0.5579999685287476, "reward_std": 0.06363959610462189, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5579999685287476, "step": 531 }, { "completion_length": 256.0, "epoch": 4.470588235294118, "grad_norm": 0.48517870903015137, "kl": 0.0027150376699864864, "learning_rate": 2.6569762988232838e-06, "loss": 0.0001, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 532 }, { "completion_length": 256.0, "epoch": 4.4789915966386555, "grad_norm": 0.010521008633077145, "kl": 0.005374213680624962, "learning_rate": 2.6482659339399047e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 533 }, { "completion_length": 256.0, "epoch": 4.487394957983193, "grad_norm": 0.6420186161994934, "kl": 0.031904011964797974, "learning_rate": 2.63955376248291e-06, "loss": 0.0013, "reward": -0.7710000276565552, "reward_std": 0.13293609023094177, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7710000276565552, "step": 534 }, { "completion_length": 256.0, "epoch": 4.495798319327731, "grad_norm": 0.5174649953842163, "kl": 0.024672798812389374, "learning_rate": 2.6308398906073603e-06, "loss": 0.001, "reward": -0.48649999499320984, "reward_std": 0.6880149245262146, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.48649999499320984, "step": 535 }, { "completion_length": 222.0, "epoch": 4.504201680672269, "grad_norm": 0.7617587447166443, "kl": 0.011859811842441559, "learning_rate": 2.6221244244890336e-06, "loss": 0.0005, "reward": -0.24399998784065247, "reward_std": 0.5218448042869568, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.24399998784065247, "step": 536 }, { "completion_length": 256.0, "epoch": 4.512605042016807, "grad_norm": 0.02986934222280979, "kl": 0.005451165605336428, "learning_rate": 2.613407470323134e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 537 }, { "completion_length": 244.0, "epoch": 4.5210084033613445, "grad_norm": 0.5694108009338379, "kl": 0.013516446575522423, "learning_rate": 2.604689134322999e-06, "loss": 0.0005, "reward": -0.5170000195503235, "reward_std": 0.5402295589447021, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5170000195503235, "step": 538 }, { "completion_length": 256.0, "epoch": 4.529411764705882, "grad_norm": 0.5198803544044495, "kl": 0.012814712710678577, "learning_rate": 2.5959695227188e-06, "loss": 0.0005, "reward": -0.4620000123977661, "reward_std": 0.830143392086029, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4620000123977661, "step": 539 }, { "completion_length": 199.5, "epoch": 4.53781512605042, "grad_norm": 0.7477623820304871, "kl": 0.0011860495433211327, "learning_rate": 2.587248741756253e-06, "loss": 0.0, "reward": -0.18199999630451202, "reward_std": 0.25738686323165894, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.18199999630451202, "step": 540 }, { "completion_length": 256.0, "epoch": 4.546218487394958, "grad_norm": 0.018783127889037132, "kl": 0.00780004495754838, "learning_rate": 2.578526897695321e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 541 }, { "completion_length": 256.0, "epoch": 4.554621848739496, "grad_norm": 0.010993880219757557, "kl": 0.0034092948772013187, "learning_rate": 2.569804096808923e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 542 }, { "completion_length": 153.0, "epoch": 4.563025210084033, "grad_norm": 0.8213419318199158, "kl": 0.00616165017709136, "learning_rate": 2.5610804453816333e-06, "loss": 0.0002, "reward": -0.21900001168251038, "reward_std": 0.486489474773407, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.21900001168251038, "step": 543 }, { "completion_length": 256.0, "epoch": 4.571428571428571, "grad_norm": 0.0030665313825011253, "kl": 0.0013890988193452358, "learning_rate": 2.5523560497083927e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 544 }, { "completion_length": 256.0, "epoch": 4.579831932773109, "grad_norm": 0.7139566540718079, "kl": 0.05103938281536102, "learning_rate": 2.543631016093209e-06, "loss": 0.002, "reward": -0.9599999785423279, "reward_std": 0.13152185082435608, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.9599999785423279, "step": 545 }, { "completion_length": 256.0, "epoch": 4.588235294117647, "grad_norm": 0.0118913808837533, "kl": 0.003797386074438691, "learning_rate": 2.5349054508478636e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 546 }, { "completion_length": 256.0, "epoch": 4.5966386554621845, "grad_norm": 0.46333226561546326, "kl": 0.0070401267148554325, "learning_rate": 2.526179460290615e-06, "loss": 0.0003, "reward": -0.33399999141693115, "reward_std": 0.649124026298523, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.33399999141693115, "step": 547 }, { "completion_length": 256.0, "epoch": 4.605042016806722, "grad_norm": 0.008937676437199116, "kl": 0.0028999093919992447, "learning_rate": 2.517453150744904e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 548 }, { "completion_length": 256.0, "epoch": 4.61344537815126, "grad_norm": 0.018326664343476295, "kl": 0.006212379317730665, "learning_rate": 2.5087266285380597e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 549 }, { "completion_length": 256.0, "epoch": 4.621848739495798, "grad_norm": 0.005397371482104063, "kl": 0.002161954762414098, "learning_rate": 2.5e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 550 }, { "completion_length": 233.5, "epoch": 4.630252100840336, "grad_norm": 0.4854866862297058, "kl": 0.008914257399737835, "learning_rate": 2.4912733714619415e-06, "loss": 0.0004, "reward": -0.34700000286102295, "reward_std": 0.7551900148391724, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.34700000286102295, "step": 551 }, { "completion_length": 228.5, "epoch": 4.6386554621848735, "grad_norm": 0.528438925743103, "kl": 0.005926819983869791, "learning_rate": 2.482546849255096e-06, "loss": 0.0002, "reward": -0.3319999873638153, "reward_std": 0.6462955474853516, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3319999873638153, "step": 552 }, { "completion_length": 256.0, "epoch": 4.647058823529412, "grad_norm": 0.010168900713324547, "kl": 0.0038402751088142395, "learning_rate": 2.4738205397093863e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 553 }, { "completion_length": 256.0, "epoch": 4.65546218487395, "grad_norm": 0.5756246447563171, "kl": 0.00415441207587719, "learning_rate": 2.4650945491521372e-06, "loss": 0.0002, "reward": -0.6809999942779541, "reward_std": 0.022627420723438263, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6809999942779541, "step": 554 }, { "completion_length": 256.0, "epoch": 4.663865546218488, "grad_norm": 0.0077059464529156685, "kl": 0.002210419625043869, "learning_rate": 2.4563689839067913e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 555 }, { "completion_length": 256.0, "epoch": 4.6722689075630255, "grad_norm": 0.01575803942978382, "kl": 0.003592760767787695, "learning_rate": 2.447643950291608e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 556 }, { "completion_length": 256.0, "epoch": 4.680672268907563, "grad_norm": 0.5559489727020264, "kl": 0.002091438975185156, "learning_rate": 2.4389195546183676e-06, "loss": 0.0001, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 557 }, { "completion_length": 168.0, "epoch": 4.689075630252101, "grad_norm": 1.5937094688415527, "kl": 0.034451764076948166, "learning_rate": 2.4301959031910785e-06, "loss": 0.0014, "reward": -0.43400001525878906, "reward_std": 0.7990306615829468, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.43400001525878906, "step": 558 }, { "completion_length": 256.0, "epoch": 4.697478991596639, "grad_norm": 0.011952197179198265, "kl": 0.005816806107759476, "learning_rate": 2.4214731023046795e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 559 }, { "completion_length": 256.0, "epoch": 4.705882352941177, "grad_norm": 0.012767662294209003, "kl": 0.004122850019484758, "learning_rate": 2.4127512582437486e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 560 }, { "completion_length": 207.5, "epoch": 4.714285714285714, "grad_norm": 0.8387289047241211, "kl": 0.03393634036183357, "learning_rate": 2.4040304772812002e-06, "loss": 0.0014, "reward": -0.5824999809265137, "reward_std": 0.30476298928260803, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5824999809265137, "step": 561 }, { "completion_length": 256.0, "epoch": 4.722689075630252, "grad_norm": 0.01276953425258398, "kl": 0.0034609478898346424, "learning_rate": 2.3953108656770018e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 562 }, { "completion_length": 256.0, "epoch": 4.73109243697479, "grad_norm": 0.009723175317049026, "kl": 0.0030459724366664886, "learning_rate": 2.3865925296768658e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 563 }, { "completion_length": 127.0, "epoch": 4.739495798319328, "grad_norm": 1.307532548904419, "kl": 0.005245089530944824, "learning_rate": 2.377875575510967e-06, "loss": 0.0002, "reward": 0.2784999907016754, "reward_std": 0.03889087587594986, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2784999907016754, "step": 564 }, { "completion_length": 129.5, "epoch": 4.7478991596638656, "grad_norm": 15.24959945678711, "kl": 0.05322512239217758, "learning_rate": 2.3691601093926406e-06, "loss": 0.0021, "reward": -0.3855000138282776, "reward_std": 0.5451793670654297, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3855000138282776, "step": 565 }, { "completion_length": 222.5, "epoch": 4.756302521008403, "grad_norm": 0.763205885887146, "kl": 0.007272263988852501, "learning_rate": 2.3604462375170905e-06, "loss": 0.0003, "reward": -0.2084999978542328, "reward_std": 0.4716402292251587, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.2084999978542328, "step": 566 }, { "completion_length": 256.0, "epoch": 4.764705882352941, "grad_norm": 0.4926777482032776, "kl": 0.0019459699979051948, "learning_rate": 2.3517340660600965e-06, "loss": 0.0001, "reward": -0.3499999940395355, "reward_std": 0.6717514395713806, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3499999940395355, "step": 567 }, { "completion_length": 256.0, "epoch": 4.773109243697479, "grad_norm": 0.012514820322394371, "kl": 0.00306853000074625, "learning_rate": 2.3430237011767166e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 568 }, { "completion_length": 227.5, "epoch": 4.781512605042017, "grad_norm": 0.6817553043365479, "kl": 0.0031129277776926756, "learning_rate": 2.3343152490000004e-06, "loss": 0.0001, "reward": -0.2775000035762787, "reward_std": 0.5692209601402283, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.2775000035762787, "step": 569 }, { "completion_length": 256.0, "epoch": 4.7899159663865545, "grad_norm": 0.02174249477684498, "kl": 0.0025351992808282375, "learning_rate": 2.325608815639687e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 570 }, { "completion_length": 256.0, "epoch": 4.798319327731092, "grad_norm": 0.009012720547616482, "kl": 0.0027656378224492073, "learning_rate": 2.3169045071809217e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 571 }, { "completion_length": 256.0, "epoch": 4.80672268907563, "grad_norm": 0.5388506650924683, "kl": 0.002020736690610647, "learning_rate": 2.3082024296829538e-06, "loss": 0.0001, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 572 }, { "completion_length": 226.5, "epoch": 4.815126050420168, "grad_norm": 0.6150339245796204, "kl": 0.003654666244983673, "learning_rate": 2.2995026891778533e-06, "loss": 0.0001, "reward": -0.3384999930858612, "reward_std": 0.6554879546165466, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3384999930858612, "step": 573 }, { "completion_length": 252.0, "epoch": 4.823529411764706, "grad_norm": 0.5614678263664246, "kl": 0.0026605622842907906, "learning_rate": 2.290805391669212e-06, "loss": 0.0001, "reward": -0.6955000162124634, "reward_std": 0.09687364846467972, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6955000162124634, "step": 574 }, { "completion_length": 256.0, "epoch": 4.831932773109243, "grad_norm": 0.003448512637987733, "kl": 0.0015153777785599232, "learning_rate": 2.2821106431308546e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 575 }, { "completion_length": 256.0, "epoch": 4.840336134453781, "grad_norm": 0.010500079952180386, "kl": 0.0018715003971010447, "learning_rate": 2.2734185495055503e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 576 }, { "completion_length": 256.0, "epoch": 4.848739495798319, "grad_norm": 0.006332644261419773, "kl": 0.0014892284525558352, "learning_rate": 2.2647292167037143e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 577 }, { "completion_length": 209.0, "epoch": 4.857142857142857, "grad_norm": 0.507714569568634, "kl": 0.0017313383286818862, "learning_rate": 2.256042750602127e-06, "loss": 0.0001, "reward": -0.1379999965429306, "reward_std": 0.37193819880485535, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1379999965429306, "step": 578 }, { "completion_length": 155.0, "epoch": 4.865546218487395, "grad_norm": 1.1372926235198975, "kl": 0.007865375839173794, "learning_rate": 2.2473592570426343e-06, "loss": 0.0003, "reward": -0.37700000405311584, "reward_std": 0.709935188293457, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.37700000405311584, "step": 579 }, { "completion_length": 254.5, "epoch": 4.873949579831933, "grad_norm": 0.570155918598175, "kl": 0.0029672663658857346, "learning_rate": 2.238678841830867e-06, "loss": 0.0001, "reward": -0.35600000619888306, "reward_std": 0.6802366971969604, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.35600000619888306, "step": 580 }, { "completion_length": 256.0, "epoch": 4.882352941176471, "grad_norm": 0.03282388299703598, "kl": 0.0032851826399564743, "learning_rate": 2.230001610734943e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 581 }, { "completion_length": 256.0, "epoch": 4.890756302521009, "grad_norm": 0.4947333037853241, "kl": 0.004140259698033333, "learning_rate": 2.2213276694841866e-06, "loss": 0.0002, "reward": -0.4325000047683716, "reward_std": 0.7884240746498108, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4325000047683716, "step": 582 }, { "completion_length": 256.0, "epoch": 4.899159663865547, "grad_norm": 0.578187882900238, "kl": 0.005852097645401955, "learning_rate": 2.212657123767834e-06, "loss": 0.0002, "reward": -0.44300001859664917, "reward_std": 0.8032733201980591, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.44300001859664917, "step": 583 }, { "completion_length": 256.0, "epoch": 4.907563025210084, "grad_norm": 0.005204138811677694, "kl": 0.0012599499896168709, "learning_rate": 2.2039900792337477e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 584 }, { "completion_length": 256.0, "epoch": 4.915966386554622, "grad_norm": 0.592668354511261, "kl": 0.003313436871394515, "learning_rate": 2.195326641487132e-06, "loss": 0.0001, "reward": -0.42649999260902405, "reward_std": 0.7799387574195862, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.42649999260902405, "step": 585 }, { "completion_length": 256.0, "epoch": 4.92436974789916, "grad_norm": 0.0030842137057334185, "kl": 0.0014734728028997779, "learning_rate": 2.186666916089239e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 586 }, { "completion_length": 256.0, "epoch": 4.932773109243698, "grad_norm": 0.004968688357621431, "kl": 0.002114580012857914, "learning_rate": 2.1780110085560935e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 587 }, { "completion_length": 256.0, "epoch": 4.9411764705882355, "grad_norm": 0.004097595810890198, "kl": 0.0012440209975466132, "learning_rate": 2.1693590243571937e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 588 }, { "completion_length": 256.0, "epoch": 4.949579831932773, "grad_norm": 0.4121953845024109, "kl": 0.0027124364860355854, "learning_rate": 2.1607110689142393e-06, "loss": 0.0001, "reward": -0.3084999918937683, "reward_std": 0.6130615472793579, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3084999918937683, "step": 589 }, { "completion_length": 256.0, "epoch": 4.957983193277311, "grad_norm": 0.6206503510475159, "kl": 0.015319441445171833, "learning_rate": 2.1520672475998374e-06, "loss": 0.0006, "reward": -0.4480000138282776, "reward_std": 0.810344398021698, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4480000138282776, "step": 590 }, { "completion_length": 256.0, "epoch": 4.966386554621849, "grad_norm": 0.016505403444170952, "kl": 0.002997180912643671, "learning_rate": 2.143427665736221e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 591 }, { "completion_length": 256.0, "epoch": 4.974789915966387, "grad_norm": 0.00553141999989748, "kl": 0.001822871621698141, "learning_rate": 2.134792428593971e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 592 }, { "completion_length": 256.0, "epoch": 4.983193277310924, "grad_norm": 0.004651006776839495, "kl": 0.0014130686176940799, "learning_rate": 2.1261616413907267e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 593 }, { "completion_length": 256.0, "epoch": 4.991596638655462, "grad_norm": 0.015994032844901085, "kl": 0.0021564895287156105, "learning_rate": 2.117535409289905e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 594 }, { "completion_length": 256.0, "epoch": 5.0, "grad_norm": 0.6122642159461975, "kl": 0.02493193931877613, "learning_rate": 2.1089138373994226e-06, "loss": 0.001, "reward": -0.35899999737739563, "reward_std": 0.6844793558120728, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.35899999737739563, "step": 595 }, { "completion_length": 256.0, "epoch": 5.008403361344538, "grad_norm": 0.5950356125831604, "kl": 0.006423520855605602, "learning_rate": 2.1002970307704134e-06, "loss": 0.0003, "reward": -0.9235000014305115, "reward_std": 0.31041988730430603, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.9235000014305115, "step": 596 }, { "completion_length": 139.0, "epoch": 5.016806722689076, "grad_norm": 0.8671441674232483, "kl": 0.0031137759797275066, "learning_rate": 2.0916850943959453e-06, "loss": 0.0001, "reward": -0.24700000882148743, "reward_std": 0.10182338207960129, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.24700000882148743, "step": 597 }, { "completion_length": 256.0, "epoch": 5.025210084033613, "grad_norm": 0.006848654244095087, "kl": 0.002302038948982954, "learning_rate": 2.0830781332097446e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 598 }, { "completion_length": 256.0, "epoch": 5.033613445378151, "grad_norm": 0.007818521931767464, "kl": 0.002072400413453579, "learning_rate": 2.0744762520849193e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 599 }, { "completion_length": 256.0, "epoch": 5.042016806722689, "grad_norm": 0.5224281549453735, "kl": 0.0019844090566039085, "learning_rate": 2.0658795558326745e-06, "loss": 0.0001, "reward": -0.32600000500679016, "reward_std": 0.6378102898597717, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.32600000500679016, "step": 600 }, { "completion_length": 256.0, "epoch": 5.050420168067227, "grad_norm": 0.0074386028572916985, "kl": 0.0015872814692556858, "learning_rate": 2.0572881492010423e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 601 }, { "completion_length": 255.0, "epoch": 5.0588235294117645, "grad_norm": 0.5245713591575623, "kl": 0.004142221063375473, "learning_rate": 2.0487021368736002e-06, "loss": 0.0002, "reward": 0.18799999356269836, "reward_std": 0.08909544348716736, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18799999356269836, "step": 602 }, { "completion_length": 168.5, "epoch": 5.067226890756302, "grad_norm": 1.117664098739624, "kl": 0.0038326880894601345, "learning_rate": 2.0401216234682e-06, "loss": 0.0002, "reward": -0.4490000009536743, "reward_std": 0.6660946011543274, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4490000009536743, "step": 603 }, { "completion_length": 256.0, "epoch": 5.07563025210084, "grad_norm": 0.014528336934745312, "kl": 0.0029943487606942654, "learning_rate": 2.031546713535688e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 604 }, { "completion_length": 241.5, "epoch": 5.084033613445378, "grad_norm": 0.004044847097247839, "kl": 0.0013671403285115957, "learning_rate": 2.022977511558638e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 605 }, { "completion_length": 256.0, "epoch": 5.092436974789916, "grad_norm": 0.01155867800116539, "kl": 0.002078444231301546, "learning_rate": 2.0144141219500707e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 606 }, { "completion_length": 256.0, "epoch": 5.100840336134453, "grad_norm": 0.004797025583684444, "kl": 0.00226387195289135, "learning_rate": 2.0058566490521848e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 607 }, { "completion_length": 256.0, "epoch": 5.109243697478991, "grad_norm": 0.006030118092894554, "kl": 0.0012524562189355493, "learning_rate": 1.997305197135089e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 608 }, { "completion_length": 256.0, "epoch": 5.117647058823529, "grad_norm": 0.010605152696371078, "kl": 0.0023506400175392628, "learning_rate": 1.9887598703955244e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 609 }, { "completion_length": 256.0, "epoch": 5.126050420168067, "grad_norm": 0.5450811386108398, "kl": 0.0013597803190350533, "learning_rate": 1.9802207729556023e-06, "loss": 0.0001, "reward": -0.3244999945163727, "reward_std": 0.6356889605522156, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3244999945163727, "step": 610 }, { "completion_length": 241.5, "epoch": 5.1344537815126055, "grad_norm": 0.5708850622177124, "kl": 0.003428893629461527, "learning_rate": 1.971688008861529e-06, "loss": 0.0001, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 611 }, { "completion_length": 226.0, "epoch": 5.142857142857143, "grad_norm": 0.008298719301819801, "kl": 0.0023929046001285315, "learning_rate": 1.963161682082342e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 612 }, { "completion_length": 256.0, "epoch": 5.151260504201681, "grad_norm": 0.010914776474237442, "kl": 0.0034842113964259624, "learning_rate": 1.9546418965086444e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 613 }, { "completion_length": 256.0, "epoch": 5.159663865546219, "grad_norm": 0.00912941712886095, "kl": 0.0018775518983602524, "learning_rate": 1.946128755951332e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 614 }, { "completion_length": 256.0, "epoch": 5.168067226890757, "grad_norm": 0.008125953376293182, "kl": 0.0020890063606202602, "learning_rate": 1.937622364140338e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 615 }, { "completion_length": 256.0, "epoch": 5.176470588235294, "grad_norm": 0.5825814008712769, "kl": 0.003179593477398157, "learning_rate": 1.9291228247233607e-06, "loss": 0.0001, "reward": -0.3375000059604645, "reward_std": 0.6540737748146057, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3375000059604645, "step": 616 }, { "completion_length": 256.0, "epoch": 5.184873949579832, "grad_norm": 0.006850538309663534, "kl": 0.0014647808857262135, "learning_rate": 1.9206302412646074e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 617 }, { "completion_length": 256.0, "epoch": 5.19327731092437, "grad_norm": 0.0038992990739643574, "kl": 0.001176977064460516, "learning_rate": 1.912144717243525e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 618 }, { "completion_length": 256.0, "epoch": 5.201680672268908, "grad_norm": 0.014090382494032383, "kl": 0.0029204320162534714, "learning_rate": 1.9036663560535484e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 619 }, { "completion_length": 256.0, "epoch": 5.2100840336134455, "grad_norm": 0.5477524399757385, "kl": 0.0037951688282191753, "learning_rate": 1.895195261000831e-06, "loss": 0.0002, "reward": -0.30300000309944153, "reward_std": 0.6052833795547485, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.30300000309944153, "step": 620 }, { "completion_length": 256.0, "epoch": 5.218487394957983, "grad_norm": 0.4071335792541504, "kl": 0.0034616105258464813, "learning_rate": 1.8867315353029937e-06, "loss": 0.0001, "reward": -0.6464999914169312, "reward_std": 0.04737617447972298, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6464999914169312, "step": 621 }, { "completion_length": 198.5, "epoch": 5.226890756302521, "grad_norm": 0.7220665812492371, "kl": 0.0020745659712702036, "learning_rate": 1.8782752820878636e-06, "loss": 0.0001, "reward": -0.351500004529953, "reward_std": 0.8520635962486267, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.351500004529953, "step": 622 }, { "completion_length": 256.0, "epoch": 5.235294117647059, "grad_norm": 0.5121845006942749, "kl": 0.0014675403945147991, "learning_rate": 1.8698266043922159e-06, "loss": 0.0001, "reward": -0.31200000643730164, "reward_std": 0.6180113554000854, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.31200000643730164, "step": 623 }, { "completion_length": 256.0, "epoch": 5.243697478991597, "grad_norm": 0.006294391583651304, "kl": 0.001778844860382378, "learning_rate": 1.8613856051605242e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 624 }, { "completion_length": 256.0, "epoch": 5.2521008403361344, "grad_norm": 0.5268535614013672, "kl": 0.0026525366120040417, "learning_rate": 1.852952387243698e-06, "loss": 0.0001, "reward": -0.9919999837875366, "reward_std": 0.07495328038930893, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.9919999837875366, "step": 625 }, { "completion_length": 256.0, "epoch": 5.260504201680672, "grad_norm": 0.6965689063072205, "kl": 0.0022122974041849375, "learning_rate": 1.8445270533978387e-06, "loss": 0.0001, "reward": -0.35749998688697815, "reward_std": 0.6823580265045166, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.35749998688697815, "step": 626 }, { "completion_length": 256.0, "epoch": 5.26890756302521, "grad_norm": 0.00528307631611824, "kl": 0.001154672703705728, "learning_rate": 1.836109706282978e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 627 }, { "completion_length": 256.0, "epoch": 5.277310924369748, "grad_norm": 0.008031424134969711, "kl": 0.0014007267309352756, "learning_rate": 1.827700448461836e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 628 }, { "completion_length": 256.0, "epoch": 5.285714285714286, "grad_norm": 0.008406974375247955, "kl": 0.0024969615042209625, "learning_rate": 1.8192993823985643e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 629 }, { "completion_length": 256.0, "epoch": 5.294117647058823, "grad_norm": 0.006517631467431784, "kl": 0.0018244747770950198, "learning_rate": 1.8109066104575023e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 630 }, { "completion_length": 139.0, "epoch": 5.302521008403361, "grad_norm": 4.421841621398926, "kl": 0.06257651746273041, "learning_rate": 1.8025222349019273e-06, "loss": 0.0025, "reward": -0.19449999928474426, "reward_std": 0.5890199542045593, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.19449999928474426, "step": 631 }, { "completion_length": 256.0, "epoch": 5.310924369747899, "grad_norm": 0.004162381868809462, "kl": 0.0012691657757386565, "learning_rate": 1.7941463578928088e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 632 }, { "completion_length": 256.0, "epoch": 5.319327731092437, "grad_norm": 0.6795893907546997, "kl": 0.0040610902942717075, "learning_rate": 1.7857790814875665e-06, "loss": 0.0002, "reward": -0.34700000286102295, "reward_std": 0.6675087809562683, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.34700000286102295, "step": 633 }, { "completion_length": 246.5, "epoch": 5.3277310924369745, "grad_norm": 0.6703924536705017, "kl": 0.007514665834605694, "learning_rate": 1.7774205076388207e-06, "loss": 0.0003, "reward": -0.25600001215934753, "reward_std": 0.5388153791427612, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.25600001215934753, "step": 634 }, { "completion_length": 256.0, "epoch": 5.336134453781512, "grad_norm": 0.6755226254463196, "kl": 0.012044209986925125, "learning_rate": 1.7690707381931585e-06, "loss": 0.0005, "reward": -0.9120000004768372, "reward_std": 0.007071061059832573, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.9120000004768372, "step": 635 }, { "completion_length": 256.0, "epoch": 5.34453781512605, "grad_norm": 0.006453452631831169, "kl": 0.0018289199797436595, "learning_rate": 1.7607298748898844e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 636 }, { "completion_length": 242.0, "epoch": 5.352941176470588, "grad_norm": 0.7364490628242493, "kl": 0.008783997967839241, "learning_rate": 1.7523980193597837e-06, "loss": 0.0004, "reward": -0.3174999952316284, "reward_std": 0.6257895231246948, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3174999952316284, "step": 637 }, { "completion_length": 256.0, "epoch": 5.361344537815126, "grad_norm": 0.012246452271938324, "kl": 0.0018720311345532537, "learning_rate": 1.744075273123889e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 638 }, { "completion_length": 256.0, "epoch": 5.369747899159664, "grad_norm": 0.007929048500955105, "kl": 0.002464049030095339, "learning_rate": 1.735761737592236e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 639 }, { "completion_length": 256.0, "epoch": 5.378151260504202, "grad_norm": 0.002886024536564946, "kl": 0.0009612165158614516, "learning_rate": 1.7274575140626318e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 640 }, { "completion_length": 256.0, "epoch": 5.38655462184874, "grad_norm": 0.5241102576255798, "kl": 0.003354104468598962, "learning_rate": 1.7191627037194187e-06, "loss": 0.0001, "reward": -0.8579999804496765, "reward_std": 0.05232590436935425, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.8579999804496765, "step": 641 }, { "completion_length": 256.0, "epoch": 5.394957983193278, "grad_norm": 0.4399980902671814, "kl": 0.004751862026751041, "learning_rate": 1.7108774076322443e-06, "loss": 0.0002, "reward": -0.21399998664855957, "reward_std": 0.4794183671474457, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.21399998664855957, "step": 642 }, { "completion_length": 256.0, "epoch": 5.4033613445378155, "grad_norm": 0.5501796007156372, "kl": 0.0023467349819839, "learning_rate": 1.702601726754825e-06, "loss": 0.0001, "reward": -0.5584999918937683, "reward_std": 0.9666149616241455, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5584999918937683, "step": 643 }, { "completion_length": 186.5, "epoch": 5.411764705882353, "grad_norm": 0.7515340447425842, "kl": 0.005701517220586538, "learning_rate": 1.6943357619237227e-06, "loss": 0.0002, "reward": -0.14399999380111694, "reward_std": 0.5586143136024475, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.14399999380111694, "step": 644 }, { "completion_length": 256.0, "epoch": 5.420168067226891, "grad_norm": 0.01598629169166088, "kl": 0.004346916917711496, "learning_rate": 1.686079613857109e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 645 }, { "completion_length": 256.0, "epoch": 5.428571428571429, "grad_norm": 0.0043999552726745605, "kl": 0.0015848834300413728, "learning_rate": 1.677833383153542e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 646 }, { "completion_length": 256.0, "epoch": 5.436974789915967, "grad_norm": 0.5508384108543396, "kl": 0.011822369880974293, "learning_rate": 1.6695971702907425e-06, "loss": 0.0005, "reward": -0.36399999260902405, "reward_std": 0.6915504336357117, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.36399999260902405, "step": 647 }, { "completion_length": 157.0, "epoch": 5.445378151260504, "grad_norm": 0.12430807948112488, "kl": 0.011089656502008438, "learning_rate": 1.661371075624363e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 648 }, { "completion_length": 247.5, "epoch": 5.453781512605042, "grad_norm": 0.5922226309776306, "kl": 0.005108335986733437, "learning_rate": 1.6531551993867717e-06, "loss": 0.0002, "reward": -0.3619999885559082, "reward_std": 0.6887219548225403, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3619999885559082, "step": 649 }, { "completion_length": 256.0, "epoch": 5.46218487394958, "grad_norm": 0.019858427345752716, "kl": 0.004609990399330854, "learning_rate": 1.6449496416858285e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 650 }, { "completion_length": 149.5, "epoch": 5.470588235294118, "grad_norm": 0.876549243927002, "kl": 0.008264636620879173, "learning_rate": 1.6367545025036634e-06, "loss": 0.0003, "reward": -0.3790000081062317, "reward_std": 0.18384775519371033, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3790000081062317, "step": 651 }, { "completion_length": 56.0, "epoch": 5.4789915966386555, "grad_norm": 1.6507885456085205, "kl": 0.011998582631349564, "learning_rate": 1.6285698816954626e-06, "loss": 0.0005, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 652 }, { "completion_length": 98.0, "epoch": 5.487394957983193, "grad_norm": 2.379565715789795, "kl": 0.051044050604104996, "learning_rate": 1.6203958789882457e-06, "loss": 0.002, "reward": -0.11699999868869781, "reward_std": 0.2503158152103424, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.11699999868869781, "step": 653 }, { "completion_length": 256.0, "epoch": 5.495798319327731, "grad_norm": 0.6574161648750305, "kl": 0.005354372784495354, "learning_rate": 1.612232593979658e-06, "loss": 0.0002, "reward": -0.3334999978542328, "reward_std": 0.6484169363975525, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3334999978542328, "step": 654 }, { "completion_length": 256.0, "epoch": 5.504201680672269, "grad_norm": 0.4538904130458832, "kl": 0.0018920748261734843, "learning_rate": 1.6040801261367494e-06, "loss": 0.0001, "reward": -0.3409999907016754, "reward_std": 0.6590235233306885, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3409999907016754, "step": 655 }, { "completion_length": 216.5, "epoch": 5.512605042016807, "grad_norm": 0.8500025272369385, "kl": 0.007286286912858486, "learning_rate": 1.5959385747947697e-06, "loss": 0.0003, "reward": 0.18799999356269836, "reward_std": 0.08909544348716736, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18799999356269836, "step": 656 }, { "completion_length": 215.5, "epoch": 5.5210084033613445, "grad_norm": 0.5545524954795837, "kl": 0.0038545397110283375, "learning_rate": 1.5878080391559507e-06, "loss": 0.0002, "reward": 0.18799999356269836, "reward_std": 0.08909544348716736, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18799999356269836, "step": 657 }, { "completion_length": 256.0, "epoch": 5.529411764705882, "grad_norm": 0.012700149789452553, "kl": 0.002550204051658511, "learning_rate": 1.5796886182883053e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 658 }, { "completion_length": 256.0, "epoch": 5.53781512605042, "grad_norm": 0.012130150571465492, "kl": 0.004246685188263655, "learning_rate": 1.5715804111244138e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 659 }, { "completion_length": 187.0, "epoch": 5.546218487394958, "grad_norm": 0.03365745395421982, "kl": 0.00521458825096488, "learning_rate": 1.56348351646022e-06, "loss": 0.0002, "reward": 0.25099998712539673, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25099998712539673, "step": 660 }, { "completion_length": 245.0, "epoch": 5.554621848739496, "grad_norm": 0.5525546073913574, "kl": 0.0056045386008918285, "learning_rate": 1.5553980329538326e-06, "loss": 0.0002, "reward": -0.33550000190734863, "reward_std": 0.6512453556060791, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.33550000190734863, "step": 661 }, { "completion_length": 233.0, "epoch": 5.563025210084033, "grad_norm": 0.6135679483413696, "kl": 0.014821572229266167, "learning_rate": 1.547324059124315e-06, "loss": 0.0006, "reward": -0.8330000042915344, "reward_std": 0.16122037172317505, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.8330000042915344, "step": 662 }, { "completion_length": 256.0, "epoch": 5.571428571428571, "grad_norm": 0.6066795587539673, "kl": 0.005018861498683691, "learning_rate": 1.539261693350491e-06, "loss": 0.0002, "reward": -0.41499999165534973, "reward_std": 0.7636752724647522, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.41499999165534973, "step": 663 }, { "completion_length": 256.0, "epoch": 5.579831932773109, "grad_norm": 0.0032927177380770445, "kl": 0.0011893229093402624, "learning_rate": 1.5312110338697427e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 664 }, { "completion_length": 256.0, "epoch": 5.588235294117647, "grad_norm": 0.011188259348273277, "kl": 0.0032275754492729902, "learning_rate": 1.5231721787768162e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 665 }, { "completion_length": 256.0, "epoch": 5.5966386554621845, "grad_norm": 0.010389181785285473, "kl": 0.0034175002947449684, "learning_rate": 1.5151452260226224e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 666 }, { "completion_length": 256.0, "epoch": 5.605042016806722, "grad_norm": 0.008956738747656345, "kl": 0.003683708142489195, "learning_rate": 1.5071302734130488e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 667 }, { "completion_length": 256.0, "epoch": 5.61344537815126, "grad_norm": 0.010918783955276012, "kl": 0.002675094874575734, "learning_rate": 1.4991274186077632e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 668 }, { "completion_length": 182.5, "epoch": 5.621848739495798, "grad_norm": 0.9802809953689575, "kl": 0.010041233152151108, "learning_rate": 1.491136759119025e-06, "loss": 0.0004, "reward": -0.11550000309944153, "reward_std": 0.3401183485984802, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.11550000309944153, "step": 669 }, { "completion_length": 256.0, "epoch": 5.630252100840336, "grad_norm": 0.005028929095715284, "kl": 0.001274311332963407, "learning_rate": 1.4831583923105e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 670 }, { "completion_length": 158.0, "epoch": 5.6386554621848735, "grad_norm": 1.168184757232666, "kl": 0.011130757629871368, "learning_rate": 1.4751924153960681e-06, "loss": 0.0004, "reward": -0.242000013589859, "reward_std": 0.5190163850784302, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.242000013589859, "step": 671 }, { "completion_length": 256.0, "epoch": 5.647058823529412, "grad_norm": 0.004574757535010576, "kl": 0.0011274329153820872, "learning_rate": 1.467238925438646e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 672 }, { "completion_length": 256.0, "epoch": 5.65546218487395, "grad_norm": 1.5507034063339233, "kl": 0.08142305910587311, "learning_rate": 1.4592980193489975e-06, "loss": 0.0033, "reward": -0.781000018119812, "reward_std": 0.04242640733718872, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.781000018119812, "step": 673 }, { "completion_length": 256.0, "epoch": 5.663865546218488, "grad_norm": 0.931879460811615, "kl": 0.07405450940132141, "learning_rate": 1.4513697938845571e-06, "loss": 0.003, "reward": -0.3995000123977661, "reward_std": 0.9185317158699036, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3995000123977661, "step": 674 }, { "completion_length": 256.0, "epoch": 5.6722689075630255, "grad_norm": 0.012074096128344536, "kl": 0.0016379408771172166, "learning_rate": 1.443454345648252e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 675 }, { "completion_length": 74.5, "epoch": 5.680672268907563, "grad_norm": 2.315338134765625, "kl": 0.00615721195936203, "learning_rate": 1.4355517710873184e-06, "loss": 0.0002, "reward": -0.038999997079372406, "reward_std": 0.23193103075027466, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.038999997079372406, "step": 676 }, { "completion_length": 255.0, "epoch": 5.689075630252101, "grad_norm": 0.6039393544197083, "kl": 0.027395591139793396, "learning_rate": 1.4276621664921358e-06, "loss": 0.0011, "reward": -0.4649999737739563, "reward_std": 0.8343859910964966, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4649999737739563, "step": 677 }, { "completion_length": 256.0, "epoch": 5.697478991596639, "grad_norm": 0.0075947134755551815, "kl": 0.0025249538011848927, "learning_rate": 1.419785627995044e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 678 }, { "completion_length": 256.0, "epoch": 5.705882352941177, "grad_norm": 0.003034507157281041, "kl": 0.0010610503377392888, "learning_rate": 1.4119222515691817e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 679 }, { "completion_length": 256.0, "epoch": 5.714285714285714, "grad_norm": 0.6727738976478577, "kl": 0.0030765181872993708, "learning_rate": 1.4040721330273063e-06, "loss": 0.0001, "reward": -0.28349998593330383, "reward_std": 0.5777062177658081, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.28349998593330383, "step": 680 }, { "completion_length": 110.0, "epoch": 5.722689075630252, "grad_norm": 1.5226813554763794, "kl": 0.02017168700695038, "learning_rate": 1.3962353680206372e-06, "loss": 0.0008, "reward": 0.12549999356269836, "reward_std": 0.17748379707336426, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12549999356269836, "step": 681 }, { "completion_length": 256.0, "epoch": 5.73109243697479, "grad_norm": 0.00672097597271204, "kl": 0.0018473025411367416, "learning_rate": 1.388412052037682e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 682 }, { "completion_length": 256.0, "epoch": 5.739495798319328, "grad_norm": 0.6360077261924744, "kl": 0.0024665307719260454, "learning_rate": 1.380602280403076e-06, "loss": 0.0001, "reward": -0.25200000405311584, "reward_std": 0.533158540725708, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.25200000405311584, "step": 683 }, { "completion_length": 103.0, "epoch": 5.7478991596638656, "grad_norm": 1.7815911769866943, "kl": 0.015566820278763771, "learning_rate": 1.3728061482764238e-06, "loss": 0.0006, "reward": -0.20500001311302185, "reward_std": 0.4666905105113983, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.20500001311302185, "step": 684 }, { "completion_length": 256.0, "epoch": 5.756302521008403, "grad_norm": 0.6177946329116821, "kl": 0.008974204771220684, "learning_rate": 1.3650237506511333e-06, "loss": 0.0004, "reward": -0.3070000112056732, "reward_std": 0.6109402775764465, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3070000112056732, "step": 685 }, { "completion_length": 224.5, "epoch": 5.764705882352941, "grad_norm": 0.6230205297470093, "kl": 0.0029948404990136623, "learning_rate": 1.3572551823532654e-06, "loss": 0.0001, "reward": -0.22949999570846558, "reward_std": 0.5013387203216553, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.22949999570846558, "step": 686 }, { "completion_length": 167.0, "epoch": 5.773109243697479, "grad_norm": 0.6922351121902466, "kl": 0.03655628114938736, "learning_rate": 1.349500538040371e-06, "loss": 0.0015, "reward": -0.5164999961853027, "reward_std": 0.3358757197856903, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5164999961853027, "step": 687 }, { "completion_length": 137.5, "epoch": 5.781512605042017, "grad_norm": 0.8064650893211365, "kl": 0.003309395397081971, "learning_rate": 1.3417599122003464e-06, "loss": 0.0001, "reward": -0.07200001180171967, "reward_std": 0.4567910134792328, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.07200001180171967, "step": 688 }, { "completion_length": 256.0, "epoch": 5.7899159663865545, "grad_norm": 0.009353609755635262, "kl": 0.002712616231292486, "learning_rate": 1.3340333991502723e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 689 }, { "completion_length": 223.5, "epoch": 5.798319327731092, "grad_norm": 0.6402768492698669, "kl": 0.007981081493198872, "learning_rate": 1.3263210930352737e-06, "loss": 0.0003, "reward": -0.0754999965429306, "reward_std": 0.28354981541633606, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.0754999965429306, "step": 690 }, { "completion_length": 256.0, "epoch": 5.80672268907563, "grad_norm": 0.007863535545766354, "kl": 0.0034077505115419626, "learning_rate": 1.3186230878273654e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 691 }, { "completion_length": 256.0, "epoch": 5.815126050420168, "grad_norm": 0.4858497083187103, "kl": 0.003121765796095133, "learning_rate": 1.3109394773243117e-06, "loss": 0.0001, "reward": -0.296999990940094, "reward_std": 0.5967981219291687, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.296999990940094, "step": 692 }, { "completion_length": 241.5, "epoch": 5.823529411764706, "grad_norm": 0.5817645192146301, "kl": 0.006237810477614403, "learning_rate": 1.3032703551484832e-06, "loss": 0.0002, "reward": -0.17350000143051147, "reward_std": 0.42214274406433105, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.17350000143051147, "step": 693 }, { "completion_length": 228.0, "epoch": 5.831932773109243, "grad_norm": 0.6947620511054993, "kl": 0.00694384565576911, "learning_rate": 1.2956158147457116e-06, "loss": 0.0003, "reward": -0.8945000171661377, "reward_std": 0.08414571732282639, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.8945000171661377, "step": 694 }, { "completion_length": 256.0, "epoch": 5.840336134453781, "grad_norm": 0.021973401308059692, "kl": 0.005194809287786484, "learning_rate": 1.2879759493841577e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 695 }, { "completion_length": 155.5, "epoch": 5.848739495798319, "grad_norm": 0.712433397769928, "kl": 0.006424563936889172, "learning_rate": 1.280350852153168e-06, "loss": 0.0003, "reward": 0.18799999356269836, "reward_std": 0.08909544348716736, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18799999356269836, "step": 696 }, { "completion_length": 256.0, "epoch": 5.857142857142857, "grad_norm": 0.012401039712131023, "kl": 0.005644943565130234, "learning_rate": 1.272740615962148e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 697 }, { "completion_length": 149.0, "epoch": 5.865546218487395, "grad_norm": 0.009321320801973343, "kl": 0.0042940061539411545, "learning_rate": 1.2651453335394232e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 698 }, { "completion_length": 157.0, "epoch": 5.873949579831933, "grad_norm": 0.6894354224205017, "kl": 0.004654036834836006, "learning_rate": 1.2575650974311118e-06, "loss": 0.0002, "reward": -0.030500009655952454, "reward_std": 0.3981010913848877, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.030500009655952454, "step": 699 }, { "completion_length": 219.5, "epoch": 5.882352941176471, "grad_norm": 0.5325128436088562, "kl": 0.011226415634155273, "learning_rate": 1.2500000000000007e-06, "loss": 0.0004, "reward": -0.21050000190734863, "reward_std": 0.65265953540802, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.21050000190734863, "step": 700 }, { "completion_length": 256.0, "epoch": 5.890756302521009, "grad_norm": 0.014634798280894756, "kl": 0.0048136357218027115, "learning_rate": 1.2424501334244124e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 701 }, { "completion_length": 182.5, "epoch": 5.899159663865547, "grad_norm": 0.7921615242958069, "kl": 0.03729894384741783, "learning_rate": 1.234915589697091e-06, "loss": 0.0015, "reward": -0.1574999988079071, "reward_std": 0.3995153307914734, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1574999988079071, "step": 702 }, { "completion_length": 256.0, "epoch": 5.907563025210084, "grad_norm": 0.008826153352856636, "kl": 0.004548550583422184, "learning_rate": 1.2273964606240718e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 703 }, { "completion_length": 256.0, "epoch": 5.915966386554622, "grad_norm": 0.5038639307022095, "kl": 0.011412066407501698, "learning_rate": 1.2198928378235717e-06, "loss": 0.0005, "reward": -0.3084999918937683, "reward_std": 0.6130615472793579, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3084999918937683, "step": 704 }, { "completion_length": 202.0, "epoch": 5.92436974789916, "grad_norm": 0.0173216313123703, "kl": 0.005657494068145752, "learning_rate": 1.2124048127248644e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 705 }, { "completion_length": 256.0, "epoch": 5.932773109243698, "grad_norm": 0.019604891538619995, "kl": 0.010336339473724365, "learning_rate": 1.204932476567175e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 706 }, { "completion_length": 182.0, "epoch": 5.9411764705882355, "grad_norm": 0.9846315383911133, "kl": 0.02360432781279087, "learning_rate": 1.19747592039856e-06, "loss": 0.0009, "reward": -0.5184999704360962, "reward_std": 0.5112382173538208, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5184999704360962, "step": 707 }, { "completion_length": 256.0, "epoch": 5.949579831932773, "grad_norm": 0.49586087465286255, "kl": 0.010822747834026814, "learning_rate": 1.1900352350748026e-06, "loss": 0.0004, "reward": -0.45649999380111694, "reward_std": 0.8223651647567749, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.45649999380111694, "step": 708 }, { "completion_length": 256.0, "epoch": 5.957983193277311, "grad_norm": 0.5020821690559387, "kl": 0.011972316540777683, "learning_rate": 1.1826105112583061e-06, "loss": 0.0005, "reward": -0.4244999885559082, "reward_std": 0.7771103382110596, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4244999885559082, "step": 709 }, { "completion_length": 256.0, "epoch": 5.966386554621849, "grad_norm": 0.6312204003334045, "kl": 0.003103027120232582, "learning_rate": 1.1752018394169882e-06, "loss": 0.0001, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 710 }, { "completion_length": 256.0, "epoch": 5.974789915966387, "grad_norm": 0.011970453895628452, "kl": 0.004626747686415911, "learning_rate": 1.1678093098231748e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 711 }, { "completion_length": 256.0, "epoch": 5.983193277310924, "grad_norm": 0.4105314314365387, "kl": 0.006742691621184349, "learning_rate": 1.160433012552508e-06, "loss": 0.0003, "reward": -0.2615000009536743, "reward_std": 0.5465935468673706, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.2615000009536743, "step": 712 }, { "completion_length": 244.0, "epoch": 5.991596638655462, "grad_norm": 0.7031504511833191, "kl": 0.009322759695351124, "learning_rate": 1.1530730374828422e-06, "loss": 0.0004, "reward": -0.18150000274181366, "reward_std": 0.4334564805030823, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.18150000274181366, "step": 713 }, { "completion_length": 122.0, "epoch": 6.0, "grad_norm": 2.9383797645568848, "kl": 0.013033779338002205, "learning_rate": 1.1457294742931508e-06, "loss": 0.0005, "reward": -0.1875, "reward_std": 0.4419417381286621, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1875, "step": 714 }, { "completion_length": 237.5, "epoch": 6.008403361344538, "grad_norm": 0.7259985208511353, "kl": 0.04795249551534653, "learning_rate": 1.1384024124624324e-06, "loss": 0.0019, "reward": -0.36250001192092896, "reward_std": 0.6894291639328003, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.36250001192092896, "step": 715 }, { "completion_length": 120.5, "epoch": 6.016806722689076, "grad_norm": 1.2028225660324097, "kl": 0.022861093282699585, "learning_rate": 1.1310919412686248e-06, "loss": 0.0009, "reward": -0.26899999380111694, "reward_std": 0.26304373145103455, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.26899999380111694, "step": 716 }, { "completion_length": 256.0, "epoch": 6.025210084033613, "grad_norm": 0.5726842880249023, "kl": 0.0036780391819775105, "learning_rate": 1.1237981497875112e-06, "loss": 0.0001, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 717 }, { "completion_length": 256.0, "epoch": 6.033613445378151, "grad_norm": 0.00352221867069602, "kl": 0.001355575630441308, "learning_rate": 1.11652112689164e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 718 }, { "completion_length": 256.0, "epoch": 6.042016806722689, "grad_norm": 0.04709786921739578, "kl": 0.007329372689127922, "learning_rate": 1.109260961249238e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 719 }, { "completion_length": 246.5, "epoch": 6.050420168067227, "grad_norm": 0.44879260659217834, "kl": 0.004294008016586304, "learning_rate": 1.1020177413231334e-06, "loss": 0.0002, "reward": 0.18799999356269836, "reward_std": 0.08909544348716736, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18799999356269836, "step": 720 }, { "completion_length": 245.0, "epoch": 6.0588235294117645, "grad_norm": 0.505556046962738, "kl": 0.0029648493509739637, "learning_rate": 1.0947915553696742e-06, "loss": 0.0001, "reward": 0.06650000065565109, "reward_std": 0.08273149281740189, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.06650000065565109, "step": 721 }, { "completion_length": 256.0, "epoch": 6.067226890756302, "grad_norm": 0.003361519891768694, "kl": 0.0013052797876298428, "learning_rate": 1.0875824914376555e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 722 }, { "completion_length": 256.0, "epoch": 6.07563025210084, "grad_norm": 0.006267227232456207, "kl": 0.003005072707310319, "learning_rate": 1.0803906373672477e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 723 }, { "completion_length": 237.0, "epoch": 6.084033613445378, "grad_norm": 0.562684953212738, "kl": 0.0018892684020102024, "learning_rate": 1.073216080788921e-06, "loss": 0.0001, "reward": -0.6165000200271606, "reward_std": 0.14495687186717987, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6165000200271606, "step": 724 }, { "completion_length": 224.0, "epoch": 6.092436974789916, "grad_norm": 0.7824398279190063, "kl": 0.012675427831709385, "learning_rate": 1.0660589091223854e-06, "loss": 0.0005, "reward": -0.7565000057220459, "reward_std": 0.20718227326869965, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7565000057220459, "step": 725 }, { "completion_length": 256.0, "epoch": 6.100840336134453, "grad_norm": 0.48440858721733093, "kl": 0.0044737220741808414, "learning_rate": 1.0589192095755172e-06, "loss": 0.0002, "reward": -0.3610000014305115, "reward_std": 0.6873077750205994, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3610000014305115, "step": 726 }, { "completion_length": 256.0, "epoch": 6.109243697478991, "grad_norm": 0.013015295378863811, "kl": 0.0033608628436923027, "learning_rate": 1.0517970691433035e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 727 }, { "completion_length": 256.0, "epoch": 6.117647058823529, "grad_norm": 0.011052263900637627, "kl": 0.004382177256047726, "learning_rate": 1.0446925746067768e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 728 }, { "completion_length": 178.0, "epoch": 6.126050420168067, "grad_norm": 0.9416030645370483, "kl": 0.004531122278422117, "learning_rate": 1.0376058125319614e-06, "loss": 0.0002, "reward": -0.04150000214576721, "reward_std": 0.41365745663642883, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.04150000214576721, "step": 729 }, { "completion_length": 245.5, "epoch": 6.1344537815126055, "grad_norm": 0.571601390838623, "kl": 0.003431119956076145, "learning_rate": 1.0305368692688175e-06, "loss": 0.0001, "reward": 0.18799999356269836, "reward_std": 0.08909544348716736, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18799999356269836, "step": 730 }, { "completion_length": 256.0, "epoch": 6.142857142857143, "grad_norm": 0.006776020396500826, "kl": 0.0025139208883047104, "learning_rate": 1.0234858309501864e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 731 }, { "completion_length": 196.5, "epoch": 6.151260504201681, "grad_norm": 0.7153845429420471, "kl": 0.03527991473674774, "learning_rate": 1.0164527834907468e-06, "loss": 0.0014, "reward": -0.5770000219345093, "reward_std": 0.16263453662395477, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5770000219345093, "step": 732 }, { "completion_length": 256.0, "epoch": 6.159663865546219, "grad_norm": 0.837486982345581, "kl": 0.01189825776964426, "learning_rate": 1.0094378125859602e-06, "loss": 0.0005, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 733 }, { "completion_length": 231.5, "epoch": 6.168067226890757, "grad_norm": 0.6288163065910339, "kl": 0.00710428599268198, "learning_rate": 1.0024410037110358e-06, "loss": 0.0003, "reward": -0.2394999861717224, "reward_std": 0.5154808163642883, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.2394999861717224, "step": 734 }, { "completion_length": 256.0, "epoch": 6.176470588235294, "grad_norm": 0.6063850522041321, "kl": 0.00333959748968482, "learning_rate": 9.95462442119879e-07, "loss": 0.0001, "reward": -0.4465000033378601, "reward_std": 0.8082230687141418, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4465000033378601, "step": 735 }, { "completion_length": 256.0, "epoch": 6.184873949579832, "grad_norm": 0.01657797023653984, "kl": 0.006196672562509775, "learning_rate": 9.88502212844063e-07, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 736 }, { "completion_length": 146.0, "epoch": 6.19327731092437, "grad_norm": 0.9086479544639587, "kl": 0.004014875739812851, "learning_rate": 9.815604006917839e-07, "loss": 0.0002, "reward": 0.18799999356269836, "reward_std": 0.08909544348716736, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18799999356269836, "step": 737 }, { "completion_length": 256.0, "epoch": 6.201680672268908, "grad_norm": 0.01317609567195177, "kl": 0.00501845870167017, "learning_rate": 9.746370902468311e-07, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 738 }, { "completion_length": 199.0, "epoch": 6.2100840336134455, "grad_norm": 0.6969436407089233, "kl": 0.014748670160770416, "learning_rate": 9.677323658675594e-07, "loss": 0.0006, "reward": -0.140500009059906, "reward_std": 0.5536645650863647, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.140500009059906, "step": 739 }, { "completion_length": 221.0, "epoch": 6.218487394957983, "grad_norm": 0.5184924602508545, "kl": 0.002622560365125537, "learning_rate": 9.608463116858544e-07, "loss": 0.0001, "reward": 0.18799999356269836, "reward_std": 0.08909544348716736, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18799999356269836, "step": 740 }, { "completion_length": 256.0, "epoch": 6.226890756302521, "grad_norm": 0.009187981486320496, "kl": 0.0032067603897303343, "learning_rate": 9.53979011606115e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 741 }, { "completion_length": 256.0, "epoch": 6.235294117647059, "grad_norm": 0.008707324042916298, "kl": 0.00342814764007926, "learning_rate": 9.471305493042243e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 742 }, { "completion_length": 232.5, "epoch": 6.243697478991597, "grad_norm": 0.6686004996299744, "kl": 0.009194644168019295, "learning_rate": 9.403010082265351e-07, "loss": 0.0004, "reward": -0.14650000631809235, "reward_std": 0.38395896553993225, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.14650000631809235, "step": 743 }, { "completion_length": 256.0, "epoch": 6.2521008403361344, "grad_norm": 0.021924443542957306, "kl": 0.012007695622742176, "learning_rate": 9.334904715888496e-07, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 744 }, { "completion_length": 256.0, "epoch": 6.260504201680672, "grad_norm": 0.002990507986396551, "kl": 0.0010730262147262692, "learning_rate": 9.266990223754069e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 745 }, { "completion_length": 256.0, "epoch": 6.26890756302521, "grad_norm": 0.6480677127838135, "kl": 0.004312966484576464, "learning_rate": 9.199267433378728e-07, "loss": 0.0002, "reward": -0.8769999742507935, "reward_std": 0.09050964564085007, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.8769999742507935, "step": 746 }, { "completion_length": 256.0, "epoch": 6.277310924369748, "grad_norm": 0.010696593672037125, "kl": 0.004897059872746468, "learning_rate": 9.131737169943314e-07, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 747 }, { "completion_length": 256.0, "epoch": 6.285714285714286, "grad_norm": 0.01653791218996048, "kl": 0.009470137767493725, "learning_rate": 9.064400256282757e-07, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 748 }, { "completion_length": 256.0, "epoch": 6.294117647058823, "grad_norm": 0.6533827185630798, "kl": 0.01002437248826027, "learning_rate": 8.99725751287611e-07, "loss": 0.0004, "reward": -0.47099998593330383, "reward_std": 0.6660946011543274, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.47099998593330383, "step": 749 }, { "completion_length": 256.0, "epoch": 6.302521008403361, "grad_norm": 0.008934236131608486, "kl": 0.0033149244263768196, "learning_rate": 8.930309757836517e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 750 }, { "completion_length": 139.0, "epoch": 6.310924369747899, "grad_norm": 1.1490325927734375, "kl": 0.020123077556490898, "learning_rate": 8.863557806901233e-07, "loss": 0.0008, "reward": -0.30799999833106995, "reward_std": 0.6123544573783875, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.30799999833106995, "step": 751 }, { "completion_length": 149.0, "epoch": 6.319327731092437, "grad_norm": 0.03675248101353645, "kl": 0.007232178933918476, "learning_rate": 8.797002473421729e-07, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 752 }, { "completion_length": 256.0, "epoch": 6.3277310924369745, "grad_norm": 0.013182894326746464, "kl": 0.00494842603802681, "learning_rate": 8.73064456835373e-07, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 753 }, { "completion_length": 256.0, "epoch": 6.336134453781512, "grad_norm": 0.014147670939564705, "kl": 0.0061243632808327675, "learning_rate": 8.664484900247363e-07, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 754 }, { "completion_length": 256.0, "epoch": 6.34453781512605, "grad_norm": 0.006377843674272299, "kl": 0.0027168327942490578, "learning_rate": 8.598524275237321e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 755 }, { "completion_length": 256.0, "epoch": 6.352941176470588, "grad_norm": 0.008408810943365097, "kl": 0.003699813038110733, "learning_rate": 8.532763497032987e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 756 }, { "completion_length": 256.0, "epoch": 6.361344537815126, "grad_norm": 0.5117924213409424, "kl": 0.013309975154697895, "learning_rate": 8.467203366908708e-07, "loss": 0.0005, "reward": -0.2639999985694885, "reward_std": 0.5501290559768677, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.2639999985694885, "step": 757 }, { "completion_length": 256.0, "epoch": 6.369747899159664, "grad_norm": 0.4230453073978424, "kl": 0.006381504237651825, "learning_rate": 8.40184468369396e-07, "loss": 0.0003, "reward": -0.3034999966621399, "reward_std": 0.605990469455719, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3034999966621399, "step": 758 }, { "completion_length": 256.0, "epoch": 6.378151260504202, "grad_norm": 0.004177461843937635, "kl": 0.0017452118918299675, "learning_rate": 8.336688243763691e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 759 }, { "completion_length": 256.0, "epoch": 6.38655462184874, "grad_norm": 0.003631402738392353, "kl": 0.0017506132135167718, "learning_rate": 8.271734841028553e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 760 }, { "completion_length": 256.0, "epoch": 6.394957983193278, "grad_norm": 0.011303781531751156, "kl": 0.006895197555422783, "learning_rate": 8.206985266925249e-07, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 761 }, { "completion_length": 256.0, "epoch": 6.4033613445378155, "grad_norm": 0.571503221988678, "kl": 0.04198170080780983, "learning_rate": 8.142440310406923e-07, "loss": 0.0017, "reward": -0.8504999876022339, "reward_std": 0.014849262312054634, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.8504999876022339, "step": 762 }, { "completion_length": 240.5, "epoch": 6.411764705882353, "grad_norm": 0.010639565996825695, "kl": 0.005928839556872845, "learning_rate": 8.078100757933486e-07, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 763 }, { "completion_length": 213.5, "epoch": 6.420168067226891, "grad_norm": 3.4670283794403076, "kl": 0.19231735169887543, "learning_rate": 8.013967393462094e-07, "loss": 0.0077, "reward": -0.5099999904632568, "reward_std": 0.09758072346448898, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5099999904632568, "step": 764 }, { "completion_length": 223.0, "epoch": 6.428571428571429, "grad_norm": 0.5275664925575256, "kl": 0.006972578354179859, "learning_rate": 7.950040998437541e-07, "loss": 0.0003, "reward": -0.4440000057220459, "reward_std": 0.3040558993816376, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4440000057220459, "step": 765 }, { "completion_length": 256.0, "epoch": 6.436974789915967, "grad_norm": 0.524572491645813, "kl": 0.02105892449617386, "learning_rate": 7.886322351782782e-07, "loss": 0.0008, "reward": -0.3824999928474426, "reward_std": 0.7177133560180664, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3824999928474426, "step": 766 }, { "completion_length": 256.0, "epoch": 6.445378151260504, "grad_norm": 0.6218293905258179, "kl": 0.008284558542072773, "learning_rate": 7.822812229889429e-07, "loss": 0.0003, "reward": -0.3474999964237213, "reward_std": 0.6682159304618835, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3474999964237213, "step": 767 }, { "completion_length": 256.0, "epoch": 6.453781512605042, "grad_norm": 0.010237136855721474, "kl": 0.003328056540340185, "learning_rate": 7.759511406608255e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 768 }, { "completion_length": 256.0, "epoch": 6.46218487394958, "grad_norm": 0.509272038936615, "kl": 0.02391972206532955, "learning_rate": 7.696420653239834e-07, "loss": 0.001, "reward": -0.2549999952316284, "reward_std": 0.5374011397361755, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.2549999952316284, "step": 769 }, { "completion_length": 256.0, "epoch": 6.470588235294118, "grad_norm": 0.509153425693512, "kl": 0.0027676376048475504, "learning_rate": 7.633540738525066e-07, "loss": 0.0001, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 770 }, { "completion_length": 175.0, "epoch": 6.4789915966386555, "grad_norm": 0.8896048069000244, "kl": 0.005227989982813597, "learning_rate": 7.57087242863589e-07, "loss": 0.0002, "reward": 0.019499998539686203, "reward_std": 0.14919953048229218, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.019499998539686203, "step": 771 }, { "completion_length": 157.5, "epoch": 6.487394957983193, "grad_norm": 0.9457676410675049, "kl": 0.009507875889539719, "learning_rate": 7.508416487165862e-07, "loss": 0.0004, "reward": 0.18799999356269836, "reward_std": 0.08909544348716736, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18799999356269836, "step": 772 }, { "completion_length": 256.0, "epoch": 6.495798319327731, "grad_norm": 0.006431178655475378, "kl": 0.0024382672272622585, "learning_rate": 7.44617367512094e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 773 }, { "completion_length": 256.0, "epoch": 6.504201680672269, "grad_norm": 0.008566629141569138, "kl": 0.004861902445554733, "learning_rate": 7.384144750910133e-07, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 774 }, { "completion_length": 256.0, "epoch": 6.512605042016807, "grad_norm": 0.5371254086494446, "kl": 0.004006564617156982, "learning_rate": 7.322330470336314e-07, "loss": 0.0002, "reward": -0.45100000500679016, "reward_std": 0.6378102898597717, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.45100000500679016, "step": 775 }, { "completion_length": 224.0, "epoch": 6.5210084033613445, "grad_norm": 0.6450855731964111, "kl": 0.008334334008395672, "learning_rate": 7.260731586586983e-07, "loss": 0.0003, "reward": -0.2290000021457672, "reward_std": 0.6788225173950195, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.2290000021457672, "step": 776 }, { "completion_length": 256.0, "epoch": 6.529411764705882, "grad_norm": 0.541738748550415, "kl": 0.01820124313235283, "learning_rate": 7.199348850225091e-07, "loss": 0.0007, "reward": -0.320499986410141, "reward_std": 0.6300321221351624, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.320499986410141, "step": 777 }, { "completion_length": 256.0, "epoch": 6.53781512605042, "grad_norm": 0.649634838104248, "kl": 0.024110881611704826, "learning_rate": 7.138183009179922e-07, "loss": 0.001, "reward": -0.2709999978542328, "reward_std": 0.7368053197860718, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.2709999978542328, "step": 778 }, { "completion_length": 50.5, "epoch": 6.546218487394958, "grad_norm": 2.506558418273926, "kl": 0.026342321187257767, "learning_rate": 7.077234808737932e-07, "loss": 0.0011, "reward": 0.04649999737739563, "reward_std": 0.11101576685905457, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.04649999737739563, "step": 779 }, { "completion_length": 256.0, "epoch": 6.554621848739496, "grad_norm": 0.6712095141410828, "kl": 0.0030131973326206207, "learning_rate": 7.016504991533727e-07, "loss": 0.0001, "reward": -0.1599999964237213, "reward_std": 0.40305086970329285, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1599999964237213, "step": 780 }, { "completion_length": 155.5, "epoch": 6.563025210084033, "grad_norm": 1.2200813293457031, "kl": 0.06478404998779297, "learning_rate": 6.955994297540947e-07, "loss": 0.0026, "reward": -0.3070000112056732, "reward_std": 0.6109402775764465, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3070000112056732, "step": 781 }, { "completion_length": 179.5, "epoch": 6.571428571428571, "grad_norm": 0.6607686877250671, "kl": 0.02946791797876358, "learning_rate": 6.895703464063319e-07, "loss": 0.0012, "reward": -0.5464999675750732, "reward_std": 0.010606633499264717, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5464999675750732, "step": 782 }, { "completion_length": 150.0, "epoch": 6.579831932773109, "grad_norm": 0.8828802704811096, "kl": 0.011935793794691563, "learning_rate": 6.835633225725604e-07, "loss": 0.0005, "reward": -0.07250000536441803, "reward_std": 0.45749804377555847, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.07250000536441803, "step": 783 }, { "completion_length": 256.0, "epoch": 6.588235294117647, "grad_norm": 0.004505544435232878, "kl": 0.0023014158941805363, "learning_rate": 6.775784314464717e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 784 }, { "completion_length": 256.0, "epoch": 6.5966386554621845, "grad_norm": 0.010366393253207207, "kl": 0.003555539296939969, "learning_rate": 6.716157459520739e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 785 }, { "completion_length": 256.0, "epoch": 6.605042016806722, "grad_norm": 0.556668221950531, "kl": 0.007973147556185722, "learning_rate": 6.656753387428089e-07, "loss": 0.0003, "reward": -0.3019999861717224, "reward_std": 0.4270924925804138, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3019999861717224, "step": 786 }, { "completion_length": 164.5, "epoch": 6.61344537815126, "grad_norm": 0.03622039407491684, "kl": 0.007887744344770908, "learning_rate": 6.597572822006643e-07, "loss": 0.0003, "reward": 0.25099998712539673, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25099998712539673, "step": 787 }, { "completion_length": 256.0, "epoch": 6.621848739495798, "grad_norm": 0.018309690058231354, "kl": 0.0033962568268179893, "learning_rate": 6.538616484352902e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 788 }, { "completion_length": 249.0, "epoch": 6.630252100840336, "grad_norm": 0.5331225991249084, "kl": 0.027490515261888504, "learning_rate": 6.479885092831251e-07, "loss": 0.0011, "reward": -0.35100001096725464, "reward_std": 0.6731656789779663, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.35100001096725464, "step": 789 }, { "completion_length": 256.0, "epoch": 6.6386554621848735, "grad_norm": 0.009605168364942074, "kl": 0.004034414887428284, "learning_rate": 6.421379363065142e-07, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 790 }, { "completion_length": 256.0, "epoch": 6.647058823529412, "grad_norm": 0.011906580068171024, "kl": 0.004115937277674675, "learning_rate": 6.363100007928447e-07, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 791 }, { "completion_length": 256.0, "epoch": 6.65546218487395, "grad_norm": 0.4698775112628937, "kl": 0.0022636624053120613, "learning_rate": 6.305047737536707e-07, "loss": 0.0001, "reward": -0.2759999930858612, "reward_std": 0.5670996308326721, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.2759999930858612, "step": 792 }, { "completion_length": 190.5, "epoch": 6.663865546218488, "grad_norm": 1.165305733680725, "kl": 0.0589740052819252, "learning_rate": 6.247223259238511e-07, "loss": 0.0024, "reward": -0.5444999933242798, "reward_std": 0.06293249130249023, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5444999933242798, "step": 793 }, { "completion_length": 256.0, "epoch": 6.6722689075630255, "grad_norm": 0.4969509541988373, "kl": 0.012743293307721615, "learning_rate": 6.189627277606894e-07, "loss": 0.0005, "reward": -1.0199999809265137, "reward_std": 0.0721248909831047, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -1.0199999809265137, "step": 794 }, { "completion_length": 99.0, "epoch": 6.680672268907563, "grad_norm": 6.010295391082764, "kl": 0.1238475814461708, "learning_rate": 6.1322604944307e-07, "loss": 0.005, "reward": 0.17249999940395355, "reward_std": 0.06717514246702194, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17249999940395355, "step": 795 }, { "completion_length": 256.0, "epoch": 6.689075630252101, "grad_norm": 0.008045363239943981, "kl": 0.0024950504302978516, "learning_rate": 6.075123608706093e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 796 }, { "completion_length": 256.0, "epoch": 6.697478991596639, "grad_norm": 0.01814122498035431, "kl": 0.004750444088131189, "learning_rate": 6.01821731662798e-07, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 797 }, { "completion_length": 202.5, "epoch": 6.705882352941177, "grad_norm": 0.656705379486084, "kl": 0.007144136354327202, "learning_rate": 5.961542311581586e-07, "loss": 0.0003, "reward": -0.3154999911785126, "reward_std": 0.6229610443115234, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3154999911785126, "step": 798 }, { "completion_length": 256.0, "epoch": 6.714285714285714, "grad_norm": 0.011661670170724392, "kl": 0.003132264129817486, "learning_rate": 5.905099284133953e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 799 }, { "completion_length": 163.0, "epoch": 6.722689075630252, "grad_norm": 0.6220264434814453, "kl": 0.004242799244821072, "learning_rate": 5.848888922025553e-07, "loss": 0.0002, "reward": -0.0020000040531158447, "reward_std": 0.3577960133552551, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.0020000040531158447, "step": 800 }, { "completion_length": 256.0, "epoch": 6.73109243697479, "grad_norm": 0.5267773866653442, "kl": 0.020459719002246857, "learning_rate": 5.792911910161922e-07, "loss": 0.0008, "reward": -0.3440000116825104, "reward_std": 0.6632661819458008, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3440000116825104, "step": 801 }, { "completion_length": 256.0, "epoch": 6.739495798319328, "grad_norm": 0.006925770081579685, "kl": 0.004409216810017824, "learning_rate": 5.737168930605272e-07, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 802 }, { "completion_length": 256.0, "epoch": 6.7478991596638656, "grad_norm": 0.0067673311568796635, "kl": 0.002394068520516157, "learning_rate": 5.681660662566225e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 803 }, { "completion_length": 256.0, "epoch": 6.756302521008403, "grad_norm": 0.7026253342628479, "kl": 0.012659819796681404, "learning_rate": 5.626387782395512e-07, "loss": 0.0005, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 804 }, { "completion_length": 256.0, "epoch": 6.764705882352941, "grad_norm": 0.004658992867916822, "kl": 0.0025986130349338055, "learning_rate": 5.571350963575728e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 805 }, { "completion_length": 256.0, "epoch": 6.773109243697479, "grad_norm": 0.6520683169364929, "kl": 0.004111369140446186, "learning_rate": 5.516550876713142e-07, "loss": 0.0002, "reward": -0.40299999713897705, "reward_std": 0.7467047572135925, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.40299999713897705, "step": 806 }, { "completion_length": 256.0, "epoch": 6.781512605042017, "grad_norm": 0.010695988312363625, "kl": 0.008855653926730156, "learning_rate": 5.461988189529529e-07, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 807 }, { "completion_length": 256.0, "epoch": 6.7899159663865545, "grad_norm": 0.019782986491918564, "kl": 0.00821752566844225, "learning_rate": 5.407663566854008e-07, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 808 }, { "completion_length": 203.0, "epoch": 6.798319327731092, "grad_norm": 0.5762162804603577, "kl": 0.008301862515509129, "learning_rate": 5.353577670614951e-07, "loss": 0.0003, "reward": -0.7250000238418579, "reward_std": 0.2121320366859436, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7250000238418579, "step": 809 }, { "completion_length": 256.0, "epoch": 6.80672268907563, "grad_norm": 0.5279088020324707, "kl": 0.003776387544348836, "learning_rate": 5.299731159831953e-07, "loss": 0.0002, "reward": -0.3294999897480011, "reward_std": 0.6427600383758545, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3294999897480011, "step": 810 }, { "completion_length": 256.0, "epoch": 6.815126050420168, "grad_norm": 0.5377076268196106, "kl": 0.002843294758349657, "learning_rate": 5.24612469060774e-07, "loss": 0.0001, "reward": -0.33899998664855957, "reward_std": 0.6561950445175171, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.33899998664855957, "step": 811 }, { "completion_length": 256.0, "epoch": 6.823529411764706, "grad_norm": 0.565001904964447, "kl": 0.030575403943657875, "learning_rate": 5.192758916120236e-07, "loss": 0.0012, "reward": -0.21250000596046448, "reward_std": 0.4772970974445343, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.21250000596046448, "step": 812 }, { "completion_length": 140.0, "epoch": 6.831932773109243, "grad_norm": 1.4325076341629028, "kl": 0.016184376552700996, "learning_rate": 5.139634486614544e-07, "loss": 0.0006, "reward": -0.18400000035762787, "reward_std": 0.2715289890766144, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.18400000035762787, "step": 813 }, { "completion_length": 256.0, "epoch": 6.840336134453781, "grad_norm": 0.009593709371984005, "kl": 0.0035222882870584726, "learning_rate": 5.086752049395094e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 814 }, { "completion_length": 216.0, "epoch": 6.848739495798319, "grad_norm": 0.6310991048812866, "kl": 0.02070564590394497, "learning_rate": 5.034112248817685e-07, "loss": 0.0008, "reward": -0.7280000448226929, "reward_std": 0.253144234418869, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7280000448226929, "step": 815 }, { "completion_length": 256.0, "epoch": 6.857142857142857, "grad_norm": 0.6817052364349365, "kl": 0.024473652243614197, "learning_rate": 4.981715726281666e-07, "loss": 0.001, "reward": -0.3919999897480011, "reward_std": 0.731148362159729, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3919999897480011, "step": 816 }, { "completion_length": 256.0, "epoch": 6.865546218487395, "grad_norm": 0.5794046521186829, "kl": 0.002642120001837611, "learning_rate": 4.929563120222142e-07, "loss": 0.0001, "reward": -0.4599999785423279, "reward_std": 0.8273149132728577, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4599999785423279, "step": 817 }, { "completion_length": 256.0, "epoch": 6.873949579831933, "grad_norm": 0.011674066074192524, "kl": 0.0027495785616338253, "learning_rate": 4.87765506610215e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 818 }, { "completion_length": 253.0, "epoch": 6.882352941176471, "grad_norm": 5.304121494293213, "kl": 0.4329560697078705, "learning_rate": 4.825992196404958e-07, "loss": 0.0173, "reward": -0.484000027179718, "reward_std": 0.8612560629844666, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.484000027179718, "step": 819 }, { "completion_length": 228.0, "epoch": 6.890756302521009, "grad_norm": 0.6880410313606262, "kl": 0.005567362532019615, "learning_rate": 4.774575140626317e-07, "loss": 0.0002, "reward": -0.7430000305175781, "reward_std": 0.2093035727739334, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7430000305175781, "step": 820 }, { "completion_length": 256.0, "epoch": 6.899159663865547, "grad_norm": 0.003968758974224329, "kl": 0.001497837365604937, "learning_rate": 4.7234045252668393e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 821 }, { "completion_length": 256.0, "epoch": 6.907563025210084, "grad_norm": 0.6042339205741882, "kl": 0.010911931283771992, "learning_rate": 4.672480973824312e-07, "loss": 0.0004, "reward": -0.3109999895095825, "reward_std": 0.6165971159934998, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3109999895095825, "step": 822 }, { "completion_length": 212.0, "epoch": 6.915966386554622, "grad_norm": 0.009960735216736794, "kl": 0.005593709647655487, "learning_rate": 4.6218051067861423e-07, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 823 }, { "completion_length": 256.0, "epoch": 6.92436974789916, "grad_norm": 0.6370019316673279, "kl": 0.004327800590544939, "learning_rate": 4.5713775416217884e-07, "loss": 0.0002, "reward": -0.4180000126361847, "reward_std": 0.7679179310798645, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4180000126361847, "step": 824 }, { "completion_length": 44.5, "epoch": 6.932773109243698, "grad_norm": 0.06477323174476624, "kl": 0.028526708483695984, "learning_rate": 4.5211988927752026e-07, "loss": 0.0011, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 825 }, { "completion_length": 256.0, "epoch": 6.9411764705882355, "grad_norm": 0.5130132436752319, "kl": 0.0034560030326247215, "learning_rate": 4.4712697716573994e-07, "loss": 0.0001, "reward": -0.31150001287460327, "reward_std": 0.617304265499115, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.31150001287460327, "step": 826 }, { "completion_length": 256.0, "epoch": 6.949579831932773, "grad_norm": 0.6017119288444519, "kl": 0.006125107407569885, "learning_rate": 4.421590786638952e-07, "loss": 0.0002, "reward": -0.3610000014305115, "reward_std": 0.6873077750205994, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3610000014305115, "step": 827 }, { "completion_length": 256.0, "epoch": 6.957983193277311, "grad_norm": 0.6031603217124939, "kl": 0.017231328412890434, "learning_rate": 4.372162543042624e-07, "loss": 0.0007, "reward": -0.38999998569488525, "reward_std": 0.7283200025558472, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.38999998569488525, "step": 828 }, { "completion_length": 192.0, "epoch": 6.966386554621849, "grad_norm": 0.7236344814300537, "kl": 0.0034490018151700497, "learning_rate": 4.3229856431359516e-07, "loss": 0.0001, "reward": -0.15850001573562622, "reward_std": 0.5791204571723938, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.15850001573562622, "step": 829 }, { "completion_length": 256.0, "epoch": 6.974789915966387, "grad_norm": 0.012784859165549278, "kl": 0.003877812996506691, "learning_rate": 4.27406068612396e-07, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 830 }, { "completion_length": 256.0, "epoch": 6.983193277310924, "grad_norm": 0.46145594120025635, "kl": 0.0019320343853905797, "learning_rate": 4.225388268141797e-07, "loss": 0.0001, "reward": -0.2809999883174896, "reward_std": 0.574170708656311, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.2809999883174896, "step": 831 }, { "completion_length": 190.5, "epoch": 6.991596638655462, "grad_norm": 0.8973815441131592, "kl": 0.017684273421764374, "learning_rate": 4.1769689822475147e-07, "loss": 0.0007, "reward": -0.5580000281333923, "reward_std": 0.43274936079978943, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5580000281333923, "step": 832 }, { "completion_length": 57.5, "epoch": 7.0, "grad_norm": 1.8832446336746216, "kl": 0.012708373367786407, "learning_rate": 4.12880341841484e-07, "loss": 0.0005, "reward": 0.0364999920129776, "reward_std": 0.3033488094806671, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0364999920129776, "step": 833 }, { "completion_length": 231.5, "epoch": 7.008403361344538, "grad_norm": 0.6110868453979492, "kl": 0.009430555626749992, "learning_rate": 4.0808921635259595e-07, "loss": 0.0004, "reward": -0.18299999833106995, "reward_std": 0.43557778000831604, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.18299999833106995, "step": 834 }, { "completion_length": 256.0, "epoch": 7.016806722689076, "grad_norm": 0.029916997998952866, "kl": 0.007502890191972256, "learning_rate": 4.033235801364402e-07, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 835 }, { "completion_length": 244.0, "epoch": 7.025210084033613, "grad_norm": 0.7315782308578491, "kl": 0.004216735251247883, "learning_rate": 3.9858349126078945e-07, "loss": 0.0002, "reward": -0.48350000381469727, "reward_std": 0.8605489730834961, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.48350000381469727, "step": 836 }, { "completion_length": 235.0, "epoch": 7.033613445378151, "grad_norm": 0.5841196179389954, "kl": 0.0414726696908474, "learning_rate": 3.938690074821314e-07, "loss": 0.0017, "reward": -0.31049999594688416, "reward_std": 0.6158900260925293, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.31049999594688416, "step": 837 }, { "completion_length": 256.0, "epoch": 7.042016806722689, "grad_norm": 0.48704537749290466, "kl": 0.007111344486474991, "learning_rate": 3.891801862449629e-07, "loss": 0.0003, "reward": -0.3564999997615814, "reward_std": 0.6809438467025757, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3564999997615814, "step": 838 }, { "completion_length": 256.0, "epoch": 7.050420168067227, "grad_norm": 0.5021942853927612, "kl": 0.004938130732625723, "learning_rate": 3.8451708468109026e-07, "loss": 0.0002, "reward": -0.3569999933242798, "reward_std": 0.6816509366035461, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3569999933242798, "step": 839 }, { "completion_length": 94.0, "epoch": 7.0588235294117645, "grad_norm": 1.3161895275115967, "kl": 0.03398154303431511, "learning_rate": 3.798797596089351e-07, "loss": 0.0014, "reward": 0.18799999356269836, "reward_std": 0.08909544348716736, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18799999356269836, "step": 840 }, { "completion_length": 256.0, "epoch": 7.067226890756302, "grad_norm": 0.010297746397554874, "kl": 0.004153572488576174, "learning_rate": 3.7526826753284065e-07, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 841 }, { "completion_length": 256.0, "epoch": 7.07563025210084, "grad_norm": 0.0044279214926064014, "kl": 0.002501305192708969, "learning_rate": 3.7068266464238085e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 842 }, { "completion_length": 256.0, "epoch": 7.084033613445378, "grad_norm": 0.010479076765477657, "kl": 0.004240111447870731, "learning_rate": 3.661230068116811e-07, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 843 }, { "completion_length": 172.0, "epoch": 7.092436974789916, "grad_norm": 1.001501441001892, "kl": 0.006351079326122999, "learning_rate": 3.615893495987335e-07, "loss": 0.0003, "reward": -0.43549999594688416, "reward_std": 0.5918483138084412, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.43549999594688416, "step": 844 }, { "completion_length": 256.0, "epoch": 7.100840336134453, "grad_norm": 0.0040421015582978725, "kl": 0.0020869565196335316, "learning_rate": 3.5708174824471947e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 845 }, { "completion_length": 256.0, "epoch": 7.109243697478991, "grad_norm": 0.00995827466249466, "kl": 0.004835959058254957, "learning_rate": 3.5260025767333894e-07, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 846 }, { "completion_length": 256.0, "epoch": 7.117647058823529, "grad_norm": 0.4672341048717499, "kl": 0.004569198936223984, "learning_rate": 3.481449324901412e-07, "loss": 0.0002, "reward": -0.35199999809265137, "reward_std": 0.6745798587799072, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.35199999809265137, "step": 847 }, { "completion_length": 233.5, "epoch": 7.126050420168067, "grad_norm": 0.6362568140029907, "kl": 0.00882687047123909, "learning_rate": 3.4371582698185636e-07, "loss": 0.0004, "reward": -0.6369999647140503, "reward_std": 0.04666907712817192, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6369999647140503, "step": 848 }, { "completion_length": 167.5, "epoch": 7.1344537815126055, "grad_norm": 0.8341497182846069, "kl": 0.04337848350405693, "learning_rate": 3.393129951157384e-07, "loss": 0.0017, "reward": -0.4334999918937683, "reward_std": 0.0898025631904602, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4334999918937683, "step": 849 }, { "completion_length": 163.0, "epoch": 7.142857142857143, "grad_norm": 0.8482690453529358, "kl": 0.008726051077246666, "learning_rate": 3.3493649053890325e-07, "loss": 0.0003, "reward": -0.1054999977350235, "reward_std": 0.3259762227535248, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1054999977350235, "step": 850 }, { "completion_length": 256.0, "epoch": 7.151260504201681, "grad_norm": 0.4721309244632721, "kl": 0.01990387588739395, "learning_rate": 3.3058636657767927e-07, "loss": 0.0008, "reward": -0.2394999861717224, "reward_std": 0.5154808163642883, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.2394999861717224, "step": 851 }, { "completion_length": 163.5, "epoch": 7.159663865546219, "grad_norm": 1.177316427230835, "kl": 0.011796743609011173, "learning_rate": 3.262626762369525e-07, "loss": 0.0005, "reward": -0.2770000100135803, "reward_std": 0.5685138702392578, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.2770000100135803, "step": 852 }, { "completion_length": 256.0, "epoch": 7.168067226890757, "grad_norm": 0.012400349602103233, "kl": 0.003964672796428204, "learning_rate": 3.219654721995266e-07, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 853 }, { "completion_length": 166.0, "epoch": 7.176470588235294, "grad_norm": 0.7118536829948425, "kl": 0.008020586334168911, "learning_rate": 3.176948068254762e-07, "loss": 0.0003, "reward": -0.29750001430511475, "reward_std": 0.07283199578523636, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.29750001430511475, "step": 854 }, { "completion_length": 157.5, "epoch": 7.184873949579832, "grad_norm": 0.6159313917160034, "kl": 0.009370331652462482, "learning_rate": 3.134507321515107e-07, "loss": 0.0004, "reward": -0.44050002098083496, "reward_std": 0.0007071187719702721, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.44050002098083496, "step": 855 }, { "completion_length": 256.0, "epoch": 7.19327731092437, "grad_norm": 0.6859822273254395, "kl": 0.0030977455899119377, "learning_rate": 3.092332998903416e-07, "loss": 0.0001, "reward": -0.3095000088214874, "reward_std": 0.6144757866859436, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3095000088214874, "step": 856 }, { "completion_length": 256.0, "epoch": 7.201680672268908, "grad_norm": 0.5166124701499939, "kl": 0.0036242003552615643, "learning_rate": 3.050425614300487e-07, "loss": 0.0001, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 857 }, { "completion_length": 256.0, "epoch": 7.2100840336134455, "grad_norm": 0.018992919474840164, "kl": 0.007829979993402958, "learning_rate": 3.0087856783345916e-07, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 858 }, { "completion_length": 256.0, "epoch": 7.218487394957983, "grad_norm": 0.07560814172029495, "kl": 0.008474218659102917, "learning_rate": 2.967413698375196e-07, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 859 }, { "completion_length": 256.0, "epoch": 7.226890756302521, "grad_norm": 0.016609979793429375, "kl": 0.005581196863204241, "learning_rate": 2.9263101785268253e-07, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 860 }, { "completion_length": 234.5, "epoch": 7.235294117647059, "grad_norm": 0.975088357925415, "kl": 0.015155334025621414, "learning_rate": 2.8854756196229017e-07, "loss": 0.0006, "reward": -0.7424999475479126, "reward_std": 0.10535889863967896, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7424999475479126, "step": 861 }, { "completion_length": 256.0, "epoch": 7.243697478991597, "grad_norm": 0.8236193656921387, "kl": 0.005577279254794121, "learning_rate": 2.844910519219632e-07, "loss": 0.0002, "reward": -0.28999999165534973, "reward_std": 0.5868986248970032, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.28999999165534973, "step": 862 }, { "completion_length": 256.0, "epoch": 7.2521008403361344, "grad_norm": 0.015439441427588463, "kl": 0.003621380077674985, "learning_rate": 2.8046153715899695e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 863 }, { "completion_length": 256.0, "epoch": 7.260504201680672, "grad_norm": 0.011661780066788197, "kl": 0.0027811862528324127, "learning_rate": 2.764590667717562e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 864 }, { "completion_length": 256.0, "epoch": 7.26890756302521, "grad_norm": 0.4703422486782074, "kl": 0.0032042653765529394, "learning_rate": 2.7248368952908055e-07, "loss": 0.0001, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 865 }, { "completion_length": 241.0, "epoch": 7.277310924369748, "grad_norm": 0.6923658847808838, "kl": 0.011267962865531445, "learning_rate": 2.6853545386968607e-07, "loss": 0.0005, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 866 }, { "completion_length": 235.5, "epoch": 7.285714285714286, "grad_norm": 0.6271227598190308, "kl": 0.002097474178299308, "learning_rate": 2.6461440790157974e-07, "loss": 0.0001, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 867 }, { "completion_length": 256.0, "epoch": 7.294117647058823, "grad_norm": 0.007775813806802034, "kl": 0.0018853155197575688, "learning_rate": 2.6072059940146775e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 868 }, { "completion_length": 256.0, "epoch": 7.302521008403361, "grad_norm": 0.007238820660859346, "kl": 0.003193457843735814, "learning_rate": 2.568540758141791e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 869 }, { "completion_length": 256.0, "epoch": 7.310924369747899, "grad_norm": 0.004498471040278673, "kl": 0.001412153709679842, "learning_rate": 2.53014884252083e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 870 }, { "completion_length": 256.0, "epoch": 7.319327731092437, "grad_norm": 0.005220589227974415, "kl": 0.0012616411549970508, "learning_rate": 2.492030714945162e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 871 }, { "completion_length": 212.0, "epoch": 7.3277310924369745, "grad_norm": 0.008997468277812004, "kl": 0.003111438825726509, "learning_rate": 2.454186839872158e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 872 }, { "completion_length": 246.0, "epoch": 7.336134453781512, "grad_norm": 0.5159225463867188, "kl": 0.007171745877712965, "learning_rate": 2.4166176784174795e-07, "loss": 0.0003, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 873 }, { "completion_length": 256.0, "epoch": 7.34453781512605, "grad_norm": 0.005670048296451569, "kl": 0.002156604314222932, "learning_rate": 2.3793236883495164e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 874 }, { "completion_length": 131.5, "epoch": 7.352941176470588, "grad_norm": 0.9532727599143982, "kl": 0.0062611764296889305, "learning_rate": 2.3423053240837518e-07, "loss": 0.0003, "reward": -0.296999990940094, "reward_std": 0.23900209367275238, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.296999990940094, "step": 875 }, { "completion_length": 256.0, "epoch": 7.361344537815126, "grad_norm": 0.003809628775343299, "kl": 0.0012634317390620708, "learning_rate": 2.3055630366772857e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 876 }, { "completion_length": 256.0, "epoch": 7.369747899159664, "grad_norm": 0.005012346897274256, "kl": 0.0016640073154121637, "learning_rate": 2.269097273823287e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 877 }, { "completion_length": 160.0, "epoch": 7.378151260504202, "grad_norm": 1.2174485921859741, "kl": 0.008502379059791565, "learning_rate": 2.2329084798455747e-07, "loss": 0.0003, "reward": -0.3865000009536743, "reward_std": 0.6144757866859436, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3865000009536743, "step": 878 }, { "completion_length": 256.0, "epoch": 7.38655462184874, "grad_norm": 0.008353069424629211, "kl": 0.002911319024860859, "learning_rate": 2.1969970956931762e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 879 }, { "completion_length": 256.0, "epoch": 7.394957983193278, "grad_norm": 0.01508881151676178, "kl": 0.003996263723820448, "learning_rate": 2.1613635589349756e-07, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 880 }, { "completion_length": 256.0, "epoch": 7.4033613445378155, "grad_norm": 0.0064295330084860325, "kl": 0.0021054691169410944, "learning_rate": 2.1260083037543817e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 881 }, { "completion_length": 235.5, "epoch": 7.411764705882353, "grad_norm": 0.0056160870008170605, "kl": 0.0026067430153489113, "learning_rate": 2.0909317609440093e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 882 }, { "completion_length": 256.0, "epoch": 7.420168067226891, "grad_norm": 0.0077072326093912125, "kl": 0.0034582684747874737, "learning_rate": 2.0561343579004716e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 883 }, { "completion_length": 256.0, "epoch": 7.428571428571429, "grad_norm": 0.5656645894050598, "kl": 0.003780920058488846, "learning_rate": 2.0216165186191406e-07, "loss": 0.0002, "reward": -0.42800000309944153, "reward_std": 0.7820600271224976, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.42800000309944153, "step": 884 }, { "completion_length": 256.0, "epoch": 7.436974789915967, "grad_norm": 0.5816947817802429, "kl": 0.043425820767879486, "learning_rate": 1.9873786636889908e-07, "loss": 0.0017, "reward": -0.43949997425079346, "reward_std": 0.7983235120773315, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.43949997425079346, "step": 885 }, { "completion_length": 256.0, "epoch": 7.445378151260504, "grad_norm": 0.6333321928977966, "kl": 0.03733671084046364, "learning_rate": 1.95342121028749e-07, "loss": 0.0015, "reward": -0.6729999780654907, "reward_std": 0.06081119924783707, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6729999780654907, "step": 886 }, { "completion_length": 256.0, "epoch": 7.453781512605042, "grad_norm": 0.6152923107147217, "kl": 0.005459221079945564, "learning_rate": 1.9197445721754777e-07, "loss": 0.0002, "reward": -0.26899999380111694, "reward_std": 0.5572001338005066, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.26899999380111694, "step": 887 }, { "completion_length": 256.0, "epoch": 7.46218487394958, "grad_norm": 0.008185365237295628, "kl": 0.002387643326073885, "learning_rate": 1.8863491596921745e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 888 }, { "completion_length": 256.0, "epoch": 7.470588235294118, "grad_norm": 0.009611280634999275, "kl": 0.0037066475488245487, "learning_rate": 1.8532353797501318e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 889 }, { "completion_length": 256.0, "epoch": 7.4789915966386555, "grad_norm": 0.5290879607200623, "kl": 0.004673336166888475, "learning_rate": 1.8204036358303173e-07, "loss": 0.0002, "reward": -0.32100000977516174, "reward_std": 0.6307392716407776, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.32100000977516174, "step": 890 }, { "completion_length": 200.0, "epoch": 7.487394957983193, "grad_norm": 1.0965220928192139, "kl": 0.006951375864446163, "learning_rate": 1.787854327977162e-07, "loss": 0.0003, "reward": 0.18799999356269836, "reward_std": 0.08909544348716736, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18799999356269836, "step": 891 }, { "completion_length": 203.0, "epoch": 7.495798319327731, "grad_norm": 0.7327089309692383, "kl": 0.009635976515710354, "learning_rate": 1.7555878527937164e-07, "loss": 0.0004, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 892 }, { "completion_length": 256.0, "epoch": 7.504201680672269, "grad_norm": 0.0030513787642121315, "kl": 0.0011630826629698277, "learning_rate": 1.7236046034367959e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 893 }, { "completion_length": 256.0, "epoch": 7.512605042016807, "grad_norm": 0.010899591259658337, "kl": 0.003470415947958827, "learning_rate": 1.6919049696121957e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 894 }, { "completion_length": 256.0, "epoch": 7.5210084033613445, "grad_norm": 0.012333949096500874, "kl": 0.003963698633015156, "learning_rate": 1.6604893375699594e-07, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 895 }, { "completion_length": 195.0, "epoch": 7.529411764705882, "grad_norm": 0.6648133397102356, "kl": 0.011089630424976349, "learning_rate": 1.629358090099639e-07, "loss": 0.0004, "reward": -0.42500001192092896, "reward_std": 0.15839192271232605, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.42500001192092896, "step": 896 }, { "completion_length": 178.5, "epoch": 7.53781512605042, "grad_norm": 0.8782094717025757, "kl": 0.021921800449490547, "learning_rate": 1.5985116065256683e-07, "loss": 0.0009, "reward": -0.2580000162124634, "reward_std": 0.7198346853256226, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.2580000162124634, "step": 897 }, { "completion_length": 231.5, "epoch": 7.546218487394958, "grad_norm": 0.6536224484443665, "kl": 0.005121362395584583, "learning_rate": 1.567950262702714e-07, "loss": 0.0002, "reward": -0.289000004529953, "reward_std": 0.5854844450950623, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.289000004529953, "step": 898 }, { "completion_length": 256.0, "epoch": 7.554621848739496, "grad_norm": 0.010603158734738827, "kl": 0.004285288974642754, "learning_rate": 1.5376744310111019e-07, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 899 }, { "completion_length": 236.5, "epoch": 7.563025210084033, "grad_norm": 0.009380782954394817, "kl": 0.0045014675706624985, "learning_rate": 1.507684480352292e-07, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 900 }, { "completion_length": 256.0, "epoch": 7.571428571428571, "grad_norm": 0.5132765769958496, "kl": 0.0019136006012558937, "learning_rate": 1.4779807761443638e-07, "loss": 0.0001, "reward": -0.3569999933242798, "reward_std": 0.6816509366035461, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3569999933242798, "step": 901 }, { "completion_length": 256.0, "epoch": 7.579831932773109, "grad_norm": 0.009072122164070606, "kl": 0.004538253415375948, "learning_rate": 1.4485636803175828e-07, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 902 }, { "completion_length": 256.0, "epoch": 7.588235294117647, "grad_norm": 0.606788694858551, "kl": 0.005488838069140911, "learning_rate": 1.419433551309976e-07, "loss": 0.0002, "reward": -0.3319999873638153, "reward_std": 0.6462955474853516, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3319999873638153, "step": 903 }, { "completion_length": 256.0, "epoch": 7.5966386554621845, "grad_norm": 0.5269256234169006, "kl": 0.0028442475013434887, "learning_rate": 1.3905907440629752e-07, "loss": 0.0001, "reward": -0.3580000102519989, "reward_std": 0.6830651760101318, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3580000102519989, "step": 904 }, { "completion_length": 252.5, "epoch": 7.605042016806722, "grad_norm": 0.0063902847468853, "kl": 0.003648000303655863, "learning_rate": 1.362035610017079e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 905 }, { "completion_length": 256.0, "epoch": 7.61344537815126, "grad_norm": 0.5393877029418945, "kl": 0.004754668101668358, "learning_rate": 1.3337684971075932e-07, "loss": 0.0002, "reward": -0.2529999911785126, "reward_std": 0.5345727205276489, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.2529999911785126, "step": 906 }, { "completion_length": 147.5, "epoch": 7.621848739495798, "grad_norm": 1.0672441720962524, "kl": 0.007533932104706764, "learning_rate": 1.305789749760361e-07, "loss": 0.0003, "reward": -0.015000000596046448, "reward_std": 0.3761807680130005, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.015000000596046448, "step": 907 }, { "completion_length": 256.0, "epoch": 7.630252100840336, "grad_norm": 0.5441468358039856, "kl": 0.002529418095946312, "learning_rate": 1.278099708887587e-07, "loss": 0.0001, "reward": -0.3154999911785126, "reward_std": 0.6229610443115234, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3154999911785126, "step": 908 }, { "completion_length": 256.0, "epoch": 7.6386554621848735, "grad_norm": 0.5252127051353455, "kl": 0.0020951409824192524, "learning_rate": 1.2506987118836912e-07, "loss": 0.0001, "reward": -0.4059999883174896, "reward_std": 0.7509474158287048, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4059999883174896, "step": 909 }, { "completion_length": 192.0, "epoch": 7.647058823529412, "grad_norm": 0.8923895955085754, "kl": 0.04129071906208992, "learning_rate": 1.223587092621162e-07, "loss": 0.0017, "reward": -0.1459999978542328, "reward_std": 0.5614427328109741, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1459999978542328, "step": 910 }, { "completion_length": 194.5, "epoch": 7.65546218487395, "grad_norm": 0.7333325147628784, "kl": 0.011591121554374695, "learning_rate": 1.1967651814465353e-07, "loss": 0.0005, "reward": -0.11100000143051147, "reward_std": 0.33375439047813416, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.11100000143051147, "step": 911 }, { "completion_length": 256.0, "epoch": 7.663865546218488, "grad_norm": 0.5285476446151733, "kl": 0.001850559958256781, "learning_rate": 1.1702333051763271e-07, "loss": 0.0001, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 912 }, { "completion_length": 256.0, "epoch": 7.6722689075630255, "grad_norm": 0.006056373007595539, "kl": 0.00204530730843544, "learning_rate": 1.1439917870930795e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 913 }, { "completion_length": 256.0, "epoch": 7.680672268907563, "grad_norm": 0.014933819882571697, "kl": 0.0025124228559434414, "learning_rate": 1.1180409469414094e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 914 }, { "completion_length": 256.0, "epoch": 7.689075630252101, "grad_norm": 0.008476823568344116, "kl": 0.0024698949418962, "learning_rate": 1.0923811009241142e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 915 }, { "completion_length": 239.0, "epoch": 7.697478991596639, "grad_norm": 0.574554443359375, "kl": 0.02115931734442711, "learning_rate": 1.067012561698319e-07, "loss": 0.0008, "reward": -0.36649999022483826, "reward_std": 0.6950859427452087, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.36649999022483826, "step": 916 }, { "completion_length": 218.5, "epoch": 7.705882352941177, "grad_norm": 0.009269939735531807, "kl": 0.004942473489791155, "learning_rate": 1.041935638371669e-07, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 917 }, { "completion_length": 151.5, "epoch": 7.714285714285714, "grad_norm": 0.7297642827033997, "kl": 0.007256344426423311, "learning_rate": 1.0171506364985622e-07, "loss": 0.0003, "reward": 0.13349999487400055, "reward_std": 0.16617007553577423, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13349999487400055, "step": 918 }, { "completion_length": 256.0, "epoch": 7.722689075630252, "grad_norm": 0.003777906997129321, "kl": 0.0024043142329901457, "learning_rate": 9.926578580764234e-08, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 919 }, { "completion_length": 256.0, "epoch": 7.73109243697479, "grad_norm": 0.0249546617269516, "kl": 0.0034575865138322115, "learning_rate": 9.684576015420277e-08, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 920 }, { "completion_length": 256.0, "epoch": 7.739495798319328, "grad_norm": 0.011834201402962208, "kl": 0.003938496112823486, "learning_rate": 9.445501617678654e-08, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 921 }, { "completion_length": 159.5, "epoch": 7.7478991596638656, "grad_norm": 0.791739821434021, "kl": 0.005662995390594006, "learning_rate": 9.209358300585474e-08, "loss": 0.0002, "reward": 0.18799999356269836, "reward_std": 0.08909544348716736, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18799999356269836, "step": 922 }, { "completion_length": 256.0, "epoch": 7.756302521008403, "grad_norm": 0.5295429229736328, "kl": 0.003137825755402446, "learning_rate": 8.9761489414725e-08, "loss": 0.0001, "reward": -0.4819999933242798, "reward_std": 0.8584276437759399, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4819999933242798, "step": 923 }, { "completion_length": 201.5, "epoch": 7.764705882352941, "grad_norm": 0.018620681017637253, "kl": 0.010227223858237267, "learning_rate": 8.745876381922147e-08, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 924 }, { "completion_length": 256.0, "epoch": 7.773109243697479, "grad_norm": 0.0033163861371576786, "kl": 0.0012920665321871638, "learning_rate": 8.518543427732951e-08, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 925 }, { "completion_length": 195.5, "epoch": 7.781512605042017, "grad_norm": 0.7901667356491089, "kl": 0.007847260683774948, "learning_rate": 8.294152848885156e-08, "loss": 0.0003, "reward": -0.6304999589920044, "reward_std": 0.5055813193321228, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6304999589920044, "step": 926 }, { "completion_length": 256.0, "epoch": 7.7899159663865545, "grad_norm": 0.014759136363863945, "kl": 0.005496338941156864, "learning_rate": 8.072707379507217e-08, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 927 }, { "completion_length": 256.0, "epoch": 7.798319327731092, "grad_norm": 0.003393803955987096, "kl": 0.0009367265738546848, "learning_rate": 7.854209717842231e-08, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 928 }, { "completion_length": 158.5, "epoch": 7.80672268907563, "grad_norm": 0.7283389568328857, "kl": 0.006240527611225843, "learning_rate": 7.638662526215284e-08, "loss": 0.0002, "reward": -0.49150002002716064, "reward_std": 0.06434670835733414, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.49150002002716064, "step": 929 }, { "completion_length": 256.0, "epoch": 7.815126050420168, "grad_norm": 0.0054249269887804985, "kl": 0.002426996361464262, "learning_rate": 7.426068431000883e-08, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 930 }, { "completion_length": 183.0, "epoch": 7.823529411764706, "grad_norm": 0.7362488508224487, "kl": 0.007468066643923521, "learning_rate": 7.216430022591009e-08, "loss": 0.0003, "reward": -0.04349999874830246, "reward_std": 0.23829498887062073, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.04349999874830246, "step": 931 }, { "completion_length": 242.0, "epoch": 7.831932773109243, "grad_norm": 0.6201578974723816, "kl": 0.002790957223623991, "learning_rate": 7.009749855363457e-08, "loss": 0.0001, "reward": -0.2150000035762787, "reward_std": 0.48083260655403137, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.2150000035762787, "step": 932 }, { "completion_length": 67.0, "epoch": 7.840336134453781, "grad_norm": 2.6520168781280518, "kl": 0.12839721143245697, "learning_rate": 6.806030447650879e-08, "loss": 0.0051, "reward": 0.06450000405311584, "reward_std": 0.03606244921684265, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.06450000405311584, "step": 933 }, { "completion_length": 183.0, "epoch": 7.848739495798319, "grad_norm": 1.0091286897659302, "kl": 0.01233973540365696, "learning_rate": 6.605274281709929e-08, "loss": 0.0005, "reward": 0.18599998950958252, "reward_std": 0.08626702427864075, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18599998950958252, "step": 934 }, { "completion_length": 256.0, "epoch": 7.857142857142857, "grad_norm": 0.019948719069361687, "kl": 0.009058884344995022, "learning_rate": 6.407483803691216e-08, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 935 }, { "completion_length": 181.5, "epoch": 7.865546218487395, "grad_norm": 1.5445952415466309, "kl": 0.0402054600417614, "learning_rate": 6.212661423609184e-08, "loss": 0.0016, "reward": -0.45350000262260437, "reward_std": 0.5282087922096252, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.45350000262260437, "step": 936 }, { "completion_length": 206.0, "epoch": 7.873949579831933, "grad_norm": 0.6447483897209167, "kl": 0.004122576676309109, "learning_rate": 6.020809515313141e-08, "loss": 0.0002, "reward": 0.12799999117851257, "reward_std": 0.004242639057338238, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12799999117851257, "step": 937 }, { "completion_length": 220.0, "epoch": 7.882352941176471, "grad_norm": 0.7472499012947083, "kl": 0.00691410293802619, "learning_rate": 5.83193041645802e-08, "loss": 0.0003, "reward": -0.16349999606609344, "reward_std": 0.4080006182193756, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.16349999606609344, "step": 938 }, { "completion_length": 256.0, "epoch": 7.890756302521009, "grad_norm": 0.0075671011582016945, "kl": 0.0023281523026525974, "learning_rate": 5.6460264284760316e-08, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 939 }, { "completion_length": 211.0, "epoch": 7.899159663865547, "grad_norm": 0.4569738209247589, "kl": 0.0049009304493665695, "learning_rate": 5.463099816548578e-08, "loss": 0.0002, "reward": 0.18799999356269836, "reward_std": 0.08909544348716736, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18799999356269836, "step": 940 }, { "completion_length": 243.5, "epoch": 7.907563025210084, "grad_norm": 0.6349002122879028, "kl": 0.010202537290751934, "learning_rate": 5.283152809578751e-08, "loss": 0.0004, "reward": -0.296999990940094, "reward_std": 0.5967981219291687, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.296999990940094, "step": 941 }, { "completion_length": 256.0, "epoch": 7.915966386554622, "grad_norm": 0.005640604067593813, "kl": 0.0029828068800270557, "learning_rate": 5.106187600163987e-08, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 942 }, { "completion_length": 228.5, "epoch": 7.92436974789916, "grad_norm": 0.8448273539543152, "kl": 0.02215358056128025, "learning_rate": 4.932206344569562e-08, "loss": 0.0009, "reward": -0.7050000429153442, "reward_std": 0.02121322602033615, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7050000429153442, "step": 943 }, { "completion_length": 177.5, "epoch": 7.932773109243698, "grad_norm": 1.315392255783081, "kl": 0.04788925126194954, "learning_rate": 4.761211162702117e-08, "loss": 0.0019, "reward": -0.5509999990463257, "reward_std": 0.19940409064292908, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5509999990463257, "step": 944 }, { "completion_length": 256.0, "epoch": 7.9411764705882355, "grad_norm": 0.010312262922525406, "kl": 0.00437077134847641, "learning_rate": 4.593204138084006e-08, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 945 }, { "completion_length": 256.0, "epoch": 7.949579831932773, "grad_norm": 0.006933620665222406, "kl": 0.0017679043812677264, "learning_rate": 4.428187317827848e-08, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 946 }, { "completion_length": 244.5, "epoch": 7.957983193277311, "grad_norm": 0.6817045211791992, "kl": 0.013074960559606552, "learning_rate": 4.26616271261146e-08, "loss": 0.0005, "reward": 0.18799999356269836, "reward_std": 0.08909544348716736, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18799999356269836, "step": 947 }, { "completion_length": 256.0, "epoch": 7.966386554621849, "grad_norm": 0.014362471178174019, "kl": 0.005483715794980526, "learning_rate": 4.1071322966535487e-08, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 948 }, { "completion_length": 256.0, "epoch": 7.974789915966387, "grad_norm": 0.015236345119774342, "kl": 0.004321468062698841, "learning_rate": 3.95109800768953e-08, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 949 }, { "completion_length": 256.0, "epoch": 7.983193277310924, "grad_norm": 0.00633437093347311, "kl": 0.002448908518999815, "learning_rate": 3.798061746947995e-08, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 950 }, { "completion_length": 256.0, "epoch": 7.991596638655462, "grad_norm": 0.41104018688201904, "kl": 0.002387253101915121, "learning_rate": 3.648025379127479e-08, "loss": 0.0001, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 951 }, { "completion_length": 256.0, "epoch": 8.0, "grad_norm": 0.011408498510718346, "kl": 0.004092681687325239, "learning_rate": 3.5009907323737826e-08, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 952 }, { "completion_length": 226.0, "epoch": 8.008403361344538, "grad_norm": 0.6938554644584656, "kl": 0.008812161162495613, "learning_rate": 3.3569595982576584e-08, "loss": 0.0004, "reward": -0.5889999866485596, "reward_std": 0.36062443256378174, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5889999866485596, "step": 953 }, { "completion_length": 236.5, "epoch": 8.016806722689076, "grad_norm": 0.6452879905700684, "kl": 0.005862862803041935, "learning_rate": 3.2159337317530234e-08, "loss": 0.0002, "reward": -0.6924999952316284, "reward_std": 0.15061374008655548, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6924999952316284, "step": 954 }, { "completion_length": 256.0, "epoch": 8.025210084033613, "grad_norm": 0.5379961729049683, "kl": 0.043372198939323425, "learning_rate": 3.077914851215585e-08, "loss": 0.0017, "reward": -0.3580000102519989, "reward_std": 0.6830651760101318, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3580000102519989, "step": 955 }, { "completion_length": 167.5, "epoch": 8.033613445378151, "grad_norm": 0.9522702097892761, "kl": 0.04721365496516228, "learning_rate": 2.9429046383618042e-08, "loss": 0.0019, "reward": -0.18799999356269836, "reward_std": 0.44264882802963257, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.18799999356269836, "step": 956 }, { "completion_length": 43.5, "epoch": 8.042016806722689, "grad_norm": 2.2643589973449707, "kl": 0.022106220945715904, "learning_rate": 2.810904738248549e-08, "loss": 0.0009, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 957 }, { "completion_length": 256.0, "epoch": 8.050420168067227, "grad_norm": 0.6165892481803894, "kl": 0.0037282672710716724, "learning_rate": 2.681916759252917e-08, "loss": 0.0001, "reward": -0.5914999842643738, "reward_std": 0.8365073204040527, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5914999842643738, "step": 958 }, { "completion_length": 256.0, "epoch": 8.058823529411764, "grad_norm": 0.010814170353114605, "kl": 0.002629430266097188, "learning_rate": 2.555942273052753e-08, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 959 }, { "completion_length": 151.0, "epoch": 8.067226890756302, "grad_norm": 0.018711211159825325, "kl": 0.01015492994338274, "learning_rate": 2.4329828146074096e-08, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 960 }, { "completion_length": 256.0, "epoch": 8.07563025210084, "grad_norm": 0.00661367317661643, "kl": 0.0013579712249338627, "learning_rate": 2.313039882139101e-08, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 961 }, { "completion_length": 256.0, "epoch": 8.084033613445378, "grad_norm": 0.006882958579808474, "kl": 0.004617562051862478, "learning_rate": 2.1961149371145795e-08, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 962 }, { "completion_length": 256.0, "epoch": 8.092436974789916, "grad_norm": 0.5232301354408264, "kl": 0.0020907833240926266, "learning_rate": 2.082209404227403e-08, "loss": 0.0001, "reward": -0.42899999022483826, "reward_std": 0.783474326133728, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.42899999022483826, "step": 963 }, { "completion_length": 256.0, "epoch": 8.100840336134453, "grad_norm": 0.012954057194292545, "kl": 0.0037056063301861286, "learning_rate": 1.9713246713805588e-08, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 964 }, { "completion_length": 256.0, "epoch": 8.109243697478991, "grad_norm": 0.004450289532542229, "kl": 0.0018706879345700145, "learning_rate": 1.8634620896695044e-08, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 965 }, { "completion_length": 256.0, "epoch": 8.117647058823529, "grad_norm": 0.009391557425260544, "kl": 0.0025061871856451035, "learning_rate": 1.7586229733657646e-08, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 966 }, { "completion_length": 256.0, "epoch": 8.126050420168067, "grad_norm": 0.017030984163284302, "kl": 0.003493981435894966, "learning_rate": 1.6568085999008886e-08, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 967 }, { "completion_length": 256.0, "epoch": 8.134453781512605, "grad_norm": 0.5420168042182922, "kl": 0.01134351547807455, "learning_rate": 1.5580202098509078e-08, "loss": 0.0005, "reward": -0.34049999713897705, "reward_std": 0.6583163738250732, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.34049999713897705, "step": 968 }, { "completion_length": 248.0, "epoch": 8.142857142857142, "grad_norm": 0.5313448309898376, "kl": 0.004319202620536089, "learning_rate": 1.4622590069211517e-08, "loss": 0.0002, "reward": -0.18199999630451202, "reward_std": 0.43416354060173035, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.18199999630451202, "step": 969 }, { "completion_length": 256.0, "epoch": 8.15126050420168, "grad_norm": 0.6526890993118286, "kl": 0.008917930535972118, "learning_rate": 1.3695261579316776e-08, "loss": 0.0004, "reward": -0.5024999976158142, "reward_std": 0.8874189853668213, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5024999976158142, "step": 970 }, { "completion_length": 193.0, "epoch": 8.159663865546218, "grad_norm": 0.6922760605812073, "kl": 0.0030572679825127125, "learning_rate": 1.2798227928029483e-08, "loss": 0.0001, "reward": -0.02799999713897705, "reward_std": 0.21637468039989471, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.02799999713897705, "step": 971 }, { "completion_length": 256.0, "epoch": 8.168067226890756, "grad_norm": 0.009784630499780178, "kl": 0.002773221582174301, "learning_rate": 1.193150004542204e-08, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 972 }, { "completion_length": 256.0, "epoch": 8.176470588235293, "grad_norm": 0.5121588110923767, "kl": 0.005751563236117363, "learning_rate": 1.109508849230001e-08, "loss": 0.0002, "reward": -0.38100001215934753, "reward_std": 0.7155921459197998, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.38100001215934753, "step": 973 }, { "completion_length": 187.0, "epoch": 8.184873949579831, "grad_norm": 1.1339529752731323, "kl": 0.003388491924852133, "learning_rate": 1.0289003460074165e-08, "loss": 0.0001, "reward": 0.05250000208616257, "reward_std": 0.10253047943115234, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.05250000208616257, "step": 974 }, { "completion_length": 256.0, "epoch": 8.193277310924369, "grad_norm": 0.5041466355323792, "kl": 0.003873729147017002, "learning_rate": 9.513254770636138e-09, "loss": 0.0002, "reward": -0.36250001192092896, "reward_std": 0.6894291639328003, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.36250001192092896, "step": 975 }, { "completion_length": 235.0, "epoch": 8.201680672268907, "grad_norm": 0.5857989192008972, "kl": 0.004832597449421883, "learning_rate": 8.767851876239075e-09, "loss": 0.0002, "reward": -0.25200000405311584, "reward_std": 0.533158540725708, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.25200000405311584, "step": 976 }, { "completion_length": 179.0, "epoch": 8.210084033613445, "grad_norm": 0.81879723072052, "kl": 0.015948977321386337, "learning_rate": 8.052803859382174e-09, "loss": 0.0006, "reward": -0.5034999847412109, "reward_std": 0.3429467976093292, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5034999847412109, "step": 977 }, { "completion_length": 256.0, "epoch": 8.218487394957982, "grad_norm": 0.007117292378097773, "kl": 0.0018334102351218462, "learning_rate": 7.368119432699383e-09, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 978 }, { "completion_length": 256.0, "epoch": 8.22689075630252, "grad_norm": 0.013268759474158287, "kl": 0.003231668844819069, "learning_rate": 6.7138069388547614e-09, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 979 }, { "completion_length": 235.5, "epoch": 8.235294117647058, "grad_norm": 0.6646103262901306, "kl": 0.011293033137917519, "learning_rate": 6.089874350439507e-09, "loss": 0.0005, "reward": -0.30000001192092896, "reward_std": 0.601040780544281, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.30000001192092896, "step": 980 }, { "completion_length": 256.0, "epoch": 8.243697478991596, "grad_norm": 0.011299612000584602, "kl": 0.002753609325736761, "learning_rate": 5.4963292698750896e-09, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 981 }, { "completion_length": 256.0, "epoch": 8.252100840336134, "grad_norm": 0.00898385513573885, "kl": 0.0037319217808544636, "learning_rate": 4.933178929321103e-09, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 982 }, { "completion_length": 218.0, "epoch": 8.260504201680673, "grad_norm": 0.5402487516403198, "kl": 0.002601149957627058, "learning_rate": 4.400430190586724e-09, "loss": 0.0001, "reward": -0.1615000069141388, "reward_std": 0.405172199010849, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1615000069141388, "step": 983 }, { "completion_length": 256.0, "epoch": 8.268907563025211, "grad_norm": 0.008241538889706135, "kl": 0.004483451135456562, "learning_rate": 3.8980895450474455e-09, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 984 }, { "completion_length": 256.0, "epoch": 8.277310924369749, "grad_norm": 0.00334729952737689, "kl": 0.0018546670908108354, "learning_rate": 3.4261631135654174e-09, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 985 }, { "completion_length": 256.0, "epoch": 8.285714285714286, "grad_norm": 0.008701927028596401, "kl": 0.0024595223367214203, "learning_rate": 2.984656646415063e-09, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 986 }, { "completion_length": 227.0, "epoch": 8.294117647058824, "grad_norm": 0.006082863546907902, "kl": 0.0024291672743856907, "learning_rate": 2.573575523213412e-09, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 987 }, { "completion_length": 256.0, "epoch": 8.302521008403362, "grad_norm": 0.011156154796481133, "kl": 0.0029956665821373463, "learning_rate": 2.192924752854042e-09, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 988 }, { "completion_length": 256.0, "epoch": 8.3109243697479, "grad_norm": 0.008423501625657082, "kl": 0.0025845151394605637, "learning_rate": 1.842708973447127e-09, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 989 }, { "completion_length": 225.0, "epoch": 8.319327731092438, "grad_norm": 1.32600736618042, "kl": 0.08641031384468079, "learning_rate": 1.5229324522605949e-09, "loss": 0.0035, "reward": -0.7210000157356262, "reward_std": 0.22768840193748474, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7210000157356262, "step": 990 }, { "completion_length": 256.0, "epoch": 8.327731092436975, "grad_norm": 0.010838591493666172, "kl": 0.0026756543666124344, "learning_rate": 1.2335990856710001e-09, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 991 }, { "completion_length": 116.5, "epoch": 8.336134453781513, "grad_norm": 1.522036075592041, "kl": 0.02132032997906208, "learning_rate": 9.747123991141193e-10, "loss": 0.0009, "reward": -0.2240000069141388, "reward_std": 0.4935605525970459, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.2240000069141388, "step": 992 }, { "completion_length": 256.0, "epoch": 8.344537815126051, "grad_norm": 0.0033922113943845034, "kl": 0.001225098967552185, "learning_rate": 7.462755470422078e-10, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 993 }, { "completion_length": 181.5, "epoch": 8.352941176470589, "grad_norm": 0.9405512809753418, "kl": 0.02734358422458172, "learning_rate": 5.48291312886251e-10, "loss": 0.0011, "reward": 0.005499999970197678, "reward_std": 0.16899850964546204, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.005499999970197678, "step": 994 }, { "completion_length": 256.0, "epoch": 8.361344537815127, "grad_norm": 0.5429350733757019, "kl": 0.004600170534104109, "learning_rate": 3.8076210902182607e-10, "loss": 0.0002, "reward": -0.28600001335144043, "reward_std": 0.58124178647995, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.28600001335144043, "step": 995 }, { "completion_length": 207.0, "epoch": 8.369747899159664, "grad_norm": 1.0061851739883423, "kl": 0.007485842797905207, "learning_rate": 2.43689976739403e-10, "loss": 0.0003, "reward": -0.09450000524520874, "reward_std": 0.31041988730430603, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09450000524520874, "step": 996 }, { "completion_length": 256.0, "epoch": 8.378151260504202, "grad_norm": 0.5545046329498291, "kl": 0.0031090022530406713, "learning_rate": 1.3707658621964216e-10, "loss": 0.0001, "reward": -0.3700000047683716, "reward_std": 0.7000357508659363, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3700000047683716, "step": 997 }, { "completion_length": 256.0, "epoch": 8.38655462184874, "grad_norm": 0.011756294406950474, "kl": 0.0036002714186906815, "learning_rate": 6.092323651313293e-11, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 998 }, { "completion_length": 256.0, "epoch": 8.394957983193278, "grad_norm": 0.0063680424354970455, "kl": 0.001808455679565668, "learning_rate": 1.5230855524017708e-11, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 999 }, { "completion_length": 220.5, "epoch": 8.403361344537815, "grad_norm": 0.7211830615997314, "kl": 0.0077142007648944855, "learning_rate": 0.0, "loss": 0.0003, "reward": -0.20149999856948853, "reward_std": 0.46174073219299316, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.20149999856948853, "step": 1000 } ], "logging_steps": 1, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 9, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }