llama3.2-1b-Open-R1-GRPO-test0 / trainer_state.json
hyunseoki's picture
Model save
e9c8320 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 27,
"global_step": 267,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 92.16071701049805,
"epoch": 0.003745318352059925,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 0.0,
"loss": -0.0,
"reward": 0.3455178663134575,
"reward_std": 0.7725450992584229,
"rewards/correctness_reward_func": 0.191964291036129,
"rewards/int_reward_func": 0.2812500149011612,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.1276964358985424,
"step": 1
},
{
"completion_length": 99.06696891784668,
"epoch": 0.00749063670411985,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 0.0,
"loss": -0.0,
"reward": 0.38210269808769226,
"reward_std": 0.8393888622522354,
"rewards/correctness_reward_func": 0.1964285783469677,
"rewards/int_reward_func": 0.2600446492433548,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.0743705378845334,
"step": 2
},
{
"completion_length": 95.87054061889648,
"epoch": 0.011235955056179775,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 0.0,
"loss": 0.0,
"reward": 0.24831698834896088,
"reward_std": 0.7660860866308212,
"rewards/correctness_reward_func": 0.12053571734577417,
"rewards/int_reward_func": 0.2421875149011612,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.11440625134855509,
"step": 3
},
{
"completion_length": 100.84152221679688,
"epoch": 0.0149812734082397,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 0.0,
"loss": 0.0,
"reward": 0.3874709792435169,
"reward_std": 0.8373937755823135,
"rewards/correctness_reward_func": 0.2187500074505806,
"rewards/int_reward_func": 0.251116082072258,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.08239509297709446,
"step": 4
},
{
"completion_length": 107.81696701049805,
"epoch": 0.018726591760299626,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 0.0,
"loss": 0.0,
"reward": 0.3461473397910595,
"reward_std": 0.8639847934246063,
"rewards/correctness_reward_func": 0.2187500111758709,
"rewards/int_reward_func": 0.2834821566939354,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.15608482621610165,
"step": 5
},
{
"completion_length": 88.96428871154785,
"epoch": 0.02247191011235955,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 0.0,
"loss": -0.0,
"reward": 0.23790179379284382,
"reward_std": 0.8017762005329132,
"rewards/correctness_reward_func": 0.14285715389996767,
"rewards/int_reward_func": 0.2310267947614193,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.13598215393722057,
"step": 6
},
{
"completion_length": 86.72768211364746,
"epoch": 0.026217228464419477,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 0.0,
"loss": -0.0,
"reward": 0.33704913780093193,
"reward_std": 0.786924734711647,
"rewards/correctness_reward_func": 0.2053571492433548,
"rewards/int_reward_func": 0.2667410895228386,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.1350491177290678,
"step": 7
},
{
"completion_length": 87.54687881469727,
"epoch": 0.0299625468164794,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 0.0,
"loss": -0.0,
"reward": 0.390026792883873,
"reward_std": 0.7708619683980942,
"rewards/correctness_reward_func": 0.2053571492433548,
"rewards/int_reward_func": 0.2354910857975483,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.050821430049836636,
"step": 8
},
{
"completion_length": 87.82366561889648,
"epoch": 0.033707865168539325,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 0.0,
"loss": -0.0,
"reward": 0.30632367357611656,
"reward_std": 0.8439056426286697,
"rewards/correctness_reward_func": 0.1741071492433548,
"rewards/int_reward_func": 0.2477678656578064,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.11555134132504463,
"step": 9
},
{
"completion_length": 95.44196891784668,
"epoch": 0.03745318352059925,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 0.0,
"loss": 0.0,
"reward": 0.36654020100831985,
"reward_std": 0.7821808308362961,
"rewards/correctness_reward_func": 0.1830357238650322,
"rewards/int_reward_func": 0.263392873108387,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.07988839386962354,
"step": 10
},
{
"completion_length": 88.40178871154785,
"epoch": 0.04119850187265917,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 0.0,
"loss": -0.0,
"reward": 0.3682924136519432,
"reward_std": 0.8412070125341415,
"rewards/correctness_reward_func": 0.2098214440047741,
"rewards/int_reward_func": 0.2600446604192257,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.10157366236671805,
"step": 11
},
{
"completion_length": 96.40178871154785,
"epoch": 0.0449438202247191,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 0.0,
"loss": 0.0,
"reward": 0.3570691980421543,
"reward_std": 0.831629067659378,
"rewards/correctness_reward_func": 0.20535715110599995,
"rewards/int_reward_func": 0.2633928693830967,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.1116808035876602,
"step": 12
},
{
"completion_length": 98.67634582519531,
"epoch": 0.04868913857677903,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 0.0,
"loss": 0.0,
"reward": 0.31250446662306786,
"reward_std": 0.7651553750038147,
"rewards/correctness_reward_func": 0.1696428656578064,
"rewards/int_reward_func": 0.2343750074505806,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.09151339251548052,
"step": 13
},
{
"completion_length": 94.42634201049805,
"epoch": 0.052434456928838954,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 0.0,
"loss": -0.0,
"reward": 0.35333259031176567,
"reward_std": 0.8554573208093643,
"rewards/correctness_reward_func": 0.2232142984867096,
"rewards/int_reward_func": 0.2645089477300644,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.13439063727855682,
"step": 14
},
{
"completion_length": 95.30580711364746,
"epoch": 0.056179775280898875,
"grad_norm": 0.7303056716918945,
"kl": 0.0,
"learning_rate": 1.8518518518518518e-07,
"loss": 0.0,
"reward": 0.3904196694493294,
"reward_std": 0.8479138016700745,
"rewards/correctness_reward_func": 0.2098214440047741,
"rewards/int_reward_func": 0.2544642984867096,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.07386607304215431,
"step": 15
},
{
"completion_length": 98.70089721679688,
"epoch": 0.0599250936329588,
"grad_norm": 0.6679372191429138,
"kl": 0.0,
"learning_rate": 3.7037037037037036e-07,
"loss": -0.0,
"reward": 0.26223884522914886,
"reward_std": 0.8470287472009659,
"rewards/correctness_reward_func": 0.1830357201397419,
"rewards/int_reward_func": 0.243303582072258,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.16410045325756073,
"step": 16
},
{
"completion_length": 111.6004524230957,
"epoch": 0.06367041198501873,
"grad_norm": 0.8377946019172668,
"kl": 8.493661880493164e-07,
"learning_rate": 5.555555555555555e-07,
"loss": 0.0,
"reward": 0.27681921795010567,
"reward_std": 0.8451116383075714,
"rewards/correctness_reward_func": 0.1875000037252903,
"rewards/int_reward_func": 0.2488839365541935,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.1595647381618619,
"step": 17
},
{
"completion_length": 99.96875381469727,
"epoch": 0.06741573033707865,
"grad_norm": 0.9236070513725281,
"kl": 0.00010453164577484131,
"learning_rate": 7.407407407407407e-07,
"loss": 0.0,
"reward": 0.31320536509156227,
"reward_std": 0.8322850167751312,
"rewards/correctness_reward_func": 0.191964291036129,
"rewards/int_reward_func": 0.2455357275903225,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.12429465167224407,
"step": 18
},
{
"completion_length": 98.84598731994629,
"epoch": 0.07116104868913857,
"grad_norm": 0.9007355570793152,
"kl": 0.0017764568328857422,
"learning_rate": 9.259259259259259e-07,
"loss": 0.0001,
"reward": 0.2539866119623184,
"reward_std": 0.8283544480800629,
"rewards/correctness_reward_func": 0.1741071492433548,
"rewards/int_reward_func": 0.227678582072258,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.1477991035208106,
"step": 19
},
{
"completion_length": 89.91964721679688,
"epoch": 0.0749063670411985,
"grad_norm": 0.8131362199783325,
"kl": 0.009290695190429688,
"learning_rate": 1.111111111111111e-06,
"loss": 0.0004,
"reward": 0.4152901992201805,
"reward_std": 0.7349574714899063,
"rewards/correctness_reward_func": 0.2232142947614193,
"rewards/int_reward_func": 0.258928582072258,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.0668526804074645,
"step": 20
},
{
"completion_length": 92.15402221679688,
"epoch": 0.07865168539325842,
"grad_norm": 0.8349559307098389,
"kl": 0.055389404296875,
"learning_rate": 1.2962962962962962e-06,
"loss": 0.0022,
"reward": 0.33069421350955963,
"reward_std": 0.7918245047330856,
"rewards/correctness_reward_func": 0.19196429289877415,
"rewards/int_reward_func": 0.2656250074505806,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.12689509708434343,
"step": 21
},
{
"completion_length": 90.09821891784668,
"epoch": 0.08239700374531835,
"grad_norm": 1.059292197227478,
"kl": 0.13580322265625,
"learning_rate": 1.4814814814814815e-06,
"loss": 0.0054,
"reward": 0.3621741235256195,
"reward_std": 0.8592714816331863,
"rewards/correctness_reward_func": 0.2187500149011612,
"rewards/int_reward_func": 0.2533482164144516,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.10992411337792873,
"step": 22
},
{
"completion_length": 90.31473541259766,
"epoch": 0.08614232209737828,
"grad_norm": 0.9306014776229858,
"kl": 0.22802734375,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.0091,
"reward": 0.27021654695272446,
"reward_std": 0.7604184001684189,
"rewards/correctness_reward_func": 0.1428571492433548,
"rewards/int_reward_func": 0.251116082072258,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.12375670112669468,
"step": 23
},
{
"completion_length": 85.87277221679688,
"epoch": 0.0898876404494382,
"grad_norm": 1.0507615804672241,
"kl": 0.26690673828125,
"learning_rate": 1.8518518518518519e-06,
"loss": 0.0107,
"reward": 0.2973214313387871,
"reward_std": 0.72261543571949,
"rewards/correctness_reward_func": 0.13839286379516125,
"rewards/int_reward_func": 0.2455357275903225,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.08660715073347092,
"step": 24
},
{
"completion_length": 93.05580711364746,
"epoch": 0.09363295880149813,
"grad_norm": 1.3314857482910156,
"kl": 0.27276611328125,
"learning_rate": 2.037037037037037e-06,
"loss": 0.0109,
"reward": 0.27803125604987144,
"reward_std": 0.80119389295578,
"rewards/correctness_reward_func": 0.1741071529686451,
"rewards/int_reward_func": 0.2310267947614193,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.1271026823669672,
"step": 25
},
{
"completion_length": 94.82589721679688,
"epoch": 0.09737827715355805,
"grad_norm": 1.0931949615478516,
"kl": 0.29876708984375,
"learning_rate": 2.222222222222222e-06,
"loss": 0.012,
"reward": 0.31898215785622597,
"reward_std": 0.861026868224144,
"rewards/correctness_reward_func": 0.1785714365541935,
"rewards/int_reward_func": 0.2611607238650322,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0011160714784637094,
"rewards/xmlcount_reward_func": -0.12186607345938683,
"step": 26
},
{
"completion_length": 102.84821891784668,
"epoch": 0.10112359550561797,
"grad_norm": 0.9510552883148193,
"kl": 0.35101318359375,
"learning_rate": 2.4074074074074075e-06,
"loss": 0.014,
"reward": 0.2714241296052933,
"reward_std": 0.7749656587839127,
"rewards/correctness_reward_func": 0.1651785783469677,
"rewards/int_reward_func": 0.2444196566939354,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.13817412219941616,
"step": 27
},
{
"completion_length": 112.87277412414551,
"epoch": 0.10486891385767791,
"grad_norm": 0.8143340945243835,
"kl": 0.4764404296875,
"learning_rate": 2.5925925925925925e-06,
"loss": 0.0191,
"reward": 0.23841295577585697,
"reward_std": 0.7267381250858307,
"rewards/correctness_reward_func": 0.12500000186264515,
"rewards/int_reward_func": 0.2544642947614193,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.14105134829878807,
"step": 28
},
{
"completion_length": 91.91071891784668,
"epoch": 0.10861423220973783,
"grad_norm": 1.2296696901321411,
"kl": 0.5228271484375,
"learning_rate": 2.7777777777777783e-06,
"loss": 0.0209,
"reward": 0.33742189407348633,
"reward_std": 0.8348551988601685,
"rewards/correctness_reward_func": 0.1919642984867096,
"rewards/int_reward_func": 0.2533482238650322,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.10789063014090061,
"step": 29
},
{
"completion_length": 88.34152030944824,
"epoch": 0.11235955056179775,
"grad_norm": 0.7870422601699829,
"kl": 0.50323486328125,
"learning_rate": 2.962962962962963e-06,
"loss": 0.0201,
"reward": 0.29978572577238083,
"reward_std": 0.8016993254423141,
"rewards/correctness_reward_func": 0.160714291036129,
"rewards/int_reward_func": 0.2645089402794838,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.1254375111311674,
"step": 30
},
{
"completion_length": 103.2433090209961,
"epoch": 0.11610486891385768,
"grad_norm": 1.320225715637207,
"kl": 0.71630859375,
"learning_rate": 3.1481481481481483e-06,
"loss": 0.0286,
"reward": 0.31782814115285873,
"reward_std": 0.8109631538391113,
"rewards/correctness_reward_func": 0.191964291036129,
"rewards/int_reward_func": 0.2477678693830967,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.12190402112901211,
"step": 31
},
{
"completion_length": 102.47321891784668,
"epoch": 0.1198501872659176,
"grad_norm": 1.1958893537521362,
"kl": 0.6319580078125,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.0253,
"reward": 0.3368035778403282,
"reward_std": 0.8891346454620361,
"rewards/correctness_reward_func": 0.2187500149011612,
"rewards/int_reward_func": 0.2767857313156128,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.15873214416205883,
"step": 32
},
{
"completion_length": 110.43080711364746,
"epoch": 0.12359550561797752,
"grad_norm": 0.900262176990509,
"kl": 0.5997314453125,
"learning_rate": 3.5185185185185187e-06,
"loss": 0.024,
"reward": 0.2625982239842415,
"reward_std": 0.814198911190033,
"rewards/correctness_reward_func": 0.16071429196745157,
"rewards/int_reward_func": 0.2321428656578064,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.1302589364349842,
"step": 33
},
{
"completion_length": 96.2433090209961,
"epoch": 0.12734082397003746,
"grad_norm": 0.8053016662597656,
"kl": 0.5283203125,
"learning_rate": 3.7037037037037037e-06,
"loss": 0.0211,
"reward": 0.2832053676247597,
"reward_std": 0.773906797170639,
"rewards/correctness_reward_func": 0.1830357238650322,
"rewards/int_reward_func": 0.2421875111758709,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.14201787114143372,
"step": 34
},
{
"completion_length": 99.74107360839844,
"epoch": 0.13108614232209737,
"grad_norm": 0.7698966860771179,
"kl": 0.5833740234375,
"learning_rate": 3.88888888888889e-06,
"loss": 0.0233,
"reward": 0.3971473351120949,
"reward_std": 0.8169043958187103,
"rewards/correctness_reward_func": 0.2366071529686451,
"rewards/int_reward_func": 0.2667410857975483,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.10620089713484049,
"step": 35
},
{
"completion_length": 91.65402030944824,
"epoch": 0.1348314606741573,
"grad_norm": 0.6963524222373962,
"kl": 0.6768798828125,
"learning_rate": 4.074074074074074e-06,
"loss": 0.0271,
"reward": 0.2977009005844593,
"reward_std": 0.9012245386838913,
"rewards/correctness_reward_func": 0.2232142947614193,
"rewards/int_reward_func": 0.2455357238650322,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.1710491143167019,
"step": 36
},
{
"completion_length": 86.2120590209961,
"epoch": 0.13857677902621723,
"grad_norm": 0.6502078771591187,
"kl": 0.7315673828125,
"learning_rate": 4.2592592592592596e-06,
"loss": 0.0293,
"reward": 0.4257053807377815,
"reward_std": 0.7627889215946198,
"rewards/correctness_reward_func": 0.2053571492433548,
"rewards/int_reward_func": 0.2611607238650322,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.04081250121816993,
"step": 37
},
{
"completion_length": 88.02232551574707,
"epoch": 0.14232209737827714,
"grad_norm": 0.7598965764045715,
"kl": 0.7764892578125,
"learning_rate": 4.444444444444444e-06,
"loss": 0.0311,
"reward": 0.26222768798470497,
"reward_std": 0.7311272174119949,
"rewards/correctness_reward_func": 0.1383928656578064,
"rewards/int_reward_func": 0.2466517947614193,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.12281697243452072,
"step": 38
},
{
"completion_length": 88.76116561889648,
"epoch": 0.14606741573033707,
"grad_norm": 0.9369046688079834,
"kl": 0.779052734375,
"learning_rate": 4.62962962962963e-06,
"loss": 0.0312,
"reward": 0.2584107182919979,
"reward_std": 0.8221316933631897,
"rewards/correctness_reward_func": 0.1651785783469677,
"rewards/int_reward_func": 0.2466518022119999,
"rewards/soft_format_reward_func": 0.0011160714784637094,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.15453572571277618,
"step": 39
},
{
"completion_length": 88.61607551574707,
"epoch": 0.149812734082397,
"grad_norm": 0.6541325449943542,
"kl": 0.7415771484375,
"learning_rate": 4.814814814814815e-06,
"loss": 0.0297,
"reward": 0.338582631200552,
"reward_std": 0.773887574672699,
"rewards/correctness_reward_func": 0.191964291036129,
"rewards/int_reward_func": 0.2522321529686451,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.10561384446918964,
"step": 40
},
{
"completion_length": 93.6473274230957,
"epoch": 0.15355805243445692,
"grad_norm": 0.7286244630813599,
"kl": 0.772216796875,
"learning_rate": 5e-06,
"loss": 0.0309,
"reward": 0.2527187615633011,
"reward_std": 0.7823167890310287,
"rewards/correctness_reward_func": 0.1473214328289032,
"rewards/int_reward_func": 0.2533482275903225,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.14795089792460203,
"step": 41
},
{
"completion_length": 102.1004524230957,
"epoch": 0.15730337078651685,
"grad_norm": 0.6125639081001282,
"kl": 0.7431640625,
"learning_rate": 4.999785818935018e-06,
"loss": 0.0297,
"reward": 0.372944213449955,
"reward_std": 0.8073680251836777,
"rewards/correctness_reward_func": 0.2098214365541935,
"rewards/int_reward_func": 0.279017873108387,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.11589509434998035,
"step": 42
},
{
"completion_length": 95.78794860839844,
"epoch": 0.16104868913857678,
"grad_norm": 0.7778175473213196,
"kl": 0.887451171875,
"learning_rate": 4.999143312438893e-06,
"loss": 0.0355,
"reward": 0.3458884060382843,
"reward_std": 0.8312461376190186,
"rewards/correctness_reward_func": 0.2008928656578064,
"rewards/int_reward_func": 0.2421875149011612,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.09719196986407042,
"step": 43
},
{
"completion_length": 104.68080711364746,
"epoch": 0.1647940074906367,
"grad_norm": 0.9053827524185181,
"kl": 0.83203125,
"learning_rate": 4.998072590601808e-06,
"loss": 0.0333,
"reward": 0.28239064663648605,
"reward_std": 0.7673767507076263,
"rewards/correctness_reward_func": 0.16517857648432255,
"rewards/int_reward_func": 0.2354910783469677,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.11827902123332024,
"step": 44
},
{
"completion_length": 91.47768211364746,
"epoch": 0.16853932584269662,
"grad_norm": 0.7150729894638062,
"kl": 0.8095703125,
"learning_rate": 4.9965738368864345e-06,
"loss": 0.0324,
"reward": 0.4368147626519203,
"reward_std": 0.8043892681598663,
"rewards/correctness_reward_func": 0.2455357238650322,
"rewards/int_reward_func": 0.2879464365541935,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.09666741825640202,
"step": 45
},
{
"completion_length": 106.02009391784668,
"epoch": 0.17228464419475656,
"grad_norm": 0.7475388050079346,
"kl": 0.9344482421875,
"learning_rate": 4.994647308096509e-06,
"loss": 0.0374,
"reward": 0.2440937664359808,
"reward_std": 0.7551652044057846,
"rewards/correctness_reward_func": 0.1339285783469677,
"rewards/int_reward_func": 0.2388392947614193,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.1286741215735674,
"step": 46
},
{
"completion_length": 94.04018211364746,
"epoch": 0.1760299625468165,
"grad_norm": 0.6236558556556702,
"kl": 0.8037109375,
"learning_rate": 4.992293334332821e-06,
"loss": 0.0322,
"reward": 0.37872322648763657,
"reward_std": 0.8807232677936554,
"rewards/correctness_reward_func": 0.2678571492433548,
"rewards/int_reward_func": 0.2589285857975483,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.14806250110268593,
"step": 47
},
{
"completion_length": 92.28571701049805,
"epoch": 0.1797752808988764,
"grad_norm": 0.6654737591743469,
"kl": 0.818115234375,
"learning_rate": 4.989512318936654e-06,
"loss": 0.0327,
"reward": 0.39247100055217743,
"reward_std": 0.743692010641098,
"rewards/correctness_reward_func": 0.1919642947614193,
"rewards/int_reward_func": 0.2578125149011612,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.057305806782096624,
"step": 48
},
{
"completion_length": 93.03571891784668,
"epoch": 0.18352059925093633,
"grad_norm": 0.7210425734519958,
"kl": 0.825927734375,
"learning_rate": 4.986304738420684e-06,
"loss": 0.033,
"reward": 0.35212278366088867,
"reward_std": 0.8072675913572311,
"rewards/correctness_reward_func": 0.2187500149011612,
"rewards/int_reward_func": 0.2488839402794838,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.11551116220653057,
"step": 49
},
{
"completion_length": 94.02902221679688,
"epoch": 0.18726591760299627,
"grad_norm": 0.6812981963157654,
"kl": 0.92724609375,
"learning_rate": 4.982671142387316e-06,
"loss": 0.0371,
"reward": 0.2549062632024288,
"reward_std": 0.91233891248703,
"rewards/correctness_reward_func": 0.1830357238650322,
"rewards/int_reward_func": 0.2466517984867096,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.17478126287460327,
"step": 50
},
{
"completion_length": 98.83259391784668,
"epoch": 0.19101123595505617,
"grad_norm": 0.7230132818222046,
"kl": 0.9453125,
"learning_rate": 4.978612153434527e-06,
"loss": 0.0378,
"reward": 0.3085335083305836,
"reward_std": 0.7170540690422058,
"rewards/correctness_reward_func": 0.13392858020961285,
"rewards/int_reward_func": 0.2600446529686451,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.08543973276391625,
"step": 51
},
{
"completion_length": 92.35491371154785,
"epoch": 0.1947565543071161,
"grad_norm": 0.7230132818222046,
"kl": 1.035888671875,
"learning_rate": 4.978612153434527e-06,
"loss": 0.0414,
"reward": 0.39346206933259964,
"reward_std": 0.7441791445016861,
"rewards/correctness_reward_func": 0.1741071529686451,
"rewards/int_reward_func": 0.2522321566939354,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.03287723264656961,
"step": 52
},
{
"completion_length": 89.34821701049805,
"epoch": 0.19850187265917604,
"grad_norm": 0.620324969291687,
"kl": 0.957275390625,
"learning_rate": 4.974128467049177e-06,
"loss": 0.0383,
"reward": 0.3490491136908531,
"reward_std": 0.747399315237999,
"rewards/correctness_reward_func": 0.1830357164144516,
"rewards/int_reward_func": 0.2522321604192257,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0011160714784637094,
"rewards/xmlcount_reward_func": -0.08733483403921127,
"step": 53
},
{
"completion_length": 98.08928871154785,
"epoch": 0.20224719101123595,
"grad_norm": 0.617904782295227,
"kl": 1.1640625,
"learning_rate": 4.9692208514878445e-06,
"loss": 0.0466,
"reward": 0.21967187896370888,
"reward_std": 0.7784698009490967,
"rewards/correctness_reward_func": 0.14285714738070965,
"rewards/int_reward_func": 0.2232142984867096,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.14639955759048462,
"step": 54
},
{
"completion_length": 90.87946891784668,
"epoch": 0.20599250936329588,
"grad_norm": 0.6099480390548706,
"kl": 1.119140625,
"learning_rate": 4.963890147645195e-06,
"loss": 0.0448,
"reward": 0.3465201109647751,
"reward_std": 0.8060361593961716,
"rewards/correctness_reward_func": 0.1919642947614193,
"rewards/int_reward_func": 0.266741082072258,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.11218527238816023,
"step": 55
},
{
"completion_length": 90.78795051574707,
"epoch": 0.20973782771535582,
"grad_norm": 0.6998101472854614,
"kl": 1.171875,
"learning_rate": 4.958137268909887e-06,
"loss": 0.0469,
"reward": 0.3585915267467499,
"reward_std": 0.7672727555036545,
"rewards/correctness_reward_func": 0.191964291036129,
"rewards/int_reward_func": 0.2488839365541935,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.08225669572129846,
"step": 56
},
{
"completion_length": 91.67857551574707,
"epoch": 0.21348314606741572,
"grad_norm": 0.8839861154556274,
"kl": 1.13525390625,
"learning_rate": 4.9519632010080765e-06,
"loss": 0.0454,
"reward": 0.3334464356303215,
"reward_std": 0.7685143500566483,
"rewards/correctness_reward_func": 0.1741071529686451,
"rewards/int_reward_func": 0.2578125074505806,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.09847322292625904,
"step": 57
},
{
"completion_length": 81.05804061889648,
"epoch": 0.21722846441947566,
"grad_norm": 0.5016757249832153,
"kl": 1.03076171875,
"learning_rate": 4.9453690018345144e-06,
"loss": 0.0412,
"reward": 0.4211518168449402,
"reward_std": 0.8437229245901108,
"rewards/correctness_reward_func": 0.2187500149011612,
"rewards/int_reward_func": 0.2779017947614193,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.0755000039935112,
"step": 58
},
{
"completion_length": 101.41518211364746,
"epoch": 0.2209737827715356,
"grad_norm": 0.6329123377799988,
"kl": 1.089111328125,
"learning_rate": 4.938355801271282e-06,
"loss": 0.0436,
"reward": 0.35140402615070343,
"reward_std": 0.7762987017631531,
"rewards/correctness_reward_func": 0.1785714402794838,
"rewards/int_reward_func": 0.2656250149011612,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.09279241226613522,
"step": 59
},
{
"completion_length": 105.19420051574707,
"epoch": 0.2247191011235955,
"grad_norm": 0.6004884839057922,
"kl": 1.02001953125,
"learning_rate": 4.930924800994192e-06,
"loss": 0.0408,
"reward": 0.2473437450826168,
"reward_std": 0.7411400526762009,
"rewards/correctness_reward_func": 0.1428571492433548,
"rewards/int_reward_func": 0.2466517984867096,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.14216518588364124,
"step": 60
},
{
"completion_length": 98.32589530944824,
"epoch": 0.22846441947565543,
"grad_norm": 0.8213242292404175,
"kl": 1.04638671875,
"learning_rate": 4.923077274266886e-06,
"loss": 0.0419,
"reward": 0.29397991858422756,
"reward_std": 0.7984266579151154,
"rewards/correctness_reward_func": 0.17857143841683865,
"rewards/int_reward_func": 0.2500000074505806,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.13459152355790138,
"step": 61
},
{
"completion_length": 95.77009201049805,
"epoch": 0.23220973782771537,
"grad_norm": 0.7614482641220093,
"kl": 0.9754638671875,
"learning_rate": 4.914814565722671e-06,
"loss": 0.039,
"reward": 0.25892411917448044,
"reward_std": 0.6874004900455475,
"rewards/correctness_reward_func": 0.1383928582072258,
"rewards/int_reward_func": 0.251116082072258,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.13058483134955168,
"step": 62
},
{
"completion_length": 88.84375381469727,
"epoch": 0.23595505617977527,
"grad_norm": 0.6162322759628296,
"kl": 0.9012451171875,
"learning_rate": 4.906138091134118e-06,
"loss": 0.0361,
"reward": 0.4282499924302101,
"reward_std": 0.867719978094101,
"rewards/correctness_reward_func": 0.2321428656578064,
"rewards/int_reward_func": 0.2801339402794838,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.08402678836137056,
"step": 63
},
{
"completion_length": 98.63393211364746,
"epoch": 0.2397003745318352,
"grad_norm": 0.7493047118186951,
"kl": 0.9737548828125,
"learning_rate": 4.897049337170483e-06,
"loss": 0.0389,
"reward": 0.31915403716266155,
"reward_std": 0.7978127002716064,
"rewards/correctness_reward_func": 0.2098214365541935,
"rewards/int_reward_func": 0.2488839402794838,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.1395513443276286,
"step": 64
},
{
"completion_length": 99.77232551574707,
"epoch": 0.24344569288389514,
"grad_norm": 0.55640709400177,
"kl": 0.95849609375,
"learning_rate": 4.887549861142967e-06,
"loss": 0.0383,
"reward": 0.25655804201960564,
"reward_std": 0.746478259563446,
"rewards/correctness_reward_func": 0.1339285783469677,
"rewards/int_reward_func": 0.2767857313156128,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.1541562583297491,
"step": 65
},
{
"completion_length": 90.46205711364746,
"epoch": 0.24719101123595505,
"grad_norm": 0.5036749243736267,
"kl": 0.875732421875,
"learning_rate": 4.8776412907378845e-06,
"loss": 0.035,
"reward": 0.3458192050457001,
"reward_std": 0.8146399855613708,
"rewards/correctness_reward_func": 0.2187500074505806,
"rewards/int_reward_func": 0.2578125111758709,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.13074330985546112,
"step": 66
},
{
"completion_length": 109.09821891784668,
"epoch": 0.250936329588015,
"grad_norm": 0.5405407547950745,
"kl": 0.9388427734375,
"learning_rate": 4.867325323737765e-06,
"loss": 0.0376,
"reward": 0.2163794655352831,
"reward_std": 0.695435032248497,
"rewards/correctness_reward_func": 0.1250000074505806,
"rewards/int_reward_func": 0.2299107238650322,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.13853124901652336,
"step": 67
},
{
"completion_length": 96.1495590209961,
"epoch": 0.2546816479400749,
"grad_norm": 0.7321078777313232,
"kl": 0.9552001953125,
"learning_rate": 4.856603727730446e-06,
"loss": 0.0382,
"reward": 0.3762388601899147,
"reward_std": 0.8388219773769379,
"rewards/correctness_reward_func": 0.2366071492433548,
"rewards/int_reward_func": 0.2600446566939354,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.12041295692324638,
"step": 68
},
{
"completion_length": 94.25893211364746,
"epoch": 0.25842696629213485,
"grad_norm": 0.5617873668670654,
"kl": 0.9840087890625,
"learning_rate": 4.845478339806211e-06,
"loss": 0.0394,
"reward": 0.3388616181910038,
"reward_std": 0.8814976066350937,
"rewards/correctness_reward_func": 0.2276785857975483,
"rewards/int_reward_func": 0.2689732275903225,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.15779018122702837,
"step": 69
},
{
"completion_length": 100.38839721679688,
"epoch": 0.26217228464419473,
"grad_norm": 0.7381689548492432,
"kl": 1.1729736328125,
"learning_rate": 4.833951066243004e-06,
"loss": 0.0469,
"reward": 0.3259017989039421,
"reward_std": 0.7590171247720718,
"rewards/correctness_reward_func": 0.165178582072258,
"rewards/int_reward_func": 0.2566964328289032,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.09597321972250938,
"step": 70
},
{
"completion_length": 108.24777412414551,
"epoch": 0.26591760299625467,
"grad_norm": 0.7399603724479675,
"kl": 1.2216796875,
"learning_rate": 4.822023882179811e-06,
"loss": 0.0489,
"reward": 0.138060272205621,
"reward_std": 0.8277581036090851,
"rewards/correctness_reward_func": 0.098214291036129,
"rewards/int_reward_func": 0.2321428656578064,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.19229689799249172,
"step": 71
},
{
"completion_length": 91.00893211364746,
"epoch": 0.2696629213483146,
"grad_norm": 0.49646005034446716,
"kl": 0.997802734375,
"learning_rate": 4.809698831278217e-06,
"loss": 0.0399,
"reward": 0.31739287078380585,
"reward_std": 0.821430504322052,
"rewards/correctness_reward_func": 0.1964285783469677,
"rewards/int_reward_func": 0.258928582072258,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.137964291498065,
"step": 72
},
{
"completion_length": 89.22545051574707,
"epoch": 0.27340823970037453,
"grad_norm": 0.5307531952857971,
"kl": 0.9476318359375,
"learning_rate": 4.796978025372247e-06,
"loss": 0.0379,
"reward": 0.3167254589498043,
"reward_std": 0.8111777305603027,
"rewards/correctness_reward_func": 0.16071429662406445,
"rewards/int_reward_func": 0.2678571566939354,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.11184598784893751,
"step": 73
},
{
"completion_length": 108.57813262939453,
"epoch": 0.27715355805243447,
"grad_norm": 0.9819021224975586,
"kl": 1.1944580078125,
"learning_rate": 4.783863644106502e-06,
"loss": 0.0478,
"reward": 0.43339288234710693,
"reward_std": 0.805885374546051,
"rewards/correctness_reward_func": 0.2410714402794838,
"rewards/int_reward_func": 0.2633928656578064,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.07107143104076385,
"step": 74
},
{
"completion_length": 97.66071891784668,
"epoch": 0.2808988764044944,
"grad_norm": 0.5349671244621277,
"kl": 0.9276123046875,
"learning_rate": 4.770357934562704e-06,
"loss": 0.0371,
"reward": 0.25291070714592934,
"reward_std": 0.7776944190263748,
"rewards/correctness_reward_func": 0.1517857201397419,
"rewards/int_reward_func": 0.2321428656578064,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.13101786747574806,
"step": 75
},
{
"completion_length": 99.77455711364746,
"epoch": 0.2846441947565543,
"grad_norm": 0.4924392104148865,
"kl": 0.9676513671875,
"learning_rate": 4.7564632108746524e-06,
"loss": 0.0387,
"reward": 0.29688840731978416,
"reward_std": 0.7467798590660095,
"rewards/correctness_reward_func": 0.1651785746216774,
"rewards/int_reward_func": 0.2377232313156128,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.10601340420544147,
"step": 76
},
{
"completion_length": 108.47545051574707,
"epoch": 0.2883895131086142,
"grad_norm": 0.45954495668411255,
"kl": 0.9312744140625,
"learning_rate": 4.742181853831721e-06,
"loss": 0.0372,
"reward": 0.2200825996696949,
"reward_std": 0.7668928056955338,
"rewards/correctness_reward_func": 0.1428571492433548,
"rewards/int_reward_func": 0.2377232201397419,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.1604977697134018,
"step": 77
},
{
"completion_length": 108.40848731994629,
"epoch": 0.29213483146067415,
"grad_norm": 0.5183126330375671,
"kl": 0.931640625,
"learning_rate": 4.72751631047092e-06,
"loss": 0.0373,
"reward": 0.26953795552253723,
"reward_std": 0.7980407774448395,
"rewards/correctness_reward_func": 0.1830357238650322,
"rewards/int_reward_func": 0.2544642984867096,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.16796205937862396,
"step": 78
},
{
"completion_length": 92.30357551574707,
"epoch": 0.2958801498127341,
"grad_norm": 0.6169615983963013,
"kl": 0.814453125,
"learning_rate": 4.712469093657605e-06,
"loss": 0.0326,
"reward": 0.3473794758319855,
"reward_std": 0.7407716810703278,
"rewards/correctness_reward_func": 0.1875000074505806,
"rewards/int_reward_func": 0.2455357313156128,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.08565625501796603,
"step": 79
},
{
"completion_length": 96.31920051574707,
"epoch": 0.299625468164794,
"grad_norm": 0.5736718773841858,
"kl": 0.82373046875,
"learning_rate": 4.697042781654913e-06,
"loss": 0.0329,
"reward": 0.3164888694882393,
"reward_std": 0.8256205767393112,
"rewards/correctness_reward_func": 0.160714291036129,
"rewards/int_reward_func": 0.2734375074505806,
"rewards/soft_format_reward_func": 0.0011160714784637094,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.11877902504056692,
"step": 80
},
{
"completion_length": 95.03348731994629,
"epoch": 0.30337078651685395,
"grad_norm": 0.6198201179504395,
"kl": 0.7996826171875,
"learning_rate": 4.681240017681994e-06,
"loss": 0.032,
"reward": 0.31822992861270905,
"reward_std": 0.7278983741998672,
"rewards/correctness_reward_func": 0.1562500074505806,
"rewards/int_reward_func": 0.2500000111758709,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.08802009373903275,
"step": 81
},
{
"completion_length": 94.79911041259766,
"epoch": 0.30711610486891383,
"grad_norm": 0.4967269003391266,
"kl": 0.791748046875,
"learning_rate": 4.665063509461098e-06,
"loss": 0.0317,
"reward": 0.37110715731978416,
"reward_std": 0.8040148764848709,
"rewards/correctness_reward_func": 0.2633928693830967,
"rewards/int_reward_func": 0.2477678693830967,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.1400535786524415,
"step": 82
},
{
"completion_length": 93.06696891784668,
"epoch": 0.31086142322097376,
"grad_norm": 0.5414807796478271,
"kl": 0.799072265625,
"learning_rate": 4.648516028753632e-06,
"loss": 0.032,
"reward": 0.3070870563387871,
"reward_std": 0.9167025238275528,
"rewards/correctness_reward_func": 0.2187500074505806,
"rewards/int_reward_func": 0.2500000149011612,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.16166296042501926,
"step": 83
},
{
"completion_length": 85.48661231994629,
"epoch": 0.3146067415730337,
"grad_norm": 0.6058946251869202,
"kl": 0.8023681640625,
"learning_rate": 4.631600410885231e-06,
"loss": 0.0321,
"reward": 0.31676117703318596,
"reward_std": 0.8016230016946793,
"rewards/correctness_reward_func": 0.1785714365541935,
"rewards/int_reward_func": 0.2232142947614193,
"rewards/soft_format_reward_func": 0.0011160714784637094,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.08614062378183007,
"step": 84
},
{
"completion_length": 98.87947082519531,
"epoch": 0.31835205992509363,
"grad_norm": 0.5493951439857483,
"kl": 0.810302734375,
"learning_rate": 4.614319554259934e-06,
"loss": 0.0324,
"reward": 0.26373885199427605,
"reward_std": 0.7828188389539719,
"rewards/correctness_reward_func": 0.1428571492433548,
"rewards/int_reward_func": 0.2421875149011612,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.12130581215023994,
"step": 85
},
{
"completion_length": 98.88170051574707,
"epoch": 0.32209737827715357,
"grad_norm": 0.5060502290725708,
"kl": 0.8116455078125,
"learning_rate": 4.596676419863561e-06,
"loss": 0.0325,
"reward": 0.37905358523130417,
"reward_std": 0.7987204343080521,
"rewards/correctness_reward_func": 0.1964285783469677,
"rewards/int_reward_func": 0.266741082072258,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.08411608170717955,
"step": 86
},
{
"completion_length": 95.37723731994629,
"epoch": 0.3258426966292135,
"grad_norm": 0.45160311460494995,
"kl": 0.814697265625,
"learning_rate": 4.578674030756364e-06,
"loss": 0.0326,
"reward": 0.3752902075648308,
"reward_std": 0.7861279100179672,
"rewards/correctness_reward_func": 0.2142857238650322,
"rewards/int_reward_func": 0.2566964402794838,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.09569196694064885,
"step": 87
},
{
"completion_length": 88.16964530944824,
"epoch": 0.3295880149812734,
"grad_norm": 0.4520312249660492,
"kl": 0.8048095703125,
"learning_rate": 4.560315471555039e-06,
"loss": 0.0322,
"reward": 0.40060270577669144,
"reward_std": 0.827767089009285,
"rewards/correctness_reward_func": 0.2410714402794838,
"rewards/int_reward_func": 0.2399553656578064,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.0804241057485342,
"step": 88
},
{
"completion_length": 90.47098541259766,
"epoch": 0.3333333333333333,
"grad_norm": 0.4482274651527405,
"kl": 0.802490234375,
"learning_rate": 4.541603887904198e-06,
"loss": 0.0321,
"reward": 0.46391965448856354,
"reward_std": 0.8666775524616241,
"rewards/correctness_reward_func": 0.2812500149011612,
"rewards/int_reward_func": 0.2845982313156128,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.1019285786896944,
"step": 89
},
{
"completion_length": 99.05357360839844,
"epoch": 0.33707865168539325,
"grad_norm": 0.48688769340515137,
"kl": 0.8892822265625,
"learning_rate": 4.522542485937369e-06,
"loss": 0.0356,
"reward": 0.32227010279893875,
"reward_std": 0.7231635600328445,
"rewards/correctness_reward_func": 0.1741071492433548,
"rewards/int_reward_func": 0.2455357238650322,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.09737277776002884,
"step": 90
},
{
"completion_length": 96.9062557220459,
"epoch": 0.3408239700374532,
"grad_norm": 0.713897168636322,
"kl": 0.83984375,
"learning_rate": 4.503134531727652e-06,
"loss": 0.0336,
"reward": 0.3822232261300087,
"reward_std": 0.8144369274377823,
"rewards/correctness_reward_func": 0.2455357275903225,
"rewards/int_reward_func": 0.258928582072258,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.1222410760819912,
"step": 91
},
{
"completion_length": 92.32589530944824,
"epoch": 0.3445692883895131,
"grad_norm": 0.48127949237823486,
"kl": 0.8839111328125,
"learning_rate": 4.4833833507280884e-06,
"loss": 0.0354,
"reward": 0.2896517887711525,
"reward_std": 0.8091708421707153,
"rewards/correctness_reward_func": 0.160714291036129,
"rewards/int_reward_func": 0.2455357238650322,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.11659822333604097,
"step": 92
},
{
"completion_length": 113.4062557220459,
"epoch": 0.34831460674157305,
"grad_norm": 0.5383365154266357,
"kl": 1.0166015625,
"learning_rate": 4.463292327201862e-06,
"loss": 0.0407,
"reward": 0.2778482399880886,
"reward_std": 0.7528630048036575,
"rewards/correctness_reward_func": 0.1607142947614193,
"rewards/int_reward_func": 0.2388393022119999,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.12170535884797573,
"step": 93
},
{
"completion_length": 90.40179061889648,
"epoch": 0.352059925093633,
"grad_norm": 0.47072696685791016,
"kl": 0.8814697265625,
"learning_rate": 4.442864903642428e-06,
"loss": 0.0353,
"reward": 0.3879285827279091,
"reward_std": 0.7762220501899719,
"rewards/correctness_reward_func": 0.2321428693830967,
"rewards/int_reward_func": 0.2377232275903225,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.08193750027567148,
"step": 94
},
{
"completion_length": 100.53795051574707,
"epoch": 0.35580524344569286,
"grad_norm": 0.5055387020111084,
"kl": 0.9866943359375,
"learning_rate": 4.422104580183649e-06,
"loss": 0.0395,
"reward": 0.27919645234942436,
"reward_std": 0.8693763017654419,
"rewards/correctness_reward_func": 0.1875000074505806,
"rewards/int_reward_func": 0.2455357238650322,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.15383929014205933,
"step": 95
},
{
"completion_length": 98.15402221679688,
"epoch": 0.3595505617977528,
"grad_norm": 0.5205227732658386,
"kl": 1.0927734375,
"learning_rate": 4.401014914000078e-06,
"loss": 0.0437,
"reward": 0.30646876618266106,
"reward_std": 0.8004807382822037,
"rewards/correctness_reward_func": 0.1830357201397419,
"rewards/int_reward_func": 0.238839291036129,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.11540625896304846,
"step": 96
},
{
"completion_length": 92.7410774230957,
"epoch": 0.36329588014981273,
"grad_norm": 0.5007465481758118,
"kl": 1.0205078125,
"learning_rate": 4.379599518697444e-06,
"loss": 0.0408,
"reward": 0.4242701083421707,
"reward_std": 0.900767520070076,
"rewards/correctness_reward_func": 0.2321428693830967,
"rewards/int_reward_func": 0.279017873108387,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.08689062856137753,
"step": 97
},
{
"completion_length": 95.44196891784668,
"epoch": 0.36704119850187267,
"grad_norm": 0.7685222029685974,
"kl": 1.154541015625,
"learning_rate": 4.357862063693486e-06,
"loss": 0.0462,
"reward": 0.3419933207333088,
"reward_std": 0.7976544201374054,
"rewards/correctness_reward_func": 0.1919642947614193,
"rewards/int_reward_func": 0.2522321529686451,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.10220312792807817,
"step": 98
},
{
"completion_length": 97.05803871154785,
"epoch": 0.3707865168539326,
"grad_norm": 0.48678070306777954,
"kl": 1.0458984375,
"learning_rate": 4.335806273589214e-06,
"loss": 0.0418,
"reward": 0.32839956879615784,
"reward_std": 0.7194608449935913,
"rewards/correctness_reward_func": 0.1562500074505806,
"rewards/int_reward_func": 0.2488839402794838,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.07673437846824527,
"step": 99
},
{
"completion_length": 89.83259391784668,
"epoch": 0.37453183520599254,
"grad_norm": 0.4802840054035187,
"kl": 0.9891357421875,
"learning_rate": 4.313435927530719e-06,
"loss": 0.0396,
"reward": 0.323910728096962,
"reward_std": 0.8017723858356476,
"rewards/correctness_reward_func": 0.1741071492433548,
"rewards/int_reward_func": 0.2377232238650322,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.08791964408010244,
"step": 100
},
{
"completion_length": 92.61607551574707,
"epoch": 0.3782771535580524,
"grad_norm": 0.6329229474067688,
"kl": 0.9765625,
"learning_rate": 4.290754858561636e-06,
"loss": 0.0391,
"reward": 0.3065357282757759,
"reward_std": 0.7879298776388168,
"rewards/correctness_reward_func": 0.165178582072258,
"rewards/int_reward_func": 0.2678571566939354,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.12650001049041748,
"step": 101
},
{
"completion_length": 92.51116371154785,
"epoch": 0.38202247191011235,
"grad_norm": 0.5261896848678589,
"kl": 0.99658203125,
"learning_rate": 4.267766952966369e-06,
"loss": 0.0399,
"reward": 0.3286317139863968,
"reward_std": 0.7215069979429245,
"rewards/correctness_reward_func": 0.1473214365541935,
"rewards/int_reward_func": 0.2611607275903225,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.07985044876113534,
"step": 102
},
{
"completion_length": 104.85937881469727,
"epoch": 0.3857677902621723,
"grad_norm": 0.4765637516975403,
"kl": 1.0252685546875,
"learning_rate": 4.244476149604201e-06,
"loss": 0.041,
"reward": 0.3370089456439018,
"reward_std": 0.7856772691011429,
"rewards/correctness_reward_func": 0.1785714402794838,
"rewards/int_reward_func": 0.2533482313156128,
"rewards/soft_format_reward_func": 0.0011160714784637094,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.09602678846567869,
"step": 103
},
{
"completion_length": 92.55804061889648,
"epoch": 0.3895131086142322,
"grad_norm": 0.5377345681190491,
"kl": 0.9796142578125,
"learning_rate": 4.220886439234385e-06,
"loss": 0.0392,
"reward": 0.3739665374159813,
"reward_std": 0.8817652761936188,
"rewards/correctness_reward_func": 0.2232142947614193,
"rewards/int_reward_func": 0.2354910857975483,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.0847388431429863,
"step": 104
},
{
"completion_length": 102.57366561889648,
"epoch": 0.39325842696629215,
"grad_norm": 0.5319781303405762,
"kl": 1.1024169921875,
"learning_rate": 4.197001863832355e-06,
"loss": 0.0441,
"reward": 0.33513617515563965,
"reward_std": 0.7641059011220932,
"rewards/correctness_reward_func": 0.1964285783469677,
"rewards/int_reward_func": 0.2544642984867096,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.11575670912861824,
"step": 105
},
{
"completion_length": 95.90625381469727,
"epoch": 0.3970037453183521,
"grad_norm": 0.5436156392097473,
"kl": 1.011474609375,
"learning_rate": 4.172826515897146e-06,
"loss": 0.0405,
"reward": 0.3867567144334316,
"reward_std": 0.7985697090625763,
"rewards/correctness_reward_func": 0.2142857275903225,
"rewards/int_reward_func": 0.2700893059372902,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0011160714784637094,
"rewards/xmlcount_reward_func": -0.09873438253998756,
"step": 106
},
{
"completion_length": 89.44866561889648,
"epoch": 0.40074906367041196,
"grad_norm": 0.47593066096305847,
"kl": 1.074462890625,
"learning_rate": 4.1483645377501726e-06,
"loss": 0.043,
"reward": 0.36116072721779346,
"reward_std": 0.8109498172998428,
"rewards/correctness_reward_func": 0.19642857648432255,
"rewards/int_reward_func": 0.2332589365541935,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.06852679001167417,
"step": 107
},
{
"completion_length": 98.08482551574707,
"epoch": 0.4044943820224719,
"grad_norm": 0.47593066096305847,
"kl": NaN,
"learning_rate": 4.1483645377501726e-06,
"loss": 0.042,
"reward": 0.3247567042708397,
"reward_std": 0.756167471408844,
"rewards/correctness_reward_func": 0.165178582072258,
"rewards/int_reward_func": 0.2488839402794838,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.08930580969899893,
"step": 108
},
{
"completion_length": 96.78571701049805,
"epoch": 0.40823970037453183,
"grad_norm": 0.46640312671661377,
"kl": 1.106689453125,
"learning_rate": 4.123620120825459e-06,
"loss": 0.0443,
"reward": 0.3182366043329239,
"reward_std": 0.8166698515415192,
"rewards/correctness_reward_func": 0.20982143469154835,
"rewards/int_reward_func": 0.2500000074505806,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.141584824770689,
"step": 109
},
{
"completion_length": 103.3504524230957,
"epoch": 0.41198501872659177,
"grad_norm": 0.4819093346595764,
"kl": 1.209228515625,
"learning_rate": 4.098597504951462e-06,
"loss": 0.0484,
"reward": 0.45162054151296616,
"reward_std": 0.9206108599901199,
"rewards/correctness_reward_func": 0.3080357313156128,
"rewards/int_reward_func": 0.2801339402794838,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.1365491133183241,
"step": 110
},
{
"completion_length": 95.90402412414551,
"epoch": 0.4157303370786517,
"grad_norm": 0.48896247148513794,
"kl": 1.0751953125,
"learning_rate": 4.073300977624594e-06,
"loss": 0.043,
"reward": 0.2652589473873377,
"reward_std": 0.7796717882156372,
"rewards/correctness_reward_func": 0.160714291036129,
"rewards/int_reward_func": 0.2533482201397419,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.14880357310175896,
"step": 111
},
{
"completion_length": 90.68527221679688,
"epoch": 0.41947565543071164,
"grad_norm": 0.4631924033164978,
"kl": 1.094970703125,
"learning_rate": 4.047734873274586e-06,
"loss": 0.0438,
"reward": 0.35960714891552925,
"reward_std": 0.7207369059324265,
"rewards/correctness_reward_func": 0.17410715529695153,
"rewards/int_reward_func": 0.2845982238650322,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.09909821674227715,
"step": 112
},
{
"completion_length": 97.99330711364746,
"epoch": 0.4232209737827715,
"grad_norm": 0.5063531398773193,
"kl": 1.1484375,
"learning_rate": 4.021903572521802e-06,
"loss": 0.0459,
"reward": 0.41493305563926697,
"reward_std": 0.8120662122964859,
"rewards/correctness_reward_func": 0.2410714402794838,
"rewards/int_reward_func": 0.2566964365541935,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0011160714784637094,
"rewards/xmlcount_reward_func": -0.08395089209079742,
"step": 113
},
{
"completion_length": 85.47991561889648,
"epoch": 0.42696629213483145,
"grad_norm": 0.4622070789337158,
"kl": 1.0574951171875,
"learning_rate": 3.995811501426648e-06,
"loss": 0.0423,
"reward": 0.3168504536151886,
"reward_std": 0.7642460912466049,
"rewards/correctness_reward_func": 0.1875000149011612,
"rewards/int_reward_func": 0.2633928656578064,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.1340424194931984,
"step": 114
},
{
"completion_length": 94.50000381469727,
"epoch": 0.4307116104868914,
"grad_norm": 0.4713800251483917,
"kl": 1.068359375,
"learning_rate": 3.969463130731183e-06,
"loss": 0.0427,
"reward": 0.3659776858985424,
"reward_std": 0.891373872756958,
"rewards/correctness_reward_func": 0.2678571492433548,
"rewards/int_reward_func": 0.2444196566939354,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.14629911817610264,
"step": 115
},
{
"completion_length": 96.42187881469727,
"epoch": 0.4344569288389513,
"grad_norm": 0.527915894985199,
"kl": 1.056884765625,
"learning_rate": 3.942862975093085e-06,
"loss": 0.0423,
"reward": 0.36224332079291344,
"reward_std": 0.8091815561056137,
"rewards/correctness_reward_func": 0.2232142984867096,
"rewards/int_reward_func": 0.2477678693830967,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.1087388452142477,
"step": 116
},
{
"completion_length": 95.6093807220459,
"epoch": 0.43820224719101125,
"grad_norm": 0.5208942294120789,
"kl": 1.090576171875,
"learning_rate": 3.916015592312083e-06,
"loss": 0.0436,
"reward": 0.27370089665055275,
"reward_std": 0.8227901756763458,
"rewards/correctness_reward_func": 0.1741071529686451,
"rewards/int_reward_func": 0.254464291036129,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.15487053990364075,
"step": 117
},
{
"completion_length": 105.62054061889648,
"epoch": 0.4419475655430712,
"grad_norm": 0.47664541006088257,
"kl": 1.18017578125,
"learning_rate": 3.888925582549006e-06,
"loss": 0.0472,
"reward": 0.28062277287244797,
"reward_std": 0.8103707134723663,
"rewards/correctness_reward_func": 0.1830357201397419,
"rewards/int_reward_func": 0.2455357201397419,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.14794866368174553,
"step": 118
},
{
"completion_length": 94.9598274230957,
"epoch": 0.44569288389513106,
"grad_norm": 0.5299546122550964,
"kl": 1.0885009765625,
"learning_rate": 3.861597587537568e-06,
"loss": 0.0435,
"reward": 0.2977410815656185,
"reward_std": 0.7406027764081955,
"rewards/correctness_reward_func": 0.1517857201397419,
"rewards/int_reward_func": 0.2254464402794838,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.07949107605963945,
"step": 119
},
{
"completion_length": 98.04911422729492,
"epoch": 0.449438202247191,
"grad_norm": 0.4211517870426178,
"kl": 1.2236328125,
"learning_rate": 3.83403628978903e-06,
"loss": 0.0489,
"reward": 0.2801852785050869,
"reward_std": 0.8073955476284027,
"rewards/correctness_reward_func": 0.1607142947614193,
"rewards/int_reward_func": 0.247767873108387,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.12829688470810652,
"step": 120
},
{
"completion_length": 96.47098731994629,
"epoch": 0.45318352059925093,
"grad_norm": 0.4674068093299866,
"kl": 1.210693359375,
"learning_rate": 3.806246411789872e-06,
"loss": 0.0484,
"reward": 0.35595760494470596,
"reward_std": 0.8347803801298141,
"rewards/correctness_reward_func": 0.2098214402794838,
"rewards/int_reward_func": 0.2544642947614193,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.10832812543958426,
"step": 121
},
{
"completion_length": 97.76116561889648,
"epoch": 0.45692883895131087,
"grad_norm": 0.7582751512527466,
"kl": 1.22021484375,
"learning_rate": 3.77823271519263e-06,
"loss": 0.0488,
"reward": 0.35320091247558594,
"reward_std": 0.7386835068464279,
"rewards/correctness_reward_func": 0.1964285746216774,
"rewards/int_reward_func": 0.2600446529686451,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.10327232570853084,
"step": 122
},
{
"completion_length": 95.66071891784668,
"epoch": 0.4606741573033708,
"grad_norm": 0.7582751512527466,
"kl": NaN,
"learning_rate": 3.77823271519263e-06,
"loss": 0.046,
"reward": 0.31566742807626724,
"reward_std": 0.8195231109857559,
"rewards/correctness_reward_func": 0.1696428656578064,
"rewards/int_reward_func": 0.2366071492433548,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.09058259081211872,
"step": 123
},
{
"completion_length": 93.24330711364746,
"epoch": 0.46441947565543074,
"grad_norm": 0.49510565400123596,
"kl": 1.078125,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.0431,
"reward": 0.3665379509329796,
"reward_std": 0.9554053395986557,
"rewards/correctness_reward_func": 0.2366071566939354,
"rewards/int_reward_func": 0.2622767947614193,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.1323459828272462,
"step": 124
},
{
"completion_length": 97.34152221679688,
"epoch": 0.4681647940074906,
"grad_norm": 0.6952568888664246,
"kl": 1.18701171875,
"learning_rate": 3.721553103742388e-06,
"loss": 0.0475,
"reward": 0.34241294860839844,
"reward_std": 0.8731215000152588,
"rewards/correctness_reward_func": 0.2053571529686451,
"rewards/int_reward_func": 0.2678571566939354,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.13080134615302086,
"step": 125
},
{
"completion_length": 99.09821891784668,
"epoch": 0.47191011235955055,
"grad_norm": 0.46382632851600647,
"kl": 1.170654296875,
"learning_rate": 3.6928969006490212e-06,
"loss": 0.0468,
"reward": 0.31666965037584305,
"reward_std": 0.7566796094179153,
"rewards/correctness_reward_func": 0.1517857201397419,
"rewards/int_reward_func": 0.2723214440047741,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.10743750259280205,
"step": 126
},
{
"completion_length": 91.62277412414551,
"epoch": 0.4756554307116105,
"grad_norm": 0.47625091671943665,
"kl": 0.992431640625,
"learning_rate": 3.664036300812779e-06,
"loss": 0.0397,
"reward": 0.3642299249768257,
"reward_std": 0.951588049530983,
"rewards/correctness_reward_func": 0.2410714402794838,
"rewards/int_reward_func": 0.2712053656578064,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.14804688468575478,
"step": 127
},
{
"completion_length": 100.61607360839844,
"epoch": 0.4794007490636704,
"grad_norm": 0.5847791433334351,
"kl": 1.225830078125,
"learning_rate": 3.634976249348867e-06,
"loss": 0.049,
"reward": 0.42196429520845413,
"reward_std": 0.8636786490678787,
"rewards/correctness_reward_func": 0.2455357238650322,
"rewards/int_reward_func": 0.2533482201397419,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.07691965182311833,
"step": 128
},
{
"completion_length": 98.10714530944824,
"epoch": 0.48314606741573035,
"grad_norm": 0.44887715578079224,
"kl": 1.186767578125,
"learning_rate": 3.6057217255475034e-06,
"loss": 0.0475,
"reward": 0.2306763455271721,
"reward_std": 0.745768278837204,
"rewards/correctness_reward_func": 0.1339285783469677,
"rewards/int_reward_func": 0.2399553619325161,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.14320759288966656,
"step": 129
},
{
"completion_length": 92.93750381469727,
"epoch": 0.4868913857677903,
"grad_norm": 0.44887715578079224,
"kl": NaN,
"learning_rate": 3.6057217255475034e-06,
"loss": 0.0481,
"reward": 0.2954799123108387,
"reward_std": 0.7976376265287399,
"rewards/correctness_reward_func": 0.1517857238650322,
"rewards/int_reward_func": 0.2566964328289032,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.11300223972648382,
"step": 130
},
{
"completion_length": 102.15178871154785,
"epoch": 0.49063670411985016,
"grad_norm": 0.417877733707428,
"kl": 1.196044921875,
"learning_rate": 3.5762777420207382e-06,
"loss": 0.0478,
"reward": 0.29901787638664246,
"reward_std": 0.7817905694246292,
"rewards/correctness_reward_func": 0.1785714328289032,
"rewards/int_reward_func": 0.2656250149011612,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.14517857134342194,
"step": 131
},
{
"completion_length": 99.67187881469727,
"epoch": 0.4943820224719101,
"grad_norm": 0.492396742105484,
"kl": 1.19775390625,
"learning_rate": 3.5466493438435707e-06,
"loss": 0.0479,
"reward": 0.2629486694931984,
"reward_std": 0.8080793470144272,
"rewards/correctness_reward_func": 0.1562500074505806,
"rewards/int_reward_func": 0.2645089402794838,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.1578102707862854,
"step": 132
},
{
"completion_length": 97.48214721679688,
"epoch": 0.49812734082397003,
"grad_norm": 0.5177117586135864,
"kl": 1.177490234375,
"learning_rate": 3.516841607689501e-06,
"loss": 0.0471,
"reward": 0.2645267955958843,
"reward_std": 0.7447800785303116,
"rewards/correctness_reward_func": 0.1562500074505806,
"rewards/int_reward_func": 0.2455357275903225,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.13725892454385757,
"step": 133
},
{
"completion_length": 95.82366561889648,
"epoch": 0.50187265917603,
"grad_norm": 0.5745740532875061,
"kl": 1.138916015625,
"learning_rate": 3.486859640960668e-06,
"loss": 0.0456,
"reward": 0.2982388660311699,
"reward_std": 0.8590549826622009,
"rewards/correctness_reward_func": 0.1785714328289032,
"rewards/int_reward_func": 0.2522321492433548,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.1325647421181202,
"step": 134
},
{
"completion_length": 91.2567024230957,
"epoch": 0.5056179775280899,
"grad_norm": 0.6395649313926697,
"kl": 1.1143798828125,
"learning_rate": 3.4567085809127247e-06,
"loss": 0.0446,
"reward": 0.3479754514992237,
"reward_std": 0.8721490353345871,
"rewards/correctness_reward_func": 0.2008928693830967,
"rewards/int_reward_func": 0.2466517947614193,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.09956920135300606,
"step": 135
},
{
"completion_length": 92.41295051574707,
"epoch": 0.5093632958801498,
"grad_norm": 0.6169349551200867,
"kl": 1.121826171875,
"learning_rate": 3.426393593774591e-06,
"loss": 0.0449,
"reward": 0.26565179601311684,
"reward_std": 0.7934366017580032,
"rewards/correctness_reward_func": 0.1339285783469677,
"rewards/int_reward_func": 0.2433035857975483,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.11158036068081856,
"step": 136
},
{
"completion_length": 96.50223541259766,
"epoch": 0.5131086142322098,
"grad_norm": 0.5668321251869202,
"kl": 1.08935546875,
"learning_rate": 3.39591987386325e-06,
"loss": 0.0436,
"reward": 0.32920314325019717,
"reward_std": 0.7672218978404999,
"rewards/correctness_reward_func": 0.20982143515720963,
"rewards/int_reward_func": 0.2421875074505806,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.12280581146478653,
"step": 137
},
{
"completion_length": 94.98661231994629,
"epoch": 0.5168539325842697,
"grad_norm": 0.5555436611175537,
"kl": 1.1796875,
"learning_rate": 3.3652926426937327e-06,
"loss": 0.0472,
"reward": 0.3373348340392113,
"reward_std": 0.8585019558668137,
"rewards/correctness_reward_func": 0.1964285783469677,
"rewards/int_reward_func": 0.2566964440047741,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.1157901817932725,
"step": 138
},
{
"completion_length": 100.43527221679688,
"epoch": 0.5205992509363296,
"grad_norm": 0.5975056290626526,
"kl": 1.116455078125,
"learning_rate": 3.3345171480844275e-06,
"loss": 0.0447,
"reward": 0.33340851217508316,
"reward_std": 0.8432840257883072,
"rewards/correctness_reward_func": 0.196428582072258,
"rewards/int_reward_func": 0.2488839402794838,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.11190402135252953,
"step": 139
},
{
"completion_length": 110.37946701049805,
"epoch": 0.5243445692883895,
"grad_norm": 0.5592173337936401,
"kl": 1.2734375,
"learning_rate": 3.303598663257904e-06,
"loss": 0.0509,
"reward": 0.3411696571856737,
"reward_std": 0.8827303797006607,
"rewards/correctness_reward_func": 0.2232142947614193,
"rewards/int_reward_func": 0.2600446529686451,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.14208929613232613,
"step": 140
},
{
"completion_length": 104.59375381469727,
"epoch": 0.5280898876404494,
"grad_norm": 0.5417644381523132,
"kl": 1.274169921875,
"learning_rate": 3.272542485937369e-06,
"loss": 0.051,
"reward": 0.367312528192997,
"reward_std": 0.7450851798057556,
"rewards/correctness_reward_func": 0.20089286752045155,
"rewards/int_reward_func": 0.2578125149011612,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.09139286354184151,
"step": 141
},
{
"completion_length": 86.5714340209961,
"epoch": 0.5318352059925093,
"grad_norm": 0.5099084973335266,
"kl": 1.106689453125,
"learning_rate": 3.2413539374389275e-06,
"loss": 0.0443,
"reward": 0.41947099566459656,
"reward_std": 0.7897130697965622,
"rewards/correctness_reward_func": 0.2187500111758709,
"rewards/int_reward_func": 0.258928582072258,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.05820759106427431,
"step": 142
},
{
"completion_length": 97.21875381469727,
"epoch": 0.5355805243445693,
"grad_norm": 0.41567039489746094,
"kl": 1.190185546875,
"learning_rate": 3.2100383617598075e-06,
"loss": 0.0476,
"reward": 0.2790111724752933,
"reward_std": 0.7919286489486694,
"rewards/correctness_reward_func": 0.16964286752045155,
"rewards/int_reward_func": 0.2444196566939354,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.13505134359002113,
"step": 143
},
{
"completion_length": 92.24553871154785,
"epoch": 0.5393258426966292,
"grad_norm": 0.5126021504402161,
"kl": 1.135009765625,
"learning_rate": 3.1786011246626858e-06,
"loss": 0.0454,
"reward": 0.33713172376155853,
"reward_std": 0.759757861495018,
"rewards/correctness_reward_func": 0.1741071492433548,
"rewards/int_reward_func": 0.2578125074505806,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0011160714784637094,
"rewards/xmlcount_reward_func": -0.09590401872992516,
"step": 144
},
{
"completion_length": 92.36384391784668,
"epoch": 0.5430711610486891,
"grad_norm": 0.4743898808956146,
"kl": 1.125732421875,
"learning_rate": 3.147047612756302e-06,
"loss": 0.045,
"reward": 0.3338058143854141,
"reward_std": 0.7495080679655075,
"rewards/correctness_reward_func": 0.1741071492433548,
"rewards/int_reward_func": 0.2399553656578064,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0011160714784637094,
"rewards/xmlcount_reward_func": -0.0813727667555213,
"step": 145
},
{
"completion_length": 96.77902030944824,
"epoch": 0.5468164794007491,
"grad_norm": 0.5349624156951904,
"kl": 1.19140625,
"learning_rate": 3.115383232572483e-06,
"loss": 0.0476,
"reward": 0.3870870769023895,
"reward_std": 0.8124971240758896,
"rewards/correctness_reward_func": 0.2232142984867096,
"rewards/int_reward_func": 0.2533482275903225,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.08947544917464256,
"step": 146
},
{
"completion_length": 100.25670051574707,
"epoch": 0.550561797752809,
"grad_norm": 0.3959732949733734,
"kl": 1.1826171875,
"learning_rate": 3.0836134096397642e-06,
"loss": 0.0473,
"reward": 0.3632299154996872,
"reward_std": 0.8887846767902374,
"rewards/correctness_reward_func": 0.2053571529686451,
"rewards/int_reward_func": 0.266741082072258,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.10886830929666758,
"step": 147
},
{
"completion_length": 95.57366561889648,
"epoch": 0.5543071161048689,
"grad_norm": 0.5033745765686035,
"kl": 1.179931640625,
"learning_rate": 3.051743587553754e-06,
"loss": 0.0472,
"reward": 0.33835939317941666,
"reward_std": 0.7949195951223373,
"rewards/correctness_reward_func": 0.1607142947614193,
"rewards/int_reward_func": 0.2466517947614193,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0011160714784637094,
"rewards/xmlcount_reward_func": -0.07012277067406103,
"step": 148
},
{
"completion_length": 88.57143211364746,
"epoch": 0.5580524344569289,
"grad_norm": 0.4301639795303345,
"kl": 1.153564453125,
"learning_rate": 3.019779227044398e-06,
"loss": 0.0462,
"reward": 0.26906250417232513,
"reward_std": 0.7477044612169266,
"rewards/correctness_reward_func": 0.1607142947614193,
"rewards/int_reward_func": 0.2578125037252903,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.14946428686380386,
"step": 149
},
{
"completion_length": 104.3437557220459,
"epoch": 0.5617977528089888,
"grad_norm": 0.49258914589881897,
"kl": 1.2353515625,
"learning_rate": 2.9877258050403214e-06,
"loss": 0.0494,
"reward": 0.3209241144359112,
"reward_std": 0.821040615439415,
"rewards/correctness_reward_func": 0.1919642947614193,
"rewards/int_reward_func": 0.2444196529686451,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.11545982770621777,
"step": 150
},
{
"completion_length": 97.84821891784668,
"epoch": 0.5655430711610487,
"grad_norm": 0.6680523753166199,
"kl": 1.210205078125,
"learning_rate": 2.9555888137303695e-06,
"loss": 0.0484,
"reward": 0.4514397457242012,
"reward_std": 0.8471043556928635,
"rewards/correctness_reward_func": 0.2455357238650322,
"rewards/int_reward_func": 0.2767857238650322,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.07088169828057289,
"step": 151
},
{
"completion_length": 95.99777221679688,
"epoch": 0.5692883895131086,
"grad_norm": 0.44427061080932617,
"kl": 1.162353515625,
"learning_rate": 2.9233737596225616e-06,
"loss": 0.0465,
"reward": 0.3017522394657135,
"reward_std": 0.7615446895360947,
"rewards/correctness_reward_func": 0.1651785783469677,
"rewards/int_reward_func": 0.2600446566939354,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.12347099208272994,
"step": 152
},
{
"completion_length": 89.79687881469727,
"epoch": 0.5730337078651685,
"grad_norm": 0.3936914801597595,
"kl": 1.1171875,
"learning_rate": 2.8910861626005774e-06,
"loss": 0.0447,
"reward": 0.4248616322875023,
"reward_std": 0.8152914345264435,
"rewards/correctness_reward_func": 0.2410714328289032,
"rewards/int_reward_func": 0.286830373108387,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.10304017923772335,
"step": 153
},
{
"completion_length": 90.57589530944824,
"epoch": 0.5767790262172284,
"grad_norm": 0.43302392959594727,
"kl": 1.1453857421875,
"learning_rate": 2.858731554977948e-06,
"loss": 0.0458,
"reward": 0.3075290396809578,
"reward_std": 0.7411210238933563,
"rewards/correctness_reward_func": 0.16071429289877415,
"rewards/int_reward_func": 0.2410714402794838,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.09425670048221946,
"step": 154
},
{
"completion_length": 88.81920051574707,
"epoch": 0.5805243445692884,
"grad_norm": 0.4523748457431793,
"kl": 1.1572265625,
"learning_rate": 2.82631548055013e-06,
"loss": 0.0463,
"reward": 0.3362343907356262,
"reward_std": 0.8344388753175735,
"rewards/correctness_reward_func": 0.2232142947614193,
"rewards/int_reward_func": 0.2220982238650322,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.10907812882214785,
"step": 155
},
{
"completion_length": 92.31920051574707,
"epoch": 0.5842696629213483,
"grad_norm": 0.7492924928665161,
"kl": 1.13818359375,
"learning_rate": 2.7938434936445946e-06,
"loss": 0.0455,
"reward": 0.2696942128241062,
"reward_std": 0.7281434237957001,
"rewards/correctness_reward_func": 0.1785714365541935,
"rewards/int_reward_func": 0.2421875074505806,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.1510647376999259,
"step": 156
},
{
"completion_length": 91.05580711364746,
"epoch": 0.5880149812734082,
"grad_norm": 0.5227993130683899,
"kl": 1.126220703125,
"learning_rate": 2.761321158169134e-06,
"loss": 0.045,
"reward": 0.2759977802634239,
"reward_std": 0.9335441738367081,
"rewards/correctness_reward_func": 0.2098214402794838,
"rewards/int_reward_func": 0.2388392984867096,
"rewards/soft_format_reward_func": 0.0011160714784637094,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.17377902194857597,
"step": 157
},
{
"completion_length": 95.73437881469727,
"epoch": 0.5917602996254682,
"grad_norm": 0.462187260389328,
"kl": 1.1458740234375,
"learning_rate": 2.7287540466585067e-06,
"loss": 0.0458,
"reward": 0.3727143071591854,
"reward_std": 0.8857483267784119,
"rewards/correctness_reward_func": 0.2544642947614193,
"rewards/int_reward_func": 0.2656250111758709,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.14737500809133053,
"step": 158
},
{
"completion_length": 93.68750381469727,
"epoch": 0.5955056179775281,
"grad_norm": 0.5321673154830933,
"kl": 1.202392578125,
"learning_rate": 2.696147739319613e-06,
"loss": 0.0481,
"reward": 0.41518306732177734,
"reward_std": 0.7417058497667313,
"rewards/correctness_reward_func": 0.2276785783469677,
"rewards/int_reward_func": 0.2678571492433548,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.08035267796367407,
"step": 159
},
{
"completion_length": 93.1093807220459,
"epoch": 0.599250936329588,
"grad_norm": 0.4620046615600586,
"kl": 1.119873046875,
"learning_rate": 2.663507823075358e-06,
"loss": 0.0448,
"reward": 0.32153796777129173,
"reward_std": 0.8575054854154587,
"rewards/correctness_reward_func": 0.2053571529686451,
"rewards/int_reward_func": 0.2500000149011612,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.13381919264793396,
"step": 160
},
{
"completion_length": 100.61161041259766,
"epoch": 0.602996254681648,
"grad_norm": 0.4500925838947296,
"kl": 1.206298828125,
"learning_rate": 2.6308398906073603e-06,
"loss": 0.0483,
"reward": 0.3319799229502678,
"reward_std": 0.7600451558828354,
"rewards/correctness_reward_func": 0.1830357201397419,
"rewards/int_reward_func": 0.2544642947614193,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.10552009008824825,
"step": 161
},
{
"completion_length": 97.59598731994629,
"epoch": 0.6067415730337079,
"grad_norm": 0.4500925838947296,
"kl": NaN,
"learning_rate": 2.6308398906073603e-06,
"loss": 0.0447,
"reward": 0.2094486728310585,
"reward_std": 0.7504701465368271,
"rewards/correctness_reward_func": 0.12500000558793545,
"rewards/int_reward_func": 0.2321428693830967,
"rewards/soft_format_reward_func": 0.0011160714784637094,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.14881027303636074,
"step": 162
},
{
"completion_length": 94.28125381469727,
"epoch": 0.6104868913857678,
"grad_norm": 0.4558067321777344,
"kl": 1.154296875,
"learning_rate": 2.5981495393976718e-06,
"loss": 0.0462,
"reward": 0.26968081295490265,
"reward_std": 0.8480332493782043,
"rewards/correctness_reward_func": 0.1562500074505806,
"rewards/int_reward_func": 0.2410714440047741,
"rewards/soft_format_reward_func": 0.0011160714784637094,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.1287567038089037,
"step": 163
},
{
"completion_length": 98.24107551574707,
"epoch": 0.6142322097378277,
"grad_norm": 0.4498756229877472,
"kl": 1.177734375,
"learning_rate": 2.5654423707696834e-06,
"loss": 0.0471,
"reward": 0.42699556052684784,
"reward_std": 0.8207688927650452,
"rewards/correctness_reward_func": 0.2544642984867096,
"rewards/int_reward_func": 0.2667410783469677,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.09420982468873262,
"step": 164
},
{
"completion_length": 93.59152221679688,
"epoch": 0.6179775280898876,
"grad_norm": 0.40786251425743103,
"kl": 1.1708984375,
"learning_rate": 2.5327239889283613e-06,
"loss": 0.0468,
"reward": 0.28490403294563293,
"reward_std": 0.7075008153915405,
"rewards/correctness_reward_func": 0.1562500037252903,
"rewards/int_reward_func": 0.2399553693830967,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.11130134668201208,
"step": 165
},
{
"completion_length": 97.06920051574707,
"epoch": 0.6217228464419475,
"grad_norm": 0.4559285640716553,
"kl": 1.146484375,
"learning_rate": 2.5e-06,
"loss": 0.0459,
"reward": 0.36037053912878036,
"reward_std": 0.7710148096084595,
"rewards/correctness_reward_func": 0.2098214365541935,
"rewards/int_reward_func": 0.2488839328289032,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.09833482303656638,
"step": 166
},
{
"completion_length": 102.25223731994629,
"epoch": 0.6254681647940075,
"grad_norm": 0.4517359137535095,
"kl": 1.250732421875,
"learning_rate": 2.4672760110716395e-06,
"loss": 0.05,
"reward": 0.3416629731655121,
"reward_std": 0.7625188678503036,
"rewards/correctness_reward_func": 0.2098214402794838,
"rewards/int_reward_func": 0.2600446529686451,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.12820313312113285,
"step": 167
},
{
"completion_length": 106.65402412414551,
"epoch": 0.6292134831460674,
"grad_norm": 0.5974560379981995,
"kl": 1.312255859375,
"learning_rate": 2.434557629230318e-06,
"loss": 0.0525,
"reward": 0.3028549253940582,
"reward_std": 0.6672599911689758,
"rewards/correctness_reward_func": 0.12946429336443543,
"rewards/int_reward_func": 0.2622767947614193,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.08888616785407066,
"step": 168
},
{
"completion_length": 101.56696701049805,
"epoch": 0.6329588014981273,
"grad_norm": 0.5514973402023315,
"kl": 1.394287109375,
"learning_rate": 2.4018504606023295e-06,
"loss": 0.0558,
"reward": 0.3376808315515518,
"reward_std": 0.7231378108263016,
"rewards/correctness_reward_func": 0.1696428693830967,
"rewards/int_reward_func": 0.2555803619325161,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.08754241955466568,
"step": 169
},
{
"completion_length": 96.87054061889648,
"epoch": 0.6367041198501873,
"grad_norm": 0.47777560353279114,
"kl": 1.227783203125,
"learning_rate": 2.3691601093926406e-06,
"loss": 0.0491,
"reward": 0.33633705973625183,
"reward_std": 0.7033251821994781,
"rewards/correctness_reward_func": 0.1696428656578064,
"rewards/int_reward_func": 0.2500000186264515,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0011160714784637094,
"rewards/xmlcount_reward_func": -0.08442188054323196,
"step": 170
},
{
"completion_length": 98.07366371154785,
"epoch": 0.6404494382022472,
"grad_norm": 0.47777560353279114,
"kl": NaN,
"learning_rate": 2.3691601093926406e-06,
"loss": 0.0531,
"reward": 0.35397323966026306,
"reward_std": 0.8121795952320099,
"rewards/correctness_reward_func": 0.196428582072258,
"rewards/int_reward_func": 0.2600446492433548,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.10250000539235771,
"step": 171
},
{
"completion_length": 95.59821891784668,
"epoch": 0.6441947565543071,
"grad_norm": 0.47777560353279114,
"kl": NaN,
"learning_rate": 2.3691601093926406e-06,
"loss": 0.053,
"reward": 0.43456026911735535,
"reward_std": 0.9222677648067474,
"rewards/correctness_reward_func": 0.2857142984867096,
"rewards/int_reward_func": 0.2633928768336773,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.1145468857139349,
"step": 172
},
{
"completion_length": 90.60714721679688,
"epoch": 0.6479400749063671,
"grad_norm": 0.4840092957019806,
"kl": 1.194580078125,
"learning_rate": 2.3364921769246423e-06,
"loss": 0.0478,
"reward": 0.27813393622636795,
"reward_std": 0.899843841791153,
"rewards/correctness_reward_func": 0.2053571529686451,
"rewards/int_reward_func": 0.2455357238650322,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.17275893315672874,
"step": 173
},
{
"completion_length": 97.23214912414551,
"epoch": 0.651685393258427,
"grad_norm": 0.5476865172386169,
"kl": 1.3369140625,
"learning_rate": 2.3038522606803882e-06,
"loss": 0.0535,
"reward": 0.3434709906578064,
"reward_std": 0.7968785911798477,
"rewards/correctness_reward_func": 0.1919642947614193,
"rewards/int_reward_func": 0.2566964402794838,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.10518973506987095,
"step": 174
},
{
"completion_length": 100.55357551574707,
"epoch": 0.6554307116104869,
"grad_norm": 0.44576528668403625,
"kl": 1.319091796875,
"learning_rate": 2.271245953341494e-06,
"loss": 0.0528,
"reward": 0.3407433070242405,
"reward_std": 0.7602152675390244,
"rewards/correctness_reward_func": 0.160714291036129,
"rewards/int_reward_func": 0.2488839440047741,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.0688549093902111,
"step": 175
},
{
"completion_length": 94.70089721679688,
"epoch": 0.6591760299625468,
"grad_norm": 0.4624294340610504,
"kl": 1.206298828125,
"learning_rate": 2.238678841830867e-06,
"loss": 0.0483,
"reward": 0.3417031392455101,
"reward_std": 0.8133516311645508,
"rewards/correctness_reward_func": 0.1919642984867096,
"rewards/int_reward_func": 0.258928582072258,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.10918973386287689,
"step": 176
},
{
"completion_length": 90.06473541259766,
"epoch": 0.6629213483146067,
"grad_norm": 0.4517357349395752,
"kl": 1.264404296875,
"learning_rate": 2.2061565063554063e-06,
"loss": 0.0506,
"reward": 0.2323437575250864,
"reward_std": 0.7963760495185852,
"rewards/correctness_reward_func": 0.1428571492433548,
"rewards/int_reward_func": 0.2388392984867096,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.14935269264969975,
"step": 177
},
{
"completion_length": 95.54464721679688,
"epoch": 0.6666666666666666,
"grad_norm": 0.4517357349395752,
"kl": NaN,
"learning_rate": 2.2061565063554063e-06,
"loss": 0.0465,
"reward": 0.33185046166181564,
"reward_std": 0.7673598080873489,
"rewards/correctness_reward_func": 0.15625000558793545,
"rewards/int_reward_func": 0.2633928693830967,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.08779241424053907,
"step": 178
},
{
"completion_length": 95.11830711364746,
"epoch": 0.6704119850187266,
"grad_norm": 0.4517357349395752,
"kl": NaN,
"learning_rate": 2.2061565063554063e-06,
"loss": 0.0485,
"reward": 0.3244776912033558,
"reward_std": 0.7535363733768463,
"rewards/correctness_reward_func": 0.15178572200238705,
"rewards/int_reward_func": 0.2533482201397419,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.08065625140443444,
"step": 179
},
{
"completion_length": 92.10937881469727,
"epoch": 0.6741573033707865,
"grad_norm": 0.43252718448638916,
"kl": 1.205078125,
"learning_rate": 2.173684519449872e-06,
"loss": 0.0482,
"reward": 0.3435089588165283,
"reward_std": 0.6813161820173264,
"rewards/correctness_reward_func": 0.1919642984867096,
"rewards/int_reward_func": 0.2354910895228386,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.08394643478095531,
"step": 180
},
{
"completion_length": 92.46652221679688,
"epoch": 0.6779026217228464,
"grad_norm": 0.4764062166213989,
"kl": 1.170166015625,
"learning_rate": 2.1412684450220524e-06,
"loss": 0.0468,
"reward": 0.40853575617074966,
"reward_std": 0.8349853605031967,
"rewards/correctness_reward_func": 0.2500000111758709,
"rewards/int_reward_func": 0.2600446566939354,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.10150893498212099,
"step": 181
},
{
"completion_length": 97.13616371154785,
"epoch": 0.6816479400749064,
"grad_norm": 0.4394303262233734,
"kl": 1.2177734375,
"learning_rate": 2.1089138373994226e-06,
"loss": 0.0487,
"reward": 0.3534955531358719,
"reward_std": 0.7782185822725296,
"rewards/correctness_reward_func": 0.2008928693830967,
"rewards/int_reward_func": 0.271205373108387,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.11860269121825695,
"step": 182
},
{
"completion_length": 93.8035774230957,
"epoch": 0.6853932584269663,
"grad_norm": 0.4350711405277252,
"kl": 1.158935546875,
"learning_rate": 2.0766262403774388e-06,
"loss": 0.0464,
"reward": 0.29088394716382027,
"reward_std": 0.8285562247037888,
"rewards/correctness_reward_func": 0.1741071455180645,
"rewards/int_reward_func": 0.2321428656578064,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.11536607332527637,
"step": 183
},
{
"completion_length": 102.04687881469727,
"epoch": 0.6891385767790262,
"grad_norm": 0.5000995993614197,
"kl": 1.3017578125,
"learning_rate": 2.0444111862696313e-06,
"loss": 0.0521,
"reward": 0.33771875873208046,
"reward_std": 0.786424919962883,
"rewards/correctness_reward_func": 0.1785714328289032,
"rewards/int_reward_func": 0.2645089328289032,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.10536161065101624,
"step": 184
},
{
"completion_length": 84.60714530944824,
"epoch": 0.6928838951310862,
"grad_norm": 0.4194977283477783,
"kl": 1.094482421875,
"learning_rate": 2.01227419495968e-06,
"loss": 0.0438,
"reward": 0.2823236584663391,
"reward_std": 0.8382576406002045,
"rewards/correctness_reward_func": 0.1741071529686451,
"rewards/int_reward_func": 0.2343750149011612,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.1261584870517254,
"step": 185
},
{
"completion_length": 101.93527221679688,
"epoch": 0.6966292134831461,
"grad_norm": 0.5226723551750183,
"kl": 1.428466796875,
"learning_rate": 1.9802207729556023e-06,
"loss": 0.0571,
"reward": 0.3646428808569908,
"reward_std": 0.7940028458833694,
"rewards/correctness_reward_func": 0.20089286752045155,
"rewards/int_reward_func": 0.2645089402794838,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.1007589353248477,
"step": 186
},
{
"completion_length": 89.75893211364746,
"epoch": 0.700374531835206,
"grad_norm": 0.5314286351203918,
"kl": 1.265625,
"learning_rate": 1.9482564124462478e-06,
"loss": 0.0506,
"reward": 0.34281474351882935,
"reward_std": 0.7245265394449234,
"rewards/correctness_reward_func": 0.1875000111758709,
"rewards/int_reward_func": 0.2500000111758709,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.09468527138233185,
"step": 187
},
{
"completion_length": 89.30357360839844,
"epoch": 0.704119850187266,
"grad_norm": 0.4597249925136566,
"kl": 1.21875,
"learning_rate": 1.9163865903602374e-06,
"loss": 0.0488,
"reward": 0.3371138572692871,
"reward_std": 0.7971315979957581,
"rewards/correctness_reward_func": 0.1964285783469677,
"rewards/int_reward_func": 0.2410714328289032,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.10038616601377726,
"step": 188
},
{
"completion_length": 98.16071891784668,
"epoch": 0.7078651685393258,
"grad_norm": 0.4473523497581482,
"kl": 1.35888671875,
"learning_rate": 1.8846167674275175e-06,
"loss": 0.0544,
"reward": 0.2626696489751339,
"reward_std": 0.6907573491334915,
"rewards/correctness_reward_func": 0.1383928619325161,
"rewards/int_reward_func": 0.2555803619325161,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.13130357582122087,
"step": 189
},
{
"completion_length": 102.61830711364746,
"epoch": 0.7116104868913857,
"grad_norm": 0.4473523497581482,
"kl": NaN,
"learning_rate": 1.8846167674275175e-06,
"loss": 0.054,
"reward": 0.3266384117305279,
"reward_std": 0.8520393073558807,
"rewards/correctness_reward_func": 0.1919642947614193,
"rewards/int_reward_func": 0.2600446566939354,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.12537054996937513,
"step": 190
},
{
"completion_length": 98.22098731994629,
"epoch": 0.7153558052434457,
"grad_norm": 0.6242156624794006,
"kl": 1.2861328125,
"learning_rate": 1.852952387243698e-06,
"loss": 0.0514,
"reward": 0.2729821652173996,
"reward_std": 0.7455658465623856,
"rewards/correctness_reward_func": 0.1785714365541935,
"rewards/int_reward_func": 0.2343750074505806,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.1399642862379551,
"step": 191
},
{
"completion_length": 95.50446701049805,
"epoch": 0.7191011235955056,
"grad_norm": 0.5410141348838806,
"kl": 1.269287109375,
"learning_rate": 1.8213988753373147e-06,
"loss": 0.0508,
"reward": 0.3193794898688793,
"reward_std": 0.8921961933374405,
"rewards/correctness_reward_func": 0.2187500074505806,
"rewards/int_reward_func": 0.2488839402794838,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.14825446717441082,
"step": 192
},
{
"completion_length": 95.50893211364746,
"epoch": 0.7228464419475655,
"grad_norm": 0.5410141348838806,
"kl": NaN,
"learning_rate": 1.8213988753373147e-06,
"loss": 0.0494,
"reward": 0.4274576008319855,
"reward_std": 0.8424459546804428,
"rewards/correctness_reward_func": 0.2500000074505806,
"rewards/int_reward_func": 0.2555803656578064,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.07812276761978865,
"step": 193
},
{
"completion_length": 100.1406307220459,
"epoch": 0.7265917602996255,
"grad_norm": 0.5630261301994324,
"kl": 1.331298828125,
"learning_rate": 1.7899616382401935e-06,
"loss": 0.0532,
"reward": 0.24447321146726608,
"reward_std": 0.8442487269639969,
"rewards/correctness_reward_func": 0.1517857201397419,
"rewards/int_reward_func": 0.2455357275903225,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.15284822601825,
"step": 194
},
{
"completion_length": 98.96429061889648,
"epoch": 0.7303370786516854,
"grad_norm": 0.5630261301994324,
"kl": NaN,
"learning_rate": 1.7899616382401935e-06,
"loss": 0.0485,
"reward": 0.3635468855500221,
"reward_std": 0.8013804405927658,
"rewards/correctness_reward_func": 0.1875000074505806,
"rewards/int_reward_func": 0.2544642984867096,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.07841741375159472,
"step": 195
},
{
"completion_length": 96.05804252624512,
"epoch": 0.7340823970037453,
"grad_norm": 0.4605935513973236,
"kl": 1.296875,
"learning_rate": 1.758646062561073e-06,
"loss": 0.0519,
"reward": 0.30311162024736404,
"reward_std": 0.7245951294898987,
"rewards/correctness_reward_func": 0.13839286379516125,
"rewards/int_reward_func": 0.2433035783469677,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.07858482515439391,
"step": 196
},
{
"completion_length": 99.58036041259766,
"epoch": 0.7378277153558053,
"grad_norm": 0.5969142913818359,
"kl": 1.305419921875,
"learning_rate": 1.7274575140626318e-06,
"loss": 0.0522,
"reward": 0.26581921428442,
"reward_std": 0.8090188354253769,
"rewards/correctness_reward_func": 0.15625000931322575,
"rewards/int_reward_func": 0.2310267984867096,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.12145759037230164,
"step": 197
},
{
"completion_length": 91.8995590209961,
"epoch": 0.7415730337078652,
"grad_norm": 0.527574360370636,
"kl": 1.245849609375,
"learning_rate": 1.6964013367420967e-06,
"loss": 0.0498,
"reward": 0.38672323897480965,
"reward_std": 0.7834379523992538,
"rewards/correctness_reward_func": 0.2232142947614193,
"rewards/int_reward_func": 0.2689732238650322,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.10546428337693214,
"step": 198
},
{
"completion_length": 98.9375057220459,
"epoch": 0.7453183520599251,
"grad_norm": 0.5717042684555054,
"kl": 1.277587890625,
"learning_rate": 1.665482851915573e-06,
"loss": 0.0511,
"reward": 0.24759376049041748,
"reward_std": 0.6963834911584854,
"rewards/correctness_reward_func": 0.1116071492433548,
"rewards/int_reward_func": 0.2343750111758709,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.09838839736767113,
"step": 199
},
{
"completion_length": 96.61830711364746,
"epoch": 0.7490636704119851,
"grad_norm": 0.5991680026054382,
"kl": 1.228271484375,
"learning_rate": 1.634707357306267e-06,
"loss": 0.0491,
"reward": 0.3660937622189522,
"reward_std": 0.8573340475559235,
"rewards/correctness_reward_func": 0.2366071492433548,
"rewards/int_reward_func": 0.2488839365541935,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.11939733009785414,
"step": 200
},
{
"completion_length": 106.84152221679688,
"epoch": 0.7528089887640449,
"grad_norm": 0.5178794860839844,
"kl": 1.34326171875,
"learning_rate": 1.6040801261367494e-06,
"loss": 0.0537,
"reward": 0.3160022422671318,
"reward_std": 0.8164055794477463,
"rewards/correctness_reward_func": 0.196428582072258,
"rewards/int_reward_func": 0.2622767984867096,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.14270313642919064,
"step": 201
},
{
"completion_length": 90.1875057220459,
"epoch": 0.7565543071161048,
"grad_norm": 0.5120040774345398,
"kl": 1.171142578125,
"learning_rate": 1.5736064062254094e-06,
"loss": 0.0468,
"reward": 0.2789843790233135,
"reward_std": 0.8016841560602188,
"rewards/correctness_reward_func": 0.1562500074505806,
"rewards/int_reward_func": 0.251116082072258,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.12838170863687992,
"step": 202
},
{
"completion_length": 90.62277030944824,
"epoch": 0.7602996254681648,
"grad_norm": 0.4355852007865906,
"kl": 1.2255859375,
"learning_rate": 1.5432914190872757e-06,
"loss": 0.049,
"reward": 0.34145762026309967,
"reward_std": 0.745452344417572,
"rewards/correctness_reward_func": 0.1696428656578064,
"rewards/int_reward_func": 0.2645089402794838,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.09269420150667429,
"step": 203
},
{
"completion_length": 96.64955711364746,
"epoch": 0.7640449438202247,
"grad_norm": 0.49031803011894226,
"kl": 1.14404296875,
"learning_rate": 1.5131403590393323e-06,
"loss": 0.0458,
"reward": 0.3124375157058239,
"reward_std": 0.7880858033895493,
"rewards/correctness_reward_func": 0.1875000074505806,
"rewards/int_reward_func": 0.2343750074505806,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.10943750524893403,
"step": 204
},
{
"completion_length": 90.63616561889648,
"epoch": 0.7677902621722846,
"grad_norm": 0.48222535848617554,
"kl": 1.188720703125,
"learning_rate": 1.4831583923105e-06,
"loss": 0.0475,
"reward": 0.372580386698246,
"reward_std": 0.8582592159509659,
"rewards/correctness_reward_func": 0.2187500074505806,
"rewards/int_reward_func": 0.2522321566939354,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.0984017881564796,
"step": 205
},
{
"completion_length": 99.03348922729492,
"epoch": 0.7715355805243446,
"grad_norm": 0.4410792589187622,
"kl": 1.247314453125,
"learning_rate": 1.4533506561564305e-06,
"loss": 0.0499,
"reward": 0.33057814463973045,
"reward_std": 0.7788708359003067,
"rewards/correctness_reward_func": 0.2142857238650322,
"rewards/int_reward_func": 0.2433035857975483,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.1270111622288823,
"step": 206
},
{
"completion_length": 103.51786041259766,
"epoch": 0.7752808988764045,
"grad_norm": 0.5613678693771362,
"kl": 1.181640625,
"learning_rate": 1.4237222579792618e-06,
"loss": 0.0473,
"reward": 0.26221875846385956,
"reward_std": 0.7947122156620026,
"rewards/correctness_reward_func": 0.14732143469154835,
"rewards/int_reward_func": 0.2421875074505806,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.12729018926620483,
"step": 207
},
{
"completion_length": 92.02232551574707,
"epoch": 0.7790262172284644,
"grad_norm": 0.6175336241722107,
"kl": 1.016357421875,
"learning_rate": 1.3942782744524974e-06,
"loss": 0.0406,
"reward": 0.2860134020447731,
"reward_std": 0.7233386188745499,
"rewards/correctness_reward_func": 0.1339285783469677,
"rewards/int_reward_func": 0.2276785783469677,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.075593750923872,
"step": 208
},
{
"completion_length": 103.3125057220459,
"epoch": 0.7827715355805244,
"grad_norm": 0.4679234027862549,
"kl": 1.18408203125,
"learning_rate": 1.3650237506511333e-06,
"loss": 0.0474,
"reward": 0.28701116889715195,
"reward_std": 0.8643300235271454,
"rewards/correctness_reward_func": 0.2098214365541935,
"rewards/int_reward_func": 0.2578125111758709,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.18062277510762215,
"step": 209
},
{
"completion_length": 102.34598541259766,
"epoch": 0.7865168539325843,
"grad_norm": 0.5165115594863892,
"kl": 1.170654296875,
"learning_rate": 1.3359636991872215e-06,
"loss": 0.0468,
"reward": 0.30597545951604843,
"reward_std": 0.6952601373195648,
"rewards/correctness_reward_func": 0.1428571492433548,
"rewards/int_reward_func": 0.2500000111758709,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.08688170462846756,
"step": 210
},
{
"completion_length": 114.28572082519531,
"epoch": 0.7902621722846442,
"grad_norm": 0.4335125982761383,
"kl": 1.242919921875,
"learning_rate": 1.307103099350979e-06,
"loss": 0.0497,
"reward": 0.2512656319886446,
"reward_std": 0.8131757378578186,
"rewards/correctness_reward_func": 0.1517857201397419,
"rewards/int_reward_func": 0.2377232238650322,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.1382433008402586,
"step": 211
},
{
"completion_length": 101.02009391784668,
"epoch": 0.7940074906367042,
"grad_norm": 0.5146499276161194,
"kl": 1.20947265625,
"learning_rate": 1.2784468962576136e-06,
"loss": 0.0484,
"reward": 0.27424776926636696,
"reward_std": 0.76705102622509,
"rewards/correctness_reward_func": 0.13392858020961285,
"rewards/int_reward_func": 0.2377232201397419,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.09740401990711689,
"step": 212
},
{
"completion_length": 108.00000381469727,
"epoch": 0.797752808988764,
"grad_norm": 0.46153557300567627,
"kl": 1.201416015625,
"learning_rate": 1.2500000000000007e-06,
"loss": 0.048,
"reward": 0.2503928691148758,
"reward_std": 0.8307089358568192,
"rewards/correctness_reward_func": 0.1875000074505806,
"rewards/int_reward_func": 0.2410714365541935,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.1781785748898983,
"step": 213
},
{
"completion_length": 93.59598731994629,
"epoch": 0.8014981273408239,
"grad_norm": 0.49990415573120117,
"kl": 1.1107177734375,
"learning_rate": 1.2217672848073702e-06,
"loss": 0.0444,
"reward": 0.3425290137529373,
"reward_std": 0.8552941530942917,
"rewards/correctness_reward_func": 0.2187500074505806,
"rewards/int_reward_func": 0.2645089477300644,
"rewards/soft_format_reward_func": 0.0011160714784637094,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.1418459787964821,
"step": 214
},
{
"completion_length": 100.52009201049805,
"epoch": 0.8052434456928839,
"grad_norm": 0.46211493015289307,
"kl": 1.141845703125,
"learning_rate": 1.193753588210128e-06,
"loss": 0.0457,
"reward": 0.2656339444220066,
"reward_std": 0.7493345886468887,
"rewards/correctness_reward_func": 0.15625000931322575,
"rewards/int_reward_func": 0.2767857275903225,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.1674017831683159,
"step": 215
},
{
"completion_length": 90.06473541259766,
"epoch": 0.8089887640449438,
"grad_norm": 0.4925229549407959,
"kl": 1.171630859375,
"learning_rate": 1.1659637102109713e-06,
"loss": 0.0469,
"reward": 0.31793973594903946,
"reward_std": 0.8032208532094955,
"rewards/correctness_reward_func": 0.1741071529686451,
"rewards/int_reward_func": 0.2433035857975483,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.09947098419070244,
"step": 216
},
{
"completion_length": 93.21428871154785,
"epoch": 0.8127340823970037,
"grad_norm": 0.4107387363910675,
"kl": 1.096923828125,
"learning_rate": 1.1384024124624324e-06,
"loss": 0.0439,
"reward": 0.2808660827577114,
"reward_std": 0.7595269531011581,
"rewards/correctness_reward_func": 0.19642857648432255,
"rewards/int_reward_func": 0.243303582072258,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.15886607952415943,
"step": 217
},
{
"completion_length": 107.45982551574707,
"epoch": 0.8164794007490637,
"grad_norm": 0.4463358521461487,
"kl": 1.247802734375,
"learning_rate": 1.1110744174509952e-06,
"loss": 0.0499,
"reward": 0.27611831203103065,
"reward_std": 0.8640467375516891,
"rewards/correctness_reward_func": 0.2187500074505806,
"rewards/int_reward_func": 0.2444196492433548,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.18705134466290474,
"step": 218
},
{
"completion_length": 96.86384582519531,
"epoch": 0.8202247191011236,
"grad_norm": 0.5301110148429871,
"kl": 1.199951171875,
"learning_rate": 1.0839844076879186e-06,
"loss": 0.048,
"reward": 0.31224555149674416,
"reward_std": 0.7878952473402023,
"rewards/correctness_reward_func": 0.2053571529686451,
"rewards/int_reward_func": 0.258928582072258,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.15204017609357834,
"step": 219
},
{
"completion_length": 91.19643211364746,
"epoch": 0.8239700374531835,
"grad_norm": 0.4537685811519623,
"kl": 1.101806640625,
"learning_rate": 1.0571370249069163e-06,
"loss": 0.0441,
"reward": 0.3926495686173439,
"reward_std": 0.8612103760242462,
"rewards/correctness_reward_func": 0.2008928693830967,
"rewards/int_reward_func": 0.2611607313156128,
"rewards/soft_format_reward_func": 0.0011160714784637094,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.07052009226754308,
"step": 220
},
{
"completion_length": 90.3125057220459,
"epoch": 0.8277153558052435,
"grad_norm": 0.4537685811519623,
"kl": NaN,
"learning_rate": 1.0571370249069163e-06,
"loss": 0.0418,
"reward": 0.28333261236548424,
"reward_std": 0.7591045498847961,
"rewards/correctness_reward_func": 0.1562500074505806,
"rewards/int_reward_func": 0.2455357275903225,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.11845313012599945,
"step": 221
},
{
"completion_length": 90.84152030944824,
"epoch": 0.8314606741573034,
"grad_norm": 0.5334520936012268,
"kl": 1.140625,
"learning_rate": 1.0305368692688175e-06,
"loss": 0.0456,
"reward": 0.305205374956131,
"reward_std": 0.7375971227884293,
"rewards/correctness_reward_func": 0.1830357164144516,
"rewards/int_reward_func": 0.2310267984867096,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.10885715018957853,
"step": 222
},
{
"completion_length": 93.97098541259766,
"epoch": 0.8352059925093633,
"grad_norm": 0.4208020269870758,
"kl": 1.108154296875,
"learning_rate": 1.0041884985733524e-06,
"loss": 0.0443,
"reward": 0.339029036462307,
"reward_std": 0.7930542379617691,
"rewards/correctness_reward_func": 0.2053571492433548,
"rewards/int_reward_func": 0.2287946529686451,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.09512277226895094,
"step": 223
},
{
"completion_length": 102.49107551574707,
"epoch": 0.8389513108614233,
"grad_norm": 0.41976016759872437,
"kl": 1.1943359375,
"learning_rate": 9.780964274781984e-07,
"loss": 0.0478,
"reward": 0.23499107360839844,
"reward_std": 0.7919187396764755,
"rewards/correctness_reward_func": 0.1651785783469677,
"rewards/int_reward_func": 0.219866082072258,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.15005357982590795,
"step": 224
},
{
"completion_length": 88.04464912414551,
"epoch": 0.8426966292134831,
"grad_norm": 0.41976016759872437,
"kl": NaN,
"learning_rate": 9.780964274781984e-07,
"loss": 0.0464,
"reward": 0.20726785995066166,
"reward_std": 0.7513840273022652,
"rewards/correctness_reward_func": 0.10714286379516125,
"rewards/int_reward_func": 0.219866082072258,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.11974107846617699,
"step": 225
},
{
"completion_length": 99.45536422729492,
"epoch": 0.846441947565543,
"grad_norm": 0.6243520975112915,
"kl": 1.137939453125,
"learning_rate": 9.522651267254149e-07,
"loss": 0.0455,
"reward": 0.3012098353356123,
"reward_std": 0.7535159438848495,
"rewards/correctness_reward_func": 0.1696428656578064,
"rewards/int_reward_func": 0.2287946566939354,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.09722768981009722,
"step": 226
},
{
"completion_length": 88.08928871154785,
"epoch": 0.850187265917603,
"grad_norm": 0.5370275974273682,
"kl": 1.02734375,
"learning_rate": 9.266990223754069e-07,
"loss": 0.0411,
"reward": 0.3909241184592247,
"reward_std": 0.7717972099781036,
"rewards/correctness_reward_func": 0.1964285783469677,
"rewards/int_reward_func": 0.2511160857975483,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.05662053730338812,
"step": 227
},
{
"completion_length": 92.60937881469727,
"epoch": 0.8539325842696629,
"grad_norm": 0.4681946337223053,
"kl": 1.1201171875,
"learning_rate": 9.014024950485384e-07,
"loss": 0.0448,
"reward": 0.3636852651834488,
"reward_std": 0.8009557723999023,
"rewards/correctness_reward_func": 0.2053571566939354,
"rewards/int_reward_func": 0.251116082072258,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.09278795216232538,
"step": 228
},
{
"completion_length": 105.85491371154785,
"epoch": 0.8576779026217228,
"grad_norm": 0.4945945143699646,
"kl": 1.174072265625,
"learning_rate": 8.763798791745413e-07,
"loss": 0.047,
"reward": 0.3037031330168247,
"reward_std": 0.8670637309551239,
"rewards/correctness_reward_func": 0.2008928656578064,
"rewards/int_reward_func": 0.2745535857975483,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.17174331843852997,
"step": 229
},
{
"completion_length": 94.59152221679688,
"epoch": 0.8614232209737828,
"grad_norm": 0.5289459228515625,
"kl": 1.0704345703125,
"learning_rate": 8.516354622498279e-07,
"loss": 0.0428,
"reward": 0.3705156408250332,
"reward_std": 0.8742925226688385,
"rewards/correctness_reward_func": 0.223214291036129,
"rewards/int_reward_func": 0.2488839365541935,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.10158259607851505,
"step": 230
},
{
"completion_length": 86.8683090209961,
"epoch": 0.8651685393258427,
"grad_norm": 0.4612804353237152,
"kl": 1.0635986328125,
"learning_rate": 8.271734841028553e-07,
"loss": 0.0425,
"reward": 0.32227009534835815,
"reward_std": 0.7606519907712936,
"rewards/correctness_reward_func": 0.14285715110599995,
"rewards/int_reward_func": 0.2600446529686451,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.08063170197419822,
"step": 231
},
{
"completion_length": 109.55357551574707,
"epoch": 0.8689138576779026,
"grad_norm": 0.5144924521446228,
"kl": 1.314208984375,
"learning_rate": 8.029981361676456e-07,
"loss": 0.0526,
"reward": 0.28595758974552155,
"reward_std": 0.8525267541408539,
"rewards/correctness_reward_func": 0.2098214440047741,
"rewards/int_reward_func": 0.2611607238650322,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.185024568811059,
"step": 232
},
{
"completion_length": 98.18303871154785,
"epoch": 0.8726591760299626,
"grad_norm": 0.5144924521446228,
"kl": NaN,
"learning_rate": 8.029981361676456e-07,
"loss": 0.0476,
"reward": 0.2972254566848278,
"reward_std": 0.7380311787128448,
"rewards/correctness_reward_func": 0.1473214365541935,
"rewards/int_reward_func": 0.2433035857975483,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.09339955315226689,
"step": 233
},
{
"completion_length": 98.16741371154785,
"epoch": 0.8764044943820225,
"grad_norm": 0.5009276270866394,
"kl": 1.244384765625,
"learning_rate": 7.791135607656147e-07,
"loss": 0.0498,
"reward": 0.3269129544496536,
"reward_std": 0.7167427837848663,
"rewards/correctness_reward_func": 0.16071429662406445,
"rewards/int_reward_func": 0.2500000111758709,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.08380134031176567,
"step": 234
},
{
"completion_length": 97.88616371154785,
"epoch": 0.8801498127340824,
"grad_norm": 0.5268076062202454,
"kl": 1.235107421875,
"learning_rate": 7.555238503958001e-07,
"loss": 0.0494,
"reward": 0.3055223375558853,
"reward_std": 0.8659389615058899,
"rewards/correctness_reward_func": 0.1919642947614193,
"rewards/int_reward_func": 0.2500000074505806,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.13644197303801775,
"step": 235
},
{
"completion_length": 88.23661041259766,
"epoch": 0.8838951310861424,
"grad_norm": 0.5268076062202454,
"kl": NaN,
"learning_rate": 7.555238503958001e-07,
"loss": 0.0497,
"reward": 0.3386116325855255,
"reward_std": 0.7405965030193329,
"rewards/correctness_reward_func": 0.1651785783469677,
"rewards/int_reward_func": 0.2500000186264515,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.07656696811318398,
"step": 236
},
{
"completion_length": 92.50000381469727,
"epoch": 0.8876404494382022,
"grad_norm": 0.49278637766838074,
"kl": 1.129150390625,
"learning_rate": 7.322330470336314e-07,
"loss": 0.0452,
"reward": 0.265178584959358,
"reward_std": 0.866941437125206,
"rewards/correctness_reward_func": 0.16517857648432255,
"rewards/int_reward_func": 0.2477678693830967,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.14776786230504513,
"step": 237
},
{
"completion_length": 95.01339912414551,
"epoch": 0.8913857677902621,
"grad_norm": 0.46395623683929443,
"kl": 1.181640625,
"learning_rate": 7.092451414383644e-07,
"loss": 0.0473,
"reward": 0.2845067009329796,
"reward_std": 0.8498467355966568,
"rewards/correctness_reward_func": 0.1696428693830967,
"rewards/int_reward_func": 0.2555803656578064,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0011160714784637094,
"rewards/xmlcount_reward_func": -0.14183259941637516,
"step": 238
},
{
"completion_length": 99.3593807220459,
"epoch": 0.8951310861423221,
"grad_norm": 0.6616207957267761,
"kl": 1.17919921875,
"learning_rate": 6.865640724692815e-07,
"loss": 0.0472,
"reward": 0.3268973380327225,
"reward_std": 0.8011666536331177,
"rewards/correctness_reward_func": 0.2008928656578064,
"rewards/int_reward_func": 0.2500000149011612,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.12399554438889027,
"step": 239
},
{
"completion_length": 104.7745590209961,
"epoch": 0.898876404494382,
"grad_norm": 0.5387265086174011,
"kl": 1.183349609375,
"learning_rate": 6.641937264107868e-07,
"loss": 0.0473,
"reward": 0.4128861799836159,
"reward_std": 0.850861206650734,
"rewards/correctness_reward_func": 0.2232142947614193,
"rewards/int_reward_func": 0.2734375074505806,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.08376562781631947,
"step": 240
},
{
"completion_length": 92.34152221679688,
"epoch": 0.9026217228464419,
"grad_norm": 0.5197759866714478,
"kl": 1.0531005859375,
"learning_rate": 6.421379363065142e-07,
"loss": 0.0421,
"reward": 0.35036832839250565,
"reward_std": 0.8599109500646591,
"rewards/correctness_reward_func": 0.2187500149011612,
"rewards/int_reward_func": 0.2645089440047741,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.13289063051342964,
"step": 241
},
{
"completion_length": 111.65402030944824,
"epoch": 0.9063670411985019,
"grad_norm": 0.46235111355781555,
"kl": 1.236572265625,
"learning_rate": 6.204004813025569e-07,
"loss": 0.0495,
"reward": 0.3619754686951637,
"reward_std": 0.7729392051696777,
"rewards/correctness_reward_func": 0.2187500037252903,
"rewards/int_reward_func": 0.2589285857975483,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.11570313014090061,
"step": 242
},
{
"completion_length": 103.99107551574707,
"epoch": 0.9101123595505618,
"grad_norm": 0.5195502638816833,
"kl": 1.19384765625,
"learning_rate": 5.989850859999227e-07,
"loss": 0.0477,
"reward": 0.2594174239784479,
"reward_std": 0.7237197905778885,
"rewards/correctness_reward_func": 0.1473214328289032,
"rewards/int_reward_func": 0.2522321529686451,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.14013616926968098,
"step": 243
},
{
"completion_length": 93.28571891784668,
"epoch": 0.9138576779026217,
"grad_norm": 0.4491368234157562,
"kl": 1.1376953125,
"learning_rate": 5.778954198163514e-07,
"loss": 0.0455,
"reward": 0.2785803731530905,
"reward_std": 0.6952795684337616,
"rewards/correctness_reward_func": 0.160714291036129,
"rewards/int_reward_func": 0.2533482275903225,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.13548214174807072,
"step": 244
},
{
"completion_length": 95.37500381469727,
"epoch": 0.9176029962546817,
"grad_norm": 0.4656703770160675,
"kl": 1.113037109375,
"learning_rate": 5.571350963575728e-07,
"loss": 0.0445,
"reward": 0.3266986757516861,
"reward_std": 0.7986108660697937,
"rewards/correctness_reward_func": 0.1830357238650322,
"rewards/int_reward_func": 0.2421875111758709,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0011160714784637094,
"rewards/xmlcount_reward_func": -0.09964063111692667,
"step": 245
},
{
"completion_length": 100.66964912414551,
"epoch": 0.9213483146067416,
"grad_norm": 0.48842382431030273,
"kl": 1.119873046875,
"learning_rate": 5.367076727981383e-07,
"loss": 0.0448,
"reward": 0.21152456477284431,
"reward_std": 0.7304975092411041,
"rewards/correctness_reward_func": 0.12053571827709675,
"rewards/int_reward_func": 0.2064732275903225,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.11548437923192978,
"step": 246
},
{
"completion_length": 95.72991561889648,
"epoch": 0.9250936329588015,
"grad_norm": 0.48145803809165955,
"kl": 1.124755859375,
"learning_rate": 5.166166492719124e-07,
"loss": 0.045,
"reward": 0.2994174249470234,
"reward_std": 0.7895640283823013,
"rewards/correctness_reward_func": 0.1651785783469677,
"rewards/int_reward_func": 0.2377232275903225,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.10348437912762165,
"step": 247
},
{
"completion_length": 101.8192024230957,
"epoch": 0.9288389513108615,
"grad_norm": 0.4706651568412781,
"kl": 1.165283203125,
"learning_rate": 4.968654682723487e-07,
"loss": 0.0466,
"reward": 0.28485044091939926,
"reward_std": 0.9133375287055969,
"rewards/correctness_reward_func": 0.2098214365541935,
"rewards/int_reward_func": 0.2321428656578064,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.15711384266614914,
"step": 248
},
{
"completion_length": 91.77902030944824,
"epoch": 0.9325842696629213,
"grad_norm": 0.9746862053871155,
"kl": 1.063232421875,
"learning_rate": 4.774575140626317e-07,
"loss": 0.0425,
"reward": 0.3371250182390213,
"reward_std": 0.775251716375351,
"rewards/correctness_reward_func": 0.1696428656578064,
"rewards/int_reward_func": 0.2433035783469677,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.07582143507897854,
"step": 249
},
{
"completion_length": 93.1718807220459,
"epoch": 0.9363295880149812,
"grad_norm": 0.4796634316444397,
"kl": 1.09228515625,
"learning_rate": 4.5839611209580277e-07,
"loss": 0.0437,
"reward": 0.42918528616428375,
"reward_std": 0.7719597369432449,
"rewards/correctness_reward_func": 0.2008928693830967,
"rewards/int_reward_func": 0.2767857201397419,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.04849330266006291,
"step": 250
},
{
"completion_length": 86.16964721679688,
"epoch": 0.9400749063670412,
"grad_norm": 0.4254949986934662,
"kl": 1.08740234375,
"learning_rate": 4.396845284449608e-07,
"loss": 0.0435,
"reward": 0.27672769874334335,
"reward_std": 0.7562145739793777,
"rewards/correctness_reward_func": 0.1383928619325161,
"rewards/int_reward_func": 0.251116082072258,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.11278124991804361,
"step": 251
},
{
"completion_length": 101.90402221679688,
"epoch": 0.9438202247191011,
"grad_norm": 0.4744266867637634,
"kl": 1.144775390625,
"learning_rate": 4.2132596924363666e-07,
"loss": 0.0458,
"reward": 0.2469821460545063,
"reward_std": 0.840282753109932,
"rewards/correctness_reward_func": 0.1785714365541935,
"rewards/int_reward_func": 0.2410714402794838,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.17266072891652584,
"step": 252
},
{
"completion_length": 118.7723274230957,
"epoch": 0.947565543071161,
"grad_norm": 0.4241223633289337,
"kl": 1.21875,
"learning_rate": 4.033235801364402e-07,
"loss": 0.0488,
"reward": 0.15625000838190317,
"reward_std": 0.7448219954967499,
"rewards/correctness_reward_func": 0.1205357238650322,
"rewards/int_reward_func": 0.2209821529686451,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0011160714784637094,
"rewards/xmlcount_reward_func": -0.1863839291036129,
"step": 253
},
{
"completion_length": 101.55580711364746,
"epoch": 0.951310861423221,
"grad_norm": 0.5570555925369263,
"kl": 1.070068359375,
"learning_rate": 3.85680445740067e-07,
"loss": 0.0428,
"reward": 0.23607589676976204,
"reward_std": 0.7750666290521622,
"rewards/correctness_reward_func": 0.1116071492433548,
"rewards/int_reward_func": 0.2287946529686451,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.10432589706033468,
"step": 254
},
{
"completion_length": 95.45089721679688,
"epoch": 0.9550561797752809,
"grad_norm": 0.511661946773529,
"kl": 1.1142578125,
"learning_rate": 3.683995891147696e-07,
"loss": 0.0446,
"reward": 0.36682143807411194,
"reward_std": 0.7813057452440262,
"rewards/correctness_reward_func": 0.2053571529686451,
"rewards/int_reward_func": 0.2600446529686451,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.0985803622752428,
"step": 255
},
{
"completion_length": 101.62277030944824,
"epoch": 0.9588014981273408,
"grad_norm": 0.4439486563205719,
"kl": 1.134765625,
"learning_rate": 3.514839712463683e-07,
"loss": 0.0454,
"reward": 0.30028797313570976,
"reward_std": 0.8320818990468979,
"rewards/correctness_reward_func": 0.1651785783469677,
"rewards/int_reward_func": 0.2455357238650322,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.11042634584009647,
"step": 256
},
{
"completion_length": 93.14062881469727,
"epoch": 0.9625468164794008,
"grad_norm": 0.5691079497337341,
"kl": 1.0438232421875,
"learning_rate": 3.3493649053890325e-07,
"loss": 0.0418,
"reward": 0.31014733761548996,
"reward_std": 0.7862526774406433,
"rewards/correctness_reward_func": 0.1741071529686451,
"rewards/int_reward_func": 0.2399553693830967,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.10391518194228411,
"step": 257
},
{
"completion_length": 102.33705711364746,
"epoch": 0.9662921348314607,
"grad_norm": 0.5333299040794373,
"kl": 1.126220703125,
"learning_rate": 3.187599823180071e-07,
"loss": 0.045,
"reward": 0.32864734157919884,
"reward_std": 0.7325298935174942,
"rewards/correctness_reward_func": 0.1473214328289032,
"rewards/int_reward_func": 0.2578125074505806,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.0764866080135107,
"step": 258
},
{
"completion_length": 93.04911231994629,
"epoch": 0.9700374531835206,
"grad_norm": 0.5367720127105713,
"kl": 1.045654296875,
"learning_rate": 3.0295721834508686e-07,
"loss": 0.0418,
"reward": 0.39304019510746,
"reward_std": 0.9003488570451736,
"rewards/correctness_reward_func": 0.254464291036129,
"rewards/int_reward_func": 0.2678571492433548,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.12928125727921724,
"step": 259
},
{
"completion_length": 91.97545051574707,
"epoch": 0.9737827715355806,
"grad_norm": 0.4951762557029724,
"kl": 1.07470703125,
"learning_rate": 2.875309063423956e-07,
"loss": 0.043,
"reward": 0.21667636185884476,
"reward_std": 0.8131074160337448,
"rewards/correctness_reward_func": 0.1294642947614193,
"rewards/int_reward_func": 0.2220982275903225,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0011160714784637094,
"rewards/xmlcount_reward_func": -0.13600223883986473,
"step": 260
},
{
"completion_length": 92.67634391784668,
"epoch": 0.9775280898876404,
"grad_norm": 0.5502758622169495,
"kl": 1.0794677734375,
"learning_rate": 2.7248368952908055e-07,
"loss": 0.0432,
"reward": 0.32996875420212746,
"reward_std": 0.8287549465894699,
"rewards/correctness_reward_func": 0.1785714328289032,
"rewards/int_reward_func": 0.2723214402794838,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0011160714784637094,
"rewards/xmlcount_reward_func": -0.12204018794000149,
"step": 261
},
{
"completion_length": 100.24777221679688,
"epoch": 0.9812734082397003,
"grad_norm": 0.5502758622169495,
"kl": NaN,
"learning_rate": 2.7248368952908055e-07,
"loss": 0.0487,
"reward": 0.3751874938607216,
"reward_std": 0.7956403493881226,
"rewards/correctness_reward_func": 0.2276785857975483,
"rewards/int_reward_func": 0.266741082072258,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.11923214513808489,
"step": 262
},
{
"completion_length": 94.25000381469727,
"epoch": 0.9850187265917603,
"grad_norm": 0.5013113617897034,
"kl": 1.0750732421875,
"learning_rate": 2.5781814616827936e-07,
"loss": 0.043,
"reward": 0.30204688012599945,
"reward_std": 0.8205768465995789,
"rewards/correctness_reward_func": 0.1875000074505806,
"rewards/int_reward_func": 0.2466517947614193,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.13210491463541985,
"step": 263
},
{
"completion_length": 88.36830711364746,
"epoch": 0.9887640449438202,
"grad_norm": 0.55992192029953,
"kl": 1.045166015625,
"learning_rate": 2.43536789125349e-07,
"loss": 0.0418,
"reward": 0.27569420635700226,
"reward_std": 0.8564379215240479,
"rewards/correctness_reward_func": 0.1651785783469677,
"rewards/int_reward_func": 0.2466517947614193,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.13613616861402988,
"step": 264
},
{
"completion_length": 89.41071701049805,
"epoch": 0.9925093632958801,
"grad_norm": 0.55992192029953,
"kl": NaN,
"learning_rate": 2.43536789125349e-07,
"loss": 0.0415,
"reward": 0.3237812668085098,
"reward_std": 0.8230260014533997,
"rewards/correctness_reward_func": 0.1785714402794838,
"rewards/int_reward_func": 0.2656250074505806,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.12041518278419971,
"step": 265
},
{
"completion_length": 105.87500381469727,
"epoch": 0.9962546816479401,
"grad_norm": 0.6678707003593445,
"kl": 1.279541015625,
"learning_rate": 2.2964206543729662e-07,
"loss": 0.0512,
"reward": 0.34079688787460327,
"reward_std": 0.805058628320694,
"rewards/correctness_reward_func": 0.196428582072258,
"rewards/int_reward_func": 0.2399553693830967,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.09558706358075142,
"step": 266
},
{
"completion_length": 102.1875,
"epoch": 1.0,
"grad_norm": 0.5808861255645752,
"kl": 1.051025390625,
"learning_rate": 2.1613635589349756e-07,
"loss": 0.0441,
"reward": 0.27787497639656067,
"reward_std": 0.8605497926473618,
"rewards/correctness_reward_func": 0.25,
"rewards/int_reward_func": 0.28125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.2533750110305846,
"step": 267
},
{
"epoch": 1.0,
"step": 267,
"total_flos": 0.0,
"train_loss": 0.0,
"train_runtime": 1.6097,
"train_samples_per_second": 4642.518,
"train_steps_per_second": 165.871
}
],
"logging_steps": 1,
"max_steps": 267,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 54,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}