|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 27, |
|
"global_step": 267, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 92.16071701049805, |
|
"epoch": 0.003745318352059925, |
|
"grad_norm": 0.0, |
|
"kl": 0.0, |
|
"learning_rate": 0.0, |
|
"loss": -0.0, |
|
"reward": 0.3455178663134575, |
|
"reward_std": 0.7725450992584229, |
|
"rewards/correctness_reward_func": 0.191964291036129, |
|
"rewards/int_reward_func": 0.2812500149011612, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.1276964358985424, |
|
"step": 1 |
|
}, |
|
{ |
|
"completion_length": 99.06696891784668, |
|
"epoch": 0.00749063670411985, |
|
"grad_norm": 0.0, |
|
"kl": 0.0, |
|
"learning_rate": 0.0, |
|
"loss": -0.0, |
|
"reward": 0.38210269808769226, |
|
"reward_std": 0.8393888622522354, |
|
"rewards/correctness_reward_func": 0.1964285783469677, |
|
"rewards/int_reward_func": 0.2600446492433548, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.0743705378845334, |
|
"step": 2 |
|
}, |
|
{ |
|
"completion_length": 95.87054061889648, |
|
"epoch": 0.011235955056179775, |
|
"grad_norm": 0.0, |
|
"kl": 0.0, |
|
"learning_rate": 0.0, |
|
"loss": 0.0, |
|
"reward": 0.24831698834896088, |
|
"reward_std": 0.7660860866308212, |
|
"rewards/correctness_reward_func": 0.12053571734577417, |
|
"rewards/int_reward_func": 0.2421875149011612, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.11440625134855509, |
|
"step": 3 |
|
}, |
|
{ |
|
"completion_length": 100.84152221679688, |
|
"epoch": 0.0149812734082397, |
|
"grad_norm": 0.0, |
|
"kl": 0.0, |
|
"learning_rate": 0.0, |
|
"loss": 0.0, |
|
"reward": 0.3874709792435169, |
|
"reward_std": 0.8373937755823135, |
|
"rewards/correctness_reward_func": 0.2187500074505806, |
|
"rewards/int_reward_func": 0.251116082072258, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.08239509297709446, |
|
"step": 4 |
|
}, |
|
{ |
|
"completion_length": 107.81696701049805, |
|
"epoch": 0.018726591760299626, |
|
"grad_norm": 0.0, |
|
"kl": 0.0, |
|
"learning_rate": 0.0, |
|
"loss": 0.0, |
|
"reward": 0.3461473397910595, |
|
"reward_std": 0.8639847934246063, |
|
"rewards/correctness_reward_func": 0.2187500111758709, |
|
"rewards/int_reward_func": 0.2834821566939354, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.15608482621610165, |
|
"step": 5 |
|
}, |
|
{ |
|
"completion_length": 88.96428871154785, |
|
"epoch": 0.02247191011235955, |
|
"grad_norm": 0.0, |
|
"kl": 0.0, |
|
"learning_rate": 0.0, |
|
"loss": -0.0, |
|
"reward": 0.23790179379284382, |
|
"reward_std": 0.8017762005329132, |
|
"rewards/correctness_reward_func": 0.14285715389996767, |
|
"rewards/int_reward_func": 0.2310267947614193, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.13598215393722057, |
|
"step": 6 |
|
}, |
|
{ |
|
"completion_length": 86.72768211364746, |
|
"epoch": 0.026217228464419477, |
|
"grad_norm": 0.0, |
|
"kl": 0.0, |
|
"learning_rate": 0.0, |
|
"loss": -0.0, |
|
"reward": 0.33704913780093193, |
|
"reward_std": 0.786924734711647, |
|
"rewards/correctness_reward_func": 0.2053571492433548, |
|
"rewards/int_reward_func": 0.2667410895228386, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.1350491177290678, |
|
"step": 7 |
|
}, |
|
{ |
|
"completion_length": 87.54687881469727, |
|
"epoch": 0.0299625468164794, |
|
"grad_norm": 0.0, |
|
"kl": 0.0, |
|
"learning_rate": 0.0, |
|
"loss": -0.0, |
|
"reward": 0.390026792883873, |
|
"reward_std": 0.7708619683980942, |
|
"rewards/correctness_reward_func": 0.2053571492433548, |
|
"rewards/int_reward_func": 0.2354910857975483, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.050821430049836636, |
|
"step": 8 |
|
}, |
|
{ |
|
"completion_length": 87.82366561889648, |
|
"epoch": 0.033707865168539325, |
|
"grad_norm": 0.0, |
|
"kl": 0.0, |
|
"learning_rate": 0.0, |
|
"loss": -0.0, |
|
"reward": 0.30632367357611656, |
|
"reward_std": 0.8439056426286697, |
|
"rewards/correctness_reward_func": 0.1741071492433548, |
|
"rewards/int_reward_func": 0.2477678656578064, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.11555134132504463, |
|
"step": 9 |
|
}, |
|
{ |
|
"completion_length": 95.44196891784668, |
|
"epoch": 0.03745318352059925, |
|
"grad_norm": 0.0, |
|
"kl": 0.0, |
|
"learning_rate": 0.0, |
|
"loss": 0.0, |
|
"reward": 0.36654020100831985, |
|
"reward_std": 0.7821808308362961, |
|
"rewards/correctness_reward_func": 0.1830357238650322, |
|
"rewards/int_reward_func": 0.263392873108387, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.07988839386962354, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 88.40178871154785, |
|
"epoch": 0.04119850187265917, |
|
"grad_norm": 0.0, |
|
"kl": 0.0, |
|
"learning_rate": 0.0, |
|
"loss": -0.0, |
|
"reward": 0.3682924136519432, |
|
"reward_std": 0.8412070125341415, |
|
"rewards/correctness_reward_func": 0.2098214440047741, |
|
"rewards/int_reward_func": 0.2600446604192257, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.10157366236671805, |
|
"step": 11 |
|
}, |
|
{ |
|
"completion_length": 96.40178871154785, |
|
"epoch": 0.0449438202247191, |
|
"grad_norm": 0.0, |
|
"kl": 0.0, |
|
"learning_rate": 0.0, |
|
"loss": 0.0, |
|
"reward": 0.3570691980421543, |
|
"reward_std": 0.831629067659378, |
|
"rewards/correctness_reward_func": 0.20535715110599995, |
|
"rewards/int_reward_func": 0.2633928693830967, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.1116808035876602, |
|
"step": 12 |
|
}, |
|
{ |
|
"completion_length": 98.67634582519531, |
|
"epoch": 0.04868913857677903, |
|
"grad_norm": 0.0, |
|
"kl": 0.0, |
|
"learning_rate": 0.0, |
|
"loss": 0.0, |
|
"reward": 0.31250446662306786, |
|
"reward_std": 0.7651553750038147, |
|
"rewards/correctness_reward_func": 0.1696428656578064, |
|
"rewards/int_reward_func": 0.2343750074505806, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.09151339251548052, |
|
"step": 13 |
|
}, |
|
{ |
|
"completion_length": 94.42634201049805, |
|
"epoch": 0.052434456928838954, |
|
"grad_norm": 0.0, |
|
"kl": 0.0, |
|
"learning_rate": 0.0, |
|
"loss": -0.0, |
|
"reward": 0.35333259031176567, |
|
"reward_std": 0.8554573208093643, |
|
"rewards/correctness_reward_func": 0.2232142984867096, |
|
"rewards/int_reward_func": 0.2645089477300644, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.13439063727855682, |
|
"step": 14 |
|
}, |
|
{ |
|
"completion_length": 95.30580711364746, |
|
"epoch": 0.056179775280898875, |
|
"grad_norm": 0.7303056716918945, |
|
"kl": 0.0, |
|
"learning_rate": 1.8518518518518518e-07, |
|
"loss": 0.0, |
|
"reward": 0.3904196694493294, |
|
"reward_std": 0.8479138016700745, |
|
"rewards/correctness_reward_func": 0.2098214440047741, |
|
"rewards/int_reward_func": 0.2544642984867096, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.07386607304215431, |
|
"step": 15 |
|
}, |
|
{ |
|
"completion_length": 98.70089721679688, |
|
"epoch": 0.0599250936329588, |
|
"grad_norm": 0.6679372191429138, |
|
"kl": 0.0, |
|
"learning_rate": 3.7037037037037036e-07, |
|
"loss": -0.0, |
|
"reward": 0.26223884522914886, |
|
"reward_std": 0.8470287472009659, |
|
"rewards/correctness_reward_func": 0.1830357201397419, |
|
"rewards/int_reward_func": 0.243303582072258, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.16410045325756073, |
|
"step": 16 |
|
}, |
|
{ |
|
"completion_length": 111.6004524230957, |
|
"epoch": 0.06367041198501873, |
|
"grad_norm": 0.8377946019172668, |
|
"kl": 8.493661880493164e-07, |
|
"learning_rate": 5.555555555555555e-07, |
|
"loss": 0.0, |
|
"reward": 0.27681921795010567, |
|
"reward_std": 0.8451116383075714, |
|
"rewards/correctness_reward_func": 0.1875000037252903, |
|
"rewards/int_reward_func": 0.2488839365541935, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.1595647381618619, |
|
"step": 17 |
|
}, |
|
{ |
|
"completion_length": 99.96875381469727, |
|
"epoch": 0.06741573033707865, |
|
"grad_norm": 0.9236070513725281, |
|
"kl": 0.00010453164577484131, |
|
"learning_rate": 7.407407407407407e-07, |
|
"loss": 0.0, |
|
"reward": 0.31320536509156227, |
|
"reward_std": 0.8322850167751312, |
|
"rewards/correctness_reward_func": 0.191964291036129, |
|
"rewards/int_reward_func": 0.2455357275903225, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.12429465167224407, |
|
"step": 18 |
|
}, |
|
{ |
|
"completion_length": 98.84598731994629, |
|
"epoch": 0.07116104868913857, |
|
"grad_norm": 0.9007355570793152, |
|
"kl": 0.0017764568328857422, |
|
"learning_rate": 9.259259259259259e-07, |
|
"loss": 0.0001, |
|
"reward": 0.2539866119623184, |
|
"reward_std": 0.8283544480800629, |
|
"rewards/correctness_reward_func": 0.1741071492433548, |
|
"rewards/int_reward_func": 0.227678582072258, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.1477991035208106, |
|
"step": 19 |
|
}, |
|
{ |
|
"completion_length": 89.91964721679688, |
|
"epoch": 0.0749063670411985, |
|
"grad_norm": 0.8131362199783325, |
|
"kl": 0.009290695190429688, |
|
"learning_rate": 1.111111111111111e-06, |
|
"loss": 0.0004, |
|
"reward": 0.4152901992201805, |
|
"reward_std": 0.7349574714899063, |
|
"rewards/correctness_reward_func": 0.2232142947614193, |
|
"rewards/int_reward_func": 0.258928582072258, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.0668526804074645, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 92.15402221679688, |
|
"epoch": 0.07865168539325842, |
|
"grad_norm": 0.8349559307098389, |
|
"kl": 0.055389404296875, |
|
"learning_rate": 1.2962962962962962e-06, |
|
"loss": 0.0022, |
|
"reward": 0.33069421350955963, |
|
"reward_std": 0.7918245047330856, |
|
"rewards/correctness_reward_func": 0.19196429289877415, |
|
"rewards/int_reward_func": 0.2656250074505806, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.12689509708434343, |
|
"step": 21 |
|
}, |
|
{ |
|
"completion_length": 90.09821891784668, |
|
"epoch": 0.08239700374531835, |
|
"grad_norm": 1.059292197227478, |
|
"kl": 0.13580322265625, |
|
"learning_rate": 1.4814814814814815e-06, |
|
"loss": 0.0054, |
|
"reward": 0.3621741235256195, |
|
"reward_std": 0.8592714816331863, |
|
"rewards/correctness_reward_func": 0.2187500149011612, |
|
"rewards/int_reward_func": 0.2533482164144516, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.10992411337792873, |
|
"step": 22 |
|
}, |
|
{ |
|
"completion_length": 90.31473541259766, |
|
"epoch": 0.08614232209737828, |
|
"grad_norm": 0.9306014776229858, |
|
"kl": 0.22802734375, |
|
"learning_rate": 1.6666666666666667e-06, |
|
"loss": 0.0091, |
|
"reward": 0.27021654695272446, |
|
"reward_std": 0.7604184001684189, |
|
"rewards/correctness_reward_func": 0.1428571492433548, |
|
"rewards/int_reward_func": 0.251116082072258, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.12375670112669468, |
|
"step": 23 |
|
}, |
|
{ |
|
"completion_length": 85.87277221679688, |
|
"epoch": 0.0898876404494382, |
|
"grad_norm": 1.0507615804672241, |
|
"kl": 0.26690673828125, |
|
"learning_rate": 1.8518518518518519e-06, |
|
"loss": 0.0107, |
|
"reward": 0.2973214313387871, |
|
"reward_std": 0.72261543571949, |
|
"rewards/correctness_reward_func": 0.13839286379516125, |
|
"rewards/int_reward_func": 0.2455357275903225, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.08660715073347092, |
|
"step": 24 |
|
}, |
|
{ |
|
"completion_length": 93.05580711364746, |
|
"epoch": 0.09363295880149813, |
|
"grad_norm": 1.3314857482910156, |
|
"kl": 0.27276611328125, |
|
"learning_rate": 2.037037037037037e-06, |
|
"loss": 0.0109, |
|
"reward": 0.27803125604987144, |
|
"reward_std": 0.80119389295578, |
|
"rewards/correctness_reward_func": 0.1741071529686451, |
|
"rewards/int_reward_func": 0.2310267947614193, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.1271026823669672, |
|
"step": 25 |
|
}, |
|
{ |
|
"completion_length": 94.82589721679688, |
|
"epoch": 0.09737827715355805, |
|
"grad_norm": 1.0931949615478516, |
|
"kl": 0.29876708984375, |
|
"learning_rate": 2.222222222222222e-06, |
|
"loss": 0.012, |
|
"reward": 0.31898215785622597, |
|
"reward_std": 0.861026868224144, |
|
"rewards/correctness_reward_func": 0.1785714365541935, |
|
"rewards/int_reward_func": 0.2611607238650322, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0011160714784637094, |
|
"rewards/xmlcount_reward_func": -0.12186607345938683, |
|
"step": 26 |
|
}, |
|
{ |
|
"completion_length": 102.84821891784668, |
|
"epoch": 0.10112359550561797, |
|
"grad_norm": 0.9510552883148193, |
|
"kl": 0.35101318359375, |
|
"learning_rate": 2.4074074074074075e-06, |
|
"loss": 0.014, |
|
"reward": 0.2714241296052933, |
|
"reward_std": 0.7749656587839127, |
|
"rewards/correctness_reward_func": 0.1651785783469677, |
|
"rewards/int_reward_func": 0.2444196566939354, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.13817412219941616, |
|
"step": 27 |
|
}, |
|
{ |
|
"completion_length": 112.87277412414551, |
|
"epoch": 0.10486891385767791, |
|
"grad_norm": 0.8143340945243835, |
|
"kl": 0.4764404296875, |
|
"learning_rate": 2.5925925925925925e-06, |
|
"loss": 0.0191, |
|
"reward": 0.23841295577585697, |
|
"reward_std": 0.7267381250858307, |
|
"rewards/correctness_reward_func": 0.12500000186264515, |
|
"rewards/int_reward_func": 0.2544642947614193, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.14105134829878807, |
|
"step": 28 |
|
}, |
|
{ |
|
"completion_length": 91.91071891784668, |
|
"epoch": 0.10861423220973783, |
|
"grad_norm": 1.2296696901321411, |
|
"kl": 0.5228271484375, |
|
"learning_rate": 2.7777777777777783e-06, |
|
"loss": 0.0209, |
|
"reward": 0.33742189407348633, |
|
"reward_std": 0.8348551988601685, |
|
"rewards/correctness_reward_func": 0.1919642984867096, |
|
"rewards/int_reward_func": 0.2533482238650322, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.10789063014090061, |
|
"step": 29 |
|
}, |
|
{ |
|
"completion_length": 88.34152030944824, |
|
"epoch": 0.11235955056179775, |
|
"grad_norm": 0.7870422601699829, |
|
"kl": 0.50323486328125, |
|
"learning_rate": 2.962962962962963e-06, |
|
"loss": 0.0201, |
|
"reward": 0.29978572577238083, |
|
"reward_std": 0.8016993254423141, |
|
"rewards/correctness_reward_func": 0.160714291036129, |
|
"rewards/int_reward_func": 0.2645089402794838, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.1254375111311674, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 103.2433090209961, |
|
"epoch": 0.11610486891385768, |
|
"grad_norm": 1.320225715637207, |
|
"kl": 0.71630859375, |
|
"learning_rate": 3.1481481481481483e-06, |
|
"loss": 0.0286, |
|
"reward": 0.31782814115285873, |
|
"reward_std": 0.8109631538391113, |
|
"rewards/correctness_reward_func": 0.191964291036129, |
|
"rewards/int_reward_func": 0.2477678693830967, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.12190402112901211, |
|
"step": 31 |
|
}, |
|
{ |
|
"completion_length": 102.47321891784668, |
|
"epoch": 0.1198501872659176, |
|
"grad_norm": 1.1958893537521362, |
|
"kl": 0.6319580078125, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 0.0253, |
|
"reward": 0.3368035778403282, |
|
"reward_std": 0.8891346454620361, |
|
"rewards/correctness_reward_func": 0.2187500149011612, |
|
"rewards/int_reward_func": 0.2767857313156128, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.15873214416205883, |
|
"step": 32 |
|
}, |
|
{ |
|
"completion_length": 110.43080711364746, |
|
"epoch": 0.12359550561797752, |
|
"grad_norm": 0.900262176990509, |
|
"kl": 0.5997314453125, |
|
"learning_rate": 3.5185185185185187e-06, |
|
"loss": 0.024, |
|
"reward": 0.2625982239842415, |
|
"reward_std": 0.814198911190033, |
|
"rewards/correctness_reward_func": 0.16071429196745157, |
|
"rewards/int_reward_func": 0.2321428656578064, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.1302589364349842, |
|
"step": 33 |
|
}, |
|
{ |
|
"completion_length": 96.2433090209961, |
|
"epoch": 0.12734082397003746, |
|
"grad_norm": 0.8053016662597656, |
|
"kl": 0.5283203125, |
|
"learning_rate": 3.7037037037037037e-06, |
|
"loss": 0.0211, |
|
"reward": 0.2832053676247597, |
|
"reward_std": 0.773906797170639, |
|
"rewards/correctness_reward_func": 0.1830357238650322, |
|
"rewards/int_reward_func": 0.2421875111758709, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.14201787114143372, |
|
"step": 34 |
|
}, |
|
{ |
|
"completion_length": 99.74107360839844, |
|
"epoch": 0.13108614232209737, |
|
"grad_norm": 0.7698966860771179, |
|
"kl": 0.5833740234375, |
|
"learning_rate": 3.88888888888889e-06, |
|
"loss": 0.0233, |
|
"reward": 0.3971473351120949, |
|
"reward_std": 0.8169043958187103, |
|
"rewards/correctness_reward_func": 0.2366071529686451, |
|
"rewards/int_reward_func": 0.2667410857975483, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.10620089713484049, |
|
"step": 35 |
|
}, |
|
{ |
|
"completion_length": 91.65402030944824, |
|
"epoch": 0.1348314606741573, |
|
"grad_norm": 0.6963524222373962, |
|
"kl": 0.6768798828125, |
|
"learning_rate": 4.074074074074074e-06, |
|
"loss": 0.0271, |
|
"reward": 0.2977009005844593, |
|
"reward_std": 0.9012245386838913, |
|
"rewards/correctness_reward_func": 0.2232142947614193, |
|
"rewards/int_reward_func": 0.2455357238650322, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.1710491143167019, |
|
"step": 36 |
|
}, |
|
{ |
|
"completion_length": 86.2120590209961, |
|
"epoch": 0.13857677902621723, |
|
"grad_norm": 0.6502078771591187, |
|
"kl": 0.7315673828125, |
|
"learning_rate": 4.2592592592592596e-06, |
|
"loss": 0.0293, |
|
"reward": 0.4257053807377815, |
|
"reward_std": 0.7627889215946198, |
|
"rewards/correctness_reward_func": 0.2053571492433548, |
|
"rewards/int_reward_func": 0.2611607238650322, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.04081250121816993, |
|
"step": 37 |
|
}, |
|
{ |
|
"completion_length": 88.02232551574707, |
|
"epoch": 0.14232209737827714, |
|
"grad_norm": 0.7598965764045715, |
|
"kl": 0.7764892578125, |
|
"learning_rate": 4.444444444444444e-06, |
|
"loss": 0.0311, |
|
"reward": 0.26222768798470497, |
|
"reward_std": 0.7311272174119949, |
|
"rewards/correctness_reward_func": 0.1383928656578064, |
|
"rewards/int_reward_func": 0.2466517947614193, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.12281697243452072, |
|
"step": 38 |
|
}, |
|
{ |
|
"completion_length": 88.76116561889648, |
|
"epoch": 0.14606741573033707, |
|
"grad_norm": 0.9369046688079834, |
|
"kl": 0.779052734375, |
|
"learning_rate": 4.62962962962963e-06, |
|
"loss": 0.0312, |
|
"reward": 0.2584107182919979, |
|
"reward_std": 0.8221316933631897, |
|
"rewards/correctness_reward_func": 0.1651785783469677, |
|
"rewards/int_reward_func": 0.2466518022119999, |
|
"rewards/soft_format_reward_func": 0.0011160714784637094, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.15453572571277618, |
|
"step": 39 |
|
}, |
|
{ |
|
"completion_length": 88.61607551574707, |
|
"epoch": 0.149812734082397, |
|
"grad_norm": 0.6541325449943542, |
|
"kl": 0.7415771484375, |
|
"learning_rate": 4.814814814814815e-06, |
|
"loss": 0.0297, |
|
"reward": 0.338582631200552, |
|
"reward_std": 0.773887574672699, |
|
"rewards/correctness_reward_func": 0.191964291036129, |
|
"rewards/int_reward_func": 0.2522321529686451, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.10561384446918964, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 93.6473274230957, |
|
"epoch": 0.15355805243445692, |
|
"grad_norm": 0.7286244630813599, |
|
"kl": 0.772216796875, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0309, |
|
"reward": 0.2527187615633011, |
|
"reward_std": 0.7823167890310287, |
|
"rewards/correctness_reward_func": 0.1473214328289032, |
|
"rewards/int_reward_func": 0.2533482275903225, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.14795089792460203, |
|
"step": 41 |
|
}, |
|
{ |
|
"completion_length": 102.1004524230957, |
|
"epoch": 0.15730337078651685, |
|
"grad_norm": 0.6125639081001282, |
|
"kl": 0.7431640625, |
|
"learning_rate": 4.999785818935018e-06, |
|
"loss": 0.0297, |
|
"reward": 0.372944213449955, |
|
"reward_std": 0.8073680251836777, |
|
"rewards/correctness_reward_func": 0.2098214365541935, |
|
"rewards/int_reward_func": 0.279017873108387, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.11589509434998035, |
|
"step": 42 |
|
}, |
|
{ |
|
"completion_length": 95.78794860839844, |
|
"epoch": 0.16104868913857678, |
|
"grad_norm": 0.7778175473213196, |
|
"kl": 0.887451171875, |
|
"learning_rate": 4.999143312438893e-06, |
|
"loss": 0.0355, |
|
"reward": 0.3458884060382843, |
|
"reward_std": 0.8312461376190186, |
|
"rewards/correctness_reward_func": 0.2008928656578064, |
|
"rewards/int_reward_func": 0.2421875149011612, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.09719196986407042, |
|
"step": 43 |
|
}, |
|
{ |
|
"completion_length": 104.68080711364746, |
|
"epoch": 0.1647940074906367, |
|
"grad_norm": 0.9053827524185181, |
|
"kl": 0.83203125, |
|
"learning_rate": 4.998072590601808e-06, |
|
"loss": 0.0333, |
|
"reward": 0.28239064663648605, |
|
"reward_std": 0.7673767507076263, |
|
"rewards/correctness_reward_func": 0.16517857648432255, |
|
"rewards/int_reward_func": 0.2354910783469677, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.11827902123332024, |
|
"step": 44 |
|
}, |
|
{ |
|
"completion_length": 91.47768211364746, |
|
"epoch": 0.16853932584269662, |
|
"grad_norm": 0.7150729894638062, |
|
"kl": 0.8095703125, |
|
"learning_rate": 4.9965738368864345e-06, |
|
"loss": 0.0324, |
|
"reward": 0.4368147626519203, |
|
"reward_std": 0.8043892681598663, |
|
"rewards/correctness_reward_func": 0.2455357238650322, |
|
"rewards/int_reward_func": 0.2879464365541935, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.09666741825640202, |
|
"step": 45 |
|
}, |
|
{ |
|
"completion_length": 106.02009391784668, |
|
"epoch": 0.17228464419475656, |
|
"grad_norm": 0.7475388050079346, |
|
"kl": 0.9344482421875, |
|
"learning_rate": 4.994647308096509e-06, |
|
"loss": 0.0374, |
|
"reward": 0.2440937664359808, |
|
"reward_std": 0.7551652044057846, |
|
"rewards/correctness_reward_func": 0.1339285783469677, |
|
"rewards/int_reward_func": 0.2388392947614193, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.1286741215735674, |
|
"step": 46 |
|
}, |
|
{ |
|
"completion_length": 94.04018211364746, |
|
"epoch": 0.1760299625468165, |
|
"grad_norm": 0.6236558556556702, |
|
"kl": 0.8037109375, |
|
"learning_rate": 4.992293334332821e-06, |
|
"loss": 0.0322, |
|
"reward": 0.37872322648763657, |
|
"reward_std": 0.8807232677936554, |
|
"rewards/correctness_reward_func": 0.2678571492433548, |
|
"rewards/int_reward_func": 0.2589285857975483, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.14806250110268593, |
|
"step": 47 |
|
}, |
|
{ |
|
"completion_length": 92.28571701049805, |
|
"epoch": 0.1797752808988764, |
|
"grad_norm": 0.6654737591743469, |
|
"kl": 0.818115234375, |
|
"learning_rate": 4.989512318936654e-06, |
|
"loss": 0.0327, |
|
"reward": 0.39247100055217743, |
|
"reward_std": 0.743692010641098, |
|
"rewards/correctness_reward_func": 0.1919642947614193, |
|
"rewards/int_reward_func": 0.2578125149011612, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.057305806782096624, |
|
"step": 48 |
|
}, |
|
{ |
|
"completion_length": 93.03571891784668, |
|
"epoch": 0.18352059925093633, |
|
"grad_norm": 0.7210425734519958, |
|
"kl": 0.825927734375, |
|
"learning_rate": 4.986304738420684e-06, |
|
"loss": 0.033, |
|
"reward": 0.35212278366088867, |
|
"reward_std": 0.8072675913572311, |
|
"rewards/correctness_reward_func": 0.2187500149011612, |
|
"rewards/int_reward_func": 0.2488839402794838, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.11551116220653057, |
|
"step": 49 |
|
}, |
|
{ |
|
"completion_length": 94.02902221679688, |
|
"epoch": 0.18726591760299627, |
|
"grad_norm": 0.6812981963157654, |
|
"kl": 0.92724609375, |
|
"learning_rate": 4.982671142387316e-06, |
|
"loss": 0.0371, |
|
"reward": 0.2549062632024288, |
|
"reward_std": 0.91233891248703, |
|
"rewards/correctness_reward_func": 0.1830357238650322, |
|
"rewards/int_reward_func": 0.2466517984867096, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.17478126287460327, |
|
"step": 50 |
|
}, |
|
{ |
|
"completion_length": 98.83259391784668, |
|
"epoch": 0.19101123595505617, |
|
"grad_norm": 0.7230132818222046, |
|
"kl": 0.9453125, |
|
"learning_rate": 4.978612153434527e-06, |
|
"loss": 0.0378, |
|
"reward": 0.3085335083305836, |
|
"reward_std": 0.7170540690422058, |
|
"rewards/correctness_reward_func": 0.13392858020961285, |
|
"rewards/int_reward_func": 0.2600446529686451, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.08543973276391625, |
|
"step": 51 |
|
}, |
|
{ |
|
"completion_length": 92.35491371154785, |
|
"epoch": 0.1947565543071161, |
|
"grad_norm": 0.7230132818222046, |
|
"kl": 1.035888671875, |
|
"learning_rate": 4.978612153434527e-06, |
|
"loss": 0.0414, |
|
"reward": 0.39346206933259964, |
|
"reward_std": 0.7441791445016861, |
|
"rewards/correctness_reward_func": 0.1741071529686451, |
|
"rewards/int_reward_func": 0.2522321566939354, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.03287723264656961, |
|
"step": 52 |
|
}, |
|
{ |
|
"completion_length": 89.34821701049805, |
|
"epoch": 0.19850187265917604, |
|
"grad_norm": 0.620324969291687, |
|
"kl": 0.957275390625, |
|
"learning_rate": 4.974128467049177e-06, |
|
"loss": 0.0383, |
|
"reward": 0.3490491136908531, |
|
"reward_std": 0.747399315237999, |
|
"rewards/correctness_reward_func": 0.1830357164144516, |
|
"rewards/int_reward_func": 0.2522321604192257, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0011160714784637094, |
|
"rewards/xmlcount_reward_func": -0.08733483403921127, |
|
"step": 53 |
|
}, |
|
{ |
|
"completion_length": 98.08928871154785, |
|
"epoch": 0.20224719101123595, |
|
"grad_norm": 0.617904782295227, |
|
"kl": 1.1640625, |
|
"learning_rate": 4.9692208514878445e-06, |
|
"loss": 0.0466, |
|
"reward": 0.21967187896370888, |
|
"reward_std": 0.7784698009490967, |
|
"rewards/correctness_reward_func": 0.14285714738070965, |
|
"rewards/int_reward_func": 0.2232142984867096, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.14639955759048462, |
|
"step": 54 |
|
}, |
|
{ |
|
"completion_length": 90.87946891784668, |
|
"epoch": 0.20599250936329588, |
|
"grad_norm": 0.6099480390548706, |
|
"kl": 1.119140625, |
|
"learning_rate": 4.963890147645195e-06, |
|
"loss": 0.0448, |
|
"reward": 0.3465201109647751, |
|
"reward_std": 0.8060361593961716, |
|
"rewards/correctness_reward_func": 0.1919642947614193, |
|
"rewards/int_reward_func": 0.266741082072258, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.11218527238816023, |
|
"step": 55 |
|
}, |
|
{ |
|
"completion_length": 90.78795051574707, |
|
"epoch": 0.20973782771535582, |
|
"grad_norm": 0.6998101472854614, |
|
"kl": 1.171875, |
|
"learning_rate": 4.958137268909887e-06, |
|
"loss": 0.0469, |
|
"reward": 0.3585915267467499, |
|
"reward_std": 0.7672727555036545, |
|
"rewards/correctness_reward_func": 0.191964291036129, |
|
"rewards/int_reward_func": 0.2488839365541935, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.08225669572129846, |
|
"step": 56 |
|
}, |
|
{ |
|
"completion_length": 91.67857551574707, |
|
"epoch": 0.21348314606741572, |
|
"grad_norm": 0.8839861154556274, |
|
"kl": 1.13525390625, |
|
"learning_rate": 4.9519632010080765e-06, |
|
"loss": 0.0454, |
|
"reward": 0.3334464356303215, |
|
"reward_std": 0.7685143500566483, |
|
"rewards/correctness_reward_func": 0.1741071529686451, |
|
"rewards/int_reward_func": 0.2578125074505806, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.09847322292625904, |
|
"step": 57 |
|
}, |
|
{ |
|
"completion_length": 81.05804061889648, |
|
"epoch": 0.21722846441947566, |
|
"grad_norm": 0.5016757249832153, |
|
"kl": 1.03076171875, |
|
"learning_rate": 4.9453690018345144e-06, |
|
"loss": 0.0412, |
|
"reward": 0.4211518168449402, |
|
"reward_std": 0.8437229245901108, |
|
"rewards/correctness_reward_func": 0.2187500149011612, |
|
"rewards/int_reward_func": 0.2779017947614193, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.0755000039935112, |
|
"step": 58 |
|
}, |
|
{ |
|
"completion_length": 101.41518211364746, |
|
"epoch": 0.2209737827715356, |
|
"grad_norm": 0.6329123377799988, |
|
"kl": 1.089111328125, |
|
"learning_rate": 4.938355801271282e-06, |
|
"loss": 0.0436, |
|
"reward": 0.35140402615070343, |
|
"reward_std": 0.7762987017631531, |
|
"rewards/correctness_reward_func": 0.1785714402794838, |
|
"rewards/int_reward_func": 0.2656250149011612, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.09279241226613522, |
|
"step": 59 |
|
}, |
|
{ |
|
"completion_length": 105.19420051574707, |
|
"epoch": 0.2247191011235955, |
|
"grad_norm": 0.6004884839057922, |
|
"kl": 1.02001953125, |
|
"learning_rate": 4.930924800994192e-06, |
|
"loss": 0.0408, |
|
"reward": 0.2473437450826168, |
|
"reward_std": 0.7411400526762009, |
|
"rewards/correctness_reward_func": 0.1428571492433548, |
|
"rewards/int_reward_func": 0.2466517984867096, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.14216518588364124, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 98.32589530944824, |
|
"epoch": 0.22846441947565543, |
|
"grad_norm": 0.8213242292404175, |
|
"kl": 1.04638671875, |
|
"learning_rate": 4.923077274266886e-06, |
|
"loss": 0.0419, |
|
"reward": 0.29397991858422756, |
|
"reward_std": 0.7984266579151154, |
|
"rewards/correctness_reward_func": 0.17857143841683865, |
|
"rewards/int_reward_func": 0.2500000074505806, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.13459152355790138, |
|
"step": 61 |
|
}, |
|
{ |
|
"completion_length": 95.77009201049805, |
|
"epoch": 0.23220973782771537, |
|
"grad_norm": 0.7614482641220093, |
|
"kl": 0.9754638671875, |
|
"learning_rate": 4.914814565722671e-06, |
|
"loss": 0.039, |
|
"reward": 0.25892411917448044, |
|
"reward_std": 0.6874004900455475, |
|
"rewards/correctness_reward_func": 0.1383928582072258, |
|
"rewards/int_reward_func": 0.251116082072258, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.13058483134955168, |
|
"step": 62 |
|
}, |
|
{ |
|
"completion_length": 88.84375381469727, |
|
"epoch": 0.23595505617977527, |
|
"grad_norm": 0.6162322759628296, |
|
"kl": 0.9012451171875, |
|
"learning_rate": 4.906138091134118e-06, |
|
"loss": 0.0361, |
|
"reward": 0.4282499924302101, |
|
"reward_std": 0.867719978094101, |
|
"rewards/correctness_reward_func": 0.2321428656578064, |
|
"rewards/int_reward_func": 0.2801339402794838, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.08402678836137056, |
|
"step": 63 |
|
}, |
|
{ |
|
"completion_length": 98.63393211364746, |
|
"epoch": 0.2397003745318352, |
|
"grad_norm": 0.7493047118186951, |
|
"kl": 0.9737548828125, |
|
"learning_rate": 4.897049337170483e-06, |
|
"loss": 0.0389, |
|
"reward": 0.31915403716266155, |
|
"reward_std": 0.7978127002716064, |
|
"rewards/correctness_reward_func": 0.2098214365541935, |
|
"rewards/int_reward_func": 0.2488839402794838, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.1395513443276286, |
|
"step": 64 |
|
}, |
|
{ |
|
"completion_length": 99.77232551574707, |
|
"epoch": 0.24344569288389514, |
|
"grad_norm": 0.55640709400177, |
|
"kl": 0.95849609375, |
|
"learning_rate": 4.887549861142967e-06, |
|
"loss": 0.0383, |
|
"reward": 0.25655804201960564, |
|
"reward_std": 0.746478259563446, |
|
"rewards/correctness_reward_func": 0.1339285783469677, |
|
"rewards/int_reward_func": 0.2767857313156128, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.1541562583297491, |
|
"step": 65 |
|
}, |
|
{ |
|
"completion_length": 90.46205711364746, |
|
"epoch": 0.24719101123595505, |
|
"grad_norm": 0.5036749243736267, |
|
"kl": 0.875732421875, |
|
"learning_rate": 4.8776412907378845e-06, |
|
"loss": 0.035, |
|
"reward": 0.3458192050457001, |
|
"reward_std": 0.8146399855613708, |
|
"rewards/correctness_reward_func": 0.2187500074505806, |
|
"rewards/int_reward_func": 0.2578125111758709, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.13074330985546112, |
|
"step": 66 |
|
}, |
|
{ |
|
"completion_length": 109.09821891784668, |
|
"epoch": 0.250936329588015, |
|
"grad_norm": 0.5405407547950745, |
|
"kl": 0.9388427734375, |
|
"learning_rate": 4.867325323737765e-06, |
|
"loss": 0.0376, |
|
"reward": 0.2163794655352831, |
|
"reward_std": 0.695435032248497, |
|
"rewards/correctness_reward_func": 0.1250000074505806, |
|
"rewards/int_reward_func": 0.2299107238650322, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.13853124901652336, |
|
"step": 67 |
|
}, |
|
{ |
|
"completion_length": 96.1495590209961, |
|
"epoch": 0.2546816479400749, |
|
"grad_norm": 0.7321078777313232, |
|
"kl": 0.9552001953125, |
|
"learning_rate": 4.856603727730446e-06, |
|
"loss": 0.0382, |
|
"reward": 0.3762388601899147, |
|
"reward_std": 0.8388219773769379, |
|
"rewards/correctness_reward_func": 0.2366071492433548, |
|
"rewards/int_reward_func": 0.2600446566939354, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.12041295692324638, |
|
"step": 68 |
|
}, |
|
{ |
|
"completion_length": 94.25893211364746, |
|
"epoch": 0.25842696629213485, |
|
"grad_norm": 0.5617873668670654, |
|
"kl": 0.9840087890625, |
|
"learning_rate": 4.845478339806211e-06, |
|
"loss": 0.0394, |
|
"reward": 0.3388616181910038, |
|
"reward_std": 0.8814976066350937, |
|
"rewards/correctness_reward_func": 0.2276785857975483, |
|
"rewards/int_reward_func": 0.2689732275903225, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.15779018122702837, |
|
"step": 69 |
|
}, |
|
{ |
|
"completion_length": 100.38839721679688, |
|
"epoch": 0.26217228464419473, |
|
"grad_norm": 0.7381689548492432, |
|
"kl": 1.1729736328125, |
|
"learning_rate": 4.833951066243004e-06, |
|
"loss": 0.0469, |
|
"reward": 0.3259017989039421, |
|
"reward_std": 0.7590171247720718, |
|
"rewards/correctness_reward_func": 0.165178582072258, |
|
"rewards/int_reward_func": 0.2566964328289032, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.09597321972250938, |
|
"step": 70 |
|
}, |
|
{ |
|
"completion_length": 108.24777412414551, |
|
"epoch": 0.26591760299625467, |
|
"grad_norm": 0.7399603724479675, |
|
"kl": 1.2216796875, |
|
"learning_rate": 4.822023882179811e-06, |
|
"loss": 0.0489, |
|
"reward": 0.138060272205621, |
|
"reward_std": 0.8277581036090851, |
|
"rewards/correctness_reward_func": 0.098214291036129, |
|
"rewards/int_reward_func": 0.2321428656578064, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.19229689799249172, |
|
"step": 71 |
|
}, |
|
{ |
|
"completion_length": 91.00893211364746, |
|
"epoch": 0.2696629213483146, |
|
"grad_norm": 0.49646005034446716, |
|
"kl": 0.997802734375, |
|
"learning_rate": 4.809698831278217e-06, |
|
"loss": 0.0399, |
|
"reward": 0.31739287078380585, |
|
"reward_std": 0.821430504322052, |
|
"rewards/correctness_reward_func": 0.1964285783469677, |
|
"rewards/int_reward_func": 0.258928582072258, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.137964291498065, |
|
"step": 72 |
|
}, |
|
{ |
|
"completion_length": 89.22545051574707, |
|
"epoch": 0.27340823970037453, |
|
"grad_norm": 0.5307531952857971, |
|
"kl": 0.9476318359375, |
|
"learning_rate": 4.796978025372247e-06, |
|
"loss": 0.0379, |
|
"reward": 0.3167254589498043, |
|
"reward_std": 0.8111777305603027, |
|
"rewards/correctness_reward_func": 0.16071429662406445, |
|
"rewards/int_reward_func": 0.2678571566939354, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.11184598784893751, |
|
"step": 73 |
|
}, |
|
{ |
|
"completion_length": 108.57813262939453, |
|
"epoch": 0.27715355805243447, |
|
"grad_norm": 0.9819021224975586, |
|
"kl": 1.1944580078125, |
|
"learning_rate": 4.783863644106502e-06, |
|
"loss": 0.0478, |
|
"reward": 0.43339288234710693, |
|
"reward_std": 0.805885374546051, |
|
"rewards/correctness_reward_func": 0.2410714402794838, |
|
"rewards/int_reward_func": 0.2633928656578064, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.07107143104076385, |
|
"step": 74 |
|
}, |
|
{ |
|
"completion_length": 97.66071891784668, |
|
"epoch": 0.2808988764044944, |
|
"grad_norm": 0.5349671244621277, |
|
"kl": 0.9276123046875, |
|
"learning_rate": 4.770357934562704e-06, |
|
"loss": 0.0371, |
|
"reward": 0.25291070714592934, |
|
"reward_std": 0.7776944190263748, |
|
"rewards/correctness_reward_func": 0.1517857201397419, |
|
"rewards/int_reward_func": 0.2321428656578064, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.13101786747574806, |
|
"step": 75 |
|
}, |
|
{ |
|
"completion_length": 99.77455711364746, |
|
"epoch": 0.2846441947565543, |
|
"grad_norm": 0.4924392104148865, |
|
"kl": 0.9676513671875, |
|
"learning_rate": 4.7564632108746524e-06, |
|
"loss": 0.0387, |
|
"reward": 0.29688840731978416, |
|
"reward_std": 0.7467798590660095, |
|
"rewards/correctness_reward_func": 0.1651785746216774, |
|
"rewards/int_reward_func": 0.2377232313156128, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.10601340420544147, |
|
"step": 76 |
|
}, |
|
{ |
|
"completion_length": 108.47545051574707, |
|
"epoch": 0.2883895131086142, |
|
"grad_norm": 0.45954495668411255, |
|
"kl": 0.9312744140625, |
|
"learning_rate": 4.742181853831721e-06, |
|
"loss": 0.0372, |
|
"reward": 0.2200825996696949, |
|
"reward_std": 0.7668928056955338, |
|
"rewards/correctness_reward_func": 0.1428571492433548, |
|
"rewards/int_reward_func": 0.2377232201397419, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.1604977697134018, |
|
"step": 77 |
|
}, |
|
{ |
|
"completion_length": 108.40848731994629, |
|
"epoch": 0.29213483146067415, |
|
"grad_norm": 0.5183126330375671, |
|
"kl": 0.931640625, |
|
"learning_rate": 4.72751631047092e-06, |
|
"loss": 0.0373, |
|
"reward": 0.26953795552253723, |
|
"reward_std": 0.7980407774448395, |
|
"rewards/correctness_reward_func": 0.1830357238650322, |
|
"rewards/int_reward_func": 0.2544642984867096, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.16796205937862396, |
|
"step": 78 |
|
}, |
|
{ |
|
"completion_length": 92.30357551574707, |
|
"epoch": 0.2958801498127341, |
|
"grad_norm": 0.6169615983963013, |
|
"kl": 0.814453125, |
|
"learning_rate": 4.712469093657605e-06, |
|
"loss": 0.0326, |
|
"reward": 0.3473794758319855, |
|
"reward_std": 0.7407716810703278, |
|
"rewards/correctness_reward_func": 0.1875000074505806, |
|
"rewards/int_reward_func": 0.2455357313156128, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.08565625501796603, |
|
"step": 79 |
|
}, |
|
{ |
|
"completion_length": 96.31920051574707, |
|
"epoch": 0.299625468164794, |
|
"grad_norm": 0.5736718773841858, |
|
"kl": 0.82373046875, |
|
"learning_rate": 4.697042781654913e-06, |
|
"loss": 0.0329, |
|
"reward": 0.3164888694882393, |
|
"reward_std": 0.8256205767393112, |
|
"rewards/correctness_reward_func": 0.160714291036129, |
|
"rewards/int_reward_func": 0.2734375074505806, |
|
"rewards/soft_format_reward_func": 0.0011160714784637094, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.11877902504056692, |
|
"step": 80 |
|
}, |
|
{ |
|
"completion_length": 95.03348731994629, |
|
"epoch": 0.30337078651685395, |
|
"grad_norm": 0.6198201179504395, |
|
"kl": 0.7996826171875, |
|
"learning_rate": 4.681240017681994e-06, |
|
"loss": 0.032, |
|
"reward": 0.31822992861270905, |
|
"reward_std": 0.7278983741998672, |
|
"rewards/correctness_reward_func": 0.1562500074505806, |
|
"rewards/int_reward_func": 0.2500000111758709, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.08802009373903275, |
|
"step": 81 |
|
}, |
|
{ |
|
"completion_length": 94.79911041259766, |
|
"epoch": 0.30711610486891383, |
|
"grad_norm": 0.4967269003391266, |
|
"kl": 0.791748046875, |
|
"learning_rate": 4.665063509461098e-06, |
|
"loss": 0.0317, |
|
"reward": 0.37110715731978416, |
|
"reward_std": 0.8040148764848709, |
|
"rewards/correctness_reward_func": 0.2633928693830967, |
|
"rewards/int_reward_func": 0.2477678693830967, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.1400535786524415, |
|
"step": 82 |
|
}, |
|
{ |
|
"completion_length": 93.06696891784668, |
|
"epoch": 0.31086142322097376, |
|
"grad_norm": 0.5414807796478271, |
|
"kl": 0.799072265625, |
|
"learning_rate": 4.648516028753632e-06, |
|
"loss": 0.032, |
|
"reward": 0.3070870563387871, |
|
"reward_std": 0.9167025238275528, |
|
"rewards/correctness_reward_func": 0.2187500074505806, |
|
"rewards/int_reward_func": 0.2500000149011612, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.16166296042501926, |
|
"step": 83 |
|
}, |
|
{ |
|
"completion_length": 85.48661231994629, |
|
"epoch": 0.3146067415730337, |
|
"grad_norm": 0.6058946251869202, |
|
"kl": 0.8023681640625, |
|
"learning_rate": 4.631600410885231e-06, |
|
"loss": 0.0321, |
|
"reward": 0.31676117703318596, |
|
"reward_std": 0.8016230016946793, |
|
"rewards/correctness_reward_func": 0.1785714365541935, |
|
"rewards/int_reward_func": 0.2232142947614193, |
|
"rewards/soft_format_reward_func": 0.0011160714784637094, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.08614062378183007, |
|
"step": 84 |
|
}, |
|
{ |
|
"completion_length": 98.87947082519531, |
|
"epoch": 0.31835205992509363, |
|
"grad_norm": 0.5493951439857483, |
|
"kl": 0.810302734375, |
|
"learning_rate": 4.614319554259934e-06, |
|
"loss": 0.0324, |
|
"reward": 0.26373885199427605, |
|
"reward_std": 0.7828188389539719, |
|
"rewards/correctness_reward_func": 0.1428571492433548, |
|
"rewards/int_reward_func": 0.2421875149011612, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.12130581215023994, |
|
"step": 85 |
|
}, |
|
{ |
|
"completion_length": 98.88170051574707, |
|
"epoch": 0.32209737827715357, |
|
"grad_norm": 0.5060502290725708, |
|
"kl": 0.8116455078125, |
|
"learning_rate": 4.596676419863561e-06, |
|
"loss": 0.0325, |
|
"reward": 0.37905358523130417, |
|
"reward_std": 0.7987204343080521, |
|
"rewards/correctness_reward_func": 0.1964285783469677, |
|
"rewards/int_reward_func": 0.266741082072258, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.08411608170717955, |
|
"step": 86 |
|
}, |
|
{ |
|
"completion_length": 95.37723731994629, |
|
"epoch": 0.3258426966292135, |
|
"grad_norm": 0.45160311460494995, |
|
"kl": 0.814697265625, |
|
"learning_rate": 4.578674030756364e-06, |
|
"loss": 0.0326, |
|
"reward": 0.3752902075648308, |
|
"reward_std": 0.7861279100179672, |
|
"rewards/correctness_reward_func": 0.2142857238650322, |
|
"rewards/int_reward_func": 0.2566964402794838, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.09569196694064885, |
|
"step": 87 |
|
}, |
|
{ |
|
"completion_length": 88.16964530944824, |
|
"epoch": 0.3295880149812734, |
|
"grad_norm": 0.4520312249660492, |
|
"kl": 0.8048095703125, |
|
"learning_rate": 4.560315471555039e-06, |
|
"loss": 0.0322, |
|
"reward": 0.40060270577669144, |
|
"reward_std": 0.827767089009285, |
|
"rewards/correctness_reward_func": 0.2410714402794838, |
|
"rewards/int_reward_func": 0.2399553656578064, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.0804241057485342, |
|
"step": 88 |
|
}, |
|
{ |
|
"completion_length": 90.47098541259766, |
|
"epoch": 0.3333333333333333, |
|
"grad_norm": 0.4482274651527405, |
|
"kl": 0.802490234375, |
|
"learning_rate": 4.541603887904198e-06, |
|
"loss": 0.0321, |
|
"reward": 0.46391965448856354, |
|
"reward_std": 0.8666775524616241, |
|
"rewards/correctness_reward_func": 0.2812500149011612, |
|
"rewards/int_reward_func": 0.2845982313156128, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.1019285786896944, |
|
"step": 89 |
|
}, |
|
{ |
|
"completion_length": 99.05357360839844, |
|
"epoch": 0.33707865168539325, |
|
"grad_norm": 0.48688769340515137, |
|
"kl": 0.8892822265625, |
|
"learning_rate": 4.522542485937369e-06, |
|
"loss": 0.0356, |
|
"reward": 0.32227010279893875, |
|
"reward_std": 0.7231635600328445, |
|
"rewards/correctness_reward_func": 0.1741071492433548, |
|
"rewards/int_reward_func": 0.2455357238650322, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.09737277776002884, |
|
"step": 90 |
|
}, |
|
{ |
|
"completion_length": 96.9062557220459, |
|
"epoch": 0.3408239700374532, |
|
"grad_norm": 0.713897168636322, |
|
"kl": 0.83984375, |
|
"learning_rate": 4.503134531727652e-06, |
|
"loss": 0.0336, |
|
"reward": 0.3822232261300087, |
|
"reward_std": 0.8144369274377823, |
|
"rewards/correctness_reward_func": 0.2455357275903225, |
|
"rewards/int_reward_func": 0.258928582072258, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.1222410760819912, |
|
"step": 91 |
|
}, |
|
{ |
|
"completion_length": 92.32589530944824, |
|
"epoch": 0.3445692883895131, |
|
"grad_norm": 0.48127949237823486, |
|
"kl": 0.8839111328125, |
|
"learning_rate": 4.4833833507280884e-06, |
|
"loss": 0.0354, |
|
"reward": 0.2896517887711525, |
|
"reward_std": 0.8091708421707153, |
|
"rewards/correctness_reward_func": 0.160714291036129, |
|
"rewards/int_reward_func": 0.2455357238650322, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.11659822333604097, |
|
"step": 92 |
|
}, |
|
{ |
|
"completion_length": 113.4062557220459, |
|
"epoch": 0.34831460674157305, |
|
"grad_norm": 0.5383365154266357, |
|
"kl": 1.0166015625, |
|
"learning_rate": 4.463292327201862e-06, |
|
"loss": 0.0407, |
|
"reward": 0.2778482399880886, |
|
"reward_std": 0.7528630048036575, |
|
"rewards/correctness_reward_func": 0.1607142947614193, |
|
"rewards/int_reward_func": 0.2388393022119999, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.12170535884797573, |
|
"step": 93 |
|
}, |
|
{ |
|
"completion_length": 90.40179061889648, |
|
"epoch": 0.352059925093633, |
|
"grad_norm": 0.47072696685791016, |
|
"kl": 0.8814697265625, |
|
"learning_rate": 4.442864903642428e-06, |
|
"loss": 0.0353, |
|
"reward": 0.3879285827279091, |
|
"reward_std": 0.7762220501899719, |
|
"rewards/correctness_reward_func": 0.2321428693830967, |
|
"rewards/int_reward_func": 0.2377232275903225, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.08193750027567148, |
|
"step": 94 |
|
}, |
|
{ |
|
"completion_length": 100.53795051574707, |
|
"epoch": 0.35580524344569286, |
|
"grad_norm": 0.5055387020111084, |
|
"kl": 0.9866943359375, |
|
"learning_rate": 4.422104580183649e-06, |
|
"loss": 0.0395, |
|
"reward": 0.27919645234942436, |
|
"reward_std": 0.8693763017654419, |
|
"rewards/correctness_reward_func": 0.1875000074505806, |
|
"rewards/int_reward_func": 0.2455357238650322, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.15383929014205933, |
|
"step": 95 |
|
}, |
|
{ |
|
"completion_length": 98.15402221679688, |
|
"epoch": 0.3595505617977528, |
|
"grad_norm": 0.5205227732658386, |
|
"kl": 1.0927734375, |
|
"learning_rate": 4.401014914000078e-06, |
|
"loss": 0.0437, |
|
"reward": 0.30646876618266106, |
|
"reward_std": 0.8004807382822037, |
|
"rewards/correctness_reward_func": 0.1830357201397419, |
|
"rewards/int_reward_func": 0.238839291036129, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.11540625896304846, |
|
"step": 96 |
|
}, |
|
{ |
|
"completion_length": 92.7410774230957, |
|
"epoch": 0.36329588014981273, |
|
"grad_norm": 0.5007465481758118, |
|
"kl": 1.0205078125, |
|
"learning_rate": 4.379599518697444e-06, |
|
"loss": 0.0408, |
|
"reward": 0.4242701083421707, |
|
"reward_std": 0.900767520070076, |
|
"rewards/correctness_reward_func": 0.2321428693830967, |
|
"rewards/int_reward_func": 0.279017873108387, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.08689062856137753, |
|
"step": 97 |
|
}, |
|
{ |
|
"completion_length": 95.44196891784668, |
|
"epoch": 0.36704119850187267, |
|
"grad_norm": 0.7685222029685974, |
|
"kl": 1.154541015625, |
|
"learning_rate": 4.357862063693486e-06, |
|
"loss": 0.0462, |
|
"reward": 0.3419933207333088, |
|
"reward_std": 0.7976544201374054, |
|
"rewards/correctness_reward_func": 0.1919642947614193, |
|
"rewards/int_reward_func": 0.2522321529686451, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.10220312792807817, |
|
"step": 98 |
|
}, |
|
{ |
|
"completion_length": 97.05803871154785, |
|
"epoch": 0.3707865168539326, |
|
"grad_norm": 0.48678070306777954, |
|
"kl": 1.0458984375, |
|
"learning_rate": 4.335806273589214e-06, |
|
"loss": 0.0418, |
|
"reward": 0.32839956879615784, |
|
"reward_std": 0.7194608449935913, |
|
"rewards/correctness_reward_func": 0.1562500074505806, |
|
"rewards/int_reward_func": 0.2488839402794838, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.07673437846824527, |
|
"step": 99 |
|
}, |
|
{ |
|
"completion_length": 89.83259391784668, |
|
"epoch": 0.37453183520599254, |
|
"grad_norm": 0.4802840054035187, |
|
"kl": 0.9891357421875, |
|
"learning_rate": 4.313435927530719e-06, |
|
"loss": 0.0396, |
|
"reward": 0.323910728096962, |
|
"reward_std": 0.8017723858356476, |
|
"rewards/correctness_reward_func": 0.1741071492433548, |
|
"rewards/int_reward_func": 0.2377232238650322, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.08791964408010244, |
|
"step": 100 |
|
}, |
|
{ |
|
"completion_length": 92.61607551574707, |
|
"epoch": 0.3782771535580524, |
|
"grad_norm": 0.6329229474067688, |
|
"kl": 0.9765625, |
|
"learning_rate": 4.290754858561636e-06, |
|
"loss": 0.0391, |
|
"reward": 0.3065357282757759, |
|
"reward_std": 0.7879298776388168, |
|
"rewards/correctness_reward_func": 0.165178582072258, |
|
"rewards/int_reward_func": 0.2678571566939354, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.12650001049041748, |
|
"step": 101 |
|
}, |
|
{ |
|
"completion_length": 92.51116371154785, |
|
"epoch": 0.38202247191011235, |
|
"grad_norm": 0.5261896848678589, |
|
"kl": 0.99658203125, |
|
"learning_rate": 4.267766952966369e-06, |
|
"loss": 0.0399, |
|
"reward": 0.3286317139863968, |
|
"reward_std": 0.7215069979429245, |
|
"rewards/correctness_reward_func": 0.1473214365541935, |
|
"rewards/int_reward_func": 0.2611607275903225, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.07985044876113534, |
|
"step": 102 |
|
}, |
|
{ |
|
"completion_length": 104.85937881469727, |
|
"epoch": 0.3857677902621723, |
|
"grad_norm": 0.4765637516975403, |
|
"kl": 1.0252685546875, |
|
"learning_rate": 4.244476149604201e-06, |
|
"loss": 0.041, |
|
"reward": 0.3370089456439018, |
|
"reward_std": 0.7856772691011429, |
|
"rewards/correctness_reward_func": 0.1785714402794838, |
|
"rewards/int_reward_func": 0.2533482313156128, |
|
"rewards/soft_format_reward_func": 0.0011160714784637094, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.09602678846567869, |
|
"step": 103 |
|
}, |
|
{ |
|
"completion_length": 92.55804061889648, |
|
"epoch": 0.3895131086142322, |
|
"grad_norm": 0.5377345681190491, |
|
"kl": 0.9796142578125, |
|
"learning_rate": 4.220886439234385e-06, |
|
"loss": 0.0392, |
|
"reward": 0.3739665374159813, |
|
"reward_std": 0.8817652761936188, |
|
"rewards/correctness_reward_func": 0.2232142947614193, |
|
"rewards/int_reward_func": 0.2354910857975483, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.0847388431429863, |
|
"step": 104 |
|
}, |
|
{ |
|
"completion_length": 102.57366561889648, |
|
"epoch": 0.39325842696629215, |
|
"grad_norm": 0.5319781303405762, |
|
"kl": 1.1024169921875, |
|
"learning_rate": 4.197001863832355e-06, |
|
"loss": 0.0441, |
|
"reward": 0.33513617515563965, |
|
"reward_std": 0.7641059011220932, |
|
"rewards/correctness_reward_func": 0.1964285783469677, |
|
"rewards/int_reward_func": 0.2544642984867096, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.11575670912861824, |
|
"step": 105 |
|
}, |
|
{ |
|
"completion_length": 95.90625381469727, |
|
"epoch": 0.3970037453183521, |
|
"grad_norm": 0.5436156392097473, |
|
"kl": 1.011474609375, |
|
"learning_rate": 4.172826515897146e-06, |
|
"loss": 0.0405, |
|
"reward": 0.3867567144334316, |
|
"reward_std": 0.7985697090625763, |
|
"rewards/correctness_reward_func": 0.2142857275903225, |
|
"rewards/int_reward_func": 0.2700893059372902, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0011160714784637094, |
|
"rewards/xmlcount_reward_func": -0.09873438253998756, |
|
"step": 106 |
|
}, |
|
{ |
|
"completion_length": 89.44866561889648, |
|
"epoch": 0.40074906367041196, |
|
"grad_norm": 0.47593066096305847, |
|
"kl": 1.074462890625, |
|
"learning_rate": 4.1483645377501726e-06, |
|
"loss": 0.043, |
|
"reward": 0.36116072721779346, |
|
"reward_std": 0.8109498172998428, |
|
"rewards/correctness_reward_func": 0.19642857648432255, |
|
"rewards/int_reward_func": 0.2332589365541935, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.06852679001167417, |
|
"step": 107 |
|
}, |
|
{ |
|
"completion_length": 98.08482551574707, |
|
"epoch": 0.4044943820224719, |
|
"grad_norm": 0.47593066096305847, |
|
"kl": NaN, |
|
"learning_rate": 4.1483645377501726e-06, |
|
"loss": 0.042, |
|
"reward": 0.3247567042708397, |
|
"reward_std": 0.756167471408844, |
|
"rewards/correctness_reward_func": 0.165178582072258, |
|
"rewards/int_reward_func": 0.2488839402794838, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.08930580969899893, |
|
"step": 108 |
|
}, |
|
{ |
|
"completion_length": 96.78571701049805, |
|
"epoch": 0.40823970037453183, |
|
"grad_norm": 0.46640312671661377, |
|
"kl": 1.106689453125, |
|
"learning_rate": 4.123620120825459e-06, |
|
"loss": 0.0443, |
|
"reward": 0.3182366043329239, |
|
"reward_std": 0.8166698515415192, |
|
"rewards/correctness_reward_func": 0.20982143469154835, |
|
"rewards/int_reward_func": 0.2500000074505806, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.141584824770689, |
|
"step": 109 |
|
}, |
|
{ |
|
"completion_length": 103.3504524230957, |
|
"epoch": 0.41198501872659177, |
|
"grad_norm": 0.4819093346595764, |
|
"kl": 1.209228515625, |
|
"learning_rate": 4.098597504951462e-06, |
|
"loss": 0.0484, |
|
"reward": 0.45162054151296616, |
|
"reward_std": 0.9206108599901199, |
|
"rewards/correctness_reward_func": 0.3080357313156128, |
|
"rewards/int_reward_func": 0.2801339402794838, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.1365491133183241, |
|
"step": 110 |
|
}, |
|
{ |
|
"completion_length": 95.90402412414551, |
|
"epoch": 0.4157303370786517, |
|
"grad_norm": 0.48896247148513794, |
|
"kl": 1.0751953125, |
|
"learning_rate": 4.073300977624594e-06, |
|
"loss": 0.043, |
|
"reward": 0.2652589473873377, |
|
"reward_std": 0.7796717882156372, |
|
"rewards/correctness_reward_func": 0.160714291036129, |
|
"rewards/int_reward_func": 0.2533482201397419, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.14880357310175896, |
|
"step": 111 |
|
}, |
|
{ |
|
"completion_length": 90.68527221679688, |
|
"epoch": 0.41947565543071164, |
|
"grad_norm": 0.4631924033164978, |
|
"kl": 1.094970703125, |
|
"learning_rate": 4.047734873274586e-06, |
|
"loss": 0.0438, |
|
"reward": 0.35960714891552925, |
|
"reward_std": 0.7207369059324265, |
|
"rewards/correctness_reward_func": 0.17410715529695153, |
|
"rewards/int_reward_func": 0.2845982238650322, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.09909821674227715, |
|
"step": 112 |
|
}, |
|
{ |
|
"completion_length": 97.99330711364746, |
|
"epoch": 0.4232209737827715, |
|
"grad_norm": 0.5063531398773193, |
|
"kl": 1.1484375, |
|
"learning_rate": 4.021903572521802e-06, |
|
"loss": 0.0459, |
|
"reward": 0.41493305563926697, |
|
"reward_std": 0.8120662122964859, |
|
"rewards/correctness_reward_func": 0.2410714402794838, |
|
"rewards/int_reward_func": 0.2566964365541935, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0011160714784637094, |
|
"rewards/xmlcount_reward_func": -0.08395089209079742, |
|
"step": 113 |
|
}, |
|
{ |
|
"completion_length": 85.47991561889648, |
|
"epoch": 0.42696629213483145, |
|
"grad_norm": 0.4622070789337158, |
|
"kl": 1.0574951171875, |
|
"learning_rate": 3.995811501426648e-06, |
|
"loss": 0.0423, |
|
"reward": 0.3168504536151886, |
|
"reward_std": 0.7642460912466049, |
|
"rewards/correctness_reward_func": 0.1875000149011612, |
|
"rewards/int_reward_func": 0.2633928656578064, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.1340424194931984, |
|
"step": 114 |
|
}, |
|
{ |
|
"completion_length": 94.50000381469727, |
|
"epoch": 0.4307116104868914, |
|
"grad_norm": 0.4713800251483917, |
|
"kl": 1.068359375, |
|
"learning_rate": 3.969463130731183e-06, |
|
"loss": 0.0427, |
|
"reward": 0.3659776858985424, |
|
"reward_std": 0.891373872756958, |
|
"rewards/correctness_reward_func": 0.2678571492433548, |
|
"rewards/int_reward_func": 0.2444196566939354, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.14629911817610264, |
|
"step": 115 |
|
}, |
|
{ |
|
"completion_length": 96.42187881469727, |
|
"epoch": 0.4344569288389513, |
|
"grad_norm": 0.527915894985199, |
|
"kl": 1.056884765625, |
|
"learning_rate": 3.942862975093085e-06, |
|
"loss": 0.0423, |
|
"reward": 0.36224332079291344, |
|
"reward_std": 0.8091815561056137, |
|
"rewards/correctness_reward_func": 0.2232142984867096, |
|
"rewards/int_reward_func": 0.2477678693830967, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.1087388452142477, |
|
"step": 116 |
|
}, |
|
{ |
|
"completion_length": 95.6093807220459, |
|
"epoch": 0.43820224719101125, |
|
"grad_norm": 0.5208942294120789, |
|
"kl": 1.090576171875, |
|
"learning_rate": 3.916015592312083e-06, |
|
"loss": 0.0436, |
|
"reward": 0.27370089665055275, |
|
"reward_std": 0.8227901756763458, |
|
"rewards/correctness_reward_func": 0.1741071529686451, |
|
"rewards/int_reward_func": 0.254464291036129, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.15487053990364075, |
|
"step": 117 |
|
}, |
|
{ |
|
"completion_length": 105.62054061889648, |
|
"epoch": 0.4419475655430712, |
|
"grad_norm": 0.47664541006088257, |
|
"kl": 1.18017578125, |
|
"learning_rate": 3.888925582549006e-06, |
|
"loss": 0.0472, |
|
"reward": 0.28062277287244797, |
|
"reward_std": 0.8103707134723663, |
|
"rewards/correctness_reward_func": 0.1830357201397419, |
|
"rewards/int_reward_func": 0.2455357201397419, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.14794866368174553, |
|
"step": 118 |
|
}, |
|
{ |
|
"completion_length": 94.9598274230957, |
|
"epoch": 0.44569288389513106, |
|
"grad_norm": 0.5299546122550964, |
|
"kl": 1.0885009765625, |
|
"learning_rate": 3.861597587537568e-06, |
|
"loss": 0.0435, |
|
"reward": 0.2977410815656185, |
|
"reward_std": 0.7406027764081955, |
|
"rewards/correctness_reward_func": 0.1517857201397419, |
|
"rewards/int_reward_func": 0.2254464402794838, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.07949107605963945, |
|
"step": 119 |
|
}, |
|
{ |
|
"completion_length": 98.04911422729492, |
|
"epoch": 0.449438202247191, |
|
"grad_norm": 0.4211517870426178, |
|
"kl": 1.2236328125, |
|
"learning_rate": 3.83403628978903e-06, |
|
"loss": 0.0489, |
|
"reward": 0.2801852785050869, |
|
"reward_std": 0.8073955476284027, |
|
"rewards/correctness_reward_func": 0.1607142947614193, |
|
"rewards/int_reward_func": 0.247767873108387, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.12829688470810652, |
|
"step": 120 |
|
}, |
|
{ |
|
"completion_length": 96.47098731994629, |
|
"epoch": 0.45318352059925093, |
|
"grad_norm": 0.4674068093299866, |
|
"kl": 1.210693359375, |
|
"learning_rate": 3.806246411789872e-06, |
|
"loss": 0.0484, |
|
"reward": 0.35595760494470596, |
|
"reward_std": 0.8347803801298141, |
|
"rewards/correctness_reward_func": 0.2098214402794838, |
|
"rewards/int_reward_func": 0.2544642947614193, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.10832812543958426, |
|
"step": 121 |
|
}, |
|
{ |
|
"completion_length": 97.76116561889648, |
|
"epoch": 0.45692883895131087, |
|
"grad_norm": 0.7582751512527466, |
|
"kl": 1.22021484375, |
|
"learning_rate": 3.77823271519263e-06, |
|
"loss": 0.0488, |
|
"reward": 0.35320091247558594, |
|
"reward_std": 0.7386835068464279, |
|
"rewards/correctness_reward_func": 0.1964285746216774, |
|
"rewards/int_reward_func": 0.2600446529686451, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.10327232570853084, |
|
"step": 122 |
|
}, |
|
{ |
|
"completion_length": 95.66071891784668, |
|
"epoch": 0.4606741573033708, |
|
"grad_norm": 0.7582751512527466, |
|
"kl": NaN, |
|
"learning_rate": 3.77823271519263e-06, |
|
"loss": 0.046, |
|
"reward": 0.31566742807626724, |
|
"reward_std": 0.8195231109857559, |
|
"rewards/correctness_reward_func": 0.1696428656578064, |
|
"rewards/int_reward_func": 0.2366071492433548, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.09058259081211872, |
|
"step": 123 |
|
}, |
|
{ |
|
"completion_length": 93.24330711364746, |
|
"epoch": 0.46441947565543074, |
|
"grad_norm": 0.49510565400123596, |
|
"kl": 1.078125, |
|
"learning_rate": 3.7500000000000005e-06, |
|
"loss": 0.0431, |
|
"reward": 0.3665379509329796, |
|
"reward_std": 0.9554053395986557, |
|
"rewards/correctness_reward_func": 0.2366071566939354, |
|
"rewards/int_reward_func": 0.2622767947614193, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.1323459828272462, |
|
"step": 124 |
|
}, |
|
{ |
|
"completion_length": 97.34152221679688, |
|
"epoch": 0.4681647940074906, |
|
"grad_norm": 0.6952568888664246, |
|
"kl": 1.18701171875, |
|
"learning_rate": 3.721553103742388e-06, |
|
"loss": 0.0475, |
|
"reward": 0.34241294860839844, |
|
"reward_std": 0.8731215000152588, |
|
"rewards/correctness_reward_func": 0.2053571529686451, |
|
"rewards/int_reward_func": 0.2678571566939354, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.13080134615302086, |
|
"step": 125 |
|
}, |
|
{ |
|
"completion_length": 99.09821891784668, |
|
"epoch": 0.47191011235955055, |
|
"grad_norm": 0.46382632851600647, |
|
"kl": 1.170654296875, |
|
"learning_rate": 3.6928969006490212e-06, |
|
"loss": 0.0468, |
|
"reward": 0.31666965037584305, |
|
"reward_std": 0.7566796094179153, |
|
"rewards/correctness_reward_func": 0.1517857201397419, |
|
"rewards/int_reward_func": 0.2723214440047741, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.10743750259280205, |
|
"step": 126 |
|
}, |
|
{ |
|
"completion_length": 91.62277412414551, |
|
"epoch": 0.4756554307116105, |
|
"grad_norm": 0.47625091671943665, |
|
"kl": 0.992431640625, |
|
"learning_rate": 3.664036300812779e-06, |
|
"loss": 0.0397, |
|
"reward": 0.3642299249768257, |
|
"reward_std": 0.951588049530983, |
|
"rewards/correctness_reward_func": 0.2410714402794838, |
|
"rewards/int_reward_func": 0.2712053656578064, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.14804688468575478, |
|
"step": 127 |
|
}, |
|
{ |
|
"completion_length": 100.61607360839844, |
|
"epoch": 0.4794007490636704, |
|
"grad_norm": 0.5847791433334351, |
|
"kl": 1.225830078125, |
|
"learning_rate": 3.634976249348867e-06, |
|
"loss": 0.049, |
|
"reward": 0.42196429520845413, |
|
"reward_std": 0.8636786490678787, |
|
"rewards/correctness_reward_func": 0.2455357238650322, |
|
"rewards/int_reward_func": 0.2533482201397419, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.07691965182311833, |
|
"step": 128 |
|
}, |
|
{ |
|
"completion_length": 98.10714530944824, |
|
"epoch": 0.48314606741573035, |
|
"grad_norm": 0.44887715578079224, |
|
"kl": 1.186767578125, |
|
"learning_rate": 3.6057217255475034e-06, |
|
"loss": 0.0475, |
|
"reward": 0.2306763455271721, |
|
"reward_std": 0.745768278837204, |
|
"rewards/correctness_reward_func": 0.1339285783469677, |
|
"rewards/int_reward_func": 0.2399553619325161, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.14320759288966656, |
|
"step": 129 |
|
}, |
|
{ |
|
"completion_length": 92.93750381469727, |
|
"epoch": 0.4868913857677903, |
|
"grad_norm": 0.44887715578079224, |
|
"kl": NaN, |
|
"learning_rate": 3.6057217255475034e-06, |
|
"loss": 0.0481, |
|
"reward": 0.2954799123108387, |
|
"reward_std": 0.7976376265287399, |
|
"rewards/correctness_reward_func": 0.1517857238650322, |
|
"rewards/int_reward_func": 0.2566964328289032, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.11300223972648382, |
|
"step": 130 |
|
}, |
|
{ |
|
"completion_length": 102.15178871154785, |
|
"epoch": 0.49063670411985016, |
|
"grad_norm": 0.417877733707428, |
|
"kl": 1.196044921875, |
|
"learning_rate": 3.5762777420207382e-06, |
|
"loss": 0.0478, |
|
"reward": 0.29901787638664246, |
|
"reward_std": 0.7817905694246292, |
|
"rewards/correctness_reward_func": 0.1785714328289032, |
|
"rewards/int_reward_func": 0.2656250149011612, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.14517857134342194, |
|
"step": 131 |
|
}, |
|
{ |
|
"completion_length": 99.67187881469727, |
|
"epoch": 0.4943820224719101, |
|
"grad_norm": 0.492396742105484, |
|
"kl": 1.19775390625, |
|
"learning_rate": 3.5466493438435707e-06, |
|
"loss": 0.0479, |
|
"reward": 0.2629486694931984, |
|
"reward_std": 0.8080793470144272, |
|
"rewards/correctness_reward_func": 0.1562500074505806, |
|
"rewards/int_reward_func": 0.2645089402794838, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.1578102707862854, |
|
"step": 132 |
|
}, |
|
{ |
|
"completion_length": 97.48214721679688, |
|
"epoch": 0.49812734082397003, |
|
"grad_norm": 0.5177117586135864, |
|
"kl": 1.177490234375, |
|
"learning_rate": 3.516841607689501e-06, |
|
"loss": 0.0471, |
|
"reward": 0.2645267955958843, |
|
"reward_std": 0.7447800785303116, |
|
"rewards/correctness_reward_func": 0.1562500074505806, |
|
"rewards/int_reward_func": 0.2455357275903225, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.13725892454385757, |
|
"step": 133 |
|
}, |
|
{ |
|
"completion_length": 95.82366561889648, |
|
"epoch": 0.50187265917603, |
|
"grad_norm": 0.5745740532875061, |
|
"kl": 1.138916015625, |
|
"learning_rate": 3.486859640960668e-06, |
|
"loss": 0.0456, |
|
"reward": 0.2982388660311699, |
|
"reward_std": 0.8590549826622009, |
|
"rewards/correctness_reward_func": 0.1785714328289032, |
|
"rewards/int_reward_func": 0.2522321492433548, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.1325647421181202, |
|
"step": 134 |
|
}, |
|
{ |
|
"completion_length": 91.2567024230957, |
|
"epoch": 0.5056179775280899, |
|
"grad_norm": 0.6395649313926697, |
|
"kl": 1.1143798828125, |
|
"learning_rate": 3.4567085809127247e-06, |
|
"loss": 0.0446, |
|
"reward": 0.3479754514992237, |
|
"reward_std": 0.8721490353345871, |
|
"rewards/correctness_reward_func": 0.2008928693830967, |
|
"rewards/int_reward_func": 0.2466517947614193, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.09956920135300606, |
|
"step": 135 |
|
}, |
|
{ |
|
"completion_length": 92.41295051574707, |
|
"epoch": 0.5093632958801498, |
|
"grad_norm": 0.6169349551200867, |
|
"kl": 1.121826171875, |
|
"learning_rate": 3.426393593774591e-06, |
|
"loss": 0.0449, |
|
"reward": 0.26565179601311684, |
|
"reward_std": 0.7934366017580032, |
|
"rewards/correctness_reward_func": 0.1339285783469677, |
|
"rewards/int_reward_func": 0.2433035857975483, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.11158036068081856, |
|
"step": 136 |
|
}, |
|
{ |
|
"completion_length": 96.50223541259766, |
|
"epoch": 0.5131086142322098, |
|
"grad_norm": 0.5668321251869202, |
|
"kl": 1.08935546875, |
|
"learning_rate": 3.39591987386325e-06, |
|
"loss": 0.0436, |
|
"reward": 0.32920314325019717, |
|
"reward_std": 0.7672218978404999, |
|
"rewards/correctness_reward_func": 0.20982143515720963, |
|
"rewards/int_reward_func": 0.2421875074505806, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.12280581146478653, |
|
"step": 137 |
|
}, |
|
{ |
|
"completion_length": 94.98661231994629, |
|
"epoch": 0.5168539325842697, |
|
"grad_norm": 0.5555436611175537, |
|
"kl": 1.1796875, |
|
"learning_rate": 3.3652926426937327e-06, |
|
"loss": 0.0472, |
|
"reward": 0.3373348340392113, |
|
"reward_std": 0.8585019558668137, |
|
"rewards/correctness_reward_func": 0.1964285783469677, |
|
"rewards/int_reward_func": 0.2566964440047741, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.1157901817932725, |
|
"step": 138 |
|
}, |
|
{ |
|
"completion_length": 100.43527221679688, |
|
"epoch": 0.5205992509363296, |
|
"grad_norm": 0.5975056290626526, |
|
"kl": 1.116455078125, |
|
"learning_rate": 3.3345171480844275e-06, |
|
"loss": 0.0447, |
|
"reward": 0.33340851217508316, |
|
"reward_std": 0.8432840257883072, |
|
"rewards/correctness_reward_func": 0.196428582072258, |
|
"rewards/int_reward_func": 0.2488839402794838, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.11190402135252953, |
|
"step": 139 |
|
}, |
|
{ |
|
"completion_length": 110.37946701049805, |
|
"epoch": 0.5243445692883895, |
|
"grad_norm": 0.5592173337936401, |
|
"kl": 1.2734375, |
|
"learning_rate": 3.303598663257904e-06, |
|
"loss": 0.0509, |
|
"reward": 0.3411696571856737, |
|
"reward_std": 0.8827303797006607, |
|
"rewards/correctness_reward_func": 0.2232142947614193, |
|
"rewards/int_reward_func": 0.2600446529686451, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.14208929613232613, |
|
"step": 140 |
|
}, |
|
{ |
|
"completion_length": 104.59375381469727, |
|
"epoch": 0.5280898876404494, |
|
"grad_norm": 0.5417644381523132, |
|
"kl": 1.274169921875, |
|
"learning_rate": 3.272542485937369e-06, |
|
"loss": 0.051, |
|
"reward": 0.367312528192997, |
|
"reward_std": 0.7450851798057556, |
|
"rewards/correctness_reward_func": 0.20089286752045155, |
|
"rewards/int_reward_func": 0.2578125149011612, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.09139286354184151, |
|
"step": 141 |
|
}, |
|
{ |
|
"completion_length": 86.5714340209961, |
|
"epoch": 0.5318352059925093, |
|
"grad_norm": 0.5099084973335266, |
|
"kl": 1.106689453125, |
|
"learning_rate": 3.2413539374389275e-06, |
|
"loss": 0.0443, |
|
"reward": 0.41947099566459656, |
|
"reward_std": 0.7897130697965622, |
|
"rewards/correctness_reward_func": 0.2187500111758709, |
|
"rewards/int_reward_func": 0.258928582072258, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.05820759106427431, |
|
"step": 142 |
|
}, |
|
{ |
|
"completion_length": 97.21875381469727, |
|
"epoch": 0.5355805243445693, |
|
"grad_norm": 0.41567039489746094, |
|
"kl": 1.190185546875, |
|
"learning_rate": 3.2100383617598075e-06, |
|
"loss": 0.0476, |
|
"reward": 0.2790111724752933, |
|
"reward_std": 0.7919286489486694, |
|
"rewards/correctness_reward_func": 0.16964286752045155, |
|
"rewards/int_reward_func": 0.2444196566939354, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.13505134359002113, |
|
"step": 143 |
|
}, |
|
{ |
|
"completion_length": 92.24553871154785, |
|
"epoch": 0.5393258426966292, |
|
"grad_norm": 0.5126021504402161, |
|
"kl": 1.135009765625, |
|
"learning_rate": 3.1786011246626858e-06, |
|
"loss": 0.0454, |
|
"reward": 0.33713172376155853, |
|
"reward_std": 0.759757861495018, |
|
"rewards/correctness_reward_func": 0.1741071492433548, |
|
"rewards/int_reward_func": 0.2578125074505806, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0011160714784637094, |
|
"rewards/xmlcount_reward_func": -0.09590401872992516, |
|
"step": 144 |
|
}, |
|
{ |
|
"completion_length": 92.36384391784668, |
|
"epoch": 0.5430711610486891, |
|
"grad_norm": 0.4743898808956146, |
|
"kl": 1.125732421875, |
|
"learning_rate": 3.147047612756302e-06, |
|
"loss": 0.045, |
|
"reward": 0.3338058143854141, |
|
"reward_std": 0.7495080679655075, |
|
"rewards/correctness_reward_func": 0.1741071492433548, |
|
"rewards/int_reward_func": 0.2399553656578064, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0011160714784637094, |
|
"rewards/xmlcount_reward_func": -0.0813727667555213, |
|
"step": 145 |
|
}, |
|
{ |
|
"completion_length": 96.77902030944824, |
|
"epoch": 0.5468164794007491, |
|
"grad_norm": 0.5349624156951904, |
|
"kl": 1.19140625, |
|
"learning_rate": 3.115383232572483e-06, |
|
"loss": 0.0476, |
|
"reward": 0.3870870769023895, |
|
"reward_std": 0.8124971240758896, |
|
"rewards/correctness_reward_func": 0.2232142984867096, |
|
"rewards/int_reward_func": 0.2533482275903225, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.08947544917464256, |
|
"step": 146 |
|
}, |
|
{ |
|
"completion_length": 100.25670051574707, |
|
"epoch": 0.550561797752809, |
|
"grad_norm": 0.3959732949733734, |
|
"kl": 1.1826171875, |
|
"learning_rate": 3.0836134096397642e-06, |
|
"loss": 0.0473, |
|
"reward": 0.3632299154996872, |
|
"reward_std": 0.8887846767902374, |
|
"rewards/correctness_reward_func": 0.2053571529686451, |
|
"rewards/int_reward_func": 0.266741082072258, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.10886830929666758, |
|
"step": 147 |
|
}, |
|
{ |
|
"completion_length": 95.57366561889648, |
|
"epoch": 0.5543071161048689, |
|
"grad_norm": 0.5033745765686035, |
|
"kl": 1.179931640625, |
|
"learning_rate": 3.051743587553754e-06, |
|
"loss": 0.0472, |
|
"reward": 0.33835939317941666, |
|
"reward_std": 0.7949195951223373, |
|
"rewards/correctness_reward_func": 0.1607142947614193, |
|
"rewards/int_reward_func": 0.2466517947614193, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0011160714784637094, |
|
"rewards/xmlcount_reward_func": -0.07012277067406103, |
|
"step": 148 |
|
}, |
|
{ |
|
"completion_length": 88.57143211364746, |
|
"epoch": 0.5580524344569289, |
|
"grad_norm": 0.4301639795303345, |
|
"kl": 1.153564453125, |
|
"learning_rate": 3.019779227044398e-06, |
|
"loss": 0.0462, |
|
"reward": 0.26906250417232513, |
|
"reward_std": 0.7477044612169266, |
|
"rewards/correctness_reward_func": 0.1607142947614193, |
|
"rewards/int_reward_func": 0.2578125037252903, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.14946428686380386, |
|
"step": 149 |
|
}, |
|
{ |
|
"completion_length": 104.3437557220459, |
|
"epoch": 0.5617977528089888, |
|
"grad_norm": 0.49258914589881897, |
|
"kl": 1.2353515625, |
|
"learning_rate": 2.9877258050403214e-06, |
|
"loss": 0.0494, |
|
"reward": 0.3209241144359112, |
|
"reward_std": 0.821040615439415, |
|
"rewards/correctness_reward_func": 0.1919642947614193, |
|
"rewards/int_reward_func": 0.2444196529686451, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.11545982770621777, |
|
"step": 150 |
|
}, |
|
{ |
|
"completion_length": 97.84821891784668, |
|
"epoch": 0.5655430711610487, |
|
"grad_norm": 0.6680523753166199, |
|
"kl": 1.210205078125, |
|
"learning_rate": 2.9555888137303695e-06, |
|
"loss": 0.0484, |
|
"reward": 0.4514397457242012, |
|
"reward_std": 0.8471043556928635, |
|
"rewards/correctness_reward_func": 0.2455357238650322, |
|
"rewards/int_reward_func": 0.2767857238650322, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.07088169828057289, |
|
"step": 151 |
|
}, |
|
{ |
|
"completion_length": 95.99777221679688, |
|
"epoch": 0.5692883895131086, |
|
"grad_norm": 0.44427061080932617, |
|
"kl": 1.162353515625, |
|
"learning_rate": 2.9233737596225616e-06, |
|
"loss": 0.0465, |
|
"reward": 0.3017522394657135, |
|
"reward_std": 0.7615446895360947, |
|
"rewards/correctness_reward_func": 0.1651785783469677, |
|
"rewards/int_reward_func": 0.2600446566939354, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.12347099208272994, |
|
"step": 152 |
|
}, |
|
{ |
|
"completion_length": 89.79687881469727, |
|
"epoch": 0.5730337078651685, |
|
"grad_norm": 0.3936914801597595, |
|
"kl": 1.1171875, |
|
"learning_rate": 2.8910861626005774e-06, |
|
"loss": 0.0447, |
|
"reward": 0.4248616322875023, |
|
"reward_std": 0.8152914345264435, |
|
"rewards/correctness_reward_func": 0.2410714328289032, |
|
"rewards/int_reward_func": 0.286830373108387, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.10304017923772335, |
|
"step": 153 |
|
}, |
|
{ |
|
"completion_length": 90.57589530944824, |
|
"epoch": 0.5767790262172284, |
|
"grad_norm": 0.43302392959594727, |
|
"kl": 1.1453857421875, |
|
"learning_rate": 2.858731554977948e-06, |
|
"loss": 0.0458, |
|
"reward": 0.3075290396809578, |
|
"reward_std": 0.7411210238933563, |
|
"rewards/correctness_reward_func": 0.16071429289877415, |
|
"rewards/int_reward_func": 0.2410714402794838, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.09425670048221946, |
|
"step": 154 |
|
}, |
|
{ |
|
"completion_length": 88.81920051574707, |
|
"epoch": 0.5805243445692884, |
|
"grad_norm": 0.4523748457431793, |
|
"kl": 1.1572265625, |
|
"learning_rate": 2.82631548055013e-06, |
|
"loss": 0.0463, |
|
"reward": 0.3362343907356262, |
|
"reward_std": 0.8344388753175735, |
|
"rewards/correctness_reward_func": 0.2232142947614193, |
|
"rewards/int_reward_func": 0.2220982238650322, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.10907812882214785, |
|
"step": 155 |
|
}, |
|
{ |
|
"completion_length": 92.31920051574707, |
|
"epoch": 0.5842696629213483, |
|
"grad_norm": 0.7492924928665161, |
|
"kl": 1.13818359375, |
|
"learning_rate": 2.7938434936445946e-06, |
|
"loss": 0.0455, |
|
"reward": 0.2696942128241062, |
|
"reward_std": 0.7281434237957001, |
|
"rewards/correctness_reward_func": 0.1785714365541935, |
|
"rewards/int_reward_func": 0.2421875074505806, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.1510647376999259, |
|
"step": 156 |
|
}, |
|
{ |
|
"completion_length": 91.05580711364746, |
|
"epoch": 0.5880149812734082, |
|
"grad_norm": 0.5227993130683899, |
|
"kl": 1.126220703125, |
|
"learning_rate": 2.761321158169134e-06, |
|
"loss": 0.045, |
|
"reward": 0.2759977802634239, |
|
"reward_std": 0.9335441738367081, |
|
"rewards/correctness_reward_func": 0.2098214402794838, |
|
"rewards/int_reward_func": 0.2388392984867096, |
|
"rewards/soft_format_reward_func": 0.0011160714784637094, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.17377902194857597, |
|
"step": 157 |
|
}, |
|
{ |
|
"completion_length": 95.73437881469727, |
|
"epoch": 0.5917602996254682, |
|
"grad_norm": 0.462187260389328, |
|
"kl": 1.1458740234375, |
|
"learning_rate": 2.7287540466585067e-06, |
|
"loss": 0.0458, |
|
"reward": 0.3727143071591854, |
|
"reward_std": 0.8857483267784119, |
|
"rewards/correctness_reward_func": 0.2544642947614193, |
|
"rewards/int_reward_func": 0.2656250111758709, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.14737500809133053, |
|
"step": 158 |
|
}, |
|
{ |
|
"completion_length": 93.68750381469727, |
|
"epoch": 0.5955056179775281, |
|
"grad_norm": 0.5321673154830933, |
|
"kl": 1.202392578125, |
|
"learning_rate": 2.696147739319613e-06, |
|
"loss": 0.0481, |
|
"reward": 0.41518306732177734, |
|
"reward_std": 0.7417058497667313, |
|
"rewards/correctness_reward_func": 0.2276785783469677, |
|
"rewards/int_reward_func": 0.2678571492433548, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.08035267796367407, |
|
"step": 159 |
|
}, |
|
{ |
|
"completion_length": 93.1093807220459, |
|
"epoch": 0.599250936329588, |
|
"grad_norm": 0.4620046615600586, |
|
"kl": 1.119873046875, |
|
"learning_rate": 2.663507823075358e-06, |
|
"loss": 0.0448, |
|
"reward": 0.32153796777129173, |
|
"reward_std": 0.8575054854154587, |
|
"rewards/correctness_reward_func": 0.2053571529686451, |
|
"rewards/int_reward_func": 0.2500000149011612, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.13381919264793396, |
|
"step": 160 |
|
}, |
|
{ |
|
"completion_length": 100.61161041259766, |
|
"epoch": 0.602996254681648, |
|
"grad_norm": 0.4500925838947296, |
|
"kl": 1.206298828125, |
|
"learning_rate": 2.6308398906073603e-06, |
|
"loss": 0.0483, |
|
"reward": 0.3319799229502678, |
|
"reward_std": 0.7600451558828354, |
|
"rewards/correctness_reward_func": 0.1830357201397419, |
|
"rewards/int_reward_func": 0.2544642947614193, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.10552009008824825, |
|
"step": 161 |
|
}, |
|
{ |
|
"completion_length": 97.59598731994629, |
|
"epoch": 0.6067415730337079, |
|
"grad_norm": 0.4500925838947296, |
|
"kl": NaN, |
|
"learning_rate": 2.6308398906073603e-06, |
|
"loss": 0.0447, |
|
"reward": 0.2094486728310585, |
|
"reward_std": 0.7504701465368271, |
|
"rewards/correctness_reward_func": 0.12500000558793545, |
|
"rewards/int_reward_func": 0.2321428693830967, |
|
"rewards/soft_format_reward_func": 0.0011160714784637094, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.14881027303636074, |
|
"step": 162 |
|
}, |
|
{ |
|
"completion_length": 94.28125381469727, |
|
"epoch": 0.6104868913857678, |
|
"grad_norm": 0.4558067321777344, |
|
"kl": 1.154296875, |
|
"learning_rate": 2.5981495393976718e-06, |
|
"loss": 0.0462, |
|
"reward": 0.26968081295490265, |
|
"reward_std": 0.8480332493782043, |
|
"rewards/correctness_reward_func": 0.1562500074505806, |
|
"rewards/int_reward_func": 0.2410714440047741, |
|
"rewards/soft_format_reward_func": 0.0011160714784637094, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.1287567038089037, |
|
"step": 163 |
|
}, |
|
{ |
|
"completion_length": 98.24107551574707, |
|
"epoch": 0.6142322097378277, |
|
"grad_norm": 0.4498756229877472, |
|
"kl": 1.177734375, |
|
"learning_rate": 2.5654423707696834e-06, |
|
"loss": 0.0471, |
|
"reward": 0.42699556052684784, |
|
"reward_std": 0.8207688927650452, |
|
"rewards/correctness_reward_func": 0.2544642984867096, |
|
"rewards/int_reward_func": 0.2667410783469677, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.09420982468873262, |
|
"step": 164 |
|
}, |
|
{ |
|
"completion_length": 93.59152221679688, |
|
"epoch": 0.6179775280898876, |
|
"grad_norm": 0.40786251425743103, |
|
"kl": 1.1708984375, |
|
"learning_rate": 2.5327239889283613e-06, |
|
"loss": 0.0468, |
|
"reward": 0.28490403294563293, |
|
"reward_std": 0.7075008153915405, |
|
"rewards/correctness_reward_func": 0.1562500037252903, |
|
"rewards/int_reward_func": 0.2399553693830967, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.11130134668201208, |
|
"step": 165 |
|
}, |
|
{ |
|
"completion_length": 97.06920051574707, |
|
"epoch": 0.6217228464419475, |
|
"grad_norm": 0.4559285640716553, |
|
"kl": 1.146484375, |
|
"learning_rate": 2.5e-06, |
|
"loss": 0.0459, |
|
"reward": 0.36037053912878036, |
|
"reward_std": 0.7710148096084595, |
|
"rewards/correctness_reward_func": 0.2098214365541935, |
|
"rewards/int_reward_func": 0.2488839328289032, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.09833482303656638, |
|
"step": 166 |
|
}, |
|
{ |
|
"completion_length": 102.25223731994629, |
|
"epoch": 0.6254681647940075, |
|
"grad_norm": 0.4517359137535095, |
|
"kl": 1.250732421875, |
|
"learning_rate": 2.4672760110716395e-06, |
|
"loss": 0.05, |
|
"reward": 0.3416629731655121, |
|
"reward_std": 0.7625188678503036, |
|
"rewards/correctness_reward_func": 0.2098214402794838, |
|
"rewards/int_reward_func": 0.2600446529686451, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.12820313312113285, |
|
"step": 167 |
|
}, |
|
{ |
|
"completion_length": 106.65402412414551, |
|
"epoch": 0.6292134831460674, |
|
"grad_norm": 0.5974560379981995, |
|
"kl": 1.312255859375, |
|
"learning_rate": 2.434557629230318e-06, |
|
"loss": 0.0525, |
|
"reward": 0.3028549253940582, |
|
"reward_std": 0.6672599911689758, |
|
"rewards/correctness_reward_func": 0.12946429336443543, |
|
"rewards/int_reward_func": 0.2622767947614193, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.08888616785407066, |
|
"step": 168 |
|
}, |
|
{ |
|
"completion_length": 101.56696701049805, |
|
"epoch": 0.6329588014981273, |
|
"grad_norm": 0.5514973402023315, |
|
"kl": 1.394287109375, |
|
"learning_rate": 2.4018504606023295e-06, |
|
"loss": 0.0558, |
|
"reward": 0.3376808315515518, |
|
"reward_std": 0.7231378108263016, |
|
"rewards/correctness_reward_func": 0.1696428693830967, |
|
"rewards/int_reward_func": 0.2555803619325161, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.08754241955466568, |
|
"step": 169 |
|
}, |
|
{ |
|
"completion_length": 96.87054061889648, |
|
"epoch": 0.6367041198501873, |
|
"grad_norm": 0.47777560353279114, |
|
"kl": 1.227783203125, |
|
"learning_rate": 2.3691601093926406e-06, |
|
"loss": 0.0491, |
|
"reward": 0.33633705973625183, |
|
"reward_std": 0.7033251821994781, |
|
"rewards/correctness_reward_func": 0.1696428656578064, |
|
"rewards/int_reward_func": 0.2500000186264515, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0011160714784637094, |
|
"rewards/xmlcount_reward_func": -0.08442188054323196, |
|
"step": 170 |
|
}, |
|
{ |
|
"completion_length": 98.07366371154785, |
|
"epoch": 0.6404494382022472, |
|
"grad_norm": 0.47777560353279114, |
|
"kl": NaN, |
|
"learning_rate": 2.3691601093926406e-06, |
|
"loss": 0.0531, |
|
"reward": 0.35397323966026306, |
|
"reward_std": 0.8121795952320099, |
|
"rewards/correctness_reward_func": 0.196428582072258, |
|
"rewards/int_reward_func": 0.2600446492433548, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.10250000539235771, |
|
"step": 171 |
|
}, |
|
{ |
|
"completion_length": 95.59821891784668, |
|
"epoch": 0.6441947565543071, |
|
"grad_norm": 0.47777560353279114, |
|
"kl": NaN, |
|
"learning_rate": 2.3691601093926406e-06, |
|
"loss": 0.053, |
|
"reward": 0.43456026911735535, |
|
"reward_std": 0.9222677648067474, |
|
"rewards/correctness_reward_func": 0.2857142984867096, |
|
"rewards/int_reward_func": 0.2633928768336773, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.1145468857139349, |
|
"step": 172 |
|
}, |
|
{ |
|
"completion_length": 90.60714721679688, |
|
"epoch": 0.6479400749063671, |
|
"grad_norm": 0.4840092957019806, |
|
"kl": 1.194580078125, |
|
"learning_rate": 2.3364921769246423e-06, |
|
"loss": 0.0478, |
|
"reward": 0.27813393622636795, |
|
"reward_std": 0.899843841791153, |
|
"rewards/correctness_reward_func": 0.2053571529686451, |
|
"rewards/int_reward_func": 0.2455357238650322, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.17275893315672874, |
|
"step": 173 |
|
}, |
|
{ |
|
"completion_length": 97.23214912414551, |
|
"epoch": 0.651685393258427, |
|
"grad_norm": 0.5476865172386169, |
|
"kl": 1.3369140625, |
|
"learning_rate": 2.3038522606803882e-06, |
|
"loss": 0.0535, |
|
"reward": 0.3434709906578064, |
|
"reward_std": 0.7968785911798477, |
|
"rewards/correctness_reward_func": 0.1919642947614193, |
|
"rewards/int_reward_func": 0.2566964402794838, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.10518973506987095, |
|
"step": 174 |
|
}, |
|
{ |
|
"completion_length": 100.55357551574707, |
|
"epoch": 0.6554307116104869, |
|
"grad_norm": 0.44576528668403625, |
|
"kl": 1.319091796875, |
|
"learning_rate": 2.271245953341494e-06, |
|
"loss": 0.0528, |
|
"reward": 0.3407433070242405, |
|
"reward_std": 0.7602152675390244, |
|
"rewards/correctness_reward_func": 0.160714291036129, |
|
"rewards/int_reward_func": 0.2488839440047741, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.0688549093902111, |
|
"step": 175 |
|
}, |
|
{ |
|
"completion_length": 94.70089721679688, |
|
"epoch": 0.6591760299625468, |
|
"grad_norm": 0.4624294340610504, |
|
"kl": 1.206298828125, |
|
"learning_rate": 2.238678841830867e-06, |
|
"loss": 0.0483, |
|
"reward": 0.3417031392455101, |
|
"reward_std": 0.8133516311645508, |
|
"rewards/correctness_reward_func": 0.1919642984867096, |
|
"rewards/int_reward_func": 0.258928582072258, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.10918973386287689, |
|
"step": 176 |
|
}, |
|
{ |
|
"completion_length": 90.06473541259766, |
|
"epoch": 0.6629213483146067, |
|
"grad_norm": 0.4517357349395752, |
|
"kl": 1.264404296875, |
|
"learning_rate": 2.2061565063554063e-06, |
|
"loss": 0.0506, |
|
"reward": 0.2323437575250864, |
|
"reward_std": 0.7963760495185852, |
|
"rewards/correctness_reward_func": 0.1428571492433548, |
|
"rewards/int_reward_func": 0.2388392984867096, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.14935269264969975, |
|
"step": 177 |
|
}, |
|
{ |
|
"completion_length": 95.54464721679688, |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 0.4517357349395752, |
|
"kl": NaN, |
|
"learning_rate": 2.2061565063554063e-06, |
|
"loss": 0.0465, |
|
"reward": 0.33185046166181564, |
|
"reward_std": 0.7673598080873489, |
|
"rewards/correctness_reward_func": 0.15625000558793545, |
|
"rewards/int_reward_func": 0.2633928693830967, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.08779241424053907, |
|
"step": 178 |
|
}, |
|
{ |
|
"completion_length": 95.11830711364746, |
|
"epoch": 0.6704119850187266, |
|
"grad_norm": 0.4517357349395752, |
|
"kl": NaN, |
|
"learning_rate": 2.2061565063554063e-06, |
|
"loss": 0.0485, |
|
"reward": 0.3244776912033558, |
|
"reward_std": 0.7535363733768463, |
|
"rewards/correctness_reward_func": 0.15178572200238705, |
|
"rewards/int_reward_func": 0.2533482201397419, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.08065625140443444, |
|
"step": 179 |
|
}, |
|
{ |
|
"completion_length": 92.10937881469727, |
|
"epoch": 0.6741573033707865, |
|
"grad_norm": 0.43252718448638916, |
|
"kl": 1.205078125, |
|
"learning_rate": 2.173684519449872e-06, |
|
"loss": 0.0482, |
|
"reward": 0.3435089588165283, |
|
"reward_std": 0.6813161820173264, |
|
"rewards/correctness_reward_func": 0.1919642984867096, |
|
"rewards/int_reward_func": 0.2354910895228386, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.08394643478095531, |
|
"step": 180 |
|
}, |
|
{ |
|
"completion_length": 92.46652221679688, |
|
"epoch": 0.6779026217228464, |
|
"grad_norm": 0.4764062166213989, |
|
"kl": 1.170166015625, |
|
"learning_rate": 2.1412684450220524e-06, |
|
"loss": 0.0468, |
|
"reward": 0.40853575617074966, |
|
"reward_std": 0.8349853605031967, |
|
"rewards/correctness_reward_func": 0.2500000111758709, |
|
"rewards/int_reward_func": 0.2600446566939354, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.10150893498212099, |
|
"step": 181 |
|
}, |
|
{ |
|
"completion_length": 97.13616371154785, |
|
"epoch": 0.6816479400749064, |
|
"grad_norm": 0.4394303262233734, |
|
"kl": 1.2177734375, |
|
"learning_rate": 2.1089138373994226e-06, |
|
"loss": 0.0487, |
|
"reward": 0.3534955531358719, |
|
"reward_std": 0.7782185822725296, |
|
"rewards/correctness_reward_func": 0.2008928693830967, |
|
"rewards/int_reward_func": 0.271205373108387, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.11860269121825695, |
|
"step": 182 |
|
}, |
|
{ |
|
"completion_length": 93.8035774230957, |
|
"epoch": 0.6853932584269663, |
|
"grad_norm": 0.4350711405277252, |
|
"kl": 1.158935546875, |
|
"learning_rate": 2.0766262403774388e-06, |
|
"loss": 0.0464, |
|
"reward": 0.29088394716382027, |
|
"reward_std": 0.8285562247037888, |
|
"rewards/correctness_reward_func": 0.1741071455180645, |
|
"rewards/int_reward_func": 0.2321428656578064, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.11536607332527637, |
|
"step": 183 |
|
}, |
|
{ |
|
"completion_length": 102.04687881469727, |
|
"epoch": 0.6891385767790262, |
|
"grad_norm": 0.5000995993614197, |
|
"kl": 1.3017578125, |
|
"learning_rate": 2.0444111862696313e-06, |
|
"loss": 0.0521, |
|
"reward": 0.33771875873208046, |
|
"reward_std": 0.786424919962883, |
|
"rewards/correctness_reward_func": 0.1785714328289032, |
|
"rewards/int_reward_func": 0.2645089328289032, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.10536161065101624, |
|
"step": 184 |
|
}, |
|
{ |
|
"completion_length": 84.60714530944824, |
|
"epoch": 0.6928838951310862, |
|
"grad_norm": 0.4194977283477783, |
|
"kl": 1.094482421875, |
|
"learning_rate": 2.01227419495968e-06, |
|
"loss": 0.0438, |
|
"reward": 0.2823236584663391, |
|
"reward_std": 0.8382576406002045, |
|
"rewards/correctness_reward_func": 0.1741071529686451, |
|
"rewards/int_reward_func": 0.2343750149011612, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.1261584870517254, |
|
"step": 185 |
|
}, |
|
{ |
|
"completion_length": 101.93527221679688, |
|
"epoch": 0.6966292134831461, |
|
"grad_norm": 0.5226723551750183, |
|
"kl": 1.428466796875, |
|
"learning_rate": 1.9802207729556023e-06, |
|
"loss": 0.0571, |
|
"reward": 0.3646428808569908, |
|
"reward_std": 0.7940028458833694, |
|
"rewards/correctness_reward_func": 0.20089286752045155, |
|
"rewards/int_reward_func": 0.2645089402794838, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.1007589353248477, |
|
"step": 186 |
|
}, |
|
{ |
|
"completion_length": 89.75893211364746, |
|
"epoch": 0.700374531835206, |
|
"grad_norm": 0.5314286351203918, |
|
"kl": 1.265625, |
|
"learning_rate": 1.9482564124462478e-06, |
|
"loss": 0.0506, |
|
"reward": 0.34281474351882935, |
|
"reward_std": 0.7245265394449234, |
|
"rewards/correctness_reward_func": 0.1875000111758709, |
|
"rewards/int_reward_func": 0.2500000111758709, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.09468527138233185, |
|
"step": 187 |
|
}, |
|
{ |
|
"completion_length": 89.30357360839844, |
|
"epoch": 0.704119850187266, |
|
"grad_norm": 0.4597249925136566, |
|
"kl": 1.21875, |
|
"learning_rate": 1.9163865903602374e-06, |
|
"loss": 0.0488, |
|
"reward": 0.3371138572692871, |
|
"reward_std": 0.7971315979957581, |
|
"rewards/correctness_reward_func": 0.1964285783469677, |
|
"rewards/int_reward_func": 0.2410714328289032, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.10038616601377726, |
|
"step": 188 |
|
}, |
|
{ |
|
"completion_length": 98.16071891784668, |
|
"epoch": 0.7078651685393258, |
|
"grad_norm": 0.4473523497581482, |
|
"kl": 1.35888671875, |
|
"learning_rate": 1.8846167674275175e-06, |
|
"loss": 0.0544, |
|
"reward": 0.2626696489751339, |
|
"reward_std": 0.6907573491334915, |
|
"rewards/correctness_reward_func": 0.1383928619325161, |
|
"rewards/int_reward_func": 0.2555803619325161, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.13130357582122087, |
|
"step": 189 |
|
}, |
|
{ |
|
"completion_length": 102.61830711364746, |
|
"epoch": 0.7116104868913857, |
|
"grad_norm": 0.4473523497581482, |
|
"kl": NaN, |
|
"learning_rate": 1.8846167674275175e-06, |
|
"loss": 0.054, |
|
"reward": 0.3266384117305279, |
|
"reward_std": 0.8520393073558807, |
|
"rewards/correctness_reward_func": 0.1919642947614193, |
|
"rewards/int_reward_func": 0.2600446566939354, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.12537054996937513, |
|
"step": 190 |
|
}, |
|
{ |
|
"completion_length": 98.22098731994629, |
|
"epoch": 0.7153558052434457, |
|
"grad_norm": 0.6242156624794006, |
|
"kl": 1.2861328125, |
|
"learning_rate": 1.852952387243698e-06, |
|
"loss": 0.0514, |
|
"reward": 0.2729821652173996, |
|
"reward_std": 0.7455658465623856, |
|
"rewards/correctness_reward_func": 0.1785714365541935, |
|
"rewards/int_reward_func": 0.2343750074505806, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.1399642862379551, |
|
"step": 191 |
|
}, |
|
{ |
|
"completion_length": 95.50446701049805, |
|
"epoch": 0.7191011235955056, |
|
"grad_norm": 0.5410141348838806, |
|
"kl": 1.269287109375, |
|
"learning_rate": 1.8213988753373147e-06, |
|
"loss": 0.0508, |
|
"reward": 0.3193794898688793, |
|
"reward_std": 0.8921961933374405, |
|
"rewards/correctness_reward_func": 0.2187500074505806, |
|
"rewards/int_reward_func": 0.2488839402794838, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.14825446717441082, |
|
"step": 192 |
|
}, |
|
{ |
|
"completion_length": 95.50893211364746, |
|
"epoch": 0.7228464419475655, |
|
"grad_norm": 0.5410141348838806, |
|
"kl": NaN, |
|
"learning_rate": 1.8213988753373147e-06, |
|
"loss": 0.0494, |
|
"reward": 0.4274576008319855, |
|
"reward_std": 0.8424459546804428, |
|
"rewards/correctness_reward_func": 0.2500000074505806, |
|
"rewards/int_reward_func": 0.2555803656578064, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.07812276761978865, |
|
"step": 193 |
|
}, |
|
{ |
|
"completion_length": 100.1406307220459, |
|
"epoch": 0.7265917602996255, |
|
"grad_norm": 0.5630261301994324, |
|
"kl": 1.331298828125, |
|
"learning_rate": 1.7899616382401935e-06, |
|
"loss": 0.0532, |
|
"reward": 0.24447321146726608, |
|
"reward_std": 0.8442487269639969, |
|
"rewards/correctness_reward_func": 0.1517857201397419, |
|
"rewards/int_reward_func": 0.2455357275903225, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.15284822601825, |
|
"step": 194 |
|
}, |
|
{ |
|
"completion_length": 98.96429061889648, |
|
"epoch": 0.7303370786516854, |
|
"grad_norm": 0.5630261301994324, |
|
"kl": NaN, |
|
"learning_rate": 1.7899616382401935e-06, |
|
"loss": 0.0485, |
|
"reward": 0.3635468855500221, |
|
"reward_std": 0.8013804405927658, |
|
"rewards/correctness_reward_func": 0.1875000074505806, |
|
"rewards/int_reward_func": 0.2544642984867096, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.07841741375159472, |
|
"step": 195 |
|
}, |
|
{ |
|
"completion_length": 96.05804252624512, |
|
"epoch": 0.7340823970037453, |
|
"grad_norm": 0.4605935513973236, |
|
"kl": 1.296875, |
|
"learning_rate": 1.758646062561073e-06, |
|
"loss": 0.0519, |
|
"reward": 0.30311162024736404, |
|
"reward_std": 0.7245951294898987, |
|
"rewards/correctness_reward_func": 0.13839286379516125, |
|
"rewards/int_reward_func": 0.2433035783469677, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.07858482515439391, |
|
"step": 196 |
|
}, |
|
{ |
|
"completion_length": 99.58036041259766, |
|
"epoch": 0.7378277153558053, |
|
"grad_norm": 0.5969142913818359, |
|
"kl": 1.305419921875, |
|
"learning_rate": 1.7274575140626318e-06, |
|
"loss": 0.0522, |
|
"reward": 0.26581921428442, |
|
"reward_std": 0.8090188354253769, |
|
"rewards/correctness_reward_func": 0.15625000931322575, |
|
"rewards/int_reward_func": 0.2310267984867096, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.12145759037230164, |
|
"step": 197 |
|
}, |
|
{ |
|
"completion_length": 91.8995590209961, |
|
"epoch": 0.7415730337078652, |
|
"grad_norm": 0.527574360370636, |
|
"kl": 1.245849609375, |
|
"learning_rate": 1.6964013367420967e-06, |
|
"loss": 0.0498, |
|
"reward": 0.38672323897480965, |
|
"reward_std": 0.7834379523992538, |
|
"rewards/correctness_reward_func": 0.2232142947614193, |
|
"rewards/int_reward_func": 0.2689732238650322, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.10546428337693214, |
|
"step": 198 |
|
}, |
|
{ |
|
"completion_length": 98.9375057220459, |
|
"epoch": 0.7453183520599251, |
|
"grad_norm": 0.5717042684555054, |
|
"kl": 1.277587890625, |
|
"learning_rate": 1.665482851915573e-06, |
|
"loss": 0.0511, |
|
"reward": 0.24759376049041748, |
|
"reward_std": 0.6963834911584854, |
|
"rewards/correctness_reward_func": 0.1116071492433548, |
|
"rewards/int_reward_func": 0.2343750111758709, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.09838839736767113, |
|
"step": 199 |
|
}, |
|
{ |
|
"completion_length": 96.61830711364746, |
|
"epoch": 0.7490636704119851, |
|
"grad_norm": 0.5991680026054382, |
|
"kl": 1.228271484375, |
|
"learning_rate": 1.634707357306267e-06, |
|
"loss": 0.0491, |
|
"reward": 0.3660937622189522, |
|
"reward_std": 0.8573340475559235, |
|
"rewards/correctness_reward_func": 0.2366071492433548, |
|
"rewards/int_reward_func": 0.2488839365541935, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.11939733009785414, |
|
"step": 200 |
|
}, |
|
{ |
|
"completion_length": 106.84152221679688, |
|
"epoch": 0.7528089887640449, |
|
"grad_norm": 0.5178794860839844, |
|
"kl": 1.34326171875, |
|
"learning_rate": 1.6040801261367494e-06, |
|
"loss": 0.0537, |
|
"reward": 0.3160022422671318, |
|
"reward_std": 0.8164055794477463, |
|
"rewards/correctness_reward_func": 0.196428582072258, |
|
"rewards/int_reward_func": 0.2622767984867096, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.14270313642919064, |
|
"step": 201 |
|
}, |
|
{ |
|
"completion_length": 90.1875057220459, |
|
"epoch": 0.7565543071161048, |
|
"grad_norm": 0.5120040774345398, |
|
"kl": 1.171142578125, |
|
"learning_rate": 1.5736064062254094e-06, |
|
"loss": 0.0468, |
|
"reward": 0.2789843790233135, |
|
"reward_std": 0.8016841560602188, |
|
"rewards/correctness_reward_func": 0.1562500074505806, |
|
"rewards/int_reward_func": 0.251116082072258, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.12838170863687992, |
|
"step": 202 |
|
}, |
|
{ |
|
"completion_length": 90.62277030944824, |
|
"epoch": 0.7602996254681648, |
|
"grad_norm": 0.4355852007865906, |
|
"kl": 1.2255859375, |
|
"learning_rate": 1.5432914190872757e-06, |
|
"loss": 0.049, |
|
"reward": 0.34145762026309967, |
|
"reward_std": 0.745452344417572, |
|
"rewards/correctness_reward_func": 0.1696428656578064, |
|
"rewards/int_reward_func": 0.2645089402794838, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.09269420150667429, |
|
"step": 203 |
|
}, |
|
{ |
|
"completion_length": 96.64955711364746, |
|
"epoch": 0.7640449438202247, |
|
"grad_norm": 0.49031803011894226, |
|
"kl": 1.14404296875, |
|
"learning_rate": 1.5131403590393323e-06, |
|
"loss": 0.0458, |
|
"reward": 0.3124375157058239, |
|
"reward_std": 0.7880858033895493, |
|
"rewards/correctness_reward_func": 0.1875000074505806, |
|
"rewards/int_reward_func": 0.2343750074505806, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.10943750524893403, |
|
"step": 204 |
|
}, |
|
{ |
|
"completion_length": 90.63616561889648, |
|
"epoch": 0.7677902621722846, |
|
"grad_norm": 0.48222535848617554, |
|
"kl": 1.188720703125, |
|
"learning_rate": 1.4831583923105e-06, |
|
"loss": 0.0475, |
|
"reward": 0.372580386698246, |
|
"reward_std": 0.8582592159509659, |
|
"rewards/correctness_reward_func": 0.2187500074505806, |
|
"rewards/int_reward_func": 0.2522321566939354, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.0984017881564796, |
|
"step": 205 |
|
}, |
|
{ |
|
"completion_length": 99.03348922729492, |
|
"epoch": 0.7715355805243446, |
|
"grad_norm": 0.4410792589187622, |
|
"kl": 1.247314453125, |
|
"learning_rate": 1.4533506561564305e-06, |
|
"loss": 0.0499, |
|
"reward": 0.33057814463973045, |
|
"reward_std": 0.7788708359003067, |
|
"rewards/correctness_reward_func": 0.2142857238650322, |
|
"rewards/int_reward_func": 0.2433035857975483, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.1270111622288823, |
|
"step": 206 |
|
}, |
|
{ |
|
"completion_length": 103.51786041259766, |
|
"epoch": 0.7752808988764045, |
|
"grad_norm": 0.5613678693771362, |
|
"kl": 1.181640625, |
|
"learning_rate": 1.4237222579792618e-06, |
|
"loss": 0.0473, |
|
"reward": 0.26221875846385956, |
|
"reward_std": 0.7947122156620026, |
|
"rewards/correctness_reward_func": 0.14732143469154835, |
|
"rewards/int_reward_func": 0.2421875074505806, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.12729018926620483, |
|
"step": 207 |
|
}, |
|
{ |
|
"completion_length": 92.02232551574707, |
|
"epoch": 0.7790262172284644, |
|
"grad_norm": 0.6175336241722107, |
|
"kl": 1.016357421875, |
|
"learning_rate": 1.3942782744524974e-06, |
|
"loss": 0.0406, |
|
"reward": 0.2860134020447731, |
|
"reward_std": 0.7233386188745499, |
|
"rewards/correctness_reward_func": 0.1339285783469677, |
|
"rewards/int_reward_func": 0.2276785783469677, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.075593750923872, |
|
"step": 208 |
|
}, |
|
{ |
|
"completion_length": 103.3125057220459, |
|
"epoch": 0.7827715355805244, |
|
"grad_norm": 0.4679234027862549, |
|
"kl": 1.18408203125, |
|
"learning_rate": 1.3650237506511333e-06, |
|
"loss": 0.0474, |
|
"reward": 0.28701116889715195, |
|
"reward_std": 0.8643300235271454, |
|
"rewards/correctness_reward_func": 0.2098214365541935, |
|
"rewards/int_reward_func": 0.2578125111758709, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.18062277510762215, |
|
"step": 209 |
|
}, |
|
{ |
|
"completion_length": 102.34598541259766, |
|
"epoch": 0.7865168539325843, |
|
"grad_norm": 0.5165115594863892, |
|
"kl": 1.170654296875, |
|
"learning_rate": 1.3359636991872215e-06, |
|
"loss": 0.0468, |
|
"reward": 0.30597545951604843, |
|
"reward_std": 0.6952601373195648, |
|
"rewards/correctness_reward_func": 0.1428571492433548, |
|
"rewards/int_reward_func": 0.2500000111758709, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.08688170462846756, |
|
"step": 210 |
|
}, |
|
{ |
|
"completion_length": 114.28572082519531, |
|
"epoch": 0.7902621722846442, |
|
"grad_norm": 0.4335125982761383, |
|
"kl": 1.242919921875, |
|
"learning_rate": 1.307103099350979e-06, |
|
"loss": 0.0497, |
|
"reward": 0.2512656319886446, |
|
"reward_std": 0.8131757378578186, |
|
"rewards/correctness_reward_func": 0.1517857201397419, |
|
"rewards/int_reward_func": 0.2377232238650322, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.1382433008402586, |
|
"step": 211 |
|
}, |
|
{ |
|
"completion_length": 101.02009391784668, |
|
"epoch": 0.7940074906367042, |
|
"grad_norm": 0.5146499276161194, |
|
"kl": 1.20947265625, |
|
"learning_rate": 1.2784468962576136e-06, |
|
"loss": 0.0484, |
|
"reward": 0.27424776926636696, |
|
"reward_std": 0.76705102622509, |
|
"rewards/correctness_reward_func": 0.13392858020961285, |
|
"rewards/int_reward_func": 0.2377232201397419, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.09740401990711689, |
|
"step": 212 |
|
}, |
|
{ |
|
"completion_length": 108.00000381469727, |
|
"epoch": 0.797752808988764, |
|
"grad_norm": 0.46153557300567627, |
|
"kl": 1.201416015625, |
|
"learning_rate": 1.2500000000000007e-06, |
|
"loss": 0.048, |
|
"reward": 0.2503928691148758, |
|
"reward_std": 0.8307089358568192, |
|
"rewards/correctness_reward_func": 0.1875000074505806, |
|
"rewards/int_reward_func": 0.2410714365541935, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.1781785748898983, |
|
"step": 213 |
|
}, |
|
{ |
|
"completion_length": 93.59598731994629, |
|
"epoch": 0.8014981273408239, |
|
"grad_norm": 0.49990415573120117, |
|
"kl": 1.1107177734375, |
|
"learning_rate": 1.2217672848073702e-06, |
|
"loss": 0.0444, |
|
"reward": 0.3425290137529373, |
|
"reward_std": 0.8552941530942917, |
|
"rewards/correctness_reward_func": 0.2187500074505806, |
|
"rewards/int_reward_func": 0.2645089477300644, |
|
"rewards/soft_format_reward_func": 0.0011160714784637094, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.1418459787964821, |
|
"step": 214 |
|
}, |
|
{ |
|
"completion_length": 100.52009201049805, |
|
"epoch": 0.8052434456928839, |
|
"grad_norm": 0.46211493015289307, |
|
"kl": 1.141845703125, |
|
"learning_rate": 1.193753588210128e-06, |
|
"loss": 0.0457, |
|
"reward": 0.2656339444220066, |
|
"reward_std": 0.7493345886468887, |
|
"rewards/correctness_reward_func": 0.15625000931322575, |
|
"rewards/int_reward_func": 0.2767857275903225, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.1674017831683159, |
|
"step": 215 |
|
}, |
|
{ |
|
"completion_length": 90.06473541259766, |
|
"epoch": 0.8089887640449438, |
|
"grad_norm": 0.4925229549407959, |
|
"kl": 1.171630859375, |
|
"learning_rate": 1.1659637102109713e-06, |
|
"loss": 0.0469, |
|
"reward": 0.31793973594903946, |
|
"reward_std": 0.8032208532094955, |
|
"rewards/correctness_reward_func": 0.1741071529686451, |
|
"rewards/int_reward_func": 0.2433035857975483, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.09947098419070244, |
|
"step": 216 |
|
}, |
|
{ |
|
"completion_length": 93.21428871154785, |
|
"epoch": 0.8127340823970037, |
|
"grad_norm": 0.4107387363910675, |
|
"kl": 1.096923828125, |
|
"learning_rate": 1.1384024124624324e-06, |
|
"loss": 0.0439, |
|
"reward": 0.2808660827577114, |
|
"reward_std": 0.7595269531011581, |
|
"rewards/correctness_reward_func": 0.19642857648432255, |
|
"rewards/int_reward_func": 0.243303582072258, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.15886607952415943, |
|
"step": 217 |
|
}, |
|
{ |
|
"completion_length": 107.45982551574707, |
|
"epoch": 0.8164794007490637, |
|
"grad_norm": 0.4463358521461487, |
|
"kl": 1.247802734375, |
|
"learning_rate": 1.1110744174509952e-06, |
|
"loss": 0.0499, |
|
"reward": 0.27611831203103065, |
|
"reward_std": 0.8640467375516891, |
|
"rewards/correctness_reward_func": 0.2187500074505806, |
|
"rewards/int_reward_func": 0.2444196492433548, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.18705134466290474, |
|
"step": 218 |
|
}, |
|
{ |
|
"completion_length": 96.86384582519531, |
|
"epoch": 0.8202247191011236, |
|
"grad_norm": 0.5301110148429871, |
|
"kl": 1.199951171875, |
|
"learning_rate": 1.0839844076879186e-06, |
|
"loss": 0.048, |
|
"reward": 0.31224555149674416, |
|
"reward_std": 0.7878952473402023, |
|
"rewards/correctness_reward_func": 0.2053571529686451, |
|
"rewards/int_reward_func": 0.258928582072258, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.15204017609357834, |
|
"step": 219 |
|
}, |
|
{ |
|
"completion_length": 91.19643211364746, |
|
"epoch": 0.8239700374531835, |
|
"grad_norm": 0.4537685811519623, |
|
"kl": 1.101806640625, |
|
"learning_rate": 1.0571370249069163e-06, |
|
"loss": 0.0441, |
|
"reward": 0.3926495686173439, |
|
"reward_std": 0.8612103760242462, |
|
"rewards/correctness_reward_func": 0.2008928693830967, |
|
"rewards/int_reward_func": 0.2611607313156128, |
|
"rewards/soft_format_reward_func": 0.0011160714784637094, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.07052009226754308, |
|
"step": 220 |
|
}, |
|
{ |
|
"completion_length": 90.3125057220459, |
|
"epoch": 0.8277153558052435, |
|
"grad_norm": 0.4537685811519623, |
|
"kl": NaN, |
|
"learning_rate": 1.0571370249069163e-06, |
|
"loss": 0.0418, |
|
"reward": 0.28333261236548424, |
|
"reward_std": 0.7591045498847961, |
|
"rewards/correctness_reward_func": 0.1562500074505806, |
|
"rewards/int_reward_func": 0.2455357275903225, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.11845313012599945, |
|
"step": 221 |
|
}, |
|
{ |
|
"completion_length": 90.84152030944824, |
|
"epoch": 0.8314606741573034, |
|
"grad_norm": 0.5334520936012268, |
|
"kl": 1.140625, |
|
"learning_rate": 1.0305368692688175e-06, |
|
"loss": 0.0456, |
|
"reward": 0.305205374956131, |
|
"reward_std": 0.7375971227884293, |
|
"rewards/correctness_reward_func": 0.1830357164144516, |
|
"rewards/int_reward_func": 0.2310267984867096, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.10885715018957853, |
|
"step": 222 |
|
}, |
|
{ |
|
"completion_length": 93.97098541259766, |
|
"epoch": 0.8352059925093633, |
|
"grad_norm": 0.4208020269870758, |
|
"kl": 1.108154296875, |
|
"learning_rate": 1.0041884985733524e-06, |
|
"loss": 0.0443, |
|
"reward": 0.339029036462307, |
|
"reward_std": 0.7930542379617691, |
|
"rewards/correctness_reward_func": 0.2053571492433548, |
|
"rewards/int_reward_func": 0.2287946529686451, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.09512277226895094, |
|
"step": 223 |
|
}, |
|
{ |
|
"completion_length": 102.49107551574707, |
|
"epoch": 0.8389513108614233, |
|
"grad_norm": 0.41976016759872437, |
|
"kl": 1.1943359375, |
|
"learning_rate": 9.780964274781984e-07, |
|
"loss": 0.0478, |
|
"reward": 0.23499107360839844, |
|
"reward_std": 0.7919187396764755, |
|
"rewards/correctness_reward_func": 0.1651785783469677, |
|
"rewards/int_reward_func": 0.219866082072258, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.15005357982590795, |
|
"step": 224 |
|
}, |
|
{ |
|
"completion_length": 88.04464912414551, |
|
"epoch": 0.8426966292134831, |
|
"grad_norm": 0.41976016759872437, |
|
"kl": NaN, |
|
"learning_rate": 9.780964274781984e-07, |
|
"loss": 0.0464, |
|
"reward": 0.20726785995066166, |
|
"reward_std": 0.7513840273022652, |
|
"rewards/correctness_reward_func": 0.10714286379516125, |
|
"rewards/int_reward_func": 0.219866082072258, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.11974107846617699, |
|
"step": 225 |
|
}, |
|
{ |
|
"completion_length": 99.45536422729492, |
|
"epoch": 0.846441947565543, |
|
"grad_norm": 0.6243520975112915, |
|
"kl": 1.137939453125, |
|
"learning_rate": 9.522651267254149e-07, |
|
"loss": 0.0455, |
|
"reward": 0.3012098353356123, |
|
"reward_std": 0.7535159438848495, |
|
"rewards/correctness_reward_func": 0.1696428656578064, |
|
"rewards/int_reward_func": 0.2287946566939354, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.09722768981009722, |
|
"step": 226 |
|
}, |
|
{ |
|
"completion_length": 88.08928871154785, |
|
"epoch": 0.850187265917603, |
|
"grad_norm": 0.5370275974273682, |
|
"kl": 1.02734375, |
|
"learning_rate": 9.266990223754069e-07, |
|
"loss": 0.0411, |
|
"reward": 0.3909241184592247, |
|
"reward_std": 0.7717972099781036, |
|
"rewards/correctness_reward_func": 0.1964285783469677, |
|
"rewards/int_reward_func": 0.2511160857975483, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.05662053730338812, |
|
"step": 227 |
|
}, |
|
{ |
|
"completion_length": 92.60937881469727, |
|
"epoch": 0.8539325842696629, |
|
"grad_norm": 0.4681946337223053, |
|
"kl": 1.1201171875, |
|
"learning_rate": 9.014024950485384e-07, |
|
"loss": 0.0448, |
|
"reward": 0.3636852651834488, |
|
"reward_std": 0.8009557723999023, |
|
"rewards/correctness_reward_func": 0.2053571566939354, |
|
"rewards/int_reward_func": 0.251116082072258, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.09278795216232538, |
|
"step": 228 |
|
}, |
|
{ |
|
"completion_length": 105.85491371154785, |
|
"epoch": 0.8576779026217228, |
|
"grad_norm": 0.4945945143699646, |
|
"kl": 1.174072265625, |
|
"learning_rate": 8.763798791745413e-07, |
|
"loss": 0.047, |
|
"reward": 0.3037031330168247, |
|
"reward_std": 0.8670637309551239, |
|
"rewards/correctness_reward_func": 0.2008928656578064, |
|
"rewards/int_reward_func": 0.2745535857975483, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.17174331843852997, |
|
"step": 229 |
|
}, |
|
{ |
|
"completion_length": 94.59152221679688, |
|
"epoch": 0.8614232209737828, |
|
"grad_norm": 0.5289459228515625, |
|
"kl": 1.0704345703125, |
|
"learning_rate": 8.516354622498279e-07, |
|
"loss": 0.0428, |
|
"reward": 0.3705156408250332, |
|
"reward_std": 0.8742925226688385, |
|
"rewards/correctness_reward_func": 0.223214291036129, |
|
"rewards/int_reward_func": 0.2488839365541935, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.10158259607851505, |
|
"step": 230 |
|
}, |
|
{ |
|
"completion_length": 86.8683090209961, |
|
"epoch": 0.8651685393258427, |
|
"grad_norm": 0.4612804353237152, |
|
"kl": 1.0635986328125, |
|
"learning_rate": 8.271734841028553e-07, |
|
"loss": 0.0425, |
|
"reward": 0.32227009534835815, |
|
"reward_std": 0.7606519907712936, |
|
"rewards/correctness_reward_func": 0.14285715110599995, |
|
"rewards/int_reward_func": 0.2600446529686451, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.08063170197419822, |
|
"step": 231 |
|
}, |
|
{ |
|
"completion_length": 109.55357551574707, |
|
"epoch": 0.8689138576779026, |
|
"grad_norm": 0.5144924521446228, |
|
"kl": 1.314208984375, |
|
"learning_rate": 8.029981361676456e-07, |
|
"loss": 0.0526, |
|
"reward": 0.28595758974552155, |
|
"reward_std": 0.8525267541408539, |
|
"rewards/correctness_reward_func": 0.2098214440047741, |
|
"rewards/int_reward_func": 0.2611607238650322, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.185024568811059, |
|
"step": 232 |
|
}, |
|
{ |
|
"completion_length": 98.18303871154785, |
|
"epoch": 0.8726591760299626, |
|
"grad_norm": 0.5144924521446228, |
|
"kl": NaN, |
|
"learning_rate": 8.029981361676456e-07, |
|
"loss": 0.0476, |
|
"reward": 0.2972254566848278, |
|
"reward_std": 0.7380311787128448, |
|
"rewards/correctness_reward_func": 0.1473214365541935, |
|
"rewards/int_reward_func": 0.2433035857975483, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.09339955315226689, |
|
"step": 233 |
|
}, |
|
{ |
|
"completion_length": 98.16741371154785, |
|
"epoch": 0.8764044943820225, |
|
"grad_norm": 0.5009276270866394, |
|
"kl": 1.244384765625, |
|
"learning_rate": 7.791135607656147e-07, |
|
"loss": 0.0498, |
|
"reward": 0.3269129544496536, |
|
"reward_std": 0.7167427837848663, |
|
"rewards/correctness_reward_func": 0.16071429662406445, |
|
"rewards/int_reward_func": 0.2500000111758709, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.08380134031176567, |
|
"step": 234 |
|
}, |
|
{ |
|
"completion_length": 97.88616371154785, |
|
"epoch": 0.8801498127340824, |
|
"grad_norm": 0.5268076062202454, |
|
"kl": 1.235107421875, |
|
"learning_rate": 7.555238503958001e-07, |
|
"loss": 0.0494, |
|
"reward": 0.3055223375558853, |
|
"reward_std": 0.8659389615058899, |
|
"rewards/correctness_reward_func": 0.1919642947614193, |
|
"rewards/int_reward_func": 0.2500000074505806, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.13644197303801775, |
|
"step": 235 |
|
}, |
|
{ |
|
"completion_length": 88.23661041259766, |
|
"epoch": 0.8838951310861424, |
|
"grad_norm": 0.5268076062202454, |
|
"kl": NaN, |
|
"learning_rate": 7.555238503958001e-07, |
|
"loss": 0.0497, |
|
"reward": 0.3386116325855255, |
|
"reward_std": 0.7405965030193329, |
|
"rewards/correctness_reward_func": 0.1651785783469677, |
|
"rewards/int_reward_func": 0.2500000186264515, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.07656696811318398, |
|
"step": 236 |
|
}, |
|
{ |
|
"completion_length": 92.50000381469727, |
|
"epoch": 0.8876404494382022, |
|
"grad_norm": 0.49278637766838074, |
|
"kl": 1.129150390625, |
|
"learning_rate": 7.322330470336314e-07, |
|
"loss": 0.0452, |
|
"reward": 0.265178584959358, |
|
"reward_std": 0.866941437125206, |
|
"rewards/correctness_reward_func": 0.16517857648432255, |
|
"rewards/int_reward_func": 0.2477678693830967, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.14776786230504513, |
|
"step": 237 |
|
}, |
|
{ |
|
"completion_length": 95.01339912414551, |
|
"epoch": 0.8913857677902621, |
|
"grad_norm": 0.46395623683929443, |
|
"kl": 1.181640625, |
|
"learning_rate": 7.092451414383644e-07, |
|
"loss": 0.0473, |
|
"reward": 0.2845067009329796, |
|
"reward_std": 0.8498467355966568, |
|
"rewards/correctness_reward_func": 0.1696428693830967, |
|
"rewards/int_reward_func": 0.2555803656578064, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0011160714784637094, |
|
"rewards/xmlcount_reward_func": -0.14183259941637516, |
|
"step": 238 |
|
}, |
|
{ |
|
"completion_length": 99.3593807220459, |
|
"epoch": 0.8951310861423221, |
|
"grad_norm": 0.6616207957267761, |
|
"kl": 1.17919921875, |
|
"learning_rate": 6.865640724692815e-07, |
|
"loss": 0.0472, |
|
"reward": 0.3268973380327225, |
|
"reward_std": 0.8011666536331177, |
|
"rewards/correctness_reward_func": 0.2008928656578064, |
|
"rewards/int_reward_func": 0.2500000149011612, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.12399554438889027, |
|
"step": 239 |
|
}, |
|
{ |
|
"completion_length": 104.7745590209961, |
|
"epoch": 0.898876404494382, |
|
"grad_norm": 0.5387265086174011, |
|
"kl": 1.183349609375, |
|
"learning_rate": 6.641937264107868e-07, |
|
"loss": 0.0473, |
|
"reward": 0.4128861799836159, |
|
"reward_std": 0.850861206650734, |
|
"rewards/correctness_reward_func": 0.2232142947614193, |
|
"rewards/int_reward_func": 0.2734375074505806, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.08376562781631947, |
|
"step": 240 |
|
}, |
|
{ |
|
"completion_length": 92.34152221679688, |
|
"epoch": 0.9026217228464419, |
|
"grad_norm": 0.5197759866714478, |
|
"kl": 1.0531005859375, |
|
"learning_rate": 6.421379363065142e-07, |
|
"loss": 0.0421, |
|
"reward": 0.35036832839250565, |
|
"reward_std": 0.8599109500646591, |
|
"rewards/correctness_reward_func": 0.2187500149011612, |
|
"rewards/int_reward_func": 0.2645089440047741, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.13289063051342964, |
|
"step": 241 |
|
}, |
|
{ |
|
"completion_length": 111.65402030944824, |
|
"epoch": 0.9063670411985019, |
|
"grad_norm": 0.46235111355781555, |
|
"kl": 1.236572265625, |
|
"learning_rate": 6.204004813025569e-07, |
|
"loss": 0.0495, |
|
"reward": 0.3619754686951637, |
|
"reward_std": 0.7729392051696777, |
|
"rewards/correctness_reward_func": 0.2187500037252903, |
|
"rewards/int_reward_func": 0.2589285857975483, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.11570313014090061, |
|
"step": 242 |
|
}, |
|
{ |
|
"completion_length": 103.99107551574707, |
|
"epoch": 0.9101123595505618, |
|
"grad_norm": 0.5195502638816833, |
|
"kl": 1.19384765625, |
|
"learning_rate": 5.989850859999227e-07, |
|
"loss": 0.0477, |
|
"reward": 0.2594174239784479, |
|
"reward_std": 0.7237197905778885, |
|
"rewards/correctness_reward_func": 0.1473214328289032, |
|
"rewards/int_reward_func": 0.2522321529686451, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.14013616926968098, |
|
"step": 243 |
|
}, |
|
{ |
|
"completion_length": 93.28571891784668, |
|
"epoch": 0.9138576779026217, |
|
"grad_norm": 0.4491368234157562, |
|
"kl": 1.1376953125, |
|
"learning_rate": 5.778954198163514e-07, |
|
"loss": 0.0455, |
|
"reward": 0.2785803731530905, |
|
"reward_std": 0.6952795684337616, |
|
"rewards/correctness_reward_func": 0.160714291036129, |
|
"rewards/int_reward_func": 0.2533482275903225, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.13548214174807072, |
|
"step": 244 |
|
}, |
|
{ |
|
"completion_length": 95.37500381469727, |
|
"epoch": 0.9176029962546817, |
|
"grad_norm": 0.4656703770160675, |
|
"kl": 1.113037109375, |
|
"learning_rate": 5.571350963575728e-07, |
|
"loss": 0.0445, |
|
"reward": 0.3266986757516861, |
|
"reward_std": 0.7986108660697937, |
|
"rewards/correctness_reward_func": 0.1830357238650322, |
|
"rewards/int_reward_func": 0.2421875111758709, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0011160714784637094, |
|
"rewards/xmlcount_reward_func": -0.09964063111692667, |
|
"step": 245 |
|
}, |
|
{ |
|
"completion_length": 100.66964912414551, |
|
"epoch": 0.9213483146067416, |
|
"grad_norm": 0.48842382431030273, |
|
"kl": 1.119873046875, |
|
"learning_rate": 5.367076727981383e-07, |
|
"loss": 0.0448, |
|
"reward": 0.21152456477284431, |
|
"reward_std": 0.7304975092411041, |
|
"rewards/correctness_reward_func": 0.12053571827709675, |
|
"rewards/int_reward_func": 0.2064732275903225, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.11548437923192978, |
|
"step": 246 |
|
}, |
|
{ |
|
"completion_length": 95.72991561889648, |
|
"epoch": 0.9250936329588015, |
|
"grad_norm": 0.48145803809165955, |
|
"kl": 1.124755859375, |
|
"learning_rate": 5.166166492719124e-07, |
|
"loss": 0.045, |
|
"reward": 0.2994174249470234, |
|
"reward_std": 0.7895640283823013, |
|
"rewards/correctness_reward_func": 0.1651785783469677, |
|
"rewards/int_reward_func": 0.2377232275903225, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.10348437912762165, |
|
"step": 247 |
|
}, |
|
{ |
|
"completion_length": 101.8192024230957, |
|
"epoch": 0.9288389513108615, |
|
"grad_norm": 0.4706651568412781, |
|
"kl": 1.165283203125, |
|
"learning_rate": 4.968654682723487e-07, |
|
"loss": 0.0466, |
|
"reward": 0.28485044091939926, |
|
"reward_std": 0.9133375287055969, |
|
"rewards/correctness_reward_func": 0.2098214365541935, |
|
"rewards/int_reward_func": 0.2321428656578064, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.15711384266614914, |
|
"step": 248 |
|
}, |
|
{ |
|
"completion_length": 91.77902030944824, |
|
"epoch": 0.9325842696629213, |
|
"grad_norm": 0.9746862053871155, |
|
"kl": 1.063232421875, |
|
"learning_rate": 4.774575140626317e-07, |
|
"loss": 0.0425, |
|
"reward": 0.3371250182390213, |
|
"reward_std": 0.775251716375351, |
|
"rewards/correctness_reward_func": 0.1696428656578064, |
|
"rewards/int_reward_func": 0.2433035783469677, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.07582143507897854, |
|
"step": 249 |
|
}, |
|
{ |
|
"completion_length": 93.1718807220459, |
|
"epoch": 0.9363295880149812, |
|
"grad_norm": 0.4796634316444397, |
|
"kl": 1.09228515625, |
|
"learning_rate": 4.5839611209580277e-07, |
|
"loss": 0.0437, |
|
"reward": 0.42918528616428375, |
|
"reward_std": 0.7719597369432449, |
|
"rewards/correctness_reward_func": 0.2008928693830967, |
|
"rewards/int_reward_func": 0.2767857201397419, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.04849330266006291, |
|
"step": 250 |
|
}, |
|
{ |
|
"completion_length": 86.16964721679688, |
|
"epoch": 0.9400749063670412, |
|
"grad_norm": 0.4254949986934662, |
|
"kl": 1.08740234375, |
|
"learning_rate": 4.396845284449608e-07, |
|
"loss": 0.0435, |
|
"reward": 0.27672769874334335, |
|
"reward_std": 0.7562145739793777, |
|
"rewards/correctness_reward_func": 0.1383928619325161, |
|
"rewards/int_reward_func": 0.251116082072258, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.11278124991804361, |
|
"step": 251 |
|
}, |
|
{ |
|
"completion_length": 101.90402221679688, |
|
"epoch": 0.9438202247191011, |
|
"grad_norm": 0.4744266867637634, |
|
"kl": 1.144775390625, |
|
"learning_rate": 4.2132596924363666e-07, |
|
"loss": 0.0458, |
|
"reward": 0.2469821460545063, |
|
"reward_std": 0.840282753109932, |
|
"rewards/correctness_reward_func": 0.1785714365541935, |
|
"rewards/int_reward_func": 0.2410714402794838, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.17266072891652584, |
|
"step": 252 |
|
}, |
|
{ |
|
"completion_length": 118.7723274230957, |
|
"epoch": 0.947565543071161, |
|
"grad_norm": 0.4241223633289337, |
|
"kl": 1.21875, |
|
"learning_rate": 4.033235801364402e-07, |
|
"loss": 0.0488, |
|
"reward": 0.15625000838190317, |
|
"reward_std": 0.7448219954967499, |
|
"rewards/correctness_reward_func": 0.1205357238650322, |
|
"rewards/int_reward_func": 0.2209821529686451, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0011160714784637094, |
|
"rewards/xmlcount_reward_func": -0.1863839291036129, |
|
"step": 253 |
|
}, |
|
{ |
|
"completion_length": 101.55580711364746, |
|
"epoch": 0.951310861423221, |
|
"grad_norm": 0.5570555925369263, |
|
"kl": 1.070068359375, |
|
"learning_rate": 3.85680445740067e-07, |
|
"loss": 0.0428, |
|
"reward": 0.23607589676976204, |
|
"reward_std": 0.7750666290521622, |
|
"rewards/correctness_reward_func": 0.1116071492433548, |
|
"rewards/int_reward_func": 0.2287946529686451, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.10432589706033468, |
|
"step": 254 |
|
}, |
|
{ |
|
"completion_length": 95.45089721679688, |
|
"epoch": 0.9550561797752809, |
|
"grad_norm": 0.511661946773529, |
|
"kl": 1.1142578125, |
|
"learning_rate": 3.683995891147696e-07, |
|
"loss": 0.0446, |
|
"reward": 0.36682143807411194, |
|
"reward_std": 0.7813057452440262, |
|
"rewards/correctness_reward_func": 0.2053571529686451, |
|
"rewards/int_reward_func": 0.2600446529686451, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.0985803622752428, |
|
"step": 255 |
|
}, |
|
{ |
|
"completion_length": 101.62277030944824, |
|
"epoch": 0.9588014981273408, |
|
"grad_norm": 0.4439486563205719, |
|
"kl": 1.134765625, |
|
"learning_rate": 3.514839712463683e-07, |
|
"loss": 0.0454, |
|
"reward": 0.30028797313570976, |
|
"reward_std": 0.8320818990468979, |
|
"rewards/correctness_reward_func": 0.1651785783469677, |
|
"rewards/int_reward_func": 0.2455357238650322, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.11042634584009647, |
|
"step": 256 |
|
}, |
|
{ |
|
"completion_length": 93.14062881469727, |
|
"epoch": 0.9625468164794008, |
|
"grad_norm": 0.5691079497337341, |
|
"kl": 1.0438232421875, |
|
"learning_rate": 3.3493649053890325e-07, |
|
"loss": 0.0418, |
|
"reward": 0.31014733761548996, |
|
"reward_std": 0.7862526774406433, |
|
"rewards/correctness_reward_func": 0.1741071529686451, |
|
"rewards/int_reward_func": 0.2399553693830967, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.10391518194228411, |
|
"step": 257 |
|
}, |
|
{ |
|
"completion_length": 102.33705711364746, |
|
"epoch": 0.9662921348314607, |
|
"grad_norm": 0.5333299040794373, |
|
"kl": 1.126220703125, |
|
"learning_rate": 3.187599823180071e-07, |
|
"loss": 0.045, |
|
"reward": 0.32864734157919884, |
|
"reward_std": 0.7325298935174942, |
|
"rewards/correctness_reward_func": 0.1473214328289032, |
|
"rewards/int_reward_func": 0.2578125074505806, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.0764866080135107, |
|
"step": 258 |
|
}, |
|
{ |
|
"completion_length": 93.04911231994629, |
|
"epoch": 0.9700374531835206, |
|
"grad_norm": 0.5367720127105713, |
|
"kl": 1.045654296875, |
|
"learning_rate": 3.0295721834508686e-07, |
|
"loss": 0.0418, |
|
"reward": 0.39304019510746, |
|
"reward_std": 0.9003488570451736, |
|
"rewards/correctness_reward_func": 0.254464291036129, |
|
"rewards/int_reward_func": 0.2678571492433548, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.12928125727921724, |
|
"step": 259 |
|
}, |
|
{ |
|
"completion_length": 91.97545051574707, |
|
"epoch": 0.9737827715355806, |
|
"grad_norm": 0.4951762557029724, |
|
"kl": 1.07470703125, |
|
"learning_rate": 2.875309063423956e-07, |
|
"loss": 0.043, |
|
"reward": 0.21667636185884476, |
|
"reward_std": 0.8131074160337448, |
|
"rewards/correctness_reward_func": 0.1294642947614193, |
|
"rewards/int_reward_func": 0.2220982275903225, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0011160714784637094, |
|
"rewards/xmlcount_reward_func": -0.13600223883986473, |
|
"step": 260 |
|
}, |
|
{ |
|
"completion_length": 92.67634391784668, |
|
"epoch": 0.9775280898876404, |
|
"grad_norm": 0.5502758622169495, |
|
"kl": 1.0794677734375, |
|
"learning_rate": 2.7248368952908055e-07, |
|
"loss": 0.0432, |
|
"reward": 0.32996875420212746, |
|
"reward_std": 0.8287549465894699, |
|
"rewards/correctness_reward_func": 0.1785714328289032, |
|
"rewards/int_reward_func": 0.2723214402794838, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0011160714784637094, |
|
"rewards/xmlcount_reward_func": -0.12204018794000149, |
|
"step": 261 |
|
}, |
|
{ |
|
"completion_length": 100.24777221679688, |
|
"epoch": 0.9812734082397003, |
|
"grad_norm": 0.5502758622169495, |
|
"kl": NaN, |
|
"learning_rate": 2.7248368952908055e-07, |
|
"loss": 0.0487, |
|
"reward": 0.3751874938607216, |
|
"reward_std": 0.7956403493881226, |
|
"rewards/correctness_reward_func": 0.2276785857975483, |
|
"rewards/int_reward_func": 0.266741082072258, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.11923214513808489, |
|
"step": 262 |
|
}, |
|
{ |
|
"completion_length": 94.25000381469727, |
|
"epoch": 0.9850187265917603, |
|
"grad_norm": 0.5013113617897034, |
|
"kl": 1.0750732421875, |
|
"learning_rate": 2.5781814616827936e-07, |
|
"loss": 0.043, |
|
"reward": 0.30204688012599945, |
|
"reward_std": 0.8205768465995789, |
|
"rewards/correctness_reward_func": 0.1875000074505806, |
|
"rewards/int_reward_func": 0.2466517947614193, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.13210491463541985, |
|
"step": 263 |
|
}, |
|
{ |
|
"completion_length": 88.36830711364746, |
|
"epoch": 0.9887640449438202, |
|
"grad_norm": 0.55992192029953, |
|
"kl": 1.045166015625, |
|
"learning_rate": 2.43536789125349e-07, |
|
"loss": 0.0418, |
|
"reward": 0.27569420635700226, |
|
"reward_std": 0.8564379215240479, |
|
"rewards/correctness_reward_func": 0.1651785783469677, |
|
"rewards/int_reward_func": 0.2466517947614193, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.13613616861402988, |
|
"step": 264 |
|
}, |
|
{ |
|
"completion_length": 89.41071701049805, |
|
"epoch": 0.9925093632958801, |
|
"grad_norm": 0.55992192029953, |
|
"kl": NaN, |
|
"learning_rate": 2.43536789125349e-07, |
|
"loss": 0.0415, |
|
"reward": 0.3237812668085098, |
|
"reward_std": 0.8230260014533997, |
|
"rewards/correctness_reward_func": 0.1785714402794838, |
|
"rewards/int_reward_func": 0.2656250074505806, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.12041518278419971, |
|
"step": 265 |
|
}, |
|
{ |
|
"completion_length": 105.87500381469727, |
|
"epoch": 0.9962546816479401, |
|
"grad_norm": 0.6678707003593445, |
|
"kl": 1.279541015625, |
|
"learning_rate": 2.2964206543729662e-07, |
|
"loss": 0.0512, |
|
"reward": 0.34079688787460327, |
|
"reward_std": 0.805058628320694, |
|
"rewards/correctness_reward_func": 0.196428582072258, |
|
"rewards/int_reward_func": 0.2399553693830967, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.09558706358075142, |
|
"step": 266 |
|
}, |
|
{ |
|
"completion_length": 102.1875, |
|
"epoch": 1.0, |
|
"grad_norm": 0.5808861255645752, |
|
"kl": 1.051025390625, |
|
"learning_rate": 2.1613635589349756e-07, |
|
"loss": 0.0441, |
|
"reward": 0.27787497639656067, |
|
"reward_std": 0.8605497926473618, |
|
"rewards/correctness_reward_func": 0.25, |
|
"rewards/int_reward_func": 0.28125, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.2533750110305846, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 267, |
|
"total_flos": 0.0, |
|
"train_loss": 0.0, |
|
"train_runtime": 1.6097, |
|
"train_samples_per_second": 4642.518, |
|
"train_steps_per_second": 165.871 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 267, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 54, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|