|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9974847287100251, |
|
"eval_steps": 100, |
|
"global_step": 347, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 763.18623046875, |
|
"epoch": 0.01437297879985627, |
|
"grad_norm": 0.0765276625752449, |
|
"kl": -6.394833326339721e-06, |
|
"learning_rate": 2.8571428571428573e-06, |
|
"loss": 0.0157, |
|
"reward": 0.17431640625, |
|
"reward_std": 0.23442449774593116, |
|
"rewards/accuracy_reward": 0.08994140625, |
|
"rewards/format_reward": 0.084375, |
|
"step": 5 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 698.62314453125, |
|
"epoch": 0.02874595759971254, |
|
"grad_norm": 0.11506624519824982, |
|
"kl": 0.00981593132019043, |
|
"learning_rate": 5.7142857142857145e-06, |
|
"loss": 0.0584, |
|
"reward": 0.6732421875, |
|
"reward_std": 0.3674958860501647, |
|
"rewards/accuracy_reward": 0.07646484375, |
|
"rewards/format_reward": 0.59677734375, |
|
"step": 10 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 541.0048828125, |
|
"epoch": 0.04311893639956881, |
|
"grad_norm": 0.050542764365673065, |
|
"kl": 0.02561187744140625, |
|
"learning_rate": 8.571428571428571e-06, |
|
"loss": 0.0354, |
|
"reward": 1.036328125, |
|
"reward_std": 0.2127559134736657, |
|
"rewards/accuracy_reward": 0.102734375, |
|
"rewards/format_reward": 0.93359375, |
|
"step": 15 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 632.689453125, |
|
"epoch": 0.05749191519942508, |
|
"grad_norm": 0.03859843313694, |
|
"kl": 0.0311004638671875, |
|
"learning_rate": 1.1428571428571429e-05, |
|
"loss": 0.0249, |
|
"reward": 1.1552734375, |
|
"reward_std": 0.23164508808404208, |
|
"rewards/accuracy_reward": 0.20224609375, |
|
"rewards/format_reward": 0.95302734375, |
|
"step": 20 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 628.31357421875, |
|
"epoch": 0.07186489399928135, |
|
"grad_norm": 0.046529632061719894, |
|
"kl": 0.0368988037109375, |
|
"learning_rate": 1.4285714285714287e-05, |
|
"loss": 0.0151, |
|
"reward": 1.157421875, |
|
"reward_std": 0.20364541225135327, |
|
"rewards/accuracy_reward": 0.188671875, |
|
"rewards/format_reward": 0.96875, |
|
"step": 25 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 671.5869140625, |
|
"epoch": 0.08623787279913762, |
|
"grad_norm": 0.037584338337183, |
|
"kl": 0.03684234619140625, |
|
"learning_rate": 1.7142857142857142e-05, |
|
"loss": 0.0213, |
|
"reward": 1.165234375, |
|
"reward_std": 0.24268896747380495, |
|
"rewards/accuracy_reward": 0.2177734375, |
|
"rewards/format_reward": 0.9474609375, |
|
"step": 30 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 646.013671875, |
|
"epoch": 0.1006108515989939, |
|
"grad_norm": 0.34336549043655396, |
|
"kl": 0.151519775390625, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0359, |
|
"reward": 1.14091796875, |
|
"reward_std": 0.2653762998059392, |
|
"rewards/accuracy_reward": 0.2052734375, |
|
"rewards/format_reward": 0.93564453125, |
|
"step": 35 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 624.95908203125, |
|
"epoch": 0.11498383039885016, |
|
"grad_norm": 0.052160657942295074, |
|
"kl": 0.595263671875, |
|
"learning_rate": 1.9987329060020616e-05, |
|
"loss": 0.0668, |
|
"reward": 1.081640625, |
|
"reward_std": 0.3258050443604589, |
|
"rewards/accuracy_reward": 0.20439453125, |
|
"rewards/format_reward": 0.87724609375, |
|
"step": 40 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 691.025390625, |
|
"epoch": 0.12935680919870643, |
|
"grad_norm": 0.27032357454299927, |
|
"kl": 0.1677734375, |
|
"learning_rate": 1.9949348350626456e-05, |
|
"loss": 0.034, |
|
"reward": 0.9642578125, |
|
"reward_std": 0.4391048148274422, |
|
"rewards/accuracy_reward": 0.18740234375, |
|
"rewards/format_reward": 0.77685546875, |
|
"step": 45 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 440.34853515625, |
|
"epoch": 0.1437297879985627, |
|
"grad_norm": 0.6052369475364685, |
|
"kl": 0.800189208984375, |
|
"learning_rate": 1.9886154122075344e-05, |
|
"loss": 0.0919, |
|
"reward": 0.89814453125, |
|
"reward_std": 0.38281605690717696, |
|
"rewards/accuracy_reward": 0.11865234375, |
|
"rewards/format_reward": 0.7794921875, |
|
"step": 50 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 487.60771484375, |
|
"epoch": 0.15810276679841898, |
|
"grad_norm": 0.28784340620040894, |
|
"kl": 2.12225341796875, |
|
"learning_rate": 1.979790652042268e-05, |
|
"loss": 0.1039, |
|
"reward": 0.85263671875, |
|
"reward_std": 0.4635654494166374, |
|
"rewards/accuracy_reward": 0.13447265625, |
|
"rewards/format_reward": 0.7181640625, |
|
"step": 55 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 718.88359375, |
|
"epoch": 0.17247574559827525, |
|
"grad_norm": 0.38119208812713623, |
|
"kl": 0.38172607421875, |
|
"learning_rate": 1.9684829181681236e-05, |
|
"loss": 0.0502, |
|
"reward": 1.06494140625, |
|
"reward_std": 0.3414448471739888, |
|
"rewards/accuracy_reward": 0.21650390625, |
|
"rewards/format_reward": 0.8484375, |
|
"step": 60 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 621.63818359375, |
|
"epoch": 0.18684872439813152, |
|
"grad_norm": 0.3849119246006012, |
|
"kl": 1.819970703125, |
|
"learning_rate": 1.954720866508546e-05, |
|
"loss": 0.1892, |
|
"reward": 0.9689453125, |
|
"reward_std": 0.4041255243122578, |
|
"rewards/accuracy_reward": 0.16826171875, |
|
"rewards/format_reward": 0.80068359375, |
|
"step": 65 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 502.92744140625, |
|
"epoch": 0.2012217031979878, |
|
"grad_norm": 0.16367273032665253, |
|
"kl": 0.688922119140625, |
|
"learning_rate": 1.9385393726896492e-05, |
|
"loss": 0.0581, |
|
"reward": 1.1560546875, |
|
"reward_std": 0.22550129257142543, |
|
"rewards/accuracy_reward": 0.19248046875, |
|
"rewards/format_reward": 0.96357421875, |
|
"step": 70 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 563.1029296875, |
|
"epoch": 0.21559468199784404, |
|
"grad_norm": 0.1713869571685791, |
|
"kl": 0.0900238037109375, |
|
"learning_rate": 1.9199794436588244e-05, |
|
"loss": 0.0071, |
|
"reward": 1.1892578125, |
|
"reward_std": 0.2032089052721858, |
|
"rewards/accuracy_reward": 0.21513671875, |
|
"rewards/format_reward": 0.97412109375, |
|
"step": 75 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 622.5634765625, |
|
"epoch": 0.2299676607977003, |
|
"grad_norm": 0.2464917004108429, |
|
"kl": 0.144158935546875, |
|
"learning_rate": 1.899088113765426e-05, |
|
"loss": 0.0189, |
|
"reward": 1.1546875, |
|
"reward_std": 0.2610320156440139, |
|
"rewards/accuracy_reward": 0.21083984375, |
|
"rewards/format_reward": 0.94384765625, |
|
"step": 80 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 652.05927734375, |
|
"epoch": 0.24434063959755659, |
|
"grad_norm": 0.2248377948999405, |
|
"kl": 0.716436767578125, |
|
"learning_rate": 1.875918325566888e-05, |
|
"loss": 0.0578, |
|
"reward": 1.06005859375, |
|
"reward_std": 0.33321408815681935, |
|
"rewards/accuracy_reward": 0.171484375, |
|
"rewards/format_reward": 0.88857421875, |
|
"step": 85 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 669.3861328125, |
|
"epoch": 0.25871361839741286, |
|
"grad_norm": 0.27829509973526, |
|
"kl": 0.617529296875, |
|
"learning_rate": 1.8505287956623298e-05, |
|
"loss": 0.0585, |
|
"reward": 1.14755859375, |
|
"reward_std": 0.2751380069181323, |
|
"rewards/accuracy_reward": 0.20859375, |
|
"rewards/format_reward": 0.93896484375, |
|
"step": 90 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 662.1236328125, |
|
"epoch": 0.27308659719726913, |
|
"grad_norm": 0.2939702868461609, |
|
"kl": 0.5397705078125, |
|
"learning_rate": 1.8229838658936566e-05, |
|
"loss": 0.0555, |
|
"reward": 1.137890625, |
|
"reward_std": 0.2469838338904083, |
|
"rewards/accuracy_reward": 0.1900390625, |
|
"rewards/format_reward": 0.9478515625, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.2874595759971254, |
|
"grad_norm": 0.1728806495666504, |
|
"learning_rate": 1.7933533402912354e-05, |
|
"loss": 0.103, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2874595759971254, |
|
"eval_clip_ratio": 0.0, |
|
"eval_completion_length": 611.2384828951579, |
|
"eval_kl": 0.50033329778157, |
|
"eval_loss": 0.06100574508309364, |
|
"eval_reward": 1.1420381825938566, |
|
"eval_reward_std": 0.27033696519433437, |
|
"eval_rewards/accuracy_reward": 0.2020051194539249, |
|
"eval_rewards/format_reward": 0.9400330631399317, |
|
"eval_runtime": 16336.0108, |
|
"eval_samples_per_second": 0.287, |
|
"eval_steps_per_second": 0.002, |
|
"step": 100 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 629.018017578125, |
|
"epoch": 0.3018325547969817, |
|
"grad_norm": 0.1207083985209465, |
|
"kl": 1.06016845703125, |
|
"learning_rate": 1.761712308177359e-05, |
|
"loss": 0.1074, |
|
"reward": 1.059326171875, |
|
"reward_std": 0.35213989242911337, |
|
"rewards/accuracy_reward": 0.18974609375, |
|
"rewards/format_reward": 0.869580078125, |
|
"step": 105 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 622.68330078125, |
|
"epoch": 0.31620553359683795, |
|
"grad_norm": 0.12369602918624878, |
|
"kl": 2.13466796875, |
|
"learning_rate": 1.7281409538757886e-05, |
|
"loss": 0.1546, |
|
"reward": 1.06484375, |
|
"reward_std": 0.3502559883520007, |
|
"rewards/accuracy_reward": 0.1806640625, |
|
"rewards/format_reward": 0.8841796875, |
|
"step": 110 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 619.0666015625, |
|
"epoch": 0.3305785123966942, |
|
"grad_norm": 0.13101035356521606, |
|
"kl": 0.932763671875, |
|
"learning_rate": 1.6927243535095995e-05, |
|
"loss": 0.0856, |
|
"reward": 1.14521484375, |
|
"reward_std": 0.2656426582485437, |
|
"rewards/accuracy_reward": 0.20322265625, |
|
"rewards/format_reward": 0.9419921875, |
|
"step": 115 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 633.12802734375, |
|
"epoch": 0.3449514911965505, |
|
"grad_norm": 0.13193248212337494, |
|
"kl": 0.9656982421875, |
|
"learning_rate": 1.655552259402295e-05, |
|
"loss": 0.0881, |
|
"reward": 1.14560546875, |
|
"reward_std": 0.27462361557409165, |
|
"rewards/accuracy_reward": 0.21337890625, |
|
"rewards/format_reward": 0.9322265625, |
|
"step": 120 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 653.04599609375, |
|
"epoch": 0.35932446999640677, |
|
"grad_norm": 0.3534374535083771, |
|
"kl": 1.867626953125, |
|
"learning_rate": 1.6167188726285433e-05, |
|
"loss": 0.1558, |
|
"reward": 1.05126953125, |
|
"reward_std": 0.36074890177696944, |
|
"rewards/accuracy_reward": 0.18544921875, |
|
"rewards/format_reward": 0.8658203125, |
|
"step": 125 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 630.14736328125, |
|
"epoch": 0.37369744879626304, |
|
"grad_norm": 2.0081052780151367, |
|
"kl": 1.8935546875, |
|
"learning_rate": 1.5763226042909455e-05, |
|
"loss": 0.1105, |
|
"reward": 1.0998046875, |
|
"reward_std": 0.3096121703274548, |
|
"rewards/accuracy_reward": 0.18486328125, |
|
"rewards/format_reward": 0.91494140625, |
|
"step": 130 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 614.62197265625, |
|
"epoch": 0.3880704275961193, |
|
"grad_norm": 0.1118120476603508, |
|
"kl": 0.59337158203125, |
|
"learning_rate": 1.5344658261278013e-05, |
|
"loss": 0.031, |
|
"reward": 1.16611328125, |
|
"reward_std": 0.24496497269719839, |
|
"rewards/accuracy_reward": 0.21005859375, |
|
"rewards/format_reward": 0.9560546875, |
|
"step": 135 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 610.4009765625, |
|
"epoch": 0.4024434063959756, |
|
"grad_norm": 0.18786092102527618, |
|
"kl": 0.7201416015625, |
|
"learning_rate": 1.4912546110838775e-05, |
|
"loss": 0.0608, |
|
"reward": 1.1451171875, |
|
"reward_std": 0.2563774929381907, |
|
"rewards/accuracy_reward": 0.2021484375, |
|
"rewards/format_reward": 0.94296875, |
|
"step": 140 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 604.14072265625, |
|
"epoch": 0.41681638519583186, |
|
"grad_norm": 0.12442336976528168, |
|
"kl": 0.96689453125, |
|
"learning_rate": 1.4467984645016259e-05, |
|
"loss": 0.0834, |
|
"reward": 1.13984375, |
|
"reward_std": 0.2728093104436994, |
|
"rewards/accuracy_reward": 0.2001953125, |
|
"rewards/format_reward": 0.9396484375, |
|
"step": 145 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 612.0369140625, |
|
"epoch": 0.4311893639956881, |
|
"grad_norm": 0.17537765204906464, |
|
"kl": 0.687255859375, |
|
"learning_rate": 1.4012100466140579e-05, |
|
"loss": 0.0628, |
|
"reward": 1.12919921875, |
|
"reward_std": 0.24853361072018743, |
|
"rewards/accuracy_reward": 0.17646484375, |
|
"rewards/format_reward": 0.952734375, |
|
"step": 150 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 609.915234375, |
|
"epoch": 0.44556234279554435, |
|
"grad_norm": 0.11783521622419357, |
|
"kl": 0.83641357421875, |
|
"learning_rate": 1.3546048870425356e-05, |
|
"loss": 0.0734, |
|
"reward": 1.12666015625, |
|
"reward_std": 0.264958731085062, |
|
"rewards/accuracy_reward": 0.18427734375, |
|
"rewards/format_reward": 0.9423828125, |
|
"step": 155 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 614.70517578125, |
|
"epoch": 0.4599353215954006, |
|
"grad_norm": 0.13742466270923615, |
|
"kl": 0.7468505859375, |
|
"learning_rate": 1.3071010920229909e-05, |
|
"loss": 0.0682, |
|
"reward": 1.122265625, |
|
"reward_std": 0.2766525615006685, |
|
"rewards/accuracy_reward": 0.18798828125, |
|
"rewards/format_reward": 0.93427734375, |
|
"step": 160 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 625.36552734375, |
|
"epoch": 0.4743083003952569, |
|
"grad_norm": 0.4238876700401306, |
|
"kl": 1.381640625, |
|
"learning_rate": 1.2588190451025209e-05, |
|
"loss": 0.1039, |
|
"reward": 1.13544921875, |
|
"reward_std": 0.31343956142663953, |
|
"rewards/accuracy_reward": 0.2201171875, |
|
"rewards/format_reward": 0.91533203125, |
|
"step": 165 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 672.3779296875, |
|
"epoch": 0.48868127919511317, |
|
"grad_norm": 0.13015827536582947, |
|
"kl": 1.4199462890625, |
|
"learning_rate": 1.2098811020648475e-05, |
|
"loss": 0.0989, |
|
"reward": 1.11416015625, |
|
"reward_std": 0.3195471292361617, |
|
"rewards/accuracy_reward": 0.208203125, |
|
"rewards/format_reward": 0.90595703125, |
|
"step": 170 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 631.84326171875, |
|
"epoch": 0.5030542579949695, |
|
"grad_norm": 0.2257327437400818, |
|
"kl": 1.1652099609375, |
|
"learning_rate": 1.1604112808577603e-05, |
|
"loss": 0.101, |
|
"reward": 1.1236328125, |
|
"reward_std": 0.30357036273926497, |
|
"rewards/accuracy_reward": 0.211328125, |
|
"rewards/format_reward": 0.9123046875, |
|
"step": 175 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 622.3265625, |
|
"epoch": 0.5174272367948257, |
|
"grad_norm": 0.11806362867355347, |
|
"kl": 0.7406005859375, |
|
"learning_rate": 1.11053494730832e-05, |
|
"loss": 0.0699, |
|
"reward": 1.1373046875, |
|
"reward_std": 0.25564199751242994, |
|
"rewards/accuracy_reward": 0.19658203125, |
|
"rewards/format_reward": 0.94072265625, |
|
"step": 180 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 652.4876953125, |
|
"epoch": 0.531800215594682, |
|
"grad_norm": 0.12807710468769073, |
|
"kl": 0.58621826171875, |
|
"learning_rate": 1.0603784974222862e-05, |
|
"loss": 0.0587, |
|
"reward": 1.173046875, |
|
"reward_std": 0.26026681158691645, |
|
"rewards/accuracy_reward": 0.2248046875, |
|
"rewards/format_reward": 0.9482421875, |
|
"step": 185 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 630.433203125, |
|
"epoch": 0.5461731943945383, |
|
"grad_norm": 0.10217402130365372, |
|
"kl": 0.9344970703125, |
|
"learning_rate": 1.0100690370728756e-05, |
|
"loss": 0.0809, |
|
"reward": 1.1609375, |
|
"reward_std": 0.2667428271844983, |
|
"rewards/accuracy_reward": 0.2150390625, |
|
"rewards/format_reward": 0.9458984375, |
|
"step": 190 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 617.68701171875, |
|
"epoch": 0.5605461731943946, |
|
"grad_norm": 0.13498954474925995, |
|
"kl": 0.67510986328125, |
|
"learning_rate": 9.597340598905851e-06, |
|
"loss": 0.0603, |
|
"reward": 1.1654296875, |
|
"reward_std": 0.25683426298201084, |
|
"rewards/accuracy_reward": 0.21796875, |
|
"rewards/format_reward": 0.9474609375, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.5749191519942508, |
|
"grad_norm": 0.1882268339395523, |
|
"learning_rate": 9.095011241703623e-06, |
|
"loss": 0.0719, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5749191519942508, |
|
"eval_clip_ratio": 0.0, |
|
"eval_completion_length": 658.4643835907503, |
|
"eval_kl": 0.7806433980375427, |
|
"eval_loss": 0.06092459335923195, |
|
"eval_reward": 1.149637372013652, |
|
"eval_reward_std": 0.27955490747409467, |
|
"eval_rewards/accuracy_reward": 0.2150170648464164, |
|
"eval_rewards/format_reward": 0.9346203071672355, |
|
"eval_runtime": 16414.3395, |
|
"eval_samples_per_second": 0.286, |
|
"eval_steps_per_second": 0.002, |
|
"step": 200 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 652.56220703125, |
|
"epoch": 0.589292130794107, |
|
"grad_norm": 0.1547040194272995, |
|
"kl": 0.93699951171875, |
|
"learning_rate": 8.594975296149076e-06, |
|
"loss": 0.0647, |
|
"reward": 1.1623046875, |
|
"reward_std": 0.28741056518629193, |
|
"rewards/accuracy_reward": 0.23125, |
|
"rewards/format_reward": 0.9310546875, |
|
"step": 205 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 674.5626953125, |
|
"epoch": 0.6036651095939634, |
|
"grad_norm": 0.25151509046554565, |
|
"kl": 0.9999267578125, |
|
"learning_rate": 8.098499947332935e-06, |
|
"loss": 0.0775, |
|
"reward": 1.1466796875, |
|
"reward_std": 0.30369703844189644, |
|
"rewards/accuracy_reward": 0.22509765625, |
|
"rewards/format_reward": 0.92158203125, |
|
"step": 210 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 676.977734375, |
|
"epoch": 0.6180380883938196, |
|
"grad_norm": 0.20043928921222687, |
|
"kl": 0.7748779296875, |
|
"learning_rate": 7.606843357124426e-06, |
|
"loss": 0.0573, |
|
"reward": 1.15302734375, |
|
"reward_std": 0.28829708844423296, |
|
"rewards/accuracy_reward": 0.2244140625, |
|
"rewards/format_reward": 0.92861328125, |
|
"step": 215 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 655.955078125, |
|
"epoch": 0.6324110671936759, |
|
"grad_norm": 0.12682239711284637, |
|
"kl": 0.7095947265625, |
|
"learning_rate": 7.12125147575254e-06, |
|
"loss": 0.0548, |
|
"reward": 1.1763671875, |
|
"reward_std": 0.25821941047906877, |
|
"rewards/accuracy_reward": 0.23046875, |
|
"rewards/format_reward": 0.9458984375, |
|
"step": 220 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 647.839453125, |
|
"epoch": 0.6467840459935321, |
|
"grad_norm": 0.13890360295772552, |
|
"kl": 0.63245849609375, |
|
"learning_rate": 6.6429548843339554e-06, |
|
"loss": 0.0502, |
|
"reward": 1.1654296875, |
|
"reward_std": 0.2512395134195685, |
|
"rewards/accuracy_reward": 0.21337890625, |
|
"rewards/format_reward": 0.95205078125, |
|
"step": 225 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 632.53623046875, |
|
"epoch": 0.6611570247933884, |
|
"grad_norm": 0.15598197281360626, |
|
"kl": 0.87559814453125, |
|
"learning_rate": 6.173165676349103e-06, |
|
"loss": 0.0703, |
|
"reward": 1.155078125, |
|
"reward_std": 0.2729664742946625, |
|
"rewards/accuracy_reward": 0.213671875, |
|
"rewards/format_reward": 0.94140625, |
|
"step": 230 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 641.5361328125, |
|
"epoch": 0.6755300035932447, |
|
"grad_norm": 0.15446113049983978, |
|
"kl": 0.77437744140625, |
|
"learning_rate": 5.713074385969457e-06, |
|
"loss": 0.0688, |
|
"reward": 1.16953125, |
|
"reward_std": 0.28331395238637924, |
|
"rewards/accuracy_reward": 0.2296875, |
|
"rewards/format_reward": 0.93984375, |
|
"step": 235 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 647.3654296875, |
|
"epoch": 0.689902982393101, |
|
"grad_norm": 0.2089157998561859, |
|
"kl": 1.21328125, |
|
"learning_rate": 5.263846971020108e-06, |
|
"loss": 0.1016, |
|
"reward": 1.116796875, |
|
"reward_std": 0.31174491699784995, |
|
"rewards/accuracy_reward": 0.2029296875, |
|
"rewards/format_reward": 0.9138671875, |
|
"step": 240 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 645.523828125, |
|
"epoch": 0.7042759611929572, |
|
"grad_norm": 0.16784484684467316, |
|
"kl": 0.791552734375, |
|
"learning_rate": 4.826621858223431e-06, |
|
"loss": 0.0734, |
|
"reward": 1.143359375, |
|
"reward_std": 0.28859285488724706, |
|
"rewards/accuracy_reward": 0.2154296875, |
|
"rewards/format_reward": 0.9279296875, |
|
"step": 245 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 619.52958984375, |
|
"epoch": 0.7186489399928135, |
|
"grad_norm": 0.1753949671983719, |
|
"kl": 0.98125, |
|
"learning_rate": 4.40250705821178e-06, |
|
"loss": 0.0812, |
|
"reward": 1.1546875, |
|
"reward_std": 0.2736880548298359, |
|
"rewards/accuracy_reward": 0.2154296875, |
|
"rewards/format_reward": 0.9392578125, |
|
"step": 250 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 635.1673828125, |
|
"epoch": 0.7330219187926698, |
|
"grad_norm": 0.20336733758449554, |
|
"kl": 0.55863037109375, |
|
"learning_rate": 3.99257735762021e-06, |
|
"loss": 0.0458, |
|
"reward": 1.17392578125, |
|
"reward_std": 0.23981231823563576, |
|
"rewards/accuracy_reward": 0.21728515625, |
|
"rewards/format_reward": 0.956640625, |
|
"step": 255 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 630.9087890625, |
|
"epoch": 0.7473948975925261, |
|
"grad_norm": 0.16080701351165771, |
|
"kl": 0.696923828125, |
|
"learning_rate": 3.5978715953751207e-06, |
|
"loss": 0.0567, |
|
"reward": 1.1685546875, |
|
"reward_std": 0.24907034020870925, |
|
"rewards/accuracy_reward": 0.21376953125, |
|
"rewards/format_reward": 0.95478515625, |
|
"step": 260 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 623.18583984375, |
|
"epoch": 0.7617678763923823, |
|
"grad_norm": 0.18338614702224731, |
|
"kl": 1.0648681640625, |
|
"learning_rate": 3.2193900300810908e-06, |
|
"loss": 0.0778, |
|
"reward": 1.151953125, |
|
"reward_std": 0.26931764371693134, |
|
"rewards/accuracy_reward": 0.210546875, |
|
"rewards/format_reward": 0.94140625, |
|
"step": 265 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 634.9572265625, |
|
"epoch": 0.7761408551922386, |
|
"grad_norm": 0.13022945821285248, |
|
"kl": 0.7796142578125, |
|
"learning_rate": 2.8580918051775542e-06, |
|
"loss": 0.065, |
|
"reward": 1.165625, |
|
"reward_std": 0.27459610607475043, |
|
"rewards/accuracy_reward": 0.2244140625, |
|
"rewards/format_reward": 0.9412109375, |
|
"step": 270 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 633.88232421875, |
|
"epoch": 0.7905138339920948, |
|
"grad_norm": 0.1719103306531906, |
|
"kl": 0.8088623046875, |
|
"learning_rate": 2.514892518288988e-06, |
|
"loss": 0.0696, |
|
"reward": 1.15087890625, |
|
"reward_std": 0.2822716049849987, |
|
"rewards/accuracy_reward": 0.21640625, |
|
"rewards/format_reward": 0.93447265625, |
|
"step": 275 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 636.853125, |
|
"epoch": 0.8048868127919512, |
|
"grad_norm": 0.21337589621543884, |
|
"kl": 0.9040283203125, |
|
"learning_rate": 2.190661900928426e-06, |
|
"loss": 0.0753, |
|
"reward": 1.1412109375, |
|
"reward_std": 0.2784146698191762, |
|
"rewards/accuracy_reward": 0.2029296875, |
|
"rewards/format_reward": 0.93828125, |
|
"step": 280 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 639.5638671875, |
|
"epoch": 0.8192597915918074, |
|
"grad_norm": 0.1362425535917282, |
|
"kl": 0.95645751953125, |
|
"learning_rate": 1.8862216144342692e-06, |
|
"loss": 0.0749, |
|
"reward": 1.14130859375, |
|
"reward_std": 0.2679125562310219, |
|
"rewards/accuracy_reward": 0.20546875, |
|
"rewards/format_reward": 0.93583984375, |
|
"step": 285 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 639.32958984375, |
|
"epoch": 0.8336327703916637, |
|
"grad_norm": 0.13494881987571716, |
|
"kl": 0.8051513671875, |
|
"learning_rate": 1.6023431677260215e-06, |
|
"loss": 0.0684, |
|
"reward": 1.16240234375, |
|
"reward_std": 0.26225354727357625, |
|
"rewards/accuracy_reward": 0.21396484375, |
|
"rewards/format_reward": 0.9484375, |
|
"step": 290 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 636.16083984375, |
|
"epoch": 0.8480057491915199, |
|
"grad_norm": 0.16026277840137482, |
|
"kl": 0.8697265625, |
|
"learning_rate": 1.339745962155613e-06, |
|
"loss": 0.0712, |
|
"reward": 1.15966796875, |
|
"reward_std": 0.2733839010819793, |
|
"rewards/accuracy_reward": 0.21552734375, |
|
"rewards/format_reward": 0.944140625, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.8623787279913762, |
|
"grad_norm": 0.155064195394516, |
|
"learning_rate": 1.099095468409156e-06, |
|
"loss": 0.0785, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.8623787279913762, |
|
"eval_clip_ratio": 0.0, |
|
"eval_completion_length": 628.6954391531569, |
|
"eval_kl": 0.8880319432593856, |
|
"eval_loss": 0.07323075085878372, |
|
"eval_reward": 1.1617160836177474, |
|
"eval_reward_std": 0.2670084892838888, |
|
"eval_rewards/accuracy_reward": 0.21819005972696245, |
|
"eval_rewards/format_reward": 0.943526023890785, |
|
"eval_runtime": 16336.911, |
|
"eval_samples_per_second": 0.287, |
|
"eval_steps_per_second": 0.002, |
|
"step": 300 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 636.594287109375, |
|
"epoch": 0.8767517067912325, |
|
"grad_norm": 0.1458193063735962, |
|
"kl": 0.95950927734375, |
|
"learning_rate": 8.810015400790994e-07, |
|
"loss": 0.0809, |
|
"reward": 1.16162109375, |
|
"reward_std": 0.26864673662930727, |
|
"rewards/accuracy_reward": 0.2203125, |
|
"rewards/format_reward": 0.94130859375, |
|
"step": 305 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 642.09326171875, |
|
"epoch": 0.8911246855910887, |
|
"grad_norm": 0.14581456780433655, |
|
"kl": 0.82933349609375, |
|
"learning_rate": 6.860168681805946e-07, |
|
"loss": 0.0661, |
|
"reward": 1.16982421875, |
|
"reward_std": 0.26240854635834693, |
|
"rewards/accuracy_reward": 0.2216796875, |
|
"rewards/format_reward": 0.94814453125, |
|
"step": 310 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 642.76943359375, |
|
"epoch": 0.905497664390945, |
|
"grad_norm": 0.16072359681129456, |
|
"kl": 0.80216064453125, |
|
"learning_rate": 5.146355805285452e-07, |
|
"loss": 0.0637, |
|
"reward": 1.17431640625, |
|
"reward_std": 0.2672739554196596, |
|
"rewards/accuracy_reward": 0.22734375, |
|
"rewards/format_reward": 0.94697265625, |
|
"step": 315 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 644.221484375, |
|
"epoch": 0.9198706431908013, |
|
"grad_norm": 0.1719951331615448, |
|
"kl": 0.84737548828125, |
|
"learning_rate": 3.6729198952483725e-07, |
|
"loss": 0.0748, |
|
"reward": 1.158203125, |
|
"reward_std": 0.2642348381690681, |
|
"rewards/accuracy_reward": 0.2169921875, |
|
"rewards/format_reward": 0.9412109375, |
|
"step": 320 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 649.33896484375, |
|
"epoch": 0.9342436219906576, |
|
"grad_norm": 0.20107921957969666, |
|
"kl": 0.87275390625, |
|
"learning_rate": 2.4435949152906144e-07, |
|
"loss": 0.0757, |
|
"reward": 1.15966796875, |
|
"reward_std": 0.27580115627497437, |
|
"rewards/accuracy_reward": 0.22021484375, |
|
"rewards/format_reward": 0.939453125, |
|
"step": 325 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 643.75712890625, |
|
"epoch": 0.9486166007905138, |
|
"grad_norm": 0.14510348439216614, |
|
"kl": 0.821826171875, |
|
"learning_rate": 1.4614962060194303e-07, |
|
"loss": 0.0658, |
|
"reward": 1.140625, |
|
"reward_std": 0.2549537133425474, |
|
"rewards/accuracy_reward": 0.1978515625, |
|
"rewards/format_reward": 0.9427734375, |
|
"step": 330 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 643.530078125, |
|
"epoch": 0.9629895795903701, |
|
"grad_norm": 0.14030759036540985, |
|
"kl": 0.77998046875, |
|
"learning_rate": 7.291125901946027e-08, |
|
"loss": 0.0701, |
|
"reward": 1.1693359375, |
|
"reward_std": 0.2593334957957268, |
|
"rewards/accuracy_reward": 0.22529296875, |
|
"rewards/format_reward": 0.94404296875, |
|
"step": 335 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 646.19736328125, |
|
"epoch": 0.9773625583902263, |
|
"grad_norm": 0.16929689049720764, |
|
"kl": 0.835546875, |
|
"learning_rate": 2.4830006558373975e-08, |
|
"loss": 0.0697, |
|
"reward": 1.162109375, |
|
"reward_std": 0.26842295806854966, |
|
"rewards/accuracy_reward": 0.2173828125, |
|
"rewards/format_reward": 0.9447265625, |
|
"step": 340 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 642.66806640625, |
|
"epoch": 0.9917355371900827, |
|
"grad_norm": 0.17319317162036896, |
|
"kl": 0.85125732421875, |
|
"learning_rate": 2.0277101514987184e-09, |
|
"loss": 0.0724, |
|
"reward": 1.15966796875, |
|
"reward_std": 0.2777851399034262, |
|
"rewards/accuracy_reward": 0.21982421875, |
|
"rewards/format_reward": 0.93984375, |
|
"step": 345 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 643.1118812561035, |
|
"epoch": 0.9974847287100251, |
|
"kl": 0.8223876953125, |
|
"reward": 1.182861328125, |
|
"reward_std": 0.2758036791346967, |
|
"rewards/accuracy_reward": 0.25390625, |
|
"rewards/format_reward": 0.928955078125, |
|
"step": 347, |
|
"total_flos": 0.0, |
|
"train_loss": 0.0704507840104852, |
|
"train_runtime": 435678.5475, |
|
"train_samples_per_second": 0.204, |
|
"train_steps_per_second": 0.001 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 347, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|