Qwen2.5-1.5B-Open-R1-GRPO / trainer_state.json
AaronHuangWei's picture
Model save
0ab1069 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9991717740599636,
"eval_steps": 100,
"global_step": 754,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 1843.4614156087239,
"epoch": 0.003975484512174921,
"grad_norm": 0.08699692785739899,
"kl": 5.9882799784342446e-05,
"learning_rate": 7.894736842105263e-07,
"loss": 0.0,
"reward": 0.20442708857202282,
"reward_std": 0.18642982677556574,
"rewards/accuracy_reward": 0.1879340319816644,
"rewards/format_reward": 0.0164930559694767,
"step": 3
},
{
"completion_length": 1872.7357126871746,
"epoch": 0.007950969024349842,
"grad_norm": 0.08034715801477432,
"kl": 0.00017563501993815103,
"learning_rate": 1.5789473684210526e-06,
"loss": 0.0,
"reward": 0.16623264349376163,
"reward_std": 0.166806096288686,
"rewards/accuracy_reward": 0.15190972640023878,
"rewards/format_reward": 0.014322916977107525,
"step": 6
},
{
"completion_length": 1840.263064066569,
"epoch": 0.011926453536524764,
"grad_norm": 0.07688009738922119,
"kl": 0.00018552939097086588,
"learning_rate": 2.368421052631579e-06,
"loss": 0.0,
"reward": 0.22352431349766752,
"reward_std": 0.17657933492834368,
"rewards/accuracy_reward": 0.20355903388311467,
"rewards/format_reward": 0.019965278489204746,
"step": 9
},
{
"completion_length": 1760.6285069783528,
"epoch": 0.015901938048699684,
"grad_norm": 0.1042766273021698,
"kl": 0.00028959910074869793,
"learning_rate": 3.157894736842105e-06,
"loss": 0.0,
"reward": 0.2647569526452571,
"reward_std": 0.219600356494387,
"rewards/accuracy_reward": 0.2326388960548987,
"rewards/format_reward": 0.032118056512748204,
"step": 12
},
{
"completion_length": 1722.3082021077473,
"epoch": 0.019877422560874606,
"grad_norm": 0.3040783703327179,
"kl": 0.0018845796585083008,
"learning_rate": 3.947368421052632e-06,
"loss": 0.0001,
"reward": 0.3059895912495752,
"reward_std": 0.23623824515379965,
"rewards/accuracy_reward": 0.23784722954345247,
"rewards/format_reward": 0.06814236252103001,
"step": 15
},
{
"completion_length": 845.952714920044,
"epoch": 0.023852907073049528,
"grad_norm": 1.3506791591644287,
"kl": 0.3431205749511719,
"learning_rate": 4.736842105263158e-06,
"loss": 0.0137,
"reward": 0.6740451576188207,
"reward_std": 0.2776922438448916,
"rewards/accuracy_reward": 0.1336805592291057,
"rewards/format_reward": 0.5403645950524757,
"step": 18
},
{
"completion_length": 66.15755401055019,
"epoch": 0.02782839158522445,
"grad_norm": 1.3607665300369263,
"kl": 0.6993815104166666,
"learning_rate": 5.526315789473685e-06,
"loss": 0.028,
"reward": 0.9774305758376917,
"reward_std": 0.09281354808869462,
"rewards/accuracy_reward": 0.018663194845430553,
"rewards/format_reward": 0.9587673805654049,
"step": 21
},
{
"completion_length": 118.68446455399196,
"epoch": 0.03180387609739937,
"grad_norm": 0.8041670322418213,
"kl": 0.53955078125,
"learning_rate": 6.31578947368421e-06,
"loss": 0.0216,
"reward": 0.9665798830489317,
"reward_std": 0.1714695317981144,
"rewards/accuracy_reward": 0.04600694558272759,
"rewards/format_reward": 0.9205729328095913,
"step": 24
},
{
"completion_length": 88.47830098867416,
"epoch": 0.03577936060957429,
"grad_norm": 0.8617585301399231,
"kl": 0.59326171875,
"learning_rate": 7.1052631578947375e-06,
"loss": 0.0237,
"reward": 1.0386284987131755,
"reward_std": 0.2134858975186944,
"rewards/accuracy_reward": 0.09592014209677775,
"rewards/format_reward": 0.9427083507180214,
"step": 27
},
{
"completion_length": 30.138455788294475,
"epoch": 0.03975484512174921,
"grad_norm": 2.2023513317108154,
"kl": 0.8590494791666666,
"learning_rate": 7.894736842105265e-06,
"loss": 0.0344,
"reward": 1.1623264302810032,
"reward_std": 0.16990292662133774,
"rewards/accuracy_reward": 0.17361111589707434,
"rewards/format_reward": 0.9887152872979641,
"step": 30
},
{
"completion_length": 35.736980040868126,
"epoch": 0.043730329633924134,
"grad_norm": 1.1060832738876343,
"kl": 0.8148600260416666,
"learning_rate": 8.68421052631579e-06,
"loss": 0.0326,
"reward": 1.1657986442248027,
"reward_std": 0.17117769015021622,
"rewards/accuracy_reward": 0.18793403407714018,
"rewards/format_reward": 0.9778645994762579,
"step": 33
},
{
"completion_length": 35.928820510705314,
"epoch": 0.047705814146099056,
"grad_norm": 1.2442351579666138,
"kl": 0.847412109375,
"learning_rate": 9.473684210526315e-06,
"loss": 0.0339,
"reward": 1.2052951740721862,
"reward_std": 0.16379862558096647,
"rewards/accuracy_reward": 0.22265625485063842,
"rewards/format_reward": 0.9826389029622078,
"step": 36
},
{
"completion_length": 70.93489801883698,
"epoch": 0.05168129865827398,
"grad_norm": 1.4330227375030518,
"kl": 0.8037923177083334,
"learning_rate": 1.0263157894736844e-05,
"loss": 0.0321,
"reward": 1.1744792064030964,
"reward_std": 0.22849255722636977,
"rewards/accuracy_reward": 0.22222222892257074,
"rewards/format_reward": 0.9522569614152113,
"step": 39
},
{
"completion_length": 81.15885670979817,
"epoch": 0.0556567831704489,
"grad_norm": 1.0737708806991577,
"kl": 0.7923177083333334,
"learning_rate": 1.105263157894737e-05,
"loss": 0.0317,
"reward": 1.14930559694767,
"reward_std": 0.22925202331195274,
"rewards/accuracy_reward": 0.19184028345625848,
"rewards/format_reward": 0.9574652947485447,
"step": 42
},
{
"completion_length": 69.16927303870519,
"epoch": 0.05963226768262382,
"grad_norm": 0.8058044910430908,
"kl": 0.8806966145833334,
"learning_rate": 1.1842105263157895e-05,
"loss": 0.0352,
"reward": 1.1675347524384658,
"reward_std": 0.1868902291947355,
"rewards/accuracy_reward": 0.19618056225590408,
"rewards/format_reward": 0.9713541852931181,
"step": 45
},
{
"completion_length": 57.069880266984306,
"epoch": 0.06360775219479874,
"grad_norm": 2.353023052215576,
"kl": 0.91162109375,
"learning_rate": 1.263157894736842e-05,
"loss": 0.0365,
"reward": 1.189236156642437,
"reward_std": 0.18126761401072145,
"rewards/accuracy_reward": 0.20095486647915095,
"rewards/format_reward": 0.9882812586923441,
"step": 48
},
{
"completion_length": 21.48567771911621,
"epoch": 0.06758323670697367,
"grad_norm": 0.964798629283905,
"kl": 0.8650716145833334,
"learning_rate": 1.3421052631578948e-05,
"loss": 0.0346,
"reward": 1.2161458743115265,
"reward_std": 0.1899353281284372,
"rewards/accuracy_reward": 0.21961806206187853,
"rewards/format_reward": 0.996527781089147,
"step": 51
},
{
"completion_length": 53.81597367922465,
"epoch": 0.07155872121914858,
"grad_norm": 1.0664324760437012,
"kl": 0.8111572265625,
"learning_rate": 1.4210526315789475e-05,
"loss": 0.0324,
"reward": 1.162326426555713,
"reward_std": 0.23168744108018777,
"rewards/accuracy_reward": 0.20008681147980192,
"rewards/format_reward": 0.9622395982344946,
"step": 54
},
{
"completion_length": 208.98611625035605,
"epoch": 0.07553420573132351,
"grad_norm": 0.8128153085708618,
"kl": 0.6840006510416666,
"learning_rate": 1.5000000000000002e-05,
"loss": 0.0274,
"reward": 1.1306424004336197,
"reward_std": 0.35048759169876575,
"rewards/accuracy_reward": 0.22743056244992962,
"rewards/format_reward": 0.9032118245959282,
"step": 57
},
{
"completion_length": 260.03820244471234,
"epoch": 0.07950969024349842,
"grad_norm": 0.5030148029327393,
"kl": 0.599609375,
"learning_rate": 1.578947368421053e-05,
"loss": 0.024,
"reward": 1.168402807166179,
"reward_std": 0.28692516156782705,
"rewards/accuracy_reward": 0.21223958927051476,
"rewards/format_reward": 0.9561632163822651,
"step": 60
},
{
"completion_length": 191.35243598620096,
"epoch": 0.08348517475567335,
"grad_norm": 0.5893499851226807,
"kl": 0.6812337239583334,
"learning_rate": 1.6578947368421053e-05,
"loss": 0.0273,
"reward": 1.1432291989525158,
"reward_std": 0.21883391573404273,
"rewards/accuracy_reward": 0.17057292070239782,
"rewards/format_reward": 0.9726562661429247,
"step": 63
},
{
"completion_length": 304.99697029590607,
"epoch": 0.08746065926784827,
"grad_norm": 0.7647993564605713,
"kl": 0.6839599609375,
"learning_rate": 1.736842105263158e-05,
"loss": 0.0274,
"reward": 1.1657986467083294,
"reward_std": 0.2664798041029523,
"rewards/accuracy_reward": 0.210069450433366,
"rewards/format_reward": 0.9557291840513548,
"step": 66
},
{
"completion_length": 84.33941195408504,
"epoch": 0.0914361437800232,
"grad_norm": 1.6230992078781128,
"kl": 0.8506673177083334,
"learning_rate": 1.8157894736842107e-05,
"loss": 0.034,
"reward": 1.1362847598890464,
"reward_std": 0.21719592344015837,
"rewards/accuracy_reward": 0.17838542186655104,
"rewards/format_reward": 0.9578993221124014,
"step": 69
},
{
"completion_length": 47.16927206516266,
"epoch": 0.09541162829219811,
"grad_norm": 0.5694164633750916,
"kl": 0.8841145833333334,
"learning_rate": 1.894736842105263e-05,
"loss": 0.0354,
"reward": 1.1827257337669532,
"reward_std": 0.18139228488629064,
"rewards/accuracy_reward": 0.19487847795244306,
"rewards/format_reward": 0.9878472325702509,
"step": 72
},
{
"completion_length": 65.3068592151006,
"epoch": 0.09938711280437303,
"grad_norm": 0.8147668838500977,
"kl": 0.8631184895833334,
"learning_rate": 1.9736842105263158e-05,
"loss": 0.0345,
"reward": 1.2265625409781933,
"reward_std": 0.22970331188601753,
"rewards/accuracy_reward": 0.24826389558923742,
"rewards/format_reward": 0.9782986268401146,
"step": 75
},
{
"completion_length": 101.73307637373607,
"epoch": 0.10336259731654796,
"grad_norm": 3.9387757778167725,
"kl": 0.9171549479166666,
"learning_rate": 1.9999570594853575e-05,
"loss": 0.0367,
"reward": 1.1497396255532901,
"reward_std": 0.26099368068389595,
"rewards/accuracy_reward": 0.20920139628772935,
"rewards/format_reward": 0.9405382163822651,
"step": 78
},
{
"completion_length": 29.979601462682087,
"epoch": 0.10733808182872287,
"grad_norm": 5.135621547698975,
"kl": 1.5565592447916667,
"learning_rate": 1.9997316318671806e-05,
"loss": 0.0622,
"reward": 1.2044271193444729,
"reward_std": 0.1869426581542939,
"rewards/accuracy_reward": 0.23090278574575981,
"rewards/format_reward": 0.9735243221124014,
"step": 81
},
{
"completion_length": 22.930122137069702,
"epoch": 0.1113135663408978,
"grad_norm": 28.183279037475586,
"kl": 1.3277994791666667,
"learning_rate": 1.999313025518698e-05,
"loss": 0.0531,
"reward": 1.1892361504336197,
"reward_std": 0.20981760757664839,
"rewards/accuracy_reward": 0.20876736678959182,
"rewards/format_reward": 0.9804687661429247,
"step": 84
},
{
"completion_length": 46.38932470480601,
"epoch": 0.11528905085307271,
"grad_norm": 1.771388053894043,
"kl": 1.13525390625,
"learning_rate": 1.9987013213274594e-05,
"loss": 0.0454,
"reward": 1.2356771156191826,
"reward_std": 0.18450136513759693,
"rewards/accuracy_reward": 0.25737847938823205,
"rewards/format_reward": 0.9782986268401146,
"step": 87
},
{
"completion_length": 133.1475731531779,
"epoch": 0.11926453536524764,
"grad_norm": 0.963828444480896,
"kl": 1.10107421875,
"learning_rate": 1.9978966374934255e-05,
"loss": 0.0441,
"reward": 1.1979166989525158,
"reward_std": 0.22738417129342756,
"rewards/accuracy_reward": 0.2348090335726738,
"rewards/format_reward": 0.9631076604127884,
"step": 90
},
{
"completion_length": 95.22396143277486,
"epoch": 0.12324001987742256,
"grad_norm": 2.4775350093841553,
"kl": 1.29541015625,
"learning_rate": 1.996899129506126e-05,
"loss": 0.0519,
"reward": 1.1801215683420498,
"reward_std": 0.22066468729948005,
"rewards/accuracy_reward": 0.21831597779722264,
"rewards/format_reward": 0.9618055733541647,
"step": 93
},
{
"completion_length": 141.00217274824777,
"epoch": 0.12721550438959747,
"grad_norm": 113.83024597167969,
"kl": 6.85986328125,
"learning_rate": 1.995708990114615e-05,
"loss": 0.2747,
"reward": 1.1006944725910823,
"reward_std": 0.2825309601612389,
"rewards/accuracy_reward": 0.16493056050967425,
"rewards/format_reward": 0.9357639079292616,
"step": 96
},
{
"completion_length": 166.101132551829,
"epoch": 0.13119098890177242,
"grad_norm": 10.326292037963867,
"kl": 4.069661458333333,
"learning_rate": 1.994326449290226e-05,
"loss": 0.1628,
"reward": 1.0438368394970894,
"reward_std": 0.32250430978213745,
"rewards/accuracy_reward": 0.1401909765166541,
"rewards/format_reward": 0.9036458494762579,
"step": 99
},
{
"epoch": 0.1325161504058307,
"eval_completion_length": 159.95536130788375,
"eval_kl": 3.7168367346938775,
"eval_loss": 0.16389134526252747,
"eval_reward": 1.0437925482282833,
"eval_reward_std": 0.3066673065174599,
"eval_rewards/accuracy_reward": 0.14200680786553693,
"eval_rewards/format_reward": 0.9017857349648768,
"eval_runtime": 416.6475,
"eval_samples_per_second": 0.238,
"eval_steps_per_second": 0.012,
"step": 100
},
{
"completion_length": 146.04948329925537,
"epoch": 0.13516647341394733,
"grad_norm": 29.008094787597656,
"kl": 4.4609375,
"learning_rate": 1.9927517741821343e-05,
"loss": 0.1592,
"reward": 1.0123698189854622,
"reward_std": 0.279809627099894,
"rewards/accuracy_reward": 0.10416666977107525,
"rewards/format_reward": 0.9082031473517418,
"step": 102
},
{
"completion_length": 83.48133925596873,
"epoch": 0.13914195792612225,
"grad_norm": 2.938828468322754,
"kl": 2.2904459635416665,
"learning_rate": 1.990985269065736e-05,
"loss": 0.0916,
"reward": 1.0742187835276127,
"reward_std": 0.23118331842124462,
"rewards/accuracy_reward": 0.12543403171002865,
"rewards/format_reward": 0.9487847375373045,
"step": 105
},
{
"completion_length": 54.888022780418396,
"epoch": 0.14311744243829716,
"grad_norm": 2.3945536613464355,
"kl": 2.4551595052083335,
"learning_rate": 1.989027275283852e-05,
"loss": 0.0982,
"reward": 1.1523437860111396,
"reward_std": 0.20264656166546047,
"rewards/accuracy_reward": 0.17621528167122355,
"rewards/format_reward": 0.976128488779068,
"step": 108
},
{
"completion_length": 42.01388998826345,
"epoch": 0.1470929269504721,
"grad_norm": 2.8216843605041504,
"kl": 1.4291178385416667,
"learning_rate": 1.9868781711807705e-05,
"loss": 0.0572,
"reward": 1.1814236467083294,
"reward_std": 0.18004686074952284,
"rewards/accuracy_reward": 0.1927083395033454,
"rewards/format_reward": 0.9887152885397276,
"step": 111
},
{
"completion_length": 37.57769203186035,
"epoch": 0.15106841146264702,
"grad_norm": 2.9078152179718018,
"kl": 1.4965006510416667,
"learning_rate": 1.9845383720291392e-05,
"loss": 0.0598,
"reward": 1.19531253973643,
"reward_std": 0.16706574785833558,
"rewards/accuracy_reward": 0.20486111663437137,
"rewards/format_reward": 0.990451397995154,
"step": 114
},
{
"completion_length": 46.923178335030876,
"epoch": 0.15504389597482193,
"grad_norm": 2.5346879959106445,
"kl": 1.30078125,
"learning_rate": 1.9820083299497227e-05,
"loss": 0.0521,
"reward": 1.1796875434617202,
"reward_std": 0.17389631201513112,
"rewards/accuracy_reward": 0.1901041710516438,
"rewards/format_reward": 0.9895833420256773,
"step": 117
},
{
"completion_length": 84.15060981114705,
"epoch": 0.15901938048699685,
"grad_norm": 20.683246612548828,
"kl": 2.1199544270833335,
"learning_rate": 1.9792885338240375e-05,
"loss": 0.0848,
"reward": 1.1553819701075554,
"reward_std": 0.22444627589235702,
"rewards/accuracy_reward": 0.1844618112857764,
"rewards/format_reward": 0.9709201554457346,
"step": 120
},
{
"completion_length": 107.51779842376709,
"epoch": 0.16299486499917176,
"grad_norm": 5.012475967407227,
"kl": 2.2217610677083335,
"learning_rate": 1.976379509199886e-05,
"loss": 0.0888,
"reward": 1.1514757337669532,
"reward_std": 0.262029462105905,
"rewards/accuracy_reward": 0.1970486156642437,
"rewards/format_reward": 0.9544271069268385,
"step": 123
},
{
"completion_length": 148.11632299423218,
"epoch": 0.1669703495113467,
"grad_norm": 14.528366088867188,
"kl": 2.2367350260416665,
"learning_rate": 1.9732818181898046e-05,
"loss": 0.0895,
"reward": 1.1228298942248027,
"reward_std": 0.2808268330991268,
"rewards/accuracy_reward": 0.1896701450071608,
"rewards/format_reward": 0.9331597425043583,
"step": 126
},
{
"completion_length": 84.5121552546819,
"epoch": 0.17094583402352162,
"grad_norm": 0.969579815864563,
"kl": 1.4344889322916667,
"learning_rate": 1.9699960593624462e-05,
"loss": 0.0574,
"reward": 1.15538198625048,
"reward_std": 0.21577061604087552,
"rewards/accuracy_reward": 0.18359375500585884,
"rewards/format_reward": 0.9717882089316845,
"step": 129
},
{
"completion_length": 72.3042555252711,
"epoch": 0.17492131853569653,
"grad_norm": 1.7120805978775024,
"kl": 1.3595377604166667,
"learning_rate": 1.966522867626919e-05,
"loss": 0.0544,
"reward": 1.1766493457059066,
"reward_std": 0.20347999944351614,
"rewards/accuracy_reward": 0.1983507004721711,
"rewards/format_reward": 0.9782986243565878,
"step": 132
},
{
"completion_length": 96.82161716620128,
"epoch": 0.17889680304787145,
"grad_norm": 7.904327869415283,
"kl": 1.818359375,
"learning_rate": 1.962862914110101e-05,
"loss": 0.0727,
"reward": 1.2313368394970894,
"reward_std": 0.2199981181571881,
"rewards/accuracy_reward": 0.25781250578196097,
"rewards/format_reward": 0.9735243196288744,
"step": 135
},
{
"completion_length": 177.80512762069702,
"epoch": 0.1828722875600464,
"grad_norm": 2.0114357471466064,
"kl": 1.666259765625,
"learning_rate": 1.9590169060269602e-05,
"loss": 0.0666,
"reward": 1.1423611491918564,
"reward_std": 0.26254904045102495,
"rewards/accuracy_reward": 0.2005208401630322,
"rewards/format_reward": 0.941840298473835,
"step": 138
},
{
"completion_length": 90.11675635973613,
"epoch": 0.1868477720722213,
"grad_norm": 0.599406898021698,
"kl": 0.9375813802083334,
"learning_rate": 1.9549855865438967e-05,
"loss": 0.0375,
"reward": 1.205729205161333,
"reward_std": 0.19089689617976546,
"rewards/accuracy_reward": 0.22092014454149952,
"rewards/format_reward": 0.9848090397814909,
"step": 141
},
{
"completion_length": 56.38889070351919,
"epoch": 0.19082325658439622,
"grad_norm": 17.37237548828125,
"kl": 1.100830078125,
"learning_rate": 1.9507697346351414e-05,
"loss": 0.0441,
"reward": 1.2404514253139496,
"reward_std": 0.15965971552456418,
"rewards/accuracy_reward": 0.2469618124111245,
"rewards/format_reward": 0.9934895895421505,
"step": 144
},
{
"completion_length": 117.53472594420116,
"epoch": 0.19479874109657114,
"grad_norm": 1.0283232927322388,
"kl": 0.917236328125,
"learning_rate": 1.9463701649322343e-05,
"loss": 0.0367,
"reward": 1.1766493432223797,
"reward_std": 0.22516770443568626,
"rewards/accuracy_reward": 0.2135416737291962,
"rewards/format_reward": 0.9631076554457346,
"step": 147
},
{
"completion_length": 154.88368586699167,
"epoch": 0.19877422560874605,
"grad_norm": 0.9131763577461243,
"kl": 1.0406901041666667,
"learning_rate": 1.941787727566613e-05,
"loss": 0.0416,
"reward": 1.1358507387340069,
"reward_std": 0.24575756738583246,
"rewards/accuracy_reward": 0.18706597783602774,
"rewards/format_reward": 0.9487847412625948,
"step": 150
},
{
"completion_length": 137.50000397364298,
"epoch": 0.202749710120921,
"grad_norm": 1.9522716999053955,
"kl": 3.1927897135416665,
"learning_rate": 1.9370233080053406e-05,
"loss": 0.1279,
"reward": 1.1432291989525158,
"reward_std": 0.25552801430846256,
"rewards/accuracy_reward": 0.18836806000520787,
"rewards/format_reward": 0.9548611318071684,
"step": 153
},
{
"completion_length": 340.70487225055695,
"epoch": 0.2067251946330959,
"grad_norm": 338.98333740234375,
"kl": 10.551839192708334,
"learning_rate": 1.9320778268800068e-05,
"loss": 0.4221,
"reward": 1.0894097586472828,
"reward_std": 0.32969770890971023,
"rewards/accuracy_reward": 0.2165798662075152,
"rewards/format_reward": 0.8728298805654049,
"step": 156
},
{
"completion_length": 1684.335110982259,
"epoch": 0.21070067914527082,
"grad_norm": 25.311864852905273,
"kl": 25.832682291666668,
"learning_rate": 1.926952239808833e-05,
"loss": 1.0325,
"reward": 0.6332465472320715,
"reward_std": 0.556972432260712,
"rewards/accuracy_reward": 0.2395833390376841,
"rewards/format_reward": 0.39366320706903934,
"step": 159
},
{
"completion_length": 1924.016092936198,
"epoch": 0.21467616365744574,
"grad_norm": 0.9118285179138184,
"kl": 2.5398763020833335,
"learning_rate": 1.9216475372120198e-05,
"loss": 0.1016,
"reward": 0.5694444632778565,
"reward_std": 0.5513101244966189,
"rewards/accuracy_reward": 0.17751736612990499,
"rewards/format_reward": 0.3919270985449354,
"step": 162
},
{
"completion_length": 536.2187668085098,
"epoch": 0.21865164816962068,
"grad_norm": 0.5543506741523743,
"kl": 0.8994954427083334,
"learning_rate": 1.9161647441203648e-05,
"loss": 0.036,
"reward": 1.0902778059244156,
"reward_std": 0.2743187023637195,
"rewards/accuracy_reward": 0.179253476128603,
"rewards/format_reward": 0.9110243183871111,
"step": 165
},
{
"completion_length": 28.863716046015423,
"epoch": 0.2226271326817956,
"grad_norm": 0.5757032632827759,
"kl": 0.918701171875,
"learning_rate": 1.9105049199771963e-05,
"loss": 0.0367,
"reward": 1.2426215708255768,
"reward_std": 0.16670533292926848,
"rewards/accuracy_reward": 0.24696181248873472,
"rewards/format_reward": 0.9956597263614336,
"step": 168
},
{
"completion_length": 36.32769219080607,
"epoch": 0.2266026171939705,
"grad_norm": 0.9013729691505432,
"kl": 0.907470703125,
"learning_rate": 1.904669158433658e-05,
"loss": 0.0363,
"reward": 1.2196180919806163,
"reward_std": 0.15937398614672324,
"rewards/accuracy_reward": 0.2235243107036998,
"rewards/format_reward": 0.9960937537252903,
"step": 171
},
{
"completion_length": 118.9145000775655,
"epoch": 0.23057810170614543,
"grad_norm": 0.603873610496521,
"kl": 0.875244140625,
"learning_rate": 1.8986585871373792e-05,
"loss": 0.035,
"reward": 1.19921878973643,
"reward_std": 0.21174315828830004,
"rewards/accuracy_reward": 0.22526042260384807,
"rewards/format_reward": 0.9739583494762579,
"step": 174
},
{
"completion_length": 154.3828158378601,
"epoch": 0.23455358621832037,
"grad_norm": 0.5029460191726685,
"kl": 0.8998209635416666,
"learning_rate": 1.8924743675145815e-05,
"loss": 0.036,
"reward": 1.1315104526778061,
"reward_std": 0.291058028737704,
"rewards/accuracy_reward": 0.2100694509766375,
"rewards/format_reward": 0.9214409937461218,
"step": 177
},
{
"completion_length": 27.94401141007741,
"epoch": 0.23852907073049529,
"grad_norm": 0.7333383560180664,
"kl": 1.0400390625,
"learning_rate": 1.8861176945456542e-05,
"loss": 0.0416,
"reward": 1.1788194874922435,
"reward_std": 0.21482299477793276,
"rewards/accuracy_reward": 0.20920139516238123,
"rewards/format_reward": 0.9696180733541647,
"step": 180
},
{
"completion_length": 24.035590926806133,
"epoch": 0.2425045552426702,
"grad_norm": 0.9987295866012573,
"kl": 1.773681640625,
"learning_rate": 1.8795897965342473e-05,
"loss": 0.071,
"reward": 1.2339410148561,
"reward_std": 0.18799259358396134,
"rewards/accuracy_reward": 0.25564236876865226,
"rewards/format_reward": 0.9782986280818781,
"step": 183
},
{
"completion_length": 22.082900047302246,
"epoch": 0.24648003975484511,
"grad_norm": 1.0223162174224854,
"kl": 1.2223307291666667,
"learning_rate": 1.8728919348699285e-05,
"loss": 0.0489,
"reward": 1.1970486516753833,
"reward_std": 0.1605207941805323,
"rewards/accuracy_reward": 0.21267361818657568,
"rewards/format_reward": 0.9843750111758709,
"step": 186
},
{
"completion_length": 22.559462388356526,
"epoch": 0.25045552426702006,
"grad_norm": 0.9617392420768738,
"kl": 1.2871907552083333,
"learning_rate": 1.866025403784439e-05,
"loss": 0.0515,
"reward": 1.253038230041663,
"reward_std": 0.16818702151067555,
"rewards/accuracy_reward": 0.26866320287808776,
"rewards/format_reward": 0.9843750111758709,
"step": 189
},
{
"completion_length": 74.73828355471294,
"epoch": 0.25443100877919494,
"grad_norm": 438.25408935546875,
"kl": 6.625325520833333,
"learning_rate": 1.858991530101613e-05,
"loss": 0.2647,
"reward": 0.8719618345300356,
"reward_std": 0.4378834879025817,
"rewards/accuracy_reward": 0.20963542287548384,
"rewards/format_reward": 0.6623264097919067,
"step": 192
},
{
"completion_length": 72.35807486375172,
"epoch": 0.2584064932913699,
"grad_norm": 2.040531873703003,
"kl": 1.7041829427083333,
"learning_rate": 1.851791672980993e-05,
"loss": 0.0681,
"reward": 0.6892361293236414,
"reward_std": 0.5002113915979862,
"rewards/accuracy_reward": 0.17578125411334136,
"rewards/format_reward": 0.5134548768401146,
"step": 195
},
{
"completion_length": 25.8216153383255,
"epoch": 0.26238197780354483,
"grad_norm": 0.6412864923477173,
"kl": 1.1061197916666667,
"learning_rate": 1.844427223655199e-05,
"loss": 0.0443,
"reward": 1.1362847524384658,
"reward_std": 0.2772039214614779,
"rewards/accuracy_reward": 0.2052951459384834,
"rewards/format_reward": 0.9309896019597849,
"step": 198
},
{
"epoch": 0.2650323008116614,
"eval_completion_length": 106.06218176621657,
"eval_kl": 1.0171274038461537,
"eval_loss": 0.0414416678249836,
"eval_reward": 1.2000000339287977,
"eval_reward_std": 0.21003777883373775,
"eval_rewards/accuracy_reward": 0.24038462171760888,
"eval_rewards/format_reward": 0.9596154038722698,
"eval_runtime": 392.0553,
"eval_samples_per_second": 0.253,
"eval_steps_per_second": 0.013,
"step": 200
},
{
"completion_length": 153.56380558013916,
"epoch": 0.2663574623157197,
"grad_norm": 0.6983628869056702,
"kl": 1.030517578125,
"learning_rate": 1.8368996051610987e-05,
"loss": 0.0402,
"reward": 1.1731771118938923,
"reward_std": 0.2824738877825439,
"rewards/accuracy_reward": 0.2369791748933494,
"rewards/format_reward": 0.9361979365348816,
"step": 201
},
{
"completion_length": 88.2196215391159,
"epoch": 0.27033294682789466,
"grad_norm": 0.5030940771102905,
"kl": 1.013427734375,
"learning_rate": 1.8292102720648333e-05,
"loss": 0.0406,
"reward": 1.1948785136143367,
"reward_std": 0.18432624839867154,
"rewards/accuracy_reward": 0.2248263950459659,
"rewards/format_reward": 0.9700521007180214,
"step": 204
},
{
"completion_length": 59.0355920791626,
"epoch": 0.27430843134006955,
"grad_norm": 0.6660766005516052,
"kl": 1.0997721354166667,
"learning_rate": 1.821360710180753e-05,
"loss": 0.044,
"reward": 1.2152778208255768,
"reward_std": 0.16302509784388045,
"rewards/accuracy_reward": 0.2326388961325089,
"rewards/format_reward": 0.9826389054457346,
"step": 207
},
{
"completion_length": 136.2734409570694,
"epoch": 0.2782839158522445,
"grad_norm": 12.398130416870117,
"kl": 1.268310546875,
"learning_rate": 1.8133524362843105e-05,
"loss": 0.0507,
"reward": 1.1844618419806163,
"reward_std": 0.2869204127540191,
"rewards/accuracy_reward": 0.24218750729536018,
"rewards/format_reward": 0.9422743258376917,
"step": 210
},
{
"completion_length": 70.42100850741069,
"epoch": 0.28225940036441943,
"grad_norm": 0.6272424459457397,
"kl": 1.75341796875,
"learning_rate": 1.8051869978189732e-05,
"loss": 0.0702,
"reward": 1.1562500409781933,
"reward_std": 0.2056693274838229,
"rewards/accuracy_reward": 0.18315972775841752,
"rewards/format_reward": 0.9730902922650179,
"step": 213
},
{
"completion_length": 73.97873449325562,
"epoch": 0.2862348848765943,
"grad_norm": 0.8462525010108948,
"kl": 1.53515625,
"learning_rate": 1.7968659725972113e-05,
"loss": 0.0614,
"reward": 1.2317708693444729,
"reward_std": 0.20991144888103008,
"rewards/accuracy_reward": 0.25130209074510884,
"rewards/format_reward": 0.9804687649011612,
"step": 216
},
{
"completion_length": 253.28212424119315,
"epoch": 0.29021036938876926,
"grad_norm": 1.3421990871429443,
"kl": 1.69873046875,
"learning_rate": 1.7883909684956142e-05,
"loss": 0.0679,
"reward": 1.194010455161333,
"reward_std": 0.23462056911860904,
"rewards/accuracy_reward": 0.22092014512357613,
"rewards/format_reward": 0.9730902959903082,
"step": 219
},
{
"completion_length": 470.5165026982625,
"epoch": 0.2941858539009442,
"grad_norm": 1.0831785202026367,
"kl": 1.51318359375,
"learning_rate": 1.7797636231442018e-05,
"loss": 0.0605,
"reward": 1.1184896218279998,
"reward_std": 0.3129944964312017,
"rewards/accuracy_reward": 0.18750000543271503,
"rewards/format_reward": 0.9309896032015482,
"step": 222
},
{
"completion_length": 236.49306138356528,
"epoch": 0.2981613384131191,
"grad_norm": 0.5017532706260681,
"kl": 1.0579427083333333,
"learning_rate": 1.770985603609982e-05,
"loss": 0.0423,
"reward": 1.1736111516753833,
"reward_std": 0.22954328202952942,
"rewards/accuracy_reward": 0.2083333401630322,
"rewards/format_reward": 0.9652777972320715,
"step": 225
},
{
"completion_length": 370.95573965708417,
"epoch": 0.30213682292529404,
"grad_norm": 1.8773243427276611,
"kl": 0.7275390625,
"learning_rate": 1.762058606074825e-05,
"loss": 0.0291,
"reward": 1.1892361417412758,
"reward_std": 0.2853658755775541,
"rewards/accuracy_reward": 0.2526041748545443,
"rewards/format_reward": 0.9366319626569748,
"step": 228
},
{
"completion_length": 749.3316179911295,
"epoch": 0.3061123074374689,
"grad_norm": 31.57447624206543,
"kl": 1613.3247884114583,
"learning_rate": 1.7529843555077066e-05,
"loss": 64.5574,
"reward": 0.9032118283212185,
"reward_std": 0.5264206398278475,
"rewards/accuracy_reward": 0.1909722271375358,
"rewards/format_reward": 0.7122396056850752,
"step": 231
},
{
"completion_length": 675.3758859634399,
"epoch": 0.31008779194964387,
"grad_norm": 5.697227954864502,
"kl": 1.6119791666666667,
"learning_rate": 1.743764605331392e-05,
"loss": 0.0645,
"reward": 0.9205729439854622,
"reward_std": 0.511370474472642,
"rewards/accuracy_reward": 0.21093750609240183,
"rewards/format_reward": 0.7096354346722364,
"step": 234
},
{
"completion_length": 132.1358541647593,
"epoch": 0.3140632764618188,
"grad_norm": 11.240227699279785,
"kl": 1.1758626302083333,
"learning_rate": 1.734401137083623e-05,
"loss": 0.047,
"reward": 1.1697048942248027,
"reward_std": 0.234877454660212,
"rewards/accuracy_reward": 0.20920139295049012,
"rewards/format_reward": 0.9605034875373045,
"step": 237
},
{
"completion_length": 65.88151196638744,
"epoch": 0.3180387609739937,
"grad_norm": 1.4037131071090698,
"kl": 1.52587890625,
"learning_rate": 1.7248957600728664e-05,
"loss": 0.0611,
"reward": 1.2100694850087166,
"reward_std": 0.17585339567934474,
"rewards/accuracy_reward": 0.2248263950071608,
"rewards/format_reward": 0.9852430634200573,
"step": 240
},
{
"completion_length": 62.16363008817037,
"epoch": 0.32201424548616864,
"grad_norm": 1.2037297487258911,
"kl": 2.00048828125,
"learning_rate": 1.7152503110287048e-05,
"loss": 0.0802,
"reward": 1.21484378973643,
"reward_std": 0.14739138268244764,
"rewards/accuracy_reward": 0.22265625558793545,
"rewards/format_reward": 0.9921875024835268,
"step": 243
},
{
"completion_length": 125.61762539545695,
"epoch": 0.3259897299983435,
"grad_norm": 101.92506408691406,
"kl": 0.934326171875,
"learning_rate": 1.7054666537469213e-05,
"loss": 0.0374,
"reward": 1.1098090633749962,
"reward_std": 0.30111823774253327,
"rewards/accuracy_reward": 0.2152777845816066,
"rewards/format_reward": 0.8945312686264515,
"step": 246
},
{
"completion_length": 121.29817994435628,
"epoch": 0.32996521451051847,
"grad_norm": 34.78390884399414,
"kl": 1.1844889322916667,
"learning_rate": 1.6955466787293574e-05,
"loss": 0.0473,
"reward": 1.1397569874922435,
"reward_std": 0.25541831855662167,
"rewards/accuracy_reward": 0.19444444961845875,
"rewards/format_reward": 0.9453125186264515,
"step": 249
},
{
"completion_length": 187.08290481567383,
"epoch": 0.3339406990226934,
"grad_norm": 9.161247253417969,
"kl": 1.1195475260416667,
"learning_rate": 1.6854923028186112e-05,
"loss": 0.0448,
"reward": 1.174479205161333,
"reward_std": 0.2551775785783927,
"rewards/accuracy_reward": 0.226128477564392,
"rewards/format_reward": 0.9483507138987383,
"step": 252
},
{
"completion_length": 427.2378609975179,
"epoch": 0.3379161835348683,
"grad_norm": 4.801308631896973,
"kl": 0.938720703125,
"learning_rate": 1.6753054688276443e-05,
"loss": 0.0376,
"reward": 1.0308160049219925,
"reward_std": 0.4219017767657836,
"rewards/accuracy_reward": 0.18880208985259136,
"rewards/format_reward": 0.8420139091710249,
"step": 255
},
{
"completion_length": 529.5642477671305,
"epoch": 0.34189166804704324,
"grad_norm": 24.645631790161133,
"kl": 0.8575032552083334,
"learning_rate": 1.6649881451643706e-05,
"loss": 0.0343,
"reward": 0.9887153046826521,
"reward_std": 0.4318722311096887,
"rewards/accuracy_reward": 0.16449653268015632,
"rewards/format_reward": 0.8242187686264515,
"step": 258
},
{
"completion_length": 635.6575686136881,
"epoch": 0.3458671525592181,
"grad_norm": 16.547225952148438,
"kl": 0.8201497395833334,
"learning_rate": 1.6545423254513003e-05,
"loss": 0.0328,
"reward": 0.9114583544433117,
"reward_std": 0.49249805447955924,
"rewards/accuracy_reward": 0.15451389361017695,
"rewards/format_reward": 0.7569444626569748,
"step": 261
},
{
"completion_length": 670.922758102417,
"epoch": 0.34984263707139307,
"grad_norm": 3.734528064727783,
"kl": 28.136962890625,
"learning_rate": 1.6439700281403113e-05,
"loss": 1.13,
"reward": 0.8875868320465088,
"reward_std": 0.5033875486503044,
"rewards/accuracy_reward": 0.15581597620621324,
"rewards/format_reward": 0.7317708556850752,
"step": 264
},
{
"completion_length": 493.13022168477374,
"epoch": 0.353818121583568,
"grad_norm": 1.4124517440795898,
"kl": 0.744140625,
"learning_rate": 1.6332732961226214e-05,
"loss": 0.0298,
"reward": 1.003472238779068,
"reward_std": 0.37580153982465464,
"rewards/accuracy_reward": 0.16710069950204343,
"rewards/format_reward": 0.8363715509573618,
"step": 267
},
{
"completion_length": 216.89670626322427,
"epoch": 0.3577936060957429,
"grad_norm": 16.25065040588379,
"kl": 0.7809244791666666,
"learning_rate": 1.6224541963340392e-05,
"loss": 0.0312,
"reward": 1.1371528195838134,
"reward_std": 0.24574858765117824,
"rewards/accuracy_reward": 0.18706597686590007,
"rewards/format_reward": 0.950086829562982,
"step": 270
},
{
"completion_length": 177.61805963516235,
"epoch": 0.36176909060791784,
"grad_norm": 0.2947433888912201,
"kl": 0.6514485677083334,
"learning_rate": 1.6115148193555708e-05,
"loss": 0.0261,
"reward": 1.1371528146167595,
"reward_std": 0.216966389445588,
"rewards/accuracy_reward": 0.1727430597335721,
"rewards/format_reward": 0.9644097400208315,
"step": 273
},
{
"completion_length": 166.2851603825887,
"epoch": 0.3657445751200928,
"grad_norm": 0.2939068377017975,
"kl": 0.6795247395833334,
"learning_rate": 1.6004572790094535e-05,
"loss": 0.0272,
"reward": 1.1618923991918564,
"reward_std": 0.20391751010902226,
"rewards/accuracy_reward": 0.1848958384944126,
"rewards/format_reward": 0.9769965422650179,
"step": 276
},
{
"completion_length": 261.6618987719218,
"epoch": 0.3697200596322677,
"grad_norm": 0.29132047295570374,
"kl": 0.6346842447916666,
"learning_rate": 1.5892837119507017e-05,
"loss": 0.0254,
"reward": 1.1731771218279998,
"reward_std": 0.2886992564114432,
"rewards/accuracy_reward": 0.22005208965856582,
"rewards/format_reward": 0.953125017384688,
"step": 279
},
{
"completion_length": 336.81771659851074,
"epoch": 0.3736955441444426,
"grad_norm": 0.3138696551322937,
"kl": 0.5843098958333334,
"learning_rate": 1.5779962772542404e-05,
"loss": 0.0234,
"reward": 1.1588542101283867,
"reward_std": 0.3115967277747889,
"rewards/accuracy_reward": 0.22482639298929522,
"rewards/format_reward": 0.9340277997155985,
"step": 282
},
{
"completion_length": 331.4518330891927,
"epoch": 0.3776710286566175,
"grad_norm": 0.2659892141819,
"kl": 0.5912679036458334,
"learning_rate": 1.5665971559977035e-05,
"loss": 0.0237,
"reward": 1.1488715608914692,
"reward_std": 0.29840323934331536,
"rewards/accuracy_reward": 0.216145838998879,
"rewards/format_reward": 0.9327257126569748,
"step": 285
},
{
"completion_length": 239.2026980717977,
"epoch": 0.38164651316879245,
"grad_norm": 0.25000807642936707,
"kl": 0.65576171875,
"learning_rate": 1.5550885508399857e-05,
"loss": 0.0262,
"reward": 1.2243924078842003,
"reward_std": 0.23120340146124363,
"rewards/accuracy_reward": 0.26302084152121097,
"rewards/format_reward": 0.9613715447485447,
"step": 288
},
{
"completion_length": 261.2309099833171,
"epoch": 0.3856219976809674,
"grad_norm": 0.3258998692035675,
"kl": 0.6355794270833334,
"learning_rate": 1.5434726855956207e-05,
"loss": 0.0254,
"reward": 1.1762153195838134,
"reward_std": 0.2527556049947937,
"rewards/accuracy_reward": 0.22656250465661287,
"rewards/format_reward": 0.949652798473835,
"step": 291
},
{
"completion_length": 307.1632038752238,
"epoch": 0.3895974821931423,
"grad_norm": 0.27683761715888977,
"kl": 0.6504720052083334,
"learning_rate": 1.5317518048050698e-05,
"loss": 0.026,
"reward": 1.1840278108914692,
"reward_std": 0.3266296978108585,
"rewards/accuracy_reward": 0.25868056357527774,
"rewards/format_reward": 0.9253472412625948,
"step": 294
},
{
"completion_length": 227.35286966959634,
"epoch": 0.3935729667053172,
"grad_norm": 0.30159127712249756,
"kl": 0.688720703125,
"learning_rate": 1.5199281733010115e-05,
"loss": 0.0275,
"reward": 1.1831597660978634,
"reward_std": 0.2784773572348058,
"rewards/accuracy_reward": 0.23350695171393454,
"rewards/format_reward": 0.9496528009573618,
"step": 297
},
{
"completion_length": 143.06510861714682,
"epoch": 0.3975484512174921,
"grad_norm": 0.2892165780067444,
"kl": 0.68359375,
"learning_rate": 1.5080040757707045e-05,
"loss": 0.0274,
"reward": 1.2187500310440857,
"reward_std": 0.2116301084558169,
"rewards/accuracy_reward": 0.24522570221840093,
"rewards/format_reward": 0.9735243245959282,
"step": 300
},
{
"epoch": 0.3975484512174921,
"eval_completion_length": 144.66667048136392,
"eval_kl": 0.6736505681818182,
"eval_loss": 0.026671981438994408,
"eval_reward": 1.2436869072191643,
"eval_reward_std": 0.1965812866886457,
"eval_rewards/accuracy_reward": 0.2651515253113978,
"eval_rewards/format_reward": 0.9785353685870315,
"eval_runtime": 306.7676,
"eval_samples_per_second": 0.323,
"eval_steps_per_second": 0.016,
"step": 300
},
{
"completion_length": 174.81424236297607,
"epoch": 0.40152393572966705,
"grad_norm": 0.2668885588645935,
"kl": 0.658447265625,
"learning_rate": 1.4959818163145174e-05,
"loss": 0.0263,
"reward": 1.1870660111308098,
"reward_std": 0.2152603679957489,
"rewards/accuracy_reward": 0.22265625613120696,
"rewards/format_reward": 0.9644097412625948,
"step": 303
},
{
"completion_length": 243.1388953526815,
"epoch": 0.405499420241842,
"grad_norm": 0.29364725947380066,
"kl": 0.6541341145833334,
"learning_rate": 1.4838637180007048e-05,
"loss": 0.0262,
"reward": 1.19227434694767,
"reward_std": 0.3106319972624381,
"rewards/accuracy_reward": 0.25911459024064243,
"rewards/format_reward": 0.9331597425043583,
"step": 306
},
{
"completion_length": 272.5533922513326,
"epoch": 0.4094749047540169,
"grad_norm": 0.8436369299888611,
"kl": 0.6695149739583334,
"learning_rate": 1.4716521224165192e-05,
"loss": 0.0268,
"reward": 1.1753472536802292,
"reward_std": 0.3257710024093588,
"rewards/accuracy_reward": 0.25520834090032923,
"rewards/format_reward": 0.9201389091710249,
"step": 309
},
{
"completion_length": 177.79601097106934,
"epoch": 0.4134503892661918,
"grad_norm": 0.4324951469898224,
"kl": 0.7711588541666666,
"learning_rate": 1.4593493892157473e-05,
"loss": 0.0308,
"reward": 1.2126736467083294,
"reward_std": 0.2525833969314893,
"rewards/accuracy_reward": 0.25607639644294977,
"rewards/format_reward": 0.9565972425043583,
"step": 312
},
{
"completion_length": 131.17882307370505,
"epoch": 0.41742587377836676,
"grad_norm": 0.4101894497871399,
"kl": 0.8329264322916666,
"learning_rate": 1.4469578956627497e-05,
"loss": 0.0333,
"reward": 1.2222222561637561,
"reward_std": 0.2416619355790317,
"rewards/accuracy_reward": 0.2569444514811039,
"rewards/format_reward": 0.9652777935067812,
"step": 315
},
{
"completion_length": 166.76866793632507,
"epoch": 0.42140135829054165,
"grad_norm": 0.4412620961666107,
"kl": 0.802490234375,
"learning_rate": 1.4344800361731028e-05,
"loss": 0.0321,
"reward": 1.2000868308047454,
"reward_std": 0.25887442535410327,
"rewards/accuracy_reward": 0.2456597302419444,
"rewards/format_reward": 0.9544271069268385,
"step": 318
},
{
"completion_length": 291.4305648803711,
"epoch": 0.4253768428027166,
"grad_norm": 1.2379664182662964,
"kl": 1.076171875,
"learning_rate": 1.4219182218509228e-05,
"loss": 0.043,
"reward": 1.188368085771799,
"reward_std": 0.3411911290604621,
"rewards/accuracy_reward": 0.2738715353965138,
"rewards/format_reward": 0.9144965472320715,
"step": 321
},
{
"completion_length": 224.42535320917764,
"epoch": 0.4293523273148915,
"grad_norm": 0.3620770275592804,
"kl": 0.82763671875,
"learning_rate": 1.4092748800229684e-05,
"loss": 0.0331,
"reward": 1.1501736504336197,
"reward_std": 0.277087006252259,
"rewards/accuracy_reward": 0.21137153512487808,
"rewards/format_reward": 0.9388021032015482,
"step": 324
},
{
"completion_length": 125.78515982627869,
"epoch": 0.4333278118270664,
"grad_norm": 0.44564080238342285,
"kl": 0.8855794270833334,
"learning_rate": 1.3965524537696048e-05,
"loss": 0.0354,
"reward": 1.2070312934617202,
"reward_std": 0.2261218437924981,
"rewards/accuracy_reward": 0.2408854247769341,
"rewards/format_reward": 0.9661458519597849,
"step": 327
},
{
"completion_length": 120.45356305440266,
"epoch": 0.43730329633924137,
"grad_norm": 0.5684562921524048,
"kl": 0.8846842447916666,
"learning_rate": 1.3837534014527292e-05,
"loss": 0.0354,
"reward": 1.1996528121332328,
"reward_std": 0.21545591143270335,
"rewards/accuracy_reward": 0.22482639430866888,
"rewards/format_reward": 0.9748264066874981,
"step": 330
},
{
"completion_length": 163.12413569291434,
"epoch": 0.44127878085141625,
"grad_norm": 0.4309135973453522,
"kl": 0.853515625,
"learning_rate": 1.370880196240736e-05,
"loss": 0.0341,
"reward": 1.1966146243115265,
"reward_std": 0.20711354352533817,
"rewards/accuracy_reward": 0.22829861768210927,
"rewards/format_reward": 0.9683159912625948,
"step": 333
},
{
"completion_length": 237.89236768086752,
"epoch": 0.4452542653635912,
"grad_norm": 0.4566245377063751,
"kl": 0.8661295572916666,
"learning_rate": 1.3579353256306287e-05,
"loss": 0.0347,
"reward": 1.2013889253139496,
"reward_std": 0.3096516130802532,
"rewards/accuracy_reward": 0.26953125911920023,
"rewards/format_reward": 0.9318576566874981,
"step": 336
},
{
"completion_length": 165.7669305006663,
"epoch": 0.4492297498757661,
"grad_norm": 0.354465126991272,
"kl": 0.865966796875,
"learning_rate": 1.3449212909673564e-05,
"loss": 0.0346,
"reward": 1.2018229539195697,
"reward_std": 0.25834672797160846,
"rewards/accuracy_reward": 0.25217014582206804,
"rewards/format_reward": 0.9496528009573618,
"step": 339
},
{
"completion_length": 142.55990060170492,
"epoch": 0.453205234387941,
"grad_norm": 0.3962474763393402,
"kl": 0.8601888020833334,
"learning_rate": 1.3318406069604794e-05,
"loss": 0.0344,
"reward": 1.2521701666216056,
"reward_std": 0.23758238561761877,
"rewards/accuracy_reward": 0.2947048688074574,
"rewards/format_reward": 0.9574652972320715,
"step": 342
},
{
"completion_length": 115.50304126739502,
"epoch": 0.45718071890011597,
"grad_norm": 0.5837423205375671,
"kl": 0.8826497395833334,
"learning_rate": 1.3186958011982502e-05,
"loss": 0.0353,
"reward": 1.2539062947034836,
"reward_std": 0.22960447745087245,
"rewards/accuracy_reward": 0.28862847849571455,
"rewards/format_reward": 0.9652777935067812,
"step": 345
},
{
"completion_length": 131.01606249809265,
"epoch": 0.46115620341229085,
"grad_norm": 0.34627339243888855,
"kl": 0.8831380208333334,
"learning_rate": 1.3054894136592052e-05,
"loss": 0.0353,
"reward": 1.2656250384946663,
"reward_std": 0.2167885024100542,
"rewards/accuracy_reward": 0.303385425824672,
"rewards/format_reward": 0.9622396032015482,
"step": 348
},
{
"completion_length": 161.92491857210794,
"epoch": 0.4651316879244658,
"grad_norm": 0.5599522590637207,
"kl": 0.927490234375,
"learning_rate": 1.2922239962213639e-05,
"loss": 0.0371,
"reward": 1.2330729564030964,
"reward_std": 0.25398758659139276,
"rewards/accuracy_reward": 0.28038195144229877,
"rewards/format_reward": 0.9526909900208315,
"step": 351
},
{
"completion_length": 177.92231432596842,
"epoch": 0.46910717243664074,
"grad_norm": 0.40600305795669556,
"kl": 0.9139811197916666,
"learning_rate": 1.2789021121691273e-05,
"loss": 0.0366,
"reward": 1.2495660111308098,
"reward_std": 0.2690324760042131,
"rewards/accuracy_reward": 0.3042534824150304,
"rewards/format_reward": 0.9453125211099783,
"step": 354
},
{
"completion_length": 172.3125053246816,
"epoch": 0.4730826569488156,
"grad_norm": 0.3264493942260742,
"kl": 0.9281412760416666,
"learning_rate": 1.2655263356979748e-05,
"loss": 0.0371,
"reward": 1.2217882325251896,
"reward_std": 0.23972468955131868,
"rewards/accuracy_reward": 0.2669270914436008,
"rewards/format_reward": 0.9548611293236414,
"step": 357
},
{
"completion_length": 184.01345992088318,
"epoch": 0.47705814146099057,
"grad_norm": 0.45698466897010803,
"kl": 0.9051920572916666,
"learning_rate": 1.252099251417048e-05,
"loss": 0.0362,
"reward": 1.2009548917412758,
"reward_std": 0.2333919748198241,
"rewards/accuracy_reward": 0.2500000069461142,
"rewards/format_reward": 0.9509548830489317,
"step": 360
},
{
"completion_length": 144.6562541325887,
"epoch": 0.48103362597316546,
"grad_norm": 0.4349970519542694,
"kl": 0.9092610677083334,
"learning_rate": 1.2386234538497281e-05,
"loss": 0.0364,
"reward": 1.2122396243115265,
"reward_std": 0.20104571796643236,
"rewards/accuracy_reward": 0.24696181206187853,
"rewards/format_reward": 0.965277798473835,
"step": 363
},
{
"completion_length": 191.15668980280557,
"epoch": 0.4850091104853404,
"grad_norm": 0.30329596996307373,
"kl": 0.88134765625,
"learning_rate": 1.2251015469322915e-05,
"loss": 0.0352,
"reward": 1.1736111516753833,
"reward_std": 0.22162295792562267,
"rewards/accuracy_reward": 0.21918403388311467,
"rewards/format_reward": 0.9544271032015482,
"step": 366
},
{
"completion_length": 264.59766403834027,
"epoch": 0.48898459499751534,
"grad_norm": 0.29752317070961,
"kl": 0.850341796875,
"learning_rate": 1.2115361435107531e-05,
"loss": 0.034,
"reward": 1.1987847660978634,
"reward_std": 0.23697279867095253,
"rewards/accuracy_reward": 0.25217014578326297,
"rewards/format_reward": 0.9466146007180214,
"step": 369
},
{
"completion_length": 265.8745719591777,
"epoch": 0.49296007950969023,
"grad_norm": 0.32228928804397583,
"kl": 0.832763671875,
"learning_rate": 1.1979298648359823e-05,
"loss": 0.0333,
"reward": 1.1922743419806163,
"reward_std": 0.23731949056188265,
"rewards/accuracy_reward": 0.24652778469802192,
"rewards/format_reward": 0.9457465497155985,
"step": 372
},
{
"completion_length": 225.2148496309916,
"epoch": 0.4969355640218652,
"grad_norm": 0.3521800637245178,
"kl": 0.93212890625,
"learning_rate": 1.1842853400571972e-05,
"loss": 0.0373,
"reward": 1.17578128973643,
"reward_std": 0.23853578185662627,
"rewards/accuracy_reward": 0.22482639566684762,
"rewards/format_reward": 0.9509548818071684,
"step": 375
},
{
"completion_length": 214.73481448491415,
"epoch": 0.5009110485340401,
"grad_norm": 0.30461886525154114,
"kl": 0.8719075520833334,
"learning_rate": 1.1706052057139335e-05,
"loss": 0.0349,
"reward": 1.2374132238328457,
"reward_std": 0.26819697495860356,
"rewards/accuracy_reward": 0.2986111195059493,
"rewards/format_reward": 0.9388021019597849,
"step": 378
},
{
"completion_length": 166.19488294919333,
"epoch": 0.504886533046215,
"grad_norm": 0.3792967200279236,
"kl": 0.9239908854166666,
"learning_rate": 1.1568921052265835e-05,
"loss": 0.037,
"reward": 1.2187500335276127,
"reward_std": 0.23409798694774508,
"rewards/accuracy_reward": 0.2708333415600161,
"rewards/format_reward": 0.9479166840513548,
"step": 381
},
{
"completion_length": 150.8567752043406,
"epoch": 0.5088620175583899,
"grad_norm": 0.592704176902771,
"kl": 0.9227701822916666,
"learning_rate": 1.1431486883856082e-05,
"loss": 0.0369,
"reward": 1.2452257374922435,
"reward_std": 0.22722656147864959,
"rewards/accuracy_reward": 0.2934027862502262,
"rewards/format_reward": 0.9518229340513548,
"step": 384
},
{
"completion_length": 158.66797320048013,
"epoch": 0.5128375020705649,
"grad_norm": 0.3592934310436249,
"kl": 0.9186197916666666,
"learning_rate": 1.1293776108395136e-05,
"loss": 0.0367,
"reward": 1.2261285136143367,
"reward_std": 0.22368450198943415,
"rewards/accuracy_reward": 0.27039931307081133,
"rewards/format_reward": 0.9557291840513548,
"step": 387
},
{
"completion_length": 185.46788756052652,
"epoch": 0.5168129865827398,
"grad_norm": 0.40210244059562683,
"kl": 0.9088541666666666,
"learning_rate": 1.115581533581701e-05,
"loss": 0.0363,
"reward": 1.215711849431197,
"reward_std": 0.24651615732970336,
"rewards/accuracy_reward": 0.26779514624892425,
"rewards/format_reward": 0.9479166890184084,
"step": 390
},
{
"completion_length": 170.30729579925537,
"epoch": 0.5207884710949147,
"grad_norm": 0.3423998951911926,
"kl": 0.9156901041666666,
"learning_rate": 1.1017631224362803e-05,
"loss": 0.0366,
"reward": 1.2191840646167595,
"reward_std": 0.23356711654923856,
"rewards/accuracy_reward": 0.26562500702372444,
"rewards/format_reward": 0.9535590472320715,
"step": 393
},
{
"completion_length": 163.50651590029398,
"epoch": 0.5247639556070897,
"grad_norm": 0.4365287721157074,
"kl": 0.9022623697916666,
"learning_rate": 1.0879250475429523e-05,
"loss": 0.0361,
"reward": 1.2296007374922435,
"reward_std": 0.21208147254462043,
"rewards/accuracy_reward": 0.27083334194806713,
"rewards/format_reward": 0.9587673805654049,
"step": 396
},
{
"completion_length": 152.78342461585999,
"epoch": 0.5287394401192645,
"grad_norm": 0.38972899317741394,
"kl": 0.9129231770833334,
"learning_rate": 1.0740699828410546e-05,
"loss": 0.0365,
"reward": 1.2404514389733474,
"reward_std": 0.21568205665486553,
"rewards/accuracy_reward": 0.2734375073729704,
"rewards/format_reward": 0.9670139054457346,
"step": 399
},
{
"epoch": 0.5300646016233228,
"eval_completion_length": 209.7181176944655,
"eval_kl": 0.892936862244898,
"eval_loss": 0.035242632031440735,
"eval_reward": 1.215136090103461,
"eval_reward_std": 0.25851106210326663,
"eval_rewards/accuracy_reward": 0.27763606341821806,
"eval_rewards/format_reward": 0.9375000194627412,
"eval_runtime": 446.558,
"eval_samples_per_second": 0.222,
"eval_steps_per_second": 0.011,
"step": 400
},
{
"completion_length": 211.7161521911621,
"epoch": 0.5327149246314394,
"grad_norm": 0.42194342613220215,
"kl": 0.86279296875,
"learning_rate": 1.060200605552876e-05,
"loss": 0.0351,
"reward": 1.2447917014360428,
"reward_std": 0.2783205214655027,
"rewards/accuracy_reward": 0.30598959198687226,
"rewards/format_reward": 0.9388021044433117,
"step": 402
},
{
"completion_length": 209.05816570917764,
"epoch": 0.5366904091436144,
"grad_norm": 0.3273554742336273,
"kl": 0.8595377604166666,
"learning_rate": 1.0463195956663339e-05,
"loss": 0.0344,
"reward": 1.2074653158585231,
"reward_std": 0.26641134327898425,
"rewards/accuracy_reward": 0.2643229237680013,
"rewards/format_reward": 0.9431423793236414,
"step": 405
},
{
"completion_length": 198.80252281824747,
"epoch": 0.5406658936557893,
"grad_norm": 0.6081684827804565,
"kl": 0.8997395833333334,
"learning_rate": 1.0324296354171209e-05,
"loss": 0.036,
"reward": 1.2209201728304226,
"reward_std": 0.26526342386690277,
"rewards/accuracy_reward": 0.2712673705148821,
"rewards/format_reward": 0.949652798473835,
"step": 408
},
{
"completion_length": 189.4709266026815,
"epoch": 0.5446413781679642,
"grad_norm": 0.3419695794582367,
"kl": 0.9253743489583334,
"learning_rate": 1.0185334087704124e-05,
"loss": 0.037,
"reward": 1.252604205161333,
"reward_std": 0.27628890207658213,
"rewards/accuracy_reward": 0.305121536909913,
"rewards/format_reward": 0.9474826554457346,
"step": 411
},
{
"completion_length": 208.34549283981323,
"epoch": 0.5486168626801391,
"grad_norm": 0.2845761775970459,
"kl": 0.9347330729166666,
"learning_rate": 1.0046336009022435e-05,
"loss": 0.0374,
"reward": 1.2300347636143367,
"reward_std": 0.2701789590064436,
"rewards/accuracy_reward": 0.2834201465981702,
"rewards/format_reward": 0.9466146069268385,
"step": 414
},
{
"completion_length": 176.7278701464335,
"epoch": 0.5525923471923141,
"grad_norm": 0.30186229944229126,
"kl": 0.9585774739583334,
"learning_rate": 9.907328976806512e-06,
"loss": 0.0384,
"reward": 1.267361145466566,
"reward_std": 0.25099668038698536,
"rewards/accuracy_reward": 0.3120659809404363,
"rewards/format_reward": 0.9552951554457346,
"step": 417
},
{
"completion_length": 181.4904566605886,
"epoch": 0.556567831704489,
"grad_norm": 0.31573575735092163,
"kl": 0.9427083333333334,
"learning_rate": 9.768339851466818e-06,
"loss": 0.0377,
"reward": 1.2165799054006736,
"reward_std": 0.23861535429023206,
"rewards/accuracy_reward": 0.2586805630320062,
"rewards/format_reward": 0.9578993208706379,
"step": 420
},
{
"completion_length": 172.80729587872824,
"epoch": 0.5605433162166639,
"grad_norm": 0.3387187421321869,
"kl": 0.9481608072916666,
"learning_rate": 9.62939548995367e-06,
"loss": 0.0379,
"reward": 1.2608507374922435,
"reward_std": 0.2416861488794287,
"rewards/accuracy_reward": 0.30729167473812896,
"rewards/format_reward": 0.9535590472320715,
"step": 423
},
{
"completion_length": 202.40842517217,
"epoch": 0.5645188007288389,
"grad_norm": 0.3616231381893158,
"kl": 0.920166015625,
"learning_rate": 9.490522740567633e-06,
"loss": 0.0368,
"reward": 1.1983507287998993,
"reward_std": 0.2321951068782558,
"rewards/accuracy_reward": 0.24826389597728848,
"rewards/format_reward": 0.950086827079455,
"step": 426
},
{
"completion_length": 234.31207275390625,
"epoch": 0.5684942852410138,
"grad_norm": 0.3933253288269043,
"kl": 0.89599609375,
"learning_rate": 9.351748437771615e-06,
"loss": 0.0358,
"reward": 1.2092014277974765,
"reward_std": 0.26475840294733644,
"rewards/accuracy_reward": 0.2638888942698638,
"rewards/format_reward": 0.9453125235935053,
"step": 429
},
{
"completion_length": 219.85721063613892,
"epoch": 0.5724697697531886,
"grad_norm": 0.3269123435020447,
"kl": 0.8291015625,
"learning_rate": 9.213099397005647e-06,
"loss": 0.0332,
"reward": 1.2604166964689891,
"reward_std": 0.24623461983477077,
"rewards/accuracy_reward": 0.3151041733411451,
"rewards/format_reward": 0.9453125211099783,
"step": 432
},
{
"completion_length": 187.03993590672812,
"epoch": 0.5764452542653636,
"grad_norm": 0.3639557361602783,
"kl": 0.8619791666666666,
"learning_rate": 9.074602409505293e-06,
"loss": 0.0345,
"reward": 1.2573785136143367,
"reward_std": 0.2650001817382872,
"rewards/accuracy_reward": 0.30512153601739556,
"rewards/format_reward": 0.9522569638987383,
"step": 435
},
{
"completion_length": 164.95964018503824,
"epoch": 0.5804207387775385,
"grad_norm": 0.47998958826065063,
"kl": 0.9236653645833334,
"learning_rate": 8.936284237124779e-06,
"loss": 0.0369,
"reward": 1.20616323625048,
"reward_std": 0.20998603710904717,
"rewards/accuracy_reward": 0.24479167334114513,
"rewards/format_reward": 0.9613715435067812,
"step": 438
},
{
"completion_length": 165.91406766573587,
"epoch": 0.5843962232897134,
"grad_norm": 0.2889668345451355,
"kl": 0.93359375,
"learning_rate": 8.798171607165779e-06,
"loss": 0.0374,
"reward": 1.1775174016753833,
"reward_std": 0.18780716601759195,
"rewards/accuracy_reward": 0.21831597752558687,
"rewards/format_reward": 0.9592014066874981,
"step": 441
},
{
"completion_length": 154.68273003896078,
"epoch": 0.5883717078018884,
"grad_norm": 0.3764539361000061,
"kl": 0.9204915364583334,
"learning_rate": 8.660291207212883e-06,
"loss": 0.0368,
"reward": 1.2274305820465088,
"reward_std": 0.2107705035402129,
"rewards/accuracy_reward": 0.25954861807016033,
"rewards/format_reward": 0.967881960173448,
"step": 444
},
{
"completion_length": 242.51129245758057,
"epoch": 0.5923471923140633,
"grad_norm": 0.4235329031944275,
"kl": 0.8951009114583334,
"learning_rate": 8.52266967997675e-06,
"loss": 0.0358,
"reward": 1.1992187909781933,
"reward_std": 0.2926396271989991,
"rewards/accuracy_reward": 0.27473959047347307,
"rewards/format_reward": 0.9244791890184084,
"step": 447
},
{
"completion_length": 216.438809633255,
"epoch": 0.5963226768262382,
"grad_norm": 0.5363680124282837,
"kl": 0.9195963541666666,
"learning_rate": 8.385333618145896e-06,
"loss": 0.0368,
"reward": 1.1462673942248027,
"reward_std": 0.27518284460529685,
"rewards/accuracy_reward": 0.21397570016173026,
"rewards/format_reward": 0.9322916840513548,
"step": 450
},
{
"completion_length": 119.86154862244923,
"epoch": 0.6002981613384131,
"grad_norm": 0.5320878028869629,
"kl": 0.965087890625,
"learning_rate": 8.248309559248203e-06,
"loss": 0.0386,
"reward": 1.18619795764486,
"reward_std": 0.20566960889846087,
"rewards/accuracy_reward": 0.22135417337995023,
"rewards/format_reward": 0.9648437711099783,
"step": 453
},
{
"completion_length": 98.42144385973613,
"epoch": 0.6042736458505881,
"grad_norm": 0.4586585760116577,
"kl": 0.9525553385416666,
"learning_rate": 8.111623980523036e-06,
"loss": 0.0381,
"reward": 1.2638889191051323,
"reward_std": 0.20143946547371647,
"rewards/accuracy_reward": 0.2899305631484215,
"rewards/format_reward": 0.9739583469927311,
"step": 456
},
{
"completion_length": 153.68880653381348,
"epoch": 0.608249130362763,
"grad_norm": 5.601478576660156,
"kl": 1.3423665364583333,
"learning_rate": 7.975303293805036e-06,
"loss": 0.0537,
"reward": 1.2421875384946663,
"reward_std": 0.24943431583233178,
"rewards/accuracy_reward": 0.29513889698622126,
"rewards/format_reward": 0.9470486293236414,
"step": 459
},
{
"completion_length": 172.61806122461954,
"epoch": 0.6122246148749378,
"grad_norm": 0.6199188828468323,
"kl": 0.9340006510416666,
"learning_rate": 7.839373840420555e-06,
"loss": 0.0374,
"reward": 1.1848958780368168,
"reward_std": 0.270951366595303,
"rewards/accuracy_reward": 0.24435764636533955,
"rewards/format_reward": 0.9405382138987383,
"step": 462
},
{
"completion_length": 216.99045578638712,
"epoch": 0.6162000993871128,
"grad_norm": 22.460529327392578,
"kl": 0.9293619791666666,
"learning_rate": 7.70386188609769e-06,
"loss": 0.0372,
"reward": 1.2044271193444729,
"reward_std": 0.2865686761215329,
"rewards/accuracy_reward": 0.2812500084207083,
"rewards/format_reward": 0.9231771019597849,
"step": 465
},
{
"completion_length": 200.38672375679016,
"epoch": 0.6201755838992877,
"grad_norm": 4.971067428588867,
"kl": 0.9654947916666666,
"learning_rate": 7.568793615890955e-06,
"loss": 0.0386,
"reward": 1.170138926555713,
"reward_std": 0.28953606037733454,
"rewards/accuracy_reward": 0.24869792345756045,
"rewards/format_reward": 0.9214409912625948,
"step": 468
},
{
"completion_length": 263.26172574361163,
"epoch": 0.6241510684114626,
"grad_norm": 19461.30859375,
"kl": 6020.59716796875,
"learning_rate": 7.434195129121517e-06,
"loss": 241.5018,
"reward": 1.1197916927436988,
"reward_std": 0.35640866014485556,
"rewards/accuracy_reward": 0.2530382012870784,
"rewards/format_reward": 0.8667534949878851,
"step": 471
},
{
"completion_length": 157.41016141573587,
"epoch": 0.6281265529236376,
"grad_norm": 15.363752365112305,
"kl": 54.584309895833336,
"learning_rate": 7.300092434334021e-06,
"loss": 2.1851,
"reward": 1.2139757387340069,
"reward_std": 0.2608258535619825,
"rewards/accuracy_reward": 0.27473959086152416,
"rewards/format_reward": 0.9392361330489317,
"step": 474
},
{
"completion_length": 188.84332064787546,
"epoch": 0.6321020374358125,
"grad_norm": 3.5809130668640137,
"kl": 1.1976725260416667,
"learning_rate": 7.166511444270924e-06,
"loss": 0.0479,
"reward": 1.2309028158585231,
"reward_std": 0.26117177587002516,
"rewards/accuracy_reward": 0.2916666743112728,
"rewards/format_reward": 0.9392361268401146,
"step": 477
},
{
"completion_length": 160.72222622235617,
"epoch": 0.6360775219479874,
"grad_norm": 3.082725763320923,
"kl": 1.359375,
"learning_rate": 7.033477970865381e-06,
"loss": 0.0544,
"reward": 1.2621528171002865,
"reward_std": 0.24424838298000395,
"rewards/accuracy_reward": 0.30772570373180014,
"rewards/format_reward": 0.9544271032015482,
"step": 480
},
{
"completion_length": 172.35720992088318,
"epoch": 0.6400530064601623,
"grad_norm": 6.727673053741455,
"kl": 3.7775065104166665,
"learning_rate": 6.901017720253583e-06,
"loss": 0.151,
"reward": 1.1987847561637561,
"reward_std": 0.24734753215064606,
"rewards/accuracy_reward": 0.25824653392191976,
"rewards/format_reward": 0.940538210173448,
"step": 483
},
{
"completion_length": 138.64713939030966,
"epoch": 0.6440284909723373,
"grad_norm": 28.416213989257812,
"kl": 1.9168294270833333,
"learning_rate": 6.769156287807539e-06,
"loss": 0.0767,
"reward": 1.2269965621332328,
"reward_std": 0.21713009189503887,
"rewards/accuracy_reward": 0.27170139527879655,
"rewards/format_reward": 0.9552951566874981,
"step": 486
},
{
"completion_length": 111.01085392634074,
"epoch": 0.6480039754845122,
"grad_norm": 28.046361923217773,
"kl": 1.4402669270833333,
"learning_rate": 6.637919153189279e-06,
"loss": 0.0576,
"reward": 1.2903646230697632,
"reward_std": 0.2030498057914277,
"rewards/accuracy_reward": 0.33203126047737896,
"rewards/format_reward": 0.9583333519597849,
"step": 489
},
{
"completion_length": 93.8329017162323,
"epoch": 0.651979459996687,
"grad_norm": 5.583730697631836,
"kl": 1.2568359375,
"learning_rate": 6.507331675427388e-06,
"loss": 0.0503,
"reward": 1.2226562922199566,
"reward_std": 0.2042010520429661,
"rewards/accuracy_reward": 0.2604166743500779,
"rewards/format_reward": 0.9622396032015482,
"step": 492
},
{
"completion_length": 88.26736386617024,
"epoch": 0.655954944508862,
"grad_norm": 12.801457405090332,
"kl": 1.261962890625,
"learning_rate": 6.3774190880168804e-06,
"loss": 0.0505,
"reward": 1.2973090683420498,
"reward_std": 0.23440878558903933,
"rewards/accuracy_reward": 0.33289931528270245,
"rewards/format_reward": 0.9644097437461218,
"step": 495
},
{
"completion_length": 90.45833583672841,
"epoch": 0.6599304290210369,
"grad_norm": 1334.54296875,
"kl": 12.92529296875,
"learning_rate": 6.248206494043313e-06,
"loss": 0.5176,
"reward": 1.269097267339627,
"reward_std": 0.1979171479276071,
"rewards/accuracy_reward": 0.30164931431257475,
"rewards/format_reward": 0.9674479365348816,
"step": 498
},
{
"epoch": 0.6625807520291536,
"eval_completion_length": 106.54038769648625,
"eval_kl": 8.832752403846154,
"eval_loss": 0.2197878211736679,
"eval_reward": 1.2631410598754882,
"eval_reward_std": 0.20897178661364776,
"eval_rewards/accuracy_reward": 0.3028846269903275,
"eval_rewards/format_reward": 0.9602564261509822,
"eval_runtime": 284.6155,
"eval_samples_per_second": 0.348,
"eval_steps_per_second": 0.018,
"step": 500
},
{
"completion_length": 137.89062881469727,
"epoch": 0.6639059135332118,
"grad_norm": 196.66842651367188,
"kl": 5.895263671875,
"learning_rate": 6.119718861332098e-06,
"loss": 0.4084,
"reward": 1.3242188021540642,
"reward_std": 0.24511273042298853,
"rewards/accuracy_reward": 0.3710937574505806,
"rewards/format_reward": 0.9531250298023224,
"step": 501
},
{
"completion_length": 101.93186076482137,
"epoch": 0.6678813980453868,
"grad_norm": 4.593560218811035,
"kl": 2.0084635416666665,
"learning_rate": 5.9919810176239554e-06,
"loss": 0.0804,
"reward": 1.2803819825251896,
"reward_std": 0.22951093905915818,
"rewards/accuracy_reward": 0.3185763976459081,
"rewards/format_reward": 0.9618055758376917,
"step": 504
},
{
"completion_length": 98.08246823151906,
"epoch": 0.6718568825575617,
"grad_norm": 36.8542594909668,
"kl": 1.6246744791666667,
"learning_rate": 5.86501764577744e-06,
"loss": 0.065,
"reward": 1.2582465658585231,
"reward_std": 0.20490265979121128,
"rewards/accuracy_reward": 0.2903645906674986,
"rewards/format_reward": 0.9678819614152113,
"step": 507
},
{
"completion_length": 125.86849367618561,
"epoch": 0.6758323670697366,
"grad_norm": 31.712203979492188,
"kl": 2.9173990885416665,
"learning_rate": 5.7388532789994476e-06,
"loss": 0.1167,
"reward": 1.2304687934617202,
"reward_std": 0.23319136871335408,
"rewards/accuracy_reward": 0.27473959159882116,
"rewards/format_reward": 0.9557291840513548,
"step": 510
},
{
"completion_length": 108.07986442248027,
"epoch": 0.6798078515819116,
"grad_norm": 19.656137466430664,
"kl": 2.4375,
"learning_rate": 5.613512296104663e-06,
"loss": 0.0974,
"reward": 1.2282986529171467,
"reward_std": 0.1991276788370063,
"rewards/accuracy_reward": 0.26388889578326297,
"rewards/format_reward": 0.964409738779068,
"step": 513
},
{
"completion_length": 120.73264233271281,
"epoch": 0.6837833360940865,
"grad_norm": 9.04715633392334,
"kl": 1.5470377604166667,
"learning_rate": 5.489018916804813e-06,
"loss": 0.0619,
"reward": 1.281250045945247,
"reward_std": 0.22288222153050205,
"rewards/accuracy_reward": 0.3198784813284874,
"rewards/format_reward": 0.9613715497155985,
"step": 516
},
{
"completion_length": 113.39974268277486,
"epoch": 0.6877588206062614,
"grad_norm": 2.3152172565460205,
"kl": 1.3323567708333333,
"learning_rate": 5.365397197028686e-06,
"loss": 0.0533,
"reward": 1.2721354613701503,
"reward_std": 0.19468989650097987,
"rewards/accuracy_reward": 0.30338542551423114,
"rewards/format_reward": 0.968750017384688,
"step": 519
},
{
"completion_length": 143.59983134269714,
"epoch": 0.6917343051184363,
"grad_norm": 12.121291160583496,
"kl": 1.47412109375,
"learning_rate": 5.242671024273798e-06,
"loss": 0.059,
"reward": 1.2456597636143367,
"reward_std": 0.25431135304582614,
"rewards/accuracy_reward": 0.29296875729536015,
"rewards/format_reward": 0.9526909900208315,
"step": 522
},
{
"completion_length": 121.45529794692993,
"epoch": 0.6957097896306113,
"grad_norm": 3.243786334991455,
"kl": 1.41357421875,
"learning_rate": 5.120864112990569e-06,
"loss": 0.0566,
"reward": 1.2443576753139496,
"reward_std": 0.20600120699964464,
"rewards/accuracy_reward": 0.28125000714013976,
"rewards/format_reward": 0.9631076554457346,
"step": 525
},
{
"completion_length": 141.91970892747244,
"epoch": 0.6996852741427861,
"grad_norm": 6.4455437660217285,
"kl": 1.6841634114583333,
"learning_rate": 5.000000000000003e-06,
"loss": 0.0674,
"reward": 1.2152778096497059,
"reward_std": 0.23748167790472507,
"rewards/accuracy_reward": 0.2604166748933494,
"rewards/format_reward": 0.9548611342906952,
"step": 528
},
{
"completion_length": 145.10503919919333,
"epoch": 0.703660758654961,
"grad_norm": 4.077893257141113,
"kl": 1.8653971354166667,
"learning_rate": 4.880102039945625e-06,
"loss": 0.0746,
"reward": 1.2673611491918564,
"reward_std": 0.22801773723525307,
"rewards/accuracy_reward": 0.3116319513646886,
"rewards/format_reward": 0.9557291865348816,
"step": 531
},
{
"completion_length": 153.23568006356558,
"epoch": 0.707636243167136,
"grad_norm": 2.3837387561798096,
"kl": 1.5,
"learning_rate": 4.761193400780667e-06,
"loss": 0.06,
"reward": 1.25694448625048,
"reward_std": 0.24314528051763773,
"rewards/accuracy_reward": 0.2999132027228673,
"rewards/format_reward": 0.9570312711099783,
"step": 534
},
{
"completion_length": 167.8263931274414,
"epoch": 0.7116117276793109,
"grad_norm": 4.202811241149902,
"kl": 2.5416666666666665,
"learning_rate": 4.643297059291303e-06,
"loss": 0.1017,
"reward": 1.2248264277974765,
"reward_std": 0.27370192063972354,
"rewards/accuracy_reward": 0.27604167559184134,
"rewards/format_reward": 0.9487847425043583,
"step": 537
},
{
"completion_length": 182.7339456876119,
"epoch": 0.7155872121914858,
"grad_norm": 14.95860481262207,
"kl": 2.8761393229166665,
"learning_rate": 4.52643579665683e-06,
"loss": 0.1151,
"reward": 1.2421875409781933,
"reward_std": 0.25901925152478117,
"rewards/accuracy_reward": 0.2981770924137284,
"rewards/format_reward": 0.9440104328095913,
"step": 540
},
{
"completion_length": 154.5321224530538,
"epoch": 0.7195626967036608,
"grad_norm": 3.4772720336914062,
"kl": 2.1795247395833335,
"learning_rate": 4.410632194047652e-06,
"loss": 0.0872,
"reward": 1.2413194812834263,
"reward_std": 0.2310507068565736,
"rewards/accuracy_reward": 0.2834201504010707,
"rewards/format_reward": 0.9578993258376917,
"step": 543
},
{
"completion_length": 183.41537022590637,
"epoch": 0.7235381812158357,
"grad_norm": 5.352535247802734,
"kl": 2.7540690104166665,
"learning_rate": 4.29590862826191e-06,
"loss": 0.1102,
"reward": 1.2369792026778061,
"reward_std": 0.24117931607179344,
"rewards/accuracy_reward": 0.2934027863666415,
"rewards/format_reward": 0.9435764091710249,
"step": 546
},
{
"completion_length": 169.5039111773173,
"epoch": 0.7275136657280106,
"grad_norm": 2.4602179527282715,
"kl": 1.9309895833333333,
"learning_rate": 4.182287267401587e-06,
"loss": 0.0772,
"reward": 1.2404514315227668,
"reward_std": 0.22196716310766837,
"rewards/accuracy_reward": 0.2877604255530362,
"rewards/format_reward": 0.9526909925043583,
"step": 549
},
{
"completion_length": 147.87847610314688,
"epoch": 0.7314891502401856,
"grad_norm": 3.658947229385376,
"kl": 2.431640625,
"learning_rate": 4.069790066588966e-06,
"loss": 0.0972,
"reward": 1.3168403158585231,
"reward_std": 0.23325985188906392,
"rewards/accuracy_reward": 0.3589409807464108,
"rewards/format_reward": 0.9578993233541647,
"step": 552
},
{
"completion_length": 148.14410118261972,
"epoch": 0.7354646347523605,
"grad_norm": 2.160740613937378,
"kl": 1.542236328125,
"learning_rate": 3.9584387637242235e-06,
"loss": 0.0617,
"reward": 1.2235243457059066,
"reward_std": 0.23132954825026295,
"rewards/accuracy_reward": 0.2647569504721711,
"rewards/format_reward": 0.9587673780818781,
"step": 555
},
{
"completion_length": 169.01172391573587,
"epoch": 0.7394401192645353,
"grad_norm": 13.349943161010742,
"kl": 2.2926432291666665,
"learning_rate": 3.848254875285e-06,
"loss": 0.0917,
"reward": 1.197048647950093,
"reward_std": 0.2103662300699701,
"rewards/accuracy_reward": 0.2460937569849193,
"rewards/format_reward": 0.9509548842906952,
"step": 558
},
{
"completion_length": 157.60981353123984,
"epoch": 0.7434156037767102,
"grad_norm": 2.4742820262908936,
"kl": 1.6537272135416667,
"learning_rate": 3.739259692168764e-06,
"loss": 0.0662,
"reward": 1.229166705161333,
"reward_std": 0.2528001538012177,
"rewards/accuracy_reward": 0.27560764621011913,
"rewards/format_reward": 0.9535590459903082,
"step": 561
},
{
"completion_length": 154.8311678568522,
"epoch": 0.7473910882888852,
"grad_norm": 1.9129363298416138,
"kl": 1.6079915364583333,
"learning_rate": 3.6314742755787537e-06,
"loss": 0.0643,
"reward": 1.2261285024384658,
"reward_std": 0.22889205797885856,
"rewards/accuracy_reward": 0.27300348059119034,
"rewards/format_reward": 0.9531250235935053,
"step": 564
},
{
"completion_length": 151.18403148651123,
"epoch": 0.7513665728010601,
"grad_norm": 1.2694976329803467,
"kl": 1.6017252604166667,
"learning_rate": 3.524919452954314e-06,
"loss": 0.064,
"reward": 1.2248264377315838,
"reward_std": 0.23556000289196768,
"rewards/accuracy_reward": 0.26692709055108327,
"rewards/format_reward": 0.9578993320465088,
"step": 567
},
{
"completion_length": 154.80859859784445,
"epoch": 0.755342057313235,
"grad_norm": 1.4956895112991333,
"kl": 1.427734375,
"learning_rate": 3.419615813946392e-06,
"loss": 0.0571,
"reward": 1.1974826753139496,
"reward_std": 0.23270095341528454,
"rewards/accuracy_reward": 0.23784722872854522,
"rewards/format_reward": 0.9596354352931181,
"step": 570
},
{
"completion_length": 169.7625904083252,
"epoch": 0.75931754182541,
"grad_norm": 0.8634160161018372,
"kl": 1.3221028645833333,
"learning_rate": 3.315583706438994e-06,
"loss": 0.0529,
"reward": 1.2278646193444729,
"reward_std": 0.24653864566547176,
"rewards/accuracy_reward": 0.27734375892517465,
"rewards/format_reward": 0.9505208519597849,
"step": 573
},
{
"completion_length": 157.0377644697825,
"epoch": 0.7632930263375849,
"grad_norm": 5.6404571533203125,
"kl": 1.3050944010416667,
"learning_rate": 3.212843232617343e-06,
"loss": 0.0522,
"reward": 1.2322048942248027,
"reward_std": 0.21434197838728627,
"rewards/accuracy_reward": 0.2738715362114211,
"rewards/format_reward": 0.9583333519597849,
"step": 576
},
{
"completion_length": 149.35286871592203,
"epoch": 0.7672685108497598,
"grad_norm": 1.4161450862884521,
"kl": 1.1617838541666667,
"learning_rate": 3.1114142450835296e-06,
"loss": 0.0465,
"reward": 1.2365451728304226,
"reward_std": 0.2225903740618378,
"rewards/accuracy_reward": 0.2730034806688006,
"rewards/format_reward": 0.9635416815678278,
"step": 579
},
{
"completion_length": 164.2899361451467,
"epoch": 0.7712439953619348,
"grad_norm": 2.1099228858947754,
"kl": 1.1534016927083333,
"learning_rate": 3.0113163430203775e-06,
"loss": 0.0461,
"reward": 1.2417535074055195,
"reward_std": 0.2334075498705109,
"rewards/accuracy_reward": 0.2873263977235183,
"rewards/format_reward": 0.9544271044433117,
"step": 582
},
{
"completion_length": 166.99479564030966,
"epoch": 0.7752194798741097,
"grad_norm": 0.6481562256813049,
"kl": 1.1204427083333333,
"learning_rate": 2.912568868404284e-06,
"loss": 0.0448,
"reward": 1.2539062947034836,
"reward_std": 0.2460917371014754,
"rewards/accuracy_reward": 0.2977430645842105,
"rewards/format_reward": 0.9561632151405016,
"step": 585
},
{
"completion_length": 142.4097265402476,
"epoch": 0.7791949643862845,
"grad_norm": 0.6822313070297241,
"kl": 1.0983072916666667,
"learning_rate": 2.815190902267757e-06,
"loss": 0.0439,
"reward": 1.2465278084079425,
"reward_std": 0.21192065292658904,
"rewards/accuracy_reward": 0.27864584055108327,
"rewards/format_reward": 0.9678819651405016,
"step": 588
},
{
"completion_length": 172.7829921245575,
"epoch": 0.7831704488984595,
"grad_norm": 1.076019525527954,
"kl": 1.0126953125,
"learning_rate": 2.7192012610123777e-06,
"loss": 0.0405,
"reward": 1.2855903171002865,
"reward_std": 0.22700861329212785,
"rewards/accuracy_reward": 0.3285590385397275,
"rewards/format_reward": 0.957031267384688,
"step": 591
},
{
"completion_length": 179.41103037198386,
"epoch": 0.7871459334106344,
"grad_norm": 0.7347291707992554,
"kl": 1.072265625,
"learning_rate": 2.6246184927728913e-06,
"loss": 0.0429,
"reward": 1.2395833755532901,
"reward_std": 0.22892415950385234,
"rewards/accuracy_reward": 0.2808159793494269,
"rewards/format_reward": 0.9587673818071684,
"step": 594
},
{
"completion_length": 190.89887682596842,
"epoch": 0.7911214179228093,
"grad_norm": 1.5825515985488892,
"kl": 1.0482584635416667,
"learning_rate": 2.5314608738331535e-06,
"loss": 0.0419,
"reward": 1.2456597586472828,
"reward_std": 0.24149028413618603,
"rewards/accuracy_reward": 0.29427084256894886,
"rewards/format_reward": 0.9513889116545519,
"step": 597
},
{
"completion_length": 175.54731305440268,
"epoch": 0.7950969024349842,
"grad_norm": 0.6331008672714233,
"kl": 1.0225423177083333,
"learning_rate": 2.4397464050945753e-06,
"loss": 0.0409,
"reward": 1.2434896143774192,
"reward_std": 0.2312415634126713,
"rewards/accuracy_reward": 0.2873263991593073,
"rewards/format_reward": 0.9561632089316845,
"step": 600
},
{
"epoch": 0.7950969024349842,
"eval_completion_length": 166.41919604214755,
"eval_kl": 1.0108901515151516,
"eval_loss": 0.040223389863967896,
"eval_reward": 1.2853535666610256,
"eval_reward_std": 0.22720548510551453,
"eval_rewards/accuracy_reward": 0.32449495679501333,
"eval_rewards/format_reward": 0.9608586051247336,
"eval_runtime": 432.7167,
"eval_samples_per_second": 0.229,
"eval_steps_per_second": 0.012,
"step": 600
},
{
"completion_length": 197.4709267616272,
"epoch": 0.7990723869471592,
"grad_norm": 3.598181962966919,
"kl": 1.1375325520833333,
"learning_rate": 2.3494928085978073e-06,
"loss": 0.0455,
"reward": 1.241319480041663,
"reward_std": 0.2442009438915799,
"rewards/accuracy_reward": 0.2921006998512894,
"rewards/format_reward": 0.949218769868215,
"step": 603
},
{
"completion_length": 171.99479659398398,
"epoch": 0.8030478714593341,
"grad_norm": 1.0907797813415527,
"kl": 0.995849609375,
"learning_rate": 2.2607175240983027e-06,
"loss": 0.0399,
"reward": 1.2322048917412758,
"reward_std": 0.2378006634923319,
"rewards/accuracy_reward": 0.2786458421420927,
"rewards/format_reward": 0.9535590509573618,
"step": 606
},
{
"completion_length": 151.64627146720886,
"epoch": 0.807023355971509,
"grad_norm": 15.526721954345703,
"kl": 1.0755208333333333,
"learning_rate": 2.1734377056964175e-06,
"loss": 0.043,
"reward": 1.2387153183420498,
"reward_std": 0.21919091992701092,
"rewards/accuracy_reward": 0.2721354246993239,
"rewards/format_reward": 0.9665798768401146,
"step": 609
},
{
"completion_length": 180.01128919919333,
"epoch": 0.810998840483684,
"grad_norm": 0.6363082528114319,
"kl": 1.0391438802083333,
"learning_rate": 2.087670218522714e-06,
"loss": 0.0416,
"reward": 1.210069477558136,
"reward_std": 0.2583714901314427,
"rewards/accuracy_reward": 0.25824653725915897,
"rewards/format_reward": 0.9518229365348816,
"step": 612
},
{
"completion_length": 134.2313413619995,
"epoch": 0.8149743249958589,
"grad_norm": 0.5771492123603821,
"kl": 0.9965006510416666,
"learning_rate": 2.0034316354791062e-06,
"loss": 0.0398,
"reward": 1.2651910136143367,
"reward_std": 0.1995284124277532,
"rewards/accuracy_reward": 0.2968750091968104,
"rewards/format_reward": 0.9683159900208315,
"step": 615
},
{
"completion_length": 145.0091195901235,
"epoch": 0.8189498095080338,
"grad_norm": 2.458407163619995,
"kl": 1.0079752604166667,
"learning_rate": 1.920738234036463e-06,
"loss": 0.0403,
"reward": 1.28298615415891,
"reward_std": 0.21225994320896766,
"rewards/accuracy_reward": 0.3133680646618207,
"rewards/format_reward": 0.9696180721124014,
"step": 618
},
{
"completion_length": 157.43186203638712,
"epoch": 0.8229252940202088,
"grad_norm": 0.5727205276489258,
"kl": 1.0079752604166667,
"learning_rate": 1.8396059930893073e-06,
"loss": 0.0403,
"reward": 1.2170139253139496,
"reward_std": 0.21727207908406854,
"rewards/accuracy_reward": 0.25130209086152416,
"rewards/format_reward": 0.9657118258376917,
"step": 621
},
{
"completion_length": 156.4114625453949,
"epoch": 0.8269007785323836,
"grad_norm": 0.5636825561523438,
"kl": 0.9955240885416666,
"learning_rate": 1.7600505898681996e-06,
"loss": 0.0398,
"reward": 1.2300347586472828,
"reward_std": 0.23207383013019958,
"rewards/accuracy_reward": 0.26649306435137987,
"rewards/format_reward": 0.9635416877766451,
"step": 624
},
{
"completion_length": 154.10373576482138,
"epoch": 0.8308762630445585,
"grad_norm": 3.9658546447753906,
"kl": 1.0211588541666667,
"learning_rate": 1.6820873969104223e-06,
"loss": 0.0409,
"reward": 1.2500000434617202,
"reward_std": 0.22843335390401384,
"rewards/accuracy_reward": 0.2808159825702508,
"rewards/format_reward": 0.9691840459903082,
"step": 627
},
{
"completion_length": 179.8810822168986,
"epoch": 0.8348517475567335,
"grad_norm": 0.8975684642791748,
"kl": 0.9754231770833334,
"learning_rate": 1.605731479089534e-06,
"loss": 0.039,
"reward": 1.270833384245634,
"reward_std": 0.2560514376188318,
"rewards/accuracy_reward": 0.3168402878024305,
"rewards/format_reward": 0.9539930745959282,
"step": 630
},
{
"completion_length": 191.86068240801492,
"epoch": 0.8388272320689084,
"grad_norm": 1.0552254915237427,
"kl": 1.05908203125,
"learning_rate": 1.530997590704375e-06,
"loss": 0.0424,
"reward": 1.223524338255326,
"reward_std": 0.24293402349576354,
"rewards/accuracy_reward": 0.2669270930734153,
"rewards/format_reward": 0.9565972437461218,
"step": 633
},
{
"completion_length": 201.91189877192178,
"epoch": 0.8428027165810833,
"grad_norm": 1.9689509868621826,
"kl": 1.109130859375,
"learning_rate": 1.4579001726280828e-06,
"loss": 0.0444,
"reward": 1.2560764352480571,
"reward_std": 0.24747123545967042,
"rewards/accuracy_reward": 0.299479175475426,
"rewards/format_reward": 0.9565972400208315,
"step": 636
},
{
"completion_length": 201.5238777001699,
"epoch": 0.8467782010932582,
"grad_norm": 0.8104230761528015,
"kl": 1.052734375,
"learning_rate": 1.386453349517679e-06,
"loss": 0.0421,
"reward": 1.2391493332882721,
"reward_std": 0.24252263192708293,
"rewards/accuracy_reward": 0.281684036909913,
"rewards/format_reward": 0.9574652922650179,
"step": 639
},
{
"completion_length": 170.00868566830954,
"epoch": 0.8507536856054332,
"grad_norm": 0.7141380310058594,
"kl": 0.9737955729166666,
"learning_rate": 1.316670927084751e-06,
"loss": 0.039,
"reward": 1.2630208705862362,
"reward_std": 0.23810221177215377,
"rewards/accuracy_reward": 0.29817709152121097,
"rewards/format_reward": 0.9648437735935053,
"step": 642
},
{
"completion_length": 187.43533500035605,
"epoch": 0.8547291701176081,
"grad_norm": 0.9845206141471863,
"kl": 1.1171061197916667,
"learning_rate": 1.2485663894277611e-06,
"loss": 0.0447,
"reward": 1.2730035160978634,
"reward_std": 0.216334043458725,
"rewards/accuracy_reward": 0.3094618124887347,
"rewards/format_reward": 0.9635416865348816,
"step": 645
},
{
"completion_length": 169.31076955795288,
"epoch": 0.858704654629783,
"grad_norm": 0.9538066387176514,
"kl": 0.978271484375,
"learning_rate": 1.182152896426515e-06,
"loss": 0.0391,
"reward": 1.281250037252903,
"reward_std": 0.24271480288977423,
"rewards/accuracy_reward": 0.31597222907779116,
"rewards/format_reward": 0.9652777972320715,
"step": 648
},
{
"completion_length": 184.18012634913126,
"epoch": 0.862680139141958,
"grad_norm": 0.5063804388046265,
"kl": 0.9745279947916666,
"learning_rate": 1.1174432811992686e-06,
"loss": 0.039,
"reward": 1.24609378973643,
"reward_std": 0.21818942956936857,
"rewards/accuracy_reward": 0.2821180628379807,
"rewards/format_reward": 0.9639757126569748,
"step": 651
},
{
"completion_length": 165.3055603504181,
"epoch": 0.8666556236541328,
"grad_norm": 0.6727854013442993,
"kl": 0.9583333333333334,
"learning_rate": 1.0544500476229713e-06,
"loss": 0.0383,
"reward": 1.2573785086472828,
"reward_std": 0.22620403526040414,
"rewards/accuracy_reward": 0.29427084035705775,
"rewards/format_reward": 0.9631076554457346,
"step": 654
},
{
"completion_length": 186.19965728123984,
"epoch": 0.8706311081663077,
"grad_norm": 0.6164532899856567,
"kl": 1.0279134114583333,
"learning_rate": 9.931853679171377e-07,
"loss": 0.0411,
"reward": 1.2439236516753833,
"reward_std": 0.24075799800145128,
"rewards/accuracy_reward": 0.28559028551292914,
"rewards/format_reward": 0.9583333507180214,
"step": 657
},
{
"completion_length": 177.0638066927592,
"epoch": 0.8746065926784827,
"grad_norm": 0.6313008666038513,
"kl": 1.0465494791666667,
"learning_rate": 9.336610802918044e-07,
"loss": 0.0419,
"reward": 1.2708333631356556,
"reward_std": 0.20328321517445147,
"rewards/accuracy_reward": 0.3051215368323028,
"rewards/format_reward": 0.9657118245959282,
"step": 660
},
{
"completion_length": 178.04774816830954,
"epoch": 0.8785820771906576,
"grad_norm": 0.5517924427986145,
"kl": 1.0804036458333333,
"learning_rate": 8.758886866600258e-07,
"loss": 0.0433,
"reward": 1.3003472636143367,
"reward_std": 0.20480242053357264,
"rewards/accuracy_reward": 0.33203125970127684,
"rewards/format_reward": 0.9683159875373045,
"step": 663
},
{
"completion_length": 184.22309557596842,
"epoch": 0.8825575617028325,
"grad_norm": 1.6948155164718628,
"kl": 0.9346516927083334,
"learning_rate": 8.198793504153491e-07,
"loss": 0.0374,
"reward": 1.2834201827645302,
"reward_std": 0.22442288471696278,
"rewards/accuracy_reward": 0.31770834152121097,
"rewards/format_reward": 0.9657118221124014,
"step": 666
},
{
"completion_length": 163.08811235427856,
"epoch": 0.8865330462150075,
"grad_norm": 0.5778855085372925,
"kl": 0.9193522135416666,
"learning_rate": 7.656438942747057e-07,
"loss": 0.0368,
"reward": 1.27039934694767,
"reward_std": 0.1949684239613513,
"rewards/accuracy_reward": 0.2973090352024883,
"rewards/format_reward": 0.9730902947485447,
"step": 669
},
{
"completion_length": 190.82596119244894,
"epoch": 0.8905085307271824,
"grad_norm": 0.6843112111091614,
"kl": 1.0071614583333333,
"learning_rate": 7.131927981871345e-07,
"loss": 0.0403,
"reward": 1.2348090757926304,
"reward_std": 0.22979943679335216,
"rewards/accuracy_reward": 0.27213542349636555,
"rewards/format_reward": 0.9626736293236414,
"step": 672
},
{
"completion_length": 180.4761331876119,
"epoch": 0.8944840152393573,
"grad_norm": 1.2002581357955933,
"kl": 0.9956868489583334,
"learning_rate": 6.625361973087363e-07,
"loss": 0.0398,
"reward": 1.267361156642437,
"reward_std": 0.20884954005790254,
"rewards/accuracy_reward": 0.2999132029945031,
"rewards/format_reward": 0.9674479303260645,
"step": 675
},
{
"completion_length": 171.35373767217,
"epoch": 0.8984594997515322,
"grad_norm": 0.5270951390266418,
"kl": 0.9773763020833334,
"learning_rate": 6.136838800442457e-07,
"loss": 0.0391,
"reward": 1.2855903183420498,
"reward_std": 0.19845290334584811,
"rewards/accuracy_reward": 0.3168402863666415,
"rewards/format_reward": 0.9687500186264515,
"step": 678
},
{
"completion_length": 190.48004015286764,
"epoch": 0.9024349842637072,
"grad_norm": 0.8527917861938477,
"kl": 0.9973958333333334,
"learning_rate": 5.66645286155616e-07,
"loss": 0.0399,
"reward": 1.2916667014360428,
"reward_std": 0.2291031815111637,
"rewards/accuracy_reward": 0.32986112144620466,
"rewards/format_reward": 0.9618055721124014,
"step": 681
},
{
"completion_length": 189.20226113001505,
"epoch": 0.906410468775882,
"grad_norm": 9.596158981323242,
"kl": 1.0517578125,
"learning_rate": 5.214295049379658e-07,
"loss": 0.0421,
"reward": 1.2582465782761574,
"reward_std": 0.22187859937548637,
"rewards/accuracy_reward": 0.2938368134200573,
"rewards/format_reward": 0.9644097400208315,
"step": 684
},
{
"completion_length": 187.68099466959634,
"epoch": 0.9103859532880569,
"grad_norm": 0.6961022615432739,
"kl": 0.9669596354166666,
"learning_rate": 4.780452734632524e-07,
"loss": 0.0387,
"reward": 1.2760417039195697,
"reward_std": 0.22566887092155716,
"rewards/accuracy_reward": 0.31163195543922484,
"rewards/format_reward": 0.9644097425043583,
"step": 687
},
{
"completion_length": 181.89453570048013,
"epoch": 0.9143614378002319,
"grad_norm": 0.5587486028671265,
"kl": 0.9386393229166666,
"learning_rate": 4.3650097489200125e-07,
"loss": 0.0376,
"reward": 1.2834201777974765,
"reward_std": 0.21305101970210671,
"rewards/accuracy_reward": 0.3146701470638315,
"rewards/format_reward": 0.9687500161429247,
"step": 690
},
{
"completion_length": 186.9974012374878,
"epoch": 0.9183369223124068,
"grad_norm": 0.603073000907898,
"kl": 0.977783203125,
"learning_rate": 3.9680463685342173e-07,
"loss": 0.0391,
"reward": 1.3268229588866234,
"reward_std": 0.22527993516996503,
"rewards/accuracy_reward": 0.36154514946974814,
"rewards/format_reward": 0.9652777935067812,
"step": 693
},
{
"completion_length": 199.9023496309916,
"epoch": 0.9223124068245817,
"grad_norm": 0.49448880553245544,
"kl": 0.979736328125,
"learning_rate": 3.589639298942238e-07,
"loss": 0.0392,
"reward": 1.276475730041663,
"reward_std": 0.2337690940281997,
"rewards/accuracy_reward": 0.3138020931510255,
"rewards/format_reward": 0.9626736293236414,
"step": 696
},
{
"completion_length": 179.73394536972046,
"epoch": 0.9262878913367567,
"grad_norm": 3.9789516925811768,
"kl": 0.976318359375,
"learning_rate": 3.2298616599643285e-07,
"loss": 0.0391,
"reward": 1.278211849431197,
"reward_std": 0.1972268489189446,
"rewards/accuracy_reward": 0.3103298700880259,
"rewards/format_reward": 0.9678819626569748,
"step": 699
},
{
"epoch": 0.927613052840815,
"eval_completion_length": 198.22194461433256,
"eval_kl": 1.0224011479591837,
"eval_loss": 0.039721183478832245,
"eval_reward": 1.2755102442235362,
"eval_reward_std": 0.2398411301629884,
"eval_rewards/accuracy_reward": 0.31972789886046427,
"eval_rewards/format_reward": 0.9557823356317015,
"eval_runtime": 434.6419,
"eval_samples_per_second": 0.228,
"eval_steps_per_second": 0.012,
"step": 700
},
{
"completion_length": 181.44922268390656,
"epoch": 0.9302633758489316,
"grad_norm": 0.6522932648658752,
"kl": 0.9718017578125,
"learning_rate": 2.8887829716449877e-07,
"loss": 0.0401,
"reward": 1.2643229570239782,
"reward_std": 0.22105206700507551,
"rewards/accuracy_reward": 0.30013021564809605,
"rewards/format_reward": 0.9641927294433117,
"step": 702
},
{
"completion_length": 193.3624184926351,
"epoch": 0.9342388603611065,
"grad_norm": 0.6112500429153442,
"kl": 0.9839680989583334,
"learning_rate": 2.5664691408194164e-07,
"loss": 0.0394,
"reward": 1.2582465621332328,
"reward_std": 0.23564991471357644,
"rewards/accuracy_reward": 0.2955729262127231,
"rewards/format_reward": 0.9626736280818781,
"step": 705
},
{
"completion_length": 220.39193407694498,
"epoch": 0.9382143448732815,
"grad_norm": 0.631359875202179,
"kl": 1.0447591145833333,
"learning_rate": 2.262982448378437e-07,
"loss": 0.0418,
"reward": 1.2782118432223797,
"reward_std": 0.2571307167721291,
"rewards/accuracy_reward": 0.32291667396202683,
"rewards/format_reward": 0.9552951566874981,
"step": 708
},
{
"completion_length": 208.5000058809916,
"epoch": 0.9421898293854564,
"grad_norm": 0.6516295075416565,
"kl": 1.075927734375,
"learning_rate": 1.9783815372338422e-07,
"loss": 0.043,
"reward": 1.2669271230697632,
"reward_std": 0.26777021974946064,
"rewards/accuracy_reward": 0.31597223101804656,
"rewards/format_reward": 0.9509548805654049,
"step": 711
},
{
"completion_length": 190.33203514417013,
"epoch": 0.9461653138976313,
"grad_norm": 0.8081660866737366,
"kl": 0.9834798177083334,
"learning_rate": 1.7127214009868387e-07,
"loss": 0.0393,
"reward": 1.3146701827645302,
"reward_std": 0.22434815554879606,
"rewards/accuracy_reward": 0.350694455128784,
"rewards/format_reward": 0.9639757138987383,
"step": 714
},
{
"completion_length": 191.1718815167745,
"epoch": 0.9501407984098061,
"grad_norm": 0.6136611700057983,
"kl": 0.9658203125,
"learning_rate": 1.4660533733015236e-07,
"loss": 0.0386,
"reward": 1.2361111529171467,
"reward_std": 0.2575679953054835,
"rewards/accuracy_reward": 0.2760416753590107,
"rewards/format_reward": 0.9600694651405016,
"step": 717
},
{
"completion_length": 199.0850751399994,
"epoch": 0.9541162829219811,
"grad_norm": 2.2742247581481934,
"kl": 1.0128580729166667,
"learning_rate": 1.2384251179857642e-07,
"loss": 0.0405,
"reward": 1.2573785086472828,
"reward_std": 0.23815507961747548,
"rewards/accuracy_reward": 0.2964409807464108,
"rewards/format_reward": 0.9609375149011612,
"step": 720
},
{
"completion_length": 180.6779546737671,
"epoch": 0.958091767434156,
"grad_norm": 0.546220064163208,
"kl": 0.9303385416666666,
"learning_rate": 1.0298806197809985e-07,
"loss": 0.0372,
"reward": 1.2834201827645302,
"reward_std": 0.23479281645268202,
"rewards/accuracy_reward": 0.32031251102065045,
"rewards/format_reward": 0.9631076616545519,
"step": 723
},
{
"completion_length": 189.55859804153442,
"epoch": 0.9620672519463309,
"grad_norm": 0.5562774538993835,
"kl": 0.964111328125,
"learning_rate": 8.404601758630892e-08,
"loss": 0.0386,
"reward": 1.2556424004336197,
"reward_std": 0.23037906123014787,
"rewards/accuracy_reward": 0.2955729252814005,
"rewards/format_reward": 0.9600694626569748,
"step": 726
},
{
"completion_length": 202.31163819630942,
"epoch": 0.9660427364585059,
"grad_norm": 0.6967044472694397,
"kl": 0.984130859375,
"learning_rate": 6.702003880556418e-08,
"loss": 0.0394,
"reward": 1.2834201728304226,
"reward_std": 0.23515649721957743,
"rewards/accuracy_reward": 0.32508681404093903,
"rewards/format_reward": 0.9583333507180214,
"step": 729
},
{
"completion_length": 184.46702075004578,
"epoch": 0.9700182209706808,
"grad_norm": 1.1955063343048096,
"kl": 1.0083821614583333,
"learning_rate": 5.191341557574392e-08,
"loss": 0.0403,
"reward": 1.2309028195838134,
"reward_std": 0.22288473897303143,
"rewards/accuracy_reward": 0.2695312558983763,
"rewards/format_reward": 0.961371548473835,
"step": 732
},
{
"completion_length": 171.00391014417013,
"epoch": 0.9739937054828557,
"grad_norm": 3.6898951530456543,
"kl": 0.980224609375,
"learning_rate": 3.872906695852607e-08,
"loss": 0.0392,
"reward": 1.2894965633749962,
"reward_std": 0.22550825821235776,
"rewards/accuracy_reward": 0.3198784821821998,
"rewards/format_reward": 0.969618077079455,
"step": 735
},
{
"completion_length": 181.51215736071268,
"epoch": 0.9779691899950307,
"grad_norm": 0.8761662244796753,
"kl": 0.9816080729166666,
"learning_rate": 2.746954057333606e-08,
"loss": 0.0393,
"reward": 1.2582465695838134,
"reward_std": 0.23283367223727205,
"rewards/accuracy_reward": 0.2925347340060398,
"rewards/format_reward": 0.9657118221124014,
"step": 738
},
{
"completion_length": 166.68663636843362,
"epoch": 0.9819446745072056,
"grad_norm": 0.48325055837631226,
"kl": 0.996337890625,
"learning_rate": 1.8137012105069464e-08,
"loss": 0.0398,
"reward": 1.263454897950093,
"reward_std": 0.20216705913965902,
"rewards/accuracy_reward": 0.29383681442899007,
"rewards/format_reward": 0.9696180733541647,
"step": 741
},
{
"completion_length": 182.96528228123984,
"epoch": 0.9859201590193805,
"grad_norm": 0.55852872133255,
"kl": 0.9632161458333334,
"learning_rate": 1.0733284883682748e-08,
"loss": 0.0385,
"reward": 1.2773437909781933,
"reward_std": 0.2304229981576403,
"rewards/accuracy_reward": 0.31032986772091437,
"rewards/format_reward": 0.9670139066874981,
"step": 744
},
{
"completion_length": 200.19835631052652,
"epoch": 0.9898956435315555,
"grad_norm": 4.7552056312561035,
"kl": 1.1190592447916667,
"learning_rate": 5.25978953573536e-09,
"loss": 0.0447,
"reward": 1.2855903195838134,
"reward_std": 0.2617647792988767,
"rewards/accuracy_reward": 0.3268229243112728,
"rewards/format_reward": 0.9587673805654049,
"step": 747
},
{
"completion_length": 192.70313183466592,
"epoch": 0.9938711280437303,
"grad_norm": 0.49542155861854553,
"kl": 0.9990234375,
"learning_rate": 1.7175837079452806e-09,
"loss": 0.04,
"reward": 1.2573785160978634,
"reward_std": 0.21775838693914315,
"rewards/accuracy_reward": 0.2960069530721133,
"rewards/format_reward": 0.9613715497155985,
"step": 750
},
{
"completion_length": 200.46571826934814,
"epoch": 0.9978466125559052,
"grad_norm": 0.8950777053833008,
"kl": 1.0421549479166667,
"learning_rate": 1.0735186282695431e-10,
"loss": 0.0417,
"reward": 1.2717014340062935,
"reward_std": 0.2556659254866342,
"rewards/accuracy_reward": 0.31206598148370784,
"rewards/format_reward": 0.9596354328095913,
"step": 753
},
{
"completion_length": 211.18359994888306,
"epoch": 0.9991717740599636,
"kl": 0.985107421875,
"reward": 1.2708333656191826,
"reward_std": 0.28282210882753134,
"rewards/accuracy_reward": 0.31250000838190317,
"rewards/format_reward": 0.9583333544433117,
"step": 754,
"total_flos": 0.0,
"train_loss": 1.286716509427883,
"train_runtime": 229250.8975,
"train_samples_per_second": 0.316,
"train_steps_per_second": 0.003
}
],
"logging_steps": 3,
"max_steps": 754,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}