{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9991717740599636, "eval_steps": 100, "global_step": 754, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 1843.4614156087239, "epoch": 0.003975484512174921, "grad_norm": 0.08699692785739899, "kl": 5.9882799784342446e-05, "learning_rate": 7.894736842105263e-07, "loss": 0.0, "reward": 0.20442708857202282, "reward_std": 0.18642982677556574, "rewards/accuracy_reward": 0.1879340319816644, "rewards/format_reward": 0.0164930559694767, "step": 3 }, { "completion_length": 1872.7357126871746, "epoch": 0.007950969024349842, "grad_norm": 0.08034715801477432, "kl": 0.00017563501993815103, "learning_rate": 1.5789473684210526e-06, "loss": 0.0, "reward": 0.16623264349376163, "reward_std": 0.166806096288686, "rewards/accuracy_reward": 0.15190972640023878, "rewards/format_reward": 0.014322916977107525, "step": 6 }, { "completion_length": 1840.263064066569, "epoch": 0.011926453536524764, "grad_norm": 0.07688009738922119, "kl": 0.00018552939097086588, "learning_rate": 2.368421052631579e-06, "loss": 0.0, "reward": 0.22352431349766752, "reward_std": 0.17657933492834368, "rewards/accuracy_reward": 0.20355903388311467, "rewards/format_reward": 0.019965278489204746, "step": 9 }, { "completion_length": 1760.6285069783528, "epoch": 0.015901938048699684, "grad_norm": 0.1042766273021698, "kl": 0.00028959910074869793, "learning_rate": 3.157894736842105e-06, "loss": 0.0, "reward": 0.2647569526452571, "reward_std": 0.219600356494387, "rewards/accuracy_reward": 0.2326388960548987, "rewards/format_reward": 0.032118056512748204, "step": 12 }, { "completion_length": 1722.3082021077473, "epoch": 0.019877422560874606, "grad_norm": 0.3040783703327179, "kl": 0.0018845796585083008, "learning_rate": 3.947368421052632e-06, "loss": 0.0001, "reward": 0.3059895912495752, "reward_std": 0.23623824515379965, "rewards/accuracy_reward": 0.23784722954345247, "rewards/format_reward": 0.06814236252103001, "step": 15 }, { "completion_length": 845.952714920044, "epoch": 0.023852907073049528, "grad_norm": 1.3506791591644287, "kl": 0.3431205749511719, "learning_rate": 4.736842105263158e-06, "loss": 0.0137, "reward": 0.6740451576188207, "reward_std": 0.2776922438448916, "rewards/accuracy_reward": 0.1336805592291057, "rewards/format_reward": 0.5403645950524757, "step": 18 }, { "completion_length": 66.15755401055019, "epoch": 0.02782839158522445, "grad_norm": 1.3607665300369263, "kl": 0.6993815104166666, "learning_rate": 5.526315789473685e-06, "loss": 0.028, "reward": 0.9774305758376917, "reward_std": 0.09281354808869462, "rewards/accuracy_reward": 0.018663194845430553, "rewards/format_reward": 0.9587673805654049, "step": 21 }, { "completion_length": 118.68446455399196, "epoch": 0.03180387609739937, "grad_norm": 0.8041670322418213, "kl": 0.53955078125, "learning_rate": 6.31578947368421e-06, "loss": 0.0216, "reward": 0.9665798830489317, "reward_std": 0.1714695317981144, "rewards/accuracy_reward": 0.04600694558272759, "rewards/format_reward": 0.9205729328095913, "step": 24 }, { "completion_length": 88.47830098867416, "epoch": 0.03577936060957429, "grad_norm": 0.8617585301399231, "kl": 0.59326171875, "learning_rate": 7.1052631578947375e-06, "loss": 0.0237, "reward": 1.0386284987131755, "reward_std": 0.2134858975186944, "rewards/accuracy_reward": 0.09592014209677775, "rewards/format_reward": 0.9427083507180214, "step": 27 }, { "completion_length": 30.138455788294475, "epoch": 0.03975484512174921, "grad_norm": 2.2023513317108154, "kl": 0.8590494791666666, "learning_rate": 7.894736842105265e-06, "loss": 0.0344, "reward": 1.1623264302810032, "reward_std": 0.16990292662133774, "rewards/accuracy_reward": 0.17361111589707434, "rewards/format_reward": 0.9887152872979641, "step": 30 }, { "completion_length": 35.736980040868126, "epoch": 0.043730329633924134, "grad_norm": 1.1060832738876343, "kl": 0.8148600260416666, "learning_rate": 8.68421052631579e-06, "loss": 0.0326, "reward": 1.1657986442248027, "reward_std": 0.17117769015021622, "rewards/accuracy_reward": 0.18793403407714018, "rewards/format_reward": 0.9778645994762579, "step": 33 }, { "completion_length": 35.928820510705314, "epoch": 0.047705814146099056, "grad_norm": 1.2442351579666138, "kl": 0.847412109375, "learning_rate": 9.473684210526315e-06, "loss": 0.0339, "reward": 1.2052951740721862, "reward_std": 0.16379862558096647, "rewards/accuracy_reward": 0.22265625485063842, "rewards/format_reward": 0.9826389029622078, "step": 36 }, { "completion_length": 70.93489801883698, "epoch": 0.05168129865827398, "grad_norm": 1.4330227375030518, "kl": 0.8037923177083334, "learning_rate": 1.0263157894736844e-05, "loss": 0.0321, "reward": 1.1744792064030964, "reward_std": 0.22849255722636977, "rewards/accuracy_reward": 0.22222222892257074, "rewards/format_reward": 0.9522569614152113, "step": 39 }, { "completion_length": 81.15885670979817, "epoch": 0.0556567831704489, "grad_norm": 1.0737708806991577, "kl": 0.7923177083333334, "learning_rate": 1.105263157894737e-05, "loss": 0.0317, "reward": 1.14930559694767, "reward_std": 0.22925202331195274, "rewards/accuracy_reward": 0.19184028345625848, "rewards/format_reward": 0.9574652947485447, "step": 42 }, { "completion_length": 69.16927303870519, "epoch": 0.05963226768262382, "grad_norm": 0.8058044910430908, "kl": 0.8806966145833334, "learning_rate": 1.1842105263157895e-05, "loss": 0.0352, "reward": 1.1675347524384658, "reward_std": 0.1868902291947355, "rewards/accuracy_reward": 0.19618056225590408, "rewards/format_reward": 0.9713541852931181, "step": 45 }, { "completion_length": 57.069880266984306, "epoch": 0.06360775219479874, "grad_norm": 2.353023052215576, "kl": 0.91162109375, "learning_rate": 1.263157894736842e-05, "loss": 0.0365, "reward": 1.189236156642437, "reward_std": 0.18126761401072145, "rewards/accuracy_reward": 0.20095486647915095, "rewards/format_reward": 0.9882812586923441, "step": 48 }, { "completion_length": 21.48567771911621, "epoch": 0.06758323670697367, "grad_norm": 0.964798629283905, "kl": 0.8650716145833334, "learning_rate": 1.3421052631578948e-05, "loss": 0.0346, "reward": 1.2161458743115265, "reward_std": 0.1899353281284372, "rewards/accuracy_reward": 0.21961806206187853, "rewards/format_reward": 0.996527781089147, "step": 51 }, { "completion_length": 53.81597367922465, "epoch": 0.07155872121914858, "grad_norm": 1.0664324760437012, "kl": 0.8111572265625, "learning_rate": 1.4210526315789475e-05, "loss": 0.0324, "reward": 1.162326426555713, "reward_std": 0.23168744108018777, "rewards/accuracy_reward": 0.20008681147980192, "rewards/format_reward": 0.9622395982344946, "step": 54 }, { "completion_length": 208.98611625035605, "epoch": 0.07553420573132351, "grad_norm": 0.8128153085708618, "kl": 0.6840006510416666, "learning_rate": 1.5000000000000002e-05, "loss": 0.0274, "reward": 1.1306424004336197, "reward_std": 0.35048759169876575, "rewards/accuracy_reward": 0.22743056244992962, "rewards/format_reward": 0.9032118245959282, "step": 57 }, { "completion_length": 260.03820244471234, "epoch": 0.07950969024349842, "grad_norm": 0.5030148029327393, "kl": 0.599609375, "learning_rate": 1.578947368421053e-05, "loss": 0.024, "reward": 1.168402807166179, "reward_std": 0.28692516156782705, "rewards/accuracy_reward": 0.21223958927051476, "rewards/format_reward": 0.9561632163822651, "step": 60 }, { "completion_length": 191.35243598620096, "epoch": 0.08348517475567335, "grad_norm": 0.5893499851226807, "kl": 0.6812337239583334, "learning_rate": 1.6578947368421053e-05, "loss": 0.0273, "reward": 1.1432291989525158, "reward_std": 0.21883391573404273, "rewards/accuracy_reward": 0.17057292070239782, "rewards/format_reward": 0.9726562661429247, "step": 63 }, { "completion_length": 304.99697029590607, "epoch": 0.08746065926784827, "grad_norm": 0.7647993564605713, "kl": 0.6839599609375, "learning_rate": 1.736842105263158e-05, "loss": 0.0274, "reward": 1.1657986467083294, "reward_std": 0.2664798041029523, "rewards/accuracy_reward": 0.210069450433366, "rewards/format_reward": 0.9557291840513548, "step": 66 }, { "completion_length": 84.33941195408504, "epoch": 0.0914361437800232, "grad_norm": 1.6230992078781128, "kl": 0.8506673177083334, "learning_rate": 1.8157894736842107e-05, "loss": 0.034, "reward": 1.1362847598890464, "reward_std": 0.21719592344015837, "rewards/accuracy_reward": 0.17838542186655104, "rewards/format_reward": 0.9578993221124014, "step": 69 }, { "completion_length": 47.16927206516266, "epoch": 0.09541162829219811, "grad_norm": 0.5694164633750916, "kl": 0.8841145833333334, "learning_rate": 1.894736842105263e-05, "loss": 0.0354, "reward": 1.1827257337669532, "reward_std": 0.18139228488629064, "rewards/accuracy_reward": 0.19487847795244306, "rewards/format_reward": 0.9878472325702509, "step": 72 }, { "completion_length": 65.3068592151006, "epoch": 0.09938711280437303, "grad_norm": 0.8147668838500977, "kl": 0.8631184895833334, "learning_rate": 1.9736842105263158e-05, "loss": 0.0345, "reward": 1.2265625409781933, "reward_std": 0.22970331188601753, "rewards/accuracy_reward": 0.24826389558923742, "rewards/format_reward": 0.9782986268401146, "step": 75 }, { "completion_length": 101.73307637373607, "epoch": 0.10336259731654796, "grad_norm": 3.9387757778167725, "kl": 0.9171549479166666, "learning_rate": 1.9999570594853575e-05, "loss": 0.0367, "reward": 1.1497396255532901, "reward_std": 0.26099368068389595, "rewards/accuracy_reward": 0.20920139628772935, "rewards/format_reward": 0.9405382163822651, "step": 78 }, { "completion_length": 29.979601462682087, "epoch": 0.10733808182872287, "grad_norm": 5.135621547698975, "kl": 1.5565592447916667, "learning_rate": 1.9997316318671806e-05, "loss": 0.0622, "reward": 1.2044271193444729, "reward_std": 0.1869426581542939, "rewards/accuracy_reward": 0.23090278574575981, "rewards/format_reward": 0.9735243221124014, "step": 81 }, { "completion_length": 22.930122137069702, "epoch": 0.1113135663408978, "grad_norm": 28.183279037475586, "kl": 1.3277994791666667, "learning_rate": 1.999313025518698e-05, "loss": 0.0531, "reward": 1.1892361504336197, "reward_std": 0.20981760757664839, "rewards/accuracy_reward": 0.20876736678959182, "rewards/format_reward": 0.9804687661429247, "step": 84 }, { "completion_length": 46.38932470480601, "epoch": 0.11528905085307271, "grad_norm": 1.771388053894043, "kl": 1.13525390625, "learning_rate": 1.9987013213274594e-05, "loss": 0.0454, "reward": 1.2356771156191826, "reward_std": 0.18450136513759693, "rewards/accuracy_reward": 0.25737847938823205, "rewards/format_reward": 0.9782986268401146, "step": 87 }, { "completion_length": 133.1475731531779, "epoch": 0.11926453536524764, "grad_norm": 0.963828444480896, "kl": 1.10107421875, "learning_rate": 1.9978966374934255e-05, "loss": 0.0441, "reward": 1.1979166989525158, "reward_std": 0.22738417129342756, "rewards/accuracy_reward": 0.2348090335726738, "rewards/format_reward": 0.9631076604127884, "step": 90 }, { "completion_length": 95.22396143277486, "epoch": 0.12324001987742256, "grad_norm": 2.4775350093841553, "kl": 1.29541015625, "learning_rate": 1.996899129506126e-05, "loss": 0.0519, "reward": 1.1801215683420498, "reward_std": 0.22066468729948005, "rewards/accuracy_reward": 0.21831597779722264, "rewards/format_reward": 0.9618055733541647, "step": 93 }, { "completion_length": 141.00217274824777, "epoch": 0.12721550438959747, "grad_norm": 113.83024597167969, "kl": 6.85986328125, "learning_rate": 1.995708990114615e-05, "loss": 0.2747, "reward": 1.1006944725910823, "reward_std": 0.2825309601612389, "rewards/accuracy_reward": 0.16493056050967425, "rewards/format_reward": 0.9357639079292616, "step": 96 }, { "completion_length": 166.101132551829, "epoch": 0.13119098890177242, "grad_norm": 10.326292037963867, "kl": 4.069661458333333, "learning_rate": 1.994326449290226e-05, "loss": 0.1628, "reward": 1.0438368394970894, "reward_std": 0.32250430978213745, "rewards/accuracy_reward": 0.1401909765166541, "rewards/format_reward": 0.9036458494762579, "step": 99 }, { "epoch": 0.1325161504058307, "eval_completion_length": 159.95536130788375, "eval_kl": 3.7168367346938775, "eval_loss": 0.16389134526252747, "eval_reward": 1.0437925482282833, "eval_reward_std": 0.3066673065174599, "eval_rewards/accuracy_reward": 0.14200680786553693, "eval_rewards/format_reward": 0.9017857349648768, "eval_runtime": 416.6475, "eval_samples_per_second": 0.238, "eval_steps_per_second": 0.012, "step": 100 }, { "completion_length": 146.04948329925537, "epoch": 0.13516647341394733, "grad_norm": 29.008094787597656, "kl": 4.4609375, "learning_rate": 1.9927517741821343e-05, "loss": 0.1592, "reward": 1.0123698189854622, "reward_std": 0.279809627099894, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.9082031473517418, "step": 102 }, { "completion_length": 83.48133925596873, "epoch": 0.13914195792612225, "grad_norm": 2.938828468322754, "kl": 2.2904459635416665, "learning_rate": 1.990985269065736e-05, "loss": 0.0916, "reward": 1.0742187835276127, "reward_std": 0.23118331842124462, "rewards/accuracy_reward": 0.12543403171002865, "rewards/format_reward": 0.9487847375373045, "step": 105 }, { "completion_length": 54.888022780418396, "epoch": 0.14311744243829716, "grad_norm": 2.3945536613464355, "kl": 2.4551595052083335, "learning_rate": 1.989027275283852e-05, "loss": 0.0982, "reward": 1.1523437860111396, "reward_std": 0.20264656166546047, "rewards/accuracy_reward": 0.17621528167122355, "rewards/format_reward": 0.976128488779068, "step": 108 }, { "completion_length": 42.01388998826345, "epoch": 0.1470929269504721, "grad_norm": 2.8216843605041504, "kl": 1.4291178385416667, "learning_rate": 1.9868781711807705e-05, "loss": 0.0572, "reward": 1.1814236467083294, "reward_std": 0.18004686074952284, "rewards/accuracy_reward": 0.1927083395033454, "rewards/format_reward": 0.9887152885397276, "step": 111 }, { "completion_length": 37.57769203186035, "epoch": 0.15106841146264702, "grad_norm": 2.9078152179718018, "kl": 1.4965006510416667, "learning_rate": 1.9845383720291392e-05, "loss": 0.0598, "reward": 1.19531253973643, "reward_std": 0.16706574785833558, "rewards/accuracy_reward": 0.20486111663437137, "rewards/format_reward": 0.990451397995154, "step": 114 }, { "completion_length": 46.923178335030876, "epoch": 0.15504389597482193, "grad_norm": 2.5346879959106445, "kl": 1.30078125, "learning_rate": 1.9820083299497227e-05, "loss": 0.0521, "reward": 1.1796875434617202, "reward_std": 0.17389631201513112, "rewards/accuracy_reward": 0.1901041710516438, "rewards/format_reward": 0.9895833420256773, "step": 117 }, { "completion_length": 84.15060981114705, "epoch": 0.15901938048699685, "grad_norm": 20.683246612548828, "kl": 2.1199544270833335, "learning_rate": 1.9792885338240375e-05, "loss": 0.0848, "reward": 1.1553819701075554, "reward_std": 0.22444627589235702, "rewards/accuracy_reward": 0.1844618112857764, "rewards/format_reward": 0.9709201554457346, "step": 120 }, { "completion_length": 107.51779842376709, "epoch": 0.16299486499917176, "grad_norm": 5.012475967407227, "kl": 2.2217610677083335, "learning_rate": 1.976379509199886e-05, "loss": 0.0888, "reward": 1.1514757337669532, "reward_std": 0.262029462105905, "rewards/accuracy_reward": 0.1970486156642437, "rewards/format_reward": 0.9544271069268385, "step": 123 }, { "completion_length": 148.11632299423218, "epoch": 0.1669703495113467, "grad_norm": 14.528366088867188, "kl": 2.2367350260416665, "learning_rate": 1.9732818181898046e-05, "loss": 0.0895, "reward": 1.1228298942248027, "reward_std": 0.2808268330991268, "rewards/accuracy_reward": 0.1896701450071608, "rewards/format_reward": 0.9331597425043583, "step": 126 }, { "completion_length": 84.5121552546819, "epoch": 0.17094583402352162, "grad_norm": 0.969579815864563, "kl": 1.4344889322916667, "learning_rate": 1.9699960593624462e-05, "loss": 0.0574, "reward": 1.15538198625048, "reward_std": 0.21577061604087552, "rewards/accuracy_reward": 0.18359375500585884, "rewards/format_reward": 0.9717882089316845, "step": 129 }, { "completion_length": 72.3042555252711, "epoch": 0.17492131853569653, "grad_norm": 1.7120805978775024, "kl": 1.3595377604166667, "learning_rate": 1.966522867626919e-05, "loss": 0.0544, "reward": 1.1766493457059066, "reward_std": 0.20347999944351614, "rewards/accuracy_reward": 0.1983507004721711, "rewards/format_reward": 0.9782986243565878, "step": 132 }, { "completion_length": 96.82161716620128, "epoch": 0.17889680304787145, "grad_norm": 7.904327869415283, "kl": 1.818359375, "learning_rate": 1.962862914110101e-05, "loss": 0.0727, "reward": 1.2313368394970894, "reward_std": 0.2199981181571881, "rewards/accuracy_reward": 0.25781250578196097, "rewards/format_reward": 0.9735243196288744, "step": 135 }, { "completion_length": 177.80512762069702, "epoch": 0.1828722875600464, "grad_norm": 2.0114357471466064, "kl": 1.666259765625, "learning_rate": 1.9590169060269602e-05, "loss": 0.0666, "reward": 1.1423611491918564, "reward_std": 0.26254904045102495, "rewards/accuracy_reward": 0.2005208401630322, "rewards/format_reward": 0.941840298473835, "step": 138 }, { "completion_length": 90.11675635973613, "epoch": 0.1868477720722213, "grad_norm": 0.599406898021698, "kl": 0.9375813802083334, "learning_rate": 1.9549855865438967e-05, "loss": 0.0375, "reward": 1.205729205161333, "reward_std": 0.19089689617976546, "rewards/accuracy_reward": 0.22092014454149952, "rewards/format_reward": 0.9848090397814909, "step": 141 }, { "completion_length": 56.38889070351919, "epoch": 0.19082325658439622, "grad_norm": 17.37237548828125, "kl": 1.100830078125, "learning_rate": 1.9507697346351414e-05, "loss": 0.0441, "reward": 1.2404514253139496, "reward_std": 0.15965971552456418, "rewards/accuracy_reward": 0.2469618124111245, "rewards/format_reward": 0.9934895895421505, "step": 144 }, { "completion_length": 117.53472594420116, "epoch": 0.19479874109657114, "grad_norm": 1.0283232927322388, "kl": 0.917236328125, "learning_rate": 1.9463701649322343e-05, "loss": 0.0367, "reward": 1.1766493432223797, "reward_std": 0.22516770443568626, "rewards/accuracy_reward": 0.2135416737291962, "rewards/format_reward": 0.9631076554457346, "step": 147 }, { "completion_length": 154.88368586699167, "epoch": 0.19877422560874605, "grad_norm": 0.9131763577461243, "kl": 1.0406901041666667, "learning_rate": 1.941787727566613e-05, "loss": 0.0416, "reward": 1.1358507387340069, "reward_std": 0.24575756738583246, "rewards/accuracy_reward": 0.18706597783602774, "rewards/format_reward": 0.9487847412625948, "step": 150 }, { "completion_length": 137.50000397364298, "epoch": 0.202749710120921, "grad_norm": 1.9522716999053955, "kl": 3.1927897135416665, "learning_rate": 1.9370233080053406e-05, "loss": 0.1279, "reward": 1.1432291989525158, "reward_std": 0.25552801430846256, "rewards/accuracy_reward": 0.18836806000520787, "rewards/format_reward": 0.9548611318071684, "step": 153 }, { "completion_length": 340.70487225055695, "epoch": 0.2067251946330959, "grad_norm": 338.98333740234375, "kl": 10.551839192708334, "learning_rate": 1.9320778268800068e-05, "loss": 0.4221, "reward": 1.0894097586472828, "reward_std": 0.32969770890971023, "rewards/accuracy_reward": 0.2165798662075152, "rewards/format_reward": 0.8728298805654049, "step": 156 }, { "completion_length": 1684.335110982259, "epoch": 0.21070067914527082, "grad_norm": 25.311864852905273, "kl": 25.832682291666668, "learning_rate": 1.926952239808833e-05, "loss": 1.0325, "reward": 0.6332465472320715, "reward_std": 0.556972432260712, "rewards/accuracy_reward": 0.2395833390376841, "rewards/format_reward": 0.39366320706903934, "step": 159 }, { "completion_length": 1924.016092936198, "epoch": 0.21467616365744574, "grad_norm": 0.9118285179138184, "kl": 2.5398763020833335, "learning_rate": 1.9216475372120198e-05, "loss": 0.1016, "reward": 0.5694444632778565, "reward_std": 0.5513101244966189, "rewards/accuracy_reward": 0.17751736612990499, "rewards/format_reward": 0.3919270985449354, "step": 162 }, { "completion_length": 536.2187668085098, "epoch": 0.21865164816962068, "grad_norm": 0.5543506741523743, "kl": 0.8994954427083334, "learning_rate": 1.9161647441203648e-05, "loss": 0.036, "reward": 1.0902778059244156, "reward_std": 0.2743187023637195, "rewards/accuracy_reward": 0.179253476128603, "rewards/format_reward": 0.9110243183871111, "step": 165 }, { "completion_length": 28.863716046015423, "epoch": 0.2226271326817956, "grad_norm": 0.5757032632827759, "kl": 0.918701171875, "learning_rate": 1.9105049199771963e-05, "loss": 0.0367, "reward": 1.2426215708255768, "reward_std": 0.16670533292926848, "rewards/accuracy_reward": 0.24696181248873472, "rewards/format_reward": 0.9956597263614336, "step": 168 }, { "completion_length": 36.32769219080607, "epoch": 0.2266026171939705, "grad_norm": 0.9013729691505432, "kl": 0.907470703125, "learning_rate": 1.904669158433658e-05, "loss": 0.0363, "reward": 1.2196180919806163, "reward_std": 0.15937398614672324, "rewards/accuracy_reward": 0.2235243107036998, "rewards/format_reward": 0.9960937537252903, "step": 171 }, { "completion_length": 118.9145000775655, "epoch": 0.23057810170614543, "grad_norm": 0.603873610496521, "kl": 0.875244140625, "learning_rate": 1.8986585871373792e-05, "loss": 0.035, "reward": 1.19921878973643, "reward_std": 0.21174315828830004, "rewards/accuracy_reward": 0.22526042260384807, "rewards/format_reward": 0.9739583494762579, "step": 174 }, { "completion_length": 154.3828158378601, "epoch": 0.23455358621832037, "grad_norm": 0.5029460191726685, "kl": 0.8998209635416666, "learning_rate": 1.8924743675145815e-05, "loss": 0.036, "reward": 1.1315104526778061, "reward_std": 0.291058028737704, "rewards/accuracy_reward": 0.2100694509766375, "rewards/format_reward": 0.9214409937461218, "step": 177 }, { "completion_length": 27.94401141007741, "epoch": 0.23852907073049529, "grad_norm": 0.7333383560180664, "kl": 1.0400390625, "learning_rate": 1.8861176945456542e-05, "loss": 0.0416, "reward": 1.1788194874922435, "reward_std": 0.21482299477793276, "rewards/accuracy_reward": 0.20920139516238123, "rewards/format_reward": 0.9696180733541647, "step": 180 }, { "completion_length": 24.035590926806133, "epoch": 0.2425045552426702, "grad_norm": 0.9987295866012573, "kl": 1.773681640625, "learning_rate": 1.8795897965342473e-05, "loss": 0.071, "reward": 1.2339410148561, "reward_std": 0.18799259358396134, "rewards/accuracy_reward": 0.25564236876865226, "rewards/format_reward": 0.9782986280818781, "step": 183 }, { "completion_length": 22.082900047302246, "epoch": 0.24648003975484511, "grad_norm": 1.0223162174224854, "kl": 1.2223307291666667, "learning_rate": 1.8728919348699285e-05, "loss": 0.0489, "reward": 1.1970486516753833, "reward_std": 0.1605207941805323, "rewards/accuracy_reward": 0.21267361818657568, "rewards/format_reward": 0.9843750111758709, "step": 186 }, { "completion_length": 22.559462388356526, "epoch": 0.25045552426702006, "grad_norm": 0.9617392420768738, "kl": 1.2871907552083333, "learning_rate": 1.866025403784439e-05, "loss": 0.0515, "reward": 1.253038230041663, "reward_std": 0.16818702151067555, "rewards/accuracy_reward": 0.26866320287808776, "rewards/format_reward": 0.9843750111758709, "step": 189 }, { "completion_length": 74.73828355471294, "epoch": 0.25443100877919494, "grad_norm": 438.25408935546875, "kl": 6.625325520833333, "learning_rate": 1.858991530101613e-05, "loss": 0.2647, "reward": 0.8719618345300356, "reward_std": 0.4378834879025817, "rewards/accuracy_reward": 0.20963542287548384, "rewards/format_reward": 0.6623264097919067, "step": 192 }, { "completion_length": 72.35807486375172, "epoch": 0.2584064932913699, "grad_norm": 2.040531873703003, "kl": 1.7041829427083333, "learning_rate": 1.851791672980993e-05, "loss": 0.0681, "reward": 0.6892361293236414, "reward_std": 0.5002113915979862, "rewards/accuracy_reward": 0.17578125411334136, "rewards/format_reward": 0.5134548768401146, "step": 195 }, { "completion_length": 25.8216153383255, "epoch": 0.26238197780354483, "grad_norm": 0.6412864923477173, "kl": 1.1061197916666667, "learning_rate": 1.844427223655199e-05, "loss": 0.0443, "reward": 1.1362847524384658, "reward_std": 0.2772039214614779, "rewards/accuracy_reward": 0.2052951459384834, "rewards/format_reward": 0.9309896019597849, "step": 198 }, { "epoch": 0.2650323008116614, "eval_completion_length": 106.06218176621657, "eval_kl": 1.0171274038461537, "eval_loss": 0.0414416678249836, "eval_reward": 1.2000000339287977, "eval_reward_std": 0.21003777883373775, "eval_rewards/accuracy_reward": 0.24038462171760888, "eval_rewards/format_reward": 0.9596154038722698, "eval_runtime": 392.0553, "eval_samples_per_second": 0.253, "eval_steps_per_second": 0.013, "step": 200 }, { "completion_length": 153.56380558013916, "epoch": 0.2663574623157197, "grad_norm": 0.6983628869056702, "kl": 1.030517578125, "learning_rate": 1.8368996051610987e-05, "loss": 0.0402, "reward": 1.1731771118938923, "reward_std": 0.2824738877825439, "rewards/accuracy_reward": 0.2369791748933494, "rewards/format_reward": 0.9361979365348816, "step": 201 }, { "completion_length": 88.2196215391159, "epoch": 0.27033294682789466, "grad_norm": 0.5030940771102905, "kl": 1.013427734375, "learning_rate": 1.8292102720648333e-05, "loss": 0.0406, "reward": 1.1948785136143367, "reward_std": 0.18432624839867154, "rewards/accuracy_reward": 0.2248263950459659, "rewards/format_reward": 0.9700521007180214, "step": 204 }, { "completion_length": 59.0355920791626, "epoch": 0.27430843134006955, "grad_norm": 0.6660766005516052, "kl": 1.0997721354166667, "learning_rate": 1.821360710180753e-05, "loss": 0.044, "reward": 1.2152778208255768, "reward_std": 0.16302509784388045, "rewards/accuracy_reward": 0.2326388961325089, "rewards/format_reward": 0.9826389054457346, "step": 207 }, { "completion_length": 136.2734409570694, "epoch": 0.2782839158522445, "grad_norm": 12.398130416870117, "kl": 1.268310546875, "learning_rate": 1.8133524362843105e-05, "loss": 0.0507, "reward": 1.1844618419806163, "reward_std": 0.2869204127540191, "rewards/accuracy_reward": 0.24218750729536018, "rewards/format_reward": 0.9422743258376917, "step": 210 }, { "completion_length": 70.42100850741069, "epoch": 0.28225940036441943, "grad_norm": 0.6272424459457397, "kl": 1.75341796875, "learning_rate": 1.8051869978189732e-05, "loss": 0.0702, "reward": 1.1562500409781933, "reward_std": 0.2056693274838229, "rewards/accuracy_reward": 0.18315972775841752, "rewards/format_reward": 0.9730902922650179, "step": 213 }, { "completion_length": 73.97873449325562, "epoch": 0.2862348848765943, "grad_norm": 0.8462525010108948, "kl": 1.53515625, "learning_rate": 1.7968659725972113e-05, "loss": 0.0614, "reward": 1.2317708693444729, "reward_std": 0.20991144888103008, "rewards/accuracy_reward": 0.25130209074510884, "rewards/format_reward": 0.9804687649011612, "step": 216 }, { "completion_length": 253.28212424119315, "epoch": 0.29021036938876926, "grad_norm": 1.3421990871429443, "kl": 1.69873046875, "learning_rate": 1.7883909684956142e-05, "loss": 0.0679, "reward": 1.194010455161333, "reward_std": 0.23462056911860904, "rewards/accuracy_reward": 0.22092014512357613, "rewards/format_reward": 0.9730902959903082, "step": 219 }, { "completion_length": 470.5165026982625, "epoch": 0.2941858539009442, "grad_norm": 1.0831785202026367, "kl": 1.51318359375, "learning_rate": 1.7797636231442018e-05, "loss": 0.0605, "reward": 1.1184896218279998, "reward_std": 0.3129944964312017, "rewards/accuracy_reward": 0.18750000543271503, "rewards/format_reward": 0.9309896032015482, "step": 222 }, { "completion_length": 236.49306138356528, "epoch": 0.2981613384131191, "grad_norm": 0.5017532706260681, "kl": 1.0579427083333333, "learning_rate": 1.770985603609982e-05, "loss": 0.0423, "reward": 1.1736111516753833, "reward_std": 0.22954328202952942, "rewards/accuracy_reward": 0.2083333401630322, "rewards/format_reward": 0.9652777972320715, "step": 225 }, { "completion_length": 370.95573965708417, "epoch": 0.30213682292529404, "grad_norm": 1.8773243427276611, "kl": 0.7275390625, "learning_rate": 1.762058606074825e-05, "loss": 0.0291, "reward": 1.1892361417412758, "reward_std": 0.2853658755775541, "rewards/accuracy_reward": 0.2526041748545443, "rewards/format_reward": 0.9366319626569748, "step": 228 }, { "completion_length": 749.3316179911295, "epoch": 0.3061123074374689, "grad_norm": 31.57447624206543, "kl": 1613.3247884114583, "learning_rate": 1.7529843555077066e-05, "loss": 64.5574, "reward": 0.9032118283212185, "reward_std": 0.5264206398278475, "rewards/accuracy_reward": 0.1909722271375358, "rewards/format_reward": 0.7122396056850752, "step": 231 }, { "completion_length": 675.3758859634399, "epoch": 0.31008779194964387, "grad_norm": 5.697227954864502, "kl": 1.6119791666666667, "learning_rate": 1.743764605331392e-05, "loss": 0.0645, "reward": 0.9205729439854622, "reward_std": 0.511370474472642, "rewards/accuracy_reward": 0.21093750609240183, "rewards/format_reward": 0.7096354346722364, "step": 234 }, { "completion_length": 132.1358541647593, "epoch": 0.3140632764618188, "grad_norm": 11.240227699279785, "kl": 1.1758626302083333, "learning_rate": 1.734401137083623e-05, "loss": 0.047, "reward": 1.1697048942248027, "reward_std": 0.234877454660212, "rewards/accuracy_reward": 0.20920139295049012, "rewards/format_reward": 0.9605034875373045, "step": 237 }, { "completion_length": 65.88151196638744, "epoch": 0.3180387609739937, "grad_norm": 1.4037131071090698, "kl": 1.52587890625, "learning_rate": 1.7248957600728664e-05, "loss": 0.0611, "reward": 1.2100694850087166, "reward_std": 0.17585339567934474, "rewards/accuracy_reward": 0.2248263950071608, "rewards/format_reward": 0.9852430634200573, "step": 240 }, { "completion_length": 62.16363008817037, "epoch": 0.32201424548616864, "grad_norm": 1.2037297487258911, "kl": 2.00048828125, "learning_rate": 1.7152503110287048e-05, "loss": 0.0802, "reward": 1.21484378973643, "reward_std": 0.14739138268244764, "rewards/accuracy_reward": 0.22265625558793545, "rewards/format_reward": 0.9921875024835268, "step": 243 }, { "completion_length": 125.61762539545695, "epoch": 0.3259897299983435, "grad_norm": 101.92506408691406, "kl": 0.934326171875, "learning_rate": 1.7054666537469213e-05, "loss": 0.0374, "reward": 1.1098090633749962, "reward_std": 0.30111823774253327, "rewards/accuracy_reward": 0.2152777845816066, "rewards/format_reward": 0.8945312686264515, "step": 246 }, { "completion_length": 121.29817994435628, "epoch": 0.32996521451051847, "grad_norm": 34.78390884399414, "kl": 1.1844889322916667, "learning_rate": 1.6955466787293574e-05, "loss": 0.0473, "reward": 1.1397569874922435, "reward_std": 0.25541831855662167, "rewards/accuracy_reward": 0.19444444961845875, "rewards/format_reward": 0.9453125186264515, "step": 249 }, { "completion_length": 187.08290481567383, "epoch": 0.3339406990226934, "grad_norm": 9.161247253417969, "kl": 1.1195475260416667, "learning_rate": 1.6854923028186112e-05, "loss": 0.0448, "reward": 1.174479205161333, "reward_std": 0.2551775785783927, "rewards/accuracy_reward": 0.226128477564392, "rewards/format_reward": 0.9483507138987383, "step": 252 }, { "completion_length": 427.2378609975179, "epoch": 0.3379161835348683, "grad_norm": 4.801308631896973, "kl": 0.938720703125, "learning_rate": 1.6753054688276443e-05, "loss": 0.0376, "reward": 1.0308160049219925, "reward_std": 0.4219017767657836, "rewards/accuracy_reward": 0.18880208985259136, "rewards/format_reward": 0.8420139091710249, "step": 255 }, { "completion_length": 529.5642477671305, "epoch": 0.34189166804704324, "grad_norm": 24.645631790161133, "kl": 0.8575032552083334, "learning_rate": 1.6649881451643706e-05, "loss": 0.0343, "reward": 0.9887153046826521, "reward_std": 0.4318722311096887, "rewards/accuracy_reward": 0.16449653268015632, "rewards/format_reward": 0.8242187686264515, "step": 258 }, { "completion_length": 635.6575686136881, "epoch": 0.3458671525592181, "grad_norm": 16.547225952148438, "kl": 0.8201497395833334, "learning_rate": 1.6545423254513003e-05, "loss": 0.0328, "reward": 0.9114583544433117, "reward_std": 0.49249805447955924, "rewards/accuracy_reward": 0.15451389361017695, "rewards/format_reward": 0.7569444626569748, "step": 261 }, { "completion_length": 670.922758102417, "epoch": 0.34984263707139307, "grad_norm": 3.734528064727783, "kl": 28.136962890625, "learning_rate": 1.6439700281403113e-05, "loss": 1.13, "reward": 0.8875868320465088, "reward_std": 0.5033875486503044, "rewards/accuracy_reward": 0.15581597620621324, "rewards/format_reward": 0.7317708556850752, "step": 264 }, { "completion_length": 493.13022168477374, "epoch": 0.353818121583568, "grad_norm": 1.4124517440795898, "kl": 0.744140625, "learning_rate": 1.6332732961226214e-05, "loss": 0.0298, "reward": 1.003472238779068, "reward_std": 0.37580153982465464, "rewards/accuracy_reward": 0.16710069950204343, "rewards/format_reward": 0.8363715509573618, "step": 267 }, { "completion_length": 216.89670626322427, "epoch": 0.3577936060957429, "grad_norm": 16.25065040588379, "kl": 0.7809244791666666, "learning_rate": 1.6224541963340392e-05, "loss": 0.0312, "reward": 1.1371528195838134, "reward_std": 0.24574858765117824, "rewards/accuracy_reward": 0.18706597686590007, "rewards/format_reward": 0.950086829562982, "step": 270 }, { "completion_length": 177.61805963516235, "epoch": 0.36176909060791784, "grad_norm": 0.2947433888912201, "kl": 0.6514485677083334, "learning_rate": 1.6115148193555708e-05, "loss": 0.0261, "reward": 1.1371528146167595, "reward_std": 0.216966389445588, "rewards/accuracy_reward": 0.1727430597335721, "rewards/format_reward": 0.9644097400208315, "step": 273 }, { "completion_length": 166.2851603825887, "epoch": 0.3657445751200928, "grad_norm": 0.2939068377017975, "kl": 0.6795247395833334, "learning_rate": 1.6004572790094535e-05, "loss": 0.0272, "reward": 1.1618923991918564, "reward_std": 0.20391751010902226, "rewards/accuracy_reward": 0.1848958384944126, "rewards/format_reward": 0.9769965422650179, "step": 276 }, { "completion_length": 261.6618987719218, "epoch": 0.3697200596322677, "grad_norm": 0.29132047295570374, "kl": 0.6346842447916666, "learning_rate": 1.5892837119507017e-05, "loss": 0.0254, "reward": 1.1731771218279998, "reward_std": 0.2886992564114432, "rewards/accuracy_reward": 0.22005208965856582, "rewards/format_reward": 0.953125017384688, "step": 279 }, { "completion_length": 336.81771659851074, "epoch": 0.3736955441444426, "grad_norm": 0.3138696551322937, "kl": 0.5843098958333334, "learning_rate": 1.5779962772542404e-05, "loss": 0.0234, "reward": 1.1588542101283867, "reward_std": 0.3115967277747889, "rewards/accuracy_reward": 0.22482639298929522, "rewards/format_reward": 0.9340277997155985, "step": 282 }, { "completion_length": 331.4518330891927, "epoch": 0.3776710286566175, "grad_norm": 0.2659892141819, "kl": 0.5912679036458334, "learning_rate": 1.5665971559977035e-05, "loss": 0.0237, "reward": 1.1488715608914692, "reward_std": 0.29840323934331536, "rewards/accuracy_reward": 0.216145838998879, "rewards/format_reward": 0.9327257126569748, "step": 285 }, { "completion_length": 239.2026980717977, "epoch": 0.38164651316879245, "grad_norm": 0.25000807642936707, "kl": 0.65576171875, "learning_rate": 1.5550885508399857e-05, "loss": 0.0262, "reward": 1.2243924078842003, "reward_std": 0.23120340146124363, "rewards/accuracy_reward": 0.26302084152121097, "rewards/format_reward": 0.9613715447485447, "step": 288 }, { "completion_length": 261.2309099833171, "epoch": 0.3856219976809674, "grad_norm": 0.3258998692035675, "kl": 0.6355794270833334, "learning_rate": 1.5434726855956207e-05, "loss": 0.0254, "reward": 1.1762153195838134, "reward_std": 0.2527556049947937, "rewards/accuracy_reward": 0.22656250465661287, "rewards/format_reward": 0.949652798473835, "step": 291 }, { "completion_length": 307.1632038752238, "epoch": 0.3895974821931423, "grad_norm": 0.27683761715888977, "kl": 0.6504720052083334, "learning_rate": 1.5317518048050698e-05, "loss": 0.026, "reward": 1.1840278108914692, "reward_std": 0.3266296978108585, "rewards/accuracy_reward": 0.25868056357527774, "rewards/format_reward": 0.9253472412625948, "step": 294 }, { "completion_length": 227.35286966959634, "epoch": 0.3935729667053172, "grad_norm": 0.30159127712249756, "kl": 0.688720703125, "learning_rate": 1.5199281733010115e-05, "loss": 0.0275, "reward": 1.1831597660978634, "reward_std": 0.2784773572348058, "rewards/accuracy_reward": 0.23350695171393454, "rewards/format_reward": 0.9496528009573618, "step": 297 }, { "completion_length": 143.06510861714682, "epoch": 0.3975484512174921, "grad_norm": 0.2892165780067444, "kl": 0.68359375, "learning_rate": 1.5080040757707045e-05, "loss": 0.0274, "reward": 1.2187500310440857, "reward_std": 0.2116301084558169, "rewards/accuracy_reward": 0.24522570221840093, "rewards/format_reward": 0.9735243245959282, "step": 300 }, { "epoch": 0.3975484512174921, "eval_completion_length": 144.66667048136392, "eval_kl": 0.6736505681818182, "eval_loss": 0.026671981438994408, "eval_reward": 1.2436869072191643, "eval_reward_std": 0.1965812866886457, "eval_rewards/accuracy_reward": 0.2651515253113978, "eval_rewards/format_reward": 0.9785353685870315, "eval_runtime": 306.7676, "eval_samples_per_second": 0.323, "eval_steps_per_second": 0.016, "step": 300 }, { "completion_length": 174.81424236297607, "epoch": 0.40152393572966705, "grad_norm": 0.2668885588645935, "kl": 0.658447265625, "learning_rate": 1.4959818163145174e-05, "loss": 0.0263, "reward": 1.1870660111308098, "reward_std": 0.2152603679957489, "rewards/accuracy_reward": 0.22265625613120696, "rewards/format_reward": 0.9644097412625948, "step": 303 }, { "completion_length": 243.1388953526815, "epoch": 0.405499420241842, "grad_norm": 0.29364725947380066, "kl": 0.6541341145833334, "learning_rate": 1.4838637180007048e-05, "loss": 0.0262, "reward": 1.19227434694767, "reward_std": 0.3106319972624381, "rewards/accuracy_reward": 0.25911459024064243, "rewards/format_reward": 0.9331597425043583, "step": 306 }, { "completion_length": 272.5533922513326, "epoch": 0.4094749047540169, "grad_norm": 0.8436369299888611, "kl": 0.6695149739583334, "learning_rate": 1.4716521224165192e-05, "loss": 0.0268, "reward": 1.1753472536802292, "reward_std": 0.3257710024093588, "rewards/accuracy_reward": 0.25520834090032923, "rewards/format_reward": 0.9201389091710249, "step": 309 }, { "completion_length": 177.79601097106934, "epoch": 0.4134503892661918, "grad_norm": 0.4324951469898224, "kl": 0.7711588541666666, "learning_rate": 1.4593493892157473e-05, "loss": 0.0308, "reward": 1.2126736467083294, "reward_std": 0.2525833969314893, "rewards/accuracy_reward": 0.25607639644294977, "rewards/format_reward": 0.9565972425043583, "step": 312 }, { "completion_length": 131.17882307370505, "epoch": 0.41742587377836676, "grad_norm": 0.4101894497871399, "kl": 0.8329264322916666, "learning_rate": 1.4469578956627497e-05, "loss": 0.0333, "reward": 1.2222222561637561, "reward_std": 0.2416619355790317, "rewards/accuracy_reward": 0.2569444514811039, "rewards/format_reward": 0.9652777935067812, "step": 315 }, { "completion_length": 166.76866793632507, "epoch": 0.42140135829054165, "grad_norm": 0.4412620961666107, "kl": 0.802490234375, "learning_rate": 1.4344800361731028e-05, "loss": 0.0321, "reward": 1.2000868308047454, "reward_std": 0.25887442535410327, "rewards/accuracy_reward": 0.2456597302419444, "rewards/format_reward": 0.9544271069268385, "step": 318 }, { "completion_length": 291.4305648803711, "epoch": 0.4253768428027166, "grad_norm": 1.2379664182662964, "kl": 1.076171875, "learning_rate": 1.4219182218509228e-05, "loss": 0.043, "reward": 1.188368085771799, "reward_std": 0.3411911290604621, "rewards/accuracy_reward": 0.2738715353965138, "rewards/format_reward": 0.9144965472320715, "step": 321 }, { "completion_length": 224.42535320917764, "epoch": 0.4293523273148915, "grad_norm": 0.3620770275592804, "kl": 0.82763671875, "learning_rate": 1.4092748800229684e-05, "loss": 0.0331, "reward": 1.1501736504336197, "reward_std": 0.277087006252259, "rewards/accuracy_reward": 0.21137153512487808, "rewards/format_reward": 0.9388021032015482, "step": 324 }, { "completion_length": 125.78515982627869, "epoch": 0.4333278118270664, "grad_norm": 0.44564080238342285, "kl": 0.8855794270833334, "learning_rate": 1.3965524537696048e-05, "loss": 0.0354, "reward": 1.2070312934617202, "reward_std": 0.2261218437924981, "rewards/accuracy_reward": 0.2408854247769341, "rewards/format_reward": 0.9661458519597849, "step": 327 }, { "completion_length": 120.45356305440266, "epoch": 0.43730329633924137, "grad_norm": 0.5684562921524048, "kl": 0.8846842447916666, "learning_rate": 1.3837534014527292e-05, "loss": 0.0354, "reward": 1.1996528121332328, "reward_std": 0.21545591143270335, "rewards/accuracy_reward": 0.22482639430866888, "rewards/format_reward": 0.9748264066874981, "step": 330 }, { "completion_length": 163.12413569291434, "epoch": 0.44127878085141625, "grad_norm": 0.4309135973453522, "kl": 0.853515625, "learning_rate": 1.370880196240736e-05, "loss": 0.0341, "reward": 1.1966146243115265, "reward_std": 0.20711354352533817, "rewards/accuracy_reward": 0.22829861768210927, "rewards/format_reward": 0.9683159912625948, "step": 333 }, { "completion_length": 237.89236768086752, "epoch": 0.4452542653635912, "grad_norm": 0.4566245377063751, "kl": 0.8661295572916666, "learning_rate": 1.3579353256306287e-05, "loss": 0.0347, "reward": 1.2013889253139496, "reward_std": 0.3096516130802532, "rewards/accuracy_reward": 0.26953125911920023, "rewards/format_reward": 0.9318576566874981, "step": 336 }, { "completion_length": 165.7669305006663, "epoch": 0.4492297498757661, "grad_norm": 0.354465126991272, "kl": 0.865966796875, "learning_rate": 1.3449212909673564e-05, "loss": 0.0346, "reward": 1.2018229539195697, "reward_std": 0.25834672797160846, "rewards/accuracy_reward": 0.25217014582206804, "rewards/format_reward": 0.9496528009573618, "step": 339 }, { "completion_length": 142.55990060170492, "epoch": 0.453205234387941, "grad_norm": 0.3962474763393402, "kl": 0.8601888020833334, "learning_rate": 1.3318406069604794e-05, "loss": 0.0344, "reward": 1.2521701666216056, "reward_std": 0.23758238561761877, "rewards/accuracy_reward": 0.2947048688074574, "rewards/format_reward": 0.9574652972320715, "step": 342 }, { "completion_length": 115.50304126739502, "epoch": 0.45718071890011597, "grad_norm": 0.5837423205375671, "kl": 0.8826497395833334, "learning_rate": 1.3186958011982502e-05, "loss": 0.0353, "reward": 1.2539062947034836, "reward_std": 0.22960447745087245, "rewards/accuracy_reward": 0.28862847849571455, "rewards/format_reward": 0.9652777935067812, "step": 345 }, { "completion_length": 131.01606249809265, "epoch": 0.46115620341229085, "grad_norm": 0.34627339243888855, "kl": 0.8831380208333334, "learning_rate": 1.3054894136592052e-05, "loss": 0.0353, "reward": 1.2656250384946663, "reward_std": 0.2167885024100542, "rewards/accuracy_reward": 0.303385425824672, "rewards/format_reward": 0.9622396032015482, "step": 348 }, { "completion_length": 161.92491857210794, "epoch": 0.4651316879244658, "grad_norm": 0.5599522590637207, "kl": 0.927490234375, "learning_rate": 1.2922239962213639e-05, "loss": 0.0371, "reward": 1.2330729564030964, "reward_std": 0.25398758659139276, "rewards/accuracy_reward": 0.28038195144229877, "rewards/format_reward": 0.9526909900208315, "step": 351 }, { "completion_length": 177.92231432596842, "epoch": 0.46910717243664074, "grad_norm": 0.40600305795669556, "kl": 0.9139811197916666, "learning_rate": 1.2789021121691273e-05, "loss": 0.0366, "reward": 1.2495660111308098, "reward_std": 0.2690324760042131, "rewards/accuracy_reward": 0.3042534824150304, "rewards/format_reward": 0.9453125211099783, "step": 354 }, { "completion_length": 172.3125053246816, "epoch": 0.4730826569488156, "grad_norm": 0.3264493942260742, "kl": 0.9281412760416666, "learning_rate": 1.2655263356979748e-05, "loss": 0.0371, "reward": 1.2217882325251896, "reward_std": 0.23972468955131868, "rewards/accuracy_reward": 0.2669270914436008, "rewards/format_reward": 0.9548611293236414, "step": 357 }, { "completion_length": 184.01345992088318, "epoch": 0.47705814146099057, "grad_norm": 0.45698466897010803, "kl": 0.9051920572916666, "learning_rate": 1.252099251417048e-05, "loss": 0.0362, "reward": 1.2009548917412758, "reward_std": 0.2333919748198241, "rewards/accuracy_reward": 0.2500000069461142, "rewards/format_reward": 0.9509548830489317, "step": 360 }, { "completion_length": 144.6562541325887, "epoch": 0.48103362597316546, "grad_norm": 0.4349970519542694, "kl": 0.9092610677083334, "learning_rate": 1.2386234538497281e-05, "loss": 0.0364, "reward": 1.2122396243115265, "reward_std": 0.20104571796643236, "rewards/accuracy_reward": 0.24696181206187853, "rewards/format_reward": 0.965277798473835, "step": 363 }, { "completion_length": 191.15668980280557, "epoch": 0.4850091104853404, "grad_norm": 0.30329596996307373, "kl": 0.88134765625, "learning_rate": 1.2251015469322915e-05, "loss": 0.0352, "reward": 1.1736111516753833, "reward_std": 0.22162295792562267, "rewards/accuracy_reward": 0.21918403388311467, "rewards/format_reward": 0.9544271032015482, "step": 366 }, { "completion_length": 264.59766403834027, "epoch": 0.48898459499751534, "grad_norm": 0.29752317070961, "kl": 0.850341796875, "learning_rate": 1.2115361435107531e-05, "loss": 0.034, "reward": 1.1987847660978634, "reward_std": 0.23697279867095253, "rewards/accuracy_reward": 0.25217014578326297, "rewards/format_reward": 0.9466146007180214, "step": 369 }, { "completion_length": 265.8745719591777, "epoch": 0.49296007950969023, "grad_norm": 0.32228928804397583, "kl": 0.832763671875, "learning_rate": 1.1979298648359823e-05, "loss": 0.0333, "reward": 1.1922743419806163, "reward_std": 0.23731949056188265, "rewards/accuracy_reward": 0.24652778469802192, "rewards/format_reward": 0.9457465497155985, "step": 372 }, { "completion_length": 225.2148496309916, "epoch": 0.4969355640218652, "grad_norm": 0.3521800637245178, "kl": 0.93212890625, "learning_rate": 1.1842853400571972e-05, "loss": 0.0373, "reward": 1.17578128973643, "reward_std": 0.23853578185662627, "rewards/accuracy_reward": 0.22482639566684762, "rewards/format_reward": 0.9509548818071684, "step": 375 }, { "completion_length": 214.73481448491415, "epoch": 0.5009110485340401, "grad_norm": 0.30461886525154114, "kl": 0.8719075520833334, "learning_rate": 1.1706052057139335e-05, "loss": 0.0349, "reward": 1.2374132238328457, "reward_std": 0.26819697495860356, "rewards/accuracy_reward": 0.2986111195059493, "rewards/format_reward": 0.9388021019597849, "step": 378 }, { "completion_length": 166.19488294919333, "epoch": 0.504886533046215, "grad_norm": 0.3792967200279236, "kl": 0.9239908854166666, "learning_rate": 1.1568921052265835e-05, "loss": 0.037, "reward": 1.2187500335276127, "reward_std": 0.23409798694774508, "rewards/accuracy_reward": 0.2708333415600161, "rewards/format_reward": 0.9479166840513548, "step": 381 }, { "completion_length": 150.8567752043406, "epoch": 0.5088620175583899, "grad_norm": 0.592704176902771, "kl": 0.9227701822916666, "learning_rate": 1.1431486883856082e-05, "loss": 0.0369, "reward": 1.2452257374922435, "reward_std": 0.22722656147864959, "rewards/accuracy_reward": 0.2934027862502262, "rewards/format_reward": 0.9518229340513548, "step": 384 }, { "completion_length": 158.66797320048013, "epoch": 0.5128375020705649, "grad_norm": 0.3592934310436249, "kl": 0.9186197916666666, "learning_rate": 1.1293776108395136e-05, "loss": 0.0367, "reward": 1.2261285136143367, "reward_std": 0.22368450198943415, "rewards/accuracy_reward": 0.27039931307081133, "rewards/format_reward": 0.9557291840513548, "step": 387 }, { "completion_length": 185.46788756052652, "epoch": 0.5168129865827398, "grad_norm": 0.40210244059562683, "kl": 0.9088541666666666, "learning_rate": 1.115581533581701e-05, "loss": 0.0363, "reward": 1.215711849431197, "reward_std": 0.24651615732970336, "rewards/accuracy_reward": 0.26779514624892425, "rewards/format_reward": 0.9479166890184084, "step": 390 }, { "completion_length": 170.30729579925537, "epoch": 0.5207884710949147, "grad_norm": 0.3423998951911926, "kl": 0.9156901041666666, "learning_rate": 1.1017631224362803e-05, "loss": 0.0366, "reward": 1.2191840646167595, "reward_std": 0.23356711654923856, "rewards/accuracy_reward": 0.26562500702372444, "rewards/format_reward": 0.9535590472320715, "step": 393 }, { "completion_length": 163.50651590029398, "epoch": 0.5247639556070897, "grad_norm": 0.4365287721157074, "kl": 0.9022623697916666, "learning_rate": 1.0879250475429523e-05, "loss": 0.0361, "reward": 1.2296007374922435, "reward_std": 0.21208147254462043, "rewards/accuracy_reward": 0.27083334194806713, "rewards/format_reward": 0.9587673805654049, "step": 396 }, { "completion_length": 152.78342461585999, "epoch": 0.5287394401192645, "grad_norm": 0.38972899317741394, "kl": 0.9129231770833334, "learning_rate": 1.0740699828410546e-05, "loss": 0.0365, "reward": 1.2404514389733474, "reward_std": 0.21568205665486553, "rewards/accuracy_reward": 0.2734375073729704, "rewards/format_reward": 0.9670139054457346, "step": 399 }, { "epoch": 0.5300646016233228, "eval_completion_length": 209.7181176944655, "eval_kl": 0.892936862244898, "eval_loss": 0.035242632031440735, "eval_reward": 1.215136090103461, "eval_reward_std": 0.25851106210326663, "eval_rewards/accuracy_reward": 0.27763606341821806, "eval_rewards/format_reward": 0.9375000194627412, "eval_runtime": 446.558, "eval_samples_per_second": 0.222, "eval_steps_per_second": 0.011, "step": 400 }, { "completion_length": 211.7161521911621, "epoch": 0.5327149246314394, "grad_norm": 0.42194342613220215, "kl": 0.86279296875, "learning_rate": 1.060200605552876e-05, "loss": 0.0351, "reward": 1.2447917014360428, "reward_std": 0.2783205214655027, "rewards/accuracy_reward": 0.30598959198687226, "rewards/format_reward": 0.9388021044433117, "step": 402 }, { "completion_length": 209.05816570917764, "epoch": 0.5366904091436144, "grad_norm": 0.3273554742336273, "kl": 0.8595377604166666, "learning_rate": 1.0463195956663339e-05, "loss": 0.0344, "reward": 1.2074653158585231, "reward_std": 0.26641134327898425, "rewards/accuracy_reward": 0.2643229237680013, "rewards/format_reward": 0.9431423793236414, "step": 405 }, { "completion_length": 198.80252281824747, "epoch": 0.5406658936557893, "grad_norm": 0.6081684827804565, "kl": 0.8997395833333334, "learning_rate": 1.0324296354171209e-05, "loss": 0.036, "reward": 1.2209201728304226, "reward_std": 0.26526342386690277, "rewards/accuracy_reward": 0.2712673705148821, "rewards/format_reward": 0.949652798473835, "step": 408 }, { "completion_length": 189.4709266026815, "epoch": 0.5446413781679642, "grad_norm": 0.3419695794582367, "kl": 0.9253743489583334, "learning_rate": 1.0185334087704124e-05, "loss": 0.037, "reward": 1.252604205161333, "reward_std": 0.27628890207658213, "rewards/accuracy_reward": 0.305121536909913, "rewards/format_reward": 0.9474826554457346, "step": 411 }, { "completion_length": 208.34549283981323, "epoch": 0.5486168626801391, "grad_norm": 0.2845761775970459, "kl": 0.9347330729166666, "learning_rate": 1.0046336009022435e-05, "loss": 0.0374, "reward": 1.2300347636143367, "reward_std": 0.2701789590064436, "rewards/accuracy_reward": 0.2834201465981702, "rewards/format_reward": 0.9466146069268385, "step": 414 }, { "completion_length": 176.7278701464335, "epoch": 0.5525923471923141, "grad_norm": 0.30186229944229126, "kl": 0.9585774739583334, "learning_rate": 9.907328976806512e-06, "loss": 0.0384, "reward": 1.267361145466566, "reward_std": 0.25099668038698536, "rewards/accuracy_reward": 0.3120659809404363, "rewards/format_reward": 0.9552951554457346, "step": 417 }, { "completion_length": 181.4904566605886, "epoch": 0.556567831704489, "grad_norm": 0.31573575735092163, "kl": 0.9427083333333334, "learning_rate": 9.768339851466818e-06, "loss": 0.0377, "reward": 1.2165799054006736, "reward_std": 0.23861535429023206, "rewards/accuracy_reward": 0.2586805630320062, "rewards/format_reward": 0.9578993208706379, "step": 420 }, { "completion_length": 172.80729587872824, "epoch": 0.5605433162166639, "grad_norm": 0.3387187421321869, "kl": 0.9481608072916666, "learning_rate": 9.62939548995367e-06, "loss": 0.0379, "reward": 1.2608507374922435, "reward_std": 0.2416861488794287, "rewards/accuracy_reward": 0.30729167473812896, "rewards/format_reward": 0.9535590472320715, "step": 423 }, { "completion_length": 202.40842517217, "epoch": 0.5645188007288389, "grad_norm": 0.3616231381893158, "kl": 0.920166015625, "learning_rate": 9.490522740567633e-06, "loss": 0.0368, "reward": 1.1983507287998993, "reward_std": 0.2321951068782558, "rewards/accuracy_reward": 0.24826389597728848, "rewards/format_reward": 0.950086827079455, "step": 426 }, { "completion_length": 234.31207275390625, "epoch": 0.5684942852410138, "grad_norm": 0.3933253288269043, "kl": 0.89599609375, "learning_rate": 9.351748437771615e-06, "loss": 0.0358, "reward": 1.2092014277974765, "reward_std": 0.26475840294733644, "rewards/accuracy_reward": 0.2638888942698638, "rewards/format_reward": 0.9453125235935053, "step": 429 }, { "completion_length": 219.85721063613892, "epoch": 0.5724697697531886, "grad_norm": 0.3269123435020447, "kl": 0.8291015625, "learning_rate": 9.213099397005647e-06, "loss": 0.0332, "reward": 1.2604166964689891, "reward_std": 0.24623461983477077, "rewards/accuracy_reward": 0.3151041733411451, "rewards/format_reward": 0.9453125211099783, "step": 432 }, { "completion_length": 187.03993590672812, "epoch": 0.5764452542653636, "grad_norm": 0.3639557361602783, "kl": 0.8619791666666666, "learning_rate": 9.074602409505293e-06, "loss": 0.0345, "reward": 1.2573785136143367, "reward_std": 0.2650001817382872, "rewards/accuracy_reward": 0.30512153601739556, "rewards/format_reward": 0.9522569638987383, "step": 435 }, { "completion_length": 164.95964018503824, "epoch": 0.5804207387775385, "grad_norm": 0.47998958826065063, "kl": 0.9236653645833334, "learning_rate": 8.936284237124779e-06, "loss": 0.0369, "reward": 1.20616323625048, "reward_std": 0.20998603710904717, "rewards/accuracy_reward": 0.24479167334114513, "rewards/format_reward": 0.9613715435067812, "step": 438 }, { "completion_length": 165.91406766573587, "epoch": 0.5843962232897134, "grad_norm": 0.2889668345451355, "kl": 0.93359375, "learning_rate": 8.798171607165779e-06, "loss": 0.0374, "reward": 1.1775174016753833, "reward_std": 0.18780716601759195, "rewards/accuracy_reward": 0.21831597752558687, "rewards/format_reward": 0.9592014066874981, "step": 441 }, { "completion_length": 154.68273003896078, "epoch": 0.5883717078018884, "grad_norm": 0.3764539361000061, "kl": 0.9204915364583334, "learning_rate": 8.660291207212883e-06, "loss": 0.0368, "reward": 1.2274305820465088, "reward_std": 0.2107705035402129, "rewards/accuracy_reward": 0.25954861807016033, "rewards/format_reward": 0.967881960173448, "step": 444 }, { "completion_length": 242.51129245758057, "epoch": 0.5923471923140633, "grad_norm": 0.4235329031944275, "kl": 0.8951009114583334, "learning_rate": 8.52266967997675e-06, "loss": 0.0358, "reward": 1.1992187909781933, "reward_std": 0.2926396271989991, "rewards/accuracy_reward": 0.27473959047347307, "rewards/format_reward": 0.9244791890184084, "step": 447 }, { "completion_length": 216.438809633255, "epoch": 0.5963226768262382, "grad_norm": 0.5363680124282837, "kl": 0.9195963541666666, "learning_rate": 8.385333618145896e-06, "loss": 0.0368, "reward": 1.1462673942248027, "reward_std": 0.27518284460529685, "rewards/accuracy_reward": 0.21397570016173026, "rewards/format_reward": 0.9322916840513548, "step": 450 }, { "completion_length": 119.86154862244923, "epoch": 0.6002981613384131, "grad_norm": 0.5320878028869629, "kl": 0.965087890625, "learning_rate": 8.248309559248203e-06, "loss": 0.0386, "reward": 1.18619795764486, "reward_std": 0.20566960889846087, "rewards/accuracy_reward": 0.22135417337995023, "rewards/format_reward": 0.9648437711099783, "step": 453 }, { "completion_length": 98.42144385973613, "epoch": 0.6042736458505881, "grad_norm": 0.4586585760116577, "kl": 0.9525553385416666, "learning_rate": 8.111623980523036e-06, "loss": 0.0381, "reward": 1.2638889191051323, "reward_std": 0.20143946547371647, "rewards/accuracy_reward": 0.2899305631484215, "rewards/format_reward": 0.9739583469927311, "step": 456 }, { "completion_length": 153.68880653381348, "epoch": 0.608249130362763, "grad_norm": 5.601478576660156, "kl": 1.3423665364583333, "learning_rate": 7.975303293805036e-06, "loss": 0.0537, "reward": 1.2421875384946663, "reward_std": 0.24943431583233178, "rewards/accuracy_reward": 0.29513889698622126, "rewards/format_reward": 0.9470486293236414, "step": 459 }, { "completion_length": 172.61806122461954, "epoch": 0.6122246148749378, "grad_norm": 0.6199188828468323, "kl": 0.9340006510416666, "learning_rate": 7.839373840420555e-06, "loss": 0.0374, "reward": 1.1848958780368168, "reward_std": 0.270951366595303, "rewards/accuracy_reward": 0.24435764636533955, "rewards/format_reward": 0.9405382138987383, "step": 462 }, { "completion_length": 216.99045578638712, "epoch": 0.6162000993871128, "grad_norm": 22.460529327392578, "kl": 0.9293619791666666, "learning_rate": 7.70386188609769e-06, "loss": 0.0372, "reward": 1.2044271193444729, "reward_std": 0.2865686761215329, "rewards/accuracy_reward": 0.2812500084207083, "rewards/format_reward": 0.9231771019597849, "step": 465 }, { "completion_length": 200.38672375679016, "epoch": 0.6201755838992877, "grad_norm": 4.971067428588867, "kl": 0.9654947916666666, "learning_rate": 7.568793615890955e-06, "loss": 0.0386, "reward": 1.170138926555713, "reward_std": 0.28953606037733454, "rewards/accuracy_reward": 0.24869792345756045, "rewards/format_reward": 0.9214409912625948, "step": 468 }, { "completion_length": 263.26172574361163, "epoch": 0.6241510684114626, "grad_norm": 19461.30859375, "kl": 6020.59716796875, "learning_rate": 7.434195129121517e-06, "loss": 241.5018, "reward": 1.1197916927436988, "reward_std": 0.35640866014485556, "rewards/accuracy_reward": 0.2530382012870784, "rewards/format_reward": 0.8667534949878851, "step": 471 }, { "completion_length": 157.41016141573587, "epoch": 0.6281265529236376, "grad_norm": 15.363752365112305, "kl": 54.584309895833336, "learning_rate": 7.300092434334021e-06, "loss": 2.1851, "reward": 1.2139757387340069, "reward_std": 0.2608258535619825, "rewards/accuracy_reward": 0.27473959086152416, "rewards/format_reward": 0.9392361330489317, "step": 474 }, { "completion_length": 188.84332064787546, "epoch": 0.6321020374358125, "grad_norm": 3.5809130668640137, "kl": 1.1976725260416667, "learning_rate": 7.166511444270924e-06, "loss": 0.0479, "reward": 1.2309028158585231, "reward_std": 0.26117177587002516, "rewards/accuracy_reward": 0.2916666743112728, "rewards/format_reward": 0.9392361268401146, "step": 477 }, { "completion_length": 160.72222622235617, "epoch": 0.6360775219479874, "grad_norm": 3.082725763320923, "kl": 1.359375, "learning_rate": 7.033477970865381e-06, "loss": 0.0544, "reward": 1.2621528171002865, "reward_std": 0.24424838298000395, "rewards/accuracy_reward": 0.30772570373180014, "rewards/format_reward": 0.9544271032015482, "step": 480 }, { "completion_length": 172.35720992088318, "epoch": 0.6400530064601623, "grad_norm": 6.727673053741455, "kl": 3.7775065104166665, "learning_rate": 6.901017720253583e-06, "loss": 0.151, "reward": 1.1987847561637561, "reward_std": 0.24734753215064606, "rewards/accuracy_reward": 0.25824653392191976, "rewards/format_reward": 0.940538210173448, "step": 483 }, { "completion_length": 138.64713939030966, "epoch": 0.6440284909723373, "grad_norm": 28.416213989257812, "kl": 1.9168294270833333, "learning_rate": 6.769156287807539e-06, "loss": 0.0767, "reward": 1.2269965621332328, "reward_std": 0.21713009189503887, "rewards/accuracy_reward": 0.27170139527879655, "rewards/format_reward": 0.9552951566874981, "step": 486 }, { "completion_length": 111.01085392634074, "epoch": 0.6480039754845122, "grad_norm": 28.046361923217773, "kl": 1.4402669270833333, "learning_rate": 6.637919153189279e-06, "loss": 0.0576, "reward": 1.2903646230697632, "reward_std": 0.2030498057914277, "rewards/accuracy_reward": 0.33203126047737896, "rewards/format_reward": 0.9583333519597849, "step": 489 }, { "completion_length": 93.8329017162323, "epoch": 0.651979459996687, "grad_norm": 5.583730697631836, "kl": 1.2568359375, "learning_rate": 6.507331675427388e-06, "loss": 0.0503, "reward": 1.2226562922199566, "reward_std": 0.2042010520429661, "rewards/accuracy_reward": 0.2604166743500779, "rewards/format_reward": 0.9622396032015482, "step": 492 }, { "completion_length": 88.26736386617024, "epoch": 0.655954944508862, "grad_norm": 12.801457405090332, "kl": 1.261962890625, "learning_rate": 6.3774190880168804e-06, "loss": 0.0505, "reward": 1.2973090683420498, "reward_std": 0.23440878558903933, "rewards/accuracy_reward": 0.33289931528270245, "rewards/format_reward": 0.9644097437461218, "step": 495 }, { "completion_length": 90.45833583672841, "epoch": 0.6599304290210369, "grad_norm": 1334.54296875, "kl": 12.92529296875, "learning_rate": 6.248206494043313e-06, "loss": 0.5176, "reward": 1.269097267339627, "reward_std": 0.1979171479276071, "rewards/accuracy_reward": 0.30164931431257475, "rewards/format_reward": 0.9674479365348816, "step": 498 }, { "epoch": 0.6625807520291536, "eval_completion_length": 106.54038769648625, "eval_kl": 8.832752403846154, "eval_loss": 0.2197878211736679, "eval_reward": 1.2631410598754882, "eval_reward_std": 0.20897178661364776, "eval_rewards/accuracy_reward": 0.3028846269903275, "eval_rewards/format_reward": 0.9602564261509822, "eval_runtime": 284.6155, "eval_samples_per_second": 0.348, "eval_steps_per_second": 0.018, "step": 500 }, { "completion_length": 137.89062881469727, "epoch": 0.6639059135332118, "grad_norm": 196.66842651367188, "kl": 5.895263671875, "learning_rate": 6.119718861332098e-06, "loss": 0.4084, "reward": 1.3242188021540642, "reward_std": 0.24511273042298853, "rewards/accuracy_reward": 0.3710937574505806, "rewards/format_reward": 0.9531250298023224, "step": 501 }, { "completion_length": 101.93186076482137, "epoch": 0.6678813980453868, "grad_norm": 4.593560218811035, "kl": 2.0084635416666665, "learning_rate": 5.9919810176239554e-06, "loss": 0.0804, "reward": 1.2803819825251896, "reward_std": 0.22951093905915818, "rewards/accuracy_reward": 0.3185763976459081, "rewards/format_reward": 0.9618055758376917, "step": 504 }, { "completion_length": 98.08246823151906, "epoch": 0.6718568825575617, "grad_norm": 36.8542594909668, "kl": 1.6246744791666667, "learning_rate": 5.86501764577744e-06, "loss": 0.065, "reward": 1.2582465658585231, "reward_std": 0.20490265979121128, "rewards/accuracy_reward": 0.2903645906674986, "rewards/format_reward": 0.9678819614152113, "step": 507 }, { "completion_length": 125.86849367618561, "epoch": 0.6758323670697366, "grad_norm": 31.712203979492188, "kl": 2.9173990885416665, "learning_rate": 5.7388532789994476e-06, "loss": 0.1167, "reward": 1.2304687934617202, "reward_std": 0.23319136871335408, "rewards/accuracy_reward": 0.27473959159882116, "rewards/format_reward": 0.9557291840513548, "step": 510 }, { "completion_length": 108.07986442248027, "epoch": 0.6798078515819116, "grad_norm": 19.656137466430664, "kl": 2.4375, "learning_rate": 5.613512296104663e-06, "loss": 0.0974, "reward": 1.2282986529171467, "reward_std": 0.1991276788370063, "rewards/accuracy_reward": 0.26388889578326297, "rewards/format_reward": 0.964409738779068, "step": 513 }, { "completion_length": 120.73264233271281, "epoch": 0.6837833360940865, "grad_norm": 9.04715633392334, "kl": 1.5470377604166667, "learning_rate": 5.489018916804813e-06, "loss": 0.0619, "reward": 1.281250045945247, "reward_std": 0.22288222153050205, "rewards/accuracy_reward": 0.3198784813284874, "rewards/format_reward": 0.9613715497155985, "step": 516 }, { "completion_length": 113.39974268277486, "epoch": 0.6877588206062614, "grad_norm": 2.3152172565460205, "kl": 1.3323567708333333, "learning_rate": 5.365397197028686e-06, "loss": 0.0533, "reward": 1.2721354613701503, "reward_std": 0.19468989650097987, "rewards/accuracy_reward": 0.30338542551423114, "rewards/format_reward": 0.968750017384688, "step": 519 }, { "completion_length": 143.59983134269714, "epoch": 0.6917343051184363, "grad_norm": 12.121291160583496, "kl": 1.47412109375, "learning_rate": 5.242671024273798e-06, "loss": 0.059, "reward": 1.2456597636143367, "reward_std": 0.25431135304582614, "rewards/accuracy_reward": 0.29296875729536015, "rewards/format_reward": 0.9526909900208315, "step": 522 }, { "completion_length": 121.45529794692993, "epoch": 0.6957097896306113, "grad_norm": 3.243786334991455, "kl": 1.41357421875, "learning_rate": 5.120864112990569e-06, "loss": 0.0566, "reward": 1.2443576753139496, "reward_std": 0.20600120699964464, "rewards/accuracy_reward": 0.28125000714013976, "rewards/format_reward": 0.9631076554457346, "step": 525 }, { "completion_length": 141.91970892747244, "epoch": 0.6996852741427861, "grad_norm": 6.4455437660217285, "kl": 1.6841634114583333, "learning_rate": 5.000000000000003e-06, "loss": 0.0674, "reward": 1.2152778096497059, "reward_std": 0.23748167790472507, "rewards/accuracy_reward": 0.2604166748933494, "rewards/format_reward": 0.9548611342906952, "step": 528 }, { "completion_length": 145.10503919919333, "epoch": 0.703660758654961, "grad_norm": 4.077893257141113, "kl": 1.8653971354166667, "learning_rate": 4.880102039945625e-06, "loss": 0.0746, "reward": 1.2673611491918564, "reward_std": 0.22801773723525307, "rewards/accuracy_reward": 0.3116319513646886, "rewards/format_reward": 0.9557291865348816, "step": 531 }, { "completion_length": 153.23568006356558, "epoch": 0.707636243167136, "grad_norm": 2.3837387561798096, "kl": 1.5, "learning_rate": 4.761193400780667e-06, "loss": 0.06, "reward": 1.25694448625048, "reward_std": 0.24314528051763773, "rewards/accuracy_reward": 0.2999132027228673, "rewards/format_reward": 0.9570312711099783, "step": 534 }, { "completion_length": 167.8263931274414, "epoch": 0.7116117276793109, "grad_norm": 4.202811241149902, "kl": 2.5416666666666665, "learning_rate": 4.643297059291303e-06, "loss": 0.1017, "reward": 1.2248264277974765, "reward_std": 0.27370192063972354, "rewards/accuracy_reward": 0.27604167559184134, "rewards/format_reward": 0.9487847425043583, "step": 537 }, { "completion_length": 182.7339456876119, "epoch": 0.7155872121914858, "grad_norm": 14.95860481262207, "kl": 2.8761393229166665, "learning_rate": 4.52643579665683e-06, "loss": 0.1151, "reward": 1.2421875409781933, "reward_std": 0.25901925152478117, "rewards/accuracy_reward": 0.2981770924137284, "rewards/format_reward": 0.9440104328095913, "step": 540 }, { "completion_length": 154.5321224530538, "epoch": 0.7195626967036608, "grad_norm": 3.4772720336914062, "kl": 2.1795247395833335, "learning_rate": 4.410632194047652e-06, "loss": 0.0872, "reward": 1.2413194812834263, "reward_std": 0.2310507068565736, "rewards/accuracy_reward": 0.2834201504010707, "rewards/format_reward": 0.9578993258376917, "step": 543 }, { "completion_length": 183.41537022590637, "epoch": 0.7235381812158357, "grad_norm": 5.352535247802734, "kl": 2.7540690104166665, "learning_rate": 4.29590862826191e-06, "loss": 0.1102, "reward": 1.2369792026778061, "reward_std": 0.24117931607179344, "rewards/accuracy_reward": 0.2934027863666415, "rewards/format_reward": 0.9435764091710249, "step": 546 }, { "completion_length": 169.5039111773173, "epoch": 0.7275136657280106, "grad_norm": 2.4602179527282715, "kl": 1.9309895833333333, "learning_rate": 4.182287267401587e-06, "loss": 0.0772, "reward": 1.2404514315227668, "reward_std": 0.22196716310766837, "rewards/accuracy_reward": 0.2877604255530362, "rewards/format_reward": 0.9526909925043583, "step": 549 }, { "completion_length": 147.87847610314688, "epoch": 0.7314891502401856, "grad_norm": 3.658947229385376, "kl": 2.431640625, "learning_rate": 4.069790066588966e-06, "loss": 0.0972, "reward": 1.3168403158585231, "reward_std": 0.23325985188906392, "rewards/accuracy_reward": 0.3589409807464108, "rewards/format_reward": 0.9578993233541647, "step": 552 }, { "completion_length": 148.14410118261972, "epoch": 0.7354646347523605, "grad_norm": 2.160740613937378, "kl": 1.542236328125, "learning_rate": 3.9584387637242235e-06, "loss": 0.0617, "reward": 1.2235243457059066, "reward_std": 0.23132954825026295, "rewards/accuracy_reward": 0.2647569504721711, "rewards/format_reward": 0.9587673780818781, "step": 555 }, { "completion_length": 169.01172391573587, "epoch": 0.7394401192645353, "grad_norm": 13.349943161010742, "kl": 2.2926432291666665, "learning_rate": 3.848254875285e-06, "loss": 0.0917, "reward": 1.197048647950093, "reward_std": 0.2103662300699701, "rewards/accuracy_reward": 0.2460937569849193, "rewards/format_reward": 0.9509548842906952, "step": 558 }, { "completion_length": 157.60981353123984, "epoch": 0.7434156037767102, "grad_norm": 2.4742820262908936, "kl": 1.6537272135416667, "learning_rate": 3.739259692168764e-06, "loss": 0.0662, "reward": 1.229166705161333, "reward_std": 0.2528001538012177, "rewards/accuracy_reward": 0.27560764621011913, "rewards/format_reward": 0.9535590459903082, "step": 561 }, { "completion_length": 154.8311678568522, "epoch": 0.7473910882888852, "grad_norm": 1.9129363298416138, "kl": 1.6079915364583333, "learning_rate": 3.6314742755787537e-06, "loss": 0.0643, "reward": 1.2261285024384658, "reward_std": 0.22889205797885856, "rewards/accuracy_reward": 0.27300348059119034, "rewards/format_reward": 0.9531250235935053, "step": 564 }, { "completion_length": 151.18403148651123, "epoch": 0.7513665728010601, "grad_norm": 1.2694976329803467, "kl": 1.6017252604166667, "learning_rate": 3.524919452954314e-06, "loss": 0.064, "reward": 1.2248264377315838, "reward_std": 0.23556000289196768, "rewards/accuracy_reward": 0.26692709055108327, "rewards/format_reward": 0.9578993320465088, "step": 567 }, { "completion_length": 154.80859859784445, "epoch": 0.755342057313235, "grad_norm": 1.4956895112991333, "kl": 1.427734375, "learning_rate": 3.419615813946392e-06, "loss": 0.0571, "reward": 1.1974826753139496, "reward_std": 0.23270095341528454, "rewards/accuracy_reward": 0.23784722872854522, "rewards/format_reward": 0.9596354352931181, "step": 570 }, { "completion_length": 169.7625904083252, "epoch": 0.75931754182541, "grad_norm": 0.8634160161018372, "kl": 1.3221028645833333, "learning_rate": 3.315583706438994e-06, "loss": 0.0529, "reward": 1.2278646193444729, "reward_std": 0.24653864566547176, "rewards/accuracy_reward": 0.27734375892517465, "rewards/format_reward": 0.9505208519597849, "step": 573 }, { "completion_length": 157.0377644697825, "epoch": 0.7632930263375849, "grad_norm": 5.6404571533203125, "kl": 1.3050944010416667, "learning_rate": 3.212843232617343e-06, "loss": 0.0522, "reward": 1.2322048942248027, "reward_std": 0.21434197838728627, "rewards/accuracy_reward": 0.2738715362114211, "rewards/format_reward": 0.9583333519597849, "step": 576 }, { "completion_length": 149.35286871592203, "epoch": 0.7672685108497598, "grad_norm": 1.4161450862884521, "kl": 1.1617838541666667, "learning_rate": 3.1114142450835296e-06, "loss": 0.0465, "reward": 1.2365451728304226, "reward_std": 0.2225903740618378, "rewards/accuracy_reward": 0.2730034806688006, "rewards/format_reward": 0.9635416815678278, "step": 579 }, { "completion_length": 164.2899361451467, "epoch": 0.7712439953619348, "grad_norm": 2.1099228858947754, "kl": 1.1534016927083333, "learning_rate": 3.0113163430203775e-06, "loss": 0.0461, "reward": 1.2417535074055195, "reward_std": 0.2334075498705109, "rewards/accuracy_reward": 0.2873263977235183, "rewards/format_reward": 0.9544271044433117, "step": 582 }, { "completion_length": 166.99479564030966, "epoch": 0.7752194798741097, "grad_norm": 0.6481562256813049, "kl": 1.1204427083333333, "learning_rate": 2.912568868404284e-06, "loss": 0.0448, "reward": 1.2539062947034836, "reward_std": 0.2460917371014754, "rewards/accuracy_reward": 0.2977430645842105, "rewards/format_reward": 0.9561632151405016, "step": 585 }, { "completion_length": 142.4097265402476, "epoch": 0.7791949643862845, "grad_norm": 0.6822313070297241, "kl": 1.0983072916666667, "learning_rate": 2.815190902267757e-06, "loss": 0.0439, "reward": 1.2465278084079425, "reward_std": 0.21192065292658904, "rewards/accuracy_reward": 0.27864584055108327, "rewards/format_reward": 0.9678819651405016, "step": 588 }, { "completion_length": 172.7829921245575, "epoch": 0.7831704488984595, "grad_norm": 1.076019525527954, "kl": 1.0126953125, "learning_rate": 2.7192012610123777e-06, "loss": 0.0405, "reward": 1.2855903171002865, "reward_std": 0.22700861329212785, "rewards/accuracy_reward": 0.3285590385397275, "rewards/format_reward": 0.957031267384688, "step": 591 }, { "completion_length": 179.41103037198386, "epoch": 0.7871459334106344, "grad_norm": 0.7347291707992554, "kl": 1.072265625, "learning_rate": 2.6246184927728913e-06, "loss": 0.0429, "reward": 1.2395833755532901, "reward_std": 0.22892415950385234, "rewards/accuracy_reward": 0.2808159793494269, "rewards/format_reward": 0.9587673818071684, "step": 594 }, { "completion_length": 190.89887682596842, "epoch": 0.7911214179228093, "grad_norm": 1.5825515985488892, "kl": 1.0482584635416667, "learning_rate": 2.5314608738331535e-06, "loss": 0.0419, "reward": 1.2456597586472828, "reward_std": 0.24149028413618603, "rewards/accuracy_reward": 0.29427084256894886, "rewards/format_reward": 0.9513889116545519, "step": 597 }, { "completion_length": 175.54731305440268, "epoch": 0.7950969024349842, "grad_norm": 0.6331008672714233, "kl": 1.0225423177083333, "learning_rate": 2.4397464050945753e-06, "loss": 0.0409, "reward": 1.2434896143774192, "reward_std": 0.2312415634126713, "rewards/accuracy_reward": 0.2873263991593073, "rewards/format_reward": 0.9561632089316845, "step": 600 }, { "epoch": 0.7950969024349842, "eval_completion_length": 166.41919604214755, "eval_kl": 1.0108901515151516, "eval_loss": 0.040223389863967896, "eval_reward": 1.2853535666610256, "eval_reward_std": 0.22720548510551453, "eval_rewards/accuracy_reward": 0.32449495679501333, "eval_rewards/format_reward": 0.9608586051247336, "eval_runtime": 432.7167, "eval_samples_per_second": 0.229, "eval_steps_per_second": 0.012, "step": 600 }, { "completion_length": 197.4709267616272, "epoch": 0.7990723869471592, "grad_norm": 3.598181962966919, "kl": 1.1375325520833333, "learning_rate": 2.3494928085978073e-06, "loss": 0.0455, "reward": 1.241319480041663, "reward_std": 0.2442009438915799, "rewards/accuracy_reward": 0.2921006998512894, "rewards/format_reward": 0.949218769868215, "step": 603 }, { "completion_length": 171.99479659398398, "epoch": 0.8030478714593341, "grad_norm": 1.0907797813415527, "kl": 0.995849609375, "learning_rate": 2.2607175240983027e-06, "loss": 0.0399, "reward": 1.2322048917412758, "reward_std": 0.2378006634923319, "rewards/accuracy_reward": 0.2786458421420927, "rewards/format_reward": 0.9535590509573618, "step": 606 }, { "completion_length": 151.64627146720886, "epoch": 0.807023355971509, "grad_norm": 15.526721954345703, "kl": 1.0755208333333333, "learning_rate": 2.1734377056964175e-06, "loss": 0.043, "reward": 1.2387153183420498, "reward_std": 0.21919091992701092, "rewards/accuracy_reward": 0.2721354246993239, "rewards/format_reward": 0.9665798768401146, "step": 609 }, { "completion_length": 180.01128919919333, "epoch": 0.810998840483684, "grad_norm": 0.6363082528114319, "kl": 1.0391438802083333, "learning_rate": 2.087670218522714e-06, "loss": 0.0416, "reward": 1.210069477558136, "reward_std": 0.2583714901314427, "rewards/accuracy_reward": 0.25824653725915897, "rewards/format_reward": 0.9518229365348816, "step": 612 }, { "completion_length": 134.2313413619995, "epoch": 0.8149743249958589, "grad_norm": 0.5771492123603821, "kl": 0.9965006510416666, "learning_rate": 2.0034316354791062e-06, "loss": 0.0398, "reward": 1.2651910136143367, "reward_std": 0.1995284124277532, "rewards/accuracy_reward": 0.2968750091968104, "rewards/format_reward": 0.9683159900208315, "step": 615 }, { "completion_length": 145.0091195901235, "epoch": 0.8189498095080338, "grad_norm": 2.458407163619995, "kl": 1.0079752604166667, "learning_rate": 1.920738234036463e-06, "loss": 0.0403, "reward": 1.28298615415891, "reward_std": 0.21225994320896766, "rewards/accuracy_reward": 0.3133680646618207, "rewards/format_reward": 0.9696180721124014, "step": 618 }, { "completion_length": 157.43186203638712, "epoch": 0.8229252940202088, "grad_norm": 0.5727205276489258, "kl": 1.0079752604166667, "learning_rate": 1.8396059930893073e-06, "loss": 0.0403, "reward": 1.2170139253139496, "reward_std": 0.21727207908406854, "rewards/accuracy_reward": 0.25130209086152416, "rewards/format_reward": 0.9657118258376917, "step": 621 }, { "completion_length": 156.4114625453949, "epoch": 0.8269007785323836, "grad_norm": 0.5636825561523438, "kl": 0.9955240885416666, "learning_rate": 1.7600505898681996e-06, "loss": 0.0398, "reward": 1.2300347586472828, "reward_std": 0.23207383013019958, "rewards/accuracy_reward": 0.26649306435137987, "rewards/format_reward": 0.9635416877766451, "step": 624 }, { "completion_length": 154.10373576482138, "epoch": 0.8308762630445585, "grad_norm": 3.9658546447753906, "kl": 1.0211588541666667, "learning_rate": 1.6820873969104223e-06, "loss": 0.0409, "reward": 1.2500000434617202, "reward_std": 0.22843335390401384, "rewards/accuracy_reward": 0.2808159825702508, "rewards/format_reward": 0.9691840459903082, "step": 627 }, { "completion_length": 179.8810822168986, "epoch": 0.8348517475567335, "grad_norm": 0.8975684642791748, "kl": 0.9754231770833334, "learning_rate": 1.605731479089534e-06, "loss": 0.039, "reward": 1.270833384245634, "reward_std": 0.2560514376188318, "rewards/accuracy_reward": 0.3168402878024305, "rewards/format_reward": 0.9539930745959282, "step": 630 }, { "completion_length": 191.86068240801492, "epoch": 0.8388272320689084, "grad_norm": 1.0552254915237427, "kl": 1.05908203125, "learning_rate": 1.530997590704375e-06, "loss": 0.0424, "reward": 1.223524338255326, "reward_std": 0.24293402349576354, "rewards/accuracy_reward": 0.2669270930734153, "rewards/format_reward": 0.9565972437461218, "step": 633 }, { "completion_length": 201.91189877192178, "epoch": 0.8428027165810833, "grad_norm": 1.9689509868621826, "kl": 1.109130859375, "learning_rate": 1.4579001726280828e-06, "loss": 0.0444, "reward": 1.2560764352480571, "reward_std": 0.24747123545967042, "rewards/accuracy_reward": 0.299479175475426, "rewards/format_reward": 0.9565972400208315, "step": 636 }, { "completion_length": 201.5238777001699, "epoch": 0.8467782010932582, "grad_norm": 0.8104230761528015, "kl": 1.052734375, "learning_rate": 1.386453349517679e-06, "loss": 0.0421, "reward": 1.2391493332882721, "reward_std": 0.24252263192708293, "rewards/accuracy_reward": 0.281684036909913, "rewards/format_reward": 0.9574652922650179, "step": 639 }, { "completion_length": 170.00868566830954, "epoch": 0.8507536856054332, "grad_norm": 0.7141380310058594, "kl": 0.9737955729166666, "learning_rate": 1.316670927084751e-06, "loss": 0.039, "reward": 1.2630208705862362, "reward_std": 0.23810221177215377, "rewards/accuracy_reward": 0.29817709152121097, "rewards/format_reward": 0.9648437735935053, "step": 642 }, { "completion_length": 187.43533500035605, "epoch": 0.8547291701176081, "grad_norm": 0.9845206141471863, "kl": 1.1171061197916667, "learning_rate": 1.2485663894277611e-06, "loss": 0.0447, "reward": 1.2730035160978634, "reward_std": 0.216334043458725, "rewards/accuracy_reward": 0.3094618124887347, "rewards/format_reward": 0.9635416865348816, "step": 645 }, { "completion_length": 169.31076955795288, "epoch": 0.858704654629783, "grad_norm": 0.9538066387176514, "kl": 0.978271484375, "learning_rate": 1.182152896426515e-06, "loss": 0.0391, "reward": 1.281250037252903, "reward_std": 0.24271480288977423, "rewards/accuracy_reward": 0.31597222907779116, "rewards/format_reward": 0.9652777972320715, "step": 648 }, { "completion_length": 184.18012634913126, "epoch": 0.862680139141958, "grad_norm": 0.5063804388046265, "kl": 0.9745279947916666, "learning_rate": 1.1174432811992686e-06, "loss": 0.039, "reward": 1.24609378973643, "reward_std": 0.21818942956936857, "rewards/accuracy_reward": 0.2821180628379807, "rewards/format_reward": 0.9639757126569748, "step": 651 }, { "completion_length": 165.3055603504181, "epoch": 0.8666556236541328, "grad_norm": 0.6727854013442993, "kl": 0.9583333333333334, "learning_rate": 1.0544500476229713e-06, "loss": 0.0383, "reward": 1.2573785086472828, "reward_std": 0.22620403526040414, "rewards/accuracy_reward": 0.29427084035705775, "rewards/format_reward": 0.9631076554457346, "step": 654 }, { "completion_length": 186.19965728123984, "epoch": 0.8706311081663077, "grad_norm": 0.6164532899856567, "kl": 1.0279134114583333, "learning_rate": 9.931853679171377e-07, "loss": 0.0411, "reward": 1.2439236516753833, "reward_std": 0.24075799800145128, "rewards/accuracy_reward": 0.28559028551292914, "rewards/format_reward": 0.9583333507180214, "step": 657 }, { "completion_length": 177.0638066927592, "epoch": 0.8746065926784827, "grad_norm": 0.6313008666038513, "kl": 1.0465494791666667, "learning_rate": 9.336610802918044e-07, "loss": 0.0419, "reward": 1.2708333631356556, "reward_std": 0.20328321517445147, "rewards/accuracy_reward": 0.3051215368323028, "rewards/format_reward": 0.9657118245959282, "step": 660 }, { "completion_length": 178.04774816830954, "epoch": 0.8785820771906576, "grad_norm": 0.5517924427986145, "kl": 1.0804036458333333, "learning_rate": 8.758886866600258e-07, "loss": 0.0433, "reward": 1.3003472636143367, "reward_std": 0.20480242053357264, "rewards/accuracy_reward": 0.33203125970127684, "rewards/format_reward": 0.9683159875373045, "step": 663 }, { "completion_length": 184.22309557596842, "epoch": 0.8825575617028325, "grad_norm": 1.6948155164718628, "kl": 0.9346516927083334, "learning_rate": 8.198793504153491e-07, "loss": 0.0374, "reward": 1.2834201827645302, "reward_std": 0.22442288471696278, "rewards/accuracy_reward": 0.31770834152121097, "rewards/format_reward": 0.9657118221124014, "step": 666 }, { "completion_length": 163.08811235427856, "epoch": 0.8865330462150075, "grad_norm": 0.5778855085372925, "kl": 0.9193522135416666, "learning_rate": 7.656438942747057e-07, "loss": 0.0368, "reward": 1.27039934694767, "reward_std": 0.1949684239613513, "rewards/accuracy_reward": 0.2973090352024883, "rewards/format_reward": 0.9730902947485447, "step": 669 }, { "completion_length": 190.82596119244894, "epoch": 0.8905085307271824, "grad_norm": 0.6843112111091614, "kl": 1.0071614583333333, "learning_rate": 7.131927981871345e-07, "loss": 0.0403, "reward": 1.2348090757926304, "reward_std": 0.22979943679335216, "rewards/accuracy_reward": 0.27213542349636555, "rewards/format_reward": 0.9626736293236414, "step": 672 }, { "completion_length": 180.4761331876119, "epoch": 0.8944840152393573, "grad_norm": 1.2002581357955933, "kl": 0.9956868489583334, "learning_rate": 6.625361973087363e-07, "loss": 0.0398, "reward": 1.267361156642437, "reward_std": 0.20884954005790254, "rewards/accuracy_reward": 0.2999132029945031, "rewards/format_reward": 0.9674479303260645, "step": 675 }, { "completion_length": 171.35373767217, "epoch": 0.8984594997515322, "grad_norm": 0.5270951390266418, "kl": 0.9773763020833334, "learning_rate": 6.136838800442457e-07, "loss": 0.0391, "reward": 1.2855903183420498, "reward_std": 0.19845290334584811, "rewards/accuracy_reward": 0.3168402863666415, "rewards/format_reward": 0.9687500186264515, "step": 678 }, { "completion_length": 190.48004015286764, "epoch": 0.9024349842637072, "grad_norm": 0.8527917861938477, "kl": 0.9973958333333334, "learning_rate": 5.66645286155616e-07, "loss": 0.0399, "reward": 1.2916667014360428, "reward_std": 0.2291031815111637, "rewards/accuracy_reward": 0.32986112144620466, "rewards/format_reward": 0.9618055721124014, "step": 681 }, { "completion_length": 189.20226113001505, "epoch": 0.906410468775882, "grad_norm": 9.596158981323242, "kl": 1.0517578125, "learning_rate": 5.214295049379658e-07, "loss": 0.0421, "reward": 1.2582465782761574, "reward_std": 0.22187859937548637, "rewards/accuracy_reward": 0.2938368134200573, "rewards/format_reward": 0.9644097400208315, "step": 684 }, { "completion_length": 187.68099466959634, "epoch": 0.9103859532880569, "grad_norm": 0.6961022615432739, "kl": 0.9669596354166666, "learning_rate": 4.780452734632524e-07, "loss": 0.0387, "reward": 1.2760417039195697, "reward_std": 0.22566887092155716, "rewards/accuracy_reward": 0.31163195543922484, "rewards/format_reward": 0.9644097425043583, "step": 687 }, { "completion_length": 181.89453570048013, "epoch": 0.9143614378002319, "grad_norm": 0.5587486028671265, "kl": 0.9386393229166666, "learning_rate": 4.3650097489200125e-07, "loss": 0.0376, "reward": 1.2834201777974765, "reward_std": 0.21305101970210671, "rewards/accuracy_reward": 0.3146701470638315, "rewards/format_reward": 0.9687500161429247, "step": 690 }, { "completion_length": 186.9974012374878, "epoch": 0.9183369223124068, "grad_norm": 0.603073000907898, "kl": 0.977783203125, "learning_rate": 3.9680463685342173e-07, "loss": 0.0391, "reward": 1.3268229588866234, "reward_std": 0.22527993516996503, "rewards/accuracy_reward": 0.36154514946974814, "rewards/format_reward": 0.9652777935067812, "step": 693 }, { "completion_length": 199.9023496309916, "epoch": 0.9223124068245817, "grad_norm": 0.49448880553245544, "kl": 0.979736328125, "learning_rate": 3.589639298942238e-07, "loss": 0.0392, "reward": 1.276475730041663, "reward_std": 0.2337690940281997, "rewards/accuracy_reward": 0.3138020931510255, "rewards/format_reward": 0.9626736293236414, "step": 696 }, { "completion_length": 179.73394536972046, "epoch": 0.9262878913367567, "grad_norm": 3.9789516925811768, "kl": 0.976318359375, "learning_rate": 3.2298616599643285e-07, "loss": 0.0391, "reward": 1.278211849431197, "reward_std": 0.1972268489189446, "rewards/accuracy_reward": 0.3103298700880259, "rewards/format_reward": 0.9678819626569748, "step": 699 }, { "epoch": 0.927613052840815, "eval_completion_length": 198.22194461433256, "eval_kl": 1.0224011479591837, "eval_loss": 0.039721183478832245, "eval_reward": 1.2755102442235362, "eval_reward_std": 0.2398411301629884, "eval_rewards/accuracy_reward": 0.31972789886046427, "eval_rewards/format_reward": 0.9557823356317015, "eval_runtime": 434.6419, "eval_samples_per_second": 0.228, "eval_steps_per_second": 0.012, "step": 700 }, { "completion_length": 181.44922268390656, "epoch": 0.9302633758489316, "grad_norm": 0.6522932648658752, "kl": 0.9718017578125, "learning_rate": 2.8887829716449877e-07, "loss": 0.0401, "reward": 1.2643229570239782, "reward_std": 0.22105206700507551, "rewards/accuracy_reward": 0.30013021564809605, "rewards/format_reward": 0.9641927294433117, "step": 702 }, { "completion_length": 193.3624184926351, "epoch": 0.9342388603611065, "grad_norm": 0.6112500429153442, "kl": 0.9839680989583334, "learning_rate": 2.5664691408194164e-07, "loss": 0.0394, "reward": 1.2582465621332328, "reward_std": 0.23564991471357644, "rewards/accuracy_reward": 0.2955729262127231, "rewards/format_reward": 0.9626736280818781, "step": 705 }, { "completion_length": 220.39193407694498, "epoch": 0.9382143448732815, "grad_norm": 0.631359875202179, "kl": 1.0447591145833333, "learning_rate": 2.262982448378437e-07, "loss": 0.0418, "reward": 1.2782118432223797, "reward_std": 0.2571307167721291, "rewards/accuracy_reward": 0.32291667396202683, "rewards/format_reward": 0.9552951566874981, "step": 708 }, { "completion_length": 208.5000058809916, "epoch": 0.9421898293854564, "grad_norm": 0.6516295075416565, "kl": 1.075927734375, "learning_rate": 1.9783815372338422e-07, "loss": 0.043, "reward": 1.2669271230697632, "reward_std": 0.26777021974946064, "rewards/accuracy_reward": 0.31597223101804656, "rewards/format_reward": 0.9509548805654049, "step": 711 }, { "completion_length": 190.33203514417013, "epoch": 0.9461653138976313, "grad_norm": 0.8081660866737366, "kl": 0.9834798177083334, "learning_rate": 1.7127214009868387e-07, "loss": 0.0393, "reward": 1.3146701827645302, "reward_std": 0.22434815554879606, "rewards/accuracy_reward": 0.350694455128784, "rewards/format_reward": 0.9639757138987383, "step": 714 }, { "completion_length": 191.1718815167745, "epoch": 0.9501407984098061, "grad_norm": 0.6136611700057983, "kl": 0.9658203125, "learning_rate": 1.4660533733015236e-07, "loss": 0.0386, "reward": 1.2361111529171467, "reward_std": 0.2575679953054835, "rewards/accuracy_reward": 0.2760416753590107, "rewards/format_reward": 0.9600694651405016, "step": 717 }, { "completion_length": 199.0850751399994, "epoch": 0.9541162829219811, "grad_norm": 2.2742247581481934, "kl": 1.0128580729166667, "learning_rate": 1.2384251179857642e-07, "loss": 0.0405, "reward": 1.2573785086472828, "reward_std": 0.23815507961747548, "rewards/accuracy_reward": 0.2964409807464108, "rewards/format_reward": 0.9609375149011612, "step": 720 }, { "completion_length": 180.6779546737671, "epoch": 0.958091767434156, "grad_norm": 0.546220064163208, "kl": 0.9303385416666666, "learning_rate": 1.0298806197809985e-07, "loss": 0.0372, "reward": 1.2834201827645302, "reward_std": 0.23479281645268202, "rewards/accuracy_reward": 0.32031251102065045, "rewards/format_reward": 0.9631076616545519, "step": 723 }, { "completion_length": 189.55859804153442, "epoch": 0.9620672519463309, "grad_norm": 0.5562774538993835, "kl": 0.964111328125, "learning_rate": 8.404601758630892e-08, "loss": 0.0386, "reward": 1.2556424004336197, "reward_std": 0.23037906123014787, "rewards/accuracy_reward": 0.2955729252814005, "rewards/format_reward": 0.9600694626569748, "step": 726 }, { "completion_length": 202.31163819630942, "epoch": 0.9660427364585059, "grad_norm": 0.6967044472694397, "kl": 0.984130859375, "learning_rate": 6.702003880556418e-08, "loss": 0.0394, "reward": 1.2834201728304226, "reward_std": 0.23515649721957743, "rewards/accuracy_reward": 0.32508681404093903, "rewards/format_reward": 0.9583333507180214, "step": 729 }, { "completion_length": 184.46702075004578, "epoch": 0.9700182209706808, "grad_norm": 1.1955063343048096, "kl": 1.0083821614583333, "learning_rate": 5.191341557574392e-08, "loss": 0.0403, "reward": 1.2309028195838134, "reward_std": 0.22288473897303143, "rewards/accuracy_reward": 0.2695312558983763, "rewards/format_reward": 0.961371548473835, "step": 732 }, { "completion_length": 171.00391014417013, "epoch": 0.9739937054828557, "grad_norm": 3.6898951530456543, "kl": 0.980224609375, "learning_rate": 3.872906695852607e-08, "loss": 0.0392, "reward": 1.2894965633749962, "reward_std": 0.22550825821235776, "rewards/accuracy_reward": 0.3198784821821998, "rewards/format_reward": 0.969618077079455, "step": 735 }, { "completion_length": 181.51215736071268, "epoch": 0.9779691899950307, "grad_norm": 0.8761662244796753, "kl": 0.9816080729166666, "learning_rate": 2.746954057333606e-08, "loss": 0.0393, "reward": 1.2582465695838134, "reward_std": 0.23283367223727205, "rewards/accuracy_reward": 0.2925347340060398, "rewards/format_reward": 0.9657118221124014, "step": 738 }, { "completion_length": 166.68663636843362, "epoch": 0.9819446745072056, "grad_norm": 0.48325055837631226, "kl": 0.996337890625, "learning_rate": 1.8137012105069464e-08, "loss": 0.0398, "reward": 1.263454897950093, "reward_std": 0.20216705913965902, "rewards/accuracy_reward": 0.29383681442899007, "rewards/format_reward": 0.9696180733541647, "step": 741 }, { "completion_length": 182.96528228123984, "epoch": 0.9859201590193805, "grad_norm": 0.55852872133255, "kl": 0.9632161458333334, "learning_rate": 1.0733284883682748e-08, "loss": 0.0385, "reward": 1.2773437909781933, "reward_std": 0.2304229981576403, "rewards/accuracy_reward": 0.31032986772091437, "rewards/format_reward": 0.9670139066874981, "step": 744 }, { "completion_length": 200.19835631052652, "epoch": 0.9898956435315555, "grad_norm": 4.7552056312561035, "kl": 1.1190592447916667, "learning_rate": 5.25978953573536e-09, "loss": 0.0447, "reward": 1.2855903195838134, "reward_std": 0.2617647792988767, "rewards/accuracy_reward": 0.3268229243112728, "rewards/format_reward": 0.9587673805654049, "step": 747 }, { "completion_length": 192.70313183466592, "epoch": 0.9938711280437303, "grad_norm": 0.49542155861854553, "kl": 0.9990234375, "learning_rate": 1.7175837079452806e-09, "loss": 0.04, "reward": 1.2573785160978634, "reward_std": 0.21775838693914315, "rewards/accuracy_reward": 0.2960069530721133, "rewards/format_reward": 0.9613715497155985, "step": 750 }, { "completion_length": 200.46571826934814, "epoch": 0.9978466125559052, "grad_norm": 0.8950777053833008, "kl": 1.0421549479166667, "learning_rate": 1.0735186282695431e-10, "loss": 0.0417, "reward": 1.2717014340062935, "reward_std": 0.2556659254866342, "rewards/accuracy_reward": 0.31206598148370784, "rewards/format_reward": 0.9596354328095913, "step": 753 }, { "completion_length": 211.18359994888306, "epoch": 0.9991717740599636, "kl": 0.985107421875, "reward": 1.2708333656191826, "reward_std": 0.28282210882753134, "rewards/accuracy_reward": 0.31250000838190317, "rewards/format_reward": 0.9583333544433117, "step": 754, "total_flos": 0.0, "train_loss": 1.286716509427883, "train_runtime": 229250.8975, "train_samples_per_second": 0.316, "train_steps_per_second": 0.003 } ], "logging_steps": 3, "max_steps": 754, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }