|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9991717740599636, |
|
"eval_steps": 100, |
|
"global_step": 754, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 1843.4614156087239, |
|
"epoch": 0.003975484512174921, |
|
"grad_norm": 0.08699692785739899, |
|
"kl": 5.9882799784342446e-05, |
|
"learning_rate": 7.894736842105263e-07, |
|
"loss": 0.0, |
|
"reward": 0.20442708857202282, |
|
"reward_std": 0.18642982677556574, |
|
"rewards/accuracy_reward": 0.1879340319816644, |
|
"rewards/format_reward": 0.0164930559694767, |
|
"step": 3 |
|
}, |
|
{ |
|
"completion_length": 1872.7357126871746, |
|
"epoch": 0.007950969024349842, |
|
"grad_norm": 0.08034715801477432, |
|
"kl": 0.00017563501993815103, |
|
"learning_rate": 1.5789473684210526e-06, |
|
"loss": 0.0, |
|
"reward": 0.16623264349376163, |
|
"reward_std": 0.166806096288686, |
|
"rewards/accuracy_reward": 0.15190972640023878, |
|
"rewards/format_reward": 0.014322916977107525, |
|
"step": 6 |
|
}, |
|
{ |
|
"completion_length": 1840.263064066569, |
|
"epoch": 0.011926453536524764, |
|
"grad_norm": 0.07688009738922119, |
|
"kl": 0.00018552939097086588, |
|
"learning_rate": 2.368421052631579e-06, |
|
"loss": 0.0, |
|
"reward": 0.22352431349766752, |
|
"reward_std": 0.17657933492834368, |
|
"rewards/accuracy_reward": 0.20355903388311467, |
|
"rewards/format_reward": 0.019965278489204746, |
|
"step": 9 |
|
}, |
|
{ |
|
"completion_length": 1760.6285069783528, |
|
"epoch": 0.015901938048699684, |
|
"grad_norm": 0.1042766273021698, |
|
"kl": 0.00028959910074869793, |
|
"learning_rate": 3.157894736842105e-06, |
|
"loss": 0.0, |
|
"reward": 0.2647569526452571, |
|
"reward_std": 0.219600356494387, |
|
"rewards/accuracy_reward": 0.2326388960548987, |
|
"rewards/format_reward": 0.032118056512748204, |
|
"step": 12 |
|
}, |
|
{ |
|
"completion_length": 1722.3082021077473, |
|
"epoch": 0.019877422560874606, |
|
"grad_norm": 0.3040783703327179, |
|
"kl": 0.0018845796585083008, |
|
"learning_rate": 3.947368421052632e-06, |
|
"loss": 0.0001, |
|
"reward": 0.3059895912495752, |
|
"reward_std": 0.23623824515379965, |
|
"rewards/accuracy_reward": 0.23784722954345247, |
|
"rewards/format_reward": 0.06814236252103001, |
|
"step": 15 |
|
}, |
|
{ |
|
"completion_length": 845.952714920044, |
|
"epoch": 0.023852907073049528, |
|
"grad_norm": 1.3506791591644287, |
|
"kl": 0.3431205749511719, |
|
"learning_rate": 4.736842105263158e-06, |
|
"loss": 0.0137, |
|
"reward": 0.6740451576188207, |
|
"reward_std": 0.2776922438448916, |
|
"rewards/accuracy_reward": 0.1336805592291057, |
|
"rewards/format_reward": 0.5403645950524757, |
|
"step": 18 |
|
}, |
|
{ |
|
"completion_length": 66.15755401055019, |
|
"epoch": 0.02782839158522445, |
|
"grad_norm": 1.3607665300369263, |
|
"kl": 0.6993815104166666, |
|
"learning_rate": 5.526315789473685e-06, |
|
"loss": 0.028, |
|
"reward": 0.9774305758376917, |
|
"reward_std": 0.09281354808869462, |
|
"rewards/accuracy_reward": 0.018663194845430553, |
|
"rewards/format_reward": 0.9587673805654049, |
|
"step": 21 |
|
}, |
|
{ |
|
"completion_length": 118.68446455399196, |
|
"epoch": 0.03180387609739937, |
|
"grad_norm": 0.8041670322418213, |
|
"kl": 0.53955078125, |
|
"learning_rate": 6.31578947368421e-06, |
|
"loss": 0.0216, |
|
"reward": 0.9665798830489317, |
|
"reward_std": 0.1714695317981144, |
|
"rewards/accuracy_reward": 0.04600694558272759, |
|
"rewards/format_reward": 0.9205729328095913, |
|
"step": 24 |
|
}, |
|
{ |
|
"completion_length": 88.47830098867416, |
|
"epoch": 0.03577936060957429, |
|
"grad_norm": 0.8617585301399231, |
|
"kl": 0.59326171875, |
|
"learning_rate": 7.1052631578947375e-06, |
|
"loss": 0.0237, |
|
"reward": 1.0386284987131755, |
|
"reward_std": 0.2134858975186944, |
|
"rewards/accuracy_reward": 0.09592014209677775, |
|
"rewards/format_reward": 0.9427083507180214, |
|
"step": 27 |
|
}, |
|
{ |
|
"completion_length": 30.138455788294475, |
|
"epoch": 0.03975484512174921, |
|
"grad_norm": 2.2023513317108154, |
|
"kl": 0.8590494791666666, |
|
"learning_rate": 7.894736842105265e-06, |
|
"loss": 0.0344, |
|
"reward": 1.1623264302810032, |
|
"reward_std": 0.16990292662133774, |
|
"rewards/accuracy_reward": 0.17361111589707434, |
|
"rewards/format_reward": 0.9887152872979641, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 35.736980040868126, |
|
"epoch": 0.043730329633924134, |
|
"grad_norm": 1.1060832738876343, |
|
"kl": 0.8148600260416666, |
|
"learning_rate": 8.68421052631579e-06, |
|
"loss": 0.0326, |
|
"reward": 1.1657986442248027, |
|
"reward_std": 0.17117769015021622, |
|
"rewards/accuracy_reward": 0.18793403407714018, |
|
"rewards/format_reward": 0.9778645994762579, |
|
"step": 33 |
|
}, |
|
{ |
|
"completion_length": 35.928820510705314, |
|
"epoch": 0.047705814146099056, |
|
"grad_norm": 1.2442351579666138, |
|
"kl": 0.847412109375, |
|
"learning_rate": 9.473684210526315e-06, |
|
"loss": 0.0339, |
|
"reward": 1.2052951740721862, |
|
"reward_std": 0.16379862558096647, |
|
"rewards/accuracy_reward": 0.22265625485063842, |
|
"rewards/format_reward": 0.9826389029622078, |
|
"step": 36 |
|
}, |
|
{ |
|
"completion_length": 70.93489801883698, |
|
"epoch": 0.05168129865827398, |
|
"grad_norm": 1.4330227375030518, |
|
"kl": 0.8037923177083334, |
|
"learning_rate": 1.0263157894736844e-05, |
|
"loss": 0.0321, |
|
"reward": 1.1744792064030964, |
|
"reward_std": 0.22849255722636977, |
|
"rewards/accuracy_reward": 0.22222222892257074, |
|
"rewards/format_reward": 0.9522569614152113, |
|
"step": 39 |
|
}, |
|
{ |
|
"completion_length": 81.15885670979817, |
|
"epoch": 0.0556567831704489, |
|
"grad_norm": 1.0737708806991577, |
|
"kl": 0.7923177083333334, |
|
"learning_rate": 1.105263157894737e-05, |
|
"loss": 0.0317, |
|
"reward": 1.14930559694767, |
|
"reward_std": 0.22925202331195274, |
|
"rewards/accuracy_reward": 0.19184028345625848, |
|
"rewards/format_reward": 0.9574652947485447, |
|
"step": 42 |
|
}, |
|
{ |
|
"completion_length": 69.16927303870519, |
|
"epoch": 0.05963226768262382, |
|
"grad_norm": 0.8058044910430908, |
|
"kl": 0.8806966145833334, |
|
"learning_rate": 1.1842105263157895e-05, |
|
"loss": 0.0352, |
|
"reward": 1.1675347524384658, |
|
"reward_std": 0.1868902291947355, |
|
"rewards/accuracy_reward": 0.19618056225590408, |
|
"rewards/format_reward": 0.9713541852931181, |
|
"step": 45 |
|
}, |
|
{ |
|
"completion_length": 57.069880266984306, |
|
"epoch": 0.06360775219479874, |
|
"grad_norm": 2.353023052215576, |
|
"kl": 0.91162109375, |
|
"learning_rate": 1.263157894736842e-05, |
|
"loss": 0.0365, |
|
"reward": 1.189236156642437, |
|
"reward_std": 0.18126761401072145, |
|
"rewards/accuracy_reward": 0.20095486647915095, |
|
"rewards/format_reward": 0.9882812586923441, |
|
"step": 48 |
|
}, |
|
{ |
|
"completion_length": 21.48567771911621, |
|
"epoch": 0.06758323670697367, |
|
"grad_norm": 0.964798629283905, |
|
"kl": 0.8650716145833334, |
|
"learning_rate": 1.3421052631578948e-05, |
|
"loss": 0.0346, |
|
"reward": 1.2161458743115265, |
|
"reward_std": 0.1899353281284372, |
|
"rewards/accuracy_reward": 0.21961806206187853, |
|
"rewards/format_reward": 0.996527781089147, |
|
"step": 51 |
|
}, |
|
{ |
|
"completion_length": 53.81597367922465, |
|
"epoch": 0.07155872121914858, |
|
"grad_norm": 1.0664324760437012, |
|
"kl": 0.8111572265625, |
|
"learning_rate": 1.4210526315789475e-05, |
|
"loss": 0.0324, |
|
"reward": 1.162326426555713, |
|
"reward_std": 0.23168744108018777, |
|
"rewards/accuracy_reward": 0.20008681147980192, |
|
"rewards/format_reward": 0.9622395982344946, |
|
"step": 54 |
|
}, |
|
{ |
|
"completion_length": 208.98611625035605, |
|
"epoch": 0.07553420573132351, |
|
"grad_norm": 0.8128153085708618, |
|
"kl": 0.6840006510416666, |
|
"learning_rate": 1.5000000000000002e-05, |
|
"loss": 0.0274, |
|
"reward": 1.1306424004336197, |
|
"reward_std": 0.35048759169876575, |
|
"rewards/accuracy_reward": 0.22743056244992962, |
|
"rewards/format_reward": 0.9032118245959282, |
|
"step": 57 |
|
}, |
|
{ |
|
"completion_length": 260.03820244471234, |
|
"epoch": 0.07950969024349842, |
|
"grad_norm": 0.5030148029327393, |
|
"kl": 0.599609375, |
|
"learning_rate": 1.578947368421053e-05, |
|
"loss": 0.024, |
|
"reward": 1.168402807166179, |
|
"reward_std": 0.28692516156782705, |
|
"rewards/accuracy_reward": 0.21223958927051476, |
|
"rewards/format_reward": 0.9561632163822651, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 191.35243598620096, |
|
"epoch": 0.08348517475567335, |
|
"grad_norm": 0.5893499851226807, |
|
"kl": 0.6812337239583334, |
|
"learning_rate": 1.6578947368421053e-05, |
|
"loss": 0.0273, |
|
"reward": 1.1432291989525158, |
|
"reward_std": 0.21883391573404273, |
|
"rewards/accuracy_reward": 0.17057292070239782, |
|
"rewards/format_reward": 0.9726562661429247, |
|
"step": 63 |
|
}, |
|
{ |
|
"completion_length": 304.99697029590607, |
|
"epoch": 0.08746065926784827, |
|
"grad_norm": 0.7647993564605713, |
|
"kl": 0.6839599609375, |
|
"learning_rate": 1.736842105263158e-05, |
|
"loss": 0.0274, |
|
"reward": 1.1657986467083294, |
|
"reward_std": 0.2664798041029523, |
|
"rewards/accuracy_reward": 0.210069450433366, |
|
"rewards/format_reward": 0.9557291840513548, |
|
"step": 66 |
|
}, |
|
{ |
|
"completion_length": 84.33941195408504, |
|
"epoch": 0.0914361437800232, |
|
"grad_norm": 1.6230992078781128, |
|
"kl": 0.8506673177083334, |
|
"learning_rate": 1.8157894736842107e-05, |
|
"loss": 0.034, |
|
"reward": 1.1362847598890464, |
|
"reward_std": 0.21719592344015837, |
|
"rewards/accuracy_reward": 0.17838542186655104, |
|
"rewards/format_reward": 0.9578993221124014, |
|
"step": 69 |
|
}, |
|
{ |
|
"completion_length": 47.16927206516266, |
|
"epoch": 0.09541162829219811, |
|
"grad_norm": 0.5694164633750916, |
|
"kl": 0.8841145833333334, |
|
"learning_rate": 1.894736842105263e-05, |
|
"loss": 0.0354, |
|
"reward": 1.1827257337669532, |
|
"reward_std": 0.18139228488629064, |
|
"rewards/accuracy_reward": 0.19487847795244306, |
|
"rewards/format_reward": 0.9878472325702509, |
|
"step": 72 |
|
}, |
|
{ |
|
"completion_length": 65.3068592151006, |
|
"epoch": 0.09938711280437303, |
|
"grad_norm": 0.8147668838500977, |
|
"kl": 0.8631184895833334, |
|
"learning_rate": 1.9736842105263158e-05, |
|
"loss": 0.0345, |
|
"reward": 1.2265625409781933, |
|
"reward_std": 0.22970331188601753, |
|
"rewards/accuracy_reward": 0.24826389558923742, |
|
"rewards/format_reward": 0.9782986268401146, |
|
"step": 75 |
|
}, |
|
{ |
|
"completion_length": 101.73307637373607, |
|
"epoch": 0.10336259731654796, |
|
"grad_norm": 3.9387757778167725, |
|
"kl": 0.9171549479166666, |
|
"learning_rate": 1.9999570594853575e-05, |
|
"loss": 0.0367, |
|
"reward": 1.1497396255532901, |
|
"reward_std": 0.26099368068389595, |
|
"rewards/accuracy_reward": 0.20920139628772935, |
|
"rewards/format_reward": 0.9405382163822651, |
|
"step": 78 |
|
}, |
|
{ |
|
"completion_length": 29.979601462682087, |
|
"epoch": 0.10733808182872287, |
|
"grad_norm": 5.135621547698975, |
|
"kl": 1.5565592447916667, |
|
"learning_rate": 1.9997316318671806e-05, |
|
"loss": 0.0622, |
|
"reward": 1.2044271193444729, |
|
"reward_std": 0.1869426581542939, |
|
"rewards/accuracy_reward": 0.23090278574575981, |
|
"rewards/format_reward": 0.9735243221124014, |
|
"step": 81 |
|
}, |
|
{ |
|
"completion_length": 22.930122137069702, |
|
"epoch": 0.1113135663408978, |
|
"grad_norm": 28.183279037475586, |
|
"kl": 1.3277994791666667, |
|
"learning_rate": 1.999313025518698e-05, |
|
"loss": 0.0531, |
|
"reward": 1.1892361504336197, |
|
"reward_std": 0.20981760757664839, |
|
"rewards/accuracy_reward": 0.20876736678959182, |
|
"rewards/format_reward": 0.9804687661429247, |
|
"step": 84 |
|
}, |
|
{ |
|
"completion_length": 46.38932470480601, |
|
"epoch": 0.11528905085307271, |
|
"grad_norm": 1.771388053894043, |
|
"kl": 1.13525390625, |
|
"learning_rate": 1.9987013213274594e-05, |
|
"loss": 0.0454, |
|
"reward": 1.2356771156191826, |
|
"reward_std": 0.18450136513759693, |
|
"rewards/accuracy_reward": 0.25737847938823205, |
|
"rewards/format_reward": 0.9782986268401146, |
|
"step": 87 |
|
}, |
|
{ |
|
"completion_length": 133.1475731531779, |
|
"epoch": 0.11926453536524764, |
|
"grad_norm": 0.963828444480896, |
|
"kl": 1.10107421875, |
|
"learning_rate": 1.9978966374934255e-05, |
|
"loss": 0.0441, |
|
"reward": 1.1979166989525158, |
|
"reward_std": 0.22738417129342756, |
|
"rewards/accuracy_reward": 0.2348090335726738, |
|
"rewards/format_reward": 0.9631076604127884, |
|
"step": 90 |
|
}, |
|
{ |
|
"completion_length": 95.22396143277486, |
|
"epoch": 0.12324001987742256, |
|
"grad_norm": 2.4775350093841553, |
|
"kl": 1.29541015625, |
|
"learning_rate": 1.996899129506126e-05, |
|
"loss": 0.0519, |
|
"reward": 1.1801215683420498, |
|
"reward_std": 0.22066468729948005, |
|
"rewards/accuracy_reward": 0.21831597779722264, |
|
"rewards/format_reward": 0.9618055733541647, |
|
"step": 93 |
|
}, |
|
{ |
|
"completion_length": 141.00217274824777, |
|
"epoch": 0.12721550438959747, |
|
"grad_norm": 113.83024597167969, |
|
"kl": 6.85986328125, |
|
"learning_rate": 1.995708990114615e-05, |
|
"loss": 0.2747, |
|
"reward": 1.1006944725910823, |
|
"reward_std": 0.2825309601612389, |
|
"rewards/accuracy_reward": 0.16493056050967425, |
|
"rewards/format_reward": 0.9357639079292616, |
|
"step": 96 |
|
}, |
|
{ |
|
"completion_length": 166.101132551829, |
|
"epoch": 0.13119098890177242, |
|
"grad_norm": 10.326292037963867, |
|
"kl": 4.069661458333333, |
|
"learning_rate": 1.994326449290226e-05, |
|
"loss": 0.1628, |
|
"reward": 1.0438368394970894, |
|
"reward_std": 0.32250430978213745, |
|
"rewards/accuracy_reward": 0.1401909765166541, |
|
"rewards/format_reward": 0.9036458494762579, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.1325161504058307, |
|
"eval_completion_length": 159.95536130788375, |
|
"eval_kl": 3.7168367346938775, |
|
"eval_loss": 0.16389134526252747, |
|
"eval_reward": 1.0437925482282833, |
|
"eval_reward_std": 0.3066673065174599, |
|
"eval_rewards/accuracy_reward": 0.14200680786553693, |
|
"eval_rewards/format_reward": 0.9017857349648768, |
|
"eval_runtime": 416.6475, |
|
"eval_samples_per_second": 0.238, |
|
"eval_steps_per_second": 0.012, |
|
"step": 100 |
|
}, |
|
{ |
|
"completion_length": 146.04948329925537, |
|
"epoch": 0.13516647341394733, |
|
"grad_norm": 29.008094787597656, |
|
"kl": 4.4609375, |
|
"learning_rate": 1.9927517741821343e-05, |
|
"loss": 0.1592, |
|
"reward": 1.0123698189854622, |
|
"reward_std": 0.279809627099894, |
|
"rewards/accuracy_reward": 0.10416666977107525, |
|
"rewards/format_reward": 0.9082031473517418, |
|
"step": 102 |
|
}, |
|
{ |
|
"completion_length": 83.48133925596873, |
|
"epoch": 0.13914195792612225, |
|
"grad_norm": 2.938828468322754, |
|
"kl": 2.2904459635416665, |
|
"learning_rate": 1.990985269065736e-05, |
|
"loss": 0.0916, |
|
"reward": 1.0742187835276127, |
|
"reward_std": 0.23118331842124462, |
|
"rewards/accuracy_reward": 0.12543403171002865, |
|
"rewards/format_reward": 0.9487847375373045, |
|
"step": 105 |
|
}, |
|
{ |
|
"completion_length": 54.888022780418396, |
|
"epoch": 0.14311744243829716, |
|
"grad_norm": 2.3945536613464355, |
|
"kl": 2.4551595052083335, |
|
"learning_rate": 1.989027275283852e-05, |
|
"loss": 0.0982, |
|
"reward": 1.1523437860111396, |
|
"reward_std": 0.20264656166546047, |
|
"rewards/accuracy_reward": 0.17621528167122355, |
|
"rewards/format_reward": 0.976128488779068, |
|
"step": 108 |
|
}, |
|
{ |
|
"completion_length": 42.01388998826345, |
|
"epoch": 0.1470929269504721, |
|
"grad_norm": 2.8216843605041504, |
|
"kl": 1.4291178385416667, |
|
"learning_rate": 1.9868781711807705e-05, |
|
"loss": 0.0572, |
|
"reward": 1.1814236467083294, |
|
"reward_std": 0.18004686074952284, |
|
"rewards/accuracy_reward": 0.1927083395033454, |
|
"rewards/format_reward": 0.9887152885397276, |
|
"step": 111 |
|
}, |
|
{ |
|
"completion_length": 37.57769203186035, |
|
"epoch": 0.15106841146264702, |
|
"grad_norm": 2.9078152179718018, |
|
"kl": 1.4965006510416667, |
|
"learning_rate": 1.9845383720291392e-05, |
|
"loss": 0.0598, |
|
"reward": 1.19531253973643, |
|
"reward_std": 0.16706574785833558, |
|
"rewards/accuracy_reward": 0.20486111663437137, |
|
"rewards/format_reward": 0.990451397995154, |
|
"step": 114 |
|
}, |
|
{ |
|
"completion_length": 46.923178335030876, |
|
"epoch": 0.15504389597482193, |
|
"grad_norm": 2.5346879959106445, |
|
"kl": 1.30078125, |
|
"learning_rate": 1.9820083299497227e-05, |
|
"loss": 0.0521, |
|
"reward": 1.1796875434617202, |
|
"reward_std": 0.17389631201513112, |
|
"rewards/accuracy_reward": 0.1901041710516438, |
|
"rewards/format_reward": 0.9895833420256773, |
|
"step": 117 |
|
}, |
|
{ |
|
"completion_length": 84.15060981114705, |
|
"epoch": 0.15901938048699685, |
|
"grad_norm": 20.683246612548828, |
|
"kl": 2.1199544270833335, |
|
"learning_rate": 1.9792885338240375e-05, |
|
"loss": 0.0848, |
|
"reward": 1.1553819701075554, |
|
"reward_std": 0.22444627589235702, |
|
"rewards/accuracy_reward": 0.1844618112857764, |
|
"rewards/format_reward": 0.9709201554457346, |
|
"step": 120 |
|
}, |
|
{ |
|
"completion_length": 107.51779842376709, |
|
"epoch": 0.16299486499917176, |
|
"grad_norm": 5.012475967407227, |
|
"kl": 2.2217610677083335, |
|
"learning_rate": 1.976379509199886e-05, |
|
"loss": 0.0888, |
|
"reward": 1.1514757337669532, |
|
"reward_std": 0.262029462105905, |
|
"rewards/accuracy_reward": 0.1970486156642437, |
|
"rewards/format_reward": 0.9544271069268385, |
|
"step": 123 |
|
}, |
|
{ |
|
"completion_length": 148.11632299423218, |
|
"epoch": 0.1669703495113467, |
|
"grad_norm": 14.528366088867188, |
|
"kl": 2.2367350260416665, |
|
"learning_rate": 1.9732818181898046e-05, |
|
"loss": 0.0895, |
|
"reward": 1.1228298942248027, |
|
"reward_std": 0.2808268330991268, |
|
"rewards/accuracy_reward": 0.1896701450071608, |
|
"rewards/format_reward": 0.9331597425043583, |
|
"step": 126 |
|
}, |
|
{ |
|
"completion_length": 84.5121552546819, |
|
"epoch": 0.17094583402352162, |
|
"grad_norm": 0.969579815864563, |
|
"kl": 1.4344889322916667, |
|
"learning_rate": 1.9699960593624462e-05, |
|
"loss": 0.0574, |
|
"reward": 1.15538198625048, |
|
"reward_std": 0.21577061604087552, |
|
"rewards/accuracy_reward": 0.18359375500585884, |
|
"rewards/format_reward": 0.9717882089316845, |
|
"step": 129 |
|
}, |
|
{ |
|
"completion_length": 72.3042555252711, |
|
"epoch": 0.17492131853569653, |
|
"grad_norm": 1.7120805978775024, |
|
"kl": 1.3595377604166667, |
|
"learning_rate": 1.966522867626919e-05, |
|
"loss": 0.0544, |
|
"reward": 1.1766493457059066, |
|
"reward_std": 0.20347999944351614, |
|
"rewards/accuracy_reward": 0.1983507004721711, |
|
"rewards/format_reward": 0.9782986243565878, |
|
"step": 132 |
|
}, |
|
{ |
|
"completion_length": 96.82161716620128, |
|
"epoch": 0.17889680304787145, |
|
"grad_norm": 7.904327869415283, |
|
"kl": 1.818359375, |
|
"learning_rate": 1.962862914110101e-05, |
|
"loss": 0.0727, |
|
"reward": 1.2313368394970894, |
|
"reward_std": 0.2199981181571881, |
|
"rewards/accuracy_reward": 0.25781250578196097, |
|
"rewards/format_reward": 0.9735243196288744, |
|
"step": 135 |
|
}, |
|
{ |
|
"completion_length": 177.80512762069702, |
|
"epoch": 0.1828722875600464, |
|
"grad_norm": 2.0114357471466064, |
|
"kl": 1.666259765625, |
|
"learning_rate": 1.9590169060269602e-05, |
|
"loss": 0.0666, |
|
"reward": 1.1423611491918564, |
|
"reward_std": 0.26254904045102495, |
|
"rewards/accuracy_reward": 0.2005208401630322, |
|
"rewards/format_reward": 0.941840298473835, |
|
"step": 138 |
|
}, |
|
{ |
|
"completion_length": 90.11675635973613, |
|
"epoch": 0.1868477720722213, |
|
"grad_norm": 0.599406898021698, |
|
"kl": 0.9375813802083334, |
|
"learning_rate": 1.9549855865438967e-05, |
|
"loss": 0.0375, |
|
"reward": 1.205729205161333, |
|
"reward_std": 0.19089689617976546, |
|
"rewards/accuracy_reward": 0.22092014454149952, |
|
"rewards/format_reward": 0.9848090397814909, |
|
"step": 141 |
|
}, |
|
{ |
|
"completion_length": 56.38889070351919, |
|
"epoch": 0.19082325658439622, |
|
"grad_norm": 17.37237548828125, |
|
"kl": 1.100830078125, |
|
"learning_rate": 1.9507697346351414e-05, |
|
"loss": 0.0441, |
|
"reward": 1.2404514253139496, |
|
"reward_std": 0.15965971552456418, |
|
"rewards/accuracy_reward": 0.2469618124111245, |
|
"rewards/format_reward": 0.9934895895421505, |
|
"step": 144 |
|
}, |
|
{ |
|
"completion_length": 117.53472594420116, |
|
"epoch": 0.19479874109657114, |
|
"grad_norm": 1.0283232927322388, |
|
"kl": 0.917236328125, |
|
"learning_rate": 1.9463701649322343e-05, |
|
"loss": 0.0367, |
|
"reward": 1.1766493432223797, |
|
"reward_std": 0.22516770443568626, |
|
"rewards/accuracy_reward": 0.2135416737291962, |
|
"rewards/format_reward": 0.9631076554457346, |
|
"step": 147 |
|
}, |
|
{ |
|
"completion_length": 154.88368586699167, |
|
"epoch": 0.19877422560874605, |
|
"grad_norm": 0.9131763577461243, |
|
"kl": 1.0406901041666667, |
|
"learning_rate": 1.941787727566613e-05, |
|
"loss": 0.0416, |
|
"reward": 1.1358507387340069, |
|
"reward_std": 0.24575756738583246, |
|
"rewards/accuracy_reward": 0.18706597783602774, |
|
"rewards/format_reward": 0.9487847412625948, |
|
"step": 150 |
|
}, |
|
{ |
|
"completion_length": 137.50000397364298, |
|
"epoch": 0.202749710120921, |
|
"grad_norm": 1.9522716999053955, |
|
"kl": 3.1927897135416665, |
|
"learning_rate": 1.9370233080053406e-05, |
|
"loss": 0.1279, |
|
"reward": 1.1432291989525158, |
|
"reward_std": 0.25552801430846256, |
|
"rewards/accuracy_reward": 0.18836806000520787, |
|
"rewards/format_reward": 0.9548611318071684, |
|
"step": 153 |
|
}, |
|
{ |
|
"completion_length": 340.70487225055695, |
|
"epoch": 0.2067251946330959, |
|
"grad_norm": 338.98333740234375, |
|
"kl": 10.551839192708334, |
|
"learning_rate": 1.9320778268800068e-05, |
|
"loss": 0.4221, |
|
"reward": 1.0894097586472828, |
|
"reward_std": 0.32969770890971023, |
|
"rewards/accuracy_reward": 0.2165798662075152, |
|
"rewards/format_reward": 0.8728298805654049, |
|
"step": 156 |
|
}, |
|
{ |
|
"completion_length": 1684.335110982259, |
|
"epoch": 0.21070067914527082, |
|
"grad_norm": 25.311864852905273, |
|
"kl": 25.832682291666668, |
|
"learning_rate": 1.926952239808833e-05, |
|
"loss": 1.0325, |
|
"reward": 0.6332465472320715, |
|
"reward_std": 0.556972432260712, |
|
"rewards/accuracy_reward": 0.2395833390376841, |
|
"rewards/format_reward": 0.39366320706903934, |
|
"step": 159 |
|
}, |
|
{ |
|
"completion_length": 1924.016092936198, |
|
"epoch": 0.21467616365744574, |
|
"grad_norm": 0.9118285179138184, |
|
"kl": 2.5398763020833335, |
|
"learning_rate": 1.9216475372120198e-05, |
|
"loss": 0.1016, |
|
"reward": 0.5694444632778565, |
|
"reward_std": 0.5513101244966189, |
|
"rewards/accuracy_reward": 0.17751736612990499, |
|
"rewards/format_reward": 0.3919270985449354, |
|
"step": 162 |
|
}, |
|
{ |
|
"completion_length": 536.2187668085098, |
|
"epoch": 0.21865164816962068, |
|
"grad_norm": 0.5543506741523743, |
|
"kl": 0.8994954427083334, |
|
"learning_rate": 1.9161647441203648e-05, |
|
"loss": 0.036, |
|
"reward": 1.0902778059244156, |
|
"reward_std": 0.2743187023637195, |
|
"rewards/accuracy_reward": 0.179253476128603, |
|
"rewards/format_reward": 0.9110243183871111, |
|
"step": 165 |
|
}, |
|
{ |
|
"completion_length": 28.863716046015423, |
|
"epoch": 0.2226271326817956, |
|
"grad_norm": 0.5757032632827759, |
|
"kl": 0.918701171875, |
|
"learning_rate": 1.9105049199771963e-05, |
|
"loss": 0.0367, |
|
"reward": 1.2426215708255768, |
|
"reward_std": 0.16670533292926848, |
|
"rewards/accuracy_reward": 0.24696181248873472, |
|
"rewards/format_reward": 0.9956597263614336, |
|
"step": 168 |
|
}, |
|
{ |
|
"completion_length": 36.32769219080607, |
|
"epoch": 0.2266026171939705, |
|
"grad_norm": 0.9013729691505432, |
|
"kl": 0.907470703125, |
|
"learning_rate": 1.904669158433658e-05, |
|
"loss": 0.0363, |
|
"reward": 1.2196180919806163, |
|
"reward_std": 0.15937398614672324, |
|
"rewards/accuracy_reward": 0.2235243107036998, |
|
"rewards/format_reward": 0.9960937537252903, |
|
"step": 171 |
|
}, |
|
{ |
|
"completion_length": 118.9145000775655, |
|
"epoch": 0.23057810170614543, |
|
"grad_norm": 0.603873610496521, |
|
"kl": 0.875244140625, |
|
"learning_rate": 1.8986585871373792e-05, |
|
"loss": 0.035, |
|
"reward": 1.19921878973643, |
|
"reward_std": 0.21174315828830004, |
|
"rewards/accuracy_reward": 0.22526042260384807, |
|
"rewards/format_reward": 0.9739583494762579, |
|
"step": 174 |
|
}, |
|
{ |
|
"completion_length": 154.3828158378601, |
|
"epoch": 0.23455358621832037, |
|
"grad_norm": 0.5029460191726685, |
|
"kl": 0.8998209635416666, |
|
"learning_rate": 1.8924743675145815e-05, |
|
"loss": 0.036, |
|
"reward": 1.1315104526778061, |
|
"reward_std": 0.291058028737704, |
|
"rewards/accuracy_reward": 0.2100694509766375, |
|
"rewards/format_reward": 0.9214409937461218, |
|
"step": 177 |
|
}, |
|
{ |
|
"completion_length": 27.94401141007741, |
|
"epoch": 0.23852907073049529, |
|
"grad_norm": 0.7333383560180664, |
|
"kl": 1.0400390625, |
|
"learning_rate": 1.8861176945456542e-05, |
|
"loss": 0.0416, |
|
"reward": 1.1788194874922435, |
|
"reward_std": 0.21482299477793276, |
|
"rewards/accuracy_reward": 0.20920139516238123, |
|
"rewards/format_reward": 0.9696180733541647, |
|
"step": 180 |
|
}, |
|
{ |
|
"completion_length": 24.035590926806133, |
|
"epoch": 0.2425045552426702, |
|
"grad_norm": 0.9987295866012573, |
|
"kl": 1.773681640625, |
|
"learning_rate": 1.8795897965342473e-05, |
|
"loss": 0.071, |
|
"reward": 1.2339410148561, |
|
"reward_std": 0.18799259358396134, |
|
"rewards/accuracy_reward": 0.25564236876865226, |
|
"rewards/format_reward": 0.9782986280818781, |
|
"step": 183 |
|
}, |
|
{ |
|
"completion_length": 22.082900047302246, |
|
"epoch": 0.24648003975484511, |
|
"grad_norm": 1.0223162174224854, |
|
"kl": 1.2223307291666667, |
|
"learning_rate": 1.8728919348699285e-05, |
|
"loss": 0.0489, |
|
"reward": 1.1970486516753833, |
|
"reward_std": 0.1605207941805323, |
|
"rewards/accuracy_reward": 0.21267361818657568, |
|
"rewards/format_reward": 0.9843750111758709, |
|
"step": 186 |
|
}, |
|
{ |
|
"completion_length": 22.559462388356526, |
|
"epoch": 0.25045552426702006, |
|
"grad_norm": 0.9617392420768738, |
|
"kl": 1.2871907552083333, |
|
"learning_rate": 1.866025403784439e-05, |
|
"loss": 0.0515, |
|
"reward": 1.253038230041663, |
|
"reward_std": 0.16818702151067555, |
|
"rewards/accuracy_reward": 0.26866320287808776, |
|
"rewards/format_reward": 0.9843750111758709, |
|
"step": 189 |
|
}, |
|
{ |
|
"completion_length": 74.73828355471294, |
|
"epoch": 0.25443100877919494, |
|
"grad_norm": 438.25408935546875, |
|
"kl": 6.625325520833333, |
|
"learning_rate": 1.858991530101613e-05, |
|
"loss": 0.2647, |
|
"reward": 0.8719618345300356, |
|
"reward_std": 0.4378834879025817, |
|
"rewards/accuracy_reward": 0.20963542287548384, |
|
"rewards/format_reward": 0.6623264097919067, |
|
"step": 192 |
|
}, |
|
{ |
|
"completion_length": 72.35807486375172, |
|
"epoch": 0.2584064932913699, |
|
"grad_norm": 2.040531873703003, |
|
"kl": 1.7041829427083333, |
|
"learning_rate": 1.851791672980993e-05, |
|
"loss": 0.0681, |
|
"reward": 0.6892361293236414, |
|
"reward_std": 0.5002113915979862, |
|
"rewards/accuracy_reward": 0.17578125411334136, |
|
"rewards/format_reward": 0.5134548768401146, |
|
"step": 195 |
|
}, |
|
{ |
|
"completion_length": 25.8216153383255, |
|
"epoch": 0.26238197780354483, |
|
"grad_norm": 0.6412864923477173, |
|
"kl": 1.1061197916666667, |
|
"learning_rate": 1.844427223655199e-05, |
|
"loss": 0.0443, |
|
"reward": 1.1362847524384658, |
|
"reward_std": 0.2772039214614779, |
|
"rewards/accuracy_reward": 0.2052951459384834, |
|
"rewards/format_reward": 0.9309896019597849, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.2650323008116614, |
|
"eval_completion_length": 106.06218176621657, |
|
"eval_kl": 1.0171274038461537, |
|
"eval_loss": 0.0414416678249836, |
|
"eval_reward": 1.2000000339287977, |
|
"eval_reward_std": 0.21003777883373775, |
|
"eval_rewards/accuracy_reward": 0.24038462171760888, |
|
"eval_rewards/format_reward": 0.9596154038722698, |
|
"eval_runtime": 392.0553, |
|
"eval_samples_per_second": 0.253, |
|
"eval_steps_per_second": 0.013, |
|
"step": 200 |
|
}, |
|
{ |
|
"completion_length": 153.56380558013916, |
|
"epoch": 0.2663574623157197, |
|
"grad_norm": 0.6983628869056702, |
|
"kl": 1.030517578125, |
|
"learning_rate": 1.8368996051610987e-05, |
|
"loss": 0.0402, |
|
"reward": 1.1731771118938923, |
|
"reward_std": 0.2824738877825439, |
|
"rewards/accuracy_reward": 0.2369791748933494, |
|
"rewards/format_reward": 0.9361979365348816, |
|
"step": 201 |
|
}, |
|
{ |
|
"completion_length": 88.2196215391159, |
|
"epoch": 0.27033294682789466, |
|
"grad_norm": 0.5030940771102905, |
|
"kl": 1.013427734375, |
|
"learning_rate": 1.8292102720648333e-05, |
|
"loss": 0.0406, |
|
"reward": 1.1948785136143367, |
|
"reward_std": 0.18432624839867154, |
|
"rewards/accuracy_reward": 0.2248263950459659, |
|
"rewards/format_reward": 0.9700521007180214, |
|
"step": 204 |
|
}, |
|
{ |
|
"completion_length": 59.0355920791626, |
|
"epoch": 0.27430843134006955, |
|
"grad_norm": 0.6660766005516052, |
|
"kl": 1.0997721354166667, |
|
"learning_rate": 1.821360710180753e-05, |
|
"loss": 0.044, |
|
"reward": 1.2152778208255768, |
|
"reward_std": 0.16302509784388045, |
|
"rewards/accuracy_reward": 0.2326388961325089, |
|
"rewards/format_reward": 0.9826389054457346, |
|
"step": 207 |
|
}, |
|
{ |
|
"completion_length": 136.2734409570694, |
|
"epoch": 0.2782839158522445, |
|
"grad_norm": 12.398130416870117, |
|
"kl": 1.268310546875, |
|
"learning_rate": 1.8133524362843105e-05, |
|
"loss": 0.0507, |
|
"reward": 1.1844618419806163, |
|
"reward_std": 0.2869204127540191, |
|
"rewards/accuracy_reward": 0.24218750729536018, |
|
"rewards/format_reward": 0.9422743258376917, |
|
"step": 210 |
|
}, |
|
{ |
|
"completion_length": 70.42100850741069, |
|
"epoch": 0.28225940036441943, |
|
"grad_norm": 0.6272424459457397, |
|
"kl": 1.75341796875, |
|
"learning_rate": 1.8051869978189732e-05, |
|
"loss": 0.0702, |
|
"reward": 1.1562500409781933, |
|
"reward_std": 0.2056693274838229, |
|
"rewards/accuracy_reward": 0.18315972775841752, |
|
"rewards/format_reward": 0.9730902922650179, |
|
"step": 213 |
|
}, |
|
{ |
|
"completion_length": 73.97873449325562, |
|
"epoch": 0.2862348848765943, |
|
"grad_norm": 0.8462525010108948, |
|
"kl": 1.53515625, |
|
"learning_rate": 1.7968659725972113e-05, |
|
"loss": 0.0614, |
|
"reward": 1.2317708693444729, |
|
"reward_std": 0.20991144888103008, |
|
"rewards/accuracy_reward": 0.25130209074510884, |
|
"rewards/format_reward": 0.9804687649011612, |
|
"step": 216 |
|
}, |
|
{ |
|
"completion_length": 253.28212424119315, |
|
"epoch": 0.29021036938876926, |
|
"grad_norm": 1.3421990871429443, |
|
"kl": 1.69873046875, |
|
"learning_rate": 1.7883909684956142e-05, |
|
"loss": 0.0679, |
|
"reward": 1.194010455161333, |
|
"reward_std": 0.23462056911860904, |
|
"rewards/accuracy_reward": 0.22092014512357613, |
|
"rewards/format_reward": 0.9730902959903082, |
|
"step": 219 |
|
}, |
|
{ |
|
"completion_length": 470.5165026982625, |
|
"epoch": 0.2941858539009442, |
|
"grad_norm": 1.0831785202026367, |
|
"kl": 1.51318359375, |
|
"learning_rate": 1.7797636231442018e-05, |
|
"loss": 0.0605, |
|
"reward": 1.1184896218279998, |
|
"reward_std": 0.3129944964312017, |
|
"rewards/accuracy_reward": 0.18750000543271503, |
|
"rewards/format_reward": 0.9309896032015482, |
|
"step": 222 |
|
}, |
|
{ |
|
"completion_length": 236.49306138356528, |
|
"epoch": 0.2981613384131191, |
|
"grad_norm": 0.5017532706260681, |
|
"kl": 1.0579427083333333, |
|
"learning_rate": 1.770985603609982e-05, |
|
"loss": 0.0423, |
|
"reward": 1.1736111516753833, |
|
"reward_std": 0.22954328202952942, |
|
"rewards/accuracy_reward": 0.2083333401630322, |
|
"rewards/format_reward": 0.9652777972320715, |
|
"step": 225 |
|
}, |
|
{ |
|
"completion_length": 370.95573965708417, |
|
"epoch": 0.30213682292529404, |
|
"grad_norm": 1.8773243427276611, |
|
"kl": 0.7275390625, |
|
"learning_rate": 1.762058606074825e-05, |
|
"loss": 0.0291, |
|
"reward": 1.1892361417412758, |
|
"reward_std": 0.2853658755775541, |
|
"rewards/accuracy_reward": 0.2526041748545443, |
|
"rewards/format_reward": 0.9366319626569748, |
|
"step": 228 |
|
}, |
|
{ |
|
"completion_length": 749.3316179911295, |
|
"epoch": 0.3061123074374689, |
|
"grad_norm": 31.57447624206543, |
|
"kl": 1613.3247884114583, |
|
"learning_rate": 1.7529843555077066e-05, |
|
"loss": 64.5574, |
|
"reward": 0.9032118283212185, |
|
"reward_std": 0.5264206398278475, |
|
"rewards/accuracy_reward": 0.1909722271375358, |
|
"rewards/format_reward": 0.7122396056850752, |
|
"step": 231 |
|
}, |
|
{ |
|
"completion_length": 675.3758859634399, |
|
"epoch": 0.31008779194964387, |
|
"grad_norm": 5.697227954864502, |
|
"kl": 1.6119791666666667, |
|
"learning_rate": 1.743764605331392e-05, |
|
"loss": 0.0645, |
|
"reward": 0.9205729439854622, |
|
"reward_std": 0.511370474472642, |
|
"rewards/accuracy_reward": 0.21093750609240183, |
|
"rewards/format_reward": 0.7096354346722364, |
|
"step": 234 |
|
}, |
|
{ |
|
"completion_length": 132.1358541647593, |
|
"epoch": 0.3140632764618188, |
|
"grad_norm": 11.240227699279785, |
|
"kl": 1.1758626302083333, |
|
"learning_rate": 1.734401137083623e-05, |
|
"loss": 0.047, |
|
"reward": 1.1697048942248027, |
|
"reward_std": 0.234877454660212, |
|
"rewards/accuracy_reward": 0.20920139295049012, |
|
"rewards/format_reward": 0.9605034875373045, |
|
"step": 237 |
|
}, |
|
{ |
|
"completion_length": 65.88151196638744, |
|
"epoch": 0.3180387609739937, |
|
"grad_norm": 1.4037131071090698, |
|
"kl": 1.52587890625, |
|
"learning_rate": 1.7248957600728664e-05, |
|
"loss": 0.0611, |
|
"reward": 1.2100694850087166, |
|
"reward_std": 0.17585339567934474, |
|
"rewards/accuracy_reward": 0.2248263950071608, |
|
"rewards/format_reward": 0.9852430634200573, |
|
"step": 240 |
|
}, |
|
{ |
|
"completion_length": 62.16363008817037, |
|
"epoch": 0.32201424548616864, |
|
"grad_norm": 1.2037297487258911, |
|
"kl": 2.00048828125, |
|
"learning_rate": 1.7152503110287048e-05, |
|
"loss": 0.0802, |
|
"reward": 1.21484378973643, |
|
"reward_std": 0.14739138268244764, |
|
"rewards/accuracy_reward": 0.22265625558793545, |
|
"rewards/format_reward": 0.9921875024835268, |
|
"step": 243 |
|
}, |
|
{ |
|
"completion_length": 125.61762539545695, |
|
"epoch": 0.3259897299983435, |
|
"grad_norm": 101.92506408691406, |
|
"kl": 0.934326171875, |
|
"learning_rate": 1.7054666537469213e-05, |
|
"loss": 0.0374, |
|
"reward": 1.1098090633749962, |
|
"reward_std": 0.30111823774253327, |
|
"rewards/accuracy_reward": 0.2152777845816066, |
|
"rewards/format_reward": 0.8945312686264515, |
|
"step": 246 |
|
}, |
|
{ |
|
"completion_length": 121.29817994435628, |
|
"epoch": 0.32996521451051847, |
|
"grad_norm": 34.78390884399414, |
|
"kl": 1.1844889322916667, |
|
"learning_rate": 1.6955466787293574e-05, |
|
"loss": 0.0473, |
|
"reward": 1.1397569874922435, |
|
"reward_std": 0.25541831855662167, |
|
"rewards/accuracy_reward": 0.19444444961845875, |
|
"rewards/format_reward": 0.9453125186264515, |
|
"step": 249 |
|
}, |
|
{ |
|
"completion_length": 187.08290481567383, |
|
"epoch": 0.3339406990226934, |
|
"grad_norm": 9.161247253417969, |
|
"kl": 1.1195475260416667, |
|
"learning_rate": 1.6854923028186112e-05, |
|
"loss": 0.0448, |
|
"reward": 1.174479205161333, |
|
"reward_std": 0.2551775785783927, |
|
"rewards/accuracy_reward": 0.226128477564392, |
|
"rewards/format_reward": 0.9483507138987383, |
|
"step": 252 |
|
}, |
|
{ |
|
"completion_length": 427.2378609975179, |
|
"epoch": 0.3379161835348683, |
|
"grad_norm": 4.801308631896973, |
|
"kl": 0.938720703125, |
|
"learning_rate": 1.6753054688276443e-05, |
|
"loss": 0.0376, |
|
"reward": 1.0308160049219925, |
|
"reward_std": 0.4219017767657836, |
|
"rewards/accuracy_reward": 0.18880208985259136, |
|
"rewards/format_reward": 0.8420139091710249, |
|
"step": 255 |
|
}, |
|
{ |
|
"completion_length": 529.5642477671305, |
|
"epoch": 0.34189166804704324, |
|
"grad_norm": 24.645631790161133, |
|
"kl": 0.8575032552083334, |
|
"learning_rate": 1.6649881451643706e-05, |
|
"loss": 0.0343, |
|
"reward": 0.9887153046826521, |
|
"reward_std": 0.4318722311096887, |
|
"rewards/accuracy_reward": 0.16449653268015632, |
|
"rewards/format_reward": 0.8242187686264515, |
|
"step": 258 |
|
}, |
|
{ |
|
"completion_length": 635.6575686136881, |
|
"epoch": 0.3458671525592181, |
|
"grad_norm": 16.547225952148438, |
|
"kl": 0.8201497395833334, |
|
"learning_rate": 1.6545423254513003e-05, |
|
"loss": 0.0328, |
|
"reward": 0.9114583544433117, |
|
"reward_std": 0.49249805447955924, |
|
"rewards/accuracy_reward": 0.15451389361017695, |
|
"rewards/format_reward": 0.7569444626569748, |
|
"step": 261 |
|
}, |
|
{ |
|
"completion_length": 670.922758102417, |
|
"epoch": 0.34984263707139307, |
|
"grad_norm": 3.734528064727783, |
|
"kl": 28.136962890625, |
|
"learning_rate": 1.6439700281403113e-05, |
|
"loss": 1.13, |
|
"reward": 0.8875868320465088, |
|
"reward_std": 0.5033875486503044, |
|
"rewards/accuracy_reward": 0.15581597620621324, |
|
"rewards/format_reward": 0.7317708556850752, |
|
"step": 264 |
|
}, |
|
{ |
|
"completion_length": 493.13022168477374, |
|
"epoch": 0.353818121583568, |
|
"grad_norm": 1.4124517440795898, |
|
"kl": 0.744140625, |
|
"learning_rate": 1.6332732961226214e-05, |
|
"loss": 0.0298, |
|
"reward": 1.003472238779068, |
|
"reward_std": 0.37580153982465464, |
|
"rewards/accuracy_reward": 0.16710069950204343, |
|
"rewards/format_reward": 0.8363715509573618, |
|
"step": 267 |
|
}, |
|
{ |
|
"completion_length": 216.89670626322427, |
|
"epoch": 0.3577936060957429, |
|
"grad_norm": 16.25065040588379, |
|
"kl": 0.7809244791666666, |
|
"learning_rate": 1.6224541963340392e-05, |
|
"loss": 0.0312, |
|
"reward": 1.1371528195838134, |
|
"reward_std": 0.24574858765117824, |
|
"rewards/accuracy_reward": 0.18706597686590007, |
|
"rewards/format_reward": 0.950086829562982, |
|
"step": 270 |
|
}, |
|
{ |
|
"completion_length": 177.61805963516235, |
|
"epoch": 0.36176909060791784, |
|
"grad_norm": 0.2947433888912201, |
|
"kl": 0.6514485677083334, |
|
"learning_rate": 1.6115148193555708e-05, |
|
"loss": 0.0261, |
|
"reward": 1.1371528146167595, |
|
"reward_std": 0.216966389445588, |
|
"rewards/accuracy_reward": 0.1727430597335721, |
|
"rewards/format_reward": 0.9644097400208315, |
|
"step": 273 |
|
}, |
|
{ |
|
"completion_length": 166.2851603825887, |
|
"epoch": 0.3657445751200928, |
|
"grad_norm": 0.2939068377017975, |
|
"kl": 0.6795247395833334, |
|
"learning_rate": 1.6004572790094535e-05, |
|
"loss": 0.0272, |
|
"reward": 1.1618923991918564, |
|
"reward_std": 0.20391751010902226, |
|
"rewards/accuracy_reward": 0.1848958384944126, |
|
"rewards/format_reward": 0.9769965422650179, |
|
"step": 276 |
|
}, |
|
{ |
|
"completion_length": 261.6618987719218, |
|
"epoch": 0.3697200596322677, |
|
"grad_norm": 0.29132047295570374, |
|
"kl": 0.6346842447916666, |
|
"learning_rate": 1.5892837119507017e-05, |
|
"loss": 0.0254, |
|
"reward": 1.1731771218279998, |
|
"reward_std": 0.2886992564114432, |
|
"rewards/accuracy_reward": 0.22005208965856582, |
|
"rewards/format_reward": 0.953125017384688, |
|
"step": 279 |
|
}, |
|
{ |
|
"completion_length": 336.81771659851074, |
|
"epoch": 0.3736955441444426, |
|
"grad_norm": 0.3138696551322937, |
|
"kl": 0.5843098958333334, |
|
"learning_rate": 1.5779962772542404e-05, |
|
"loss": 0.0234, |
|
"reward": 1.1588542101283867, |
|
"reward_std": 0.3115967277747889, |
|
"rewards/accuracy_reward": 0.22482639298929522, |
|
"rewards/format_reward": 0.9340277997155985, |
|
"step": 282 |
|
}, |
|
{ |
|
"completion_length": 331.4518330891927, |
|
"epoch": 0.3776710286566175, |
|
"grad_norm": 0.2659892141819, |
|
"kl": 0.5912679036458334, |
|
"learning_rate": 1.5665971559977035e-05, |
|
"loss": 0.0237, |
|
"reward": 1.1488715608914692, |
|
"reward_std": 0.29840323934331536, |
|
"rewards/accuracy_reward": 0.216145838998879, |
|
"rewards/format_reward": 0.9327257126569748, |
|
"step": 285 |
|
}, |
|
{ |
|
"completion_length": 239.2026980717977, |
|
"epoch": 0.38164651316879245, |
|
"grad_norm": 0.25000807642936707, |
|
"kl": 0.65576171875, |
|
"learning_rate": 1.5550885508399857e-05, |
|
"loss": 0.0262, |
|
"reward": 1.2243924078842003, |
|
"reward_std": 0.23120340146124363, |
|
"rewards/accuracy_reward": 0.26302084152121097, |
|
"rewards/format_reward": 0.9613715447485447, |
|
"step": 288 |
|
}, |
|
{ |
|
"completion_length": 261.2309099833171, |
|
"epoch": 0.3856219976809674, |
|
"grad_norm": 0.3258998692035675, |
|
"kl": 0.6355794270833334, |
|
"learning_rate": 1.5434726855956207e-05, |
|
"loss": 0.0254, |
|
"reward": 1.1762153195838134, |
|
"reward_std": 0.2527556049947937, |
|
"rewards/accuracy_reward": 0.22656250465661287, |
|
"rewards/format_reward": 0.949652798473835, |
|
"step": 291 |
|
}, |
|
{ |
|
"completion_length": 307.1632038752238, |
|
"epoch": 0.3895974821931423, |
|
"grad_norm": 0.27683761715888977, |
|
"kl": 0.6504720052083334, |
|
"learning_rate": 1.5317518048050698e-05, |
|
"loss": 0.026, |
|
"reward": 1.1840278108914692, |
|
"reward_std": 0.3266296978108585, |
|
"rewards/accuracy_reward": 0.25868056357527774, |
|
"rewards/format_reward": 0.9253472412625948, |
|
"step": 294 |
|
}, |
|
{ |
|
"completion_length": 227.35286966959634, |
|
"epoch": 0.3935729667053172, |
|
"grad_norm": 0.30159127712249756, |
|
"kl": 0.688720703125, |
|
"learning_rate": 1.5199281733010115e-05, |
|
"loss": 0.0275, |
|
"reward": 1.1831597660978634, |
|
"reward_std": 0.2784773572348058, |
|
"rewards/accuracy_reward": 0.23350695171393454, |
|
"rewards/format_reward": 0.9496528009573618, |
|
"step": 297 |
|
}, |
|
{ |
|
"completion_length": 143.06510861714682, |
|
"epoch": 0.3975484512174921, |
|
"grad_norm": 0.2892165780067444, |
|
"kl": 0.68359375, |
|
"learning_rate": 1.5080040757707045e-05, |
|
"loss": 0.0274, |
|
"reward": 1.2187500310440857, |
|
"reward_std": 0.2116301084558169, |
|
"rewards/accuracy_reward": 0.24522570221840093, |
|
"rewards/format_reward": 0.9735243245959282, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3975484512174921, |
|
"eval_completion_length": 144.66667048136392, |
|
"eval_kl": 0.6736505681818182, |
|
"eval_loss": 0.026671981438994408, |
|
"eval_reward": 1.2436869072191643, |
|
"eval_reward_std": 0.1965812866886457, |
|
"eval_rewards/accuracy_reward": 0.2651515253113978, |
|
"eval_rewards/format_reward": 0.9785353685870315, |
|
"eval_runtime": 306.7676, |
|
"eval_samples_per_second": 0.323, |
|
"eval_steps_per_second": 0.016, |
|
"step": 300 |
|
}, |
|
{ |
|
"completion_length": 174.81424236297607, |
|
"epoch": 0.40152393572966705, |
|
"grad_norm": 0.2668885588645935, |
|
"kl": 0.658447265625, |
|
"learning_rate": 1.4959818163145174e-05, |
|
"loss": 0.0263, |
|
"reward": 1.1870660111308098, |
|
"reward_std": 0.2152603679957489, |
|
"rewards/accuracy_reward": 0.22265625613120696, |
|
"rewards/format_reward": 0.9644097412625948, |
|
"step": 303 |
|
}, |
|
{ |
|
"completion_length": 243.1388953526815, |
|
"epoch": 0.405499420241842, |
|
"grad_norm": 0.29364725947380066, |
|
"kl": 0.6541341145833334, |
|
"learning_rate": 1.4838637180007048e-05, |
|
"loss": 0.0262, |
|
"reward": 1.19227434694767, |
|
"reward_std": 0.3106319972624381, |
|
"rewards/accuracy_reward": 0.25911459024064243, |
|
"rewards/format_reward": 0.9331597425043583, |
|
"step": 306 |
|
}, |
|
{ |
|
"completion_length": 272.5533922513326, |
|
"epoch": 0.4094749047540169, |
|
"grad_norm": 0.8436369299888611, |
|
"kl": 0.6695149739583334, |
|
"learning_rate": 1.4716521224165192e-05, |
|
"loss": 0.0268, |
|
"reward": 1.1753472536802292, |
|
"reward_std": 0.3257710024093588, |
|
"rewards/accuracy_reward": 0.25520834090032923, |
|
"rewards/format_reward": 0.9201389091710249, |
|
"step": 309 |
|
}, |
|
{ |
|
"completion_length": 177.79601097106934, |
|
"epoch": 0.4134503892661918, |
|
"grad_norm": 0.4324951469898224, |
|
"kl": 0.7711588541666666, |
|
"learning_rate": 1.4593493892157473e-05, |
|
"loss": 0.0308, |
|
"reward": 1.2126736467083294, |
|
"reward_std": 0.2525833969314893, |
|
"rewards/accuracy_reward": 0.25607639644294977, |
|
"rewards/format_reward": 0.9565972425043583, |
|
"step": 312 |
|
}, |
|
{ |
|
"completion_length": 131.17882307370505, |
|
"epoch": 0.41742587377836676, |
|
"grad_norm": 0.4101894497871399, |
|
"kl": 0.8329264322916666, |
|
"learning_rate": 1.4469578956627497e-05, |
|
"loss": 0.0333, |
|
"reward": 1.2222222561637561, |
|
"reward_std": 0.2416619355790317, |
|
"rewards/accuracy_reward": 0.2569444514811039, |
|
"rewards/format_reward": 0.9652777935067812, |
|
"step": 315 |
|
}, |
|
{ |
|
"completion_length": 166.76866793632507, |
|
"epoch": 0.42140135829054165, |
|
"grad_norm": 0.4412620961666107, |
|
"kl": 0.802490234375, |
|
"learning_rate": 1.4344800361731028e-05, |
|
"loss": 0.0321, |
|
"reward": 1.2000868308047454, |
|
"reward_std": 0.25887442535410327, |
|
"rewards/accuracy_reward": 0.2456597302419444, |
|
"rewards/format_reward": 0.9544271069268385, |
|
"step": 318 |
|
}, |
|
{ |
|
"completion_length": 291.4305648803711, |
|
"epoch": 0.4253768428027166, |
|
"grad_norm": 1.2379664182662964, |
|
"kl": 1.076171875, |
|
"learning_rate": 1.4219182218509228e-05, |
|
"loss": 0.043, |
|
"reward": 1.188368085771799, |
|
"reward_std": 0.3411911290604621, |
|
"rewards/accuracy_reward": 0.2738715353965138, |
|
"rewards/format_reward": 0.9144965472320715, |
|
"step": 321 |
|
}, |
|
{ |
|
"completion_length": 224.42535320917764, |
|
"epoch": 0.4293523273148915, |
|
"grad_norm": 0.3620770275592804, |
|
"kl": 0.82763671875, |
|
"learning_rate": 1.4092748800229684e-05, |
|
"loss": 0.0331, |
|
"reward": 1.1501736504336197, |
|
"reward_std": 0.277087006252259, |
|
"rewards/accuracy_reward": 0.21137153512487808, |
|
"rewards/format_reward": 0.9388021032015482, |
|
"step": 324 |
|
}, |
|
{ |
|
"completion_length": 125.78515982627869, |
|
"epoch": 0.4333278118270664, |
|
"grad_norm": 0.44564080238342285, |
|
"kl": 0.8855794270833334, |
|
"learning_rate": 1.3965524537696048e-05, |
|
"loss": 0.0354, |
|
"reward": 1.2070312934617202, |
|
"reward_std": 0.2261218437924981, |
|
"rewards/accuracy_reward": 0.2408854247769341, |
|
"rewards/format_reward": 0.9661458519597849, |
|
"step": 327 |
|
}, |
|
{ |
|
"completion_length": 120.45356305440266, |
|
"epoch": 0.43730329633924137, |
|
"grad_norm": 0.5684562921524048, |
|
"kl": 0.8846842447916666, |
|
"learning_rate": 1.3837534014527292e-05, |
|
"loss": 0.0354, |
|
"reward": 1.1996528121332328, |
|
"reward_std": 0.21545591143270335, |
|
"rewards/accuracy_reward": 0.22482639430866888, |
|
"rewards/format_reward": 0.9748264066874981, |
|
"step": 330 |
|
}, |
|
{ |
|
"completion_length": 163.12413569291434, |
|
"epoch": 0.44127878085141625, |
|
"grad_norm": 0.4309135973453522, |
|
"kl": 0.853515625, |
|
"learning_rate": 1.370880196240736e-05, |
|
"loss": 0.0341, |
|
"reward": 1.1966146243115265, |
|
"reward_std": 0.20711354352533817, |
|
"rewards/accuracy_reward": 0.22829861768210927, |
|
"rewards/format_reward": 0.9683159912625948, |
|
"step": 333 |
|
}, |
|
{ |
|
"completion_length": 237.89236768086752, |
|
"epoch": 0.4452542653635912, |
|
"grad_norm": 0.4566245377063751, |
|
"kl": 0.8661295572916666, |
|
"learning_rate": 1.3579353256306287e-05, |
|
"loss": 0.0347, |
|
"reward": 1.2013889253139496, |
|
"reward_std": 0.3096516130802532, |
|
"rewards/accuracy_reward": 0.26953125911920023, |
|
"rewards/format_reward": 0.9318576566874981, |
|
"step": 336 |
|
}, |
|
{ |
|
"completion_length": 165.7669305006663, |
|
"epoch": 0.4492297498757661, |
|
"grad_norm": 0.354465126991272, |
|
"kl": 0.865966796875, |
|
"learning_rate": 1.3449212909673564e-05, |
|
"loss": 0.0346, |
|
"reward": 1.2018229539195697, |
|
"reward_std": 0.25834672797160846, |
|
"rewards/accuracy_reward": 0.25217014582206804, |
|
"rewards/format_reward": 0.9496528009573618, |
|
"step": 339 |
|
}, |
|
{ |
|
"completion_length": 142.55990060170492, |
|
"epoch": 0.453205234387941, |
|
"grad_norm": 0.3962474763393402, |
|
"kl": 0.8601888020833334, |
|
"learning_rate": 1.3318406069604794e-05, |
|
"loss": 0.0344, |
|
"reward": 1.2521701666216056, |
|
"reward_std": 0.23758238561761877, |
|
"rewards/accuracy_reward": 0.2947048688074574, |
|
"rewards/format_reward": 0.9574652972320715, |
|
"step": 342 |
|
}, |
|
{ |
|
"completion_length": 115.50304126739502, |
|
"epoch": 0.45718071890011597, |
|
"grad_norm": 0.5837423205375671, |
|
"kl": 0.8826497395833334, |
|
"learning_rate": 1.3186958011982502e-05, |
|
"loss": 0.0353, |
|
"reward": 1.2539062947034836, |
|
"reward_std": 0.22960447745087245, |
|
"rewards/accuracy_reward": 0.28862847849571455, |
|
"rewards/format_reward": 0.9652777935067812, |
|
"step": 345 |
|
}, |
|
{ |
|
"completion_length": 131.01606249809265, |
|
"epoch": 0.46115620341229085, |
|
"grad_norm": 0.34627339243888855, |
|
"kl": 0.8831380208333334, |
|
"learning_rate": 1.3054894136592052e-05, |
|
"loss": 0.0353, |
|
"reward": 1.2656250384946663, |
|
"reward_std": 0.2167885024100542, |
|
"rewards/accuracy_reward": 0.303385425824672, |
|
"rewards/format_reward": 0.9622396032015482, |
|
"step": 348 |
|
}, |
|
{ |
|
"completion_length": 161.92491857210794, |
|
"epoch": 0.4651316879244658, |
|
"grad_norm": 0.5599522590637207, |
|
"kl": 0.927490234375, |
|
"learning_rate": 1.2922239962213639e-05, |
|
"loss": 0.0371, |
|
"reward": 1.2330729564030964, |
|
"reward_std": 0.25398758659139276, |
|
"rewards/accuracy_reward": 0.28038195144229877, |
|
"rewards/format_reward": 0.9526909900208315, |
|
"step": 351 |
|
}, |
|
{ |
|
"completion_length": 177.92231432596842, |
|
"epoch": 0.46910717243664074, |
|
"grad_norm": 0.40600305795669556, |
|
"kl": 0.9139811197916666, |
|
"learning_rate": 1.2789021121691273e-05, |
|
"loss": 0.0366, |
|
"reward": 1.2495660111308098, |
|
"reward_std": 0.2690324760042131, |
|
"rewards/accuracy_reward": 0.3042534824150304, |
|
"rewards/format_reward": 0.9453125211099783, |
|
"step": 354 |
|
}, |
|
{ |
|
"completion_length": 172.3125053246816, |
|
"epoch": 0.4730826569488156, |
|
"grad_norm": 0.3264493942260742, |
|
"kl": 0.9281412760416666, |
|
"learning_rate": 1.2655263356979748e-05, |
|
"loss": 0.0371, |
|
"reward": 1.2217882325251896, |
|
"reward_std": 0.23972468955131868, |
|
"rewards/accuracy_reward": 0.2669270914436008, |
|
"rewards/format_reward": 0.9548611293236414, |
|
"step": 357 |
|
}, |
|
{ |
|
"completion_length": 184.01345992088318, |
|
"epoch": 0.47705814146099057, |
|
"grad_norm": 0.45698466897010803, |
|
"kl": 0.9051920572916666, |
|
"learning_rate": 1.252099251417048e-05, |
|
"loss": 0.0362, |
|
"reward": 1.2009548917412758, |
|
"reward_std": 0.2333919748198241, |
|
"rewards/accuracy_reward": 0.2500000069461142, |
|
"rewards/format_reward": 0.9509548830489317, |
|
"step": 360 |
|
}, |
|
{ |
|
"completion_length": 144.6562541325887, |
|
"epoch": 0.48103362597316546, |
|
"grad_norm": 0.4349970519542694, |
|
"kl": 0.9092610677083334, |
|
"learning_rate": 1.2386234538497281e-05, |
|
"loss": 0.0364, |
|
"reward": 1.2122396243115265, |
|
"reward_std": 0.20104571796643236, |
|
"rewards/accuracy_reward": 0.24696181206187853, |
|
"rewards/format_reward": 0.965277798473835, |
|
"step": 363 |
|
}, |
|
{ |
|
"completion_length": 191.15668980280557, |
|
"epoch": 0.4850091104853404, |
|
"grad_norm": 0.30329596996307373, |
|
"kl": 0.88134765625, |
|
"learning_rate": 1.2251015469322915e-05, |
|
"loss": 0.0352, |
|
"reward": 1.1736111516753833, |
|
"reward_std": 0.22162295792562267, |
|
"rewards/accuracy_reward": 0.21918403388311467, |
|
"rewards/format_reward": 0.9544271032015482, |
|
"step": 366 |
|
}, |
|
{ |
|
"completion_length": 264.59766403834027, |
|
"epoch": 0.48898459499751534, |
|
"grad_norm": 0.29752317070961, |
|
"kl": 0.850341796875, |
|
"learning_rate": 1.2115361435107531e-05, |
|
"loss": 0.034, |
|
"reward": 1.1987847660978634, |
|
"reward_std": 0.23697279867095253, |
|
"rewards/accuracy_reward": 0.25217014578326297, |
|
"rewards/format_reward": 0.9466146007180214, |
|
"step": 369 |
|
}, |
|
{ |
|
"completion_length": 265.8745719591777, |
|
"epoch": 0.49296007950969023, |
|
"grad_norm": 0.32228928804397583, |
|
"kl": 0.832763671875, |
|
"learning_rate": 1.1979298648359823e-05, |
|
"loss": 0.0333, |
|
"reward": 1.1922743419806163, |
|
"reward_std": 0.23731949056188265, |
|
"rewards/accuracy_reward": 0.24652778469802192, |
|
"rewards/format_reward": 0.9457465497155985, |
|
"step": 372 |
|
}, |
|
{ |
|
"completion_length": 225.2148496309916, |
|
"epoch": 0.4969355640218652, |
|
"grad_norm": 0.3521800637245178, |
|
"kl": 0.93212890625, |
|
"learning_rate": 1.1842853400571972e-05, |
|
"loss": 0.0373, |
|
"reward": 1.17578128973643, |
|
"reward_std": 0.23853578185662627, |
|
"rewards/accuracy_reward": 0.22482639566684762, |
|
"rewards/format_reward": 0.9509548818071684, |
|
"step": 375 |
|
}, |
|
{ |
|
"completion_length": 214.73481448491415, |
|
"epoch": 0.5009110485340401, |
|
"grad_norm": 0.30461886525154114, |
|
"kl": 0.8719075520833334, |
|
"learning_rate": 1.1706052057139335e-05, |
|
"loss": 0.0349, |
|
"reward": 1.2374132238328457, |
|
"reward_std": 0.26819697495860356, |
|
"rewards/accuracy_reward": 0.2986111195059493, |
|
"rewards/format_reward": 0.9388021019597849, |
|
"step": 378 |
|
}, |
|
{ |
|
"completion_length": 166.19488294919333, |
|
"epoch": 0.504886533046215, |
|
"grad_norm": 0.3792967200279236, |
|
"kl": 0.9239908854166666, |
|
"learning_rate": 1.1568921052265835e-05, |
|
"loss": 0.037, |
|
"reward": 1.2187500335276127, |
|
"reward_std": 0.23409798694774508, |
|
"rewards/accuracy_reward": 0.2708333415600161, |
|
"rewards/format_reward": 0.9479166840513548, |
|
"step": 381 |
|
}, |
|
{ |
|
"completion_length": 150.8567752043406, |
|
"epoch": 0.5088620175583899, |
|
"grad_norm": 0.592704176902771, |
|
"kl": 0.9227701822916666, |
|
"learning_rate": 1.1431486883856082e-05, |
|
"loss": 0.0369, |
|
"reward": 1.2452257374922435, |
|
"reward_std": 0.22722656147864959, |
|
"rewards/accuracy_reward": 0.2934027862502262, |
|
"rewards/format_reward": 0.9518229340513548, |
|
"step": 384 |
|
}, |
|
{ |
|
"completion_length": 158.66797320048013, |
|
"epoch": 0.5128375020705649, |
|
"grad_norm": 0.3592934310436249, |
|
"kl": 0.9186197916666666, |
|
"learning_rate": 1.1293776108395136e-05, |
|
"loss": 0.0367, |
|
"reward": 1.2261285136143367, |
|
"reward_std": 0.22368450198943415, |
|
"rewards/accuracy_reward": 0.27039931307081133, |
|
"rewards/format_reward": 0.9557291840513548, |
|
"step": 387 |
|
}, |
|
{ |
|
"completion_length": 185.46788756052652, |
|
"epoch": 0.5168129865827398, |
|
"grad_norm": 0.40210244059562683, |
|
"kl": 0.9088541666666666, |
|
"learning_rate": 1.115581533581701e-05, |
|
"loss": 0.0363, |
|
"reward": 1.215711849431197, |
|
"reward_std": 0.24651615732970336, |
|
"rewards/accuracy_reward": 0.26779514624892425, |
|
"rewards/format_reward": 0.9479166890184084, |
|
"step": 390 |
|
}, |
|
{ |
|
"completion_length": 170.30729579925537, |
|
"epoch": 0.5207884710949147, |
|
"grad_norm": 0.3423998951911926, |
|
"kl": 0.9156901041666666, |
|
"learning_rate": 1.1017631224362803e-05, |
|
"loss": 0.0366, |
|
"reward": 1.2191840646167595, |
|
"reward_std": 0.23356711654923856, |
|
"rewards/accuracy_reward": 0.26562500702372444, |
|
"rewards/format_reward": 0.9535590472320715, |
|
"step": 393 |
|
}, |
|
{ |
|
"completion_length": 163.50651590029398, |
|
"epoch": 0.5247639556070897, |
|
"grad_norm": 0.4365287721157074, |
|
"kl": 0.9022623697916666, |
|
"learning_rate": 1.0879250475429523e-05, |
|
"loss": 0.0361, |
|
"reward": 1.2296007374922435, |
|
"reward_std": 0.21208147254462043, |
|
"rewards/accuracy_reward": 0.27083334194806713, |
|
"rewards/format_reward": 0.9587673805654049, |
|
"step": 396 |
|
}, |
|
{ |
|
"completion_length": 152.78342461585999, |
|
"epoch": 0.5287394401192645, |
|
"grad_norm": 0.38972899317741394, |
|
"kl": 0.9129231770833334, |
|
"learning_rate": 1.0740699828410546e-05, |
|
"loss": 0.0365, |
|
"reward": 1.2404514389733474, |
|
"reward_std": 0.21568205665486553, |
|
"rewards/accuracy_reward": 0.2734375073729704, |
|
"rewards/format_reward": 0.9670139054457346, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.5300646016233228, |
|
"eval_completion_length": 209.7181176944655, |
|
"eval_kl": 0.892936862244898, |
|
"eval_loss": 0.035242632031440735, |
|
"eval_reward": 1.215136090103461, |
|
"eval_reward_std": 0.25851106210326663, |
|
"eval_rewards/accuracy_reward": 0.27763606341821806, |
|
"eval_rewards/format_reward": 0.9375000194627412, |
|
"eval_runtime": 446.558, |
|
"eval_samples_per_second": 0.222, |
|
"eval_steps_per_second": 0.011, |
|
"step": 400 |
|
}, |
|
{ |
|
"completion_length": 211.7161521911621, |
|
"epoch": 0.5327149246314394, |
|
"grad_norm": 0.42194342613220215, |
|
"kl": 0.86279296875, |
|
"learning_rate": 1.060200605552876e-05, |
|
"loss": 0.0351, |
|
"reward": 1.2447917014360428, |
|
"reward_std": 0.2783205214655027, |
|
"rewards/accuracy_reward": 0.30598959198687226, |
|
"rewards/format_reward": 0.9388021044433117, |
|
"step": 402 |
|
}, |
|
{ |
|
"completion_length": 209.05816570917764, |
|
"epoch": 0.5366904091436144, |
|
"grad_norm": 0.3273554742336273, |
|
"kl": 0.8595377604166666, |
|
"learning_rate": 1.0463195956663339e-05, |
|
"loss": 0.0344, |
|
"reward": 1.2074653158585231, |
|
"reward_std": 0.26641134327898425, |
|
"rewards/accuracy_reward": 0.2643229237680013, |
|
"rewards/format_reward": 0.9431423793236414, |
|
"step": 405 |
|
}, |
|
{ |
|
"completion_length": 198.80252281824747, |
|
"epoch": 0.5406658936557893, |
|
"grad_norm": 0.6081684827804565, |
|
"kl": 0.8997395833333334, |
|
"learning_rate": 1.0324296354171209e-05, |
|
"loss": 0.036, |
|
"reward": 1.2209201728304226, |
|
"reward_std": 0.26526342386690277, |
|
"rewards/accuracy_reward": 0.2712673705148821, |
|
"rewards/format_reward": 0.949652798473835, |
|
"step": 408 |
|
}, |
|
{ |
|
"completion_length": 189.4709266026815, |
|
"epoch": 0.5446413781679642, |
|
"grad_norm": 0.3419695794582367, |
|
"kl": 0.9253743489583334, |
|
"learning_rate": 1.0185334087704124e-05, |
|
"loss": 0.037, |
|
"reward": 1.252604205161333, |
|
"reward_std": 0.27628890207658213, |
|
"rewards/accuracy_reward": 0.305121536909913, |
|
"rewards/format_reward": 0.9474826554457346, |
|
"step": 411 |
|
}, |
|
{ |
|
"completion_length": 208.34549283981323, |
|
"epoch": 0.5486168626801391, |
|
"grad_norm": 0.2845761775970459, |
|
"kl": 0.9347330729166666, |
|
"learning_rate": 1.0046336009022435e-05, |
|
"loss": 0.0374, |
|
"reward": 1.2300347636143367, |
|
"reward_std": 0.2701789590064436, |
|
"rewards/accuracy_reward": 0.2834201465981702, |
|
"rewards/format_reward": 0.9466146069268385, |
|
"step": 414 |
|
}, |
|
{ |
|
"completion_length": 176.7278701464335, |
|
"epoch": 0.5525923471923141, |
|
"grad_norm": 0.30186229944229126, |
|
"kl": 0.9585774739583334, |
|
"learning_rate": 9.907328976806512e-06, |
|
"loss": 0.0384, |
|
"reward": 1.267361145466566, |
|
"reward_std": 0.25099668038698536, |
|
"rewards/accuracy_reward": 0.3120659809404363, |
|
"rewards/format_reward": 0.9552951554457346, |
|
"step": 417 |
|
}, |
|
{ |
|
"completion_length": 181.4904566605886, |
|
"epoch": 0.556567831704489, |
|
"grad_norm": 0.31573575735092163, |
|
"kl": 0.9427083333333334, |
|
"learning_rate": 9.768339851466818e-06, |
|
"loss": 0.0377, |
|
"reward": 1.2165799054006736, |
|
"reward_std": 0.23861535429023206, |
|
"rewards/accuracy_reward": 0.2586805630320062, |
|
"rewards/format_reward": 0.9578993208706379, |
|
"step": 420 |
|
}, |
|
{ |
|
"completion_length": 172.80729587872824, |
|
"epoch": 0.5605433162166639, |
|
"grad_norm": 0.3387187421321869, |
|
"kl": 0.9481608072916666, |
|
"learning_rate": 9.62939548995367e-06, |
|
"loss": 0.0379, |
|
"reward": 1.2608507374922435, |
|
"reward_std": 0.2416861488794287, |
|
"rewards/accuracy_reward": 0.30729167473812896, |
|
"rewards/format_reward": 0.9535590472320715, |
|
"step": 423 |
|
}, |
|
{ |
|
"completion_length": 202.40842517217, |
|
"epoch": 0.5645188007288389, |
|
"grad_norm": 0.3616231381893158, |
|
"kl": 0.920166015625, |
|
"learning_rate": 9.490522740567633e-06, |
|
"loss": 0.0368, |
|
"reward": 1.1983507287998993, |
|
"reward_std": 0.2321951068782558, |
|
"rewards/accuracy_reward": 0.24826389597728848, |
|
"rewards/format_reward": 0.950086827079455, |
|
"step": 426 |
|
}, |
|
{ |
|
"completion_length": 234.31207275390625, |
|
"epoch": 0.5684942852410138, |
|
"grad_norm": 0.3933253288269043, |
|
"kl": 0.89599609375, |
|
"learning_rate": 9.351748437771615e-06, |
|
"loss": 0.0358, |
|
"reward": 1.2092014277974765, |
|
"reward_std": 0.26475840294733644, |
|
"rewards/accuracy_reward": 0.2638888942698638, |
|
"rewards/format_reward": 0.9453125235935053, |
|
"step": 429 |
|
}, |
|
{ |
|
"completion_length": 219.85721063613892, |
|
"epoch": 0.5724697697531886, |
|
"grad_norm": 0.3269123435020447, |
|
"kl": 0.8291015625, |
|
"learning_rate": 9.213099397005647e-06, |
|
"loss": 0.0332, |
|
"reward": 1.2604166964689891, |
|
"reward_std": 0.24623461983477077, |
|
"rewards/accuracy_reward": 0.3151041733411451, |
|
"rewards/format_reward": 0.9453125211099783, |
|
"step": 432 |
|
}, |
|
{ |
|
"completion_length": 187.03993590672812, |
|
"epoch": 0.5764452542653636, |
|
"grad_norm": 0.3639557361602783, |
|
"kl": 0.8619791666666666, |
|
"learning_rate": 9.074602409505293e-06, |
|
"loss": 0.0345, |
|
"reward": 1.2573785136143367, |
|
"reward_std": 0.2650001817382872, |
|
"rewards/accuracy_reward": 0.30512153601739556, |
|
"rewards/format_reward": 0.9522569638987383, |
|
"step": 435 |
|
}, |
|
{ |
|
"completion_length": 164.95964018503824, |
|
"epoch": 0.5804207387775385, |
|
"grad_norm": 0.47998958826065063, |
|
"kl": 0.9236653645833334, |
|
"learning_rate": 8.936284237124779e-06, |
|
"loss": 0.0369, |
|
"reward": 1.20616323625048, |
|
"reward_std": 0.20998603710904717, |
|
"rewards/accuracy_reward": 0.24479167334114513, |
|
"rewards/format_reward": 0.9613715435067812, |
|
"step": 438 |
|
}, |
|
{ |
|
"completion_length": 165.91406766573587, |
|
"epoch": 0.5843962232897134, |
|
"grad_norm": 0.2889668345451355, |
|
"kl": 0.93359375, |
|
"learning_rate": 8.798171607165779e-06, |
|
"loss": 0.0374, |
|
"reward": 1.1775174016753833, |
|
"reward_std": 0.18780716601759195, |
|
"rewards/accuracy_reward": 0.21831597752558687, |
|
"rewards/format_reward": 0.9592014066874981, |
|
"step": 441 |
|
}, |
|
{ |
|
"completion_length": 154.68273003896078, |
|
"epoch": 0.5883717078018884, |
|
"grad_norm": 0.3764539361000061, |
|
"kl": 0.9204915364583334, |
|
"learning_rate": 8.660291207212883e-06, |
|
"loss": 0.0368, |
|
"reward": 1.2274305820465088, |
|
"reward_std": 0.2107705035402129, |
|
"rewards/accuracy_reward": 0.25954861807016033, |
|
"rewards/format_reward": 0.967881960173448, |
|
"step": 444 |
|
}, |
|
{ |
|
"completion_length": 242.51129245758057, |
|
"epoch": 0.5923471923140633, |
|
"grad_norm": 0.4235329031944275, |
|
"kl": 0.8951009114583334, |
|
"learning_rate": 8.52266967997675e-06, |
|
"loss": 0.0358, |
|
"reward": 1.1992187909781933, |
|
"reward_std": 0.2926396271989991, |
|
"rewards/accuracy_reward": 0.27473959047347307, |
|
"rewards/format_reward": 0.9244791890184084, |
|
"step": 447 |
|
}, |
|
{ |
|
"completion_length": 216.438809633255, |
|
"epoch": 0.5963226768262382, |
|
"grad_norm": 0.5363680124282837, |
|
"kl": 0.9195963541666666, |
|
"learning_rate": 8.385333618145896e-06, |
|
"loss": 0.0368, |
|
"reward": 1.1462673942248027, |
|
"reward_std": 0.27518284460529685, |
|
"rewards/accuracy_reward": 0.21397570016173026, |
|
"rewards/format_reward": 0.9322916840513548, |
|
"step": 450 |
|
}, |
|
{ |
|
"completion_length": 119.86154862244923, |
|
"epoch": 0.6002981613384131, |
|
"grad_norm": 0.5320878028869629, |
|
"kl": 0.965087890625, |
|
"learning_rate": 8.248309559248203e-06, |
|
"loss": 0.0386, |
|
"reward": 1.18619795764486, |
|
"reward_std": 0.20566960889846087, |
|
"rewards/accuracy_reward": 0.22135417337995023, |
|
"rewards/format_reward": 0.9648437711099783, |
|
"step": 453 |
|
}, |
|
{ |
|
"completion_length": 98.42144385973613, |
|
"epoch": 0.6042736458505881, |
|
"grad_norm": 0.4586585760116577, |
|
"kl": 0.9525553385416666, |
|
"learning_rate": 8.111623980523036e-06, |
|
"loss": 0.0381, |
|
"reward": 1.2638889191051323, |
|
"reward_std": 0.20143946547371647, |
|
"rewards/accuracy_reward": 0.2899305631484215, |
|
"rewards/format_reward": 0.9739583469927311, |
|
"step": 456 |
|
}, |
|
{ |
|
"completion_length": 153.68880653381348, |
|
"epoch": 0.608249130362763, |
|
"grad_norm": 5.601478576660156, |
|
"kl": 1.3423665364583333, |
|
"learning_rate": 7.975303293805036e-06, |
|
"loss": 0.0537, |
|
"reward": 1.2421875384946663, |
|
"reward_std": 0.24943431583233178, |
|
"rewards/accuracy_reward": 0.29513889698622126, |
|
"rewards/format_reward": 0.9470486293236414, |
|
"step": 459 |
|
}, |
|
{ |
|
"completion_length": 172.61806122461954, |
|
"epoch": 0.6122246148749378, |
|
"grad_norm": 0.6199188828468323, |
|
"kl": 0.9340006510416666, |
|
"learning_rate": 7.839373840420555e-06, |
|
"loss": 0.0374, |
|
"reward": 1.1848958780368168, |
|
"reward_std": 0.270951366595303, |
|
"rewards/accuracy_reward": 0.24435764636533955, |
|
"rewards/format_reward": 0.9405382138987383, |
|
"step": 462 |
|
}, |
|
{ |
|
"completion_length": 216.99045578638712, |
|
"epoch": 0.6162000993871128, |
|
"grad_norm": 22.460529327392578, |
|
"kl": 0.9293619791666666, |
|
"learning_rate": 7.70386188609769e-06, |
|
"loss": 0.0372, |
|
"reward": 1.2044271193444729, |
|
"reward_std": 0.2865686761215329, |
|
"rewards/accuracy_reward": 0.2812500084207083, |
|
"rewards/format_reward": 0.9231771019597849, |
|
"step": 465 |
|
}, |
|
{ |
|
"completion_length": 200.38672375679016, |
|
"epoch": 0.6201755838992877, |
|
"grad_norm": 4.971067428588867, |
|
"kl": 0.9654947916666666, |
|
"learning_rate": 7.568793615890955e-06, |
|
"loss": 0.0386, |
|
"reward": 1.170138926555713, |
|
"reward_std": 0.28953606037733454, |
|
"rewards/accuracy_reward": 0.24869792345756045, |
|
"rewards/format_reward": 0.9214409912625948, |
|
"step": 468 |
|
}, |
|
{ |
|
"completion_length": 263.26172574361163, |
|
"epoch": 0.6241510684114626, |
|
"grad_norm": 19461.30859375, |
|
"kl": 6020.59716796875, |
|
"learning_rate": 7.434195129121517e-06, |
|
"loss": 241.5018, |
|
"reward": 1.1197916927436988, |
|
"reward_std": 0.35640866014485556, |
|
"rewards/accuracy_reward": 0.2530382012870784, |
|
"rewards/format_reward": 0.8667534949878851, |
|
"step": 471 |
|
}, |
|
{ |
|
"completion_length": 157.41016141573587, |
|
"epoch": 0.6281265529236376, |
|
"grad_norm": 15.363752365112305, |
|
"kl": 54.584309895833336, |
|
"learning_rate": 7.300092434334021e-06, |
|
"loss": 2.1851, |
|
"reward": 1.2139757387340069, |
|
"reward_std": 0.2608258535619825, |
|
"rewards/accuracy_reward": 0.27473959086152416, |
|
"rewards/format_reward": 0.9392361330489317, |
|
"step": 474 |
|
}, |
|
{ |
|
"completion_length": 188.84332064787546, |
|
"epoch": 0.6321020374358125, |
|
"grad_norm": 3.5809130668640137, |
|
"kl": 1.1976725260416667, |
|
"learning_rate": 7.166511444270924e-06, |
|
"loss": 0.0479, |
|
"reward": 1.2309028158585231, |
|
"reward_std": 0.26117177587002516, |
|
"rewards/accuracy_reward": 0.2916666743112728, |
|
"rewards/format_reward": 0.9392361268401146, |
|
"step": 477 |
|
}, |
|
{ |
|
"completion_length": 160.72222622235617, |
|
"epoch": 0.6360775219479874, |
|
"grad_norm": 3.082725763320923, |
|
"kl": 1.359375, |
|
"learning_rate": 7.033477970865381e-06, |
|
"loss": 0.0544, |
|
"reward": 1.2621528171002865, |
|
"reward_std": 0.24424838298000395, |
|
"rewards/accuracy_reward": 0.30772570373180014, |
|
"rewards/format_reward": 0.9544271032015482, |
|
"step": 480 |
|
}, |
|
{ |
|
"completion_length": 172.35720992088318, |
|
"epoch": 0.6400530064601623, |
|
"grad_norm": 6.727673053741455, |
|
"kl": 3.7775065104166665, |
|
"learning_rate": 6.901017720253583e-06, |
|
"loss": 0.151, |
|
"reward": 1.1987847561637561, |
|
"reward_std": 0.24734753215064606, |
|
"rewards/accuracy_reward": 0.25824653392191976, |
|
"rewards/format_reward": 0.940538210173448, |
|
"step": 483 |
|
}, |
|
{ |
|
"completion_length": 138.64713939030966, |
|
"epoch": 0.6440284909723373, |
|
"grad_norm": 28.416213989257812, |
|
"kl": 1.9168294270833333, |
|
"learning_rate": 6.769156287807539e-06, |
|
"loss": 0.0767, |
|
"reward": 1.2269965621332328, |
|
"reward_std": 0.21713009189503887, |
|
"rewards/accuracy_reward": 0.27170139527879655, |
|
"rewards/format_reward": 0.9552951566874981, |
|
"step": 486 |
|
}, |
|
{ |
|
"completion_length": 111.01085392634074, |
|
"epoch": 0.6480039754845122, |
|
"grad_norm": 28.046361923217773, |
|
"kl": 1.4402669270833333, |
|
"learning_rate": 6.637919153189279e-06, |
|
"loss": 0.0576, |
|
"reward": 1.2903646230697632, |
|
"reward_std": 0.2030498057914277, |
|
"rewards/accuracy_reward": 0.33203126047737896, |
|
"rewards/format_reward": 0.9583333519597849, |
|
"step": 489 |
|
}, |
|
{ |
|
"completion_length": 93.8329017162323, |
|
"epoch": 0.651979459996687, |
|
"grad_norm": 5.583730697631836, |
|
"kl": 1.2568359375, |
|
"learning_rate": 6.507331675427388e-06, |
|
"loss": 0.0503, |
|
"reward": 1.2226562922199566, |
|
"reward_std": 0.2042010520429661, |
|
"rewards/accuracy_reward": 0.2604166743500779, |
|
"rewards/format_reward": 0.9622396032015482, |
|
"step": 492 |
|
}, |
|
{ |
|
"completion_length": 88.26736386617024, |
|
"epoch": 0.655954944508862, |
|
"grad_norm": 12.801457405090332, |
|
"kl": 1.261962890625, |
|
"learning_rate": 6.3774190880168804e-06, |
|
"loss": 0.0505, |
|
"reward": 1.2973090683420498, |
|
"reward_std": 0.23440878558903933, |
|
"rewards/accuracy_reward": 0.33289931528270245, |
|
"rewards/format_reward": 0.9644097437461218, |
|
"step": 495 |
|
}, |
|
{ |
|
"completion_length": 90.45833583672841, |
|
"epoch": 0.6599304290210369, |
|
"grad_norm": 1334.54296875, |
|
"kl": 12.92529296875, |
|
"learning_rate": 6.248206494043313e-06, |
|
"loss": 0.5176, |
|
"reward": 1.269097267339627, |
|
"reward_std": 0.1979171479276071, |
|
"rewards/accuracy_reward": 0.30164931431257475, |
|
"rewards/format_reward": 0.9674479365348816, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.6625807520291536, |
|
"eval_completion_length": 106.54038769648625, |
|
"eval_kl": 8.832752403846154, |
|
"eval_loss": 0.2197878211736679, |
|
"eval_reward": 1.2631410598754882, |
|
"eval_reward_std": 0.20897178661364776, |
|
"eval_rewards/accuracy_reward": 0.3028846269903275, |
|
"eval_rewards/format_reward": 0.9602564261509822, |
|
"eval_runtime": 284.6155, |
|
"eval_samples_per_second": 0.348, |
|
"eval_steps_per_second": 0.018, |
|
"step": 500 |
|
}, |
|
{ |
|
"completion_length": 137.89062881469727, |
|
"epoch": 0.6639059135332118, |
|
"grad_norm": 196.66842651367188, |
|
"kl": 5.895263671875, |
|
"learning_rate": 6.119718861332098e-06, |
|
"loss": 0.4084, |
|
"reward": 1.3242188021540642, |
|
"reward_std": 0.24511273042298853, |
|
"rewards/accuracy_reward": 0.3710937574505806, |
|
"rewards/format_reward": 0.9531250298023224, |
|
"step": 501 |
|
}, |
|
{ |
|
"completion_length": 101.93186076482137, |
|
"epoch": 0.6678813980453868, |
|
"grad_norm": 4.593560218811035, |
|
"kl": 2.0084635416666665, |
|
"learning_rate": 5.9919810176239554e-06, |
|
"loss": 0.0804, |
|
"reward": 1.2803819825251896, |
|
"reward_std": 0.22951093905915818, |
|
"rewards/accuracy_reward": 0.3185763976459081, |
|
"rewards/format_reward": 0.9618055758376917, |
|
"step": 504 |
|
}, |
|
{ |
|
"completion_length": 98.08246823151906, |
|
"epoch": 0.6718568825575617, |
|
"grad_norm": 36.8542594909668, |
|
"kl": 1.6246744791666667, |
|
"learning_rate": 5.86501764577744e-06, |
|
"loss": 0.065, |
|
"reward": 1.2582465658585231, |
|
"reward_std": 0.20490265979121128, |
|
"rewards/accuracy_reward": 0.2903645906674986, |
|
"rewards/format_reward": 0.9678819614152113, |
|
"step": 507 |
|
}, |
|
{ |
|
"completion_length": 125.86849367618561, |
|
"epoch": 0.6758323670697366, |
|
"grad_norm": 31.712203979492188, |
|
"kl": 2.9173990885416665, |
|
"learning_rate": 5.7388532789994476e-06, |
|
"loss": 0.1167, |
|
"reward": 1.2304687934617202, |
|
"reward_std": 0.23319136871335408, |
|
"rewards/accuracy_reward": 0.27473959159882116, |
|
"rewards/format_reward": 0.9557291840513548, |
|
"step": 510 |
|
}, |
|
{ |
|
"completion_length": 108.07986442248027, |
|
"epoch": 0.6798078515819116, |
|
"grad_norm": 19.656137466430664, |
|
"kl": 2.4375, |
|
"learning_rate": 5.613512296104663e-06, |
|
"loss": 0.0974, |
|
"reward": 1.2282986529171467, |
|
"reward_std": 0.1991276788370063, |
|
"rewards/accuracy_reward": 0.26388889578326297, |
|
"rewards/format_reward": 0.964409738779068, |
|
"step": 513 |
|
}, |
|
{ |
|
"completion_length": 120.73264233271281, |
|
"epoch": 0.6837833360940865, |
|
"grad_norm": 9.04715633392334, |
|
"kl": 1.5470377604166667, |
|
"learning_rate": 5.489018916804813e-06, |
|
"loss": 0.0619, |
|
"reward": 1.281250045945247, |
|
"reward_std": 0.22288222153050205, |
|
"rewards/accuracy_reward": 0.3198784813284874, |
|
"rewards/format_reward": 0.9613715497155985, |
|
"step": 516 |
|
}, |
|
{ |
|
"completion_length": 113.39974268277486, |
|
"epoch": 0.6877588206062614, |
|
"grad_norm": 2.3152172565460205, |
|
"kl": 1.3323567708333333, |
|
"learning_rate": 5.365397197028686e-06, |
|
"loss": 0.0533, |
|
"reward": 1.2721354613701503, |
|
"reward_std": 0.19468989650097987, |
|
"rewards/accuracy_reward": 0.30338542551423114, |
|
"rewards/format_reward": 0.968750017384688, |
|
"step": 519 |
|
}, |
|
{ |
|
"completion_length": 143.59983134269714, |
|
"epoch": 0.6917343051184363, |
|
"grad_norm": 12.121291160583496, |
|
"kl": 1.47412109375, |
|
"learning_rate": 5.242671024273798e-06, |
|
"loss": 0.059, |
|
"reward": 1.2456597636143367, |
|
"reward_std": 0.25431135304582614, |
|
"rewards/accuracy_reward": 0.29296875729536015, |
|
"rewards/format_reward": 0.9526909900208315, |
|
"step": 522 |
|
}, |
|
{ |
|
"completion_length": 121.45529794692993, |
|
"epoch": 0.6957097896306113, |
|
"grad_norm": 3.243786334991455, |
|
"kl": 1.41357421875, |
|
"learning_rate": 5.120864112990569e-06, |
|
"loss": 0.0566, |
|
"reward": 1.2443576753139496, |
|
"reward_std": 0.20600120699964464, |
|
"rewards/accuracy_reward": 0.28125000714013976, |
|
"rewards/format_reward": 0.9631076554457346, |
|
"step": 525 |
|
}, |
|
{ |
|
"completion_length": 141.91970892747244, |
|
"epoch": 0.6996852741427861, |
|
"grad_norm": 6.4455437660217285, |
|
"kl": 1.6841634114583333, |
|
"learning_rate": 5.000000000000003e-06, |
|
"loss": 0.0674, |
|
"reward": 1.2152778096497059, |
|
"reward_std": 0.23748167790472507, |
|
"rewards/accuracy_reward": 0.2604166748933494, |
|
"rewards/format_reward": 0.9548611342906952, |
|
"step": 528 |
|
}, |
|
{ |
|
"completion_length": 145.10503919919333, |
|
"epoch": 0.703660758654961, |
|
"grad_norm": 4.077893257141113, |
|
"kl": 1.8653971354166667, |
|
"learning_rate": 4.880102039945625e-06, |
|
"loss": 0.0746, |
|
"reward": 1.2673611491918564, |
|
"reward_std": 0.22801773723525307, |
|
"rewards/accuracy_reward": 0.3116319513646886, |
|
"rewards/format_reward": 0.9557291865348816, |
|
"step": 531 |
|
}, |
|
{ |
|
"completion_length": 153.23568006356558, |
|
"epoch": 0.707636243167136, |
|
"grad_norm": 2.3837387561798096, |
|
"kl": 1.5, |
|
"learning_rate": 4.761193400780667e-06, |
|
"loss": 0.06, |
|
"reward": 1.25694448625048, |
|
"reward_std": 0.24314528051763773, |
|
"rewards/accuracy_reward": 0.2999132027228673, |
|
"rewards/format_reward": 0.9570312711099783, |
|
"step": 534 |
|
}, |
|
{ |
|
"completion_length": 167.8263931274414, |
|
"epoch": 0.7116117276793109, |
|
"grad_norm": 4.202811241149902, |
|
"kl": 2.5416666666666665, |
|
"learning_rate": 4.643297059291303e-06, |
|
"loss": 0.1017, |
|
"reward": 1.2248264277974765, |
|
"reward_std": 0.27370192063972354, |
|
"rewards/accuracy_reward": 0.27604167559184134, |
|
"rewards/format_reward": 0.9487847425043583, |
|
"step": 537 |
|
}, |
|
{ |
|
"completion_length": 182.7339456876119, |
|
"epoch": 0.7155872121914858, |
|
"grad_norm": 14.95860481262207, |
|
"kl": 2.8761393229166665, |
|
"learning_rate": 4.52643579665683e-06, |
|
"loss": 0.1151, |
|
"reward": 1.2421875409781933, |
|
"reward_std": 0.25901925152478117, |
|
"rewards/accuracy_reward": 0.2981770924137284, |
|
"rewards/format_reward": 0.9440104328095913, |
|
"step": 540 |
|
}, |
|
{ |
|
"completion_length": 154.5321224530538, |
|
"epoch": 0.7195626967036608, |
|
"grad_norm": 3.4772720336914062, |
|
"kl": 2.1795247395833335, |
|
"learning_rate": 4.410632194047652e-06, |
|
"loss": 0.0872, |
|
"reward": 1.2413194812834263, |
|
"reward_std": 0.2310507068565736, |
|
"rewards/accuracy_reward": 0.2834201504010707, |
|
"rewards/format_reward": 0.9578993258376917, |
|
"step": 543 |
|
}, |
|
{ |
|
"completion_length": 183.41537022590637, |
|
"epoch": 0.7235381812158357, |
|
"grad_norm": 5.352535247802734, |
|
"kl": 2.7540690104166665, |
|
"learning_rate": 4.29590862826191e-06, |
|
"loss": 0.1102, |
|
"reward": 1.2369792026778061, |
|
"reward_std": 0.24117931607179344, |
|
"rewards/accuracy_reward": 0.2934027863666415, |
|
"rewards/format_reward": 0.9435764091710249, |
|
"step": 546 |
|
}, |
|
{ |
|
"completion_length": 169.5039111773173, |
|
"epoch": 0.7275136657280106, |
|
"grad_norm": 2.4602179527282715, |
|
"kl": 1.9309895833333333, |
|
"learning_rate": 4.182287267401587e-06, |
|
"loss": 0.0772, |
|
"reward": 1.2404514315227668, |
|
"reward_std": 0.22196716310766837, |
|
"rewards/accuracy_reward": 0.2877604255530362, |
|
"rewards/format_reward": 0.9526909925043583, |
|
"step": 549 |
|
}, |
|
{ |
|
"completion_length": 147.87847610314688, |
|
"epoch": 0.7314891502401856, |
|
"grad_norm": 3.658947229385376, |
|
"kl": 2.431640625, |
|
"learning_rate": 4.069790066588966e-06, |
|
"loss": 0.0972, |
|
"reward": 1.3168403158585231, |
|
"reward_std": 0.23325985188906392, |
|
"rewards/accuracy_reward": 0.3589409807464108, |
|
"rewards/format_reward": 0.9578993233541647, |
|
"step": 552 |
|
}, |
|
{ |
|
"completion_length": 148.14410118261972, |
|
"epoch": 0.7354646347523605, |
|
"grad_norm": 2.160740613937378, |
|
"kl": 1.542236328125, |
|
"learning_rate": 3.9584387637242235e-06, |
|
"loss": 0.0617, |
|
"reward": 1.2235243457059066, |
|
"reward_std": 0.23132954825026295, |
|
"rewards/accuracy_reward": 0.2647569504721711, |
|
"rewards/format_reward": 0.9587673780818781, |
|
"step": 555 |
|
}, |
|
{ |
|
"completion_length": 169.01172391573587, |
|
"epoch": 0.7394401192645353, |
|
"grad_norm": 13.349943161010742, |
|
"kl": 2.2926432291666665, |
|
"learning_rate": 3.848254875285e-06, |
|
"loss": 0.0917, |
|
"reward": 1.197048647950093, |
|
"reward_std": 0.2103662300699701, |
|
"rewards/accuracy_reward": 0.2460937569849193, |
|
"rewards/format_reward": 0.9509548842906952, |
|
"step": 558 |
|
}, |
|
{ |
|
"completion_length": 157.60981353123984, |
|
"epoch": 0.7434156037767102, |
|
"grad_norm": 2.4742820262908936, |
|
"kl": 1.6537272135416667, |
|
"learning_rate": 3.739259692168764e-06, |
|
"loss": 0.0662, |
|
"reward": 1.229166705161333, |
|
"reward_std": 0.2528001538012177, |
|
"rewards/accuracy_reward": 0.27560764621011913, |
|
"rewards/format_reward": 0.9535590459903082, |
|
"step": 561 |
|
}, |
|
{ |
|
"completion_length": 154.8311678568522, |
|
"epoch": 0.7473910882888852, |
|
"grad_norm": 1.9129363298416138, |
|
"kl": 1.6079915364583333, |
|
"learning_rate": 3.6314742755787537e-06, |
|
"loss": 0.0643, |
|
"reward": 1.2261285024384658, |
|
"reward_std": 0.22889205797885856, |
|
"rewards/accuracy_reward": 0.27300348059119034, |
|
"rewards/format_reward": 0.9531250235935053, |
|
"step": 564 |
|
}, |
|
{ |
|
"completion_length": 151.18403148651123, |
|
"epoch": 0.7513665728010601, |
|
"grad_norm": 1.2694976329803467, |
|
"kl": 1.6017252604166667, |
|
"learning_rate": 3.524919452954314e-06, |
|
"loss": 0.064, |
|
"reward": 1.2248264377315838, |
|
"reward_std": 0.23556000289196768, |
|
"rewards/accuracy_reward": 0.26692709055108327, |
|
"rewards/format_reward": 0.9578993320465088, |
|
"step": 567 |
|
}, |
|
{ |
|
"completion_length": 154.80859859784445, |
|
"epoch": 0.755342057313235, |
|
"grad_norm": 1.4956895112991333, |
|
"kl": 1.427734375, |
|
"learning_rate": 3.419615813946392e-06, |
|
"loss": 0.0571, |
|
"reward": 1.1974826753139496, |
|
"reward_std": 0.23270095341528454, |
|
"rewards/accuracy_reward": 0.23784722872854522, |
|
"rewards/format_reward": 0.9596354352931181, |
|
"step": 570 |
|
}, |
|
{ |
|
"completion_length": 169.7625904083252, |
|
"epoch": 0.75931754182541, |
|
"grad_norm": 0.8634160161018372, |
|
"kl": 1.3221028645833333, |
|
"learning_rate": 3.315583706438994e-06, |
|
"loss": 0.0529, |
|
"reward": 1.2278646193444729, |
|
"reward_std": 0.24653864566547176, |
|
"rewards/accuracy_reward": 0.27734375892517465, |
|
"rewards/format_reward": 0.9505208519597849, |
|
"step": 573 |
|
}, |
|
{ |
|
"completion_length": 157.0377644697825, |
|
"epoch": 0.7632930263375849, |
|
"grad_norm": 5.6404571533203125, |
|
"kl": 1.3050944010416667, |
|
"learning_rate": 3.212843232617343e-06, |
|
"loss": 0.0522, |
|
"reward": 1.2322048942248027, |
|
"reward_std": 0.21434197838728627, |
|
"rewards/accuracy_reward": 0.2738715362114211, |
|
"rewards/format_reward": 0.9583333519597849, |
|
"step": 576 |
|
}, |
|
{ |
|
"completion_length": 149.35286871592203, |
|
"epoch": 0.7672685108497598, |
|
"grad_norm": 1.4161450862884521, |
|
"kl": 1.1617838541666667, |
|
"learning_rate": 3.1114142450835296e-06, |
|
"loss": 0.0465, |
|
"reward": 1.2365451728304226, |
|
"reward_std": 0.2225903740618378, |
|
"rewards/accuracy_reward": 0.2730034806688006, |
|
"rewards/format_reward": 0.9635416815678278, |
|
"step": 579 |
|
}, |
|
{ |
|
"completion_length": 164.2899361451467, |
|
"epoch": 0.7712439953619348, |
|
"grad_norm": 2.1099228858947754, |
|
"kl": 1.1534016927083333, |
|
"learning_rate": 3.0113163430203775e-06, |
|
"loss": 0.0461, |
|
"reward": 1.2417535074055195, |
|
"reward_std": 0.2334075498705109, |
|
"rewards/accuracy_reward": 0.2873263977235183, |
|
"rewards/format_reward": 0.9544271044433117, |
|
"step": 582 |
|
}, |
|
{ |
|
"completion_length": 166.99479564030966, |
|
"epoch": 0.7752194798741097, |
|
"grad_norm": 0.6481562256813049, |
|
"kl": 1.1204427083333333, |
|
"learning_rate": 2.912568868404284e-06, |
|
"loss": 0.0448, |
|
"reward": 1.2539062947034836, |
|
"reward_std": 0.2460917371014754, |
|
"rewards/accuracy_reward": 0.2977430645842105, |
|
"rewards/format_reward": 0.9561632151405016, |
|
"step": 585 |
|
}, |
|
{ |
|
"completion_length": 142.4097265402476, |
|
"epoch": 0.7791949643862845, |
|
"grad_norm": 0.6822313070297241, |
|
"kl": 1.0983072916666667, |
|
"learning_rate": 2.815190902267757e-06, |
|
"loss": 0.0439, |
|
"reward": 1.2465278084079425, |
|
"reward_std": 0.21192065292658904, |
|
"rewards/accuracy_reward": 0.27864584055108327, |
|
"rewards/format_reward": 0.9678819651405016, |
|
"step": 588 |
|
}, |
|
{ |
|
"completion_length": 172.7829921245575, |
|
"epoch": 0.7831704488984595, |
|
"grad_norm": 1.076019525527954, |
|
"kl": 1.0126953125, |
|
"learning_rate": 2.7192012610123777e-06, |
|
"loss": 0.0405, |
|
"reward": 1.2855903171002865, |
|
"reward_std": 0.22700861329212785, |
|
"rewards/accuracy_reward": 0.3285590385397275, |
|
"rewards/format_reward": 0.957031267384688, |
|
"step": 591 |
|
}, |
|
{ |
|
"completion_length": 179.41103037198386, |
|
"epoch": 0.7871459334106344, |
|
"grad_norm": 0.7347291707992554, |
|
"kl": 1.072265625, |
|
"learning_rate": 2.6246184927728913e-06, |
|
"loss": 0.0429, |
|
"reward": 1.2395833755532901, |
|
"reward_std": 0.22892415950385234, |
|
"rewards/accuracy_reward": 0.2808159793494269, |
|
"rewards/format_reward": 0.9587673818071684, |
|
"step": 594 |
|
}, |
|
{ |
|
"completion_length": 190.89887682596842, |
|
"epoch": 0.7911214179228093, |
|
"grad_norm": 1.5825515985488892, |
|
"kl": 1.0482584635416667, |
|
"learning_rate": 2.5314608738331535e-06, |
|
"loss": 0.0419, |
|
"reward": 1.2456597586472828, |
|
"reward_std": 0.24149028413618603, |
|
"rewards/accuracy_reward": 0.29427084256894886, |
|
"rewards/format_reward": 0.9513889116545519, |
|
"step": 597 |
|
}, |
|
{ |
|
"completion_length": 175.54731305440268, |
|
"epoch": 0.7950969024349842, |
|
"grad_norm": 0.6331008672714233, |
|
"kl": 1.0225423177083333, |
|
"learning_rate": 2.4397464050945753e-06, |
|
"loss": 0.0409, |
|
"reward": 1.2434896143774192, |
|
"reward_std": 0.2312415634126713, |
|
"rewards/accuracy_reward": 0.2873263991593073, |
|
"rewards/format_reward": 0.9561632089316845, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.7950969024349842, |
|
"eval_completion_length": 166.41919604214755, |
|
"eval_kl": 1.0108901515151516, |
|
"eval_loss": 0.040223389863967896, |
|
"eval_reward": 1.2853535666610256, |
|
"eval_reward_std": 0.22720548510551453, |
|
"eval_rewards/accuracy_reward": 0.32449495679501333, |
|
"eval_rewards/format_reward": 0.9608586051247336, |
|
"eval_runtime": 432.7167, |
|
"eval_samples_per_second": 0.229, |
|
"eval_steps_per_second": 0.012, |
|
"step": 600 |
|
}, |
|
{ |
|
"completion_length": 197.4709267616272, |
|
"epoch": 0.7990723869471592, |
|
"grad_norm": 3.598181962966919, |
|
"kl": 1.1375325520833333, |
|
"learning_rate": 2.3494928085978073e-06, |
|
"loss": 0.0455, |
|
"reward": 1.241319480041663, |
|
"reward_std": 0.2442009438915799, |
|
"rewards/accuracy_reward": 0.2921006998512894, |
|
"rewards/format_reward": 0.949218769868215, |
|
"step": 603 |
|
}, |
|
{ |
|
"completion_length": 171.99479659398398, |
|
"epoch": 0.8030478714593341, |
|
"grad_norm": 1.0907797813415527, |
|
"kl": 0.995849609375, |
|
"learning_rate": 2.2607175240983027e-06, |
|
"loss": 0.0399, |
|
"reward": 1.2322048917412758, |
|
"reward_std": 0.2378006634923319, |
|
"rewards/accuracy_reward": 0.2786458421420927, |
|
"rewards/format_reward": 0.9535590509573618, |
|
"step": 606 |
|
}, |
|
{ |
|
"completion_length": 151.64627146720886, |
|
"epoch": 0.807023355971509, |
|
"grad_norm": 15.526721954345703, |
|
"kl": 1.0755208333333333, |
|
"learning_rate": 2.1734377056964175e-06, |
|
"loss": 0.043, |
|
"reward": 1.2387153183420498, |
|
"reward_std": 0.21919091992701092, |
|
"rewards/accuracy_reward": 0.2721354246993239, |
|
"rewards/format_reward": 0.9665798768401146, |
|
"step": 609 |
|
}, |
|
{ |
|
"completion_length": 180.01128919919333, |
|
"epoch": 0.810998840483684, |
|
"grad_norm": 0.6363082528114319, |
|
"kl": 1.0391438802083333, |
|
"learning_rate": 2.087670218522714e-06, |
|
"loss": 0.0416, |
|
"reward": 1.210069477558136, |
|
"reward_std": 0.2583714901314427, |
|
"rewards/accuracy_reward": 0.25824653725915897, |
|
"rewards/format_reward": 0.9518229365348816, |
|
"step": 612 |
|
}, |
|
{ |
|
"completion_length": 134.2313413619995, |
|
"epoch": 0.8149743249958589, |
|
"grad_norm": 0.5771492123603821, |
|
"kl": 0.9965006510416666, |
|
"learning_rate": 2.0034316354791062e-06, |
|
"loss": 0.0398, |
|
"reward": 1.2651910136143367, |
|
"reward_std": 0.1995284124277532, |
|
"rewards/accuracy_reward": 0.2968750091968104, |
|
"rewards/format_reward": 0.9683159900208315, |
|
"step": 615 |
|
}, |
|
{ |
|
"completion_length": 145.0091195901235, |
|
"epoch": 0.8189498095080338, |
|
"grad_norm": 2.458407163619995, |
|
"kl": 1.0079752604166667, |
|
"learning_rate": 1.920738234036463e-06, |
|
"loss": 0.0403, |
|
"reward": 1.28298615415891, |
|
"reward_std": 0.21225994320896766, |
|
"rewards/accuracy_reward": 0.3133680646618207, |
|
"rewards/format_reward": 0.9696180721124014, |
|
"step": 618 |
|
}, |
|
{ |
|
"completion_length": 157.43186203638712, |
|
"epoch": 0.8229252940202088, |
|
"grad_norm": 0.5727205276489258, |
|
"kl": 1.0079752604166667, |
|
"learning_rate": 1.8396059930893073e-06, |
|
"loss": 0.0403, |
|
"reward": 1.2170139253139496, |
|
"reward_std": 0.21727207908406854, |
|
"rewards/accuracy_reward": 0.25130209086152416, |
|
"rewards/format_reward": 0.9657118258376917, |
|
"step": 621 |
|
}, |
|
{ |
|
"completion_length": 156.4114625453949, |
|
"epoch": 0.8269007785323836, |
|
"grad_norm": 0.5636825561523438, |
|
"kl": 0.9955240885416666, |
|
"learning_rate": 1.7600505898681996e-06, |
|
"loss": 0.0398, |
|
"reward": 1.2300347586472828, |
|
"reward_std": 0.23207383013019958, |
|
"rewards/accuracy_reward": 0.26649306435137987, |
|
"rewards/format_reward": 0.9635416877766451, |
|
"step": 624 |
|
}, |
|
{ |
|
"completion_length": 154.10373576482138, |
|
"epoch": 0.8308762630445585, |
|
"grad_norm": 3.9658546447753906, |
|
"kl": 1.0211588541666667, |
|
"learning_rate": 1.6820873969104223e-06, |
|
"loss": 0.0409, |
|
"reward": 1.2500000434617202, |
|
"reward_std": 0.22843335390401384, |
|
"rewards/accuracy_reward": 0.2808159825702508, |
|
"rewards/format_reward": 0.9691840459903082, |
|
"step": 627 |
|
}, |
|
{ |
|
"completion_length": 179.8810822168986, |
|
"epoch": 0.8348517475567335, |
|
"grad_norm": 0.8975684642791748, |
|
"kl": 0.9754231770833334, |
|
"learning_rate": 1.605731479089534e-06, |
|
"loss": 0.039, |
|
"reward": 1.270833384245634, |
|
"reward_std": 0.2560514376188318, |
|
"rewards/accuracy_reward": 0.3168402878024305, |
|
"rewards/format_reward": 0.9539930745959282, |
|
"step": 630 |
|
}, |
|
{ |
|
"completion_length": 191.86068240801492, |
|
"epoch": 0.8388272320689084, |
|
"grad_norm": 1.0552254915237427, |
|
"kl": 1.05908203125, |
|
"learning_rate": 1.530997590704375e-06, |
|
"loss": 0.0424, |
|
"reward": 1.223524338255326, |
|
"reward_std": 0.24293402349576354, |
|
"rewards/accuracy_reward": 0.2669270930734153, |
|
"rewards/format_reward": 0.9565972437461218, |
|
"step": 633 |
|
}, |
|
{ |
|
"completion_length": 201.91189877192178, |
|
"epoch": 0.8428027165810833, |
|
"grad_norm": 1.9689509868621826, |
|
"kl": 1.109130859375, |
|
"learning_rate": 1.4579001726280828e-06, |
|
"loss": 0.0444, |
|
"reward": 1.2560764352480571, |
|
"reward_std": 0.24747123545967042, |
|
"rewards/accuracy_reward": 0.299479175475426, |
|
"rewards/format_reward": 0.9565972400208315, |
|
"step": 636 |
|
}, |
|
{ |
|
"completion_length": 201.5238777001699, |
|
"epoch": 0.8467782010932582, |
|
"grad_norm": 0.8104230761528015, |
|
"kl": 1.052734375, |
|
"learning_rate": 1.386453349517679e-06, |
|
"loss": 0.0421, |
|
"reward": 1.2391493332882721, |
|
"reward_std": 0.24252263192708293, |
|
"rewards/accuracy_reward": 0.281684036909913, |
|
"rewards/format_reward": 0.9574652922650179, |
|
"step": 639 |
|
}, |
|
{ |
|
"completion_length": 170.00868566830954, |
|
"epoch": 0.8507536856054332, |
|
"grad_norm": 0.7141380310058594, |
|
"kl": 0.9737955729166666, |
|
"learning_rate": 1.316670927084751e-06, |
|
"loss": 0.039, |
|
"reward": 1.2630208705862362, |
|
"reward_std": 0.23810221177215377, |
|
"rewards/accuracy_reward": 0.29817709152121097, |
|
"rewards/format_reward": 0.9648437735935053, |
|
"step": 642 |
|
}, |
|
{ |
|
"completion_length": 187.43533500035605, |
|
"epoch": 0.8547291701176081, |
|
"grad_norm": 0.9845206141471863, |
|
"kl": 1.1171061197916667, |
|
"learning_rate": 1.2485663894277611e-06, |
|
"loss": 0.0447, |
|
"reward": 1.2730035160978634, |
|
"reward_std": 0.216334043458725, |
|
"rewards/accuracy_reward": 0.3094618124887347, |
|
"rewards/format_reward": 0.9635416865348816, |
|
"step": 645 |
|
}, |
|
{ |
|
"completion_length": 169.31076955795288, |
|
"epoch": 0.858704654629783, |
|
"grad_norm": 0.9538066387176514, |
|
"kl": 0.978271484375, |
|
"learning_rate": 1.182152896426515e-06, |
|
"loss": 0.0391, |
|
"reward": 1.281250037252903, |
|
"reward_std": 0.24271480288977423, |
|
"rewards/accuracy_reward": 0.31597222907779116, |
|
"rewards/format_reward": 0.9652777972320715, |
|
"step": 648 |
|
}, |
|
{ |
|
"completion_length": 184.18012634913126, |
|
"epoch": 0.862680139141958, |
|
"grad_norm": 0.5063804388046265, |
|
"kl": 0.9745279947916666, |
|
"learning_rate": 1.1174432811992686e-06, |
|
"loss": 0.039, |
|
"reward": 1.24609378973643, |
|
"reward_std": 0.21818942956936857, |
|
"rewards/accuracy_reward": 0.2821180628379807, |
|
"rewards/format_reward": 0.9639757126569748, |
|
"step": 651 |
|
}, |
|
{ |
|
"completion_length": 165.3055603504181, |
|
"epoch": 0.8666556236541328, |
|
"grad_norm": 0.6727854013442993, |
|
"kl": 0.9583333333333334, |
|
"learning_rate": 1.0544500476229713e-06, |
|
"loss": 0.0383, |
|
"reward": 1.2573785086472828, |
|
"reward_std": 0.22620403526040414, |
|
"rewards/accuracy_reward": 0.29427084035705775, |
|
"rewards/format_reward": 0.9631076554457346, |
|
"step": 654 |
|
}, |
|
{ |
|
"completion_length": 186.19965728123984, |
|
"epoch": 0.8706311081663077, |
|
"grad_norm": 0.6164532899856567, |
|
"kl": 1.0279134114583333, |
|
"learning_rate": 9.931853679171377e-07, |
|
"loss": 0.0411, |
|
"reward": 1.2439236516753833, |
|
"reward_std": 0.24075799800145128, |
|
"rewards/accuracy_reward": 0.28559028551292914, |
|
"rewards/format_reward": 0.9583333507180214, |
|
"step": 657 |
|
}, |
|
{ |
|
"completion_length": 177.0638066927592, |
|
"epoch": 0.8746065926784827, |
|
"grad_norm": 0.6313008666038513, |
|
"kl": 1.0465494791666667, |
|
"learning_rate": 9.336610802918044e-07, |
|
"loss": 0.0419, |
|
"reward": 1.2708333631356556, |
|
"reward_std": 0.20328321517445147, |
|
"rewards/accuracy_reward": 0.3051215368323028, |
|
"rewards/format_reward": 0.9657118245959282, |
|
"step": 660 |
|
}, |
|
{ |
|
"completion_length": 178.04774816830954, |
|
"epoch": 0.8785820771906576, |
|
"grad_norm": 0.5517924427986145, |
|
"kl": 1.0804036458333333, |
|
"learning_rate": 8.758886866600258e-07, |
|
"loss": 0.0433, |
|
"reward": 1.3003472636143367, |
|
"reward_std": 0.20480242053357264, |
|
"rewards/accuracy_reward": 0.33203125970127684, |
|
"rewards/format_reward": 0.9683159875373045, |
|
"step": 663 |
|
}, |
|
{ |
|
"completion_length": 184.22309557596842, |
|
"epoch": 0.8825575617028325, |
|
"grad_norm": 1.6948155164718628, |
|
"kl": 0.9346516927083334, |
|
"learning_rate": 8.198793504153491e-07, |
|
"loss": 0.0374, |
|
"reward": 1.2834201827645302, |
|
"reward_std": 0.22442288471696278, |
|
"rewards/accuracy_reward": 0.31770834152121097, |
|
"rewards/format_reward": 0.9657118221124014, |
|
"step": 666 |
|
}, |
|
{ |
|
"completion_length": 163.08811235427856, |
|
"epoch": 0.8865330462150075, |
|
"grad_norm": 0.5778855085372925, |
|
"kl": 0.9193522135416666, |
|
"learning_rate": 7.656438942747057e-07, |
|
"loss": 0.0368, |
|
"reward": 1.27039934694767, |
|
"reward_std": 0.1949684239613513, |
|
"rewards/accuracy_reward": 0.2973090352024883, |
|
"rewards/format_reward": 0.9730902947485447, |
|
"step": 669 |
|
}, |
|
{ |
|
"completion_length": 190.82596119244894, |
|
"epoch": 0.8905085307271824, |
|
"grad_norm": 0.6843112111091614, |
|
"kl": 1.0071614583333333, |
|
"learning_rate": 7.131927981871345e-07, |
|
"loss": 0.0403, |
|
"reward": 1.2348090757926304, |
|
"reward_std": 0.22979943679335216, |
|
"rewards/accuracy_reward": 0.27213542349636555, |
|
"rewards/format_reward": 0.9626736293236414, |
|
"step": 672 |
|
}, |
|
{ |
|
"completion_length": 180.4761331876119, |
|
"epoch": 0.8944840152393573, |
|
"grad_norm": 1.2002581357955933, |
|
"kl": 0.9956868489583334, |
|
"learning_rate": 6.625361973087363e-07, |
|
"loss": 0.0398, |
|
"reward": 1.267361156642437, |
|
"reward_std": 0.20884954005790254, |
|
"rewards/accuracy_reward": 0.2999132029945031, |
|
"rewards/format_reward": 0.9674479303260645, |
|
"step": 675 |
|
}, |
|
{ |
|
"completion_length": 171.35373767217, |
|
"epoch": 0.8984594997515322, |
|
"grad_norm": 0.5270951390266418, |
|
"kl": 0.9773763020833334, |
|
"learning_rate": 6.136838800442457e-07, |
|
"loss": 0.0391, |
|
"reward": 1.2855903183420498, |
|
"reward_std": 0.19845290334584811, |
|
"rewards/accuracy_reward": 0.3168402863666415, |
|
"rewards/format_reward": 0.9687500186264515, |
|
"step": 678 |
|
}, |
|
{ |
|
"completion_length": 190.48004015286764, |
|
"epoch": 0.9024349842637072, |
|
"grad_norm": 0.8527917861938477, |
|
"kl": 0.9973958333333334, |
|
"learning_rate": 5.66645286155616e-07, |
|
"loss": 0.0399, |
|
"reward": 1.2916667014360428, |
|
"reward_std": 0.2291031815111637, |
|
"rewards/accuracy_reward": 0.32986112144620466, |
|
"rewards/format_reward": 0.9618055721124014, |
|
"step": 681 |
|
}, |
|
{ |
|
"completion_length": 189.20226113001505, |
|
"epoch": 0.906410468775882, |
|
"grad_norm": 9.596158981323242, |
|
"kl": 1.0517578125, |
|
"learning_rate": 5.214295049379658e-07, |
|
"loss": 0.0421, |
|
"reward": 1.2582465782761574, |
|
"reward_std": 0.22187859937548637, |
|
"rewards/accuracy_reward": 0.2938368134200573, |
|
"rewards/format_reward": 0.9644097400208315, |
|
"step": 684 |
|
}, |
|
{ |
|
"completion_length": 187.68099466959634, |
|
"epoch": 0.9103859532880569, |
|
"grad_norm": 0.6961022615432739, |
|
"kl": 0.9669596354166666, |
|
"learning_rate": 4.780452734632524e-07, |
|
"loss": 0.0387, |
|
"reward": 1.2760417039195697, |
|
"reward_std": 0.22566887092155716, |
|
"rewards/accuracy_reward": 0.31163195543922484, |
|
"rewards/format_reward": 0.9644097425043583, |
|
"step": 687 |
|
}, |
|
{ |
|
"completion_length": 181.89453570048013, |
|
"epoch": 0.9143614378002319, |
|
"grad_norm": 0.5587486028671265, |
|
"kl": 0.9386393229166666, |
|
"learning_rate": 4.3650097489200125e-07, |
|
"loss": 0.0376, |
|
"reward": 1.2834201777974765, |
|
"reward_std": 0.21305101970210671, |
|
"rewards/accuracy_reward": 0.3146701470638315, |
|
"rewards/format_reward": 0.9687500161429247, |
|
"step": 690 |
|
}, |
|
{ |
|
"completion_length": 186.9974012374878, |
|
"epoch": 0.9183369223124068, |
|
"grad_norm": 0.603073000907898, |
|
"kl": 0.977783203125, |
|
"learning_rate": 3.9680463685342173e-07, |
|
"loss": 0.0391, |
|
"reward": 1.3268229588866234, |
|
"reward_std": 0.22527993516996503, |
|
"rewards/accuracy_reward": 0.36154514946974814, |
|
"rewards/format_reward": 0.9652777935067812, |
|
"step": 693 |
|
}, |
|
{ |
|
"completion_length": 199.9023496309916, |
|
"epoch": 0.9223124068245817, |
|
"grad_norm": 0.49448880553245544, |
|
"kl": 0.979736328125, |
|
"learning_rate": 3.589639298942238e-07, |
|
"loss": 0.0392, |
|
"reward": 1.276475730041663, |
|
"reward_std": 0.2337690940281997, |
|
"rewards/accuracy_reward": 0.3138020931510255, |
|
"rewards/format_reward": 0.9626736293236414, |
|
"step": 696 |
|
}, |
|
{ |
|
"completion_length": 179.73394536972046, |
|
"epoch": 0.9262878913367567, |
|
"grad_norm": 3.9789516925811768, |
|
"kl": 0.976318359375, |
|
"learning_rate": 3.2298616599643285e-07, |
|
"loss": 0.0391, |
|
"reward": 1.278211849431197, |
|
"reward_std": 0.1972268489189446, |
|
"rewards/accuracy_reward": 0.3103298700880259, |
|
"rewards/format_reward": 0.9678819626569748, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 0.927613052840815, |
|
"eval_completion_length": 198.22194461433256, |
|
"eval_kl": 1.0224011479591837, |
|
"eval_loss": 0.039721183478832245, |
|
"eval_reward": 1.2755102442235362, |
|
"eval_reward_std": 0.2398411301629884, |
|
"eval_rewards/accuracy_reward": 0.31972789886046427, |
|
"eval_rewards/format_reward": 0.9557823356317015, |
|
"eval_runtime": 434.6419, |
|
"eval_samples_per_second": 0.228, |
|
"eval_steps_per_second": 0.012, |
|
"step": 700 |
|
}, |
|
{ |
|
"completion_length": 181.44922268390656, |
|
"epoch": 0.9302633758489316, |
|
"grad_norm": 0.6522932648658752, |
|
"kl": 0.9718017578125, |
|
"learning_rate": 2.8887829716449877e-07, |
|
"loss": 0.0401, |
|
"reward": 1.2643229570239782, |
|
"reward_std": 0.22105206700507551, |
|
"rewards/accuracy_reward": 0.30013021564809605, |
|
"rewards/format_reward": 0.9641927294433117, |
|
"step": 702 |
|
}, |
|
{ |
|
"completion_length": 193.3624184926351, |
|
"epoch": 0.9342388603611065, |
|
"grad_norm": 0.6112500429153442, |
|
"kl": 0.9839680989583334, |
|
"learning_rate": 2.5664691408194164e-07, |
|
"loss": 0.0394, |
|
"reward": 1.2582465621332328, |
|
"reward_std": 0.23564991471357644, |
|
"rewards/accuracy_reward": 0.2955729262127231, |
|
"rewards/format_reward": 0.9626736280818781, |
|
"step": 705 |
|
}, |
|
{ |
|
"completion_length": 220.39193407694498, |
|
"epoch": 0.9382143448732815, |
|
"grad_norm": 0.631359875202179, |
|
"kl": 1.0447591145833333, |
|
"learning_rate": 2.262982448378437e-07, |
|
"loss": 0.0418, |
|
"reward": 1.2782118432223797, |
|
"reward_std": 0.2571307167721291, |
|
"rewards/accuracy_reward": 0.32291667396202683, |
|
"rewards/format_reward": 0.9552951566874981, |
|
"step": 708 |
|
}, |
|
{ |
|
"completion_length": 208.5000058809916, |
|
"epoch": 0.9421898293854564, |
|
"grad_norm": 0.6516295075416565, |
|
"kl": 1.075927734375, |
|
"learning_rate": 1.9783815372338422e-07, |
|
"loss": 0.043, |
|
"reward": 1.2669271230697632, |
|
"reward_std": 0.26777021974946064, |
|
"rewards/accuracy_reward": 0.31597223101804656, |
|
"rewards/format_reward": 0.9509548805654049, |
|
"step": 711 |
|
}, |
|
{ |
|
"completion_length": 190.33203514417013, |
|
"epoch": 0.9461653138976313, |
|
"grad_norm": 0.8081660866737366, |
|
"kl": 0.9834798177083334, |
|
"learning_rate": 1.7127214009868387e-07, |
|
"loss": 0.0393, |
|
"reward": 1.3146701827645302, |
|
"reward_std": 0.22434815554879606, |
|
"rewards/accuracy_reward": 0.350694455128784, |
|
"rewards/format_reward": 0.9639757138987383, |
|
"step": 714 |
|
}, |
|
{ |
|
"completion_length": 191.1718815167745, |
|
"epoch": 0.9501407984098061, |
|
"grad_norm": 0.6136611700057983, |
|
"kl": 0.9658203125, |
|
"learning_rate": 1.4660533733015236e-07, |
|
"loss": 0.0386, |
|
"reward": 1.2361111529171467, |
|
"reward_std": 0.2575679953054835, |
|
"rewards/accuracy_reward": 0.2760416753590107, |
|
"rewards/format_reward": 0.9600694651405016, |
|
"step": 717 |
|
}, |
|
{ |
|
"completion_length": 199.0850751399994, |
|
"epoch": 0.9541162829219811, |
|
"grad_norm": 2.2742247581481934, |
|
"kl": 1.0128580729166667, |
|
"learning_rate": 1.2384251179857642e-07, |
|
"loss": 0.0405, |
|
"reward": 1.2573785086472828, |
|
"reward_std": 0.23815507961747548, |
|
"rewards/accuracy_reward": 0.2964409807464108, |
|
"rewards/format_reward": 0.9609375149011612, |
|
"step": 720 |
|
}, |
|
{ |
|
"completion_length": 180.6779546737671, |
|
"epoch": 0.958091767434156, |
|
"grad_norm": 0.546220064163208, |
|
"kl": 0.9303385416666666, |
|
"learning_rate": 1.0298806197809985e-07, |
|
"loss": 0.0372, |
|
"reward": 1.2834201827645302, |
|
"reward_std": 0.23479281645268202, |
|
"rewards/accuracy_reward": 0.32031251102065045, |
|
"rewards/format_reward": 0.9631076616545519, |
|
"step": 723 |
|
}, |
|
{ |
|
"completion_length": 189.55859804153442, |
|
"epoch": 0.9620672519463309, |
|
"grad_norm": 0.5562774538993835, |
|
"kl": 0.964111328125, |
|
"learning_rate": 8.404601758630892e-08, |
|
"loss": 0.0386, |
|
"reward": 1.2556424004336197, |
|
"reward_std": 0.23037906123014787, |
|
"rewards/accuracy_reward": 0.2955729252814005, |
|
"rewards/format_reward": 0.9600694626569748, |
|
"step": 726 |
|
}, |
|
{ |
|
"completion_length": 202.31163819630942, |
|
"epoch": 0.9660427364585059, |
|
"grad_norm": 0.6967044472694397, |
|
"kl": 0.984130859375, |
|
"learning_rate": 6.702003880556418e-08, |
|
"loss": 0.0394, |
|
"reward": 1.2834201728304226, |
|
"reward_std": 0.23515649721957743, |
|
"rewards/accuracy_reward": 0.32508681404093903, |
|
"rewards/format_reward": 0.9583333507180214, |
|
"step": 729 |
|
}, |
|
{ |
|
"completion_length": 184.46702075004578, |
|
"epoch": 0.9700182209706808, |
|
"grad_norm": 1.1955063343048096, |
|
"kl": 1.0083821614583333, |
|
"learning_rate": 5.191341557574392e-08, |
|
"loss": 0.0403, |
|
"reward": 1.2309028195838134, |
|
"reward_std": 0.22288473897303143, |
|
"rewards/accuracy_reward": 0.2695312558983763, |
|
"rewards/format_reward": 0.961371548473835, |
|
"step": 732 |
|
}, |
|
{ |
|
"completion_length": 171.00391014417013, |
|
"epoch": 0.9739937054828557, |
|
"grad_norm": 3.6898951530456543, |
|
"kl": 0.980224609375, |
|
"learning_rate": 3.872906695852607e-08, |
|
"loss": 0.0392, |
|
"reward": 1.2894965633749962, |
|
"reward_std": 0.22550825821235776, |
|
"rewards/accuracy_reward": 0.3198784821821998, |
|
"rewards/format_reward": 0.969618077079455, |
|
"step": 735 |
|
}, |
|
{ |
|
"completion_length": 181.51215736071268, |
|
"epoch": 0.9779691899950307, |
|
"grad_norm": 0.8761662244796753, |
|
"kl": 0.9816080729166666, |
|
"learning_rate": 2.746954057333606e-08, |
|
"loss": 0.0393, |
|
"reward": 1.2582465695838134, |
|
"reward_std": 0.23283367223727205, |
|
"rewards/accuracy_reward": 0.2925347340060398, |
|
"rewards/format_reward": 0.9657118221124014, |
|
"step": 738 |
|
}, |
|
{ |
|
"completion_length": 166.68663636843362, |
|
"epoch": 0.9819446745072056, |
|
"grad_norm": 0.48325055837631226, |
|
"kl": 0.996337890625, |
|
"learning_rate": 1.8137012105069464e-08, |
|
"loss": 0.0398, |
|
"reward": 1.263454897950093, |
|
"reward_std": 0.20216705913965902, |
|
"rewards/accuracy_reward": 0.29383681442899007, |
|
"rewards/format_reward": 0.9696180733541647, |
|
"step": 741 |
|
}, |
|
{ |
|
"completion_length": 182.96528228123984, |
|
"epoch": 0.9859201590193805, |
|
"grad_norm": 0.55852872133255, |
|
"kl": 0.9632161458333334, |
|
"learning_rate": 1.0733284883682748e-08, |
|
"loss": 0.0385, |
|
"reward": 1.2773437909781933, |
|
"reward_std": 0.2304229981576403, |
|
"rewards/accuracy_reward": 0.31032986772091437, |
|
"rewards/format_reward": 0.9670139066874981, |
|
"step": 744 |
|
}, |
|
{ |
|
"completion_length": 200.19835631052652, |
|
"epoch": 0.9898956435315555, |
|
"grad_norm": 4.7552056312561035, |
|
"kl": 1.1190592447916667, |
|
"learning_rate": 5.25978953573536e-09, |
|
"loss": 0.0447, |
|
"reward": 1.2855903195838134, |
|
"reward_std": 0.2617647792988767, |
|
"rewards/accuracy_reward": 0.3268229243112728, |
|
"rewards/format_reward": 0.9587673805654049, |
|
"step": 747 |
|
}, |
|
{ |
|
"completion_length": 192.70313183466592, |
|
"epoch": 0.9938711280437303, |
|
"grad_norm": 0.49542155861854553, |
|
"kl": 0.9990234375, |
|
"learning_rate": 1.7175837079452806e-09, |
|
"loss": 0.04, |
|
"reward": 1.2573785160978634, |
|
"reward_std": 0.21775838693914315, |
|
"rewards/accuracy_reward": 0.2960069530721133, |
|
"rewards/format_reward": 0.9613715497155985, |
|
"step": 750 |
|
}, |
|
{ |
|
"completion_length": 200.46571826934814, |
|
"epoch": 0.9978466125559052, |
|
"grad_norm": 0.8950777053833008, |
|
"kl": 1.0421549479166667, |
|
"learning_rate": 1.0735186282695431e-10, |
|
"loss": 0.0417, |
|
"reward": 1.2717014340062935, |
|
"reward_std": 0.2556659254866342, |
|
"rewards/accuracy_reward": 0.31206598148370784, |
|
"rewards/format_reward": 0.9596354328095913, |
|
"step": 753 |
|
}, |
|
{ |
|
"completion_length": 211.18359994888306, |
|
"epoch": 0.9991717740599636, |
|
"kl": 0.985107421875, |
|
"reward": 1.2708333656191826, |
|
"reward_std": 0.28282210882753134, |
|
"rewards/accuracy_reward": 0.31250000838190317, |
|
"rewards/format_reward": 0.9583333544433117, |
|
"step": 754, |
|
"total_flos": 0.0, |
|
"train_loss": 1.286716509427883, |
|
"train_runtime": 229250.8975, |
|
"train_samples_per_second": 0.316, |
|
"train_steps_per_second": 0.003 |
|
} |
|
], |
|
"logging_steps": 3, |
|
"max_steps": 754, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|