{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9995728321230244, "eval_steps": 500, "global_step": 585, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 1680.4732666015625, "epoch": 0.0017086715079026058, "grad_norm": 0.11831104755401611, "kl": 0.0, "learning_rate": 1.6949152542372882e-08, "loss": 0.0501, "reward": 0.4218750149011612, "reward_std": 0.14868441969156265, "rewards/accuracy_reward": 0.05357143096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3683035895228386, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 1731.7232666015625, "epoch": 0.0034173430158052115, "grad_norm": 0.16019196808338165, "kl": 0.0, "learning_rate": 3.3898305084745764e-08, "loss": 0.1068, "reward": 0.5379464477300644, "reward_std": 0.21157729253172874, "rewards/accuracy_reward": 0.15178572293370962, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3861607313156128, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 1802.7232971191406, "epoch": 0.0051260145237078175, "grad_norm": 0.1373331993818283, "kl": 4.425644874572754e-05, "learning_rate": 5.0847457627118645e-08, "loss": 0.0667, "reward": 0.4620536044239998, "reward_std": 0.2063144575804472, "rewards/accuracy_reward": 0.11607143469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3459821566939354, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 1600.33935546875, "epoch": 0.006834686031610423, "grad_norm": 0.15266355872154236, "kl": 3.698468208312988e-05, "learning_rate": 6.779661016949153e-08, "loss": 0.0676, "reward": 0.6383928954601288, "reward_std": 0.22091378644108772, "rewards/accuracy_reward": 0.2142857275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.424107164144516, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 1716.4286499023438, "epoch": 0.00854335753951303, "grad_norm": 0.15427502989768982, "kl": 4.5359134674072266e-05, "learning_rate": 8.47457627118644e-08, "loss": 0.0795, "reward": 0.5357143059372902, "reward_std": 0.16237006522715092, "rewards/accuracy_reward": 0.1517857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.383928582072258, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 1744.1250915527344, "epoch": 0.010252029047415635, "grad_norm": 0.1304451823234558, "kl": 4.458427429199219e-05, "learning_rate": 1.0169491525423729e-07, "loss": 0.0534, "reward": 0.4486607313156128, "reward_std": 0.12905766069889069, "rewards/accuracy_reward": 0.06250000279396772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3861607313156128, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 1744.7322387695312, "epoch": 0.01196070055531824, "grad_norm": 0.10000187903642654, "kl": 4.38690185546875e-05, "learning_rate": 1.1864406779661017e-07, "loss": 0.0376, "reward": 0.3750000223517418, "reward_std": 0.11052639409899712, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3392857238650322, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 1741.3215026855469, "epoch": 0.013669372063220846, "grad_norm": 0.13073387742042542, "kl": 4.6312808990478516e-05, "learning_rate": 1.3559322033898305e-07, "loss": 0.0676, "reward": 0.5200893059372902, "reward_std": 0.23169920407235622, "rewards/accuracy_reward": 0.13392857927829027, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3861607387661934, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 1673.40185546875, "epoch": 0.015378043571123452, "grad_norm": 0.1581849306821823, "kl": 4.00543212890625e-05, "learning_rate": 1.5254237288135593e-07, "loss": 0.0988, "reward": 0.4732143133878708, "reward_std": 0.17714741080999374, "rewards/accuracy_reward": 0.08035714644938707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.392857164144516, "step": 9 }, { "clip_ratio": 0.0, "completion_length": 1749.2232971191406, "epoch": 0.01708671507902606, "grad_norm": 0.1283339262008667, "kl": 5.453824996948242e-05, "learning_rate": 1.694915254237288e-07, "loss": 0.0748, "reward": 0.5111607313156128, "reward_std": 0.14422839507460594, "rewards/accuracy_reward": 0.12500000465661287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3861607313156128, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 1747.3750915527344, "epoch": 0.018795386586928663, "grad_norm": 0.16728822886943817, "kl": 4.8726797103881836e-05, "learning_rate": 1.8644067796610168e-07, "loss": 0.0941, "reward": 0.4531250149011612, "reward_std": 0.20281513407826424, "rewards/accuracy_reward": 0.08035714831203222, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3727678656578064, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 1754.5893859863281, "epoch": 0.02050405809483127, "grad_norm": 0.16166667640209198, "kl": 3.555417060852051e-05, "learning_rate": 2.0338983050847458e-07, "loss": 0.0881, "reward": 0.4776785969734192, "reward_std": 0.19166521355509758, "rewards/accuracy_reward": 0.09821429010480642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3794643059372902, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 1640.3482971191406, "epoch": 0.022212729602733874, "grad_norm": 0.1343711018562317, "kl": 3.522634506225586e-05, "learning_rate": 2.2033898305084743e-07, "loss": 0.0477, "reward": 0.6049107313156128, "reward_std": 0.2026656735688448, "rewards/accuracy_reward": 0.2053571529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.399553582072258, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 1763.8036804199219, "epoch": 0.02392140111063648, "grad_norm": 0.1508689671754837, "kl": 3.910064697265625e-05, "learning_rate": 2.3728813559322033e-07, "loss": 0.0751, "reward": 0.4308035895228386, "reward_std": 0.22263599559664726, "rewards/accuracy_reward": 0.0625000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3683035895228386, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 1835.9375915527344, "epoch": 0.025630072618539085, "grad_norm": 0.13171635568141937, "kl": 3.8683414459228516e-05, "learning_rate": 2.542372881355932e-07, "loss": 0.0554, "reward": 0.4017857387661934, "reward_std": 0.1099984273314476, "rewards/accuracy_reward": 0.044642859138548374, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3571428805589676, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 1690.044677734375, "epoch": 0.027338744126441692, "grad_norm": 0.1276414841413498, "kl": 4.2051076889038086e-05, "learning_rate": 2.711864406779661e-07, "loss": 0.0796, "reward": 0.5334821715950966, "reward_std": 0.16428881138563156, "rewards/accuracy_reward": 0.1160714328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4174107238650322, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 1769.6607971191406, "epoch": 0.029047415634344296, "grad_norm": 0.10887516289949417, "kl": 4.050135612487793e-05, "learning_rate": 2.88135593220339e-07, "loss": 0.0519, "reward": 0.5111607387661934, "reward_std": 0.16433580592274666, "rewards/accuracy_reward": 0.14285714738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3683035895228386, "step": 17 }, { "clip_ratio": 0.0, "completion_length": 1637.3215026855469, "epoch": 0.030756087142246903, "grad_norm": 0.12711912393569946, "kl": 4.0650367736816406e-05, "learning_rate": 3.0508474576271186e-07, "loss": 0.0489, "reward": 0.4665178805589676, "reward_std": 0.15946133434772491, "rewards/accuracy_reward": 0.07142857369035482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3950892984867096, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 1723.1429748535156, "epoch": 0.03246475865014951, "grad_norm": 0.12875297665596008, "kl": 3.540515899658203e-05, "learning_rate": 3.220338983050847e-07, "loss": 0.0604, "reward": 0.4508928880095482, "reward_std": 0.14720085263252258, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3794643059372902, "step": 19 }, { "clip_ratio": 0.0, "completion_length": 1680.1965026855469, "epoch": 0.03417343015805212, "grad_norm": 0.1310959905385971, "kl": 3.68952751159668e-05, "learning_rate": 3.389830508474576e-07, "loss": 0.0989, "reward": 0.5580357313156128, "reward_std": 0.24784812331199646, "rewards/accuracy_reward": 0.16071429662406445, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3973214477300644, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 1626.009033203125, "epoch": 0.03588210166595472, "grad_norm": 0.16018393635749817, "kl": 4.738569259643555e-05, "learning_rate": 3.559322033898305e-07, "loss": 0.066, "reward": 0.5803571492433548, "reward_std": 0.15897901356220245, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.392857164144516, "step": 21 }, { "clip_ratio": 0.0, "completion_length": 1593.5000915527344, "epoch": 0.037590773173857325, "grad_norm": 0.15119893848896027, "kl": 5.543231964111328e-05, "learning_rate": 3.7288135593220336e-07, "loss": 0.0765, "reward": 0.5223214477300644, "reward_std": 0.21863358840346336, "rewards/accuracy_reward": 0.11607143376022577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4062500149011612, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 1646.8840026855469, "epoch": 0.03929944468175993, "grad_norm": 0.15664291381835938, "kl": 3.483891487121582e-05, "learning_rate": 3.898305084745763e-07, "loss": 0.0763, "reward": 0.5491071790456772, "reward_std": 0.18198650237172842, "rewards/accuracy_reward": 0.16071429289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.388392873108387, "step": 23 }, { "clip_ratio": 0.0, "completion_length": 1667.8929138183594, "epoch": 0.04100811618966254, "grad_norm": 0.12416456639766693, "kl": 3.55839729309082e-05, "learning_rate": 4.0677966101694916e-07, "loss": 0.089, "reward": 0.5178571715950966, "reward_std": 0.16610532253980637, "rewards/accuracy_reward": 0.11607143748551607, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4017857313156128, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 1605.5179748535156, "epoch": 0.042716787697565144, "grad_norm": 0.1491008698940277, "kl": 3.2514333724975586e-05, "learning_rate": 4.23728813559322e-07, "loss": 0.0498, "reward": 0.5892857164144516, "reward_std": 0.18751543015241623, "rewards/accuracy_reward": 0.1785714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4107142984867096, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 1533.8304138183594, "epoch": 0.04442545920546775, "grad_norm": 0.16634103655815125, "kl": 4.57763671875e-05, "learning_rate": 4.4067796610169486e-07, "loss": 0.1019, "reward": 0.5089285895228386, "reward_std": 0.18087475933134556, "rewards/accuracy_reward": 0.09821429289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4107143059372902, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 1811.8482666015625, "epoch": 0.04613413071337035, "grad_norm": 0.12147875875234604, "kl": 2.8073787689208984e-05, "learning_rate": 4.576271186440678e-07, "loss": 0.0685, "reward": 0.5558035969734192, "reward_std": 0.18344293721020222, "rewards/accuracy_reward": 0.19642857927829027, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3593750149011612, "step": 27 }, { "clip_ratio": 0.0, "completion_length": 1765.5983276367188, "epoch": 0.04784280222127296, "grad_norm": 0.1164206713438034, "kl": 2.5212764739990234e-05, "learning_rate": 4.7457627118644066e-07, "loss": 0.0733, "reward": 0.4308035895228386, "reward_std": 0.13239517901092768, "rewards/accuracy_reward": 0.07142857555299997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3593750149011612, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 1597.2232666015625, "epoch": 0.049551473729175566, "grad_norm": 0.15887533128261566, "kl": 2.08243727684021e-05, "learning_rate": 4.915254237288136e-07, "loss": 0.0947, "reward": 0.5647321715950966, "reward_std": 0.1855197735130787, "rewards/accuracy_reward": 0.1517857164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.412946455180645, "step": 29 }, { "clip_ratio": 0.0, "completion_length": 1631.8839721679688, "epoch": 0.05126014523707817, "grad_norm": 0.18144667148590088, "kl": 3.5434961318969727e-05, "learning_rate": 5.084745762711864e-07, "loss": 0.0949, "reward": 0.5089286118745804, "reward_std": 0.1441587945446372, "rewards/accuracy_reward": 0.0803571455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.428571455180645, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 1645.02685546875, "epoch": 0.05296881674498078, "grad_norm": 0.13209061324596405, "kl": 1.7717480659484863e-05, "learning_rate": 5.254237288135593e-07, "loss": 0.0619, "reward": 0.553571455180645, "reward_std": 0.1325458474457264, "rewards/accuracy_reward": 0.160714291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3928571566939354, "step": 31 }, { "clip_ratio": 0.0, "completion_length": 1683.5357666015625, "epoch": 0.054677488252883384, "grad_norm": 0.17384155094623566, "kl": 1.7598271369934082e-05, "learning_rate": 5.423728813559322e-07, "loss": 0.0696, "reward": 0.4553571566939354, "reward_std": 0.15251443721354008, "rewards/accuracy_reward": 0.08035714738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3750000149011612, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 1811.9911499023438, "epoch": 0.05638615976078599, "grad_norm": 0.13401125371456146, "kl": 1.5020370483398438e-05, "learning_rate": 5.59322033898305e-07, "loss": 0.0639, "reward": 0.4375000223517418, "reward_std": 0.19593489915132523, "rewards/accuracy_reward": 0.08928571920841932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3482142984867096, "step": 33 }, { "clip_ratio": 0.0, "completion_length": 1661.1697082519531, "epoch": 0.05809483126868859, "grad_norm": 0.14857830107212067, "kl": 1.0596588253974915e-05, "learning_rate": 5.76271186440678e-07, "loss": 0.0925, "reward": 0.611607164144516, "reward_std": 0.2090790867805481, "rewards/accuracy_reward": 0.2142857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.397321455180645, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 1557.77685546875, "epoch": 0.0598035027765912, "grad_norm": 0.17167609930038452, "kl": 1.4755874872207642e-05, "learning_rate": 5.932203389830508e-07, "loss": 0.0988, "reward": 0.6116071790456772, "reward_std": 0.19911614432930946, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.424107164144516, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 1823.0447387695312, "epoch": 0.061512174284493806, "grad_norm": 0.12531593441963196, "kl": 1.6145408153533936e-05, "learning_rate": 6.101694915254237e-07, "loss": 0.0516, "reward": 0.4888393059372902, "reward_std": 0.17774958908557892, "rewards/accuracy_reward": 0.12500000465661287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3638393059372902, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 1822.02685546875, "epoch": 0.06322084579239641, "grad_norm": 0.11463770270347595, "kl": 1.2965872883796692e-05, "learning_rate": 6.271186440677966e-07, "loss": 0.0502, "reward": 0.4084821492433548, "reward_std": 0.11839569173753262, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3459821492433548, "step": 37 }, { "clip_ratio": 0.0, "completion_length": 1570.7500305175781, "epoch": 0.06492951730029901, "grad_norm": 0.1291818618774414, "kl": 2.3663043975830078e-05, "learning_rate": 6.440677966101694e-07, "loss": 0.0396, "reward": 0.584821455180645, "reward_std": 0.20419742166996002, "rewards/accuracy_reward": 0.16964286752045155, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4151785969734192, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 1709.3482971191406, "epoch": 0.06663818880820162, "grad_norm": 0.1476876586675644, "kl": 3.0666589736938477e-05, "learning_rate": 6.610169491525423e-07, "loss": 0.0857, "reward": 0.5089285969734192, "reward_std": 0.14652280509471893, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3839285895228386, "step": 39 }, { "clip_ratio": 0.0, "completion_length": 1489.6339721679688, "epoch": 0.06834686031610424, "grad_norm": 0.1492823213338852, "kl": 3.784894943237305e-05, "learning_rate": 6.779661016949152e-07, "loss": 0.0976, "reward": 0.6339286118745804, "reward_std": 0.2272382378578186, "rewards/accuracy_reward": 0.18750000931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4464285969734192, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 1758.2947387695312, "epoch": 0.07005553182400684, "grad_norm": 0.10318886488676071, "kl": 4.7147274017333984e-05, "learning_rate": 6.949152542372881e-07, "loss": 0.0138, "reward": 0.3906250149011612, "reward_std": 0.1005205875262618, "rewards/accuracy_reward": 0.035714288242161274, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3549107313156128, "step": 41 }, { "clip_ratio": 0.0, "completion_length": 1619.4911499023438, "epoch": 0.07176420333190944, "grad_norm": 0.15412960946559906, "kl": 8.857250213623047e-05, "learning_rate": 7.11864406779661e-07, "loss": 0.0702, "reward": 0.6183036118745804, "reward_std": 0.1132989153265953, "rewards/accuracy_reward": 0.2232142947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3950892984867096, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 1645.1161499023438, "epoch": 0.07347287483981205, "grad_norm": 0.11579399555921555, "kl": 8.225440979003906e-05, "learning_rate": 7.288135593220338e-07, "loss": 0.0564, "reward": 0.5379464402794838, "reward_std": 0.1665024645626545, "rewards/accuracy_reward": 0.1339285783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.404017873108387, "step": 43 }, { "clip_ratio": 0.0, "completion_length": 1692.5357666015625, "epoch": 0.07518154634771465, "grad_norm": 0.14809252321720123, "kl": 0.00012862682342529297, "learning_rate": 7.457627118644067e-07, "loss": 0.0834, "reward": 0.5513393133878708, "reward_std": 0.18062489107251167, "rewards/accuracy_reward": 0.1696428656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3816964402794838, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 1578.3036193847656, "epoch": 0.07689021785561725, "grad_norm": 0.18836835026741028, "kl": 0.00014793872833251953, "learning_rate": 7.627118644067796e-07, "loss": 0.1158, "reward": 0.508928582072258, "reward_std": 0.1665172502398491, "rewards/accuracy_reward": 0.11607143376022577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3928571566939354, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 1631.4822082519531, "epoch": 0.07859888936351986, "grad_norm": 0.12439078092575073, "kl": 0.00015115737915039062, "learning_rate": 7.796610169491526e-07, "loss": 0.0581, "reward": 0.5089285969734192, "reward_std": 0.15179534256458282, "rewards/accuracy_reward": 0.11607143469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.392857164144516, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 1684.5982971191406, "epoch": 0.08030756087142248, "grad_norm": 0.12262114882469177, "kl": 0.00019979476928710938, "learning_rate": 7.966101694915253e-07, "loss": 0.082, "reward": 0.5089285969734192, "reward_std": 0.22356541454792023, "rewards/accuracy_reward": 0.13392857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3750000149011612, "step": 47 }, { "clip_ratio": 0.0, "completion_length": 1719.7054443359375, "epoch": 0.08201623237932508, "grad_norm": 0.17706398665905, "kl": 0.000370025634765625, "learning_rate": 8.135593220338983e-07, "loss": 0.0839, "reward": 0.5937500149011612, "reward_std": 0.15803945809602737, "rewards/accuracy_reward": 0.2142857238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3794643059372902, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 1723.65185546875, "epoch": 0.08372490388722768, "grad_norm": 0.12803256511688232, "kl": 0.00041294097900390625, "learning_rate": 8.305084745762712e-07, "loss": 0.0645, "reward": 0.5312500298023224, "reward_std": 0.10837463941425085, "rewards/accuracy_reward": 0.17857143841683865, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.352678582072258, "step": 49 }, { "clip_ratio": 0.0, "completion_length": 1821.4197082519531, "epoch": 0.08543357539513029, "grad_norm": 0.11561048775911331, "kl": 0.0006256103515625, "learning_rate": 8.47457627118644e-07, "loss": 0.0486, "reward": 0.4330357387661934, "reward_std": 0.13983718352392316, "rewards/accuracy_reward": 0.0892857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3437500223517418, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 1592.5804748535156, "epoch": 0.08714224690303289, "grad_norm": 0.13196177780628204, "kl": 0.0007009506225585938, "learning_rate": 8.64406779661017e-07, "loss": 0.0789, "reward": 0.5223214626312256, "reward_std": 0.15226908959448338, "rewards/accuracy_reward": 0.10714285913854837, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4151785969734192, "step": 51 }, { "clip_ratio": 0.0, "completion_length": 1523.8572387695312, "epoch": 0.0888509184109355, "grad_norm": 0.13278700411319733, "kl": 0.0008640289306640625, "learning_rate": 8.813559322033897e-07, "loss": 0.0572, "reward": 0.6718750223517418, "reward_std": 0.18452133424580097, "rewards/accuracy_reward": 0.21428572200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4575893059372902, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 1793.71435546875, "epoch": 0.0905595899188381, "grad_norm": 0.1284395009279251, "kl": 0.0011205673217773438, "learning_rate": 8.983050847457627e-07, "loss": 0.0633, "reward": 0.4241071566939354, "reward_std": 0.13102841563522816, "rewards/accuracy_reward": 0.07142857555299997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.352678582072258, "step": 53 }, { "clip_ratio": 0.0, "completion_length": 1576.5268249511719, "epoch": 0.0922682614267407, "grad_norm": 0.12687018513679504, "kl": 0.0012102127075195312, "learning_rate": 9.152542372881356e-07, "loss": 0.0763, "reward": 0.5379464477300644, "reward_std": 0.22808022424578667, "rewards/accuracy_reward": 0.098214291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.439732164144516, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 1472.7679138183594, "epoch": 0.09397693293464332, "grad_norm": 0.16030772030353546, "kl": 0.001163482666015625, "learning_rate": 9.322033898305083e-07, "loss": 0.0774, "reward": 0.6741071790456772, "reward_std": 0.24146351031959057, "rewards/accuracy_reward": 0.2053571566939354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4687500298023224, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 1625.5625915527344, "epoch": 0.09568560444254592, "grad_norm": 0.11340639740228653, "kl": 0.0014019012451171875, "learning_rate": 9.491525423728813e-07, "loss": 0.0334, "reward": 0.5089285969734192, "reward_std": 0.1404428742825985, "rewards/accuracy_reward": 0.08928571734577417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.419642873108387, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 1612.9464721679688, "epoch": 0.09739427595044853, "grad_norm": 0.13393424451351166, "kl": 0.00182342529296875, "learning_rate": 9.661016949152542e-07, "loss": 0.0419, "reward": 0.4732143059372902, "reward_std": 0.16740593686699867, "rewards/accuracy_reward": 0.07142857369035482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4017857387661934, "step": 57 }, { "clip_ratio": 0.0, "completion_length": 1583.96435546875, "epoch": 0.09910294745835113, "grad_norm": 0.14023074507713318, "kl": 0.00157928466796875, "learning_rate": 9.830508474576272e-07, "loss": 0.0726, "reward": 0.5379464477300644, "reward_std": 0.20214489102363586, "rewards/accuracy_reward": 0.10714286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4308035895228386, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 1512.9822082519531, "epoch": 0.10081161896625374, "grad_norm": 0.15368545055389404, "kl": 0.001575469970703125, "learning_rate": 1e-06, "loss": 0.0741, "reward": 0.7388393357396126, "reward_std": 0.21830238960683346, "rewards/accuracy_reward": 0.31250001955777407, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4263393059372902, "step": 59 }, { "clip_ratio": 0.0, "completion_length": 1681.6607971191406, "epoch": 0.10252029047415634, "grad_norm": 0.19045627117156982, "kl": 0.0029582977294921875, "learning_rate": 9.999919738091163e-07, "loss": 0.0848, "reward": 0.5022321566939354, "reward_std": 0.17235002480447292, "rewards/accuracy_reward": 0.10714286379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3950892984867096, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 1716.77685546875, "epoch": 0.10422896198205894, "grad_norm": 0.15595842897891998, "kl": 0.002857208251953125, "learning_rate": 9.999678955227748e-07, "loss": 0.0783, "reward": 0.3950893059372902, "reward_std": 0.11436966061592102, "rewards/accuracy_reward": 0.026785715483129025, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3683035895228386, "step": 61 }, { "clip_ratio": 0.0, "completion_length": 1752.7411499023438, "epoch": 0.10593763348996156, "grad_norm": 0.17402169108390808, "kl": 0.0027828216552734375, "learning_rate": 9.999277659998957e-07, "loss": 0.0924, "reward": 0.430803582072258, "reward_std": 0.12815688271075487, "rewards/accuracy_reward": 0.07142857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3593750149011612, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 1744.5715026855469, "epoch": 0.10764630499786416, "grad_norm": 0.154078409075737, "kl": 0.002315521240234375, "learning_rate": 9.99871586671977e-07, "loss": 0.0705, "reward": 0.4933035895228386, "reward_std": 0.17782744206488132, "rewards/accuracy_reward": 0.1250000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.368303582072258, "step": 63 }, { "clip_ratio": 0.0, "completion_length": 1795.1250610351562, "epoch": 0.10935497650576677, "grad_norm": 0.10305806249380112, "kl": 0.002681732177734375, "learning_rate": 9.997993595430462e-07, "loss": 0.0696, "reward": 0.4196428805589676, "reward_std": 0.15877895802259445, "rewards/accuracy_reward": 0.044642859138548374, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3750000149011612, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 1732.9286499023438, "epoch": 0.11106364801366937, "grad_norm": 0.11790303140878677, "kl": 0.00273895263671875, "learning_rate": 9.99711087189586e-07, "loss": 0.0412, "reward": 0.558035746216774, "reward_std": 0.21614733710885048, "rewards/accuracy_reward": 0.17857143376022577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3794642984867096, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 1705.5536499023438, "epoch": 0.11277231952157198, "grad_norm": 0.12586507201194763, "kl": 0.0033416748046875, "learning_rate": 9.996067727604443e-07, "loss": 0.0053, "reward": 0.4508928880095482, "reward_std": 0.1810305304825306, "rewards/accuracy_reward": 0.08035714738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3705357313156128, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 1595.9107666015625, "epoch": 0.11448099102947458, "grad_norm": 0.1491793692111969, "kl": 0.004669189453125, "learning_rate": 9.994864199767212e-07, "loss": 0.0769, "reward": 0.4575893133878708, "reward_std": 0.13090776093304157, "rewards/accuracy_reward": 0.035714288242161274, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4218750149011612, "step": 67 }, { "clip_ratio": 0.0, "completion_length": 1656.1607971191406, "epoch": 0.11618966253737718, "grad_norm": 0.16097894310951233, "kl": 0.005825042724609375, "learning_rate": 9.993500331316366e-07, "loss": 0.0568, "reward": 0.5803571715950966, "reward_std": 0.17094802390784025, "rewards/accuracy_reward": 0.17857143748551607, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4017857387661934, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 1723.1607971191406, "epoch": 0.1178983340452798, "grad_norm": 0.15254752337932587, "kl": 0.0056915283203125, "learning_rate": 9.99197617090376e-07, "loss": 0.0756, "reward": 0.4575893059372902, "reward_std": 0.15283931978046894, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3861607313156128, "step": 69 }, { "clip_ratio": 0.0, "completion_length": 1645.6250610351562, "epoch": 0.1196070055531824, "grad_norm": 0.15007099509239197, "kl": 0.00524139404296875, "learning_rate": 9.990291772899186e-07, "loss": 0.063, "reward": 0.6607143133878708, "reward_std": 0.23921789415180683, "rewards/accuracy_reward": 0.2500000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4107143059372902, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 1801.5179138183594, "epoch": 0.12131567706108501, "grad_norm": 0.16796092689037323, "kl": 0.0065765380859375, "learning_rate": 9.98844719738842e-07, "loss": 0.0726, "reward": 0.4843750149011612, "reward_std": 0.15324074029922485, "rewards/accuracy_reward": 0.11607143469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3683035895228386, "step": 71 }, { "clip_ratio": 0.0, "completion_length": 1810.9822387695312, "epoch": 0.12302434856898761, "grad_norm": 0.14802227914333344, "kl": 0.0090789794921875, "learning_rate": 9.986442510171088e-07, "loss": 0.0818, "reward": 0.408482164144516, "reward_std": 0.12176126893609762, "rewards/accuracy_reward": 0.06250000279396772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3459821566939354, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 1658.5714721679688, "epoch": 0.12473302007689022, "grad_norm": 0.16436883807182312, "kl": 0.00789642333984375, "learning_rate": 9.984277782758304e-07, "loss": 0.0863, "reward": 0.5625000223517418, "reward_std": 0.20358623191714287, "rewards/accuracy_reward": 0.12500000465661287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4375000223517418, "step": 73 }, { "clip_ratio": 0.0, "completion_length": 1783.1161499023438, "epoch": 0.12644169158479282, "grad_norm": 0.10990848392248154, "kl": 0.00762176513671875, "learning_rate": 9.981953092370145e-07, "loss": 0.0681, "reward": 0.3816964477300644, "reward_std": 0.10674701724201441, "rewards/accuracy_reward": 0.01785714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3638393059372902, "step": 74 }, { "clip_ratio": 0.0, "completion_length": 1865.3840026855469, "epoch": 0.12815036309269542, "grad_norm": 0.15377569198608398, "kl": 0.009033203125, "learning_rate": 9.979468521932867e-07, "loss": 0.037, "reward": 0.3973214402794838, "reward_std": 0.18342850916087627, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3616071566939354, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 1681.8215026855469, "epoch": 0.12985903460059803, "grad_norm": 0.18511129915714264, "kl": 0.009857177734375, "learning_rate": 9.976824160075968e-07, "loss": 0.1195, "reward": 0.4620535895228386, "reward_std": 0.22398092597723007, "rewards/accuracy_reward": 0.08035714644938707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3816964477300644, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 1770.83935546875, "epoch": 0.13156770610850063, "grad_norm": 0.14749263226985931, "kl": 0.009735107421875, "learning_rate": 9.974020101129015e-07, "loss": 0.0731, "reward": 0.4174107313156128, "reward_std": 0.15235848724842072, "rewards/accuracy_reward": 0.0535714328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3638392984867096, "step": 77 }, { "clip_ratio": 0.0, "completion_length": 1628.2232971191406, "epoch": 0.13327637761640324, "grad_norm": 0.1713767647743225, "kl": 0.00905609130859375, "learning_rate": 9.97105644511829e-07, "loss": 0.0669, "reward": 0.6718750447034836, "reward_std": 0.2134380042552948, "rewards/accuracy_reward": 0.25000001676380634, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4218750223517418, "step": 78 }, { "clip_ratio": 0.0, "completion_length": 1754.8125915527344, "epoch": 0.13498504912430584, "grad_norm": 0.10342177748680115, "kl": 0.008514404296875, "learning_rate": 9.967933297763203e-07, "loss": 0.0224, "reward": 0.517857164144516, "reward_std": 0.19758236780762672, "rewards/accuracy_reward": 0.11607143469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4017857387661934, "step": 79 }, { "clip_ratio": 0.0, "completion_length": 1616.5804138183594, "epoch": 0.13669372063220847, "grad_norm": 0.15395300090312958, "kl": 0.0104217529296875, "learning_rate": 9.964650770472549e-07, "loss": 0.0526, "reward": 0.5691964626312256, "reward_std": 0.2391242254525423, "rewards/accuracy_reward": 0.14285715110599995, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4263393133878708, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 1762.3840026855469, "epoch": 0.13840239214011107, "grad_norm": 0.11707251518964767, "kl": 0.010223388671875, "learning_rate": 9.961208980340497e-07, "loss": 0.0392, "reward": 0.5267857313156128, "reward_std": 0.21197225153446198, "rewards/accuracy_reward": 0.15178571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3750000149011612, "step": 81 }, { "clip_ratio": 0.0, "completion_length": 1746.2411499023438, "epoch": 0.14011106364801368, "grad_norm": 0.13182953000068665, "kl": 0.01006317138671875, "learning_rate": 9.957608050142455e-07, "loss": 0.0472, "reward": 0.4330357387661934, "reward_std": 0.1161644896492362, "rewards/accuracy_reward": 0.05357143096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3794642984867096, "step": 82 }, { "clip_ratio": 0.0, "completion_length": 1564.7054138183594, "epoch": 0.14181973515591628, "grad_norm": 0.15057554841041565, "kl": 0.0110015869140625, "learning_rate": 9.953848108330654e-07, "loss": 0.0717, "reward": 0.6227678954601288, "reward_std": 0.2399737611413002, "rewards/accuracy_reward": 0.17857143748551607, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.444196455180645, "step": 83 }, { "clip_ratio": 0.0, "completion_length": 1657.52685546875, "epoch": 0.14352840666381889, "grad_norm": 0.12103632837533951, "kl": 0.0106964111328125, "learning_rate": 9.94992928902959e-07, "loss": 0.045, "reward": 0.4977678880095482, "reward_std": 0.11338848993182182, "rewards/accuracy_reward": 0.09821429196745157, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3995535969734192, "step": 84 }, { "clip_ratio": 0.0, "completion_length": 1600.8840026855469, "epoch": 0.1452370781717215, "grad_norm": 0.1864004135131836, "kl": 0.012237548828125, "learning_rate": 9.94585173203122e-07, "loss": 0.0986, "reward": 0.5312500223517418, "reward_std": 0.1860370710492134, "rewards/accuracy_reward": 0.08928571548312902, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4419643059372902, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 1626.6875610351562, "epoch": 0.1469457496796241, "grad_norm": 0.19480495154857635, "kl": 0.0110931396484375, "learning_rate": 9.941615582789998e-07, "loss": 0.0685, "reward": 0.6093750223517418, "reward_std": 0.2709679566323757, "rewards/accuracy_reward": 0.20535715110599995, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4040178805589676, "step": 86 }, { "clip_ratio": 0.0, "completion_length": 1719.4107971191406, "epoch": 0.1486544211875267, "grad_norm": 0.15734519064426422, "kl": 0.0118255615234375, "learning_rate": 9.937220992417666e-07, "loss": 0.1003, "reward": 0.464285746216774, "reward_std": 0.19154664129018784, "rewards/accuracy_reward": 0.0892857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3750000223517418, "step": 87 }, { "clip_ratio": 0.0, "completion_length": 1780.8215026855469, "epoch": 0.1503630926954293, "grad_norm": 0.1262011080980301, "kl": 0.0131378173828125, "learning_rate": 9.932668117677872e-07, "loss": 0.0552, "reward": 0.4375000149011612, "reward_std": 0.10049869306385517, "rewards/accuracy_reward": 0.0535714328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3839285895228386, "step": 88 }, { "clip_ratio": 0.0, "completion_length": 1675.4822082519531, "epoch": 0.1520717642033319, "grad_norm": 0.20487579703330994, "kl": 0.015106201171875, "learning_rate": 9.927957120980582e-07, "loss": 0.1049, "reward": 0.5133928805589676, "reward_std": 0.20257512107491493, "rewards/accuracy_reward": 0.09821428824216127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4151785895228386, "step": 89 }, { "clip_ratio": 0.0, "completion_length": 1726.7947387695312, "epoch": 0.1537804357112345, "grad_norm": 0.13307704031467438, "kl": 0.0146636962890625, "learning_rate": 9.923088170376279e-07, "loss": 0.0626, "reward": 0.5290178805589676, "reward_std": 0.15771132707595825, "rewards/accuracy_reward": 0.11607143376022577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4129464477300644, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 1593.6429138183594, "epoch": 0.1554891072191371, "grad_norm": 0.13380111753940582, "kl": 0.0141448974609375, "learning_rate": 9.91806143954997e-07, "loss": 0.0429, "reward": 0.5312500298023224, "reward_std": 0.1236056424677372, "rewards/accuracy_reward": 0.0892857164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4419642984867096, "step": 91 }, { "clip_ratio": 0.0, "completion_length": 1623.8483276367188, "epoch": 0.15719777872703972, "grad_norm": 0.16255508363246918, "kl": 0.015289306640625, "learning_rate": 9.912877107814993e-07, "loss": 0.0673, "reward": 0.5491071715950966, "reward_std": 0.16942394897341728, "rewards/accuracy_reward": 0.1250000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.424107164144516, "step": 92 }, { "clip_ratio": 0.0, "completion_length": 1776.7947387695312, "epoch": 0.15890645023494232, "grad_norm": 0.12829580903053284, "kl": 0.0136871337890625, "learning_rate": 9.907535360106624e-07, "loss": 0.0492, "reward": 0.4799107387661934, "reward_std": 0.14293671399354935, "rewards/accuracy_reward": 0.10714286472648382, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.372767873108387, "step": 93 }, { "clip_ratio": 0.0, "completion_length": 1731.5090026855469, "epoch": 0.16061512174284495, "grad_norm": 0.1763499230146408, "kl": 0.01629638671875, "learning_rate": 9.902036386975466e-07, "loss": 0.0568, "reward": 0.5156250223517418, "reward_std": 0.1870399061590433, "rewards/accuracy_reward": 0.11607143376022577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3995535969734192, "step": 94 }, { "clip_ratio": 0.0, "completion_length": 1673.6786499023438, "epoch": 0.16232379325074756, "grad_norm": 0.17149698734283447, "kl": 0.01824951171875, "learning_rate": 9.896380384580667e-07, "loss": 0.0682, "reward": 0.5446428880095482, "reward_std": 0.17154706455767155, "rewards/accuracy_reward": 0.14285715017467737, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4017857387661934, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 1757.8661499023438, "epoch": 0.16403246475865016, "grad_norm": 0.12477970868349075, "kl": 0.02117919921875, "learning_rate": 9.890567554682914e-07, "loss": 0.0545, "reward": 0.5000000223517418, "reward_std": 0.1711373906582594, "rewards/accuracy_reward": 0.12500000931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3750000223517418, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 1836.7500915527344, "epoch": 0.16574113626655276, "grad_norm": 0.09104233235120773, "kl": 0.019775390625, "learning_rate": 9.884598104637243e-07, "loss": 0.0194, "reward": 0.4732143133878708, "reward_std": 0.07368332706391811, "rewards/accuracy_reward": 0.14285715389996767, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.330357164144516, "step": 97 }, { "clip_ratio": 0.0, "completion_length": 1629.6964721679688, "epoch": 0.16744980777445537, "grad_norm": 0.21247686445713043, "kl": 0.023834228515625, "learning_rate": 9.878472247385635e-07, "loss": 0.0559, "reward": 0.5892857313156128, "reward_std": 0.15296770446002483, "rewards/accuracy_reward": 0.18750001210719347, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4017857238650322, "step": 98 }, { "clip_ratio": 0.0, "completion_length": 1776.2768859863281, "epoch": 0.16915847928235797, "grad_norm": 0.13919728994369507, "kl": 0.020294189453125, "learning_rate": 9.872190201449415e-07, "loss": 0.056, "reward": 0.5178571715950966, "reward_std": 0.16671263054013252, "rewards/accuracy_reward": 0.11607143376022577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4017857313156128, "step": 99 }, { "clip_ratio": 0.0, "completion_length": 1591.71435546875, "epoch": 0.17086715079026057, "grad_norm": 0.20742930471897125, "kl": 0.01904296875, "learning_rate": 9.865752190921479e-07, "loss": 0.0768, "reward": 0.683035746216774, "reward_std": 0.14633493684232235, "rewards/accuracy_reward": 0.2589285857975483, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4241071566939354, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 1738.9732971191406, "epoch": 0.17257582229816318, "grad_norm": 0.16575846076011658, "kl": 0.020904541015625, "learning_rate": 9.85915844545827e-07, "loss": 0.058, "reward": 0.479910746216774, "reward_std": 0.17550636641681194, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.372767873108387, "step": 101 }, { "clip_ratio": 0.0, "completion_length": 1688.3482666015625, "epoch": 0.17428449380606578, "grad_norm": 0.1455792486667633, "kl": 0.025543212890625, "learning_rate": 9.852409200271615e-07, "loss": 0.0544, "reward": 0.4397321715950966, "reward_std": 0.11569055169820786, "rewards/accuracy_reward": 0.05357143096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3861607387661934, "step": 102 }, { "clip_ratio": 0.0, "completion_length": 1706.3482971191406, "epoch": 0.17599316531396839, "grad_norm": 0.17037266492843628, "kl": 0.0244140625, "learning_rate": 9.845504696120314e-07, "loss": 0.0764, "reward": 0.4330357313156128, "reward_std": 0.10342254769057035, "rewards/accuracy_reward": 0.0625000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3705357238650322, "step": 103 }, { "clip_ratio": 0.0, "completion_length": 1839.9286499023438, "epoch": 0.177701836821871, "grad_norm": 0.21385829150676727, "kl": 0.0284423828125, "learning_rate": 9.838445179301555e-07, "loss": 0.0786, "reward": 0.5803571790456772, "reward_std": 0.17464595660567284, "rewards/accuracy_reward": 0.2053571529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3750000149011612, "step": 104 }, { "clip_ratio": 0.0, "completion_length": 1591.2411499023438, "epoch": 0.1794105083297736, "grad_norm": 0.1647193282842636, "kl": 0.024505615234375, "learning_rate": 9.831230901642143e-07, "loss": 0.0835, "reward": 0.5758928805589676, "reward_std": 0.1880602464079857, "rewards/accuracy_reward": 0.14285715203732252, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4330357387661934, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 1888.0000915527344, "epoch": 0.1811191798376762, "grad_norm": 0.16050036251544952, "kl": 0.030792236328125, "learning_rate": 9.82386212048949e-07, "loss": 0.058, "reward": 0.4665178805589676, "reward_std": 0.2006453424692154, "rewards/accuracy_reward": 0.11607143189758062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3504464402794838, "step": 106 }, { "clip_ratio": 0.0, "completion_length": 1792.9197082519531, "epoch": 0.1828278513455788, "grad_norm": 0.15584886074066162, "kl": 0.02740478515625, "learning_rate": 9.816339098702467e-07, "loss": 0.0577, "reward": 0.4799107387661934, "reward_std": 0.1447084043174982, "rewards/accuracy_reward": 0.07142857555299997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4084821566939354, "step": 107 }, { "clip_ratio": 0.0, "completion_length": 1734.1072387695312, "epoch": 0.1845365228534814, "grad_norm": 0.14531783759593964, "kl": 0.028900146484375, "learning_rate": 9.808662104641995e-07, "loss": 0.0262, "reward": 0.537946455180645, "reward_std": 0.17246798984706402, "rewards/accuracy_reward": 0.13392857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4040178805589676, "step": 108 }, { "clip_ratio": 0.0, "completion_length": 1776.7590026855469, "epoch": 0.18624519436138404, "grad_norm": 0.17478659749031067, "kl": 0.028472900390625, "learning_rate": 9.800831412161502e-07, "loss": 0.0647, "reward": 0.5156250298023224, "reward_std": 0.19426362216472626, "rewards/accuracy_reward": 0.1160714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3995535895228386, "step": 109 }, { "clip_ratio": 0.0, "completion_length": 1751.9286499023438, "epoch": 0.18795386586928664, "grad_norm": 0.1425745040178299, "kl": 0.035400390625, "learning_rate": 9.792847300597128e-07, "loss": 0.032, "reward": 0.439732164144516, "reward_std": 0.16179021634161472, "rewards/accuracy_reward": 0.05357143096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3861607313156128, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 1810.0179138183594, "epoch": 0.18966253737718924, "grad_norm": 0.20884282886981964, "kl": 0.0361328125, "learning_rate": 9.78471005475778e-07, "loss": 0.0837, "reward": 0.4263392984867096, "reward_std": 0.18841860443353653, "rewards/accuracy_reward": 0.04464286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3816964402794838, "step": 111 }, { "clip_ratio": 0.0, "completion_length": 1715.9108276367188, "epoch": 0.19137120888509185, "grad_norm": 0.2955910861492157, "kl": 0.031768798828125, "learning_rate": 9.776419964914958e-07, "loss": 0.0902, "reward": 0.5401785969734192, "reward_std": 0.19905314967036247, "rewards/accuracy_reward": 0.12500000651925802, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4151785969734192, "step": 112 }, { "clip_ratio": 0.0, "completion_length": 1791.8125915527344, "epoch": 0.19307988039299445, "grad_norm": 0.24942556023597717, "kl": 0.03802490234375, "learning_rate": 9.767977326792414e-07, "loss": 0.1074, "reward": 0.4464285895228386, "reward_std": 0.16373559087514877, "rewards/accuracy_reward": 0.0803571455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3660714477300644, "step": 113 }, { "clip_ratio": 0.0, "completion_length": 1730.3036499023438, "epoch": 0.19478855190089706, "grad_norm": 0.16386914253234863, "kl": 0.0382080078125, "learning_rate": 9.759382441555594e-07, "loss": 0.0546, "reward": 0.5468750298023224, "reward_std": 0.1978953517973423, "rewards/accuracy_reward": 0.14285714738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.404017873108387, "step": 114 }, { "clip_ratio": 0.0, "completion_length": 1518.52685546875, "epoch": 0.19649722340879966, "grad_norm": 0.3123798966407776, "kl": 0.03887939453125, "learning_rate": 9.750635615800893e-07, "loss": 0.1049, "reward": 0.524553582072258, "reward_std": 0.1920180767774582, "rewards/accuracy_reward": 0.0982142873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4263392984867096, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 1590.7411193847656, "epoch": 0.19820589491670226, "grad_norm": 0.21765924990177155, "kl": 0.04296875, "learning_rate": 9.741737161544728e-07, "loss": 0.046, "reward": 0.497767873108387, "reward_std": 0.1302202045917511, "rewards/accuracy_reward": 0.08035714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4174107313156128, "step": 116 }, { "clip_ratio": 0.0, "completion_length": 1684.509033203125, "epoch": 0.19991456642460487, "grad_norm": 0.2124960571527481, "kl": 0.04144287109375, "learning_rate": 9.732687396212399e-07, "loss": 0.0395, "reward": 0.5647321790456772, "reward_std": 0.20065089128911495, "rewards/accuracy_reward": 0.15178572572767735, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4129464402794838, "step": 117 }, { "clip_ratio": 0.0, "completion_length": 1673.8215026855469, "epoch": 0.20162323793250747, "grad_norm": 0.23278361558914185, "kl": 0.048828125, "learning_rate": 9.723486642626763e-07, "loss": 0.0717, "reward": 0.5111607313156128, "reward_std": 0.18970635905861855, "rewards/accuracy_reward": 0.08035714644938707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4308035895228386, "step": 118 }, { "clip_ratio": 0.0, "completion_length": 1723.7411499023438, "epoch": 0.20333190944041007, "grad_norm": 0.15765896439552307, "kl": 0.048583984375, "learning_rate": 9.714135228996734e-07, "loss": 0.0286, "reward": 0.4196428805589676, "reward_std": 0.12831192277371883, "rewards/accuracy_reward": 0.044642859138548374, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3750000223517418, "step": 119 }, { "clip_ratio": 0.0, "completion_length": 1681.8750610351562, "epoch": 0.20504058094831268, "grad_norm": 0.3843046724796295, "kl": 0.056884765625, "learning_rate": 9.704633488905557e-07, "loss": 0.0995, "reward": 0.5736607536673546, "reward_std": 0.21387874148786068, "rewards/accuracy_reward": 0.1696428693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4040178805589676, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 1741.6339721679688, "epoch": 0.20674925245621528, "grad_norm": 0.3731752634048462, "kl": 0.0584716796875, "learning_rate": 9.694981761298924e-07, "loss": 0.0912, "reward": 0.5401786118745804, "reward_std": 0.14469599723815918, "rewards/accuracy_reward": 0.1517857238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3883928805589676, "step": 121 }, { "clip_ratio": 0.0, "completion_length": 1557.3036499023438, "epoch": 0.20845792396411789, "grad_norm": 0.44584032893180847, "kl": 0.05181884765625, "learning_rate": 9.685180390472867e-07, "loss": 0.0957, "reward": 0.6651785969734192, "reward_std": 0.29333463311195374, "rewards/accuracy_reward": 0.20535715576261282, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4598214402794838, "step": 122 }, { "clip_ratio": 0.0, "completion_length": 1632.9911804199219, "epoch": 0.21016659547202052, "grad_norm": 0.40921562910079956, "kl": 0.05938720703125, "learning_rate": 9.67522972606149e-07, "loss": 0.0547, "reward": 0.5825893133878708, "reward_std": 0.1475483775138855, "rewards/accuracy_reward": 0.13392857927829027, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4486607387661934, "step": 123 }, { "clip_ratio": 0.0, "completion_length": 1560.4375610351562, "epoch": 0.21187526697992312, "grad_norm": 0.3308427631855011, "kl": 0.07098388671875, "learning_rate": 9.665130123024492e-07, "loss": 0.0296, "reward": 0.575892873108387, "reward_std": 0.21556157618761063, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4508928805589676, "step": 124 }, { "clip_ratio": 0.0, "completion_length": 1624.1697387695312, "epoch": 0.21358393848782573, "grad_norm": 0.5423309803009033, "kl": 0.0882568359375, "learning_rate": 9.654881941634501e-07, "loss": 0.1431, "reward": 0.6026785969734192, "reward_std": 0.26951388269662857, "rewards/accuracy_reward": 0.1785714402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.424107164144516, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 1681.5982666015625, "epoch": 0.21529260999572833, "grad_norm": 0.33591747283935547, "kl": 0.0965576171875, "learning_rate": 9.64448554746423e-07, "loss": 0.0655, "reward": 0.5558036118745804, "reward_std": 0.15189602691680193, "rewards/accuracy_reward": 0.1339285783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4218750223517418, "step": 126 }, { "clip_ratio": 0.0, "completion_length": 1393.3125762939453, "epoch": 0.21700128150363093, "grad_norm": 0.6810412406921387, "kl": 0.07489013671875, "learning_rate": 9.633941311373432e-07, "loss": 0.1217, "reward": 0.7187500447034836, "reward_std": 0.2488148845732212, "rewards/accuracy_reward": 0.19642857927829027, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.522321455180645, "step": 127 }, { "clip_ratio": 0.0, "completion_length": 1626.9643859863281, "epoch": 0.21870995301153354, "grad_norm": 0.5725372433662415, "kl": 0.1165771484375, "learning_rate": 9.623249609495667e-07, "loss": 0.0852, "reward": 0.595982164144516, "reward_std": 0.2693430446088314, "rewards/accuracy_reward": 0.1250000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.470982164144516, "step": 128 }, { "clip_ratio": 0.0, "completion_length": 1487.9197387695312, "epoch": 0.22041862451943614, "grad_norm": 0.6992425918579102, "kl": 0.1109619140625, "learning_rate": 9.612410823224893e-07, "loss": 0.0924, "reward": 0.6428571790456772, "reward_std": 0.21655242145061493, "rewards/accuracy_reward": 0.15178572107106447, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4910714477300644, "step": 129 }, { "clip_ratio": 0.0, "completion_length": 1470.919692993164, "epoch": 0.22212729602733874, "grad_norm": 1.288134217262268, "kl": 0.161376953125, "learning_rate": 9.601425339201852e-07, "loss": 0.1423, "reward": 0.5982143133878708, "reward_std": 0.24282339215278625, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4732143208384514, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 1511.7947082519531, "epoch": 0.22383596753524135, "grad_norm": 0.9988163709640503, "kl": 0.1650390625, "learning_rate": 9.590293549300289e-07, "loss": 0.1548, "reward": 0.5736607611179352, "reward_std": 0.23862973973155022, "rewards/accuracy_reward": 0.13392858020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4397321715950966, "step": 131 }, { "clip_ratio": 0.0, "completion_length": 1582.3482971191406, "epoch": 0.22554463904314395, "grad_norm": 0.8792645335197449, "kl": 0.1982421875, "learning_rate": 9.579015850612959e-07, "loss": 0.0774, "reward": 0.5200893059372902, "reward_std": 0.23121628165245056, "rewards/accuracy_reward": 0.07142857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4486607387661934, "step": 132 }, { "clip_ratio": 0.0, "completion_length": 1595.5179443359375, "epoch": 0.22725331055104656, "grad_norm": 1.3966670036315918, "kl": 0.301025390625, "learning_rate": 9.567592645437474e-07, "loss": 0.1415, "reward": 0.6250000298023224, "reward_std": 0.2399446852505207, "rewards/accuracy_reward": 0.15178572200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4732143059372902, "step": 133 }, { "clip_ratio": 0.0, "completion_length": 1760.3572387695312, "epoch": 0.22896198205894916, "grad_norm": 0.8892687559127808, "kl": 0.34619140625, "learning_rate": 9.556024341261947e-07, "loss": 0.1058, "reward": 0.5468750298023224, "reward_std": 0.17602956667542458, "rewards/accuracy_reward": 0.13392857555299997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4129464477300644, "step": 134 }, { "clip_ratio": 0.0, "completion_length": 1574.1072082519531, "epoch": 0.23067065356685176, "grad_norm": 0.9226676821708679, "kl": 0.34716796875, "learning_rate": 9.544311350750453e-07, "loss": 0.0887, "reward": 0.7053571790456772, "reward_std": 0.2427867203950882, "rewards/accuracy_reward": 0.22321429662406445, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4821428805589676, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 1716.46435546875, "epoch": 0.23237932507475437, "grad_norm": 0.8753883242607117, "kl": 0.41357421875, "learning_rate": 9.532454091728318e-07, "loss": 0.0963, "reward": 0.5223214626312256, "reward_std": 0.2653765007853508, "rewards/accuracy_reward": 0.09821429010480642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4241071566939354, "step": 136 }, { "clip_ratio": 0.0, "completion_length": 1634.321533203125, "epoch": 0.23408799658265697, "grad_norm": 1.6251349449157715, "kl": 0.45361328125, "learning_rate": 9.520452987167204e-07, "loss": 0.1309, "reward": 0.6026785969734192, "reward_std": 0.24593664333224297, "rewards/accuracy_reward": 0.1428571455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.459821455180645, "step": 137 }, { "clip_ratio": 0.0, "completion_length": 1684.7143859863281, "epoch": 0.2357966680905596, "grad_norm": 2.103081464767456, "kl": 0.44677734375, "learning_rate": 9.508308465170026e-07, "loss": 0.101, "reward": 0.631696455180645, "reward_std": 0.26234467700123787, "rewards/accuracy_reward": 0.1696428656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4620536044239998, "step": 138 }, { "clip_ratio": 0.0, "completion_length": 1768.4286499023438, "epoch": 0.2375053395984622, "grad_norm": 1.3002150058746338, "kl": 0.6005859375, "learning_rate": 9.496020958955682e-07, "loss": 0.1158, "reward": 0.4732143133878708, "reward_std": 0.23482651263475418, "rewards/accuracy_reward": 0.07142857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4017857313156128, "step": 139 }, { "clip_ratio": 0.0, "completion_length": 1539.919677734375, "epoch": 0.2392140111063648, "grad_norm": 1.6156166791915894, "kl": 0.6650390625, "learning_rate": 9.483590906843596e-07, "loss": 0.1659, "reward": 0.6294642984867096, "reward_std": 0.2907675765454769, "rewards/accuracy_reward": 0.14285715110599995, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4866071715950966, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 1635.4911804199219, "epoch": 0.2409226826142674, "grad_norm": 2.858745813369751, "kl": 0.7568359375, "learning_rate": 9.471018752238084e-07, "loss": 0.1121, "reward": 0.5535714700818062, "reward_std": 0.22640549764037132, "rewards/accuracy_reward": 0.10714286286383867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.446428582072258, "step": 141 }, { "clip_ratio": 0.0, "completion_length": 1442.9375610351562, "epoch": 0.24263135412217002, "grad_norm": 2.4326553344726562, "kl": 0.8125, "learning_rate": 9.458304943612532e-07, "loss": 0.1639, "reward": 0.5580357313156128, "reward_std": 0.23855087906122208, "rewards/accuracy_reward": 0.04464286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5133928805589676, "step": 142 }, { "clip_ratio": 0.0, "completion_length": 1653.7500305175781, "epoch": 0.24434002563007262, "grad_norm": 2.160229206085205, "kl": 0.94140625, "learning_rate": 9.445449934493409e-07, "loss": 0.1225, "reward": 0.5714286044239998, "reward_std": 0.25177959725260735, "rewards/accuracy_reward": 0.098214291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4732143059372902, "step": 143 }, { "clip_ratio": 0.0, "completion_length": 1494.6072082519531, "epoch": 0.24604869713797523, "grad_norm": 1.9926283359527588, "kl": 0.9755859375, "learning_rate": 9.43245418344408e-07, "loss": 0.1748, "reward": 0.6406250447034836, "reward_std": 0.2861219272017479, "rewards/accuracy_reward": 0.14285715389996767, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4977678880095482, "step": 144 }, { "clip_ratio": 0.0, "completion_length": 1618.6072082519531, "epoch": 0.24775736864587783, "grad_norm": 2.532705068588257, "kl": 1.1259765625, "learning_rate": 9.419318154048447e-07, "loss": 0.161, "reward": 0.573660746216774, "reward_std": 0.23833118006587029, "rewards/accuracy_reward": 0.0803571455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4933035969734192, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 1585.6072082519531, "epoch": 0.24946604015378043, "grad_norm": 2.1685805320739746, "kl": 1.1943359375, "learning_rate": 9.406042314894421e-07, "loss": 0.1058, "reward": 0.5803571790456772, "reward_std": 0.2993314899504185, "rewards/accuracy_reward": 0.10714286286383867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4732142984867096, "step": 146 }, { "clip_ratio": 0.0, "completion_length": 1514.1250610351562, "epoch": 0.25117471166168304, "grad_norm": 2.366929292678833, "kl": 1.228515625, "learning_rate": 9.392627139557199e-07, "loss": 0.157, "reward": 0.6138392984867096, "reward_std": 0.2633180692791939, "rewards/accuracy_reward": 0.08928571920841932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5245535969734192, "step": 147 }, { "clip_ratio": 0.0, "completion_length": 1513.6072082519531, "epoch": 0.25288338316958564, "grad_norm": 1.8784329891204834, "kl": 1.314453125, "learning_rate": 9.37907310658237e-07, "loss": 0.1318, "reward": 0.6629464477300644, "reward_std": 0.28271085768938065, "rewards/accuracy_reward": 0.1517857238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5111607313156128, "step": 148 }, { "clip_ratio": 0.0, "completion_length": 1461.5625610351562, "epoch": 0.25459205467748824, "grad_norm": 1.8616801500320435, "kl": 1.330078125, "learning_rate": 9.36538069946885e-07, "loss": 0.1138, "reward": 0.6450893133878708, "reward_std": 0.2977392449975014, "rewards/accuracy_reward": 0.0982142873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5468750149011612, "step": 149 }, { "clip_ratio": 0.0, "completion_length": 1429.0357666015625, "epoch": 0.25630072618539085, "grad_norm": 1.596628189086914, "kl": 1.0166015625, "learning_rate": 9.35155040665163e-07, "loss": 0.1159, "reward": 0.6852678805589676, "reward_std": 0.25977613776922226, "rewards/accuracy_reward": 0.15178572479635477, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.533482164144516, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 1564.2322082519531, "epoch": 0.25800939769329345, "grad_norm": 1.5286195278167725, "kl": 1.08984375, "learning_rate": 9.337582721484356e-07, "loss": 0.0953, "reward": 0.6294643133878708, "reward_std": 0.2726179510354996, "rewards/accuracy_reward": 0.14285714831203222, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4866071715950966, "step": 151 }, { "clip_ratio": 0.0, "completion_length": 1508.21435546875, "epoch": 0.25971806920119606, "grad_norm": 1.4923723936080933, "kl": 1.16796875, "learning_rate": 9.323478142221728e-07, "loss": 0.1406, "reward": 0.6964286118745804, "reward_std": 0.34178658574819565, "rewards/accuracy_reward": 0.15178571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5446428805589676, "step": 152 }, { "clip_ratio": 0.0, "completion_length": 1583.3215026855469, "epoch": 0.26142674070909866, "grad_norm": 1.287610650062561, "kl": 1.091796875, "learning_rate": 9.309237172001724e-07, "loss": 0.1139, "reward": 0.660714328289032, "reward_std": 0.31564628705382347, "rewards/accuracy_reward": 0.10714286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5535714626312256, "step": 153 }, { "clip_ratio": 0.0, "completion_length": 1431.0625610351562, "epoch": 0.26313541221700126, "grad_norm": 1.469887375831604, "kl": 0.765625, "learning_rate": 9.294860318827659e-07, "loss": 0.1387, "reward": 0.6986607611179352, "reward_std": 0.34512969851493835, "rewards/accuracy_reward": 0.0892857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6093750298023224, "step": 154 }, { "clip_ratio": 0.0, "completion_length": 1355.5804138183594, "epoch": 0.26484408372490387, "grad_norm": 1.505653738975525, "kl": 0.67578125, "learning_rate": 9.280348095550053e-07, "loss": 0.1208, "reward": 0.6941964626312256, "reward_std": 0.24940932914614677, "rewards/accuracy_reward": 0.09821429289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5959821790456772, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 1238.8839721679688, "epoch": 0.26655275523280647, "grad_norm": 1.7104984521865845, "kl": 0.841796875, "learning_rate": 9.265701019848353e-07, "loss": 0.1576, "reward": 0.8437500447034836, "reward_std": 0.31489480286836624, "rewards/accuracy_reward": 0.2321428656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6116071790456772, "step": 156 }, { "clip_ratio": 0.0, "completion_length": 1434.401870727539, "epoch": 0.2682614267407091, "grad_norm": 1.196596622467041, "kl": 0.7568359375, "learning_rate": 9.250919614212443e-07, "loss": 0.0663, "reward": 0.7031250298023224, "reward_std": 0.3127906769514084, "rewards/accuracy_reward": 0.098214291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6049107313156128, "step": 157 }, { "clip_ratio": 0.0, "completion_length": 1294.482177734375, "epoch": 0.2699700982486117, "grad_norm": 1.0969858169555664, "kl": 0.6787109375, "learning_rate": 9.236004405924031e-07, "loss": 0.1095, "reward": 0.7254464626312256, "reward_std": 0.3098319172859192, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.618303582072258, "step": 158 }, { "clip_ratio": 0.0, "completion_length": 1446.8661499023438, "epoch": 0.27167876975651434, "grad_norm": 1.4693799018859863, "kl": 0.80126953125, "learning_rate": 9.220955927037822e-07, "loss": 0.1758, "reward": 0.7343750298023224, "reward_std": 0.3354809433221817, "rewards/accuracy_reward": 0.13392857927829027, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6004464477300644, "step": 159 }, { "clip_ratio": 0.0, "completion_length": 1232.5804138183594, "epoch": 0.27338744126441694, "grad_norm": 0.9941864013671875, "kl": 0.6474609375, "learning_rate": 9.205774714362543e-07, "loss": 0.0969, "reward": 0.8415178954601288, "reward_std": 0.28928979858756065, "rewards/accuracy_reward": 0.18750000931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6540178805589676, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 1253.732177734375, "epoch": 0.27509611277231955, "grad_norm": 2.137742042541504, "kl": 0.5986328125, "learning_rate": 9.1904613094418e-07, "loss": 0.1661, "reward": 0.7767857313156128, "reward_std": 0.2339324727654457, "rewards/accuracy_reward": 0.08928571920841932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6875000298023224, "step": 161 }, { "clip_ratio": 0.0, "completion_length": 1355.2857666015625, "epoch": 0.27680478428022215, "grad_norm": 1.8357288837432861, "kl": 0.8447265625, "learning_rate": 9.175016258534749e-07, "loss": 0.1178, "reward": 0.7455357313156128, "reward_std": 0.2638407461345196, "rewards/accuracy_reward": 0.0803571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6651785969734192, "step": 162 }, { "clip_ratio": 0.0, "completion_length": 1296.5447082519531, "epoch": 0.27851345578812475, "grad_norm": 1.1587196588516235, "kl": 0.859375, "learning_rate": 9.159440112596625e-07, "loss": 0.1698, "reward": 0.7745535969734192, "reward_std": 0.3002671115100384, "rewards/accuracy_reward": 0.1250000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6495535969734192, "step": 163 }, { "clip_ratio": 0.0, "completion_length": 1304.2857971191406, "epoch": 0.28022212729602736, "grad_norm": 1.0852662324905396, "kl": 0.56982421875, "learning_rate": 9.143733427259071e-07, "loss": 0.1305, "reward": 0.7812500447034836, "reward_std": 0.33041248843073845, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6562500447034836, "step": 164 }, { "clip_ratio": 0.0, "completion_length": 1493.8215026855469, "epoch": 0.28193079880392996, "grad_norm": 2.7339744567871094, "kl": 0.740234375, "learning_rate": 9.127896762810332e-07, "loss": 0.1317, "reward": 0.776785746216774, "reward_std": 0.2861787863075733, "rewards/accuracy_reward": 0.1250000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.651785746216774, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 1377.27685546875, "epoch": 0.28363947031183256, "grad_norm": 1.0761616230010986, "kl": 0.7197265625, "learning_rate": 9.111930684175262e-07, "loss": 0.1432, "reward": 0.7633928954601288, "reward_std": 0.2873268537223339, "rewards/accuracy_reward": 0.11607143469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6473214477300644, "step": 166 }, { "clip_ratio": 0.0, "completion_length": 1222.4107666015625, "epoch": 0.28534814181973517, "grad_norm": 1.4208155870437622, "kl": 0.633544921875, "learning_rate": 9.095835760895166e-07, "loss": 0.1316, "reward": 0.9598214775323868, "reward_std": 0.3381858095526695, "rewards/accuracy_reward": 0.2857142984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6741071790456772, "step": 167 }, { "clip_ratio": 0.0, "completion_length": 1253.3572082519531, "epoch": 0.28705681332763777, "grad_norm": 1.9054979085922241, "kl": 0.5556640625, "learning_rate": 9.079612567107501e-07, "loss": 0.1161, "reward": 0.8035714626312256, "reward_std": 0.2851314917206764, "rewards/accuracy_reward": 0.10714286100119352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6964286118745804, "step": 168 }, { "clip_ratio": 0.0, "completion_length": 1458.8840026855469, "epoch": 0.2887654848355404, "grad_norm": 1.9743632078170776, "kl": 0.77880859375, "learning_rate": 9.063261681525375e-07, "loss": 0.1987, "reward": 0.7343750298023224, "reward_std": 0.29755445942282677, "rewards/accuracy_reward": 0.1339285783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6004464775323868, "step": 169 }, { "clip_ratio": 0.0, "completion_length": 1296.08935546875, "epoch": 0.290474156343443, "grad_norm": 1.2607367038726807, "kl": 0.63427734375, "learning_rate": 9.046783687416917e-07, "loss": 0.1333, "reward": 0.8080357611179352, "reward_std": 0.2957063056528568, "rewards/accuracy_reward": 0.1250000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.683035746216774, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 1238.3482666015625, "epoch": 0.2921828278513456, "grad_norm": 1.343139886856079, "kl": 0.59130859375, "learning_rate": 9.030179172584464e-07, "loss": 0.1317, "reward": 0.8102678805589676, "reward_std": 0.26404349878430367, "rewards/accuracy_reward": 0.09821429289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7120535969734192, "step": 171 }, { "clip_ratio": 0.0, "completion_length": 1178.589370727539, "epoch": 0.2938914993592482, "grad_norm": 2.861290693283081, "kl": 1.140625, "learning_rate": 9.013448729343599e-07, "loss": 0.1667, "reward": 0.8281250298023224, "reward_std": 0.3496924415230751, "rewards/accuracy_reward": 0.15178571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6763393133878708, "step": 172 }, { "clip_ratio": 0.0, "completion_length": 1184.6785888671875, "epoch": 0.2956001708671508, "grad_norm": 1.2848232984542847, "kl": 0.63720703125, "learning_rate": 8.996592954502014e-07, "loss": 0.1259, "reward": 0.9933036267757416, "reward_std": 0.32481422647833824, "rewards/accuracy_reward": 0.258928582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7343750447034836, "step": 173 }, { "clip_ratio": 0.0, "completion_length": 1456.33935546875, "epoch": 0.2973088423750534, "grad_norm": 3.458548069000244, "kl": 1.3076171875, "learning_rate": 8.979612449338225e-07, "loss": 0.1153, "reward": 0.7321428805589676, "reward_std": 0.2682997025549412, "rewards/accuracy_reward": 0.1160714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6160714477300644, "step": 174 }, { "clip_ratio": 0.0, "completion_length": 1272.3661193847656, "epoch": 0.299017513882956, "grad_norm": 3.7309749126434326, "kl": 1.0078125, "learning_rate": 8.962507819580127e-07, "loss": 0.2118, "reward": 0.8348214775323868, "reward_std": 0.37767282873392105, "rewards/accuracy_reward": 0.1875000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6473214477300644, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 1378.2679138183594, "epoch": 0.3007261853908586, "grad_norm": 0.8642393350601196, "kl": 0.6875, "learning_rate": 8.945279675383377e-07, "loss": 0.1662, "reward": 0.9397321790456772, "reward_std": 0.33435316383838654, "rewards/accuracy_reward": 0.2678571529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6718750149011612, "step": 176 }, { "clip_ratio": 0.0, "completion_length": 1378.7411193847656, "epoch": 0.3024348568987612, "grad_norm": 1.0882568359375, "kl": 1.1181640625, "learning_rate": 8.927928631309638e-07, "loss": 0.2056, "reward": 0.85714291036129, "reward_std": 0.2683042958378792, "rewards/accuracy_reward": 0.22321429941803217, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.633928582072258, "step": 177 }, { "clip_ratio": 0.0, "completion_length": 1327.1607666015625, "epoch": 0.3041435284066638, "grad_norm": 1.6407157182693481, "kl": 1.0888671875, "learning_rate": 8.91045530630465e-07, "loss": 0.2361, "reward": 0.8950893431901932, "reward_std": 0.3372327871620655, "rewards/accuracy_reward": 0.2500000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.645089328289032, "step": 178 }, { "clip_ratio": 0.0, "completion_length": 1280.0447387695312, "epoch": 0.3058521999145664, "grad_norm": 1.904410719871521, "kl": 0.9287109375, "learning_rate": 8.892860323676156e-07, "loss": 0.2361, "reward": 0.7812500298023224, "reward_std": 0.3141815960407257, "rewards/accuracy_reward": 0.14285714644938707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6383928805589676, "step": 179 }, { "clip_ratio": 0.0, "completion_length": 1397.5714721679688, "epoch": 0.307560871422469, "grad_norm": 1.41862952709198, "kl": 0.92578125, "learning_rate": 8.875144311071663e-07, "loss": 0.2092, "reward": 0.723214328289032, "reward_std": 0.2747877798974514, "rewards/accuracy_reward": 0.098214291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6250000298023224, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 1192.419677734375, "epoch": 0.3092695429303716, "grad_norm": 1.840302586555481, "kl": 1.0166015625, "learning_rate": 8.857307900456055e-07, "loss": 0.1052, "reward": 0.8102678805589676, "reward_std": 0.25420933216810226, "rewards/accuracy_reward": 0.14285715017467737, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.667410746216774, "step": 181 }, { "clip_ratio": 0.0, "completion_length": 1320.4107666015625, "epoch": 0.3109782144382742, "grad_norm": 1.2133535146713257, "kl": 1.48828125, "learning_rate": 8.839351728089048e-07, "loss": 0.2326, "reward": 0.7500000298023224, "reward_std": 0.3192509189248085, "rewards/accuracy_reward": 0.1696428656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.580357164144516, "step": 182 }, { "clip_ratio": 0.0, "completion_length": 1250.3661193847656, "epoch": 0.31268688594617683, "grad_norm": 1.0826963186264038, "kl": 1.01025390625, "learning_rate": 8.821276434502498e-07, "loss": 0.1175, "reward": 0.8526786267757416, "reward_std": 0.36335889250040054, "rewards/accuracy_reward": 0.16964286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6830357313156128, "step": 183 }, { "clip_ratio": 0.0, "completion_length": 1330.2500610351562, "epoch": 0.31439555745407943, "grad_norm": 2.739576816558838, "kl": 1.234375, "learning_rate": 8.80308266447754e-07, "loss": 0.168, "reward": 0.7723214626312256, "reward_std": 0.35239071026444435, "rewards/accuracy_reward": 0.16964286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6026785969734192, "step": 184 }, { "clip_ratio": 0.0, "completion_length": 1311.7411499023438, "epoch": 0.31610422896198204, "grad_norm": 1.8622268438339233, "kl": 1.2431640625, "learning_rate": 8.784771067021608e-07, "loss": 0.1784, "reward": 0.8794643431901932, "reward_std": 0.24756912142038345, "rewards/accuracy_reward": 0.2321428693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6473214775323868, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 1315.2054138183594, "epoch": 0.31781290046988464, "grad_norm": 1.588362216949463, "kl": 1.0576171875, "learning_rate": 8.766342295345259e-07, "loss": 0.135, "reward": 0.7745535969734192, "reward_std": 0.3398048058152199, "rewards/accuracy_reward": 0.1696428619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.604910746216774, "step": 186 }, { "clip_ratio": 0.0, "completion_length": 1288.33935546875, "epoch": 0.31952157197778724, "grad_norm": 1.7115285396575928, "kl": 1.0302734375, "learning_rate": 8.747797006838892e-07, "loss": 0.1558, "reward": 0.7812500298023224, "reward_std": 0.32417355477809906, "rewards/accuracy_reward": 0.1160714328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6651785969734192, "step": 187 }, { "clip_ratio": 0.0, "completion_length": 1306.5089721679688, "epoch": 0.3212302434856899, "grad_norm": 1.1582908630371094, "kl": 0.828125, "learning_rate": 8.729135863049288e-07, "loss": 0.1334, "reward": 0.7544643133878708, "reward_std": 0.26360753923654556, "rewards/accuracy_reward": 0.13392857555299997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6205357611179352, "step": 188 }, { "clip_ratio": 0.0, "completion_length": 1420.8929138183594, "epoch": 0.3229389149935925, "grad_norm": 1.1876899003982544, "kl": 0.6728515625, "learning_rate": 8.710359529656008e-07, "loss": 0.1661, "reward": 0.7834821790456772, "reward_std": 0.28750549629330635, "rewards/accuracy_reward": 0.16071429196745157, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6227678805589676, "step": 189 }, { "clip_ratio": 0.0, "completion_length": 1453.7322082519531, "epoch": 0.3246475865014951, "grad_norm": 1.9170705080032349, "kl": 0.67041015625, "learning_rate": 8.691468676447661e-07, "loss": 0.1276, "reward": 0.6540178805589676, "reward_std": 0.32378406822681427, "rewards/accuracy_reward": 0.07142857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5825892984867096, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 1408.3125610351562, "epoch": 0.3263562580093977, "grad_norm": 1.4841084480285645, "kl": 0.62548828125, "learning_rate": 8.672463977297995e-07, "loss": 0.1254, "reward": 0.707589328289032, "reward_std": 0.3010448217391968, "rewards/accuracy_reward": 0.06250000279396772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6450893133878708, "step": 191 }, { "clip_ratio": 0.0, "completion_length": 1526.259033203125, "epoch": 0.3280649295173003, "grad_norm": 1.4510630369186401, "kl": 0.7001953125, "learning_rate": 8.653346110141868e-07, "loss": 0.151, "reward": 0.6540178805589676, "reward_std": 0.25370531901717186, "rewards/accuracy_reward": 0.04464286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6093750149011612, "step": 192 }, { "clip_ratio": 0.0, "completion_length": 1426.9911193847656, "epoch": 0.3297736010252029, "grad_norm": 1.007460355758667, "kl": 0.689453125, "learning_rate": 8.634115756951063e-07, "loss": 0.1509, "reward": 0.7589285969734192, "reward_std": 0.3458084911108017, "rewards/accuracy_reward": 0.1250000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6339286118745804, "step": 193 }, { "clip_ratio": 0.0, "completion_length": 1321.3215026855469, "epoch": 0.3314822725331055, "grad_norm": 1.083178162574768, "kl": 0.603515625, "learning_rate": 8.614773603709959e-07, "loss": 0.123, "reward": 0.8772321790456772, "reward_std": 0.32047489657998085, "rewards/accuracy_reward": 0.232142873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6450893133878708, "step": 194 }, { "clip_ratio": 0.0, "completion_length": 1430.6250610351562, "epoch": 0.33319094404100813, "grad_norm": 0.8163322806358337, "kl": 0.8642578125, "learning_rate": 8.595320340391066e-07, "loss": 0.1358, "reward": 0.6919643133878708, "reward_std": 0.2659658417105675, "rewards/accuracy_reward": 0.08928571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6026785969734192, "step": 195 }, { "clip_ratio": 0.0, "completion_length": 1513.4732971191406, "epoch": 0.33489961554891073, "grad_norm": 0.9237807393074036, "kl": 0.96875, "learning_rate": 8.5757566609304e-07, "loss": 0.1455, "reward": 0.7656250447034836, "reward_std": 0.2854275666177273, "rewards/accuracy_reward": 0.12500000279396772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6406250149011612, "step": 196 }, { "clip_ratio": 0.0, "completion_length": 1217.0625305175781, "epoch": 0.33660828705681334, "grad_norm": 1.2529582977294922, "kl": 0.7890625, "learning_rate": 8.556083263202744e-07, "loss": 0.1919, "reward": 0.8214286118745804, "reward_std": 0.3162773437798023, "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6785714626312256, "step": 197 }, { "clip_ratio": 0.0, "completion_length": 1207.482177734375, "epoch": 0.33831695856471594, "grad_norm": 0.8351501822471619, "kl": 0.8935546875, "learning_rate": 8.536300848996748e-07, "loss": 0.1228, "reward": 0.7343750447034836, "reward_std": 0.2935408167541027, "rewards/accuracy_reward": 0.07142857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6629464626312256, "step": 198 }, { "clip_ratio": 0.0, "completion_length": 1398.8125610351562, "epoch": 0.34002563007261855, "grad_norm": 1.395736813545227, "kl": 0.9052734375, "learning_rate": 8.516410123989886e-07, "loss": 0.1435, "reward": 0.7656250447034836, "reward_std": 0.2357596606016159, "rewards/accuracy_reward": 0.11607143376022577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6495535969734192, "step": 199 }, { "clip_ratio": 0.0, "completion_length": 1222.4732666015625, "epoch": 0.34173430158052115, "grad_norm": 1.8048124313354492, "kl": 0.830078125, "learning_rate": 8.496411797723295e-07, "loss": 0.0897, "reward": 0.799107164144516, "reward_std": 0.23418355733156204, "rewards/accuracy_reward": 0.0892857164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7098214626312256, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 1183.5000610351562, "epoch": 0.34344297308842375, "grad_norm": 1.197396159172058, "kl": 0.7353515625, "learning_rate": 8.476306583576461e-07, "loss": 0.0775, "reward": 0.8281250447034836, "reward_std": 0.2444763220846653, "rewards/accuracy_reward": 0.12500000465661287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7031250298023224, "step": 201 }, { "clip_ratio": 0.0, "completion_length": 1294.8928833007812, "epoch": 0.34515164459632636, "grad_norm": 0.5945436954498291, "kl": 0.70703125, "learning_rate": 8.456095198741768e-07, "loss": 0.1117, "reward": 0.7723214626312256, "reward_std": 0.2531391903758049, "rewards/accuracy_reward": 0.098214291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6741071790456772, "step": 202 }, { "clip_ratio": 0.0, "completion_length": 1331.6161193847656, "epoch": 0.34686031610422896, "grad_norm": 0.5211927890777588, "kl": 0.56787109375, "learning_rate": 8.435778364198914e-07, "loss": 0.1295, "reward": 0.8816964775323868, "reward_std": 0.27834611758589745, "rewards/accuracy_reward": 0.19642857927829027, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6852678805589676, "step": 203 }, { "clip_ratio": 0.0, "completion_length": 1237.15185546875, "epoch": 0.34856898761213156, "grad_norm": 1.4769175052642822, "kl": 0.4580078125, "learning_rate": 8.415356804689202e-07, "loss": 0.1059, "reward": 0.9598214626312256, "reward_std": 0.22813929989933968, "rewards/accuracy_reward": 0.2321428693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7276786118745804, "step": 204 }, { "clip_ratio": 0.0, "completion_length": 1270.3304138183594, "epoch": 0.35027765912003417, "grad_norm": 1.397891640663147, "kl": 0.564453125, "learning_rate": 8.394831248689673e-07, "loss": 0.1072, "reward": 0.81026791036129, "reward_std": 0.3266693353652954, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6852678954601288, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 1353.9822082519531, "epoch": 0.35198633062793677, "grad_norm": 0.6328490972518921, "kl": 0.52734375, "learning_rate": 8.374202428387129e-07, "loss": 0.0949, "reward": 0.95089291036129, "reward_std": 0.3237159363925457, "rewards/accuracy_reward": 0.2500000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7008928954601288, "step": 206 }, { "clip_ratio": 0.0, "completion_length": 1257.232177734375, "epoch": 0.3536950021358394, "grad_norm": 2.0763943195343018, "kl": 0.47705078125, "learning_rate": 8.353471079652013e-07, "loss": 0.1507, "reward": 0.8660714775323868, "reward_std": 0.31462325155735016, "rewards/accuracy_reward": 0.16964286472648382, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6964286118745804, "step": 207 }, { "clip_ratio": 0.0, "completion_length": 1341.6875610351562, "epoch": 0.355403673643742, "grad_norm": 0.8509356379508972, "kl": 0.533203125, "learning_rate": 8.332637942012157e-07, "loss": 0.125, "reward": 0.7299107313156128, "reward_std": 0.26654981821775436, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6941964626312256, "step": 208 }, { "clip_ratio": 0.0, "completion_length": 1181.794677734375, "epoch": 0.3571123451516446, "grad_norm": 0.8915902376174927, "kl": 0.564453125, "learning_rate": 8.311703758626403e-07, "loss": 0.1191, "reward": 0.8125000298023224, "reward_std": 0.30438848212361336, "rewards/accuracy_reward": 0.08035714831203222, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7321428954601288, "step": 209 }, { "clip_ratio": 0.0, "completion_length": 1158.7589416503906, "epoch": 0.3588210166595472, "grad_norm": 0.8463822603225708, "kl": 0.669921875, "learning_rate": 8.29066927625809e-07, "loss": 0.11, "reward": 0.8348214775323868, "reward_std": 0.2866206653416157, "rewards/accuracy_reward": 0.1250000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7098214626312256, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 1144.9286193847656, "epoch": 0.3605296881674498, "grad_norm": 1.168274998664856, "kl": 0.7470703125, "learning_rate": 8.269535245248425e-07, "loss": 0.1189, "reward": 0.8303571790456772, "reward_std": 0.28710902854800224, "rewards/accuracy_reward": 0.11607143189758062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7142857611179352, "step": 211 }, { "clip_ratio": 0.0, "completion_length": 1251.4286193847656, "epoch": 0.3622383596753524, "grad_norm": 0.8057065606117249, "kl": 0.87890625, "learning_rate": 8.248302419489703e-07, "loss": 0.1432, "reward": 0.8058036118745804, "reward_std": 0.22419050708413124, "rewards/accuracy_reward": 0.13392858020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6718750447034836, "step": 212 }, { "clip_ratio": 0.0, "completion_length": 1220.1875610351562, "epoch": 0.363947031183255, "grad_norm": 0.9034870862960815, "kl": 0.8095703125, "learning_rate": 8.226971556398425e-07, "loss": 0.1044, "reward": 0.8035714626312256, "reward_std": 0.2373671606183052, "rewards/accuracy_reward": 0.10714286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6964285969734192, "step": 213 }, { "clip_ratio": 0.0, "completion_length": 1245.1607666015625, "epoch": 0.3656557026911576, "grad_norm": 1.746027946472168, "kl": 0.9990234375, "learning_rate": 8.205543416888273e-07, "loss": 0.1084, "reward": 0.8616071790456772, "reward_std": 0.33074841648340225, "rewards/accuracy_reward": 0.1875000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6741071790456772, "step": 214 }, { "clip_ratio": 0.0, "completion_length": 1156.4822082519531, "epoch": 0.3673643741990602, "grad_norm": 3.06565523147583, "kl": 1.0439453125, "learning_rate": 8.184018765342975e-07, "loss": 0.1303, "reward": 0.8683035969734192, "reward_std": 0.32450249791145325, "rewards/accuracy_reward": 0.20535715483129025, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6629464626312256, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 1316.8482971191406, "epoch": 0.3690730457069628, "grad_norm": 2.2442991733551025, "kl": 1.220703125, "learning_rate": 8.162398369589026e-07, "loss": 0.1187, "reward": 0.8191964626312256, "reward_std": 0.30199623480439186, "rewards/accuracy_reward": 0.20535715389996767, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6138392984867096, "step": 216 }, { "clip_ratio": 0.0, "completion_length": 1313.0357971191406, "epoch": 0.37078171721486547, "grad_norm": 1.187595009803772, "kl": 0.9765625, "learning_rate": 8.140683000868309e-07, "loss": 0.1646, "reward": 0.7901785969734192, "reward_std": 0.2929707393050194, "rewards/accuracy_reward": 0.1607142984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6294642984867096, "step": 217 }, { "clip_ratio": 0.0, "completion_length": 1290.0893249511719, "epoch": 0.3724903887227681, "grad_norm": 1.1727043390274048, "kl": 1.0322265625, "learning_rate": 8.118873433810577e-07, "loss": 0.1181, "reward": 0.7879464775323868, "reward_std": 0.3262900374829769, "rewards/accuracy_reward": 0.13392857741564512, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6540178805589676, "step": 218 }, { "clip_ratio": 0.0, "completion_length": 1402.6072387695312, "epoch": 0.3741990602306707, "grad_norm": 1.195185899734497, "kl": 1.01171875, "learning_rate": 8.09697044640582e-07, "loss": 0.189, "reward": 0.689732164144516, "reward_std": 0.2741188146173954, "rewards/accuracy_reward": 0.08928571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6004464477300644, "step": 219 }, { "clip_ratio": 0.0, "completion_length": 1447.0536193847656, "epoch": 0.3759077317385733, "grad_norm": 1.0180710554122925, "kl": 0.9111328125, "learning_rate": 8.074974819976522e-07, "loss": 0.0723, "reward": 0.7433035969734192, "reward_std": 0.3359968736767769, "rewards/accuracy_reward": 0.14285715017467737, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6004464626312256, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 1363.4732666015625, "epoch": 0.3776164032464759, "grad_norm": 1.1887321472167969, "kl": 0.95703125, "learning_rate": 8.052887339149773e-07, "loss": 0.1389, "reward": 0.7901786118745804, "reward_std": 0.3207033947110176, "rewards/accuracy_reward": 0.13392857927829027, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6562500149011612, "step": 221 }, { "clip_ratio": 0.0, "completion_length": 1489.5893249511719, "epoch": 0.3793250747543785, "grad_norm": 0.8815401196479797, "kl": 0.9560546875, "learning_rate": 8.030708791829297e-07, "loss": 0.1366, "reward": 0.636160746216774, "reward_std": 0.28052425384521484, "rewards/accuracy_reward": 0.04464286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5915178954601288, "step": 222 }, { "clip_ratio": 0.0, "completion_length": 1354.1339721679688, "epoch": 0.3810337462622811, "grad_norm": 0.8264697790145874, "kl": 0.7900390625, "learning_rate": 8.008439969167334e-07, "loss": 0.1055, "reward": 0.7901786118745804, "reward_std": 0.3359355330467224, "rewards/accuracy_reward": 0.13392857927829027, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6562500298023224, "step": 223 }, { "clip_ratio": 0.0, "completion_length": 1370.8214721679688, "epoch": 0.3827424177701837, "grad_norm": 1.1989609003067017, "kl": 0.802734375, "learning_rate": 7.986081665536426e-07, "loss": 0.1054, "reward": 0.707589328289032, "reward_std": 0.30412501469254494, "rewards/accuracy_reward": 0.098214291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6093750298023224, "step": 224 }, { "clip_ratio": 0.0, "completion_length": 1529.8393249511719, "epoch": 0.3844510892780863, "grad_norm": 0.7080864310264587, "kl": 0.8740234375, "learning_rate": 7.963634678501071e-07, "loss": 0.1308, "reward": 0.7589285969734192, "reward_std": 0.3299409933388233, "rewards/accuracy_reward": 0.1339285783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6250000298023224, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 1349.6250610351562, "epoch": 0.3861597607859889, "grad_norm": 1.1407538652420044, "kl": 0.8134765625, "learning_rate": 7.941099808789283e-07, "loss": 0.1384, "reward": 0.8102678805589676, "reward_std": 0.34228233993053436, "rewards/accuracy_reward": 0.1964285783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6138393133878708, "step": 226 }, { "clip_ratio": 0.0, "completion_length": 1343.4554443359375, "epoch": 0.3878684322938915, "grad_norm": 1.7054117918014526, "kl": 0.814453125, "learning_rate": 7.918477860264021e-07, "loss": 0.1682, "reward": 0.8482143431901932, "reward_std": 0.34862805902957916, "rewards/accuracy_reward": 0.2232142984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6250000447034836, "step": 227 }, { "clip_ratio": 0.0, "completion_length": 1352.9375610351562, "epoch": 0.3895771038017941, "grad_norm": 0.9958900213241577, "kl": 0.7861328125, "learning_rate": 7.895769639894516e-07, "loss": 0.109, "reward": 0.7477678954601288, "reward_std": 0.2599702551960945, "rewards/accuracy_reward": 0.10714286286383867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6406250298023224, "step": 228 }, { "clip_ratio": 0.0, "completion_length": 1429.7678833007812, "epoch": 0.3912857753096967, "grad_norm": 0.9736700057983398, "kl": 0.8759765625, "learning_rate": 7.872975957727486e-07, "loss": 0.097, "reward": 0.7254464626312256, "reward_std": 0.3137795031070709, "rewards/accuracy_reward": 0.1339285783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5915178805589676, "step": 229 }, { "clip_ratio": 0.0, "completion_length": 1422.3304138183594, "epoch": 0.3929944468175993, "grad_norm": 1.1940455436706543, "kl": 0.841796875, "learning_rate": 7.850097626858236e-07, "loss": 0.1718, "reward": 0.7611607611179352, "reward_std": 0.3111877329647541, "rewards/accuracy_reward": 0.1607142947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6004464626312256, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 1335.5715026855469, "epoch": 0.3947031183255019, "grad_norm": 1.2862074375152588, "kl": 0.8134765625, "learning_rate": 7.827135463401658e-07, "loss": 0.1724, "reward": 0.7812500447034836, "reward_std": 0.3847400024533272, "rewards/accuracy_reward": 0.1339285746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6473214477300644, "step": 231 }, { "clip_ratio": 0.0, "completion_length": 1447.7232971191406, "epoch": 0.3964117898334045, "grad_norm": 0.9515857100486755, "kl": 0.8212890625, "learning_rate": 7.80409028646312e-07, "loss": 0.1347, "reward": 0.7031250447034836, "reward_std": 0.29914386570453644, "rewards/accuracy_reward": 0.08928571734577417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6138392984867096, "step": 232 }, { "clip_ratio": 0.0, "completion_length": 1436.7500610351562, "epoch": 0.39812046134130713, "grad_norm": 1.15450119972229, "kl": 0.970703125, "learning_rate": 7.780962918109235e-07, "loss": 0.1547, "reward": 0.7433035969734192, "reward_std": 0.34430645406246185, "rewards/accuracy_reward": 0.196428582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5468750149011612, "step": 233 }, { "clip_ratio": 0.0, "completion_length": 1455.4911499023438, "epoch": 0.39982913284920973, "grad_norm": 2.7830755710601807, "kl": 0.7861328125, "learning_rate": 7.757754183338552e-07, "loss": 0.2141, "reward": 0.6875000298023224, "reward_std": 0.3030503839254379, "rewards/accuracy_reward": 0.10714286379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.580357164144516, "step": 234 }, { "clip_ratio": 0.0, "completion_length": 1389.9197082519531, "epoch": 0.40153780435711234, "grad_norm": 1.2082629203796387, "kl": 0.8837890625, "learning_rate": 7.734464910052119e-07, "loss": 0.1708, "reward": 0.6428571790456772, "reward_std": 0.3243940845131874, "rewards/accuracy_reward": 0.06250000279396772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5803571492433548, "step": 235 }, { "clip_ratio": 0.0, "completion_length": 1490.6250610351562, "epoch": 0.40324647586501494, "grad_norm": 1.1319290399551392, "kl": 0.9560546875, "learning_rate": 7.711095929023947e-07, "loss": 0.1471, "reward": 0.604910746216774, "reward_std": 0.2105468399822712, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5691964477300644, "step": 236 }, { "clip_ratio": 0.0, "completion_length": 1458.2322082519531, "epoch": 0.40495514737291755, "grad_norm": 1.3172343969345093, "kl": 1.0068359375, "learning_rate": 7.687648073871379e-07, "loss": 0.1623, "reward": 0.7187500298023224, "reward_std": 0.246288500726223, "rewards/accuracy_reward": 0.1428571529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.575892873108387, "step": 237 }, { "clip_ratio": 0.0, "completion_length": 1481.96435546875, "epoch": 0.40666381888082015, "grad_norm": 0.976262629032135, "kl": 0.8623046875, "learning_rate": 7.664122181025356e-07, "loss": 0.133, "reward": 0.6517857611179352, "reward_std": 0.3340938650071621, "rewards/accuracy_reward": 0.053571431897580624, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5982143133878708, "step": 238 }, { "clip_ratio": 0.0, "completion_length": 1277.4107666015625, "epoch": 0.40837249038872275, "grad_norm": 2.829563856124878, "kl": 0.8349609375, "learning_rate": 7.640519089700578e-07, "loss": 0.1103, "reward": 0.8660714775323868, "reward_std": 0.3478277176618576, "rewards/accuracy_reward": 0.23214287031441927, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6339285969734192, "step": 239 }, { "clip_ratio": 0.0, "completion_length": 1377.1697082519531, "epoch": 0.41008116189662536, "grad_norm": 0.9941349625587463, "kl": 0.822265625, "learning_rate": 7.616839641865556e-07, "loss": 0.1194, "reward": 0.6406250149011612, "reward_std": 0.2593759186565876, "rewards/accuracy_reward": 0.026785715483129025, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6138393133878708, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 1387.2054138183594, "epoch": 0.41178983340452796, "grad_norm": 4.587230682373047, "kl": 1.041015625, "learning_rate": 7.593084682212597e-07, "loss": 0.172, "reward": 0.7633928805589676, "reward_std": 0.297294307500124, "rewards/accuracy_reward": 0.1964285783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5669643133878708, "step": 241 }, { "clip_ratio": 0.0, "completion_length": 1453.6786499023438, "epoch": 0.41349850491243056, "grad_norm": 0.7582125663757324, "kl": 0.8935546875, "learning_rate": 7.569255058127659e-07, "loss": 0.1822, "reward": 0.658482164144516, "reward_std": 0.3083745837211609, "rewards/accuracy_reward": 0.08035714831203222, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5781250298023224, "step": 242 }, { "clip_ratio": 0.0, "completion_length": 1419.0358276367188, "epoch": 0.41520717642033317, "grad_norm": 1.351284146308899, "kl": 0.984375, "learning_rate": 7.545351619660126e-07, "loss": 0.1411, "reward": 0.7120535969734192, "reward_std": 0.3146362043917179, "rewards/accuracy_reward": 0.1339285783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5781250149011612, "step": 243 }, { "clip_ratio": 0.0, "completion_length": 1503.1875915527344, "epoch": 0.41691584792823577, "grad_norm": 1.0964982509613037, "kl": 1.0234375, "learning_rate": 7.521375219492489e-07, "loss": 0.2201, "reward": 0.683035746216774, "reward_std": 0.31017132103443146, "rewards/accuracy_reward": 0.09821428824216127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5848214477300644, "step": 244 }, { "clip_ratio": 0.0, "completion_length": 1421.1340026855469, "epoch": 0.4186245194361384, "grad_norm": 1.5779714584350586, "kl": 0.955078125, "learning_rate": 7.49732671290992e-07, "loss": 0.1256, "reward": 0.6852678805589676, "reward_std": 0.2986368052661419, "rewards/accuracy_reward": 0.09821429196745157, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5870535969734192, "step": 245 }, { "clip_ratio": 0.0, "completion_length": 1375.6697082519531, "epoch": 0.42033319094404104, "grad_norm": 1.4974365234375, "kl": 1.185546875, "learning_rate": 7.473206957769773e-07, "loss": 0.2117, "reward": 0.7098214477300644, "reward_std": 0.33180297911167145, "rewards/accuracy_reward": 0.1250000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5848214477300644, "step": 246 }, { "clip_ratio": 0.0, "completion_length": 1223.8393249511719, "epoch": 0.42204186245194364, "grad_norm": 0.9476463198661804, "kl": 1.1005859375, "learning_rate": 7.449016814470976e-07, "loss": 0.1473, "reward": 0.7209821790456772, "reward_std": 0.2768738903105259, "rewards/accuracy_reward": 0.13392857927829027, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.587053582072258, "step": 247 }, { "clip_ratio": 0.0, "completion_length": 1332.2054443359375, "epoch": 0.42375053395984624, "grad_norm": 2.410874128341675, "kl": 1.236328125, "learning_rate": 7.424757145923341e-07, "loss": 0.1367, "reward": 0.6428571715950966, "reward_std": 0.2907528653740883, "rewards/accuracy_reward": 0.06250000465661287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5803571715950966, "step": 248 }, { "clip_ratio": 0.0, "completion_length": 1450.1429138183594, "epoch": 0.42545920546774885, "grad_norm": 1.5963125228881836, "kl": 1.169921875, "learning_rate": 7.400428817516782e-07, "loss": 0.1397, "reward": 0.6584821790456772, "reward_std": 0.30701371282339096, "rewards/accuracy_reward": 0.10714286472648382, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5513393133878708, "step": 249 }, { "clip_ratio": 0.0, "completion_length": 1197.4375610351562, "epoch": 0.42716787697565145, "grad_norm": 1.4690433740615845, "kl": 1.0693359375, "learning_rate": 7.376032697090443e-07, "loss": 0.2144, "reward": 0.7209821790456772, "reward_std": 0.3110496662557125, "rewards/accuracy_reward": 0.12500000279396772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.595982164144516, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 1337.7322082519531, "epoch": 0.42887654848355405, "grad_norm": 1.146095633506775, "kl": 0.9951171875, "learning_rate": 7.351569654901744e-07, "loss": 0.2028, "reward": 0.7723214626312256, "reward_std": 0.31498534232378006, "rewards/accuracy_reward": 0.12500000651925802, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6473214477300644, "step": 251 }, { "clip_ratio": 0.0, "completion_length": 1401.2590026855469, "epoch": 0.43058521999145666, "grad_norm": 1.8497591018676758, "kl": 1.27734375, "learning_rate": 7.327040563595335e-07, "loss": 0.1589, "reward": 0.696428582072258, "reward_std": 0.3371073454618454, "rewards/accuracy_reward": 0.13392857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5625000149011612, "step": 252 }, { "clip_ratio": 0.0, "completion_length": 1247.6964721679688, "epoch": 0.43229389149935926, "grad_norm": 1.1931570768356323, "kl": 1.126953125, "learning_rate": 7.302446298171965e-07, "loss": 0.2135, "reward": 0.7723214626312256, "reward_std": 0.3378186672925949, "rewards/accuracy_reward": 0.14285715017467737, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6294643133878708, "step": 253 }, { "clip_ratio": 0.0, "completion_length": 1289.5625305175781, "epoch": 0.43400256300726187, "grad_norm": 1.3028424978256226, "kl": 1.1845703125, "learning_rate": 7.277787735957276e-07, "loss": 0.1594, "reward": 0.7723214626312256, "reward_std": 0.367490291595459, "rewards/accuracy_reward": 0.21428572572767735, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.558035746216774, "step": 254 }, { "clip_ratio": 0.0, "completion_length": 1324.2054138183594, "epoch": 0.43571123451516447, "grad_norm": 0.960293173789978, "kl": 0.8359375, "learning_rate": 7.253065756570502e-07, "loss": 0.1264, "reward": 0.7321428954601288, "reward_std": 0.25103695690631866, "rewards/accuracy_reward": 0.10714286286383867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6250000447034836, "step": 255 }, { "clip_ratio": 0.0, "completion_length": 1335.71435546875, "epoch": 0.4374199060230671, "grad_norm": 0.8919795751571655, "kl": 1.0546875, "learning_rate": 7.228281241893087e-07, "loss": 0.1042, "reward": 0.6696428954601288, "reward_std": 0.2767292708158493, "rewards/accuracy_reward": 0.098214291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5714285969734192, "step": 256 }, { "clip_ratio": 0.0, "completion_length": 1245.7857666015625, "epoch": 0.4391285775309697, "grad_norm": 1.3694312572479248, "kl": 1.064453125, "learning_rate": 7.203435076037235e-07, "loss": 0.1801, "reward": 0.720982164144516, "reward_std": 0.28131017833948135, "rewards/accuracy_reward": 0.098214291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6227678656578064, "step": 257 }, { "clip_ratio": 0.0, "completion_length": 1264.3303985595703, "epoch": 0.4408372490388723, "grad_norm": 2.955960273742676, "kl": 0.8662109375, "learning_rate": 7.178528145314367e-07, "loss": 0.2324, "reward": 0.8415178954601288, "reward_std": 0.27330131083726883, "rewards/accuracy_reward": 0.21428572107106447, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.627232164144516, "step": 258 }, { "clip_ratio": 0.0, "completion_length": 1302.2768249511719, "epoch": 0.4425459205467749, "grad_norm": 1.0514429807662964, "kl": 1.19140625, "learning_rate": 7.15356133820351e-07, "loss": 0.2296, "reward": 0.7075892984867096, "reward_std": 0.3070714548230171, "rewards/accuracy_reward": 0.14285714738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.564732164144516, "step": 259 }, { "clip_ratio": 0.0, "completion_length": 1239.9464721679688, "epoch": 0.4442545920546775, "grad_norm": 1.6768121719360352, "kl": 1.1826171875, "learning_rate": 7.128535545319592e-07, "loss": 0.1841, "reward": 0.7946428805589676, "reward_std": 0.3091660812497139, "rewards/accuracy_reward": 0.1875000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6071428805589676, "step": 260 }, { "clip_ratio": 0.0, "completion_length": 1171.0893249511719, "epoch": 0.4459632635625801, "grad_norm": 1.0441120862960815, "kl": 0.8798828125, "learning_rate": 7.103451659381684e-07, "loss": 0.102, "reward": 0.7031250298023224, "reward_std": 0.3036840856075287, "rewards/accuracy_reward": 0.08928572107106447, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6138392984867096, "step": 261 }, { "clip_ratio": 0.0, "completion_length": 1254.08935546875, "epoch": 0.4476719350704827, "grad_norm": 1.6376787424087524, "kl": 1.248046875, "learning_rate": 7.078310575181149e-07, "loss": 0.1052, "reward": 0.6495535969734192, "reward_std": 0.3499393127858639, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5781250298023224, "step": 262 }, { "clip_ratio": 0.0, "completion_length": 1319.9375915527344, "epoch": 0.4493806065783853, "grad_norm": 1.5446854829788208, "kl": 1.0537109375, "learning_rate": 7.053113189549724e-07, "loss": 0.1441, "reward": 0.6584821790456772, "reward_std": 0.29416774958372116, "rewards/accuracy_reward": 0.05357143096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.604910746216774, "step": 263 }, { "clip_ratio": 0.0, "completion_length": 1195.9465026855469, "epoch": 0.4510892780862879, "grad_norm": 1.9209314584732056, "kl": 0.8935546875, "learning_rate": 7.027860401327528e-07, "loss": 0.1347, "reward": 0.7812500298023224, "reward_std": 0.27084343135356903, "rewards/accuracy_reward": 0.160714291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6205357313156128, "step": 264 }, { "clip_ratio": 0.0, "completion_length": 1119.3750610351562, "epoch": 0.4527979495941905, "grad_norm": 1.508484959602356, "kl": 0.94921875, "learning_rate": 7.002553111331e-07, "loss": 0.1332, "reward": 0.6785714626312256, "reward_std": 0.323038961738348, "rewards/accuracy_reward": 0.053571431897580624, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6250000149011612, "step": 265 }, { "clip_ratio": 0.0, "completion_length": 1030.5178833007812, "epoch": 0.4545066211020931, "grad_norm": 2.3404104709625244, "kl": 1.185546875, "learning_rate": 6.977192222320763e-07, "loss": 0.1584, "reward": 0.7991071939468384, "reward_std": 0.3482673019170761, "rewards/accuracy_reward": 0.16071428917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6383928954601288, "step": 266 }, { "clip_ratio": 0.0, "completion_length": 1179.40185546875, "epoch": 0.4562152926099957, "grad_norm": 1.2713619470596313, "kl": 1.25390625, "learning_rate": 6.951778638969421e-07, "loss": 0.1538, "reward": 0.6785714626312256, "reward_std": 0.2634088769555092, "rewards/accuracy_reward": 0.06250000279396772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6160714626312256, "step": 267 }, { "clip_ratio": 0.0, "completion_length": 1215.8572082519531, "epoch": 0.4579239641178983, "grad_norm": 3.370774507522583, "kl": 1.513671875, "learning_rate": 6.926313267829293e-07, "loss": 0.1989, "reward": 0.6629464626312256, "reward_std": 0.2876349240541458, "rewards/accuracy_reward": 0.098214291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.564732164144516, "step": 268 }, { "clip_ratio": 0.0, "completion_length": 1123.6161193847656, "epoch": 0.4596326356258009, "grad_norm": 2.086580514907837, "kl": 1.234375, "learning_rate": 6.900797017300062e-07, "loss": 0.1519, "reward": 0.8549107611179352, "reward_std": 0.30461129546165466, "rewards/accuracy_reward": 0.21428572572767735, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6406250298023224, "step": 269 }, { "clip_ratio": 0.0, "completion_length": 1222.2679138183594, "epoch": 0.4613413071337035, "grad_norm": 1.3789944648742676, "kl": 1.44140625, "learning_rate": 6.875230797596388e-07, "loss": 0.2121, "reward": 0.7678571790456772, "reward_std": 0.36743323504924774, "rewards/accuracy_reward": 0.17857143841683865, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5892857313156128, "step": 270 }, { "clip_ratio": 0.0, "completion_length": 1272.2947082519531, "epoch": 0.46304997864160613, "grad_norm": 2.355010509490967, "kl": 1.63671875, "learning_rate": 6.84961552071542e-07, "loss": 0.1476, "reward": 0.598214328289032, "reward_std": 0.30791985243558884, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5625000149011612, "step": 271 }, { "clip_ratio": 0.0, "completion_length": 1182.4554138183594, "epoch": 0.46475865014950873, "grad_norm": 1.8412355184555054, "kl": 1.30078125, "learning_rate": 6.823952100404278e-07, "loss": 0.1244, "reward": 0.7410714626312256, "reward_std": 0.2917444556951523, "rewards/accuracy_reward": 0.14285714644938707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5982142984867096, "step": 272 }, { "clip_ratio": 0.0, "completion_length": 1119.2500610351562, "epoch": 0.46646732165741134, "grad_norm": 3.0378377437591553, "kl": 1.59375, "learning_rate": 6.798241452127453e-07, "loss": 0.1946, "reward": 0.9129464626312256, "reward_std": 0.3309457339346409, "rewards/accuracy_reward": 0.31250001676380634, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6004464626312256, "step": 273 }, { "clip_ratio": 0.0, "completion_length": 1152.794662475586, "epoch": 0.46817599316531394, "grad_norm": 2.485898971557617, "kl": 1.51171875, "learning_rate": 6.77248449303415e-07, "loss": 0.166, "reward": 0.7477678954601288, "reward_std": 0.3498176485300064, "rewards/accuracy_reward": 0.1339285783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6138393133878708, "step": 274 }, { "clip_ratio": 0.0, "completion_length": 1113.8125457763672, "epoch": 0.4698846646732166, "grad_norm": 1.3373255729675293, "kl": 1.296875, "learning_rate": 6.746682141925566e-07, "loss": 0.1665, "reward": 0.7276785969734192, "reward_std": 0.2847871445119381, "rewards/accuracy_reward": 0.12500000838190317, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6026785969734192, "step": 275 }, { "clip_ratio": 0.0, "completion_length": 1330.9911193847656, "epoch": 0.4715933361811192, "grad_norm": 2.1263697147369385, "kl": 1.578125, "learning_rate": 6.720835319222129e-07, "loss": 0.1566, "reward": 0.7120536118745804, "reward_std": 0.2614187151193619, "rewards/accuracy_reward": 0.12500000931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.587053582072258, "step": 276 }, { "clip_ratio": 0.0, "completion_length": 1267.2590026855469, "epoch": 0.4733020076890218, "grad_norm": 1.6527024507522583, "kl": 1.38671875, "learning_rate": 6.694944946930646e-07, "loss": 0.1834, "reward": 0.7053571790456772, "reward_std": 0.30956385657191277, "rewards/accuracy_reward": 0.08035714644938707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6250000149011612, "step": 277 }, { "clip_ratio": 0.0, "completion_length": 1098.4197082519531, "epoch": 0.4750106791969244, "grad_norm": 2.0362517833709717, "kl": 1.384765625, "learning_rate": 6.669011948611433e-07, "loss": 0.1999, "reward": 0.714285746216774, "reward_std": 0.3250354677438736, "rewards/accuracy_reward": 0.08035714831203222, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6339285969734192, "step": 278 }, { "clip_ratio": 0.0, "completion_length": 1202.9732666015625, "epoch": 0.476719350704827, "grad_norm": 1.8541934490203857, "kl": 1.2421875, "learning_rate": 6.643037249345353e-07, "loss": 0.1495, "reward": 0.7968750447034836, "reward_std": 0.3629959002137184, "rewards/accuracy_reward": 0.1517857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6450893133878708, "step": 279 }, { "clip_ratio": 0.0, "completion_length": 1077.8839721679688, "epoch": 0.4784280222127296, "grad_norm": 4.3601975440979, "kl": 1.25, "learning_rate": 6.617021775700827e-07, "loss": 0.1589, "reward": 0.7745535969734192, "reward_std": 0.2124730534851551, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6495536118745804, "step": 280 }, { "clip_ratio": 0.0, "completion_length": 1257.8304138183594, "epoch": 0.4801366937206322, "grad_norm": 3.1542532444000244, "kl": 1.375, "learning_rate": 6.590966455700775e-07, "loss": 0.1211, "reward": 0.7343750298023224, "reward_std": 0.33456141501665115, "rewards/accuracy_reward": 0.13392858020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6004464626312256, "step": 281 }, { "clip_ratio": 0.0, "completion_length": 1170.6250305175781, "epoch": 0.4818453652285348, "grad_norm": 1.4870823621749878, "kl": 1.345703125, "learning_rate": 6.564872218789519e-07, "loss": 0.1459, "reward": 0.5915178656578064, "reward_std": 0.2642357088625431, "rewards/accuracy_reward": 0.01785714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5736607313156128, "step": 282 }, { "clip_ratio": 0.0, "completion_length": 1104.6786193847656, "epoch": 0.48355403673643743, "grad_norm": 2.3791069984436035, "kl": 1.51953125, "learning_rate": 6.538739995799619e-07, "loss": 0.1941, "reward": 0.6607143133878708, "reward_std": 0.2977270260453224, "rewards/accuracy_reward": 0.07142857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.589285746216774, "step": 283 }, { "clip_ratio": 0.0, "completion_length": 1141.33935546875, "epoch": 0.48526270824434004, "grad_norm": 1.7712472677230835, "kl": 0.96875, "learning_rate": 6.512570718918674e-07, "loss": 0.0803, "reward": 0.7834821790456772, "reward_std": 0.3025682643055916, "rewards/accuracy_reward": 0.10714286100119352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6763393133878708, "step": 284 }, { "clip_ratio": 0.0, "completion_length": 1066.0000457763672, "epoch": 0.48697137975224264, "grad_norm": 1.81183922290802, "kl": 1.23828125, "learning_rate": 6.48636532165607e-07, "loss": 0.141, "reward": 0.689732164144516, "reward_std": 0.28278978168964386, "rewards/accuracy_reward": 0.08035714831203222, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6093750298023224, "step": 285 }, { "clip_ratio": 0.0, "completion_length": 1204.2232666015625, "epoch": 0.48868005126014524, "grad_norm": 1.7218575477600098, "kl": 0.982421875, "learning_rate": 6.460124738809676e-07, "loss": 0.1329, "reward": 0.754464328289032, "reward_std": 0.33313046395778656, "rewards/accuracy_reward": 0.12500000279396772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6294643133878708, "step": 286 }, { "clip_ratio": 0.0, "completion_length": 1049.6072082519531, "epoch": 0.49038872276804785, "grad_norm": 1.7371902465820312, "kl": 1.0927734375, "learning_rate": 6.4338499064325e-07, "loss": 0.0885, "reward": 0.6696428954601288, "reward_std": 0.282687421888113, "rewards/accuracy_reward": 0.044642859138548374, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6250000298023224, "step": 287 }, { "clip_ratio": 0.0, "completion_length": 1171.5357360839844, "epoch": 0.49209739427595045, "grad_norm": 1.699110507965088, "kl": 1.3359375, "learning_rate": 6.407541761799299e-07, "loss": 0.1286, "reward": 0.7700893133878708, "reward_std": 0.32603807747364044, "rewards/accuracy_reward": 0.14285714831203222, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.627232164144516, "step": 288 }, { "clip_ratio": 0.0, "completion_length": 1061.3572082519531, "epoch": 0.49380606578385305, "grad_norm": 1.2986787557601929, "kl": 1.0849609375, "learning_rate": 6.381201243373141e-07, "loss": 0.1445, "reward": 0.8191964626312256, "reward_std": 0.28703416883945465, "rewards/accuracy_reward": 0.18750000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6316964626312256, "step": 289 }, { "clip_ratio": 0.0, "completion_length": 1180.7232971191406, "epoch": 0.49551473729175566, "grad_norm": 1.6688075065612793, "kl": 1.48828125, "learning_rate": 6.354829290771934e-07, "loss": 0.1462, "reward": 0.6741071790456772, "reward_std": 0.2714449465274811, "rewards/accuracy_reward": 0.1250000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5491071790456772, "step": 290 }, { "clip_ratio": 0.0, "completion_length": 1142.776840209961, "epoch": 0.49722340879965826, "grad_norm": 1.2279541492462158, "kl": 1.169921875, "learning_rate": 6.3284268447349e-07, "loss": 0.084, "reward": 0.707589328289032, "reward_std": 0.30590294674038887, "rewards/accuracy_reward": 0.08035714644938707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6272321790456772, "step": 291 }, { "clip_ratio": 0.0, "completion_length": 1074.3393249511719, "epoch": 0.49893208030756087, "grad_norm": 1.543359398841858, "kl": 0.962890625, "learning_rate": 6.301994847089026e-07, "loss": 0.1062, "reward": 0.823660746216774, "reward_std": 0.3202272988855839, "rewards/accuracy_reward": 0.1517857238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6718750447034836, "step": 292 }, { "clip_ratio": 0.0, "completion_length": 1084.5714721679688, "epoch": 0.5006407518154635, "grad_norm": 1.198907494544983, "kl": 1.0732421875, "learning_rate": 6.27553424071546e-07, "loss": 0.0944, "reward": 0.8102678954601288, "reward_std": 0.36363718286156654, "rewards/accuracy_reward": 0.196428582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6138392984867096, "step": 293 }, { "clip_ratio": 0.0, "completion_length": 1032.3036499023438, "epoch": 0.5023494233233661, "grad_norm": 2.4161536693573, "kl": 1.1162109375, "learning_rate": 6.249045969515882e-07, "loss": 0.061, "reward": 0.792410746216774, "reward_std": 0.32833296060562134, "rewards/accuracy_reward": 0.17857144214212894, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6138392984867096, "step": 294 }, { "clip_ratio": 0.0, "completion_length": 1133.1786193847656, "epoch": 0.5040580948312687, "grad_norm": 1.3697497844696045, "kl": 1.068359375, "learning_rate": 6.222530978378829e-07, "loss": 0.0506, "reward": 0.7633928805589676, "reward_std": 0.31810464709997177, "rewards/accuracy_reward": 0.13392857555299997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6294643133878708, "step": 295 }, { "clip_ratio": 0.0, "completion_length": 1129.0982360839844, "epoch": 0.5057667663391713, "grad_norm": 2.744436502456665, "kl": 1.0576171875, "learning_rate": 6.195990213145987e-07, "loss": 0.1275, "reward": 0.723214328289032, "reward_std": 0.322661180049181, "rewards/accuracy_reward": 0.098214291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6250000447034836, "step": 296 }, { "clip_ratio": 0.0, "completion_length": 1034.7053985595703, "epoch": 0.5074754378470739, "grad_norm": 2.4617998600006104, "kl": 1.1220703125, "learning_rate": 6.169424620578464e-07, "loss": 0.1145, "reward": 0.6138393133878708, "reward_std": 0.2960447371006012, "rewards/accuracy_reward": 0.01785714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5959821790456772, "step": 297 }, { "clip_ratio": 0.0, "completion_length": 1089.1250305175781, "epoch": 0.5091841093549765, "grad_norm": 1.269031047821045, "kl": 1.1015625, "learning_rate": 6.142835148322997e-07, "loss": 0.1128, "reward": 0.6674107611179352, "reward_std": 0.2783175855875015, "rewards/accuracy_reward": 0.06250000465661287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.604910746216774, "step": 298 }, { "clip_ratio": 0.0, "completion_length": 1040.7322082519531, "epoch": 0.5108927808628791, "grad_norm": 1.1665772199630737, "kl": 1.27734375, "learning_rate": 6.116222744878164e-07, "loss": 0.1047, "reward": 0.6651785969734192, "reward_std": 0.2582739628851414, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6294643133878708, "step": 299 }, { "clip_ratio": 0.0, "completion_length": 1006.8929138183594, "epoch": 0.5126014523707817, "grad_norm": 2.393529176712036, "kl": 1.4423828125, "learning_rate": 6.089588359560545e-07, "loss": 0.108, "reward": 0.6741071790456772, "reward_std": 0.3190380521118641, "rewards/accuracy_reward": 0.05357143096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.620535746216774, "step": 300 }, { "clip_ratio": 0.0, "completion_length": 1095.8929138183594, "epoch": 0.5143101238786844, "grad_norm": 1.7411783933639526, "kl": 1.1787109375, "learning_rate": 6.062932942470851e-07, "loss": 0.1055, "reward": 0.6540178805589676, "reward_std": 0.29942508041858673, "rewards/accuracy_reward": 0.0625000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5915178805589676, "step": 301 }, { "clip_ratio": 0.0, "completion_length": 968.6339721679688, "epoch": 0.5160187953865869, "grad_norm": 1.9820164442062378, "kl": 0.994140625, "learning_rate": 6.03625744446004e-07, "loss": 0.1085, "reward": 0.7812500447034836, "reward_std": 0.3846283331513405, "rewards/accuracy_reward": 0.13392858114093542, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6473214626312256, "step": 302 }, { "clip_ratio": 0.0, "completion_length": 1001.9643249511719, "epoch": 0.5177274668944896, "grad_norm": 2.251901865005493, "kl": 1.2119140625, "learning_rate": 6.009562817095399e-07, "loss": 0.0886, "reward": 0.7343750298023224, "reward_std": 0.2748066708445549, "rewards/accuracy_reward": 0.08035714644938707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6540178805589676, "step": 303 }, { "clip_ratio": 0.0, "completion_length": 934.1250457763672, "epoch": 0.5194361384023921, "grad_norm": 2.02225399017334, "kl": 1.47265625, "learning_rate": 5.982850012626593e-07, "loss": 0.141, "reward": 0.7968750447034836, "reward_std": 0.2797054648399353, "rewards/accuracy_reward": 0.160714291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6361607313156128, "step": 304 }, { "clip_ratio": 0.0, "completion_length": 962.2589569091797, "epoch": 0.5211448099102948, "grad_norm": 1.4530941247940063, "kl": 1.283203125, "learning_rate": 5.956119983951697e-07, "loss": 0.0619, "reward": 0.7209821790456772, "reward_std": 0.25691112130880356, "rewards/accuracy_reward": 0.06250000465661287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6584821790456772, "step": 305 }, { "clip_ratio": 0.0, "completion_length": 1024.3482360839844, "epoch": 0.5228534814181973, "grad_norm": 1.2270197868347168, "kl": 1.255859375, "learning_rate": 5.929373684583217e-07, "loss": 0.0644, "reward": 0.7276785969734192, "reward_std": 0.27319925278425217, "rewards/accuracy_reward": 0.09821428824216127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6294643133878708, "step": 306 }, { "clip_ratio": 0.0, "completion_length": 979.6339721679688, "epoch": 0.5245621529261, "grad_norm": 1.234403133392334, "kl": 1.0498046875, "learning_rate": 5.902612068614057e-07, "loss": 0.0763, "reward": 0.7120535969734192, "reward_std": 0.30254342406988144, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6406250149011612, "step": 307 }, { "clip_ratio": 0.0, "completion_length": 949.3839721679688, "epoch": 0.5262708244340025, "grad_norm": 1.359805941581726, "kl": 1.30859375, "learning_rate": 5.875836090683497e-07, "loss": 0.1254, "reward": 0.7433035969734192, "reward_std": 0.3441048711538315, "rewards/accuracy_reward": 0.11607143189758062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6272321790456772, "step": 308 }, { "clip_ratio": 0.0, "completion_length": 915.27685546875, "epoch": 0.5279794959419052, "grad_norm": 1.3475511074066162, "kl": 0.9833984375, "learning_rate": 5.849046705943136e-07, "loss": 0.1299, "reward": 0.8459821790456772, "reward_std": 0.33283481001853943, "rewards/accuracy_reward": 0.17857143469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.667410746216774, "step": 309 }, { "clip_ratio": 0.0, "completion_length": 990.4464569091797, "epoch": 0.5296881674498077, "grad_norm": 1.7766283750534058, "kl": 1.349609375, "learning_rate": 5.822244870022824e-07, "loss": -0.0125, "reward": 0.7656250447034836, "reward_std": 0.33516017347574234, "rewards/accuracy_reward": 0.16964286752045155, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.595982164144516, "step": 310 }, { "clip_ratio": 0.0, "completion_length": 990.8214721679688, "epoch": 0.5313968389577104, "grad_norm": 1.651833415031433, "kl": 1.130859375, "learning_rate": 5.795431538996567e-07, "loss": 0.0755, "reward": 0.7366071790456772, "reward_std": 0.25338204205036163, "rewards/accuracy_reward": 0.08928571734577417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6473214477300644, "step": 311 }, { "clip_ratio": 0.0, "completion_length": 957.7589721679688, "epoch": 0.5331055104656129, "grad_norm": 2.9751198291778564, "kl": 1.1640625, "learning_rate": 5.768607669348419e-07, "loss": 0.0365, "reward": 0.7321428805589676, "reward_std": 0.31464507430791855, "rewards/accuracy_reward": 0.08035714738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6517857313156128, "step": 312 }, { "clip_ratio": 0.0, "completion_length": 999.3929290771484, "epoch": 0.5348141819735156, "grad_norm": 1.149966835975647, "kl": 0.9921875, "learning_rate": 5.741774217938373e-07, "loss": 0.0694, "reward": 0.785714328289032, "reward_std": 0.2896200269460678, "rewards/accuracy_reward": 0.11607143376022577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6696428954601288, "step": 313 }, { "clip_ratio": 0.0, "completion_length": 948.8661041259766, "epoch": 0.5365228534814181, "grad_norm": 1.456761121749878, "kl": 1.4287109375, "learning_rate": 5.714932141968221e-07, "loss": 0.0082, "reward": 0.7745536118745804, "reward_std": 0.3295694664120674, "rewards/accuracy_reward": 0.14285715110599995, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6316964775323868, "step": 314 }, { "clip_ratio": 0.0, "completion_length": 946.607177734375, "epoch": 0.5382315249893208, "grad_norm": 4.404793739318848, "kl": 1.830078125, "learning_rate": 5.688082398947409e-07, "loss": 0.0256, "reward": 0.7343750298023224, "reward_std": 0.3344566971063614, "rewards/accuracy_reward": 0.10714286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.627232164144516, "step": 315 }, { "clip_ratio": 0.0, "completion_length": 1063.0982818603516, "epoch": 0.5399401964972234, "grad_norm": 4.570176601409912, "kl": 1.91015625, "learning_rate": 5.66122594665888e-07, "loss": 0.1439, "reward": 0.6540178954601288, "reward_std": 0.32976697385311127, "rewards/accuracy_reward": 0.053571431897580624, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.600446455180645, "step": 316 }, { "clip_ratio": 0.0, "completion_length": 1044.8482666015625, "epoch": 0.541648868005126, "grad_norm": 2.2109522819519043, "kl": 1.357421875, "learning_rate": 5.634363743124918e-07, "loss": 0.0218, "reward": 0.698660746216774, "reward_std": 0.30520205572247505, "rewards/accuracy_reward": 0.08035714831203222, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6183035969734192, "step": 317 }, { "clip_ratio": 0.0, "completion_length": 832.3036193847656, "epoch": 0.5433575395130287, "grad_norm": 2.23356556892395, "kl": 1.32421875, "learning_rate": 5.607496746572953e-07, "loss": 0.0726, "reward": 0.7745536118745804, "reward_std": 0.3260001614689827, "rewards/accuracy_reward": 0.10714286286383867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.667410746216774, "step": 318 }, { "clip_ratio": 0.0, "completion_length": 1013.9375457763672, "epoch": 0.5450662110209312, "grad_norm": 2.5800023078918457, "kl": 1.2919921875, "learning_rate": 5.5806259154014e-07, "loss": 0.0126, "reward": 0.7790178954601288, "reward_std": 0.282642625272274, "rewards/accuracy_reward": 0.14285715017467737, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6361607313156128, "step": 319 }, { "clip_ratio": 0.0, "completion_length": 986.5893249511719, "epoch": 0.5467748825288339, "grad_norm": 1.79945969581604, "kl": 1.18359375, "learning_rate": 5.553752208145459e-07, "loss": 0.1263, "reward": 0.7120536118745804, "reward_std": 0.26668355241417885, "rewards/accuracy_reward": 0.0535714328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6584821790456772, "step": 320 }, { "clip_ratio": 0.0, "completion_length": 1183.4911193847656, "epoch": 0.5484835540367364, "grad_norm": 1.8276363611221313, "kl": 0.9853515625, "learning_rate": 5.526876583442928e-07, "loss": 0.0898, "reward": 0.6696428954601288, "reward_std": 0.2576278932392597, "rewards/accuracy_reward": 0.053571431897580624, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6160714626312256, "step": 321 }, { "clip_ratio": 0.0, "completion_length": 976.0625457763672, "epoch": 0.5501922255446391, "grad_norm": 1.678053855895996, "kl": 1.546875, "learning_rate": 5.5e-07, "loss": 0.0664, "reward": 0.7075893133878708, "reward_std": 0.2546408623456955, "rewards/accuracy_reward": 0.10714286286383867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6004464477300644, "step": 322 }, { "clip_ratio": 0.0, "completion_length": 1040.2679138183594, "epoch": 0.5519008970525416, "grad_norm": 1.1143730878829956, "kl": 0.9609375, "learning_rate": 5.473123416557074e-07, "loss": 0.0016, "reward": 0.7410714477300644, "reward_std": 0.2841089479625225, "rewards/accuracy_reward": 0.1160714328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6250000298023224, "step": 323 }, { "clip_ratio": 0.0, "completion_length": 1006.4107666015625, "epoch": 0.5536095685604443, "grad_norm": 1.3419076204299927, "kl": 1.158203125, "learning_rate": 5.446247791854541e-07, "loss": -0.01, "reward": 0.707589328289032, "reward_std": 0.30943915247917175, "rewards/accuracy_reward": 0.09821428824216127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6093750447034836, "step": 324 }, { "clip_ratio": 0.0, "completion_length": 1045.1607666015625, "epoch": 0.5553182400683468, "grad_norm": 1.9382789134979248, "kl": 0.9970703125, "learning_rate": 5.4193740845986e-07, "loss": 0.0435, "reward": 0.8325893133878708, "reward_std": 0.28164472058415413, "rewards/accuracy_reward": 0.17857143469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6540178954601288, "step": 325 }, { "clip_ratio": 0.0, "completion_length": 975.0804138183594, "epoch": 0.5570269115762495, "grad_norm": 1.8686273097991943, "kl": 1.36767578125, "learning_rate": 5.392503253427048e-07, "loss": 0.0421, "reward": 0.892857164144516, "reward_std": 0.3082231916487217, "rewards/accuracy_reward": 0.22321429755538702, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6696428954601288, "step": 326 }, { "clip_ratio": 0.0, "completion_length": 871.5714721679688, "epoch": 0.558735583084152, "grad_norm": 2.199556350708008, "kl": 1.498046875, "learning_rate": 5.365636256875083e-07, "loss": 0.0957, "reward": 0.8616071939468384, "reward_std": 0.3545013815164566, "rewards/accuracy_reward": 0.18750000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6741071790456772, "step": 327 }, { "clip_ratio": 0.0, "completion_length": 932.5893402099609, "epoch": 0.5604442545920547, "grad_norm": 2.4717750549316406, "kl": 1.8876953125, "learning_rate": 5.338774053341119e-07, "loss": 0.1122, "reward": 0.7544643133878708, "reward_std": 0.30379757285118103, "rewards/accuracy_reward": 0.1339285783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.620535746216774, "step": 328 }, { "clip_ratio": 0.0, "completion_length": 924.4375457763672, "epoch": 0.5621529260999573, "grad_norm": 4.240217208862305, "kl": 1.712890625, "learning_rate": 5.311917601052594e-07, "loss": 0.1299, "reward": 0.7008928954601288, "reward_std": 0.2621128298342228, "rewards/accuracy_reward": 0.0803571455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.620535746216774, "step": 329 }, { "clip_ratio": 0.0, "completion_length": 989.5536041259766, "epoch": 0.5638615976078599, "grad_norm": 15.417141914367676, "kl": 2.421875, "learning_rate": 5.28506785803178e-07, "loss": 0.0729, "reward": 0.705357164144516, "reward_std": 0.3452422171831131, "rewards/accuracy_reward": 0.10714286286383867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5982143133878708, "step": 330 }, { "clip_ratio": 0.0, "completion_length": 984.2232666015625, "epoch": 0.5655702691157625, "grad_norm": 5.937498092651367, "kl": 1.6884765625, "learning_rate": 5.258225782061628e-07, "loss": 0.106, "reward": 0.7924107611179352, "reward_std": 0.3410973884165287, "rewards/accuracy_reward": 0.12500000186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.667410746216774, "step": 331 }, { "clip_ratio": 0.0, "completion_length": 932.5446929931641, "epoch": 0.5672789406236651, "grad_norm": 3.5565853118896484, "kl": 1.548828125, "learning_rate": 5.231392330651582e-07, "loss": 0.0294, "reward": 0.761160746216774, "reward_std": 0.2842172831296921, "rewards/accuracy_reward": 0.10714286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6540178954601288, "step": 332 }, { "clip_ratio": 0.0, "completion_length": 756.8482513427734, "epoch": 0.5689876121315677, "grad_norm": 2.91373348236084, "kl": 2.2265625, "learning_rate": 5.204568461003433e-07, "loss": 0.0533, "reward": 0.7633928954601288, "reward_std": 0.26106104999780655, "rewards/accuracy_reward": 0.14285715389996767, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6205357313156128, "step": 333 }, { "clip_ratio": 0.0, "completion_length": 812.4553985595703, "epoch": 0.5706962836394703, "grad_norm": 4.170596122741699, "kl": 1.994140625, "learning_rate": 5.177755129977176e-07, "loss": 0.0687, "reward": 0.7276785969734192, "reward_std": 0.25997817888855934, "rewards/accuracy_reward": 0.10714285913854837, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.620535746216774, "step": 334 }, { "clip_ratio": 0.0, "completion_length": 806.9464721679688, "epoch": 0.5724049551473729, "grad_norm": 3.5833747386932373, "kl": 1.783203125, "learning_rate": 5.150953294056864e-07, "loss": 0.0631, "reward": 0.8236607611179352, "reward_std": 0.3711840733885765, "rewards/accuracy_reward": 0.160714291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6629464626312256, "step": 335 }, { "clip_ratio": 0.0, "completion_length": 888.3393249511719, "epoch": 0.5741136266552755, "grad_norm": 2.0271449089050293, "kl": 1.8828125, "learning_rate": 5.124163909316505e-07, "loss": 0.0649, "reward": 0.7098214775323868, "reward_std": 0.2663373500108719, "rewards/accuracy_reward": 0.07142857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6383928954601288, "step": 336 }, { "clip_ratio": 0.0, "completion_length": 919.9196929931641, "epoch": 0.5758222981631781, "grad_norm": 4.025437831878662, "kl": 1.900390625, "learning_rate": 5.097387931385945e-07, "loss": 0.1032, "reward": 0.645089328289032, "reward_std": 0.18225691094994545, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6093750447034836, "step": 337 }, { "clip_ratio": 0.0, "completion_length": 891.7053833007812, "epoch": 0.5775309696710808, "grad_norm": 2.983868360519409, "kl": 1.8046875, "learning_rate": 5.070626315416783e-07, "loss": 0.092, "reward": 0.776785746216774, "reward_std": 0.3568408042192459, "rewards/accuracy_reward": 0.14285714831203222, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6339285969734192, "step": 338 }, { "clip_ratio": 0.0, "completion_length": 902.4553985595703, "epoch": 0.5792396411789833, "grad_norm": 1.8878085613250732, "kl": 1.822265625, "learning_rate": 5.043880016048303e-07, "loss": 0.0432, "reward": 0.7276785969734192, "reward_std": 0.39400141686201096, "rewards/accuracy_reward": 0.13392857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5937500298023224, "step": 339 }, { "clip_ratio": 0.0, "completion_length": 836.6250457763672, "epoch": 0.580948312686886, "grad_norm": 2.3490586280822754, "kl": 1.66796875, "learning_rate": 5.017149987373409e-07, "loss": 0.0652, "reward": 0.7500000298023224, "reward_std": 0.2971554473042488, "rewards/accuracy_reward": 0.12500000186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6250000298023224, "step": 340 }, { "clip_ratio": 0.0, "completion_length": 894.9643249511719, "epoch": 0.5826569841947885, "grad_norm": 1.9202009439468384, "kl": 1.736328125, "learning_rate": 4.990437182904601e-07, "loss": 0.0425, "reward": 0.676339328289032, "reward_std": 0.26142656803131104, "rewards/accuracy_reward": 0.07142857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.604910746216774, "step": 341 }, { "clip_ratio": 0.0, "completion_length": 878.6786041259766, "epoch": 0.5843656557026912, "grad_norm": 1.74933922290802, "kl": 1.623046875, "learning_rate": 4.96374255553996e-07, "loss": 0.0164, "reward": 0.6540178954601288, "reward_std": 0.29254597797989845, "rewards/accuracy_reward": 0.035714288242161274, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6183036118745804, "step": 342 }, { "clip_ratio": 0.0, "completion_length": 772.8036041259766, "epoch": 0.5860743272105937, "grad_norm": 1.9784163236618042, "kl": 1.673828125, "learning_rate": 4.937067057529151e-07, "loss": 0.1045, "reward": 0.7008928805589676, "reward_std": 0.2629803456366062, "rewards/accuracy_reward": 0.06250000279396772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6383928656578064, "step": 343 }, { "clip_ratio": 0.0, "completion_length": 911.1786193847656, "epoch": 0.5877829987184964, "grad_norm": 2.6527278423309326, "kl": 1.767578125, "learning_rate": 4.910411640439454e-07, "loss": 0.0647, "reward": 0.683035746216774, "reward_std": 0.33389192819595337, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.558035746216774, "step": 344 }, { "clip_ratio": 0.0, "completion_length": 772.1875457763672, "epoch": 0.5894916702263989, "grad_norm": 3.1040942668914795, "kl": 1.8984375, "learning_rate": 4.883777255121835e-07, "loss": 0.1127, "reward": 0.6852678954601288, "reward_std": 0.31748366355895996, "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5424107313156128, "step": 345 }, { "clip_ratio": 0.0, "completion_length": 832.3839721679688, "epoch": 0.5912003417343016, "grad_norm": 2.6396405696868896, "kl": 1.70703125, "learning_rate": 4.857164851677004e-07, "loss": 0.1568, "reward": 0.676339328289032, "reward_std": 0.2795928381383419, "rewards/accuracy_reward": 0.10714286379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5691964626312256, "step": 346 }, { "clip_ratio": 0.0, "completion_length": 787.6339721679688, "epoch": 0.5929090132422042, "grad_norm": 2.293712854385376, "kl": 1.533203125, "learning_rate": 4.830575379421537e-07, "loss": 0.0639, "reward": 0.7477678954601288, "reward_std": 0.33594826608896255, "rewards/accuracy_reward": 0.14285715017467737, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.604910746216774, "step": 347 }, { "clip_ratio": 0.0, "completion_length": 949.7321929931641, "epoch": 0.5946176847501068, "grad_norm": 2.1845521926879883, "kl": 1.474609375, "learning_rate": 4.804009786854012e-07, "loss": -0.0477, "reward": 0.705357164144516, "reward_std": 0.3584393262863159, "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5625000149011612, "step": 348 }, { "clip_ratio": 0.0, "completion_length": 944.0803985595703, "epoch": 0.5963263562580094, "grad_norm": 2.2127904891967773, "kl": 1.2568359375, "learning_rate": 4.777469021621171e-07, "loss": 0.0115, "reward": 0.6897321790456772, "reward_std": 0.3161856308579445, "rewards/accuracy_reward": 0.09821429196745157, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5915178954601288, "step": 349 }, { "clip_ratio": 0.0, "completion_length": 918.4464721679688, "epoch": 0.598035027765912, "grad_norm": 1.2418453693389893, "kl": 1.2275390625, "learning_rate": 4.7509540304841176e-07, "loss": -0.0218, "reward": 0.7455357611179352, "reward_std": 0.3125985115766525, "rewards/accuracy_reward": 0.1339285746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.611607164144516, "step": 350 }, { "clip_ratio": 0.0, "completion_length": 866.4553833007812, "epoch": 0.5997436992738147, "grad_norm": 2.8095757961273193, "kl": 1.185546875, "learning_rate": 4.7244657592845403e-07, "loss": 0.0508, "reward": 0.6517857313156128, "reward_std": 0.33358415961265564, "rewards/accuracy_reward": 0.05357143096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5982143133878708, "step": 351 }, { "clip_ratio": 0.0, "completion_length": 898.3214721679688, "epoch": 0.6014523707817172, "grad_norm": 2.650524139404297, "kl": 1.25, "learning_rate": 4.6980051529109743e-07, "loss": 0.0255, "reward": 0.6428571790456772, "reward_std": 0.2761208079755306, "rewards/accuracy_reward": 0.04464286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5982143133878708, "step": 352 }, { "clip_ratio": 0.0, "completion_length": 888.3482513427734, "epoch": 0.6031610422896199, "grad_norm": 2.572216749191284, "kl": 1.3515625, "learning_rate": 4.671573155265101e-07, "loss": 0.0859, "reward": 0.7812500298023224, "reward_std": 0.32725009322166443, "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6383928805589676, "step": 353 }, { "clip_ratio": 0.0, "completion_length": 920.2500457763672, "epoch": 0.6048697137975224, "grad_norm": 1.893396019935608, "kl": 1.509765625, "learning_rate": 4.6451707092280676e-07, "loss": 0.0492, "reward": 0.683035746216774, "reward_std": 0.2604496330022812, "rewards/accuracy_reward": 0.0892857164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5937500298023224, "step": 354 }, { "clip_ratio": 0.0, "completion_length": 852.6339721679688, "epoch": 0.6065783853054251, "grad_norm": 2.0251169204711914, "kl": 1.19140625, "learning_rate": 4.6187987566268584e-07, "loss": 0.1073, "reward": 0.7544643133878708, "reward_std": 0.3476915881037712, "rewards/accuracy_reward": 0.10714286379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6473214626312256, "step": 355 }, { "clip_ratio": 0.0, "completion_length": 1044.857177734375, "epoch": 0.6082870568133276, "grad_norm": 1.508630394935608, "kl": 1.2109375, "learning_rate": 4.5924582382007006e-07, "loss": 0.0244, "reward": 0.6718750298023224, "reward_std": 0.32279861345887184, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6093750298023224, "step": 356 }, { "clip_ratio": 0.0, "completion_length": 930.2411193847656, "epoch": 0.6099957283212303, "grad_norm": 5.364377021789551, "kl": 1.201171875, "learning_rate": 4.5661500935675e-07, "loss": -0.0278, "reward": 0.7611607611179352, "reward_std": 0.3252589702606201, "rewards/accuracy_reward": 0.10714285913854837, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6540178805589676, "step": 357 }, { "clip_ratio": 0.0, "completion_length": 969.8125457763672, "epoch": 0.6117043998291328, "grad_norm": 2.1110141277313232, "kl": 1.615234375, "learning_rate": 4.5398752611903235e-07, "loss": 0.0592, "reward": 0.7053571790456772, "reward_std": 0.30860764533281326, "rewards/accuracy_reward": 0.09821429289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6071428805589676, "step": 358 }, { "clip_ratio": 0.0, "completion_length": 972.4018096923828, "epoch": 0.6134130713370355, "grad_norm": 3.0337636470794678, "kl": 1.392578125, "learning_rate": 4.5136346783439317e-07, "loss": 0.082, "reward": 0.7433036118745804, "reward_std": 0.3119441531598568, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6183035969734192, "step": 359 }, { "clip_ratio": 0.0, "completion_length": 915.7678985595703, "epoch": 0.615121742844938, "grad_norm": 2.682349681854248, "kl": 1.236328125, "learning_rate": 4.4874292810813277e-07, "loss": 0.0056, "reward": 0.808035746216774, "reward_std": 0.31216536462306976, "rewards/accuracy_reward": 0.16964285913854837, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6383928954601288, "step": 360 }, { "clip_ratio": 0.0, "completion_length": 1003.0000457763672, "epoch": 0.6168304143528407, "grad_norm": 1.2690590620040894, "kl": 1.365234375, "learning_rate": 4.461260004200381e-07, "loss": 0.0303, "reward": 0.738839328289032, "reward_std": 0.27491897344589233, "rewards/accuracy_reward": 0.09821428824216127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6406250298023224, "step": 361 }, { "clip_ratio": 0.0, "completion_length": 943.7857666015625, "epoch": 0.6185390858607432, "grad_norm": 2.0243499279022217, "kl": 1.0732421875, "learning_rate": 4.4351277812104826e-07, "loss": 0.0085, "reward": 0.8035714626312256, "reward_std": 0.3553484305739403, "rewards/accuracy_reward": 0.18750001210719347, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6160714626312256, "step": 362 }, { "clip_ratio": 0.0, "completion_length": 917.3036041259766, "epoch": 0.6202477573686459, "grad_norm": 1.7993136644363403, "kl": 1.2578125, "learning_rate": 4.4090335442992256e-07, "loss": 0.0678, "reward": 0.8370536118745804, "reward_std": 0.308171346783638, "rewards/accuracy_reward": 0.16964286752045155, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.667410746216774, "step": 363 }, { "clip_ratio": 0.0, "completion_length": 917.5357666015625, "epoch": 0.6219564288765485, "grad_norm": 1.5550683736801147, "kl": 1.263671875, "learning_rate": 4.3829782242991733e-07, "loss": 0.0717, "reward": 0.7991071939468384, "reward_std": 0.35001590847969055, "rewards/accuracy_reward": 0.15178572479635477, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6473214477300644, "step": 364 }, { "clip_ratio": 0.0, "completion_length": 1077.7054290771484, "epoch": 0.6236651003844511, "grad_norm": 1.3447710275650024, "kl": 1.107421875, "learning_rate": 4.356962750654647e-07, "loss": 0.039, "reward": 0.808035746216774, "reward_std": 0.32425977289676666, "rewards/accuracy_reward": 0.1428571529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6651785969734192, "step": 365 }, { "clip_ratio": 0.0, "completion_length": 891.3393249511719, "epoch": 0.6253737718923537, "grad_norm": 1.6414378881454468, "kl": 1.404296875, "learning_rate": 4.3309880513885664e-07, "loss": -0.0229, "reward": 0.7410714626312256, "reward_std": 0.3331568166613579, "rewards/accuracy_reward": 0.0892857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.651785746216774, "step": 366 }, { "clip_ratio": 0.0, "completion_length": 908.9107513427734, "epoch": 0.6270824434002563, "grad_norm": 5.815966606140137, "kl": 1.3486328125, "learning_rate": 4.305055053069354e-07, "loss": 0.0751, "reward": 0.7812500298023224, "reward_std": 0.3126010484993458, "rewards/accuracy_reward": 0.1160714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6651785969734192, "step": 367 }, { "clip_ratio": 0.0, "completion_length": 904.6250305175781, "epoch": 0.6287911149081589, "grad_norm": 2.0202035903930664, "kl": 1.69921875, "learning_rate": 4.2791646807778715e-07, "loss": -0.0009, "reward": 0.8705357611179352, "reward_std": 0.3386400118470192, "rewards/accuracy_reward": 0.21428572107106447, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6562500447034836, "step": 368 }, { "clip_ratio": 0.0, "completion_length": 941.5804138183594, "epoch": 0.6304997864160615, "grad_norm": 1.429031491279602, "kl": 1.1630859375, "learning_rate": 4.2533178580744324e-07, "loss": -0.0099, "reward": 0.8906250298023224, "reward_std": 0.302538238465786, "rewards/accuracy_reward": 0.18750000931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7031250298023224, "step": 369 }, { "clip_ratio": 0.0, "completion_length": 984.9911041259766, "epoch": 0.6322084579239641, "grad_norm": 1.812880516052246, "kl": 1.1552734375, "learning_rate": 4.227515506965851e-07, "loss": 0.0059, "reward": 0.8683036118745804, "reward_std": 0.36844101548194885, "rewards/accuracy_reward": 0.1785714402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6897321790456772, "step": 370 }, { "clip_ratio": 0.0, "completion_length": 851.3482513427734, "epoch": 0.6339171294318667, "grad_norm": 2.088967800140381, "kl": 1.638671875, "learning_rate": 4.201758547872546e-07, "loss": 0.0304, "reward": 0.7053571790456772, "reward_std": 0.26205795258283615, "rewards/accuracy_reward": 0.08035714644938707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6250000447034836, "step": 371 }, { "clip_ratio": 0.0, "completion_length": 929.544677734375, "epoch": 0.6356258009397693, "grad_norm": 2.631113290786743, "kl": 1.478515625, "learning_rate": 4.1760478995957217e-07, "loss": 0.0981, "reward": 0.9218750298023224, "reward_std": 0.3007860407233238, "rewards/accuracy_reward": 0.2589285895228386, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6629464626312256, "step": 372 }, { "clip_ratio": 0.0, "completion_length": 995.4286193847656, "epoch": 0.6373344724476719, "grad_norm": 1.5658360719680786, "kl": 1.032958984375, "learning_rate": 4.150384479284581e-07, "loss": 0.0934, "reward": 0.854910746216774, "reward_std": 0.2803886868059635, "rewards/accuracy_reward": 0.12500000651925802, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7299107611179352, "step": 373 }, { "clip_ratio": 0.0, "completion_length": 1038.4107513427734, "epoch": 0.6390431439555745, "grad_norm": 1.1654471158981323, "kl": 1.1416015625, "learning_rate": 4.124769202403614e-07, "loss": 0.0633, "reward": 0.8102678805589676, "reward_std": 0.2920790947973728, "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.667410746216774, "step": 374 }, { "clip_ratio": 0.0, "completion_length": 993.3393402099609, "epoch": 0.6407518154634771, "grad_norm": 1.1324126720428467, "kl": 1.29296875, "learning_rate": 4.0992029826999374e-07, "loss": 0.0193, "reward": 0.7723214626312256, "reward_std": 0.2212589718401432, "rewards/accuracy_reward": 0.05357143096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7187500298023224, "step": 375 }, { "clip_ratio": 0.0, "completion_length": 1052.0536041259766, "epoch": 0.6424604869713798, "grad_norm": 2.0446372032165527, "kl": 1.4208984375, "learning_rate": 4.073686732170708e-07, "loss": -0.0244, "reward": 0.7366071790456772, "reward_std": 0.27388640865683556, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6651785969734192, "step": 376 }, { "clip_ratio": 0.0, "completion_length": 1072.9286193847656, "epoch": 0.6441691584792824, "grad_norm": 1.11142098903656, "kl": 0.87841796875, "learning_rate": 4.0482213610305793e-07, "loss": 0.0874, "reward": 0.8303571790456772, "reward_std": 0.2902870513498783, "rewards/accuracy_reward": 0.11607143469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.714285746216774, "step": 377 }, { "clip_ratio": 0.0, "completion_length": 808.294677734375, "epoch": 0.645877829987185, "grad_norm": 1.58376944065094, "kl": 1.3818359375, "learning_rate": 4.022807777679238e-07, "loss": -0.0169, "reward": 0.832589328289032, "reward_std": 0.27708718925714493, "rewards/accuracy_reward": 0.14285714644938707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.689732164144516, "step": 378 }, { "clip_ratio": 0.0, "completion_length": 1124.4822082519531, "epoch": 0.6475865014950876, "grad_norm": 1.6241235733032227, "kl": 1.01953125, "learning_rate": 3.9974468886690007e-07, "loss": -0.0186, "reward": 0.8214286118745804, "reward_std": 0.28429659456014633, "rewards/accuracy_reward": 0.11607143096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.705357164144516, "step": 379 }, { "clip_ratio": 0.0, "completion_length": 933.3214569091797, "epoch": 0.6492951730029902, "grad_norm": 1.753510594367981, "kl": 1.693359375, "learning_rate": 3.9721395986724713e-07, "loss": 0.012, "reward": 0.8750000447034836, "reward_std": 0.2899741195142269, "rewards/accuracy_reward": 0.1964285783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6785714626312256, "step": 380 }, { "clip_ratio": 0.0, "completion_length": 1034.7947082519531, "epoch": 0.6510038445108928, "grad_norm": 1.4004111289978027, "kl": 0.9990234375, "learning_rate": 3.946886810450277e-07, "loss": 0.0578, "reward": 0.745535746216774, "reward_std": 0.262836217880249, "rewards/accuracy_reward": 0.05357143096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6919643133878708, "step": 381 }, { "clip_ratio": 0.0, "completion_length": 1010.2678833007812, "epoch": 0.6527125160187954, "grad_norm": 3.8880679607391357, "kl": 1.474609375, "learning_rate": 3.9216894248188527e-07, "loss": 0.0382, "reward": 0.926339328289032, "reward_std": 0.3153666779398918, "rewards/accuracy_reward": 0.2321428693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6941964626312256, "step": 382 }, { "clip_ratio": 0.0, "completion_length": 982.0982666015625, "epoch": 0.654421187526698, "grad_norm": 1.1028307676315308, "kl": 1.4013671875, "learning_rate": 3.896548340618316e-07, "loss": 0.0026, "reward": 0.7790178954601288, "reward_std": 0.3125469870865345, "rewards/accuracy_reward": 0.11607143376022577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6629464626312256, "step": 383 }, { "clip_ratio": 0.0, "completion_length": 1049.1607513427734, "epoch": 0.6561298590346006, "grad_norm": 1.5106911659240723, "kl": 0.9033203125, "learning_rate": 3.8714644546804075e-07, "loss": -0.0412, "reward": 0.9196428954601288, "reward_std": 0.2908468618988991, "rewards/accuracy_reward": 0.2232142984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6964285969734192, "step": 384 }, { "clip_ratio": 0.0, "completion_length": 1119.4553833007812, "epoch": 0.6578385305425032, "grad_norm": 2.4177772998809814, "kl": 1.248046875, "learning_rate": 3.8464386617964915e-07, "loss": -0.0001, "reward": 0.7633928954601288, "reward_std": 0.28351058438420296, "rewards/accuracy_reward": 0.053571431897580624, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7098214626312256, "step": 385 }, { "clip_ratio": 0.0, "completion_length": 1045.044692993164, "epoch": 0.6595472020504058, "grad_norm": 1.1626689434051514, "kl": 0.9814453125, "learning_rate": 3.821471854685632e-07, "loss": 0.0689, "reward": 0.7879464626312256, "reward_std": 0.274183202534914, "rewards/accuracy_reward": 0.08035714738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.707589328289032, "step": 386 }, { "clip_ratio": 0.0, "completion_length": 1027.4464721679688, "epoch": 0.6612558735583084, "grad_norm": 1.6755082607269287, "kl": 1.7998046875, "learning_rate": 3.796564923962765e-07, "loss": 0.0092, "reward": 0.7790178954601288, "reward_std": 0.3768128529191017, "rewards/accuracy_reward": 0.12500000838190317, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6540178954601288, "step": 387 }, { "clip_ratio": 0.0, "completion_length": 1048.3839874267578, "epoch": 0.662964545066211, "grad_norm": 1.5085257291793823, "kl": 1.1611328125, "learning_rate": 3.771718758106913e-07, "loss": 0.021, "reward": 0.7767857611179352, "reward_std": 0.3047008663415909, "rewards/accuracy_reward": 0.10714286472648382, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6696428805589676, "step": 388 }, { "clip_ratio": 0.0, "completion_length": 1051.9464569091797, "epoch": 0.6646732165741136, "grad_norm": 2.055103063583374, "kl": 1.4482421875, "learning_rate": 3.7469342434294983e-07, "loss": -0.0042, "reward": 0.785714328289032, "reward_std": 0.3091980591416359, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6607143133878708, "step": 389 }, { "clip_ratio": 0.0, "completion_length": 947.5268402099609, "epoch": 0.6663818880820163, "grad_norm": 2.4679975509643555, "kl": 1.55078125, "learning_rate": 3.7222122640427234e-07, "loss": 0.0497, "reward": 0.761160746216774, "reward_std": 0.34849340468645096, "rewards/accuracy_reward": 0.10714286472648382, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6540178954601288, "step": 390 }, { "clip_ratio": 0.0, "completion_length": 1085.9911041259766, "epoch": 0.6680905595899188, "grad_norm": 1.3151092529296875, "kl": 1.1357421875, "learning_rate": 3.6975537018280356e-07, "loss": 0.0259, "reward": 0.8794643431901932, "reward_std": 0.2425183802843094, "rewards/accuracy_reward": 0.13392857927829027, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7455357313156128, "step": 391 }, { "clip_ratio": 0.0, "completion_length": 1115.5625610351562, "epoch": 0.6697992310978215, "grad_norm": 1.7801506519317627, "kl": 0.890625, "learning_rate": 3.6729594364046656e-07, "loss": 0.0117, "reward": 0.9017857611179352, "reward_std": 0.35527393966913223, "rewards/accuracy_reward": 0.19642857927829027, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7053571790456772, "step": 392 }, { "clip_ratio": 0.0, "completion_length": 1022.5625610351562, "epoch": 0.671507902605724, "grad_norm": 1.3101731538772583, "kl": 1.26171875, "learning_rate": 3.6484303450982566e-07, "loss": 0.0064, "reward": 0.895089328289032, "reward_std": 0.2832319252192974, "rewards/accuracy_reward": 0.19642858393490314, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.698660746216774, "step": 393 }, { "clip_ratio": 0.0, "completion_length": 1006.5446929931641, "epoch": 0.6732165741136267, "grad_norm": 2.5020875930786133, "kl": 1.513671875, "learning_rate": 3.6239673029095583e-07, "loss": 0.0591, "reward": 0.7633928954601288, "reward_std": 0.32094674557447433, "rewards/accuracy_reward": 0.10714286100119352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6562500298023224, "step": 394 }, { "clip_ratio": 0.0, "completion_length": 1137.357177734375, "epoch": 0.6749252456215292, "grad_norm": 1.1993253231048584, "kl": 0.7530517578125, "learning_rate": 3.5995711824832177e-07, "loss": 0.0198, "reward": 0.988839328289032, "reward_std": 0.3272216320037842, "rewards/accuracy_reward": 0.2678571566939354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7209821939468384, "step": 395 }, { "clip_ratio": 0.0, "completion_length": 1015.2500457763672, "epoch": 0.6766339171294319, "grad_norm": 1.6216177940368652, "kl": 1.748046875, "learning_rate": 3.57524285407666e-07, "loss": 0.0133, "reward": 0.683035746216774, "reward_std": 0.28871873766183853, "rewards/accuracy_reward": 0.035714288242161274, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6473214626312256, "step": 396 }, { "clip_ratio": 0.0, "completion_length": 994.4553985595703, "epoch": 0.6783425886373344, "grad_norm": 2.603776216506958, "kl": 1.52734375, "learning_rate": 3.550983185529023e-07, "loss": 0.0214, "reward": 0.8236607611179352, "reward_std": 0.32682809978723526, "rewards/accuracy_reward": 0.13392857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6897321939468384, "step": 397 }, { "clip_ratio": 0.0, "completion_length": 847.8750305175781, "epoch": 0.6800512601452371, "grad_norm": 2.379682779312134, "kl": 1.1435546875, "learning_rate": 3.5267930422302263e-07, "loss": 0.0461, "reward": 0.7901786267757416, "reward_std": 0.26421744748950005, "rewards/accuracy_reward": 0.035714288242161274, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.754464328289032, "step": 398 }, { "clip_ratio": 0.0, "completion_length": 979.1964721679688, "epoch": 0.6817599316531396, "grad_norm": 2.391604423522949, "kl": 1.4482421875, "learning_rate": 3.502673287090082e-07, "loss": 0.067, "reward": 0.9084821790456772, "reward_std": 0.3233964778482914, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.720982164144516, "step": 399 }, { "clip_ratio": 0.0, "completion_length": 1082.401840209961, "epoch": 0.6834686031610423, "grad_norm": 1.6612086296081543, "kl": 1.2548828125, "learning_rate": 3.4786247805075107e-07, "loss": 0.0336, "reward": 0.7410714477300644, "reward_std": 0.2830534353852272, "rewards/accuracy_reward": 0.05357143096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6875000447034836, "step": 400 }, { "clip_ratio": 0.0, "completion_length": 1009.3661193847656, "epoch": 0.6851772746689448, "grad_norm": 3.3944904804229736, "kl": 1.0830078125, "learning_rate": 3.454648380339873e-07, "loss": 0.0156, "reward": 0.8816964775323868, "reward_std": 0.32183992490172386, "rewards/accuracy_reward": 0.17857143469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7031250447034836, "step": 401 }, { "clip_ratio": 0.0, "completion_length": 1051.5893249511719, "epoch": 0.6868859461768475, "grad_norm": 2.288043260574341, "kl": 1.4921875, "learning_rate": 3.430744941872342e-07, "loss": 0.0984, "reward": 0.8258928954601288, "reward_std": 0.32092058658599854, "rewards/accuracy_reward": 0.11607143469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7098214775323868, "step": 402 }, { "clip_ratio": 0.0, "completion_length": 925.3750457763672, "epoch": 0.68859461768475, "grad_norm": 2.3981616497039795, "kl": 1.1162109375, "learning_rate": 3.4069153177874024e-07, "loss": 0.0271, "reward": 0.9732143431901932, "reward_std": 0.30317365378141403, "rewards/accuracy_reward": 0.25000000931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.723214328289032, "step": 403 }, { "clip_ratio": 0.0, "completion_length": 1002.8661346435547, "epoch": 0.6903032891926527, "grad_norm": 2.4945766925811768, "kl": 1.53125, "learning_rate": 3.383160358134445e-07, "loss": 0.0235, "reward": 0.8191964626312256, "reward_std": 0.2600704915821552, "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6763393133878708, "step": 404 }, { "clip_ratio": 0.0, "completion_length": 953.8839721679688, "epoch": 0.6920119607005554, "grad_norm": 1.441089153289795, "kl": 1.3359375, "learning_rate": 3.3594809102994235e-07, "loss": 0.0685, "reward": 0.7589286118745804, "reward_std": 0.3026113845407963, "rewards/accuracy_reward": 0.053571431897580624, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7053571790456772, "step": 405 }, { "clip_ratio": 0.0, "completion_length": 863.3571929931641, "epoch": 0.6937206322084579, "grad_norm": 2.39959716796875, "kl": 1.818359375, "learning_rate": 3.335877818974642e-07, "loss": -0.0321, "reward": 0.8437500447034836, "reward_std": 0.2551427371799946, "rewards/accuracy_reward": 0.1785714402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6651785969734192, "step": 406 }, { "clip_ratio": 0.0, "completion_length": 1018.8660888671875, "epoch": 0.6954293037163606, "grad_norm": 3.515746593475342, "kl": 1.396484375, "learning_rate": 3.3123519261286214e-07, "loss": -0.0051, "reward": 0.8370536118745804, "reward_std": 0.27879123389720917, "rewards/accuracy_reward": 0.14285714644938707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6941964477300644, "step": 407 }, { "clip_ratio": 0.0, "completion_length": 885.2678833007812, "epoch": 0.6971379752242631, "grad_norm": 2.1625101566314697, "kl": 1.8046875, "learning_rate": 3.2889040709760554e-07, "loss": 0.0075, "reward": 0.7790178954601288, "reward_std": 0.3068099170923233, "rewards/accuracy_reward": 0.12500000838190317, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6540178954601288, "step": 408 }, { "clip_ratio": 0.0, "completion_length": 968.7946929931641, "epoch": 0.6988466467321658, "grad_norm": 2.1072840690612793, "kl": 1.62890625, "learning_rate": 3.2655350899478804e-07, "loss": 0.0341, "reward": 0.7566964626312256, "reward_std": 0.30446139350533485, "rewards/accuracy_reward": 0.09821429010480642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6584821790456772, "step": 409 }, { "clip_ratio": 0.0, "completion_length": 983.9464874267578, "epoch": 0.7005553182400683, "grad_norm": 1.7346993684768677, "kl": 1.42578125, "learning_rate": 3.2422458166614485e-07, "loss": 0.0244, "reward": 0.7678571939468384, "reward_std": 0.3458539545536041, "rewards/accuracy_reward": 0.12500000186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.642857164144516, "step": 410 }, { "clip_ratio": 0.0, "completion_length": 1067.5357666015625, "epoch": 0.702263989747971, "grad_norm": 1.4967718124389648, "kl": 1.34765625, "learning_rate": 3.219037081890766e-07, "loss": 0.0106, "reward": 0.7410714626312256, "reward_std": 0.2904849871993065, "rewards/accuracy_reward": 0.0625000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6785714775323868, "step": 411 }, { "clip_ratio": 0.0, "completion_length": 942.4464721679688, "epoch": 0.7039726612558735, "grad_norm": 1.5984158515930176, "kl": 1.4130859375, "learning_rate": 3.1959097135368813e-07, "loss": 0.0991, "reward": 0.745535746216774, "reward_std": 0.2680112384259701, "rewards/accuracy_reward": 0.09821429196745157, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6473214626312256, "step": 412 }, { "clip_ratio": 0.0, "completion_length": 856.2054138183594, "epoch": 0.7056813327637762, "grad_norm": 2.6420440673828125, "kl": 1.8671875, "learning_rate": 3.1728645365983406e-07, "loss": 0.0415, "reward": 0.6808036118745804, "reward_std": 0.254717655479908, "rewards/accuracy_reward": 0.08928571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5915178954601288, "step": 413 }, { "clip_ratio": 0.0, "completion_length": 854.794677734375, "epoch": 0.7073900042716788, "grad_norm": 1.766916275024414, "kl": 1.677734375, "learning_rate": 3.149902373141763e-07, "loss": -0.0056, "reward": 0.7723214477300644, "reward_std": 0.3421461209654808, "rewards/accuracy_reward": 0.1160714328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6562500298023224, "step": 414 }, { "clip_ratio": 0.0, "completion_length": 902.3036193847656, "epoch": 0.7090986757795814, "grad_norm": 3.2869479656219482, "kl": 1.814453125, "learning_rate": 3.1270240422725145e-07, "loss": -0.0009, "reward": 0.7165178954601288, "reward_std": 0.3962520733475685, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5915178805589676, "step": 415 }, { "clip_ratio": 0.0, "completion_length": 849.0803985595703, "epoch": 0.710807347287484, "grad_norm": 1.7114737033843994, "kl": 1.4375, "learning_rate": 3.1042303601054836e-07, "loss": 0.0618, "reward": 0.8571428954601288, "reward_std": 0.3375391215085983, "rewards/accuracy_reward": 0.16071429569274187, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6964285969734192, "step": 416 }, { "clip_ratio": 0.0, "completion_length": 808.5000457763672, "epoch": 0.7125160187953866, "grad_norm": 2.0757381916046143, "kl": 1.6328125, "learning_rate": 3.081522139735977e-07, "loss": 0.056, "reward": 0.7098214626312256, "reward_std": 0.35865096002817154, "rewards/accuracy_reward": 0.08928571920841932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6205357313156128, "step": 417 }, { "clip_ratio": 0.0, "completion_length": 846.6428833007812, "epoch": 0.7142246903032892, "grad_norm": 2.0216376781463623, "kl": 1.40234375, "learning_rate": 3.058900191210717e-07, "loss": 0.0423, "reward": 0.7991071790456772, "reward_std": 0.3462955616414547, "rewards/accuracy_reward": 0.1071428656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6919643133878708, "step": 418 }, { "clip_ratio": 0.0, "completion_length": 945.3393249511719, "epoch": 0.7159333618111918, "grad_norm": 2.1384878158569336, "kl": 1.361328125, "learning_rate": 3.036365321498929e-07, "loss": 0.0029, "reward": 0.7589286118745804, "reward_std": 0.305264450609684, "rewards/accuracy_reward": 0.08035714738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6785714626312256, "step": 419 }, { "clip_ratio": 0.0, "completion_length": 930.3036041259766, "epoch": 0.7176420333190944, "grad_norm": 2.572693347930908, "kl": 1.275390625, "learning_rate": 3.013918334463574e-07, "loss": 0.0292, "reward": 0.7500000298023224, "reward_std": 0.2966507151722908, "rewards/accuracy_reward": 0.09821429289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.651785746216774, "step": 420 }, { "clip_ratio": 0.0, "completion_length": 829.857177734375, "epoch": 0.719350704826997, "grad_norm": 2.9174904823303223, "kl": 1.4306640625, "learning_rate": 2.9915600308326656e-07, "loss": 0.0435, "reward": 0.7678571790456772, "reward_std": 0.2890019714832306, "rewards/accuracy_reward": 0.0892857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6785714626312256, "step": 421 }, { "clip_ratio": 0.0, "completion_length": 851.1607513427734, "epoch": 0.7210593763348996, "grad_norm": 1.2860957384109497, "kl": 1.666015625, "learning_rate": 2.9692912081707036e-07, "loss": -0.041, "reward": 0.8236607611179352, "reward_std": 0.3163386881351471, "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.680803582072258, "step": 422 }, { "clip_ratio": 0.0, "completion_length": 895.2946929931641, "epoch": 0.7227680478428022, "grad_norm": 2.8128488063812256, "kl": 1.7109375, "learning_rate": 2.947112660850227e-07, "loss": 0.053, "reward": 0.7232143133878708, "reward_std": 0.2777799591422081, "rewards/accuracy_reward": 0.09821429289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6250000149011612, "step": 423 }, { "clip_ratio": 0.0, "completion_length": 847.5982666015625, "epoch": 0.7244767193507048, "grad_norm": 2.750545024871826, "kl": 1.451171875, "learning_rate": 2.925025180023478e-07, "loss": 0.0866, "reward": 0.8035714775323868, "reward_std": 0.3833009973168373, "rewards/accuracy_reward": 0.16071429569274187, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.642857164144516, "step": 424 }, { "clip_ratio": 0.0, "completion_length": 930.9107513427734, "epoch": 0.7261853908586074, "grad_norm": 2.4542009830474854, "kl": 1.30859375, "learning_rate": 2.9030295535941797e-07, "loss": 0.0585, "reward": 0.8058035969734192, "reward_std": 0.28921060264110565, "rewards/accuracy_reward": 0.13392858114093542, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6718750298023224, "step": 425 }, { "clip_ratio": 0.0, "completion_length": 971.1071929931641, "epoch": 0.72789406236651, "grad_norm": 1.2857714891433716, "kl": 1.0302734375, "learning_rate": 2.8811265661894233e-07, "loss": -0.0204, "reward": 0.8883928805589676, "reward_std": 0.28945399820804596, "rewards/accuracy_reward": 0.2142857238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6741071492433548, "step": 426 }, { "clip_ratio": 0.0, "completion_length": 904.5000152587891, "epoch": 0.7296027338744127, "grad_norm": 1.5165693759918213, "kl": 1.3486328125, "learning_rate": 2.8593169991316906e-07, "loss": -0.0397, "reward": 0.8571428805589676, "reward_std": 0.34615374729037285, "rewards/accuracy_reward": 0.13392857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7232143133878708, "step": 427 }, { "clip_ratio": 0.0, "completion_length": 1031.9464721679688, "epoch": 0.7313114053823152, "grad_norm": 1.913068413734436, "kl": 1.4609375, "learning_rate": 2.837601630410974e-07, "loss": 0.0449, "reward": 0.7455357611179352, "reward_std": 0.3147626481950283, "rewards/accuracy_reward": 0.1160714328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6294643133878708, "step": 428 }, { "clip_ratio": 0.0, "completion_length": 1025.6964874267578, "epoch": 0.7330200768902179, "grad_norm": 1.805964469909668, "kl": 1.201171875, "learning_rate": 2.815981234657025e-07, "loss": 0.034, "reward": 0.8950893431901932, "reward_std": 0.29243259876966476, "rewards/accuracy_reward": 0.19642858300358057, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.698660746216774, "step": 429 }, { "clip_ratio": 0.0, "completion_length": 939.7411346435547, "epoch": 0.7347287483981204, "grad_norm": 2.118492603302002, "kl": 1.03125, "learning_rate": 2.794456583111726e-07, "loss": 0.0024, "reward": 0.7767857611179352, "reward_std": 0.2828138619661331, "rewards/accuracy_reward": 0.0892857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6875000298023224, "step": 430 }, { "clip_ratio": 0.0, "completion_length": 1012.2411346435547, "epoch": 0.7364374199060231, "grad_norm": 1.6793782711029053, "kl": 1.01953125, "learning_rate": 2.773028443601576e-07, "loss": -0.0035, "reward": 0.7633928954601288, "reward_std": 0.29614872857928276, "rewards/accuracy_reward": 0.07142857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6919643431901932, "step": 431 }, { "clip_ratio": 0.0, "completion_length": 1062.6161346435547, "epoch": 0.7381460914139256, "grad_norm": 2.204909324645996, "kl": 1.04541015625, "learning_rate": 2.7516975805102973e-07, "loss": -0.0019, "reward": 0.738839328289032, "reward_std": 0.32136082276701927, "rewards/accuracy_reward": 0.08035714831203222, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.658482164144516, "step": 432 }, { "clip_ratio": 0.0, "completion_length": 919.9375305175781, "epoch": 0.7398547629218283, "grad_norm": 3.826732635498047, "kl": 1.01171875, "learning_rate": 2.7304647547515753e-07, "loss": -0.005, "reward": 0.9419643133878708, "reward_std": 0.3394418992102146, "rewards/accuracy_reward": 0.2142857238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7276786118745804, "step": 433 }, { "clip_ratio": 0.0, "completion_length": 960.1161193847656, "epoch": 0.7415634344297309, "grad_norm": 1.189111590385437, "kl": 0.97265625, "learning_rate": 2.709330723741909e-07, "loss": 0.0019, "reward": 0.7968750298023224, "reward_std": 0.30054067447781563, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.689732164144516, "step": 434 }, { "clip_ratio": 0.0, "completion_length": 985.1786193847656, "epoch": 0.7432721059376335, "grad_norm": 2.1822919845581055, "kl": 1.505859375, "learning_rate": 2.688296241373597e-07, "loss": -0.0241, "reward": 0.7767857313156128, "reward_std": 0.2546679563820362, "rewards/accuracy_reward": 0.13392857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6428571790456772, "step": 435 }, { "clip_ratio": 0.0, "completion_length": 1095.7589721679688, "epoch": 0.7449807774455361, "grad_norm": 1.7638018131256104, "kl": 1.2099609375, "learning_rate": 2.667362057987842e-07, "loss": 0.0246, "reward": 0.8750000298023224, "reward_std": 0.3443813659250736, "rewards/accuracy_reward": 0.1964285857975483, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6785714626312256, "step": 436 }, { "clip_ratio": 0.0, "completion_length": 932.0357513427734, "epoch": 0.7466894489534387, "grad_norm": 1.3302981853485107, "kl": 1.16015625, "learning_rate": 2.646528920347986e-07, "loss": 0.0468, "reward": 0.808035746216774, "reward_std": 0.30279363691806793, "rewards/accuracy_reward": 0.13392857369035482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6741071790456772, "step": 437 }, { "clip_ratio": 0.0, "completion_length": 934.0179138183594, "epoch": 0.7483981204613414, "grad_norm": 1.8005025386810303, "kl": 1.0615234375, "learning_rate": 2.62579757161287e-07, "loss": 0.0811, "reward": 0.7968750298023224, "reward_std": 0.263308871537447, "rewards/accuracy_reward": 0.10714285913854837, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6897321790456772, "step": 438 }, { "clip_ratio": 0.0, "completion_length": 954.8572082519531, "epoch": 0.7501067919692439, "grad_norm": 1.743116021156311, "kl": 1.0673828125, "learning_rate": 2.605168751310328e-07, "loss": 0.0879, "reward": 0.745535746216774, "reward_std": 0.24390725791454315, "rewards/accuracy_reward": 0.08928571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6562500447034836, "step": 439 }, { "clip_ratio": 0.0, "completion_length": 1017.7590026855469, "epoch": 0.7518154634771466, "grad_norm": 1.5193634033203125, "kl": 1.138671875, "learning_rate": 2.584643195310797e-07, "loss": 0.0431, "reward": 0.8415178805589676, "reward_std": 0.2562900371849537, "rewards/accuracy_reward": 0.15178572200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6897321790456772, "step": 440 }, { "clip_ratio": 0.0, "completion_length": 1120.919692993164, "epoch": 0.7535241349850491, "grad_norm": 2.0082855224609375, "kl": 1.1064453125, "learning_rate": 2.5642216358010854e-07, "loss": 0.054, "reward": 0.7589285969734192, "reward_std": 0.23304230719804764, "rewards/accuracy_reward": 0.0535714328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.705357164144516, "step": 441 }, { "clip_ratio": 0.0, "completion_length": 965.2857818603516, "epoch": 0.7552328064929518, "grad_norm": 3.7321677207946777, "kl": 1.60546875, "learning_rate": 2.5439048012582345e-07, "loss": 0.0399, "reward": 0.7343750447034836, "reward_std": 0.30342838913202286, "rewards/accuracy_reward": 0.11607143469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6183035969734192, "step": 442 }, { "clip_ratio": 0.0, "completion_length": 959.482177734375, "epoch": 0.7569414780008543, "grad_norm": 1.0299155712127686, "kl": 1.2509765625, "learning_rate": 2.5236934164235386e-07, "loss": 0.0465, "reward": 0.8504464626312256, "reward_std": 0.3116784542798996, "rewards/accuracy_reward": 0.15178572200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.698660746216774, "step": 443 }, { "clip_ratio": 0.0, "completion_length": 1066.2589416503906, "epoch": 0.758650149508757, "grad_norm": 1.349860668182373, "kl": 1.2880859375, "learning_rate": 2.503588202276704e-07, "loss": 0.0296, "reward": 0.7723214626312256, "reward_std": 0.33823534846305847, "rewards/accuracy_reward": 0.11607143562287092, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6562500149011612, "step": 444 }, { "clip_ratio": 0.0, "completion_length": 1016.4107666015625, "epoch": 0.7603588210166595, "grad_norm": 1.3173326253890991, "kl": 1.2109375, "learning_rate": 2.483589876010115e-07, "loss": 0.0597, "reward": 0.745535746216774, "reward_std": 0.3319416418671608, "rewards/accuracy_reward": 0.05357143096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.691964328289032, "step": 445 }, { "clip_ratio": 0.0, "completion_length": 949.2232666015625, "epoch": 0.7620674925245622, "grad_norm": 1.0797067880630493, "kl": 0.9580078125, "learning_rate": 2.4636991510032513e-07, "loss": -0.0232, "reward": 0.8258928954601288, "reward_std": 0.2679731138050556, "rewards/accuracy_reward": 0.1160714328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7098214477300644, "step": 446 }, { "clip_ratio": 0.0, "completion_length": 951.1696929931641, "epoch": 0.7637761640324647, "grad_norm": 2.0433828830718994, "kl": 1.587890625, "learning_rate": 2.4439167367972553e-07, "loss": 0.0263, "reward": 0.895089328289032, "reward_std": 0.31700175255537033, "rewards/accuracy_reward": 0.223214291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6718750149011612, "step": 447 }, { "clip_ratio": 0.0, "completion_length": 925.8214874267578, "epoch": 0.7654848355403674, "grad_norm": 1.4599140882492065, "kl": 1.0966796875, "learning_rate": 2.424243339069599e-07, "loss": 0.0075, "reward": 0.8750000298023224, "reward_std": 0.20956343784928322, "rewards/accuracy_reward": 0.1517857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7232143133878708, "step": 448 }, { "clip_ratio": 0.0, "completion_length": 1123.2232513427734, "epoch": 0.7671935070482699, "grad_norm": 2.753387212753296, "kl": 1.0458984375, "learning_rate": 2.404679659608934e-07, "loss": 0.0239, "reward": 0.8571428954601288, "reward_std": 0.268641822040081, "rewards/accuracy_reward": 0.1428571529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7142857313156128, "step": 449 }, { "clip_ratio": 0.0, "completion_length": 1087.6250762939453, "epoch": 0.7689021785561726, "grad_norm": 0.8384984135627747, "kl": 0.918701171875, "learning_rate": 2.3852263962900406e-07, "loss": -0.0212, "reward": 0.7500000596046448, "reward_std": 0.2530766874551773, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.714285746216774, "step": 450 }, { "clip_ratio": 0.0, "completion_length": 1081.5982818603516, "epoch": 0.7706108500640751, "grad_norm": 2.7289891242980957, "kl": 1.041015625, "learning_rate": 2.3658842430489364e-07, "loss": 0.0436, "reward": 0.738839328289032, "reward_std": 0.28885190561413765, "rewards/accuracy_reward": 0.044642859138548374, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6941964626312256, "step": 451 }, { "clip_ratio": 0.0, "completion_length": 917.0536193847656, "epoch": 0.7723195215719778, "grad_norm": 1.8390542268753052, "kl": 1.3369140625, "learning_rate": 2.3466538898581315e-07, "loss": -0.0266, "reward": 0.9620536118745804, "reward_std": 0.3268473781645298, "rewards/accuracy_reward": 0.2678571566939354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6941964626312256, "step": 452 }, { "clip_ratio": 0.0, "completion_length": 1029.3214416503906, "epoch": 0.7740281930798804, "grad_norm": 1.6633975505828857, "kl": 1.2587890625, "learning_rate": 2.327536022702006e-07, "loss": 0.0059, "reward": 0.8727678954601288, "reward_std": 0.37463538348674774, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6852678954601288, "step": 453 }, { "clip_ratio": 0.0, "completion_length": 988.3214721679688, "epoch": 0.775736864587783, "grad_norm": 2.297581434249878, "kl": 1.623046875, "learning_rate": 2.3085313235523385e-07, "loss": -0.0179, "reward": 0.729910746216774, "reward_std": 0.2748340219259262, "rewards/accuracy_reward": 0.04464286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6852678954601288, "step": 454 }, { "clip_ratio": 0.0, "completion_length": 974.2589874267578, "epoch": 0.7774455360956856, "grad_norm": 1.4085257053375244, "kl": 1.0791015625, "learning_rate": 2.2896404703439922e-07, "loss": -0.0055, "reward": 0.8571428954601288, "reward_std": 0.319912388920784, "rewards/accuracy_reward": 0.14285714831203222, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7142857611179352, "step": 455 }, { "clip_ratio": 0.0, "completion_length": 1103.776840209961, "epoch": 0.7791542076035882, "grad_norm": 1.576709508895874, "kl": 0.9296875, "learning_rate": 2.270864136950714e-07, "loss": 0.0088, "reward": 0.79464291036129, "reward_std": 0.27475224807858467, "rewards/accuracy_reward": 0.08928571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7053571939468384, "step": 456 }, { "clip_ratio": 0.0, "completion_length": 970.9732666015625, "epoch": 0.7808628791114908, "grad_norm": 1.926915168762207, "kl": 1.689453125, "learning_rate": 2.2522029931611068e-07, "loss": 0.0419, "reward": 0.8616071790456772, "reward_std": 0.34910599142313004, "rewards/accuracy_reward": 0.1964285783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6651786118745804, "step": 457 }, { "clip_ratio": 0.0, "completion_length": 969.6964874267578, "epoch": 0.7825715506193934, "grad_norm": 1.0638432502746582, "kl": 1.158203125, "learning_rate": 2.2336577046547417e-07, "loss": 0.0358, "reward": 0.8325893133878708, "reward_std": 0.2992747649550438, "rewards/accuracy_reward": 0.1339285783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.698660746216774, "step": 458 }, { "clip_ratio": 0.0, "completion_length": 1011.6161041259766, "epoch": 0.784280222127296, "grad_norm": 1.5740846395492554, "kl": 1.408203125, "learning_rate": 2.2152289329783923e-07, "loss": -0.0082, "reward": 0.8482143431901932, "reward_std": 0.3178826794028282, "rewards/accuracy_reward": 0.1964285857975483, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.651785746216774, "step": 459 }, { "clip_ratio": 0.0, "completion_length": 840.6964721679688, "epoch": 0.7859888936351986, "grad_norm": 2.8560545444488525, "kl": 1.1640625, "learning_rate": 2.196917335522458e-07, "loss": 0.0348, "reward": 0.8504464775323868, "reward_std": 0.24337276071310043, "rewards/accuracy_reward": 0.133928582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7165178954601288, "step": 460 }, { "clip_ratio": 0.0, "completion_length": 990.7589721679688, "epoch": 0.7876975651431012, "grad_norm": 1.907439112663269, "kl": 0.96484375, "learning_rate": 2.1787235654975027e-07, "loss": 0.0511, "reward": 0.8794643431901932, "reward_std": 0.26954543218016624, "rewards/accuracy_reward": 0.13392858020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7455357313156128, "step": 461 }, { "clip_ratio": 0.0, "completion_length": 885.5625610351562, "epoch": 0.7894062366510038, "grad_norm": 1.7068558931350708, "kl": 1.4091796875, "learning_rate": 2.1606482719109498e-07, "loss": -0.0019, "reward": 0.8705357611179352, "reward_std": 0.3431403413414955, "rewards/accuracy_reward": 0.16964286752045155, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7008928954601288, "step": 462 }, { "clip_ratio": 0.0, "completion_length": 1126.9375305175781, "epoch": 0.7911149081589065, "grad_norm": 1.9599171876907349, "kl": 1.29296875, "learning_rate": 2.1426920995439452e-07, "loss": -0.0244, "reward": 0.8504464775323868, "reward_std": 0.32897577434778214, "rewards/accuracy_reward": 0.14285715017467737, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.707589328289032, "step": 463 }, { "clip_ratio": 0.0, "completion_length": 1041.6875457763672, "epoch": 0.792823579666809, "grad_norm": 2.5348830223083496, "kl": 1.32421875, "learning_rate": 2.1248556889283375e-07, "loss": 0.0053, "reward": 0.8058036118745804, "reward_std": 0.33272355049848557, "rewards/accuracy_reward": 0.15178571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6540178805589676, "step": 464 }, { "clip_ratio": 0.0, "completion_length": 1047.169692993164, "epoch": 0.7945322511747117, "grad_norm": 1.787291169166565, "kl": 1.25634765625, "learning_rate": 2.107139676323843e-07, "loss": -0.0599, "reward": 0.8080357611179352, "reward_std": 0.28191737830638885, "rewards/accuracy_reward": 0.09821429289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7098214626312256, "step": 465 }, { "clip_ratio": 0.0, "completion_length": 898.9732666015625, "epoch": 0.7962409226826143, "grad_norm": 2.8172249794006348, "kl": 1.93359375, "learning_rate": 2.0895446936953502e-07, "loss": 0.0496, "reward": 0.7477678805589676, "reward_std": 0.2592001259326935, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6406250298023224, "step": 466 }, { "clip_ratio": 0.0, "completion_length": 857.1786041259766, "epoch": 0.7979495941905169, "grad_norm": 4.560422420501709, "kl": 1.6328125, "learning_rate": 2.072071368690363e-07, "loss": -0.0192, "reward": 0.8526786118745804, "reward_std": 0.3187345936894417, "rewards/accuracy_reward": 0.20535715483129025, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6473214328289032, "step": 467 }, { "clip_ratio": 0.0, "completion_length": 894.2500457763672, "epoch": 0.7996582656984195, "grad_norm": 2.0428810119628906, "kl": 1.8642578125, "learning_rate": 2.0547203246166227e-07, "loss": 0.0248, "reward": 0.7834821790456772, "reward_std": 0.3083726763725281, "rewards/accuracy_reward": 0.09821429010480642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6852678805589676, "step": 468 }, { "clip_ratio": 0.0, "completion_length": 908.6786193847656, "epoch": 0.8013669372063221, "grad_norm": 3.1206984519958496, "kl": 1.634765625, "learning_rate": 2.0374921804198737e-07, "loss": 0.0142, "reward": 0.801339328289032, "reward_std": 0.2381625920534134, "rewards/accuracy_reward": 0.13392857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.667410746216774, "step": 469 }, { "clip_ratio": 0.0, "completion_length": 951.7857666015625, "epoch": 0.8030756087142247, "grad_norm": 2.3621981143951416, "kl": 1.515625, "learning_rate": 2.0203875506617745e-07, "loss": 0.0304, "reward": 0.8191964626312256, "reward_std": 0.276196975260973, "rewards/accuracy_reward": 0.11607143748551607, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7031250298023224, "step": 470 }, { "clip_ratio": 0.0, "completion_length": 932.669677734375, "epoch": 0.8047842802221273, "grad_norm": 3.6590757369995117, "kl": 1.802734375, "learning_rate": 2.0034070454979862e-07, "loss": 0.0193, "reward": 0.7700893133878708, "reward_std": 0.32008133828639984, "rewards/accuracy_reward": 0.0892857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6808035969734192, "step": 471 }, { "clip_ratio": 0.0, "completion_length": 834.0625457763672, "epoch": 0.8064929517300299, "grad_norm": 2.6816742420196533, "kl": 1.68359375, "learning_rate": 1.9865512706564004e-07, "loss": 0.0274, "reward": 0.8348214775323868, "reward_std": 0.33679404482245445, "rewards/accuracy_reward": 0.1696428656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6651785969734192, "step": 472 }, { "clip_ratio": 0.0, "completion_length": 902.4554138183594, "epoch": 0.8082016232379325, "grad_norm": 1.82427179813385, "kl": 1.677734375, "learning_rate": 1.9698208274155354e-07, "loss": 0.0783, "reward": 0.886160746216774, "reward_std": 0.3110070414841175, "rewards/accuracy_reward": 0.2410714402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.645089328289032, "step": 473 }, { "clip_ratio": 0.0, "completion_length": 734.2768096923828, "epoch": 0.8099102947458351, "grad_norm": 1.982773780822754, "kl": 1.66796875, "learning_rate": 1.953216312583083e-07, "loss": 0.039, "reward": 0.8125000447034836, "reward_std": 0.2679525762796402, "rewards/accuracy_reward": 0.12500000838190317, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6875000298023224, "step": 474 }, { "clip_ratio": 0.0, "completion_length": 748.6071929931641, "epoch": 0.8116189662537378, "grad_norm": 3.696535587310791, "kl": 1.9921875, "learning_rate": 1.9367383184746254e-07, "loss": 0.0644, "reward": 0.9107143431901932, "reward_std": 0.3552417457103729, "rewards/accuracy_reward": 0.27678572479635477, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6339285969734192, "step": 475 }, { "clip_ratio": 0.0, "completion_length": 947.0268096923828, "epoch": 0.8133276377616403, "grad_norm": 2.6566147804260254, "kl": 1.6240234375, "learning_rate": 1.920387432892499e-07, "loss": 0.0173, "reward": 0.7410714626312256, "reward_std": 0.3272313252091408, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6696428805589676, "step": 476 }, { "clip_ratio": 0.0, "completion_length": 907.2589721679688, "epoch": 0.815036309269543, "grad_norm": 1.7346994876861572, "kl": 1.9228515625, "learning_rate": 1.9041642391048331e-07, "loss": 0.0489, "reward": 0.8772321790456772, "reward_std": 0.28501565009355545, "rewards/accuracy_reward": 0.21428573038429022, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6629464626312256, "step": 477 }, { "clip_ratio": 0.0, "completion_length": 962.2411193847656, "epoch": 0.8167449807774455, "grad_norm": 3.025005340576172, "kl": 1.7265625, "learning_rate": 1.888069315824739e-07, "loss": -0.0323, "reward": 0.745535746216774, "reward_std": 0.33267049863934517, "rewards/accuracy_reward": 0.0803571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6651786118745804, "step": 478 }, { "clip_ratio": 0.0, "completion_length": 916.6250457763672, "epoch": 0.8184536522853482, "grad_norm": 1.853507161140442, "kl": 1.7109375, "learning_rate": 1.8721032371896673e-07, "loss": 0.0252, "reward": 0.7254464626312256, "reward_std": 0.2655007541179657, "rewards/accuracy_reward": 0.053571430034935474, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6718750298023224, "step": 479 }, { "clip_ratio": 0.0, "completion_length": 932.4643249511719, "epoch": 0.8201623237932507, "grad_norm": 1.5919908285140991, "kl": 0.9580078125, "learning_rate": 1.8562665727409285e-07, "loss": 0.0591, "reward": 0.816964328289032, "reward_std": 0.2727978155016899, "rewards/accuracy_reward": 0.12500000465661287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6919643133878708, "step": 480 }, { "clip_ratio": 0.0, "completion_length": 835.5000305175781, "epoch": 0.8218709953011534, "grad_norm": 2.9764211177825928, "kl": 1.83984375, "learning_rate": 1.8405598874033755e-07, "loss": 0.0438, "reward": 0.7678571790456772, "reward_std": 0.32687392085790634, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.642857164144516, "step": 481 }, { "clip_ratio": 0.0, "completion_length": 1000.8482666015625, "epoch": 0.8235796668090559, "grad_norm": 1.6623742580413818, "kl": 1.041015625, "learning_rate": 1.8249837414652506e-07, "loss": -0.0331, "reward": 0.8571428954601288, "reward_std": 0.3095138594508171, "rewards/accuracy_reward": 0.14285715017467737, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.714285746216774, "step": 482 }, { "clip_ratio": 0.0, "completion_length": 983.0000457763672, "epoch": 0.8252883383169586, "grad_norm": 2.847067356109619, "kl": 1.318359375, "learning_rate": 1.809538690558201e-07, "loss": 0.0023, "reward": 0.6517857611179352, "reward_std": 0.2827831096947193, "rewards/accuracy_reward": 0.008928571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6428571790456772, "step": 483 }, { "clip_ratio": 0.0, "completion_length": 931.8571929931641, "epoch": 0.8269970098248611, "grad_norm": 1.8843674659729004, "kl": 1.3876953125, "learning_rate": 1.7942252856374568e-07, "loss": 0.0694, "reward": 0.6852678805589676, "reward_std": 0.29281439259648323, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6227678805589676, "step": 484 }, { "clip_ratio": 0.0, "completion_length": 963.8303985595703, "epoch": 0.8287056813327638, "grad_norm": 2.828702449798584, "kl": 1.287109375, "learning_rate": 1.779044072962178e-07, "loss": 0.0345, "reward": 0.7767857611179352, "reward_std": 0.2837047018110752, "rewards/accuracy_reward": 0.1160714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.660714328289032, "step": 485 }, { "clip_ratio": 0.0, "completion_length": 883.4107666015625, "epoch": 0.8304143528406663, "grad_norm": 1.732930302619934, "kl": 1.0849609375, "learning_rate": 1.7639955940759681e-07, "loss": -0.0507, "reward": 0.7968750298023224, "reward_std": 0.29691141843795776, "rewards/accuracy_reward": 0.08035714738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7165178954601288, "step": 486 }, { "clip_ratio": 0.0, "completion_length": 897.1964721679688, "epoch": 0.832123024348569, "grad_norm": 2.4405362606048584, "kl": 1.236328125, "learning_rate": 1.7490803857875564e-07, "loss": -0.0096, "reward": 0.7924107611179352, "reward_std": 0.2722489945590496, "rewards/accuracy_reward": 0.1428571529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6495535969734192, "step": 487 }, { "clip_ratio": 0.0, "completion_length": 920.3125457763672, "epoch": 0.8338316958564715, "grad_norm": 1.5337131023406982, "kl": 1.2841796875, "learning_rate": 1.734298980151648e-07, "loss": -0.0181, "reward": 0.752232164144516, "reward_std": 0.3724236339330673, "rewards/accuracy_reward": 0.09821429010480642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6540178805589676, "step": 488 }, { "clip_ratio": 0.0, "completion_length": 828.1875305175781, "epoch": 0.8355403673643742, "grad_norm": 1.31434166431427, "kl": 1.177734375, "learning_rate": 1.7196519044499457e-07, "loss": 0.0271, "reward": 0.8526785969734192, "reward_std": 0.30362287908792496, "rewards/accuracy_reward": 0.17857143096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.674107164144516, "step": 489 }, { "clip_ratio": 0.0, "completion_length": 1001.2232513427734, "epoch": 0.8372490388722768, "grad_norm": 1.8005735874176025, "kl": 0.9326171875, "learning_rate": 1.7051396811723423e-07, "loss": 0.0517, "reward": 0.8839286118745804, "reward_std": 0.3203303888440132, "rewards/accuracy_reward": 0.196428582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6875000447034836, "step": 490 }, { "clip_ratio": 0.0, "completion_length": 862.2053985595703, "epoch": 0.8389577103801794, "grad_norm": 2.2546513080596924, "kl": 1.3046875, "learning_rate": 1.6907628279982755e-07, "loss": -0.063, "reward": 0.7678571939468384, "reward_std": 0.3461604565382004, "rewards/accuracy_reward": 0.13392857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6339285969734192, "step": 491 }, { "clip_ratio": 0.0, "completion_length": 935.5714416503906, "epoch": 0.8406663818880821, "grad_norm": 1.7200562953948975, "kl": 1.4541015625, "learning_rate": 1.6765218577782719e-07, "loss": 0.0707, "reward": 0.7187500298023224, "reward_std": 0.28121884167194366, "rewards/accuracy_reward": 0.12500000651925802, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5937500447034836, "step": 492 }, { "clip_ratio": 0.0, "completion_length": 979.5982818603516, "epoch": 0.8423750533959846, "grad_norm": 2.6383872032165527, "kl": 1.1875, "learning_rate": 1.6624172785156436e-07, "loss": -0.0463, "reward": 0.714285746216774, "reward_std": 0.28426671773195267, "rewards/accuracy_reward": 0.05357143096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6607143133878708, "step": 493 }, { "clip_ratio": 0.0, "completion_length": 964.4553985595703, "epoch": 0.8440837249038873, "grad_norm": 1.2469029426574707, "kl": 1.28125, "learning_rate": 1.6484495933483692e-07, "loss": 0.0116, "reward": 0.770089328289032, "reward_std": 0.2826877534389496, "rewards/accuracy_reward": 0.10714286286383867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6629464626312256, "step": 494 }, { "clip_ratio": 0.0, "completion_length": 791.3928985595703, "epoch": 0.8457923964117898, "grad_norm": 2.6087076663970947, "kl": 1.4853515625, "learning_rate": 1.6346193005311494e-07, "loss": 0.0276, "reward": 0.9017857611179352, "reward_std": 0.32446930557489395, "rewards/accuracy_reward": 0.2589285857975483, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6428571790456772, "step": 495 }, { "clip_ratio": 0.0, "completion_length": 897.1607513427734, "epoch": 0.8475010679196925, "grad_norm": 2.094269275665283, "kl": 1.65625, "learning_rate": 1.6209268934176292e-07, "loss": 0.0351, "reward": 0.7611607611179352, "reward_std": 0.31133370101451874, "rewards/accuracy_reward": 0.13392857369035482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.627232164144516, "step": 496 }, { "clip_ratio": 0.0, "completion_length": 1004.5268402099609, "epoch": 0.849209739427595, "grad_norm": 2.996782064437866, "kl": 1.1689453125, "learning_rate": 1.6073728604428002e-07, "loss": 0.0243, "reward": 0.8772321790456772, "reward_std": 0.40657078474760056, "rewards/accuracy_reward": 0.18750000838190317, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.689732164144516, "step": 497 }, { "clip_ratio": 0.0, "completion_length": 874.9911193847656, "epoch": 0.8509184109354977, "grad_norm": 2.921583890914917, "kl": 1.03759765625, "learning_rate": 1.5939576851055786e-07, "loss": -0.0119, "reward": 0.8816964626312256, "reward_std": 0.41966621577739716, "rewards/accuracy_reward": 0.2142857238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.667410746216774, "step": 498 }, { "clip_ratio": 0.0, "completion_length": 972.5179138183594, "epoch": 0.8526270824434002, "grad_norm": 1.9489350318908691, "kl": 1.0966796875, "learning_rate": 1.580681845951552e-07, "loss": 0.018, "reward": 0.9017857611179352, "reward_std": 0.29346491023898125, "rewards/accuracy_reward": 0.2232142947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6785714477300644, "step": 499 }, { "clip_ratio": 0.0, "completion_length": 911.9911193847656, "epoch": 0.8543357539513029, "grad_norm": 1.656348705291748, "kl": 1.1923828125, "learning_rate": 1.56754581655592e-07, "loss": 0.0167, "reward": 0.8348214775323868, "reward_std": 0.27620547637343407, "rewards/accuracy_reward": 0.16964286658912897, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6651786118745804, "step": 500 }, { "clip_ratio": 0.0, "completion_length": 873.7678833007812, "epoch": 0.8560444254592054, "grad_norm": 1.6158640384674072, "kl": 1.224609375, "learning_rate": 1.5545500655065917e-07, "loss": 0.0048, "reward": 0.8013393133878708, "reward_std": 0.3165704682469368, "rewards/accuracy_reward": 0.10714286286383867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6941964477300644, "step": 501 }, { "clip_ratio": 0.0, "completion_length": 993.4018402099609, "epoch": 0.8577530969671081, "grad_norm": 1.6703130006790161, "kl": 1.111328125, "learning_rate": 1.541695056387468e-07, "loss": -0.0284, "reward": 0.770089328289032, "reward_std": 0.3497941829264164, "rewards/accuracy_reward": 0.07142857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.698660746216774, "step": 502 }, { "clip_ratio": 0.0, "completion_length": 936.8928985595703, "epoch": 0.8594617684750107, "grad_norm": 2.0887317657470703, "kl": 1.3994140625, "learning_rate": 1.5289812477619166e-07, "loss": 0.0659, "reward": 0.8191964626312256, "reward_std": 0.3564138673245907, "rewards/accuracy_reward": 0.15178572200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.667410746216774, "step": 503 }, { "clip_ratio": 0.0, "completion_length": 826.9196929931641, "epoch": 0.8611704399829133, "grad_norm": 1.7875081300735474, "kl": 1.78515625, "learning_rate": 1.5164090931564036e-07, "loss": 0.0319, "reward": 0.7790178954601288, "reward_std": 0.29810645431280136, "rewards/accuracy_reward": 0.1250000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6540178954601288, "step": 504 }, { "clip_ratio": 0.0, "completion_length": 912.4375305175781, "epoch": 0.8628791114908159, "grad_norm": 1.9543348550796509, "kl": 1.2138671875, "learning_rate": 1.5039790410443166e-07, "loss": -0.0275, "reward": 0.801339328289032, "reward_std": 0.32371920347213745, "rewards/accuracy_reward": 0.1160714328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6852678954601288, "step": 505 }, { "clip_ratio": 0.0, "completion_length": 1090.0625610351562, "epoch": 0.8645877829987185, "grad_norm": 2.1263389587402344, "kl": 0.8759765625, "learning_rate": 1.4916915348299738e-07, "loss": -0.0319, "reward": 0.839285746216774, "reward_std": 0.3556714132428169, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7142857313156128, "step": 506 }, { "clip_ratio": 0.0, "completion_length": 960.9464721679688, "epoch": 0.8662964545066211, "grad_norm": 1.8417190313339233, "kl": 1.04443359375, "learning_rate": 1.4795470128327967e-07, "loss": 0.023, "reward": 0.9531250447034836, "reward_std": 0.3278018347918987, "rewards/accuracy_reward": 0.24107144586741924, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7120536118745804, "step": 507 }, { "clip_ratio": 0.0, "completion_length": 909.2143096923828, "epoch": 0.8680051260145237, "grad_norm": 2.3360016345977783, "kl": 1.271484375, "learning_rate": 1.4675459082716819e-07, "loss": -0.0169, "reward": 0.9218750447034836, "reward_std": 0.3685170114040375, "rewards/accuracy_reward": 0.2321428656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.689732164144516, "step": 508 }, { "clip_ratio": 0.0, "completion_length": 953.8750610351562, "epoch": 0.8697137975224263, "grad_norm": 1.2348896265029907, "kl": 1.431640625, "learning_rate": 1.4556886492495475e-07, "loss": 0.0101, "reward": 0.7209821790456772, "reward_std": 0.313141155987978, "rewards/accuracy_reward": 0.053571431897580624, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.667410746216774, "step": 509 }, { "clip_ratio": 0.0, "completion_length": 1015.544677734375, "epoch": 0.8714224690303289, "grad_norm": 3.983487606048584, "kl": 0.8154296875, "learning_rate": 1.443975658738054e-07, "loss": 0.0029, "reward": 0.8281250447034836, "reward_std": 0.25288259238004684, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7209821790456772, "step": 510 }, { "clip_ratio": 0.0, "completion_length": 963.8125457763672, "epoch": 0.8731311405382315, "grad_norm": 1.9935516119003296, "kl": 1.037109375, "learning_rate": 1.4324073545625252e-07, "loss": 0.057, "reward": 0.8281250447034836, "reward_std": 0.29823729023337364, "rewards/accuracy_reward": 0.13392857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6941964477300644, "step": 511 }, { "clip_ratio": 0.0, "completion_length": 989.1875610351562, "epoch": 0.8748398120461341, "grad_norm": 1.634122371673584, "kl": 0.943359375, "learning_rate": 1.4209841493870412e-07, "loss": 0.0378, "reward": 0.8705357760190964, "reward_std": 0.3377785459160805, "rewards/accuracy_reward": 0.17857143748551607, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.691964328289032, "step": 512 }, { "clip_ratio": 0.0, "completion_length": 970.1518249511719, "epoch": 0.8765484835540367, "grad_norm": 1.646342158317566, "kl": 1.412109375, "learning_rate": 1.4097064506997104e-07, "loss": 0.0678, "reward": 0.6941964626312256, "reward_std": 0.26397955045104027, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.658482164144516, "step": 513 }, { "clip_ratio": 0.0, "completion_length": 991.3839721679688, "epoch": 0.8782571550619394, "grad_norm": 2.085681915283203, "kl": 1.21484375, "learning_rate": 1.3985746607981473e-07, "loss": -0.0072, "reward": 0.7901786118745804, "reward_std": 0.2989831529557705, "rewards/accuracy_reward": 0.06250000465661287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7276785969734192, "step": 514 }, { "clip_ratio": 0.0, "completion_length": 1102.6429290771484, "epoch": 0.8799658265698419, "grad_norm": 2.309361696243286, "kl": 0.7236328125, "learning_rate": 1.3875891767751073e-07, "loss": -0.0179, "reward": 0.879464328289032, "reward_std": 0.34267255663871765, "rewards/accuracy_reward": 0.1339285783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.745535746216774, "step": 515 }, { "clip_ratio": 0.0, "completion_length": 1060.6964721679688, "epoch": 0.8816744980777446, "grad_norm": 1.1139980554580688, "kl": 0.89990234375, "learning_rate": 1.3767503905043314e-07, "loss": 0.0217, "reward": 0.926339328289032, "reward_std": 0.34493232518434525, "rewards/accuracy_reward": 0.17857143469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7477678954601288, "step": 516 }, { "clip_ratio": 0.0, "completion_length": 1006.7411041259766, "epoch": 0.8833831695856471, "grad_norm": 1.206600546836853, "kl": 1.185546875, "learning_rate": 1.3660586886265676e-07, "loss": -0.0379, "reward": 0.8839286267757416, "reward_std": 0.30364222079515457, "rewards/accuracy_reward": 0.196428582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6875000298023224, "step": 517 }, { "clip_ratio": 0.0, "completion_length": 987.8482818603516, "epoch": 0.8850918410935498, "grad_norm": 1.7174700498580933, "kl": 1.119140625, "learning_rate": 1.3555144525357683e-07, "loss": 0.0479, "reward": 0.7790178954601288, "reward_std": 0.2580721527338028, "rewards/accuracy_reward": 0.08928571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6897321939468384, "step": 518 }, { "clip_ratio": 0.0, "completion_length": 936.8661041259766, "epoch": 0.8868005126014523, "grad_norm": 2.275324583053589, "kl": 1.068359375, "learning_rate": 1.3451180583654976e-07, "loss": 0.056, "reward": 0.9285714775323868, "reward_std": 0.3547321520745754, "rewards/accuracy_reward": 0.2142857238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.714285746216774, "step": 519 }, { "clip_ratio": 0.0, "completion_length": 973.5625305175781, "epoch": 0.888509184109355, "grad_norm": 1.4097974300384521, "kl": 1.2685546875, "learning_rate": 1.3348698769755084e-07, "loss": 0.0562, "reward": 0.8102678954601288, "reward_std": 0.24892665073275566, "rewards/accuracy_reward": 0.09821428917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7120536118745804, "step": 520 }, { "clip_ratio": 0.0, "completion_length": 1020.5179138183594, "epoch": 0.8902178556172576, "grad_norm": 1.6948277950286865, "kl": 1.0625, "learning_rate": 1.3247702739385102e-07, "loss": -0.0028, "reward": 0.7723214626312256, "reward_std": 0.3934285342693329, "rewards/accuracy_reward": 0.09821429010480642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.674107164144516, "step": 521 }, { "clip_ratio": 0.0, "completion_length": 982.9018249511719, "epoch": 0.8919265271251602, "grad_norm": 1.3726471662521362, "kl": 1.466796875, "learning_rate": 1.3148196095271336e-07, "loss": 0.0327, "reward": 0.7946428954601288, "reward_std": 0.30808496102690697, "rewards/accuracy_reward": 0.1160714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6785714477300644, "step": 522 }, { "clip_ratio": 0.0, "completion_length": 1001.1875610351562, "epoch": 0.8936351986330628, "grad_norm": 2.7190818786621094, "kl": 1.205078125, "learning_rate": 1.3050182387010767e-07, "loss": 0.072, "reward": 0.9575893431901932, "reward_std": 0.3048710450530052, "rewards/accuracy_reward": 0.22321429569274187, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7343750298023224, "step": 523 }, { "clip_ratio": 0.0, "completion_length": 1078.5357666015625, "epoch": 0.8953438701409654, "grad_norm": 1.0787479877471924, "kl": 0.83349609375, "learning_rate": 1.295366511094442e-07, "loss": 0.0518, "reward": 0.8638393133878708, "reward_std": 0.2481858693063259, "rewards/accuracy_reward": 0.15178572107106447, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7120536118745804, "step": 524 }, { "clip_ratio": 0.0, "completion_length": 1003.669677734375, "epoch": 0.897052541648868, "grad_norm": 3.256316661834717, "kl": 1.4130859375, "learning_rate": 1.2858647710032665e-07, "loss": 0.0226, "reward": 0.8750000447034836, "reward_std": 0.29159480705857277, "rewards/accuracy_reward": 0.1875000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6875000298023224, "step": 525 }, { "clip_ratio": 0.0, "completion_length": 1065.6607818603516, "epoch": 0.8987612131567706, "grad_norm": 1.3841853141784668, "kl": 1.2763671875, "learning_rate": 1.2765133573732374e-07, "loss": 0.0211, "reward": 0.7522321790456772, "reward_std": 0.326959066092968, "rewards/accuracy_reward": 0.10714285913854837, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6450893133878708, "step": 526 }, { "clip_ratio": 0.0, "completion_length": 948.1607666015625, "epoch": 0.9004698846646733, "grad_norm": 2.528103828430176, "kl": 1.423828125, "learning_rate": 1.2673126037876025e-07, "loss": 0.0114, "reward": 0.8526786118745804, "reward_std": 0.2680281512439251, "rewards/accuracy_reward": 0.1517857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7008928954601288, "step": 527 }, { "clip_ratio": 0.0, "completion_length": 939.7143249511719, "epoch": 0.9021785561725758, "grad_norm": 1.519159197807312, "kl": 0.689697265625, "learning_rate": 1.2582628384552716e-07, "loss": -0.0297, "reward": 0.8303571939468384, "reward_std": 0.21166226640343666, "rewards/accuracy_reward": 0.0803571455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7500000447034836, "step": 528 }, { "clip_ratio": 0.0, "completion_length": 918.5446929931641, "epoch": 0.9038872276804785, "grad_norm": 2.0482962131500244, "kl": 1.2431640625, "learning_rate": 1.249364384199106e-07, "loss": -0.0233, "reward": 0.808035746216774, "reward_std": 0.3059060722589493, "rewards/accuracy_reward": 0.10714286472648382, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7008928954601288, "step": 529 }, { "clip_ratio": 0.0, "completion_length": 982.2411346435547, "epoch": 0.905595899188381, "grad_norm": 22.664501190185547, "kl": 1.8095703125, "learning_rate": 1.240617558444406e-07, "loss": 0.0061, "reward": 0.8772321790456772, "reward_std": 0.3060721233487129, "rewards/accuracy_reward": 0.17857143376022577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6986607611179352, "step": 530 }, { "clip_ratio": 0.0, "completion_length": 1034.8393096923828, "epoch": 0.9073045706962837, "grad_norm": 2.4823694229125977, "kl": 1.248046875, "learning_rate": 1.2320226732075855e-07, "loss": 0.0467, "reward": 0.7834821790456772, "reward_std": 0.28578638657927513, "rewards/accuracy_reward": 0.07142857555299997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7120536118745804, "step": 531 }, { "clip_ratio": 0.0, "completion_length": 883.9196929931641, "epoch": 0.9090132422041862, "grad_norm": 1.5375847816467285, "kl": 1.3251953125, "learning_rate": 1.223580035085042e-07, "loss": 0.1134, "reward": 0.7991071790456772, "reward_std": 0.27495670318603516, "rewards/accuracy_reward": 0.04464286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7544643133878708, "step": 532 }, { "clip_ratio": 0.0, "completion_length": 1113.7857666015625, "epoch": 0.9107219137120889, "grad_norm": 2.9891517162323, "kl": 1.294921875, "learning_rate": 1.2152899452422219e-07, "loss": -0.0468, "reward": 0.8839286118745804, "reward_std": 0.23452728614211082, "rewards/accuracy_reward": 0.1696428656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.714285746216774, "step": 533 }, { "clip_ratio": 0.0, "completion_length": 974.9911193847656, "epoch": 0.9124305852199914, "grad_norm": 1.7981221675872803, "kl": 1.0703125, "learning_rate": 1.2071526994028725e-07, "loss": 0.0195, "reward": 0.90401791036129, "reward_std": 0.3086390309035778, "rewards/accuracy_reward": 0.1785714402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7254464775323868, "step": 534 }, { "clip_ratio": 0.0, "completion_length": 935.1518402099609, "epoch": 0.9141392567278941, "grad_norm": 1.8704559803009033, "kl": 1.1923828125, "learning_rate": 1.199168587838498e-07, "loss": 0.0242, "reward": 0.8973214626312256, "reward_std": 0.30721230059862137, "rewards/accuracy_reward": 0.16964286286383867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7276785969734192, "step": 535 }, { "clip_ratio": 0.0, "completion_length": 1039.8125457763672, "epoch": 0.9158479282357966, "grad_norm": 1.1522799730300903, "kl": 1.08203125, "learning_rate": 1.1913378953580033e-07, "loss": 0.0304, "reward": 0.7968750298023224, "reward_std": 0.31773998588323593, "rewards/accuracy_reward": 0.14285715017467737, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6540178805589676, "step": 536 }, { "clip_ratio": 0.0, "completion_length": 868.2143249511719, "epoch": 0.9175565997436993, "grad_norm": 2.0809593200683594, "kl": 0.884765625, "learning_rate": 1.1836609012975332e-07, "loss": 0.0166, "reward": 0.9709821790456772, "reward_std": 0.30408982932567596, "rewards/accuracy_reward": 0.22321429662406445, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7477678805589676, "step": 537 }, { "clip_ratio": 0.0, "completion_length": 753.6339645385742, "epoch": 0.9192652712516018, "grad_norm": 2.4306819438934326, "kl": 1.302734375, "learning_rate": 1.1761378795105089e-07, "loss": 0.0286, "reward": 0.7656250298023224, "reward_std": 0.24535901844501495, "rewards/accuracy_reward": 0.035714288242161274, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.729910746216774, "step": 538 }, { "clip_ratio": 0.0, "completion_length": 1027.8482666015625, "epoch": 0.9209739427595045, "grad_norm": 1.510364294052124, "kl": 1.33837890625, "learning_rate": 1.1687690983578581e-07, "loss": 0.0299, "reward": 0.7879464626312256, "reward_std": 0.31542395427823067, "rewards/accuracy_reward": 0.0625000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7254464477300644, "step": 539 }, { "clip_ratio": 0.0, "completion_length": 849.4821624755859, "epoch": 0.922682614267407, "grad_norm": 1.2751209735870361, "kl": 1.2802734375, "learning_rate": 1.1615548206984436e-07, "loss": 0.0135, "reward": 0.8080357611179352, "reward_std": 0.28669071942567825, "rewards/accuracy_reward": 0.07142857369035482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7366071790456772, "step": 540 }, { "clip_ratio": 0.0, "completion_length": 1019.3393096923828, "epoch": 0.9243912857753097, "grad_norm": 1.5096882581710815, "kl": 0.919921875, "learning_rate": 1.1544953038796869e-07, "loss": 0.0454, "reward": 0.8013393133878708, "reward_std": 0.25788456946611404, "rewards/accuracy_reward": 0.08035714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.720982164144516, "step": 541 }, { "clip_ratio": 0.0, "completion_length": 975.6339721679688, "epoch": 0.9260999572832123, "grad_norm": 1.6345617771148682, "kl": 1.453125, "learning_rate": 1.1475907997283842e-07, "loss": 0.0493, "reward": 0.8995536267757416, "reward_std": 0.2816562093794346, "rewards/accuracy_reward": 0.2053571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6941964626312256, "step": 542 }, { "clip_ratio": 0.0, "completion_length": 839.7411041259766, "epoch": 0.9278086287911149, "grad_norm": 2.0646791458129883, "kl": 1.056640625, "learning_rate": 1.1408415545417288e-07, "loss": 0.0702, "reward": 0.90401791036129, "reward_std": 0.28080233186483383, "rewards/accuracy_reward": 0.1696428619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7343750298023224, "step": 543 }, { "clip_ratio": 0.0, "completion_length": 953.8571929931641, "epoch": 0.9295173002990175, "grad_norm": 2.1020452976226807, "kl": 1.40234375, "learning_rate": 1.134247809078522e-07, "loss": -0.0424, "reward": 0.8549107611179352, "reward_std": 0.33114008605480194, "rewards/accuracy_reward": 0.13392857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.720982164144516, "step": 544 }, { "clip_ratio": 0.0, "completion_length": 1056.7143096923828, "epoch": 0.9312259718069201, "grad_norm": 2.2826364040374756, "kl": 1.740234375, "learning_rate": 1.1278097985505839e-07, "loss": 0.0523, "reward": 0.7410714626312256, "reward_std": 0.36796700209379196, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6160714626312256, "step": 545 }, { "clip_ratio": 0.0, "completion_length": 1035.2768096923828, "epoch": 0.9329346433148227, "grad_norm": 1.241382360458374, "kl": 0.955078125, "learning_rate": 1.1215277526143658e-07, "loss": -0.02, "reward": 0.9196428954601288, "reward_std": 0.3233882859349251, "rewards/accuracy_reward": 0.20535715389996767, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.714285746216774, "step": 546 }, { "clip_ratio": 0.0, "completion_length": 1068.5089569091797, "epoch": 0.9346433148227253, "grad_norm": 3.344892740249634, "kl": 0.79833984375, "learning_rate": 1.1154018953627553e-07, "loss": 0.044, "reward": 0.8080357313156128, "reward_std": 0.30821555107831955, "rewards/accuracy_reward": 0.11607143376022577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6919643133878708, "step": 547 }, { "clip_ratio": 0.0, "completion_length": 922.5268402099609, "epoch": 0.9363519863306279, "grad_norm": 1.8845711946487427, "kl": 1.521484375, "learning_rate": 1.1094324453170843e-07, "loss": -0.0383, "reward": 0.8950893431901932, "reward_std": 0.3563399314880371, "rewards/accuracy_reward": 0.2142857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6808035969734192, "step": 548 }, { "clip_ratio": 0.0, "completion_length": 892.4196929931641, "epoch": 0.9380606578385305, "grad_norm": 2.8238370418548584, "kl": 1.376953125, "learning_rate": 1.1036196154193336e-07, "loss": 0.0634, "reward": 0.816964328289032, "reward_std": 0.2748360186815262, "rewards/accuracy_reward": 0.10714285913854837, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7098214626312256, "step": 549 }, { "clip_ratio": 0.0, "completion_length": 945.1161041259766, "epoch": 0.9397693293464332, "grad_norm": 2.037820816040039, "kl": 1.685546875, "learning_rate": 1.0979636130245343e-07, "loss": 0.0907, "reward": 0.8058036118745804, "reward_std": 0.35410451143980026, "rewards/accuracy_reward": 0.10714286100119352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6986607313156128, "step": 550 }, { "clip_ratio": 0.0, "completion_length": 1085.8303985595703, "epoch": 0.9414780008543358, "grad_norm": 1.1005703210830688, "kl": 1.08154296875, "learning_rate": 1.0924646398933763e-07, "loss": 0.0341, "reward": 0.8035714775323868, "reward_std": 0.32637201249599457, "rewards/accuracy_reward": 0.08928571920841932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.714285746216774, "step": 551 }, { "clip_ratio": 0.0, "completion_length": 1064.232162475586, "epoch": 0.9431866723622384, "grad_norm": 2.3469197750091553, "kl": 1.34375, "learning_rate": 1.0871228921850062e-07, "loss": 0.0005, "reward": 0.8147321790456772, "reward_std": 0.22606049105525017, "rewards/accuracy_reward": 0.098214291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7165178805589676, "step": 552 }, { "clip_ratio": 0.0, "completion_length": 956.0982818603516, "epoch": 0.944895343870141, "grad_norm": 2.748126268386841, "kl": 1.3779296875, "learning_rate": 1.08193856045003e-07, "loss": -0.0057, "reward": 0.8348214775323868, "reward_std": 0.3591946139931679, "rewards/accuracy_reward": 0.1517857238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.683035746216774, "step": 553 }, { "clip_ratio": 0.0, "completion_length": 1047.9732666015625, "epoch": 0.9466040153780436, "grad_norm": 1.58036470413208, "kl": 1.0576171875, "learning_rate": 1.0769118296237213e-07, "loss": 0.0218, "reward": 0.7857143133878708, "reward_std": 0.3132629096508026, "rewards/accuracy_reward": 0.14285714644938707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.642857164144516, "step": 554 }, { "clip_ratio": 0.0, "completion_length": 940.1786193847656, "epoch": 0.9483126868859462, "grad_norm": 1.4453859329223633, "kl": 1.0390625, "learning_rate": 1.0720428790194177e-07, "loss": 0.0258, "reward": 0.8616071790456772, "reward_std": 0.337561659514904, "rewards/accuracy_reward": 0.1250000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7366071790456772, "step": 555 }, { "clip_ratio": 0.0, "completion_length": 782.5178833007812, "epoch": 0.9500213583938488, "grad_norm": 2.271420478820801, "kl": 1.314453125, "learning_rate": 1.0673318823221271e-07, "loss": -0.0196, "reward": 1.0000000447034836, "reward_std": 0.3643309995532036, "rewards/accuracy_reward": 0.2946428656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7053571790456772, "step": 556 }, { "clip_ratio": 0.0, "completion_length": 1002.857177734375, "epoch": 0.9517300299017514, "grad_norm": 1.6239064931869507, "kl": 1.0322265625, "learning_rate": 1.0627790075823346e-07, "loss": 0.0207, "reward": 0.8147321790456772, "reward_std": 0.22938312962651253, "rewards/accuracy_reward": 0.08035714738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7343750447034836, "step": 557 }, { "clip_ratio": 0.0, "completion_length": 871.919677734375, "epoch": 0.953438701409654, "grad_norm": 2.5180678367614746, "kl": 1.66796875, "learning_rate": 1.0583844172100026e-07, "loss": 0.0181, "reward": 0.8571428954601288, "reward_std": 0.3611206114292145, "rewards/accuracy_reward": 0.1517857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7053571790456772, "step": 558 }, { "clip_ratio": 0.0, "completion_length": 1016.3036346435547, "epoch": 0.9551473729175566, "grad_norm": 1.8385826349258423, "kl": 1.064453125, "learning_rate": 1.0541482679687797e-07, "loss": 0.014, "reward": 0.839285746216774, "reward_std": 0.27544307708740234, "rewards/accuracy_reward": 0.1517857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6875000447034836, "step": 559 }, { "clip_ratio": 0.0, "completion_length": 836.6250457763672, "epoch": 0.9568560444254592, "grad_norm": 2.686523675918579, "kl": 1.509765625, "learning_rate": 1.0500707109704118e-07, "loss": -0.0049, "reward": 0.8303571790456772, "reward_std": 0.3624780811369419, "rewards/accuracy_reward": 0.1517857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6785714626312256, "step": 560 }, { "clip_ratio": 0.0, "completion_length": 869.3661193847656, "epoch": 0.9585647159333618, "grad_norm": 2.903303623199463, "kl": 1.029296875, "learning_rate": 1.0461518916693452e-07, "loss": 0.0106, "reward": 0.8549107611179352, "reward_std": 0.2641325071454048, "rewards/accuracy_reward": 0.13392857927829027, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.720982164144516, "step": 561 }, { "clip_ratio": 0.0, "completion_length": 980.2679138183594, "epoch": 0.9602733874412644, "grad_norm": 1.8177927732467651, "kl": 1.09033203125, "learning_rate": 1.0423919498575448e-07, "loss": -0.0262, "reward": 0.9799107760190964, "reward_std": 0.32166746258735657, "rewards/accuracy_reward": 0.24107144214212894, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7388393133878708, "step": 562 }, { "clip_ratio": 0.0, "completion_length": 1077.9822082519531, "epoch": 0.961982058949167, "grad_norm": 1.9446163177490234, "kl": 1.0947265625, "learning_rate": 1.0387910196595021e-07, "loss": 0.0555, "reward": 0.7857143133878708, "reward_std": 0.27803416922688484, "rewards/accuracy_reward": 0.08035714738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.705357164144516, "step": 563 }, { "clip_ratio": 0.0, "completion_length": 1101.3482666015625, "epoch": 0.9636907304570697, "grad_norm": 1.6303197145462036, "kl": 0.82763671875, "learning_rate": 1.0353492295274526e-07, "loss": 0.0202, "reward": 0.9174107611179352, "reward_std": 0.29665153101086617, "rewards/accuracy_reward": 0.16071429289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7566964626312256, "step": 564 }, { "clip_ratio": 0.0, "completion_length": 834.4732666015625, "epoch": 0.9653994019649722, "grad_norm": 1.4084457159042358, "kl": 1.2744140625, "learning_rate": 1.0320667022367952e-07, "loss": -0.0458, "reward": 0.991071492433548, "reward_std": 0.24970977380871773, "rewards/accuracy_reward": 0.2678571604192257, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7232143133878708, "step": 565 }, { "clip_ratio": 0.0, "completion_length": 981.7678833007812, "epoch": 0.9671080734728749, "grad_norm": 2.613037109375, "kl": 1.0390625, "learning_rate": 1.0289435548817101e-07, "loss": 0.059, "reward": 0.8772321790456772, "reward_std": 0.30870165303349495, "rewards/accuracy_reward": 0.1607142947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7165178954601288, "step": 566 }, { "clip_ratio": 0.0, "completion_length": 993.794677734375, "epoch": 0.9688167449807774, "grad_norm": 1.326412558555603, "kl": 1.2939453125, "learning_rate": 1.0259798988709827e-07, "loss": 0.046, "reward": 0.9017857760190964, "reward_std": 0.31238842755556107, "rewards/accuracy_reward": 0.1964285783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.705357164144516, "step": 567 }, { "clip_ratio": 0.0, "completion_length": 949.4464721679688, "epoch": 0.9705254164886801, "grad_norm": 1.6106551885604858, "kl": 1.3291015625, "learning_rate": 1.0231758399240314e-07, "loss": -0.0215, "reward": 0.8058036118745804, "reward_std": 0.33262885734438896, "rewards/accuracy_reward": 0.10714286100119352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6986607313156128, "step": 568 }, { "clip_ratio": 0.0, "completion_length": 864.3571929931641, "epoch": 0.9722340879965826, "grad_norm": 2.1797053813934326, "kl": 1.318359375, "learning_rate": 1.0205314780671322e-07, "loss": -0.0344, "reward": 0.9285714626312256, "reward_std": 0.275399599224329, "rewards/accuracy_reward": 0.1785714402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7500000298023224, "step": 569 }, { "clip_ratio": 0.0, "completion_length": 990.9375457763672, "epoch": 0.9739427595044853, "grad_norm": 2.5850865840911865, "kl": 1.322265625, "learning_rate": 1.0180469076298543e-07, "loss": 0.0094, "reward": 0.8102678954601288, "reward_std": 0.29576943069696426, "rewards/accuracy_reward": 0.08928571920841932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.720982164144516, "step": 570 }, { "clip_ratio": 0.0, "completion_length": 897.7143249511719, "epoch": 0.9756514310123878, "grad_norm": 1.3948034048080444, "kl": 0.8525390625, "learning_rate": 1.0157222172416943e-07, "loss": -0.0064, "reward": 0.9620536118745804, "reward_std": 0.32447294145822525, "rewards/accuracy_reward": 0.2232142984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7388393133878708, "step": 571 }, { "clip_ratio": 0.0, "completion_length": 877.1161041259766, "epoch": 0.9773601025202905, "grad_norm": 2.146336317062378, "kl": 1.4013671875, "learning_rate": 1.0135574898289128e-07, "loss": 0.0201, "reward": 0.8593750298023224, "reward_std": 0.24795975163578987, "rewards/accuracy_reward": 0.15178572107106447, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7075893133878708, "step": 572 }, { "clip_ratio": 0.0, "completion_length": 1004.4911193847656, "epoch": 0.979068774028193, "grad_norm": 1.590342402458191, "kl": 0.9765625, "learning_rate": 1.011552802611579e-07, "loss": 0.1135, "reward": 0.8839285969734192, "reward_std": 0.31369584053754807, "rewards/accuracy_reward": 0.16964286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.714285746216774, "step": 573 }, { "clip_ratio": 0.0, "completion_length": 829.2678833007812, "epoch": 0.9807774455360957, "grad_norm": 1.7591408491134644, "kl": 0.97900390625, "learning_rate": 1.0097082271008133e-07, "loss": 0.0335, "reward": 0.8973214775323868, "reward_std": 0.3453449606895447, "rewards/accuracy_reward": 0.18750000931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7098214626312256, "step": 574 }, { "clip_ratio": 0.0, "completion_length": 976.7946929931641, "epoch": 0.9824861170439982, "grad_norm": 1.268677830696106, "kl": 1.18359375, "learning_rate": 1.0080238290962396e-07, "loss": 0.0503, "reward": 0.8660714775323868, "reward_std": 0.28534968569874763, "rewards/accuracy_reward": 0.15178571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7142857611179352, "step": 575 }, { "clip_ratio": 0.0, "completion_length": 879.6161193847656, "epoch": 0.9841947885519009, "grad_norm": 1.5944905281066895, "kl": 0.9541015625, "learning_rate": 1.0064996686836342e-07, "loss": 0.1147, "reward": 0.973214328289032, "reward_std": 0.27744918689131737, "rewards/accuracy_reward": 0.22321429569274187, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7500000447034836, "step": 576 }, { "clip_ratio": 0.0, "completion_length": 977.0714721679688, "epoch": 0.9859034600598034, "grad_norm": 3.684220790863037, "kl": 1.1875, "learning_rate": 1.0051358002327867e-07, "loss": -0.0176, "reward": 0.8995536118745804, "reward_std": 0.32888074219226837, "rewards/accuracy_reward": 0.13392857555299997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7656250447034836, "step": 577 }, { "clip_ratio": 0.0, "completion_length": 1005.0536346435547, "epoch": 0.9876121315677061, "grad_norm": 1.733279824256897, "kl": 1.0498046875, "learning_rate": 1.0039322723955559e-07, "loss": -0.0068, "reward": 0.9419643133878708, "reward_std": 0.3152732215821743, "rewards/accuracy_reward": 0.1785714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7633928954601288, "step": 578 }, { "clip_ratio": 0.0, "completion_length": 937.2232360839844, "epoch": 0.9893208030756088, "grad_norm": 2.405085802078247, "kl": 1.4248046875, "learning_rate": 1.0028891281041394e-07, "loss": 0.1354, "reward": 0.792410746216774, "reward_std": 0.27230104804039, "rewards/accuracy_reward": 0.10714286379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6852678805589676, "step": 579 }, { "clip_ratio": 0.0, "completion_length": 1028.1339569091797, "epoch": 0.9910294745835113, "grad_norm": 1.677873134613037, "kl": 1.1181640625, "learning_rate": 1.0020064045695372e-07, "loss": 0.0153, "reward": 0.8616071939468384, "reward_std": 0.2928595803678036, "rewards/accuracy_reward": 0.09821428917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7633928805589676, "step": 580 }, { "clip_ratio": 0.0, "completion_length": 1035.9375305175781, "epoch": 0.992738146091414, "grad_norm": 1.8089627027511597, "kl": 1.1318359375, "learning_rate": 1.001284133280228e-07, "loss": 0.0659, "reward": 0.8816964626312256, "reward_std": 0.3417782448232174, "rewards/accuracy_reward": 0.1607142947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7209821790456772, "step": 581 }, { "clip_ratio": 0.0, "completion_length": 1020.3482666015625, "epoch": 0.9944468175993165, "grad_norm": 1.3826255798339844, "kl": 1.1875, "learning_rate": 1.0007223400010437e-07, "loss": 0.0518, "reward": 0.7723214775323868, "reward_std": 0.31216898560523987, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7008928805589676, "step": 582 }, { "clip_ratio": 0.0, "completion_length": 1071.0625610351562, "epoch": 0.9961554891072192, "grad_norm": 1.98823881149292, "kl": 1.025390625, "learning_rate": 1.0003210447722508e-07, "loss": 0.0197, "reward": 0.8125000298023224, "reward_std": 0.27175144851207733, "rewards/accuracy_reward": 0.098214291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.714285746216774, "step": 583 }, { "clip_ratio": 0.0, "completion_length": 1078.2232666015625, "epoch": 0.9978641606151217, "grad_norm": 2.838872194290161, "kl": 1.240234375, "learning_rate": 1.0000802619088376e-07, "loss": 0.0659, "reward": 0.761160746216774, "reward_std": 0.2807781994342804, "rewards/accuracy_reward": 0.08928571548312902, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6718750298023224, "step": 584 }, { "clip_ratio": 0.0, "completion_length": 890.75, "epoch": 0.9995728321230244, "grad_norm": 2.2915525436401367, "kl": 1.0673828125, "learning_rate": 1e-07, "loss": 0.0786, "reward": 0.9687500447034836, "reward_std": 0.32982346042990685, "rewards/accuracy_reward": 0.2500000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7187500298023224, "step": 585 }, { "epoch": 0.9995728321230244, "step": 585, "total_flos": 0.0, "train_loss": 0.07174540107587699, "train_runtime": 35314.338, "train_samples_per_second": 0.464, "train_steps_per_second": 0.017 } ], "logging_steps": 1, "max_steps": 585, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }