{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.997867803837953, "eval_steps": 500, "global_step": 234, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 576.5390625, "epoch": 0.0042643923240938165, "grad_norm": 0.32528209686279297, "kl": 0.0, "learning_rate": 8.333333333333333e-07, "loss": 0.0286, "reward": 0.2578125, "reward_std": 0.40137775242328644, "rewards/accuracy_reward": 0.18359375, "rewards/format_reward": 0.015625, "rewards/tag_count_reward": 0.05859375, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 547.66796875, "epoch": 0.008528784648187633, "grad_norm": 0.43932273983955383, "kl": 0.0, "learning_rate": 1.6666666666666667e-06, "loss": 0.0028, "reward": 0.265625, "reward_std": 0.40303920209407806, "rewards/accuracy_reward": 0.17578125, "rewards/format_reward": 0.02734375, "rewards/tag_count_reward": 0.0625, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 484.96484375, "epoch": 0.01279317697228145, "grad_norm": 0.4568934440612793, "kl": 0.0001436471939086914, "learning_rate": 2.5e-06, "loss": 0.0148, "reward": 0.3037109375, "reward_std": 0.41709331423044205, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.01953125, "rewards/tag_count_reward": 0.0810546875, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 469.796875, "epoch": 0.017057569296375266, "grad_norm": 0.5138208866119385, "kl": 0.0001804828643798828, "learning_rate": 3.3333333333333333e-06, "loss": 0.0222, "reward": 0.3076171875, "reward_std": 0.4279475286602974, "rewards/accuracy_reward": 0.1953125, "rewards/format_reward": 0.03515625, "rewards/tag_count_reward": 0.0771484375, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 498.140625, "epoch": 0.021321961620469083, "grad_norm": 0.4469239115715027, "kl": 0.0010051727294921875, "learning_rate": 4.166666666666667e-06, "loss": 0.0029, "reward": 0.318359375, "reward_std": 0.44843800365924835, "rewards/accuracy_reward": 0.18359375, "rewards/format_reward": 0.02734375, "rewards/tag_count_reward": 0.107421875, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 457.3125, "epoch": 0.0255863539445629, "grad_norm": 0.7152092456817627, "kl": 0.029693603515625, "learning_rate": 5e-06, "loss": 0.0403, "reward": 0.5029296875, "reward_std": 0.6177150011062622, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.10546875, "rewards/tag_count_reward": 0.2255859375, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 454.74609375, "epoch": 0.029850746268656716, "grad_norm": 2.3532168865203857, "kl": 0.09393310546875, "learning_rate": 5.833333333333334e-06, "loss": 0.048, "reward": 0.564453125, "reward_std": 0.6538278013467789, "rewards/accuracy_reward": 0.2578125, "rewards/format_reward": 0.10546875, "rewards/tag_count_reward": 0.201171875, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 492.9765625, "epoch": 0.03411513859275053, "grad_norm": 0.46021807193756104, "kl": 0.016357421875, "learning_rate": 6.666666666666667e-06, "loss": 0.0541, "reward": 0.5888671875, "reward_std": 0.592596247792244, "rewards/accuracy_reward": 0.19921875, "rewards/format_reward": 0.15234375, "rewards/tag_count_reward": 0.2373046875, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 478.79296875, "epoch": 0.03837953091684435, "grad_norm": 2.5319058895111084, "kl": 0.0784912109375, "learning_rate": 7.500000000000001e-06, "loss": 0.0801, "reward": 0.71875, "reward_std": 0.6128444075584412, "rewards/accuracy_reward": 0.3515625, "rewards/format_reward": 0.140625, "rewards/tag_count_reward": 0.2265625, "step": 9 }, { "clip_ratio": 0.0, "completion_length": 563.734375, "epoch": 0.042643923240938165, "grad_norm": 1.262290596961975, "kl": 0.0523223876953125, "learning_rate": 8.333333333333334e-06, "loss": 0.0363, "reward": 0.5009765625, "reward_std": 0.5229385495185852, "rewards/accuracy_reward": 0.296875, "rewards/format_reward": 0.078125, "rewards/tag_count_reward": 0.1259765625, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 499.44140625, "epoch": 0.046908315565031986, "grad_norm": 1.3580890893936157, "kl": 0.0596923828125, "learning_rate": 9.166666666666666e-06, "loss": 0.11, "reward": 0.681640625, "reward_std": 0.6728685200214386, "rewards/accuracy_reward": 0.23828125, "rewards/format_reward": 0.18359375, "rewards/tag_count_reward": 0.259765625, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 567.75390625, "epoch": 0.0511727078891258, "grad_norm": 4.042412281036377, "kl": 0.047576904296875, "learning_rate": 1e-05, "loss": 0.0732, "reward": 0.521484375, "reward_std": 0.5327698737382889, "rewards/accuracy_reward": 0.27734375, "rewards/format_reward": 0.08203125, "rewards/tag_count_reward": 0.162109375, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 559.421875, "epoch": 0.05543710021321962, "grad_norm": 0.3716038763523102, "kl": 0.0142669677734375, "learning_rate": 1.0833333333333334e-05, "loss": 0.1328, "reward": 0.599609375, "reward_std": 0.6229686141014099, "rewards/accuracy_reward": 0.234375, "rewards/format_reward": 0.14453125, "rewards/tag_count_reward": 0.220703125, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 437.3828125, "epoch": 0.05970149253731343, "grad_norm": 2.6377227306365967, "kl": 0.119598388671875, "learning_rate": 1.1666666666666668e-05, "loss": 0.1456, "reward": 0.798828125, "reward_std": 0.7079743444919586, "rewards/accuracy_reward": 0.23046875, "rewards/format_reward": 0.21875, "rewards/tag_count_reward": 0.349609375, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 434.79296875, "epoch": 0.06396588486140725, "grad_norm": 0.4819924831390381, "kl": 0.035736083984375, "learning_rate": 1.25e-05, "loss": 0.0931, "reward": 0.8818359375, "reward_std": 0.6916099190711975, "rewards/accuracy_reward": 0.20703125, "rewards/format_reward": 0.234375, "rewards/tag_count_reward": 0.4404296875, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 324.69921875, "epoch": 0.06823027718550106, "grad_norm": 1.3913614749908447, "kl": 0.0682373046875, "learning_rate": 1.3333333333333333e-05, "loss": 0.2251, "reward": 1.1982421875, "reward_std": 0.7941954433917999, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.4375, "rewards/tag_count_reward": 0.6748046875, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 299.75, "epoch": 0.07249466950959488, "grad_norm": 0.9118645787239075, "kl": 0.0875244140625, "learning_rate": 1.416666666666667e-05, "loss": 0.2379, "reward": 1.59765625, "reward_std": 0.6924279183149338, "rewards/accuracy_reward": 0.11328125, "rewards/format_reward": 0.65625, "rewards/tag_count_reward": 0.828125, "step": 17 }, { "clip_ratio": 0.0, "completion_length": 220.4921875, "epoch": 0.0767590618336887, "grad_norm": 1.340067982673645, "kl": 0.133056640625, "learning_rate": 1.5000000000000002e-05, "loss": 0.1821, "reward": 1.8427734375, "reward_std": 0.4516802802681923, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.84375, "rewards/tag_count_reward": 0.9287109375, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 182.53515625, "epoch": 0.08102345415778252, "grad_norm": 3.030550003051758, "kl": 0.4296875, "learning_rate": 1.5833333333333333e-05, "loss": 0.0898, "reward": 1.986328125, "reward_std": 0.22909418493509293, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.95703125, "rewards/tag_count_reward": 0.958984375, "step": 19 }, { "clip_ratio": 0.0, "completion_length": 214.65234375, "epoch": 0.08528784648187633, "grad_norm": 1.1024236679077148, "kl": 0.1187744140625, "learning_rate": 1.6666666666666667e-05, "loss": 0.1787, "reward": 2.0078125, "reward_std": 0.170526759698987, "rewards/accuracy_reward": 0.06640625, "rewards/format_reward": 0.98046875, "rewards/tag_count_reward": 0.9609375, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 208.2421875, "epoch": 0.08955223880597014, "grad_norm": 22.709352493286133, "kl": 1.359375, "learning_rate": 1.7500000000000002e-05, "loss": 0.1298, "reward": 1.9345703125, "reward_std": 0.32205624878406525, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.91015625, "rewards/tag_count_reward": 0.9619140625, "step": 21 }, { "clip_ratio": 0.0, "completion_length": 253.96484375, "epoch": 0.09381663113006397, "grad_norm": 8.92030143737793, "kl": 1.1494140625, "learning_rate": 1.8333333333333333e-05, "loss": 0.1757, "reward": 1.4931640625, "reward_std": 0.6142828911542892, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.57421875, "rewards/tag_count_reward": 0.8720703125, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 217.625, "epoch": 0.09808102345415778, "grad_norm": 3.518479585647583, "kl": 0.38134765625, "learning_rate": 1.916666666666667e-05, "loss": 0.0203, "reward": 1.0087890625, "reward_std": 0.44057436287403107, "rewards/accuracy_reward": 0.05078125, "rewards/format_reward": 0.171875, "rewards/tag_count_reward": 0.7861328125, "step": 23 }, { "clip_ratio": 0.0, "completion_length": 237.28125, "epoch": 0.1023454157782516, "grad_norm": 1.0693765878677368, "kl": 0.19775390625, "learning_rate": 2e-05, "loss": 0.0202, "reward": 1.134765625, "reward_std": 0.5518650561571121, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.296875, "rewards/tag_count_reward": 0.814453125, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 295.78515625, "epoch": 0.10660980810234541, "grad_norm": 9.523970603942871, "kl": 0.23779296875, "learning_rate": 1.9998881018102735e-05, "loss": 0.1602, "reward": 1.501953125, "reward_std": 0.6455793529748917, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.50390625, "rewards/tag_count_reward": 0.896484375, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 345.72265625, "epoch": 0.11087420042643924, "grad_norm": 24690.228515625, "kl": 121.465576171875, "learning_rate": 1.9995524322835035e-05, "loss": 6.9741, "reward": 1.5634765625, "reward_std": 0.648356705904007, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.58984375, "rewards/tag_count_reward": 0.8955078125, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 452.1953125, "epoch": 0.11513859275053305, "grad_norm": 114.83018493652344, "kl": 1.1298828125, "learning_rate": 1.9989930665413148e-05, "loss": 0.3668, "reward": 1.2841796875, "reward_std": 0.7657907009124756, "rewards/accuracy_reward": 0.05078125, "rewards/format_reward": 0.453125, "rewards/tag_count_reward": 0.7802734375, "step": 27 }, { "clip_ratio": 0.0, "completion_length": 550.98828125, "epoch": 0.11940298507462686, "grad_norm": 32.12217330932617, "kl": 0.9326171875, "learning_rate": 1.998210129767735e-05, "loss": 0.3607, "reward": 1.0625, "reward_std": 0.7528532892465591, "rewards/accuracy_reward": 0.05078125, "rewards/format_reward": 0.328125, "rewards/tag_count_reward": 0.68359375, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 534.66796875, "epoch": 0.12366737739872068, "grad_norm": 31.25490951538086, "kl": 0.796875, "learning_rate": 1.9972037971811802e-05, "loss": 0.1675, "reward": 0.6337890625, "reward_std": 0.5352872237563133, "rewards/accuracy_reward": 0.03515625, "rewards/format_reward": 0.109375, "rewards/tag_count_reward": 0.4892578125, "step": 29 }, { "clip_ratio": 0.0, "completion_length": 492.50390625, "epoch": 0.1279317697228145, "grad_norm": 7.399560928344727, "kl": 0.8779296875, "learning_rate": 1.9959742939952393e-05, "loss": 0.0607, "reward": 0.4462890625, "reward_std": 0.32183003425598145, "rewards/accuracy_reward": 0.01953125, "rewards/format_reward": 0.03515625, "rewards/tag_count_reward": 0.3916015625, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 430.3125, "epoch": 0.13219616204690832, "grad_norm": 391.785888671875, "kl": 8.494140625, "learning_rate": 1.9945218953682736e-05, "loss": 0.5494, "reward": 0.4248046875, "reward_std": 0.2781025320291519, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.0234375, "rewards/tag_count_reward": 0.3857421875, "step": 31 }, { "clip_ratio": 0.0, "completion_length": 404.62890625, "epoch": 0.13646055437100213, "grad_norm": 199.300048828125, "kl": 2.603515625, "learning_rate": 1.9928469263418376e-05, "loss": 0.235, "reward": 0.3564453125, "reward_std": 0.19321707263588905, "rewards/accuracy_reward": 0.00390625, "rewards/format_reward": 0.0078125, "rewards/tag_count_reward": 0.3447265625, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 432.109375, "epoch": 0.14072494669509594, "grad_norm": 3538.403564453125, "kl": 31.28515625, "learning_rate": 1.990949761767935e-05, "loss": 2.1404, "reward": 0.3896484375, "reward_std": 0.24749910086393356, "rewards/accuracy_reward": 0.01171875, "rewards/format_reward": 0.01953125, "rewards/tag_count_reward": 0.3583984375, "step": 33 }, { "clip_ratio": 0.0, "completion_length": 464.28515625, "epoch": 0.14498933901918976, "grad_norm": 111.8864517211914, "kl": 1.9296875, "learning_rate": 1.9888308262251286e-05, "loss": 0.1906, "reward": 0.345703125, "reward_std": 0.1697397418320179, "rewards/accuracy_reward": 0.01171875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.333984375, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 510.56640625, "epoch": 0.14925373134328357, "grad_norm": 14.984077453613281, "kl": 1.08984375, "learning_rate": 1.9864905939235215e-05, "loss": 0.0739, "reward": 0.3662109375, "reward_std": 0.19745982438325882, "rewards/accuracy_reward": 0.01953125, "rewards/format_reward": 0.00390625, "rewards/tag_count_reward": 0.3427734375, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 547.35546875, "epoch": 0.1535181236673774, "grad_norm": 137.8433380126953, "kl": 1.69921875, "learning_rate": 1.98392958859863e-05, "loss": 0.0645, "reward": 0.3623046875, "reward_std": 0.21899614110589027, "rewards/accuracy_reward": 0.01171875, "rewards/format_reward": 0.0078125, "rewards/tag_count_reward": 0.3427734375, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 647.1953125, "epoch": 0.15778251599147122, "grad_norm": 45.083709716796875, "kl": 2.125, "learning_rate": 1.9811483833941726e-05, "loss": 0.132, "reward": 0.3369140625, "reward_std": 0.17930956557393074, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.00390625, "rewards/tag_count_reward": 0.3251953125, "step": 37 }, { "clip_ratio": 0.0, "completion_length": 651.08984375, "epoch": 0.16204690831556504, "grad_norm": 35.52851104736328, "kl": 1.908203125, "learning_rate": 1.9781476007338058e-05, "loss": 0.098, "reward": 0.345703125, "reward_std": 0.23194141685962677, "rewards/accuracy_reward": 0.01171875, "rewards/format_reward": 0.00390625, "rewards/tag_count_reward": 0.330078125, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 648.5234375, "epoch": 0.16631130063965885, "grad_norm": 22.79519271850586, "kl": 2.15625, "learning_rate": 1.9749279121818235e-05, "loss": 0.058, "reward": 0.3427734375, "reward_std": 0.21174855902791023, "rewards/accuracy_reward": 0.00390625, "rewards/format_reward": 0.00390625, "rewards/tag_count_reward": 0.3349609375, "step": 39 }, { "clip_ratio": 0.0, "completion_length": 725.0078125, "epoch": 0.17057569296375266, "grad_norm": 1158.721923828125, "kl": 26.59375, "learning_rate": 1.9714900382928674e-05, "loss": 1.3132, "reward": 0.3447265625, "reward_std": 0.24672244489192963, "rewards/accuracy_reward": 0.01953125, "rewards/format_reward": 0.00390625, "rewards/tag_count_reward": 0.3212890625, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 674.56640625, "epoch": 0.17484008528784648, "grad_norm": 10.722182273864746, "kl": 2.39453125, "learning_rate": 1.9678347484506667e-05, "loss": 0.0955, "reward": 0.3056640625, "reward_std": 0.2298230677843094, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2900390625, "step": 41 }, { "clip_ratio": 0.0, "completion_length": 716.73046875, "epoch": 0.1791044776119403, "grad_norm": 22.29323387145996, "kl": 2.66015625, "learning_rate": 1.9639628606958535e-05, "loss": 0.1796, "reward": 0.2939453125, "reward_std": 0.2149362936615944, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2939453125, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 770.56640625, "epoch": 0.18336886993603413, "grad_norm": 5175.0302734375, "kl": 168.125, "learning_rate": 1.9598752415428893e-05, "loss": 8.1192, "reward": 0.3203125, "reward_std": 0.19998998567461967, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3203125, "step": 43 }, { "clip_ratio": 0.0, "completion_length": 815.19140625, "epoch": 0.18763326226012794, "grad_norm": 59.13957977294922, "kl": 2.7734375, "learning_rate": 1.955572805786141e-05, "loss": 0.1392, "reward": 0.3330078125, "reward_std": 0.19636105746030807, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3330078125, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 823.13671875, "epoch": 0.19189765458422176, "grad_norm": 56.28653335571289, "kl": 2.48046875, "learning_rate": 1.9510565162951538e-05, "loss": 0.1406, "reward": 0.3310546875, "reward_std": 0.20274027064442635, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3310546875, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 809.56640625, "epoch": 0.19616204690831557, "grad_norm": 13.485371589660645, "kl": 2.076171875, "learning_rate": 1.9463273837991643e-05, "loss": 0.1466, "reward": 0.3271484375, "reward_std": 0.2578311152756214, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0078125, "rewards/tag_count_reward": 0.3115234375, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 839.78125, "epoch": 0.20042643923240938, "grad_norm": 6.2021918296813965, "kl": 1.67578125, "learning_rate": 1.9413864666609036e-05, "loss": 0.1194, "reward": 0.3603515625, "reward_std": 0.21874134615063667, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0078125, "rewards/tag_count_reward": 0.3525390625, "step": 47 }, { "clip_ratio": 0.0, "completion_length": 935.88671875, "epoch": 0.2046908315565032, "grad_norm": 42.63424301147461, "kl": 1.73828125, "learning_rate": 1.9362348706397374e-05, "loss": 0.1158, "reward": 0.3427734375, "reward_std": 0.23805152624845505, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3349609375, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 923.13671875, "epoch": 0.208955223880597, "grad_norm": 20.886306762695312, "kl": 1.84765625, "learning_rate": 1.9308737486442045e-05, "loss": 0.0876, "reward": 0.365234375, "reward_std": 0.24175361543893814, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0078125, "rewards/tag_count_reward": 0.349609375, "step": 49 }, { "clip_ratio": 0.0, "completion_length": 913.77734375, "epoch": 0.21321961620469082, "grad_norm": 1.078397512435913, "kl": 0.9013671875, "learning_rate": 1.9253043004739967e-05, "loss": 0.0613, "reward": 0.3681640625, "reward_std": 0.2707056328654289, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0234375, "rewards/tag_count_reward": 0.3447265625, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 924.3671875, "epoch": 0.21748400852878466, "grad_norm": 139370.5625, "kl": 772.5048828125, "learning_rate": 1.919527772551451e-05, "loss": 34.5749, "reward": 0.3916015625, "reward_std": 0.2611350491642952, "rewards/accuracy_reward": 0.00390625, "rewards/format_reward": 0.01953125, "rewards/tag_count_reward": 0.3681640625, "step": 51 }, { "clip_ratio": 0.0, "completion_length": 966.453125, "epoch": 0.22174840085287847, "grad_norm": 2.575350284576416, "kl": 0.6396484375, "learning_rate": 1.913545457642601e-05, "loss": 0.0048, "reward": 0.3994140625, "reward_std": 0.26176000386476517, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.015625, "rewards/tag_count_reward": 0.3759765625, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 935.90625, "epoch": 0.2260127931769723, "grad_norm": 0.6655358672142029, "kl": 0.625, "learning_rate": 1.907358694567865e-05, "loss": 0.0332, "reward": 0.3818359375, "reward_std": 0.30670569837093353, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.0234375, "rewards/tag_count_reward": 0.3427734375, "step": 53 }, { "clip_ratio": 0.0, "completion_length": 981.7109375, "epoch": 0.2302771855010661, "grad_norm": 0.49440306425094604, "kl": 1.32421875, "learning_rate": 1.900968867902419e-05, "loss": 0.05, "reward": 0.37890625, "reward_std": 0.30825207754969597, "rewards/accuracy_reward": 0.00390625, "rewards/format_reward": 0.04296875, "rewards/tag_count_reward": 0.33203125, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 1006.4765625, "epoch": 0.2345415778251599, "grad_norm": 0.6174039840698242, "kl": 3.51171875, "learning_rate": 1.8943774076663372e-05, "loss": 0.1368, "reward": 0.55078125, "reward_std": 0.45677174627780914, "rewards/accuracy_reward": 0.03515625, "rewards/format_reward": 0.09375, "rewards/tag_count_reward": 0.421875, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 993.33984375, "epoch": 0.23880597014925373, "grad_norm": 0.6529553532600403, "kl": 4.8984375, "learning_rate": 1.8875857890045544e-05, "loss": 0.1925, "reward": 0.7568359375, "reward_std": 0.5871296375989914, "rewards/accuracy_reward": 0.01953125, "rewards/format_reward": 0.171875, "rewards/tag_count_reward": 0.5654296875, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 1012.4765625, "epoch": 0.24307036247334754, "grad_norm": 133.8551788330078, "kl": 4.53515625, "learning_rate": 1.880595531856738e-05, "loss": 0.1758, "reward": 0.806640625, "reward_std": 0.6075598150491714, "rewards/accuracy_reward": 0.03515625, "rewards/format_reward": 0.22265625, "rewards/tag_count_reward": 0.548828125, "step": 57 }, { "clip_ratio": 0.0, "completion_length": 996.9453125, "epoch": 0.24733475479744135, "grad_norm": 0.891619861125946, "kl": 2.80078125, "learning_rate": 1.87340820061713e-05, "loss": 0.0914, "reward": 0.732421875, "reward_std": 0.6083860993385315, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.2265625, "rewards/tag_count_reward": 0.498046875, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 967.28515625, "epoch": 0.2515991471215352, "grad_norm": 1.4940592050552368, "kl": 3.5234375, "learning_rate": 1.866025403784439e-05, "loss": 0.1145, "reward": 0.79296875, "reward_std": 0.6623349040746689, "rewards/accuracy_reward": 0.00390625, "rewards/format_reward": 0.2890625, "rewards/tag_count_reward": 0.5, "step": 59 }, { "clip_ratio": 0.0, "completion_length": 911.89453125, "epoch": 0.255863539445629, "grad_norm": 25.662094116210938, "kl": 4.6328125, "learning_rate": 1.8584487936018663e-05, "loss": 0.1482, "reward": 0.9228515625, "reward_std": 0.7258684784173965, "rewards/accuracy_reward": 0.05078125, "rewards/format_reward": 0.33203125, "rewards/tag_count_reward": 0.5400390625, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 838.140625, "epoch": 0.2601279317697228, "grad_norm": 23.566726684570312, "kl": 6.2734375, "learning_rate": 1.8506800656873397e-05, "loss": 0.14, "reward": 0.90625, "reward_std": 0.6905761212110519, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.32421875, "rewards/tag_count_reward": 0.53515625, "step": 61 }, { "clip_ratio": 0.0, "completion_length": 593.88671875, "epoch": 0.26439232409381663, "grad_norm": 9.581720352172852, "kl": 5.3984375, "learning_rate": 1.8427209586540392e-05, "loss": 0.0744, "reward": 0.966796875, "reward_std": 0.7168334871530533, "rewards/accuracy_reward": 0.03515625, "rewards/format_reward": 0.37109375, "rewards/tag_count_reward": 0.560546875, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 403.3515625, "epoch": 0.26865671641791045, "grad_norm": 3.977918863296509, "kl": 5.6328125, "learning_rate": 1.834573253721303e-05, "loss": 0.0664, "reward": 0.9931640625, "reward_std": 0.7101524770259857, "rewards/accuracy_reward": 0.03515625, "rewards/format_reward": 0.3671875, "rewards/tag_count_reward": 0.5908203125, "step": 63 }, { "clip_ratio": 0.0, "completion_length": 285.90625, "epoch": 0.27292110874200426, "grad_norm": 3.9532861709594727, "kl": 4.4375, "learning_rate": 1.826238774315995e-05, "loss": -0.0383, "reward": 1.2724609375, "reward_std": 0.7493992298841476, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.5703125, "rewards/tag_count_reward": 0.6552734375, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 219.54296875, "epoch": 0.2771855010660981, "grad_norm": 9.081878662109375, "kl": 5.0, "learning_rate": 1.8177193856644315e-05, "loss": 0.029, "reward": 1.5458984375, "reward_std": 0.7314303368330002, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.69921875, "rewards/tag_count_reward": 0.8076171875, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 172.1484375, "epoch": 0.2814498933901919, "grad_norm": 123.24443817138672, "kl": 8.5546875, "learning_rate": 1.8090169943749477e-05, "loss": 0.1148, "reward": 1.3759765625, "reward_std": 0.7652620077133179, "rewards/accuracy_reward": 0.00390625, "rewards/format_reward": 0.6015625, "rewards/tag_count_reward": 0.7705078125, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 155.49609375, "epoch": 0.2857142857142857, "grad_norm": 6.339594841003418, "kl": 5.8203125, "learning_rate": 1.8001335480112067e-05, "loss": 0.0783, "reward": 1.376953125, "reward_std": 0.6871647387742996, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.56640625, "rewards/tag_count_reward": 0.810546875, "step": 67 }, { "clip_ratio": 0.0, "completion_length": 154.2734375, "epoch": 0.2899786780383795, "grad_norm": 181.54469299316406, "kl": 9.875, "learning_rate": 1.7910710346563417e-05, "loss": 0.2282, "reward": 1.689453125, "reward_std": 0.574043981730938, "rewards/accuracy_reward": 0.01953125, "rewards/format_reward": 0.80078125, "rewards/tag_count_reward": 0.869140625, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 156.546875, "epoch": 0.2942430703624733, "grad_norm": 23.90792465209961, "kl": 4.12890625, "learning_rate": 1.78183148246803e-05, "loss": -0.0867, "reward": 1.49609375, "reward_std": 0.7586368173360825, "rewards/accuracy_reward": 0.01171875, "rewards/format_reward": 0.703125, "rewards/tag_count_reward": 0.78125, "step": 69 }, { "clip_ratio": 0.0, "completion_length": 154.2109375, "epoch": 0.29850746268656714, "grad_norm": 6.664966106414795, "kl": 4.35546875, "learning_rate": 1.7724169592245996e-05, "loss": -0.1123, "reward": 1.3564453125, "reward_std": 0.7493429481983185, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.62890625, "rewards/tag_count_reward": 0.7275390625, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 135.6171875, "epoch": 0.302771855010661, "grad_norm": 520.9791259765625, "kl": 14.2890625, "learning_rate": 1.7628295718622666e-05, "loss": 0.2477, "reward": 1.4765625, "reward_std": 0.7755448371171951, "rewards/accuracy_reward": 0.03515625, "rewards/format_reward": 0.67578125, "rewards/tag_count_reward": 0.765625, "step": 71 }, { "clip_ratio": 0.0, "completion_length": 137.9296875, "epoch": 0.3070362473347548, "grad_norm": 9.297532081604004, "kl": 4.171875, "learning_rate": 1.7530714660036112e-05, "loss": -0.0591, "reward": 1.576171875, "reward_std": 0.702255368232727, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.7265625, "rewards/tag_count_reward": 0.810546875, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 129.53515625, "epoch": 0.31130063965884863, "grad_norm": 481.652099609375, "kl": 26.50390625, "learning_rate": 1.7431448254773943e-05, "loss": 0.4083, "reward": 1.6875, "reward_std": 0.5314841717481613, "rewards/accuracy_reward": 0.00390625, "rewards/format_reward": 0.8125, "rewards/tag_count_reward": 0.87109375, "step": 73 }, { "clip_ratio": 0.0, "completion_length": 138.23046875, "epoch": 0.31556503198294245, "grad_norm": 7.763786315917969, "kl": 4.296875, "learning_rate": 1.7330518718298263e-05, "loss": 0.0646, "reward": 1.7587890625, "reward_std": 0.5207200050354004, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.83203125, "rewards/tag_count_reward": 0.8955078125, "step": 74 }, { "clip_ratio": 0.0, "completion_length": 128.81640625, "epoch": 0.31982942430703626, "grad_norm": 17.239259719848633, "kl": 5.734375, "learning_rate": 1.7227948638273918e-05, "loss": 0.135, "reward": 1.919921875, "reward_std": 0.3152705281972885, "rewards/accuracy_reward": 0.04296875, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.955078125, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 111.79296875, "epoch": 0.32409381663113007, "grad_norm": 8.510507583618164, "kl": 4.6171875, "learning_rate": 1.712376096951345e-05, "loss": 0.117, "reward": 1.900390625, "reward_std": 0.32303596287965775, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.91796875, "rewards/tag_count_reward": 0.943359375, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 124.20703125, "epoch": 0.3283582089552239, "grad_norm": 13.03882884979248, "kl": 5.109375, "learning_rate": 1.7017979028839918e-05, "loss": 0.1378, "reward": 1.8974609375, "reward_std": 0.3129582107067108, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.91796875, "rewards/tag_count_reward": 0.9482421875, "step": 77 }, { "clip_ratio": 0.0, "completion_length": 114.91015625, "epoch": 0.3326226012793177, "grad_norm": 14.898639678955078, "kl": 5.84375, "learning_rate": 1.691062648986865e-05, "loss": 0.1679, "reward": 1.8828125, "reward_std": 0.28449319303035736, "rewards/accuracy_reward": 0.00390625, "rewards/format_reward": 0.92578125, "rewards/tag_count_reward": 0.953125, "step": 78 }, { "clip_ratio": 0.0, "completion_length": 116.80859375, "epoch": 0.3368869936034115, "grad_norm": 41.616432189941406, "kl": 6.03125, "learning_rate": 1.6801727377709195e-05, "loss": 0.1764, "reward": 1.939453125, "reward_std": 0.24111925438046455, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.970703125, "step": 79 }, { "clip_ratio": 0.0, "completion_length": 178.0546875, "epoch": 0.3411513859275053, "grad_norm": 40.391815185546875, "kl": 7.078125, "learning_rate": 1.6691306063588583e-05, "loss": 0.1877, "reward": 1.896484375, "reward_std": 0.24799961294047534, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.94140625, "rewards/tag_count_reward": 0.955078125, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 207.1484375, "epoch": 0.34541577825159914, "grad_norm": 53.5943489074707, "kl": 7.96875, "learning_rate": 1.657938725939713e-05, "loss": 0.2265, "reward": 1.8984375, "reward_std": 0.25493185594677925, "rewards/accuracy_reward": 0.00390625, "rewards/format_reward": 0.94140625, "rewards/tag_count_reward": 0.953125, "step": 81 }, { "clip_ratio": 0.0, "completion_length": 244.5625, "epoch": 0.34968017057569295, "grad_norm": 9.949625015258789, "kl": 6.1015625, "learning_rate": 1.6465996012157996e-05, "loss": 0.1384, "reward": 1.8525390625, "reward_std": 0.3406095430254936, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.9384765625, "step": 82 }, { "clip_ratio": 0.0, "completion_length": 232.71875, "epoch": 0.35394456289978676, "grad_norm": 11.608429908752441, "kl": 5.5390625, "learning_rate": 1.635115769842179e-05, "loss": 0.1642, "reward": 1.8603515625, "reward_std": 0.319538950920105, "rewards/accuracy_reward": 0.00390625, "rewards/format_reward": 0.90234375, "rewards/tag_count_reward": 0.9541015625, "step": 83 }, { "clip_ratio": 0.0, "completion_length": 196.1328125, "epoch": 0.3582089552238806, "grad_norm": 7.774857044219971, "kl": 5.359375, "learning_rate": 1.6234898018587336e-05, "loss": 0.1484, "reward": 1.841796875, "reward_std": 0.35672812163829803, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.89453125, "rewards/tag_count_reward": 0.947265625, "step": 84 }, { "clip_ratio": 0.0, "completion_length": 184.19921875, "epoch": 0.3624733475479744, "grad_norm": 5.315440654754639, "kl": 5.0234375, "learning_rate": 1.6117242991150064e-05, "loss": 0.1525, "reward": 1.9189453125, "reward_std": 0.21933256834745407, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.94921875, "rewards/tag_count_reward": 0.9619140625, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 193.97265625, "epoch": 0.36673773987206826, "grad_norm": 5.736013889312744, "kl": 5.515625, "learning_rate": 1.599821894687914e-05, "loss": 0.1875, "reward": 1.9306640625, "reward_std": 0.20439787581562996, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.94921875, "rewards/tag_count_reward": 0.9736328125, "step": 86 }, { "clip_ratio": 0.0, "completion_length": 297.3984375, "epoch": 0.37100213219616207, "grad_norm": 8.90512752532959, "kl": 5.5703125, "learning_rate": 1.5877852522924733e-05, "loss": 0.1698, "reward": 1.91015625, "reward_std": 0.33383994549512863, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.91796875, "rewards/tag_count_reward": 0.953125, "step": 87 }, { "clip_ratio": 0.0, "completion_length": 293.015625, "epoch": 0.3752665245202559, "grad_norm": 102.06912231445312, "kl": 10.2578125, "learning_rate": 1.575617065685674e-05, "loss": 0.3011, "reward": 1.890625, "reward_std": 0.31814195960760117, "rewards/accuracy_reward": 0.03515625, "rewards/format_reward": 0.90234375, "rewards/tag_count_reward": 0.953125, "step": 88 }, { "clip_ratio": 0.0, "completion_length": 336.6796875, "epoch": 0.3795309168443497, "grad_norm": 108.87593841552734, "kl": 11.6640625, "learning_rate": 1.563320058063622e-05, "loss": 0.2676, "reward": 1.8369140625, "reward_std": 0.38644537329673767, "rewards/accuracy_reward": 0.03515625, "rewards/format_reward": 0.8671875, "rewards/tag_count_reward": 0.9345703125, "step": 89 }, { "clip_ratio": 0.0, "completion_length": 409.29296875, "epoch": 0.3837953091684435, "grad_norm": 35.86373519897461, "kl": 8.4296875, "learning_rate": 1.5508969814521026e-05, "loss": 0.2346, "reward": 1.8154296875, "reward_std": 0.4089268818497658, "rewards/accuracy_reward": 0.00390625, "rewards/format_reward": 0.87109375, "rewards/tag_count_reward": 0.9404296875, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 419.03125, "epoch": 0.3880597014925373, "grad_norm": 4.703104496002197, "kl": 5.8359375, "learning_rate": 1.5383506160906826e-05, "loss": 0.1736, "reward": 1.8583984375, "reward_std": 0.37071677297353745, "rewards/accuracy_reward": 0.01953125, "rewards/format_reward": 0.88671875, "rewards/tag_count_reward": 0.9521484375, "step": 91 }, { "clip_ratio": 0.0, "completion_length": 498.890625, "epoch": 0.39232409381663114, "grad_norm": 15.804770469665527, "kl": 6.359375, "learning_rate": 1.5256837698105047e-05, "loss": 0.2056, "reward": 1.896484375, "reward_std": 0.264212965965271, "rewards/accuracy_reward": 0.01171875, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.962890625, "step": 92 }, { "clip_ratio": 0.0, "completion_length": 541.484375, "epoch": 0.39658848614072495, "grad_norm": 43.44738006591797, "kl": 7.046875, "learning_rate": 1.5128992774059063e-05, "loss": 0.1825, "reward": 1.84375, "reward_std": 0.37193765491247177, "rewards/accuracy_reward": 0.01171875, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.94140625, "step": 93 }, { "clip_ratio": 0.0, "completion_length": 690.60546875, "epoch": 0.40085287846481876, "grad_norm": 3.9883878231048584, "kl": 5.6875, "learning_rate": 1.5000000000000002e-05, "loss": 0.1577, "reward": 1.796875, "reward_std": 0.449543721973896, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.8515625, "rewards/tag_count_reward": 0.9140625, "step": 94 }, { "clip_ratio": 0.0, "completion_length": 710.62890625, "epoch": 0.4051172707889126, "grad_norm": 13.03452205657959, "kl": 5.1953125, "learning_rate": 1.4869888244043674e-05, "loss": 0.1824, "reward": 1.794921875, "reward_std": 0.44430477917194366, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.859375, "rewards/tag_count_reward": 0.927734375, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 679.2578125, "epoch": 0.4093816631130064, "grad_norm": 4.490772724151611, "kl": 5.296875, "learning_rate": 1.4738686624729987e-05, "loss": 0.1653, "reward": 1.80859375, "reward_std": 0.35829880461096764, "rewards/accuracy_reward": 0.02734375, "rewards/format_reward": 0.8515625, "rewards/tag_count_reward": 0.9296875, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 880.9765625, "epoch": 0.4136460554371002, "grad_norm": 150.7144317626953, "kl": 6.5390625, "learning_rate": 1.4606424504506325e-05, "loss": 0.2454, "reward": 1.5869140625, "reward_std": 0.5404268652200699, "rewards/accuracy_reward": 0.03515625, "rewards/format_reward": 0.76953125, "rewards/tag_count_reward": 0.7822265625, "step": 97 }, { "clip_ratio": 0.0, "completion_length": 945.01953125, "epoch": 0.417910447761194, "grad_norm": 216.19607543945312, "kl": 15.546875, "learning_rate": 1.4473131483156326e-05, "loss": 0.3319, "reward": 1.4111328125, "reward_std": 0.6219311505556107, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.6875, "rewards/tag_count_reward": 0.7001953125, "step": 98 }, { "clip_ratio": 0.0, "completion_length": 988.53515625, "epoch": 0.42217484008528783, "grad_norm": 137.95619201660156, "kl": 6.5, "learning_rate": 1.4338837391175582e-05, "loss": 0.2236, "reward": 1.45703125, "reward_std": 0.6083492934703827, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.70703125, "rewards/tag_count_reward": 0.7265625, "step": 99 }, { "clip_ratio": 0.0, "completion_length": 985.140625, "epoch": 0.42643923240938164, "grad_norm": 7.844208240509033, "kl": 1.84375, "learning_rate": 1.4203572283095657e-05, "loss": 0.039, "reward": 1.4658203125, "reward_std": 0.6321621090173721, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.69140625, "rewards/tag_count_reward": 0.7431640625, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 992.00390625, "epoch": 0.43070362473347545, "grad_norm": 13.683513641357422, "kl": 1.30078125, "learning_rate": 1.4067366430758004e-05, "loss": 0.0205, "reward": 1.4462890625, "reward_std": 0.6124080866575241, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.71875, "rewards/tag_count_reward": 0.6806640625, "step": 101 }, { "clip_ratio": 0.0, "completion_length": 998.76171875, "epoch": 0.4349680170575693, "grad_norm": 30.369285583496094, "kl": 1.1240234375, "learning_rate": 1.3930250316539237e-05, "loss": 0.0296, "reward": 1.4365234375, "reward_std": 0.6077300161123276, "rewards/accuracy_reward": 0.02734375, "rewards/format_reward": 0.7109375, "rewards/tag_count_reward": 0.6982421875, "step": 102 }, { "clip_ratio": 0.0, "completion_length": 1007.33984375, "epoch": 0.43923240938166314, "grad_norm": 22.23171615600586, "kl": 3.173828125, "learning_rate": 1.3792254626529286e-05, "loss": 0.1053, "reward": 1.39453125, "reward_std": 0.5931012779474258, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.68359375, "rewards/tag_count_reward": 0.6796875, "step": 103 }, { "clip_ratio": 0.0, "completion_length": 1018.3046875, "epoch": 0.44349680170575695, "grad_norm": 32.721920013427734, "kl": 2.7041015625, "learning_rate": 1.3653410243663953e-05, "loss": 0.1054, "reward": 1.4375, "reward_std": 0.5352352559566498, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.74609375, "rewards/tag_count_reward": 0.68359375, "step": 104 }, { "clip_ratio": 0.0, "completion_length": 983.671875, "epoch": 0.44776119402985076, "grad_norm": 21.012828826904297, "kl": 2.029296875, "learning_rate": 1.3513748240813429e-05, "loss": 0.0617, "reward": 1.44140625, "reward_std": 0.5487575381994247, "rewards/accuracy_reward": 0.03515625, "rewards/format_reward": 0.71484375, "rewards/tag_count_reward": 0.69140625, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 1009.48828125, "epoch": 0.4520255863539446, "grad_norm": 8.625335693359375, "kl": 1.4072265625, "learning_rate": 1.3373299873828303e-05, "loss": 0.0492, "reward": 1.4296875, "reward_std": 0.5546326637268066, "rewards/accuracy_reward": 0.03515625, "rewards/format_reward": 0.703125, "rewards/tag_count_reward": 0.69140625, "step": 106 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4562899786780384, "grad_norm": 3.2243165969848633, "kl": 0.8115234375, "learning_rate": 1.3232096574544602e-05, "loss": 0.0324, "reward": 1.462890625, "reward_std": 0.5354997888207436, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.72265625, "rewards/tag_count_reward": 0.701171875, "step": 107 }, { "clip_ratio": 0.0, "completion_length": 1020.35546875, "epoch": 0.4605543710021322, "grad_norm": 1.6773790121078491, "kl": 0.9384765625, "learning_rate": 1.3090169943749475e-05, "loss": 0.0316, "reward": 1.3544921875, "reward_std": 0.6066916137933731, "rewards/accuracy_reward": 0.02734375, "rewards/format_reward": 0.6328125, "rewards/tag_count_reward": 0.6943359375, "step": 108 }, { "clip_ratio": 0.0, "completion_length": 934.0625, "epoch": 0.464818763326226, "grad_norm": 0.25028663873672485, "kl": 3.5703125, "learning_rate": 1.2947551744109044e-05, "loss": 0.1428, "reward": 1.7275390625, "reward_std": 0.30690931528806686, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.94140625, "rewards/tag_count_reward": 0.7158203125, "step": 109 }, { "clip_ratio": 0.0, "completion_length": 894.3125, "epoch": 0.4690831556503198, "grad_norm": 0.25236231088638306, "kl": 4.31640625, "learning_rate": 1.2804273893060028e-05, "loss": 0.1724, "reward": 1.5966796875, "reward_std": 0.3756791800260544, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.86328125, "rewards/tag_count_reward": 0.7099609375, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 843.46875, "epoch": 0.47334754797441364, "grad_norm": 0.30303165316581726, "kl": 4.3515625, "learning_rate": 1.2660368455666752e-05, "loss": 0.174, "reward": 1.6923828125, "reward_std": 0.36458854377269745, "rewards/accuracy_reward": 0.05859375, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.7041015625, "step": 111 }, { "clip_ratio": 0.0, "completion_length": 846.1875, "epoch": 0.47761194029850745, "grad_norm": 0.30785125494003296, "kl": 4.8515625, "learning_rate": 1.2515867637445088e-05, "loss": 0.1944, "reward": 1.65234375, "reward_std": 0.35947033017873764, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.91015625, "rewards/tag_count_reward": 0.7421875, "step": 112 }, { "clip_ratio": 0.0, "completion_length": 1001.59375, "epoch": 0.48187633262260127, "grad_norm": 0.5503849983215332, "kl": 3.1875, "learning_rate": 1.2370803777154976e-05, "loss": 0.1275, "reward": 0.92578125, "reward_std": 0.40457524359226227, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.171875, "rewards/tag_count_reward": 0.73828125, "step": 113 }, { "clip_ratio": 0.0, "completion_length": 742.28125, "epoch": 0.4861407249466951, "grad_norm": 0.6809885501861572, "kl": 5.1484375, "learning_rate": 1.2225209339563144e-05, "loss": 0.2059, "reward": 1.7626953125, "reward_std": 0.38498707860708237, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.8095703125, "step": 114 }, { "clip_ratio": 0.0, "completion_length": 575.97265625, "epoch": 0.4904051172707889, "grad_norm": 0.8168994784355164, "kl": 4.984375, "learning_rate": 1.2079116908177592e-05, "loss": 0.1925, "reward": 1.8603515625, "reward_std": 0.43856722861528397, "rewards/accuracy_reward": 0.06640625, "rewards/format_reward": 0.84765625, "rewards/tag_count_reward": 0.9462890625, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 859.1875, "epoch": 0.4946695095948827, "grad_norm": 0.32246819138526917, "kl": 4.65625, "learning_rate": 1.1932559177955533e-05, "loss": 0.1858, "reward": 1.6337890625, "reward_std": 0.3074583485722542, "rewards/accuracy_reward": 0.01171875, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.7314453125, "step": 116 }, { "clip_ratio": 0.0, "completion_length": 1009.09375, "epoch": 0.4989339019189765, "grad_norm": 0.2883855402469635, "kl": 2.7890625, "learning_rate": 1.1785568947986368e-05, "loss": 0.1117, "reward": 1.8154296875, "reward_std": 0.2801100164651871, "rewards/accuracy_reward": 0.12109375, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.7333984375, "step": 117 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5031982942430704, "grad_norm": 0.5340821146965027, "kl": 2.125, "learning_rate": 1.1638179114151378e-05, "loss": 0.0849, "reward": 1.6708984375, "reward_std": 0.2677147090435028, "rewards/accuracy_reward": 0.01953125, "rewards/format_reward": 0.92578125, "rewards/tag_count_reward": 0.7255859375, "step": 118 }, { "clip_ratio": 0.0, "completion_length": 951.5625, "epoch": 0.5074626865671642, "grad_norm": 0.3258584141731262, "kl": 2.87109375, "learning_rate": 1.1490422661761744e-05, "loss": 0.1149, "reward": 1.71875, "reward_std": 0.1409970298409462, "rewards/accuracy_reward": 0.01171875, "rewards/format_reward": 0.96484375, "rewards/tag_count_reward": 0.7421875, "step": 119 }, { "clip_ratio": 0.0, "completion_length": 948.03125, "epoch": 0.511727078891258, "grad_norm": 0.20516642928123474, "kl": 2.56640625, "learning_rate": 1.1342332658176556e-05, "loss": 0.1026, "reward": 1.7783203125, "reward_std": 0.21998512372374535, "rewards/accuracy_reward": 0.07421875, "rewards/format_reward": 0.96484375, "rewards/tag_count_reward": 0.7392578125, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 949.3125, "epoch": 0.5159914712153518, "grad_norm": 0.15999875962734222, "kl": 2.73828125, "learning_rate": 1.1193942245402443e-05, "loss": 0.1093, "reward": 1.7880859375, "reward_std": 0.1588208805769682, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.96484375, "rewards/tag_count_reward": 0.7451171875, "step": 121 }, { "clip_ratio": 0.0, "completion_length": 996.125, "epoch": 0.5202558635394456, "grad_norm": 0.2390127331018448, "kl": 2.75, "learning_rate": 1.1045284632676535e-05, "loss": 0.1098, "reward": 1.76171875, "reward_std": 0.22831767983734608, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.73046875, "step": 122 }, { "clip_ratio": 0.0, "completion_length": 972.6796875, "epoch": 0.5245202558635395, "grad_norm": 4.8665361404418945, "kl": 3.66796875, "learning_rate": 1.0896393089034336e-05, "loss": 0.1313, "reward": 1.6845703125, "reward_std": 0.36246033012866974, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.88671875, "rewards/tag_count_reward": 0.7041015625, "step": 123 }, { "clip_ratio": 0.0, "completion_length": 888.3046875, "epoch": 0.5287846481876333, "grad_norm": 3.6568827629089355, "kl": 2.06640625, "learning_rate": 1.0747300935864245e-05, "loss": 0.0925, "reward": 1.68359375, "reward_std": 0.4343060404062271, "rewards/accuracy_reward": 0.05078125, "rewards/format_reward": 0.8828125, "rewards/tag_count_reward": 0.75, "step": 124 }, { "clip_ratio": 0.0, "completion_length": 516.43359375, "epoch": 0.5330490405117271, "grad_norm": 0.5316474437713623, "kl": 0.61767578125, "learning_rate": 1.0598041539450344e-05, "loss": 0.2445, "reward": 1.7685546875, "reward_std": 0.4253704324364662, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.89453125, "rewards/tag_count_reward": 0.8349609375, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 473.5546875, "epoch": 0.5373134328358209, "grad_norm": 2.9844846725463867, "kl": 0.8173828125, "learning_rate": 1.044864830350515e-05, "loss": 0.2749, "reward": 1.833984375, "reward_std": 0.523324653506279, "rewards/accuracy_reward": 0.10546875, "rewards/format_reward": 0.84375, "rewards/tag_count_reward": 0.884765625, "step": 126 }, { "clip_ratio": 0.0, "completion_length": 450.921875, "epoch": 0.5415778251599147, "grad_norm": 0.5743687748908997, "kl": 0.84765625, "learning_rate": 1.0299154661693987e-05, "loss": 0.2714, "reward": 1.771484375, "reward_std": 0.5503488332033157, "rewards/accuracy_reward": 0.05859375, "rewards/format_reward": 0.83203125, "rewards/tag_count_reward": 0.880859375, "step": 127 }, { "clip_ratio": 0.0, "completion_length": 488.37109375, "epoch": 0.5458422174840085, "grad_norm": 6.451872825622559, "kl": 1.234375, "learning_rate": 1.0149594070152638e-05, "loss": 0.3969, "reward": 1.7021484375, "reward_std": 0.6392623782157898, "rewards/accuracy_reward": 0.06640625, "rewards/format_reward": 0.78125, "rewards/tag_count_reward": 0.8544921875, "step": 128 }, { "clip_ratio": 0.0, "completion_length": 466.671875, "epoch": 0.5501066098081023, "grad_norm": 2.481407403945923, "kl": 0.986328125, "learning_rate": 1e-05, "loss": 0.4003, "reward": 1.615234375, "reward_std": 0.6238291710615158, "rewards/accuracy_reward": 0.03515625, "rewards/format_reward": 0.7421875, "rewards/tag_count_reward": 0.837890625, "step": 129 }, { "clip_ratio": 0.0, "completion_length": 414.88671875, "epoch": 0.5543710021321961, "grad_norm": 4.6952948570251465, "kl": 0.55078125, "learning_rate": 9.850405929847367e-06, "loss": 0.413, "reward": 1.654296875, "reward_std": 0.6841937601566315, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.7421875, "rewards/tag_count_reward": 0.841796875, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 374.3828125, "epoch": 0.55863539445629, "grad_norm": 10.949110984802246, "kl": 0.5146484375, "learning_rate": 9.700845338306018e-06, "loss": 0.4342, "reward": 1.7568359375, "reward_std": 0.5640043765306473, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.81640625, "rewards/tag_count_reward": 0.8935546875, "step": 131 }, { "clip_ratio": 0.0, "completion_length": 379.46875, "epoch": 0.5628997867803838, "grad_norm": 7.129451274871826, "kl": 0.41796875, "learning_rate": 9.551351696494854e-06, "loss": 0.462, "reward": 1.6328125, "reward_std": 0.7138571888208389, "rewards/accuracy_reward": 0.07421875, "rewards/format_reward": 0.7265625, "rewards/tag_count_reward": 0.83203125, "step": 132 }, { "clip_ratio": 0.0, "completion_length": 378.390625, "epoch": 0.5671641791044776, "grad_norm": 5.389857769012451, "kl": 0.6474609375, "learning_rate": 9.401958460549658e-06, "loss": 0.4062, "reward": 1.69921875, "reward_std": 0.5953380540013313, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.78515625, "rewards/tag_count_reward": 0.8828125, "step": 133 }, { "clip_ratio": 0.0, "completion_length": 374.63671875, "epoch": 0.5714285714285714, "grad_norm": 8.098217964172363, "kl": 0.6748046875, "learning_rate": 9.252699064135759e-06, "loss": 0.5274, "reward": 1.68359375, "reward_std": 0.6231431663036346, "rewards/accuracy_reward": 0.04296875, "rewards/format_reward": 0.76953125, "rewards/tag_count_reward": 0.87109375, "step": 134 }, { "clip_ratio": 0.0, "completion_length": 371.6953125, "epoch": 0.5756929637526652, "grad_norm": 27.047813415527344, "kl": 6.0146484375, "learning_rate": 9.103606910965666e-06, "loss": 0.4173, "reward": 1.7587890625, "reward_std": 0.477617509663105, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.7890625, "rewards/tag_count_reward": 0.8759765625, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 388.03125, "epoch": 0.579957356076759, "grad_norm": 72.6392822265625, "kl": 33.333984375, "learning_rate": 8.954715367323468e-06, "loss": 0.5359, "reward": 1.5771484375, "reward_std": 0.65767702460289, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.71484375, "rewards/tag_count_reward": 0.8388671875, "step": 136 }, { "clip_ratio": 0.0, "completion_length": 520.26953125, "epoch": 0.5842217484008528, "grad_norm": 11.781960487365723, "kl": 3.19140625, "learning_rate": 8.806057754597559e-06, "loss": 0.3497, "reward": 1.1142578125, "reward_std": 0.6293385028839111, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.40625, "rewards/tag_count_reward": 0.6923828125, "step": 137 }, { "clip_ratio": 0.0, "completion_length": 612.92578125, "epoch": 0.5884861407249466, "grad_norm": 10.908761978149414, "kl": 2.56640625, "learning_rate": 8.657667341823449e-06, "loss": 0.089, "reward": 0.552734375, "reward_std": 0.22833332046866417, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.05078125, "rewards/tag_count_reward": 0.501953125, "step": 138 }, { "clip_ratio": 0.0, "completion_length": 164.50390625, "epoch": 0.5927505330490405, "grad_norm": 1.4357022047042847, "kl": 0.38916015625, "learning_rate": 8.509577338238255e-06, "loss": 0.3546, "reward": 0.4619140625, "reward_std": 0.0770116988569498, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4619140625, "step": 139 }, { "clip_ratio": 0.0, "completion_length": 236.9140625, "epoch": 0.5970149253731343, "grad_norm": 1.1441797018051147, "kl": 0.32080078125, "learning_rate": 8.361820885848623e-06, "loss": 0.1043, "reward": 0.3369140625, "reward_std": 0.11966157145798206, "rewards/accuracy_reward": 0.00390625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3330078125, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 304.41796875, "epoch": 0.6012793176972282, "grad_norm": 6.6605143547058105, "kl": 0.4736328125, "learning_rate": 8.214431052013636e-06, "loss": 0.0359, "reward": 0.6025390625, "reward_std": 0.21631848067045212, "rewards/accuracy_reward": 0.06640625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5361328125, "step": 141 }, { "clip_ratio": 0.0, "completion_length": 291.44921875, "epoch": 0.605543710021322, "grad_norm": 29.841733932495117, "kl": 0.37939453125, "learning_rate": 8.06744082204447e-06, "loss": 0.1329, "reward": 0.7060546875, "reward_std": 0.25769177079200745, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5888671875, "step": 142 }, { "clip_ratio": 0.0, "completion_length": 259.703125, "epoch": 0.6098081023454158, "grad_norm": 22.494600296020508, "kl": 1.2724609375, "learning_rate": 7.92088309182241e-06, "loss": -0.003, "reward": 0.61328125, "reward_std": 0.2214067205786705, "rewards/accuracy_reward": 0.05078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5625, "step": 143 }, { "clip_ratio": 0.0, "completion_length": 272.9921875, "epoch": 0.6140724946695096, "grad_norm": 3.0637097358703613, "kl": 0.34814453125, "learning_rate": 7.774790660436857e-06, "loss": -0.0925, "reward": 0.5869140625, "reward_std": 0.22308824211359024, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5478515625, "step": 144 }, { "clip_ratio": 0.0, "completion_length": 303.21875, "epoch": 0.6183368869936035, "grad_norm": 3.038789987564087, "kl": 0.57373046875, "learning_rate": 7.629196222845027e-06, "loss": -0.0695, "reward": 0.6015625, "reward_std": 0.1990872472524643, "rewards/accuracy_reward": 0.02734375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.57421875, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 244.98828125, "epoch": 0.6226012793176973, "grad_norm": 3.982813835144043, "kl": 1.84765625, "learning_rate": 7.484132362554915e-06, "loss": -0.1056, "reward": 0.61328125, "reward_std": 0.24527693167328835, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.57421875, "step": 146 }, { "clip_ratio": 0.0, "completion_length": 304.8359375, "epoch": 0.6268656716417911, "grad_norm": 1.170094609260559, "kl": 1.3681640625, "learning_rate": 7.33963154433325e-06, "loss": -0.1068, "reward": 0.609375, "reward_std": 0.203267153352499, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.578125, "step": 147 }, { "clip_ratio": 0.0, "completion_length": 344.03125, "epoch": 0.6311300639658849, "grad_norm": 3.609171152114868, "kl": 1.33203125, "learning_rate": 7.1957261069399745e-06, "loss": -0.1631, "reward": 0.6796875, "reward_std": 0.21126757562160492, "rewards/accuracy_reward": 0.05859375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.62109375, "step": 148 }, { "clip_ratio": 0.0, "completion_length": 312.6640625, "epoch": 0.6353944562899787, "grad_norm": 1.0008127689361572, "kl": 1.52734375, "learning_rate": 7.052448255890958e-06, "loss": -0.2083, "reward": 0.7255859375, "reward_std": 0.29479434341192245, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6083984375, "step": 149 }, { "clip_ratio": 0.0, "completion_length": 472.80859375, "epoch": 0.6396588486140725, "grad_norm": 0.9753682017326355, "kl": 0.650390625, "learning_rate": 6.909830056250527e-06, "loss": -0.196, "reward": 0.748046875, "reward_std": 0.2531566210091114, "rewards/accuracy_reward": 0.08984375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.658203125, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 808.75, "epoch": 0.6439232409381663, "grad_norm": 0.5874699950218201, "kl": 0.8330078125, "learning_rate": 6.767903425455402e-06, "loss": -0.122, "reward": 0.6337890625, "reward_std": 0.2824634090065956, "rewards/accuracy_reward": 0.12890625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5048828125, "step": 151 }, { "clip_ratio": 0.0, "completion_length": 793.7578125, "epoch": 0.6481876332622601, "grad_norm": 0.6901421546936035, "kl": 1.171875, "learning_rate": 6.6267001261717015e-06, "loss": -0.0709, "reward": 0.70703125, "reward_std": 0.2805519849061966, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.56640625, "step": 152 }, { "clip_ratio": 0.0, "completion_length": 625.5, "epoch": 0.652452025586354, "grad_norm": 1.2642836570739746, "kl": 6.0654296875, "learning_rate": 6.486251759186573e-06, "loss": -0.1338, "reward": 0.72265625, "reward_std": 0.28466814011335373, "rewards/accuracy_reward": 0.23046875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4921875, "step": 153 }, { "clip_ratio": 0.0, "completion_length": 625.890625, "epoch": 0.6567164179104478, "grad_norm": 0.3763836622238159, "kl": 0.4189453125, "learning_rate": 6.34658975633605e-06, "loss": -0.0951, "reward": 0.697265625, "reward_std": 0.2551993578672409, "rewards/accuracy_reward": 0.19921875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.498046875, "step": 154 }, { "clip_ratio": 0.0, "completion_length": 591.17578125, "epoch": 0.6609808102345416, "grad_norm": 0.6317035555839539, "kl": 0.52490234375, "learning_rate": 6.207745373470717e-06, "loss": -0.1346, "reward": 0.7265625, "reward_std": 0.32422181963920593, "rewards/accuracy_reward": 0.23046875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.49609375, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 702.796875, "epoch": 0.6652452025586354, "grad_norm": 0.3873419165611267, "kl": 0.81591796875, "learning_rate": 6.069749683460765e-06, "loss": -0.1725, "reward": 0.6943359375, "reward_std": 0.2808499410748482, "rewards/accuracy_reward": 0.11328125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5810546875, "step": 156 }, { "clip_ratio": 0.0, "completion_length": 655.75390625, "epoch": 0.6695095948827292, "grad_norm": 0.37135419249534607, "kl": 0.52587890625, "learning_rate": 5.932633569242e-06, "loss": -0.041, "reward": 0.6240234375, "reward_std": 0.19439143873751163, "rewards/accuracy_reward": 0.05078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5732421875, "step": 157 }, { "clip_ratio": 0.0, "completion_length": 676.12890625, "epoch": 0.673773987206823, "grad_norm": 0.3440045118331909, "kl": 0.49658203125, "learning_rate": 5.796427716904347e-06, "loss": -0.1016, "reward": 0.7001953125, "reward_std": 0.252426378428936, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5830078125, "step": 158 }, { "clip_ratio": 0.0, "completion_length": 611.26953125, "epoch": 0.6780383795309168, "grad_norm": 0.4159948527812958, "kl": 0.88134765625, "learning_rate": 5.66116260882442e-06, "loss": -0.1264, "reward": 0.642578125, "reward_std": 0.18874739110469818, "rewards/accuracy_reward": 0.01953125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.623046875, "step": 159 }, { "clip_ratio": 0.0, "completion_length": 470.328125, "epoch": 0.6823027718550106, "grad_norm": 0.38798099756240845, "kl": 0.79736328125, "learning_rate": 5.526868516843673e-06, "loss": -0.0926, "reward": 0.708984375, "reward_std": 0.22118790447711945, "rewards/accuracy_reward": 0.05078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.658203125, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 447.8125, "epoch": 0.6865671641791045, "grad_norm": 0.4147135615348816, "kl": 0.64111328125, "learning_rate": 5.393575495493679e-06, "loss": -0.1075, "reward": 0.732421875, "reward_std": 0.18555288948118687, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.662109375, "step": 161 }, { "clip_ratio": 0.0, "completion_length": 459.30078125, "epoch": 0.6908315565031983, "grad_norm": 0.5294929146766663, "kl": 0.58642578125, "learning_rate": 5.2613133752700145e-06, "loss": -0.1284, "reward": 0.66796875, "reward_std": 0.1776830367743969, "rewards/accuracy_reward": 0.01171875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.65625, "step": 162 }, { "clip_ratio": 0.0, "completion_length": 379.0625, "epoch": 0.6950959488272921, "grad_norm": 0.5291323065757751, "kl": 0.830078125, "learning_rate": 5.130111755956327e-06, "loss": -0.1563, "reward": 0.7099609375, "reward_std": 0.19628439471125603, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6630859375, "step": 163 }, { "clip_ratio": 0.0, "completion_length": 338.12890625, "epoch": 0.6993603411513859, "grad_norm": 0.4588962197303772, "kl": 0.73583984375, "learning_rate": 5.000000000000003e-06, "loss": -0.1004, "reward": 0.7626953125, "reward_std": 0.17791462130844593, "rewards/accuracy_reward": 0.06640625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6962890625, "step": 164 }, { "clip_ratio": 0.0, "completion_length": 310.578125, "epoch": 0.7036247334754797, "grad_norm": 0.8297274708747864, "kl": 1.7900390625, "learning_rate": 4.87100722594094e-06, "loss": -0.0991, "reward": 0.7421875, "reward_std": 0.17338587157428265, "rewards/accuracy_reward": 0.04296875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.69921875, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 316.98828125, "epoch": 0.7078891257995735, "grad_norm": 0.9038926362991333, "kl": 1.57470703125, "learning_rate": 4.743162301894952e-06, "loss": -0.0521, "reward": 0.7744140625, "reward_std": 0.14399663731455803, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7119140625, "step": 166 }, { "clip_ratio": 0.0, "completion_length": 307.609375, "epoch": 0.7121535181236673, "grad_norm": 3.5091426372528076, "kl": 1.41943359375, "learning_rate": 4.616493839093179e-06, "loss": -0.0394, "reward": 0.791015625, "reward_std": 0.1766387764364481, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.697265625, "step": 167 }, { "clip_ratio": 0.0, "completion_length": 290.44921875, "epoch": 0.7164179104477612, "grad_norm": 2.227064847946167, "kl": 1.279296875, "learning_rate": 4.491030185478976e-06, "loss": -0.0156, "reward": 0.7197265625, "reward_std": 0.10716542787849903, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7041015625, "step": 168 }, { "clip_ratio": 0.0, "completion_length": 296.54296875, "epoch": 0.720682302771855, "grad_norm": 5.702210426330566, "kl": 1.513671875, "learning_rate": 4.3667994193637794e-06, "loss": 0.0234, "reward": 0.7626953125, "reward_std": 0.12216670252382755, "rewards/accuracy_reward": 0.05078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7119140625, "step": 169 }, { "clip_ratio": 0.0, "completion_length": 303.10546875, "epoch": 0.7249466950959488, "grad_norm": 0.5832945108413696, "kl": 0.66162109375, "learning_rate": 4.2438293431432665e-06, "loss": 0.0051, "reward": 0.806640625, "reward_std": 0.13442331552505493, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.720703125, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 269.23046875, "epoch": 0.7292110874200426, "grad_norm": 1.5207250118255615, "kl": 0.60546875, "learning_rate": 4.12214747707527e-06, "loss": 0.0104, "reward": 0.732421875, "reward_std": 0.09527772478759289, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.716796875, "step": 171 }, { "clip_ratio": 0.0, "completion_length": 221.65625, "epoch": 0.7334754797441365, "grad_norm": 2.143716335296631, "kl": 1.3779296875, "learning_rate": 4.001781053120863e-06, "loss": -0.0052, "reward": 0.7958984375, "reward_std": 0.13394116796553135, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7021484375, "step": 172 }, { "clip_ratio": 0.0, "completion_length": 175.17578125, "epoch": 0.7377398720682303, "grad_norm": 5.434141635894775, "kl": 2.75, "learning_rate": 3.882757008849936e-06, "loss": 0.0388, "reward": 0.685546875, "reward_std": 0.16674507781863213, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.646484375, "step": 173 }, { "clip_ratio": 0.0, "completion_length": 162.55859375, "epoch": 0.7420042643923241, "grad_norm": 27.080265045166016, "kl": 3.57421875, "learning_rate": 3.7651019814126656e-06, "loss": 0.0552, "reward": 0.6416015625, "reward_std": 0.133183553814888, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6337890625, "step": 174 }, { "clip_ratio": 0.0, "completion_length": 161.41796875, "epoch": 0.746268656716418, "grad_norm": 4.6160430908203125, "kl": 3.09375, "learning_rate": 3.6488423015782128e-06, "loss": 0.074, "reward": 0.6455078125, "reward_std": 0.15037459693849087, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6220703125, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 153.59375, "epoch": 0.7505330490405118, "grad_norm": 3.9284942150115967, "kl": 2.150390625, "learning_rate": 3.534003987842005e-06, "loss": 0.0613, "reward": 0.69921875, "reward_std": 0.16477027162909508, "rewards/accuracy_reward": 0.03515625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6640625, "step": 176 }, { "clip_ratio": 0.0, "completion_length": 194.90625, "epoch": 0.7547974413646056, "grad_norm": 1.458369493484497, "kl": 0.7900390625, "learning_rate": 3.4206127406028744e-06, "loss": 0.0115, "reward": 0.78125, "reward_std": 0.16565649397671223, "rewards/accuracy_reward": 0.08203125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.69921875, "step": 177 }, { "clip_ratio": 0.0, "completion_length": 185.91796875, "epoch": 0.7590618336886994, "grad_norm": 0.9649374485015869, "kl": 0.466796875, "learning_rate": 3.308693936411421e-06, "loss": -0.0284, "reward": 0.75, "reward_std": 0.09331535268574953, "rewards/accuracy_reward": 0.01953125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.73046875, "step": 178 }, { "clip_ratio": 0.0, "completion_length": 179.1171875, "epoch": 0.7633262260127932, "grad_norm": 1.3005759716033936, "kl": 0.43115234375, "learning_rate": 3.1982726222908046e-06, "loss": -0.0093, "reward": 0.87890625, "reward_std": 0.09297346090897918, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.73046875, "step": 179 }, { "clip_ratio": 0.0, "completion_length": 198.36328125, "epoch": 0.767590618336887, "grad_norm": 0.8584280014038086, "kl": 0.2998046875, "learning_rate": 3.089373510131354e-06, "loss": -0.0111, "reward": 0.7822265625, "reward_std": 0.10853294795379043, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7353515625, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 187.99609375, "epoch": 0.7718550106609808, "grad_norm": 0.5993466377258301, "kl": 0.52099609375, "learning_rate": 2.9820209711600858e-06, "loss": -0.0401, "reward": 0.7890625, "reward_std": 0.1114540034905076, "rewards/accuracy_reward": 0.05859375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.73046875, "step": 181 }, { "clip_ratio": 0.0, "completion_length": 219.74609375, "epoch": 0.7761194029850746, "grad_norm": 3.0398595333099365, "kl": 0.75439453125, "learning_rate": 2.876239030486554e-06, "loss": 0.0153, "reward": 0.7724609375, "reward_std": 0.1012349147349596, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7333984375, "step": 182 }, { "clip_ratio": 0.0, "completion_length": 192.01171875, "epoch": 0.7803837953091685, "grad_norm": 2.972774028778076, "kl": 1.375, "learning_rate": 2.7720513617260857e-06, "loss": 0.0664, "reward": 0.814453125, "reward_std": 0.11987380962818861, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.728515625, "step": 183 }, { "clip_ratio": 0.0, "completion_length": 185.59765625, "epoch": 0.7846481876332623, "grad_norm": 6.028680801391602, "kl": 1.98046875, "learning_rate": 2.669481281701739e-06, "loss": 0.0526, "reward": 0.7646484375, "reward_std": 0.10079776309430599, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7333984375, "step": 184 }, { "clip_ratio": 0.0, "completion_length": 206.2109375, "epoch": 0.7889125799573561, "grad_norm": 1.8360040187835693, "kl": 1.3046875, "learning_rate": 2.5685517452260566e-06, "loss": -0.0217, "reward": 0.7841796875, "reward_std": 0.08804275188595057, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7294921875, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 211.5234375, "epoch": 0.7931769722814499, "grad_norm": 1.8284348249435425, "kl": 1.18359375, "learning_rate": 2.469285339963892e-06, "loss": 0.0017, "reward": 0.7744140625, "reward_std": 0.09929579310119152, "rewards/accuracy_reward": 0.04296875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7314453125, "step": 186 }, { "clip_ratio": 0.0, "completion_length": 206.52734375, "epoch": 0.7974413646055437, "grad_norm": 2.95171856880188, "kl": 1.2236328125, "learning_rate": 2.371704281377335e-06, "loss": 0.0348, "reward": 0.73828125, "reward_std": 0.09545402321964502, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.72265625, "step": 187 }, { "clip_ratio": 0.0, "completion_length": 202.625, "epoch": 0.8017057569296375, "grad_norm": 0.737244725227356, "kl": 0.52197265625, "learning_rate": 2.275830407754006e-06, "loss": 0.0328, "reward": 0.8466796875, "reward_std": 0.15702996030449867, "rewards/accuracy_reward": 0.11328125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7333984375, "step": 188 }, { "clip_ratio": 0.0, "completion_length": 214.828125, "epoch": 0.8059701492537313, "grad_norm": 0.781270444393158, "kl": 0.302734375, "learning_rate": 2.1816851753197023e-06, "loss": 0.0188, "reward": 0.80078125, "reward_std": 0.13719853153452277, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.73046875, "step": 189 }, { "clip_ratio": 0.0, "completion_length": 212.7890625, "epoch": 0.8102345415778252, "grad_norm": 1.513720989227295, "kl": 0.3876953125, "learning_rate": 2.08928965343659e-06, "loss": 0.0004, "reward": 0.861328125, "reward_std": 0.1351899290457368, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.736328125, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 220.57421875, "epoch": 0.814498933901919, "grad_norm": 4.133224964141846, "kl": 1.0576171875, "learning_rate": 1.9986645198879385e-06, "loss": -0.0196, "reward": 0.7626953125, "reward_std": 0.14676811546087265, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7236328125, "step": 191 }, { "clip_ratio": 0.0, "completion_length": 221.96875, "epoch": 0.8187633262260128, "grad_norm": 0.4540961682796478, "kl": 0.3203125, "learning_rate": 1.9098300562505266e-06, "loss": -0.0199, "reward": 0.8544921875, "reward_std": 0.1271651964634657, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7373046875, "step": 192 }, { "clip_ratio": 0.0, "completion_length": 219.1171875, "epoch": 0.8230277185501066, "grad_norm": 0.32064002752304077, "kl": 0.45361328125, "learning_rate": 1.8228061433556866e-06, "loss": -0.0265, "reward": 0.779296875, "reward_std": 0.0899216216057539, "rewards/accuracy_reward": 0.04296875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.736328125, "step": 193 }, { "clip_ratio": 0.0, "completion_length": 214.43359375, "epoch": 0.8272921108742004, "grad_norm": 1.135198950767517, "kl": 0.42529296875, "learning_rate": 1.7376122568400533e-06, "loss": -0.0286, "reward": 0.8046875, "reward_std": 0.16580088809132576, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7265625, "step": 194 }, { "clip_ratio": 0.0, "completion_length": 218.921875, "epoch": 0.8315565031982942, "grad_norm": 0.5622548460960388, "kl": 0.316650390625, "learning_rate": 1.6542674627869738e-06, "loss": 0.017, "reward": 0.80078125, "reward_std": 0.13944148644804955, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.73046875, "step": 195 }, { "clip_ratio": 0.0, "completion_length": 251.47265625, "epoch": 0.835820895522388, "grad_norm": 0.7856387495994568, "kl": 0.4052734375, "learning_rate": 1.5727904134596084e-06, "loss": 0.0162, "reward": 0.8193359375, "reward_std": 0.16033071093261242, "rewards/accuracy_reward": 0.11328125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7060546875, "step": 196 }, { "clip_ratio": 0.0, "completion_length": 243.625, "epoch": 0.8400852878464818, "grad_norm": 0.6806755065917969, "kl": 0.49072265625, "learning_rate": 1.4931993431266056e-06, "loss": 0.0095, "reward": 0.7890625, "reward_std": 0.25723421946167946, "rewards/accuracy_reward": 0.11328125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.67578125, "step": 197 }, { "clip_ratio": 0.0, "completion_length": 273.87109375, "epoch": 0.8443496801705757, "grad_norm": 1.0873993635177612, "kl": 0.375, "learning_rate": 1.4155120639813392e-06, "loss": 0.1037, "reward": 0.7626953125, "reward_std": 0.21218526735901833, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6689453125, "step": 198 }, { "clip_ratio": 0.0, "completion_length": 239.97265625, "epoch": 0.8486140724946695, "grad_norm": 1.0088647603988647, "kl": 0.37353515625, "learning_rate": 1.339745962155613e-06, "loss": 0.0416, "reward": 0.7822265625, "reward_std": 0.24759295210242271, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6806640625, "step": 199 }, { "clip_ratio": 0.0, "completion_length": 268.73046875, "epoch": 0.8528784648187633, "grad_norm": 0.6826640367507935, "kl": 0.43017578125, "learning_rate": 1.2659179938287035e-06, "loss": 0.0312, "reward": 0.7685546875, "reward_std": 0.18006664514541626, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6982421875, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 221.609375, "epoch": 0.8571428571428571, "grad_norm": 1.0566011667251587, "kl": 0.45751953125, "learning_rate": 1.19404468143262e-06, "loss": -0.0104, "reward": 0.796875, "reward_std": 0.15738755092024803, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.71875, "step": 201 }, { "clip_ratio": 0.0, "completion_length": 269.80859375, "epoch": 0.8614072494669509, "grad_norm": 0.7283450365066528, "kl": 0.5009765625, "learning_rate": 1.124142109954459e-06, "loss": -0.0242, "reward": 0.7705078125, "reward_std": 0.13039706647396088, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7314453125, "step": 202 }, { "clip_ratio": 0.0, "completion_length": 255.265625, "epoch": 0.8656716417910447, "grad_norm": 1.5315821170806885, "kl": 1.380859375, "learning_rate": 1.0562259233366334e-06, "loss": -0.0731, "reward": 0.78515625, "reward_std": 0.2296939566731453, "rewards/accuracy_reward": 0.08984375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6953125, "step": 203 }, { "clip_ratio": 0.0, "completion_length": 218.21875, "epoch": 0.8699360341151386, "grad_norm": 1.1094874143600464, "kl": 0.73046875, "learning_rate": 9.903113209758098e-07, "loss": 0.0012, "reward": 0.8720703125, "reward_std": 0.2057046014815569, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7236328125, "step": 204 }, { "clip_ratio": 0.0, "completion_length": 281.12109375, "epoch": 0.8742004264392325, "grad_norm": 1.407812237739563, "kl": 1.35546875, "learning_rate": 9.264130543213512e-07, "loss": -0.0625, "reward": 0.8251953125, "reward_std": 0.20766575261950493, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7080078125, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 252.62109375, "epoch": 0.8784648187633263, "grad_norm": 3.055626153945923, "kl": 2.255859375, "learning_rate": 8.645454235739903e-07, "loss": -0.0862, "reward": 0.80859375, "reward_std": 0.2070464938879013, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.69921875, "step": 206 }, { "clip_ratio": 0.0, "completion_length": 238.58203125, "epoch": 0.8827292110874201, "grad_norm": 1.8872811794281006, "kl": 1.49609375, "learning_rate": 8.047222744854943e-07, "loss": 0.0217, "reward": 0.8857421875, "reward_std": 0.24977924302220345, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7138671875, "step": 207 }, { "clip_ratio": 0.0, "completion_length": 258.48046875, "epoch": 0.8869936034115139, "grad_norm": 4.186584949493408, "kl": 2.2333984375, "learning_rate": 7.46956995260033e-07, "loss": -0.0711, "reward": 0.8271484375, "reward_std": 0.18339894711971283, "rewards/accuracy_reward": 0.12890625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6982421875, "step": 208 }, { "clip_ratio": 0.0, "completion_length": 236.16796875, "epoch": 0.8912579957356077, "grad_norm": 2.354311466217041, "kl": 1.609375, "learning_rate": 6.912625135579587e-07, "loss": -0.0062, "reward": 0.791015625, "reward_std": 0.17353365197777748, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.712890625, "step": 209 }, { "clip_ratio": 0.0, "completion_length": 227.03125, "epoch": 0.8955223880597015, "grad_norm": 2.211200714111328, "kl": 1.818359375, "learning_rate": 6.37651293602628e-07, "loss": -0.019, "reward": 0.7958984375, "reward_std": 0.19231459498405457, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7099609375, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 259.16015625, "epoch": 0.8997867803837953, "grad_norm": 3.354318141937256, "kl": 1.48828125, "learning_rate": 5.861353333909692e-07, "loss": -0.0305, "reward": 0.8115234375, "reward_std": 0.17966507747769356, "rewards/accuracy_reward": 0.09765625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7138671875, "step": 211 }, { "clip_ratio": 0.0, "completion_length": 262.46484375, "epoch": 0.9040511727078892, "grad_norm": 3.2571589946746826, "kl": 2.1796875, "learning_rate": 5.367261620083575e-07, "loss": -0.0519, "reward": 0.83984375, "reward_std": 0.2149498090147972, "rewards/accuracy_reward": 0.12109375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.71875, "step": 212 }, { "clip_ratio": 0.0, "completion_length": 237.96875, "epoch": 0.908315565031983, "grad_norm": 1.6243290901184082, "kl": 1.0390625, "learning_rate": 4.894348370484648e-07, "loss": 0.0014, "reward": 0.7900390625, "reward_std": 0.14244702830910683, "rewards/accuracy_reward": 0.05859375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7314453125, "step": 213 }, { "clip_ratio": 0.0, "completion_length": 238.0703125, "epoch": 0.9125799573560768, "grad_norm": 4.307506084442139, "kl": 1.15576171875, "learning_rate": 4.4427194213859216e-07, "loss": 0.0194, "reward": 0.833984375, "reward_std": 0.19881774485111237, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.724609375, "step": 214 }, { "clip_ratio": 0.0, "completion_length": 240.42578125, "epoch": 0.9168443496801706, "grad_norm": 0.588789701461792, "kl": 0.70654296875, "learning_rate": 4.012475845711106e-07, "loss": -0.0109, "reward": 0.8740234375, "reward_std": 0.2335027940571308, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7255859375, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 245.46875, "epoch": 0.9211087420042644, "grad_norm": 2.772460460662842, "kl": 1.4560546875, "learning_rate": 3.603713930414676e-07, "loss": -0.0346, "reward": 0.7451171875, "reward_std": 0.1310195019468665, "rewards/accuracy_reward": 0.02734375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7177734375, "step": 216 }, { "clip_ratio": 0.0, "completion_length": 276.265625, "epoch": 0.9253731343283582, "grad_norm": 2.068373918533325, "kl": 1.5322265625, "learning_rate": 3.2165251549333585e-07, "loss": -0.0333, "reward": 0.765625, "reward_std": 0.15018462389707565, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.71875, "step": 217 }, { "clip_ratio": 0.0, "completion_length": 274.9609375, "epoch": 0.929637526652452, "grad_norm": 1.8721359968185425, "kl": 0.88037109375, "learning_rate": 2.8509961707132496e-07, "loss": -0.0266, "reward": 0.8369140625, "reward_std": 0.20388219691812992, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7197265625, "step": 218 }, { "clip_ratio": 0.0, "completion_length": 268.0234375, "epoch": 0.9339019189765458, "grad_norm": 7.492040157318115, "kl": 1.865234375, "learning_rate": 2.507208781817638e-07, "loss": -0.0152, "reward": 0.859375, "reward_std": 0.2039647325873375, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7265625, "step": 219 }, { "clip_ratio": 0.0, "completion_length": 247.60546875, "epoch": 0.9381663113006397, "grad_norm": 1.1414939165115356, "kl": 0.890625, "learning_rate": 2.1852399266194312e-07, "loss": -0.0075, "reward": 0.8203125, "reward_std": 0.19437766447663307, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7265625, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 252.56640625, "epoch": 0.9424307036247335, "grad_norm": 1.324097990989685, "kl": 0.7802734375, "learning_rate": 1.885161660582746e-07, "loss": -0.0435, "reward": 0.7861328125, "reward_std": 0.1638173609972, "rewards/accuracy_reward": 0.05859375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7275390625, "step": 221 }, { "clip_ratio": 0.0, "completion_length": 234.42578125, "epoch": 0.9466950959488273, "grad_norm": 1.6171019077301025, "kl": 1.47265625, "learning_rate": 1.6070411401370335e-07, "loss": -0.0326, "reward": 0.771484375, "reward_std": 0.17419602535665035, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.708984375, "step": 222 }, { "clip_ratio": 0.0, "completion_length": 260.5859375, "epoch": 0.9509594882729211, "grad_norm": 4.242193698883057, "kl": 0.716552734375, "learning_rate": 1.350940607647866e-07, "loss": 0.0139, "reward": 0.822265625, "reward_std": 0.16951362788677216, "rewards/accuracy_reward": 0.08984375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.732421875, "step": 223 }, { "clip_ratio": 0.0, "completion_length": 259.80078125, "epoch": 0.9552238805970149, "grad_norm": 5.544849395751953, "kl": 1.04345703125, "learning_rate": 1.1169173774871478e-07, "loss": 0.0055, "reward": 0.8037109375, "reward_std": 0.17775351367890835, "rewards/accuracy_reward": 0.08203125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7216796875, "step": 224 }, { "clip_ratio": 0.0, "completion_length": 272.6328125, "epoch": 0.9594882729211087, "grad_norm": 0.6265246868133545, "kl": 0.630859375, "learning_rate": 9.0502382320653e-08, "loss": -0.0349, "reward": 0.9287109375, "reward_std": 0.25279103592038155, "rewards/accuracy_reward": 0.19921875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7294921875, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 240.05078125, "epoch": 0.9637526652452025, "grad_norm": 1.7309554815292358, "kl": 1.01171875, "learning_rate": 7.153073658162646e-08, "loss": -0.0208, "reward": 0.7783203125, "reward_std": 0.1793037187308073, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7158203125, "step": 226 }, { "clip_ratio": 0.0, "completion_length": 239.66796875, "epoch": 0.9680170575692963, "grad_norm": 1.2242202758789062, "kl": 0.91845703125, "learning_rate": 5.4781046317267103e-08, "loss": -0.0061, "reward": 0.8173828125, "reward_std": 0.20180584490299225, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7236328125, "step": 227 }, { "clip_ratio": 0.0, "completion_length": 275.390625, "epoch": 0.9722814498933902, "grad_norm": 0.6100110411643982, "kl": 0.55224609375, "learning_rate": 4.025706004760932e-08, "loss": -0.0347, "reward": 0.8251953125, "reward_std": 0.15769800543785095, "rewards/accuracy_reward": 0.08984375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7353515625, "step": 228 }, { "clip_ratio": 0.0, "completion_length": 251.50390625, "epoch": 0.976545842217484, "grad_norm": 1.7776597738265991, "kl": 0.861328125, "learning_rate": 2.796202818819871e-08, "loss": -0.0023, "reward": 0.85546875, "reward_std": 0.22671574354171753, "rewards/accuracy_reward": 0.12890625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7265625, "step": 229 }, { "clip_ratio": 0.0, "completion_length": 242.7734375, "epoch": 0.9808102345415778, "grad_norm": 1.0293753147125244, "kl": 0.8193359375, "learning_rate": 1.7898702322648453e-08, "loss": -0.0337, "reward": 0.828125, "reward_std": 0.16718050092458725, "rewards/accuracy_reward": 0.10546875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.72265625, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 265.5546875, "epoch": 0.9850746268656716, "grad_norm": 1.5529704093933105, "kl": 1.0205078125, "learning_rate": 1.0069334586854106e-08, "loss": -0.0289, "reward": 0.83203125, "reward_std": 0.16101082926616073, "rewards/accuracy_reward": 0.11328125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.71875, "step": 231 }, { "clip_ratio": 0.0, "completion_length": 254.5, "epoch": 0.9893390191897654, "grad_norm": 2.5354487895965576, "kl": 1.244140625, "learning_rate": 4.475677164966774e-09, "loss": -0.0043, "reward": 0.810546875, "reward_std": 0.18945523723959923, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.724609375, "step": 232 }, { "clip_ratio": 0.0, "completion_length": 251.34765625, "epoch": 0.9936034115138592, "grad_norm": 1.5713036060333252, "kl": 1.1103515625, "learning_rate": 1.1189818972656697e-09, "loss": -0.0032, "reward": 0.8720703125, "reward_std": 0.26588882878422737, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7158203125, "step": 233 }, { "clip_ratio": 0.0, "completion_length": 269.0000114440918, "epoch": 0.997867803837953, "grad_norm": 1.1278139352798462, "kl": 1.296875, "learning_rate": 0.0, "loss": -0.0335, "reward": 0.8466796875, "reward_std": 0.23463162407279015, "rewards/accuracy_reward": 0.12890625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7177734375, "step": 234 }, { "epoch": 0.997867803837953, "step": 234, "total_flos": 0.0, "train_loss": 0.3048181866761297, "train_runtime": 9320.4205, "train_samples_per_second": 0.805, "train_steps_per_second": 0.025 } ], "logging_steps": 1, "max_steps": 234, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }