Qwen2.5-1.5B-Open-R1-GRPO / trainer_state.json
Evangelinejy's picture
Model save
4cef8a8 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.997867803837953,
"eval_steps": 500,
"global_step": 234,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 576.5390625,
"epoch": 0.0042643923240938165,
"grad_norm": 0.32528209686279297,
"kl": 0.0,
"learning_rate": 8.333333333333333e-07,
"loss": 0.0286,
"reward": 0.2578125,
"reward_std": 0.40137775242328644,
"rewards/accuracy_reward": 0.18359375,
"rewards/format_reward": 0.015625,
"rewards/tag_count_reward": 0.05859375,
"step": 1
},
{
"clip_ratio": 0.0,
"completion_length": 547.66796875,
"epoch": 0.008528784648187633,
"grad_norm": 0.43932273983955383,
"kl": 0.0,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.0028,
"reward": 0.265625,
"reward_std": 0.40303920209407806,
"rewards/accuracy_reward": 0.17578125,
"rewards/format_reward": 0.02734375,
"rewards/tag_count_reward": 0.0625,
"step": 2
},
{
"clip_ratio": 0.0,
"completion_length": 484.96484375,
"epoch": 0.01279317697228145,
"grad_norm": 0.4568934440612793,
"kl": 0.0001436471939086914,
"learning_rate": 2.5e-06,
"loss": 0.0148,
"reward": 0.3037109375,
"reward_std": 0.41709331423044205,
"rewards/accuracy_reward": 0.203125,
"rewards/format_reward": 0.01953125,
"rewards/tag_count_reward": 0.0810546875,
"step": 3
},
{
"clip_ratio": 0.0,
"completion_length": 469.796875,
"epoch": 0.017057569296375266,
"grad_norm": 0.5138208866119385,
"kl": 0.0001804828643798828,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.0222,
"reward": 0.3076171875,
"reward_std": 0.4279475286602974,
"rewards/accuracy_reward": 0.1953125,
"rewards/format_reward": 0.03515625,
"rewards/tag_count_reward": 0.0771484375,
"step": 4
},
{
"clip_ratio": 0.0,
"completion_length": 498.140625,
"epoch": 0.021321961620469083,
"grad_norm": 0.4469239115715027,
"kl": 0.0010051727294921875,
"learning_rate": 4.166666666666667e-06,
"loss": 0.0029,
"reward": 0.318359375,
"reward_std": 0.44843800365924835,
"rewards/accuracy_reward": 0.18359375,
"rewards/format_reward": 0.02734375,
"rewards/tag_count_reward": 0.107421875,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 457.3125,
"epoch": 0.0255863539445629,
"grad_norm": 0.7152092456817627,
"kl": 0.029693603515625,
"learning_rate": 5e-06,
"loss": 0.0403,
"reward": 0.5029296875,
"reward_std": 0.6177150011062622,
"rewards/accuracy_reward": 0.171875,
"rewards/format_reward": 0.10546875,
"rewards/tag_count_reward": 0.2255859375,
"step": 6
},
{
"clip_ratio": 0.0,
"completion_length": 454.74609375,
"epoch": 0.029850746268656716,
"grad_norm": 2.3532168865203857,
"kl": 0.09393310546875,
"learning_rate": 5.833333333333334e-06,
"loss": 0.048,
"reward": 0.564453125,
"reward_std": 0.6538278013467789,
"rewards/accuracy_reward": 0.2578125,
"rewards/format_reward": 0.10546875,
"rewards/tag_count_reward": 0.201171875,
"step": 7
},
{
"clip_ratio": 0.0,
"completion_length": 492.9765625,
"epoch": 0.03411513859275053,
"grad_norm": 0.46021807193756104,
"kl": 0.016357421875,
"learning_rate": 6.666666666666667e-06,
"loss": 0.0541,
"reward": 0.5888671875,
"reward_std": 0.592596247792244,
"rewards/accuracy_reward": 0.19921875,
"rewards/format_reward": 0.15234375,
"rewards/tag_count_reward": 0.2373046875,
"step": 8
},
{
"clip_ratio": 0.0,
"completion_length": 478.79296875,
"epoch": 0.03837953091684435,
"grad_norm": 2.5319058895111084,
"kl": 0.0784912109375,
"learning_rate": 7.500000000000001e-06,
"loss": 0.0801,
"reward": 0.71875,
"reward_std": 0.6128444075584412,
"rewards/accuracy_reward": 0.3515625,
"rewards/format_reward": 0.140625,
"rewards/tag_count_reward": 0.2265625,
"step": 9
},
{
"clip_ratio": 0.0,
"completion_length": 563.734375,
"epoch": 0.042643923240938165,
"grad_norm": 1.262290596961975,
"kl": 0.0523223876953125,
"learning_rate": 8.333333333333334e-06,
"loss": 0.0363,
"reward": 0.5009765625,
"reward_std": 0.5229385495185852,
"rewards/accuracy_reward": 0.296875,
"rewards/format_reward": 0.078125,
"rewards/tag_count_reward": 0.1259765625,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 499.44140625,
"epoch": 0.046908315565031986,
"grad_norm": 1.3580890893936157,
"kl": 0.0596923828125,
"learning_rate": 9.166666666666666e-06,
"loss": 0.11,
"reward": 0.681640625,
"reward_std": 0.6728685200214386,
"rewards/accuracy_reward": 0.23828125,
"rewards/format_reward": 0.18359375,
"rewards/tag_count_reward": 0.259765625,
"step": 11
},
{
"clip_ratio": 0.0,
"completion_length": 567.75390625,
"epoch": 0.0511727078891258,
"grad_norm": 4.042412281036377,
"kl": 0.047576904296875,
"learning_rate": 1e-05,
"loss": 0.0732,
"reward": 0.521484375,
"reward_std": 0.5327698737382889,
"rewards/accuracy_reward": 0.27734375,
"rewards/format_reward": 0.08203125,
"rewards/tag_count_reward": 0.162109375,
"step": 12
},
{
"clip_ratio": 0.0,
"completion_length": 559.421875,
"epoch": 0.05543710021321962,
"grad_norm": 0.3716038763523102,
"kl": 0.0142669677734375,
"learning_rate": 1.0833333333333334e-05,
"loss": 0.1328,
"reward": 0.599609375,
"reward_std": 0.6229686141014099,
"rewards/accuracy_reward": 0.234375,
"rewards/format_reward": 0.14453125,
"rewards/tag_count_reward": 0.220703125,
"step": 13
},
{
"clip_ratio": 0.0,
"completion_length": 437.3828125,
"epoch": 0.05970149253731343,
"grad_norm": 2.6377227306365967,
"kl": 0.119598388671875,
"learning_rate": 1.1666666666666668e-05,
"loss": 0.1456,
"reward": 0.798828125,
"reward_std": 0.7079743444919586,
"rewards/accuracy_reward": 0.23046875,
"rewards/format_reward": 0.21875,
"rewards/tag_count_reward": 0.349609375,
"step": 14
},
{
"clip_ratio": 0.0,
"completion_length": 434.79296875,
"epoch": 0.06396588486140725,
"grad_norm": 0.4819924831390381,
"kl": 0.035736083984375,
"learning_rate": 1.25e-05,
"loss": 0.0931,
"reward": 0.8818359375,
"reward_std": 0.6916099190711975,
"rewards/accuracy_reward": 0.20703125,
"rewards/format_reward": 0.234375,
"rewards/tag_count_reward": 0.4404296875,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 324.69921875,
"epoch": 0.06823027718550106,
"grad_norm": 1.3913614749908447,
"kl": 0.0682373046875,
"learning_rate": 1.3333333333333333e-05,
"loss": 0.2251,
"reward": 1.1982421875,
"reward_std": 0.7941954433917999,
"rewards/accuracy_reward": 0.0859375,
"rewards/format_reward": 0.4375,
"rewards/tag_count_reward": 0.6748046875,
"step": 16
},
{
"clip_ratio": 0.0,
"completion_length": 299.75,
"epoch": 0.07249466950959488,
"grad_norm": 0.9118645787239075,
"kl": 0.0875244140625,
"learning_rate": 1.416666666666667e-05,
"loss": 0.2379,
"reward": 1.59765625,
"reward_std": 0.6924279183149338,
"rewards/accuracy_reward": 0.11328125,
"rewards/format_reward": 0.65625,
"rewards/tag_count_reward": 0.828125,
"step": 17
},
{
"clip_ratio": 0.0,
"completion_length": 220.4921875,
"epoch": 0.0767590618336887,
"grad_norm": 1.340067982673645,
"kl": 0.133056640625,
"learning_rate": 1.5000000000000002e-05,
"loss": 0.1821,
"reward": 1.8427734375,
"reward_std": 0.4516802802681923,
"rewards/accuracy_reward": 0.0703125,
"rewards/format_reward": 0.84375,
"rewards/tag_count_reward": 0.9287109375,
"step": 18
},
{
"clip_ratio": 0.0,
"completion_length": 182.53515625,
"epoch": 0.08102345415778252,
"grad_norm": 3.030550003051758,
"kl": 0.4296875,
"learning_rate": 1.5833333333333333e-05,
"loss": 0.0898,
"reward": 1.986328125,
"reward_std": 0.22909418493509293,
"rewards/accuracy_reward": 0.0703125,
"rewards/format_reward": 0.95703125,
"rewards/tag_count_reward": 0.958984375,
"step": 19
},
{
"clip_ratio": 0.0,
"completion_length": 214.65234375,
"epoch": 0.08528784648187633,
"grad_norm": 1.1024236679077148,
"kl": 0.1187744140625,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.1787,
"reward": 2.0078125,
"reward_std": 0.170526759698987,
"rewards/accuracy_reward": 0.06640625,
"rewards/format_reward": 0.98046875,
"rewards/tag_count_reward": 0.9609375,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 208.2421875,
"epoch": 0.08955223880597014,
"grad_norm": 22.709352493286133,
"kl": 1.359375,
"learning_rate": 1.7500000000000002e-05,
"loss": 0.1298,
"reward": 1.9345703125,
"reward_std": 0.32205624878406525,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.91015625,
"rewards/tag_count_reward": 0.9619140625,
"step": 21
},
{
"clip_ratio": 0.0,
"completion_length": 253.96484375,
"epoch": 0.09381663113006397,
"grad_norm": 8.92030143737793,
"kl": 1.1494140625,
"learning_rate": 1.8333333333333333e-05,
"loss": 0.1757,
"reward": 1.4931640625,
"reward_std": 0.6142828911542892,
"rewards/accuracy_reward": 0.046875,
"rewards/format_reward": 0.57421875,
"rewards/tag_count_reward": 0.8720703125,
"step": 22
},
{
"clip_ratio": 0.0,
"completion_length": 217.625,
"epoch": 0.09808102345415778,
"grad_norm": 3.518479585647583,
"kl": 0.38134765625,
"learning_rate": 1.916666666666667e-05,
"loss": 0.0203,
"reward": 1.0087890625,
"reward_std": 0.44057436287403107,
"rewards/accuracy_reward": 0.05078125,
"rewards/format_reward": 0.171875,
"rewards/tag_count_reward": 0.7861328125,
"step": 23
},
{
"clip_ratio": 0.0,
"completion_length": 237.28125,
"epoch": 0.1023454157782516,
"grad_norm": 1.0693765878677368,
"kl": 0.19775390625,
"learning_rate": 2e-05,
"loss": 0.0202,
"reward": 1.134765625,
"reward_std": 0.5518650561571121,
"rewards/accuracy_reward": 0.0234375,
"rewards/format_reward": 0.296875,
"rewards/tag_count_reward": 0.814453125,
"step": 24
},
{
"clip_ratio": 0.0,
"completion_length": 295.78515625,
"epoch": 0.10660980810234541,
"grad_norm": 9.523970603942871,
"kl": 0.23779296875,
"learning_rate": 1.9998881018102735e-05,
"loss": 0.1602,
"reward": 1.501953125,
"reward_std": 0.6455793529748917,
"rewards/accuracy_reward": 0.1015625,
"rewards/format_reward": 0.50390625,
"rewards/tag_count_reward": 0.896484375,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 345.72265625,
"epoch": 0.11087420042643924,
"grad_norm": 24690.228515625,
"kl": 121.465576171875,
"learning_rate": 1.9995524322835035e-05,
"loss": 6.9741,
"reward": 1.5634765625,
"reward_std": 0.648356705904007,
"rewards/accuracy_reward": 0.078125,
"rewards/format_reward": 0.58984375,
"rewards/tag_count_reward": 0.8955078125,
"step": 26
},
{
"clip_ratio": 0.0,
"completion_length": 452.1953125,
"epoch": 0.11513859275053305,
"grad_norm": 114.83018493652344,
"kl": 1.1298828125,
"learning_rate": 1.9989930665413148e-05,
"loss": 0.3668,
"reward": 1.2841796875,
"reward_std": 0.7657907009124756,
"rewards/accuracy_reward": 0.05078125,
"rewards/format_reward": 0.453125,
"rewards/tag_count_reward": 0.7802734375,
"step": 27
},
{
"clip_ratio": 0.0,
"completion_length": 550.98828125,
"epoch": 0.11940298507462686,
"grad_norm": 32.12217330932617,
"kl": 0.9326171875,
"learning_rate": 1.998210129767735e-05,
"loss": 0.3607,
"reward": 1.0625,
"reward_std": 0.7528532892465591,
"rewards/accuracy_reward": 0.05078125,
"rewards/format_reward": 0.328125,
"rewards/tag_count_reward": 0.68359375,
"step": 28
},
{
"clip_ratio": 0.0,
"completion_length": 534.66796875,
"epoch": 0.12366737739872068,
"grad_norm": 31.25490951538086,
"kl": 0.796875,
"learning_rate": 1.9972037971811802e-05,
"loss": 0.1675,
"reward": 0.6337890625,
"reward_std": 0.5352872237563133,
"rewards/accuracy_reward": 0.03515625,
"rewards/format_reward": 0.109375,
"rewards/tag_count_reward": 0.4892578125,
"step": 29
},
{
"clip_ratio": 0.0,
"completion_length": 492.50390625,
"epoch": 0.1279317697228145,
"grad_norm": 7.399560928344727,
"kl": 0.8779296875,
"learning_rate": 1.9959742939952393e-05,
"loss": 0.0607,
"reward": 0.4462890625,
"reward_std": 0.32183003425598145,
"rewards/accuracy_reward": 0.01953125,
"rewards/format_reward": 0.03515625,
"rewards/tag_count_reward": 0.3916015625,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 430.3125,
"epoch": 0.13219616204690832,
"grad_norm": 391.785888671875,
"kl": 8.494140625,
"learning_rate": 1.9945218953682736e-05,
"loss": 0.5494,
"reward": 0.4248046875,
"reward_std": 0.2781025320291519,
"rewards/accuracy_reward": 0.015625,
"rewards/format_reward": 0.0234375,
"rewards/tag_count_reward": 0.3857421875,
"step": 31
},
{
"clip_ratio": 0.0,
"completion_length": 404.62890625,
"epoch": 0.13646055437100213,
"grad_norm": 199.300048828125,
"kl": 2.603515625,
"learning_rate": 1.9928469263418376e-05,
"loss": 0.235,
"reward": 0.3564453125,
"reward_std": 0.19321707263588905,
"rewards/accuracy_reward": 0.00390625,
"rewards/format_reward": 0.0078125,
"rewards/tag_count_reward": 0.3447265625,
"step": 32
},
{
"clip_ratio": 0.0,
"completion_length": 432.109375,
"epoch": 0.14072494669509594,
"grad_norm": 3538.403564453125,
"kl": 31.28515625,
"learning_rate": 1.990949761767935e-05,
"loss": 2.1404,
"reward": 0.3896484375,
"reward_std": 0.24749910086393356,
"rewards/accuracy_reward": 0.01171875,
"rewards/format_reward": 0.01953125,
"rewards/tag_count_reward": 0.3583984375,
"step": 33
},
{
"clip_ratio": 0.0,
"completion_length": 464.28515625,
"epoch": 0.14498933901918976,
"grad_norm": 111.8864517211914,
"kl": 1.9296875,
"learning_rate": 1.9888308262251286e-05,
"loss": 0.1906,
"reward": 0.345703125,
"reward_std": 0.1697397418320179,
"rewards/accuracy_reward": 0.01171875,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.333984375,
"step": 34
},
{
"clip_ratio": 0.0,
"completion_length": 510.56640625,
"epoch": 0.14925373134328357,
"grad_norm": 14.984077453613281,
"kl": 1.08984375,
"learning_rate": 1.9864905939235215e-05,
"loss": 0.0739,
"reward": 0.3662109375,
"reward_std": 0.19745982438325882,
"rewards/accuracy_reward": 0.01953125,
"rewards/format_reward": 0.00390625,
"rewards/tag_count_reward": 0.3427734375,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 547.35546875,
"epoch": 0.1535181236673774,
"grad_norm": 137.8433380126953,
"kl": 1.69921875,
"learning_rate": 1.98392958859863e-05,
"loss": 0.0645,
"reward": 0.3623046875,
"reward_std": 0.21899614110589027,
"rewards/accuracy_reward": 0.01171875,
"rewards/format_reward": 0.0078125,
"rewards/tag_count_reward": 0.3427734375,
"step": 36
},
{
"clip_ratio": 0.0,
"completion_length": 647.1953125,
"epoch": 0.15778251599147122,
"grad_norm": 45.083709716796875,
"kl": 2.125,
"learning_rate": 1.9811483833941726e-05,
"loss": 0.132,
"reward": 0.3369140625,
"reward_std": 0.17930956557393074,
"rewards/accuracy_reward": 0.0078125,
"rewards/format_reward": 0.00390625,
"rewards/tag_count_reward": 0.3251953125,
"step": 37
},
{
"clip_ratio": 0.0,
"completion_length": 651.08984375,
"epoch": 0.16204690831556504,
"grad_norm": 35.52851104736328,
"kl": 1.908203125,
"learning_rate": 1.9781476007338058e-05,
"loss": 0.098,
"reward": 0.345703125,
"reward_std": 0.23194141685962677,
"rewards/accuracy_reward": 0.01171875,
"rewards/format_reward": 0.00390625,
"rewards/tag_count_reward": 0.330078125,
"step": 38
},
{
"clip_ratio": 0.0,
"completion_length": 648.5234375,
"epoch": 0.16631130063965885,
"grad_norm": 22.79519271850586,
"kl": 2.15625,
"learning_rate": 1.9749279121818235e-05,
"loss": 0.058,
"reward": 0.3427734375,
"reward_std": 0.21174855902791023,
"rewards/accuracy_reward": 0.00390625,
"rewards/format_reward": 0.00390625,
"rewards/tag_count_reward": 0.3349609375,
"step": 39
},
{
"clip_ratio": 0.0,
"completion_length": 725.0078125,
"epoch": 0.17057569296375266,
"grad_norm": 1158.721923828125,
"kl": 26.59375,
"learning_rate": 1.9714900382928674e-05,
"loss": 1.3132,
"reward": 0.3447265625,
"reward_std": 0.24672244489192963,
"rewards/accuracy_reward": 0.01953125,
"rewards/format_reward": 0.00390625,
"rewards/tag_count_reward": 0.3212890625,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 674.56640625,
"epoch": 0.17484008528784648,
"grad_norm": 10.722182273864746,
"kl": 2.39453125,
"learning_rate": 1.9678347484506667e-05,
"loss": 0.0955,
"reward": 0.3056640625,
"reward_std": 0.2298230677843094,
"rewards/accuracy_reward": 0.015625,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2900390625,
"step": 41
},
{
"clip_ratio": 0.0,
"completion_length": 716.73046875,
"epoch": 0.1791044776119403,
"grad_norm": 22.29323387145996,
"kl": 2.66015625,
"learning_rate": 1.9639628606958535e-05,
"loss": 0.1796,
"reward": 0.2939453125,
"reward_std": 0.2149362936615944,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2939453125,
"step": 42
},
{
"clip_ratio": 0.0,
"completion_length": 770.56640625,
"epoch": 0.18336886993603413,
"grad_norm": 5175.0302734375,
"kl": 168.125,
"learning_rate": 1.9598752415428893e-05,
"loss": 8.1192,
"reward": 0.3203125,
"reward_std": 0.19998998567461967,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.3203125,
"step": 43
},
{
"clip_ratio": 0.0,
"completion_length": 815.19140625,
"epoch": 0.18763326226012794,
"grad_norm": 59.13957977294922,
"kl": 2.7734375,
"learning_rate": 1.955572805786141e-05,
"loss": 0.1392,
"reward": 0.3330078125,
"reward_std": 0.19636105746030807,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.3330078125,
"step": 44
},
{
"clip_ratio": 0.0,
"completion_length": 823.13671875,
"epoch": 0.19189765458422176,
"grad_norm": 56.28653335571289,
"kl": 2.48046875,
"learning_rate": 1.9510565162951538e-05,
"loss": 0.1406,
"reward": 0.3310546875,
"reward_std": 0.20274027064442635,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.3310546875,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 809.56640625,
"epoch": 0.19616204690831557,
"grad_norm": 13.485371589660645,
"kl": 2.076171875,
"learning_rate": 1.9463273837991643e-05,
"loss": 0.1466,
"reward": 0.3271484375,
"reward_std": 0.2578311152756214,
"rewards/accuracy_reward": 0.0078125,
"rewards/format_reward": 0.0078125,
"rewards/tag_count_reward": 0.3115234375,
"step": 46
},
{
"clip_ratio": 0.0,
"completion_length": 839.78125,
"epoch": 0.20042643923240938,
"grad_norm": 6.2021918296813965,
"kl": 1.67578125,
"learning_rate": 1.9413864666609036e-05,
"loss": 0.1194,
"reward": 0.3603515625,
"reward_std": 0.21874134615063667,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0078125,
"rewards/tag_count_reward": 0.3525390625,
"step": 47
},
{
"clip_ratio": 0.0,
"completion_length": 935.88671875,
"epoch": 0.2046908315565032,
"grad_norm": 42.63424301147461,
"kl": 1.73828125,
"learning_rate": 1.9362348706397374e-05,
"loss": 0.1158,
"reward": 0.3427734375,
"reward_std": 0.23805152624845505,
"rewards/accuracy_reward": 0.0078125,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.3349609375,
"step": 48
},
{
"clip_ratio": 0.0,
"completion_length": 923.13671875,
"epoch": 0.208955223880597,
"grad_norm": 20.886306762695312,
"kl": 1.84765625,
"learning_rate": 1.9308737486442045e-05,
"loss": 0.0876,
"reward": 0.365234375,
"reward_std": 0.24175361543893814,
"rewards/accuracy_reward": 0.0078125,
"rewards/format_reward": 0.0078125,
"rewards/tag_count_reward": 0.349609375,
"step": 49
},
{
"clip_ratio": 0.0,
"completion_length": 913.77734375,
"epoch": 0.21321961620469082,
"grad_norm": 1.078397512435913,
"kl": 0.9013671875,
"learning_rate": 1.9253043004739967e-05,
"loss": 0.0613,
"reward": 0.3681640625,
"reward_std": 0.2707056328654289,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0234375,
"rewards/tag_count_reward": 0.3447265625,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 924.3671875,
"epoch": 0.21748400852878466,
"grad_norm": 139370.5625,
"kl": 772.5048828125,
"learning_rate": 1.919527772551451e-05,
"loss": 34.5749,
"reward": 0.3916015625,
"reward_std": 0.2611350491642952,
"rewards/accuracy_reward": 0.00390625,
"rewards/format_reward": 0.01953125,
"rewards/tag_count_reward": 0.3681640625,
"step": 51
},
{
"clip_ratio": 0.0,
"completion_length": 966.453125,
"epoch": 0.22174840085287847,
"grad_norm": 2.575350284576416,
"kl": 0.6396484375,
"learning_rate": 1.913545457642601e-05,
"loss": 0.0048,
"reward": 0.3994140625,
"reward_std": 0.26176000386476517,
"rewards/accuracy_reward": 0.0078125,
"rewards/format_reward": 0.015625,
"rewards/tag_count_reward": 0.3759765625,
"step": 52
},
{
"clip_ratio": 0.0,
"completion_length": 935.90625,
"epoch": 0.2260127931769723,
"grad_norm": 0.6655358672142029,
"kl": 0.625,
"learning_rate": 1.907358694567865e-05,
"loss": 0.0332,
"reward": 0.3818359375,
"reward_std": 0.30670569837093353,
"rewards/accuracy_reward": 0.015625,
"rewards/format_reward": 0.0234375,
"rewards/tag_count_reward": 0.3427734375,
"step": 53
},
{
"clip_ratio": 0.0,
"completion_length": 981.7109375,
"epoch": 0.2302771855010661,
"grad_norm": 0.49440306425094604,
"kl": 1.32421875,
"learning_rate": 1.900968867902419e-05,
"loss": 0.05,
"reward": 0.37890625,
"reward_std": 0.30825207754969597,
"rewards/accuracy_reward": 0.00390625,
"rewards/format_reward": 0.04296875,
"rewards/tag_count_reward": 0.33203125,
"step": 54
},
{
"clip_ratio": 0.0,
"completion_length": 1006.4765625,
"epoch": 0.2345415778251599,
"grad_norm": 0.6174039840698242,
"kl": 3.51171875,
"learning_rate": 1.8943774076663372e-05,
"loss": 0.1368,
"reward": 0.55078125,
"reward_std": 0.45677174627780914,
"rewards/accuracy_reward": 0.03515625,
"rewards/format_reward": 0.09375,
"rewards/tag_count_reward": 0.421875,
"step": 55
},
{
"clip_ratio": 0.0,
"completion_length": 993.33984375,
"epoch": 0.23880597014925373,
"grad_norm": 0.6529553532600403,
"kl": 4.8984375,
"learning_rate": 1.8875857890045544e-05,
"loss": 0.1925,
"reward": 0.7568359375,
"reward_std": 0.5871296375989914,
"rewards/accuracy_reward": 0.01953125,
"rewards/format_reward": 0.171875,
"rewards/tag_count_reward": 0.5654296875,
"step": 56
},
{
"clip_ratio": 0.0,
"completion_length": 1012.4765625,
"epoch": 0.24307036247334754,
"grad_norm": 133.8551788330078,
"kl": 4.53515625,
"learning_rate": 1.880595531856738e-05,
"loss": 0.1758,
"reward": 0.806640625,
"reward_std": 0.6075598150491714,
"rewards/accuracy_reward": 0.03515625,
"rewards/format_reward": 0.22265625,
"rewards/tag_count_reward": 0.548828125,
"step": 57
},
{
"clip_ratio": 0.0,
"completion_length": 996.9453125,
"epoch": 0.24733475479744135,
"grad_norm": 0.891619861125946,
"kl": 2.80078125,
"learning_rate": 1.87340820061713e-05,
"loss": 0.0914,
"reward": 0.732421875,
"reward_std": 0.6083860993385315,
"rewards/accuracy_reward": 0.0078125,
"rewards/format_reward": 0.2265625,
"rewards/tag_count_reward": 0.498046875,
"step": 58
},
{
"clip_ratio": 0.0,
"completion_length": 967.28515625,
"epoch": 0.2515991471215352,
"grad_norm": 1.4940592050552368,
"kl": 3.5234375,
"learning_rate": 1.866025403784439e-05,
"loss": 0.1145,
"reward": 0.79296875,
"reward_std": 0.6623349040746689,
"rewards/accuracy_reward": 0.00390625,
"rewards/format_reward": 0.2890625,
"rewards/tag_count_reward": 0.5,
"step": 59
},
{
"clip_ratio": 0.0,
"completion_length": 911.89453125,
"epoch": 0.255863539445629,
"grad_norm": 25.662094116210938,
"kl": 4.6328125,
"learning_rate": 1.8584487936018663e-05,
"loss": 0.1482,
"reward": 0.9228515625,
"reward_std": 0.7258684784173965,
"rewards/accuracy_reward": 0.05078125,
"rewards/format_reward": 0.33203125,
"rewards/tag_count_reward": 0.5400390625,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 838.140625,
"epoch": 0.2601279317697228,
"grad_norm": 23.566726684570312,
"kl": 6.2734375,
"learning_rate": 1.8506800656873397e-05,
"loss": 0.14,
"reward": 0.90625,
"reward_std": 0.6905761212110519,
"rewards/accuracy_reward": 0.046875,
"rewards/format_reward": 0.32421875,
"rewards/tag_count_reward": 0.53515625,
"step": 61
},
{
"clip_ratio": 0.0,
"completion_length": 593.88671875,
"epoch": 0.26439232409381663,
"grad_norm": 9.581720352172852,
"kl": 5.3984375,
"learning_rate": 1.8427209586540392e-05,
"loss": 0.0744,
"reward": 0.966796875,
"reward_std": 0.7168334871530533,
"rewards/accuracy_reward": 0.03515625,
"rewards/format_reward": 0.37109375,
"rewards/tag_count_reward": 0.560546875,
"step": 62
},
{
"clip_ratio": 0.0,
"completion_length": 403.3515625,
"epoch": 0.26865671641791045,
"grad_norm": 3.977918863296509,
"kl": 5.6328125,
"learning_rate": 1.834573253721303e-05,
"loss": 0.0664,
"reward": 0.9931640625,
"reward_std": 0.7101524770259857,
"rewards/accuracy_reward": 0.03515625,
"rewards/format_reward": 0.3671875,
"rewards/tag_count_reward": 0.5908203125,
"step": 63
},
{
"clip_ratio": 0.0,
"completion_length": 285.90625,
"epoch": 0.27292110874200426,
"grad_norm": 3.9532861709594727,
"kl": 4.4375,
"learning_rate": 1.826238774315995e-05,
"loss": -0.0383,
"reward": 1.2724609375,
"reward_std": 0.7493992298841476,
"rewards/accuracy_reward": 0.046875,
"rewards/format_reward": 0.5703125,
"rewards/tag_count_reward": 0.6552734375,
"step": 64
},
{
"clip_ratio": 0.0,
"completion_length": 219.54296875,
"epoch": 0.2771855010660981,
"grad_norm": 9.081878662109375,
"kl": 5.0,
"learning_rate": 1.8177193856644315e-05,
"loss": 0.029,
"reward": 1.5458984375,
"reward_std": 0.7314303368330002,
"rewards/accuracy_reward": 0.0390625,
"rewards/format_reward": 0.69921875,
"rewards/tag_count_reward": 0.8076171875,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 172.1484375,
"epoch": 0.2814498933901919,
"grad_norm": 123.24443817138672,
"kl": 8.5546875,
"learning_rate": 1.8090169943749477e-05,
"loss": 0.1148,
"reward": 1.3759765625,
"reward_std": 0.7652620077133179,
"rewards/accuracy_reward": 0.00390625,
"rewards/format_reward": 0.6015625,
"rewards/tag_count_reward": 0.7705078125,
"step": 66
},
{
"clip_ratio": 0.0,
"completion_length": 155.49609375,
"epoch": 0.2857142857142857,
"grad_norm": 6.339594841003418,
"kl": 5.8203125,
"learning_rate": 1.8001335480112067e-05,
"loss": 0.0783,
"reward": 1.376953125,
"reward_std": 0.6871647387742996,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.56640625,
"rewards/tag_count_reward": 0.810546875,
"step": 67
},
{
"clip_ratio": 0.0,
"completion_length": 154.2734375,
"epoch": 0.2899786780383795,
"grad_norm": 181.54469299316406,
"kl": 9.875,
"learning_rate": 1.7910710346563417e-05,
"loss": 0.2282,
"reward": 1.689453125,
"reward_std": 0.574043981730938,
"rewards/accuracy_reward": 0.01953125,
"rewards/format_reward": 0.80078125,
"rewards/tag_count_reward": 0.869140625,
"step": 68
},
{
"clip_ratio": 0.0,
"completion_length": 156.546875,
"epoch": 0.2942430703624733,
"grad_norm": 23.90792465209961,
"kl": 4.12890625,
"learning_rate": 1.78183148246803e-05,
"loss": -0.0867,
"reward": 1.49609375,
"reward_std": 0.7586368173360825,
"rewards/accuracy_reward": 0.01171875,
"rewards/format_reward": 0.703125,
"rewards/tag_count_reward": 0.78125,
"step": 69
},
{
"clip_ratio": 0.0,
"completion_length": 154.2109375,
"epoch": 0.29850746268656714,
"grad_norm": 6.664966106414795,
"kl": 4.35546875,
"learning_rate": 1.7724169592245996e-05,
"loss": -0.1123,
"reward": 1.3564453125,
"reward_std": 0.7493429481983185,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.62890625,
"rewards/tag_count_reward": 0.7275390625,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 135.6171875,
"epoch": 0.302771855010661,
"grad_norm": 520.9791259765625,
"kl": 14.2890625,
"learning_rate": 1.7628295718622666e-05,
"loss": 0.2477,
"reward": 1.4765625,
"reward_std": 0.7755448371171951,
"rewards/accuracy_reward": 0.03515625,
"rewards/format_reward": 0.67578125,
"rewards/tag_count_reward": 0.765625,
"step": 71
},
{
"clip_ratio": 0.0,
"completion_length": 137.9296875,
"epoch": 0.3070362473347548,
"grad_norm": 9.297532081604004,
"kl": 4.171875,
"learning_rate": 1.7530714660036112e-05,
"loss": -0.0591,
"reward": 1.576171875,
"reward_std": 0.702255368232727,
"rewards/accuracy_reward": 0.0390625,
"rewards/format_reward": 0.7265625,
"rewards/tag_count_reward": 0.810546875,
"step": 72
},
{
"clip_ratio": 0.0,
"completion_length": 129.53515625,
"epoch": 0.31130063965884863,
"grad_norm": 481.652099609375,
"kl": 26.50390625,
"learning_rate": 1.7431448254773943e-05,
"loss": 0.4083,
"reward": 1.6875,
"reward_std": 0.5314841717481613,
"rewards/accuracy_reward": 0.00390625,
"rewards/format_reward": 0.8125,
"rewards/tag_count_reward": 0.87109375,
"step": 73
},
{
"clip_ratio": 0.0,
"completion_length": 138.23046875,
"epoch": 0.31556503198294245,
"grad_norm": 7.763786315917969,
"kl": 4.296875,
"learning_rate": 1.7330518718298263e-05,
"loss": 0.0646,
"reward": 1.7587890625,
"reward_std": 0.5207200050354004,
"rewards/accuracy_reward": 0.03125,
"rewards/format_reward": 0.83203125,
"rewards/tag_count_reward": 0.8955078125,
"step": 74
},
{
"clip_ratio": 0.0,
"completion_length": 128.81640625,
"epoch": 0.31982942430703626,
"grad_norm": 17.239259719848633,
"kl": 5.734375,
"learning_rate": 1.7227948638273918e-05,
"loss": 0.135,
"reward": 1.919921875,
"reward_std": 0.3152705281972885,
"rewards/accuracy_reward": 0.04296875,
"rewards/format_reward": 0.921875,
"rewards/tag_count_reward": 0.955078125,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 111.79296875,
"epoch": 0.32409381663113007,
"grad_norm": 8.510507583618164,
"kl": 4.6171875,
"learning_rate": 1.712376096951345e-05,
"loss": 0.117,
"reward": 1.900390625,
"reward_std": 0.32303596287965775,
"rewards/accuracy_reward": 0.0390625,
"rewards/format_reward": 0.91796875,
"rewards/tag_count_reward": 0.943359375,
"step": 76
},
{
"clip_ratio": 0.0,
"completion_length": 124.20703125,
"epoch": 0.3283582089552239,
"grad_norm": 13.03882884979248,
"kl": 5.109375,
"learning_rate": 1.7017979028839918e-05,
"loss": 0.1378,
"reward": 1.8974609375,
"reward_std": 0.3129582107067108,
"rewards/accuracy_reward": 0.03125,
"rewards/format_reward": 0.91796875,
"rewards/tag_count_reward": 0.9482421875,
"step": 77
},
{
"clip_ratio": 0.0,
"completion_length": 114.91015625,
"epoch": 0.3326226012793177,
"grad_norm": 14.898639678955078,
"kl": 5.84375,
"learning_rate": 1.691062648986865e-05,
"loss": 0.1679,
"reward": 1.8828125,
"reward_std": 0.28449319303035736,
"rewards/accuracy_reward": 0.00390625,
"rewards/format_reward": 0.92578125,
"rewards/tag_count_reward": 0.953125,
"step": 78
},
{
"clip_ratio": 0.0,
"completion_length": 116.80859375,
"epoch": 0.3368869936034115,
"grad_norm": 41.616432189941406,
"kl": 6.03125,
"learning_rate": 1.6801727377709195e-05,
"loss": 0.1764,
"reward": 1.939453125,
"reward_std": 0.24111925438046455,
"rewards/accuracy_reward": 0.0234375,
"rewards/format_reward": 0.9453125,
"rewards/tag_count_reward": 0.970703125,
"step": 79
},
{
"clip_ratio": 0.0,
"completion_length": 178.0546875,
"epoch": 0.3411513859275053,
"grad_norm": 40.391815185546875,
"kl": 7.078125,
"learning_rate": 1.6691306063588583e-05,
"loss": 0.1877,
"reward": 1.896484375,
"reward_std": 0.24799961294047534,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.94140625,
"rewards/tag_count_reward": 0.955078125,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 207.1484375,
"epoch": 0.34541577825159914,
"grad_norm": 53.5943489074707,
"kl": 7.96875,
"learning_rate": 1.657938725939713e-05,
"loss": 0.2265,
"reward": 1.8984375,
"reward_std": 0.25493185594677925,
"rewards/accuracy_reward": 0.00390625,
"rewards/format_reward": 0.94140625,
"rewards/tag_count_reward": 0.953125,
"step": 81
},
{
"clip_ratio": 0.0,
"completion_length": 244.5625,
"epoch": 0.34968017057569295,
"grad_norm": 9.949625015258789,
"kl": 6.1015625,
"learning_rate": 1.6465996012157996e-05,
"loss": 0.1384,
"reward": 1.8525390625,
"reward_std": 0.3406095430254936,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.9140625,
"rewards/tag_count_reward": 0.9384765625,
"step": 82
},
{
"clip_ratio": 0.0,
"completion_length": 232.71875,
"epoch": 0.35394456289978676,
"grad_norm": 11.608429908752441,
"kl": 5.5390625,
"learning_rate": 1.635115769842179e-05,
"loss": 0.1642,
"reward": 1.8603515625,
"reward_std": 0.319538950920105,
"rewards/accuracy_reward": 0.00390625,
"rewards/format_reward": 0.90234375,
"rewards/tag_count_reward": 0.9541015625,
"step": 83
},
{
"clip_ratio": 0.0,
"completion_length": 196.1328125,
"epoch": 0.3582089552238806,
"grad_norm": 7.774857044219971,
"kl": 5.359375,
"learning_rate": 1.6234898018587336e-05,
"loss": 0.1484,
"reward": 1.841796875,
"reward_std": 0.35672812163829803,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.89453125,
"rewards/tag_count_reward": 0.947265625,
"step": 84
},
{
"clip_ratio": 0.0,
"completion_length": 184.19921875,
"epoch": 0.3624733475479744,
"grad_norm": 5.315440654754639,
"kl": 5.0234375,
"learning_rate": 1.6117242991150064e-05,
"loss": 0.1525,
"reward": 1.9189453125,
"reward_std": 0.21933256834745407,
"rewards/accuracy_reward": 0.0078125,
"rewards/format_reward": 0.94921875,
"rewards/tag_count_reward": 0.9619140625,
"step": 85
},
{
"clip_ratio": 0.0,
"completion_length": 193.97265625,
"epoch": 0.36673773987206826,
"grad_norm": 5.736013889312744,
"kl": 5.515625,
"learning_rate": 1.599821894687914e-05,
"loss": 0.1875,
"reward": 1.9306640625,
"reward_std": 0.20439787581562996,
"rewards/accuracy_reward": 0.0078125,
"rewards/format_reward": 0.94921875,
"rewards/tag_count_reward": 0.9736328125,
"step": 86
},
{
"clip_ratio": 0.0,
"completion_length": 297.3984375,
"epoch": 0.37100213219616207,
"grad_norm": 8.90512752532959,
"kl": 5.5703125,
"learning_rate": 1.5877852522924733e-05,
"loss": 0.1698,
"reward": 1.91015625,
"reward_std": 0.33383994549512863,
"rewards/accuracy_reward": 0.0390625,
"rewards/format_reward": 0.91796875,
"rewards/tag_count_reward": 0.953125,
"step": 87
},
{
"clip_ratio": 0.0,
"completion_length": 293.015625,
"epoch": 0.3752665245202559,
"grad_norm": 102.06912231445312,
"kl": 10.2578125,
"learning_rate": 1.575617065685674e-05,
"loss": 0.3011,
"reward": 1.890625,
"reward_std": 0.31814195960760117,
"rewards/accuracy_reward": 0.03515625,
"rewards/format_reward": 0.90234375,
"rewards/tag_count_reward": 0.953125,
"step": 88
},
{
"clip_ratio": 0.0,
"completion_length": 336.6796875,
"epoch": 0.3795309168443497,
"grad_norm": 108.87593841552734,
"kl": 11.6640625,
"learning_rate": 1.563320058063622e-05,
"loss": 0.2676,
"reward": 1.8369140625,
"reward_std": 0.38644537329673767,
"rewards/accuracy_reward": 0.03515625,
"rewards/format_reward": 0.8671875,
"rewards/tag_count_reward": 0.9345703125,
"step": 89
},
{
"clip_ratio": 0.0,
"completion_length": 409.29296875,
"epoch": 0.3837953091684435,
"grad_norm": 35.86373519897461,
"kl": 8.4296875,
"learning_rate": 1.5508969814521026e-05,
"loss": 0.2346,
"reward": 1.8154296875,
"reward_std": 0.4089268818497658,
"rewards/accuracy_reward": 0.00390625,
"rewards/format_reward": 0.87109375,
"rewards/tag_count_reward": 0.9404296875,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 419.03125,
"epoch": 0.3880597014925373,
"grad_norm": 4.703104496002197,
"kl": 5.8359375,
"learning_rate": 1.5383506160906826e-05,
"loss": 0.1736,
"reward": 1.8583984375,
"reward_std": 0.37071677297353745,
"rewards/accuracy_reward": 0.01953125,
"rewards/format_reward": 0.88671875,
"rewards/tag_count_reward": 0.9521484375,
"step": 91
},
{
"clip_ratio": 0.0,
"completion_length": 498.890625,
"epoch": 0.39232409381663114,
"grad_norm": 15.804770469665527,
"kl": 6.359375,
"learning_rate": 1.5256837698105047e-05,
"loss": 0.2056,
"reward": 1.896484375,
"reward_std": 0.264212965965271,
"rewards/accuracy_reward": 0.01171875,
"rewards/format_reward": 0.921875,
"rewards/tag_count_reward": 0.962890625,
"step": 92
},
{
"clip_ratio": 0.0,
"completion_length": 541.484375,
"epoch": 0.39658848614072495,
"grad_norm": 43.44738006591797,
"kl": 7.046875,
"learning_rate": 1.5128992774059063e-05,
"loss": 0.1825,
"reward": 1.84375,
"reward_std": 0.37193765491247177,
"rewards/accuracy_reward": 0.01171875,
"rewards/format_reward": 0.890625,
"rewards/tag_count_reward": 0.94140625,
"step": 93
},
{
"clip_ratio": 0.0,
"completion_length": 690.60546875,
"epoch": 0.40085287846481876,
"grad_norm": 3.9883878231048584,
"kl": 5.6875,
"learning_rate": 1.5000000000000002e-05,
"loss": 0.1577,
"reward": 1.796875,
"reward_std": 0.449543721973896,
"rewards/accuracy_reward": 0.03125,
"rewards/format_reward": 0.8515625,
"rewards/tag_count_reward": 0.9140625,
"step": 94
},
{
"clip_ratio": 0.0,
"completion_length": 710.62890625,
"epoch": 0.4051172707889126,
"grad_norm": 13.03452205657959,
"kl": 5.1953125,
"learning_rate": 1.4869888244043674e-05,
"loss": 0.1824,
"reward": 1.794921875,
"reward_std": 0.44430477917194366,
"rewards/accuracy_reward": 0.0078125,
"rewards/format_reward": 0.859375,
"rewards/tag_count_reward": 0.927734375,
"step": 95
},
{
"clip_ratio": 0.0,
"completion_length": 679.2578125,
"epoch": 0.4093816631130064,
"grad_norm": 4.490772724151611,
"kl": 5.296875,
"learning_rate": 1.4738686624729987e-05,
"loss": 0.1653,
"reward": 1.80859375,
"reward_std": 0.35829880461096764,
"rewards/accuracy_reward": 0.02734375,
"rewards/format_reward": 0.8515625,
"rewards/tag_count_reward": 0.9296875,
"step": 96
},
{
"clip_ratio": 0.0,
"completion_length": 880.9765625,
"epoch": 0.4136460554371002,
"grad_norm": 150.7144317626953,
"kl": 6.5390625,
"learning_rate": 1.4606424504506325e-05,
"loss": 0.2454,
"reward": 1.5869140625,
"reward_std": 0.5404268652200699,
"rewards/accuracy_reward": 0.03515625,
"rewards/format_reward": 0.76953125,
"rewards/tag_count_reward": 0.7822265625,
"step": 97
},
{
"clip_ratio": 0.0,
"completion_length": 945.01953125,
"epoch": 0.417910447761194,
"grad_norm": 216.19607543945312,
"kl": 15.546875,
"learning_rate": 1.4473131483156326e-05,
"loss": 0.3319,
"reward": 1.4111328125,
"reward_std": 0.6219311505556107,
"rewards/accuracy_reward": 0.0234375,
"rewards/format_reward": 0.6875,
"rewards/tag_count_reward": 0.7001953125,
"step": 98
},
{
"clip_ratio": 0.0,
"completion_length": 988.53515625,
"epoch": 0.42217484008528783,
"grad_norm": 137.95619201660156,
"kl": 6.5,
"learning_rate": 1.4338837391175582e-05,
"loss": 0.2236,
"reward": 1.45703125,
"reward_std": 0.6083492934703827,
"rewards/accuracy_reward": 0.0234375,
"rewards/format_reward": 0.70703125,
"rewards/tag_count_reward": 0.7265625,
"step": 99
},
{
"clip_ratio": 0.0,
"completion_length": 985.140625,
"epoch": 0.42643923240938164,
"grad_norm": 7.844208240509033,
"kl": 1.84375,
"learning_rate": 1.4203572283095657e-05,
"loss": 0.039,
"reward": 1.4658203125,
"reward_std": 0.6321621090173721,
"rewards/accuracy_reward": 0.03125,
"rewards/format_reward": 0.69140625,
"rewards/tag_count_reward": 0.7431640625,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 992.00390625,
"epoch": 0.43070362473347545,
"grad_norm": 13.683513641357422,
"kl": 1.30078125,
"learning_rate": 1.4067366430758004e-05,
"loss": 0.0205,
"reward": 1.4462890625,
"reward_std": 0.6124080866575241,
"rewards/accuracy_reward": 0.046875,
"rewards/format_reward": 0.71875,
"rewards/tag_count_reward": 0.6806640625,
"step": 101
},
{
"clip_ratio": 0.0,
"completion_length": 998.76171875,
"epoch": 0.4349680170575693,
"grad_norm": 30.369285583496094,
"kl": 1.1240234375,
"learning_rate": 1.3930250316539237e-05,
"loss": 0.0296,
"reward": 1.4365234375,
"reward_std": 0.6077300161123276,
"rewards/accuracy_reward": 0.02734375,
"rewards/format_reward": 0.7109375,
"rewards/tag_count_reward": 0.6982421875,
"step": 102
},
{
"clip_ratio": 0.0,
"completion_length": 1007.33984375,
"epoch": 0.43923240938166314,
"grad_norm": 22.23171615600586,
"kl": 3.173828125,
"learning_rate": 1.3792254626529286e-05,
"loss": 0.1053,
"reward": 1.39453125,
"reward_std": 0.5931012779474258,
"rewards/accuracy_reward": 0.03125,
"rewards/format_reward": 0.68359375,
"rewards/tag_count_reward": 0.6796875,
"step": 103
},
{
"clip_ratio": 0.0,
"completion_length": 1018.3046875,
"epoch": 0.44349680170575695,
"grad_norm": 32.721920013427734,
"kl": 2.7041015625,
"learning_rate": 1.3653410243663953e-05,
"loss": 0.1054,
"reward": 1.4375,
"reward_std": 0.5352352559566498,
"rewards/accuracy_reward": 0.0078125,
"rewards/format_reward": 0.74609375,
"rewards/tag_count_reward": 0.68359375,
"step": 104
},
{
"clip_ratio": 0.0,
"completion_length": 983.671875,
"epoch": 0.44776119402985076,
"grad_norm": 21.012828826904297,
"kl": 2.029296875,
"learning_rate": 1.3513748240813429e-05,
"loss": 0.0617,
"reward": 1.44140625,
"reward_std": 0.5487575381994247,
"rewards/accuracy_reward": 0.03515625,
"rewards/format_reward": 0.71484375,
"rewards/tag_count_reward": 0.69140625,
"step": 105
},
{
"clip_ratio": 0.0,
"completion_length": 1009.48828125,
"epoch": 0.4520255863539446,
"grad_norm": 8.625335693359375,
"kl": 1.4072265625,
"learning_rate": 1.3373299873828303e-05,
"loss": 0.0492,
"reward": 1.4296875,
"reward_std": 0.5546326637268066,
"rewards/accuracy_reward": 0.03515625,
"rewards/format_reward": 0.703125,
"rewards/tag_count_reward": 0.69140625,
"step": 106
},
{
"clip_ratio": 0.0,
"completion_length": 1024.0,
"epoch": 0.4562899786780384,
"grad_norm": 3.2243165969848633,
"kl": 0.8115234375,
"learning_rate": 1.3232096574544602e-05,
"loss": 0.0324,
"reward": 1.462890625,
"reward_std": 0.5354997888207436,
"rewards/accuracy_reward": 0.0390625,
"rewards/format_reward": 0.72265625,
"rewards/tag_count_reward": 0.701171875,
"step": 107
},
{
"clip_ratio": 0.0,
"completion_length": 1020.35546875,
"epoch": 0.4605543710021322,
"grad_norm": 1.6773790121078491,
"kl": 0.9384765625,
"learning_rate": 1.3090169943749475e-05,
"loss": 0.0316,
"reward": 1.3544921875,
"reward_std": 0.6066916137933731,
"rewards/accuracy_reward": 0.02734375,
"rewards/format_reward": 0.6328125,
"rewards/tag_count_reward": 0.6943359375,
"step": 108
},
{
"clip_ratio": 0.0,
"completion_length": 934.0625,
"epoch": 0.464818763326226,
"grad_norm": 0.25028663873672485,
"kl": 3.5703125,
"learning_rate": 1.2947551744109044e-05,
"loss": 0.1428,
"reward": 1.7275390625,
"reward_std": 0.30690931528806686,
"rewards/accuracy_reward": 0.0703125,
"rewards/format_reward": 0.94140625,
"rewards/tag_count_reward": 0.7158203125,
"step": 109
},
{
"clip_ratio": 0.0,
"completion_length": 894.3125,
"epoch": 0.4690831556503198,
"grad_norm": 0.25236231088638306,
"kl": 4.31640625,
"learning_rate": 1.2804273893060028e-05,
"loss": 0.1724,
"reward": 1.5966796875,
"reward_std": 0.3756791800260544,
"rewards/accuracy_reward": 0.0234375,
"rewards/format_reward": 0.86328125,
"rewards/tag_count_reward": 0.7099609375,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 843.46875,
"epoch": 0.47334754797441364,
"grad_norm": 0.30303165316581726,
"kl": 4.3515625,
"learning_rate": 1.2660368455666752e-05,
"loss": 0.174,
"reward": 1.6923828125,
"reward_std": 0.36458854377269745,
"rewards/accuracy_reward": 0.05859375,
"rewards/format_reward": 0.9296875,
"rewards/tag_count_reward": 0.7041015625,
"step": 111
},
{
"clip_ratio": 0.0,
"completion_length": 846.1875,
"epoch": 0.47761194029850745,
"grad_norm": 0.30785125494003296,
"kl": 4.8515625,
"learning_rate": 1.2515867637445088e-05,
"loss": 0.1944,
"reward": 1.65234375,
"reward_std": 0.35947033017873764,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.91015625,
"rewards/tag_count_reward": 0.7421875,
"step": 112
},
{
"clip_ratio": 0.0,
"completion_length": 1001.59375,
"epoch": 0.48187633262260127,
"grad_norm": 0.5503849983215332,
"kl": 3.1875,
"learning_rate": 1.2370803777154976e-05,
"loss": 0.1275,
"reward": 0.92578125,
"reward_std": 0.40457524359226227,
"rewards/accuracy_reward": 0.015625,
"rewards/format_reward": 0.171875,
"rewards/tag_count_reward": 0.73828125,
"step": 113
},
{
"clip_ratio": 0.0,
"completion_length": 742.28125,
"epoch": 0.4861407249466951,
"grad_norm": 0.6809885501861572,
"kl": 5.1484375,
"learning_rate": 1.2225209339563144e-05,
"loss": 0.2059,
"reward": 1.7626953125,
"reward_std": 0.38498707860708237,
"rewards/accuracy_reward": 0.0390625,
"rewards/format_reward": 0.9140625,
"rewards/tag_count_reward": 0.8095703125,
"step": 114
},
{
"clip_ratio": 0.0,
"completion_length": 575.97265625,
"epoch": 0.4904051172707889,
"grad_norm": 0.8168994784355164,
"kl": 4.984375,
"learning_rate": 1.2079116908177592e-05,
"loss": 0.1925,
"reward": 1.8603515625,
"reward_std": 0.43856722861528397,
"rewards/accuracy_reward": 0.06640625,
"rewards/format_reward": 0.84765625,
"rewards/tag_count_reward": 0.9462890625,
"step": 115
},
{
"clip_ratio": 0.0,
"completion_length": 859.1875,
"epoch": 0.4946695095948827,
"grad_norm": 0.32246819138526917,
"kl": 4.65625,
"learning_rate": 1.1932559177955533e-05,
"loss": 0.1858,
"reward": 1.6337890625,
"reward_std": 0.3074583485722542,
"rewards/accuracy_reward": 0.01171875,
"rewards/format_reward": 0.890625,
"rewards/tag_count_reward": 0.7314453125,
"step": 116
},
{
"clip_ratio": 0.0,
"completion_length": 1009.09375,
"epoch": 0.4989339019189765,
"grad_norm": 0.2883855402469635,
"kl": 2.7890625,
"learning_rate": 1.1785568947986368e-05,
"loss": 0.1117,
"reward": 1.8154296875,
"reward_std": 0.2801100164651871,
"rewards/accuracy_reward": 0.12109375,
"rewards/format_reward": 0.9609375,
"rewards/tag_count_reward": 0.7333984375,
"step": 117
},
{
"clip_ratio": 0.0,
"completion_length": 1024.0,
"epoch": 0.5031982942430704,
"grad_norm": 0.5340821146965027,
"kl": 2.125,
"learning_rate": 1.1638179114151378e-05,
"loss": 0.0849,
"reward": 1.6708984375,
"reward_std": 0.2677147090435028,
"rewards/accuracy_reward": 0.01953125,
"rewards/format_reward": 0.92578125,
"rewards/tag_count_reward": 0.7255859375,
"step": 118
},
{
"clip_ratio": 0.0,
"completion_length": 951.5625,
"epoch": 0.5074626865671642,
"grad_norm": 0.3258584141731262,
"kl": 2.87109375,
"learning_rate": 1.1490422661761744e-05,
"loss": 0.1149,
"reward": 1.71875,
"reward_std": 0.1409970298409462,
"rewards/accuracy_reward": 0.01171875,
"rewards/format_reward": 0.96484375,
"rewards/tag_count_reward": 0.7421875,
"step": 119
},
{
"clip_ratio": 0.0,
"completion_length": 948.03125,
"epoch": 0.511727078891258,
"grad_norm": 0.20516642928123474,
"kl": 2.56640625,
"learning_rate": 1.1342332658176556e-05,
"loss": 0.1026,
"reward": 1.7783203125,
"reward_std": 0.21998512372374535,
"rewards/accuracy_reward": 0.07421875,
"rewards/format_reward": 0.96484375,
"rewards/tag_count_reward": 0.7392578125,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 949.3125,
"epoch": 0.5159914712153518,
"grad_norm": 0.15999875962734222,
"kl": 2.73828125,
"learning_rate": 1.1193942245402443e-05,
"loss": 0.1093,
"reward": 1.7880859375,
"reward_std": 0.1588208805769682,
"rewards/accuracy_reward": 0.078125,
"rewards/format_reward": 0.96484375,
"rewards/tag_count_reward": 0.7451171875,
"step": 121
},
{
"clip_ratio": 0.0,
"completion_length": 996.125,
"epoch": 0.5202558635394456,
"grad_norm": 0.2390127331018448,
"kl": 2.75,
"learning_rate": 1.1045284632676535e-05,
"loss": 0.1098,
"reward": 1.76171875,
"reward_std": 0.22831767983734608,
"rewards/accuracy_reward": 0.078125,
"rewards/format_reward": 0.953125,
"rewards/tag_count_reward": 0.73046875,
"step": 122
},
{
"clip_ratio": 0.0,
"completion_length": 972.6796875,
"epoch": 0.5245202558635395,
"grad_norm": 4.8665361404418945,
"kl": 3.66796875,
"learning_rate": 1.0896393089034336e-05,
"loss": 0.1313,
"reward": 1.6845703125,
"reward_std": 0.36246033012866974,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.88671875,
"rewards/tag_count_reward": 0.7041015625,
"step": 123
},
{
"clip_ratio": 0.0,
"completion_length": 888.3046875,
"epoch": 0.5287846481876333,
"grad_norm": 3.6568827629089355,
"kl": 2.06640625,
"learning_rate": 1.0747300935864245e-05,
"loss": 0.0925,
"reward": 1.68359375,
"reward_std": 0.4343060404062271,
"rewards/accuracy_reward": 0.05078125,
"rewards/format_reward": 0.8828125,
"rewards/tag_count_reward": 0.75,
"step": 124
},
{
"clip_ratio": 0.0,
"completion_length": 516.43359375,
"epoch": 0.5330490405117271,
"grad_norm": 0.5316474437713623,
"kl": 0.61767578125,
"learning_rate": 1.0598041539450344e-05,
"loss": 0.2445,
"reward": 1.7685546875,
"reward_std": 0.4253704324364662,
"rewards/accuracy_reward": 0.0390625,
"rewards/format_reward": 0.89453125,
"rewards/tag_count_reward": 0.8349609375,
"step": 125
},
{
"clip_ratio": 0.0,
"completion_length": 473.5546875,
"epoch": 0.5373134328358209,
"grad_norm": 2.9844846725463867,
"kl": 0.8173828125,
"learning_rate": 1.044864830350515e-05,
"loss": 0.2749,
"reward": 1.833984375,
"reward_std": 0.523324653506279,
"rewards/accuracy_reward": 0.10546875,
"rewards/format_reward": 0.84375,
"rewards/tag_count_reward": 0.884765625,
"step": 126
},
{
"clip_ratio": 0.0,
"completion_length": 450.921875,
"epoch": 0.5415778251599147,
"grad_norm": 0.5743687748908997,
"kl": 0.84765625,
"learning_rate": 1.0299154661693987e-05,
"loss": 0.2714,
"reward": 1.771484375,
"reward_std": 0.5503488332033157,
"rewards/accuracy_reward": 0.05859375,
"rewards/format_reward": 0.83203125,
"rewards/tag_count_reward": 0.880859375,
"step": 127
},
{
"clip_ratio": 0.0,
"completion_length": 488.37109375,
"epoch": 0.5458422174840085,
"grad_norm": 6.451872825622559,
"kl": 1.234375,
"learning_rate": 1.0149594070152638e-05,
"loss": 0.3969,
"reward": 1.7021484375,
"reward_std": 0.6392623782157898,
"rewards/accuracy_reward": 0.06640625,
"rewards/format_reward": 0.78125,
"rewards/tag_count_reward": 0.8544921875,
"step": 128
},
{
"clip_ratio": 0.0,
"completion_length": 466.671875,
"epoch": 0.5501066098081023,
"grad_norm": 2.481407403945923,
"kl": 0.986328125,
"learning_rate": 1e-05,
"loss": 0.4003,
"reward": 1.615234375,
"reward_std": 0.6238291710615158,
"rewards/accuracy_reward": 0.03515625,
"rewards/format_reward": 0.7421875,
"rewards/tag_count_reward": 0.837890625,
"step": 129
},
{
"clip_ratio": 0.0,
"completion_length": 414.88671875,
"epoch": 0.5543710021321961,
"grad_norm": 4.6952948570251465,
"kl": 0.55078125,
"learning_rate": 9.850405929847367e-06,
"loss": 0.413,
"reward": 1.654296875,
"reward_std": 0.6841937601566315,
"rewards/accuracy_reward": 0.0703125,
"rewards/format_reward": 0.7421875,
"rewards/tag_count_reward": 0.841796875,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 374.3828125,
"epoch": 0.55863539445629,
"grad_norm": 10.949110984802246,
"kl": 0.5146484375,
"learning_rate": 9.700845338306018e-06,
"loss": 0.4342,
"reward": 1.7568359375,
"reward_std": 0.5640043765306473,
"rewards/accuracy_reward": 0.046875,
"rewards/format_reward": 0.81640625,
"rewards/tag_count_reward": 0.8935546875,
"step": 131
},
{
"clip_ratio": 0.0,
"completion_length": 379.46875,
"epoch": 0.5628997867803838,
"grad_norm": 7.129451274871826,
"kl": 0.41796875,
"learning_rate": 9.551351696494854e-06,
"loss": 0.462,
"reward": 1.6328125,
"reward_std": 0.7138571888208389,
"rewards/accuracy_reward": 0.07421875,
"rewards/format_reward": 0.7265625,
"rewards/tag_count_reward": 0.83203125,
"step": 132
},
{
"clip_ratio": 0.0,
"completion_length": 378.390625,
"epoch": 0.5671641791044776,
"grad_norm": 5.389857769012451,
"kl": 0.6474609375,
"learning_rate": 9.401958460549658e-06,
"loss": 0.4062,
"reward": 1.69921875,
"reward_std": 0.5953380540013313,
"rewards/accuracy_reward": 0.03125,
"rewards/format_reward": 0.78515625,
"rewards/tag_count_reward": 0.8828125,
"step": 133
},
{
"clip_ratio": 0.0,
"completion_length": 374.63671875,
"epoch": 0.5714285714285714,
"grad_norm": 8.098217964172363,
"kl": 0.6748046875,
"learning_rate": 9.252699064135759e-06,
"loss": 0.5274,
"reward": 1.68359375,
"reward_std": 0.6231431663036346,
"rewards/accuracy_reward": 0.04296875,
"rewards/format_reward": 0.76953125,
"rewards/tag_count_reward": 0.87109375,
"step": 134
},
{
"clip_ratio": 0.0,
"completion_length": 371.6953125,
"epoch": 0.5756929637526652,
"grad_norm": 27.047813415527344,
"kl": 6.0146484375,
"learning_rate": 9.103606910965666e-06,
"loss": 0.4173,
"reward": 1.7587890625,
"reward_std": 0.477617509663105,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.7890625,
"rewards/tag_count_reward": 0.8759765625,
"step": 135
},
{
"clip_ratio": 0.0,
"completion_length": 388.03125,
"epoch": 0.579957356076759,
"grad_norm": 72.6392822265625,
"kl": 33.333984375,
"learning_rate": 8.954715367323468e-06,
"loss": 0.5359,
"reward": 1.5771484375,
"reward_std": 0.65767702460289,
"rewards/accuracy_reward": 0.0234375,
"rewards/format_reward": 0.71484375,
"rewards/tag_count_reward": 0.8388671875,
"step": 136
},
{
"clip_ratio": 0.0,
"completion_length": 520.26953125,
"epoch": 0.5842217484008528,
"grad_norm": 11.781960487365723,
"kl": 3.19140625,
"learning_rate": 8.806057754597559e-06,
"loss": 0.3497,
"reward": 1.1142578125,
"reward_std": 0.6293385028839111,
"rewards/accuracy_reward": 0.015625,
"rewards/format_reward": 0.40625,
"rewards/tag_count_reward": 0.6923828125,
"step": 137
},
{
"clip_ratio": 0.0,
"completion_length": 612.92578125,
"epoch": 0.5884861407249466,
"grad_norm": 10.908761978149414,
"kl": 2.56640625,
"learning_rate": 8.657667341823449e-06,
"loss": 0.089,
"reward": 0.552734375,
"reward_std": 0.22833332046866417,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.05078125,
"rewards/tag_count_reward": 0.501953125,
"step": 138
},
{
"clip_ratio": 0.0,
"completion_length": 164.50390625,
"epoch": 0.5927505330490405,
"grad_norm": 1.4357022047042847,
"kl": 0.38916015625,
"learning_rate": 8.509577338238255e-06,
"loss": 0.3546,
"reward": 0.4619140625,
"reward_std": 0.0770116988569498,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4619140625,
"step": 139
},
{
"clip_ratio": 0.0,
"completion_length": 236.9140625,
"epoch": 0.5970149253731343,
"grad_norm": 1.1441797018051147,
"kl": 0.32080078125,
"learning_rate": 8.361820885848623e-06,
"loss": 0.1043,
"reward": 0.3369140625,
"reward_std": 0.11966157145798206,
"rewards/accuracy_reward": 0.00390625,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.3330078125,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 304.41796875,
"epoch": 0.6012793176972282,
"grad_norm": 6.6605143547058105,
"kl": 0.4736328125,
"learning_rate": 8.214431052013636e-06,
"loss": 0.0359,
"reward": 0.6025390625,
"reward_std": 0.21631848067045212,
"rewards/accuracy_reward": 0.06640625,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5361328125,
"step": 141
},
{
"clip_ratio": 0.0,
"completion_length": 291.44921875,
"epoch": 0.605543710021322,
"grad_norm": 29.841733932495117,
"kl": 0.37939453125,
"learning_rate": 8.06744082204447e-06,
"loss": 0.1329,
"reward": 0.7060546875,
"reward_std": 0.25769177079200745,
"rewards/accuracy_reward": 0.1171875,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5888671875,
"step": 142
},
{
"clip_ratio": 0.0,
"completion_length": 259.703125,
"epoch": 0.6098081023454158,
"grad_norm": 22.494600296020508,
"kl": 1.2724609375,
"learning_rate": 7.92088309182241e-06,
"loss": -0.003,
"reward": 0.61328125,
"reward_std": 0.2214067205786705,
"rewards/accuracy_reward": 0.05078125,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5625,
"step": 143
},
{
"clip_ratio": 0.0,
"completion_length": 272.9921875,
"epoch": 0.6140724946695096,
"grad_norm": 3.0637097358703613,
"kl": 0.34814453125,
"learning_rate": 7.774790660436857e-06,
"loss": -0.0925,
"reward": 0.5869140625,
"reward_std": 0.22308824211359024,
"rewards/accuracy_reward": 0.0390625,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5478515625,
"step": 144
},
{
"clip_ratio": 0.0,
"completion_length": 303.21875,
"epoch": 0.6183368869936035,
"grad_norm": 3.038789987564087,
"kl": 0.57373046875,
"learning_rate": 7.629196222845027e-06,
"loss": -0.0695,
"reward": 0.6015625,
"reward_std": 0.1990872472524643,
"rewards/accuracy_reward": 0.02734375,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.57421875,
"step": 145
},
{
"clip_ratio": 0.0,
"completion_length": 244.98828125,
"epoch": 0.6226012793176973,
"grad_norm": 3.982813835144043,
"kl": 1.84765625,
"learning_rate": 7.484132362554915e-06,
"loss": -0.1056,
"reward": 0.61328125,
"reward_std": 0.24527693167328835,
"rewards/accuracy_reward": 0.0390625,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.57421875,
"step": 146
},
{
"clip_ratio": 0.0,
"completion_length": 304.8359375,
"epoch": 0.6268656716417911,
"grad_norm": 1.170094609260559,
"kl": 1.3681640625,
"learning_rate": 7.33963154433325e-06,
"loss": -0.1068,
"reward": 0.609375,
"reward_std": 0.203267153352499,
"rewards/accuracy_reward": 0.03125,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.578125,
"step": 147
},
{
"clip_ratio": 0.0,
"completion_length": 344.03125,
"epoch": 0.6311300639658849,
"grad_norm": 3.609171152114868,
"kl": 1.33203125,
"learning_rate": 7.1957261069399745e-06,
"loss": -0.1631,
"reward": 0.6796875,
"reward_std": 0.21126757562160492,
"rewards/accuracy_reward": 0.05859375,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.62109375,
"step": 148
},
{
"clip_ratio": 0.0,
"completion_length": 312.6640625,
"epoch": 0.6353944562899787,
"grad_norm": 1.0008127689361572,
"kl": 1.52734375,
"learning_rate": 7.052448255890958e-06,
"loss": -0.2083,
"reward": 0.7255859375,
"reward_std": 0.29479434341192245,
"rewards/accuracy_reward": 0.1171875,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6083984375,
"step": 149
},
{
"clip_ratio": 0.0,
"completion_length": 472.80859375,
"epoch": 0.6396588486140725,
"grad_norm": 0.9753682017326355,
"kl": 0.650390625,
"learning_rate": 6.909830056250527e-06,
"loss": -0.196,
"reward": 0.748046875,
"reward_std": 0.2531566210091114,
"rewards/accuracy_reward": 0.08984375,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.658203125,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 808.75,
"epoch": 0.6439232409381663,
"grad_norm": 0.5874699950218201,
"kl": 0.8330078125,
"learning_rate": 6.767903425455402e-06,
"loss": -0.122,
"reward": 0.6337890625,
"reward_std": 0.2824634090065956,
"rewards/accuracy_reward": 0.12890625,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5048828125,
"step": 151
},
{
"clip_ratio": 0.0,
"completion_length": 793.7578125,
"epoch": 0.6481876332622601,
"grad_norm": 0.6901421546936035,
"kl": 1.171875,
"learning_rate": 6.6267001261717015e-06,
"loss": -0.0709,
"reward": 0.70703125,
"reward_std": 0.2805519849061966,
"rewards/accuracy_reward": 0.140625,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.56640625,
"step": 152
},
{
"clip_ratio": 0.0,
"completion_length": 625.5,
"epoch": 0.652452025586354,
"grad_norm": 1.2642836570739746,
"kl": 6.0654296875,
"learning_rate": 6.486251759186573e-06,
"loss": -0.1338,
"reward": 0.72265625,
"reward_std": 0.28466814011335373,
"rewards/accuracy_reward": 0.23046875,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4921875,
"step": 153
},
{
"clip_ratio": 0.0,
"completion_length": 625.890625,
"epoch": 0.6567164179104478,
"grad_norm": 0.3763836622238159,
"kl": 0.4189453125,
"learning_rate": 6.34658975633605e-06,
"loss": -0.0951,
"reward": 0.697265625,
"reward_std": 0.2551993578672409,
"rewards/accuracy_reward": 0.19921875,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.498046875,
"step": 154
},
{
"clip_ratio": 0.0,
"completion_length": 591.17578125,
"epoch": 0.6609808102345416,
"grad_norm": 0.6317035555839539,
"kl": 0.52490234375,
"learning_rate": 6.207745373470717e-06,
"loss": -0.1346,
"reward": 0.7265625,
"reward_std": 0.32422181963920593,
"rewards/accuracy_reward": 0.23046875,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.49609375,
"step": 155
},
{
"clip_ratio": 0.0,
"completion_length": 702.796875,
"epoch": 0.6652452025586354,
"grad_norm": 0.3873419165611267,
"kl": 0.81591796875,
"learning_rate": 6.069749683460765e-06,
"loss": -0.1725,
"reward": 0.6943359375,
"reward_std": 0.2808499410748482,
"rewards/accuracy_reward": 0.11328125,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5810546875,
"step": 156
},
{
"clip_ratio": 0.0,
"completion_length": 655.75390625,
"epoch": 0.6695095948827292,
"grad_norm": 0.37135419249534607,
"kl": 0.52587890625,
"learning_rate": 5.932633569242e-06,
"loss": -0.041,
"reward": 0.6240234375,
"reward_std": 0.19439143873751163,
"rewards/accuracy_reward": 0.05078125,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5732421875,
"step": 157
},
{
"clip_ratio": 0.0,
"completion_length": 676.12890625,
"epoch": 0.673773987206823,
"grad_norm": 0.3440045118331909,
"kl": 0.49658203125,
"learning_rate": 5.796427716904347e-06,
"loss": -0.1016,
"reward": 0.7001953125,
"reward_std": 0.252426378428936,
"rewards/accuracy_reward": 0.1171875,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5830078125,
"step": 158
},
{
"clip_ratio": 0.0,
"completion_length": 611.26953125,
"epoch": 0.6780383795309168,
"grad_norm": 0.4159948527812958,
"kl": 0.88134765625,
"learning_rate": 5.66116260882442e-06,
"loss": -0.1264,
"reward": 0.642578125,
"reward_std": 0.18874739110469818,
"rewards/accuracy_reward": 0.01953125,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.623046875,
"step": 159
},
{
"clip_ratio": 0.0,
"completion_length": 470.328125,
"epoch": 0.6823027718550106,
"grad_norm": 0.38798099756240845,
"kl": 0.79736328125,
"learning_rate": 5.526868516843673e-06,
"loss": -0.0926,
"reward": 0.708984375,
"reward_std": 0.22118790447711945,
"rewards/accuracy_reward": 0.05078125,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.658203125,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 447.8125,
"epoch": 0.6865671641791045,
"grad_norm": 0.4147135615348816,
"kl": 0.64111328125,
"learning_rate": 5.393575495493679e-06,
"loss": -0.1075,
"reward": 0.732421875,
"reward_std": 0.18555288948118687,
"rewards/accuracy_reward": 0.0703125,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.662109375,
"step": 161
},
{
"clip_ratio": 0.0,
"completion_length": 459.30078125,
"epoch": 0.6908315565031983,
"grad_norm": 0.5294929146766663,
"kl": 0.58642578125,
"learning_rate": 5.2613133752700145e-06,
"loss": -0.1284,
"reward": 0.66796875,
"reward_std": 0.1776830367743969,
"rewards/accuracy_reward": 0.01171875,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.65625,
"step": 162
},
{
"clip_ratio": 0.0,
"completion_length": 379.0625,
"epoch": 0.6950959488272921,
"grad_norm": 0.5291323065757751,
"kl": 0.830078125,
"learning_rate": 5.130111755956327e-06,
"loss": -0.1563,
"reward": 0.7099609375,
"reward_std": 0.19628439471125603,
"rewards/accuracy_reward": 0.046875,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6630859375,
"step": 163
},
{
"clip_ratio": 0.0,
"completion_length": 338.12890625,
"epoch": 0.6993603411513859,
"grad_norm": 0.4588962197303772,
"kl": 0.73583984375,
"learning_rate": 5.000000000000003e-06,
"loss": -0.1004,
"reward": 0.7626953125,
"reward_std": 0.17791462130844593,
"rewards/accuracy_reward": 0.06640625,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6962890625,
"step": 164
},
{
"clip_ratio": 0.0,
"completion_length": 310.578125,
"epoch": 0.7036247334754797,
"grad_norm": 0.8297274708747864,
"kl": 1.7900390625,
"learning_rate": 4.87100722594094e-06,
"loss": -0.0991,
"reward": 0.7421875,
"reward_std": 0.17338587157428265,
"rewards/accuracy_reward": 0.04296875,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.69921875,
"step": 165
},
{
"clip_ratio": 0.0,
"completion_length": 316.98828125,
"epoch": 0.7078891257995735,
"grad_norm": 0.9038926362991333,
"kl": 1.57470703125,
"learning_rate": 4.743162301894952e-06,
"loss": -0.0521,
"reward": 0.7744140625,
"reward_std": 0.14399663731455803,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.7119140625,
"step": 166
},
{
"clip_ratio": 0.0,
"completion_length": 307.609375,
"epoch": 0.7121535181236673,
"grad_norm": 3.5091426372528076,
"kl": 1.41943359375,
"learning_rate": 4.616493839093179e-06,
"loss": -0.0394,
"reward": 0.791015625,
"reward_std": 0.1766387764364481,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.697265625,
"step": 167
},
{
"clip_ratio": 0.0,
"completion_length": 290.44921875,
"epoch": 0.7164179104477612,
"grad_norm": 2.227064847946167,
"kl": 1.279296875,
"learning_rate": 4.491030185478976e-06,
"loss": -0.0156,
"reward": 0.7197265625,
"reward_std": 0.10716542787849903,
"rewards/accuracy_reward": 0.015625,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.7041015625,
"step": 168
},
{
"clip_ratio": 0.0,
"completion_length": 296.54296875,
"epoch": 0.720682302771855,
"grad_norm": 5.702210426330566,
"kl": 1.513671875,
"learning_rate": 4.3667994193637794e-06,
"loss": 0.0234,
"reward": 0.7626953125,
"reward_std": 0.12216670252382755,
"rewards/accuracy_reward": 0.05078125,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.7119140625,
"step": 169
},
{
"clip_ratio": 0.0,
"completion_length": 303.10546875,
"epoch": 0.7249466950959488,
"grad_norm": 0.5832945108413696,
"kl": 0.66162109375,
"learning_rate": 4.2438293431432665e-06,
"loss": 0.0051,
"reward": 0.806640625,
"reward_std": 0.13442331552505493,
"rewards/accuracy_reward": 0.0859375,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.720703125,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 269.23046875,
"epoch": 0.7292110874200426,
"grad_norm": 1.5207250118255615,
"kl": 0.60546875,
"learning_rate": 4.12214747707527e-06,
"loss": 0.0104,
"reward": 0.732421875,
"reward_std": 0.09527772478759289,
"rewards/accuracy_reward": 0.015625,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.716796875,
"step": 171
},
{
"clip_ratio": 0.0,
"completion_length": 221.65625,
"epoch": 0.7334754797441365,
"grad_norm": 2.143716335296631,
"kl": 1.3779296875,
"learning_rate": 4.001781053120863e-06,
"loss": -0.0052,
"reward": 0.7958984375,
"reward_std": 0.13394116796553135,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.7021484375,
"step": 172
},
{
"clip_ratio": 0.0,
"completion_length": 175.17578125,
"epoch": 0.7377398720682303,
"grad_norm": 5.434141635894775,
"kl": 2.75,
"learning_rate": 3.882757008849936e-06,
"loss": 0.0388,
"reward": 0.685546875,
"reward_std": 0.16674507781863213,
"rewards/accuracy_reward": 0.0390625,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.646484375,
"step": 173
},
{
"clip_ratio": 0.0,
"completion_length": 162.55859375,
"epoch": 0.7420042643923241,
"grad_norm": 27.080265045166016,
"kl": 3.57421875,
"learning_rate": 3.7651019814126656e-06,
"loss": 0.0552,
"reward": 0.6416015625,
"reward_std": 0.133183553814888,
"rewards/accuracy_reward": 0.0078125,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6337890625,
"step": 174
},
{
"clip_ratio": 0.0,
"completion_length": 161.41796875,
"epoch": 0.746268656716418,
"grad_norm": 4.6160430908203125,
"kl": 3.09375,
"learning_rate": 3.6488423015782128e-06,
"loss": 0.074,
"reward": 0.6455078125,
"reward_std": 0.15037459693849087,
"rewards/accuracy_reward": 0.0234375,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6220703125,
"step": 175
},
{
"clip_ratio": 0.0,
"completion_length": 153.59375,
"epoch": 0.7505330490405118,
"grad_norm": 3.9284942150115967,
"kl": 2.150390625,
"learning_rate": 3.534003987842005e-06,
"loss": 0.0613,
"reward": 0.69921875,
"reward_std": 0.16477027162909508,
"rewards/accuracy_reward": 0.03515625,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6640625,
"step": 176
},
{
"clip_ratio": 0.0,
"completion_length": 194.90625,
"epoch": 0.7547974413646056,
"grad_norm": 1.458369493484497,
"kl": 0.7900390625,
"learning_rate": 3.4206127406028744e-06,
"loss": 0.0115,
"reward": 0.78125,
"reward_std": 0.16565649397671223,
"rewards/accuracy_reward": 0.08203125,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.69921875,
"step": 177
},
{
"clip_ratio": 0.0,
"completion_length": 185.91796875,
"epoch": 0.7590618336886994,
"grad_norm": 0.9649374485015869,
"kl": 0.466796875,
"learning_rate": 3.308693936411421e-06,
"loss": -0.0284,
"reward": 0.75,
"reward_std": 0.09331535268574953,
"rewards/accuracy_reward": 0.01953125,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.73046875,
"step": 178
},
{
"clip_ratio": 0.0,
"completion_length": 179.1171875,
"epoch": 0.7633262260127932,
"grad_norm": 1.3005759716033936,
"kl": 0.43115234375,
"learning_rate": 3.1982726222908046e-06,
"loss": -0.0093,
"reward": 0.87890625,
"reward_std": 0.09297346090897918,
"rewards/accuracy_reward": 0.1484375,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.73046875,
"step": 179
},
{
"clip_ratio": 0.0,
"completion_length": 198.36328125,
"epoch": 0.767590618336887,
"grad_norm": 0.8584280014038086,
"kl": 0.2998046875,
"learning_rate": 3.089373510131354e-06,
"loss": -0.0111,
"reward": 0.7822265625,
"reward_std": 0.10853294795379043,
"rewards/accuracy_reward": 0.046875,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.7353515625,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 187.99609375,
"epoch": 0.7718550106609808,
"grad_norm": 0.5993466377258301,
"kl": 0.52099609375,
"learning_rate": 2.9820209711600858e-06,
"loss": -0.0401,
"reward": 0.7890625,
"reward_std": 0.1114540034905076,
"rewards/accuracy_reward": 0.05859375,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.73046875,
"step": 181
},
{
"clip_ratio": 0.0,
"completion_length": 219.74609375,
"epoch": 0.7761194029850746,
"grad_norm": 3.0398595333099365,
"kl": 0.75439453125,
"learning_rate": 2.876239030486554e-06,
"loss": 0.0153,
"reward": 0.7724609375,
"reward_std": 0.1012349147349596,
"rewards/accuracy_reward": 0.0390625,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.7333984375,
"step": 182
},
{
"clip_ratio": 0.0,
"completion_length": 192.01171875,
"epoch": 0.7803837953091685,
"grad_norm": 2.972774028778076,
"kl": 1.375,
"learning_rate": 2.7720513617260857e-06,
"loss": 0.0664,
"reward": 0.814453125,
"reward_std": 0.11987380962818861,
"rewards/accuracy_reward": 0.0859375,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.728515625,
"step": 183
},
{
"clip_ratio": 0.0,
"completion_length": 185.59765625,
"epoch": 0.7846481876332623,
"grad_norm": 6.028680801391602,
"kl": 1.98046875,
"learning_rate": 2.669481281701739e-06,
"loss": 0.0526,
"reward": 0.7646484375,
"reward_std": 0.10079776309430599,
"rewards/accuracy_reward": 0.03125,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.7333984375,
"step": 184
},
{
"clip_ratio": 0.0,
"completion_length": 206.2109375,
"epoch": 0.7889125799573561,
"grad_norm": 1.8360040187835693,
"kl": 1.3046875,
"learning_rate": 2.5685517452260566e-06,
"loss": -0.0217,
"reward": 0.7841796875,
"reward_std": 0.08804275188595057,
"rewards/accuracy_reward": 0.0546875,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.7294921875,
"step": 185
},
{
"clip_ratio": 0.0,
"completion_length": 211.5234375,
"epoch": 0.7931769722814499,
"grad_norm": 1.8284348249435425,
"kl": 1.18359375,
"learning_rate": 2.469285339963892e-06,
"loss": 0.0017,
"reward": 0.7744140625,
"reward_std": 0.09929579310119152,
"rewards/accuracy_reward": 0.04296875,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.7314453125,
"step": 186
},
{
"clip_ratio": 0.0,
"completion_length": 206.52734375,
"epoch": 0.7974413646055437,
"grad_norm": 2.95171856880188,
"kl": 1.2236328125,
"learning_rate": 2.371704281377335e-06,
"loss": 0.0348,
"reward": 0.73828125,
"reward_std": 0.09545402321964502,
"rewards/accuracy_reward": 0.015625,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.72265625,
"step": 187
},
{
"clip_ratio": 0.0,
"completion_length": 202.625,
"epoch": 0.8017057569296375,
"grad_norm": 0.737244725227356,
"kl": 0.52197265625,
"learning_rate": 2.275830407754006e-06,
"loss": 0.0328,
"reward": 0.8466796875,
"reward_std": 0.15702996030449867,
"rewards/accuracy_reward": 0.11328125,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.7333984375,
"step": 188
},
{
"clip_ratio": 0.0,
"completion_length": 214.828125,
"epoch": 0.8059701492537313,
"grad_norm": 0.781270444393158,
"kl": 0.302734375,
"learning_rate": 2.1816851753197023e-06,
"loss": 0.0188,
"reward": 0.80078125,
"reward_std": 0.13719853153452277,
"rewards/accuracy_reward": 0.0703125,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.73046875,
"step": 189
},
{
"clip_ratio": 0.0,
"completion_length": 212.7890625,
"epoch": 0.8102345415778252,
"grad_norm": 1.513720989227295,
"kl": 0.3876953125,
"learning_rate": 2.08928965343659e-06,
"loss": 0.0004,
"reward": 0.861328125,
"reward_std": 0.1351899290457368,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.736328125,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 220.57421875,
"epoch": 0.814498933901919,
"grad_norm": 4.133224964141846,
"kl": 1.0576171875,
"learning_rate": 1.9986645198879385e-06,
"loss": -0.0196,
"reward": 0.7626953125,
"reward_std": 0.14676811546087265,
"rewards/accuracy_reward": 0.0390625,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.7236328125,
"step": 191
},
{
"clip_ratio": 0.0,
"completion_length": 221.96875,
"epoch": 0.8187633262260128,
"grad_norm": 0.4540961682796478,
"kl": 0.3203125,
"learning_rate": 1.9098300562505266e-06,
"loss": -0.0199,
"reward": 0.8544921875,
"reward_std": 0.1271651964634657,
"rewards/accuracy_reward": 0.1171875,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.7373046875,
"step": 192
},
{
"clip_ratio": 0.0,
"completion_length": 219.1171875,
"epoch": 0.8230277185501066,
"grad_norm": 0.32064002752304077,
"kl": 0.45361328125,
"learning_rate": 1.8228061433556866e-06,
"loss": -0.0265,
"reward": 0.779296875,
"reward_std": 0.0899216216057539,
"rewards/accuracy_reward": 0.04296875,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.736328125,
"step": 193
},
{
"clip_ratio": 0.0,
"completion_length": 214.43359375,
"epoch": 0.8272921108742004,
"grad_norm": 1.135198950767517,
"kl": 0.42529296875,
"learning_rate": 1.7376122568400533e-06,
"loss": -0.0286,
"reward": 0.8046875,
"reward_std": 0.16580088809132576,
"rewards/accuracy_reward": 0.078125,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.7265625,
"step": 194
},
{
"clip_ratio": 0.0,
"completion_length": 218.921875,
"epoch": 0.8315565031982942,
"grad_norm": 0.5622548460960388,
"kl": 0.316650390625,
"learning_rate": 1.6542674627869738e-06,
"loss": 0.017,
"reward": 0.80078125,
"reward_std": 0.13944148644804955,
"rewards/accuracy_reward": 0.0703125,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.73046875,
"step": 195
},
{
"clip_ratio": 0.0,
"completion_length": 251.47265625,
"epoch": 0.835820895522388,
"grad_norm": 0.7856387495994568,
"kl": 0.4052734375,
"learning_rate": 1.5727904134596084e-06,
"loss": 0.0162,
"reward": 0.8193359375,
"reward_std": 0.16033071093261242,
"rewards/accuracy_reward": 0.11328125,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.7060546875,
"step": 196
},
{
"clip_ratio": 0.0,
"completion_length": 243.625,
"epoch": 0.8400852878464818,
"grad_norm": 0.6806755065917969,
"kl": 0.49072265625,
"learning_rate": 1.4931993431266056e-06,
"loss": 0.0095,
"reward": 0.7890625,
"reward_std": 0.25723421946167946,
"rewards/accuracy_reward": 0.11328125,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.67578125,
"step": 197
},
{
"clip_ratio": 0.0,
"completion_length": 273.87109375,
"epoch": 0.8443496801705757,
"grad_norm": 1.0873993635177612,
"kl": 0.375,
"learning_rate": 1.4155120639813392e-06,
"loss": 0.1037,
"reward": 0.7626953125,
"reward_std": 0.21218526735901833,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6689453125,
"step": 198
},
{
"clip_ratio": 0.0,
"completion_length": 239.97265625,
"epoch": 0.8486140724946695,
"grad_norm": 1.0088647603988647,
"kl": 0.37353515625,
"learning_rate": 1.339745962155613e-06,
"loss": 0.0416,
"reward": 0.7822265625,
"reward_std": 0.24759295210242271,
"rewards/accuracy_reward": 0.1015625,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6806640625,
"step": 199
},
{
"clip_ratio": 0.0,
"completion_length": 268.73046875,
"epoch": 0.8528784648187633,
"grad_norm": 0.6826640367507935,
"kl": 0.43017578125,
"learning_rate": 1.2659179938287035e-06,
"loss": 0.0312,
"reward": 0.7685546875,
"reward_std": 0.18006664514541626,
"rewards/accuracy_reward": 0.0703125,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6982421875,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 221.609375,
"epoch": 0.8571428571428571,
"grad_norm": 1.0566011667251587,
"kl": 0.45751953125,
"learning_rate": 1.19404468143262e-06,
"loss": -0.0104,
"reward": 0.796875,
"reward_std": 0.15738755092024803,
"rewards/accuracy_reward": 0.078125,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.71875,
"step": 201
},
{
"clip_ratio": 0.0,
"completion_length": 269.80859375,
"epoch": 0.8614072494669509,
"grad_norm": 0.7283450365066528,
"kl": 0.5009765625,
"learning_rate": 1.124142109954459e-06,
"loss": -0.0242,
"reward": 0.7705078125,
"reward_std": 0.13039706647396088,
"rewards/accuracy_reward": 0.0390625,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.7314453125,
"step": 202
},
{
"clip_ratio": 0.0,
"completion_length": 255.265625,
"epoch": 0.8656716417910447,
"grad_norm": 1.5315821170806885,
"kl": 1.380859375,
"learning_rate": 1.0562259233366334e-06,
"loss": -0.0731,
"reward": 0.78515625,
"reward_std": 0.2296939566731453,
"rewards/accuracy_reward": 0.08984375,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6953125,
"step": 203
},
{
"clip_ratio": 0.0,
"completion_length": 218.21875,
"epoch": 0.8699360341151386,
"grad_norm": 1.1094874143600464,
"kl": 0.73046875,
"learning_rate": 9.903113209758098e-07,
"loss": 0.0012,
"reward": 0.8720703125,
"reward_std": 0.2057046014815569,
"rewards/accuracy_reward": 0.1484375,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.7236328125,
"step": 204
},
{
"clip_ratio": 0.0,
"completion_length": 281.12109375,
"epoch": 0.8742004264392325,
"grad_norm": 1.407812237739563,
"kl": 1.35546875,
"learning_rate": 9.264130543213512e-07,
"loss": -0.0625,
"reward": 0.8251953125,
"reward_std": 0.20766575261950493,
"rewards/accuracy_reward": 0.1171875,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.7080078125,
"step": 205
},
{
"clip_ratio": 0.0,
"completion_length": 252.62109375,
"epoch": 0.8784648187633263,
"grad_norm": 3.055626153945923,
"kl": 2.255859375,
"learning_rate": 8.645454235739903e-07,
"loss": -0.0862,
"reward": 0.80859375,
"reward_std": 0.2070464938879013,
"rewards/accuracy_reward": 0.109375,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.69921875,
"step": 206
},
{
"clip_ratio": 0.0,
"completion_length": 238.58203125,
"epoch": 0.8827292110874201,
"grad_norm": 1.8872811794281006,
"kl": 1.49609375,
"learning_rate": 8.047222744854943e-07,
"loss": 0.0217,
"reward": 0.8857421875,
"reward_std": 0.24977924302220345,
"rewards/accuracy_reward": 0.171875,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.7138671875,
"step": 207
},
{
"clip_ratio": 0.0,
"completion_length": 258.48046875,
"epoch": 0.8869936034115139,
"grad_norm": 4.186584949493408,
"kl": 2.2333984375,
"learning_rate": 7.46956995260033e-07,
"loss": -0.0711,
"reward": 0.8271484375,
"reward_std": 0.18339894711971283,
"rewards/accuracy_reward": 0.12890625,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6982421875,
"step": 208
},
{
"clip_ratio": 0.0,
"completion_length": 236.16796875,
"epoch": 0.8912579957356077,
"grad_norm": 2.354311466217041,
"kl": 1.609375,
"learning_rate": 6.912625135579587e-07,
"loss": -0.0062,
"reward": 0.791015625,
"reward_std": 0.17353365197777748,
"rewards/accuracy_reward": 0.078125,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.712890625,
"step": 209
},
{
"clip_ratio": 0.0,
"completion_length": 227.03125,
"epoch": 0.8955223880597015,
"grad_norm": 2.211200714111328,
"kl": 1.818359375,
"learning_rate": 6.37651293602628e-07,
"loss": -0.019,
"reward": 0.7958984375,
"reward_std": 0.19231459498405457,
"rewards/accuracy_reward": 0.0859375,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.7099609375,
"step": 210
},
{
"clip_ratio": 0.0,
"completion_length": 259.16015625,
"epoch": 0.8997867803837953,
"grad_norm": 3.354318141937256,
"kl": 1.48828125,
"learning_rate": 5.861353333909692e-07,
"loss": -0.0305,
"reward": 0.8115234375,
"reward_std": 0.17966507747769356,
"rewards/accuracy_reward": 0.09765625,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.7138671875,
"step": 211
},
{
"clip_ratio": 0.0,
"completion_length": 262.46484375,
"epoch": 0.9040511727078892,
"grad_norm": 3.2571589946746826,
"kl": 2.1796875,
"learning_rate": 5.367261620083575e-07,
"loss": -0.0519,
"reward": 0.83984375,
"reward_std": 0.2149498090147972,
"rewards/accuracy_reward": 0.12109375,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.71875,
"step": 212
},
{
"clip_ratio": 0.0,
"completion_length": 237.96875,
"epoch": 0.908315565031983,
"grad_norm": 1.6243290901184082,
"kl": 1.0390625,
"learning_rate": 4.894348370484648e-07,
"loss": 0.0014,
"reward": 0.7900390625,
"reward_std": 0.14244702830910683,
"rewards/accuracy_reward": 0.05859375,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.7314453125,
"step": 213
},
{
"clip_ratio": 0.0,
"completion_length": 238.0703125,
"epoch": 0.9125799573560768,
"grad_norm": 4.307506084442139,
"kl": 1.15576171875,
"learning_rate": 4.4427194213859216e-07,
"loss": 0.0194,
"reward": 0.833984375,
"reward_std": 0.19881774485111237,
"rewards/accuracy_reward": 0.109375,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.724609375,
"step": 214
},
{
"clip_ratio": 0.0,
"completion_length": 240.42578125,
"epoch": 0.9168443496801706,
"grad_norm": 0.588789701461792,
"kl": 0.70654296875,
"learning_rate": 4.012475845711106e-07,
"loss": -0.0109,
"reward": 0.8740234375,
"reward_std": 0.2335027940571308,
"rewards/accuracy_reward": 0.1484375,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.7255859375,
"step": 215
},
{
"clip_ratio": 0.0,
"completion_length": 245.46875,
"epoch": 0.9211087420042644,
"grad_norm": 2.772460460662842,
"kl": 1.4560546875,
"learning_rate": 3.603713930414676e-07,
"loss": -0.0346,
"reward": 0.7451171875,
"reward_std": 0.1310195019468665,
"rewards/accuracy_reward": 0.02734375,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.7177734375,
"step": 216
},
{
"clip_ratio": 0.0,
"completion_length": 276.265625,
"epoch": 0.9253731343283582,
"grad_norm": 2.068373918533325,
"kl": 1.5322265625,
"learning_rate": 3.2165251549333585e-07,
"loss": -0.0333,
"reward": 0.765625,
"reward_std": 0.15018462389707565,
"rewards/accuracy_reward": 0.046875,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.71875,
"step": 217
},
{
"clip_ratio": 0.0,
"completion_length": 274.9609375,
"epoch": 0.929637526652452,
"grad_norm": 1.8721359968185425,
"kl": 0.88037109375,
"learning_rate": 2.8509961707132496e-07,
"loss": -0.0266,
"reward": 0.8369140625,
"reward_std": 0.20388219691812992,
"rewards/accuracy_reward": 0.1171875,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.7197265625,
"step": 218
},
{
"clip_ratio": 0.0,
"completion_length": 268.0234375,
"epoch": 0.9339019189765458,
"grad_norm": 7.492040157318115,
"kl": 1.865234375,
"learning_rate": 2.507208781817638e-07,
"loss": -0.0152,
"reward": 0.859375,
"reward_std": 0.2039647325873375,
"rewards/accuracy_reward": 0.1328125,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.7265625,
"step": 219
},
{
"clip_ratio": 0.0,
"completion_length": 247.60546875,
"epoch": 0.9381663113006397,
"grad_norm": 1.1414939165115356,
"kl": 0.890625,
"learning_rate": 2.1852399266194312e-07,
"loss": -0.0075,
"reward": 0.8203125,
"reward_std": 0.19437766447663307,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.7265625,
"step": 220
},
{
"clip_ratio": 0.0,
"completion_length": 252.56640625,
"epoch": 0.9424307036247335,
"grad_norm": 1.324097990989685,
"kl": 0.7802734375,
"learning_rate": 1.885161660582746e-07,
"loss": -0.0435,
"reward": 0.7861328125,
"reward_std": 0.1638173609972,
"rewards/accuracy_reward": 0.05859375,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.7275390625,
"step": 221
},
{
"clip_ratio": 0.0,
"completion_length": 234.42578125,
"epoch": 0.9466950959488273,
"grad_norm": 1.6171019077301025,
"kl": 1.47265625,
"learning_rate": 1.6070411401370335e-07,
"loss": -0.0326,
"reward": 0.771484375,
"reward_std": 0.17419602535665035,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.708984375,
"step": 222
},
{
"clip_ratio": 0.0,
"completion_length": 260.5859375,
"epoch": 0.9509594882729211,
"grad_norm": 4.242193698883057,
"kl": 0.716552734375,
"learning_rate": 1.350940607647866e-07,
"loss": 0.0139,
"reward": 0.822265625,
"reward_std": 0.16951362788677216,
"rewards/accuracy_reward": 0.08984375,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.732421875,
"step": 223
},
{
"clip_ratio": 0.0,
"completion_length": 259.80078125,
"epoch": 0.9552238805970149,
"grad_norm": 5.544849395751953,
"kl": 1.04345703125,
"learning_rate": 1.1169173774871478e-07,
"loss": 0.0055,
"reward": 0.8037109375,
"reward_std": 0.17775351367890835,
"rewards/accuracy_reward": 0.08203125,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.7216796875,
"step": 224
},
{
"clip_ratio": 0.0,
"completion_length": 272.6328125,
"epoch": 0.9594882729211087,
"grad_norm": 0.6265246868133545,
"kl": 0.630859375,
"learning_rate": 9.0502382320653e-08,
"loss": -0.0349,
"reward": 0.9287109375,
"reward_std": 0.25279103592038155,
"rewards/accuracy_reward": 0.19921875,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.7294921875,
"step": 225
},
{
"clip_ratio": 0.0,
"completion_length": 240.05078125,
"epoch": 0.9637526652452025,
"grad_norm": 1.7309554815292358,
"kl": 1.01171875,
"learning_rate": 7.153073658162646e-08,
"loss": -0.0208,
"reward": 0.7783203125,
"reward_std": 0.1793037187308073,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.7158203125,
"step": 226
},
{
"clip_ratio": 0.0,
"completion_length": 239.66796875,
"epoch": 0.9680170575692963,
"grad_norm": 1.2242202758789062,
"kl": 0.91845703125,
"learning_rate": 5.4781046317267103e-08,
"loss": -0.0061,
"reward": 0.8173828125,
"reward_std": 0.20180584490299225,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.7236328125,
"step": 227
},
{
"clip_ratio": 0.0,
"completion_length": 275.390625,
"epoch": 0.9722814498933902,
"grad_norm": 0.6100110411643982,
"kl": 0.55224609375,
"learning_rate": 4.025706004760932e-08,
"loss": -0.0347,
"reward": 0.8251953125,
"reward_std": 0.15769800543785095,
"rewards/accuracy_reward": 0.08984375,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.7353515625,
"step": 228
},
{
"clip_ratio": 0.0,
"completion_length": 251.50390625,
"epoch": 0.976545842217484,
"grad_norm": 1.7776597738265991,
"kl": 0.861328125,
"learning_rate": 2.796202818819871e-08,
"loss": -0.0023,
"reward": 0.85546875,
"reward_std": 0.22671574354171753,
"rewards/accuracy_reward": 0.12890625,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.7265625,
"step": 229
},
{
"clip_ratio": 0.0,
"completion_length": 242.7734375,
"epoch": 0.9808102345415778,
"grad_norm": 1.0293753147125244,
"kl": 0.8193359375,
"learning_rate": 1.7898702322648453e-08,
"loss": -0.0337,
"reward": 0.828125,
"reward_std": 0.16718050092458725,
"rewards/accuracy_reward": 0.10546875,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.72265625,
"step": 230
},
{
"clip_ratio": 0.0,
"completion_length": 265.5546875,
"epoch": 0.9850746268656716,
"grad_norm": 1.5529704093933105,
"kl": 1.0205078125,
"learning_rate": 1.0069334586854106e-08,
"loss": -0.0289,
"reward": 0.83203125,
"reward_std": 0.16101082926616073,
"rewards/accuracy_reward": 0.11328125,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.71875,
"step": 231
},
{
"clip_ratio": 0.0,
"completion_length": 254.5,
"epoch": 0.9893390191897654,
"grad_norm": 2.5354487895965576,
"kl": 1.244140625,
"learning_rate": 4.475677164966774e-09,
"loss": -0.0043,
"reward": 0.810546875,
"reward_std": 0.18945523723959923,
"rewards/accuracy_reward": 0.0859375,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.724609375,
"step": 232
},
{
"clip_ratio": 0.0,
"completion_length": 251.34765625,
"epoch": 0.9936034115138592,
"grad_norm": 1.5713036060333252,
"kl": 1.1103515625,
"learning_rate": 1.1189818972656697e-09,
"loss": -0.0032,
"reward": 0.8720703125,
"reward_std": 0.26588882878422737,
"rewards/accuracy_reward": 0.15625,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.7158203125,
"step": 233
},
{
"clip_ratio": 0.0,
"completion_length": 269.0000114440918,
"epoch": 0.997867803837953,
"grad_norm": 1.1278139352798462,
"kl": 1.296875,
"learning_rate": 0.0,
"loss": -0.0335,
"reward": 0.8466796875,
"reward_std": 0.23463162407279015,
"rewards/accuracy_reward": 0.12890625,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.7177734375,
"step": 234
},
{
"epoch": 0.997867803837953,
"step": 234,
"total_flos": 0.0,
"train_loss": 0.3048181866761297,
"train_runtime": 9320.4205,
"train_samples_per_second": 0.805,
"train_steps_per_second": 0.025
}
],
"logging_steps": 1,
"max_steps": 234,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}