|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.997867803837953, |
|
"eval_steps": 500, |
|
"global_step": 234, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 576.5390625, |
|
"epoch": 0.0042643923240938165, |
|
"grad_norm": 0.32528209686279297, |
|
"kl": 0.0, |
|
"learning_rate": 8.333333333333333e-07, |
|
"loss": 0.0286, |
|
"reward": 0.2578125, |
|
"reward_std": 0.40137775242328644, |
|
"rewards/accuracy_reward": 0.18359375, |
|
"rewards/format_reward": 0.015625, |
|
"rewards/tag_count_reward": 0.05859375, |
|
"step": 1 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 547.66796875, |
|
"epoch": 0.008528784648187633, |
|
"grad_norm": 0.43932273983955383, |
|
"kl": 0.0, |
|
"learning_rate": 1.6666666666666667e-06, |
|
"loss": 0.0028, |
|
"reward": 0.265625, |
|
"reward_std": 0.40303920209407806, |
|
"rewards/accuracy_reward": 0.17578125, |
|
"rewards/format_reward": 0.02734375, |
|
"rewards/tag_count_reward": 0.0625, |
|
"step": 2 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 484.96484375, |
|
"epoch": 0.01279317697228145, |
|
"grad_norm": 0.4568934440612793, |
|
"kl": 0.0001436471939086914, |
|
"learning_rate": 2.5e-06, |
|
"loss": 0.0148, |
|
"reward": 0.3037109375, |
|
"reward_std": 0.41709331423044205, |
|
"rewards/accuracy_reward": 0.203125, |
|
"rewards/format_reward": 0.01953125, |
|
"rewards/tag_count_reward": 0.0810546875, |
|
"step": 3 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 469.796875, |
|
"epoch": 0.017057569296375266, |
|
"grad_norm": 0.5138208866119385, |
|
"kl": 0.0001804828643798828, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 0.0222, |
|
"reward": 0.3076171875, |
|
"reward_std": 0.4279475286602974, |
|
"rewards/accuracy_reward": 0.1953125, |
|
"rewards/format_reward": 0.03515625, |
|
"rewards/tag_count_reward": 0.0771484375, |
|
"step": 4 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 498.140625, |
|
"epoch": 0.021321961620469083, |
|
"grad_norm": 0.4469239115715027, |
|
"kl": 0.0010051727294921875, |
|
"learning_rate": 4.166666666666667e-06, |
|
"loss": 0.0029, |
|
"reward": 0.318359375, |
|
"reward_std": 0.44843800365924835, |
|
"rewards/accuracy_reward": 0.18359375, |
|
"rewards/format_reward": 0.02734375, |
|
"rewards/tag_count_reward": 0.107421875, |
|
"step": 5 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 457.3125, |
|
"epoch": 0.0255863539445629, |
|
"grad_norm": 0.7152092456817627, |
|
"kl": 0.029693603515625, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0403, |
|
"reward": 0.5029296875, |
|
"reward_std": 0.6177150011062622, |
|
"rewards/accuracy_reward": 0.171875, |
|
"rewards/format_reward": 0.10546875, |
|
"rewards/tag_count_reward": 0.2255859375, |
|
"step": 6 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 454.74609375, |
|
"epoch": 0.029850746268656716, |
|
"grad_norm": 2.3532168865203857, |
|
"kl": 0.09393310546875, |
|
"learning_rate": 5.833333333333334e-06, |
|
"loss": 0.048, |
|
"reward": 0.564453125, |
|
"reward_std": 0.6538278013467789, |
|
"rewards/accuracy_reward": 0.2578125, |
|
"rewards/format_reward": 0.10546875, |
|
"rewards/tag_count_reward": 0.201171875, |
|
"step": 7 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 492.9765625, |
|
"epoch": 0.03411513859275053, |
|
"grad_norm": 0.46021807193756104, |
|
"kl": 0.016357421875, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 0.0541, |
|
"reward": 0.5888671875, |
|
"reward_std": 0.592596247792244, |
|
"rewards/accuracy_reward": 0.19921875, |
|
"rewards/format_reward": 0.15234375, |
|
"rewards/tag_count_reward": 0.2373046875, |
|
"step": 8 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 478.79296875, |
|
"epoch": 0.03837953091684435, |
|
"grad_norm": 2.5319058895111084, |
|
"kl": 0.0784912109375, |
|
"learning_rate": 7.500000000000001e-06, |
|
"loss": 0.0801, |
|
"reward": 0.71875, |
|
"reward_std": 0.6128444075584412, |
|
"rewards/accuracy_reward": 0.3515625, |
|
"rewards/format_reward": 0.140625, |
|
"rewards/tag_count_reward": 0.2265625, |
|
"step": 9 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 563.734375, |
|
"epoch": 0.042643923240938165, |
|
"grad_norm": 1.262290596961975, |
|
"kl": 0.0523223876953125, |
|
"learning_rate": 8.333333333333334e-06, |
|
"loss": 0.0363, |
|
"reward": 0.5009765625, |
|
"reward_std": 0.5229385495185852, |
|
"rewards/accuracy_reward": 0.296875, |
|
"rewards/format_reward": 0.078125, |
|
"rewards/tag_count_reward": 0.1259765625, |
|
"step": 10 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 499.44140625, |
|
"epoch": 0.046908315565031986, |
|
"grad_norm": 1.3580890893936157, |
|
"kl": 0.0596923828125, |
|
"learning_rate": 9.166666666666666e-06, |
|
"loss": 0.11, |
|
"reward": 0.681640625, |
|
"reward_std": 0.6728685200214386, |
|
"rewards/accuracy_reward": 0.23828125, |
|
"rewards/format_reward": 0.18359375, |
|
"rewards/tag_count_reward": 0.259765625, |
|
"step": 11 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 567.75390625, |
|
"epoch": 0.0511727078891258, |
|
"grad_norm": 4.042412281036377, |
|
"kl": 0.047576904296875, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0732, |
|
"reward": 0.521484375, |
|
"reward_std": 0.5327698737382889, |
|
"rewards/accuracy_reward": 0.27734375, |
|
"rewards/format_reward": 0.08203125, |
|
"rewards/tag_count_reward": 0.162109375, |
|
"step": 12 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 559.421875, |
|
"epoch": 0.05543710021321962, |
|
"grad_norm": 0.3716038763523102, |
|
"kl": 0.0142669677734375, |
|
"learning_rate": 1.0833333333333334e-05, |
|
"loss": 0.1328, |
|
"reward": 0.599609375, |
|
"reward_std": 0.6229686141014099, |
|
"rewards/accuracy_reward": 0.234375, |
|
"rewards/format_reward": 0.14453125, |
|
"rewards/tag_count_reward": 0.220703125, |
|
"step": 13 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 437.3828125, |
|
"epoch": 0.05970149253731343, |
|
"grad_norm": 2.6377227306365967, |
|
"kl": 0.119598388671875, |
|
"learning_rate": 1.1666666666666668e-05, |
|
"loss": 0.1456, |
|
"reward": 0.798828125, |
|
"reward_std": 0.7079743444919586, |
|
"rewards/accuracy_reward": 0.23046875, |
|
"rewards/format_reward": 0.21875, |
|
"rewards/tag_count_reward": 0.349609375, |
|
"step": 14 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 434.79296875, |
|
"epoch": 0.06396588486140725, |
|
"grad_norm": 0.4819924831390381, |
|
"kl": 0.035736083984375, |
|
"learning_rate": 1.25e-05, |
|
"loss": 0.0931, |
|
"reward": 0.8818359375, |
|
"reward_std": 0.6916099190711975, |
|
"rewards/accuracy_reward": 0.20703125, |
|
"rewards/format_reward": 0.234375, |
|
"rewards/tag_count_reward": 0.4404296875, |
|
"step": 15 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 324.69921875, |
|
"epoch": 0.06823027718550106, |
|
"grad_norm": 1.3913614749908447, |
|
"kl": 0.0682373046875, |
|
"learning_rate": 1.3333333333333333e-05, |
|
"loss": 0.2251, |
|
"reward": 1.1982421875, |
|
"reward_std": 0.7941954433917999, |
|
"rewards/accuracy_reward": 0.0859375, |
|
"rewards/format_reward": 0.4375, |
|
"rewards/tag_count_reward": 0.6748046875, |
|
"step": 16 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 299.75, |
|
"epoch": 0.07249466950959488, |
|
"grad_norm": 0.9118645787239075, |
|
"kl": 0.0875244140625, |
|
"learning_rate": 1.416666666666667e-05, |
|
"loss": 0.2379, |
|
"reward": 1.59765625, |
|
"reward_std": 0.6924279183149338, |
|
"rewards/accuracy_reward": 0.11328125, |
|
"rewards/format_reward": 0.65625, |
|
"rewards/tag_count_reward": 0.828125, |
|
"step": 17 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 220.4921875, |
|
"epoch": 0.0767590618336887, |
|
"grad_norm": 1.340067982673645, |
|
"kl": 0.133056640625, |
|
"learning_rate": 1.5000000000000002e-05, |
|
"loss": 0.1821, |
|
"reward": 1.8427734375, |
|
"reward_std": 0.4516802802681923, |
|
"rewards/accuracy_reward": 0.0703125, |
|
"rewards/format_reward": 0.84375, |
|
"rewards/tag_count_reward": 0.9287109375, |
|
"step": 18 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 182.53515625, |
|
"epoch": 0.08102345415778252, |
|
"grad_norm": 3.030550003051758, |
|
"kl": 0.4296875, |
|
"learning_rate": 1.5833333333333333e-05, |
|
"loss": 0.0898, |
|
"reward": 1.986328125, |
|
"reward_std": 0.22909418493509293, |
|
"rewards/accuracy_reward": 0.0703125, |
|
"rewards/format_reward": 0.95703125, |
|
"rewards/tag_count_reward": 0.958984375, |
|
"step": 19 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 214.65234375, |
|
"epoch": 0.08528784648187633, |
|
"grad_norm": 1.1024236679077148, |
|
"kl": 0.1187744140625, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 0.1787, |
|
"reward": 2.0078125, |
|
"reward_std": 0.170526759698987, |
|
"rewards/accuracy_reward": 0.06640625, |
|
"rewards/format_reward": 0.98046875, |
|
"rewards/tag_count_reward": 0.9609375, |
|
"step": 20 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 208.2421875, |
|
"epoch": 0.08955223880597014, |
|
"grad_norm": 22.709352493286133, |
|
"kl": 1.359375, |
|
"learning_rate": 1.7500000000000002e-05, |
|
"loss": 0.1298, |
|
"reward": 1.9345703125, |
|
"reward_std": 0.32205624878406525, |
|
"rewards/accuracy_reward": 0.0625, |
|
"rewards/format_reward": 0.91015625, |
|
"rewards/tag_count_reward": 0.9619140625, |
|
"step": 21 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 253.96484375, |
|
"epoch": 0.09381663113006397, |
|
"grad_norm": 8.92030143737793, |
|
"kl": 1.1494140625, |
|
"learning_rate": 1.8333333333333333e-05, |
|
"loss": 0.1757, |
|
"reward": 1.4931640625, |
|
"reward_std": 0.6142828911542892, |
|
"rewards/accuracy_reward": 0.046875, |
|
"rewards/format_reward": 0.57421875, |
|
"rewards/tag_count_reward": 0.8720703125, |
|
"step": 22 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 217.625, |
|
"epoch": 0.09808102345415778, |
|
"grad_norm": 3.518479585647583, |
|
"kl": 0.38134765625, |
|
"learning_rate": 1.916666666666667e-05, |
|
"loss": 0.0203, |
|
"reward": 1.0087890625, |
|
"reward_std": 0.44057436287403107, |
|
"rewards/accuracy_reward": 0.05078125, |
|
"rewards/format_reward": 0.171875, |
|
"rewards/tag_count_reward": 0.7861328125, |
|
"step": 23 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 237.28125, |
|
"epoch": 0.1023454157782516, |
|
"grad_norm": 1.0693765878677368, |
|
"kl": 0.19775390625, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0202, |
|
"reward": 1.134765625, |
|
"reward_std": 0.5518650561571121, |
|
"rewards/accuracy_reward": 0.0234375, |
|
"rewards/format_reward": 0.296875, |
|
"rewards/tag_count_reward": 0.814453125, |
|
"step": 24 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 295.78515625, |
|
"epoch": 0.10660980810234541, |
|
"grad_norm": 9.523970603942871, |
|
"kl": 0.23779296875, |
|
"learning_rate": 1.9998881018102735e-05, |
|
"loss": 0.1602, |
|
"reward": 1.501953125, |
|
"reward_std": 0.6455793529748917, |
|
"rewards/accuracy_reward": 0.1015625, |
|
"rewards/format_reward": 0.50390625, |
|
"rewards/tag_count_reward": 0.896484375, |
|
"step": 25 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 345.72265625, |
|
"epoch": 0.11087420042643924, |
|
"grad_norm": 24690.228515625, |
|
"kl": 121.465576171875, |
|
"learning_rate": 1.9995524322835035e-05, |
|
"loss": 6.9741, |
|
"reward": 1.5634765625, |
|
"reward_std": 0.648356705904007, |
|
"rewards/accuracy_reward": 0.078125, |
|
"rewards/format_reward": 0.58984375, |
|
"rewards/tag_count_reward": 0.8955078125, |
|
"step": 26 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 452.1953125, |
|
"epoch": 0.11513859275053305, |
|
"grad_norm": 114.83018493652344, |
|
"kl": 1.1298828125, |
|
"learning_rate": 1.9989930665413148e-05, |
|
"loss": 0.3668, |
|
"reward": 1.2841796875, |
|
"reward_std": 0.7657907009124756, |
|
"rewards/accuracy_reward": 0.05078125, |
|
"rewards/format_reward": 0.453125, |
|
"rewards/tag_count_reward": 0.7802734375, |
|
"step": 27 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 550.98828125, |
|
"epoch": 0.11940298507462686, |
|
"grad_norm": 32.12217330932617, |
|
"kl": 0.9326171875, |
|
"learning_rate": 1.998210129767735e-05, |
|
"loss": 0.3607, |
|
"reward": 1.0625, |
|
"reward_std": 0.7528532892465591, |
|
"rewards/accuracy_reward": 0.05078125, |
|
"rewards/format_reward": 0.328125, |
|
"rewards/tag_count_reward": 0.68359375, |
|
"step": 28 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 534.66796875, |
|
"epoch": 0.12366737739872068, |
|
"grad_norm": 31.25490951538086, |
|
"kl": 0.796875, |
|
"learning_rate": 1.9972037971811802e-05, |
|
"loss": 0.1675, |
|
"reward": 0.6337890625, |
|
"reward_std": 0.5352872237563133, |
|
"rewards/accuracy_reward": 0.03515625, |
|
"rewards/format_reward": 0.109375, |
|
"rewards/tag_count_reward": 0.4892578125, |
|
"step": 29 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 492.50390625, |
|
"epoch": 0.1279317697228145, |
|
"grad_norm": 7.399560928344727, |
|
"kl": 0.8779296875, |
|
"learning_rate": 1.9959742939952393e-05, |
|
"loss": 0.0607, |
|
"reward": 0.4462890625, |
|
"reward_std": 0.32183003425598145, |
|
"rewards/accuracy_reward": 0.01953125, |
|
"rewards/format_reward": 0.03515625, |
|
"rewards/tag_count_reward": 0.3916015625, |
|
"step": 30 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 430.3125, |
|
"epoch": 0.13219616204690832, |
|
"grad_norm": 391.785888671875, |
|
"kl": 8.494140625, |
|
"learning_rate": 1.9945218953682736e-05, |
|
"loss": 0.5494, |
|
"reward": 0.4248046875, |
|
"reward_std": 0.2781025320291519, |
|
"rewards/accuracy_reward": 0.015625, |
|
"rewards/format_reward": 0.0234375, |
|
"rewards/tag_count_reward": 0.3857421875, |
|
"step": 31 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 404.62890625, |
|
"epoch": 0.13646055437100213, |
|
"grad_norm": 199.300048828125, |
|
"kl": 2.603515625, |
|
"learning_rate": 1.9928469263418376e-05, |
|
"loss": 0.235, |
|
"reward": 0.3564453125, |
|
"reward_std": 0.19321707263588905, |
|
"rewards/accuracy_reward": 0.00390625, |
|
"rewards/format_reward": 0.0078125, |
|
"rewards/tag_count_reward": 0.3447265625, |
|
"step": 32 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 432.109375, |
|
"epoch": 0.14072494669509594, |
|
"grad_norm": 3538.403564453125, |
|
"kl": 31.28515625, |
|
"learning_rate": 1.990949761767935e-05, |
|
"loss": 2.1404, |
|
"reward": 0.3896484375, |
|
"reward_std": 0.24749910086393356, |
|
"rewards/accuracy_reward": 0.01171875, |
|
"rewards/format_reward": 0.01953125, |
|
"rewards/tag_count_reward": 0.3583984375, |
|
"step": 33 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 464.28515625, |
|
"epoch": 0.14498933901918976, |
|
"grad_norm": 111.8864517211914, |
|
"kl": 1.9296875, |
|
"learning_rate": 1.9888308262251286e-05, |
|
"loss": 0.1906, |
|
"reward": 0.345703125, |
|
"reward_std": 0.1697397418320179, |
|
"rewards/accuracy_reward": 0.01171875, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.333984375, |
|
"step": 34 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 510.56640625, |
|
"epoch": 0.14925373134328357, |
|
"grad_norm": 14.984077453613281, |
|
"kl": 1.08984375, |
|
"learning_rate": 1.9864905939235215e-05, |
|
"loss": 0.0739, |
|
"reward": 0.3662109375, |
|
"reward_std": 0.19745982438325882, |
|
"rewards/accuracy_reward": 0.01953125, |
|
"rewards/format_reward": 0.00390625, |
|
"rewards/tag_count_reward": 0.3427734375, |
|
"step": 35 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 547.35546875, |
|
"epoch": 0.1535181236673774, |
|
"grad_norm": 137.8433380126953, |
|
"kl": 1.69921875, |
|
"learning_rate": 1.98392958859863e-05, |
|
"loss": 0.0645, |
|
"reward": 0.3623046875, |
|
"reward_std": 0.21899614110589027, |
|
"rewards/accuracy_reward": 0.01171875, |
|
"rewards/format_reward": 0.0078125, |
|
"rewards/tag_count_reward": 0.3427734375, |
|
"step": 36 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 647.1953125, |
|
"epoch": 0.15778251599147122, |
|
"grad_norm": 45.083709716796875, |
|
"kl": 2.125, |
|
"learning_rate": 1.9811483833941726e-05, |
|
"loss": 0.132, |
|
"reward": 0.3369140625, |
|
"reward_std": 0.17930956557393074, |
|
"rewards/accuracy_reward": 0.0078125, |
|
"rewards/format_reward": 0.00390625, |
|
"rewards/tag_count_reward": 0.3251953125, |
|
"step": 37 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 651.08984375, |
|
"epoch": 0.16204690831556504, |
|
"grad_norm": 35.52851104736328, |
|
"kl": 1.908203125, |
|
"learning_rate": 1.9781476007338058e-05, |
|
"loss": 0.098, |
|
"reward": 0.345703125, |
|
"reward_std": 0.23194141685962677, |
|
"rewards/accuracy_reward": 0.01171875, |
|
"rewards/format_reward": 0.00390625, |
|
"rewards/tag_count_reward": 0.330078125, |
|
"step": 38 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 648.5234375, |
|
"epoch": 0.16631130063965885, |
|
"grad_norm": 22.79519271850586, |
|
"kl": 2.15625, |
|
"learning_rate": 1.9749279121818235e-05, |
|
"loss": 0.058, |
|
"reward": 0.3427734375, |
|
"reward_std": 0.21174855902791023, |
|
"rewards/accuracy_reward": 0.00390625, |
|
"rewards/format_reward": 0.00390625, |
|
"rewards/tag_count_reward": 0.3349609375, |
|
"step": 39 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 725.0078125, |
|
"epoch": 0.17057569296375266, |
|
"grad_norm": 1158.721923828125, |
|
"kl": 26.59375, |
|
"learning_rate": 1.9714900382928674e-05, |
|
"loss": 1.3132, |
|
"reward": 0.3447265625, |
|
"reward_std": 0.24672244489192963, |
|
"rewards/accuracy_reward": 0.01953125, |
|
"rewards/format_reward": 0.00390625, |
|
"rewards/tag_count_reward": 0.3212890625, |
|
"step": 40 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 674.56640625, |
|
"epoch": 0.17484008528784648, |
|
"grad_norm": 10.722182273864746, |
|
"kl": 2.39453125, |
|
"learning_rate": 1.9678347484506667e-05, |
|
"loss": 0.0955, |
|
"reward": 0.3056640625, |
|
"reward_std": 0.2298230677843094, |
|
"rewards/accuracy_reward": 0.015625, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.2900390625, |
|
"step": 41 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 716.73046875, |
|
"epoch": 0.1791044776119403, |
|
"grad_norm": 22.29323387145996, |
|
"kl": 2.66015625, |
|
"learning_rate": 1.9639628606958535e-05, |
|
"loss": 0.1796, |
|
"reward": 0.2939453125, |
|
"reward_std": 0.2149362936615944, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.2939453125, |
|
"step": 42 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 770.56640625, |
|
"epoch": 0.18336886993603413, |
|
"grad_norm": 5175.0302734375, |
|
"kl": 168.125, |
|
"learning_rate": 1.9598752415428893e-05, |
|
"loss": 8.1192, |
|
"reward": 0.3203125, |
|
"reward_std": 0.19998998567461967, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.3203125, |
|
"step": 43 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 815.19140625, |
|
"epoch": 0.18763326226012794, |
|
"grad_norm": 59.13957977294922, |
|
"kl": 2.7734375, |
|
"learning_rate": 1.955572805786141e-05, |
|
"loss": 0.1392, |
|
"reward": 0.3330078125, |
|
"reward_std": 0.19636105746030807, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.3330078125, |
|
"step": 44 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 823.13671875, |
|
"epoch": 0.19189765458422176, |
|
"grad_norm": 56.28653335571289, |
|
"kl": 2.48046875, |
|
"learning_rate": 1.9510565162951538e-05, |
|
"loss": 0.1406, |
|
"reward": 0.3310546875, |
|
"reward_std": 0.20274027064442635, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.3310546875, |
|
"step": 45 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 809.56640625, |
|
"epoch": 0.19616204690831557, |
|
"grad_norm": 13.485371589660645, |
|
"kl": 2.076171875, |
|
"learning_rate": 1.9463273837991643e-05, |
|
"loss": 0.1466, |
|
"reward": 0.3271484375, |
|
"reward_std": 0.2578311152756214, |
|
"rewards/accuracy_reward": 0.0078125, |
|
"rewards/format_reward": 0.0078125, |
|
"rewards/tag_count_reward": 0.3115234375, |
|
"step": 46 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 839.78125, |
|
"epoch": 0.20042643923240938, |
|
"grad_norm": 6.2021918296813965, |
|
"kl": 1.67578125, |
|
"learning_rate": 1.9413864666609036e-05, |
|
"loss": 0.1194, |
|
"reward": 0.3603515625, |
|
"reward_std": 0.21874134615063667, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0078125, |
|
"rewards/tag_count_reward": 0.3525390625, |
|
"step": 47 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 935.88671875, |
|
"epoch": 0.2046908315565032, |
|
"grad_norm": 42.63424301147461, |
|
"kl": 1.73828125, |
|
"learning_rate": 1.9362348706397374e-05, |
|
"loss": 0.1158, |
|
"reward": 0.3427734375, |
|
"reward_std": 0.23805152624845505, |
|
"rewards/accuracy_reward": 0.0078125, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.3349609375, |
|
"step": 48 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 923.13671875, |
|
"epoch": 0.208955223880597, |
|
"grad_norm": 20.886306762695312, |
|
"kl": 1.84765625, |
|
"learning_rate": 1.9308737486442045e-05, |
|
"loss": 0.0876, |
|
"reward": 0.365234375, |
|
"reward_std": 0.24175361543893814, |
|
"rewards/accuracy_reward": 0.0078125, |
|
"rewards/format_reward": 0.0078125, |
|
"rewards/tag_count_reward": 0.349609375, |
|
"step": 49 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 913.77734375, |
|
"epoch": 0.21321961620469082, |
|
"grad_norm": 1.078397512435913, |
|
"kl": 0.9013671875, |
|
"learning_rate": 1.9253043004739967e-05, |
|
"loss": 0.0613, |
|
"reward": 0.3681640625, |
|
"reward_std": 0.2707056328654289, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0234375, |
|
"rewards/tag_count_reward": 0.3447265625, |
|
"step": 50 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 924.3671875, |
|
"epoch": 0.21748400852878466, |
|
"grad_norm": 139370.5625, |
|
"kl": 772.5048828125, |
|
"learning_rate": 1.919527772551451e-05, |
|
"loss": 34.5749, |
|
"reward": 0.3916015625, |
|
"reward_std": 0.2611350491642952, |
|
"rewards/accuracy_reward": 0.00390625, |
|
"rewards/format_reward": 0.01953125, |
|
"rewards/tag_count_reward": 0.3681640625, |
|
"step": 51 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 966.453125, |
|
"epoch": 0.22174840085287847, |
|
"grad_norm": 2.575350284576416, |
|
"kl": 0.6396484375, |
|
"learning_rate": 1.913545457642601e-05, |
|
"loss": 0.0048, |
|
"reward": 0.3994140625, |
|
"reward_std": 0.26176000386476517, |
|
"rewards/accuracy_reward": 0.0078125, |
|
"rewards/format_reward": 0.015625, |
|
"rewards/tag_count_reward": 0.3759765625, |
|
"step": 52 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 935.90625, |
|
"epoch": 0.2260127931769723, |
|
"grad_norm": 0.6655358672142029, |
|
"kl": 0.625, |
|
"learning_rate": 1.907358694567865e-05, |
|
"loss": 0.0332, |
|
"reward": 0.3818359375, |
|
"reward_std": 0.30670569837093353, |
|
"rewards/accuracy_reward": 0.015625, |
|
"rewards/format_reward": 0.0234375, |
|
"rewards/tag_count_reward": 0.3427734375, |
|
"step": 53 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 981.7109375, |
|
"epoch": 0.2302771855010661, |
|
"grad_norm": 0.49440306425094604, |
|
"kl": 1.32421875, |
|
"learning_rate": 1.900968867902419e-05, |
|
"loss": 0.05, |
|
"reward": 0.37890625, |
|
"reward_std": 0.30825207754969597, |
|
"rewards/accuracy_reward": 0.00390625, |
|
"rewards/format_reward": 0.04296875, |
|
"rewards/tag_count_reward": 0.33203125, |
|
"step": 54 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1006.4765625, |
|
"epoch": 0.2345415778251599, |
|
"grad_norm": 0.6174039840698242, |
|
"kl": 3.51171875, |
|
"learning_rate": 1.8943774076663372e-05, |
|
"loss": 0.1368, |
|
"reward": 0.55078125, |
|
"reward_std": 0.45677174627780914, |
|
"rewards/accuracy_reward": 0.03515625, |
|
"rewards/format_reward": 0.09375, |
|
"rewards/tag_count_reward": 0.421875, |
|
"step": 55 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 993.33984375, |
|
"epoch": 0.23880597014925373, |
|
"grad_norm": 0.6529553532600403, |
|
"kl": 4.8984375, |
|
"learning_rate": 1.8875857890045544e-05, |
|
"loss": 0.1925, |
|
"reward": 0.7568359375, |
|
"reward_std": 0.5871296375989914, |
|
"rewards/accuracy_reward": 0.01953125, |
|
"rewards/format_reward": 0.171875, |
|
"rewards/tag_count_reward": 0.5654296875, |
|
"step": 56 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1012.4765625, |
|
"epoch": 0.24307036247334754, |
|
"grad_norm": 133.8551788330078, |
|
"kl": 4.53515625, |
|
"learning_rate": 1.880595531856738e-05, |
|
"loss": 0.1758, |
|
"reward": 0.806640625, |
|
"reward_std": 0.6075598150491714, |
|
"rewards/accuracy_reward": 0.03515625, |
|
"rewards/format_reward": 0.22265625, |
|
"rewards/tag_count_reward": 0.548828125, |
|
"step": 57 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 996.9453125, |
|
"epoch": 0.24733475479744135, |
|
"grad_norm": 0.891619861125946, |
|
"kl": 2.80078125, |
|
"learning_rate": 1.87340820061713e-05, |
|
"loss": 0.0914, |
|
"reward": 0.732421875, |
|
"reward_std": 0.6083860993385315, |
|
"rewards/accuracy_reward": 0.0078125, |
|
"rewards/format_reward": 0.2265625, |
|
"rewards/tag_count_reward": 0.498046875, |
|
"step": 58 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 967.28515625, |
|
"epoch": 0.2515991471215352, |
|
"grad_norm": 1.4940592050552368, |
|
"kl": 3.5234375, |
|
"learning_rate": 1.866025403784439e-05, |
|
"loss": 0.1145, |
|
"reward": 0.79296875, |
|
"reward_std": 0.6623349040746689, |
|
"rewards/accuracy_reward": 0.00390625, |
|
"rewards/format_reward": 0.2890625, |
|
"rewards/tag_count_reward": 0.5, |
|
"step": 59 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 911.89453125, |
|
"epoch": 0.255863539445629, |
|
"grad_norm": 25.662094116210938, |
|
"kl": 4.6328125, |
|
"learning_rate": 1.8584487936018663e-05, |
|
"loss": 0.1482, |
|
"reward": 0.9228515625, |
|
"reward_std": 0.7258684784173965, |
|
"rewards/accuracy_reward": 0.05078125, |
|
"rewards/format_reward": 0.33203125, |
|
"rewards/tag_count_reward": 0.5400390625, |
|
"step": 60 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 838.140625, |
|
"epoch": 0.2601279317697228, |
|
"grad_norm": 23.566726684570312, |
|
"kl": 6.2734375, |
|
"learning_rate": 1.8506800656873397e-05, |
|
"loss": 0.14, |
|
"reward": 0.90625, |
|
"reward_std": 0.6905761212110519, |
|
"rewards/accuracy_reward": 0.046875, |
|
"rewards/format_reward": 0.32421875, |
|
"rewards/tag_count_reward": 0.53515625, |
|
"step": 61 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 593.88671875, |
|
"epoch": 0.26439232409381663, |
|
"grad_norm": 9.581720352172852, |
|
"kl": 5.3984375, |
|
"learning_rate": 1.8427209586540392e-05, |
|
"loss": 0.0744, |
|
"reward": 0.966796875, |
|
"reward_std": 0.7168334871530533, |
|
"rewards/accuracy_reward": 0.03515625, |
|
"rewards/format_reward": 0.37109375, |
|
"rewards/tag_count_reward": 0.560546875, |
|
"step": 62 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 403.3515625, |
|
"epoch": 0.26865671641791045, |
|
"grad_norm": 3.977918863296509, |
|
"kl": 5.6328125, |
|
"learning_rate": 1.834573253721303e-05, |
|
"loss": 0.0664, |
|
"reward": 0.9931640625, |
|
"reward_std": 0.7101524770259857, |
|
"rewards/accuracy_reward": 0.03515625, |
|
"rewards/format_reward": 0.3671875, |
|
"rewards/tag_count_reward": 0.5908203125, |
|
"step": 63 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 285.90625, |
|
"epoch": 0.27292110874200426, |
|
"grad_norm": 3.9532861709594727, |
|
"kl": 4.4375, |
|
"learning_rate": 1.826238774315995e-05, |
|
"loss": -0.0383, |
|
"reward": 1.2724609375, |
|
"reward_std": 0.7493992298841476, |
|
"rewards/accuracy_reward": 0.046875, |
|
"rewards/format_reward": 0.5703125, |
|
"rewards/tag_count_reward": 0.6552734375, |
|
"step": 64 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 219.54296875, |
|
"epoch": 0.2771855010660981, |
|
"grad_norm": 9.081878662109375, |
|
"kl": 5.0, |
|
"learning_rate": 1.8177193856644315e-05, |
|
"loss": 0.029, |
|
"reward": 1.5458984375, |
|
"reward_std": 0.7314303368330002, |
|
"rewards/accuracy_reward": 0.0390625, |
|
"rewards/format_reward": 0.69921875, |
|
"rewards/tag_count_reward": 0.8076171875, |
|
"step": 65 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 172.1484375, |
|
"epoch": 0.2814498933901919, |
|
"grad_norm": 123.24443817138672, |
|
"kl": 8.5546875, |
|
"learning_rate": 1.8090169943749477e-05, |
|
"loss": 0.1148, |
|
"reward": 1.3759765625, |
|
"reward_std": 0.7652620077133179, |
|
"rewards/accuracy_reward": 0.00390625, |
|
"rewards/format_reward": 0.6015625, |
|
"rewards/tag_count_reward": 0.7705078125, |
|
"step": 66 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 155.49609375, |
|
"epoch": 0.2857142857142857, |
|
"grad_norm": 6.339594841003418, |
|
"kl": 5.8203125, |
|
"learning_rate": 1.8001335480112067e-05, |
|
"loss": 0.0783, |
|
"reward": 1.376953125, |
|
"reward_std": 0.6871647387742996, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.56640625, |
|
"rewards/tag_count_reward": 0.810546875, |
|
"step": 67 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 154.2734375, |
|
"epoch": 0.2899786780383795, |
|
"grad_norm": 181.54469299316406, |
|
"kl": 9.875, |
|
"learning_rate": 1.7910710346563417e-05, |
|
"loss": 0.2282, |
|
"reward": 1.689453125, |
|
"reward_std": 0.574043981730938, |
|
"rewards/accuracy_reward": 0.01953125, |
|
"rewards/format_reward": 0.80078125, |
|
"rewards/tag_count_reward": 0.869140625, |
|
"step": 68 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 156.546875, |
|
"epoch": 0.2942430703624733, |
|
"grad_norm": 23.90792465209961, |
|
"kl": 4.12890625, |
|
"learning_rate": 1.78183148246803e-05, |
|
"loss": -0.0867, |
|
"reward": 1.49609375, |
|
"reward_std": 0.7586368173360825, |
|
"rewards/accuracy_reward": 0.01171875, |
|
"rewards/format_reward": 0.703125, |
|
"rewards/tag_count_reward": 0.78125, |
|
"step": 69 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 154.2109375, |
|
"epoch": 0.29850746268656714, |
|
"grad_norm": 6.664966106414795, |
|
"kl": 4.35546875, |
|
"learning_rate": 1.7724169592245996e-05, |
|
"loss": -0.1123, |
|
"reward": 1.3564453125, |
|
"reward_std": 0.7493429481983185, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.62890625, |
|
"rewards/tag_count_reward": 0.7275390625, |
|
"step": 70 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 135.6171875, |
|
"epoch": 0.302771855010661, |
|
"grad_norm": 520.9791259765625, |
|
"kl": 14.2890625, |
|
"learning_rate": 1.7628295718622666e-05, |
|
"loss": 0.2477, |
|
"reward": 1.4765625, |
|
"reward_std": 0.7755448371171951, |
|
"rewards/accuracy_reward": 0.03515625, |
|
"rewards/format_reward": 0.67578125, |
|
"rewards/tag_count_reward": 0.765625, |
|
"step": 71 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 137.9296875, |
|
"epoch": 0.3070362473347548, |
|
"grad_norm": 9.297532081604004, |
|
"kl": 4.171875, |
|
"learning_rate": 1.7530714660036112e-05, |
|
"loss": -0.0591, |
|
"reward": 1.576171875, |
|
"reward_std": 0.702255368232727, |
|
"rewards/accuracy_reward": 0.0390625, |
|
"rewards/format_reward": 0.7265625, |
|
"rewards/tag_count_reward": 0.810546875, |
|
"step": 72 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 129.53515625, |
|
"epoch": 0.31130063965884863, |
|
"grad_norm": 481.652099609375, |
|
"kl": 26.50390625, |
|
"learning_rate": 1.7431448254773943e-05, |
|
"loss": 0.4083, |
|
"reward": 1.6875, |
|
"reward_std": 0.5314841717481613, |
|
"rewards/accuracy_reward": 0.00390625, |
|
"rewards/format_reward": 0.8125, |
|
"rewards/tag_count_reward": 0.87109375, |
|
"step": 73 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 138.23046875, |
|
"epoch": 0.31556503198294245, |
|
"grad_norm": 7.763786315917969, |
|
"kl": 4.296875, |
|
"learning_rate": 1.7330518718298263e-05, |
|
"loss": 0.0646, |
|
"reward": 1.7587890625, |
|
"reward_std": 0.5207200050354004, |
|
"rewards/accuracy_reward": 0.03125, |
|
"rewards/format_reward": 0.83203125, |
|
"rewards/tag_count_reward": 0.8955078125, |
|
"step": 74 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 128.81640625, |
|
"epoch": 0.31982942430703626, |
|
"grad_norm": 17.239259719848633, |
|
"kl": 5.734375, |
|
"learning_rate": 1.7227948638273918e-05, |
|
"loss": 0.135, |
|
"reward": 1.919921875, |
|
"reward_std": 0.3152705281972885, |
|
"rewards/accuracy_reward": 0.04296875, |
|
"rewards/format_reward": 0.921875, |
|
"rewards/tag_count_reward": 0.955078125, |
|
"step": 75 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 111.79296875, |
|
"epoch": 0.32409381663113007, |
|
"grad_norm": 8.510507583618164, |
|
"kl": 4.6171875, |
|
"learning_rate": 1.712376096951345e-05, |
|
"loss": 0.117, |
|
"reward": 1.900390625, |
|
"reward_std": 0.32303596287965775, |
|
"rewards/accuracy_reward": 0.0390625, |
|
"rewards/format_reward": 0.91796875, |
|
"rewards/tag_count_reward": 0.943359375, |
|
"step": 76 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 124.20703125, |
|
"epoch": 0.3283582089552239, |
|
"grad_norm": 13.03882884979248, |
|
"kl": 5.109375, |
|
"learning_rate": 1.7017979028839918e-05, |
|
"loss": 0.1378, |
|
"reward": 1.8974609375, |
|
"reward_std": 0.3129582107067108, |
|
"rewards/accuracy_reward": 0.03125, |
|
"rewards/format_reward": 0.91796875, |
|
"rewards/tag_count_reward": 0.9482421875, |
|
"step": 77 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 114.91015625, |
|
"epoch": 0.3326226012793177, |
|
"grad_norm": 14.898639678955078, |
|
"kl": 5.84375, |
|
"learning_rate": 1.691062648986865e-05, |
|
"loss": 0.1679, |
|
"reward": 1.8828125, |
|
"reward_std": 0.28449319303035736, |
|
"rewards/accuracy_reward": 0.00390625, |
|
"rewards/format_reward": 0.92578125, |
|
"rewards/tag_count_reward": 0.953125, |
|
"step": 78 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 116.80859375, |
|
"epoch": 0.3368869936034115, |
|
"grad_norm": 41.616432189941406, |
|
"kl": 6.03125, |
|
"learning_rate": 1.6801727377709195e-05, |
|
"loss": 0.1764, |
|
"reward": 1.939453125, |
|
"reward_std": 0.24111925438046455, |
|
"rewards/accuracy_reward": 0.0234375, |
|
"rewards/format_reward": 0.9453125, |
|
"rewards/tag_count_reward": 0.970703125, |
|
"step": 79 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 178.0546875, |
|
"epoch": 0.3411513859275053, |
|
"grad_norm": 40.391815185546875, |
|
"kl": 7.078125, |
|
"learning_rate": 1.6691306063588583e-05, |
|
"loss": 0.1877, |
|
"reward": 1.896484375, |
|
"reward_std": 0.24799961294047534, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.94140625, |
|
"rewards/tag_count_reward": 0.955078125, |
|
"step": 80 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 207.1484375, |
|
"epoch": 0.34541577825159914, |
|
"grad_norm": 53.5943489074707, |
|
"kl": 7.96875, |
|
"learning_rate": 1.657938725939713e-05, |
|
"loss": 0.2265, |
|
"reward": 1.8984375, |
|
"reward_std": 0.25493185594677925, |
|
"rewards/accuracy_reward": 0.00390625, |
|
"rewards/format_reward": 0.94140625, |
|
"rewards/tag_count_reward": 0.953125, |
|
"step": 81 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 244.5625, |
|
"epoch": 0.34968017057569295, |
|
"grad_norm": 9.949625015258789, |
|
"kl": 6.1015625, |
|
"learning_rate": 1.6465996012157996e-05, |
|
"loss": 0.1384, |
|
"reward": 1.8525390625, |
|
"reward_std": 0.3406095430254936, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.9140625, |
|
"rewards/tag_count_reward": 0.9384765625, |
|
"step": 82 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 232.71875, |
|
"epoch": 0.35394456289978676, |
|
"grad_norm": 11.608429908752441, |
|
"kl": 5.5390625, |
|
"learning_rate": 1.635115769842179e-05, |
|
"loss": 0.1642, |
|
"reward": 1.8603515625, |
|
"reward_std": 0.319538950920105, |
|
"rewards/accuracy_reward": 0.00390625, |
|
"rewards/format_reward": 0.90234375, |
|
"rewards/tag_count_reward": 0.9541015625, |
|
"step": 83 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 196.1328125, |
|
"epoch": 0.3582089552238806, |
|
"grad_norm": 7.774857044219971, |
|
"kl": 5.359375, |
|
"learning_rate": 1.6234898018587336e-05, |
|
"loss": 0.1484, |
|
"reward": 1.841796875, |
|
"reward_std": 0.35672812163829803, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.89453125, |
|
"rewards/tag_count_reward": 0.947265625, |
|
"step": 84 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 184.19921875, |
|
"epoch": 0.3624733475479744, |
|
"grad_norm": 5.315440654754639, |
|
"kl": 5.0234375, |
|
"learning_rate": 1.6117242991150064e-05, |
|
"loss": 0.1525, |
|
"reward": 1.9189453125, |
|
"reward_std": 0.21933256834745407, |
|
"rewards/accuracy_reward": 0.0078125, |
|
"rewards/format_reward": 0.94921875, |
|
"rewards/tag_count_reward": 0.9619140625, |
|
"step": 85 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 193.97265625, |
|
"epoch": 0.36673773987206826, |
|
"grad_norm": 5.736013889312744, |
|
"kl": 5.515625, |
|
"learning_rate": 1.599821894687914e-05, |
|
"loss": 0.1875, |
|
"reward": 1.9306640625, |
|
"reward_std": 0.20439787581562996, |
|
"rewards/accuracy_reward": 0.0078125, |
|
"rewards/format_reward": 0.94921875, |
|
"rewards/tag_count_reward": 0.9736328125, |
|
"step": 86 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 297.3984375, |
|
"epoch": 0.37100213219616207, |
|
"grad_norm": 8.90512752532959, |
|
"kl": 5.5703125, |
|
"learning_rate": 1.5877852522924733e-05, |
|
"loss": 0.1698, |
|
"reward": 1.91015625, |
|
"reward_std": 0.33383994549512863, |
|
"rewards/accuracy_reward": 0.0390625, |
|
"rewards/format_reward": 0.91796875, |
|
"rewards/tag_count_reward": 0.953125, |
|
"step": 87 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 293.015625, |
|
"epoch": 0.3752665245202559, |
|
"grad_norm": 102.06912231445312, |
|
"kl": 10.2578125, |
|
"learning_rate": 1.575617065685674e-05, |
|
"loss": 0.3011, |
|
"reward": 1.890625, |
|
"reward_std": 0.31814195960760117, |
|
"rewards/accuracy_reward": 0.03515625, |
|
"rewards/format_reward": 0.90234375, |
|
"rewards/tag_count_reward": 0.953125, |
|
"step": 88 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 336.6796875, |
|
"epoch": 0.3795309168443497, |
|
"grad_norm": 108.87593841552734, |
|
"kl": 11.6640625, |
|
"learning_rate": 1.563320058063622e-05, |
|
"loss": 0.2676, |
|
"reward": 1.8369140625, |
|
"reward_std": 0.38644537329673767, |
|
"rewards/accuracy_reward": 0.03515625, |
|
"rewards/format_reward": 0.8671875, |
|
"rewards/tag_count_reward": 0.9345703125, |
|
"step": 89 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 409.29296875, |
|
"epoch": 0.3837953091684435, |
|
"grad_norm": 35.86373519897461, |
|
"kl": 8.4296875, |
|
"learning_rate": 1.5508969814521026e-05, |
|
"loss": 0.2346, |
|
"reward": 1.8154296875, |
|
"reward_std": 0.4089268818497658, |
|
"rewards/accuracy_reward": 0.00390625, |
|
"rewards/format_reward": 0.87109375, |
|
"rewards/tag_count_reward": 0.9404296875, |
|
"step": 90 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 419.03125, |
|
"epoch": 0.3880597014925373, |
|
"grad_norm": 4.703104496002197, |
|
"kl": 5.8359375, |
|
"learning_rate": 1.5383506160906826e-05, |
|
"loss": 0.1736, |
|
"reward": 1.8583984375, |
|
"reward_std": 0.37071677297353745, |
|
"rewards/accuracy_reward": 0.01953125, |
|
"rewards/format_reward": 0.88671875, |
|
"rewards/tag_count_reward": 0.9521484375, |
|
"step": 91 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 498.890625, |
|
"epoch": 0.39232409381663114, |
|
"grad_norm": 15.804770469665527, |
|
"kl": 6.359375, |
|
"learning_rate": 1.5256837698105047e-05, |
|
"loss": 0.2056, |
|
"reward": 1.896484375, |
|
"reward_std": 0.264212965965271, |
|
"rewards/accuracy_reward": 0.01171875, |
|
"rewards/format_reward": 0.921875, |
|
"rewards/tag_count_reward": 0.962890625, |
|
"step": 92 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 541.484375, |
|
"epoch": 0.39658848614072495, |
|
"grad_norm": 43.44738006591797, |
|
"kl": 7.046875, |
|
"learning_rate": 1.5128992774059063e-05, |
|
"loss": 0.1825, |
|
"reward": 1.84375, |
|
"reward_std": 0.37193765491247177, |
|
"rewards/accuracy_reward": 0.01171875, |
|
"rewards/format_reward": 0.890625, |
|
"rewards/tag_count_reward": 0.94140625, |
|
"step": 93 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 690.60546875, |
|
"epoch": 0.40085287846481876, |
|
"grad_norm": 3.9883878231048584, |
|
"kl": 5.6875, |
|
"learning_rate": 1.5000000000000002e-05, |
|
"loss": 0.1577, |
|
"reward": 1.796875, |
|
"reward_std": 0.449543721973896, |
|
"rewards/accuracy_reward": 0.03125, |
|
"rewards/format_reward": 0.8515625, |
|
"rewards/tag_count_reward": 0.9140625, |
|
"step": 94 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 710.62890625, |
|
"epoch": 0.4051172707889126, |
|
"grad_norm": 13.03452205657959, |
|
"kl": 5.1953125, |
|
"learning_rate": 1.4869888244043674e-05, |
|
"loss": 0.1824, |
|
"reward": 1.794921875, |
|
"reward_std": 0.44430477917194366, |
|
"rewards/accuracy_reward": 0.0078125, |
|
"rewards/format_reward": 0.859375, |
|
"rewards/tag_count_reward": 0.927734375, |
|
"step": 95 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 679.2578125, |
|
"epoch": 0.4093816631130064, |
|
"grad_norm": 4.490772724151611, |
|
"kl": 5.296875, |
|
"learning_rate": 1.4738686624729987e-05, |
|
"loss": 0.1653, |
|
"reward": 1.80859375, |
|
"reward_std": 0.35829880461096764, |
|
"rewards/accuracy_reward": 0.02734375, |
|
"rewards/format_reward": 0.8515625, |
|
"rewards/tag_count_reward": 0.9296875, |
|
"step": 96 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 880.9765625, |
|
"epoch": 0.4136460554371002, |
|
"grad_norm": 150.7144317626953, |
|
"kl": 6.5390625, |
|
"learning_rate": 1.4606424504506325e-05, |
|
"loss": 0.2454, |
|
"reward": 1.5869140625, |
|
"reward_std": 0.5404268652200699, |
|
"rewards/accuracy_reward": 0.03515625, |
|
"rewards/format_reward": 0.76953125, |
|
"rewards/tag_count_reward": 0.7822265625, |
|
"step": 97 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 945.01953125, |
|
"epoch": 0.417910447761194, |
|
"grad_norm": 216.19607543945312, |
|
"kl": 15.546875, |
|
"learning_rate": 1.4473131483156326e-05, |
|
"loss": 0.3319, |
|
"reward": 1.4111328125, |
|
"reward_std": 0.6219311505556107, |
|
"rewards/accuracy_reward": 0.0234375, |
|
"rewards/format_reward": 0.6875, |
|
"rewards/tag_count_reward": 0.7001953125, |
|
"step": 98 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 988.53515625, |
|
"epoch": 0.42217484008528783, |
|
"grad_norm": 137.95619201660156, |
|
"kl": 6.5, |
|
"learning_rate": 1.4338837391175582e-05, |
|
"loss": 0.2236, |
|
"reward": 1.45703125, |
|
"reward_std": 0.6083492934703827, |
|
"rewards/accuracy_reward": 0.0234375, |
|
"rewards/format_reward": 0.70703125, |
|
"rewards/tag_count_reward": 0.7265625, |
|
"step": 99 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 985.140625, |
|
"epoch": 0.42643923240938164, |
|
"grad_norm": 7.844208240509033, |
|
"kl": 1.84375, |
|
"learning_rate": 1.4203572283095657e-05, |
|
"loss": 0.039, |
|
"reward": 1.4658203125, |
|
"reward_std": 0.6321621090173721, |
|
"rewards/accuracy_reward": 0.03125, |
|
"rewards/format_reward": 0.69140625, |
|
"rewards/tag_count_reward": 0.7431640625, |
|
"step": 100 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 992.00390625, |
|
"epoch": 0.43070362473347545, |
|
"grad_norm": 13.683513641357422, |
|
"kl": 1.30078125, |
|
"learning_rate": 1.4067366430758004e-05, |
|
"loss": 0.0205, |
|
"reward": 1.4462890625, |
|
"reward_std": 0.6124080866575241, |
|
"rewards/accuracy_reward": 0.046875, |
|
"rewards/format_reward": 0.71875, |
|
"rewards/tag_count_reward": 0.6806640625, |
|
"step": 101 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 998.76171875, |
|
"epoch": 0.4349680170575693, |
|
"grad_norm": 30.369285583496094, |
|
"kl": 1.1240234375, |
|
"learning_rate": 1.3930250316539237e-05, |
|
"loss": 0.0296, |
|
"reward": 1.4365234375, |
|
"reward_std": 0.6077300161123276, |
|
"rewards/accuracy_reward": 0.02734375, |
|
"rewards/format_reward": 0.7109375, |
|
"rewards/tag_count_reward": 0.6982421875, |
|
"step": 102 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1007.33984375, |
|
"epoch": 0.43923240938166314, |
|
"grad_norm": 22.23171615600586, |
|
"kl": 3.173828125, |
|
"learning_rate": 1.3792254626529286e-05, |
|
"loss": 0.1053, |
|
"reward": 1.39453125, |
|
"reward_std": 0.5931012779474258, |
|
"rewards/accuracy_reward": 0.03125, |
|
"rewards/format_reward": 0.68359375, |
|
"rewards/tag_count_reward": 0.6796875, |
|
"step": 103 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1018.3046875, |
|
"epoch": 0.44349680170575695, |
|
"grad_norm": 32.721920013427734, |
|
"kl": 2.7041015625, |
|
"learning_rate": 1.3653410243663953e-05, |
|
"loss": 0.1054, |
|
"reward": 1.4375, |
|
"reward_std": 0.5352352559566498, |
|
"rewards/accuracy_reward": 0.0078125, |
|
"rewards/format_reward": 0.74609375, |
|
"rewards/tag_count_reward": 0.68359375, |
|
"step": 104 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 983.671875, |
|
"epoch": 0.44776119402985076, |
|
"grad_norm": 21.012828826904297, |
|
"kl": 2.029296875, |
|
"learning_rate": 1.3513748240813429e-05, |
|
"loss": 0.0617, |
|
"reward": 1.44140625, |
|
"reward_std": 0.5487575381994247, |
|
"rewards/accuracy_reward": 0.03515625, |
|
"rewards/format_reward": 0.71484375, |
|
"rewards/tag_count_reward": 0.69140625, |
|
"step": 105 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1009.48828125, |
|
"epoch": 0.4520255863539446, |
|
"grad_norm": 8.625335693359375, |
|
"kl": 1.4072265625, |
|
"learning_rate": 1.3373299873828303e-05, |
|
"loss": 0.0492, |
|
"reward": 1.4296875, |
|
"reward_std": 0.5546326637268066, |
|
"rewards/accuracy_reward": 0.03515625, |
|
"rewards/format_reward": 0.703125, |
|
"rewards/tag_count_reward": 0.69140625, |
|
"step": 106 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1024.0, |
|
"epoch": 0.4562899786780384, |
|
"grad_norm": 3.2243165969848633, |
|
"kl": 0.8115234375, |
|
"learning_rate": 1.3232096574544602e-05, |
|
"loss": 0.0324, |
|
"reward": 1.462890625, |
|
"reward_std": 0.5354997888207436, |
|
"rewards/accuracy_reward": 0.0390625, |
|
"rewards/format_reward": 0.72265625, |
|
"rewards/tag_count_reward": 0.701171875, |
|
"step": 107 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1020.35546875, |
|
"epoch": 0.4605543710021322, |
|
"grad_norm": 1.6773790121078491, |
|
"kl": 0.9384765625, |
|
"learning_rate": 1.3090169943749475e-05, |
|
"loss": 0.0316, |
|
"reward": 1.3544921875, |
|
"reward_std": 0.6066916137933731, |
|
"rewards/accuracy_reward": 0.02734375, |
|
"rewards/format_reward": 0.6328125, |
|
"rewards/tag_count_reward": 0.6943359375, |
|
"step": 108 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 934.0625, |
|
"epoch": 0.464818763326226, |
|
"grad_norm": 0.25028663873672485, |
|
"kl": 3.5703125, |
|
"learning_rate": 1.2947551744109044e-05, |
|
"loss": 0.1428, |
|
"reward": 1.7275390625, |
|
"reward_std": 0.30690931528806686, |
|
"rewards/accuracy_reward": 0.0703125, |
|
"rewards/format_reward": 0.94140625, |
|
"rewards/tag_count_reward": 0.7158203125, |
|
"step": 109 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 894.3125, |
|
"epoch": 0.4690831556503198, |
|
"grad_norm": 0.25236231088638306, |
|
"kl": 4.31640625, |
|
"learning_rate": 1.2804273893060028e-05, |
|
"loss": 0.1724, |
|
"reward": 1.5966796875, |
|
"reward_std": 0.3756791800260544, |
|
"rewards/accuracy_reward": 0.0234375, |
|
"rewards/format_reward": 0.86328125, |
|
"rewards/tag_count_reward": 0.7099609375, |
|
"step": 110 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 843.46875, |
|
"epoch": 0.47334754797441364, |
|
"grad_norm": 0.30303165316581726, |
|
"kl": 4.3515625, |
|
"learning_rate": 1.2660368455666752e-05, |
|
"loss": 0.174, |
|
"reward": 1.6923828125, |
|
"reward_std": 0.36458854377269745, |
|
"rewards/accuracy_reward": 0.05859375, |
|
"rewards/format_reward": 0.9296875, |
|
"rewards/tag_count_reward": 0.7041015625, |
|
"step": 111 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 846.1875, |
|
"epoch": 0.47761194029850745, |
|
"grad_norm": 0.30785125494003296, |
|
"kl": 4.8515625, |
|
"learning_rate": 1.2515867637445088e-05, |
|
"loss": 0.1944, |
|
"reward": 1.65234375, |
|
"reward_std": 0.35947033017873764, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.91015625, |
|
"rewards/tag_count_reward": 0.7421875, |
|
"step": 112 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1001.59375, |
|
"epoch": 0.48187633262260127, |
|
"grad_norm": 0.5503849983215332, |
|
"kl": 3.1875, |
|
"learning_rate": 1.2370803777154976e-05, |
|
"loss": 0.1275, |
|
"reward": 0.92578125, |
|
"reward_std": 0.40457524359226227, |
|
"rewards/accuracy_reward": 0.015625, |
|
"rewards/format_reward": 0.171875, |
|
"rewards/tag_count_reward": 0.73828125, |
|
"step": 113 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 742.28125, |
|
"epoch": 0.4861407249466951, |
|
"grad_norm": 0.6809885501861572, |
|
"kl": 5.1484375, |
|
"learning_rate": 1.2225209339563144e-05, |
|
"loss": 0.2059, |
|
"reward": 1.7626953125, |
|
"reward_std": 0.38498707860708237, |
|
"rewards/accuracy_reward": 0.0390625, |
|
"rewards/format_reward": 0.9140625, |
|
"rewards/tag_count_reward": 0.8095703125, |
|
"step": 114 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 575.97265625, |
|
"epoch": 0.4904051172707889, |
|
"grad_norm": 0.8168994784355164, |
|
"kl": 4.984375, |
|
"learning_rate": 1.2079116908177592e-05, |
|
"loss": 0.1925, |
|
"reward": 1.8603515625, |
|
"reward_std": 0.43856722861528397, |
|
"rewards/accuracy_reward": 0.06640625, |
|
"rewards/format_reward": 0.84765625, |
|
"rewards/tag_count_reward": 0.9462890625, |
|
"step": 115 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 859.1875, |
|
"epoch": 0.4946695095948827, |
|
"grad_norm": 0.32246819138526917, |
|
"kl": 4.65625, |
|
"learning_rate": 1.1932559177955533e-05, |
|
"loss": 0.1858, |
|
"reward": 1.6337890625, |
|
"reward_std": 0.3074583485722542, |
|
"rewards/accuracy_reward": 0.01171875, |
|
"rewards/format_reward": 0.890625, |
|
"rewards/tag_count_reward": 0.7314453125, |
|
"step": 116 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1009.09375, |
|
"epoch": 0.4989339019189765, |
|
"grad_norm": 0.2883855402469635, |
|
"kl": 2.7890625, |
|
"learning_rate": 1.1785568947986368e-05, |
|
"loss": 0.1117, |
|
"reward": 1.8154296875, |
|
"reward_std": 0.2801100164651871, |
|
"rewards/accuracy_reward": 0.12109375, |
|
"rewards/format_reward": 0.9609375, |
|
"rewards/tag_count_reward": 0.7333984375, |
|
"step": 117 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1024.0, |
|
"epoch": 0.5031982942430704, |
|
"grad_norm": 0.5340821146965027, |
|
"kl": 2.125, |
|
"learning_rate": 1.1638179114151378e-05, |
|
"loss": 0.0849, |
|
"reward": 1.6708984375, |
|
"reward_std": 0.2677147090435028, |
|
"rewards/accuracy_reward": 0.01953125, |
|
"rewards/format_reward": 0.92578125, |
|
"rewards/tag_count_reward": 0.7255859375, |
|
"step": 118 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 951.5625, |
|
"epoch": 0.5074626865671642, |
|
"grad_norm": 0.3258584141731262, |
|
"kl": 2.87109375, |
|
"learning_rate": 1.1490422661761744e-05, |
|
"loss": 0.1149, |
|
"reward": 1.71875, |
|
"reward_std": 0.1409970298409462, |
|
"rewards/accuracy_reward": 0.01171875, |
|
"rewards/format_reward": 0.96484375, |
|
"rewards/tag_count_reward": 0.7421875, |
|
"step": 119 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 948.03125, |
|
"epoch": 0.511727078891258, |
|
"grad_norm": 0.20516642928123474, |
|
"kl": 2.56640625, |
|
"learning_rate": 1.1342332658176556e-05, |
|
"loss": 0.1026, |
|
"reward": 1.7783203125, |
|
"reward_std": 0.21998512372374535, |
|
"rewards/accuracy_reward": 0.07421875, |
|
"rewards/format_reward": 0.96484375, |
|
"rewards/tag_count_reward": 0.7392578125, |
|
"step": 120 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 949.3125, |
|
"epoch": 0.5159914712153518, |
|
"grad_norm": 0.15999875962734222, |
|
"kl": 2.73828125, |
|
"learning_rate": 1.1193942245402443e-05, |
|
"loss": 0.1093, |
|
"reward": 1.7880859375, |
|
"reward_std": 0.1588208805769682, |
|
"rewards/accuracy_reward": 0.078125, |
|
"rewards/format_reward": 0.96484375, |
|
"rewards/tag_count_reward": 0.7451171875, |
|
"step": 121 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 996.125, |
|
"epoch": 0.5202558635394456, |
|
"grad_norm": 0.2390127331018448, |
|
"kl": 2.75, |
|
"learning_rate": 1.1045284632676535e-05, |
|
"loss": 0.1098, |
|
"reward": 1.76171875, |
|
"reward_std": 0.22831767983734608, |
|
"rewards/accuracy_reward": 0.078125, |
|
"rewards/format_reward": 0.953125, |
|
"rewards/tag_count_reward": 0.73046875, |
|
"step": 122 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 972.6796875, |
|
"epoch": 0.5245202558635395, |
|
"grad_norm": 4.8665361404418945, |
|
"kl": 3.66796875, |
|
"learning_rate": 1.0896393089034336e-05, |
|
"loss": 0.1313, |
|
"reward": 1.6845703125, |
|
"reward_std": 0.36246033012866974, |
|
"rewards/accuracy_reward": 0.09375, |
|
"rewards/format_reward": 0.88671875, |
|
"rewards/tag_count_reward": 0.7041015625, |
|
"step": 123 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 888.3046875, |
|
"epoch": 0.5287846481876333, |
|
"grad_norm": 3.6568827629089355, |
|
"kl": 2.06640625, |
|
"learning_rate": 1.0747300935864245e-05, |
|
"loss": 0.0925, |
|
"reward": 1.68359375, |
|
"reward_std": 0.4343060404062271, |
|
"rewards/accuracy_reward": 0.05078125, |
|
"rewards/format_reward": 0.8828125, |
|
"rewards/tag_count_reward": 0.75, |
|
"step": 124 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 516.43359375, |
|
"epoch": 0.5330490405117271, |
|
"grad_norm": 0.5316474437713623, |
|
"kl": 0.61767578125, |
|
"learning_rate": 1.0598041539450344e-05, |
|
"loss": 0.2445, |
|
"reward": 1.7685546875, |
|
"reward_std": 0.4253704324364662, |
|
"rewards/accuracy_reward": 0.0390625, |
|
"rewards/format_reward": 0.89453125, |
|
"rewards/tag_count_reward": 0.8349609375, |
|
"step": 125 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 473.5546875, |
|
"epoch": 0.5373134328358209, |
|
"grad_norm": 2.9844846725463867, |
|
"kl": 0.8173828125, |
|
"learning_rate": 1.044864830350515e-05, |
|
"loss": 0.2749, |
|
"reward": 1.833984375, |
|
"reward_std": 0.523324653506279, |
|
"rewards/accuracy_reward": 0.10546875, |
|
"rewards/format_reward": 0.84375, |
|
"rewards/tag_count_reward": 0.884765625, |
|
"step": 126 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 450.921875, |
|
"epoch": 0.5415778251599147, |
|
"grad_norm": 0.5743687748908997, |
|
"kl": 0.84765625, |
|
"learning_rate": 1.0299154661693987e-05, |
|
"loss": 0.2714, |
|
"reward": 1.771484375, |
|
"reward_std": 0.5503488332033157, |
|
"rewards/accuracy_reward": 0.05859375, |
|
"rewards/format_reward": 0.83203125, |
|
"rewards/tag_count_reward": 0.880859375, |
|
"step": 127 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 488.37109375, |
|
"epoch": 0.5458422174840085, |
|
"grad_norm": 6.451872825622559, |
|
"kl": 1.234375, |
|
"learning_rate": 1.0149594070152638e-05, |
|
"loss": 0.3969, |
|
"reward": 1.7021484375, |
|
"reward_std": 0.6392623782157898, |
|
"rewards/accuracy_reward": 0.06640625, |
|
"rewards/format_reward": 0.78125, |
|
"rewards/tag_count_reward": 0.8544921875, |
|
"step": 128 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 466.671875, |
|
"epoch": 0.5501066098081023, |
|
"grad_norm": 2.481407403945923, |
|
"kl": 0.986328125, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4003, |
|
"reward": 1.615234375, |
|
"reward_std": 0.6238291710615158, |
|
"rewards/accuracy_reward": 0.03515625, |
|
"rewards/format_reward": 0.7421875, |
|
"rewards/tag_count_reward": 0.837890625, |
|
"step": 129 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 414.88671875, |
|
"epoch": 0.5543710021321961, |
|
"grad_norm": 4.6952948570251465, |
|
"kl": 0.55078125, |
|
"learning_rate": 9.850405929847367e-06, |
|
"loss": 0.413, |
|
"reward": 1.654296875, |
|
"reward_std": 0.6841937601566315, |
|
"rewards/accuracy_reward": 0.0703125, |
|
"rewards/format_reward": 0.7421875, |
|
"rewards/tag_count_reward": 0.841796875, |
|
"step": 130 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 374.3828125, |
|
"epoch": 0.55863539445629, |
|
"grad_norm": 10.949110984802246, |
|
"kl": 0.5146484375, |
|
"learning_rate": 9.700845338306018e-06, |
|
"loss": 0.4342, |
|
"reward": 1.7568359375, |
|
"reward_std": 0.5640043765306473, |
|
"rewards/accuracy_reward": 0.046875, |
|
"rewards/format_reward": 0.81640625, |
|
"rewards/tag_count_reward": 0.8935546875, |
|
"step": 131 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 379.46875, |
|
"epoch": 0.5628997867803838, |
|
"grad_norm": 7.129451274871826, |
|
"kl": 0.41796875, |
|
"learning_rate": 9.551351696494854e-06, |
|
"loss": 0.462, |
|
"reward": 1.6328125, |
|
"reward_std": 0.7138571888208389, |
|
"rewards/accuracy_reward": 0.07421875, |
|
"rewards/format_reward": 0.7265625, |
|
"rewards/tag_count_reward": 0.83203125, |
|
"step": 132 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 378.390625, |
|
"epoch": 0.5671641791044776, |
|
"grad_norm": 5.389857769012451, |
|
"kl": 0.6474609375, |
|
"learning_rate": 9.401958460549658e-06, |
|
"loss": 0.4062, |
|
"reward": 1.69921875, |
|
"reward_std": 0.5953380540013313, |
|
"rewards/accuracy_reward": 0.03125, |
|
"rewards/format_reward": 0.78515625, |
|
"rewards/tag_count_reward": 0.8828125, |
|
"step": 133 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 374.63671875, |
|
"epoch": 0.5714285714285714, |
|
"grad_norm": 8.098217964172363, |
|
"kl": 0.6748046875, |
|
"learning_rate": 9.252699064135759e-06, |
|
"loss": 0.5274, |
|
"reward": 1.68359375, |
|
"reward_std": 0.6231431663036346, |
|
"rewards/accuracy_reward": 0.04296875, |
|
"rewards/format_reward": 0.76953125, |
|
"rewards/tag_count_reward": 0.87109375, |
|
"step": 134 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 371.6953125, |
|
"epoch": 0.5756929637526652, |
|
"grad_norm": 27.047813415527344, |
|
"kl": 6.0146484375, |
|
"learning_rate": 9.103606910965666e-06, |
|
"loss": 0.4173, |
|
"reward": 1.7587890625, |
|
"reward_std": 0.477617509663105, |
|
"rewards/accuracy_reward": 0.09375, |
|
"rewards/format_reward": 0.7890625, |
|
"rewards/tag_count_reward": 0.8759765625, |
|
"step": 135 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 388.03125, |
|
"epoch": 0.579957356076759, |
|
"grad_norm": 72.6392822265625, |
|
"kl": 33.333984375, |
|
"learning_rate": 8.954715367323468e-06, |
|
"loss": 0.5359, |
|
"reward": 1.5771484375, |
|
"reward_std": 0.65767702460289, |
|
"rewards/accuracy_reward": 0.0234375, |
|
"rewards/format_reward": 0.71484375, |
|
"rewards/tag_count_reward": 0.8388671875, |
|
"step": 136 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 520.26953125, |
|
"epoch": 0.5842217484008528, |
|
"grad_norm": 11.781960487365723, |
|
"kl": 3.19140625, |
|
"learning_rate": 8.806057754597559e-06, |
|
"loss": 0.3497, |
|
"reward": 1.1142578125, |
|
"reward_std": 0.6293385028839111, |
|
"rewards/accuracy_reward": 0.015625, |
|
"rewards/format_reward": 0.40625, |
|
"rewards/tag_count_reward": 0.6923828125, |
|
"step": 137 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 612.92578125, |
|
"epoch": 0.5884861407249466, |
|
"grad_norm": 10.908761978149414, |
|
"kl": 2.56640625, |
|
"learning_rate": 8.657667341823449e-06, |
|
"loss": 0.089, |
|
"reward": 0.552734375, |
|
"reward_std": 0.22833332046866417, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.05078125, |
|
"rewards/tag_count_reward": 0.501953125, |
|
"step": 138 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 164.50390625, |
|
"epoch": 0.5927505330490405, |
|
"grad_norm": 1.4357022047042847, |
|
"kl": 0.38916015625, |
|
"learning_rate": 8.509577338238255e-06, |
|
"loss": 0.3546, |
|
"reward": 0.4619140625, |
|
"reward_std": 0.0770116988569498, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4619140625, |
|
"step": 139 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 236.9140625, |
|
"epoch": 0.5970149253731343, |
|
"grad_norm": 1.1441797018051147, |
|
"kl": 0.32080078125, |
|
"learning_rate": 8.361820885848623e-06, |
|
"loss": 0.1043, |
|
"reward": 0.3369140625, |
|
"reward_std": 0.11966157145798206, |
|
"rewards/accuracy_reward": 0.00390625, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.3330078125, |
|
"step": 140 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 304.41796875, |
|
"epoch": 0.6012793176972282, |
|
"grad_norm": 6.6605143547058105, |
|
"kl": 0.4736328125, |
|
"learning_rate": 8.214431052013636e-06, |
|
"loss": 0.0359, |
|
"reward": 0.6025390625, |
|
"reward_std": 0.21631848067045212, |
|
"rewards/accuracy_reward": 0.06640625, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5361328125, |
|
"step": 141 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 291.44921875, |
|
"epoch": 0.605543710021322, |
|
"grad_norm": 29.841733932495117, |
|
"kl": 0.37939453125, |
|
"learning_rate": 8.06744082204447e-06, |
|
"loss": 0.1329, |
|
"reward": 0.7060546875, |
|
"reward_std": 0.25769177079200745, |
|
"rewards/accuracy_reward": 0.1171875, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5888671875, |
|
"step": 142 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 259.703125, |
|
"epoch": 0.6098081023454158, |
|
"grad_norm": 22.494600296020508, |
|
"kl": 1.2724609375, |
|
"learning_rate": 7.92088309182241e-06, |
|
"loss": -0.003, |
|
"reward": 0.61328125, |
|
"reward_std": 0.2214067205786705, |
|
"rewards/accuracy_reward": 0.05078125, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5625, |
|
"step": 143 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 272.9921875, |
|
"epoch": 0.6140724946695096, |
|
"grad_norm": 3.0637097358703613, |
|
"kl": 0.34814453125, |
|
"learning_rate": 7.774790660436857e-06, |
|
"loss": -0.0925, |
|
"reward": 0.5869140625, |
|
"reward_std": 0.22308824211359024, |
|
"rewards/accuracy_reward": 0.0390625, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5478515625, |
|
"step": 144 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 303.21875, |
|
"epoch": 0.6183368869936035, |
|
"grad_norm": 3.038789987564087, |
|
"kl": 0.57373046875, |
|
"learning_rate": 7.629196222845027e-06, |
|
"loss": -0.0695, |
|
"reward": 0.6015625, |
|
"reward_std": 0.1990872472524643, |
|
"rewards/accuracy_reward": 0.02734375, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.57421875, |
|
"step": 145 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 244.98828125, |
|
"epoch": 0.6226012793176973, |
|
"grad_norm": 3.982813835144043, |
|
"kl": 1.84765625, |
|
"learning_rate": 7.484132362554915e-06, |
|
"loss": -0.1056, |
|
"reward": 0.61328125, |
|
"reward_std": 0.24527693167328835, |
|
"rewards/accuracy_reward": 0.0390625, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.57421875, |
|
"step": 146 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 304.8359375, |
|
"epoch": 0.6268656716417911, |
|
"grad_norm": 1.170094609260559, |
|
"kl": 1.3681640625, |
|
"learning_rate": 7.33963154433325e-06, |
|
"loss": -0.1068, |
|
"reward": 0.609375, |
|
"reward_std": 0.203267153352499, |
|
"rewards/accuracy_reward": 0.03125, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.578125, |
|
"step": 147 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 344.03125, |
|
"epoch": 0.6311300639658849, |
|
"grad_norm": 3.609171152114868, |
|
"kl": 1.33203125, |
|
"learning_rate": 7.1957261069399745e-06, |
|
"loss": -0.1631, |
|
"reward": 0.6796875, |
|
"reward_std": 0.21126757562160492, |
|
"rewards/accuracy_reward": 0.05859375, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.62109375, |
|
"step": 148 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 312.6640625, |
|
"epoch": 0.6353944562899787, |
|
"grad_norm": 1.0008127689361572, |
|
"kl": 1.52734375, |
|
"learning_rate": 7.052448255890958e-06, |
|
"loss": -0.2083, |
|
"reward": 0.7255859375, |
|
"reward_std": 0.29479434341192245, |
|
"rewards/accuracy_reward": 0.1171875, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.6083984375, |
|
"step": 149 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 472.80859375, |
|
"epoch": 0.6396588486140725, |
|
"grad_norm": 0.9753682017326355, |
|
"kl": 0.650390625, |
|
"learning_rate": 6.909830056250527e-06, |
|
"loss": -0.196, |
|
"reward": 0.748046875, |
|
"reward_std": 0.2531566210091114, |
|
"rewards/accuracy_reward": 0.08984375, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.658203125, |
|
"step": 150 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 808.75, |
|
"epoch": 0.6439232409381663, |
|
"grad_norm": 0.5874699950218201, |
|
"kl": 0.8330078125, |
|
"learning_rate": 6.767903425455402e-06, |
|
"loss": -0.122, |
|
"reward": 0.6337890625, |
|
"reward_std": 0.2824634090065956, |
|
"rewards/accuracy_reward": 0.12890625, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5048828125, |
|
"step": 151 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 793.7578125, |
|
"epoch": 0.6481876332622601, |
|
"grad_norm": 0.6901421546936035, |
|
"kl": 1.171875, |
|
"learning_rate": 6.6267001261717015e-06, |
|
"loss": -0.0709, |
|
"reward": 0.70703125, |
|
"reward_std": 0.2805519849061966, |
|
"rewards/accuracy_reward": 0.140625, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.56640625, |
|
"step": 152 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 625.5, |
|
"epoch": 0.652452025586354, |
|
"grad_norm": 1.2642836570739746, |
|
"kl": 6.0654296875, |
|
"learning_rate": 6.486251759186573e-06, |
|
"loss": -0.1338, |
|
"reward": 0.72265625, |
|
"reward_std": 0.28466814011335373, |
|
"rewards/accuracy_reward": 0.23046875, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4921875, |
|
"step": 153 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 625.890625, |
|
"epoch": 0.6567164179104478, |
|
"grad_norm": 0.3763836622238159, |
|
"kl": 0.4189453125, |
|
"learning_rate": 6.34658975633605e-06, |
|
"loss": -0.0951, |
|
"reward": 0.697265625, |
|
"reward_std": 0.2551993578672409, |
|
"rewards/accuracy_reward": 0.19921875, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.498046875, |
|
"step": 154 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 591.17578125, |
|
"epoch": 0.6609808102345416, |
|
"grad_norm": 0.6317035555839539, |
|
"kl": 0.52490234375, |
|
"learning_rate": 6.207745373470717e-06, |
|
"loss": -0.1346, |
|
"reward": 0.7265625, |
|
"reward_std": 0.32422181963920593, |
|
"rewards/accuracy_reward": 0.23046875, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.49609375, |
|
"step": 155 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 702.796875, |
|
"epoch": 0.6652452025586354, |
|
"grad_norm": 0.3873419165611267, |
|
"kl": 0.81591796875, |
|
"learning_rate": 6.069749683460765e-06, |
|
"loss": -0.1725, |
|
"reward": 0.6943359375, |
|
"reward_std": 0.2808499410748482, |
|
"rewards/accuracy_reward": 0.11328125, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5810546875, |
|
"step": 156 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 655.75390625, |
|
"epoch": 0.6695095948827292, |
|
"grad_norm": 0.37135419249534607, |
|
"kl": 0.52587890625, |
|
"learning_rate": 5.932633569242e-06, |
|
"loss": -0.041, |
|
"reward": 0.6240234375, |
|
"reward_std": 0.19439143873751163, |
|
"rewards/accuracy_reward": 0.05078125, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5732421875, |
|
"step": 157 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 676.12890625, |
|
"epoch": 0.673773987206823, |
|
"grad_norm": 0.3440045118331909, |
|
"kl": 0.49658203125, |
|
"learning_rate": 5.796427716904347e-06, |
|
"loss": -0.1016, |
|
"reward": 0.7001953125, |
|
"reward_std": 0.252426378428936, |
|
"rewards/accuracy_reward": 0.1171875, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5830078125, |
|
"step": 158 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 611.26953125, |
|
"epoch": 0.6780383795309168, |
|
"grad_norm": 0.4159948527812958, |
|
"kl": 0.88134765625, |
|
"learning_rate": 5.66116260882442e-06, |
|
"loss": -0.1264, |
|
"reward": 0.642578125, |
|
"reward_std": 0.18874739110469818, |
|
"rewards/accuracy_reward": 0.01953125, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.623046875, |
|
"step": 159 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 470.328125, |
|
"epoch": 0.6823027718550106, |
|
"grad_norm": 0.38798099756240845, |
|
"kl": 0.79736328125, |
|
"learning_rate": 5.526868516843673e-06, |
|
"loss": -0.0926, |
|
"reward": 0.708984375, |
|
"reward_std": 0.22118790447711945, |
|
"rewards/accuracy_reward": 0.05078125, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.658203125, |
|
"step": 160 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 447.8125, |
|
"epoch": 0.6865671641791045, |
|
"grad_norm": 0.4147135615348816, |
|
"kl": 0.64111328125, |
|
"learning_rate": 5.393575495493679e-06, |
|
"loss": -0.1075, |
|
"reward": 0.732421875, |
|
"reward_std": 0.18555288948118687, |
|
"rewards/accuracy_reward": 0.0703125, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.662109375, |
|
"step": 161 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 459.30078125, |
|
"epoch": 0.6908315565031983, |
|
"grad_norm": 0.5294929146766663, |
|
"kl": 0.58642578125, |
|
"learning_rate": 5.2613133752700145e-06, |
|
"loss": -0.1284, |
|
"reward": 0.66796875, |
|
"reward_std": 0.1776830367743969, |
|
"rewards/accuracy_reward": 0.01171875, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.65625, |
|
"step": 162 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 379.0625, |
|
"epoch": 0.6950959488272921, |
|
"grad_norm": 0.5291323065757751, |
|
"kl": 0.830078125, |
|
"learning_rate": 5.130111755956327e-06, |
|
"loss": -0.1563, |
|
"reward": 0.7099609375, |
|
"reward_std": 0.19628439471125603, |
|
"rewards/accuracy_reward": 0.046875, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.6630859375, |
|
"step": 163 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 338.12890625, |
|
"epoch": 0.6993603411513859, |
|
"grad_norm": 0.4588962197303772, |
|
"kl": 0.73583984375, |
|
"learning_rate": 5.000000000000003e-06, |
|
"loss": -0.1004, |
|
"reward": 0.7626953125, |
|
"reward_std": 0.17791462130844593, |
|
"rewards/accuracy_reward": 0.06640625, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.6962890625, |
|
"step": 164 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 310.578125, |
|
"epoch": 0.7036247334754797, |
|
"grad_norm": 0.8297274708747864, |
|
"kl": 1.7900390625, |
|
"learning_rate": 4.87100722594094e-06, |
|
"loss": -0.0991, |
|
"reward": 0.7421875, |
|
"reward_std": 0.17338587157428265, |
|
"rewards/accuracy_reward": 0.04296875, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.69921875, |
|
"step": 165 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 316.98828125, |
|
"epoch": 0.7078891257995735, |
|
"grad_norm": 0.9038926362991333, |
|
"kl": 1.57470703125, |
|
"learning_rate": 4.743162301894952e-06, |
|
"loss": -0.0521, |
|
"reward": 0.7744140625, |
|
"reward_std": 0.14399663731455803, |
|
"rewards/accuracy_reward": 0.0625, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.7119140625, |
|
"step": 166 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 307.609375, |
|
"epoch": 0.7121535181236673, |
|
"grad_norm": 3.5091426372528076, |
|
"kl": 1.41943359375, |
|
"learning_rate": 4.616493839093179e-06, |
|
"loss": -0.0394, |
|
"reward": 0.791015625, |
|
"reward_std": 0.1766387764364481, |
|
"rewards/accuracy_reward": 0.09375, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.697265625, |
|
"step": 167 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 290.44921875, |
|
"epoch": 0.7164179104477612, |
|
"grad_norm": 2.227064847946167, |
|
"kl": 1.279296875, |
|
"learning_rate": 4.491030185478976e-06, |
|
"loss": -0.0156, |
|
"reward": 0.7197265625, |
|
"reward_std": 0.10716542787849903, |
|
"rewards/accuracy_reward": 0.015625, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.7041015625, |
|
"step": 168 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 296.54296875, |
|
"epoch": 0.720682302771855, |
|
"grad_norm": 5.702210426330566, |
|
"kl": 1.513671875, |
|
"learning_rate": 4.3667994193637794e-06, |
|
"loss": 0.0234, |
|
"reward": 0.7626953125, |
|
"reward_std": 0.12216670252382755, |
|
"rewards/accuracy_reward": 0.05078125, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.7119140625, |
|
"step": 169 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 303.10546875, |
|
"epoch": 0.7249466950959488, |
|
"grad_norm": 0.5832945108413696, |
|
"kl": 0.66162109375, |
|
"learning_rate": 4.2438293431432665e-06, |
|
"loss": 0.0051, |
|
"reward": 0.806640625, |
|
"reward_std": 0.13442331552505493, |
|
"rewards/accuracy_reward": 0.0859375, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.720703125, |
|
"step": 170 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 269.23046875, |
|
"epoch": 0.7292110874200426, |
|
"grad_norm": 1.5207250118255615, |
|
"kl": 0.60546875, |
|
"learning_rate": 4.12214747707527e-06, |
|
"loss": 0.0104, |
|
"reward": 0.732421875, |
|
"reward_std": 0.09527772478759289, |
|
"rewards/accuracy_reward": 0.015625, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.716796875, |
|
"step": 171 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 221.65625, |
|
"epoch": 0.7334754797441365, |
|
"grad_norm": 2.143716335296631, |
|
"kl": 1.3779296875, |
|
"learning_rate": 4.001781053120863e-06, |
|
"loss": -0.0052, |
|
"reward": 0.7958984375, |
|
"reward_std": 0.13394116796553135, |
|
"rewards/accuracy_reward": 0.09375, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.7021484375, |
|
"step": 172 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 175.17578125, |
|
"epoch": 0.7377398720682303, |
|
"grad_norm": 5.434141635894775, |
|
"kl": 2.75, |
|
"learning_rate": 3.882757008849936e-06, |
|
"loss": 0.0388, |
|
"reward": 0.685546875, |
|
"reward_std": 0.16674507781863213, |
|
"rewards/accuracy_reward": 0.0390625, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.646484375, |
|
"step": 173 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 162.55859375, |
|
"epoch": 0.7420042643923241, |
|
"grad_norm": 27.080265045166016, |
|
"kl": 3.57421875, |
|
"learning_rate": 3.7651019814126656e-06, |
|
"loss": 0.0552, |
|
"reward": 0.6416015625, |
|
"reward_std": 0.133183553814888, |
|
"rewards/accuracy_reward": 0.0078125, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.6337890625, |
|
"step": 174 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 161.41796875, |
|
"epoch": 0.746268656716418, |
|
"grad_norm": 4.6160430908203125, |
|
"kl": 3.09375, |
|
"learning_rate": 3.6488423015782128e-06, |
|
"loss": 0.074, |
|
"reward": 0.6455078125, |
|
"reward_std": 0.15037459693849087, |
|
"rewards/accuracy_reward": 0.0234375, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.6220703125, |
|
"step": 175 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 153.59375, |
|
"epoch": 0.7505330490405118, |
|
"grad_norm": 3.9284942150115967, |
|
"kl": 2.150390625, |
|
"learning_rate": 3.534003987842005e-06, |
|
"loss": 0.0613, |
|
"reward": 0.69921875, |
|
"reward_std": 0.16477027162909508, |
|
"rewards/accuracy_reward": 0.03515625, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.6640625, |
|
"step": 176 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 194.90625, |
|
"epoch": 0.7547974413646056, |
|
"grad_norm": 1.458369493484497, |
|
"kl": 0.7900390625, |
|
"learning_rate": 3.4206127406028744e-06, |
|
"loss": 0.0115, |
|
"reward": 0.78125, |
|
"reward_std": 0.16565649397671223, |
|
"rewards/accuracy_reward": 0.08203125, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.69921875, |
|
"step": 177 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 185.91796875, |
|
"epoch": 0.7590618336886994, |
|
"grad_norm": 0.9649374485015869, |
|
"kl": 0.466796875, |
|
"learning_rate": 3.308693936411421e-06, |
|
"loss": -0.0284, |
|
"reward": 0.75, |
|
"reward_std": 0.09331535268574953, |
|
"rewards/accuracy_reward": 0.01953125, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.73046875, |
|
"step": 178 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 179.1171875, |
|
"epoch": 0.7633262260127932, |
|
"grad_norm": 1.3005759716033936, |
|
"kl": 0.43115234375, |
|
"learning_rate": 3.1982726222908046e-06, |
|
"loss": -0.0093, |
|
"reward": 0.87890625, |
|
"reward_std": 0.09297346090897918, |
|
"rewards/accuracy_reward": 0.1484375, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.73046875, |
|
"step": 179 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 198.36328125, |
|
"epoch": 0.767590618336887, |
|
"grad_norm": 0.8584280014038086, |
|
"kl": 0.2998046875, |
|
"learning_rate": 3.089373510131354e-06, |
|
"loss": -0.0111, |
|
"reward": 0.7822265625, |
|
"reward_std": 0.10853294795379043, |
|
"rewards/accuracy_reward": 0.046875, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.7353515625, |
|
"step": 180 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 187.99609375, |
|
"epoch": 0.7718550106609808, |
|
"grad_norm": 0.5993466377258301, |
|
"kl": 0.52099609375, |
|
"learning_rate": 2.9820209711600858e-06, |
|
"loss": -0.0401, |
|
"reward": 0.7890625, |
|
"reward_std": 0.1114540034905076, |
|
"rewards/accuracy_reward": 0.05859375, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.73046875, |
|
"step": 181 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 219.74609375, |
|
"epoch": 0.7761194029850746, |
|
"grad_norm": 3.0398595333099365, |
|
"kl": 0.75439453125, |
|
"learning_rate": 2.876239030486554e-06, |
|
"loss": 0.0153, |
|
"reward": 0.7724609375, |
|
"reward_std": 0.1012349147349596, |
|
"rewards/accuracy_reward": 0.0390625, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.7333984375, |
|
"step": 182 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 192.01171875, |
|
"epoch": 0.7803837953091685, |
|
"grad_norm": 2.972774028778076, |
|
"kl": 1.375, |
|
"learning_rate": 2.7720513617260857e-06, |
|
"loss": 0.0664, |
|
"reward": 0.814453125, |
|
"reward_std": 0.11987380962818861, |
|
"rewards/accuracy_reward": 0.0859375, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.728515625, |
|
"step": 183 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 185.59765625, |
|
"epoch": 0.7846481876332623, |
|
"grad_norm": 6.028680801391602, |
|
"kl": 1.98046875, |
|
"learning_rate": 2.669481281701739e-06, |
|
"loss": 0.0526, |
|
"reward": 0.7646484375, |
|
"reward_std": 0.10079776309430599, |
|
"rewards/accuracy_reward": 0.03125, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.7333984375, |
|
"step": 184 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 206.2109375, |
|
"epoch": 0.7889125799573561, |
|
"grad_norm": 1.8360040187835693, |
|
"kl": 1.3046875, |
|
"learning_rate": 2.5685517452260566e-06, |
|
"loss": -0.0217, |
|
"reward": 0.7841796875, |
|
"reward_std": 0.08804275188595057, |
|
"rewards/accuracy_reward": 0.0546875, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.7294921875, |
|
"step": 185 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 211.5234375, |
|
"epoch": 0.7931769722814499, |
|
"grad_norm": 1.8284348249435425, |
|
"kl": 1.18359375, |
|
"learning_rate": 2.469285339963892e-06, |
|
"loss": 0.0017, |
|
"reward": 0.7744140625, |
|
"reward_std": 0.09929579310119152, |
|
"rewards/accuracy_reward": 0.04296875, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.7314453125, |
|
"step": 186 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 206.52734375, |
|
"epoch": 0.7974413646055437, |
|
"grad_norm": 2.95171856880188, |
|
"kl": 1.2236328125, |
|
"learning_rate": 2.371704281377335e-06, |
|
"loss": 0.0348, |
|
"reward": 0.73828125, |
|
"reward_std": 0.09545402321964502, |
|
"rewards/accuracy_reward": 0.015625, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.72265625, |
|
"step": 187 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 202.625, |
|
"epoch": 0.8017057569296375, |
|
"grad_norm": 0.737244725227356, |
|
"kl": 0.52197265625, |
|
"learning_rate": 2.275830407754006e-06, |
|
"loss": 0.0328, |
|
"reward": 0.8466796875, |
|
"reward_std": 0.15702996030449867, |
|
"rewards/accuracy_reward": 0.11328125, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.7333984375, |
|
"step": 188 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 214.828125, |
|
"epoch": 0.8059701492537313, |
|
"grad_norm": 0.781270444393158, |
|
"kl": 0.302734375, |
|
"learning_rate": 2.1816851753197023e-06, |
|
"loss": 0.0188, |
|
"reward": 0.80078125, |
|
"reward_std": 0.13719853153452277, |
|
"rewards/accuracy_reward": 0.0703125, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.73046875, |
|
"step": 189 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 212.7890625, |
|
"epoch": 0.8102345415778252, |
|
"grad_norm": 1.513720989227295, |
|
"kl": 0.3876953125, |
|
"learning_rate": 2.08928965343659e-06, |
|
"loss": 0.0004, |
|
"reward": 0.861328125, |
|
"reward_std": 0.1351899290457368, |
|
"rewards/accuracy_reward": 0.125, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.736328125, |
|
"step": 190 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 220.57421875, |
|
"epoch": 0.814498933901919, |
|
"grad_norm": 4.133224964141846, |
|
"kl": 1.0576171875, |
|
"learning_rate": 1.9986645198879385e-06, |
|
"loss": -0.0196, |
|
"reward": 0.7626953125, |
|
"reward_std": 0.14676811546087265, |
|
"rewards/accuracy_reward": 0.0390625, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.7236328125, |
|
"step": 191 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 221.96875, |
|
"epoch": 0.8187633262260128, |
|
"grad_norm": 0.4540961682796478, |
|
"kl": 0.3203125, |
|
"learning_rate": 1.9098300562505266e-06, |
|
"loss": -0.0199, |
|
"reward": 0.8544921875, |
|
"reward_std": 0.1271651964634657, |
|
"rewards/accuracy_reward": 0.1171875, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.7373046875, |
|
"step": 192 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 219.1171875, |
|
"epoch": 0.8230277185501066, |
|
"grad_norm": 0.32064002752304077, |
|
"kl": 0.45361328125, |
|
"learning_rate": 1.8228061433556866e-06, |
|
"loss": -0.0265, |
|
"reward": 0.779296875, |
|
"reward_std": 0.0899216216057539, |
|
"rewards/accuracy_reward": 0.04296875, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.736328125, |
|
"step": 193 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 214.43359375, |
|
"epoch": 0.8272921108742004, |
|
"grad_norm": 1.135198950767517, |
|
"kl": 0.42529296875, |
|
"learning_rate": 1.7376122568400533e-06, |
|
"loss": -0.0286, |
|
"reward": 0.8046875, |
|
"reward_std": 0.16580088809132576, |
|
"rewards/accuracy_reward": 0.078125, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.7265625, |
|
"step": 194 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 218.921875, |
|
"epoch": 0.8315565031982942, |
|
"grad_norm": 0.5622548460960388, |
|
"kl": 0.316650390625, |
|
"learning_rate": 1.6542674627869738e-06, |
|
"loss": 0.017, |
|
"reward": 0.80078125, |
|
"reward_std": 0.13944148644804955, |
|
"rewards/accuracy_reward": 0.0703125, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.73046875, |
|
"step": 195 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 251.47265625, |
|
"epoch": 0.835820895522388, |
|
"grad_norm": 0.7856387495994568, |
|
"kl": 0.4052734375, |
|
"learning_rate": 1.5727904134596084e-06, |
|
"loss": 0.0162, |
|
"reward": 0.8193359375, |
|
"reward_std": 0.16033071093261242, |
|
"rewards/accuracy_reward": 0.11328125, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.7060546875, |
|
"step": 196 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 243.625, |
|
"epoch": 0.8400852878464818, |
|
"grad_norm": 0.6806755065917969, |
|
"kl": 0.49072265625, |
|
"learning_rate": 1.4931993431266056e-06, |
|
"loss": 0.0095, |
|
"reward": 0.7890625, |
|
"reward_std": 0.25723421946167946, |
|
"rewards/accuracy_reward": 0.11328125, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.67578125, |
|
"step": 197 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 273.87109375, |
|
"epoch": 0.8443496801705757, |
|
"grad_norm": 1.0873993635177612, |
|
"kl": 0.375, |
|
"learning_rate": 1.4155120639813392e-06, |
|
"loss": 0.1037, |
|
"reward": 0.7626953125, |
|
"reward_std": 0.21218526735901833, |
|
"rewards/accuracy_reward": 0.09375, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.6689453125, |
|
"step": 198 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 239.97265625, |
|
"epoch": 0.8486140724946695, |
|
"grad_norm": 1.0088647603988647, |
|
"kl": 0.37353515625, |
|
"learning_rate": 1.339745962155613e-06, |
|
"loss": 0.0416, |
|
"reward": 0.7822265625, |
|
"reward_std": 0.24759295210242271, |
|
"rewards/accuracy_reward": 0.1015625, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.6806640625, |
|
"step": 199 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 268.73046875, |
|
"epoch": 0.8528784648187633, |
|
"grad_norm": 0.6826640367507935, |
|
"kl": 0.43017578125, |
|
"learning_rate": 1.2659179938287035e-06, |
|
"loss": 0.0312, |
|
"reward": 0.7685546875, |
|
"reward_std": 0.18006664514541626, |
|
"rewards/accuracy_reward": 0.0703125, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.6982421875, |
|
"step": 200 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 221.609375, |
|
"epoch": 0.8571428571428571, |
|
"grad_norm": 1.0566011667251587, |
|
"kl": 0.45751953125, |
|
"learning_rate": 1.19404468143262e-06, |
|
"loss": -0.0104, |
|
"reward": 0.796875, |
|
"reward_std": 0.15738755092024803, |
|
"rewards/accuracy_reward": 0.078125, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.71875, |
|
"step": 201 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 269.80859375, |
|
"epoch": 0.8614072494669509, |
|
"grad_norm": 0.7283450365066528, |
|
"kl": 0.5009765625, |
|
"learning_rate": 1.124142109954459e-06, |
|
"loss": -0.0242, |
|
"reward": 0.7705078125, |
|
"reward_std": 0.13039706647396088, |
|
"rewards/accuracy_reward": 0.0390625, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.7314453125, |
|
"step": 202 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 255.265625, |
|
"epoch": 0.8656716417910447, |
|
"grad_norm": 1.5315821170806885, |
|
"kl": 1.380859375, |
|
"learning_rate": 1.0562259233366334e-06, |
|
"loss": -0.0731, |
|
"reward": 0.78515625, |
|
"reward_std": 0.2296939566731453, |
|
"rewards/accuracy_reward": 0.08984375, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.6953125, |
|
"step": 203 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 218.21875, |
|
"epoch": 0.8699360341151386, |
|
"grad_norm": 1.1094874143600464, |
|
"kl": 0.73046875, |
|
"learning_rate": 9.903113209758098e-07, |
|
"loss": 0.0012, |
|
"reward": 0.8720703125, |
|
"reward_std": 0.2057046014815569, |
|
"rewards/accuracy_reward": 0.1484375, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.7236328125, |
|
"step": 204 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 281.12109375, |
|
"epoch": 0.8742004264392325, |
|
"grad_norm": 1.407812237739563, |
|
"kl": 1.35546875, |
|
"learning_rate": 9.264130543213512e-07, |
|
"loss": -0.0625, |
|
"reward": 0.8251953125, |
|
"reward_std": 0.20766575261950493, |
|
"rewards/accuracy_reward": 0.1171875, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.7080078125, |
|
"step": 205 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 252.62109375, |
|
"epoch": 0.8784648187633263, |
|
"grad_norm": 3.055626153945923, |
|
"kl": 2.255859375, |
|
"learning_rate": 8.645454235739903e-07, |
|
"loss": -0.0862, |
|
"reward": 0.80859375, |
|
"reward_std": 0.2070464938879013, |
|
"rewards/accuracy_reward": 0.109375, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.69921875, |
|
"step": 206 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 238.58203125, |
|
"epoch": 0.8827292110874201, |
|
"grad_norm": 1.8872811794281006, |
|
"kl": 1.49609375, |
|
"learning_rate": 8.047222744854943e-07, |
|
"loss": 0.0217, |
|
"reward": 0.8857421875, |
|
"reward_std": 0.24977924302220345, |
|
"rewards/accuracy_reward": 0.171875, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.7138671875, |
|
"step": 207 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 258.48046875, |
|
"epoch": 0.8869936034115139, |
|
"grad_norm": 4.186584949493408, |
|
"kl": 2.2333984375, |
|
"learning_rate": 7.46956995260033e-07, |
|
"loss": -0.0711, |
|
"reward": 0.8271484375, |
|
"reward_std": 0.18339894711971283, |
|
"rewards/accuracy_reward": 0.12890625, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.6982421875, |
|
"step": 208 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 236.16796875, |
|
"epoch": 0.8912579957356077, |
|
"grad_norm": 2.354311466217041, |
|
"kl": 1.609375, |
|
"learning_rate": 6.912625135579587e-07, |
|
"loss": -0.0062, |
|
"reward": 0.791015625, |
|
"reward_std": 0.17353365197777748, |
|
"rewards/accuracy_reward": 0.078125, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.712890625, |
|
"step": 209 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 227.03125, |
|
"epoch": 0.8955223880597015, |
|
"grad_norm": 2.211200714111328, |
|
"kl": 1.818359375, |
|
"learning_rate": 6.37651293602628e-07, |
|
"loss": -0.019, |
|
"reward": 0.7958984375, |
|
"reward_std": 0.19231459498405457, |
|
"rewards/accuracy_reward": 0.0859375, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.7099609375, |
|
"step": 210 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 259.16015625, |
|
"epoch": 0.8997867803837953, |
|
"grad_norm": 3.354318141937256, |
|
"kl": 1.48828125, |
|
"learning_rate": 5.861353333909692e-07, |
|
"loss": -0.0305, |
|
"reward": 0.8115234375, |
|
"reward_std": 0.17966507747769356, |
|
"rewards/accuracy_reward": 0.09765625, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.7138671875, |
|
"step": 211 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 262.46484375, |
|
"epoch": 0.9040511727078892, |
|
"grad_norm": 3.2571589946746826, |
|
"kl": 2.1796875, |
|
"learning_rate": 5.367261620083575e-07, |
|
"loss": -0.0519, |
|
"reward": 0.83984375, |
|
"reward_std": 0.2149498090147972, |
|
"rewards/accuracy_reward": 0.12109375, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.71875, |
|
"step": 212 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 237.96875, |
|
"epoch": 0.908315565031983, |
|
"grad_norm": 1.6243290901184082, |
|
"kl": 1.0390625, |
|
"learning_rate": 4.894348370484648e-07, |
|
"loss": 0.0014, |
|
"reward": 0.7900390625, |
|
"reward_std": 0.14244702830910683, |
|
"rewards/accuracy_reward": 0.05859375, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.7314453125, |
|
"step": 213 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 238.0703125, |
|
"epoch": 0.9125799573560768, |
|
"grad_norm": 4.307506084442139, |
|
"kl": 1.15576171875, |
|
"learning_rate": 4.4427194213859216e-07, |
|
"loss": 0.0194, |
|
"reward": 0.833984375, |
|
"reward_std": 0.19881774485111237, |
|
"rewards/accuracy_reward": 0.109375, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.724609375, |
|
"step": 214 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 240.42578125, |
|
"epoch": 0.9168443496801706, |
|
"grad_norm": 0.588789701461792, |
|
"kl": 0.70654296875, |
|
"learning_rate": 4.012475845711106e-07, |
|
"loss": -0.0109, |
|
"reward": 0.8740234375, |
|
"reward_std": 0.2335027940571308, |
|
"rewards/accuracy_reward": 0.1484375, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.7255859375, |
|
"step": 215 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 245.46875, |
|
"epoch": 0.9211087420042644, |
|
"grad_norm": 2.772460460662842, |
|
"kl": 1.4560546875, |
|
"learning_rate": 3.603713930414676e-07, |
|
"loss": -0.0346, |
|
"reward": 0.7451171875, |
|
"reward_std": 0.1310195019468665, |
|
"rewards/accuracy_reward": 0.02734375, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.7177734375, |
|
"step": 216 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 276.265625, |
|
"epoch": 0.9253731343283582, |
|
"grad_norm": 2.068373918533325, |
|
"kl": 1.5322265625, |
|
"learning_rate": 3.2165251549333585e-07, |
|
"loss": -0.0333, |
|
"reward": 0.765625, |
|
"reward_std": 0.15018462389707565, |
|
"rewards/accuracy_reward": 0.046875, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.71875, |
|
"step": 217 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 274.9609375, |
|
"epoch": 0.929637526652452, |
|
"grad_norm": 1.8721359968185425, |
|
"kl": 0.88037109375, |
|
"learning_rate": 2.8509961707132496e-07, |
|
"loss": -0.0266, |
|
"reward": 0.8369140625, |
|
"reward_std": 0.20388219691812992, |
|
"rewards/accuracy_reward": 0.1171875, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.7197265625, |
|
"step": 218 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 268.0234375, |
|
"epoch": 0.9339019189765458, |
|
"grad_norm": 7.492040157318115, |
|
"kl": 1.865234375, |
|
"learning_rate": 2.507208781817638e-07, |
|
"loss": -0.0152, |
|
"reward": 0.859375, |
|
"reward_std": 0.2039647325873375, |
|
"rewards/accuracy_reward": 0.1328125, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.7265625, |
|
"step": 219 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 247.60546875, |
|
"epoch": 0.9381663113006397, |
|
"grad_norm": 1.1414939165115356, |
|
"kl": 0.890625, |
|
"learning_rate": 2.1852399266194312e-07, |
|
"loss": -0.0075, |
|
"reward": 0.8203125, |
|
"reward_std": 0.19437766447663307, |
|
"rewards/accuracy_reward": 0.09375, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.7265625, |
|
"step": 220 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 252.56640625, |
|
"epoch": 0.9424307036247335, |
|
"grad_norm": 1.324097990989685, |
|
"kl": 0.7802734375, |
|
"learning_rate": 1.885161660582746e-07, |
|
"loss": -0.0435, |
|
"reward": 0.7861328125, |
|
"reward_std": 0.1638173609972, |
|
"rewards/accuracy_reward": 0.05859375, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.7275390625, |
|
"step": 221 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 234.42578125, |
|
"epoch": 0.9466950959488273, |
|
"grad_norm": 1.6171019077301025, |
|
"kl": 1.47265625, |
|
"learning_rate": 1.6070411401370335e-07, |
|
"loss": -0.0326, |
|
"reward": 0.771484375, |
|
"reward_std": 0.17419602535665035, |
|
"rewards/accuracy_reward": 0.0625, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.708984375, |
|
"step": 222 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 260.5859375, |
|
"epoch": 0.9509594882729211, |
|
"grad_norm": 4.242193698883057, |
|
"kl": 0.716552734375, |
|
"learning_rate": 1.350940607647866e-07, |
|
"loss": 0.0139, |
|
"reward": 0.822265625, |
|
"reward_std": 0.16951362788677216, |
|
"rewards/accuracy_reward": 0.08984375, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.732421875, |
|
"step": 223 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 259.80078125, |
|
"epoch": 0.9552238805970149, |
|
"grad_norm": 5.544849395751953, |
|
"kl": 1.04345703125, |
|
"learning_rate": 1.1169173774871478e-07, |
|
"loss": 0.0055, |
|
"reward": 0.8037109375, |
|
"reward_std": 0.17775351367890835, |
|
"rewards/accuracy_reward": 0.08203125, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.7216796875, |
|
"step": 224 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 272.6328125, |
|
"epoch": 0.9594882729211087, |
|
"grad_norm": 0.6265246868133545, |
|
"kl": 0.630859375, |
|
"learning_rate": 9.0502382320653e-08, |
|
"loss": -0.0349, |
|
"reward": 0.9287109375, |
|
"reward_std": 0.25279103592038155, |
|
"rewards/accuracy_reward": 0.19921875, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.7294921875, |
|
"step": 225 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 240.05078125, |
|
"epoch": 0.9637526652452025, |
|
"grad_norm": 1.7309554815292358, |
|
"kl": 1.01171875, |
|
"learning_rate": 7.153073658162646e-08, |
|
"loss": -0.0208, |
|
"reward": 0.7783203125, |
|
"reward_std": 0.1793037187308073, |
|
"rewards/accuracy_reward": 0.0625, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.7158203125, |
|
"step": 226 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 239.66796875, |
|
"epoch": 0.9680170575692963, |
|
"grad_norm": 1.2242202758789062, |
|
"kl": 0.91845703125, |
|
"learning_rate": 5.4781046317267103e-08, |
|
"loss": -0.0061, |
|
"reward": 0.8173828125, |
|
"reward_std": 0.20180584490299225, |
|
"rewards/accuracy_reward": 0.09375, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.7236328125, |
|
"step": 227 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 275.390625, |
|
"epoch": 0.9722814498933902, |
|
"grad_norm": 0.6100110411643982, |
|
"kl": 0.55224609375, |
|
"learning_rate": 4.025706004760932e-08, |
|
"loss": -0.0347, |
|
"reward": 0.8251953125, |
|
"reward_std": 0.15769800543785095, |
|
"rewards/accuracy_reward": 0.08984375, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.7353515625, |
|
"step": 228 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 251.50390625, |
|
"epoch": 0.976545842217484, |
|
"grad_norm": 1.7776597738265991, |
|
"kl": 0.861328125, |
|
"learning_rate": 2.796202818819871e-08, |
|
"loss": -0.0023, |
|
"reward": 0.85546875, |
|
"reward_std": 0.22671574354171753, |
|
"rewards/accuracy_reward": 0.12890625, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.7265625, |
|
"step": 229 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 242.7734375, |
|
"epoch": 0.9808102345415778, |
|
"grad_norm": 1.0293753147125244, |
|
"kl": 0.8193359375, |
|
"learning_rate": 1.7898702322648453e-08, |
|
"loss": -0.0337, |
|
"reward": 0.828125, |
|
"reward_std": 0.16718050092458725, |
|
"rewards/accuracy_reward": 0.10546875, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.72265625, |
|
"step": 230 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 265.5546875, |
|
"epoch": 0.9850746268656716, |
|
"grad_norm": 1.5529704093933105, |
|
"kl": 1.0205078125, |
|
"learning_rate": 1.0069334586854106e-08, |
|
"loss": -0.0289, |
|
"reward": 0.83203125, |
|
"reward_std": 0.16101082926616073, |
|
"rewards/accuracy_reward": 0.11328125, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.71875, |
|
"step": 231 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 254.5, |
|
"epoch": 0.9893390191897654, |
|
"grad_norm": 2.5354487895965576, |
|
"kl": 1.244140625, |
|
"learning_rate": 4.475677164966774e-09, |
|
"loss": -0.0043, |
|
"reward": 0.810546875, |
|
"reward_std": 0.18945523723959923, |
|
"rewards/accuracy_reward": 0.0859375, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.724609375, |
|
"step": 232 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 251.34765625, |
|
"epoch": 0.9936034115138592, |
|
"grad_norm": 1.5713036060333252, |
|
"kl": 1.1103515625, |
|
"learning_rate": 1.1189818972656697e-09, |
|
"loss": -0.0032, |
|
"reward": 0.8720703125, |
|
"reward_std": 0.26588882878422737, |
|
"rewards/accuracy_reward": 0.15625, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.7158203125, |
|
"step": 233 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 269.0000114440918, |
|
"epoch": 0.997867803837953, |
|
"grad_norm": 1.1278139352798462, |
|
"kl": 1.296875, |
|
"learning_rate": 0.0, |
|
"loss": -0.0335, |
|
"reward": 0.8466796875, |
|
"reward_std": 0.23463162407279015, |
|
"rewards/accuracy_reward": 0.12890625, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.7177734375, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.997867803837953, |
|
"step": 234, |
|
"total_flos": 0.0, |
|
"train_loss": 0.3048181866761297, |
|
"train_runtime": 9320.4205, |
|
"train_samples_per_second": 0.805, |
|
"train_steps_per_second": 0.025 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 234, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|