{ "best_metric": null, "best_model_checkpoint": null, "episode": 24000, "epoch": 0.14379695869432363, "eval_steps": 500, "global_step": 1500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "episode": 16, "epoch": 9.586463912954908e-05, "loss/policy_avg": 0.015691569074988365, "lr": 1e-05, "objective/entropy": 136.889404296875, "objective/kl": 13.172518730163574, "objective/non_score_reward": -0.6586259603500366, "objective/rlhf_reward": -1.2559016580260813, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 330.0568542480469, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.75, "step": 0, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.999426007270813 }, { "episode": 32, "epoch": 0.00019172927825909816, "loss/policy_avg": 0.021727558225393295, "lr": 9.999360940695298e-06, "objective/entropy": -4.705432891845703, "objective/kl": 4.4086012840271, "objective/non_score_reward": -0.22043009102344513, "objective/rlhf_reward": 0.49688179692854306, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 25.247615814208984, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4375, "step": 1, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0005669593811035 }, { "episode": 48, "epoch": 0.00028759391738864725, "loss/policy_avg": 0.05422616004943848, "lr": 9.998721881390595e-06, "objective/entropy": 26.511795043945312, "objective/kl": 10.364278793334961, "objective/non_score_reward": -0.5182140469551086, "objective/rlhf_reward": -0.6222579917923059, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 174.7788543701172, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6328125, "step": 2, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.001580238342285 }, { "episode": 64, "epoch": 0.0003834585565181963, "loss/policy_avg": 0.1031150370836258, "lr": 9.99808282208589e-06, "objective/entropy": -6.2874298095703125, "objective/kl": 7.10389518737793, "objective/non_score_reward": -0.35519474744796753, "objective/rlhf_reward": 0.24108044284523888, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 107.51742553710938, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.806640625, "step": 3, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9999995231628418 }, { "episode": 80, "epoch": 0.0004793231956477454, "loss/policy_avg": 0.020609447732567787, "lr": 9.997443762781187e-06, "objective/entropy": 63.54547882080078, "objective/kl": 1.458254337310791, "objective/non_score_reward": -0.07291271537542343, "objective/rlhf_reward": 1.224120924828116, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 14.240117073059082, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4150390625, "step": 4, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.000563383102417 }, { "episode": 96, "epoch": 0.0005751878347772945, "loss/policy_avg": 0.1277482807636261, "lr": 9.996804703476484e-06, "objective/entropy": 55.068546295166016, "objective/kl": 8.753851890563965, "objective/non_score_reward": -0.43769264221191406, "objective/rlhf_reward": -0.37216834077010735, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 100.08578491210938, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.447265625, "step": 5, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.999474048614502 }, { "episode": 112, "epoch": 0.0006710524739068436, "loss/policy_avg": 0.3148539662361145, "lr": 9.99616564417178e-06, "objective/entropy": 21.463600158691406, "objective/kl": 9.847577095031738, "objective/non_score_reward": -0.4923788607120514, "objective/rlhf_reward": -0.02210425861352272, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 82.89840698242188, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.619140625, "step": 6, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.998772382736206 }, { "episode": 128, "epoch": 0.0007669171130363926, "loss/policy_avg": -9.760260581970215e-06, "lr": 9.995526584867077e-06, "objective/entropy": 43.514984130859375, "objective/kl": 6.468422889709473, "objective/non_score_reward": -0.3234211802482605, "objective/rlhf_reward": 0.18726797867262368, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 53.660911560058594, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.595703125, "step": 7, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0024185180664062 }, { "episode": 144, "epoch": 0.0008627817521659417, "loss/policy_avg": 0.07420124113559723, "lr": 9.994887525562374e-06, "objective/entropy": 111.558837890625, "objective/kl": 5.765064716339111, "objective/non_score_reward": -0.2882532477378845, "objective/rlhf_reward": 0.7943982454372089, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 38.34186935424805, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4462890625, "step": 8, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9975563287734985 }, { "episode": 160, "epoch": 0.0009586463912954908, "loss/policy_avg": 0.22252294421195984, "lr": 9.99424846625767e-06, "objective/entropy": 99.2086181640625, "objective/kl": 8.770297050476074, "objective/non_score_reward": -0.4385148584842682, "objective/rlhf_reward": -0.35405938923358926, "objective/scores": 0.35, "policy/approxkl_avg": 98.07421112060547, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.75, "step": 9, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9961905479431152 }, { "episode": 176, "epoch": 0.0010545110304250398, "loss/policy_avg": 0.05278925597667694, "lr": 9.993609406952966e-06, "objective/entropy": 192.25936889648438, "objective/kl": 5.483057975769043, "objective/non_score_reward": -0.27415287494659424, "objective/rlhf_reward": 1.3033885151147842, "objective/scores": 0.6, "policy/approxkl_avg": 54.852699279785156, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.73046875, "step": 10, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0001378059387207 }, { "episode": 192, "epoch": 0.001150375669554589, "loss/policy_avg": 0.01604432426393032, "lr": 9.992970347648263e-06, "objective/entropy": 91.4354476928711, "objective/kl": 1.6482281684875488, "objective/non_score_reward": -0.08241140842437744, "objective/rlhf_reward": 1.1513069728358984, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 12.662862777709961, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5390625, "step": 11, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9994404315948486 }, { "episode": 208, "epoch": 0.001246240308684138, "loss/policy_avg": 0.17367278039455414, "lr": 9.992331288343558e-06, "objective/entropy": 148.37680053710938, "objective/kl": 9.977045059204102, "objective/non_score_reward": -0.4988522529602051, "objective/rlhf_reward": -0.4796372515880427, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 132.6361083984375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4619140625, "step": 12, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9963808059692383 }, { "episode": 224, "epoch": 0.0013421049478136871, "loss/policy_avg": -0.12138635665178299, "lr": 9.991692229038855e-06, "objective/entropy": -70.20156860351562, "objective/kl": 3.8376624584198, "objective/non_score_reward": -0.1918831169605255, "objective/rlhf_reward": 0.6324675619602202, "objective/scores": 0.35, "policy/approxkl_avg": 15.127391815185547, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.505859375, "step": 13, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.017153739929199 }, { "episode": 240, "epoch": 0.001437969586943236, "loss/policy_avg": 0.1106414794921875, "lr": 9.991053169734152e-06, "objective/entropy": 129.54013061523438, "objective/kl": 12.085613250732422, "objective/non_score_reward": -0.6042807102203369, "objective/rlhf_reward": -0.6837895224491755, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 178.22561645507812, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5390625, "step": 14, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.999481201171875 }, { "episode": 256, "epoch": 0.0015338342260727853, "loss/policy_avg": 0.01672934927046299, "lr": 9.990414110429449e-06, "objective/entropy": 177.98126220703125, "objective/kl": 7.125063896179199, "objective/non_score_reward": -0.3562532067298889, "objective/rlhf_reward": -0.025012841820716947, "objective/scores": 0.35, "policy/approxkl_avg": 91.47238159179688, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.716796875, "step": 15, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.000582218170166 }, { "episode": 272, "epoch": 0.0016296988652023342, "loss/policy_avg": 0.14258402585983276, "lr": 9.989775051124744e-06, "objective/entropy": 197.2217559814453, "objective/kl": 12.70147705078125, "objective/non_score_reward": -0.6350738406181335, "objective/rlhf_reward": -1.1616931343949852, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 84.26277160644531, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.630859375, "step": 16, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9964921474456787 }, { "episode": 288, "epoch": 0.0017255635043318834, "loss/policy_avg": -0.0007228106260299683, "lr": 9.989135991820041e-06, "objective/entropy": -9.756143569946289, "objective/kl": 7.940765380859375, "objective/non_score_reward": -0.3970382809638977, "objective/rlhf_reward": -0.07238138595455501, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 42.61369323730469, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.64453125, "step": 17, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0011234283447266 }, { "episode": 304, "epoch": 0.0018214281434614326, "loss/policy_avg": 0.13892704248428345, "lr": 9.988496932515338e-06, "objective/entropy": 14.549068450927734, "objective/kl": 9.783748626708984, "objective/non_score_reward": -0.48918741941452026, "objective/rlhf_reward": -0.5781475538886606, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 73.81009674072266, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.607421875, "step": 18, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.998319149017334 }, { "episode": 320, "epoch": 0.0019172927825909815, "loss/policy_avg": 0.12347989529371262, "lr": 9.987857873210635e-06, "objective/entropy": 197.0328369140625, "objective/kl": 9.07555103302002, "objective/non_score_reward": -0.453777551651001, "objective/rlhf_reward": -0.15325071436225013, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 74.28388214111328, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5625, "step": 19, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.001952648162842 }, { "episode": 336, "epoch": 0.0020131574217205307, "loss/policy_avg": 0.06666804850101471, "lr": 9.987218813905932e-06, "objective/entropy": 180.56707763671875, "objective/kl": 10.346174240112305, "objective/non_score_reward": -0.5173087120056152, "objective/rlhf_reward": -0.6454025848704257, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 88.01742553710938, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.595703125, "step": 20, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9958171844482422 }, { "episode": 352, "epoch": 0.0021090220608500796, "loss/policy_avg": 0.12632718682289124, "lr": 9.986579754601228e-06, "objective/entropy": 165.49900817871094, "objective/kl": 10.707776069641113, "objective/non_score_reward": -0.5353888273239136, "objective/rlhf_reward": -0.7629530663169442, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 118.42108917236328, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.78125, "step": 21, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9964122772216797 }, { "episode": 368, "epoch": 0.0022048866999796286, "loss/policy_avg": 0.012576747685670853, "lr": 9.985940695296524e-06, "objective/entropy": -133.83059692382812, "objective/kl": 6.06254768371582, "objective/non_score_reward": -0.3031274080276489, "objective/rlhf_reward": 0.21132251183215, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 4.497255325317383, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.552734375, "step": 22, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0017709732055664 }, { "episode": 384, "epoch": 0.002300751339109178, "loss/policy_avg": 0.21566970646381378, "lr": 9.98530163599182e-06, "objective/entropy": 80.05180358886719, "objective/kl": 18.019107818603516, "objective/non_score_reward": -0.9009554386138916, "objective/rlhf_reward": -2.1799896850186267, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 244.3957061767578, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.72265625, "step": 23, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9975435733795166 }, { "episode": 400, "epoch": 0.002396615978238727, "loss/policy_avg": 0.21825431287288666, "lr": 9.984662576687117e-06, "objective/entropy": 22.858154296875, "objective/kl": 7.889187812805176, "objective/non_score_reward": -0.39445942640304565, "objective/rlhf_reward": 0.5448686011871957, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 45.33286666870117, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.54296875, "step": 24, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9998900890350342 }, { "episode": 416, "epoch": 0.002492480617368276, "loss/policy_avg": 0.2645857036113739, "lr": 9.984023517382414e-06, "objective/entropy": 37.619895935058594, "objective/kl": 11.23090934753418, "objective/non_score_reward": -0.5615454316139221, "objective/rlhf_reward": 0.15381827354431143, "objective/scores": 0.6, "policy/approxkl_avg": 88.95787811279297, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.552734375, "step": 25, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.996498703956604 }, { "episode": 432, "epoch": 0.002588345256497825, "loss/policy_avg": 0.04753335565328598, "lr": 9.983384458077711e-06, "objective/entropy": 156.34921264648438, "objective/kl": 7.371222496032715, "objective/non_score_reward": -0.36856111884117126, "objective/rlhf_reward": -0.14873159292332616, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 35.437461853027344, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6015625, "step": 26, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9979305267333984 }, { "episode": 448, "epoch": 0.0026842098956273742, "loss/policy_avg": -0.010932949371635914, "lr": 9.982745398773006e-06, "objective/entropy": 16.393407821655273, "objective/kl": 16.967132568359375, "objective/non_score_reward": -0.8483567237854004, "objective/rlhf_reward": -2.051791122465759, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 207.71142578125, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.564453125, "step": 27, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9935011863708496 }, { "episode": 464, "epoch": 0.002780074534756923, "loss/policy_avg": 0.23893436789512634, "lr": 9.982106339468303e-06, "objective/entropy": 170.59136962890625, "objective/kl": 15.129783630371094, "objective/non_score_reward": -0.7564891576766968, "objective/rlhf_reward": -1.469697265830591, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 135.97763061523438, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.72265625, "step": 28, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9975056648254395 }, { "episode": 480, "epoch": 0.002875939173886472, "loss/policy_avg": 0.03272615000605583, "lr": 9.9814672801636e-06, "objective/entropy": 6.700323104858398, "objective/kl": 10.701581954956055, "objective/non_score_reward": -0.5350791215896606, "objective/rlhf_reward": -0.6897181971982564, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 63.513145446777344, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.60546875, "step": 29, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.998331069946289 }, { "episode": 496, "epoch": 0.0029718038130160216, "loss/policy_avg": 0.07188314199447632, "lr": 9.980828220858897e-06, "objective/entropy": -47.331199645996094, "objective/kl": 12.874979019165039, "objective/non_score_reward": -0.6437489986419678, "objective/rlhf_reward": -1.1963937664903224, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 77.876220703125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5390625, "step": 30, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9967491626739502 }, { "episode": 512, "epoch": 0.0030676684521455705, "loss/policy_avg": 0.04047826677560806, "lr": 9.980189161554194e-06, "objective/entropy": 282.3853759765625, "objective/kl": 9.654375076293945, "objective/non_score_reward": -0.4827187657356262, "objective/rlhf_reward": -0.5716251668676566, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 64.11791229248047, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.89453125, "step": 31, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9997191429138184 }, { "episode": 528, "epoch": 0.0031635330912751195, "loss/policy_avg": 0.07097287476062775, "lr": 9.97955010224949e-06, "objective/entropy": 116.042236328125, "objective/kl": 14.595599174499512, "objective/non_score_reward": -0.7297799587249756, "objective/rlhf_reward": -0.7964137478926516, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 272.6925048828125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.3857421875, "step": 32, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0027289390563965 }, { "episode": 544, "epoch": 0.0032593977304046684, "loss/policy_avg": 0.5246497392654419, "lr": 9.978911042944786e-06, "objective/entropy": 8.318304061889648, "objective/kl": 16.622827529907227, "objective/non_score_reward": -0.831141471862793, "objective/rlhf_reward": -1.9990529752074906, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 159.0550079345703, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.703125, "step": 33, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9971305131912231 }, { "episode": 560, "epoch": 0.003355262369534218, "loss/policy_avg": 0.20073390007019043, "lr": 9.978271983640083e-06, "objective/entropy": 92.97464752197266, "objective/kl": 10.66767692565918, "objective/non_score_reward": -0.5333837866783142, "objective/rlhf_reward": 2.2664648383855823, "objective/scores": 1.1, "policy/approxkl_avg": 89.14144134521484, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.544921875, "step": 34, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000143051147461 }, { "episode": 576, "epoch": 0.0034511270086637668, "loss/policy_avg": 0.04765152558684349, "lr": 9.977632924335378e-06, "objective/entropy": 149.43089294433594, "objective/kl": 16.67333221435547, "objective/non_score_reward": -0.8336665630340576, "objective/rlhf_reward": -0.9346663713455201, "objective/scores": 0.6, "policy/approxkl_avg": 189.3590850830078, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4765625, "step": 35, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9986698627471924 }, { "episode": 592, "epoch": 0.0035469916477933157, "loss/policy_avg": 0.40008074045181274, "lr": 9.976993865030675e-06, "objective/entropy": 157.10501098632812, "objective/kl": 13.927867889404297, "objective/non_score_reward": -0.6963933706283569, "objective/rlhf_reward": -1.406971328941685, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 121.78231811523438, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.70703125, "step": 36, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9974275827407837 }, { "episode": 608, "epoch": 0.003642856286922865, "loss/policy_avg": 0.08663024008274078, "lr": 9.976354805725972e-06, "objective/entropy": 47.76446533203125, "objective/kl": 13.560833930969238, "objective/non_score_reward": -0.6780416965484619, "objective/rlhf_reward": -0.5894605539002753, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 43.71810531616211, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5078125, "step": 37, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9991490840911865 }, { "episode": 624, "epoch": 0.003738720926052414, "loss/policy_avg": 0.08268876373767853, "lr": 9.975715746421269e-06, "objective/entropy": 192.41729736328125, "objective/kl": 6.687016010284424, "objective/non_score_reward": -0.3343508243560791, "objective/rlhf_reward": 0.021846643354015427, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 67.82701873779297, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.619140625, "step": 38, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.999939203262329 }, { "episode": 640, "epoch": 0.003834585565181963, "loss/policy_avg": 0.05995899811387062, "lr": 9.975076687116566e-06, "objective/entropy": -98.350341796875, "objective/kl": 9.015666961669922, "objective/non_score_reward": -0.450783371925354, "objective/rlhf_reward": 0.14427768908268623, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 51.733055114746094, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5859375, "step": 39, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9974713325500488 }, { "episode": 656, "epoch": 0.003930450204311512, "loss/policy_avg": 0.18854951858520508, "lr": 9.97443762781186e-06, "objective/entropy": 141.67947387695312, "objective/kl": 10.309185028076172, "objective/non_score_reward": -0.5154592990875244, "objective/rlhf_reward": -0.6618371069431306, "objective/scores": 0.35, "policy/approxkl_avg": 71.02857208251953, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.744140625, "step": 40, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9993364810943604 }, { "episode": 672, "epoch": 0.004026314843441061, "loss/policy_avg": 0.05062849074602127, "lr": 9.973798568507158e-06, "objective/entropy": -38.6858024597168, "objective/kl": 9.445882797241211, "objective/non_score_reward": -0.4722941517829895, "objective/rlhf_reward": -1.8891766667366028, "objective/scores": 0.0, "policy/approxkl_avg": 5.4856438636779785, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.62890625, "step": 41, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9984209537506104 }, { "episode": 688, "epoch": 0.00412217948257061, "loss/policy_avg": 0.09501229226589203, "lr": 9.973159509202454e-06, "objective/entropy": 17.35771942138672, "objective/kl": 10.873266220092773, "objective/non_score_reward": -0.5436632633209229, "objective/rlhf_reward": -0.44131985406080876, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 98.38662719726562, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6953125, "step": 42, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9995697736740112 }, { "episode": 704, "epoch": 0.004218044121700159, "loss/policy_avg": 0.32498252391815186, "lr": 9.972520449897751e-06, "objective/entropy": 174.98866271972656, "objective/kl": 11.279447555541992, "objective/non_score_reward": -0.5639723539352417, "objective/rlhf_reward": -0.7749369321421384, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 62.73210144042969, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.552734375, "step": 43, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.0007286071777344 }, { "episode": 720, "epoch": 0.004313908760829708, "loss/policy_avg": 0.3995896577835083, "lr": 9.971881390593048e-06, "objective/entropy": 36.609832763671875, "objective/kl": 19.769756317138672, "objective/non_score_reward": -0.9884878993034363, "objective/rlhf_reward": -2.1291227295723667, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 164.33892822265625, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.708984375, "step": 44, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9945415258407593 }, { "episode": 736, "epoch": 0.004409773399959257, "loss/policy_avg": 0.17710548639297485, "lr": 9.971242331288345e-06, "objective/entropy": 93.23808288574219, "objective/kl": 16.88797378540039, "objective/non_score_reward": -0.8443987965583801, "objective/rlhf_reward": -1.7157356492882831, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 54.64923858642578, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.779296875, "step": 45, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9981857538223267 }, { "episode": 752, "epoch": 0.004505638039088807, "loss/policy_avg": 0.32767364382743835, "lr": 9.97060327198364e-06, "objective/entropy": 202.11843872070312, "objective/kl": 14.050471305847168, "objective/non_score_reward": -0.7025235295295715, "objective/rlhf_reward": -1.484581295281572, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 76.14016723632812, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7421875, "step": 46, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9997007846832275 }, { "episode": 768, "epoch": 0.004601502678218356, "loss/policy_avg": 0.08174459636211395, "lr": 9.969964212678937e-06, "objective/entropy": 54.37752151489258, "objective/kl": 15.1139497756958, "objective/non_score_reward": -0.75569748878479, "objective/rlhf_reward": -1.6635400888666343, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 83.4612045288086, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4296875, "step": 47, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9972434043884277 }, { "episode": 784, "epoch": 0.004697367317347905, "loss/policy_avg": 0.03365965187549591, "lr": 9.969325153374234e-06, "objective/entropy": 85.39935302734375, "objective/kl": 13.452342987060547, "objective/non_score_reward": -0.6726170778274536, "objective/rlhf_reward": -0.74305723138326, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 61.629390716552734, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.572265625, "step": 48, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9998470544815063 }, { "episode": 800, "epoch": 0.004793231956477454, "loss/policy_avg": 0.009335246868431568, "lr": 9.968686094069531e-06, "objective/entropy": 288.22564697265625, "objective/kl": 19.127742767333984, "objective/non_score_reward": -0.9563871026039124, "objective/rlhf_reward": -0.9018295153391089, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 176.43731689453125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.892578125, "step": 49, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9936624765396118 }, { "episode": 816, "epoch": 0.004889096595607003, "loss/policy_avg": 0.13336139917373657, "lr": 9.968047034764828e-06, "objective/entropy": -38.686851501464844, "objective/kl": 18.06523895263672, "objective/non_score_reward": -0.9032620191574097, "objective/rlhf_reward": -2.1320952503041024, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 179.73486328125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.65625, "step": 50, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.996016263961792 }, { "episode": 832, "epoch": 0.004984961234736552, "loss/policy_avg": 0.09758515655994415, "lr": 9.967407975460123e-06, "objective/entropy": -32.55284881591797, "objective/kl": 10.72513198852539, "objective/non_score_reward": -0.5362565517425537, "objective/rlhf_reward": -0.721194286544887, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 44.48727798461914, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.736328125, "step": 51, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9976041316986084 }, { "episode": 848, "epoch": 0.005080825873866101, "loss/policy_avg": 0.5202991366386414, "lr": 9.96676891615542e-06, "objective/entropy": 45.2802734375, "objective/kl": 16.129152297973633, "objective/non_score_reward": -0.8064576387405396, "objective/rlhf_reward": -1.2784193260239918, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 124.33740234375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.623046875, "step": 52, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9978928565979004 }, { "episode": 864, "epoch": 0.00517669051299565, "loss/policy_avg": 0.28677505254745483, "lr": 9.966129856850717e-06, "objective/entropy": -76.81179809570312, "objective/kl": 15.223251342773438, "objective/non_score_reward": -0.761162519454956, "objective/rlhf_reward": -1.5288782207094989, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 69.77767944335938, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7890625, "step": 53, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9999229907989502 }, { "episode": 880, "epoch": 0.0052725551521251995, "loss/policy_avg": 0.20859162509441376, "lr": 9.965490797546014e-06, "objective/entropy": -21.344478607177734, "objective/kl": 10.70494556427002, "objective/non_score_reward": -0.535247266292572, "objective/rlhf_reward": -0.7623869264997064, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 98.75808715820312, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.56640625, "step": 54, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9975996017456055 }, { "episode": 896, "epoch": 0.0053684197912547485, "loss/policy_avg": 1.2579694986343384, "lr": 9.96485173824131e-06, "objective/entropy": 164.7299346923828, "objective/kl": 18.096805572509766, "objective/non_score_reward": -0.9048402309417725, "objective/rlhf_reward": -2.0152409709134873, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 95.78445434570312, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.736328125, "step": 55, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9966285228729248 }, { "episode": 912, "epoch": 0.0054642844303842975, "loss/policy_avg": 0.3564913868904114, "lr": 9.964212678936606e-06, "objective/entropy": 85.46858215332031, "objective/kl": 17.930484771728516, "objective/non_score_reward": -0.89652419090271, "objective/rlhf_reward": -1.4633905313172677, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 79.41477966308594, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4091796875, "step": 56, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9984946250915527 }, { "episode": 928, "epoch": 0.005560149069513846, "loss/policy_avg": 0.03960660099983215, "lr": 9.963573619631903e-06, "objective/entropy": 205.954833984375, "objective/kl": 17.15917205810547, "objective/non_score_reward": -0.8579585552215576, "objective/rlhf_reward": -1.3091281972089148, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 23.591196060180664, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.791015625, "step": 57, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.997645378112793 }, { "episode": 944, "epoch": 0.005656013708643395, "loss/policy_avg": -0.00983378104865551, "lr": 9.9629345603272e-06, "objective/entropy": -1.1022186279296875, "objective/kl": 16.26142692565918, "objective/non_score_reward": -0.8130713105201721, "objective/rlhf_reward": 1.1477148175239567, "objective/scores": 1.1, "policy/approxkl_avg": 81.65092468261719, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.552734375, "step": 58, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.99957275390625 }, { "episode": 960, "epoch": 0.005751878347772944, "loss/policy_avg": 0.32060182094573975, "lr": 9.962295501022495e-06, "objective/entropy": 48.09014892578125, "objective/kl": 7.438636302947998, "objective/non_score_reward": -0.3719318211078644, "objective/rlhf_reward": 0.6349789739391469, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 7.77626895904541, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.822265625, "step": 59, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.008031129837036 }, { "episode": 976, "epoch": 0.005847742986902493, "loss/policy_avg": 0.2516993582248688, "lr": 9.961656441717792e-06, "objective/entropy": -46.64883804321289, "objective/kl": 19.601835250854492, "objective/non_score_reward": -0.9800918102264404, "objective/rlhf_reward": -2.594854134946985, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 181.5974578857422, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.609375, "step": 60, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9988071918487549 }, { "episode": 992, "epoch": 0.005943607626032043, "loss/policy_avg": 0.1109720841050148, "lr": 9.961017382413088e-06, "objective/entropy": 97.6422348022461, "objective/kl": 13.844486236572266, "objective/non_score_reward": -0.692224383354187, "objective/rlhf_reward": -1.2126380791335847, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 96.34603118896484, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.583984375, "step": 61, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9974918365478516 }, { "episode": 1008, "epoch": 0.006039472265161592, "loss/policy_avg": -0.05115126073360443, "lr": 9.960378323108385e-06, "objective/entropy": 34.42061996459961, "objective/kl": 14.079090118408203, "objective/non_score_reward": -0.7039545774459839, "objective/rlhf_reward": -1.4565682944997977, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 49.87873840332031, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.677734375, "step": 62, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9982357025146484 }, { "episode": 1024, "epoch": 0.006135336904291141, "loss/policy_avg": 0.22280101478099823, "lr": 9.959739263803682e-06, "objective/entropy": -24.89067840576172, "objective/kl": 19.501176834106445, "objective/non_score_reward": -0.9750589728355408, "objective/rlhf_reward": -2.4496376319841, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 243.47512817382812, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.888671875, "step": 63, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.999916911125183 }, { "episode": 1040, "epoch": 0.00623120154342069, "loss/policy_avg": 0.36840492486953735, "lr": 9.959100204498979e-06, "objective/entropy": 134.6929931640625, "objective/kl": 22.332670211791992, "objective/non_score_reward": -1.1166335344314575, "objective/rlhf_reward": -2.641705389293741, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 136.65045166015625, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.65234375, "step": 64, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9981276988983154 }, { "episode": 1056, "epoch": 0.006327066182550239, "loss/policy_avg": 0.09098342061042786, "lr": 9.958461145194274e-06, "objective/entropy": -26.864063262939453, "objective/kl": 13.052759170532227, "objective/non_score_reward": -0.6526379585266113, "objective/rlhf_reward": -0.7857228770580997, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 62.885929107666016, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.603515625, "step": 65, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.997183084487915 }, { "episode": 1072, "epoch": 0.006422930821679788, "loss/policy_avg": 0.27086368203163147, "lr": 9.957822085889571e-06, "objective/entropy": -58.01667404174805, "objective/kl": 16.48623275756836, "objective/non_score_reward": -0.8243115544319153, "objective/rlhf_reward": -1.635386770189391, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 153.92050170898438, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.583984375, "step": 66, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0005505084991455 }, { "episode": 1088, "epoch": 0.006518795460809337, "loss/policy_avg": 1.2388324737548828, "lr": 9.957183026584868e-06, "objective/entropy": 99.91399383544922, "objective/kl": 21.524110794067383, "objective/non_score_reward": -1.0762056112289429, "objective/rlhf_reward": -2.6429626993542774, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 170.69760131835938, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.64453125, "step": 67, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9946773052215576 }, { "episode": 1104, "epoch": 0.006614660099938887, "loss/policy_avg": 0.330521821975708, "lr": 9.956543967280165e-06, "objective/entropy": -76.99481201171875, "objective/kl": 15.58948802947998, "objective/non_score_reward": -0.7794743776321411, "objective/rlhf_reward": -1.7178976856172086, "objective/scores": 0.35, "policy/approxkl_avg": 218.45574951171875, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.64453125, "step": 68, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9997715950012207 }, { "episode": 1120, "epoch": 0.006710524739068436, "loss/policy_avg": 0.11920663714408875, "lr": 9.955904907975462e-06, "objective/entropy": 70.55160522460938, "objective/kl": 20.134777069091797, "objective/non_score_reward": -1.0067389011383057, "objective/rlhf_reward": -2.6853197722727353, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 62.195674896240234, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.34765625, "step": 69, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.001072406768799 }, { "episode": 1136, "epoch": 0.006806389378197985, "loss/policy_avg": -0.17695794999599457, "lr": 9.955265848670757e-06, "objective/entropy": 101.99272918701172, "objective/kl": 12.69788932800293, "objective/non_score_reward": -0.6348943710327148, "objective/rlhf_reward": -2.539577692747116, "objective/scores": 0.0, "policy/approxkl_avg": 64.835693359375, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.44140625, "step": 70, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0115315914154053 }, { "episode": 1152, "epoch": 0.0069022540173275335, "loss/policy_avg": 0.35137245059013367, "lr": 9.954626789366054e-06, "objective/entropy": 79.80499267578125, "objective/kl": 21.120101928710938, "objective/non_score_reward": -1.0560050010681152, "objective/rlhf_reward": -2.1013141296067577, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 124.16864776611328, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.68359375, "step": 71, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.998917818069458 }, { "episode": 1168, "epoch": 0.0069981186564570825, "loss/policy_avg": 0.07422849535942078, "lr": 9.95398773006135e-06, "objective/entropy": 9.376724243164062, "objective/kl": 15.093628883361816, "objective/non_score_reward": -0.7546814680099487, "objective/rlhf_reward": -1.6594760653719138, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 47.567962646484375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.65234375, "step": 72, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9958832263946533 }, { "episode": 1184, "epoch": 0.0070939832955866314, "loss/policy_avg": 0.11969298124313354, "lr": 9.953348670756648e-06, "objective/entropy": 133.57423400878906, "objective/kl": 20.2343807220459, "objective/non_score_reward": -1.0117191076278687, "objective/rlhf_reward": -1.1231571778070655, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 93.79672241210938, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.423828125, "step": 73, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.0005669593811035 }, { "episode": 1200, "epoch": 0.00718984793471618, "loss/policy_avg": 0.2395152747631073, "lr": 9.952709611451944e-06, "objective/entropy": 31.68697166442871, "objective/kl": 20.96116828918457, "objective/non_score_reward": -1.0480585098266602, "objective/rlhf_reward": -2.711281481202006, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 194.83474731445312, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.669921875, "step": 74, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9953577518463135 }, { "episode": 1216, "epoch": 0.00728571257384573, "loss/policy_avg": 0.27856501936912537, "lr": 9.952070552147241e-06, "objective/entropy": 119.42091369628906, "objective/kl": 11.30095100402832, "objective/non_score_reward": -0.5650476217269897, "objective/rlhf_reward": -0.9185547738367612, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 59.14590835571289, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.75, "step": 75, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9953703880310059 }, { "episode": 1232, "epoch": 0.007381577212975279, "loss/policy_avg": 0.21030786633491516, "lr": 9.951431492842536e-06, "objective/entropy": 7.310768127441406, "objective/kl": 6.645857810974121, "objective/non_score_reward": -0.3322928845882416, "objective/rlhf_reward": 0.04943063011993787, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 14.611559867858887, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.591796875, "step": 76, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9996391534805298 }, { "episode": 1248, "epoch": 0.007477441852104828, "loss/policy_avg": 0.4117072820663452, "lr": 9.950792433537833e-06, "objective/entropy": -109.53082275390625, "objective/kl": 11.825650215148926, "objective/non_score_reward": -0.5912825465202332, "objective/rlhf_reward": 0.03486987352371207, "objective/scores": 0.6, "policy/approxkl_avg": 19.0810604095459, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6171875, "step": 77, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9981213808059692 }, { "episode": 1264, "epoch": 0.007573306491234377, "loss/policy_avg": 0.2597622275352478, "lr": 9.950153374233129e-06, "objective/entropy": -29.7529296875, "objective/kl": 18.43012809753418, "objective/non_score_reward": -0.9215063452720642, "objective/rlhf_reward": -2.2860254704952236, "objective/scores": 0.35, "policy/approxkl_avg": 267.2847900390625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.70703125, "step": 78, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.997544765472412 }, { "episode": 1280, "epoch": 0.007669171130363926, "loss/policy_avg": 0.2407466471195221, "lr": 9.949514314928425e-06, "objective/entropy": 14.07373046875, "objective/kl": 20.781753540039062, "objective/non_score_reward": -1.0390876531600952, "objective/rlhf_reward": -1.2326316579591956, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 147.4822235107422, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.724609375, "step": 79, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9987099170684814 }, { "episode": 1296, "epoch": 0.007765035769493475, "loss/policy_avg": 0.17344285547733307, "lr": 9.948875255623722e-06, "objective/entropy": 112.44259643554688, "objective/kl": 10.0985746383667, "objective/non_score_reward": -0.504928708076477, "objective/rlhf_reward": 0.38028510808944693, "objective/scores": 0.6, "policy/approxkl_avg": 4.8866167068481445, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.443359375, "step": 80, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0143842697143555 }, { "episode": 1312, "epoch": 0.007860900408623025, "loss/policy_avg": 0.14816004037857056, "lr": 9.94823619631902e-06, "objective/entropy": 67.11033630371094, "objective/kl": 17.487518310546875, "objective/non_score_reward": -0.8743758797645569, "objective/rlhf_reward": -2.1558679251963193, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 18.69343376159668, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4619140625, "step": 81, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.998113751411438 }, { "episode": 1328, "epoch": 0.007956765047752574, "loss/policy_avg": 0.2536642849445343, "lr": 9.947597137014316e-06, "objective/entropy": -71.85224914550781, "objective/kl": 11.223343849182129, "objective/non_score_reward": -0.5611672401428223, "objective/rlhf_reward": -0.7637163875654935, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 37.78028869628906, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.48828125, "step": 82, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0003702640533447 }, { "episode": 1344, "epoch": 0.008052629686882123, "loss/policy_avg": 0.3479039669036865, "lr": 9.946958077709611e-06, "objective/entropy": 146.41241455078125, "objective/kl": 20.458145141601562, "objective/non_score_reward": -1.0229072570800781, "objective/rlhf_reward": -2.732379042838497, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 64.28889465332031, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.705078125, "step": 83, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9976742267608643 }, { "episode": 1360, "epoch": 0.008148494326011672, "loss/policy_avg": 0.10525624454021454, "lr": 9.946319018404908e-06, "objective/entropy": -43.42662048339844, "objective/kl": 13.858359336853027, "objective/non_score_reward": -0.6929180026054382, "objective/rlhf_reward": -0.6489658228316642, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 61.37925720214844, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.48828125, "step": 84, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0012595653533936 }, { "episode": 1376, "epoch": 0.00824435896514122, "loss/policy_avg": 0.3409525156021118, "lr": 9.945679959100205e-06, "objective/entropy": 1.5508041381835938, "objective/kl": 19.05010223388672, "objective/non_score_reward": -0.9525051116943359, "objective/rlhf_reward": -2.205900583330708, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 97.6533203125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.63671875, "step": 85, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000422477722168 }, { "episode": 1392, "epoch": 0.00834022360427077, "loss/policy_avg": 0.3110717535018921, "lr": 9.945040899795502e-06, "objective/entropy": 215.75965881347656, "objective/kl": 18.800819396972656, "objective/non_score_reward": -0.9400409460067749, "objective/rlhf_reward": -2.156043860975819, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 84.93620300292969, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.669921875, "step": 86, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9962902069091797 }, { "episode": 1408, "epoch": 0.008436088243400319, "loss/policy_avg": 0.02868543565273285, "lr": 9.944401840490799e-06, "objective/entropy": 154.10025024414062, "objective/kl": 13.492873191833496, "objective/non_score_reward": -0.6746436357498169, "objective/rlhf_reward": -0.9652413214246431, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 42.483882904052734, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.44921875, "step": 87, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9983662366867065 }, { "episode": 1424, "epoch": 0.008531952882529868, "loss/policy_avg": 0.07607420533895493, "lr": 9.943762781186096e-06, "objective/entropy": 202.40365600585938, "objective/kl": 13.719297409057617, "objective/non_score_reward": -0.685964822769165, "objective/rlhf_reward": 1.6561407089233402, "objective/scores": 1.1, "policy/approxkl_avg": 20.57819175720215, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.728515625, "step": 88, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.999366283416748 }, { "episode": 1440, "epoch": 0.008627817521659416, "loss/policy_avg": 0.16665664315223694, "lr": 9.94312372188139e-06, "objective/entropy": -100.20193481445312, "objective/kl": 15.216776847839355, "objective/non_score_reward": -0.7608388662338257, "objective/rlhf_reward": -1.4392355120817002, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 85.36731719970703, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.626953125, "step": 89, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9990196228027344 }, { "episode": 1456, "epoch": 0.008723682160788965, "loss/policy_avg": 0.19817781448364258, "lr": 9.942484662576688e-06, "objective/entropy": -0.7409725189208984, "objective/kl": 10.389724731445312, "objective/non_score_reward": -0.5194862484931946, "objective/rlhf_reward": 2.3220549762248996, "objective/scores": 1.1, "policy/approxkl_avg": 12.642692565917969, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.52734375, "step": 90, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9989277124404907 }, { "episode": 1472, "epoch": 0.008819546799918514, "loss/policy_avg": 0.2365586757659912, "lr": 9.941845603271985e-06, "objective/entropy": 152.64306640625, "objective/kl": 21.58309555053711, "objective/non_score_reward": -1.0791547298431396, "objective/rlhf_reward": -2.9573691723093223, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 87.72661590576172, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.771484375, "step": 91, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9999784231185913 }, { "episode": 1488, "epoch": 0.008915411439048063, "loss/policy_avg": 0.059907689690589905, "lr": 9.941206543967281e-06, "objective/entropy": 89.6580810546875, "objective/kl": 16.996726989746094, "objective/non_score_reward": -0.8498364686965942, "objective/rlhf_reward": -1.9755135669308581, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 72.40145874023438, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.861328125, "step": 92, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0003039836883545 }, { "episode": 1504, "epoch": 0.009011276078177614, "loss/policy_avg": 0.14265713095664978, "lr": 9.940567484662578e-06, "objective/entropy": -33.708492279052734, "objective/kl": 15.94516372680664, "objective/non_score_reward": -0.797258198261261, "objective/rlhf_reward": -0.2653137638580527, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 78.95989990234375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.53125, "step": 93, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.997810959815979 }, { "episode": 1520, "epoch": 0.009107140717307163, "loss/policy_avg": -0.018713245168328285, "lr": 9.939928425357874e-06, "objective/entropy": -3.091245651245117, "objective/kl": 14.482427597045898, "objective/non_score_reward": -0.7241213917732239, "objective/rlhf_reward": -1.2346261046534641, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 56.76847839355469, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.501953125, "step": 94, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9993091821670532 }, { "episode": 1536, "epoch": 0.009203005356436712, "loss/policy_avg": -0.0069353943690657616, "lr": 9.93928936605317e-06, "objective/entropy": 95.46006774902344, "objective/kl": 20.928672790527344, "objective/non_score_reward": -1.046433687210083, "objective/rlhf_reward": -2.360906060012888, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 103.58160400390625, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.58203125, "step": 95, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9974098205566406 }, { "episode": 1552, "epoch": 0.009298869995566261, "loss/policy_avg": 0.0523187518119812, "lr": 9.938650306748467e-06, "objective/entropy": 16.342994689941406, "objective/kl": 20.205509185791016, "objective/non_score_reward": -1.0102753639221191, "objective/rlhf_reward": 0.35889836549758947, "objective/scores": 1.1, "policy/approxkl_avg": 84.55277252197266, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4697265625, "step": 96, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000453233718872 }, { "episode": 1568, "epoch": 0.00939473463469581, "loss/policy_avg": 0.18428044021129608, "lr": 9.938011247443764e-06, "objective/entropy": -31.386062622070312, "objective/kl": 19.641075134277344, "objective/non_score_reward": -0.9820537567138672, "objective/rlhf_reward": -1.8055088541665412, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 92.56884002685547, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.59765625, "step": 97, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0001230239868164 }, { "episode": 1584, "epoch": 0.009490599273825359, "loss/policy_avg": -0.11768925935029984, "lr": 9.937372188139061e-06, "objective/entropy": -29.0854434967041, "objective/kl": 16.647226333618164, "objective/non_score_reward": -0.8323614001274109, "objective/rlhf_reward": -1.9701957342371177, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 1.0866272449493408, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.541015625, "step": 98, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0018460750579834 }, { "episode": 1600, "epoch": 0.009586463912954908, "loss/policy_avg": 0.06727765500545502, "lr": 9.936733128834358e-06, "objective/entropy": 96.53413391113281, "objective/kl": 21.015684127807617, "objective/non_score_reward": -1.0507843494415283, "objective/rlhf_reward": -2.8031371593475343, "objective/scores": 0.35, "policy/approxkl_avg": 36.56340026855469, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.646484375, "step": 99, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9949061870574951 }, { "episode": 1616, "epoch": 0.009682328552084457, "loss/policy_avg": 0.28386813402175903, "lr": 9.936094069529653e-06, "objective/entropy": 33.901954650878906, "objective/kl": 19.533782958984375, "objective/non_score_reward": -0.9766892194747925, "objective/rlhf_reward": -2.425804230387568, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 162.0339813232422, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5859375, "step": 100, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9985501766204834 }, { "episode": 1632, "epoch": 0.009778193191214006, "loss/policy_avg": 0.11220409721136093, "lr": 9.93545501022495e-06, "objective/entropy": -3.93096923828125, "objective/kl": 22.981700897216797, "objective/non_score_reward": -1.1490850448608398, "objective/rlhf_reward": -3.1725080504017744, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 46.0514030456543, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6328125, "step": 101, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0070252418518066 }, { "episode": 1648, "epoch": 0.009874057830343555, "loss/policy_avg": 0.20420242846012115, "lr": 9.934815950920245e-06, "objective/entropy": 198.98751831054688, "objective/kl": 17.92270278930664, "objective/non_score_reward": -0.8961352109909058, "objective/rlhf_reward": -1.759712155136179, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 55.74137878417969, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.65234375, "step": 102, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9980387687683105 }, { "episode": 1664, "epoch": 0.009969922469473104, "loss/policy_avg": 0.27041423320770264, "lr": 9.934176891615542e-06, "objective/entropy": 1.5637626647949219, "objective/kl": 12.633028030395508, "objective/non_score_reward": -0.6316514015197754, "objective/rlhf_reward": -0.7017769768563022, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 13.92137622833252, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4208984375, "step": 103, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9987752437591553 }, { "episode": 1680, "epoch": 0.010065787108602653, "loss/policy_avg": 0.318324476480484, "lr": 9.933537832310839e-06, "objective/entropy": 218.76858520507812, "objective/kl": 21.40100860595703, "objective/non_score_reward": -1.0700504779815674, "objective/rlhf_reward": -2.9385662584597165, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 90.99249267578125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.677734375, "step": 104, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.998801827430725 }, { "episode": 1696, "epoch": 0.010161651747732202, "loss/policy_avg": 0.3075984716415405, "lr": 9.932898773006136e-06, "objective/entropy": -56.81090545654297, "objective/kl": 10.457717895507812, "objective/non_score_reward": -0.5228859186172485, "objective/rlhf_reward": -0.7129414687431871, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 48.63943862915039, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.658203125, "step": 105, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.995776653289795 }, { "episode": 1712, "epoch": 0.01025751638686175, "loss/policy_avg": 0.5551585555076599, "lr": 9.932259713701433e-06, "objective/entropy": -48.12900924682617, "objective/kl": 21.915470123291016, "objective/non_score_reward": -1.0957735776901245, "objective/rlhf_reward": -1.459375207067701, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 33.369083404541016, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.71484375, "step": 106, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.995157241821289 }, { "episode": 1728, "epoch": 0.0103533810259913, "loss/policy_avg": 0.252463161945343, "lr": 9.931620654396728e-06, "objective/entropy": -69.64755249023438, "objective/kl": 15.248108863830566, "objective/non_score_reward": -0.7624054551124573, "objective/rlhf_reward": -1.707986166983276, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 59.05755615234375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7578125, "step": 107, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9963257312774658 }, { "episode": 1744, "epoch": 0.01044924566512085, "loss/policy_avg": 0.13919854164123535, "lr": 9.930981595092025e-06, "objective/entropy": -133.55258178710938, "objective/kl": 17.2213134765625, "objective/non_score_reward": -0.8610656261444092, "objective/rlhf_reward": -2.0850126979097556, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 32.41887664794922, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5234375, "step": 108, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9992578029632568 }, { "episode": 1760, "epoch": 0.010545110304250399, "loss/policy_avg": 0.5300755500793457, "lr": 9.930342535787322e-06, "objective/entropy": -9.471179962158203, "objective/kl": 18.607471466064453, "objective/non_score_reward": -0.9303736090660095, "objective/rlhf_reward": -2.3214945554733273, "objective/scores": 0.35, "policy/approxkl_avg": 31.75185203552246, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.654296875, "step": 109, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9994070529937744 }, { "episode": 1776, "epoch": 0.010640974943379948, "loss/policy_avg": 0.17107412219047546, "lr": 9.929703476482619e-06, "objective/entropy": 72.44110107421875, "objective/kl": 16.862125396728516, "objective/non_score_reward": -0.8431062698364258, "objective/rlhf_reward": -3.372425138950348, "objective/scores": 0.0, "policy/approxkl_avg": 66.22834777832031, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.70703125, "step": 110, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.995293378829956 }, { "episode": 1792, "epoch": 0.010736839582509497, "loss/policy_avg": -0.11443672329187393, "lr": 9.929064417177915e-06, "objective/entropy": 80.82670593261719, "objective/kl": 18.79993438720703, "objective/non_score_reward": -0.9399967789649963, "objective/rlhf_reward": -2.336154927213756, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 31.270248413085938, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.5625, "step": 111, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.007622241973877 }, { "episode": 1808, "epoch": 0.010832704221639046, "loss/policy_avg": 0.0878123939037323, "lr": 9.928425357873212e-06, "objective/entropy": -118.92440795898438, "objective/kl": 17.83495330810547, "objective/non_score_reward": -0.8917477130889893, "objective/rlhf_reward": -2.2253551392847593, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 20.88257598876953, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7421875, "step": 112, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.996976613998413 }, { "episode": 1824, "epoch": 0.010928568860768595, "loss/policy_avg": 0.18364591896533966, "lr": 9.927786298568507e-06, "objective/entropy": 8.144821166992188, "objective/kl": 14.821235656738281, "objective/non_score_reward": -0.741061806678772, "objective/rlhf_reward": -1.2309138337771097, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 17.778968811035156, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.62890625, "step": 113, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000706911087036 }, { "episode": 1840, "epoch": 0.011024433499898144, "loss/policy_avg": 0.06979192793369293, "lr": 9.927147239263804e-06, "objective/entropy": -2.9724502563476562, "objective/kl": 17.076000213623047, "objective/non_score_reward": -0.8538000583648682, "objective/rlhf_reward": -1.8994284508549533, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 46.98078918457031, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.798828125, "step": 114, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9999396800994873 }, { "episode": 1856, "epoch": 0.011120298139027693, "loss/policy_avg": 0.27465301752090454, "lr": 9.926508179959101e-06, "objective/entropy": 40.056610107421875, "objective/kl": 22.515907287597656, "objective/non_score_reward": -1.1257953643798828, "objective/rlhf_reward": -2.8413221291905506, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 81.93817138671875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.57421875, "step": 115, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0007479190826416 }, { "episode": 1872, "epoch": 0.011216162778157242, "loss/policy_avg": 0.3945024013519287, "lr": 9.925869120654398e-06, "objective/entropy": 69.15873718261719, "objective/kl": 21.74050521850586, "objective/non_score_reward": -1.0870254039764404, "objective/rlhf_reward": -3.0225888824760148, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 38.46895980834961, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.59765625, "step": 116, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0014419555664062 }, { "episode": 1888, "epoch": 0.01131202741728679, "loss/policy_avg": 0.5689772367477417, "lr": 9.925230061349695e-06, "objective/entropy": 144.26678466796875, "objective/kl": 14.530990600585938, "objective/non_score_reward": -0.726549506187439, "objective/rlhf_reward": -1.1728648702303568, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 1.715579628944397, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8203125, "step": 117, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0058815479278564 }, { "episode": 1904, "epoch": 0.01140789205641634, "loss/policy_avg": -0.025625256821513176, "lr": 9.92459100204499e-06, "objective/entropy": -91.6683120727539, "objective/kl": 16.61312484741211, "objective/non_score_reward": -0.8306561708450317, "objective/rlhf_reward": -1.944022663918835, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 18.064186096191406, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.4990234375, "step": 118, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.999868392944336 }, { "episode": 1920, "epoch": 0.011503756695545889, "loss/policy_avg": 0.4135175943374634, "lr": 9.923951942740287e-06, "objective/entropy": 145.33905029296875, "objective/kl": 18.559207916259766, "objective/non_score_reward": -0.9279603958129883, "objective/rlhf_reward": -1.5891353509583808, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 19.033662796020508, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.66796875, "step": 119, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9981472492218018 }, { "episode": 1936, "epoch": 0.011599621334675438, "loss/policy_avg": 0.3322446942329407, "lr": 9.923312883435584e-06, "objective/entropy": 109.6761474609375, "objective/kl": 18.231651306152344, "objective/non_score_reward": -0.9115825891494751, "objective/rlhf_reward": -1.2463304907083512, "objective/scores": 0.6, "policy/approxkl_avg": 108.51126098632812, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.65625, "step": 120, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.996593952178955 }, { "episode": 1952, "epoch": 0.011695485973804987, "loss/policy_avg": 0.22522342205047607, "lr": 9.92267382413088e-06, "objective/entropy": 95.46246337890625, "objective/kl": 16.838998794555664, "objective/non_score_reward": -0.841949999332428, "objective/rlhf_reward": -1.8520282743298375, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 14.038084983825684, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8046875, "step": 121, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.997774362564087 }, { "episode": 1968, "epoch": 0.011791350612934537, "loss/policy_avg": 0.18379229307174683, "lr": 9.922034764826178e-06, "objective/entropy": 138.12388610839844, "objective/kl": 25.93743324279785, "objective/non_score_reward": -1.2968716621398926, "objective/rlhf_reward": -3.828236812089367, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 26.206398010253906, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.734375, "step": 122, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0011606216430664 }, { "episode": 1984, "epoch": 0.011887215252064086, "loss/policy_avg": 0.31653979420661926, "lr": 9.921395705521473e-06, "objective/entropy": -44.61676788330078, "objective/kl": 21.166324615478516, "objective/non_score_reward": -1.0583162307739258, "objective/rlhf_reward": -2.9077520704566666, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 29.74887466430664, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.521484375, "step": 123, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9996273517608643 }, { "episode": 2000, "epoch": 0.011983079891193635, "loss/policy_avg": 0.1589316874742508, "lr": 9.92075664621677e-06, "objective/entropy": -77.4912109375, "objective/kl": 20.79126739501953, "objective/non_score_reward": -1.0395634174346924, "objective/rlhf_reward": -2.4249199191729227, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 133.58343505859375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.66015625, "step": 124, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9961724281311035 }, { "episode": 2016, "epoch": 0.012078944530323184, "loss/policy_avg": 0.2586688995361328, "lr": 9.920117586912067e-06, "objective/entropy": 139.38818359375, "objective/kl": 21.455245971679688, "objective/non_score_reward": -1.072762370109558, "objective/rlhf_reward": -2.775277876647648, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 47.609947204589844, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8125, "step": 125, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9975237846374512 }, { "episode": 2032, "epoch": 0.012174809169452733, "loss/policy_avg": 0.16066747903823853, "lr": 9.919478527607362e-06, "objective/entropy": 72.43231201171875, "objective/kl": 20.59688377380371, "objective/non_score_reward": -1.0298442840576172, "objective/rlhf_reward": 0.28062304258346593, "objective/scores": 1.1, "policy/approxkl_avg": 75.74966430664062, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.529296875, "step": 126, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.998389720916748 }, { "episode": 2048, "epoch": 0.012270673808582282, "loss/policy_avg": 0.07932023704051971, "lr": 9.918839468302659e-06, "objective/entropy": -12.7745361328125, "objective/kl": 20.53061294555664, "objective/non_score_reward": -1.0265307426452637, "objective/rlhf_reward": -2.7275206232942164, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 19.110069274902344, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.55859375, "step": 127, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9984248876571655 }, { "episode": 2064, "epoch": 0.012366538447711831, "loss/policy_avg": 0.27331969141960144, "lr": 9.918200408997956e-06, "objective/entropy": 101.82013702392578, "objective/kl": 18.18286895751953, "objective/non_score_reward": -0.9091434478759766, "objective/rlhf_reward": -2.2579716230310023, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 6.703115463256836, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.556640625, "step": 128, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0009617805480957 }, { "episode": 2080, "epoch": 0.01246240308684138, "loss/policy_avg": 0.4916057586669922, "lr": 9.917561349693252e-06, "objective/entropy": 88.1321029663086, "objective/kl": 23.30657958984375, "objective/non_score_reward": -1.165329098701477, "objective/rlhf_reward": -3.3020663795217704, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 142.93795776367188, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.611328125, "step": 129, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9967398643493652 }, { "episode": 2096, "epoch": 0.012558267725970929, "loss/policy_avg": 0.16071423888206482, "lr": 9.91692229038855e-06, "objective/entropy": 136.1899871826172, "objective/kl": 15.380975723266602, "objective/non_score_reward": -0.769048810005188, "objective/rlhf_reward": -0.6761951804161073, "objective/scores": 0.6, "policy/approxkl_avg": 28.551767349243164, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.56640625, "step": 130, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.03311824798584 }, { "episode": 2112, "epoch": 0.012654132365100478, "loss/policy_avg": 0.0021135974675416946, "lr": 9.916283231083844e-06, "objective/entropy": -71.15084838867188, "objective/kl": 18.961715698242188, "objective/non_score_reward": -0.9480857849121094, "objective/rlhf_reward": -2.1304838709241016, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 1.844127893447876, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4833984375, "step": 131, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0009913444519043 }, { "episode": 2128, "epoch": 0.012749997004230027, "loss/policy_avg": 0.042635850608348846, "lr": 9.915644171779141e-06, "objective/entropy": 20.673603057861328, "objective/kl": 15.986173629760742, "objective/non_score_reward": -0.7993086576461792, "objective/rlhf_reward": -1.8555989473158414, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 36.049034118652344, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.67578125, "step": 132, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.998380422592163 }, { "episode": 2144, "epoch": 0.012845861643359576, "loss/policy_avg": 0.46513473987579346, "lr": 9.915005112474438e-06, "objective/entropy": 5.5274505615234375, "objective/kl": 19.590290069580078, "objective/non_score_reward": -0.979514479637146, "objective/rlhf_reward": -2.5394558692849696, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 12.074180603027344, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.642578125, "step": 133, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0011277198791504 }, { "episode": 2160, "epoch": 0.012941726282489125, "loss/policy_avg": 0.245748370885849, "lr": 9.914366053169735e-06, "objective/entropy": 65.60797119140625, "objective/kl": 19.637710571289062, "objective/non_score_reward": -0.9818854928016663, "objective/rlhf_reward": -1.980130786971982, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 50.17578125, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.791015625, "step": 134, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9983665943145752 }, { "episode": 2176, "epoch": 0.013037590921618674, "loss/policy_avg": 0.02180427499115467, "lr": 9.913726993865032e-06, "objective/entropy": 0.8936500549316406, "objective/kl": 24.33076286315918, "objective/non_score_reward": -1.2165381908416748, "objective/rlhf_reward": -3.524517109900146, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 69.30375671386719, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5009765625, "step": 135, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.99745774269104 }, { "episode": 2192, "epoch": 0.013133455560748224, "loss/policy_avg": 0.36717042326927185, "lr": 9.913087934560329e-06, "objective/entropy": 83.415283203125, "objective/kl": 21.930896759033203, "objective/non_score_reward": -1.0965447425842285, "objective/rlhf_reward": -1.4624603136789527, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 79.15277862548828, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.546875, "step": 136, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.998199701309204 }, { "episode": 2208, "epoch": 0.013229320199877773, "loss/policy_avg": 0.2460360825061798, "lr": 9.912448875255624e-06, "objective/entropy": 137.11976623535156, "objective/kl": 21.218502044677734, "objective/non_score_reward": -1.060925006866455, "objective/rlhf_reward": -2.8198681666451373, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 67.851806640625, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.666015625, "step": 137, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9969704151153564 }, { "episode": 2224, "epoch": 0.013325184839007322, "loss/policy_avg": 0.21244561672210693, "lr": 9.911809815950921e-06, "objective/entropy": 175.0180206298828, "objective/kl": 16.889467239379883, "objective/non_score_reward": -0.8444733619689941, "objective/rlhf_reward": -1.4304821593331654, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 78.4537353515625, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.515625, "step": 138, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9985227584838867 }, { "episode": 2240, "epoch": 0.013421049478136871, "loss/policy_avg": 0.18417471647262573, "lr": 9.911170756646218e-06, "objective/entropy": 224.734619140625, "objective/kl": 33.112342834472656, "objective/non_score_reward": -1.6556169986724854, "objective/rlhf_reward": -4.889135018984477, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 160.8165283203125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7109375, "step": 139, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9992772340774536 }, { "episode": 2256, "epoch": 0.01351691411726642, "loss/policy_avg": 0.40639203786849976, "lr": 9.910531697341515e-06, "objective/entropy": 69.94343566894531, "objective/kl": 24.266616821289062, "objective/non_score_reward": -1.2133309841156006, "objective/rlhf_reward": -3.40272543868576, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 126.5036392211914, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5625, "step": 140, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.999225378036499 }, { "episode": 2272, "epoch": 0.01361277875639597, "loss/policy_avg": 0.28501349687576294, "lr": 9.909892638036812e-06, "objective/entropy": 61.523101806640625, "objective/kl": 17.776689529418945, "objective/non_score_reward": -0.8888344764709473, "objective/rlhf_reward": -1.8220045725504557, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 87.0567398071289, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.537109375, "step": 141, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.000370740890503 }, { "episode": 2288, "epoch": 0.013708643395525518, "loss/policy_avg": 0.30668091773986816, "lr": 9.909253578732107e-06, "objective/entropy": 227.46041870117188, "objective/kl": 20.17832374572754, "objective/non_score_reward": -1.0089161396026611, "objective/rlhf_reward": -2.5198930142247047, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 50.498268127441406, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.685546875, "step": 142, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.999216079711914 }, { "episode": 2304, "epoch": 0.013804508034655067, "loss/policy_avg": 0.3348355293273926, "lr": 9.908614519427404e-06, "objective/entropy": 164.50863647460938, "objective/kl": 13.646249771118164, "objective/non_score_reward": -0.6823124885559082, "objective/rlhf_reward": -1.1251298821607407, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 63.31299591064453, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.95703125, "step": 143, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9986834526062012 }, { "episode": 2320, "epoch": 0.013900372673784616, "loss/policy_avg": 0.7517778277397156, "lr": 9.9079754601227e-06, "objective/entropy": -69.42684936523438, "objective/kl": 13.007519721984863, "objective/non_score_reward": -0.6503760814666748, "objective/rlhf_reward": -0.2015041172504426, "objective/scores": 0.6, "policy/approxkl_avg": 15.501136779785156, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.533203125, "step": 144, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9969980716705322 }, { "episode": 2336, "epoch": 0.013996237312914165, "loss/policy_avg": 0.1666509509086609, "lr": 9.907336400817996e-06, "objective/entropy": 175.3941192626953, "objective/kl": 20.383106231689453, "objective/non_score_reward": -1.0191553831100464, "objective/rlhf_reward": -2.414762055099593, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 102.40309143066406, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.65625, "step": 145, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9960074424743652 }, { "episode": 2352, "epoch": 0.014092101952043714, "loss/policy_avg": 0.08111919462680817, "lr": 9.906697341513293e-06, "objective/entropy": 66.45804595947266, "objective/kl": 20.63641357421875, "objective/non_score_reward": -1.0318206548690796, "objective/rlhf_reward": -2.7680326637968253, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 16.144962310791016, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.44921875, "step": 146, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0003509521484375 }, { "episode": 2368, "epoch": 0.014187966591173263, "loss/policy_avg": 0.2162848860025406, "lr": 9.90605828220859e-06, "objective/entropy": 66.34003448486328, "objective/kl": 21.03724479675293, "objective/non_score_reward": -1.051862359046936, "objective/rlhf_reward": -1.8074494361877442, "objective/scores": 0.6, "policy/approxkl_avg": 56.59767150878906, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.544921875, "step": 147, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9967288970947266 }, { "episode": 2384, "epoch": 0.014283831230302812, "loss/policy_avg": 0.13452857732772827, "lr": 9.905419222903886e-06, "objective/entropy": 160.91929626464844, "objective/kl": 22.133365631103516, "objective/non_score_reward": -1.10666823387146, "objective/rlhf_reward": -2.693339631954829, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 64.49358367919922, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.62109375, "step": 148, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9988274574279785 }, { "episode": 2400, "epoch": 0.01437969586943236, "loss/policy_avg": 1.6826289892196655, "lr": 9.904780163599183e-06, "objective/entropy": -182.28018188476562, "objective/kl": 22.543842315673828, "objective/non_score_reward": -1.1271920204162598, "objective/rlhf_reward": -3.084936280449001, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 70.59880828857422, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.62890625, "step": 149, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0008223056793213 }, { "episode": 2416, "epoch": 0.01447556050856191, "loss/policy_avg": 0.4059183597564697, "lr": 9.904141104294478e-06, "objective/entropy": 225.73135375976562, "objective/kl": 23.115840911865234, "objective/non_score_reward": -1.1557921171188354, "objective/rlhf_reward": -2.8898351351420084, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 45.14168930053711, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.703125, "step": 150, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9997532367706299 }, { "episode": 2432, "epoch": 0.01457142514769146, "loss/policy_avg": 0.10681919753551483, "lr": 9.903502044989775e-06, "objective/entropy": 213.69598388671875, "objective/kl": 26.178190231323242, "objective/non_score_reward": -1.3089096546173096, "objective/rlhf_reward": -3.894002726584106, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 92.52935791015625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.69140625, "step": 151, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9975218772888184 }, { "episode": 2448, "epoch": 0.01466728978682101, "loss/policy_avg": -0.2853464186191559, "lr": 9.902862985685072e-06, "objective/entropy": 58.680572509765625, "objective/kl": 17.81705665588379, "objective/non_score_reward": -0.8908528089523315, "objective/rlhf_reward": -0.6396921619188514, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 89.08941650390625, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.669921875, "step": 152, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0143747329711914 }, { "episode": 2464, "epoch": 0.014763154425950558, "loss/policy_avg": 0.07825072109699249, "lr": 9.902223926380369e-06, "objective/entropy": 198.86288452148438, "objective/kl": 28.436542510986328, "objective/non_score_reward": -1.4218271970748901, "objective/rlhf_reward": -2.7635896548044414, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 44.41461181640625, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.59375, "step": 153, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9994783401489258 }, { "episode": 2480, "epoch": 0.014859019065080107, "loss/policy_avg": 0.27155977487564087, "lr": 9.901584867075666e-06, "objective/entropy": 89.04707336425781, "objective/kl": 21.113758087158203, "objective/non_score_reward": -1.0556879043579102, "objective/rlhf_reward": -1.2990326031458106, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 58.70441818237305, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.623046875, "step": 154, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9971623420715332 }, { "episode": 2496, "epoch": 0.014954883704209656, "loss/policy_avg": 0.3080964982509613, "lr": 9.900945807770961e-06, "objective/entropy": 35.38983154296875, "objective/kl": 21.02568817138672, "objective/non_score_reward": -1.0512844324111938, "objective/rlhf_reward": -2.7241851715401406, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 52.82551193237305, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.560546875, "step": 155, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9960044622421265 }, { "episode": 2512, "epoch": 0.015050748343339205, "loss/policy_avg": 4.562356472015381, "lr": 9.900306748466258e-06, "objective/entropy": 253.11752319335938, "objective/kl": 22.01451301574707, "objective/non_score_reward": -1.1007256507873535, "objective/rlhf_reward": -2.798782501284199, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 74.26364135742188, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.765625, "step": 156, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9965643882751465 }, { "episode": 2528, "epoch": 0.015146612982468754, "loss/policy_avg": 0.21197248995304108, "lr": 9.899667689161555e-06, "objective/entropy": 149.58770751953125, "objective/kl": 23.317626953125, "objective/non_score_reward": -1.1658812761306763, "objective/rlhf_reward": -2.2635251045227047, "objective/scores": 0.6, "policy/approxkl_avg": 51.574981689453125, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4736328125, "step": 157, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.995574951171875 }, { "episode": 2544, "epoch": 0.015242477621598303, "loss/policy_avg": 0.20880039036273956, "lr": 9.899028629856852e-06, "objective/entropy": -64.38532257080078, "objective/kl": 25.92443084716797, "objective/non_score_reward": -1.2962216138839722, "objective/rlhf_reward": -3.784886217117309, "objective/scores": 0.35, "policy/approxkl_avg": 138.45706176757812, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.568359375, "step": 158, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9968822002410889 }, { "episode": 2560, "epoch": 0.015338342260727852, "loss/policy_avg": 0.21600359678268433, "lr": 9.898389570552149e-06, "objective/entropy": 3.545970916748047, "objective/kl": 23.09051513671875, "objective/non_score_reward": -1.1545257568359375, "objective/rlhf_reward": -2.6706922007369354, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 36.885650634765625, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.55859375, "step": 159, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9993218183517456 }, { "episode": 2576, "epoch": 0.015434206899857401, "loss/policy_avg": 0.5031390190124512, "lr": 9.897750511247446e-06, "objective/entropy": 98.00604248046875, "objective/kl": 25.33047866821289, "objective/non_score_reward": -1.2665239572525024, "objective/rlhf_reward": -3.4619760847726635, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 83.63774871826172, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.671875, "step": 160, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.000274896621704 }, { "episode": 2592, "epoch": 0.01553007153898695, "loss/policy_avg": 0.018053412437438965, "lr": 9.89711145194274e-06, "objective/entropy": 2.8434524536132812, "objective/kl": 24.395084381103516, "objective/non_score_reward": -1.2197542190551758, "objective/rlhf_reward": -3.2171576074963673, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 1.6353378295898438, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.64453125, "step": 161, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.001478433609009 }, { "episode": 2608, "epoch": 0.0156259361781165, "loss/policy_avg": 0.25576311349868774, "lr": 9.896472392638038e-06, "objective/entropy": -64.24278259277344, "objective/kl": 16.287256240844727, "objective/non_score_reward": -0.8143627643585205, "objective/rlhf_reward": -1.5241178731123606, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 25.824050903320312, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6953125, "step": 162, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9984066486358643 }, { "episode": 2624, "epoch": 0.01572180081724605, "loss/policy_avg": 0.2750253677368164, "lr": 9.895833333333334e-06, "objective/entropy": 170.5203857421875, "objective/kl": 35.09113693237305, "objective/non_score_reward": -1.7545567750930786, "objective/rlhf_reward": -4.094508086086485, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 91.88323974609375, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.76171875, "step": 163, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9978190660476685 }, { "episode": 2640, "epoch": 0.0158176654563756, "loss/policy_avg": 0.2685161828994751, "lr": 9.895194274028631e-06, "objective/entropy": 107.911376953125, "objective/kl": 21.708637237548828, "objective/non_score_reward": -1.0854318141937256, "objective/rlhf_reward": -2.8911290570214834, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 48.546165466308594, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.603515625, "step": 164, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9939230680465698 }, { "episode": 2656, "epoch": 0.015913530095505148, "loss/policy_avg": 0.3802343010902405, "lr": 9.894555214723928e-06, "objective/entropy": 137.427978515625, "objective/kl": 20.673809051513672, "objective/non_score_reward": -1.0336904525756836, "objective/rlhf_reward": -2.793125978022247, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 36.90850830078125, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.63671875, "step": 165, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9987661838531494 }, { "episode": 2672, "epoch": 0.016009394734634697, "loss/policy_avg": 0.0008638650178909302, "lr": 9.893916155419225e-06, "objective/entropy": 159.45681762695312, "objective/kl": 20.339492797851562, "objective/non_score_reward": -1.016974687576294, "objective/rlhf_reward": -2.7086488542303275, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 6.459288597106934, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.515625, "step": 166, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9977226257324219 }, { "episode": 2688, "epoch": 0.016105259373764245, "loss/policy_avg": 0.3463206887245178, "lr": 9.89327709611452e-06, "objective/entropy": -75.2735824584961, "objective/kl": 27.865215301513672, "objective/non_score_reward": -1.3932607173919678, "objective/rlhf_reward": -4.173042631149292, "objective/scores": 0.35, "policy/approxkl_avg": 139.90060424804688, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.587890625, "step": 167, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0016684532165527 }, { "episode": 2704, "epoch": 0.016201124012893794, "loss/policy_avg": 0.07642253488302231, "lr": 9.892638036809815e-06, "objective/entropy": 38.99913787841797, "objective/kl": 19.061498641967773, "objective/non_score_reward": -0.9530749320983887, "objective/rlhf_reward": -1.987470920356821, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 22.035629272460938, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.484375, "step": 168, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0013604164123535 }, { "episode": 2720, "epoch": 0.016296988652023343, "loss/policy_avg": 0.2990867495536804, "lr": 9.891998977505112e-06, "objective/entropy": 199.7046661376953, "objective/kl": 23.46067237854004, "objective/non_score_reward": -1.1730337142944336, "objective/rlhf_reward": -3.268302519519893, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 19.572267532348633, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6171875, "step": 169, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.998270034790039 }, { "episode": 2736, "epoch": 0.016392853291152892, "loss/policy_avg": 0.3040146231651306, "lr": 9.89135991820041e-06, "objective/entropy": 84.5781021118164, "objective/kl": 24.218996047973633, "objective/non_score_reward": -1.2109497785568237, "objective/rlhf_reward": -2.896387885289128, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 91.4429931640625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.57421875, "step": 170, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0007870197296143 }, { "episode": 2752, "epoch": 0.01648871793028244, "loss/policy_avg": 0.24132516980171204, "lr": 9.890720858895706e-06, "objective/entropy": 25.26891326904297, "objective/kl": 12.311616897583008, "objective/non_score_reward": -0.6155807971954346, "objective/rlhf_reward": -2.4623232781887054, "objective/scores": 0.0, "policy/approxkl_avg": 4.089572906494141, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6015625, "step": 171, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9984550476074219 }, { "episode": 2768, "epoch": 0.01658458256941199, "loss/policy_avg": 0.07815683633089066, "lr": 9.890081799591003e-06, "objective/entropy": -2.7739601135253906, "objective/kl": 20.480499267578125, "objective/non_score_reward": -1.0240248441696167, "objective/rlhf_reward": -2.6151468185738325, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 11.766371726989746, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.52734375, "step": 172, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.999391794204712 }, { "episode": 2784, "epoch": 0.01668044720854154, "loss/policy_avg": 0.31003671884536743, "lr": 9.8894427402863e-06, "objective/entropy": -5.804538726806641, "objective/kl": 23.551572799682617, "objective/non_score_reward": -1.1775786876678467, "objective/rlhf_reward": -3.2597167297319025, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 241.19540405273438, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.587890625, "step": 173, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9990514516830444 }, { "episode": 2800, "epoch": 0.016776311847671088, "loss/policy_avg": 0.027285143733024597, "lr": 9.888803680981595e-06, "objective/entropy": 91.14071655273438, "objective/kl": 19.611085891723633, "objective/non_score_reward": -0.9805543422698975, "objective/rlhf_reward": -2.44126462471044, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 60.10600662231445, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.537109375, "step": 174, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9972784519195557 }, { "episode": 2816, "epoch": 0.016872176486800637, "loss/policy_avg": 0.2845172882080078, "lr": 9.888164621676892e-06, "objective/entropy": 30.190153121948242, "objective/kl": 24.783939361572266, "objective/non_score_reward": -1.239197015762329, "objective/rlhf_reward": -3.578185775367123, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 76.30748748779297, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.443359375, "step": 175, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9994440078735352 }, { "episode": 2832, "epoch": 0.016968041125930186, "loss/policy_avg": 0.5662503838539124, "lr": 9.887525562372189e-06, "objective/entropy": 60.807342529296875, "objective/kl": 12.370782852172852, "objective/non_score_reward": -0.6185390949249268, "objective/rlhf_reward": -1.0503242506581225, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 14.155126571655273, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.525390625, "step": 176, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9987331628799438 }, { "episode": 2848, "epoch": 0.017063905765059735, "loss/policy_avg": 0.08586982637643814, "lr": 9.886886503067486e-06, "objective/entropy": 43.38105010986328, "objective/kl": 24.246856689453125, "objective/non_score_reward": -1.2123429775238037, "objective/rlhf_reward": -3.470769503203732, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 141.50592041015625, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.64453125, "step": 177, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9969239234924316 }, { "episode": 2864, "epoch": 0.017159770404189284, "loss/policy_avg": 0.26094895601272583, "lr": 9.886247443762783e-06, "objective/entropy": 54.85191345214844, "objective/kl": 20.912307739257812, "objective/non_score_reward": -1.0456154346466064, "objective/rlhf_reward": -2.7824616193771363, "objective/scores": 0.35, "policy/approxkl_avg": 19.43996810913086, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4755859375, "step": 178, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0007808208465576 }, { "episode": 2880, "epoch": 0.017255635043318833, "loss/policy_avg": -0.0008885636925697327, "lr": 9.88560838445808e-06, "objective/entropy": 1.5364952087402344, "objective/kl": 18.547964096069336, "objective/non_score_reward": -0.9273982048034668, "objective/rlhf_reward": -1.762181530671056, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 103.84625244140625, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.52734375, "step": 179, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0031652450561523 }, { "episode": 2896, "epoch": 0.017351499682448382, "loss/policy_avg": 0.07095308601856232, "lr": 9.884969325153375e-06, "objective/entropy": -57.707908630371094, "objective/kl": 17.486156463623047, "objective/non_score_reward": -0.8743079304695129, "objective/rlhf_reward": -1.3745254895844794, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 35.78956604003906, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.63671875, "step": 180, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9995075464248657 }, { "episode": 2912, "epoch": 0.01744736432157793, "loss/policy_avg": 0.42247164249420166, "lr": 9.884330265848671e-06, "objective/entropy": 194.7113037109375, "objective/kl": 21.53358268737793, "objective/non_score_reward": -1.0766791105270386, "objective/rlhf_reward": -2.750457256045893, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 58.89783477783203, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.720703125, "step": 181, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.996431827545166 }, { "episode": 2928, "epoch": 0.01754322896070748, "loss/policy_avg": 0.3189627528190613, "lr": 9.883691206543968e-06, "objective/entropy": 125.43355560302734, "objective/kl": 20.729223251342773, "objective/non_score_reward": -1.0364612340927124, "objective/rlhf_reward": -2.767242708293301, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 31.974578857421875, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.60546875, "step": 182, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9984747171401978 }, { "episode": 2944, "epoch": 0.01763909359983703, "loss/policy_avg": 0.19416040182113647, "lr": 9.883052147239265e-06, "objective/entropy": 127.4957275390625, "objective/kl": 23.107641220092773, "objective/non_score_reward": -1.1553820371627808, "objective/rlhf_reward": -3.2429258609689295, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 41.45734786987305, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.376953125, "step": 183, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.999535322189331 }, { "episode": 2960, "epoch": 0.017734958238966578, "loss/policy_avg": 0.04916887357831001, "lr": 9.882413087934562e-06, "objective/entropy": -16.33904266357422, "objective/kl": 15.624849319458008, "objective/non_score_reward": -0.7812424898147583, "objective/rlhf_reward": -1.002263667360816, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 86.75860595703125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8203125, "step": 184, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9967164993286133 }, { "episode": 2976, "epoch": 0.017830822878096127, "loss/policy_avg": 0.15854808688163757, "lr": 9.881774028629857e-06, "objective/entropy": -9.968147277832031, "objective/kl": 20.46514320373535, "objective/non_score_reward": -1.0232571363449097, "objective/rlhf_reward": -2.35969527165095, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 16.395225524902344, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5859375, "step": 185, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9976359605789185 }, { "episode": 2992, "epoch": 0.017926687517225676, "loss/policy_avg": 0.36498603224754333, "lr": 9.881134969325154e-06, "objective/entropy": 209.59991455078125, "objective/kl": 18.690290451049805, "objective/non_score_reward": -0.9345145225524902, "objective/rlhf_reward": -2.338058030605316, "objective/scores": 0.35, "policy/approxkl_avg": 12.64120101928711, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.623046875, "step": 186, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9994118213653564 }, { "episode": 3008, "epoch": 0.018022552156355228, "loss/policy_avg": 0.15073028206825256, "lr": 9.880495910020451e-06, "objective/entropy": 33.50044250488281, "objective/kl": 21.099205017089844, "objective/non_score_reward": -1.0549602508544922, "objective/rlhf_reward": 0.1801587581634525, "objective/scores": 1.1, "policy/approxkl_avg": 28.017484664916992, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.666015625, "step": 187, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000263214111328 }, { "episode": 3024, "epoch": 0.018118416795484777, "loss/policy_avg": 0.04914219304919243, "lr": 9.879856850715748e-06, "objective/entropy": 109.99685668945312, "objective/kl": 23.795440673828125, "objective/non_score_reward": -1.1897720098495483, "objective/rlhf_reward": -0.3590880990028378, "objective/scores": 1.1, "policy/approxkl_avg": 17.797225952148438, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.529296875, "step": 188, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0002386569976807 }, { "episode": 3040, "epoch": 0.018214281434614326, "loss/policy_avg": 0.26782599091529846, "lr": 9.879217791411043e-06, "objective/entropy": 46.40031051635742, "objective/kl": 15.295504570007324, "objective/non_score_reward": -0.764775276184082, "objective/rlhf_reward": -1.6998512086614799, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 19.033124923706055, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4287109375, "step": 189, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0006394386291504 }, { "episode": 3056, "epoch": 0.018310146073743875, "loss/policy_avg": -0.0003484562039375305, "lr": 9.87857873210634e-06, "objective/entropy": -128.13638305664062, "objective/kl": 23.236797332763672, "objective/non_score_reward": -1.1618399620056152, "objective/rlhf_reward": -2.985500340879546, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 122.61852264404297, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.521484375, "step": 190, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.998673439025879 }, { "episode": 3072, "epoch": 0.018406010712873424, "loss/policy_avg": 0.285878986120224, "lr": 9.877939672801637e-06, "objective/entropy": -155.79151916503906, "objective/kl": 17.15728187561035, "objective/non_score_reward": -0.8578640818595886, "objective/rlhf_reward": -1.6981231282154718, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 27.024686813354492, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.578125, "step": 191, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9977209568023682 }, { "episode": 3088, "epoch": 0.018501875352002973, "loss/policy_avg": 0.03845605254173279, "lr": 9.877300613496934e-06, "objective/entropy": -79.23377227783203, "objective/kl": 24.854154586791992, "objective/non_score_reward": -1.2427077293395996, "objective/rlhf_reward": -3.4145718505054266, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 108.08650970458984, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.54296875, "step": 192, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9965288639068604 }, { "episode": 3104, "epoch": 0.018597739991132522, "loss/policy_avg": 0.22054271399974823, "lr": 9.876661554192229e-06, "objective/entropy": 58.46562576293945, "objective/kl": 18.69571876525879, "objective/non_score_reward": -0.9347859621047974, "objective/rlhf_reward": -1.3391437292099, "objective/scores": 0.6, "policy/approxkl_avg": 17.535587310791016, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.66796875, "step": 193, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.996351957321167 }, { "episode": 3120, "epoch": 0.01869360463026207, "loss/policy_avg": 0.46004775166511536, "lr": 9.876022494887526e-06, "objective/entropy": 208.6689453125, "objective/kl": 24.537294387817383, "objective/non_score_reward": -1.2268648147583008, "objective/rlhf_reward": -3.3511998941570074, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 103.11289978027344, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6171875, "step": 194, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9980366230010986 }, { "episode": 3136, "epoch": 0.01878946926939162, "loss/policy_avg": 0.14284425973892212, "lr": 9.875383435582823e-06, "objective/entropy": -140.25045776367188, "objective/kl": 21.156387329101562, "objective/non_score_reward": -1.0578192472457886, "objective/rlhf_reward": -1.8312772423028945, "objective/scores": 0.6, "policy/approxkl_avg": 95.11038208007812, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.69921875, "step": 195, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0021564960479736 }, { "episode": 3152, "epoch": 0.01888533390852117, "loss/policy_avg": 0.4036502540111542, "lr": 9.87474437627812e-06, "objective/entropy": 97.97139739990234, "objective/kl": 20.765098571777344, "objective/non_score_reward": -1.038254737854004, "objective/rlhf_reward": -1.7530193686485291, "objective/scores": 0.6, "policy/approxkl_avg": 33.61680603027344, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.75, "step": 196, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9960455894470215 }, { "episode": 3168, "epoch": 0.018981198547650718, "loss/policy_avg": 0.03367016091942787, "lr": 9.874105316973416e-06, "objective/entropy": 110.7692642211914, "objective/kl": 32.466636657714844, "objective/non_score_reward": -1.6233320236206055, "objective/rlhf_reward": -4.668499465259623, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 22.905399322509766, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.66015625, "step": 197, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000126361846924 }, { "episode": 3184, "epoch": 0.019077063186780267, "loss/policy_avg": 0.3382406532764435, "lr": 9.873466257668712e-06, "objective/entropy": -46.87655258178711, "objective/kl": 23.83783531188965, "objective/non_score_reward": -1.1918917894363403, "objective/rlhf_reward": -3.44205424550168, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 26.46108055114746, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4814453125, "step": 198, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9974579811096191 }, { "episode": 3200, "epoch": 0.019172927825909816, "loss/policy_avg": 0.05052588880062103, "lr": 9.872827198364009e-06, "objective/entropy": -62.79549789428711, "objective/kl": 19.587276458740234, "objective/non_score_reward": -0.9793638586997986, "objective/rlhf_reward": -0.9937364205133643, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 37.62165069580078, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.564453125, "step": 199, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9972220659255981 }, { "episode": 3216, "epoch": 0.019268792465039365, "loss/policy_avg": 0.2230260968208313, "lr": 9.872188139059305e-06, "objective/entropy": -37.75834655761719, "objective/kl": 23.102069854736328, "objective/non_score_reward": -1.1551035642623901, "objective/rlhf_reward": -3.2787786035830075, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 56.49012756347656, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.583984375, "step": 200, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000248908996582 }, { "episode": 3232, "epoch": 0.019364657104168913, "loss/policy_avg": 0.4118785858154297, "lr": 9.871549079754602e-06, "objective/entropy": 85.49769592285156, "objective/kl": 25.69809913635254, "objective/non_score_reward": -1.284904956817627, "objective/rlhf_reward": -3.5833605816036016, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 56.752174377441406, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.66015625, "step": 201, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9987632036209106 }, { "episode": 3248, "epoch": 0.019460521743298462, "loss/policy_avg": 0.06031988561153412, "lr": 9.8709100204499e-06, "objective/entropy": 16.456554412841797, "objective/kl": 25.35955047607422, "objective/non_score_reward": -1.2679774761199951, "objective/rlhf_reward": -3.6213118239358515, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 21.745624542236328, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.529296875, "step": 202, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9980382919311523 }, { "episode": 3264, "epoch": 0.01955638638242801, "loss/policy_avg": 0.06312263011932373, "lr": 9.870270961145196e-06, "objective/entropy": 132.99948120117188, "objective/kl": 22.432659149169922, "objective/non_score_reward": -1.1216330528259277, "objective/rlhf_reward": -2.8246725253468616, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 93.43849182128906, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.568359375, "step": 203, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9995737075805664 }, { "episode": 3280, "epoch": 0.01965225102155756, "loss/policy_avg": 0.6064414978027344, "lr": 9.869631901840491e-06, "objective/entropy": -19.207683563232422, "objective/kl": 18.83993148803711, "objective/non_score_reward": -0.9419965744018555, "objective/rlhf_reward": -2.3173880978540033, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 90.60572052001953, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.4931640625, "step": 204, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.000382661819458 }, { "episode": 3296, "epoch": 0.01974811566068711, "loss/policy_avg": 0.2940763831138611, "lr": 9.868992842535788e-06, "objective/entropy": 83.77371978759766, "objective/kl": 25.884700775146484, "objective/non_score_reward": -1.2942349910736084, "objective/rlhf_reward": -3.3521112903681507, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 39.873409271240234, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.447265625, "step": 205, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9972522258758545 }, { "episode": 3312, "epoch": 0.019843980299816658, "loss/policy_avg": 0.18257562816143036, "lr": 9.868353783231085e-06, "objective/entropy": 119.6646728515625, "objective/kl": 27.568458557128906, "objective/non_score_reward": -1.3784228563308716, "objective/rlhf_reward": -1.1136915445327755, "objective/scores": 1.1, "policy/approxkl_avg": 48.24208068847656, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.66796875, "step": 206, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9987564086914062 }, { "episode": 3328, "epoch": 0.019939844938946207, "loss/policy_avg": -0.011964879930019379, "lr": 9.867714723926382e-06, "objective/entropy": 79.78416442871094, "objective/kl": 24.409799575805664, "objective/non_score_reward": -1.2204899787902832, "objective/rlhf_reward": -3.5033578658975184, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 19.269145965576172, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4658203125, "step": 207, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.000608205795288 }, { "episode": 3344, "epoch": 0.020035709578075756, "loss/policy_avg": 0.04908262565732002, "lr": 9.867075664621679e-06, "objective/entropy": 174.413818359375, "objective/kl": 24.83539581298828, "objective/non_score_reward": -1.241769790649414, "objective/rlhf_reward": -3.3629594779649548, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 14.995980262756348, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.54296875, "step": 208, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9985637664794922 }, { "episode": 3360, "epoch": 0.020131574217205305, "loss/policy_avg": 0.14710021018981934, "lr": 9.866436605316974e-06, "objective/entropy": 132.51194763183594, "objective/kl": 29.743432998657227, "objective/non_score_reward": -1.4871716499328613, "objective/rlhf_reward": -4.344566795889454, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 65.08041381835938, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.490234375, "step": 209, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0021185874938965 }, { "episode": 3376, "epoch": 0.020227438856334854, "loss/policy_avg": 0.0796532854437828, "lr": 9.86579754601227e-06, "objective/entropy": 1.3461151123046875, "objective/kl": 26.279298782348633, "objective/non_score_reward": -1.313965082168579, "objective/rlhf_reward": -0.8558599710464474, "objective/scores": 1.1, "policy/approxkl_avg": 105.49284362792969, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5859375, "step": 210, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9989545345306396 }, { "episode": 3392, "epoch": 0.020323303495464403, "loss/policy_avg": -0.03664415329694748, "lr": 9.865158486707568e-06, "objective/entropy": -37.266082763671875, "objective/kl": 19.48423957824707, "objective/non_score_reward": -0.9742119908332825, "objective/rlhf_reward": -0.9731288298380103, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 8.304027557373047, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.638671875, "step": 211, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.003216028213501 }, { "episode": 3408, "epoch": 0.020419168134593952, "loss/policy_avg": 0.30985838174819946, "lr": 9.864519427402863e-06, "objective/entropy": 94.80859375, "objective/kl": 29.94342041015625, "objective/non_score_reward": -1.4971709251403809, "objective/rlhf_reward": -4.564851482112971, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 115.7642593383789, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.74609375, "step": 212, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9959279298782349 }, { "episode": 3424, "epoch": 0.0205150327737235, "loss/policy_avg": 0.23234406113624573, "lr": 9.86388036809816e-06, "objective/entropy": 125.32878875732422, "objective/kl": 33.22450637817383, "objective/non_score_reward": -1.6612253189086914, "objective/rlhf_reward": -4.820072407993387, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 82.43852233886719, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.58203125, "step": 213, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.001713752746582 }, { "episode": 3440, "epoch": 0.02061089741285305, "loss/policy_avg": 1.5097947120666504, "lr": 9.863241308793457e-06, "objective/entropy": 132.66845703125, "objective/kl": 27.622318267822266, "objective/non_score_reward": -1.3811159133911133, "objective/rlhf_reward": -3.6996345475044956, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 26.179336547851562, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.708984375, "step": 214, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9993813037872314 }, { "episode": 3456, "epoch": 0.0207067620519826, "loss/policy_avg": 0.12209601700305939, "lr": 9.862602249488753e-06, "objective/entropy": 132.88406372070312, "objective/kl": 26.24971580505371, "objective/non_score_reward": -1.312485694885254, "objective/rlhf_reward": -5.249942898750305, "objective/scores": 0.0, "policy/approxkl_avg": 41.524139404296875, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7109375, "step": 215, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9990661144256592 }, { "episode": 3472, "epoch": 0.02080262669111215, "loss/policy_avg": 0.3654727339744568, "lr": 9.86196319018405e-06, "objective/entropy": 39.344974517822266, "objective/kl": 23.619754791259766, "objective/non_score_reward": -1.18098783493042, "objective/rlhf_reward": -1.8002320870172706, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 5.19040584564209, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4951171875, "step": 216, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9990694522857666 }, { "episode": 3488, "epoch": 0.0208984913302417, "loss/policy_avg": 0.05907230079174042, "lr": 9.861324130879346e-06, "objective/entropy": -49.055564880371094, "objective/kl": 27.70423126220703, "objective/non_score_reward": -1.3852115869522095, "objective/rlhf_reward": -3.8789869598752125, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 62.16511917114258, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.609375, "step": 217, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9973804950714111 }, { "episode": 3504, "epoch": 0.02099435596937125, "loss/policy_avg": 0.5758800506591797, "lr": 9.860685071574642e-06, "objective/entropy": 18.1787166595459, "objective/kl": 25.688358306884766, "objective/non_score_reward": -1.2844178676605225, "objective/rlhf_reward": -2.2139523147952285, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 23.39984130859375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.498046875, "step": 218, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9974064826965332 }, { "episode": 3520, "epoch": 0.021090220608500798, "loss/policy_avg": 0.2610527575016022, "lr": 9.86004601226994e-06, "objective/entropy": -68.09791564941406, "objective/kl": 26.7615966796875, "objective/non_score_reward": -1.3380796909332275, "objective/rlhf_reward": -4.026806149512453, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 124.13450622558594, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4599609375, "step": 219, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9986741542816162 }, { "episode": 3536, "epoch": 0.021186085247630347, "loss/policy_avg": 0.1624567210674286, "lr": 9.859406952965236e-06, "objective/entropy": -113.99856567382812, "objective/kl": 19.689868927001953, "objective/non_score_reward": -0.9844935536384583, "objective/rlhf_reward": -2.113145466121744, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 45.295875549316406, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.70703125, "step": 220, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0004196166992188 }, { "episode": 3552, "epoch": 0.021281949886759896, "loss/policy_avg": 0.13548433780670166, "lr": 9.858767893660533e-06, "objective/entropy": 154.66708374023438, "objective/kl": 31.08365249633789, "objective/non_score_reward": -1.554182529449463, "objective/rlhf_reward": -4.554870968282805, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 43.560997009277344, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7265625, "step": 221, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9972419738769531 }, { "episode": 3568, "epoch": 0.021377814525889445, "loss/policy_avg": 0.04025420919060707, "lr": 9.858128834355828e-06, "objective/entropy": 145.02468872070312, "objective/kl": 31.459678649902344, "objective/non_score_reward": -1.572983980178833, "objective/rlhf_reward": -4.932686292861385, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 41.05935287475586, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4560546875, "step": 222, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.0009706020355225 }, { "episode": 3584, "epoch": 0.021473679165018994, "loss/policy_avg": 1.5885295867919922, "lr": 9.857489775051125e-06, "objective/entropy": 141.5781707763672, "objective/kl": 34.53314971923828, "objective/non_score_reward": -1.726657509803772, "objective/rlhf_reward": -5.244770532072174, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 37.03607177734375, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.658203125, "step": 223, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.992570400238037 }, { "episode": 3600, "epoch": 0.021569543804148543, "loss/policy_avg": 0.9811650514602661, "lr": 9.856850715746422e-06, "objective/entropy": -30.946441650390625, "objective/kl": 29.145998001098633, "objective/non_score_reward": -1.4572999477386475, "objective/rlhf_reward": -4.450597622481686, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 19.481060028076172, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.462890625, "step": 224, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9983983039855957 }, { "episode": 3616, "epoch": 0.021665408443278092, "loss/policy_avg": 0.5196128487586975, "lr": 9.856211656441719e-06, "objective/entropy": -16.55962371826172, "objective/kl": 28.4706974029541, "objective/non_score_reward": -1.423534870147705, "objective/rlhf_reward": -3.5714332482972484, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 117.12289428710938, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.732421875, "step": 225, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9975980520248413 }, { "episode": 3632, "epoch": 0.02176127308240764, "loss/policy_avg": 0.6528609395027161, "lr": 9.855572597137016e-06, "objective/entropy": 136.64077758789062, "objective/kl": 32.46646499633789, "objective/non_score_reward": -1.6233232021331787, "objective/rlhf_reward": -2.093292927742004, "objective/scores": 1.1, "policy/approxkl_avg": 44.35145950317383, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.669921875, "step": 226, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9994146823883057 }, { "episode": 3648, "epoch": 0.02185713772153719, "loss/policy_avg": 0.9434906244277954, "lr": 9.854933537832313e-06, "objective/entropy": -36.75615310668945, "objective/kl": 31.890575408935547, "objective/non_score_reward": -1.5945286750793457, "objective/rlhf_reward": -5.052601966887636, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 65.19577026367188, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.59375, "step": 227, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9979374408721924 }, { "episode": 3664, "epoch": 0.02195300236066674, "loss/policy_avg": 0.36130765080451965, "lr": 9.854294478527608e-06, "objective/entropy": 47.61101531982422, "objective/kl": 18.669593811035156, "objective/non_score_reward": -0.9334796071052551, "objective/rlhf_reward": -2.3339184284210206, "objective/scores": 0.35, "policy/approxkl_avg": 15.266149520874023, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.578125, "step": 228, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9959979057312012 }, { "episode": 3680, "epoch": 0.022048866999796288, "loss/policy_avg": 0.18321090936660767, "lr": 9.853655419222905e-06, "objective/entropy": 116.60293579101562, "objective/kl": 27.56112289428711, "objective/non_score_reward": -1.378056287765503, "objective/rlhf_reward": -3.5648136837052657, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 29.471284866333008, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.537109375, "step": 229, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9991540908813477 }, { "episode": 3696, "epoch": 0.022144731638925837, "loss/policy_avg": -0.044996485114097595, "lr": 9.853016359918202e-06, "objective/entropy": 38.275238037109375, "objective/kl": 28.720836639404297, "objective/non_score_reward": -1.4360418319702148, "objective/rlhf_reward": -4.187907754388407, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 173.6102752685547, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.552734375, "step": 230, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.997882604598999 }, { "episode": 3712, "epoch": 0.022240596278055386, "loss/policy_avg": 0.027855467051267624, "lr": 9.852377300613498e-06, "objective/entropy": 123.59611511230469, "objective/kl": 30.175601959228516, "objective/non_score_reward": -1.5087801218032837, "objective/rlhf_reward": -4.478861062732294, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 50.733642578125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.37109375, "step": 231, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.0003466606140137 }, { "episode": 3728, "epoch": 0.022336460917184935, "loss/policy_avg": -0.3093503713607788, "lr": 9.851738241308795e-06, "objective/entropy": 0.438995361328125, "objective/kl": 27.025171279907227, "objective/non_score_reward": -1.3512585163116455, "objective/rlhf_reward": -5.405034303665161, "objective/scores": 0.0, "policy/approxkl_avg": 13.092641830444336, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.615234375, "step": 232, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000675678253174 }, { "episode": 3744, "epoch": 0.022432325556314484, "loss/policy_avg": -0.05236402899026871, "lr": 9.85109918200409e-06, "objective/entropy": 112.74819946289062, "objective/kl": 24.94538688659668, "objective/non_score_reward": -1.2472693920135498, "objective/rlhf_reward": -3.473305845054325, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 19.200075149536133, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.3583984375, "step": 233, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.002117395401001 }, { "episode": 3760, "epoch": 0.022528190195444033, "loss/policy_avg": 0.21103611588478088, "lr": 9.850460122699387e-06, "objective/entropy": 73.77043151855469, "objective/kl": 28.00216293334961, "objective/non_score_reward": -1.4001080989837646, "objective/rlhf_reward": -3.6530211669968917, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 13.291183471679688, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5009765625, "step": 234, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9995331764221191 }, { "episode": 3776, "epoch": 0.02262405483457358, "loss/policy_avg": 0.6418443918228149, "lr": 9.849821063394683e-06, "objective/entropy": 19.92426300048828, "objective/kl": 31.282997131347656, "objective/non_score_reward": -1.5641499757766724, "objective/rlhf_reward": -4.931086901456041, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 98.59768676757812, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.34375, "step": 235, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0005524158477783 }, { "episode": 3792, "epoch": 0.02271991947370313, "loss/policy_avg": 0.20836295187473297, "lr": 9.84918200408998e-06, "objective/entropy": 28.238201141357422, "objective/kl": 29.105060577392578, "objective/non_score_reward": -1.455253005027771, "objective/rlhf_reward": -4.264752714839533, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 34.374176025390625, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.51171875, "step": 236, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9989919662475586 }, { "episode": 3808, "epoch": 0.02281578411283268, "loss/policy_avg": 0.43571943044662476, "lr": 9.848542944785276e-06, "objective/entropy": 144.94302368164062, "objective/kl": 33.369178771972656, "objective/non_score_reward": -1.6684589385986328, "objective/rlhf_reward": -5.314585768912716, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 113.68771362304688, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.607421875, "step": 237, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.996192216873169 }, { "episode": 3824, "epoch": 0.02291164875196223, "loss/policy_avg": 0.14893671870231628, "lr": 9.847903885480573e-06, "objective/entropy": 186.38681030273438, "objective/kl": 41.077842712402344, "objective/non_score_reward": -2.0538923740386963, "objective/rlhf_reward": -6.611449215475636, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 168.3666229248047, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.755859375, "step": 238, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9984937906265259 }, { "episode": 3840, "epoch": 0.023007513391091777, "loss/policy_avg": 0.07648584991693497, "lr": 9.84726482617587e-06, "objective/entropy": -37.23631286621094, "objective/kl": 25.318248748779297, "objective/non_score_reward": -1.2659125328063965, "objective/rlhf_reward": -3.5073907067447454, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 50.266414642333984, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.48828125, "step": 239, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9979995489120483 }, { "episode": 3856, "epoch": 0.023103378030221326, "loss/policy_avg": -0.15926438570022583, "lr": 9.846625766871167e-06, "objective/entropy": 37.868736267089844, "objective/kl": 27.493305206298828, "objective/non_score_reward": -1.3746652603149414, "objective/rlhf_reward": -4.173148546248598, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 6.63505220413208, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5390625, "step": 240, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0081896781921387 }, { "episode": 3872, "epoch": 0.023199242669350875, "loss/policy_avg": 0.14562831819057465, "lr": 9.845986707566462e-06, "objective/entropy": 15.188220977783203, "objective/kl": 28.046958923339844, "objective/non_score_reward": -1.4023480415344238, "objective/rlhf_reward": -4.1587937875703425, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 43.238990783691406, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.578125, "step": 241, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.996706247329712 }, { "episode": 3888, "epoch": 0.023295107308480424, "loss/policy_avg": 0.11054911464452744, "lr": 9.845347648261759e-06, "objective/entropy": 65.03858947753906, "objective/kl": 30.087387084960938, "objective/non_score_reward": -1.5043694972991943, "objective/rlhf_reward": -4.070066402630742, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 3.83949613571167, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.587890625, "step": 242, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9988956451416016 }, { "episode": 3904, "epoch": 0.023390971947609973, "loss/policy_avg": 0.3941475749015808, "lr": 9.844708588957056e-06, "objective/entropy": 59.93316650390625, "objective/kl": 25.623512268066406, "objective/non_score_reward": -1.2811756134033203, "objective/rlhf_reward": -3.52058264977129, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 78.30380249023438, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5859375, "step": 243, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9990283250808716 }, { "episode": 3920, "epoch": 0.023486836586739522, "loss/policy_avg": 0.19095474481582642, "lr": 9.844069529652353e-06, "objective/entropy": 31.422988891601562, "objective/kl": 24.865825653076172, "objective/non_score_reward": -1.2432913780212402, "objective/rlhf_reward": -3.2398319403330484, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 38.12981033325195, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.53125, "step": 244, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.004727840423584 }, { "episode": 3936, "epoch": 0.023582701225869074, "loss/policy_avg": 0.049357250332832336, "lr": 9.84343047034765e-06, "objective/entropy": 21.297576904296875, "objective/kl": 35.60150146484375, "objective/non_score_reward": -1.7800750732421875, "objective/rlhf_reward": -5.720300531387329, "objective/scores": 0.35, "policy/approxkl_avg": 38.869449615478516, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4716796875, "step": 245, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0019991397857666 }, { "episode": 3952, "epoch": 0.023678565864998623, "loss/policy_avg": 0.7713517546653748, "lr": 9.842791411042945e-06, "objective/entropy": 53.62720489501953, "objective/kl": 31.218942642211914, "objective/non_score_reward": -1.5609471797943115, "objective/rlhf_reward": -4.296377490239079, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 48.73869323730469, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7265625, "step": 246, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9975836277008057 }, { "episode": 3968, "epoch": 0.023774430504128172, "loss/policy_avg": 0.008143262937664986, "lr": 9.842152351738242e-06, "objective/entropy": 171.02789306640625, "objective/kl": 34.79176330566406, "objective/non_score_reward": -1.7395880222320557, "objective/rlhf_reward": -5.296492939413176, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 21.7828369140625, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.57421875, "step": 247, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9989376068115234 }, { "episode": 3984, "epoch": 0.02387029514325772, "loss/policy_avg": -0.12264247238636017, "lr": 9.841513292433539e-06, "objective/entropy": 80.24577331542969, "objective/kl": 33.11949920654297, "objective/non_score_reward": -1.6559748649597168, "objective/rlhf_reward": -4.799071069034646, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 61.87395477294922, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.4599609375, "step": 248, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.003005027770996 }, { "episode": 4000, "epoch": 0.02396615978238727, "loss/policy_avg": 0.2658330202102661, "lr": 9.840874233128836e-06, "objective/entropy": 149.58941650390625, "objective/kl": 29.3863525390625, "objective/non_score_reward": -1.4693175554275513, "objective/rlhf_reward": -4.273150358263569, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 58.66055679321289, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.51171875, "step": 249, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9972658157348633 }, { "episode": 4016, "epoch": 0.02406202442151682, "loss/policy_avg": 0.09115779399871826, "lr": 9.840235173824132e-06, "objective/entropy": 147.28927612304688, "objective/kl": 31.492679595947266, "objective/non_score_reward": -1.5746338367462158, "objective/rlhf_reward": -4.939285838340206, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 28.799278259277344, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.796875, "step": 250, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.002530574798584 }, { "episode": 4032, "epoch": 0.024157889060646368, "loss/policy_avg": 0.09398385882377625, "lr": 9.83959611451943e-06, "objective/entropy": -45.248435974121094, "objective/kl": 28.402175903320312, "objective/non_score_reward": -1.4201087951660156, "objective/rlhf_reward": -4.018575882137405, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 19.838550567626953, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.517578125, "step": 251, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9973764419555664 }, { "episode": 4048, "epoch": 0.024253753699775917, "loss/policy_avg": 0.19270983338356018, "lr": 9.838957055214724e-06, "objective/entropy": 77.1705093383789, "objective/kl": 34.050987243652344, "objective/non_score_reward": -1.7025493383407593, "objective/rlhf_reward": -5.076863960425058, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 18.725093841552734, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4814453125, "step": 252, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.001063823699951 }, { "episode": 4064, "epoch": 0.024349618338905466, "loss/policy_avg": 0.4652649164199829, "lr": 9.838317995910021e-06, "objective/entropy": 257.7345886230469, "objective/kl": 24.133747100830078, "objective/non_score_reward": -1.2066874504089355, "objective/rlhf_reward": -3.4481475735581935, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 41.46368408203125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.80078125, "step": 253, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9955878257751465 }, { "episode": 4080, "epoch": 0.024445482978035015, "loss/policy_avg": 0.14692571759223938, "lr": 9.837678936605318e-06, "objective/entropy": 43.00188064575195, "objective/kl": 24.73518180847168, "objective/non_score_reward": -1.236759066581726, "objective/rlhf_reward": -3.568433978644711, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 75.05264282226562, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7421875, "step": 254, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9990639686584473 }, { "episode": 4096, "epoch": 0.024541347617164564, "loss/policy_avg": 0.08271847665309906, "lr": 9.837039877300615e-06, "objective/entropy": -79.57066345214844, "objective/kl": 26.90784454345703, "objective/non_score_reward": -1.3453922271728516, "objective/rlhf_reward": -3.648235575358073, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 24.23294448852539, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.619140625, "step": 255, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9984873533248901 }, { "episode": 4112, "epoch": 0.024637212256294113, "loss/policy_avg": 0.12403183430433273, "lr": 9.83640081799591e-06, "objective/entropy": 87.87326049804688, "objective/kl": 29.708419799804688, "objective/non_score_reward": -1.4854209423065186, "objective/rlhf_reward": -4.116855438026499, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 32.65428161621094, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.62890625, "step": 256, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9981478452682495 }, { "episode": 4128, "epoch": 0.024733076895423662, "loss/policy_avg": -0.17764857411384583, "lr": 9.835761758691207e-06, "objective/entropy": 130.6345977783203, "objective/kl": 34.35237121582031, "objective/non_score_reward": -1.717618465423584, "objective/rlhf_reward": -5.314214794841364, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 118.99533081054688, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.3974609375, "step": 257, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.014057159423828 }, { "episode": 4144, "epoch": 0.02482894153455321, "loss/policy_avg": 2.400163173675537, "lr": 9.835122699386504e-06, "objective/entropy": 123.72301483154297, "objective/kl": 21.25601577758789, "objective/non_score_reward": -1.0628007650375366, "objective/rlhf_reward": 0.1487968802452091, "objective/scores": 1.1, "policy/approxkl_avg": 36.07887268066406, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.572265625, "step": 258, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.998108148574829 }, { "episode": 4160, "epoch": 0.02492480617368276, "loss/policy_avg": 0.3900964856147766, "lr": 9.8344836400818e-06, "objective/entropy": 233.3748321533203, "objective/kl": 42.447425842285156, "objective/non_score_reward": -2.1223714351654053, "objective/rlhf_reward": -5.5657667263757915, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 19.722026824951172, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.74609375, "step": 259, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.000584363937378 }, { "episode": 4176, "epoch": 0.02502067081281231, "loss/policy_avg": 0.3361247181892395, "lr": 9.833844580777096e-06, "objective/entropy": 135.13961791992188, "objective/kl": 31.25783920288086, "objective/non_score_reward": -1.5628920793533325, "objective/rlhf_reward": -4.426739449771952, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 16.49414825439453, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.486328125, "step": 260, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9986810684204102 }, { "episode": 4192, "epoch": 0.025116535451941858, "loss/policy_avg": 0.1438344419002533, "lr": 9.833205521472393e-06, "objective/entropy": 104.18168640136719, "objective/kl": 35.72525405883789, "objective/non_score_reward": -1.7862627506256104, "objective/rlhf_reward": -5.320222015651773, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 22.100770950317383, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.65625, "step": 261, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9996044635772705 }, { "episode": 4208, "epoch": 0.025212400091071407, "loss/policy_avg": 2.402132034301758, "lr": 9.83256646216769e-06, "objective/entropy": 91.16908264160156, "objective/kl": 29.633235931396484, "objective/non_score_reward": -1.4816619157791138, "objective/rlhf_reward": -4.476049522967681, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 43.586891174316406, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.64453125, "step": 262, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.004427433013916 }, { "episode": 4224, "epoch": 0.025308264730200956, "loss/policy_avg": 0.7259080410003662, "lr": 9.831927402862987e-06, "objective/entropy": 154.68115234375, "objective/kl": 37.00696563720703, "objective/non_score_reward": -1.8503483533859253, "objective/rlhf_reward": -5.576564307483743, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 16.052043914794922, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.484375, "step": 263, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9974215030670166 }, { "episode": 4240, "epoch": 0.025404129369330505, "loss/policy_avg": 0.09373458474874496, "lr": 9.831288343558284e-06, "objective/entropy": 72.85606384277344, "objective/kl": 27.522302627563477, "objective/non_score_reward": -1.376115083694458, "objective/rlhf_reward": -3.679631943973612, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 142.1138916015625, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.310546875, "step": 264, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9991629123687744 }, { "episode": 4256, "epoch": 0.025499994008460054, "loss/policy_avg": 0.7555310130119324, "lr": 9.830649284253579e-06, "objective/entropy": 72.61222076416016, "objective/kl": 30.647029876708984, "objective/non_score_reward": -1.5323514938354492, "objective/rlhf_reward": -4.705573756893244, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 54.394874572753906, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.587890625, "step": 265, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0006964206695557 }, { "episode": 4272, "epoch": 0.025595858647589603, "loss/policy_avg": 0.6551899313926697, "lr": 9.830010224948876e-06, "objective/entropy": 121.19924926757812, "objective/kl": 33.96527099609375, "objective/non_score_reward": -1.6982636451721191, "objective/rlhf_reward": -5.131195192754852, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 40.39656066894531, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.470703125, "step": 266, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.999192476272583 }, { "episode": 4288, "epoch": 0.02569172328671915, "loss/policy_avg": 1.1016074419021606, "lr": 9.829371165644173e-06, "objective/entropy": 132.00601196289062, "objective/kl": 43.09049987792969, "objective/non_score_reward": -2.154524803161621, "objective/rlhf_reward": -7.102327191623386, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 126.27546691894531, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.3935546875, "step": 267, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9990718364715576 }, { "episode": 4304, "epoch": 0.0257875879258487, "loss/policy_avg": 0.08981708437204361, "lr": 9.82873210633947e-06, "objective/entropy": 140.80239868164062, "objective/kl": 26.626178741455078, "objective/non_score_reward": -1.3313090801239014, "objective/rlhf_reward": -0.9252360224723812, "objective/scores": 1.1, "policy/approxkl_avg": 84.53665924072266, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.841796875, "step": 268, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.997150182723999 }, { "episode": 4320, "epoch": 0.02588345256497825, "loss/policy_avg": 0.565528929233551, "lr": 9.828093047034766e-06, "objective/entropy": 138.6593017578125, "objective/kl": 32.08763885498047, "objective/non_score_reward": -1.604382038116455, "objective/rlhf_reward": -4.813408408228474, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 34.42543029785156, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.427734375, "step": 269, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.0018558502197266 }, { "episode": 4336, "epoch": 0.0259793172041078, "loss/policy_avg": 0.4312899708747864, "lr": 9.827453987730061e-06, "objective/entropy": 20.17654800415039, "objective/kl": 23.528181076049805, "objective/non_score_reward": -1.176409125328064, "objective/rlhf_reward": -2.5829304478326183, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 20.440711975097656, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7890625, "step": 270, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9989676475524902 }, { "episode": 4352, "epoch": 0.026075181843237347, "loss/policy_avg": 0.20729105174541473, "lr": 9.826814928425358e-06, "objective/entropy": 166.21115112304688, "objective/kl": 31.01326560974121, "objective/non_score_reward": -1.5506633520126343, "objective/rlhf_reward": -6.202653288841248, "objective/scores": 0.0, "policy/approxkl_avg": 34.41830825805664, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.654296875, "step": 271, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.003887176513672 }, { "episode": 4368, "epoch": 0.026171046482366896, "loss/policy_avg": 3.2944061756134033, "lr": 9.826175869120655e-06, "objective/entropy": 28.755096435546875, "objective/kl": 31.482175827026367, "objective/non_score_reward": -1.5741088390350342, "objective/rlhf_reward": -4.917833187667233, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 5.366632461547852, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.3701171875, "step": 272, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0053317546844482 }, { "episode": 4384, "epoch": 0.02626691112149645, "loss/policy_avg": 0.23004142940044403, "lr": 9.825536809815952e-06, "objective/entropy": 54.82402038574219, "objective/kl": 32.45307922363281, "objective/non_score_reward": -1.6226541996002197, "objective/rlhf_reward": -5.148980966120391, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 31.775432586669922, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.525390625, "step": 273, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9995795488357544 }, { "episode": 4400, "epoch": 0.026362775760625998, "loss/policy_avg": -0.08435960114002228, "lr": 9.824897750511249e-06, "objective/entropy": 98.25897216796875, "objective/kl": 28.68474578857422, "objective/non_score_reward": -1.4342372417449951, "objective/rlhf_reward": -5.73694920539856, "objective/scores": 0.0, "policy/approxkl_avg": 72.97157287597656, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.517578125, "step": 274, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0012598037719727 }, { "episode": 4416, "epoch": 0.026458640399755547, "loss/policy_avg": 0.41626134514808655, "lr": 9.824258691206546e-06, "objective/entropy": 83.60694885253906, "objective/kl": 30.977035522460938, "objective/non_score_reward": -1.548851728439331, "objective/rlhf_reward": -4.795407152175903, "objective/scores": 0.35, "policy/approxkl_avg": 39.04691696166992, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.60546875, "step": 275, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0021119117736816 }, { "episode": 4432, "epoch": 0.026554505038885096, "loss/policy_avg": 0.43957769870758057, "lr": 9.823619631901841e-06, "objective/entropy": 127.34529113769531, "objective/kl": 35.28544616699219, "objective/non_score_reward": -1.7642724514007568, "objective/rlhf_reward": -5.606491903872833, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 150.78646850585938, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.748046875, "step": 276, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9970619678497314 }, { "episode": 4448, "epoch": 0.026650369678014645, "loss/policy_avg": 0.8086847066879272, "lr": 9.822980572597138e-06, "objective/entropy": -119.74644470214844, "objective/kl": 26.706302642822266, "objective/non_score_reward": -1.335315227508545, "objective/rlhf_reward": -3.9412606716156002, "objective/scores": 0.35, "policy/approxkl_avg": 65.78569793701172, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.568359375, "step": 277, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9986886978149414 }, { "episode": 4464, "epoch": 0.026746234317144194, "loss/policy_avg": 0.09760895371437073, "lr": 9.822341513292433e-06, "objective/entropy": 209.31890869140625, "objective/kl": 41.666831970214844, "objective/non_score_reward": -2.083341598510742, "objective/rlhf_reward": -6.7292466498056225, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 14.525606155395508, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6328125, "step": 278, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9980053901672363 }, { "episode": 4480, "epoch": 0.026842098956273742, "loss/policy_avg": 0.0820450559258461, "lr": 9.82170245398773e-06, "objective/entropy": 152.01095581054688, "objective/kl": 29.104724884033203, "objective/non_score_reward": -1.4552361965179443, "objective/rlhf_reward": -4.159085219324218, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 21.12679100036621, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4560546875, "step": 279, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.99649977684021 }, { "episode": 4496, "epoch": 0.02693796359540329, "loss/policy_avg": 0.08112587034702301, "lr": 9.821063394683027e-06, "objective/entropy": 49.22539138793945, "objective/kl": 32.40191650390625, "objective/non_score_reward": -1.6200958490371704, "objective/rlhf_reward": -5.029785375209197, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 4.874902725219727, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.404296875, "step": 280, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0027780532836914 }, { "episode": 4512, "epoch": 0.02703382823453284, "loss/policy_avg": 0.41851094365119934, "lr": 9.820424335378324e-06, "objective/entropy": 108.13827514648438, "objective/kl": 44.792015075683594, "objective/non_score_reward": -2.239600658416748, "objective/rlhf_reward": -7.133574362072061, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 67.72032165527344, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.537109375, "step": 281, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9979641437530518 }, { "episode": 4528, "epoch": 0.02712969287366239, "loss/policy_avg": 0.8327301144599915, "lr": 9.81978527607362e-06, "objective/entropy": 70.98486328125, "objective/kl": 43.82145690917969, "objective/non_score_reward": -2.191072702407837, "objective/rlhf_reward": -7.283338430340647, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 1.1268585920333862, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.427734375, "step": 282, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.002413749694824 }, { "episode": 4544, "epoch": 0.02722555751279194, "loss/policy_avg": 0.26003268361091614, "lr": 9.819146216768916e-06, "objective/entropy": 59.813140869140625, "objective/kl": 32.33997344970703, "objective/non_score_reward": -1.6169987916946411, "objective/rlhf_reward": -4.643166418346476, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 108.00172424316406, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.60546875, "step": 283, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.996032953262329 }, { "episode": 4560, "epoch": 0.027321422151921487, "loss/policy_avg": 0.06828334182500839, "lr": 9.818507157464213e-06, "objective/entropy": 164.7733154296875, "objective/kl": 36.976539611816406, "objective/non_score_reward": -1.8488272428512573, "objective/rlhf_reward": -5.791188750330525, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 22.712989807128906, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.498046875, "step": 284, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.998927354812622 }, { "episode": 4576, "epoch": 0.027417286791051036, "loss/policy_avg": 0.346102774143219, "lr": 9.81786809815951e-06, "objective/entropy": 141.91213989257812, "objective/kl": 29.89690589904785, "objective/non_score_reward": -1.4948452711105347, "objective/rlhf_reward": -4.5793810248374935, "objective/scores": 0.35, "policy/approxkl_avg": 4.914261817932129, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.583984375, "step": 285, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9991732835769653 }, { "episode": 4592, "epoch": 0.027513151430180585, "loss/policy_avg": 0.07111110538244247, "lr": 9.817229038854806e-06, "objective/entropy": -41.44879150390625, "objective/kl": 29.296417236328125, "objective/non_score_reward": -1.4648208618164062, "objective/rlhf_reward": -4.4806815172113, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 70.16557312011719, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5625, "step": 286, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9982048273086548 }, { "episode": 4608, "epoch": 0.027609016069310134, "loss/policy_avg": 0.6204440593719482, "lr": 9.816589979550103e-06, "objective/entropy": 10.609687805175781, "objective/kl": 34.5562744140625, "objective/non_score_reward": -1.727813720703125, "objective/rlhf_reward": -5.552005314563198, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 44.11948776245117, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4677734375, "step": 287, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9960062503814697 }, { "episode": 4624, "epoch": 0.027704880708439683, "loss/policy_avg": -0.3703474700450897, "lr": 9.8159509202454e-06, "objective/entropy": 16.20748519897461, "objective/kl": 40.348899841308594, "objective/non_score_reward": -2.0174450874328613, "objective/rlhf_reward": -5.6697804689407345, "objective/scores": 0.6, "policy/approxkl_avg": 58.94084167480469, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.451171875, "step": 288, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000488519668579 }, { "episode": 4640, "epoch": 0.027800745347569232, "loss/policy_avg": 0.691341757774353, "lr": 9.815311860940695e-06, "objective/entropy": 164.64894104003906, "objective/kl": 35.96034240722656, "objective/non_score_reward": -1.7980170249938965, "objective/rlhf_reward": -2.792067980766296, "objective/scores": 1.1, "policy/approxkl_avg": 105.621826171875, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.53125, "step": 289, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9972904920578003 }, { "episode": 4656, "epoch": 0.02789660998669878, "loss/policy_avg": 0.05122673511505127, "lr": 9.814672801635992e-06, "objective/entropy": 143.17758178710938, "objective/kl": 27.651023864746094, "objective/non_score_reward": -1.3825511932373047, "objective/rlhf_reward": -2.6064857586633887, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 30.806257247924805, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.462890625, "step": 290, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.996225118637085 }, { "episode": 4672, "epoch": 0.02799247462582833, "loss/policy_avg": -0.021466929465532303, "lr": 9.81403374233129e-06, "objective/entropy": 123.44010925292969, "objective/kl": 18.645748138427734, "objective/non_score_reward": -0.9322873950004578, "objective/rlhf_reward": -2.403636608153505, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 24.915597915649414, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.638671875, "step": 291, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.995213270187378 }, { "episode": 4688, "epoch": 0.02808833926495788, "loss/policy_avg": 0.700859785079956, "lr": 9.813394683026586e-06, "objective/entropy": 58.48292922973633, "objective/kl": 28.2305965423584, "objective/non_score_reward": -1.411529779434204, "objective/rlhf_reward": -4.24611941576004, "objective/scores": 0.35, "policy/approxkl_avg": 21.04977035522461, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4208984375, "step": 292, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9981340169906616 }, { "episode": 4704, "epoch": 0.028184203904087428, "loss/policy_avg": 0.9605820775032043, "lr": 9.812755623721883e-06, "objective/entropy": -33.6519775390625, "objective/kl": 33.635501861572266, "objective/non_score_reward": -1.6817750930786133, "objective/rlhf_reward": -5.065241103590118, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 18.019363403320312, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4619140625, "step": 293, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000504493713379 }, { "episode": 4720, "epoch": 0.028280068543216977, "loss/policy_avg": 0.44443511962890625, "lr": 9.81211656441718e-06, "objective/entropy": 61.81305694580078, "objective/kl": 37.54548263549805, "objective/non_score_reward": -1.8772742748260498, "objective/rlhf_reward": -6.130494453994137, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 34.736690521240234, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.701171875, "step": 294, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9981006383895874 }, { "episode": 4736, "epoch": 0.028375933182346526, "loss/policy_avg": -0.004817202687263489, "lr": 9.811477505112475e-06, "objective/entropy": -85.25079345703125, "objective/kl": 22.125272750854492, "objective/non_score_reward": -1.1062637567520142, "objective/rlhf_reward": -3.0658050415262412, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 39.945377349853516, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.619140625, "step": 295, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.001112937927246 }, { "episode": 4752, "epoch": 0.028471797821476075, "loss/policy_avg": -0.018911486491560936, "lr": 9.810838445807772e-06, "objective/entropy": 187.50953674316406, "objective/kl": 31.752737045288086, "objective/non_score_reward": -1.587636947631836, "objective/rlhf_reward": -4.525718684467386, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 41.095298767089844, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.560546875, "step": 296, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.0318245887756348 }, { "episode": 4768, "epoch": 0.028567662460605624, "loss/policy_avg": 0.5813855528831482, "lr": 9.810199386503069e-06, "objective/entropy": 13.395767211914062, "objective/kl": 29.76428985595703, "objective/non_score_reward": -1.4882144927978516, "objective/rlhf_reward": -4.219524757067362, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 58.40808868408203, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.62109375, "step": 297, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9971027374267578 }, { "episode": 4784, "epoch": 0.028663527099735173, "loss/policy_avg": 0.25174012780189514, "lr": 9.809560327198366e-06, "objective/entropy": 93.99857330322266, "objective/kl": 31.07823944091797, "objective/non_score_reward": -1.5539120435714722, "objective/rlhf_reward": -4.482314721743266, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 56.219329833984375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.767578125, "step": 298, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9973026514053345 }, { "episode": 4800, "epoch": 0.02875939173886472, "loss/policy_avg": -0.05966740474104881, "lr": 9.808921267893663e-06, "objective/entropy": 199.3701934814453, "objective/kl": 26.15532684326172, "objective/non_score_reward": -1.3077664375305176, "objective/rlhf_reward": -3.7152936098896827, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 18.272422790527344, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.650390625, "step": 299, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.002091646194458 }, { "episode": 4816, "epoch": 0.02885525637799427, "loss/policy_avg": 0.19725301861763, "lr": 9.808282208588958e-06, "objective/entropy": 112.11613464355469, "objective/kl": 33.344722747802734, "objective/non_score_reward": -1.667236089706421, "objective/rlhf_reward": -6.668944478034973, "objective/scores": 0.0, "policy/approxkl_avg": 29.54242706298828, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.640625, "step": 300, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0038766860961914 }, { "episode": 4832, "epoch": 0.02895112101712382, "loss/policy_avg": -0.17506346106529236, "lr": 9.807643149284255e-06, "objective/entropy": 70.48281860351562, "objective/kl": 29.51511573791504, "objective/non_score_reward": -1.4757558107376099, "objective/rlhf_reward": -4.387251400741276, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 12.791141510009766, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4814453125, "step": 301, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.999895691871643 }, { "episode": 4848, "epoch": 0.029046985656253372, "loss/policy_avg": 0.38140204548835754, "lr": 9.80700408997955e-06, "objective/entropy": 23.643152236938477, "objective/kl": 27.579925537109375, "objective/non_score_reward": -1.3789963722229004, "objective/rlhf_reward": -3.854125951946364, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 8.89024543762207, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.58203125, "step": 302, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9984774589538574 }, { "episode": 4864, "epoch": 0.02914285029538292, "loss/policy_avg": 0.18466374278068542, "lr": 9.806365030674847e-06, "objective/entropy": -30.63671875, "objective/kl": 25.678733825683594, "objective/non_score_reward": -1.2839367389678955, "objective/rlhf_reward": -3.6199750540577735, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 4.08036470413208, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.53515625, "step": 303, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.999252200126648 }, { "episode": 4880, "epoch": 0.02923871493451247, "loss/policy_avg": 0.20352232456207275, "lr": 9.805725971370144e-06, "objective/entropy": -14.465229034423828, "objective/kl": 16.88151741027832, "objective/non_score_reward": -0.8440757989883423, "objective/rlhf_reward": 1.023696751892567, "objective/scores": 1.1, "policy/approxkl_avg": 16.945369720458984, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.51171875, "step": 304, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.997638463973999 }, { "episode": 4896, "epoch": 0.02933457957364202, "loss/policy_avg": 0.36892420053482056, "lr": 9.80508691206544e-06, "objective/entropy": 136.53363037109375, "objective/kl": 30.262548446655273, "objective/non_score_reward": -1.513127326965332, "objective/rlhf_reward": -3.1287905319940776, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 30.166175842285156, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.615234375, "step": 305, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0001025199890137 }, { "episode": 4912, "epoch": 0.029430444212771568, "loss/policy_avg": 0.07577557861804962, "lr": 9.804447852760737e-06, "objective/entropy": 77.17935943603516, "objective/kl": 28.32352638244629, "objective/non_score_reward": -1.4161763191223145, "objective/rlhf_reward": -4.148933493884739, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 2.6957955360412598, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.63671875, "step": 306, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0025148391723633 }, { "episode": 4928, "epoch": 0.029526308851901117, "loss/policy_avg": 0.1559610664844513, "lr": 9.803808793456034e-06, "objective/entropy": -16.938400268554688, "objective/kl": 21.827743530273438, "objective/non_score_reward": -1.091387152671814, "objective/rlhf_reward": -2.703689043939696, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 7.885660171508789, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.431640625, "step": 307, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0027849674224854 }, { "episode": 4944, "epoch": 0.029622173491030666, "loss/policy_avg": -0.17305535078048706, "lr": 9.80316973415133e-06, "objective/entropy": -31.412694931030273, "objective/kl": 23.805431365966797, "objective/non_score_reward": -1.1902716159820557, "objective/rlhf_reward": -3.1569663322606853, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 35.29633331298828, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.654296875, "step": 308, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0007619857788086 }, { "episode": 4960, "epoch": 0.029718038130160215, "loss/policy_avg": 0.13406828045845032, "lr": 9.802530674846626e-06, "objective/entropy": 68.0604248046875, "objective/kl": 31.641517639160156, "objective/non_score_reward": -1.582075834274292, "objective/rlhf_reward": -4.949701407042843, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 32.652069091796875, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.3017578125, "step": 309, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9981597661972046 }, { "episode": 4976, "epoch": 0.029813902769289764, "loss/policy_avg": 0.3640270233154297, "lr": 9.801891615541923e-06, "objective/entropy": 73.73117065429688, "objective/kl": 22.181957244873047, "objective/non_score_reward": -1.109097957611084, "objective/rlhf_reward": -4.436391651630402, "objective/scores": 0.0, "policy/approxkl_avg": 24.474929809570312, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.77734375, "step": 310, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9988360404968262 }, { "episode": 4992, "epoch": 0.029909767408419313, "loss/policy_avg": 0.598778486251831, "lr": 9.80125255623722e-06, "objective/entropy": 77.45819854736328, "objective/kl": 31.91500473022461, "objective/non_score_reward": -1.5957502126693726, "objective/rlhf_reward": -4.558172132047723, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 7.392116546630859, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.595703125, "step": 311, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9976756572723389 }, { "episode": 5008, "epoch": 0.03000563204754886, "loss/policy_avg": -0.14829277992248535, "lr": 9.800613496932517e-06, "objective/entropy": 73.91107940673828, "objective/kl": 22.043235778808594, "objective/non_score_reward": -1.1021617650985718, "objective/rlhf_reward": -3.0086471796035763, "objective/scores": 0.35, "policy/approxkl_avg": 7.375496864318848, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.548828125, "step": 312, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0021305084228516 }, { "episode": 5024, "epoch": 0.03010149668667841, "loss/policy_avg": 0.34449532628059387, "lr": 9.799974437627812e-06, "objective/entropy": 27.04425048828125, "objective/kl": 31.98007583618164, "objective/non_score_reward": -1.599003791809082, "objective/rlhf_reward": -4.996015524864196, "objective/scores": 0.35, "policy/approxkl_avg": 53.630210876464844, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.529296875, "step": 313, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9990134239196777 }, { "episode": 5040, "epoch": 0.03019736132580796, "loss/policy_avg": 0.029857225716114044, "lr": 9.799335378323109e-06, "objective/entropy": 147.96096801757812, "objective/kl": 27.342838287353516, "objective/non_score_reward": -1.3671419620513916, "objective/rlhf_reward": -4.017969946475372, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 18.108400344848633, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.638671875, "step": 314, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9988747835159302 }, { "episode": 5056, "epoch": 0.03029322596493751, "loss/policy_avg": 0.05283927917480469, "lr": 9.798696319018406e-06, "objective/entropy": -46.846099853515625, "objective/kl": 30.715242385864258, "objective/non_score_reward": -1.535762071609497, "objective/rlhf_reward": -4.538928542200642, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 66.26033020019531, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6484375, "step": 315, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9992979764938354 }, { "episode": 5072, "epoch": 0.030389090604067057, "loss/policy_avg": 0.2858242094516754, "lr": 9.798057259713703e-06, "objective/entropy": -156.9435577392578, "objective/kl": 31.284622192382812, "objective/non_score_reward": -1.5642311573028564, "objective/rlhf_reward": -4.915288856535583, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 74.38943481445312, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7421875, "step": 316, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9992833137512207 }, { "episode": 5088, "epoch": 0.030484955243196606, "loss/policy_avg": 0.28274843096733093, "lr": 9.797418200409e-06, "objective/entropy": -214.69573974609375, "objective/kl": 22.27606201171875, "objective/non_score_reward": -1.1138031482696533, "objective/rlhf_reward": -2.3325063607850414, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 35.48945236206055, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.505859375, "step": 317, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9962687492370605 }, { "episode": 5104, "epoch": 0.030580819882326155, "loss/policy_avg": -0.08736838400363922, "lr": 9.796779141104296e-06, "objective/entropy": -18.148971557617188, "objective/kl": 27.546077728271484, "objective/non_score_reward": -1.377303957939148, "objective/rlhf_reward": -4.1306134844697535, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 76.84832000732422, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.6171875, "step": 318, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0014290809631348 }, { "episode": 5120, "epoch": 0.030676684521455704, "loss/policy_avg": 0.031098078936338425, "lr": 9.796140081799592e-06, "objective/entropy": 103.30211639404297, "objective/kl": 27.747032165527344, "objective/non_score_reward": -1.3873515129089355, "objective/rlhf_reward": -4.033634447845158, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 54.69970703125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.65234375, "step": 319, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9985463619232178 }, { "episode": 5136, "epoch": 0.030772549160585253, "loss/policy_avg": 0.3622899651527405, "lr": 9.795501022494888e-06, "objective/entropy": 66.0567398071289, "objective/kl": 26.39444351196289, "objective/non_score_reward": -1.3197221755981445, "objective/rlhf_reward": -3.7226295759349615, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 5.640605449676514, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6484375, "step": 320, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9992847442626953 }, { "episode": 5152, "epoch": 0.030868413799714802, "loss/policy_avg": -0.10469883680343628, "lr": 9.794861963190185e-06, "objective/entropy": 35.81920623779297, "objective/kl": 25.668739318847656, "objective/non_score_reward": -1.2834370136260986, "objective/rlhf_reward": -3.7337480843067166, "objective/scores": 0.35, "policy/approxkl_avg": 5.808808326721191, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6484375, "step": 321, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.999007225036621 }, { "episode": 5168, "epoch": 0.03096427843884435, "loss/policy_avg": -0.2741212248802185, "lr": 9.794222903885482e-06, "objective/entropy": 52.38888168334961, "objective/kl": 34.969974517822266, "objective/non_score_reward": -1.748498797416687, "objective/rlhf_reward": -5.652359655409484, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 8.913843154907227, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.3798828125, "step": 322, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0103840827941895 }, { "episode": 5184, "epoch": 0.0310601430779739, "loss/policy_avg": 0.30122414231300354, "lr": 9.793583844580777e-06, "objective/entropy": 134.16075134277344, "objective/kl": 25.608116149902344, "objective/non_score_reward": -1.280405879020691, "objective/rlhf_reward": -3.1742123318480804, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 74.33633422851562, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.603515625, "step": 323, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.014462471008301 }, { "episode": 5200, "epoch": 0.03115600771710345, "loss/policy_avg": 0.26204991340637207, "lr": 9.792944785276074e-06, "objective/entropy": 2.559833526611328, "objective/kl": 25.519519805908203, "objective/non_score_reward": -1.2759759426116943, "objective/rlhf_reward": -3.74465426180212, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 92.09954071044922, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.57421875, "step": 324, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.998145580291748 }, { "episode": 5216, "epoch": 0.031251872356233, "loss/policy_avg": 0.18864840269088745, "lr": 9.792305725971371e-06, "objective/entropy": 48.99184036254883, "objective/kl": 28.022377014160156, "objective/non_score_reward": -1.4011187553405762, "objective/rlhf_reward": -4.123522403653025, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 22.120746612548828, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.60546875, "step": 325, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9984712600708008 }, { "episode": 5232, "epoch": 0.03134773699536255, "loss/policy_avg": 0.42162489891052246, "lr": 9.791666666666666e-06, "objective/entropy": -129.23065185546875, "objective/kl": 31.687660217285156, "objective/non_score_reward": -1.5843830108642578, "objective/rlhf_reward": -4.821760052236256, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 111.98194885253906, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.587890625, "step": 326, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.996734857559204 }, { "episode": 5248, "epoch": 0.0314436016344921, "loss/policy_avg": -0.07900102436542511, "lr": 9.791027607361963e-06, "objective/entropy": 31.351696014404297, "objective/kl": 27.038206100463867, "objective/non_score_reward": -1.3519103527069092, "objective/rlhf_reward": -5.407641291618347, "objective/scores": 0.0, "policy/approxkl_avg": 9.7061767578125, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.4423828125, "step": 327, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0043118000030518 }, { "episode": 5264, "epoch": 0.03153946627362165, "loss/policy_avg": 0.16587843000888824, "lr": 9.79038854805726e-06, "objective/entropy": 143.86651611328125, "objective/kl": 27.42593765258789, "objective/non_score_reward": -1.3712968826293945, "objective/rlhf_reward": -4.125937962268276, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 119.49800872802734, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.767578125, "step": 328, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.999497890472412 }, { "episode": 5280, "epoch": 0.0316353309127512, "loss/policy_avg": 0.29106539487838745, "lr": 9.789749488752557e-06, "objective/entropy": 67.8651351928711, "objective/kl": 32.114479064941406, "objective/non_score_reward": -1.6057239770889282, "objective/rlhf_reward": -5.08126013567987, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 7.976801872253418, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.470703125, "step": 329, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0011985301971436 }, { "episode": 5296, "epoch": 0.031731195551880746, "loss/policy_avg": 0.5780457258224487, "lr": 9.789110429447854e-06, "objective/entropy": 104.15371704101562, "objective/kl": 30.92220687866211, "objective/non_score_reward": -1.5461102724075317, "objective/rlhf_reward": -3.784441030025482, "objective/scores": 0.6, "policy/approxkl_avg": 52.566375732421875, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.55859375, "step": 330, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9993044137954712 }, { "episode": 5312, "epoch": 0.031827060191010295, "loss/policy_avg": 0.24728742241859436, "lr": 9.78847137014315e-06, "objective/entropy": -95.75634765625, "objective/kl": 30.755779266357422, "objective/non_score_reward": -1.5377888679504395, "objective/rlhf_reward": -4.791905486319942, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 32.567970275878906, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.666015625, "step": 331, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9991464614868164 }, { "episode": 5328, "epoch": 0.031922924830139844, "loss/policy_avg": 1.9531396627426147, "lr": 9.787832310838446e-06, "objective/entropy": 18.057151794433594, "objective/kl": 21.966590881347656, "objective/non_score_reward": -1.0983295440673828, "objective/rlhf_reward": -2.993318116664886, "objective/scores": 0.35, "policy/approxkl_avg": 11.555295944213867, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.716796875, "step": 332, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.004971981048584 }, { "episode": 5344, "epoch": 0.03201878946926939, "loss/policy_avg": 0.0304682869464159, "lr": 9.787193251533743e-06, "objective/entropy": -100.86114501953125, "objective/kl": 21.19540023803711, "objective/non_score_reward": -1.0597699880599976, "objective/rlhf_reward": -2.8604777837670863, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 36.17786407470703, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.46875, "step": 333, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.997187852859497 }, { "episode": 5360, "epoch": 0.03211465410839894, "loss/policy_avg": 0.2974792718887329, "lr": 9.78655419222904e-06, "objective/entropy": 59.0064697265625, "objective/kl": 23.83527183532715, "objective/non_score_reward": -1.1917636394500732, "objective/rlhf_reward": -3.2861017016724343, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 27.08124542236328, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.76171875, "step": 334, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0005297660827637 }, { "episode": 5376, "epoch": 0.03221051874752849, "loss/policy_avg": 0.20310130715370178, "lr": 9.785915132924337e-06, "objective/entropy": 51.579200744628906, "objective/kl": 26.064043045043945, "objective/non_score_reward": -1.3032021522521973, "objective/rlhf_reward": -3.656549363341883, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 3.1224026679992676, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.51171875, "step": 335, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0020689964294434 }, { "episode": 5392, "epoch": 0.03230638338665804, "loss/policy_avg": -0.22360196709632874, "lr": 9.785276073619633e-06, "objective/entropy": 8.019195556640625, "objective/kl": 34.267356872558594, "objective/non_score_reward": -1.7133680582046509, "objective/rlhf_reward": -5.40287409266983, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 27.402694702148438, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.517578125, "step": 336, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.99736487865448 }, { "episode": 5408, "epoch": 0.03240224802578759, "loss/policy_avg": 0.394004225730896, "lr": 9.784637014314929e-06, "objective/entropy": -7.316375732421875, "objective/kl": 34.60337829589844, "objective/non_score_reward": -1.7301688194274902, "objective/rlhf_reward": -3.9969565018427105, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 60.58606719970703, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.609375, "step": 337, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9990360736846924 }, { "episode": 5424, "epoch": 0.03249811266491714, "loss/policy_avg": 0.08118537068367004, "lr": 9.783997955010226e-06, "objective/entropy": 3.808826446533203, "objective/kl": 33.9757080078125, "objective/non_score_reward": -1.6987853050231934, "objective/rlhf_reward": -5.3713093592720895, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 49.47349548339844, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.546875, "step": 338, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9974095821380615 }, { "episode": 5440, "epoch": 0.03259397730404669, "loss/policy_avg": 0.1250596046447754, "lr": 9.783358895705522e-06, "objective/entropy": -42.7471809387207, "objective/kl": 27.222618103027344, "objective/non_score_reward": -1.361130952835083, "objective/rlhf_reward": -3.9287524459683265, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 8.669515609741211, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.509765625, "step": 339, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0002639293670654 }, { "episode": 5456, "epoch": 0.032689841943176236, "loss/policy_avg": 1.2977867126464844, "lr": 9.78271983640082e-06, "objective/entropy": -60.51675796508789, "objective/kl": 27.726932525634766, "objective/non_score_reward": -1.3863465785980225, "objective/rlhf_reward": -4.064433994706034, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 52.59510803222656, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4345703125, "step": 340, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9984922409057617 }, { "episode": 5472, "epoch": 0.032785706582305785, "loss/policy_avg": 0.10771232098340988, "lr": 9.782080777096116e-06, "objective/entropy": 39.22501754760742, "objective/kl": 38.581573486328125, "objective/non_score_reward": -1.9290788173675537, "objective/rlhf_reward": -6.374679616003662, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 16.336502075195312, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.599609375, "step": 341, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9990124702453613 }, { "episode": 5488, "epoch": 0.032881571221435334, "loss/policy_avg": 0.029969744384288788, "lr": 9.781441717791413e-06, "objective/entropy": 54.763675689697266, "objective/kl": 27.586057662963867, "objective/non_score_reward": -1.379302978515625, "objective/rlhf_reward": -3.7838785807291666, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 29.997591018676758, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4462890625, "step": 342, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9992620944976807 }, { "episode": 5504, "epoch": 0.03297743586056488, "loss/policy_avg": -0.003006638027727604, "lr": 9.780802658486708e-06, "objective/entropy": 4.6327056884765625, "objective/kl": 25.01122283935547, "objective/non_score_reward": -1.250560998916626, "objective/rlhf_reward": -3.054833005146916, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 2.332850694656372, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.51171875, "step": 343, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0042150020599365 }, { "episode": 5520, "epoch": 0.03307330049969443, "loss/policy_avg": -0.2595655918121338, "lr": 9.780163599182005e-06, "objective/entropy": -9.382579803466797, "objective/kl": 25.310394287109375, "objective/non_score_reward": -1.2655197381973267, "objective/rlhf_reward": -3.611480812640533, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 35.86376190185547, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.708984375, "step": 344, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9991991519927979 }, { "episode": 5536, "epoch": 0.03316916513882398, "loss/policy_avg": 1.6723182201385498, "lr": 9.7795245398773e-06, "objective/entropy": 167.249267578125, "objective/kl": 38.30883026123047, "objective/non_score_reward": -1.915441632270813, "objective/rlhf_reward": -6.283164360610348, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 27.648231506347656, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.521484375, "step": 345, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9966659545898438 }, { "episode": 5552, "epoch": 0.03326502977795353, "loss/policy_avg": 0.21136921644210815, "lr": 9.778885480572597e-06, "objective/entropy": 202.48263549804688, "objective/kl": 28.62633514404297, "objective/non_score_reward": -1.4313167333602905, "objective/rlhf_reward": -4.169007628169611, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 28.591995239257812, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7421875, "step": 346, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9962763786315918 }, { "episode": 5568, "epoch": 0.03336089441708308, "loss/policy_avg": 0.030091844499111176, "lr": 9.778246421267894e-06, "objective/entropy": 178.1235809326172, "objective/kl": 37.731300354003906, "objective/non_score_reward": -1.8865652084350586, "objective/rlhf_reward": -5.990001528468683, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 17.381601333618164, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.52734375, "step": 347, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.001326322555542 }, { "episode": 5584, "epoch": 0.03345675905621263, "loss/policy_avg": 0.40717682242393494, "lr": 9.777607361963191e-06, "objective/entropy": 90.73904418945312, "objective/kl": 31.88462257385254, "objective/non_score_reward": -1.594231128692627, "objective/rlhf_reward": -5.0176747677072715, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 37.96768569946289, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5703125, "step": 348, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9991170167922974 }, { "episode": 5600, "epoch": 0.033552623695342176, "loss/policy_avg": 0.5422201156616211, "lr": 9.776968302658488e-06, "objective/entropy": 80.41102600097656, "objective/kl": 34.64447021484375, "objective/non_score_reward": -1.7322235107421875, "objective/rlhf_reward": -5.478295783610687, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 117.23408508300781, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.54296875, "step": 349, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9983158111572266 }, { "episode": 5616, "epoch": 0.033648488334471725, "loss/policy_avg": 0.3756037950515747, "lr": 9.776329243353783e-06, "objective/entropy": 61.65838623046875, "objective/kl": 44.269325256347656, "objective/non_score_reward": -2.213466167449951, "objective/rlhf_reward": -7.40326676806961, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 19.3502254486084, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.583984375, "step": 350, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9988865852355957 }, { "episode": 5632, "epoch": 0.033744352973601274, "loss/policy_avg": 0.9775654673576355, "lr": 9.77569018404908e-06, "objective/entropy": 57.90337371826172, "objective/kl": 41.80830383300781, "objective/non_score_reward": -2.0904150009155273, "objective/rlhf_reward": -6.628326908747354, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 84.0235824584961, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.84765625, "step": 351, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9960914850234985 }, { "episode": 5648, "epoch": 0.03384021761273082, "loss/policy_avg": -0.20816992223262787, "lr": 9.775051124744377e-06, "objective/entropy": -118.41542053222656, "objective/kl": 23.201061248779297, "objective/non_score_reward": -1.160053014755249, "objective/rlhf_reward": -2.8153834894028416, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 4.062729835510254, "policy/clipfrac_avg": 2.0, "policy/entropy_avg": 0.4453125, "step": 352, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0035219192504883 }, { "episode": 5664, "epoch": 0.03393608225186037, "loss/policy_avg": 0.035901207476854324, "lr": 9.774412065439674e-06, "objective/entropy": 154.33920288085938, "objective/kl": 28.773828506469727, "objective/non_score_reward": -1.4386913776397705, "objective/rlhf_reward": -2.8310468539011207, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 25.329944610595703, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4755859375, "step": 353, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.0003747940063477 }, { "episode": 5680, "epoch": 0.03403194689098992, "loss/policy_avg": 0.7185342311859131, "lr": 9.77377300613497e-06, "objective/entropy": 45.80010986328125, "objective/kl": 35.51177215576172, "objective/non_score_reward": -1.7755887508392334, "objective/rlhf_reward": -5.586583339961704, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 69.95939636230469, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.51953125, "step": 354, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.996915340423584 }, { "episode": 5696, "epoch": 0.03412781153011947, "loss/policy_avg": 0.871320903301239, "lr": 9.773133946830267e-06, "objective/entropy": 136.34942626953125, "objective/kl": 37.25979995727539, "objective/non_score_reward": -1.862990140914917, "objective/rlhf_reward": -5.504549334721501, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 58.879180908203125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.66796875, "step": 355, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9990177154541016 }, { "episode": 5712, "epoch": 0.03422367616924902, "loss/policy_avg": 0.14556461572647095, "lr": 9.772494887525563e-06, "objective/entropy": -10.28516960144043, "objective/kl": 29.231609344482422, "objective/non_score_reward": -1.461580514907837, "objective/rlhf_reward": -4.021493013176035, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 39.2762451171875, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5859375, "step": 356, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9985218048095703 }, { "episode": 5728, "epoch": 0.03431954080837857, "loss/policy_avg": 0.27659082412719727, "lr": 9.77185582822086e-06, "objective/entropy": -36.31108093261719, "objective/kl": 32.386661529541016, "objective/non_score_reward": -1.619333028793335, "objective/rlhf_reward": -6.47733199596405, "objective/scores": 0.0, "policy/approxkl_avg": 10.265704154968262, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.71484375, "step": 357, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9992687702178955 }, { "episode": 5744, "epoch": 0.03441540544750812, "loss/policy_avg": 0.10546956956386566, "lr": 9.771216768916156e-06, "objective/entropy": 79.19872283935547, "objective/kl": 22.353626251220703, "objective/non_score_reward": -1.1176813840866089, "objective/rlhf_reward": -3.0201275154069513, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 20.73809051513672, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4873046875, "step": 358, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9974703788757324 }, { "episode": 5760, "epoch": 0.034511270086637666, "loss/policy_avg": 0.5648351311683655, "lr": 9.770577709611453e-06, "objective/entropy": 38.47356033325195, "objective/kl": 23.87390899658203, "objective/non_score_reward": -1.1936955451965332, "objective/rlhf_reward": -3.4331463485056455, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 8.14659595489502, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.53125, "step": 359, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0011978149414062 }, { "episode": 5776, "epoch": 0.034607134725767215, "loss/policy_avg": 0.5912380814552307, "lr": 9.76993865030675e-06, "objective/entropy": 116.97152709960938, "objective/kl": 40.231689453125, "objective/non_score_reward": -2.011584758758545, "objective/rlhf_reward": -6.565385702069163, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 117.33955383300781, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.666015625, "step": 360, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9969582557678223 }, { "episode": 5792, "epoch": 0.034702999364896764, "loss/policy_avg": -0.019477106630802155, "lr": 9.769299591002045e-06, "objective/entropy": -144.96791076660156, "objective/kl": 27.773448944091797, "objective/non_score_reward": -1.3886725902557373, "objective/rlhf_reward": -5.554690062999725, "objective/scores": 0.0, "policy/approxkl_avg": 7.48216438293457, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.626953125, "step": 361, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000136137008667 }, { "episode": 5808, "epoch": 0.03479886400402631, "loss/policy_avg": -0.5155759453773499, "lr": 9.768660531697342e-06, "objective/entropy": 78.00074768066406, "objective/kl": 34.501590728759766, "objective/non_score_reward": -1.7250795364379883, "objective/rlhf_reward": -5.521715917674404, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 122.40145874023438, "policy/clipfrac_avg": 2.0, "policy/entropy_avg": 0.513671875, "step": 362, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.02427077293396 }, { "episode": 5824, "epoch": 0.03489472864315586, "loss/policy_avg": 0.3520805537700653, "lr": 9.768021472392639e-06, "objective/entropy": -66.29779815673828, "objective/kl": 23.767650604248047, "objective/non_score_reward": -1.188382625579834, "objective/rlhf_reward": -3.3535303235054013, "objective/scores": 0.35, "policy/approxkl_avg": 66.86349487304688, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.51953125, "step": 363, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9973565340042114 }, { "episode": 5840, "epoch": 0.03499059328228541, "loss/policy_avg": 0.25808075070381165, "lr": 9.767382413087936e-06, "objective/entropy": 55.69321060180664, "objective/kl": 32.73713684082031, "objective/non_score_reward": -1.6368569135665894, "objective/rlhf_reward": -4.147427594661712, "objective/scores": 0.6, "policy/approxkl_avg": 17.00968360900879, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.70703125, "step": 364, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.998908519744873 }, { "episode": 5856, "epoch": 0.03508645792141496, "loss/policy_avg": -0.33678027987480164, "lr": 9.766743353783233e-06, "objective/entropy": 63.459205627441406, "objective/kl": 36.74503707885742, "objective/non_score_reward": -1.837251901626587, "objective/rlhf_reward": -5.226301344410453, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 63.5507926940918, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7734375, "step": 365, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0005531311035156 }, { "episode": 5872, "epoch": 0.03518232256054451, "loss/policy_avg": 0.397920161485672, "lr": 9.76610429447853e-06, "objective/entropy": -11.37314224243164, "objective/kl": 32.99299240112305, "objective/non_score_reward": -1.6496496200561523, "objective/rlhf_reward": -5.174766202171413, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 28.19782257080078, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.65625, "step": 366, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9984650611877441 }, { "episode": 5888, "epoch": 0.03527818719967406, "loss/policy_avg": 0.5101684331893921, "lr": 9.765465235173825e-06, "objective/entropy": 122.12913513183594, "objective/kl": 39.20099639892578, "objective/non_score_reward": -1.9600497484207153, "objective/rlhf_reward": -6.480949008201046, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 10.180255889892578, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.45703125, "step": 367, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9976162910461426 }, { "episode": 5904, "epoch": 0.03537405183880361, "loss/policy_avg": -0.46757811307907104, "lr": 9.764826175869122e-06, "objective/entropy": -108.47764587402344, "objective/kl": 25.862443923950195, "objective/non_score_reward": -1.2931220531463623, "objective/rlhf_reward": -3.6162289073138982, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 2.3750016689300537, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.703125, "step": 368, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0032150745391846 }, { "episode": 5920, "epoch": 0.035469916477933155, "loss/policy_avg": 0.12928390502929688, "lr": 9.764187116564417e-06, "objective/entropy": 47.25078201293945, "objective/kl": 23.20449447631836, "objective/non_score_reward": -1.1602246761322021, "objective/rlhf_reward": -2.240898942947388, "objective/scores": 0.6, "policy/approxkl_avg": 2.1992838382720947, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.544921875, "step": 369, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0065484046936035 }, { "episode": 5936, "epoch": 0.035565781117062704, "loss/policy_avg": 0.15939241647720337, "lr": 9.763548057259714e-06, "objective/entropy": -19.609264373779297, "objective/kl": 28.25977325439453, "objective/non_score_reward": -1.4129884243011475, "objective/rlhf_reward": -4.273351618138653, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 59.99807357788086, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.66796875, "step": 370, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0036120414733887 }, { "episode": 5952, "epoch": 0.03566164575619225, "loss/policy_avg": 0.1767190843820572, "lr": 9.76290899795501e-06, "objective/entropy": -11.536600112915039, "objective/kl": 36.28870391845703, "objective/non_score_reward": -1.8144354820251465, "objective/rlhf_reward": -7.257741451263428, "objective/scores": 0.0, "policy/approxkl_avg": 11.846475601196289, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.611328125, "step": 371, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.99981689453125 }, { "episode": 5968, "epoch": 0.0357575103953218, "loss/policy_avg": 0.3314260244369507, "lr": 9.762269938650308e-06, "objective/entropy": -30.279476165771484, "objective/kl": 28.756494522094727, "objective/non_score_reward": -1.4378247261047363, "objective/rlhf_reward": -4.300700943084106, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 39.342529296875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.623046875, "step": 372, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.998626708984375 }, { "episode": 5984, "epoch": 0.03585337503445135, "loss/policy_avg": 0.18494009971618652, "lr": 9.761630879345604e-06, "objective/entropy": 68.65098571777344, "objective/kl": 36.555747985839844, "objective/non_score_reward": -1.8277872800827026, "objective/rlhf_reward": -5.486320610317301, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 10.346623420715332, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.517578125, "step": 373, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.000062942504883 }, { "episode": 6000, "epoch": 0.03594923967358091, "loss/policy_avg": -0.05254024267196655, "lr": 9.7609918200409e-06, "objective/entropy": -30.816913604736328, "objective/kl": 26.80430793762207, "objective/non_score_reward": -1.3402154445648193, "objective/rlhf_reward": -3.845090114864048, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 3.3415722846984863, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.583984375, "step": 374, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9991049766540527 }, { "episode": 6016, "epoch": 0.036045104312710456, "loss/policy_avg": 0.48673489689826965, "lr": 9.760352760736196e-06, "objective/entropy": -54.172760009765625, "objective/kl": 26.726612091064453, "objective/non_score_reward": -1.3363306522369385, "objective/rlhf_reward": -0.945322489738464, "objective/scores": 1.1, "policy/approxkl_avg": 36.357444763183594, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.708984375, "step": 375, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.999312400817871 }, { "episode": 6032, "epoch": 0.036140968951840005, "loss/policy_avg": -0.06733483076095581, "lr": 9.759713701431493e-06, "objective/entropy": 135.20721435546875, "objective/kl": 37.13209915161133, "objective/non_score_reward": -1.856605052947998, "objective/rlhf_reward": -4.5027009590875835, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 17.97521209716797, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4873046875, "step": 376, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.002077341079712 }, { "episode": 6048, "epoch": 0.036236833590969554, "loss/policy_avg": -0.041654448956251144, "lr": 9.75907464212679e-06, "objective/entropy": -167.12548828125, "objective/kl": 25.773399353027344, "objective/non_score_reward": -1.2886700630187988, "objective/rlhf_reward": -0.7546801328659054, "objective/scores": 1.1, "policy/approxkl_avg": 0.800922691822052, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.521484375, "step": 377, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.000338315963745 }, { "episode": 6064, "epoch": 0.0363326982300991, "loss/policy_avg": 0.03024141490459442, "lr": 9.758435582822087e-06, "objective/entropy": -73.82417297363281, "objective/kl": 26.33017349243164, "objective/non_score_reward": -1.3165086507797241, "objective/rlhf_reward": -3.14332831122068, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 23.01593780517578, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.63671875, "step": 378, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.002521276473999 }, { "episode": 6080, "epoch": 0.03642856286922865, "loss/policy_avg": 0.285569429397583, "lr": 9.757796523517384e-06, "objective/entropy": -111.42575073242188, "objective/kl": 28.885374069213867, "objective/non_score_reward": -1.4442687034606934, "objective/rlhf_reward": -4.398472824183804, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 79.57511901855469, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.61328125, "step": 379, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9979077577590942 }, { "episode": 6096, "epoch": 0.0365244275083582, "loss/policy_avg": -0.022392742335796356, "lr": 9.75715746421268e-06, "objective/entropy": -79.86695098876953, "objective/kl": 17.694236755371094, "objective/non_score_reward": -0.8847118616104126, "objective/rlhf_reward": -1.7140187576142063, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 4.339657306671143, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.767578125, "step": 380, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0013508796691895 }, { "episode": 6112, "epoch": 0.03662029214748775, "loss/policy_avg": 0.4459357261657715, "lr": 9.756518404907976e-06, "objective/entropy": -148.62872314453125, "objective/kl": 21.098934173583984, "objective/non_score_reward": -1.054946780204773, "objective/rlhf_reward": 0.1802129983901981, "objective/scores": 1.1, "policy/approxkl_avg": 6.359186172485352, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.576171875, "step": 381, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9992458820343018 }, { "episode": 6128, "epoch": 0.0367161567866173, "loss/policy_avg": -0.012147974222898483, "lr": 9.755879345603273e-06, "objective/entropy": 152.35232543945312, "objective/kl": 31.486684799194336, "objective/non_score_reward": -1.5743342638015747, "objective/rlhf_reward": -3.3736180409204692, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 35.51153564453125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5859375, "step": 382, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.999483585357666 }, { "episode": 6144, "epoch": 0.03681202142574685, "loss/policy_avg": 0.012859173119068146, "lr": 9.75524028629857e-06, "objective/entropy": 26.343887329101562, "objective/kl": 33.34328079223633, "objective/non_score_reward": -1.6671642065048218, "objective/rlhf_reward": -4.721245358662541, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 22.614994049072266, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.82421875, "step": 383, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0018911361694336 }, { "episode": 6160, "epoch": 0.0369078860648764, "loss/policy_avg": 0.21653258800506592, "lr": 9.754601226993867e-06, "objective/entropy": 109.49678039550781, "objective/kl": 43.73469543457031, "objective/non_score_reward": -2.186734676361084, "objective/rlhf_reward": -7.296340326876983, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 31.000137329101562, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.689453125, "step": 384, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.001706123352051 }, { "episode": 6176, "epoch": 0.037003750704005946, "loss/policy_avg": 0.17637991905212402, "lr": 9.753962167689162e-06, "objective/entropy": -57.256038665771484, "objective/kl": 20.548786163330078, "objective/non_score_reward": -1.0274393558502197, "objective/rlhf_reward": -1.9870514891305304, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 7.945226669311523, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.71484375, "step": 385, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000217914581299 }, { "episode": 6192, "epoch": 0.037099615343135495, "loss/policy_avg": 0.23474755883216858, "lr": 9.753323108384459e-06, "objective/entropy": -67.67970275878906, "objective/kl": 29.886417388916016, "objective/non_score_reward": -1.4943209886550903, "objective/rlhf_reward": -4.461511933597263, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 30.2872314453125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.517578125, "step": 386, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9968770742416382 }, { "episode": 6208, "epoch": 0.037195479982265044, "loss/policy_avg": 3.0326309204101562, "lr": 9.752684049079756e-06, "objective/entropy": -30.304298400878906, "objective/kl": 34.21199035644531, "objective/non_score_reward": -1.710599660873413, "objective/rlhf_reward": -5.391800324530944, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 191.66567993164062, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.62109375, "step": 387, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.998448133468628 }, { "episode": 6224, "epoch": 0.03729134462139459, "loss/policy_avg": 0.020012550055980682, "lr": 9.752044989775053e-06, "objective/entropy": -44.4876594543457, "objective/kl": 30.23657989501953, "objective/non_score_reward": -1.5118290185928345, "objective/rlhf_reward": -4.099904905037816, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 21.57486915588379, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.767578125, "step": 388, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.002869129180908 }, { "episode": 6240, "epoch": 0.03738720926052414, "loss/policy_avg": 0.33562996983528137, "lr": 9.751405930470348e-06, "objective/entropy": -154.47891235351562, "objective/kl": 18.6168155670166, "objective/non_score_reward": -0.9308407306671143, "objective/rlhf_reward": -3.723362982273102, "objective/scores": 0.0, "policy/approxkl_avg": 13.14146614074707, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.65625, "step": 389, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0002055168151855 }, { "episode": 6256, "epoch": 0.03748307389965369, "loss/policy_avg": 0.037651438266038895, "lr": 9.750766871165645e-06, "objective/entropy": -6.050981521606445, "objective/kl": 26.29869270324707, "objective/non_score_reward": -1.3149347305297852, "objective/rlhf_reward": -5.25973904132843, "objective/scores": 0.0, "policy/approxkl_avg": 27.001697540283203, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.759765625, "step": 390, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9982492923736572 }, { "episode": 6272, "epoch": 0.03757893853878324, "loss/policy_avg": 0.1277342140674591, "lr": 9.750127811860941e-06, "objective/entropy": -114.59310913085938, "objective/kl": 33.31782531738281, "objective/non_score_reward": -1.6658912897109985, "objective/rlhf_reward": -3.739846025348875, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 30.69461441040039, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.755859375, "step": 391, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9998853206634521 }, { "episode": 6288, "epoch": 0.03767480317791279, "loss/policy_avg": 0.08161749690771103, "lr": 9.749488752556238e-06, "objective/entropy": 28.02770233154297, "objective/kl": 25.580188751220703, "objective/non_score_reward": -1.279009461402893, "objective/rlhf_reward": -3.6654397054627985, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 11.5637845993042, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.3642578125, "step": 392, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9969701766967773 }, { "episode": 6304, "epoch": 0.03777066781704234, "loss/policy_avg": 0.013617899268865585, "lr": 9.748849693251534e-06, "objective/entropy": 137.66958618164062, "objective/kl": 36.88829040527344, "objective/non_score_reward": -1.8444143533706665, "objective/rlhf_reward": -5.999055602637631, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 3.8839926719665527, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.703125, "step": 393, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9998722076416016 }, { "episode": 6320, "epoch": 0.037866532456171886, "loss/policy_avg": 0.7664667963981628, "lr": 9.74821063394683e-06, "objective/entropy": 12.1875, "objective/kl": 27.703767776489258, "objective/non_score_reward": -1.385188341140747, "objective/rlhf_reward": -4.181503379081173, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 29.00311279296875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.64453125, "step": 394, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9983487129211426 }, { "episode": 6336, "epoch": 0.037962397095301435, "loss/policy_avg": 0.13891640305519104, "lr": 9.747571574642127e-06, "objective/entropy": -52.291236877441406, "objective/kl": 29.62856101989746, "objective/non_score_reward": -1.4814281463623047, "objective/rlhf_reward": -4.10088383701713, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 45.48643112182617, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.763671875, "step": 395, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9967341423034668 }, { "episode": 6352, "epoch": 0.038058261734430984, "loss/policy_avg": -0.5259265899658203, "lr": 9.746932515337424e-06, "objective/entropy": -14.848602294921875, "objective/kl": 36.51825714111328, "objective/non_score_reward": -1.8259128332138062, "objective/rlhf_reward": -5.180944981352363, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 133.36766052246094, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.673828125, "step": 396, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.008728504180908 }, { "episode": 6368, "epoch": 0.03815412637356053, "loss/policy_avg": 0.1340530812740326, "lr": 9.746293456032721e-06, "objective/entropy": -13.48861312866211, "objective/kl": 24.147233963012695, "objective/non_score_reward": -1.2073616981506348, "objective/rlhf_reward": -3.0961134592692057, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 4.865433216094971, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.787109375, "step": 397, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0002119541168213 }, { "episode": 6384, "epoch": 0.03824999101269008, "loss/policy_avg": 0.036313191056251526, "lr": 9.745654396728016e-06, "objective/entropy": -118.45596313476562, "objective/kl": 26.90463638305664, "objective/non_score_reward": -1.3452317714691162, "objective/rlhf_reward": -3.5560982182350864, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 7.153594017028809, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.708984375, "step": 398, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0009984970092773 }, { "episode": 6400, "epoch": 0.03834585565181963, "loss/policy_avg": 0.07543957978487015, "lr": 9.745015337423313e-06, "objective/entropy": 5.307586669921875, "objective/kl": 29.030933380126953, "objective/non_score_reward": -1.4515466690063477, "objective/rlhf_reward": -2.88246778094885, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 14.018705368041992, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.611328125, "step": 399, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9984686374664307 }, { "episode": 6416, "epoch": 0.03844172029094918, "loss/policy_avg": 0.11864852905273438, "lr": 9.74437627811861e-06, "objective/entropy": 10.484695434570312, "objective/kl": 24.462554931640625, "objective/non_score_reward": -1.2231277227401733, "objective/rlhf_reward": -3.376739227565464, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 14.423017501831055, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.685546875, "step": 400, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9988946914672852 }, { "episode": 6432, "epoch": 0.03853758493007873, "loss/policy_avg": -0.036792345345020294, "lr": 9.743737218813907e-06, "objective/entropy": -181.87400817871094, "objective/kl": 23.07555389404297, "objective/non_score_reward": -1.153777837753296, "objective/rlhf_reward": -3.191279132564632, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 20.132736206054688, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.708984375, "step": 401, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.00081729888916 }, { "episode": 6448, "epoch": 0.03863344956920828, "loss/policy_avg": 0.22927281260490417, "lr": 9.743098159509204e-06, "objective/entropy": -88.96450805664062, "objective/kl": 32.569129943847656, "objective/non_score_reward": -1.6284565925598145, "objective/rlhf_reward": -4.780492917696634, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 49.499900817871094, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.70703125, "step": 402, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9982388019561768 }, { "episode": 6464, "epoch": 0.03872931420833783, "loss/policy_avg": 0.30984753370285034, "lr": 9.7424591002045e-06, "objective/entropy": -18.365474700927734, "objective/kl": 31.77776336669922, "objective/non_score_reward": -1.5888882875442505, "objective/rlhf_reward": -5.030040267735643, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 36.973690032958984, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.607421875, "step": 403, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9970709085464478 }, { "episode": 6480, "epoch": 0.038825178847467376, "loss/policy_avg": 0.06557717174291611, "lr": 9.741820040899796e-06, "objective/entropy": -141.13568115234375, "objective/kl": 28.107177734375, "objective/non_score_reward": -1.405358910560608, "objective/rlhf_reward": -3.674024294094975, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 31.192813873291016, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.791015625, "step": 404, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9952688217163086 }, { "episode": 6496, "epoch": 0.038921043486596925, "loss/policy_avg": 0.05502002686262131, "lr": 9.741180981595093e-06, "objective/entropy": 32.80726623535156, "objective/kl": 44.297119140625, "objective/non_score_reward": -2.2148561477661133, "objective/rlhf_reward": -7.5001741287454795, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 19.57358169555664, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.650390625, "step": 405, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9999144077301025 }, { "episode": 6512, "epoch": 0.039016908125726474, "loss/policy_avg": 0.026680059731006622, "lr": 9.74054192229039e-06, "objective/entropy": 119.29817962646484, "objective/kl": 39.39287567138672, "objective/non_score_reward": -1.9696437120437622, "objective/rlhf_reward": -6.536938837080627, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 0.6370775699615479, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6171875, "step": 406, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0018253326416016 }, { "episode": 6528, "epoch": 0.03911277276485602, "loss/policy_avg": 0.6271831393241882, "lr": 9.739902862985686e-06, "objective/entropy": 6.752727508544922, "objective/kl": 25.43050765991211, "objective/non_score_reward": -1.2715253829956055, "objective/rlhf_reward": -5.086101770401001, "objective/scores": 0.0, "policy/approxkl_avg": 17.81015396118164, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.74609375, "step": 407, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9977548122406006 }, { "episode": 6544, "epoch": 0.03920863740398557, "loss/policy_avg": 0.30096232891082764, "lr": 9.739263803680983e-06, "objective/entropy": -24.516462326049805, "objective/kl": 38.53913116455078, "objective/non_score_reward": -1.9269566535949707, "objective/rlhf_reward": -5.585120143667732, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 15.016406059265137, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.603515625, "step": 408, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.994694709777832 }, { "episode": 6560, "epoch": 0.03930450204311512, "loss/policy_avg": 0.03762083500623703, "lr": 9.73862474437628e-06, "objective/entropy": -218.5489501953125, "objective/kl": 26.699615478515625, "objective/non_score_reward": -1.3349807262420654, "objective/rlhf_reward": -3.6780635170346363, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 59.4561653137207, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.54296875, "step": 409, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9982738494873047 }, { "episode": 6576, "epoch": 0.03940036668224467, "loss/policy_avg": 0.2932765483856201, "lr": 9.737985685071575e-06, "objective/entropy": -25.477672576904297, "objective/kl": 35.529788970947266, "objective/non_score_reward": -1.776489496231079, "objective/rlhf_reward": -5.372624413172403, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 39.98287582397461, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.55859375, "step": 410, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.999699354171753 }, { "episode": 6592, "epoch": 0.03949623132137422, "loss/policy_avg": -0.2486688196659088, "lr": 9.737346625766872e-06, "objective/entropy": -12.952373504638672, "objective/kl": 33.62919616699219, "objective/non_score_reward": -1.681459903717041, "objective/rlhf_reward": -4.778428207116063, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 28.626731872558594, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.56640625, "step": 411, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.003122568130493 }, { "episode": 6608, "epoch": 0.03959209596050377, "loss/policy_avg": 0.3249208629131317, "lr": 9.736707566462167e-06, "objective/entropy": -52.927459716796875, "objective/kl": 33.82263946533203, "objective/non_score_reward": -1.6911320686340332, "objective/rlhf_reward": -4.364528393745422, "objective/scores": 0.6, "policy/approxkl_avg": 41.674591064453125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.791015625, "step": 412, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.000582695007324 }, { "episode": 6624, "epoch": 0.039687960599633317, "loss/policy_avg": 0.15019002556800842, "lr": 9.736068507157464e-06, "objective/entropy": -22.71458625793457, "objective/kl": 32.99541473388672, "objective/non_score_reward": -1.6497704982757568, "objective/rlhf_reward": -5.257446458845763, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 6.256417274475098, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6484375, "step": 413, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9978971481323242 }, { "episode": 6640, "epoch": 0.039783825238762865, "loss/policy_avg": 0.296099990606308, "lr": 9.735429447852761e-06, "objective/entropy": -10.485054016113281, "objective/kl": 28.53786277770996, "objective/non_score_reward": -1.4268931150436401, "objective/rlhf_reward": -3.9742393652598063, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 4.458545684814453, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.703125, "step": 414, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9996311664581299 }, { "episode": 6656, "epoch": 0.039879689877892414, "loss/policy_avg": 0.3615366816520691, "lr": 9.734790388548058e-06, "objective/entropy": -102.9046859741211, "objective/kl": 19.901390075683594, "objective/non_score_reward": -0.9950695037841797, "objective/rlhf_reward": -2.3184185675984486, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 18.427024841308594, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.82421875, "step": 415, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.999051809310913 }, { "episode": 6672, "epoch": 0.03997555451702196, "loss/policy_avg": 0.14772659540176392, "lr": 9.734151329243355e-06, "objective/entropy": -148.49395751953125, "objective/kl": 26.190744400024414, "objective/non_score_reward": -1.3095372915267944, "objective/rlhf_reward": -3.6340291834512524, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 59.936073303222656, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.806640625, "step": 416, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.001544237136841 }, { "episode": 6688, "epoch": 0.04007141915615151, "loss/policy_avg": 0.23557257652282715, "lr": 9.73351226993865e-06, "objective/entropy": -145.32284545898438, "objective/kl": 30.992046356201172, "objective/non_score_reward": -1.5496025085449219, "objective/rlhf_reward": -3.7984096765518185, "objective/scores": 0.6, "policy/approxkl_avg": 7.065143585205078, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.904296875, "step": 417, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9989118576049805 }, { "episode": 6704, "epoch": 0.04016728379528106, "loss/policy_avg": 0.12179827690124512, "lr": 9.732873210633947e-06, "objective/entropy": -64.65836334228516, "objective/kl": 35.22796630859375, "objective/non_score_reward": -1.7613983154296875, "objective/rlhf_reward": -5.686343335841579, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 61.170570373535156, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5546875, "step": 418, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9985511302947998 }, { "episode": 6720, "epoch": 0.04026314843441061, "loss/policy_avg": -0.043803490698337555, "lr": 9.732234151329244e-06, "objective/entropy": -87.70707702636719, "objective/kl": 28.95832061767578, "objective/non_score_reward": -1.447916030883789, "objective/rlhf_reward": -4.275892340930637, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 2.8885676860809326, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.763671875, "step": 419, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9999752044677734 }, { "episode": 6736, "epoch": 0.04035901307354016, "loss/policy_avg": 0.18042519688606262, "lr": 9.73159509202454e-06, "objective/entropy": -4.936176300048828, "objective/kl": 30.613567352294922, "objective/non_score_reward": -1.5306785106658936, "objective/rlhf_reward": -4.722713804244995, "objective/scores": 0.35, "policy/approxkl_avg": 209.10888671875, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.619140625, "step": 420, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9993661642074585 }, { "episode": 6752, "epoch": 0.04045487771266971, "loss/policy_avg": 0.6567588448524475, "lr": 9.730956032719838e-06, "objective/entropy": -162.10116577148438, "objective/kl": 33.140079498291016, "objective/non_score_reward": -1.6570039987564087, "objective/rlhf_reward": -4.505309881941352, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 33.703067779541016, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7578125, "step": 421, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9999632835388184 }, { "episode": 6768, "epoch": 0.04055074235179926, "loss/policy_avg": 0.5961964130401611, "lr": 9.730316973415135e-06, "objective/entropy": 18.374740600585938, "objective/kl": 36.82442092895508, "objective/non_score_reward": -1.8412209749221802, "objective/rlhf_reward": -4.441164646984312, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 62.1960334777832, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.654296875, "step": 422, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9999306201934814 }, { "episode": 6784, "epoch": 0.040646606990928806, "loss/policy_avg": 0.19755011796951294, "lr": 9.72967791411043e-06, "objective/entropy": -57.290000915527344, "objective/kl": 30.764808654785156, "objective/non_score_reward": -1.5382404327392578, "objective/rlhf_reward": -4.811326077490478, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 37.60175323486328, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.67578125, "step": 423, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9990873336791992 }, { "episode": 6800, "epoch": 0.040742471630058355, "loss/policy_avg": 0.2760317325592041, "lr": 9.729038854805727e-06, "objective/entropy": -54.2406005859375, "objective/kl": 28.681961059570312, "objective/non_score_reward": -1.4340981245040894, "objective/rlhf_reward": -3.7889812094735458, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 13.514376640319824, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.548828125, "step": 424, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0004029273986816 }, { "episode": 6816, "epoch": 0.040838336269187904, "loss/policy_avg": 0.05885821580886841, "lr": 9.728399795501023e-06, "objective/entropy": -30.280364990234375, "objective/kl": 31.102825164794922, "objective/non_score_reward": -1.5551413297653198, "objective/rlhf_reward": -4.820565319061279, "objective/scores": 0.35, "policy/approxkl_avg": 61.290470123291016, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.767578125, "step": 425, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9986295700073242 }, { "episode": 6832, "epoch": 0.04093420090831745, "loss/policy_avg": 0.044344570487737656, "lr": 9.72776073619632e-06, "objective/entropy": -223.16510009765625, "objective/kl": 11.546382904052734, "objective/non_score_reward": -0.5773191452026367, "objective/rlhf_reward": -0.3618654114770252, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 2.5684561729431152, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7109375, "step": 426, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0002474784851074 }, { "episode": 6848, "epoch": 0.041030065547447, "loss/policy_avg": 0.11938305199146271, "lr": 9.727121676891617e-06, "objective/entropy": -84.6756362915039, "objective/kl": 32.253173828125, "objective/non_score_reward": -1.6126585006713867, "objective/rlhf_reward": -5.000035624118194, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 54.86524963378906, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.693359375, "step": 427, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9985135793685913 }, { "episode": 6864, "epoch": 0.04112593018657655, "loss/policy_avg": -0.02704887092113495, "lr": 9.726482617586912e-06, "objective/entropy": 61.31664276123047, "objective/kl": 50.535186767578125, "objective/non_score_reward": -2.526759624481201, "objective/rlhf_reward": -8.765402606039672, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 87.70621490478516, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4794921875, "step": 428, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0009117126464844 }, { "episode": 6880, "epoch": 0.0412217948257061, "loss/policy_avg": 0.3563253581523895, "lr": 9.72584355828221e-06, "objective/entropy": -201.59555053710938, "objective/kl": 26.542133331298828, "objective/non_score_reward": -1.3271067142486572, "objective/rlhf_reward": -2.384707783104154, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 12.606565475463867, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.60546875, "step": 429, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9991586208343506 }, { "episode": 6896, "epoch": 0.04131765946483565, "loss/policy_avg": 0.3849369287490845, "lr": 9.725204498977506e-06, "objective/entropy": -172.11151123046875, "objective/kl": 31.27842140197754, "objective/non_score_reward": -1.5639209747314453, "objective/rlhf_reward": -4.52235098282496, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 35.41864776611328, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.794921875, "step": 430, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9977457523345947 }, { "episode": 6912, "epoch": 0.0414135241039652, "loss/policy_avg": 0.5410929918289185, "lr": 9.724565439672803e-06, "objective/entropy": -53.43696594238281, "objective/kl": 36.75939178466797, "objective/non_score_reward": -1.8379695415496826, "objective/rlhf_reward": -5.229172053114448, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 11.017414093017578, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.51953125, "step": 431, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9961919784545898 }, { "episode": 6928, "epoch": 0.041509388743094754, "loss/policy_avg": 0.5185568332672119, "lr": 9.7239263803681e-06, "objective/entropy": -42.49586486816406, "objective/kl": 31.465147018432617, "objective/non_score_reward": -1.5732574462890625, "objective/rlhf_reward": -4.914427437869412, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 1.669852614402771, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.62109375, "step": 432, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9998645782470703 }, { "episode": 6944, "epoch": 0.0416052533822243, "loss/policy_avg": -0.09886922687292099, "lr": 9.723287321063397e-06, "objective/entropy": -182.28286743164062, "objective/kl": 27.1431884765625, "objective/non_score_reward": -1.3571594953536987, "objective/rlhf_reward": -3.6953046480814615, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 25.096237182617188, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.70703125, "step": 433, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0018882751464844 }, { "episode": 6960, "epoch": 0.04170111802135385, "loss/policy_avg": 0.39349502325057983, "lr": 9.722648261758692e-06, "objective/entropy": 28.20358657836914, "objective/kl": 38.92597198486328, "objective/non_score_reward": -1.946298599243164, "objective/rlhf_reward": -6.385194158554077, "objective/scores": 0.35, "policy/approxkl_avg": 46.153385162353516, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4912109375, "step": 434, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9992406368255615 }, { "episode": 6976, "epoch": 0.0417969826604834, "loss/policy_avg": 0.3586619198322296, "lr": 9.722009202453989e-06, "objective/entropy": -126.02680206298828, "objective/kl": 32.40974807739258, "objective/non_score_reward": -1.6204874515533447, "objective/rlhf_reward": -4.534538338856633, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 10.944326400756836, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.666015625, "step": 435, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9971150159835815 }, { "episode": 6992, "epoch": 0.04189284729961295, "loss/policy_avg": -0.4687817692756653, "lr": 9.721370143149284e-06, "objective/entropy": -69.42359924316406, "objective/kl": 20.10685157775879, "objective/non_score_reward": -1.0053426027297974, "objective/rlhf_reward": -2.6427683430291236, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 22.483867645263672, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.6328125, "step": 436, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.041172504425049 }, { "episode": 7008, "epoch": 0.0419887119387425, "loss/policy_avg": 0.0906272605061531, "lr": 9.720731083844581e-06, "objective/entropy": -149.47274780273438, "objective/kl": 26.28115463256836, "objective/non_score_reward": -1.3140578269958496, "objective/rlhf_reward": -3.1335249564805365, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 2.7223973274230957, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.599609375, "step": 437, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.000173568725586 }, { "episode": 7024, "epoch": 0.04208457657787205, "loss/policy_avg": 0.3348531126976013, "lr": 9.720092024539878e-06, "objective/entropy": 22.56686782836914, "objective/kl": 36.523582458496094, "objective/non_score_reward": -1.8261791467666626, "objective/rlhf_reward": -5.700596723620015, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 20.443164825439453, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.515625, "step": 438, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9979515075683594 }, { "episode": 7040, "epoch": 0.042180441217001596, "loss/policy_avg": 0.04725319519639015, "lr": 9.719452965235175e-06, "objective/entropy": -71.08361053466797, "objective/kl": 20.915573120117188, "objective/non_score_reward": -1.045778751373291, "objective/rlhf_reward": -2.0604087731995917, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 7.088305473327637, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4462890625, "step": 439, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0024590492248535 }, { "episode": 7056, "epoch": 0.042276305856131145, "loss/policy_avg": 0.18381188809871674, "lr": 9.718813905930472e-06, "objective/entropy": 25.569873809814453, "objective/kl": 38.07762145996094, "objective/non_score_reward": -1.9038809537887573, "objective/rlhf_reward": -3.215523815155029, "objective/scores": 1.1, "policy/approxkl_avg": 30.962854385375977, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.62890625, "step": 440, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0011448860168457 }, { "episode": 7072, "epoch": 0.042372170495260694, "loss/policy_avg": 0.1967303454875946, "lr": 9.718174846625767e-06, "objective/entropy": -103.38803100585938, "objective/kl": 29.222076416015625, "objective/non_score_reward": -1.4611037969589233, "objective/rlhf_reward": -4.240295205179768, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 8.899417877197266, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.689453125, "step": 441, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9986486434936523 }, { "episode": 7088, "epoch": 0.04246803513439024, "loss/policy_avg": -0.07635466754436493, "lr": 9.717535787321064e-06, "objective/entropy": -54.58887481689453, "objective/kl": 35.043663024902344, "objective/non_score_reward": -1.752183198928833, "objective/rlhf_reward": -5.527779820378184, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 12.18149185180664, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.580078125, "step": 442, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0013041496276855 }, { "episode": 7104, "epoch": 0.04256389977351979, "loss/policy_avg": 0.3104819059371948, "lr": 9.71689672801636e-06, "objective/entropy": -53.842830657958984, "objective/kl": 23.18008804321289, "objective/non_score_reward": -1.1590044498443604, "objective/rlhf_reward": -3.0797587921291143, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 86.82899475097656, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.91796875, "step": 443, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9991846084594727 }, { "episode": 7120, "epoch": 0.04265976441264934, "loss/policy_avg": 0.6317604780197144, "lr": 9.716257668711657e-06, "objective/entropy": -21.19356918334961, "objective/kl": 30.069751739501953, "objective/non_score_reward": -1.503487467765808, "objective/rlhf_reward": -4.6353477025903285, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 128.40951538085938, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.76953125, "step": 444, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.997659683227539 }, { "episode": 7136, "epoch": 0.04275562905177889, "loss/policy_avg": 0.33194229006767273, "lr": 9.715618609406954e-06, "objective/entropy": -102.48907470703125, "objective/kl": 32.374549865722656, "objective/non_score_reward": -1.6187275648117065, "objective/rlhf_reward": -6.474910318851471, "objective/scores": 0.0, "policy/approxkl_avg": 7.681756973266602, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.59375, "step": 445, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.998705506324768 }, { "episode": 7152, "epoch": 0.04285149369090844, "loss/policy_avg": 0.26850253343582153, "lr": 9.714979550102251e-06, "objective/entropy": 69.35136413574219, "objective/kl": 26.097612380981445, "objective/non_score_reward": -1.3048806190490723, "objective/rlhf_reward": -3.738569977696299, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 62.56462097167969, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6640625, "step": 446, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.99786376953125 }, { "episode": 7168, "epoch": 0.04294735833003799, "loss/policy_avg": -0.1885017603635788, "lr": 9.714340490797546e-06, "objective/entropy": -16.98421859741211, "objective/kl": 30.90627670288086, "objective/non_score_reward": -1.5453139543533325, "objective/rlhf_reward": -4.577135715548115, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 11.766645431518555, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.73828125, "step": 447, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.003262996673584 }, { "episode": 7184, "epoch": 0.04304322296916754, "loss/policy_avg": 0.24147900938987732, "lr": 9.713701431492843e-06, "objective/entropy": -196.87869262695312, "objective/kl": 23.231670379638672, "objective/non_score_reward": -1.161583662033081, "objective/rlhf_reward": -3.1305624780976142, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 19.03369903564453, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.69140625, "step": 448, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9996755123138428 }, { "episode": 7200, "epoch": 0.043139087608297086, "loss/policy_avg": 0.3051467537879944, "lr": 9.71306237218814e-06, "objective/entropy": -54.2137565612793, "objective/kl": 33.54918670654297, "objective/non_score_reward": -1.6774592399597168, "objective/rlhf_reward": -5.047977810323822, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 74.37176513671875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.783203125, "step": 449, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9967325925827026 }, { "episode": 7216, "epoch": 0.043234952247426635, "loss/policy_avg": 0.0008301436901092529, "lr": 9.712423312883437e-06, "objective/entropy": -37.864322662353516, "objective/kl": 24.052818298339844, "objective/non_score_reward": -1.2026410102844238, "objective/rlhf_reward": -2.9857349946823826, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 1.6498993635177612, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.525390625, "step": 450, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.001569986343384 }, { "episode": 7232, "epoch": 0.043330816886556184, "loss/policy_avg": 0.10217726975679398, "lr": 9.711784253578734e-06, "objective/entropy": -97.12496948242188, "objective/kl": 20.143707275390625, "objective/non_score_reward": -1.007185459136963, "objective/rlhf_reward": -2.669491672252102, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 37.34214401245117, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.783203125, "step": 451, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9993760585784912 }, { "episode": 7248, "epoch": 0.04342668152568573, "loss/policy_avg": 0.2181258350610733, "lr": 9.711145194274029e-06, "objective/entropy": -187.07266235351562, "objective/kl": 22.520824432373047, "objective/non_score_reward": -1.1260414123535156, "objective/rlhf_reward": -2.9000454283395585, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 80.40426635742188, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.654296875, "step": 452, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000328540802002 }, { "episode": 7264, "epoch": 0.04352254616481528, "loss/policy_avg": 0.28700706362724304, "lr": 9.710506134969326e-06, "objective/entropy": -119.91871643066406, "objective/kl": 30.88311004638672, "objective/non_score_reward": -1.5441553592681885, "objective/rlhf_reward": -4.834986022024779, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 14.897968292236328, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.787109375, "step": 453, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9972997903823853 }, { "episode": 7280, "epoch": 0.04361841080394483, "loss/policy_avg": 0.013649387285113335, "lr": 9.709867075664623e-06, "objective/entropy": -137.84861755371094, "objective/kl": 35.624549865722656, "objective/non_score_reward": -1.781227707862854, "objective/rlhf_reward": -7.124910950660706, "objective/scores": 0.0, "policy/approxkl_avg": 77.14759826660156, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.712890625, "step": 454, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.999969720840454 }, { "episode": 7296, "epoch": 0.04371427544307438, "loss/policy_avg": 0.9055305123329163, "lr": 9.70922801635992e-06, "objective/entropy": -177.1896514892578, "objective/kl": 34.19129943847656, "objective/non_score_reward": -1.7095649242401123, "objective/rlhf_reward": -5.387661199183807, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 51.92662811279297, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.66796875, "step": 455, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9976481199264526 }, { "episode": 7312, "epoch": 0.04381014008220393, "loss/policy_avg": -0.14486947655677795, "lr": 9.708588957055215e-06, "objective/entropy": -91.43609619140625, "objective/kl": 30.12580108642578, "objective/non_score_reward": -1.5062901973724365, "objective/rlhf_reward": -4.509388887675938, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 24.85628890991211, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.611328125, "step": 456, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.011446952819824 }, { "episode": 7328, "epoch": 0.04390600472133348, "loss/policy_avg": 0.3115137815475464, "lr": 9.707949897750512e-06, "objective/entropy": -33.496673583984375, "objective/kl": 24.4674072265625, "objective/non_score_reward": -1.2233703136444092, "objective/rlhf_reward": -3.377709650787052, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 9.057685852050781, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.751953125, "step": 457, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0009520053863525 }, { "episode": 7344, "epoch": 0.044001869360463026, "loss/policy_avg": 1.4892723560333252, "lr": 9.707310838445809e-06, "objective/entropy": -35.618934631347656, "objective/kl": 27.64456558227539, "objective/non_score_reward": -1.3822282552719116, "objective/rlhf_reward": -3.5815017921494796, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 1.899414300918579, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.57421875, "step": 458, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9999825954437256 }, { "episode": 7360, "epoch": 0.044097733999592575, "loss/policy_avg": 0.022264737635850906, "lr": 9.706671779141105e-06, "objective/entropy": 31.060089111328125, "objective/kl": 34.85979461669922, "objective/non_score_reward": -1.7429897785186768, "objective/rlhf_reward": -5.367839369837361, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 7.1077799797058105, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.751953125, "step": 459, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9993565082550049 }, { "episode": 7376, "epoch": 0.044193598638722124, "loss/policy_avg": 0.08219340443611145, "lr": 9.7060327198364e-06, "objective/entropy": -69.6414566040039, "objective/kl": 35.42669677734375, "objective/non_score_reward": -1.7713346481323242, "objective/rlhf_reward": -5.726088785861416, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 21.27887535095215, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.712890625, "step": 460, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0088043212890625 }, { "episode": 7392, "epoch": 0.04428946327785167, "loss/policy_avg": 0.03685396909713745, "lr": 9.705393660531698e-06, "objective/entropy": -245.04380798339844, "objective/kl": 21.42380142211914, "objective/non_score_reward": -1.0711899995803833, "objective/rlhf_reward": -2.1620538852372504, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 1.849046230316162, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.767578125, "step": 461, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.008730411529541 }, { "episode": 7408, "epoch": 0.04438532791698122, "loss/policy_avg": 0.5492111444473267, "lr": 9.704754601226994e-06, "objective/entropy": 9.25466537475586, "objective/kl": 20.997167587280273, "objective/non_score_reward": -1.0498583316802979, "objective/rlhf_reward": -1.2757146700632302, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 36.03380584716797, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.9296875, "step": 462, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.000826120376587 }, { "episode": 7424, "epoch": 0.04448119255611077, "loss/policy_avg": 0.22961178421974182, "lr": 9.704115541922291e-06, "objective/entropy": -2.9236984252929688, "objective/kl": 26.89717674255371, "objective/non_score_reward": -1.3448588848114014, "objective/rlhf_reward": -3.717576061905013, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 133.2696075439453, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8984375, "step": 463, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.999076008796692 }, { "episode": 7440, "epoch": 0.04457705719524032, "loss/policy_avg": 0.1330358386039734, "lr": 9.703476482617588e-06, "objective/entropy": -155.3049774169922, "objective/kl": 32.32700729370117, "objective/non_score_reward": -1.6163502931594849, "objective/rlhf_reward": -2.0654012918472286, "objective/scores": 1.1, "policy/approxkl_avg": 352.436767578125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.86328125, "step": 464, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9973843097686768 }, { "episode": 7456, "epoch": 0.04467292183436987, "loss/policy_avg": 0.13191767036914825, "lr": 9.702837423312883e-06, "objective/entropy": -130.06350708007812, "objective/kl": 31.98480987548828, "objective/non_score_reward": -1.5992405414581299, "objective/rlhf_reward": -5.07144889596097, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 4.149503707885742, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.654296875, "step": 465, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9979965686798096 }, { "episode": 7472, "epoch": 0.04476878647349942, "loss/policy_avg": 0.11230316013097763, "lr": 9.70219836400818e-06, "objective/entropy": 11.579151153564453, "objective/kl": 34.1675910949707, "objective/non_score_reward": -1.7083796262741089, "objective/rlhf_reward": -5.3525657681778664, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 13.865779876708984, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.708984375, "step": 466, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.00162410736084 }, { "episode": 7488, "epoch": 0.04486465111262897, "loss/policy_avg": 0.2810555398464203, "lr": 9.701559304703477e-06, "objective/entropy": -138.13914489746094, "objective/kl": 22.91815948486328, "objective/non_score_reward": -1.145907998085022, "objective/rlhf_reward": -3.205029585448605, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 97.98136901855469, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.775390625, "step": 467, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9984302520751953 }, { "episode": 7504, "epoch": 0.044960515751758516, "loss/policy_avg": -0.09679757058620453, "lr": 9.700920245398774e-06, "objective/entropy": -44.23152160644531, "objective/kl": 34.52162170410156, "objective/non_score_reward": -1.726081132888794, "objective/rlhf_reward": -5.170991019407907, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 12.573694229125977, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.3896484375, "step": 468, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9995930194854736 }, { "episode": 7520, "epoch": 0.045056380390888065, "loss/policy_avg": 0.2740531265735626, "lr": 9.700281186094071e-06, "objective/entropy": -64.87997436523438, "objective/kl": 30.31191062927246, "objective/non_score_reward": -1.5155954360961914, "objective/rlhf_reward": -4.329048738876978, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 12.677139282226562, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.630859375, "step": 469, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9981954097747803 }, { "episode": 7536, "epoch": 0.045152245030017614, "loss/policy_avg": 0.4849107265472412, "lr": 9.699642126789368e-06, "objective/entropy": -136.48355102539062, "objective/kl": 20.618619918823242, "objective/non_score_reward": -1.030930995941162, "objective/rlhf_reward": -2.6998918845253863, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 95.56924438476562, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.607421875, "step": 470, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9975948333740234 }, { "episode": 7552, "epoch": 0.04524810966914716, "loss/policy_avg": 0.05032477527856827, "lr": 9.699003067484663e-06, "objective/entropy": -116.99330139160156, "objective/kl": 31.927814483642578, "objective/non_score_reward": -1.596390724182129, "objective/rlhf_reward": -5.026312672828121, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 1.1943883895874023, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.744140625, "step": 471, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0004005432128906 }, { "episode": 7568, "epoch": 0.04534397430827671, "loss/policy_avg": 0.23768550157546997, "lr": 9.69836400817996e-06, "objective/entropy": -56.441200256347656, "objective/kl": 35.956565856933594, "objective/non_score_reward": -1.7978280782699585, "objective/rlhf_reward": -5.587192330423909, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 18.25104522705078, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.75390625, "step": 472, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.001668930053711 }, { "episode": 7584, "epoch": 0.04543983894740626, "loss/policy_avg": 0.18428431451320648, "lr": 9.697724948875257e-06, "objective/entropy": -12.911811828613281, "objective/kl": 31.440038681030273, "objective/non_score_reward": -1.5720020532608032, "objective/rlhf_reward": -4.554674939314523, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 33.68145751953125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.673828125, "step": 473, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9997875690460205 }, { "episode": 7600, "epoch": 0.04553570358653581, "loss/policy_avg": 1.0267724990844727, "lr": 9.697085889570554e-06, "objective/entropy": -155.81759643554688, "objective/kl": 15.551814079284668, "objective/non_score_reward": -0.7775906920433044, "objective/rlhf_reward": -1.7317606593049586, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 3.7084851264953613, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.673828125, "step": 474, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9998857975006104 }, { "episode": 7616, "epoch": 0.04563156822566536, "loss/policy_avg": 0.5301028490066528, "lr": 9.69644683026585e-06, "objective/entropy": -186.65789794921875, "objective/kl": 37.16144561767578, "objective/non_score_reward": -1.858072280883789, "objective/rlhf_reward": -5.876029699054316, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 48.150047302246094, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.658203125, "step": 475, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9972370862960815 }, { "episode": 7632, "epoch": 0.04572743286479491, "loss/policy_avg": 0.2144310474395752, "lr": 9.695807770961146e-06, "objective/entropy": -153.16233825683594, "objective/kl": 31.742645263671875, "objective/non_score_reward": -1.5871323347091675, "objective/rlhf_reward": -4.832757556232151, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 43.260581970214844, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.62109375, "step": 476, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.996619701385498 }, { "episode": 7648, "epoch": 0.04582329750392446, "loss/policy_avg": 0.1423683762550354, "lr": 9.695168711656443e-06, "objective/entropy": -101.34695434570312, "objective/kl": 34.40277099609375, "objective/non_score_reward": -1.7201385498046875, "objective/rlhf_reward": -5.555040988951845, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 6.133903503417969, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.740234375, "step": 477, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9991512298583984 }, { "episode": 7664, "epoch": 0.045919162143054006, "loss/policy_avg": -0.20567180216312408, "lr": 9.694529652351738e-06, "objective/entropy": 1.8477153778076172, "objective/kl": 34.25542068481445, "objective/non_score_reward": -1.7127711772918701, "objective/rlhf_reward": -5.451084411144256, "objective/scores": 0.35, "policy/approxkl_avg": 90.96925354003906, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.79296875, "step": 478, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9978113174438477 }, { "episode": 7680, "epoch": 0.046015026782183555, "loss/policy_avg": 0.04285082221031189, "lr": 9.693890593047035e-06, "objective/entropy": -163.51800537109375, "objective/kl": 39.76237487792969, "objective/non_score_reward": -1.9881186485290527, "objective/rlhf_reward": -6.47152245324409, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 41.795677185058594, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.64453125, "step": 479, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9989352226257324 }, { "episode": 7696, "epoch": 0.046110891421313104, "loss/policy_avg": 0.30679094791412354, "lr": 9.693251533742331e-06, "objective/entropy": -137.21139526367188, "objective/kl": 24.817203521728516, "objective/non_score_reward": -1.2408602237701416, "objective/rlhf_reward": -3.407181172576502, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 7.010622024536133, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.740234375, "step": 480, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.998010516166687 }, { "episode": 7712, "epoch": 0.04620675606044265, "loss/policy_avg": 0.14935311675071716, "lr": 9.692612474437628e-06, "objective/entropy": -133.61581420898438, "objective/kl": 28.18117904663086, "objective/non_score_reward": -1.4090590476989746, "objective/rlhf_reward": -4.276986324523373, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 41.72409439086914, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.720703125, "step": 481, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9976277351379395 }, { "episode": 7728, "epoch": 0.0463026206995722, "loss/policy_avg": 0.4503282606601715, "lr": 9.691973415132925e-06, "objective/entropy": -185.92971801757812, "objective/kl": 24.44643783569336, "objective/non_score_reward": -1.22232186794281, "objective/rlhf_reward": -4.88928747177124, "objective/scores": 0.0, "policy/approxkl_avg": 26.91709327697754, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.55078125, "step": 482, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9986282587051392 }, { "episode": 7744, "epoch": 0.04639848533870175, "loss/policy_avg": 0.7586182355880737, "lr": 9.691334355828222e-06, "objective/entropy": -136.83555603027344, "objective/kl": 27.66883087158203, "objective/non_score_reward": -1.38344144821167, "objective/rlhf_reward": -3.41105959035543, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 39.446250915527344, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4287109375, "step": 483, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9947845935821533 }, { "episode": 7760, "epoch": 0.0464943499778313, "loss/policy_avg": 0.47291696071624756, "lr": 9.690695296523517e-06, "objective/entropy": 10.135929107666016, "objective/kl": 31.171567916870117, "objective/non_score_reward": -1.558578372001648, "objective/rlhf_reward": -4.572453921259033, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 15.718633651733398, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.611328125, "step": 484, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.997343897819519 }, { "episode": 7776, "epoch": 0.04659021461696085, "loss/policy_avg": 0.19839856028556824, "lr": 9.690056237218814e-06, "objective/entropy": -64.7506332397461, "objective/kl": 25.45448112487793, "objective/non_score_reward": -1.2727241516113281, "objective/rlhf_reward": -2.690896427631378, "objective/scores": 0.6, "policy/approxkl_avg": 29.054779052734375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.76953125, "step": 485, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9977914094924927 }, { "episode": 7792, "epoch": 0.0466860792560904, "loss/policy_avg": 0.16692940890789032, "lr": 9.689417177914111e-06, "objective/entropy": -200.1573028564453, "objective/kl": 16.24359893798828, "objective/non_score_reward": -0.8121800422668457, "objective/rlhf_reward": -1.6446000672021683, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 3.7478506565093994, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.58984375, "step": 486, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9997503757476807 }, { "episode": 7808, "epoch": 0.046781943895219946, "loss/policy_avg": 0.20832450687885284, "lr": 9.688778118609408e-06, "objective/entropy": -229.8734893798828, "objective/kl": 24.610809326171875, "objective/non_score_reward": -1.2305405139923096, "objective/rlhf_reward": -3.3180417156854443, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 50.22547912597656, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.62890625, "step": 487, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9977538585662842 }, { "episode": 7824, "epoch": 0.046877808534349495, "loss/policy_avg": 0.584824800491333, "lr": 9.688139059304705e-06, "objective/entropy": -159.94088745117188, "objective/kl": 32.78782653808594, "objective/non_score_reward": -1.6393911838531494, "objective/rlhf_reward": -5.041793072017368, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 53.52165985107422, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4560546875, "step": 488, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9974932670593262 }, { "episode": 7840, "epoch": 0.046973673173479044, "loss/policy_avg": 0.10657641291618347, "lr": 9.6875e-06, "objective/entropy": -117.46031188964844, "objective/kl": 22.680068969726562, "objective/non_score_reward": -1.1340034008026123, "objective/rlhf_reward": -2.802680269877116, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 31.437467575073242, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.640625, "step": 489, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9984140396118164 }, { "episode": 7856, "epoch": 0.0470695378126086, "loss/policy_avg": 0.05225694179534912, "lr": 9.686860940695297e-06, "objective/entropy": -102.69722747802734, "objective/kl": 35.890769958496094, "objective/non_score_reward": -1.7945387363433838, "objective/rlhf_reward": -2.7781547069549557, "objective/scores": 1.1, "policy/approxkl_avg": 8.238727569580078, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.693359375, "step": 490, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.996579885482788 }, { "episode": 7872, "epoch": 0.04716540245173815, "loss/policy_avg": 0.3118276000022888, "lr": 9.686221881390594e-06, "objective/entropy": -42.73939895629883, "objective/kl": 22.486095428466797, "objective/non_score_reward": -1.1243047714233398, "objective/rlhf_reward": -3.0733869268494525, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 29.32803726196289, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.609375, "step": 491, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9991399049758911 }, { "episode": 7888, "epoch": 0.0472612670908677, "loss/policy_avg": 0.621738076210022, "lr": 9.68558282208589e-06, "objective/entropy": -26.77874755859375, "objective/kl": 33.77405548095703, "objective/non_score_reward": -1.688702940940857, "objective/rlhf_reward": -5.198552160468653, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 9.273128509521484, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.73046875, "step": 492, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9988332986831665 }, { "episode": 7904, "epoch": 0.04735713172999725, "loss/policy_avg": 0.16049662232398987, "lr": 9.684943762781188e-06, "objective/entropy": -84.04755401611328, "objective/kl": 25.384605407714844, "objective/non_score_reward": -1.2692303657531738, "objective/rlhf_reward": -2.1532023891222205, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 0.7223958373069763, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.646484375, "step": 493, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0005576610565186 }, { "episode": 7920, "epoch": 0.047452996369126796, "loss/policy_avg": 0.3413264751434326, "lr": 9.684304703476484e-06, "objective/entropy": -118.85188293457031, "objective/kl": 30.77880859375, "objective/non_score_reward": -1.5389404296875, "objective/rlhf_reward": -4.422428623835246, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 19.30898666381836, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.611328125, "step": 494, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.997934103012085 }, { "episode": 7936, "epoch": 0.047548861008256345, "loss/policy_avg": -0.016445789486169815, "lr": 9.68366564417178e-06, "objective/entropy": -211.39361572265625, "objective/kl": 26.587682723999023, "objective/non_score_reward": -1.3293840885162354, "objective/rlhf_reward": -2.917536354064941, "objective/scores": 0.6, "policy/approxkl_avg": 50.449562072753906, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5234375, "step": 495, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.99893319606781 }, { "episode": 7952, "epoch": 0.047644725647385894, "loss/policy_avg": -0.2565712034702301, "lr": 9.683026584867076e-06, "objective/entropy": -49.41560363769531, "objective/kl": 27.722068786621094, "objective/non_score_reward": -1.3861035108566284, "objective/rlhf_reward": -3.882554417074309, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 16.277629852294922, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.703125, "step": 496, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.003061294555664 }, { "episode": 7968, "epoch": 0.04774059028651544, "loss/policy_avg": 0.17001637816429138, "lr": 9.682387525562373e-06, "objective/entropy": -40.254676818847656, "objective/kl": 25.527742385864258, "objective/non_score_reward": -1.2763869762420654, "objective/rlhf_reward": -5.10554826259613, "objective/scores": 0.0, "policy/approxkl_avg": 19.284744262695312, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6875, "step": 497, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9972081184387207 }, { "episode": 7984, "epoch": 0.04783645492564499, "loss/policy_avg": 0.08028728514909744, "lr": 9.68174846625767e-06, "objective/entropy": -23.79485321044922, "objective/kl": 23.14282989501953, "objective/non_score_reward": -1.1571415662765503, "objective/rlhf_reward": -4.628566324710846, "objective/scores": 0.0, "policy/approxkl_avg": 25.781452178955078, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4580078125, "step": 498, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9980249404907227 }, { "episode": 8000, "epoch": 0.04793231956477454, "loss/policy_avg": 0.2174569070339203, "lr": 9.681109406952967e-06, "objective/entropy": -109.13389587402344, "objective/kl": 36.64985656738281, "objective/non_score_reward": -1.8324928283691406, "objective/rlhf_reward": -5.951369323817593, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 27.508981704711914, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.525390625, "step": 499, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.99745512008667 }, { "episode": 8016, "epoch": 0.04802818420390409, "loss/policy_avg": 0.13631635904312134, "lr": 9.680470347648262e-06, "objective/entropy": -99.519775390625, "objective/kl": 41.364810943603516, "objective/non_score_reward": -2.0682406425476074, "objective/rlhf_reward": -6.448134417804788, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 102.98858642578125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4970703125, "step": 500, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.998145341873169 }, { "episode": 8032, "epoch": 0.04812404884303364, "loss/policy_avg": 0.059351589530706406, "lr": 9.67983128834356e-06, "objective/entropy": -226.86756896972656, "objective/kl": 27.588150024414062, "objective/non_score_reward": -1.379407525062561, "objective/rlhf_reward": -4.001858436855015, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 6.536296844482422, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.65625, "step": 501, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9976544380187988 }, { "episode": 8048, "epoch": 0.04821991348216319, "loss/policy_avg": 0.5408469438552856, "lr": 9.679192229038854e-06, "objective/entropy": 4.518913269042969, "objective/kl": 37.552825927734375, "objective/non_score_reward": -1.8776414394378662, "objective/rlhf_reward": -5.777232364813486, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 8.410907745361328, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.787109375, "step": 502, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9991774559020996 }, { "episode": 8064, "epoch": 0.048315778121292736, "loss/policy_avg": 1.089150071144104, "lr": 9.678553169734151e-06, "objective/entropy": -70.22102355957031, "objective/kl": 36.886138916015625, "objective/non_score_reward": -1.8443071842193604, "objective/rlhf_reward": -5.254522027746711, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 17.696430206298828, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.662109375, "step": 503, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9944283962249756 }, { "episode": 8080, "epoch": 0.048411642760422285, "loss/policy_avg": 0.04815336689352989, "lr": 9.677914110429448e-06, "objective/entropy": -206.61251831054688, "objective/kl": 19.784542083740234, "objective/non_score_reward": -0.9892270565032959, "objective/rlhf_reward": -2.4411365626179538, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 10.987642288208008, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6328125, "step": 504, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9982428550720215 }, { "episode": 8096, "epoch": 0.048507507399551834, "loss/policy_avg": 0.4511667788028717, "lr": 9.677275051124745e-06, "objective/entropy": -44.11040496826172, "objective/kl": 32.054603576660156, "objective/non_score_reward": -1.6027300357818604, "objective/rlhf_reward": -4.8951483605229225, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 161.647705078125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.57421875, "step": 505, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9990203380584717 }, { "episode": 8112, "epoch": 0.04860337203868138, "loss/policy_avg": 0.43728113174438477, "lr": 9.676635991820042e-06, "objective/entropy": -167.46401977539062, "objective/kl": 25.358474731445312, "objective/non_score_reward": -1.2679238319396973, "objective/rlhf_reward": -5.071695148944855, "objective/scores": 0.0, "policy/approxkl_avg": 6.505180358886719, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.67578125, "step": 506, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.999653697013855 }, { "episode": 8128, "epoch": 0.04869923667781093, "loss/policy_avg": 0.049704909324645996, "lr": 9.675996932515339e-06, "objective/entropy": -68.84889221191406, "objective/kl": 23.506563186645508, "objective/non_score_reward": -1.1753281354904175, "objective/rlhf_reward": -3.3227105523027003, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 3.8750853538513184, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.783203125, "step": 507, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.99936842918396 }, { "episode": 8144, "epoch": 0.04879510131694048, "loss/policy_avg": 0.23126532137393951, "lr": 9.675357873210634e-06, "objective/entropy": -193.32493591308594, "objective/kl": 30.975135803222656, "objective/non_score_reward": -1.5487568378448486, "objective/rlhf_reward": -4.072320940271888, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 30.721832275390625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.599609375, "step": 508, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.994474172592163 }, { "episode": 8160, "epoch": 0.04889096595607003, "loss/policy_avg": 0.6136177778244019, "lr": 9.67471881390593e-06, "objective/entropy": 35.12611770629883, "objective/kl": 24.636138916015625, "objective/non_score_reward": -1.2318068742752075, "objective/rlhf_reward": -2.979816268162663, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 31.945526123046875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.736328125, "step": 509, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.001587390899658 }, { "episode": 8176, "epoch": 0.04898683059519958, "loss/policy_avg": 0.07654842734336853, "lr": 9.674079754601228e-06, "objective/entropy": -218.7822265625, "objective/kl": 30.072967529296875, "objective/non_score_reward": -1.5036484003067017, "objective/rlhf_reward": -3.8918873689332347, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 42.21351623535156, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.763671875, "step": 510, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9961347579956055 }, { "episode": 8192, "epoch": 0.04908269523432913, "loss/policy_avg": 0.4642539322376251, "lr": 9.673440695296525e-06, "objective/entropy": -61.26002502441406, "objective/kl": 28.09502410888672, "objective/non_score_reward": -1.4047513008117676, "objective/rlhf_reward": -4.168407420726165, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 28.139495849609375, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.50390625, "step": 511, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9988558292388916 }, { "episode": 8208, "epoch": 0.04917855987345868, "loss/policy_avg": -0.1496490240097046, "lr": 9.672801635991821e-06, "objective/entropy": -237.9604034423828, "objective/kl": 24.80710220336914, "objective/non_score_reward": -1.2403552532196045, "objective/rlhf_reward": -3.5828184867776454, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 9.494747161865234, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.671875, "step": 512, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.000725507736206 }, { "episode": 8224, "epoch": 0.049274424512588226, "loss/policy_avg": -0.18209466338157654, "lr": 9.672162576687117e-06, "objective/entropy": -180.66116333007812, "objective/kl": 25.97962188720703, "objective/non_score_reward": -1.2989810705184937, "objective/rlhf_reward": -3.073217930571113, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 41.079193115234375, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.73046875, "step": 513, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.997527837753296 }, { "episode": 8240, "epoch": 0.049370289151717775, "loss/policy_avg": 0.3504701852798462, "lr": 9.671523517382413e-06, "objective/entropy": -98.80787658691406, "objective/kl": 26.576587677001953, "objective/non_score_reward": -1.3288295269012451, "objective/rlhf_reward": -0.9153180480003353, "objective/scores": 1.1, "policy/approxkl_avg": 13.758487701416016, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6953125, "step": 514, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.998854160308838 }, { "episode": 8256, "epoch": 0.049466153790847324, "loss/policy_avg": 0.48611417412757874, "lr": 9.67088445807771e-06, "objective/entropy": -128.45774841308594, "objective/kl": 29.784334182739258, "objective/non_score_reward": -1.4892168045043945, "objective/rlhf_reward": -4.223533527056375, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 1.2566263675689697, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.681640625, "step": 515, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0001587867736816 }, { "episode": 8272, "epoch": 0.04956201842997687, "loss/policy_avg": -0.13057222962379456, "lr": 9.670245398773007e-06, "objective/entropy": -146.07781982421875, "objective/kl": 31.182106018066406, "objective/non_score_reward": -1.5591052770614624, "objective/rlhf_reward": -3.8364211082458493, "objective/scores": 0.6, "policy/approxkl_avg": 15.76829719543457, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.576171875, "step": 516, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0000805854797363 }, { "episode": 8288, "epoch": 0.04965788306910642, "loss/policy_avg": 0.637583315372467, "lr": 9.669606339468304e-06, "objective/entropy": -144.37762451171875, "objective/kl": 27.648868560791016, "objective/non_score_reward": -1.3824436664581299, "objective/rlhf_reward": -4.0140026448094215, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 2.933715343475342, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.64453125, "step": 517, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9995014667510986 }, { "episode": 8304, "epoch": 0.04975374770823597, "loss/policy_avg": 0.23517751693725586, "lr": 9.668967280163601e-06, "objective/entropy": -130.0078125, "objective/kl": 26.889904022216797, "objective/non_score_reward": -1.344495415687561, "objective/rlhf_reward": -3.927383343787536, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 35.43697738647461, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.81640625, "step": 518, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9984192848205566 }, { "episode": 8320, "epoch": 0.04984961234736552, "loss/policy_avg": -0.05650443956255913, "lr": 9.668328220858896e-06, "objective/entropy": -214.1605682373047, "objective/kl": 21.148624420166016, "objective/non_score_reward": -1.0574312210083008, "objective/rlhf_reward": -2.673465876784876, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 18.935588836669922, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.599609375, "step": 519, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9993261098861694 }, { "episode": 8336, "epoch": 0.04994547698649507, "loss/policy_avg": -0.034447960555553436, "lr": 9.667689161554193e-06, "objective/entropy": -158.14088439941406, "objective/kl": 32.29146957397461, "objective/non_score_reward": -1.61457359790802, "objective/rlhf_reward": -4.902035086360529, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 6.876145362854004, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.58984375, "step": 520, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9993882179260254 }, { "episode": 8352, "epoch": 0.05004134162562462, "loss/policy_avg": -0.13744737207889557, "lr": 9.66705010224949e-06, "objective/entropy": -204.13546752929688, "objective/kl": 28.699504852294922, "objective/non_score_reward": -1.4349753856658936, "objective/rlhf_reward": -4.361299076167446, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 2.3828086853027344, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.66796875, "step": 521, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0017244815826416 }, { "episode": 8368, "epoch": 0.05013720626475417, "loss/policy_avg": 0.13512714207172394, "lr": 9.666411042944787e-06, "objective/entropy": -234.03375244140625, "objective/kl": 27.24090576171875, "objective/non_score_reward": -1.3620452880859375, "objective/rlhf_reward": -3.932409131320652, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 27.1795654296875, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.689453125, "step": 522, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.999112844467163 }, { "episode": 8384, "epoch": 0.050233070903883716, "loss/policy_avg": -0.011349002830684185, "lr": 9.665771983640082e-06, "objective/entropy": -252.35935974121094, "objective/kl": 35.68749237060547, "objective/non_score_reward": -1.784374713897705, "objective/rlhf_reward": -5.7588963890946925, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 13.969385147094727, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.626953125, "step": 523, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9982863664627075 }, { "episode": 8400, "epoch": 0.050328935543013265, "loss/policy_avg": 0.03610409051179886, "lr": 9.665132924335379e-06, "objective/entropy": -18.527732849121094, "objective/kl": 31.889944076538086, "objective/non_score_reward": -1.5944972038269043, "objective/rlhf_reward": -4.927390317530975, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 69.35887145996094, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.86328125, "step": 524, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.999839425086975 }, { "episode": 8416, "epoch": 0.050424800182142814, "loss/policy_avg": 0.4427942633628845, "lr": 9.664493865030676e-06, "objective/entropy": -203.7809295654297, "objective/kl": 25.36702537536621, "objective/non_score_reward": -1.2683511972427368, "objective/rlhf_reward": -3.6495729281502642, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 22.38974380493164, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.828125, "step": 525, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9989018440246582 }, { "episode": 8432, "epoch": 0.05052066482127236, "loss/policy_avg": 1.6773953437805176, "lr": 9.663854805725971e-06, "objective/entropy": -146.93841552734375, "objective/kl": 37.069419860839844, "objective/non_score_reward": -1.853471040725708, "objective/rlhf_reward": -5.990052063663569, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 11.231493949890137, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.53515625, "step": 526, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9981741905212402 }, { "episode": 8448, "epoch": 0.05061652946040191, "loss/policy_avg": -0.08897572010755539, "lr": 9.663215746421268e-06, "objective/entropy": -158.65708923339844, "objective/kl": 23.60004997253418, "objective/non_score_reward": -1.1800025701522827, "objective/rlhf_reward": -3.394497547179384, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 10.824882507324219, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6953125, "step": 527, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9995331764221191 }, { "episode": 8464, "epoch": 0.05071239409953146, "loss/policy_avg": 0.024341005831956863, "lr": 9.662576687116565e-06, "objective/entropy": -174.72035217285156, "objective/kl": 29.104461669921875, "objective/non_score_reward": -1.4552230834960938, "objective/rlhf_reward": -4.479256918936401, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 17.054231643676758, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.470703125, "step": 528, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.999741554260254 }, { "episode": 8480, "epoch": 0.05080825873866101, "loss/policy_avg": 0.257159948348999, "lr": 9.661937627811862e-06, "objective/entropy": -200.30184936523438, "objective/kl": 23.69171905517578, "objective/non_score_reward": -1.1845859289169312, "objective/rlhf_reward": -3.338343775272369, "objective/scores": 0.35, "policy/approxkl_avg": 6.550008773803711, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.595703125, "step": 529, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9984736442565918 }, { "episode": 8496, "epoch": 0.05090412337779056, "loss/policy_avg": 0.4184650182723999, "lr": 9.661298568507158e-06, "objective/entropy": -344.7420959472656, "objective/kl": 24.219188690185547, "objective/non_score_reward": -1.2109594345092773, "objective/rlhf_reward": -3.4652354205525935, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 67.58980560302734, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.681640625, "step": 530, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9985125064849854 }, { "episode": 8512, "epoch": 0.05099998801692011, "loss/policy_avg": -0.0187949538230896, "lr": 9.660659509202455e-06, "objective/entropy": -14.01883316040039, "objective/kl": 29.49643325805664, "objective/non_score_reward": -1.47482168674469, "objective/rlhf_reward": -4.520684697715145, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 6.090343475341797, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.59765625, "step": 531, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0014190673828125 }, { "episode": 8528, "epoch": 0.051095852656049656, "loss/policy_avg": 0.5480527877807617, "lr": 9.66002044989775e-06, "objective/entropy": -169.82949829101562, "objective/kl": 34.57899475097656, "objective/non_score_reward": -1.728949785232544, "objective/rlhf_reward": -5.434846642430186, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 7.255028247833252, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.638671875, "step": 532, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9971615076065063 }, { "episode": 8544, "epoch": 0.051191717295179205, "loss/policy_avg": 0.2761814594268799, "lr": 9.659381390593047e-06, "objective/entropy": -100.77452850341797, "objective/kl": 36.835365295410156, "objective/non_score_reward": -1.8417682647705078, "objective/rlhf_reward": -6.007823192809505, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 50.438026428222656, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.625, "step": 533, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9985384941101074 }, { "episode": 8560, "epoch": 0.051287581934308754, "loss/policy_avg": 0.4119563698768616, "lr": 9.658742331288344e-06, "objective/entropy": -65.70556640625, "objective/kl": 29.577213287353516, "objective/non_score_reward": -1.47886061668396, "objective/rlhf_reward": -3.792736174837623, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 7.75493049621582, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4990234375, "step": 534, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.002697467803955 }, { "episode": 8576, "epoch": 0.0513834465734383, "loss/policy_avg": 0.12609338760375977, "lr": 9.658103271983641e-06, "objective/entropy": -150.71954345703125, "objective/kl": 28.952709197998047, "objective/non_score_reward": -1.447635531425476, "objective/rlhf_reward": -4.3905422449111935, "objective/scores": 0.35, "policy/approxkl_avg": 34.924835205078125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.615234375, "step": 535, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0004661083221436 }, { "episode": 8592, "epoch": 0.05147931121256785, "loss/policy_avg": 0.014640828594565392, "lr": 9.657464212678938e-06, "objective/entropy": -37.74507141113281, "objective/kl": 25.910266876220703, "objective/non_score_reward": -1.295513391494751, "objective/rlhf_reward": -0.7820532083511349, "objective/scores": 1.1, "policy/approxkl_avg": 2.0191965103149414, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.751953125, "step": 536, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0010244846343994 }, { "episode": 8608, "epoch": 0.0515751758516974, "loss/policy_avg": 0.04429921880364418, "lr": 9.656825153374235e-06, "objective/entropy": -26.176483154296875, "objective/kl": 32.8004264831543, "objective/non_score_reward": -1.6400213241577148, "objective/rlhf_reward": -4.826751814285913, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 42.128135681152344, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.908203125, "step": 537, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0013113021850586 }, { "episode": 8624, "epoch": 0.05167104049082695, "loss/policy_avg": 0.46547916531562805, "lr": 9.65618609406953e-06, "objective/entropy": 7.776313781738281, "objective/kl": 28.19791030883789, "objective/non_score_reward": -1.4098955392837524, "objective/rlhf_reward": -3.906248764197031, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 4.504173755645752, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8828125, "step": 538, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.998396635055542 }, { "episode": 8640, "epoch": 0.0517669051299565, "loss/policy_avg": 0.0001214742660522461, "lr": 9.655547034764827e-06, "objective/entropy": -112.6850357055664, "objective/kl": 31.756372451782227, "objective/non_score_reward": -1.5878187417984009, "objective/rlhf_reward": -4.228568734900032, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 2.7504100799560547, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.546875, "step": 539, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0001778602600098 }, { "episode": 8656, "epoch": 0.05186276976908605, "loss/policy_avg": 0.41524794697761536, "lr": 9.654907975460124e-06, "objective/entropy": -135.01878356933594, "objective/kl": 23.119266510009766, "objective/non_score_reward": -1.1559633016586304, "objective/rlhf_reward": -3.0675939609676153, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 26.581480026245117, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6015625, "step": 540, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9962246417999268 }, { "episode": 8672, "epoch": 0.0519586344082156, "loss/policy_avg": 0.3321428894996643, "lr": 9.65426891615542e-06, "objective/entropy": -5.44740104675293, "objective/kl": 39.89240264892578, "objective/non_score_reward": -1.9946203231811523, "objective/rlhf_reward": -7.97848105430603, "objective/scores": 0.0, "policy/approxkl_avg": 67.52932739257812, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.62890625, "step": 541, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9986319541931152 }, { "episode": 8688, "epoch": 0.052054499047345146, "loss/policy_avg": 0.22704890370368958, "lr": 9.653629856850718e-06, "objective/entropy": 23.631000518798828, "objective/kl": 22.43924331665039, "objective/non_score_reward": -1.121962308883667, "objective/rlhf_reward": -3.109246918050152, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 40.600868225097656, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.708984375, "step": 542, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0008883476257324 }, { "episode": 8704, "epoch": 0.052150363686474695, "loss/policy_avg": 0.6167892217636108, "lr": 9.652990797546013e-06, "objective/entropy": 8.02947998046875, "objective/kl": 34.78337478637695, "objective/non_score_reward": -1.739168643951416, "objective/rlhf_reward": -5.556674695014953, "objective/scores": 0.35, "policy/approxkl_avg": 7.763035774230957, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.55859375, "step": 543, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9983458518981934 }, { "episode": 8720, "epoch": 0.052246228325604244, "loss/policy_avg": 0.1720658838748932, "lr": 9.65235173824131e-06, "objective/entropy": 0.5252876281738281, "objective/kl": 31.73941993713379, "objective/non_score_reward": -1.5869710445404053, "objective/rlhf_reward": -4.79162499209936, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 6.366281509399414, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.76953125, "step": 544, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9988775253295898 }, { "episode": 8736, "epoch": 0.05234209296473379, "loss/policy_avg": 0.07084909081459045, "lr": 9.651712678936605e-06, "objective/entropy": -50.734527587890625, "objective/kl": 24.657032012939453, "objective/non_score_reward": -1.2328516244888306, "objective/rlhf_reward": -3.1065776899185886, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 12.337860107421875, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.521484375, "step": 545, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9985425472259521 }, { "episode": 8752, "epoch": 0.05243795760386334, "loss/policy_avg": -0.053861357271671295, "lr": 9.651073619631902e-06, "objective/entropy": -242.29559326171875, "objective/kl": 21.178913116455078, "objective/non_score_reward": -1.058945655822754, "objective/rlhf_reward": -2.6316629386583146, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 23.818538665771484, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.62109375, "step": 546, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0112152099609375 }, { "episode": 8768, "epoch": 0.0525338222429929, "loss/policy_avg": -0.008508548140525818, "lr": 9.650434560327199e-06, "objective/entropy": -46.92424011230469, "objective/kl": 39.04132843017578, "objective/non_score_reward": -1.952066421508789, "objective/rlhf_reward": -6.429663398352963, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 15.27535629272461, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4677734375, "step": 547, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9982926845550537 }, { "episode": 8784, "epoch": 0.052629686882122446, "loss/policy_avg": 0.17654258012771606, "lr": 9.649795501022496e-06, "objective/entropy": -44.7242431640625, "objective/kl": 19.804813385009766, "objective/non_score_reward": -0.9902406930923462, "objective/rlhf_reward": -2.635450038939638, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 39.75682067871094, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.76953125, "step": 548, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.002223014831543 }, { "episode": 8800, "epoch": 0.052725551521251995, "loss/policy_avg": 0.46367156505584717, "lr": 9.649156441717792e-06, "objective/entropy": -132.18556213378906, "objective/kl": 38.18450927734375, "objective/non_score_reward": -1.909225344657898, "objective/rlhf_reward": -6.0327816343942455, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 24.263263702392578, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7421875, "step": 549, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9974383115768433 }, { "episode": 8816, "epoch": 0.052821416160381544, "loss/policy_avg": 0.2747136950492859, "lr": 9.64851738241309e-06, "objective/entropy": -91.26388549804688, "objective/kl": 28.735111236572266, "objective/non_score_reward": -1.4367555379867554, "objective/rlhf_reward": -4.085162764013396, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 5.113122940063477, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.626953125, "step": 550, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000535011291504 }, { "episode": 8832, "epoch": 0.05291728079951109, "loss/policy_avg": 0.031243963167071342, "lr": 9.647878323108384e-06, "objective/entropy": -40.358192443847656, "objective/kl": 31.673667907714844, "objective/non_score_reward": -1.5836834907531738, "objective/rlhf_reward": -4.993098309546142, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 78.17581939697266, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.796875, "step": 551, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9989157915115356 }, { "episode": 8848, "epoch": 0.05301314543864064, "loss/policy_avg": -0.28017422556877136, "lr": 9.647239263803681e-06, "objective/entropy": -100.97856140136719, "objective/kl": 33.18678283691406, "objective/non_score_reward": -1.659339189529419, "objective/rlhf_reward": -6.637356638908386, "objective/scores": 0.0, "policy/approxkl_avg": 6.006505012512207, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.572265625, "step": 552, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.003824234008789 }, { "episode": 8864, "epoch": 0.05310901007777019, "loss/policy_avg": 0.04892890527844429, "lr": 9.646600204498978e-06, "objective/entropy": -136.31918334960938, "objective/kl": 19.06879997253418, "objective/non_score_reward": -0.9534400105476379, "objective/rlhf_reward": -2.2575007369190008, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 1.5354987382888794, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.572265625, "step": 553, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.003046989440918 }, { "episode": 8880, "epoch": 0.05320487471689974, "loss/policy_avg": 0.1114959716796875, "lr": 9.645961145194275e-06, "objective/entropy": -125.14915466308594, "objective/kl": 41.65575408935547, "objective/non_score_reward": -2.0827877521514893, "objective/rlhf_reward": -6.383740137295659, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 12.4759521484375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.66796875, "step": 554, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9973247051239014 }, { "episode": 8896, "epoch": 0.05330073935602929, "loss/policy_avg": 0.2784144878387451, "lr": 9.645322085889572e-06, "objective/entropy": -42.213340759277344, "objective/kl": 34.43170928955078, "objective/non_score_reward": -1.7215855121612549, "objective/rlhf_reward": -6.8863421976566315, "objective/scores": 0.0, "policy/approxkl_avg": 37.5791015625, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.546875, "step": 555, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9974336624145508 }, { "episode": 8912, "epoch": 0.05339660399515884, "loss/policy_avg": -0.0683375895023346, "lr": 9.644683026584867e-06, "objective/entropy": -94.292724609375, "objective/kl": 29.925048828125, "objective/non_score_reward": -1.4962522983551025, "objective/rlhf_reward": -4.4287500669627935, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 0.9679741263389587, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.53125, "step": 556, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.002323627471924 }, { "episode": 8928, "epoch": 0.05349246863428839, "loss/policy_avg": 0.3528517484664917, "lr": 9.644043967280164e-06, "objective/entropy": 100.1601791381836, "objective/kl": 29.87194061279297, "objective/non_score_reward": -1.4935970306396484, "objective/rlhf_reward": -4.493435802872538, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 21.40321922302246, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.54296875, "step": 557, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.999911904335022 }, { "episode": 8944, "epoch": 0.053588333273417936, "loss/policy_avg": 0.15664523839950562, "lr": 9.643404907975461e-06, "objective/entropy": -163.13458251953125, "objective/kl": 43.485382080078125, "objective/non_score_reward": -2.174269199371338, "objective/rlhf_reward": -6.297076797485351, "objective/scores": 0.6, "policy/approxkl_avg": 28.333932876586914, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.52734375, "step": 558, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9964426755905151 }, { "episode": 8960, "epoch": 0.053684197912547485, "loss/policy_avg": 0.6344835162162781, "lr": 9.642765848670758e-06, "objective/entropy": -252.752685546875, "objective/kl": 33.16960144042969, "objective/non_score_reward": -1.658479928970337, "objective/rlhf_reward": -5.255317785827023, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 52.37012481689453, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.771484375, "step": 559, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.0004310607910156 }, { "episode": 8976, "epoch": 0.053780062551677034, "loss/policy_avg": 0.19869406521320343, "lr": 9.642126789366055e-06, "objective/entropy": -50.086647033691406, "objective/kl": 30.926883697509766, "objective/non_score_reward": -1.5463443994522095, "objective/rlhf_reward": -4.629118292537287, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 26.995628356933594, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.626953125, "step": 560, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9950168132781982 }, { "episode": 8992, "epoch": 0.05387592719080658, "loss/policy_avg": -0.010918349027633667, "lr": 9.641487730061352e-06, "objective/entropy": -168.9771728515625, "objective/kl": 22.5106201171875, "objective/non_score_reward": -1.1255309581756592, "objective/rlhf_reward": -3.1604882984453733, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 20.162094116210938, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.763671875, "step": 561, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.001406669616699 }, { "episode": 9008, "epoch": 0.05397179182993613, "loss/policy_avg": 0.4963573217391968, "lr": 9.640848670756647e-06, "objective/entropy": -159.58302307128906, "objective/kl": 34.39787673950195, "objective/non_score_reward": -1.7198940515518188, "objective/rlhf_reward": -5.455743868549433, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 32.154441833496094, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.59375, "step": 562, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.99989652633667 }, { "episode": 9024, "epoch": 0.05406765646906568, "loss/policy_avg": 0.4512660503387451, "lr": 9.640209611451944e-06, "objective/entropy": -112.33628845214844, "objective/kl": 34.371681213378906, "objective/non_score_reward": -1.7185840606689453, "objective/rlhf_reward": -5.515086495612545, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 4.578237056732178, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.599609375, "step": 563, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9984909296035767 }, { "episode": 9040, "epoch": 0.05416352110819523, "loss/policy_avg": 0.08781366050243378, "lr": 9.63957055214724e-06, "objective/entropy": -39.49800491333008, "objective/kl": 33.1617431640625, "objective/non_score_reward": -1.6580872535705566, "objective/rlhf_reward": -4.232348775863647, "objective/scores": 0.6, "policy/approxkl_avg": 4.19449520111084, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.677734375, "step": 564, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.000304698944092 }, { "episode": 9056, "epoch": 0.05425938574732478, "loss/policy_avg": 0.02701903134584427, "lr": 9.638931492842537e-06, "objective/entropy": -135.10118103027344, "objective/kl": 34.19304656982422, "objective/non_score_reward": -1.7096521854400635, "objective/rlhf_reward": -5.388010840030059, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 18.33478546142578, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.61328125, "step": 565, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.999070644378662 }, { "episode": 9072, "epoch": 0.05435525038645433, "loss/policy_avg": 0.2804332375526428, "lr": 9.638292433537834e-06, "objective/entropy": -100.01052856445312, "objective/kl": 28.388795852661133, "objective/non_score_reward": -1.4194397926330566, "objective/rlhf_reward": -5.677759170532227, "objective/scores": 0.0, "policy/approxkl_avg": 7.587360382080078, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.68359375, "step": 566, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0005486011505127 }, { "episode": 9088, "epoch": 0.05445111502558388, "loss/policy_avg": 0.4314262866973877, "lr": 9.63765337423313e-06, "objective/entropy": -130.2495574951172, "objective/kl": 35.38700866699219, "objective/non_score_reward": -1.7693501710891724, "objective/rlhf_reward": -5.4155414156323545, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 44.93388366699219, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.75390625, "step": 567, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9968568086624146 }, { "episode": 9104, "epoch": 0.054546979664713426, "loss/policy_avg": 0.3399587869644165, "lr": 9.637014314928426e-06, "objective/entropy": -247.61073303222656, "objective/kl": 28.445119857788086, "objective/non_score_reward": -1.4222559928894043, "objective/rlhf_reward": -3.864195342334818, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 7.162724018096924, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.607421875, "step": 568, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9984673261642456 }, { "episode": 9120, "epoch": 0.054642844303842975, "loss/policy_avg": 0.5520263314247131, "lr": 9.636375255623721e-06, "objective/entropy": -97.92376708984375, "objective/kl": 26.055057525634766, "objective/non_score_reward": -1.30275297164917, "objective/rlhf_reward": -3.088305356279884, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 36.18694305419922, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.494140625, "step": 569, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0012941360473633 }, { "episode": 9136, "epoch": 0.054738708942972523, "loss/policy_avg": 0.09734541922807693, "lr": 9.635736196319018e-06, "objective/entropy": -196.53872680664062, "objective/kl": 23.71702003479004, "objective/non_score_reward": -1.185850977897644, "objective/rlhf_reward": -4.743403911590576, "objective/scores": 0.0, "policy/approxkl_avg": 2.213500738143921, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.580078125, "step": 570, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9993475675582886 }, { "episode": 9152, "epoch": 0.05483457358210207, "loss/policy_avg": 0.4516823887825012, "lr": 9.635097137014315e-06, "objective/entropy": -126.11761474609375, "objective/kl": 28.336185455322266, "objective/non_score_reward": -1.4168094396591187, "objective/rlhf_reward": -1.2672375202178952, "objective/scores": 1.1, "policy/approxkl_avg": 44.684326171875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.775390625, "step": 571, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9990687370300293 }, { "episode": 9168, "epoch": 0.05493043822123162, "loss/policy_avg": 0.34894299507141113, "lr": 9.634458077709612e-06, "objective/entropy": -3.410472869873047, "objective/kl": 35.99509048461914, "objective/non_score_reward": -1.7997545003890991, "objective/rlhf_reward": -5.87350514891736, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 4.621858596801758, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.634765625, "step": 572, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0019896030426025 }, { "episode": 9184, "epoch": 0.05502630286036117, "loss/policy_avg": 0.1023169457912445, "lr": 9.633819018404909e-06, "objective/entropy": -180.73724365234375, "objective/kl": 24.693328857421875, "objective/non_score_reward": -1.2346664667129517, "objective/rlhf_reward": -3.5386658668518063, "objective/scores": 0.35, "policy/approxkl_avg": 22.89309310913086, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.64453125, "step": 573, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9981255531311035 }, { "episode": 9200, "epoch": 0.05512216749949072, "loss/policy_avg": 0.2509443163871765, "lr": 9.633179959100206e-06, "objective/entropy": -268.43072509765625, "objective/kl": 28.437435150146484, "objective/non_score_reward": -1.4218716621398926, "objective/rlhf_reward": -4.131227611508921, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 60.228729248046875, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.78125, "step": 574, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0000219345092773 }, { "episode": 9216, "epoch": 0.05521803213862027, "loss/policy_avg": -0.04683633893728256, "lr": 9.632540899795501e-06, "objective/entropy": -70.71329498291016, "objective/kl": 38.51101303100586, "objective/non_score_reward": -1.9255508184432983, "objective/rlhf_reward": -5.877374465736459, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 3.3532156944274902, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.615234375, "step": 575, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.000458002090454 }, { "episode": 9232, "epoch": 0.05531389677774982, "loss/policy_avg": 0.25571292638778687, "lr": 9.631901840490798e-06, "objective/entropy": -197.88787841796875, "objective/kl": 25.574037551879883, "objective/non_score_reward": -1.278701901435852, "objective/rlhf_reward": -3.3814741532007853, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 6.096738815307617, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.671875, "step": 576, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.001577854156494 }, { "episode": 9248, "epoch": 0.055409761416879366, "loss/policy_avg": 0.7064580917358398, "lr": 9.631262781186095e-06, "objective/entropy": -150.29953002929688, "objective/kl": 30.821884155273438, "objective/non_score_reward": -1.5410943031311035, "objective/rlhf_reward": -4.43104387919108, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 43.45115280151367, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.60546875, "step": 577, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9980316162109375 }, { "episode": 9264, "epoch": 0.055505626056008915, "loss/policy_avg": 0.20062510669231415, "lr": 9.630623721881392e-06, "objective/entropy": -158.88388061523438, "objective/kl": 28.73421859741211, "objective/non_score_reward": -1.4367109537124634, "objective/rlhf_reward": -4.346843814849853, "objective/scores": 0.35, "policy/approxkl_avg": 12.110857963562012, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.69140625, "step": 578, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.998002290725708 }, { "episode": 9280, "epoch": 0.055601490695138464, "loss/policy_avg": 0.08450721949338913, "lr": 9.629984662576689e-06, "objective/entropy": -250.45445251464844, "objective/kl": 27.57752227783203, "objective/non_score_reward": -1.3788762092590332, "objective/rlhf_reward": -4.064906816096649, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 17.175188064575195, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5859375, "step": 579, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.997262716293335 }, { "episode": 9296, "epoch": 0.05569735533426801, "loss/policy_avg": 0.41482874751091003, "lr": 9.629345603271984e-06, "objective/entropy": -177.06607055664062, "objective/kl": 29.43456268310547, "objective/non_score_reward": -1.4717282056808472, "objective/rlhf_reward": -2.9631939872514934, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 50.86977005004883, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.595703125, "step": 580, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0005576610565186 }, { "episode": 9312, "epoch": 0.05579321997339756, "loss/policy_avg": 0.20043331384658813, "lr": 9.62870654396728e-06, "objective/entropy": -224.79660034179688, "objective/kl": 23.171340942382812, "objective/non_score_reward": -1.1585670709609985, "objective/rlhf_reward": -2.6868569953011825, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 4.841948986053467, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.603515625, "step": 581, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.00144362449646 }, { "episode": 9328, "epoch": 0.05588908461252711, "loss/policy_avg": 0.28447139263153076, "lr": 9.628067484662578e-06, "objective/entropy": -44.1309814453125, "objective/kl": 42.387351989746094, "objective/non_score_reward": -2.1193675994873047, "objective/rlhf_reward": -7.151957724124117, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 20.72610092163086, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.419921875, "step": 582, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9971909523010254 }, { "episode": 9344, "epoch": 0.05598494925165666, "loss/policy_avg": 0.09533769637346268, "lr": 9.627428425357874e-06, "objective/entropy": -218.9058380126953, "objective/kl": 27.360652923583984, "objective/non_score_reward": -1.368032693862915, "objective/rlhf_reward": -4.021532396884307, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 11.28432846069336, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7421875, "step": 583, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9974052906036377 }, { "episode": 9360, "epoch": 0.05608081389078621, "loss/policy_avg": 0.5065032243728638, "lr": 9.626789366053171e-06, "objective/entropy": -231.38427734375, "objective/kl": 32.08224105834961, "objective/non_score_reward": -1.604112148284912, "objective/rlhf_reward": -5.0748127012545154, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 40.948760986328125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.69921875, "step": 584, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9988150596618652 }, { "episode": 9376, "epoch": 0.05617667852991576, "loss/policy_avg": 0.6530688405036926, "lr": 9.626150306748468e-06, "objective/entropy": -116.65798950195312, "objective/kl": 31.407730102539062, "objective/non_score_reward": -1.570386528968811, "objective/rlhf_reward": -4.902944007006985, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 13.348186492919922, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.54296875, "step": 585, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000188112258911 }, { "episode": 9392, "epoch": 0.05627254316904531, "loss/policy_avg": -0.06093317270278931, "lr": 9.625511247443763e-06, "objective/entropy": -245.7208251953125, "objective/kl": 22.28873634338379, "objective/non_score_reward": -1.1144368648529053, "objective/rlhf_reward": -2.33504098869947, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 1.7080774307250977, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.681640625, "step": 586, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.002075672149658 }, { "episode": 9408, "epoch": 0.056368407808174856, "loss/policy_avg": 0.4493389129638672, "lr": 9.62487218813906e-06, "objective/entropy": -11.156410217285156, "objective/kl": 29.71312141418457, "objective/non_score_reward": -1.4856561422348022, "objective/rlhf_reward": -4.117795641693186, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 18.012893676757812, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.759765625, "step": 587, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000352382659912 }, { "episode": 9424, "epoch": 0.056464272447304405, "loss/policy_avg": 0.3274408280849457, "lr": 9.624233128834357e-06, "objective/entropy": -116.3506088256836, "objective/kl": 35.94437026977539, "objective/non_score_reward": -1.7972185611724854, "objective/rlhf_reward": -4.788874185085296, "objective/scores": 0.6, "policy/approxkl_avg": 17.158645629882812, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.69140625, "step": 588, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9996552467346191 }, { "episode": 9440, "epoch": 0.056560137086433954, "loss/policy_avg": 0.879096508026123, "lr": 9.623594069529654e-06, "objective/entropy": -152.50155639648438, "objective/kl": 32.464576721191406, "objective/non_score_reward": -1.623228669166565, "objective/rlhf_reward": -5.069082756240931, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 70.49058532714844, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.849609375, "step": 589, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.001586437225342 }, { "episode": 9456, "epoch": 0.0566560017255635, "loss/policy_avg": 0.2921786904335022, "lr": 9.62295501022495e-06, "objective/entropy": -177.27088928222656, "objective/kl": 39.783531188964844, "objective/non_score_reward": -1.989176630973816, "objective/rlhf_reward": -6.57810423621307, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 77.26689147949219, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6796875, "step": 590, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9989970922470093 }, { "episode": 9472, "epoch": 0.05675186636469305, "loss/policy_avg": 0.3912142515182495, "lr": 9.622315950920246e-06, "objective/entropy": -120.1540756225586, "objective/kl": 31.21270179748535, "objective/non_score_reward": -1.5606350898742676, "objective/rlhf_reward": -3.842540299892425, "objective/scores": 0.6, "policy/approxkl_avg": 25.256790161132812, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.654296875, "step": 591, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9980988502502441 }, { "episode": 9488, "epoch": 0.0568477310038226, "loss/policy_avg": 0.04369340091943741, "lr": 9.621676891615543e-06, "objective/entropy": -277.40753173828125, "objective/kl": 29.685585021972656, "objective/non_score_reward": -1.4842792749404907, "objective/rlhf_reward": -1.5371170997619625, "objective/scores": 1.1, "policy/approxkl_avg": 7.890674591064453, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.677734375, "step": 592, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9981930255889893 }, { "episode": 9504, "epoch": 0.05694359564295215, "loss/policy_avg": 0.05721379816532135, "lr": 9.621037832310838e-06, "objective/entropy": -257.69232177734375, "objective/kl": 23.966060638427734, "objective/non_score_reward": -1.19830322265625, "objective/rlhf_reward": -3.0598793412248293, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 20.133102416992188, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.732421875, "step": 593, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.001232624053955 }, { "episode": 9520, "epoch": 0.0570394602820817, "loss/policy_avg": 0.5772296786308289, "lr": 9.620398773006135e-06, "objective/entropy": -89.6330795288086, "objective/kl": 31.078372955322266, "objective/non_score_reward": -1.5539186000823975, "objective/rlhf_reward": -4.734722021038889, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 21.1763916015625, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.87109375, "step": 594, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.000513792037964 }, { "episode": 9536, "epoch": 0.05713532492121125, "loss/policy_avg": -0.026315592229366302, "lr": 9.619759713701432e-06, "objective/entropy": -219.30979919433594, "objective/kl": 26.461135864257812, "objective/non_score_reward": -1.323056697845459, "objective/rlhf_reward": -3.9329772827371787, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 8.585318565368652, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.677734375, "step": 595, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0008788108825684 }, { "episode": 9552, "epoch": 0.057231189560340796, "loss/policy_avg": 0.2548080384731293, "lr": 9.619120654396729e-06, "objective/entropy": -37.27716827392578, "objective/kl": 44.03446960449219, "objective/non_score_reward": -2.201723575592041, "objective/rlhf_reward": -7.356295923800811, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 21.06201934814453, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.63671875, "step": 596, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9991514682769775 }, { "episode": 9568, "epoch": 0.057327054199470345, "loss/policy_avg": 2.5911049842834473, "lr": 9.618481595092026e-06, "objective/entropy": -171.7782745361328, "objective/kl": 20.800029754638672, "objective/non_score_reward": -1.0400015115737915, "objective/rlhf_reward": -1.760006046295166, "objective/scores": 0.6, "policy/approxkl_avg": 2.9469943046569824, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6484375, "step": 597, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.014529228210449 }, { "episode": 9584, "epoch": 0.057422918838599894, "loss/policy_avg": -0.1166892945766449, "lr": 9.617842535787323e-06, "objective/entropy": -109.67333221435547, "objective/kl": 34.37934494018555, "objective/non_score_reward": -1.7189671993255615, "objective/rlhf_reward": -6.8758686780929565, "objective/scores": 0.0, "policy/approxkl_avg": 17.377391815185547, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.4794921875, "step": 598, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0002119541168213 }, { "episode": 9600, "epoch": 0.05751878347772944, "loss/policy_avg": -0.15396325290203094, "lr": 9.617203476482618e-06, "objective/entropy": -128.05728149414062, "objective/kl": 29.42688751220703, "objective/non_score_reward": -1.4713443517684937, "objective/rlhf_reward": -4.060548658641886, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 2.408236026763916, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.71484375, "step": 599, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.002739429473877 }, { "episode": 9616, "epoch": 0.05761464811685899, "loss/policy_avg": 0.14407247304916382, "lr": 9.616564417177915e-06, "objective/entropy": -272.3529357910156, "objective/kl": 21.596874237060547, "objective/non_score_reward": -1.0798437595367432, "objective/rlhf_reward": -1.3956560238611426, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 7.104412078857422, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.58984375, "step": 600, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.001608371734619 }, { "episode": 9632, "epoch": 0.05771051275598854, "loss/policy_avg": 0.20445303618907928, "lr": 9.615925357873211e-06, "objective/entropy": -291.0384521484375, "objective/kl": 28.06856918334961, "objective/non_score_reward": -1.403428554534912, "objective/rlhf_reward": -4.235111692038876, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 9.333198547363281, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.720703125, "step": 601, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0009381771087646 }, { "episode": 9648, "epoch": 0.05780637739511809, "loss/policy_avg": 0.7656448483467102, "lr": 9.615286298568508e-06, "objective/entropy": -4.355806350708008, "objective/kl": 34.863006591796875, "objective/non_score_reward": -1.7431503534317017, "objective/rlhf_reward": -5.548769433696833, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 10.645190238952637, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.591796875, "step": 602, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9971041679382324 }, { "episode": 9664, "epoch": 0.05790224203424764, "loss/policy_avg": 0.1100698709487915, "lr": 9.614647239263805e-06, "objective/entropy": -203.49618530273438, "objective/kl": 19.046649932861328, "objective/non_score_reward": -0.9523325562477112, "objective/rlhf_reward": -2.4307281161225855, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 0.499467670917511, "policy/clipfrac_avg": 0.25, "policy/entropy_avg": 0.638671875, "step": 603, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0019733905792236 }, { "episode": 9680, "epoch": 0.05799810667337719, "loss/policy_avg": 0.17878472805023193, "lr": 9.6140081799591e-06, "objective/entropy": -162.996826171875, "objective/kl": 23.458127975463867, "objective/non_score_reward": -1.172906517982483, "objective/rlhf_reward": -3.3661131596862504, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 8.434497833251953, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5390625, "step": 604, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9986371994018555 }, { "episode": 9696, "epoch": 0.058093971312506744, "loss/policy_avg": 0.5608217716217041, "lr": 9.613369120654397e-06, "objective/entropy": -168.91802978515625, "objective/kl": 31.90495491027832, "objective/non_score_reward": -1.5952478647232056, "objective/rlhf_reward": -3.4572724446069927, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 10.658321380615234, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.57421875, "step": 605, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.999322772026062 }, { "episode": 9712, "epoch": 0.05818983595163629, "loss/policy_avg": 0.10194225609302521, "lr": 9.612730061349694e-06, "objective/entropy": -138.00286865234375, "objective/kl": 34.8355712890625, "objective/non_score_reward": -1.7417783737182617, "objective/rlhf_reward": -5.641600999861879, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 19.823665618896484, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.765625, "step": 606, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.000347852706909 }, { "episode": 9728, "epoch": 0.05828570059076584, "loss/policy_avg": 1.170401930809021, "lr": 9.612091002044991e-06, "objective/entropy": -171.179443359375, "objective/kl": 23.883764266967773, "objective/non_score_reward": -1.1941882371902466, "objective/rlhf_reward": -3.2609813449704017, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 1.674392819404602, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.654296875, "step": 607, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0004193782806396 }, { "episode": 9744, "epoch": 0.05838156522989539, "loss/policy_avg": 0.05054464191198349, "lr": 9.611451942740288e-06, "objective/entropy": -196.56436157226562, "objective/kl": 23.218883514404297, "objective/non_score_reward": -1.1609442234039307, "objective/rlhf_reward": -3.1931789918855276, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 11.145727157592773, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.60546875, "step": 608, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.998692512512207 }, { "episode": 9760, "epoch": 0.05847742986902494, "loss/policy_avg": 0.054385945200920105, "lr": 9.610812883435585e-06, "objective/entropy": -244.93141174316406, "objective/kl": 29.985477447509766, "objective/non_score_reward": -1.4992740154266357, "objective/rlhf_reward": -4.637845957015438, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 19.703460693359375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.69140625, "step": 609, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0000925064086914 }, { "episode": 9776, "epoch": 0.05857329450815449, "loss/policy_avg": -0.05685323104262352, "lr": 9.61017382413088e-06, "objective/entropy": -65.63417053222656, "objective/kl": 31.53623390197754, "objective/non_score_reward": -1.5768117904663086, "objective/rlhf_reward": -3.383528147579405, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 12.860790252685547, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.59375, "step": 610, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.001614809036255 }, { "episode": 9792, "epoch": 0.05866915914728404, "loss/policy_avg": 0.20876801013946533, "lr": 9.609534764826177e-06, "objective/entropy": -112.53227996826172, "objective/kl": 41.12568664550781, "objective/non_score_reward": -2.0562844276428223, "objective/rlhf_reward": -5.825137710571289, "objective/scores": 0.6, "policy/approxkl_avg": 33.385337829589844, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.626953125, "step": 611, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000166654586792 }, { "episode": 9808, "epoch": 0.05876502378641359, "loss/policy_avg": 0.2722185552120209, "lr": 9.608895705521472e-06, "objective/entropy": -124.71205139160156, "objective/kl": 38.9796257019043, "objective/non_score_reward": -1.9489812850952148, "objective/rlhf_reward": -5.395925498008728, "objective/scores": 0.6, "policy/approxkl_avg": 19.52260971069336, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.623046875, "step": 612, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9988607168197632 }, { "episode": 9824, "epoch": 0.058860888425543136, "loss/policy_avg": 0.7936792969703674, "lr": 9.608256646216769e-06, "objective/entropy": -150.9628448486328, "objective/kl": 32.946922302246094, "objective/non_score_reward": -1.6473462581634521, "objective/rlhf_reward": -5.165553171833125, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 23.228769302368164, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.537109375, "step": 613, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0001022815704346 }, { "episode": 9840, "epoch": 0.058956753064672685, "loss/policy_avg": 0.8288295269012451, "lr": 9.607617586912066e-06, "objective/entropy": -145.37136840820312, "objective/kl": 37.17048645019531, "objective/non_score_reward": -1.8585245609283447, "objective/rlhf_reward": -5.6092691376534205, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 8.95422077178955, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.568359375, "step": 614, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9995349645614624 }, { "episode": 9856, "epoch": 0.05905261770380223, "loss/policy_avg": 0.19199243187904358, "lr": 9.606978527607363e-06, "objective/entropy": -158.26043701171875, "objective/kl": 31.016521453857422, "objective/non_score_reward": -1.550826072692871, "objective/rlhf_reward": -4.8440544244989585, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 1.4004452228546143, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5546875, "step": 615, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.00236439704895 }, { "episode": 9872, "epoch": 0.05914848234293178, "loss/policy_avg": 0.29752206802368164, "lr": 9.60633946830266e-06, "objective/entropy": -141.43800354003906, "objective/kl": 27.8808536529541, "objective/non_score_reward": -1.394042730331421, "objective/rlhf_reward": -3.842837558190028, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 10.629474639892578, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5078125, "step": 616, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.00028657913208 }, { "episode": 9888, "epoch": 0.05924434698206133, "loss/policy_avg": 0.2227097749710083, "lr": 9.605700408997955e-06, "objective/entropy": -97.0810775756836, "objective/kl": 34.3601188659668, "objective/non_score_reward": -1.718005895614624, "objective/rlhf_reward": -5.4481916024285235, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 16.432331085205078, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.548828125, "step": 617, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9975783824920654 }, { "episode": 9904, "epoch": 0.05934021162119088, "loss/policy_avg": 0.17975842952728271, "lr": 9.605061349693252e-06, "objective/entropy": -200.100830078125, "objective/kl": 28.51620864868164, "objective/non_score_reward": -1.4258103370666504, "objective/rlhf_reward": -3.8784127190438022, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 10.591612815856934, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.658203125, "step": 618, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0000221729278564 }, { "episode": 9920, "epoch": 0.05943607626032043, "loss/policy_avg": 0.4452857971191406, "lr": 9.604422290388548e-06, "objective/entropy": -87.9361572265625, "objective/kl": 34.174217224121094, "objective/non_score_reward": -1.7087109088897705, "objective/rlhf_reward": -5.278584449496821, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 24.203800201416016, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.587890625, "step": 619, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9989922046661377 }, { "episode": 9936, "epoch": 0.05953194089944998, "loss/policy_avg": 0.31785786151885986, "lr": 9.603783231083845e-06, "objective/entropy": -56.93491744995117, "objective/kl": 34.28547286987305, "objective/non_score_reward": -1.7142736911773682, "objective/rlhf_reward": -5.032265897068094, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 12.636474609375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.85546875, "step": 620, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.996368408203125 }, { "episode": 9952, "epoch": 0.05962780553857953, "loss/policy_avg": 0.6350647211074829, "lr": 9.603144171779142e-06, "objective/entropy": -129.3587188720703, "objective/kl": 41.710655212402344, "objective/non_score_reward": -2.0855326652526855, "objective/rlhf_reward": -6.219424667135749, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 11.748146057128906, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4775390625, "step": 621, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9979774951934814 }, { "episode": 9968, "epoch": 0.059723670177709076, "loss/policy_avg": 0.9843254089355469, "lr": 9.602505112474439e-06, "objective/entropy": -95.34288024902344, "objective/kl": 49.37370300292969, "objective/non_score_reward": -2.4686851501464844, "objective/rlhf_reward": -8.049912209781716, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 31.02006721496582, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4970703125, "step": 622, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9965976476669312 }, { "episode": 9984, "epoch": 0.059819534816838625, "loss/policy_avg": 0.6165390610694885, "lr": 9.601866053169734e-06, "objective/entropy": -100.56966400146484, "objective/kl": 33.22990036010742, "objective/non_score_reward": -1.6614950895309448, "objective/rlhf_reward": -5.286730491851253, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 13.85442066192627, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.61328125, "step": 623, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9971177577972412 }, { "episode": 10000, "epoch": 0.059915399455968174, "loss/policy_avg": 0.3318287134170532, "lr": 9.601226993865031e-06, "objective/entropy": -212.1555938720703, "objective/kl": 25.822668075561523, "objective/non_score_reward": -1.2911334037780762, "objective/rlhf_reward": -2.2408145412218303, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 1.2788864374160767, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.525390625, "step": 624, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9995461702346802 }, { "episode": 10016, "epoch": 0.06001126409509772, "loss/policy_avg": 0.35671815276145935, "lr": 9.600587934560328e-06, "objective/entropy": -96.60403442382812, "objective/kl": 42.28247833251953, "objective/non_score_reward": -2.114124059677124, "objective/rlhf_reward": -6.6316679671135645, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 9.525958061218262, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.55859375, "step": 625, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.999858021736145 }, { "episode": 10032, "epoch": 0.06010712873422727, "loss/policy_avg": 0.026430530473589897, "lr": 9.599948875255625e-06, "objective/entropy": -96.45112609863281, "objective/kl": 30.055763244628906, "objective/non_score_reward": -1.5027881860733032, "objective/rlhf_reward": -4.56055448493515, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 6.234503746032715, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.521484375, "step": 626, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.002078056335449 }, { "episode": 10048, "epoch": 0.06020299337335682, "loss/policy_avg": -0.07770150899887085, "lr": 9.599309815950922e-06, "objective/entropy": -78.50785827636719, "objective/kl": 33.19765090942383, "objective/non_score_reward": -1.6598827838897705, "objective/rlhf_reward": -5.158578279431223, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 60.745849609375, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5546875, "step": 627, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0026586055755615 }, { "episode": 10064, "epoch": 0.06029885801248637, "loss/policy_avg": 0.045525066554546356, "lr": 9.598670756646217e-06, "objective/entropy": -207.98727416992188, "objective/kl": 34.44676208496094, "objective/non_score_reward": -1.7223379611968994, "objective/rlhf_reward": -5.489351963996887, "objective/scores": 0.35, "policy/approxkl_avg": 2.952592372894287, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.671875, "step": 628, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9989871978759766 }, { "episode": 10080, "epoch": 0.06039472265161592, "loss/policy_avg": 0.32521092891693115, "lr": 9.598031697341514e-06, "objective/entropy": -71.00718688964844, "objective/kl": 27.00582504272461, "objective/non_score_reward": -1.3502912521362305, "objective/rlhf_reward": -3.977332849701015, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 5.865281105041504, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7578125, "step": 629, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.001133918762207 }, { "episode": 10096, "epoch": 0.06049058729074547, "loss/policy_avg": 0.22257700562477112, "lr": 9.59739263803681e-06, "objective/entropy": -87.40052795410156, "objective/kl": 31.356922149658203, "objective/non_score_reward": -1.5678460597991943, "objective/rlhf_reward": -4.32397324867719, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 27.549453735351562, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.546875, "step": 630, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.999577522277832 }, { "episode": 10112, "epoch": 0.06058645192987502, "loss/policy_avg": 0.4591647982597351, "lr": 9.596753578732108e-06, "objective/entropy": -35.01010513305664, "objective/kl": 28.93059539794922, "objective/non_score_reward": -1.4465298652648926, "objective/rlhf_reward": -4.42686941597311, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 10.006196975708008, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8203125, "step": 631, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9968595504760742 }, { "episode": 10128, "epoch": 0.060682316569004566, "loss/policy_avg": 0.9483177661895752, "lr": 9.596114519427405e-06, "objective/entropy": -152.91030883789062, "objective/kl": 30.360069274902344, "objective/non_score_reward": -1.5180034637451172, "objective/rlhf_reward": -4.338680283228555, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 15.410400390625, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.599609375, "step": 632, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9943327903747559 }, { "episode": 10144, "epoch": 0.060778181208134115, "loss/policy_avg": 0.4167541265487671, "lr": 9.595475460122701e-06, "objective/entropy": -154.04684448242188, "objective/kl": 33.39550018310547, "objective/non_score_reward": -1.6697750091552734, "objective/rlhf_reward": -5.074980471197682, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 53.406578063964844, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.576171875, "step": 633, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9963878393173218 }, { "episode": 10160, "epoch": 0.060874045847263664, "loss/policy_avg": -0.021846026182174683, "lr": 9.594836400817997e-06, "objective/entropy": -22.81509780883789, "objective/kl": 23.709880828857422, "objective/non_score_reward": -1.1854941844940186, "objective/rlhf_reward": -2.917147810730051, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 0.839837908744812, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6953125, "step": 634, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000304937362671 }, { "episode": 10176, "epoch": 0.06096991048639321, "loss/policy_avg": 0.014755940064787865, "lr": 9.594197341513293e-06, "objective/entropy": -198.07839965820312, "objective/kl": 21.79191017150879, "objective/non_score_reward": -1.0895954370498657, "objective/rlhf_reward": -1.9583818078041078, "objective/scores": 0.6, "policy/approxkl_avg": 0.6484163999557495, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.767578125, "step": 635, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0002074241638184 }, { "episode": 10192, "epoch": 0.06106577512552276, "loss/policy_avg": 0.13533297181129456, "lr": 9.593558282208589e-06, "objective/entropy": -201.26246643066406, "objective/kl": 26.135250091552734, "objective/non_score_reward": -1.3067626953125, "objective/rlhf_reward": -3.885414889364868, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 11.92165756225586, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.740234375, "step": 636, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9993889331817627 }, { "episode": 10208, "epoch": 0.06116163976465231, "loss/policy_avg": 0.4021642506122589, "lr": 9.592919222903886e-06, "objective/entropy": -286.0339050292969, "objective/kl": 14.542181968688965, "objective/non_score_reward": -0.7271090745925903, "objective/rlhf_reward": -1.484604258735744, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 5.031335353851318, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.701171875, "step": 637, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.003591775894165 }, { "episode": 10224, "epoch": 0.06125750440378186, "loss/policy_avg": 0.2514651417732239, "lr": 9.592280163599182e-06, "objective/entropy": -132.75355529785156, "objective/kl": 25.25128173828125, "objective/non_score_reward": -1.2625641822814941, "objective/rlhf_reward": -3.5996581717446894, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 14.74315071105957, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.712890625, "step": 638, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.000126361846924 }, { "episode": 10240, "epoch": 0.06135336904291141, "loss/policy_avg": 0.012995198369026184, "lr": 9.59164110429448e-06, "objective/entropy": -181.2290496826172, "objective/kl": 22.253154754638672, "objective/non_score_reward": -1.1126577854156494, "objective/rlhf_reward": -3.026798923214046, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 0.9591898918151855, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.53515625, "step": 639, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9993176460266113 }, { "episode": 10256, "epoch": 0.06144923368204096, "loss/policy_avg": 0.15271592140197754, "lr": 9.591002044989776e-06, "objective/entropy": -105.57412719726562, "objective/kl": 38.59171676635742, "objective/non_score_reward": -1.9295859336853027, "objective/rlhf_reward": -6.16208431026037, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 6.626259803771973, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.734375, "step": 640, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.996435284614563 }, { "episode": 10272, "epoch": 0.061545098321170506, "loss/policy_avg": -0.11524446308612823, "lr": 9.590362985685071e-06, "objective/entropy": -123.53447723388672, "objective/kl": 26.7266845703125, "objective/non_score_reward": -1.336334228515625, "objective/rlhf_reward": -3.222630920187507, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 1.8472533226013184, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.46484375, "step": 641, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.011606216430664 }, { "episode": 10288, "epoch": 0.061640962960300055, "loss/policy_avg": 0.4013972282409668, "lr": 9.589723926380368e-06, "objective/entropy": -128.90103149414062, "objective/kl": 31.007064819335938, "objective/non_score_reward": -1.5503532886505127, "objective/rlhf_reward": -4.685641431602177, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 6.671117782592773, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.556640625, "step": 642, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9970167875289917 }, { "episode": 10304, "epoch": 0.061736827599429604, "loss/policy_avg": 0.7907944321632385, "lr": 9.589084867075665e-06, "objective/entropy": -58.220497131347656, "objective/kl": 41.770606994628906, "objective/non_score_reward": -2.0885305404663086, "objective/rlhf_reward": -6.620788232485452, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 17.74094581604004, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.466796875, "step": 643, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.995949149131775 }, { "episode": 10320, "epoch": 0.06183269223855915, "loss/policy_avg": 0.017528323456645012, "lr": 9.588445807770962e-06, "objective/entropy": -208.79119873046875, "objective/kl": 23.041034698486328, "objective/non_score_reward": -1.1520518064498901, "objective/rlhf_reward": -3.092435383590397, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 1.83624267578125, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.732421875, "step": 644, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0010547637939453 }, { "episode": 10336, "epoch": 0.0619285568776887, "loss/policy_avg": 0.15500307083129883, "lr": 9.587806748466259e-06, "objective/entropy": -124.78570556640625, "objective/kl": 34.243202209472656, "objective/non_score_reward": -1.7121602296829224, "objective/rlhf_reward": -3.92492190444586, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 2.4558181762695312, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5078125, "step": 645, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9997308254241943 }, { "episode": 10352, "epoch": 0.06202442151681825, "loss/policy_avg": 0.2161247283220291, "lr": 9.587167689161556e-06, "objective/entropy": -163.63064575195312, "objective/kl": 25.873336791992188, "objective/non_score_reward": -1.293666958808899, "objective/rlhf_reward": -3.7960657263673365, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 8.89102840423584, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5234375, "step": 646, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.998837947845459 }, { "episode": 10368, "epoch": 0.0621202861559478, "loss/policy_avg": 0.08966261148452759, "lr": 9.586528629856851e-06, "objective/entropy": -104.2444076538086, "objective/kl": 33.29509735107422, "objective/non_score_reward": -1.664754867553711, "objective/rlhf_reward": -4.925686256090799, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 1.3677499294281006, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.62890625, "step": 647, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9993959665298462 }, { "episode": 10384, "epoch": 0.06221615079507735, "loss/policy_avg": -0.02724701538681984, "lr": 9.585889570552148e-06, "objective/entropy": -133.99429321289062, "objective/kl": 27.543067932128906, "objective/non_score_reward": -1.3771533966064453, "objective/rlhf_reward": -3.1086136460304257, "objective/scores": 0.6, "policy/approxkl_avg": 7.215035438537598, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.677734375, "step": 648, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9986003637313843 }, { "episode": 10400, "epoch": 0.0623120154342069, "loss/policy_avg": -0.23539991676807404, "lr": 9.585250511247445e-06, "objective/entropy": -167.906494140625, "objective/kl": 25.879772186279297, "objective/non_score_reward": -1.293988585472107, "objective/rlhf_reward": -3.571834478441792, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 2.0954341888427734, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.57421875, "step": 649, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9997683763504028 }, { "episode": 10416, "epoch": 0.06240788007333645, "loss/policy_avg": 0.30569222569465637, "lr": 9.584611451942742e-06, "objective/entropy": -226.60678100585938, "objective/kl": 28.675113677978516, "objective/non_score_reward": -1.433755874633789, "objective/rlhf_reward": -3.7876121503877, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 52.77922058105469, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6484375, "step": 650, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9980084896087646 }, { "episode": 10432, "epoch": 0.062503744712466, "loss/policy_avg": -0.24214023351669312, "lr": 9.583972392638038e-06, "objective/entropy": -121.17498779296875, "objective/kl": 38.84062957763672, "objective/non_score_reward": -1.9420316219329834, "objective/rlhf_reward": -5.820715139584477, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 1.8967432975769043, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.623046875, "step": 651, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0004329681396484 }, { "episode": 10448, "epoch": 0.06259960935159554, "loss/policy_avg": -0.3156575858592987, "lr": 9.583333333333335e-06, "objective/entropy": -146.38143920898438, "objective/kl": 32.020687103271484, "objective/non_score_reward": -1.60103440284729, "objective/rlhf_reward": -5.062502017527251, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 2.199296236038208, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.642578125, "step": 652, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0025553703308105 }, { "episode": 10464, "epoch": 0.0626954739907251, "loss/policy_avg": 0.07271748781204224, "lr": 9.58269427402863e-06, "objective/entropy": -196.48562622070312, "objective/kl": 28.001068115234375, "objective/non_score_reward": -1.4000535011291504, "objective/rlhf_reward": -4.2002141833305355, "objective/scores": 0.35, "policy/approxkl_avg": 24.475753784179688, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6875, "step": 653, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0010745525360107 }, { "episode": 10480, "epoch": 0.06279133862985464, "loss/policy_avg": 0.17373695969581604, "lr": 9.582055214723927e-06, "objective/entropy": -275.5335388183594, "objective/kl": 27.79926300048828, "objective/non_score_reward": -1.3899632692337036, "objective/rlhf_reward": -5.5598530769348145, "objective/scores": 0.0, "policy/approxkl_avg": 17.22200584411621, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.615234375, "step": 654, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9987027645111084 }, { "episode": 10496, "epoch": 0.0628872032689842, "loss/policy_avg": 0.15186084806919098, "lr": 9.581416155419224e-06, "objective/entropy": -197.2568817138672, "objective/kl": 23.105377197265625, "objective/non_score_reward": -1.1552690267562866, "objective/rlhf_reward": -2.796247239383768, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 35.64599609375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7734375, "step": 655, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9985601902008057 }, { "episode": 10512, "epoch": 0.06298306790811374, "loss/policy_avg": 0.09821736067533493, "lr": 9.58077709611452e-06, "objective/entropy": -192.20767211914062, "objective/kl": 28.659635543823242, "objective/non_score_reward": -1.4329817295074463, "objective/rlhf_reward": -4.070067649305449, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 5.6847333908081055, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.712890625, "step": 656, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9977080821990967 }, { "episode": 10528, "epoch": 0.0630789325472433, "loss/policy_avg": 0.24115119874477386, "lr": 9.580138036809816e-06, "objective/entropy": -171.08619689941406, "objective/kl": 26.453920364379883, "objective/non_score_reward": -1.3226962089538574, "objective/rlhf_reward": -3.8907844781875607, "objective/scores": 0.35, "policy/approxkl_avg": 11.276920318603516, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6875, "step": 657, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.999925136566162 }, { "episode": 10544, "epoch": 0.06317479718637284, "loss/policy_avg": -0.04878993332386017, "lr": 9.579498977505113e-06, "objective/entropy": -95.69158172607422, "objective/kl": 26.445575714111328, "objective/non_score_reward": -1.3222787380218506, "objective/rlhf_reward": -3.94747917941156, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 5.285589218139648, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.625, "step": 658, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0013298988342285 }, { "episode": 10560, "epoch": 0.0632706618255024, "loss/policy_avg": -0.10105658322572708, "lr": 9.57885991820041e-06, "objective/entropy": -209.01065063476562, "objective/kl": 27.234224319458008, "objective/non_score_reward": -1.3617112636566162, "objective/rlhf_reward": -4.046844816207885, "objective/scores": 0.35, "policy/approxkl_avg": 2.436962366104126, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6328125, "step": 659, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.001246690750122 }, { "episode": 10576, "epoch": 0.06336652646463194, "loss/policy_avg": -0.3218346834182739, "lr": 9.578220858895705e-06, "objective/entropy": -3.9748001098632812, "objective/kl": 18.186880111694336, "objective/non_score_reward": -0.9093440771102905, "objective/rlhf_reward": -1.5146698824324945, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 28.07345962524414, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.8359375, "step": 660, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0002238750457764 }, { "episode": 10592, "epoch": 0.06346239110376149, "loss/policy_avg": -0.19762462377548218, "lr": 9.577581799591002e-06, "objective/entropy": -204.72760009765625, "objective/kl": 18.785112380981445, "objective/non_score_reward": -0.9392555356025696, "objective/rlhf_reward": -1.6343160293259955, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 1.8940598964691162, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.560546875, "step": 661, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0025925636291504 }, { "episode": 10608, "epoch": 0.06355825574289103, "loss/policy_avg": -0.45743584632873535, "lr": 9.576942740286299e-06, "objective/entropy": -134.4844970703125, "objective/kl": 33.7373046875, "objective/non_score_reward": -1.6868653297424316, "objective/rlhf_reward": -5.296863298030242, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 2.153486967086792, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.521484375, "step": 662, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.00048828125 }, { "episode": 10624, "epoch": 0.06365412038202059, "loss/policy_avg": 0.2565079629421234, "lr": 9.576303680981596e-06, "objective/entropy": -180.13528442382812, "objective/kl": 17.24534034729004, "objective/non_score_reward": -0.862267017364502, "objective/rlhf_reward": -2.089818143580837, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 7.433453559875488, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.626953125, "step": 663, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9994475841522217 }, { "episode": 10640, "epoch": 0.06374998502115013, "loss/policy_avg": 0.17452527582645416, "lr": 9.575664621676893e-06, "objective/entropy": -64.2728271484375, "objective/kl": 21.405649185180664, "objective/non_score_reward": -1.0702824592590332, "objective/rlhf_reward": -2.9218800303682517, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 1.6351606845855713, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.71484375, "step": 664, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0012965202331543 }, { "episode": 10656, "epoch": 0.06384584966027969, "loss/policy_avg": 0.6966801881790161, "lr": 9.57502556237219e-06, "objective/entropy": -251.04238891601562, "objective/kl": 27.693851470947266, "objective/non_score_reward": -1.384692668914795, "objective/rlhf_reward": -3.934650454584675, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 10.390886306762695, "policy/clipfrac_avg": 0.25, "policy/entropy_avg": 0.5859375, "step": 665, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.001401662826538 }, { "episode": 10672, "epoch": 0.06394171429940923, "loss/policy_avg": 0.16458481550216675, "lr": 9.574386503067485e-06, "objective/entropy": -219.99136352539062, "objective/kl": 13.308931350708008, "objective/non_score_reward": -0.6654465198516846, "objective/rlhf_reward": -0.7143749100732166, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 3.77976131439209, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.69140625, "step": 666, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000656843185425 }, { "episode": 10688, "epoch": 0.06403757893853879, "loss/policy_avg": -0.009436726570129395, "lr": 9.573747443762782e-06, "objective/entropy": -162.25047302246094, "objective/kl": 23.977962493896484, "objective/non_score_reward": -1.1988980770111084, "objective/rlhf_reward": -2.8481810791062667, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 22.450942993164062, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.763671875, "step": 667, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0019123554229736 }, { "episode": 10704, "epoch": 0.06413344357766833, "loss/policy_avg": 0.4135128855705261, "lr": 9.573108384458079e-06, "objective/entropy": -63.0797119140625, "objective/kl": 41.37904739379883, "objective/non_score_reward": -2.0689523220062256, "objective/rlhf_reward": -6.542475895086923, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 88.98745727539062, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.779296875, "step": 668, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9999923706054688 }, { "episode": 10720, "epoch": 0.06422930821679788, "loss/policy_avg": 0.6821532845497131, "lr": 9.572469325153375e-06, "objective/entropy": -196.7287139892578, "objective/kl": 30.88260269165039, "objective/non_score_reward": -1.5441300868988037, "objective/rlhf_reward": -4.660748505386051, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 23.963293075561523, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.78515625, "step": 669, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9989004135131836 }, { "episode": 10736, "epoch": 0.06432517285592743, "loss/policy_avg": 0.3629915118217468, "lr": 9.571830265848672e-06, "objective/entropy": -205.541259765625, "objective/kl": 24.442432403564453, "objective/non_score_reward": -1.2221217155456543, "objective/rlhf_reward": -3.155153171221415, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 15.010305404663086, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.63671875, "step": 670, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9991261959075928 }, { "episode": 10752, "epoch": 0.06442103749505698, "loss/policy_avg": 0.3024546504020691, "lr": 9.571191206543968e-06, "objective/entropy": -184.0182647705078, "objective/kl": 28.46197509765625, "objective/non_score_reward": -1.4230988025665283, "objective/rlhf_reward": -3.744983862118657, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 2.1509013175964355, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.611328125, "step": 671, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.998319149017334 }, { "episode": 10768, "epoch": 0.06451690213418652, "loss/policy_avg": -0.12359270453453064, "lr": 9.570552147239264e-06, "objective/entropy": -107.1251220703125, "objective/kl": 24.85216522216797, "objective/non_score_reward": -1.2426085472106934, "objective/rlhf_reward": -3.611183964942379, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 2.815180540084839, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.62890625, "step": 672, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.002232074737549 }, { "episode": 10784, "epoch": 0.06461276677331608, "loss/policy_avg": 0.3783743977546692, "lr": 9.569913087934561e-06, "objective/entropy": -155.0634765625, "objective/kl": 33.26643371582031, "objective/non_score_reward": -1.663321852684021, "objective/rlhf_reward": -5.294037544463558, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 7.487679958343506, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.751953125, "step": 673, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9973864555358887 }, { "episode": 10800, "epoch": 0.06470863141244562, "loss/policy_avg": 0.12491178512573242, "lr": 9.569274028629858e-06, "objective/entropy": -202.8880157470703, "objective/kl": 23.53227996826172, "objective/non_score_reward": -1.1766140460968018, "objective/rlhf_reward": -2.9731229106585184, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 5.709697246551514, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.576171875, "step": 674, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9979543685913086 }, { "episode": 10816, "epoch": 0.06480449605157518, "loss/policy_avg": -0.01751142367720604, "lr": 9.568634969325155e-06, "objective/entropy": -217.27896118164062, "objective/kl": 27.020957946777344, "objective/non_score_reward": -1.3510478734970093, "objective/rlhf_reward": -3.4567805034684493, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 0.6378078460693359, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.63671875, "step": 675, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0012476444244385 }, { "episode": 10832, "epoch": 0.06490036069070472, "loss/policy_avg": 0.28126630187034607, "lr": 9.567995910020452e-06, "objective/entropy": -230.15963745117188, "objective/kl": 24.95879364013672, "objective/non_score_reward": -1.2479398250579834, "objective/rlhf_reward": -3.329899912298308, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 10.301782608032227, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.744140625, "step": 676, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9993771314620972 }, { "episode": 10848, "epoch": 0.06499622532983428, "loss/policy_avg": 0.12287623435258865, "lr": 9.567356850715747e-06, "objective/entropy": -263.37542724609375, "objective/kl": 23.937744140625, "objective/non_score_reward": -1.1968872547149658, "objective/rlhf_reward": -0.3875493764877316, "objective/scores": 1.1, "policy/approxkl_avg": 45.05952453613281, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.703125, "step": 677, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9995876550674438 }, { "episode": 10864, "epoch": 0.06509208996896382, "loss/policy_avg": 0.6470179557800293, "lr": 9.566717791411044e-06, "objective/entropy": -65.45881652832031, "objective/kl": 23.807559967041016, "objective/non_score_reward": -1.190378189086914, "objective/rlhf_reward": -3.419877028375297, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 10.65350341796875, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.744140625, "step": 678, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.999696969985962 }, { "episode": 10880, "epoch": 0.06518795460809337, "loss/policy_avg": 0.2790781855583191, "lr": 9.56607873210634e-06, "objective/entropy": -161.4605712890625, "objective/kl": 41.620460510253906, "objective/non_score_reward": -2.0810232162475586, "objective/rlhf_reward": -3.924092388153076, "objective/scores": 1.1, "policy/approxkl_avg": 5.482306480407715, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.57421875, "step": 679, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9986392259597778 }, { "episode": 10896, "epoch": 0.06528381924722292, "loss/policy_avg": 0.042992569506168365, "lr": 9.565439672801636e-06, "objective/entropy": -162.92010498046875, "objective/kl": 26.902143478393555, "objective/non_score_reward": -1.3451071977615356, "objective/rlhf_reward": -4.001826503363949, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 4.27599573135376, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.654296875, "step": 680, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.998090147972107 }, { "episode": 10912, "epoch": 0.06537968388635247, "loss/policy_avg": 0.20157073438167572, "lr": 9.564800613496933e-06, "objective/entropy": -265.3901672363281, "objective/kl": 29.956632614135742, "objective/non_score_reward": -1.4978315830230713, "objective/rlhf_reward": -3.868620397821937, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 68.22042846679688, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.609375, "step": 681, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9982101917266846 }, { "episode": 10928, "epoch": 0.06547554852548201, "loss/policy_avg": 1.519484281539917, "lr": 9.56416155419223e-06, "objective/entropy": -127.62720489501953, "objective/kl": 23.382505416870117, "objective/non_score_reward": -1.1691253185272217, "objective/rlhf_reward": -2.2765009164810177, "objective/scores": 0.6, "policy/approxkl_avg": 17.878856658935547, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.419921875, "step": 682, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9984099864959717 }, { "episode": 10944, "epoch": 0.06557141316461157, "loss/policy_avg": 0.3158057928085327, "lr": 9.563522494887527e-06, "objective/entropy": -190.45260620117188, "objective/kl": 25.518230438232422, "objective/non_score_reward": -1.275911569595337, "objective/rlhf_reward": -3.622693660672068, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 34.12330627441406, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.63671875, "step": 683, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000800371170044 }, { "episode": 10960, "epoch": 0.06566727780374111, "loss/policy_avg": 1.1294161081314087, "lr": 9.562883435582822e-06, "objective/entropy": -107.20721435546875, "objective/kl": 32.379913330078125, "objective/non_score_reward": -1.6189957857131958, "objective/rlhf_reward": -5.13434737017694, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 7.272080421447754, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5234375, "step": 684, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.998885154724121 }, { "episode": 10976, "epoch": 0.06576314244287067, "loss/policy_avg": 0.44281357526779175, "lr": 9.562244376278119e-06, "objective/entropy": -128.0640869140625, "objective/kl": 20.03044891357422, "objective/non_score_reward": -1.0015225410461426, "objective/rlhf_reward": -1.082371120096418, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 12.73418140411377, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.634765625, "step": 685, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.999821424484253 }, { "episode": 10992, "epoch": 0.06585900708200021, "loss/policy_avg": 0.2683737576007843, "lr": 9.561605316973416e-06, "objective/entropy": -258.8201904296875, "objective/kl": 27.295347213745117, "objective/non_score_reward": -1.3647674322128296, "objective/rlhf_reward": -2.535350595356199, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 5.86362886428833, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.71875, "step": 686, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9981340169906616 }, { "episode": 11008, "epoch": 0.06595487172112977, "loss/policy_avg": -0.14624132215976715, "lr": 9.560966257668713e-06, "objective/entropy": -96.99462890625, "objective/kl": 30.466350555419922, "objective/non_score_reward": -1.523317575454712, "objective/rlhf_reward": -4.57749816158646, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 8.779112815856934, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.494140625, "step": 687, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9981212615966797 }, { "episode": 11024, "epoch": 0.06605073636025931, "loss/policy_avg": 0.12842759490013123, "lr": 9.56032719836401e-06, "objective/entropy": -166.20689392089844, "objective/kl": 26.250516891479492, "objective/non_score_reward": -1.312525749206543, "objective/rlhf_reward": -2.8501029968261715, "objective/scores": 0.6, "policy/approxkl_avg": 7.160890102386475, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5703125, "step": 688, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9990025758743286 }, { "episode": 11040, "epoch": 0.06614660099938886, "loss/policy_avg": 0.2923339009284973, "lr": 9.559688139059306e-06, "objective/entropy": -236.72100830078125, "objective/kl": 33.81795883178711, "objective/non_score_reward": -1.6908979415893555, "objective/rlhf_reward": -5.4219562321001575, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 16.3193359375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.650390625, "step": 689, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.99735426902771 }, { "episode": 11056, "epoch": 0.0662424656385184, "loss/policy_avg": -0.10266150534152985, "lr": 9.559049079754601e-06, "objective/entropy": -85.62126159667969, "objective/kl": 31.331233978271484, "objective/non_score_reward": -1.5665616989135742, "objective/rlhf_reward": -4.143540324942146, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 6.518294811248779, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.794921875, "step": 690, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.006378650665283 }, { "episode": 11072, "epoch": 0.06633833027764796, "loss/policy_avg": 0.17208513617515564, "lr": 9.558410020449898e-06, "objective/entropy": -175.00662231445312, "objective/kl": 33.992698669433594, "objective/non_score_reward": -1.6996350288391113, "objective/rlhf_reward": -5.4392902490839194, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 32.03794860839844, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.716796875, "step": 691, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9989235401153564 }, { "episode": 11088, "epoch": 0.06643419491677752, "loss/policy_avg": 0.01335047371685505, "lr": 9.557770961145195e-06, "objective/entropy": -248.65049743652344, "objective/kl": 22.41885757446289, "objective/non_score_reward": -1.1209429502487183, "objective/rlhf_reward": -2.536360512452062, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 2.7352328300476074, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.673828125, "step": 692, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0011818408966064 }, { "episode": 11104, "epoch": 0.06653005955590706, "loss/policy_avg": 0.14417897164821625, "lr": 9.557131901840492e-06, "objective/entropy": -218.454345703125, "objective/kl": 15.86509895324707, "objective/non_score_reward": -0.7932549715042114, "objective/rlhf_reward": 1.226980143785477, "objective/scores": 1.1, "policy/approxkl_avg": 1.0328912734985352, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.640625, "step": 693, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0003151893615723 }, { "episode": 11120, "epoch": 0.06662592419503661, "loss/policy_avg": 0.09597369283437729, "lr": 9.556492842535789e-06, "objective/entropy": -175.68487548828125, "objective/kl": 32.48929977416992, "objective/non_score_reward": -1.624464988708496, "objective/rlhf_reward": -2.0978601336479183, "objective/scores": 1.1, "policy/approxkl_avg": 3.689056396484375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.705078125, "step": 694, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.998678207397461 }, { "episode": 11136, "epoch": 0.06672178883416616, "loss/policy_avg": -0.004386359825730324, "lr": 9.555853783231084e-06, "objective/entropy": 122.54474639892578, "objective/kl": 42.134315490722656, "objective/non_score_reward": -2.106715679168701, "objective/rlhf_reward": -6.822743091646748, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 6.307683944702148, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7421875, "step": 695, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9999419450759888 }, { "episode": 11152, "epoch": 0.06681765347329571, "loss/policy_avg": 0.3615373373031616, "lr": 9.555214723926381e-06, "objective/entropy": -260.84075927734375, "objective/kl": 35.725467681884766, "objective/non_score_reward": -1.7862732410430908, "objective/rlhf_reward": -5.664140108044505, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 45.438873291015625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.685546875, "step": 696, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.996116280555725 }, { "episode": 11168, "epoch": 0.06691351811242525, "loss/policy_avg": 0.24602335691452026, "lr": 9.554575664621678e-06, "objective/entropy": -71.92741394042969, "objective/kl": 30.083784103393555, "objective/non_score_reward": -1.5041892528533936, "objective/rlhf_reward": -4.6575073835596275, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 5.438946723937988, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4765625, "step": 697, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.998178482055664 }, { "episode": 11184, "epoch": 0.06700938275155481, "loss/policy_avg": 0.034039177000522614, "lr": 9.553936605316975e-06, "objective/entropy": -198.67774963378906, "objective/kl": 23.375925064086914, "objective/non_score_reward": -1.1687963008880615, "objective/rlhf_reward": -1.7514660700571265, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 0.5530495643615723, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.53125, "step": 698, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.00034236907959 }, { "episode": 11200, "epoch": 0.06710524739068435, "loss/policy_avg": 0.5306535959243774, "lr": 9.553297546012272e-06, "objective/entropy": -143.43771362304688, "objective/kl": 35.411888122558594, "objective/non_score_reward": -1.7705943584442139, "objective/rlhf_reward": -5.63177965125595, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 6.416120529174805, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.66796875, "step": 699, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9994237422943115 }, { "episode": 11216, "epoch": 0.06720111202981391, "loss/policy_avg": 0.2092888504266739, "lr": 9.552658486707569e-06, "objective/entropy": -169.036376953125, "objective/kl": 30.64543914794922, "objective/non_score_reward": -1.5322721004486084, "objective/rlhf_reward": -1.7290880441665646, "objective/scores": 1.1, "policy/approxkl_avg": 132.6121063232422, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.564453125, "step": 700, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9992992877960205 }, { "episode": 11232, "epoch": 0.06729697666894345, "loss/policy_avg": 0.2553282380104065, "lr": 9.552019427402864e-06, "objective/entropy": -145.8370361328125, "objective/kl": 31.58509063720703, "objective/non_score_reward": -1.5792546272277832, "objective/rlhf_reward": -4.760759084430292, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 23.342622756958008, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.712890625, "step": 701, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0007388591766357 }, { "episode": 11248, "epoch": 0.067392841308073, "loss/policy_avg": 0.1272473782300949, "lr": 9.55138036809816e-06, "objective/entropy": -283.0919494628906, "objective/kl": 18.825233459472656, "objective/non_score_reward": -0.9412617683410645, "objective/rlhf_reward": -2.4057970878824424, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 1.5947492122650146, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.70703125, "step": 702, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.999638557434082 }, { "episode": 11264, "epoch": 0.06748870594720255, "loss/policy_avg": 0.2034430205821991, "lr": 9.550741308793456e-06, "objective/entropy": -274.40478515625, "objective/kl": 20.724695205688477, "objective/non_score_reward": -1.0362348556518555, "objective/rlhf_reward": -1.221220110298368, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 3.738941192626953, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.61328125, "step": 703, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.997761607170105 }, { "episode": 11280, "epoch": 0.0675845705863321, "loss/policy_avg": 0.7114033699035645, "lr": 9.550102249488753e-06, "objective/entropy": -135.6627960205078, "objective/kl": 27.718311309814453, "objective/non_score_reward": -1.3859155178070068, "objective/rlhf_reward": -3.5962508422898605, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 32.94233703613281, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.64453125, "step": 704, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9985225200653076 }, { "episode": 11296, "epoch": 0.06768043522546165, "loss/policy_avg": -0.08856553584337234, "lr": 9.54946319018405e-06, "objective/entropy": -172.419921875, "objective/kl": 31.078826904296875, "objective/non_score_reward": -1.553941249847412, "objective/rlhf_reward": -4.765167097659454, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 27.00151824951172, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.76171875, "step": 705, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.003095865249634 }, { "episode": 11312, "epoch": 0.0677762998645912, "loss/policy_avg": -0.1016867533326149, "lr": 9.548824130879346e-06, "objective/entropy": -186.52476501464844, "objective/kl": 30.371601104736328, "objective/non_score_reward": -1.5185801982879639, "objective/rlhf_reward": -4.593368175442576, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 7.805020332336426, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.68359375, "step": 706, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0018138885498047 }, { "episode": 11328, "epoch": 0.06787216450372074, "loss/policy_avg": 0.3950710892677307, "lr": 9.548185071574643e-06, "objective/entropy": -169.30099487304688, "objective/kl": 26.604206085205078, "objective/non_score_reward": -1.3302103281021118, "objective/rlhf_reward": -3.9422390843308985, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 3.9309802055358887, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.650390625, "step": 707, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0003857612609863 }, { "episode": 11344, "epoch": 0.0679680291428503, "loss/policy_avg": 0.15957045555114746, "lr": 9.547546012269938e-06, "objective/entropy": -152.48211669921875, "objective/kl": 28.93355941772461, "objective/non_score_reward": -1.4466780424118042, "objective/rlhf_reward": -4.124852543295012, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 30.355663299560547, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.728515625, "step": 708, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9971683025360107 }, { "episode": 11360, "epoch": 0.06806389378197984, "loss/policy_avg": 0.1635814905166626, "lr": 9.546906952965235e-06, "objective/entropy": -225.05284118652344, "objective/kl": 32.07009506225586, "objective/non_score_reward": -1.6035047769546509, "objective/rlhf_reward": -5.088506314784212, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 25.63396453857422, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.708984375, "step": 709, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9975805282592773 }, { "episode": 11376, "epoch": 0.0681597584211094, "loss/policy_avg": 0.22918304800987244, "lr": 9.546267893660532e-06, "objective/entropy": -245.11099243164062, "objective/kl": 31.21074867248535, "objective/non_score_reward": -1.560537576675415, "objective/rlhf_reward": -4.5802904419308765, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 14.6522216796875, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.658203125, "step": 710, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9971773624420166 }, { "episode": 11392, "epoch": 0.06825562306023894, "loss/policy_avg": -0.15267148613929749, "lr": 9.545628834355829e-06, "objective/entropy": -26.006134033203125, "objective/kl": 25.76430320739746, "objective/non_score_reward": -1.288215160369873, "objective/rlhf_reward": -3.2054496509599044, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 2.9515511989593506, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.75, "step": 711, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0021400451660156 }, { "episode": 11408, "epoch": 0.0683514876993685, "loss/policy_avg": 0.03201477974653244, "lr": 9.544989775051126e-06, "objective/entropy": -229.9574737548828, "objective/kl": 31.691633224487305, "objective/non_score_reward": -1.5845816135406494, "objective/rlhf_reward": -4.887728492827758, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 81.25225830078125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.74609375, "step": 712, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0012998580932617 }, { "episode": 11424, "epoch": 0.06844735233849804, "loss/policy_avg": 0.5598920583724976, "lr": 9.544350715746423e-06, "objective/entropy": -198.39407348632812, "objective/kl": 22.02547264099121, "objective/non_score_reward": -1.1012736558914185, "objective/rlhf_reward": -3.045844846700115, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 7.494403839111328, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6640625, "step": 713, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0001235008239746 }, { "episode": 11440, "epoch": 0.0685432169776276, "loss/policy_avg": 0.14270013570785522, "lr": 9.543711656441718e-06, "objective/entropy": -281.67730712890625, "objective/kl": 30.167518615722656, "objective/non_score_reward": -1.5083760023117065, "objective/rlhf_reward": -4.517732465060886, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 42.272212982177734, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6171875, "step": 714, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9981521368026733 }, { "episode": 11456, "epoch": 0.06863908161675714, "loss/policy_avg": 0.23854002356529236, "lr": 9.543072597137015e-06, "objective/entropy": -205.70501708984375, "objective/kl": 26.037616729736328, "objective/non_score_reward": -1.3018807172775269, "objective/rlhf_reward": -3.603402886454182, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 21.1671085357666, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.736328125, "step": 715, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.999593734741211 }, { "episode": 11472, "epoch": 0.06873494625588669, "loss/policy_avg": 0.25810641050338745, "lr": 9.542433537832312e-06, "objective/entropy": -202.4583740234375, "objective/kl": 26.777297973632812, "objective/non_score_reward": -1.338865041732788, "objective/rlhf_reward": -3.7513400054612926, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 4.448478698730469, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7890625, "step": 716, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.999415636062622 }, { "episode": 11488, "epoch": 0.06883081089501623, "loss/policy_avg": 0.16866181790828705, "lr": 9.541794478527609e-06, "objective/entropy": -174.37855529785156, "objective/kl": 34.941444396972656, "objective/non_score_reward": -1.7470722198486328, "objective/rlhf_reward": -5.43202957412298, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 0.9149700403213501, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.69140625, "step": 717, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.000814437866211 }, { "episode": 11504, "epoch": 0.06892667553414579, "loss/policy_avg": 0.20718123018741608, "lr": 9.541155419222906e-06, "objective/entropy": -75.93595123291016, "objective/kl": 37.52787780761719, "objective/non_score_reward": -1.8763937950134277, "objective/rlhf_reward": -6.024622860367655, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 3.859286308288574, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5859375, "step": 718, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9990894794464111 }, { "episode": 11520, "epoch": 0.06902254017327533, "loss/policy_avg": -0.14078834652900696, "lr": 9.5405163599182e-06, "objective/entropy": -111.06301879882812, "objective/kl": 37.833980560302734, "objective/non_score_reward": -1.8916990756988525, "objective/rlhf_reward": -5.44408971287397, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 1.0138969421386719, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.646484375, "step": 719, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0016212463378906 }, { "episode": 11536, "epoch": 0.06911840481240489, "loss/policy_avg": -0.02326921373605728, "lr": 9.539877300613498e-06, "objective/entropy": -7.474525451660156, "objective/kl": 37.21611785888672, "objective/non_score_reward": -1.860805869102478, "objective/rlhf_reward": -7.443223357200623, "objective/scores": 0.0, "policy/approxkl_avg": 0.989769458770752, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5546875, "step": 720, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0015361309051514 }, { "episode": 11552, "epoch": 0.06921426945153443, "loss/policy_avg": 0.9960123896598816, "lr": 9.539238241308795e-06, "objective/entropy": -102.21640014648438, "objective/kl": 29.624881744384766, "objective/non_score_reward": -1.4812440872192383, "objective/rlhf_reward": -3.9775650007294967, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 1.5700416564941406, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.568359375, "step": 721, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.000302314758301 }, { "episode": 11568, "epoch": 0.06931013409066399, "loss/policy_avg": -0.022494332864880562, "lr": 9.538599182004091e-06, "objective/entropy": -97.00556182861328, "objective/kl": 34.23220443725586, "objective/non_score_reward": -1.7116100788116455, "objective/rlhf_reward": -5.520927820235414, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 6.0028605461120605, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.625, "step": 722, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0014429092407227 }, { "episode": 11584, "epoch": 0.06940599872979353, "loss/policy_avg": 0.2970792055130005, "lr": 9.537960122699387e-06, "objective/entropy": -218.43130493164062, "objective/kl": 23.677339553833008, "objective/non_score_reward": -1.1838669776916504, "objective/rlhf_reward": -0.335467970371246, "objective/scores": 1.1, "policy/approxkl_avg": 35.85502624511719, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.744140625, "step": 723, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9957175254821777 }, { "episode": 11600, "epoch": 0.06950186336892308, "loss/policy_avg": 0.09062906354665756, "lr": 9.537321063394683e-06, "objective/entropy": -145.62179565429688, "objective/kl": 19.510597229003906, "objective/non_score_reward": -0.9755299091339111, "objective/rlhf_reward": -2.560484102278381, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 5.657525539398193, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.671875, "step": 724, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.002211809158325 }, { "episode": 11616, "epoch": 0.06959772800805263, "loss/policy_avg": 0.5650205612182617, "lr": 9.53668200408998e-06, "objective/entropy": -189.58197021484375, "objective/kl": 22.43151092529297, "objective/non_score_reward": -1.1215755939483643, "objective/rlhf_reward": -3.1446669011408384, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 16.189781188964844, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.69140625, "step": 725, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9978678226470947 }, { "episode": 11632, "epoch": 0.06969359264718218, "loss/policy_avg": 0.10538655519485474, "lr": 9.536042944785277e-06, "objective/entropy": -262.17254638671875, "objective/kl": 21.21435546875, "objective/non_score_reward": -1.0607177019119263, "objective/rlhf_reward": -2.1201648137727123, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 5.554556846618652, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7578125, "step": 726, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9994568824768066 }, { "episode": 11648, "epoch": 0.06978945728631172, "loss/policy_avg": 0.08264347910881042, "lr": 9.535403885480572e-06, "objective/entropy": -144.35389709472656, "objective/kl": 23.849288940429688, "objective/non_score_reward": -1.1924644708633423, "objective/rlhf_reward": -3.2889052657440896, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 0.2577582895755768, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.60546875, "step": 727, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0009419918060303 }, { "episode": 11664, "epoch": 0.06988532192544128, "loss/policy_avg": -0.11442309617996216, "lr": 9.53476482617587e-06, "objective/entropy": -161.91555786132812, "objective/kl": 29.32978057861328, "objective/non_score_reward": -1.4664889574050903, "objective/rlhf_reward": -4.132622496287028, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 6.162350654602051, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.58203125, "step": 728, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0256590843200684 }, { "episode": 11680, "epoch": 0.06998118656457082, "loss/policy_avg": 0.15979725122451782, "lr": 9.534125766871166e-06, "objective/entropy": -46.392860412597656, "objective/kl": 34.71672058105469, "objective/non_score_reward": -1.7358360290527344, "objective/rlhf_reward": -5.601708403139739, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 26.208736419677734, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.837890625, "step": 729, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9947504997253418 }, { "episode": 11696, "epoch": 0.07007705120370038, "loss/policy_avg": 0.01945001818239689, "lr": 9.533486707566463e-06, "objective/entropy": -199.32308959960938, "objective/kl": 20.052722930908203, "objective/non_score_reward": -1.002636194229126, "objective/rlhf_reward": -2.586712677677242, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 5.049467086791992, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.623046875, "step": 730, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9999655485153198 }, { "episode": 11712, "epoch": 0.07017291584282992, "loss/policy_avg": 0.22911685705184937, "lr": 9.53284764826176e-06, "objective/entropy": -199.43820190429688, "objective/kl": 29.375852584838867, "objective/non_score_reward": -1.4687926769256592, "objective/rlhf_reward": -3.4751707077026364, "objective/scores": 0.6, "policy/approxkl_avg": 1.4132235050201416, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.64453125, "step": 731, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9989359378814697 }, { "episode": 11728, "epoch": 0.07026878048195948, "loss/policy_avg": 0.045667171478271484, "lr": 9.532208588957055e-06, "objective/entropy": -156.77005004882812, "objective/kl": 28.574951171875, "objective/non_score_reward": -1.4287474155426025, "objective/rlhf_reward": -4.110870037142353, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 11.299884796142578, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.587890625, "step": 732, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9989423751831055 }, { "episode": 11744, "epoch": 0.07036464512108902, "loss/policy_avg": -0.07621235400438309, "lr": 9.531569529652352e-06, "objective/entropy": -211.5927734375, "objective/kl": 25.139881134033203, "objective/non_score_reward": -1.2569940090179443, "objective/rlhf_reward": -3.2031475856629124, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 2.0796079635620117, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.548828125, "step": 733, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.00140118598938 }, { "episode": 11760, "epoch": 0.07046050976021857, "loss/policy_avg": 0.3665542006492615, "lr": 9.530930470347649e-06, "objective/entropy": -136.42066955566406, "objective/kl": 28.39642333984375, "objective/non_score_reward": -1.4198211431503296, "objective/rlhf_reward": -5.679284453392029, "objective/scores": 0.0, "policy/approxkl_avg": 2.8006393909454346, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.673828125, "step": 734, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9988253116607666 }, { "episode": 11776, "epoch": 0.07055637439934812, "loss/policy_avg": -0.16624964773654938, "lr": 9.530291411042946e-06, "objective/entropy": -172.16896057128906, "objective/kl": 32.62467956542969, "objective/non_score_reward": -1.6312339305877686, "objective/rlhf_reward": -5.183300068884521, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 8.176142692565918, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.587890625, "step": 735, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0014700889587402 }, { "episode": 11792, "epoch": 0.07065223903847767, "loss/policy_avg": -0.01751716434955597, "lr": 9.529652351738243e-06, "objective/entropy": -244.469970703125, "objective/kl": 21.34896469116211, "objective/non_score_reward": -1.0674481391906738, "objective/rlhf_reward": -1.346073900104734, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 1.2310829162597656, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.73828125, "step": 736, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0015974044799805 }, { "episode": 11808, "epoch": 0.07074810367760721, "loss/policy_avg": -0.13727766275405884, "lr": 9.52901329243354e-06, "objective/entropy": -152.7752227783203, "objective/kl": 30.841548919677734, "objective/non_score_reward": -1.5420774221420288, "objective/rlhf_reward": -1.7683096885681149, "objective/scores": 1.1, "policy/approxkl_avg": 2.1432337760925293, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.732421875, "step": 737, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.000338315963745 }, { "episode": 11824, "epoch": 0.07084396831673677, "loss/policy_avg": 0.24724145233631134, "lr": 9.528374233128835e-06, "objective/entropy": -249.35003662109375, "objective/kl": 41.97819519042969, "objective/non_score_reward": -2.098909854888916, "objective/rlhf_reward": -6.945041160197601, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 14.357757568359375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7421875, "step": 738, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9984996318817139 }, { "episode": 11840, "epoch": 0.07093983295586631, "loss/policy_avg": -0.1166142150759697, "lr": 9.527735173824132e-06, "objective/entropy": 16.65149688720703, "objective/kl": 28.71587371826172, "objective/non_score_reward": -1.4357936382293701, "objective/rlhf_reward": -4.401539257078796, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 3.7607579231262207, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.64453125, "step": 739, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9990723133087158 }, { "episode": 11856, "epoch": 0.07103569759499587, "loss/policy_avg": 0.035362888127565384, "lr": 9.527096114519428e-06, "objective/entropy": -227.2210235595703, "objective/kl": 27.349641799926758, "objective/non_score_reward": -1.36748206615448, "objective/rlhf_reward": -3.865808401171284, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 9.06348705291748, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6640625, "step": 740, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9996118545532227 }, { "episode": 11872, "epoch": 0.07113156223412541, "loss/policy_avg": 0.31989267468452454, "lr": 9.526457055214725e-06, "objective/entropy": -213.7845458984375, "objective/kl": 34.27381896972656, "objective/non_score_reward": -1.713691234588623, "objective/rlhf_reward": -4.732058527246986, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 26.892040252685547, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.67578125, "step": 741, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0020368099212646 }, { "episode": 11888, "epoch": 0.07122742687325496, "loss/policy_avg": 0.18080441653728485, "lr": 9.525817995910022e-06, "objective/entropy": -164.34909057617188, "objective/kl": 29.15081024169922, "objective/non_score_reward": -1.457540512084961, "objective/rlhf_reward": -4.379564206214294, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 19.9893798828125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.64453125, "step": 742, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9979908466339111 }, { "episode": 11904, "epoch": 0.0713232915123845, "loss/policy_avg": 0.06947439908981323, "lr": 9.525178936605317e-06, "objective/entropy": -35.78013610839844, "objective/kl": 30.88395118713379, "objective/non_score_reward": -1.5441975593566895, "objective/rlhf_reward": -4.620531051364496, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 9.903773307800293, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.658203125, "step": 743, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0000925064086914 }, { "episode": 11920, "epoch": 0.07141915615151406, "loss/policy_avg": 0.4868197441101074, "lr": 9.524539877300614e-06, "objective/entropy": -185.67857360839844, "objective/kl": 30.794139862060547, "objective/non_score_reward": -1.5397069454193115, "objective/rlhf_reward": -4.833315048247499, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 23.752399444580078, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5703125, "step": 744, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9970831871032715 }, { "episode": 11936, "epoch": 0.0715150207906436, "loss/policy_avg": 0.4937871992588043, "lr": 9.52390081799591e-06, "objective/entropy": -196.15248107910156, "objective/kl": 32.130393981933594, "objective/non_score_reward": -1.6065199375152588, "objective/rlhf_reward": -5.084443858175903, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 4.993836402893066, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.509765625, "step": 745, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9994457960128784 }, { "episode": 11952, "epoch": 0.07161088542977316, "loss/policy_avg": 0.10673123598098755, "lr": 9.523261758691206e-06, "objective/entropy": -74.68463134765625, "objective/kl": 34.281944274902344, "objective/non_score_reward": -1.7140971422195435, "objective/rlhf_reward": -3.9326697334062786, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 9.657389640808105, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4892578125, "step": 746, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.998197078704834 }, { "episode": 11968, "epoch": 0.0717067500689027, "loss/policy_avg": 0.08303539454936981, "lr": 9.522622699386503e-06, "objective/entropy": -234.022705078125, "objective/kl": 26.956684112548828, "objective/non_score_reward": -1.3478342294692993, "objective/rlhf_reward": -3.26863074518827, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 9.614282608032227, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.77734375, "step": 747, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9977872371673584 }, { "episode": 11984, "epoch": 0.07180261470803226, "loss/policy_avg": 0.006275704130530357, "lr": 9.5219836400818e-06, "objective/entropy": -179.78111267089844, "objective/kl": 24.191059112548828, "objective/non_score_reward": -1.2095528841018677, "objective/rlhf_reward": -3.4789615509256553, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 3.5060572624206543, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.48828125, "step": 748, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.001844882965088 }, { "episode": 12000, "epoch": 0.07189847934716181, "loss/policy_avg": 0.05262988060712814, "lr": 9.521344580777097e-06, "objective/entropy": -61.52648162841797, "objective/kl": 24.345882415771484, "objective/non_score_reward": -1.2172942161560059, "objective/rlhf_reward": -2.7464705727258067, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 26.343456268310547, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.734375, "step": 749, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9996674060821533 }, { "episode": 12016, "epoch": 0.07199434398629136, "loss/policy_avg": 0.1489763706922531, "lr": 9.520705521472394e-06, "objective/entropy": -179.14523315429688, "objective/kl": 25.692440032958984, "objective/non_score_reward": -1.284622073173523, "objective/rlhf_reward": -3.19107700415128, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 1.4589556455612183, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.517578125, "step": 750, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9989516735076904 }, { "episode": 12032, "epoch": 0.07209020862542091, "loss/policy_avg": 0.06708867847919464, "lr": 9.520066462167689e-06, "objective/entropy": -56.47541427612305, "objective/kl": 42.95630645751953, "objective/non_score_reward": -2.147815227508545, "objective/rlhf_reward": -6.7664322808113795, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 6.856327056884766, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.650390625, "step": 751, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9969167709350586 }, { "episode": 12048, "epoch": 0.07218607326455045, "loss/policy_avg": 0.3973958194255829, "lr": 9.519427402862986e-06, "objective/entropy": -244.11431884765625, "objective/kl": 25.62933921813965, "objective/non_score_reward": -1.2814669609069824, "objective/rlhf_reward": -3.301039035591196, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 48.01885223388672, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.671875, "step": 752, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9983410835266113 }, { "episode": 12064, "epoch": 0.07228193790368001, "loss/policy_avg": 0.016892850399017334, "lr": 9.518788343558283e-06, "objective/entropy": -233.80613708496094, "objective/kl": 33.0050048828125, "objective/non_score_reward": -1.6502504348754883, "objective/rlhf_reward": -4.653590510563786, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 15.416328430175781, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.560546875, "step": 753, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.999025821685791 }, { "episode": 12080, "epoch": 0.07237780254280955, "loss/policy_avg": 0.10087546706199646, "lr": 9.51814928425358e-06, "objective/entropy": -283.5254211425781, "objective/kl": 25.051952362060547, "objective/non_score_reward": -1.2525975704193115, "objective/rlhf_reward": -2.6103905797004696, "objective/scores": 0.6, "policy/approxkl_avg": 19.29462432861328, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6328125, "step": 754, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9981729984283447 }, { "episode": 12096, "epoch": 0.07247366718193911, "loss/policy_avg": 0.24108710885047913, "lr": 9.517510224948877e-06, "objective/entropy": -211.13575744628906, "objective/kl": 35.66078186035156, "objective/non_score_reward": -1.7830390930175781, "objective/rlhf_reward": -5.708324392040339, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 11.15980339050293, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.60546875, "step": 755, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9982430934906006 }, { "episode": 12112, "epoch": 0.07256953182106865, "loss/policy_avg": 0.6718421578407288, "lr": 9.516871165644172e-06, "objective/entropy": -148.00872802734375, "objective/kl": 30.348403930664062, "objective/non_score_reward": -1.5174202919006348, "objective/rlhf_reward": -4.669681048393249, "objective/scores": 0.35, "policy/approxkl_avg": 24.264657974243164, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.626953125, "step": 756, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9989105463027954 }, { "episode": 12128, "epoch": 0.0726653964601982, "loss/policy_avg": 0.17684796452522278, "lr": 9.516232106339469e-06, "objective/entropy": -220.75283813476562, "objective/kl": 18.81310272216797, "objective/non_score_reward": -0.9406551122665405, "objective/rlhf_reward": -2.3840183998025477, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 3.753880500793457, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.498046875, "step": 757, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9982280731201172 }, { "episode": 12144, "epoch": 0.07276126109932775, "loss/policy_avg": 0.5594636797904968, "lr": 9.515593047034765e-06, "objective/entropy": -182.7705535888672, "objective/kl": 19.829849243164062, "objective/non_score_reward": -0.991492509841919, "objective/rlhf_reward": 0.4340301394462589, "objective/scores": 1.1, "policy/approxkl_avg": 28.46674346923828, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.572265625, "step": 758, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9991915225982666 }, { "episode": 12160, "epoch": 0.0728571257384573, "loss/policy_avg": 0.6502060890197754, "lr": 9.514953987730062e-06, "objective/entropy": -112.33629608154297, "objective/kl": 39.52580642700195, "objective/non_score_reward": -1.9762903451919556, "objective/rlhf_reward": -5.78245514847425, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 4.3783769607543945, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.57421875, "step": 759, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9975318908691406 }, { "episode": 12176, "epoch": 0.07295299037758685, "loss/policy_avg": 0.88495934009552, "lr": 9.51431492842536e-06, "objective/entropy": -201.14666748046875, "objective/kl": 27.90923309326172, "objective/non_score_reward": -1.3954615592956543, "objective/rlhf_reward": -4.240210583716064, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 3.1258697509765625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.66796875, "step": 760, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0020923614501953 }, { "episode": 12192, "epoch": 0.0730488550167164, "loss/policy_avg": 0.3271714448928833, "lr": 9.513675869120656e-06, "objective/entropy": -236.55361938476562, "objective/kl": 28.77971839904785, "objective/non_score_reward": -1.43898606300354, "objective/rlhf_reward": -4.35594413280487, "objective/scores": 0.35, "policy/approxkl_avg": 5.469420909881592, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6171875, "step": 761, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.997791051864624 }, { "episode": 12208, "epoch": 0.07314471965584594, "loss/policy_avg": 0.0032866448163986206, "lr": 9.513036809815951e-06, "objective/entropy": -200.22227478027344, "objective/kl": 28.73204803466797, "objective/non_score_reward": -1.4366023540496826, "objective/rlhf_reward": -4.142289552752095, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 0.5752939581871033, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.587890625, "step": 762, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0023627281188965 }, { "episode": 12224, "epoch": 0.0732405842949755, "loss/policy_avg": 0.21868771314620972, "lr": 9.512397750511248e-06, "objective/entropy": -187.9447784423828, "objective/kl": 20.44854736328125, "objective/non_score_reward": -1.0224274396896362, "objective/rlhf_reward": -2.5739379761540255, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 11.789055824279785, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6328125, "step": 763, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9979248046875 }, { "episode": 12240, "epoch": 0.07333644893410504, "loss/policy_avg": 0.3879333734512329, "lr": 9.511758691206545e-06, "objective/entropy": -267.96685791015625, "objective/kl": 28.91057586669922, "objective/non_score_reward": -1.4455287456512451, "objective/rlhf_reward": -3.8347037536668136, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 4.813044548034668, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6953125, "step": 764, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0009827613830566 }, { "episode": 12256, "epoch": 0.0734323135732346, "loss/policy_avg": 0.06569409370422363, "lr": 9.511119631901842e-06, "objective/entropy": -207.83352661132812, "objective/kl": 24.208805084228516, "objective/non_score_reward": -1.2104402780532837, "objective/rlhf_reward": -3.2855019261508733, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 2.039762258529663, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.658203125, "step": 765, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0008108615875244 }, { "episode": 12272, "epoch": 0.07352817821236414, "loss/policy_avg": 0.9109029769897461, "lr": 9.510480572597139e-06, "objective/entropy": -85.82101440429688, "objective/kl": 31.18517303466797, "objective/non_score_reward": -1.5592585802078247, "objective/rlhf_reward": -4.50370092789332, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 4.987689018249512, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.56640625, "step": 766, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.997962236404419 }, { "episode": 12288, "epoch": 0.0736240428514937, "loss/policy_avg": 0.44006603956222534, "lr": 9.509841513292434e-06, "objective/entropy": -254.5596923828125, "objective/kl": 26.123559951782227, "objective/non_score_reward": -1.3061779737472534, "objective/rlhf_reward": -3.6684524109035284, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 13.005337715148926, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.552734375, "step": 767, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9978489875793457 }, { "episode": 12304, "epoch": 0.07371990749062324, "loss/policy_avg": 0.14191022515296936, "lr": 9.509202453987731e-06, "objective/entropy": -185.1569061279297, "objective/kl": 38.093666076660156, "objective/non_score_reward": -1.9046835899353027, "objective/rlhf_reward": -7.618734002113342, "objective/scores": 0.0, "policy/approxkl_avg": 60.80290603637695, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.640625, "step": 768, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9955925941467285 }, { "episode": 12320, "epoch": 0.0738157721297528, "loss/policy_avg": -0.31537145376205444, "lr": 9.508563394683026e-06, "objective/entropy": -164.9215087890625, "objective/kl": 30.594449996948242, "objective/non_score_reward": -1.5297224521636963, "objective/rlhf_reward": -4.63793725055015, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 1.2754226922988892, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.6171875, "step": 769, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.001909017562866 }, { "episode": 12336, "epoch": 0.07391163676888234, "loss/policy_avg": 0.034731436520814896, "lr": 9.507924335378323e-06, "objective/entropy": -200.43959045410156, "objective/kl": 36.4830436706543, "objective/non_score_reward": -1.8241522312164307, "objective/rlhf_reward": -5.8727765872078805, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 3.3153905868530273, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.626953125, "step": 770, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0031118392944336 }, { "episode": 12352, "epoch": 0.07400750140801189, "loss/policy_avg": 0.29965466260910034, "lr": 9.50728527607362e-06, "objective/entropy": -168.58261108398438, "objective/kl": 34.881736755371094, "objective/non_score_reward": -1.7440869808197021, "objective/rlhf_reward": -5.314488296926605, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 12.419918060302734, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.744140625, "step": 771, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.999511957168579 }, { "episode": 12368, "epoch": 0.07410336604714143, "loss/policy_avg": 0.5840628743171692, "lr": 9.506646216768917e-06, "objective/entropy": -149.50210571289062, "objective/kl": 26.40768051147461, "objective/non_score_reward": -1.3203840255737305, "objective/rlhf_reward": -3.8005837230042214, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 10.453241348266602, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.560546875, "step": 772, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9990813732147217 }, { "episode": 12384, "epoch": 0.07419923068627099, "loss/policy_avg": -0.20146791636943817, "lr": 9.506007157464214e-06, "objective/entropy": -206.66688537597656, "objective/kl": 25.146541595458984, "objective/non_score_reward": -1.2573271989822388, "objective/rlhf_reward": -5.029308795928955, "objective/scores": 0.0, "policy/approxkl_avg": 55.61228561401367, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.728515625, "step": 773, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9993964433670044 }, { "episode": 12400, "epoch": 0.07429509532540053, "loss/policy_avg": 2.0998456478118896, "lr": 9.50536809815951e-06, "objective/entropy": -135.09249877929688, "objective/kl": 26.86371612548828, "objective/non_score_reward": -1.3431859016418457, "objective/rlhf_reward": -3.922145526023254, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 7.190234184265137, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.548828125, "step": 774, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0000576972961426 }, { "episode": 12416, "epoch": 0.07439095996453009, "loss/policy_avg": 0.024284163489937782, "lr": 9.504729038854806e-06, "objective/entropy": -269.6484375, "objective/kl": 21.226428985595703, "objective/non_score_reward": -1.061321496963501, "objective/rlhf_reward": -2.7946879669145197, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 6.07242488861084, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.66796875, "step": 775, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.999997615814209 }, { "episode": 12432, "epoch": 0.07448682460365963, "loss/policy_avg": 0.03317616134881973, "lr": 9.504089979550103e-06, "objective/entropy": -234.43389892578125, "objective/kl": 27.79866600036621, "objective/non_score_reward": -1.3899333477020264, "objective/rlhf_reward": -3.8264001766840616, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 1.3638486862182617, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.689453125, "step": 776, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9996023178100586 }, { "episode": 12448, "epoch": 0.07458268924278919, "loss/policy_avg": 0.16213266551494598, "lr": 9.5034509202454e-06, "objective/entropy": -203.708740234375, "objective/kl": 38.612911224365234, "objective/non_score_reward": -1.9306457042694092, "objective/rlhf_reward": -6.271984438510284, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 1.698218584060669, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.64453125, "step": 777, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9995825290679932 }, { "episode": 12464, "epoch": 0.07467855388191873, "loss/policy_avg": 0.2597602605819702, "lr": 9.502811860940696e-06, "objective/entropy": -250.4356231689453, "objective/kl": 30.581310272216797, "objective/non_score_reward": -1.529065489768982, "objective/rlhf_reward": -4.737660029021603, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 8.781853675842285, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.615234375, "step": 778, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9989944696426392 }, { "episode": 12480, "epoch": 0.07477441852104828, "loss/policy_avg": -0.24061758816242218, "lr": 9.502172801635993e-06, "objective/entropy": -98.61205291748047, "objective/kl": 26.375612258911133, "objective/non_score_reward": -1.3187806606292725, "objective/rlhf_reward": -3.794169786389231, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 5.955351829528809, "policy/clipfrac_avg": 2.0, "policy/entropy_avg": 0.7265625, "step": 779, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.003558397293091 }, { "episode": 12496, "epoch": 0.07487028316017783, "loss/policy_avg": 0.48288995027542114, "lr": 9.50153374233129e-06, "objective/entropy": -230.7918701171875, "objective/kl": 37.52941131591797, "objective/non_score_reward": -1.8764704465866089, "objective/rlhf_reward": -6.024929526265025, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 12.408464431762695, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.68359375, "step": 780, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9990174770355225 }, { "episode": 12512, "epoch": 0.07496614779930738, "loss/policy_avg": 0.27871203422546387, "lr": 9.500894683026585e-06, "objective/entropy": -159.85903930664062, "objective/kl": 25.038909912109375, "objective/non_score_reward": -1.2519454956054688, "objective/rlhf_reward": -2.607781863212585, "objective/scores": 0.6, "policy/approxkl_avg": 46.26438903808594, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.658203125, "step": 781, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.000070095062256 }, { "episode": 12528, "epoch": 0.07506201243843692, "loss/policy_avg": 0.06291055679321289, "lr": 9.500255623721882e-06, "objective/entropy": -163.0406494140625, "objective/kl": 27.101749420166016, "objective/non_score_reward": -1.3550875186920166, "objective/rlhf_reward": -4.061099970076961, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 13.61475658416748, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.63671875, "step": 782, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9986028671264648 }, { "episode": 12544, "epoch": 0.07515787707756648, "loss/policy_avg": 0.07766500115394592, "lr": 9.499616564417179e-06, "objective/entropy": -264.68377685546875, "objective/kl": 26.38882827758789, "objective/non_score_reward": -1.319441556930542, "objective/rlhf_reward": -2.3540468558084697, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 29.816272735595703, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.62890625, "step": 783, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9993165731430054 }, { "episode": 12560, "epoch": 0.07525374171669602, "loss/policy_avg": -0.25779616832733154, "lr": 9.498977505112476e-06, "objective/entropy": -192.4373016357422, "objective/kl": 30.569807052612305, "objective/non_score_reward": -1.528490424156189, "objective/rlhf_reward": -4.5098417139688305, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 3.409776210784912, "policy/clipfrac_avg": 2.0, "policy/entropy_avg": 0.765625, "step": 784, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0031237602233887 }, { "episode": 12576, "epoch": 0.07534960635582558, "loss/policy_avg": -0.23182180523872375, "lr": 9.498338445807773e-06, "objective/entropy": -116.57367706298828, "objective/kl": 30.319534301757812, "objective/non_score_reward": -1.5159766674041748, "objective/rlhf_reward": -4.704657160972042, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 3.2308108806610107, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.3955078125, "step": 785, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.001164436340332 }, { "episode": 12592, "epoch": 0.07544547099495512, "loss/policy_avg": 0.270114541053772, "lr": 9.497699386503068e-06, "objective/entropy": -213.6279296875, "objective/kl": 34.02395248413086, "objective/non_score_reward": -1.701197624206543, "objective/rlhf_reward": -3.8810713633310527, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 4.159467697143555, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.556640625, "step": 786, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.999849557876587 }, { "episode": 12608, "epoch": 0.07554133563408467, "loss/policy_avg": 0.01593317836523056, "lr": 9.497060327198365e-06, "objective/entropy": -83.6307601928711, "objective/kl": 28.397233963012695, "objective/non_score_reward": -1.4198617935180664, "objective/rlhf_reward": -4.198494317944407, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 13.974614143371582, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.576171875, "step": 787, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9974963665008545 }, { "episode": 12624, "epoch": 0.07563720027321422, "loss/policy_avg": 0.122782863676548, "lr": 9.496421267893662e-06, "objective/entropy": -66.27203369140625, "objective/kl": 20.0443115234375, "objective/non_score_reward": -1.0022156238555908, "objective/rlhf_reward": -2.6302602673448146, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 5.128955364227295, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.78125, "step": 788, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.002767562866211 }, { "episode": 12640, "epoch": 0.07573306491234377, "loss/policy_avg": 0.06789802759885788, "lr": 9.495782208588959e-06, "objective/entropy": -174.1296844482422, "objective/kl": 28.25243377685547, "objective/non_score_reward": -1.4126217365264893, "objective/rlhf_reward": -3.988627438963042, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 21.132152557373047, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.802734375, "step": 789, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0010154247283936 }, { "episode": 12656, "epoch": 0.07582892955147331, "loss/policy_avg": 0.1666814684867859, "lr": 9.495143149284254e-06, "objective/entropy": -226.70257568359375, "objective/kl": 28.976097106933594, "objective/non_score_reward": -1.4488048553466797, "objective/rlhf_reward": -4.371387500961391, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 2.0613138675689697, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.62890625, "step": 790, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9994125366210938 }, { "episode": 12672, "epoch": 0.07592479419060287, "loss/policy_avg": 0.1284073442220688, "lr": 9.49450408997955e-06, "objective/entropy": -215.84002685546875, "objective/kl": 28.486852645874023, "objective/non_score_reward": -1.4243427515029907, "objective/rlhf_reward": -4.355735114126831, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 5.659012317657471, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.64453125, "step": 791, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0002176761627197 }, { "episode": 12688, "epoch": 0.07602065882973241, "loss/policy_avg": -0.04723303020000458, "lr": 9.493865030674848e-06, "objective/entropy": -227.61280822753906, "objective/kl": 28.772476196289062, "objective/non_score_reward": -1.4386236667633057, "objective/rlhf_reward": -2.830775891185972, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 1.8349313735961914, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.609375, "step": 792, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.002286911010742 }, { "episode": 12704, "epoch": 0.07611652346886197, "loss/policy_avg": -0.01974731869995594, "lr": 9.493225971370144e-06, "objective/entropy": -168.45291137695312, "objective/kl": 32.674957275390625, "objective/non_score_reward": -1.633747935295105, "objective/rlhf_reward": -5.209478828936739, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 0.8098639249801636, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6015625, "step": 793, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0004916191101074 }, { "episode": 12720, "epoch": 0.07621238810799151, "loss/policy_avg": 0.3524478077888489, "lr": 9.49258691206544e-06, "objective/entropy": -170.04669189453125, "objective/kl": 35.1775016784668, "objective/non_score_reward": -1.7588751316070557, "objective/rlhf_reward": -5.479241101947382, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 4.70783805847168, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.658203125, "step": 794, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9981024265289307 }, { "episode": 12736, "epoch": 0.07630825274712107, "loss/policy_avg": 0.14937232434749603, "lr": 9.491947852760736e-06, "objective/entropy": -258.00518798828125, "objective/kl": 30.382396697998047, "objective/non_score_reward": -1.5191197395324707, "objective/rlhf_reward": -4.472358975473957, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 8.522323608398438, "policy/clipfrac_avg": 0.25, "policy/entropy_avg": 0.68359375, "step": 795, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.00205659866333 }, { "episode": 12752, "epoch": 0.07640411738625061, "loss/policy_avg": 0.4101511240005493, "lr": 9.491308793456033e-06, "objective/entropy": -97.3719482421875, "objective/kl": 49.89447021484375, "objective/non_score_reward": -2.4947237968444824, "objective/rlhf_reward": -7.578894591331482, "objective/scores": 0.6, "policy/approxkl_avg": 19.377134323120117, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.498046875, "step": 796, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9981412887573242 }, { "episode": 12768, "epoch": 0.07649998202538016, "loss/policy_avg": -0.0627971962094307, "lr": 9.49066973415133e-06, "objective/entropy": -110.8655776977539, "objective/kl": 44.73468017578125, "objective/non_score_reward": -2.23673415184021, "objective/rlhf_reward": -6.546936726570129, "objective/scores": 0.6, "policy/approxkl_avg": 5.804272651672363, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.806640625, "step": 797, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9971017837524414 }, { "episode": 12784, "epoch": 0.0765958466645097, "loss/policy_avg": 0.3731452226638794, "lr": 9.490030674846627e-06, "objective/entropy": -15.07757568359375, "objective/kl": 24.15683364868164, "objective/non_score_reward": -1.2078416347503662, "objective/rlhf_reward": -2.4313664793968197, "objective/scores": 0.6, "policy/approxkl_avg": 5.745340347290039, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.638671875, "step": 798, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9993362426757812 }, { "episode": 12800, "epoch": 0.07669171130363926, "loss/policy_avg": 0.3336324691772461, "lr": 9.489391615541922e-06, "objective/entropy": -249.59414672851562, "objective/kl": 28.68617820739746, "objective/non_score_reward": -1.4343090057373047, "objective/rlhf_reward": -2.8135166510355205, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 3.9479708671569824, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.693359375, "step": 799, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9993374347686768 }, { "episode": 12816, "epoch": 0.0767875759427688, "loss/policy_avg": 0.12261458486318588, "lr": 9.488752556237219e-06, "objective/entropy": -207.68580627441406, "objective/kl": 33.91386413574219, "objective/non_score_reward": -1.6956932544708252, "objective/rlhf_reward": -5.301820400174021, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 25.18114471435547, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.625, "step": 800, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9971894025802612 }, { "episode": 12832, "epoch": 0.07688344058189836, "loss/policy_avg": 0.1192292720079422, "lr": 9.488113496932516e-06, "objective/entropy": -268.4300842285156, "objective/kl": 26.710205078125, "objective/non_score_reward": -1.3355103731155396, "objective/rlhf_reward": -4.000405719786316, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 4.064979553222656, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.638671875, "step": 801, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9994006156921387 }, { "episode": 12848, "epoch": 0.0769793052210279, "loss/policy_avg": 0.4274081587791443, "lr": 9.487474437627813e-06, "objective/entropy": -125.00625610351562, "objective/kl": 36.30561065673828, "objective/non_score_reward": -1.815280795097351, "objective/rlhf_reward": -5.901873194907589, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 9.215574264526367, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.55078125, "step": 802, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.999779224395752 }, { "episode": 12864, "epoch": 0.07707516986015746, "loss/policy_avg": 0.02082793414592743, "lr": 9.48683537832311e-06, "objective/entropy": 49.048545837402344, "objective/kl": 31.830245971679688, "objective/non_score_reward": -1.5915122032165527, "objective/rlhf_reward": -4.915450672717437, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 2.6811680793762207, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4384765625, "step": 803, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9994103908538818 }, { "episode": 12880, "epoch": 0.077171034499287, "loss/policy_avg": 0.1582624763250351, "lr": 9.486196319018407e-06, "objective/entropy": -110.25260925292969, "objective/kl": 31.00435447692871, "objective/non_score_reward": -1.550217866897583, "objective/rlhf_reward": -3.8008712291717526, "objective/scores": 0.6, "policy/approxkl_avg": 3.5253429412841797, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.55859375, "step": 804, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0027151107788086 }, { "episode": 12896, "epoch": 0.07726689913841656, "loss/policy_avg": 0.09249435365200043, "lr": 9.485557259713702e-06, "objective/entropy": -203.63662719726562, "objective/kl": 31.04816436767578, "objective/non_score_reward": -1.552408218383789, "objective/rlhf_reward": -4.547773247182952, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 1.3485993146896362, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.646484375, "step": 805, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.999431848526001 }, { "episode": 12912, "epoch": 0.0773627637775461, "loss/policy_avg": 0.44563794136047363, "lr": 9.484918200408999e-06, "objective/entropy": -163.74508666992188, "objective/kl": 31.982746124267578, "objective/non_score_reward": -1.599137306213379, "objective/rlhf_reward": -3.472829972149107, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 87.72571563720703, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.70703125, "step": 806, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.001553535461426 }, { "episode": 12928, "epoch": 0.07745862841667565, "loss/policy_avg": -0.017649848014116287, "lr": 9.484279141104296e-06, "objective/entropy": -266.5451965332031, "objective/kl": 27.058134078979492, "objective/non_score_reward": -1.3529068231582642, "objective/rlhf_reward": -1.0116270542144772, "objective/scores": 1.1, "policy/approxkl_avg": 5.037982940673828, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.654296875, "step": 807, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0039749145507812 }, { "episode": 12944, "epoch": 0.07755449305580521, "loss/policy_avg": 5.042888641357422, "lr": 9.483640081799592e-06, "objective/entropy": -212.65740966796875, "objective/kl": 24.790084838867188, "objective/non_score_reward": -1.2395042181015015, "objective/rlhf_reward": -3.4770642546967263, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 11.046760559082031, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.533203125, "step": 808, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0002431869506836 }, { "episode": 12960, "epoch": 0.07765035769493475, "loss/policy_avg": -0.07623002678155899, "lr": 9.48300102249489e-06, "objective/entropy": -167.7131805419922, "objective/kl": 31.204689025878906, "objective/non_score_reward": -1.5602343082427979, "objective/rlhf_reward": -4.790339152427062, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 5.110037803649902, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5234375, "step": 809, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9989296197891235 }, { "episode": 12976, "epoch": 0.07774622233406431, "loss/policy_avg": 0.0697702169418335, "lr": 9.482361963190185e-06, "objective/entropy": -99.56057739257812, "objective/kl": 40.95980453491211, "objective/non_score_reward": -2.047990322113037, "objective/rlhf_reward": -5.268241856933805, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 1.0177828073501587, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.595703125, "step": 810, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9999544620513916 }, { "episode": 12992, "epoch": 0.07784208697319385, "loss/policy_avg": 0.011765815317630768, "lr": 9.481722903885481e-06, "objective/entropy": -270.2078857421875, "objective/kl": 32.53266906738281, "objective/non_score_reward": -1.6266334056854248, "objective/rlhf_reward": -4.950274675098017, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 10.882495880126953, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.685546875, "step": 811, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.997708797454834 }, { "episode": 13008, "epoch": 0.0779379516123234, "loss/policy_avg": 0.4012794494628906, "lr": 9.481083844580777e-06, "objective/entropy": -139.22914123535156, "objective/kl": 37.05573272705078, "objective/non_score_reward": -1.8527867794036865, "objective/rlhf_reward": -5.586318249973367, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 210.83877563476562, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6640625, "step": 812, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.001636505126953 }, { "episode": 13024, "epoch": 0.07803381625145295, "loss/policy_avg": 0.2699980139732361, "lr": 9.480444785276073e-06, "objective/entropy": -196.59963989257812, "objective/kl": 30.699893951416016, "objective/non_score_reward": -1.5349947214126587, "objective/rlhf_reward": -3.7399788856506344, "objective/scores": 0.6, "policy/approxkl_avg": 2.332146167755127, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.619140625, "step": 813, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9989255666732788 }, { "episode": 13040, "epoch": 0.0781296808905825, "loss/policy_avg": 0.20207370817661285, "lr": 9.47980572597137e-06, "objective/entropy": -267.2593994140625, "objective/kl": 33.34029006958008, "objective/non_score_reward": -1.6670145988464355, "objective/rlhf_reward": -5.342545185118837, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 1.632169246673584, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.564453125, "step": 814, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9993088245391846 }, { "episode": 13056, "epoch": 0.07822554552971205, "loss/policy_avg": 0.1745888739824295, "lr": 9.479166666666667e-06, "objective/entropy": -108.20680236816406, "objective/kl": 35.203025817871094, "objective/non_score_reward": -1.7601512670516968, "objective/rlhf_reward": -5.484345762935236, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 8.32550048828125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.736328125, "step": 815, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0030176639556885 }, { "episode": 13072, "epoch": 0.0783214101688416, "loss/policy_avg": 0.2600640654563904, "lr": 9.478527607361964e-06, "objective/entropy": -204.03048706054688, "objective/kl": 40.41114807128906, "objective/non_score_reward": -2.020557403564453, "objective/rlhf_reward": -6.74059360316339, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 4.140628814697266, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.619140625, "step": 816, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0000810623168945 }, { "episode": 13088, "epoch": 0.07841727480797114, "loss/policy_avg": 0.5273202061653137, "lr": 9.477888548057261e-06, "objective/entropy": -241.156494140625, "objective/kl": 24.541404724121094, "objective/non_score_reward": -1.2270702123641968, "objective/rlhf_reward": -3.246421401918517, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 14.965031623840332, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.61328125, "step": 817, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9990627765655518 }, { "episode": 13104, "epoch": 0.0785131394471007, "loss/policy_avg": -0.09151424467563629, "lr": 9.477249488752556e-06, "objective/entropy": -219.21754455566406, "objective/kl": 31.261905670166016, "objective/non_score_reward": -1.5630953311920166, "objective/rlhf_reward": -4.4275525763359775, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 2.8227334022521973, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.833984375, "step": 818, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0036370754241943 }, { "episode": 13120, "epoch": 0.07860900408623024, "loss/policy_avg": 0.13953115046024323, "lr": 9.476610429447853e-06, "objective/entropy": -186.8937530517578, "objective/kl": 27.69632339477539, "objective/non_score_reward": -1.3848161697387695, "objective/rlhf_reward": -3.1392647981643673, "objective/scores": 0.6, "policy/approxkl_avg": 3.2056455612182617, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6015625, "step": 819, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.003466844558716 }, { "episode": 13136, "epoch": 0.0787048687253598, "loss/policy_avg": 0.6420396566390991, "lr": 9.47597137014315e-06, "objective/entropy": -134.00025939941406, "objective/kl": 22.993852615356445, "objective/non_score_reward": -1.1496926546096802, "objective/rlhf_reward": -2.651359389500554, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 3.607414722442627, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.611328125, "step": 820, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.000748634338379 }, { "episode": 13152, "epoch": 0.07880073336448934, "loss/policy_avg": 0.08356916159391403, "lr": 9.475332310838447e-06, "objective/entropy": -189.72003173828125, "objective/kl": 26.506973266601562, "objective/non_score_reward": -1.3253486156463623, "objective/rlhf_reward": -3.959759166746765, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 4.290050029754639, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.62890625, "step": 821, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9971721172332764 }, { "episode": 13168, "epoch": 0.0788965980036189, "loss/policy_avg": 0.11917827278375626, "lr": 9.474693251533744e-06, "objective/entropy": -207.30722045898438, "objective/kl": 35.41877746582031, "objective/non_score_reward": -1.7709391117095947, "objective/rlhf_reward": -5.683756327629089, "objective/scores": 0.35, "policy/approxkl_avg": 6.870448112487793, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.57421875, "step": 822, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9955227375030518 }, { "episode": 13184, "epoch": 0.07899246264274844, "loss/policy_avg": -0.3528624475002289, "lr": 9.474054192229039e-06, "objective/entropy": -138.19627380371094, "objective/kl": 27.491954803466797, "objective/non_score_reward": -1.3745976686477661, "objective/rlhf_reward": -4.156755199938446, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 1.025694727897644, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.52734375, "step": 823, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0017786026000977 }, { "episode": 13200, "epoch": 0.079088327281878, "loss/policy_avg": 0.3965766727924347, "lr": 9.473415132924336e-06, "objective/entropy": -244.5587921142578, "objective/kl": 28.361434936523438, "objective/non_score_reward": -1.4180717468261719, "objective/rlhf_reward": -3.549580337778602, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 22.821792602539062, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.53515625, "step": 824, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.997540831565857 }, { "episode": 13216, "epoch": 0.07918419192100754, "loss/policy_avg": 0.183881938457489, "lr": 9.472776073619633e-06, "objective/entropy": -235.81063842773438, "objective/kl": 35.635047912597656, "objective/non_score_reward": -1.7817524671554565, "objective/rlhf_reward": -5.785373976736694, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 13.993101119995117, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.681640625, "step": 825, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9987282752990723 }, { "episode": 13232, "epoch": 0.07928005656013709, "loss/policy_avg": 0.13472305238246918, "lr": 9.47213701431493e-06, "objective/entropy": -209.61251831054688, "objective/kl": 32.511722564697266, "objective/non_score_reward": -1.6255862712860107, "objective/rlhf_reward": -5.176832351714296, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 10.514575958251953, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.693359375, "step": 826, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9980220794677734 }, { "episode": 13248, "epoch": 0.07937592119926663, "loss/policy_avg": 0.28974202275276184, "lr": 9.471497955010226e-06, "objective/entropy": -277.55413818359375, "objective/kl": 23.343517303466797, "objective/non_score_reward": -1.1671757698059082, "objective/rlhf_reward": -4.668703377246857, "objective/scores": 0.0, "policy/approxkl_avg": 4.868777275085449, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.595703125, "step": 827, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9984409809112549 }, { "episode": 13264, "epoch": 0.07947178583839619, "loss/policy_avg": 0.0649593323469162, "lr": 9.470858895705523e-06, "objective/entropy": -168.09161376953125, "objective/kl": 32.58544921875, "objective/non_score_reward": -1.6292723417282104, "objective/rlhf_reward": -5.001317584308323, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 13.682709693908691, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.630859375, "step": 828, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9994233846664429 }, { "episode": 13280, "epoch": 0.07956765047752573, "loss/policy_avg": 0.01223127543926239, "lr": 9.470219836400818e-06, "objective/entropy": -24.52312469482422, "objective/kl": 37.070613861083984, "objective/non_score_reward": -1.8535306453704834, "objective/rlhf_reward": -5.963524679751739, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 1.4948031902313232, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.591796875, "step": 829, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0006215572357178 }, { "episode": 13296, "epoch": 0.07966351511665529, "loss/policy_avg": 0.08012821525335312, "lr": 9.469580777096115e-06, "objective/entropy": -222.74710083007812, "objective/kl": 29.31437873840332, "objective/non_score_reward": -1.4657189846038818, "objective/rlhf_reward": -5.862875819206238, "objective/scores": 0.0, "policy/approxkl_avg": 7.948197364807129, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.720703125, "step": 830, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.999361515045166 }, { "episode": 13312, "epoch": 0.07975937975578483, "loss/policy_avg": 0.25244101881980896, "lr": 9.468941717791412e-06, "objective/entropy": -256.2400817871094, "objective/kl": 25.82564926147461, "objective/non_score_reward": -1.2912824153900146, "objective/rlhf_reward": -5.165129542350769, "objective/scores": 0.0, "policy/approxkl_avg": 25.767894744873047, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.625, "step": 831, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9958332777023315 }, { "episode": 13328, "epoch": 0.07985524439491438, "loss/policy_avg": 0.20151713490486145, "lr": 9.468302658486709e-06, "objective/entropy": -176.53012084960938, "objective/kl": 31.989328384399414, "objective/non_score_reward": -1.5994665622711182, "objective/rlhf_reward": -4.94726787051712, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 6.573209762573242, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.65234375, "step": 832, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0001397132873535 }, { "episode": 13344, "epoch": 0.07995110903404393, "loss/policy_avg": 0.40637868642807007, "lr": 9.467663599182006e-06, "objective/entropy": -157.83944702148438, "objective/kl": 26.236248016357422, "objective/non_score_reward": -1.311812400817871, "objective/rlhf_reward": -2.847249662876129, "objective/scores": 0.6, "policy/approxkl_avg": 41.408966064453125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.69921875, "step": 833, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9979543685913086 }, { "episode": 13360, "epoch": 0.08004697367317348, "loss/policy_avg": 0.4117756485939026, "lr": 9.467024539877301e-06, "objective/entropy": -154.52528381347656, "objective/kl": 34.40885925292969, "objective/non_score_reward": -1.7204430103302002, "objective/rlhf_reward": -3.958053027034971, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 5.24909782409668, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.552734375, "step": 834, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9996882677078247 }, { "episode": 13376, "epoch": 0.08014283831230302, "loss/policy_avg": 0.25968849658966064, "lr": 9.466385480572598e-06, "objective/entropy": -35.48725509643555, "objective/kl": 48.416969299316406, "objective/non_score_reward": -2.4208483695983887, "objective/rlhf_reward": -8.324143612121029, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 7.6608781814575195, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4736328125, "step": 835, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9974427223205566 }, { "episode": 13392, "epoch": 0.08023870295143258, "loss/policy_avg": 0.6013174057006836, "lr": 9.465746421267893e-06, "objective/entropy": -131.218994140625, "objective/kl": 40.460113525390625, "objective/non_score_reward": -2.023005723953247, "objective/rlhf_reward": -6.267194564613412, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 5.2574968338012695, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.765625, "step": 836, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9985507726669312 }, { "episode": 13408, "epoch": 0.08033456759056212, "loss/policy_avg": 0.024118170142173767, "lr": 9.46510736196319e-06, "objective/entropy": -219.2191162109375, "objective/kl": 37.4605712890625, "objective/non_score_reward": -1.8730283975601196, "objective/rlhf_reward": -5.544702480511601, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 4.715839385986328, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.603515625, "step": 837, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.003450393676758 }, { "episode": 13424, "epoch": 0.08043043222969168, "loss/policy_avg": 0.3022631108760834, "lr": 9.464468302658487e-06, "objective/entropy": -122.02997589111328, "objective/kl": 32.87577438354492, "objective/non_score_reward": -1.6437886953353882, "objective/rlhf_reward": -5.196552612868649, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 3.3451852798461914, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6328125, "step": 838, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9992682933807373 }, { "episode": 13440, "epoch": 0.08052629686882122, "loss/policy_avg": 0.09435372054576874, "lr": 9.463829243353784e-06, "objective/entropy": -228.3193817138672, "objective/kl": 27.057086944580078, "objective/non_score_reward": -1.3528543710708618, "objective/rlhf_reward": -3.7495579771405323, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 64.43006896972656, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.642578125, "step": 839, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9953296184539795 }, { "episode": 13456, "epoch": 0.08062216150795078, "loss/policy_avg": 1.2935261726379395, "lr": 9.46319018404908e-06, "objective/entropy": -160.080322265625, "objective/kl": 34.4007568359375, "objective/non_score_reward": -1.7200379371643066, "objective/rlhf_reward": -5.538515916376738, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 131.64187622070312, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6015625, "step": 840, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9983408451080322 }, { "episode": 13472, "epoch": 0.08071802614708032, "loss/policy_avg": 0.5178288817405701, "lr": 9.462551124744378e-06, "objective/entropy": -140.98907470703125, "objective/kl": 32.42417526245117, "objective/non_score_reward": -1.621208906173706, "objective/rlhf_reward": -4.084835386276245, "objective/scores": 0.6, "policy/approxkl_avg": 2.9638893604278564, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7421875, "step": 841, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9993921518325806 }, { "episode": 13488, "epoch": 0.08081389078620987, "loss/policy_avg": 1.674887776374817, "lr": 9.461912065439673e-06, "objective/entropy": -140.6572723388672, "objective/kl": 33.64493179321289, "objective/non_score_reward": -1.682246446609497, "objective/rlhf_reward": -2.3289861440658566, "objective/scores": 1.1, "policy/approxkl_avg": 2.7393760681152344, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.73828125, "step": 842, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0029544830322266 }, { "episode": 13504, "epoch": 0.08090975542533942, "loss/policy_avg": 0.10809826105833054, "lr": 9.46127300613497e-06, "objective/entropy": 33.49109649658203, "objective/kl": 46.121177673339844, "objective/non_score_reward": -2.3060591220855713, "objective/rlhf_reward": -7.399407501491616, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 10.247078895568848, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7578125, "step": 843, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9997997283935547 }, { "episode": 13520, "epoch": 0.08100562006446897, "loss/policy_avg": 0.08235388994216919, "lr": 9.460633946830267e-06, "objective/entropy": -232.94918823242188, "objective/kl": 29.242427825927734, "objective/non_score_reward": -1.4621214866638184, "objective/rlhf_reward": -5.848485827445984, "objective/scores": 0.0, "policy/approxkl_avg": 7.9668121337890625, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.708984375, "step": 844, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.998790979385376 }, { "episode": 13536, "epoch": 0.08110148470359851, "loss/policy_avg": 0.15575401484966278, "lr": 9.459994887525563e-06, "objective/entropy": -230.47235107421875, "objective/kl": 39.588829040527344, "objective/non_score_reward": -1.9794416427612305, "objective/rlhf_reward": -6.401994669231113, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 3.700314521789551, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.708984375, "step": 845, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9979462623596191 }, { "episode": 13552, "epoch": 0.08119734934272807, "loss/policy_avg": 0.13659973442554474, "lr": 9.45935582822086e-06, "objective/entropy": -174.33474731445312, "objective/kl": 28.351903915405273, "objective/non_score_reward": -1.4175951480865479, "objective/rlhf_reward": -2.746661697269651, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 9.24754524230957, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.705078125, "step": 846, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0036792755126953 }, { "episode": 13568, "epoch": 0.08129321398185761, "loss/policy_avg": -0.0010715574026107788, "lr": 9.458716768916156e-06, "objective/entropy": -106.94636535644531, "objective/kl": 43.695289611816406, "objective/non_score_reward": -2.1847643852233887, "objective/rlhf_reward": -7.077198391378509, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 4.114851474761963, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5859375, "step": 847, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0022311210632324 }, { "episode": 13584, "epoch": 0.08138907862098717, "loss/policy_avg": -0.020745811983942986, "lr": 9.458077709611452e-06, "objective/entropy": -274.30377197265625, "objective/kl": 29.099441528320312, "objective/non_score_reward": -1.4549720287322998, "objective/rlhf_reward": -4.215768191877919, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 5.374234199523926, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.55078125, "step": 848, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0002601146698 }, { "episode": 13600, "epoch": 0.08148494326011671, "loss/policy_avg": 0.08369505405426025, "lr": 9.45743865030675e-06, "objective/entropy": -90.9344482421875, "objective/kl": 32.62782669067383, "objective/non_score_reward": -1.6313912868499756, "objective/rlhf_reward": -4.921445462767201, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 2.873699426651001, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.681640625, "step": 849, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9983861446380615 }, { "episode": 13616, "epoch": 0.08158080789924627, "loss/policy_avg": 0.12610237300395966, "lr": 9.456799591002046e-06, "objective/entropy": -216.01071166992188, "objective/kl": 31.95155906677246, "objective/non_score_reward": -1.5975778102874756, "objective/rlhf_reward": -5.048675945311218, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 17.690187454223633, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.630859375, "step": 850, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9975669384002686 }, { "episode": 13632, "epoch": 0.08167667253837581, "loss/policy_avg": 0.09207138419151306, "lr": 9.456160531697343e-06, "objective/entropy": -213.504638671875, "objective/kl": 33.958152770996094, "objective/non_score_reward": -1.69790780544281, "objective/rlhf_reward": -5.413029053298336, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 2.278407096862793, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.712890625, "step": 851, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9982938766479492 }, { "episode": 13648, "epoch": 0.08177253717750536, "loss/policy_avg": 0.7879657745361328, "lr": 9.45552147239264e-06, "objective/entropy": -179.40536499023438, "objective/kl": 38.20147705078125, "objective/non_score_reward": -1.91007399559021, "objective/rlhf_reward": -6.216463644702998, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 3.275893211364746, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.52734375, "step": 852, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.000800609588623 }, { "episode": 13664, "epoch": 0.0818684018166349, "loss/policy_avg": -0.05168546736240387, "lr": 9.454882413087935e-06, "objective/entropy": -252.6636505126953, "objective/kl": 36.603004455566406, "objective/non_score_reward": -1.8301501274108887, "objective/rlhf_reward": -5.65874100250064, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 1.8799333572387695, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.734375, "step": 853, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.000197410583496 }, { "episode": 13680, "epoch": 0.08196426645576446, "loss/policy_avg": 0.35540589690208435, "lr": 9.454243353783232e-06, "objective/entropy": -263.326171875, "objective/kl": 31.936683654785156, "objective/non_score_reward": -1.5968341827392578, "objective/rlhf_reward": -5.0618239379226395, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 9.10447883605957, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.591796875, "step": 854, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9981427192687988 }, { "episode": 13696, "epoch": 0.082060131094894, "loss/policy_avg": -0.01314299926161766, "lr": 9.453604294478529e-06, "objective/entropy": -50.54448699951172, "objective/kl": 27.010623931884766, "objective/non_score_reward": -1.3505312204360962, "objective/rlhf_reward": -4.002124941349029, "objective/scores": 0.35, "policy/approxkl_avg": 72.71121215820312, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.755859375, "step": 855, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9988174438476562 }, { "episode": 13712, "epoch": 0.08215599573402356, "loss/policy_avg": 0.507459282875061, "lr": 9.452965235173824e-06, "objective/entropy": -196.7661590576172, "objective/kl": 41.39533615112305, "objective/non_score_reward": -2.0697667598724365, "objective/rlhf_reward": -6.331655929760869, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 6.633426666259766, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.734375, "step": 856, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9997053146362305 }, { "episode": 13728, "epoch": 0.0822518603731531, "loss/policy_avg": 0.01022842712700367, "lr": 9.452326175869121e-06, "objective/entropy": -165.575439453125, "objective/kl": 28.162111282348633, "objective/non_score_reward": -1.408105492591858, "objective/rlhf_reward": -4.253819801894528, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 2.566072463989258, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.673828125, "step": 857, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9994425773620605 }, { "episode": 13744, "epoch": 0.08234772501228266, "loss/policy_avg": 0.5199975371360779, "lr": 9.451687116564418e-06, "objective/entropy": -191.289794921875, "objective/kl": 25.639848709106445, "objective/non_score_reward": -1.2819924354553223, "objective/rlhf_reward": -3.6121978996121253, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 2.2938754558563232, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.4873046875, "step": 858, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9993865489959717 }, { "episode": 13760, "epoch": 0.0824435896514122, "loss/policy_avg": -0.09089094400405884, "lr": 9.451048057259715e-06, "objective/entropy": -222.6432647705078, "objective/kl": 35.101905822753906, "objective/non_score_reward": -1.7550954818725586, "objective/rlhf_reward": -5.641779580203396, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 1.5215179920196533, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.701171875, "step": 859, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0004208087921143 }, { "episode": 13776, "epoch": 0.08253945429054176, "loss/policy_avg": 0.3994244635105133, "lr": 9.45040899795501e-06, "objective/entropy": -232.05795288085938, "objective/kl": 35.13083267211914, "objective/non_score_reward": -1.7565417289733887, "objective/rlhf_reward": -2.6261669158935543, "objective/scores": 1.1, "policy/approxkl_avg": 7.337094306945801, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.736328125, "step": 860, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0000622272491455 }, { "episode": 13792, "epoch": 0.0826353189296713, "loss/policy_avg": 0.241072878241539, "lr": 9.449769938650307e-06, "objective/entropy": -235.5231475830078, "objective/kl": 42.96981430053711, "objective/non_score_reward": -2.1484906673431396, "objective/rlhf_reward": -6.860629336039224, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 5.666136264801025, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.681640625, "step": 861, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9980902671813965 }, { "episode": 13808, "epoch": 0.08273118356880085, "loss/policy_avg": 0.06892701238393784, "lr": 9.449130879345604e-06, "objective/entropy": -43.37392044067383, "objective/kl": 28.94279670715332, "objective/non_score_reward": -1.447139859199524, "objective/rlhf_reward": -4.446923902540832, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 13.682140350341797, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.873046875, "step": 862, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.998504638671875 }, { "episode": 13824, "epoch": 0.0828270482079304, "loss/policy_avg": 0.05104389786720276, "lr": 9.4484918200409e-06, "objective/entropy": -274.24462890625, "objective/kl": 26.58008575439453, "objective/non_score_reward": -1.3290044069290161, "objective/rlhf_reward": -3.6541578821545704, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 14.673041343688965, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6796875, "step": 863, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.998407244682312 }, { "episode": 13840, "epoch": 0.08292291284705995, "loss/policy_avg": 2.0433521270751953, "lr": 9.447852760736197e-06, "objective/entropy": -141.08175659179688, "objective/kl": 38.66474151611328, "objective/non_score_reward": -1.933237075805664, "objective/rlhf_reward": -6.282350401492462, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 3.4866771697998047, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.849609375, "step": 864, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0020241737365723 }, { "episode": 13856, "epoch": 0.08301877748618951, "loss/policy_avg": 0.5822303891181946, "lr": 9.447213701431494e-06, "objective/entropy": -75.44483184814453, "objective/kl": 42.41981506347656, "objective/non_score_reward": -2.1209909915924072, "objective/rlhf_reward": -7.033365587802276, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 1.0502395629882812, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.763671875, "step": 865, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0002496242523193 }, { "episode": 13872, "epoch": 0.08311464212531905, "loss/policy_avg": 1.5961978435516357, "lr": 9.44657464212679e-06, "objective/entropy": -102.62336730957031, "objective/kl": 32.63710021972656, "objective/non_score_reward": -1.6318550109863281, "objective/rlhf_reward": -4.702591176303934, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 44.34449005126953, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.583984375, "step": 866, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.997666597366333 }, { "episode": 13888, "epoch": 0.0832105067644486, "loss/policy_avg": -0.06377097964286804, "lr": 9.445935582822086e-06, "objective/entropy": -179.53016662597656, "objective/kl": 27.1846981048584, "objective/non_score_reward": -1.3592349290847778, "objective/rlhf_reward": -3.3142334840455394, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 11.25791072845459, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.67578125, "step": 867, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.001521110534668 }, { "episode": 13904, "epoch": 0.08330637140357815, "loss/policy_avg": 0.06122337281703949, "lr": 9.445296523517383e-06, "objective/entropy": -160.8975830078125, "objective/kl": 37.28607940673828, "objective/non_score_reward": -1.8643040657043457, "objective/rlhf_reward": -6.131703171759767, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 2.380110263824463, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.685546875, "step": 868, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9993302822113037 }, { "episode": 13920, "epoch": 0.0834022360427077, "loss/policy_avg": 0.06397978216409683, "lr": 9.44465746421268e-06, "objective/entropy": -279.75146484375, "objective/kl": 36.54051971435547, "objective/non_score_reward": -1.8270260095596313, "objective/rlhf_reward": -5.3606928093003585, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 9.166413307189941, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6953125, "step": 869, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.998555064201355 }, { "episode": 13936, "epoch": 0.08349810068183725, "loss/policy_avg": 0.18339544534683228, "lr": 9.444018404907977e-06, "objective/entropy": -197.06088256835938, "objective/kl": 35.413883209228516, "objective/non_score_reward": -1.7706942558288574, "objective/rlhf_reward": -5.420917516172516, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 2.4228196144104004, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.55859375, "step": 870, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9997130632400513 }, { "episode": 13952, "epoch": 0.0835939653209668, "loss/policy_avg": 0.7395508885383606, "lr": 9.443379345603272e-06, "objective/entropy": -175.5420684814453, "objective/kl": 27.310260772705078, "objective/non_score_reward": -1.3655130863189697, "objective/rlhf_reward": -3.9057928611903936, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 20.016393661499023, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.744140625, "step": 871, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9980931282043457 }, { "episode": 13968, "epoch": 0.08368982996009634, "loss/policy_avg": 0.11419187486171722, "lr": 9.442740286298569e-06, "objective/entropy": -202.19219970703125, "objective/kl": 26.73446273803711, "objective/non_score_reward": -1.3367230892181396, "objective/rlhf_reward": -0.9468923568725582, "objective/scores": 1.1, "policy/approxkl_avg": 1.4593892097473145, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.505859375, "step": 872, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9997605085372925 }, { "episode": 13984, "epoch": 0.0837856945992259, "loss/policy_avg": 0.10254265367984772, "lr": 9.442101226993866e-06, "objective/entropy": -181.49607849121094, "objective/kl": 34.489620208740234, "objective/non_score_reward": -1.7244811058044434, "objective/rlhf_reward": -5.2938043213525585, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 10.685236930847168, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.572265625, "step": 873, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0004138946533203 }, { "episode": 14000, "epoch": 0.08388155923835544, "loss/policy_avg": -0.11048807948827744, "lr": 9.441462167689163e-06, "objective/entropy": -233.58718872070312, "objective/kl": 27.196325302124023, "objective/non_score_reward": -1.359816312789917, "objective/rlhf_reward": -4.080015146468563, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 7.074767112731934, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.599609375, "step": 874, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.000337600708008 }, { "episode": 14016, "epoch": 0.083977423877485, "loss/policy_avg": -0.04991217330098152, "lr": 9.44082310838446e-06, "objective/entropy": -147.29574584960938, "objective/kl": 39.145992279052734, "objective/non_score_reward": -1.9572995901107788, "objective/rlhf_reward": -6.429198360443115, "objective/scores": 0.35, "policy/approxkl_avg": 2.3655714988708496, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.703125, "step": 875, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0024516582489014 }, { "episode": 14032, "epoch": 0.08407328851661454, "loss/policy_avg": 0.018214020878076553, "lr": 9.440184049079757e-06, "objective/entropy": -225.25274658203125, "objective/kl": 28.496929168701172, "objective/non_score_reward": -1.4248464107513428, "objective/rlhf_reward": -4.248787502856597, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 8.280494689941406, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.78515625, "step": 876, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0028023719787598 }, { "episode": 14048, "epoch": 0.0841691531557441, "loss/policy_avg": -0.0712839737534523, "lr": 9.439544989775052e-06, "objective/entropy": -111.49925231933594, "objective/kl": 33.307395935058594, "objective/non_score_reward": -1.6653697490692139, "objective/rlhf_reward": -5.237647135456172, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 10.649118423461914, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.744140625, "step": 877, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0005428791046143 }, { "episode": 14064, "epoch": 0.08426501779487364, "loss/policy_avg": 0.416260302066803, "lr": 9.438905930470349e-06, "objective/entropy": -91.5921630859375, "objective/kl": 36.07551193237305, "objective/non_score_reward": -1.8037755489349365, "objective/rlhf_reward": -5.658842890468195, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 14.971528053283691, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.546875, "step": 878, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.99779212474823 }, { "episode": 14080, "epoch": 0.08436088243400319, "loss/policy_avg": 0.6945221424102783, "lr": 9.438266871165644e-06, "objective/entropy": -103.2996597290039, "objective/kl": 29.02838706970215, "objective/non_score_reward": -1.4514193534851074, "objective/rlhf_reward": -4.249418287482813, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 3.5951104164123535, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6015625, "step": 879, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9996235370635986 }, { "episode": 14096, "epoch": 0.08445674707313273, "loss/policy_avg": 0.14096970856189728, "lr": 9.43762781186094e-06, "objective/entropy": -250.6915283203125, "objective/kl": 24.03522491455078, "objective/non_score_reward": -1.2017613649368286, "objective/rlhf_reward": -3.4284433508790553, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 141.8468017578125, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.681640625, "step": 880, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9993736743927002 }, { "episode": 14112, "epoch": 0.08455261171226229, "loss/policy_avg": 0.3699185848236084, "lr": 9.436988752556238e-06, "objective/entropy": -159.3045196533203, "objective/kl": 40.019386291503906, "objective/non_score_reward": -2.000969409942627, "objective/rlhf_reward": -6.553278903575286, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 8.20317554473877, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.775390625, "step": 881, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9977927207946777 }, { "episode": 14128, "epoch": 0.08464847635139183, "loss/policy_avg": 0.41995298862457275, "lr": 9.436349693251534e-06, "objective/entropy": 76.95626068115234, "objective/kl": 39.00627899169922, "objective/non_score_reward": -1.9503138065338135, "objective/rlhf_reward": -6.139395838201629, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 31.75859832763672, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.568359375, "step": 882, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9953019618988037 }, { "episode": 14144, "epoch": 0.08474434099052139, "loss/policy_avg": 0.5355075597763062, "lr": 9.435710633946831e-06, "objective/entropy": -164.35186767578125, "objective/kl": 42.27740478515625, "objective/non_score_reward": -2.113870143890381, "objective/rlhf_reward": -7.113845041304259, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 20.66805648803711, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.72265625, "step": 883, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9979139566421509 }, { "episode": 14160, "epoch": 0.08484020562965093, "loss/policy_avg": 0.12046757340431213, "lr": 9.435071574642126e-06, "objective/entropy": -139.48226928710938, "objective/kl": 35.96446228027344, "objective/non_score_reward": -1.7982230186462402, "objective/rlhf_reward": -5.833642208312435, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 5.584999084472656, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.763671875, "step": 884, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9985802173614502 }, { "episode": 14176, "epoch": 0.08493607026878049, "loss/policy_avg": 0.20259422063827515, "lr": 9.434432515337423e-06, "objective/entropy": -194.32472229003906, "objective/kl": 29.422592163085938, "objective/non_score_reward": -1.4711295366287231, "objective/rlhf_reward": -4.151184813181559, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 7.590093612670898, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.705078125, "step": 885, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0000078678131104 }, { "episode": 14192, "epoch": 0.08503193490791003, "loss/policy_avg": 0.38378089666366577, "lr": 9.43379345603272e-06, "objective/entropy": -204.80718994140625, "objective/kl": 26.858444213867188, "objective/non_score_reward": -1.3429222106933594, "objective/rlhf_reward": -3.947856862743465, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 54.279869079589844, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8828125, "step": 886, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.000415086746216 }, { "episode": 14208, "epoch": 0.08512779954703958, "loss/policy_avg": 0.27804744243621826, "lr": 9.433154396728017e-06, "objective/entropy": -216.76026916503906, "objective/kl": 31.35245704650879, "objective/non_score_reward": -1.5676229000091553, "objective/rlhf_reward": -4.928855529337554, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 44.15214157104492, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.841796875, "step": 887, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9985570907592773 }, { "episode": 14224, "epoch": 0.08522366418616913, "loss/policy_avg": 0.1285010725259781, "lr": 9.432515337423314e-06, "objective/entropy": -256.2292175292969, "objective/kl": 22.457351684570312, "objective/non_score_reward": -1.1228675842285156, "objective/rlhf_reward": -2.5440589291619613, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 2.694319725036621, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.591796875, "step": 888, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9982428550720215 }, { "episode": 14240, "epoch": 0.08531952882529868, "loss/policy_avg": 0.1620079129934311, "lr": 9.431876278118611e-06, "objective/entropy": -246.3665313720703, "objective/kl": 32.27862548828125, "objective/non_score_reward": -1.6139311790466309, "objective/rlhf_reward": -5.03189285536584, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 4.128833293914795, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.626953125, "step": 889, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0017335414886475 }, { "episode": 14256, "epoch": 0.08541539346442822, "loss/policy_avg": 0.6714350581169128, "lr": 9.431237218813906e-06, "objective/entropy": -87.00444793701172, "objective/kl": 30.12160873413086, "objective/non_score_reward": -1.5060807466506958, "objective/rlhf_reward": -4.600490768154231, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 30.543041229248047, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.681640625, "step": 890, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9955544471740723 }, { "episode": 14272, "epoch": 0.08551125810355778, "loss/policy_avg": 0.5368032455444336, "lr": 9.430598159509203e-06, "objective/entropy": -151.2410125732422, "objective/kl": 23.1306095123291, "objective/non_score_reward": -1.1565306186676025, "objective/rlhf_reward": -3.266872340176983, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 18.648775100708008, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.62890625, "step": 891, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.999136209487915 }, { "episode": 14288, "epoch": 0.08560712274268732, "loss/policy_avg": -0.4043048024177551, "lr": 9.4299591002045e-06, "objective/entropy": -214.12281799316406, "objective/kl": 38.173484802246094, "objective/non_score_reward": -1.9086743593215942, "objective/rlhf_reward": -5.972837810934173, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 3.6675243377685547, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.580078125, "step": 892, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.000812292098999 }, { "episode": 14304, "epoch": 0.08570298738181688, "loss/policy_avg": 1.0885683298110962, "lr": 9.429320040899797e-06, "objective/entropy": -234.37998962402344, "objective/kl": 27.995094299316406, "objective/non_score_reward": -1.3997547626495361, "objective/rlhf_reward": -4.239769363139553, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 1.2649099826812744, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6640625, "step": 893, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0003528594970703 }, { "episode": 14320, "epoch": 0.08579885202094642, "loss/policy_avg": -0.1013278141617775, "lr": 9.428680981595094e-06, "objective/entropy": -156.33245849609375, "objective/kl": 35.587982177734375, "objective/non_score_reward": -1.779399037361145, "objective/rlhf_reward": -5.738993861762387, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 8.389669418334961, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.666015625, "step": 894, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.99942946434021 }, { "episode": 14336, "epoch": 0.08589471666007598, "loss/policy_avg": -0.006531273480504751, "lr": 9.42804192229039e-06, "objective/entropy": -197.26820373535156, "objective/kl": 21.04766082763672, "objective/non_score_reward": -1.0523829460144043, "objective/rlhf_reward": -2.6532727172046453, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 1.4280903339385986, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.69140625, "step": 895, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.001786708831787 }, { "episode": 14352, "epoch": 0.08599058129920552, "loss/policy_avg": 0.10259456932544708, "lr": 9.427402862985686e-06, "objective/entropy": -120.49540710449219, "objective/kl": 37.17432403564453, "objective/non_score_reward": -1.858716368675232, "objective/rlhf_reward": -3.034865355491638, "objective/scores": 1.1, "policy/approxkl_avg": 6.6070685386657715, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.615234375, "step": 896, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.999729871749878 }, { "episode": 14368, "epoch": 0.08608644593833507, "loss/policy_avg": 0.18344524502754211, "lr": 9.426763803680982e-06, "objective/entropy": -84.0172348022461, "objective/kl": 32.38622283935547, "objective/non_score_reward": -1.6193112134933472, "objective/rlhf_reward": -5.117994987700863, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 33.82829284667969, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.529296875, "step": 897, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9998817443847656 }, { "episode": 14384, "epoch": 0.08618231057746462, "loss/policy_avg": 0.7863380312919617, "lr": 9.42612474437628e-06, "objective/entropy": -94.4057388305664, "objective/kl": 31.75823402404785, "objective/non_score_reward": -1.58791184425354, "objective/rlhf_reward": -3.4279281839143962, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 3.406008243560791, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.8125, "step": 898, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0038909912109375 }, { "episode": 14400, "epoch": 0.08627817521659417, "loss/policy_avg": 0.5351603031158447, "lr": 9.425485685071576e-06, "objective/entropy": -265.2181396484375, "objective/kl": 29.21182632446289, "objective/non_score_reward": -1.460591197013855, "objective/rlhf_reward": -4.1090314547220865, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 4.054888725280762, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.57421875, "step": 899, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.998013973236084 }, { "episode": 14416, "epoch": 0.08637403985572371, "loss/policy_avg": 0.013534091413021088, "lr": 9.424846625766873e-06, "objective/entropy": -194.56564331054688, "objective/kl": 24.970386505126953, "objective/non_score_reward": -1.2485194206237793, "objective/rlhf_reward": -3.6348278162225913, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 0.42985397577285767, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.53515625, "step": 900, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0002646446228027 }, { "episode": 14432, "epoch": 0.08646990449485327, "loss/policy_avg": -0.026506464928388596, "lr": 9.424207566462168e-06, "objective/entropy": -121.82954406738281, "objective/kl": 38.97528839111328, "objective/non_score_reward": -1.9487645626068115, "objective/rlhf_reward": -7.795057892799377, "objective/scores": 0.0, "policy/approxkl_avg": 18.97709846496582, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.3564453125, "step": 901, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9988110065460205 }, { "episode": 14448, "epoch": 0.08656576913398281, "loss/policy_avg": 0.04643288254737854, "lr": 9.423568507157465e-06, "objective/entropy": -97.38468170166016, "objective/kl": 28.042333602905273, "objective/non_score_reward": -1.4021167755126953, "objective/rlhf_reward": -4.184634823997585, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 2.1407618522644043, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.60546875, "step": 902, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9999892711639404 }, { "episode": 14464, "epoch": 0.08666163377311237, "loss/policy_avg": 0.5154027342796326, "lr": 9.42292944785276e-06, "objective/entropy": -250.2370147705078, "objective/kl": 25.91543960571289, "objective/non_score_reward": -1.2957720756530762, "objective/rlhf_reward": -3.759255845745174, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 1.9840008020401, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.607421875, "step": 903, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9984569549560547 }, { "episode": 14480, "epoch": 0.08675749841224191, "loss/policy_avg": -0.12090878188610077, "lr": 9.422290388548057e-06, "objective/entropy": -224.9342041015625, "objective/kl": 21.860130310058594, "objective/non_score_reward": -1.0930064916610718, "objective/rlhf_reward": -2.42461485691541, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 6.253545761108398, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.751953125, "step": 904, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.000030755996704 }, { "episode": 14496, "epoch": 0.08685336305137147, "loss/policy_avg": 0.2192097306251526, "lr": 9.421651329243354e-06, "objective/entropy": -116.75704956054688, "objective/kl": 40.641937255859375, "objective/non_score_reward": -2.0320968627929688, "objective/rlhf_reward": -6.786751320868163, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 3.1222383975982666, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.525390625, "step": 905, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9990143775939941 }, { "episode": 14512, "epoch": 0.08694922769050101, "loss/policy_avg": 0.014911421574652195, "lr": 9.421012269938651e-06, "objective/entropy": -169.34967041015625, "objective/kl": 19.47471809387207, "objective/non_score_reward": -0.9737359285354614, "objective/rlhf_reward": -1.7722373626389838, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 3.2120165824890137, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.65234375, "step": 906, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0021252632141113 }, { "episode": 14528, "epoch": 0.08704509232963056, "loss/policy_avg": -0.06861399114131927, "lr": 9.420373210633948e-06, "objective/entropy": -199.73748779296875, "objective/kl": 32.33728790283203, "objective/non_score_reward": -1.6168644428253174, "objective/rlhf_reward": -5.088855722037655, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 7.329561233520508, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.58203125, "step": 907, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.1414122581481934 }, { "episode": 14544, "epoch": 0.0871409569687601, "loss/policy_avg": -0.0006491807289421558, "lr": 9.419734151329245e-06, "objective/entropy": -241.5037078857422, "objective/kl": 26.676612854003906, "objective/non_score_reward": -1.3338308334350586, "objective/rlhf_reward": -0.9353229761123654, "objective/scores": 1.1, "policy/approxkl_avg": 2.882882595062256, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.568359375, "step": 908, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9995675086975098 }, { "episode": 14560, "epoch": 0.08723682160788966, "loss/policy_avg": -0.30844664573669434, "lr": 9.41909509202454e-06, "objective/entropy": -193.48281860351562, "objective/kl": 32.22890853881836, "objective/non_score_reward": -1.6114455461502075, "objective/rlhf_reward": -4.712448493639627, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 9.586688995361328, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.537109375, "step": 909, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0032057762145996 }, { "episode": 14576, "epoch": 0.0873326862470192, "loss/policy_avg": 0.10456671565771103, "lr": 9.418456032719837e-06, "objective/entropy": -214.8862762451172, "objective/kl": 30.845359802246094, "objective/non_score_reward": -1.5422677993774414, "objective/rlhf_reward": -4.769071197509765, "objective/scores": 0.35, "policy/approxkl_avg": 48.766883850097656, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.560546875, "step": 910, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0017874240875244 }, { "episode": 14592, "epoch": 0.08742855088614876, "loss/policy_avg": 0.011322952806949615, "lr": 9.417816973415134e-06, "objective/entropy": -148.18869018554688, "objective/kl": 34.653785705566406, "objective/non_score_reward": -1.7326891422271729, "objective/rlhf_reward": -5.5069247080880075, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 2.303962230682373, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.740234375, "step": 911, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.001537561416626 }, { "episode": 14608, "epoch": 0.0875244155252783, "loss/policy_avg": 1.4446654319763184, "lr": 9.41717791411043e-06, "objective/entropy": -151.7039337158203, "objective/kl": 36.139678955078125, "objective/non_score_reward": -1.8069840669631958, "objective/rlhf_reward": -5.623816165987568, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 8.342704772949219, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7421875, "step": 912, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.997730016708374 }, { "episode": 14624, "epoch": 0.08762028016440786, "loss/policy_avg": 0.007501431740820408, "lr": 9.416538854805727e-06, "objective/entropy": -192.82723999023438, "objective/kl": 28.006526947021484, "objective/non_score_reward": -1.4003264904022217, "objective/rlhf_reward": -3.776477153572153, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 9.397720336914062, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.74609375, "step": 913, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9988361597061157 }, { "episode": 14640, "epoch": 0.0877161448035374, "loss/policy_avg": 0.7067223787307739, "lr": 9.415899795501023e-06, "objective/entropy": -199.13888549804688, "objective/kl": 40.245330810546875, "objective/non_score_reward": -2.0122666358947754, "objective/rlhf_reward": -6.387206798017608, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 1.6032171249389648, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.626953125, "step": 914, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0032029151916504 }, { "episode": 14656, "epoch": 0.08781200944266696, "loss/policy_avg": 0.7447987198829651, "lr": 9.41526073619632e-06, "objective/entropy": -192.03024291992188, "objective/kl": 33.84302520751953, "objective/non_score_reward": -1.6921510696411133, "objective/rlhf_reward": -5.212345330920771, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 12.58854866027832, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.708984375, "step": 915, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9982028007507324 }, { "episode": 14672, "epoch": 0.0879078740817965, "loss/policy_avg": -0.12448902428150177, "lr": 9.414621676891616e-06, "objective/entropy": -108.39199829101562, "objective/kl": 27.540185928344727, "objective/non_score_reward": -1.3770092725753784, "objective/rlhf_reward": -3.3853308580079418, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 0.6809393763542175, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.771484375, "step": 916, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0027718544006348 }, { "episode": 14688, "epoch": 0.08800373872092605, "loss/policy_avg": 0.09778769314289093, "lr": 9.413982617586913e-06, "objective/entropy": -83.20165252685547, "objective/kl": 27.68124008178711, "objective/non_score_reward": -1.3840619325637817, "objective/rlhf_reward": -3.7114191010323276, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 7.001269340515137, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6015625, "step": 917, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9994606971740723 }, { "episode": 14704, "epoch": 0.0880996033600556, "loss/policy_avg": 0.7267050743103027, "lr": 9.41334355828221e-06, "objective/entropy": -174.48663330078125, "objective/kl": 32.38935089111328, "objective/non_score_reward": -1.6194674968719482, "objective/rlhf_reward": -6.477869987487793, "objective/scores": 0.0, "policy/approxkl_avg": 9.753436088562012, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6796875, "step": 918, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9973618984222412 }, { "episode": 14720, "epoch": 0.08819546799918515, "loss/policy_avg": 0.18099595606327057, "lr": 9.412704498977507e-06, "objective/entropy": -232.4264373779297, "objective/kl": 37.20670700073242, "objective/non_score_reward": -1.860335350036621, "objective/rlhf_reward": -6.115828309088869, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 41.893341064453125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.79296875, "step": 919, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.002889633178711 }, { "episode": 14736, "epoch": 0.08829133263831469, "loss/policy_avg": 0.43639302253723145, "lr": 9.412065439672802e-06, "objective/entropy": -183.69644165039062, "objective/kl": 24.13558006286621, "objective/non_score_reward": -1.2067790031433105, "objective/rlhf_reward": -2.8797047836350753, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 30.2447509765625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.55078125, "step": 920, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9992506504058838 }, { "episode": 14752, "epoch": 0.08838719727744425, "loss/policy_avg": 0.5567411780357361, "lr": 9.411426380368099e-06, "objective/entropy": -285.06512451171875, "objective/kl": 32.89839553833008, "objective/non_score_reward": -1.644919753074646, "objective/rlhf_reward": -4.8463457385698945, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 18.15423583984375, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.609375, "step": 921, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.997804880142212 }, { "episode": 14768, "epoch": 0.0884830619165738, "loss/policy_avg": -0.024971559643745422, "lr": 9.410787321063396e-06, "objective/entropy": -144.00473022460938, "objective/kl": 27.061277389526367, "objective/non_score_reward": -1.353063941001892, "objective/rlhf_reward": -4.033653714743954, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 1.530630111694336, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.62890625, "step": 922, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0000722408294678 }, { "episode": 14784, "epoch": 0.08857892655570335, "loss/policy_avg": -0.49618157744407654, "lr": 9.410148261758691e-06, "objective/entropy": -37.43824768066406, "objective/kl": 35.81788635253906, "objective/non_score_reward": -1.7908943891525269, "objective/rlhf_reward": -5.501717870653259, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 4.131357192993164, "policy/clipfrac_avg": 2.0, "policy/entropy_avg": 0.4287109375, "step": 923, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0173897743225098 }, { "episode": 14800, "epoch": 0.0886747911948329, "loss/policy_avg": 0.05783979594707489, "lr": 9.409509202453988e-06, "objective/entropy": -154.13516235351562, "objective/kl": 46.57448196411133, "objective/non_score_reward": -2.3287243843078613, "objective/rlhf_reward": -7.653037791669952, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 2.5200886726379395, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5390625, "step": 924, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9987819194793701 }, { "episode": 14816, "epoch": 0.08877065583396244, "loss/policy_avg": 0.034926094114780426, "lr": 9.408870143149285e-06, "objective/entropy": -221.52577209472656, "objective/kl": 35.47760772705078, "objective/non_score_reward": -1.7738804817199707, "objective/rlhf_reward": -5.614569070752024, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 1.4324332475662231, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.66796875, "step": 925, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9995288848876953 }, { "episode": 14832, "epoch": 0.088866520473092, "loss/policy_avg": 0.32427555322647095, "lr": 9.408231083844582e-06, "objective/entropy": -130.25445556640625, "objective/kl": 34.63972473144531, "objective/non_score_reward": -1.7319860458374023, "objective/rlhf_reward": -5.371684878078058, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 3.6408345699310303, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.568359375, "step": 926, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0012738704681396 }, { "episode": 14848, "epoch": 0.08896238511222154, "loss/policy_avg": -0.27763280272483826, "lr": 9.407592024539877e-06, "objective/entropy": -244.65667724609375, "objective/kl": 27.930646896362305, "objective/non_score_reward": -1.396532416343689, "objective/rlhf_reward": -3.7613009765473118, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 65.45894622802734, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.66015625, "step": 927, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.002328872680664 }, { "episode": 14864, "epoch": 0.0890582497513511, "loss/policy_avg": 0.39164024591445923, "lr": 9.406952965235174e-06, "objective/entropy": -92.6754150390625, "objective/kl": 40.35970687866211, "objective/non_score_reward": -2.0179853439331055, "objective/rlhf_reward": -5.148222361446592, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 9.222280502319336, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.587890625, "step": 928, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9965031147003174 }, { "episode": 14880, "epoch": 0.08915411439048064, "loss/policy_avg": 0.018820755183696747, "lr": 9.40631390593047e-06, "objective/entropy": -221.75802612304688, "objective/kl": 32.733848571777344, "objective/non_score_reward": -1.6366922855377197, "objective/rlhf_reward": -4.942649397913533, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 1.5601739883422852, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.67578125, "step": 929, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0016555786132812 }, { "episode": 14896, "epoch": 0.0892499790296102, "loss/policy_avg": 0.02956710010766983, "lr": 9.405674846625768e-06, "objective/entropy": -225.1991729736328, "objective/kl": 27.00541114807129, "objective/non_score_reward": -1.3502705097198486, "objective/rlhf_reward": -4.059446623831421, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 6.755413055419922, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8046875, "step": 930, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.001051664352417 }, { "episode": 14912, "epoch": 0.08934584366873974, "loss/policy_avg": 0.08642945438623428, "lr": 9.405035787321065e-06, "objective/entropy": -179.3356475830078, "objective/kl": 36.390193939208984, "objective/non_score_reward": -1.8195096254348755, "objective/rlhf_reward": -5.330627392010625, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 10.583852767944336, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4501953125, "step": 931, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.999453067779541 }, { "episode": 14928, "epoch": 0.0894417083078693, "loss/policy_avg": 0.08466912060976028, "lr": 9.404396728016361e-06, "objective/entropy": -160.34024047851562, "objective/kl": 48.99607849121094, "objective/non_score_reward": -2.4498043060302734, "objective/rlhf_reward": -8.195096407000142, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 0.9886335134506226, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.666015625, "step": 932, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0025954246520996 }, { "episode": 14944, "epoch": 0.08953757294699884, "loss/policy_avg": 0.3508598804473877, "lr": 9.403757668711657e-06, "objective/entropy": -177.20993041992188, "objective/kl": 32.381324768066406, "objective/non_score_reward": -1.6190659999847412, "objective/rlhf_reward": -5.150751504927797, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 31.277324676513672, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.4716796875, "step": 933, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9997992515563965 }, { "episode": 14960, "epoch": 0.08963343758612839, "loss/policy_avg": 0.11015394330024719, "lr": 9.403118609406953e-06, "objective/entropy": -203.39776611328125, "objective/kl": 32.743534088134766, "objective/non_score_reward": -1.637176752090454, "objective/rlhf_reward": -4.94458726412447, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 2.4484572410583496, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.734375, "step": 934, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9993422031402588 }, { "episode": 14976, "epoch": 0.08972930222525793, "loss/policy_avg": -0.10944172739982605, "lr": 9.40247955010225e-06, "objective/entropy": -241.4989013671875, "objective/kl": 21.90488052368164, "objective/non_score_reward": -1.0952439308166504, "objective/rlhf_reward": -3.0217259762033652, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 3.7654926776885986, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.689453125, "step": 935, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.002392292022705 }, { "episode": 14992, "epoch": 0.08982516686438749, "loss/policy_avg": 0.9405217170715332, "lr": 9.401840490797547e-06, "objective/entropy": -237.89816284179688, "objective/kl": 25.436769485473633, "objective/non_score_reward": -1.2718384265899658, "objective/rlhf_reward": -3.663521905143825, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 6.3816022872924805, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.60546875, "step": 936, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9998328685760498 }, { "episode": 15008, "epoch": 0.08992103150351703, "loss/policy_avg": 0.3327906131744385, "lr": 9.401201431492844e-06, "objective/entropy": -268.6925354003906, "objective/kl": 37.998870849609375, "objective/non_score_reward": -1.899943470954895, "objective/rlhf_reward": -6.0435144593387395, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 10.16036605834961, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.607421875, "step": 937, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9984662532806396 }, { "episode": 15024, "epoch": 0.09001689614264659, "loss/policy_avg": -0.26467132568359375, "lr": 9.40056237218814e-06, "objective/entropy": -231.59254455566406, "objective/kl": 26.266529083251953, "objective/non_score_reward": -1.3133264780044556, "objective/rlhf_reward": -3.737534248622593, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 8.63685417175293, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6328125, "step": 938, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.999743938446045 }, { "episode": 15040, "epoch": 0.09011276078177613, "loss/policy_avg": 0.2447420209646225, "lr": 9.399923312883436e-06, "objective/entropy": -278.01153564453125, "objective/kl": 27.628671646118164, "objective/non_score_reward": -1.3814334869384766, "objective/rlhf_reward": -4.147132196513516, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 2.7261061668395996, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.669921875, "step": 939, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9990365505218506 }, { "episode": 15056, "epoch": 0.09020862542090569, "loss/policy_avg": 0.2600797414779663, "lr": 9.399284253578733e-06, "objective/entropy": -242.6852264404297, "objective/kl": 40.91444396972656, "objective/non_score_reward": -2.045722484588623, "objective/rlhf_reward": -6.060183467642341, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 7.501818656921387, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6484375, "step": 940, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9971075057983398 }, { "episode": 15072, "epoch": 0.09030449006003523, "loss/policy_avg": 0.3729836940765381, "lr": 9.39864519427403e-06, "objective/entropy": -225.56338500976562, "objective/kl": 34.106658935546875, "objective/non_score_reward": -1.7053331136703491, "objective/rlhf_reward": -5.340379836972117, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 3.6144325733184814, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.73828125, "step": 941, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9977765083312988 }, { "episode": 15088, "epoch": 0.09040035469916478, "loss/policy_avg": 0.571183443069458, "lr": 9.398006134969327e-06, "objective/entropy": -109.51638793945312, "objective/kl": 57.49871826171875, "objective/non_score_reward": -2.8749358654022217, "objective/rlhf_reward": -9.895623478952961, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 5.06275749206543, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.736328125, "step": 942, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000272512435913 }, { "episode": 15104, "epoch": 0.09049621933829433, "loss/policy_avg": 0.7253443002700806, "lr": 9.397367075664624e-06, "objective/entropy": -69.86570739746094, "objective/kl": 40.12030029296875, "objective/non_score_reward": -2.0060153007507324, "objective/rlhf_reward": -6.362201397836792, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 66.08172607421875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6015625, "step": 943, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.997114896774292 }, { "episode": 15120, "epoch": 0.09059208397742388, "loss/policy_avg": 0.7548943758010864, "lr": 9.396728016359919e-06, "objective/entropy": -264.1029357910156, "objective/kl": 29.125934600830078, "objective/non_score_reward": -1.456296682357788, "objective/rlhf_reward": -4.268927424159601, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 1.555539846420288, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6328125, "step": 944, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0007224082946777 }, { "episode": 15136, "epoch": 0.09068794861655342, "loss/policy_avg": -0.06224450469017029, "lr": 9.396088957055216e-06, "objective/entropy": -215.80255126953125, "objective/kl": 36.1290283203125, "objective/non_score_reward": -1.8064515590667725, "objective/rlhf_reward": -5.7100345728718604, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 2.062628746032715, "policy/clipfrac_avg": 0.25, "policy/entropy_avg": 0.5703125, "step": 945, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0022828578948975 }, { "episode": 15152, "epoch": 0.09078381325568298, "loss/policy_avg": -0.34320878982543945, "lr": 9.395449897750511e-06, "objective/entropy": -254.14260864257812, "objective/kl": 24.163818359375, "objective/non_score_reward": -1.20819091796875, "objective/rlhf_reward": -2.4327639102935787, "objective/scores": 0.6, "policy/approxkl_avg": 3.011139392852783, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.62109375, "step": 946, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0023422241210938 }, { "episode": 15168, "epoch": 0.09087967789481252, "loss/policy_avg": 0.08071097731590271, "lr": 9.394810838445808e-06, "objective/entropy": -269.91180419921875, "objective/kl": 29.857431411743164, "objective/non_score_reward": -1.4928715229034424, "objective/rlhf_reward": -3.8487801573434215, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 7.305149555206299, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.646484375, "step": 947, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9971578121185303 }, { "episode": 15184, "epoch": 0.09097554253394208, "loss/policy_avg": -0.019624732434749603, "lr": 9.394171779141105e-06, "objective/entropy": -274.10198974609375, "objective/kl": 33.219993591308594, "objective/non_score_reward": -1.6609996557235718, "objective/rlhf_reward": -6.643998503684998, "objective/scores": 0.0, "policy/approxkl_avg": 4.708046913146973, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6015625, "step": 948, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.000514030456543 }, { "episode": 15200, "epoch": 0.09107140717307162, "loss/policy_avg": -0.5435956716537476, "lr": 9.393532719836402e-06, "objective/entropy": -245.58270263671875, "objective/kl": 26.876476287841797, "objective/non_score_reward": -1.3438239097595215, "objective/rlhf_reward": -3.771175924603062, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 10.178674697875977, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.630859375, "step": 949, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0033979415893555 }, { "episode": 15216, "epoch": 0.09116727181220118, "loss/policy_avg": 0.6083466410636902, "lr": 9.392893660531698e-06, "objective/entropy": -169.32357788085938, "objective/kl": 38.449127197265625, "objective/non_score_reward": -1.9224563837051392, "objective/rlhf_reward": -6.133565931525782, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 8.572129249572754, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.685546875, "step": 950, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.000035285949707 }, { "episode": 15232, "epoch": 0.09126313645133072, "loss/policy_avg": 0.1515914499759674, "lr": 9.392254601226994e-06, "objective/entropy": -181.75010681152344, "objective/kl": 31.95659637451172, "objective/non_score_reward": -1.5978299379348755, "objective/rlhf_reward": -5.04968385985437, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 12.761173248291016, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.798828125, "step": 951, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9977487325668335 }, { "episode": 15248, "epoch": 0.09135900109046027, "loss/policy_avg": 0.7638048529624939, "lr": 9.39161554192229e-06, "objective/entropy": -158.99050903320312, "objective/kl": 39.69103240966797, "objective/non_score_reward": -1.9845517873764038, "objective/rlhf_reward": -5.815500917212043, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 11.06544303894043, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.541015625, "step": 952, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9981579780578613 }, { "episode": 15264, "epoch": 0.09145486572958982, "loss/policy_avg": 0.764492392539978, "lr": 9.390976482617587e-06, "objective/entropy": -159.26947021484375, "objective/kl": 28.415475845336914, "objective/non_score_reward": -1.4207737445831299, "objective/rlhf_reward": -5.683095276355743, "objective/scores": 0.0, "policy/approxkl_avg": 7.907594680786133, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6328125, "step": 953, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.997883677482605 }, { "episode": 15280, "epoch": 0.09155073036871937, "loss/policy_avg": 0.3368009328842163, "lr": 9.390337423312884e-06, "objective/entropy": -173.85415649414062, "objective/kl": 35.513309478759766, "objective/non_score_reward": -1.775665521621704, "objective/rlhf_reward": -5.49854234224947, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 6.337751388549805, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.541015625, "step": 954, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9994667768478394 }, { "episode": 15296, "epoch": 0.09164659500784891, "loss/policy_avg": 0.0456845797598362, "lr": 9.389698364008181e-06, "objective/entropy": 16.725250244140625, "objective/kl": 36.44686508178711, "objective/non_score_reward": -1.822343349456787, "objective/rlhf_reward": -5.865541179378596, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 21.832763671875, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.548828125, "step": 955, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.999651551246643 }, { "episode": 15312, "epoch": 0.09174245964697847, "loss/policy_avg": 0.0268879272043705, "lr": 9.389059304703478e-06, "objective/entropy": -219.0832977294922, "objective/kl": 25.021286010742188, "objective/non_score_reward": -1.2510643005371094, "objective/rlhf_reward": -3.1794285133209934, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 13.525361061096191, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.673828125, "step": 956, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0001580715179443 }, { "episode": 15328, "epoch": 0.09183832428610801, "loss/policy_avg": 0.25198429822921753, "lr": 9.388420245398773e-06, "objective/entropy": -216.4515838623047, "objective/kl": 29.98337173461914, "objective/non_score_reward": -1.4991683959960938, "objective/rlhf_reward": -3.0729548081171245, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 8.199630737304688, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.634765625, "step": 957, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9965362548828125 }, { "episode": 15344, "epoch": 0.09193418892523757, "loss/policy_avg": 0.035516731441020966, "lr": 9.38778118609407e-06, "objective/entropy": -250.8704833984375, "objective/kl": 30.556961059570312, "objective/non_score_reward": -1.5278480052947998, "objective/rlhf_reward": -4.73278991231094, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 3.100607395172119, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.607421875, "step": 958, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0005221366882324 }, { "episode": 15360, "epoch": 0.09203005356436711, "loss/policy_avg": 0.6594608426094055, "lr": 9.387142126789367e-06, "objective/entropy": -190.2021942138672, "objective/kl": 29.693756103515625, "objective/non_score_reward": -1.4846878051757812, "objective/rlhf_reward": -4.38249173661764, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 11.999906539916992, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.55859375, "step": 959, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9964005947113037 }, { "episode": 15376, "epoch": 0.09212591820349667, "loss/policy_avg": 0.16847842931747437, "lr": 9.386503067484664e-06, "objective/entropy": -220.72311401367188, "objective/kl": 22.618806838989258, "objective/non_score_reward": -1.1309404373168945, "objective/rlhf_reward": -3.0731633110955805, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 1.5775080919265747, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.63671875, "step": 960, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0047149658203125 }, { "episode": 15392, "epoch": 0.09222178284262621, "loss/policy_avg": 0.37361010909080505, "lr": 9.38586400817996e-06, "objective/entropy": -219.60760498046875, "objective/kl": 31.668062210083008, "objective/non_score_reward": -1.58340322971344, "objective/rlhf_reward": -4.852660181935191, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 6.965027809143066, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.615234375, "step": 961, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9987406730651855 }, { "episode": 15408, "epoch": 0.09231764748175576, "loss/policy_avg": 0.3272181749343872, "lr": 9.385224948875256e-06, "objective/entropy": -200.26370239257812, "objective/kl": 38.33747100830078, "objective/non_score_reward": -1.916873574256897, "objective/rlhf_reward": -5.720082710461552, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 2.9499969482421875, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.583984375, "step": 962, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9983041286468506 }, { "episode": 15424, "epoch": 0.0924135121208853, "loss/policy_avg": 0.02453005313873291, "lr": 9.384585889570553e-06, "objective/entropy": -259.0159606933594, "objective/kl": 32.376686096191406, "objective/non_score_reward": -1.6188342571258545, "objective/rlhf_reward": -5.051504810054866, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 11.491250038146973, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.634765625, "step": 963, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0016989707946777 }, { "episode": 15440, "epoch": 0.09250937676001486, "loss/policy_avg": -0.1082817018032074, "lr": 9.38394683026585e-06, "objective/entropy": -136.52200317382812, "objective/kl": 34.37030792236328, "objective/non_score_reward": -1.718515396118164, "objective/rlhf_reward": -5.212202077329742, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 5.610563278198242, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.65234375, "step": 964, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9993445873260498 }, { "episode": 15456, "epoch": 0.0926052413991444, "loss/policy_avg": 0.3635658025741577, "lr": 9.383307770961147e-06, "objective/entropy": -242.04705810546875, "objective/kl": 26.167871475219727, "objective/non_score_reward": -1.3083934783935547, "objective/rlhf_reward": -3.8335740923881527, "objective/scores": 0.35, "policy/approxkl_avg": 10.497917175292969, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.505859375, "step": 965, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.998986840248108 }, { "episode": 15472, "epoch": 0.09270110603827396, "loss/policy_avg": 0.4805383086204529, "lr": 9.382668711656443e-06, "objective/entropy": -130.80931091308594, "objective/kl": 43.840057373046875, "objective/non_score_reward": -2.192002773284912, "objective/rlhf_reward": -6.368011450767517, "objective/scores": 0.6, "policy/approxkl_avg": 1.2675271034240723, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.619140625, "step": 966, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.001443862915039 }, { "episode": 15488, "epoch": 0.0927969706774035, "loss/policy_avg": 0.9434456825256348, "lr": 9.382029652351739e-06, "objective/entropy": -116.85310363769531, "objective/kl": 55.79869842529297, "objective/non_score_reward": -2.7899351119995117, "objective/rlhf_reward": -9.426406518618265, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 2.6991868019104004, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.791015625, "step": 967, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.00309157371521 }, { "episode": 15504, "epoch": 0.09289283531653306, "loss/policy_avg": 0.2830507755279541, "lr": 9.381390593047035e-06, "objective/entropy": -260.5260925292969, "objective/kl": 34.16276550292969, "objective/non_score_reward": -1.7081382274627686, "objective/rlhf_reward": -5.381954531283721, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 4.792706489562988, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.615234375, "step": 968, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9985530376434326 }, { "episode": 15520, "epoch": 0.0929886999556626, "loss/policy_avg": 0.19756931066513062, "lr": 9.380751533742332e-06, "objective/entropy": -234.741455078125, "objective/kl": 25.891204833984375, "objective/non_score_reward": -1.2945603132247925, "objective/rlhf_reward": -3.055534782187019, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 2.262695789337158, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.640625, "step": 969, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0007271766662598 }, { "episode": 15536, "epoch": 0.09308456459479215, "loss/policy_avg": 0.0513734444975853, "lr": 9.380112474437628e-06, "objective/entropy": -195.60171508789062, "objective/kl": 35.50217819213867, "objective/non_score_reward": -1.775109052658081, "objective/rlhf_reward": -5.741186105941219, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 1.6989755630493164, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.607421875, "step": 970, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.0012567043304443 }, { "episode": 15552, "epoch": 0.0931804292339217, "loss/policy_avg": 0.1513216644525528, "lr": 9.379473415132924e-06, "objective/entropy": -245.57977294921875, "objective/kl": 23.89773941040039, "objective/non_score_reward": -1.1948869228363037, "objective/rlhf_reward": -4.7795480489730835, "objective/scores": 0.0, "policy/approxkl_avg": 6.129580020904541, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.69921875, "step": 971, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.000563621520996 }, { "episode": 15568, "epoch": 0.09327629387305125, "loss/policy_avg": 0.041885554790496826, "lr": 9.378834355828221e-06, "objective/entropy": -261.82769775390625, "objective/kl": 24.18181037902832, "objective/non_score_reward": -1.2090904712677002, "objective/rlhf_reward": -3.457759955016476, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 9.62070369720459, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.533203125, "step": 972, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9969148635864258 }, { "episode": 15584, "epoch": 0.0933721585121808, "loss/policy_avg": 0.012015002779662609, "lr": 9.378195296523518e-06, "objective/entropy": -251.767333984375, "objective/kl": 27.563173294067383, "objective/non_score_reward": -1.378158688545227, "objective/rlhf_reward": -3.908514711920338, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 1.0967427492141724, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.634765625, "step": 973, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0006260871887207 }, { "episode": 15600, "epoch": 0.09346802315131035, "loss/policy_avg": -0.31819072365760803, "lr": 9.377556237218815e-06, "objective/entropy": -175.70556640625, "objective/kl": 28.285152435302734, "objective/non_score_reward": -1.4142576456069946, "objective/rlhf_reward": -4.052910540167408, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 2.37001371383667, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.505859375, "step": 974, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9995312690734863 }, { "episode": 15616, "epoch": 0.09356388779043989, "loss/policy_avg": 0.6060304641723633, "lr": 9.37691717791411e-06, "objective/entropy": -34.974281311035156, "objective/kl": 35.56610107421875, "objective/non_score_reward": -1.7783050537109375, "objective/rlhf_reward": -5.59744867065781, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 6.845120906829834, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.853515625, "step": 975, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9995267391204834 }, { "episode": 15632, "epoch": 0.09365975242956945, "loss/policy_avg": 0.1691616326570511, "lr": 9.376278118609407e-06, "objective/entropy": -173.51535034179688, "objective/kl": 40.181976318359375, "objective/non_score_reward": -2.009099006652832, "objective/rlhf_reward": -6.657793619719845, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 0.46673262119293213, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.71875, "step": 976, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0020508766174316 }, { "episode": 15648, "epoch": 0.09375561706869899, "loss/policy_avg": 0.12263473123311996, "lr": 9.375639059304704e-06, "objective/entropy": -244.26974487304688, "objective/kl": 29.573442459106445, "objective/non_score_reward": -1.4786722660064697, "objective/rlhf_reward": -4.358429758754328, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 3.748386859893799, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.693359375, "step": 977, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9990627765655518 }, { "episode": 15664, "epoch": 0.09385148170782855, "loss/policy_avg": 1.4557695388793945, "lr": 9.375000000000001e-06, "objective/entropy": -133.55853271484375, "objective/kl": 45.2318229675293, "objective/non_score_reward": -2.2615909576416016, "objective/rlhf_reward": -7.530592167171177, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 4.7986626625061035, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.654296875, "step": 978, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9999971389770508 }, { "episode": 15680, "epoch": 0.09394734634695809, "loss/policy_avg": 0.04724450409412384, "lr": 9.374360940695298e-06, "objective/entropy": -291.25103759765625, "objective/kl": 28.29153823852539, "objective/non_score_reward": -1.4145770072937012, "objective/rlhf_reward": -3.710896800236638, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 2.313387393951416, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.625, "step": 979, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9993375539779663 }, { "episode": 15696, "epoch": 0.09404321098608764, "loss/policy_avg": 0.2293320745229721, "lr": 9.373721881390595e-06, "objective/entropy": -136.44857788085938, "objective/kl": 38.36898422241211, "objective/non_score_reward": -1.9184492826461792, "objective/rlhf_reward": -5.551090779081855, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 2.303453207015991, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.61328125, "step": 980, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.999606966972351 }, { "episode": 15712, "epoch": 0.0941390756252172, "loss/policy_avg": 0.16989938914775848, "lr": 9.37308282208589e-06, "objective/entropy": -171.79864501953125, "objective/kl": 32.806495666503906, "objective/non_score_reward": -1.640324592590332, "objective/rlhf_reward": -4.613887022213872, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 8.31067180633545, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.603515625, "step": 981, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9984290599822998 }, { "episode": 15728, "epoch": 0.09423494026434674, "loss/policy_avg": 0.7234645485877991, "lr": 9.372443762781187e-06, "objective/entropy": -219.93374633789062, "objective/kl": 26.91738510131836, "objective/non_score_reward": -1.3458693027496338, "objective/rlhf_reward": -0.9834773302078244, "objective/scores": 1.1, "policy/approxkl_avg": 1.4521507024765015, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.646484375, "step": 982, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.003150463104248 }, { "episode": 15744, "epoch": 0.0943308049034763, "loss/policy_avg": 0.48133015632629395, "lr": 9.371804703476484e-06, "objective/entropy": -282.47552490234375, "objective/kl": 39.29179763793945, "objective/non_score_reward": -1.9645898342132568, "objective/rlhf_reward": -6.125026241938272, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 6.169063568115234, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.720703125, "step": 983, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.997374415397644 }, { "episode": 15760, "epoch": 0.09442666954260584, "loss/policy_avg": 0.1187177523970604, "lr": 9.37116564417178e-06, "objective/entropy": -158.33642578125, "objective/kl": 40.20547103881836, "objective/non_score_reward": -2.0102736949920654, "objective/rlhf_reward": -6.69945864966455, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 3.5165886878967285, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.62109375, "step": 984, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9981753826141357 }, { "episode": 15776, "epoch": 0.0945225341817354, "loss/policy_avg": 0.16677279770374298, "lr": 9.370526584867077e-06, "objective/entropy": -162.21728515625, "objective/kl": 33.61964797973633, "objective/non_score_reward": -1.6809823513031006, "objective/rlhf_reward": -5.323929286003112, "objective/scores": 0.35, "policy/approxkl_avg": 5.913999557495117, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.734375, "step": 985, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9968831539154053 }, { "episode": 15792, "epoch": 0.09461839882086494, "loss/policy_avg": 0.22338780760765076, "lr": 9.369887525562373e-06, "objective/entropy": -191.39588928222656, "objective/kl": 50.39151382446289, "objective/non_score_reward": -2.519575595855713, "objective/rlhf_reward": -8.416443472326385, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 45.444732666015625, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.755859375, "step": 986, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.998542308807373 }, { "episode": 15808, "epoch": 0.0947142634599945, "loss/policy_avg": 0.37791919708251953, "lr": 9.36924846625767e-06, "objective/entropy": -270.806396484375, "objective/kl": 29.205078125, "objective/non_score_reward": -1.4602539539337158, "objective/rlhf_reward": -5.841015696525574, "objective/scores": 0.0, "policy/approxkl_avg": 8.895004272460938, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6875, "step": 987, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9979805946350098 }, { "episode": 15824, "epoch": 0.09481012809912404, "loss/policy_avg": 0.7314577102661133, "lr": 9.368609406952966e-06, "objective/entropy": -174.33633422851562, "objective/kl": 41.00555419921875, "objective/non_score_reward": -2.0502774715423584, "objective/rlhf_reward": -6.77727790613946, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 4.151052474975586, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.59375, "step": 988, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.998262882232666 }, { "episode": 15840, "epoch": 0.09490599273825359, "loss/policy_avg": 0.1200692355632782, "lr": 9.367970347648263e-06, "objective/entropy": -259.9232177734375, "objective/kl": 32.56160354614258, "objective/non_score_reward": -1.628080129623413, "objective/rlhf_reward": -5.112320518493652, "objective/scores": 0.35, "policy/approxkl_avg": 3.3896703720092773, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.615234375, "step": 989, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0001492500305176 }, { "episode": 15856, "epoch": 0.09500185737738313, "loss/policy_avg": 0.7871278524398804, "lr": 9.367331288343558e-06, "objective/entropy": -162.90664672851562, "objective/kl": 37.55353927612305, "objective/non_score_reward": -1.8776767253875732, "objective/rlhf_reward": -6.086875279148188, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 24.93891716003418, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7890625, "step": 990, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9959328174591064 }, { "episode": 15872, "epoch": 0.09509772201651269, "loss/policy_avg": -0.12516134977340698, "lr": 9.366692229038855e-06, "objective/entropy": -238.83116149902344, "objective/kl": 37.03616714477539, "objective/non_score_reward": -1.8518084287643433, "objective/rlhf_reward": -6.047983967994137, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 15.576482772827148, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.681640625, "step": 991, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9985809326171875 }, { "episode": 15888, "epoch": 0.09519358665564223, "loss/policy_avg": -0.04968651384115219, "lr": 9.366053169734152e-06, "objective/entropy": -183.43231201171875, "objective/kl": 35.40851593017578, "objective/non_score_reward": -1.77042555809021, "objective/rlhf_reward": -5.756189737349672, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 0.5774535536766052, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.583984375, "step": 992, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.002680778503418 }, { "episode": 15904, "epoch": 0.09528945129477179, "loss/policy_avg": 0.009859908372163773, "lr": 9.365414110429449e-06, "objective/entropy": -14.670166015625, "objective/kl": 53.70581817626953, "objective/non_score_reward": -2.685290813446045, "objective/rlhf_reward": -8.3411630153656, "objective/scores": 0.6, "policy/approxkl_avg": 1.3184102773666382, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.681640625, "step": 993, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0005533695220947 }, { "episode": 15920, "epoch": 0.09538531593390133, "loss/policy_avg": 0.3695295453071594, "lr": 9.364775051124744e-06, "objective/entropy": -288.468505859375, "objective/kl": 32.96984100341797, "objective/non_score_reward": -1.6484923362731934, "objective/rlhf_reward": -5.0781975624882545, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 3.1653892993927, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.642578125, "step": 994, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.999734878540039 }, { "episode": 15936, "epoch": 0.09548118057303089, "loss/policy_avg": 0.3992432951927185, "lr": 9.364135991820041e-06, "objective/entropy": -231.646728515625, "objective/kl": 34.67195510864258, "objective/non_score_reward": -1.733597755432129, "objective/rlhf_reward": -5.510559280117121, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 19.767539978027344, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.849609375, "step": 995, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9987037181854248 }, { "episode": 15952, "epoch": 0.09557704521216043, "loss/policy_avg": 0.03356311097741127, "lr": 9.363496932515338e-06, "objective/entropy": -210.72410583496094, "objective/kl": 27.1010799407959, "objective/non_score_reward": -1.3550540208816528, "objective/rlhf_reward": -3.595387215885233, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 1.0958271026611328, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7265625, "step": 996, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9994826316833496 }, { "episode": 15968, "epoch": 0.09567290985128998, "loss/policy_avg": 1.1218140125274658, "lr": 9.362857873210635e-06, "objective/entropy": -71.63316345214844, "objective/kl": 40.19666290283203, "objective/non_score_reward": -2.009833335876465, "objective/rlhf_reward": -8.03933310508728, "objective/scores": 0.0, "policy/approxkl_avg": 3.4838500022888184, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.66015625, "step": 997, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9991953372955322 }, { "episode": 15984, "epoch": 0.09576877449041953, "loss/policy_avg": 0.23440885543823242, "lr": 9.362218813905932e-06, "objective/entropy": -217.69229125976562, "objective/kl": 26.445728302001953, "objective/non_score_reward": -1.3222863674163818, "objective/rlhf_reward": -3.773374044688877, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 11.445338249206543, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.623046875, "step": 998, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9970027208328247 }, { "episode": 16000, "epoch": 0.09586463912954908, "loss/policy_avg": -0.3169388175010681, "lr": 9.361579754601227e-06, "objective/entropy": -116.28077697753906, "objective/kl": 44.722564697265625, "objective/non_score_reward": -2.236128091812134, "objective/rlhf_reward": -6.997101019101079, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 4.412589073181152, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.734375, "step": 999, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.000357151031494 }, { "episode": 16016, "epoch": 0.09596050376867862, "loss/policy_avg": 0.49583154916763306, "lr": 9.360940695296524e-06, "objective/entropy": -255.0631561279297, "objective/kl": 37.207157135009766, "objective/non_score_reward": -1.8603577613830566, "objective/rlhf_reward": -4.517711792827818, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 4.7410383224487305, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6484375, "step": 1000, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9983508586883545 }, { "episode": 16032, "epoch": 0.09605636840780818, "loss/policy_avg": 0.2908029556274414, "lr": 9.36030163599182e-06, "objective/entropy": -158.05224609375, "objective/kl": 43.559486389160156, "objective/non_score_reward": -2.177974224090576, "objective/rlhf_reward": -7.386384520560426, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 4.997418403625488, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.49609375, "step": 1001, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000509738922119 }, { "episode": 16048, "epoch": 0.09615223304693772, "loss/policy_avg": 0.0880887508392334, "lr": 9.359662576687117e-06, "objective/entropy": -159.17636108398438, "objective/kl": 32.491432189941406, "objective/non_score_reward": -1.6245718002319336, "objective/rlhf_reward": -5.1196852708734095, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 23.146318435668945, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.615234375, "step": 1002, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9992117881774902 }, { "episode": 16064, "epoch": 0.09624809768606728, "loss/policy_avg": -0.1608562171459198, "lr": 9.359023517382414e-06, "objective/entropy": 31.09607696533203, "objective/kl": 48.06477355957031, "objective/non_score_reward": -2.4032387733459473, "objective/rlhf_reward": -7.78812610653312, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 4.198085784912109, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.71484375, "step": 1003, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.001403331756592 }, { "episode": 16080, "epoch": 0.09634396232519682, "loss/policy_avg": -0.09791003167629242, "lr": 9.358384458077711e-06, "objective/entropy": -204.42648315429688, "objective/kl": 32.63614273071289, "objective/non_score_reward": -1.6318070888519287, "objective/rlhf_reward": -5.127228474617004, "objective/scores": 0.35, "policy/approxkl_avg": 3.644939422607422, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.623046875, "step": 1004, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0014867782592773 }, { "episode": 16096, "epoch": 0.09643982696432638, "loss/policy_avg": 0.3904947340488434, "lr": 9.357745398773006e-06, "objective/entropy": -230.99227905273438, "objective/kl": 26.775943756103516, "objective/non_score_reward": -1.3387972116470337, "objective/rlhf_reward": -3.955188965797424, "objective/scores": 0.35, "policy/approxkl_avg": 6.282003402709961, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.859375, "step": 1005, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.999232292175293 }, { "episode": 16112, "epoch": 0.09653569160345592, "loss/policy_avg": 0.7725321054458618, "lr": 9.357106339468303e-06, "objective/entropy": -164.7260284423828, "objective/kl": 36.20423889160156, "objective/non_score_reward": -1.8102120161056519, "objective/rlhf_reward": -5.416019315990518, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 2.2319459915161133, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.69921875, "step": 1006, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000358819961548 }, { "episode": 16128, "epoch": 0.09663155624258547, "loss/policy_avg": 0.4622969627380371, "lr": 9.3564672801636e-06, "objective/entropy": -133.11448669433594, "objective/kl": 46.60032272338867, "objective/non_score_reward": -2.3300158977508545, "objective/rlhf_reward": -7.920063829421997, "objective/scores": 0.35, "policy/approxkl_avg": 4.947162628173828, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.63671875, "step": 1007, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9968760013580322 }, { "episode": 16144, "epoch": 0.09672742088171501, "loss/policy_avg": 0.28032606840133667, "lr": 9.355828220858897e-06, "objective/entropy": -185.09371948242188, "objective/kl": 38.272674560546875, "objective/non_score_reward": -1.9136335849761963, "objective/rlhf_reward": -6.275932290641171, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 5.263652801513672, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.650390625, "step": 1008, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0010600090026855 }, { "episode": 16160, "epoch": 0.09682328552084457, "loss/policy_avg": 0.18294349312782288, "lr": 9.355189161554194e-06, "objective/entropy": -147.19964599609375, "objective/kl": 32.98589324951172, "objective/non_score_reward": -1.6492946147918701, "objective/rlhf_reward": -4.935319071233856, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 5.73829460144043, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.67578125, "step": 1009, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9979850053787231 }, { "episode": 16176, "epoch": 0.09691915015997411, "loss/policy_avg": -0.004333788529038429, "lr": 9.35455010224949e-06, "objective/entropy": -197.96774291992188, "objective/kl": 37.333194732666016, "objective/non_score_reward": -1.8666596412658691, "objective/rlhf_reward": -4.5429196699869365, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 1.3020401000976562, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8203125, "step": 1010, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9998223781585693 }, { "episode": 16192, "epoch": 0.09701501479910367, "loss/policy_avg": -0.052329957485198975, "lr": 9.353911042944786e-06, "objective/entropy": -197.37957763671875, "objective/kl": 30.12477684020996, "objective/non_score_reward": -1.5062386989593506, "objective/rlhf_reward": -4.077543805317815, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 1.2824825048446655, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.619140625, "step": 1011, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.000406265258789 }, { "episode": 16208, "epoch": 0.09711087943823321, "loss/policy_avg": -0.058374106884002686, "lr": 9.353271983640083e-06, "objective/entropy": -196.46224975585938, "objective/kl": 28.03622817993164, "objective/non_score_reward": -1.4018113613128662, "objective/rlhf_reward": -4.126293065960764, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 1.1209321022033691, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.57421875, "step": 1012, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.001298427581787 }, { "episode": 16224, "epoch": 0.09720674407736277, "loss/policy_avg": 0.7006990909576416, "lr": 9.352632924335378e-06, "objective/entropy": -285.3323974609375, "objective/kl": 28.77189826965332, "objective/non_score_reward": -1.4385948181152344, "objective/rlhf_reward": -4.4288665390311905, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 2.1271591186523438, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.693359375, "step": 1013, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.002638339996338 }, { "episode": 16240, "epoch": 0.09730260871649231, "loss/policy_avg": 0.07051658630371094, "lr": 9.351993865030675e-06, "objective/entropy": -198.2432098388672, "objective/kl": 24.557363510131836, "objective/non_score_reward": -1.2278680801391602, "objective/rlhf_reward": -3.5859598255454728, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 29.07752227783203, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6875, "step": 1014, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9990439414978027 }, { "episode": 16256, "epoch": 0.09739847335562186, "loss/policy_avg": -0.5166081190109253, "lr": 9.351354805725972e-06, "objective/entropy": -63.29674530029297, "objective/kl": 38.85722351074219, "objective/non_score_reward": -1.9428613185882568, "objective/rlhf_reward": -6.167325112883168, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 6.718572616577148, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.61328125, "step": 1015, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0040817260742188 }, { "episode": 16272, "epoch": 0.0974943379947514, "loss/policy_avg": 0.462972491979599, "lr": 9.350715746421269e-06, "objective/entropy": -214.515380859375, "objective/kl": 33.796573638916016, "objective/non_score_reward": -1.689828634262085, "objective/rlhf_reward": -5.203055350986078, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 9.360330581665039, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.79296875, "step": 1016, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.996612310409546 }, { "episode": 16288, "epoch": 0.09759020263388096, "loss/policy_avg": -0.1453489363193512, "lr": 9.350076687116566e-06, "objective/entropy": -235.11651611328125, "objective/kl": 33.26921081542969, "objective/non_score_reward": -1.663460612297058, "objective/rlhf_reward": -4.7064312202500656, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 4.160917282104492, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.703125, "step": 1017, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.000718593597412 }, { "episode": 16304, "epoch": 0.0976860672730105, "loss/policy_avg": 0.19937211275100708, "lr": 9.34943762781186e-06, "objective/entropy": -255.98963928222656, "objective/kl": 37.99565887451172, "objective/non_score_reward": -1.8997828960418701, "objective/rlhf_reward": -6.257495990305571, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 18.184246063232422, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.58203125, "step": 1018, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.998490810394287 }, { "episode": 16320, "epoch": 0.09778193191214006, "loss/policy_avg": -0.04537857323884964, "lr": 9.348798568507158e-06, "objective/entropy": -208.28750610351562, "objective/kl": 29.751262664794922, "objective/non_score_reward": -1.4875633716583252, "objective/rlhf_reward": -4.002841900067265, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 35.739540100097656, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.787109375, "step": 1019, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9980144500732422 }, { "episode": 16336, "epoch": 0.0978777965512696, "loss/policy_avg": 0.15292394161224365, "lr": 9.348159509202455e-06, "objective/entropy": -234.64700317382812, "objective/kl": 29.85890769958496, "objective/non_score_reward": -1.4929454326629639, "objective/rlhf_reward": -4.3676616287866405, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 2.772150993347168, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.677734375, "step": 1020, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0000951290130615 }, { "episode": 16352, "epoch": 0.09797366119039916, "loss/policy_avg": 0.3814322352409363, "lr": 9.347520449897751e-06, "objective/entropy": -124.42337799072266, "objective/kl": 36.442901611328125, "objective/non_score_reward": -1.8221449851989746, "objective/rlhf_reward": -5.8885798215866085, "objective/scores": 0.35, "policy/approxkl_avg": 5.533565998077393, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.68359375, "step": 1021, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9961100816726685 }, { "episode": 16368, "epoch": 0.0980695258295287, "loss/policy_avg": 0.4999345541000366, "lr": 9.346881390593048e-06, "objective/entropy": -192.25704956054688, "objective/kl": 24.090442657470703, "objective/non_score_reward": -1.2045221328735352, "objective/rlhf_reward": -3.3023168084942665, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 16.40319061279297, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.556640625, "step": 1022, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.997682809829712 }, { "episode": 16384, "epoch": 0.09816539046865826, "loss/policy_avg": 0.22556136548519135, "lr": 9.346242331288345e-06, "objective/entropy": -280.6515197753906, "objective/kl": 30.555099487304688, "objective/non_score_reward": -1.5277550220489502, "objective/rlhf_reward": -4.506900105539875, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 0.8321056365966797, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.650390625, "step": 1023, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9995068311691284 }, { "episode": 16400, "epoch": 0.0982612551077878, "loss/policy_avg": 0.1927730292081833, "lr": 9.34560327198364e-06, "objective/entropy": -114.62777709960938, "objective/kl": 41.009063720703125, "objective/non_score_reward": -2.0504534244537354, "objective/rlhf_reward": -6.5399538926488034, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 12.904714584350586, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.55859375, "step": 1024, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9970085620880127 }, { "episode": 16416, "epoch": 0.09835711974691735, "loss/policy_avg": 0.004962563514709473, "lr": 9.344964212678937e-06, "objective/entropy": -175.405029296875, "objective/kl": 32.8451042175293, "objective/non_score_reward": -1.6422550678253174, "objective/rlhf_reward": -4.835687295595805, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 2.176795244216919, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.763671875, "step": 1025, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9991182088851929 }, { "episode": 16432, "epoch": 0.0984529843860469, "loss/policy_avg": 0.3356385827064514, "lr": 9.344325153374234e-06, "objective/entropy": -179.56375122070312, "objective/kl": 44.559669494628906, "objective/non_score_reward": -2.2279834747314453, "objective/rlhf_reward": -7.1786006848017365, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 1.5793884992599487, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6875, "step": 1026, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0000429153442383 }, { "episode": 16448, "epoch": 0.09854884902517645, "loss/policy_avg": 0.062264252454042435, "lr": 9.343686094069531e-06, "objective/entropy": -124.67230224609375, "objective/kl": 32.24571228027344, "objective/non_score_reward": -1.6122857332229614, "objective/rlhf_reward": -4.049142932891845, "objective/scores": 0.6, "policy/approxkl_avg": 4.209178924560547, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.68359375, "step": 1027, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.998481273651123 }, { "episode": 16464, "epoch": 0.098644713664306, "loss/policy_avg": 0.27750128507614136, "lr": 9.343047034764828e-06, "objective/entropy": -280.3656005859375, "objective/kl": 36.0235710144043, "objective/non_score_reward": -1.8011784553527832, "objective/rlhf_reward": -5.863078525572448, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 9.040508270263672, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.705078125, "step": 1028, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9958012104034424 }, { "episode": 16480, "epoch": 0.09874057830343555, "loss/policy_avg": -0.08439403772354126, "lr": 9.342407975460123e-06, "objective/entropy": -159.83497619628906, "objective/kl": 42.88642120361328, "objective/non_score_reward": -2.1443209648132324, "objective/rlhf_reward": -7.1266858383134455, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 6.443965911865234, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.701171875, "step": 1029, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.003908157348633 }, { "episode": 16496, "epoch": 0.09883644294256509, "loss/policy_avg": 0.6222244501113892, "lr": 9.34176891615542e-06, "objective/entropy": -148.41481018066406, "objective/kl": 38.87040710449219, "objective/non_score_reward": -1.943520188331604, "objective/rlhf_reward": -6.258308732303318, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 28.20026397705078, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.87109375, "step": 1030, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9953274726867676 }, { "episode": 16512, "epoch": 0.09893230758169465, "loss/policy_avg": 0.04845335707068443, "lr": 9.341129856850717e-06, "objective/entropy": -236.35935974121094, "objective/kl": 28.790306091308594, "objective/non_score_reward": -1.4395153522491455, "objective/rlhf_reward": -3.8106498820351913, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 6.143889427185059, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.642578125, "step": 1031, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9981472492218018 }, { "episode": 16528, "epoch": 0.09902817222082419, "loss/policy_avg": 0.1800106167793274, "lr": 9.340490797546014e-06, "objective/entropy": -234.52456665039062, "objective/kl": 38.6103515625, "objective/non_score_reward": -1.9305176734924316, "objective/rlhf_reward": -7.722070813179016, "objective/scores": 0.0, "policy/approxkl_avg": 2.025315761566162, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.58984375, "step": 1032, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0001583099365234 }, { "episode": 16544, "epoch": 0.09912403685995375, "loss/policy_avg": 0.1573864221572876, "lr": 9.33985173824131e-06, "objective/entropy": -206.30435180664062, "objective/kl": 29.538883209228516, "objective/non_score_reward": -1.4769442081451416, "objective/rlhf_reward": -3.507777070999145, "objective/scores": 0.6, "policy/approxkl_avg": 3.956908702850342, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7734375, "step": 1033, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9989354610443115 }, { "episode": 16560, "epoch": 0.09921990149908329, "loss/policy_avg": 0.3316153883934021, "lr": 9.339212678936606e-06, "objective/entropy": -158.2957763671875, "objective/kl": 27.869169235229492, "objective/non_score_reward": -1.393458366394043, "objective/rlhf_reward": -5.5738338232040405, "objective/scores": 0.0, "policy/approxkl_avg": 2.423194169998169, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.60546875, "step": 1034, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.998276710510254 }, { "episode": 16576, "epoch": 0.09931576613821284, "loss/policy_avg": 0.540399432182312, "lr": 9.338573619631903e-06, "objective/entropy": -278.6914367675781, "objective/kl": 24.516807556152344, "objective/non_score_reward": -1.2258403301239014, "objective/rlhf_reward": -3.4795294596749224, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 3.2752022743225098, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6015625, "step": 1035, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9993618726730347 }, { "episode": 16592, "epoch": 0.09941163077734239, "loss/policy_avg": 0.17466121912002563, "lr": 9.3379345603272e-06, "objective/entropy": -273.7776794433594, "objective/kl": 35.438560485839844, "objective/non_score_reward": -1.7719281911849976, "objective/rlhf_reward": -5.571941101344761, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 19.215896606445312, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.650390625, "step": 1036, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9990078210830688 }, { "episode": 16608, "epoch": 0.09950749541647194, "loss/policy_avg": 0.6281372308731079, "lr": 9.337295501022495e-06, "objective/entropy": -54.27313232421875, "objective/kl": 45.946815490722656, "objective/non_score_reward": -2.2973408699035645, "objective/rlhf_reward": -7.6735920546376075, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 10.886024475097656, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.931640625, "step": 1037, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9997375011444092 }, { "episode": 16624, "epoch": 0.0996033600556015, "loss/policy_avg": 0.5044693350791931, "lr": 9.336656441717792e-06, "objective/entropy": -51.8316650390625, "objective/kl": 34.80516815185547, "objective/non_score_reward": -1.7402584552764893, "objective/rlhf_reward": -5.5104356213525385, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 3.0943219661712646, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.837890625, "step": 1038, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0017142295837402 }, { "episode": 16640, "epoch": 0.09969922469473104, "loss/policy_avg": 0.050643354654312134, "lr": 9.336017382413088e-06, "objective/entropy": -289.61761474609375, "objective/kl": 35.579490661621094, "objective/non_score_reward": -1.7789745330810547, "objective/rlhf_reward": -5.559638767448023, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 29.854312896728516, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.71484375, "step": 1039, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.99898362159729 }, { "episode": 16656, "epoch": 0.0997950893338606, "loss/policy_avg": 0.66060471534729, "lr": 9.335378323108385e-06, "objective/entropy": -253.1927490234375, "objective/kl": 31.551429748535156, "objective/non_score_reward": -1.5775716304779053, "objective/rlhf_reward": -4.9847734308540055, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 74.64668273925781, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.619140625, "step": 1040, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.998822569847107 }, { "episode": 16672, "epoch": 0.09989095397299014, "loss/policy_avg": 0.9751706123352051, "lr": 9.334739263803682e-06, "objective/entropy": -148.04188537597656, "objective/kl": 32.937591552734375, "objective/non_score_reward": -1.6468796730041504, "objective/rlhf_reward": -6.587518572807312, "objective/scores": 0.0, "policy/approxkl_avg": 6.001709461212158, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.728515625, "step": 1041, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.000730037689209 }, { "episode": 16688, "epoch": 0.0999868186121197, "loss/policy_avg": 0.37717461585998535, "lr": 9.334100204498977e-06, "objective/entropy": -37.40810012817383, "objective/kl": 31.557598114013672, "objective/non_score_reward": -1.5778799057006836, "objective/rlhf_reward": -4.364108632283147, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 75.23666381835938, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.60546875, "step": 1042, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9990663528442383 }, { "episode": 16704, "epoch": 0.10008268325124924, "loss/policy_avg": 0.21707114577293396, "lr": 9.333461145194274e-06, "objective/entropy": -185.875732421875, "objective/kl": 31.279882431030273, "objective/non_score_reward": -1.563994288444519, "objective/rlhf_reward": -4.133270683065925, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 13.107833862304688, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.703125, "step": 1043, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.998514175415039 }, { "episode": 16720, "epoch": 0.10017854789037879, "loss/policy_avg": 0.19673524796962738, "lr": 9.332822085889571e-06, "objective/entropy": -271.62109375, "objective/kl": 31.95672607421875, "objective/non_score_reward": -1.5978362560272217, "objective/rlhf_reward": -1.9913449048995968, "objective/scores": 1.1, "policy/approxkl_avg": 8.022303581237793, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.669921875, "step": 1044, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9962241649627686 }, { "episode": 16736, "epoch": 0.10027441252950833, "loss/policy_avg": 0.36011672019958496, "lr": 9.332183026584868e-06, "objective/entropy": -189.5650634765625, "objective/kl": 27.331592559814453, "objective/non_score_reward": -1.3665797710418701, "objective/rlhf_reward": -3.34361249424604, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 12.111129760742188, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7890625, "step": 1045, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9945616722106934 }, { "episode": 16752, "epoch": 0.10037027716863789, "loss/policy_avg": 0.24991941452026367, "lr": 9.331543967280165e-06, "objective/entropy": -269.1661682128906, "objective/kl": 29.150144577026367, "objective/non_score_reward": -1.4575071334838867, "objective/rlhf_reward": -4.314256751331028, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 39.73731231689453, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.591796875, "step": 1046, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9968986511230469 }, { "episode": 16768, "epoch": 0.10046614180776743, "loss/policy_avg": 0.018538065254688263, "lr": 9.330904907975462e-06, "objective/entropy": -128.5980224609375, "objective/kl": 42.25013732910156, "objective/non_score_reward": -2.112506866455078, "objective/rlhf_reward": -6.050027823448181, "objective/scores": 0.6, "policy/approxkl_avg": 1.4199237823486328, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6875, "step": 1047, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.000657320022583 }, { "episode": 16784, "epoch": 0.10056200644689699, "loss/policy_avg": 0.35199424624443054, "lr": 9.330265848670757e-06, "objective/entropy": -282.9249572753906, "objective/kl": 34.62944793701172, "objective/non_score_reward": -1.7314722537994385, "objective/rlhf_reward": -4.002170358539793, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 2.832670211791992, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.60546875, "step": 1048, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9977006912231445 }, { "episode": 16800, "epoch": 0.10065787108602653, "loss/policy_avg": -0.12381379306316376, "lr": 9.329626789366054e-06, "objective/entropy": -177.63133239746094, "objective/kl": 29.458477020263672, "objective/non_score_reward": -1.472923755645752, "objective/rlhf_reward": -4.229835753858673, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 1.280195713043213, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.638671875, "step": 1049, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.002951145172119 }, { "episode": 16816, "epoch": 0.10075373572515609, "loss/policy_avg": 0.06033053621649742, "lr": 9.32898773006135e-06, "objective/entropy": -229.76272583007812, "objective/kl": 25.89266586303711, "objective/non_score_reward": -1.294633388519287, "objective/rlhf_reward": -3.7547014548378863, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 3.814189910888672, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6640625, "step": 1050, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9981303215026855 }, { "episode": 16832, "epoch": 0.10084960036428563, "loss/policy_avg": -0.14406134188175201, "lr": 9.328348670756648e-06, "objective/entropy": -121.60057067871094, "objective/kl": 34.72946548461914, "objective/non_score_reward": -1.7364733219146729, "objective/rlhf_reward": -5.522061069210139, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 4.643096446990967, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.609375, "step": 1051, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0071685314178467 }, { "episode": 16848, "epoch": 0.10094546500341518, "loss/policy_avg": 0.3516131639480591, "lr": 9.327709611451944e-06, "objective/entropy": -290.5709228515625, "objective/kl": 32.417964935302734, "objective/non_score_reward": -1.6208982467651367, "objective/rlhf_reward": -2.0835929870605465, "objective/scores": 1.1, "policy/approxkl_avg": 106.68559265136719, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.578125, "step": 1052, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9990932941436768 }, { "episode": 16864, "epoch": 0.10104132964254472, "loss/policy_avg": -0.2397887408733368, "lr": 9.32707055214724e-06, "objective/entropy": -130.25076293945312, "objective/kl": 37.00995635986328, "objective/non_score_reward": -1.850497841835022, "objective/rlhf_reward": -5.978159268100825, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 5.758305072784424, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.6875, "step": 1053, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0002975463867188 }, { "episode": 16880, "epoch": 0.10113719428167428, "loss/policy_avg": 0.07710824906826019, "lr": 9.326431492842537e-06, "objective/entropy": -265.08575439453125, "objective/kl": 30.579792022705078, "objective/non_score_reward": -1.528989553451538, "objective/rlhf_reward": -3.192239318729612, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 1.1249363422393799, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.720703125, "step": 1054, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.99985933303833 }, { "episode": 16896, "epoch": 0.10123305892080382, "loss/policy_avg": 0.5552304983139038, "lr": 9.325792433537833e-06, "objective/entropy": -214.11900329589844, "objective/kl": 49.237579345703125, "objective/non_score_reward": -2.461879014968872, "objective/rlhf_reward": -8.447516059875488, "objective/scores": 0.35, "policy/approxkl_avg": 28.872817993164062, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.66796875, "step": 1055, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9940861463546753 }, { "episode": 16912, "epoch": 0.10132892355993338, "loss/policy_avg": 0.4369004964828491, "lr": 9.325153374233129e-06, "objective/entropy": -218.92349243164062, "objective/kl": 31.91252899169922, "objective/non_score_reward": -1.5956264734268188, "objective/rlhf_reward": -4.435094545559819, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 20.476360321044922, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.646484375, "step": 1056, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.99814772605896 }, { "episode": 16928, "epoch": 0.10142478819906292, "loss/policy_avg": 0.11664807796478271, "lr": 9.324514314928425e-06, "objective/entropy": -241.1952667236328, "objective/kl": 33.52198791503906, "objective/non_score_reward": -1.6760993003845215, "objective/rlhf_reward": -4.971064166227976, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 4.343099594116211, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.822265625, "step": 1057, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9999535083770752 }, { "episode": 16944, "epoch": 0.10152065283819248, "loss/policy_avg": -0.033681720495224, "lr": 9.323875255623722e-06, "objective/entropy": -244.3253173828125, "objective/kl": 26.85427474975586, "objective/non_score_reward": -1.3427138328552246, "objective/rlhf_reward": -3.7667349911371044, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 5.019390106201172, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.677734375, "step": 1058, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.000871419906616 }, { "episode": 16960, "epoch": 0.10161651747732202, "loss/policy_avg": -0.006691465154290199, "lr": 9.32323619631902e-06, "objective/entropy": -193.07406616210938, "objective/kl": 22.30344009399414, "objective/non_score_reward": -1.115172028541565, "objective/rlhf_reward": -2.9044288088947083, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 14.373213768005371, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.734375, "step": 1059, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9997581243515015 }, { "episode": 16976, "epoch": 0.10171238211645157, "loss/policy_avg": 0.03293745219707489, "lr": 9.322597137014316e-06, "objective/entropy": -276.60870361328125, "objective/kl": 35.162376403808594, "objective/non_score_reward": -1.7581188678741455, "objective/rlhf_reward": -5.632475113868713, "objective/scores": 0.35, "policy/approxkl_avg": 6.92661714553833, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6328125, "step": 1060, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9994102716445923 }, { "episode": 16992, "epoch": 0.10180824675558112, "loss/policy_avg": 0.009452302008867264, "lr": 9.321958077709611e-06, "objective/entropy": -167.18348693847656, "objective/kl": 33.525054931640625, "objective/non_score_reward": -1.676252841949463, "objective/rlhf_reward": -5.043151860654937, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 0.7187179923057556, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.98828125, "step": 1061, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.002134323120117 }, { "episode": 17008, "epoch": 0.10190411139471067, "loss/policy_avg": 0.2391328066587448, "lr": 9.321319018404908e-06, "objective/entropy": -251.56936645507812, "objective/kl": 31.454349517822266, "objective/non_score_reward": -1.5727174282073975, "objective/rlhf_reward": -4.466040725978922, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 2.082510471343994, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.634765625, "step": 1062, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0036981105804443 }, { "episode": 17024, "epoch": 0.10199997603384021, "loss/policy_avg": 0.2995299696922302, "lr": 9.320679959100205e-06, "objective/entropy": -240.9496307373047, "objective/kl": 36.60504913330078, "objective/non_score_reward": -1.8302524089813232, "objective/rlhf_reward": -5.716889891687947, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 1.9207489490509033, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.60546875, "step": 1063, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9988347291946411 }, { "episode": 17040, "epoch": 0.10209584067296977, "loss/policy_avg": 0.14015616476535797, "lr": 9.320040899795502e-06, "objective/entropy": -262.3077392578125, "objective/kl": 22.77030372619629, "objective/non_score_reward": -1.1385152339935303, "objective/rlhf_reward": -3.2124252229029233, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 1.2123262882232666, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.734375, "step": 1064, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9995969533920288 }, { "episode": 17056, "epoch": 0.10219170531209931, "loss/policy_avg": 0.14029760658740997, "lr": 9.319401840490799e-06, "objective/entropy": -303.0190734863281, "objective/kl": 25.82904815673828, "objective/non_score_reward": -1.2914522886276245, "objective/rlhf_reward": -3.609549908843592, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 2.374150276184082, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.634765625, "step": 1065, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9978749752044678 }, { "episode": 17072, "epoch": 0.10228756995122887, "loss/policy_avg": 0.3477242588996887, "lr": 9.318762781186094e-06, "objective/entropy": -70.10704040527344, "objective/kl": 36.12684631347656, "objective/non_score_reward": -1.806342363357544, "objective/rlhf_reward": -5.883733919172911, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 4.796685695648193, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.560546875, "step": 1066, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.998852014541626 }, { "episode": 17088, "epoch": 0.10238343459035841, "loss/policy_avg": 0.07034695893526077, "lr": 9.318123721881391e-06, "objective/entropy": -297.8764343261719, "objective/kl": 27.875173568725586, "objective/non_score_reward": -1.3937586545944214, "objective/rlhf_reward": -5.575034737586975, "objective/scores": 0.0, "policy/approxkl_avg": 2.2109901905059814, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.58203125, "step": 1067, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.002016305923462 }, { "episode": 17104, "epoch": 0.10247929922948797, "loss/policy_avg": 1.4407649040222168, "lr": 9.317484662576688e-06, "objective/entropy": -241.74539184570312, "objective/kl": 19.868005752563477, "objective/non_score_reward": -0.9934003353118896, "objective/rlhf_reward": -2.369481120173054, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 1.7092839479446411, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.673828125, "step": 1068, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.004781723022461 }, { "episode": 17120, "epoch": 0.10257516386861751, "loss/policy_avg": 0.2252398431301117, "lr": 9.316845603271985e-06, "objective/entropy": -238.30023193359375, "objective/kl": 36.790252685546875, "objective/non_score_reward": -1.839512586593628, "objective/rlhf_reward": -5.410638998227055, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 1.8241536617279053, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.71875, "step": 1069, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9990484714508057 }, { "episode": 17136, "epoch": 0.10267102850774706, "loss/policy_avg": 0.2009587585926056, "lr": 9.316206543967282e-06, "objective/entropy": -281.51422119140625, "objective/kl": 31.799592971801758, "objective/non_score_reward": -1.589979648590088, "objective/rlhf_reward": -3.9599182963371273, "objective/scores": 0.6, "policy/approxkl_avg": 11.409127235412598, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.73046875, "step": 1070, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9992220401763916 }, { "episode": 17152, "epoch": 0.1027668931468766, "loss/policy_avg": 0.07947662472724915, "lr": 9.315567484662578e-06, "objective/entropy": -224.4807891845703, "objective/kl": 26.412246704101562, "objective/non_score_reward": -1.3206123113632202, "objective/rlhf_reward": -3.1597432515778876, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 0.5046712756156921, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.568359375, "step": 1071, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.00076961517334 }, { "episode": 17168, "epoch": 0.10286275778600616, "loss/policy_avg": 0.06411048024892807, "lr": 9.314928425357874e-06, "objective/entropy": -184.87181091308594, "objective/kl": 18.737346649169922, "objective/non_score_reward": -0.9368672370910645, "objective/rlhf_reward": -1.6247628948846198, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 1.1642158031463623, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.650390625, "step": 1072, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9995265007019043 }, { "episode": 17184, "epoch": 0.1029586224251357, "loss/policy_avg": 0.12491060793399811, "lr": 9.31428936605317e-06, "objective/entropy": -264.9185791015625, "objective/kl": 33.87244415283203, "objective/non_score_reward": -1.693622350692749, "objective/rlhf_reward": -4.651782932058845, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 1.8209168910980225, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.638671875, "step": 1073, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9998373985290527 }, { "episode": 17200, "epoch": 0.10305448706426526, "loss/policy_avg": 0.18550439178943634, "lr": 9.313650306748467e-06, "objective/entropy": -263.8056335449219, "objective/kl": 32.30176544189453, "objective/non_score_reward": -1.6150879859924316, "objective/rlhf_reward": -4.060352301597595, "objective/scores": 0.6, "policy/approxkl_avg": 9.517640113830566, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.658203125, "step": 1074, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.998751163482666 }, { "episode": 17216, "epoch": 0.1031503517033948, "loss/policy_avg": 0.03002159669995308, "lr": 9.313011247443764e-06, "objective/entropy": -127.8392562866211, "objective/kl": 34.593231201171875, "objective/non_score_reward": -1.729661464691162, "objective/rlhf_reward": -4.518645679950714, "objective/scores": 0.6, "policy/approxkl_avg": 8.971546173095703, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4599609375, "step": 1075, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9981253147125244 }, { "episode": 17232, "epoch": 0.10324621634252436, "loss/policy_avg": 0.13241755962371826, "lr": 9.312372188139061e-06, "objective/entropy": -202.40301513671875, "objective/kl": 18.52395248413086, "objective/non_score_reward": -0.9261976480484009, "objective/rlhf_reward": -2.2238379148796796, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 9.288294792175293, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8125, "step": 1076, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.001105785369873 }, { "episode": 17248, "epoch": 0.1033420809816539, "loss/policy_avg": 1.6102979183197021, "lr": 9.311733128834356e-06, "objective/entropy": -234.32969665527344, "objective/kl": 31.251758575439453, "objective/non_score_reward": -1.5625879764556885, "objective/rlhf_reward": -4.3029405576752975, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 20.491464614868164, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.724609375, "step": 1077, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.996706485748291 }, { "episode": 17264, "epoch": 0.10343794562078346, "loss/policy_avg": -0.1527136266231537, "lr": 9.311094069529653e-06, "objective/entropy": -268.0172119140625, "objective/kl": 27.41750144958496, "objective/non_score_reward": -1.3708750009536743, "objective/rlhf_reward": -1.083500242233276, "objective/scores": 1.1, "policy/approxkl_avg": 2.5196101665496826, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.677734375, "step": 1078, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0008769035339355 }, { "episode": 17280, "epoch": 0.103533810259913, "loss/policy_avg": 0.062209486961364746, "lr": 9.310455010224948e-06, "objective/entropy": -160.53085327148438, "objective/kl": 35.78590774536133, "objective/non_score_reward": -1.7892953157424927, "objective/rlhf_reward": -5.676228764469981, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 5.662154674530029, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.609375, "step": 1079, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9969273805618286 }, { "episode": 17296, "epoch": 0.10362967489904255, "loss/policy_avg": 0.8675416707992554, "lr": 9.309815950920245e-06, "objective/entropy": -288.6915283203125, "objective/kl": 25.7120418548584, "objective/non_score_reward": -1.28560209274292, "objective/rlhf_reward": -3.1949969632195785, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 3.7964463233947754, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.748046875, "step": 1080, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.999279260635376 }, { "episode": 17312, "epoch": 0.1037255395381721, "loss/policy_avg": 0.022417806088924408, "lr": 9.309176891615542e-06, "objective/entropy": -234.59405517578125, "objective/kl": 29.527116775512695, "objective/non_score_reward": -1.476355791091919, "objective/rlhf_reward": -4.2435640148526295, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 1.2056889533996582, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.623046875, "step": 1081, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0002384185791016 }, { "episode": 17328, "epoch": 0.10382140417730165, "loss/policy_avg": 1.0629796981811523, "lr": 9.308537832310839e-06, "objective/entropy": -235.58709716796875, "objective/kl": 24.657703399658203, "objective/non_score_reward": -1.2328851222991943, "objective/rlhf_reward": -3.531540727615356, "objective/scores": 0.35, "policy/approxkl_avg": 5.628866195678711, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.591796875, "step": 1082, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9997105598449707 }, { "episode": 17344, "epoch": 0.1039172688164312, "loss/policy_avg": 0.43491989374160767, "lr": 9.307898773006136e-06, "objective/entropy": -116.438232421875, "objective/kl": 31.854278564453125, "objective/non_score_reward": -1.5927139520645142, "objective/rlhf_reward": -4.970855867862701, "objective/scores": 0.35, "policy/approxkl_avg": 11.138096809387207, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.716796875, "step": 1083, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0005722045898438 }, { "episode": 17360, "epoch": 0.10401313345556075, "loss/policy_avg": 1.154296636581421, "lr": 9.307259713701433e-06, "objective/entropy": -104.04910278320312, "objective/kl": 33.66610336303711, "objective/non_score_reward": -1.6833051443099976, "objective/rlhf_reward": -3.8095016225588054, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 24.187870025634766, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.619140625, "step": 1084, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9990277290344238 }, { "episode": 17376, "epoch": 0.10410899809469029, "loss/policy_avg": 2.80964732170105, "lr": 9.306620654396728e-06, "objective/entropy": -223.38082885742188, "objective/kl": 42.09947967529297, "objective/non_score_reward": -2.1049740314483643, "objective/rlhf_reward": -6.47248501606458, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 4.849597454071045, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.630859375, "step": 1085, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.001406192779541 }, { "episode": 17392, "epoch": 0.10420486273381985, "loss/policy_avg": 0.4371190667152405, "lr": 9.305981595092025e-06, "objective/entropy": -209.35194396972656, "objective/kl": 23.755962371826172, "objective/non_score_reward": -1.187798023223877, "objective/rlhf_reward": -3.1470724082628063, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 1.6081452369689941, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.626953125, "step": 1086, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9983155727386475 }, { "episode": 17408, "epoch": 0.10430072737294939, "loss/policy_avg": 0.27756333351135254, "lr": 9.305342535787322e-06, "objective/entropy": -262.8760986328125, "objective/kl": 32.76499938964844, "objective/non_score_reward": -1.6382498741149902, "objective/rlhf_reward": -4.948879156176167, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 24.257652282714844, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.61328125, "step": 1087, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9991706609725952 }, { "episode": 17424, "epoch": 0.10439659201207895, "loss/policy_avg": -0.05298028513789177, "lr": 9.304703476482619e-06, "objective/entropy": -69.1202163696289, "objective/kl": 30.052305221557617, "objective/non_score_reward": -1.5026153326034546, "objective/rlhf_reward": -6.010461330413818, "objective/scores": 0.0, "policy/approxkl_avg": 2.539027214050293, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4375, "step": 1088, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.001836061477661 }, { "episode": 17440, "epoch": 0.10449245665120849, "loss/policy_avg": 0.7193084955215454, "lr": 9.304064417177915e-06, "objective/entropy": -143.99217224121094, "objective/kl": 29.456846237182617, "objective/non_score_reward": -1.4728422164916992, "objective/rlhf_reward": -4.229509656847107, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 3.5728912353515625, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6484375, "step": 1089, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.997127890586853 }, { "episode": 17456, "epoch": 0.10458832129033804, "loss/policy_avg": 0.17522019147872925, "lr": 9.30342535787321e-06, "objective/entropy": -233.08404541015625, "objective/kl": 32.47724914550781, "objective/non_score_reward": -1.6238625049591064, "objective/rlhf_reward": -5.116847612944943, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 33.11177444458008, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.658203125, "step": 1090, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9972889423370361 }, { "episode": 17472, "epoch": 0.10468418592946759, "loss/policy_avg": 0.15333101153373718, "lr": 9.302786298568508e-06, "objective/entropy": -160.20663452148438, "objective/kl": 36.02931594848633, "objective/non_score_reward": -1.801465630531311, "objective/rlhf_reward": -5.690090739520725, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 9.341711044311523, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.564453125, "step": 1091, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000162124633789 }, { "episode": 17488, "epoch": 0.10478005056859714, "loss/policy_avg": 0.13975301384925842, "lr": 9.302147239263804e-06, "objective/entropy": -148.38388061523438, "objective/kl": 37.94308853149414, "objective/non_score_reward": -1.8971545696258545, "objective/rlhf_reward": -5.76378917244346, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 20.583585739135742, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6875, "step": 1092, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9984629154205322 }, { "episode": 17504, "epoch": 0.10487591520772668, "loss/policy_avg": 0.06423387676477432, "lr": 9.301508179959101e-06, "objective/entropy": -251.20310974121094, "objective/kl": 30.99344825744629, "objective/non_score_reward": -1.5496724843978882, "objective/rlhf_reward": -4.873177084952516, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 74.65060424804688, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6484375, "step": 1093, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9979256391525269 }, { "episode": 17520, "epoch": 0.10497177984685624, "loss/policy_avg": 0.045309893786907196, "lr": 9.300869120654398e-06, "objective/entropy": -231.59390258789062, "objective/kl": 39.9537353515625, "objective/non_score_reward": -1.9976863861083984, "objective/rlhf_reward": -5.590746021270752, "objective/scores": 0.6, "policy/approxkl_avg": 1.6203057765960693, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.744140625, "step": 1094, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9998950958251953 }, { "episode": 17536, "epoch": 0.1050676444859858, "loss/policy_avg": 0.784805953502655, "lr": 9.300230061349695e-06, "objective/entropy": -211.55604553222656, "objective/kl": 30.87300682067871, "objective/non_score_reward": -1.5436503887176514, "objective/rlhf_reward": -3.250882480980131, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 40.055843353271484, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.708984375, "step": 1095, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0004684925079346 }, { "episode": 17552, "epoch": 0.10516350912511534, "loss/policy_avg": -0.08781934529542923, "lr": 9.29959100204499e-06, "objective/entropy": -234.98513793945312, "objective/kl": 32.781734466552734, "objective/non_score_reward": -1.6390867233276367, "objective/rlhf_reward": -5.156346833705902, "objective/scores": 0.35, "policy/approxkl_avg": 6.987787246704102, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6953125, "step": 1096, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0007686614990234 }, { "episode": 17568, "epoch": 0.10525937376424489, "loss/policy_avg": 0.01477903313934803, "lr": 9.298951942740287e-06, "objective/entropy": -247.9517822265625, "objective/kl": 34.785831451416016, "objective/non_score_reward": -1.7392916679382324, "objective/rlhf_reward": -5.009755204396184, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 1.2163832187652588, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.609375, "step": 1097, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9993152618408203 }, { "episode": 17584, "epoch": 0.10535523840337443, "loss/policy_avg": 0.1219930574297905, "lr": 9.298312883435584e-06, "objective/entropy": -219.2138671875, "objective/kl": 25.922840118408203, "objective/non_score_reward": -1.2961418628692627, "objective/rlhf_reward": -3.703615131790995, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 97.83702087402344, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.689453125, "step": 1098, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.998638391494751 }, { "episode": 17600, "epoch": 0.10545110304250399, "loss/policy_avg": 0.9850329756736755, "lr": 9.29767382413088e-06, "objective/entropy": -280.9995422363281, "objective/kl": 37.09015655517578, "objective/non_score_reward": -1.8545079231262207, "objective/rlhf_reward": -5.470620225148137, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 15.378658294677734, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.703125, "step": 1099, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9981629848480225 }, { "episode": 17616, "epoch": 0.10554696768163353, "loss/policy_avg": 0.16606320440769196, "lr": 9.297034764826178e-06, "objective/entropy": -260.265625, "objective/kl": 19.693069458007812, "objective/non_score_reward": -0.9846534132957458, "objective/rlhf_reward": -2.3344938195386704, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 10.92020320892334, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6640625, "step": 1100, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.996177315711975 }, { "episode": 17632, "epoch": 0.10564283232076309, "loss/policy_avg": 0.01635119318962097, "lr": 9.296395705521473e-06, "objective/entropy": -205.85324096679688, "objective/kl": 33.84467697143555, "objective/non_score_reward": -1.6922338008880615, "objective/rlhf_reward": -5.390333392707211, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 9.969751358032227, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.669921875, "step": 1101, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0005996227264404 }, { "episode": 17648, "epoch": 0.10573869695989263, "loss/policy_avg": -0.09314411878585815, "lr": 9.29575664621677e-06, "objective/entropy": -140.11074829101562, "objective/kl": 30.367794036865234, "objective/non_score_reward": -1.5183897018432617, "objective/rlhf_reward": -4.592606308873057, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 20.063873291015625, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.650390625, "step": 1102, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.999199628829956 }, { "episode": 17664, "epoch": 0.10583456159902219, "loss/policy_avg": 0.6026681661605835, "lr": 9.295117586912065e-06, "objective/entropy": -229.55003356933594, "objective/kl": 40.14759826660156, "objective/non_score_reward": -2.0073800086975098, "objective/rlhf_reward": -6.296186701456705, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 5.752803802490234, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.609375, "step": 1103, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.99612295627594 }, { "episode": 17680, "epoch": 0.10593042623815173, "loss/policy_avg": 0.4246598184108734, "lr": 9.294478527607362e-06, "objective/entropy": -282.4384460449219, "objective/kl": 41.07707977294922, "objective/non_score_reward": -2.053853988647461, "objective/rlhf_reward": -6.856166326735897, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 23.673992156982422, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.650390625, "step": 1104, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.999070405960083 }, { "episode": 17696, "epoch": 0.10602629087728128, "loss/policy_avg": 0.37388309836387634, "lr": 9.293839468302659e-06, "objective/entropy": 24.34271240234375, "objective/kl": 43.73130798339844, "objective/non_score_reward": -2.186565399169922, "objective/rlhf_reward": -7.295662741275176, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 5.244170188903809, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.64453125, "step": 1105, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000379800796509 }, { "episode": 17712, "epoch": 0.10612215551641083, "loss/policy_avg": 0.48876816034317017, "lr": 9.293200408997956e-06, "objective/entropy": -201.01852416992188, "objective/kl": 26.633869171142578, "objective/non_score_reward": -1.3316935300827026, "objective/rlhf_reward": -3.9481719518579066, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 2.9823970794677734, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.552734375, "step": 1106, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.996859073638916 }, { "episode": 17728, "epoch": 0.10621802015554038, "loss/policy_avg": -0.03377959132194519, "lr": 9.292561349693252e-06, "objective/entropy": -243.04660034179688, "objective/kl": 32.35979080200195, "objective/non_score_reward": -1.6179895401000977, "objective/rlhf_reward": -5.048125703533259, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 1.477148175239563, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.65625, "step": 1107, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.001762628555298 }, { "episode": 17744, "epoch": 0.10631388479466992, "loss/policy_avg": -0.23846808075904846, "lr": 9.29192229038855e-06, "objective/entropy": -251.7974395751953, "objective/kl": 30.760231018066406, "objective/non_score_reward": -1.5380115509033203, "objective/rlhf_reward": -4.701447825045928, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 8.709911346435547, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.52734375, "step": 1108, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0022435188293457 }, { "episode": 17760, "epoch": 0.10640974943379948, "loss/policy_avg": 0.19507169723510742, "lr": 9.291283231083845e-06, "objective/entropy": -236.431396484375, "objective/kl": 29.49862289428711, "objective/non_score_reward": -1.474931240081787, "objective/rlhf_reward": -4.237865214765654, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 18.90414047241211, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6953125, "step": 1109, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.998689889907837 }, { "episode": 17776, "epoch": 0.10650561407292902, "loss/policy_avg": 0.08301146328449249, "lr": 9.290644171779141e-06, "objective/entropy": -275.0250244140625, "objective/kl": 41.055580139160156, "objective/non_score_reward": -2.052779197692871, "objective/rlhf_reward": -6.477783219019571, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 14.971565246582031, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6484375, "step": 1110, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0006604194641113 }, { "episode": 17792, "epoch": 0.10660147871205858, "loss/policy_avg": 1.2557047605514526, "lr": 9.290005112474438e-06, "objective/entropy": -183.14273071289062, "objective/kl": 28.433589935302734, "objective/non_score_reward": -1.4216796159744263, "objective/rlhf_reward": -4.28671840429306, "objective/scores": 0.35, "policy/approxkl_avg": 7.521367073059082, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.771484375, "step": 1111, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.00211763381958 }, { "episode": 17808, "epoch": 0.10669734335118812, "loss/policy_avg": -0.1782451868057251, "lr": 9.289366053169735e-06, "objective/entropy": -279.40826416015625, "objective/kl": 18.467693328857422, "objective/non_score_reward": -0.9233846068382263, "objective/rlhf_reward": -2.334288516376896, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 4.5754899978637695, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.685546875, "step": 1112, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0029964447021484 }, { "episode": 17824, "epoch": 0.10679320799031768, "loss/policy_avg": 0.03669451177120209, "lr": 9.288726993865032e-06, "objective/entropy": -223.73326110839844, "objective/kl": 29.530508041381836, "objective/non_score_reward": -1.4765253067016602, "objective/rlhf_reward": -3.783395232931648, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 2.630830764770508, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.62109375, "step": 1113, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9987865686416626 }, { "episode": 17840, "epoch": 0.10688907262944722, "loss/policy_avg": 0.8654987215995789, "lr": 9.288087934560327e-06, "objective/entropy": -210.11935424804688, "objective/kl": 29.22211456298828, "objective/non_score_reward": -1.4611058235168457, "objective/rlhf_reward": -4.288163571563318, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 0.9125807285308838, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.591796875, "step": 1114, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.004485845565796 }, { "episode": 17856, "epoch": 0.10698493726857677, "loss/policy_avg": -0.06222856044769287, "lr": 9.287448875255624e-06, "objective/entropy": -234.88995361328125, "objective/kl": 29.992103576660156, "objective/non_score_reward": -1.4996052980422974, "objective/rlhf_reward": -3.074702118278715, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 3.34584903717041, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7265625, "step": 1115, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9993617534637451 }, { "episode": 17872, "epoch": 0.10708080190770632, "loss/policy_avg": 0.20112337172031403, "lr": 9.286809815950921e-06, "objective/entropy": -230.16200256347656, "objective/kl": 30.825511932373047, "objective/non_score_reward": -1.5412755012512207, "objective/rlhf_reward": -4.714504103274688, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 12.865804672241211, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6953125, "step": 1116, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9985527992248535 }, { "episode": 17888, "epoch": 0.10717666654683587, "loss/policy_avg": 0.6556056141853333, "lr": 9.286170756646218e-06, "objective/entropy": -280.40069580078125, "objective/kl": 28.695655822753906, "objective/non_score_reward": -1.434782862663269, "objective/rlhf_reward": -3.6164251587548595, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 2.831923007965088, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6796875, "step": 1117, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9997923374176025 }, { "episode": 17904, "epoch": 0.10727253118596541, "loss/policy_avg": 0.1591615378856659, "lr": 9.285531697341515e-06, "objective/entropy": -208.41720581054688, "objective/kl": 32.10327911376953, "objective/non_score_reward": -1.605163812637329, "objective/rlhf_reward": -5.042053201285702, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 16.582778930664062, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5390625, "step": 1118, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9988048076629639 }, { "episode": 17920, "epoch": 0.10736839582509497, "loss/policy_avg": 0.6213997602462769, "lr": 9.284892638036812e-06, "objective/entropy": -174.9388427734375, "objective/kl": 22.156795501708984, "objective/non_score_reward": -1.107839584350586, "objective/rlhf_reward": -2.698025361696879, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 2.6573128700256348, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4931640625, "step": 1119, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.001469612121582 }, { "episode": 17936, "epoch": 0.10746426046422451, "loss/policy_avg": 0.15051786601543427, "lr": 9.284253578732107e-06, "objective/entropy": -67.49928283691406, "objective/kl": 43.85652160644531, "objective/non_score_reward": -2.19282603263855, "objective/rlhf_reward": -7.3207059904054255, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 4.555420875549316, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6640625, "step": 1120, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.001720666885376 }, { "episode": 17952, "epoch": 0.10756012510335407, "loss/policy_avg": -0.04347284138202667, "lr": 9.283614519427404e-06, "objective/entropy": -228.60853576660156, "objective/kl": 27.952720642089844, "objective/non_score_reward": -1.39763605594635, "objective/rlhf_reward": -5.590544044971466, "objective/scores": 0.0, "policy/approxkl_avg": 23.599834442138672, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.76953125, "step": 1121, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0020194053649902 }, { "episode": 17968, "epoch": 0.10765598974248361, "loss/policy_avg": -0.053687386214733124, "lr": 9.2829754601227e-06, "objective/entropy": -207.92953491210938, "objective/kl": 39.524742126464844, "objective/non_score_reward": -1.9762370586395264, "objective/rlhf_reward": -5.957537005619939, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 9.628499984741211, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6171875, "step": 1122, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.000182867050171 }, { "episode": 17984, "epoch": 0.10775185438161317, "loss/policy_avg": -0.1910426765680313, "lr": 9.282336400817996e-06, "objective/entropy": -152.73464965820312, "objective/kl": 33.28754425048828, "objective/non_score_reward": -1.664376974105835, "objective/rlhf_reward": -5.141736590655979, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 5.299195289611816, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.73828125, "step": 1123, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9999160766601562 }, { "episode": 18000, "epoch": 0.10784771902074271, "loss/policy_avg": 0.23040008544921875, "lr": 9.281697341513293e-06, "objective/entropy": -260.4175109863281, "objective/kl": 27.83688735961914, "objective/non_score_reward": -1.391844391822815, "objective/rlhf_reward": -2.64365843379614, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 1.275976538658142, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.642578125, "step": 1124, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.999161720275879 }, { "episode": 18016, "epoch": 0.10794358365987226, "loss/policy_avg": 0.38624101877212524, "lr": 9.28105828220859e-06, "objective/entropy": -278.8191833496094, "objective/kl": 41.93511962890625, "objective/non_score_reward": -2.0967559814453125, "objective/rlhf_reward": -5.987024164199829, "objective/scores": 0.6, "policy/approxkl_avg": 5.510004043579102, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6875, "step": 1125, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.000148057937622 }, { "episode": 18032, "epoch": 0.1080394482990018, "loss/policy_avg": 0.07502768188714981, "lr": 9.280419222903886e-06, "objective/entropy": -261.2082824707031, "objective/kl": 36.19464111328125, "objective/non_score_reward": -1.80973219871521, "objective/rlhf_reward": -5.291517208294804, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 31.399539947509766, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.599609375, "step": 1126, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9981523752212524 }, { "episode": 18048, "epoch": 0.10813531293813136, "loss/policy_avg": 0.027504732832312584, "lr": 9.279780163599183e-06, "objective/entropy": -173.93919372558594, "objective/kl": 38.43782424926758, "objective/non_score_reward": -1.921891212463379, "objective/rlhf_reward": -6.236966948123321, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 4.174002647399902, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.578125, "step": 1127, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9981164932250977 }, { "episode": 18064, "epoch": 0.1082311775772609, "loss/policy_avg": -0.27174612879753113, "lr": 9.279141104294478e-06, "objective/entropy": -244.70285034179688, "objective/kl": 29.41028594970703, "objective/non_score_reward": -1.4705145359039307, "objective/rlhf_reward": -5.8820579051971436, "objective/scores": 0.0, "policy/approxkl_avg": 13.355351448059082, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.76171875, "step": 1128, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0021891593933105 }, { "episode": 18080, "epoch": 0.10832704221639046, "loss/policy_avg": 0.1301630437374115, "lr": 9.278502044989775e-06, "objective/entropy": -217.2534942626953, "objective/kl": 24.805774688720703, "objective/non_score_reward": -1.2402887344360352, "objective/rlhf_reward": -3.5105568572000116, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 179.41348266601562, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.681640625, "step": 1129, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9998412132263184 }, { "episode": 18096, "epoch": 0.10842290685552, "loss/policy_avg": 0.29972007870674133, "lr": 9.277862985685072e-06, "objective/entropy": -165.94686889648438, "objective/kl": 33.62857437133789, "objective/non_score_reward": -1.6814286708831787, "objective/rlhf_reward": -5.063855295599089, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 9.789844512939453, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.54296875, "step": 1130, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.996779441833496 }, { "episode": 18112, "epoch": 0.10851877149464956, "loss/policy_avg": -0.1860085129737854, "lr": 9.277223926380369e-06, "objective/entropy": -216.37200927734375, "objective/kl": 34.99008560180664, "objective/non_score_reward": -1.7495043277740479, "objective/rlhf_reward": -5.517064335759043, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 1.947920799255371, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.591796875, "step": 1131, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.001011371612549 }, { "episode": 18128, "epoch": 0.1086146361337791, "loss/policy_avg": 1.0164711475372314, "lr": 9.276584867075666e-06, "objective/entropy": -198.08203125, "objective/kl": 27.897228240966797, "objective/non_score_reward": -1.3948614597320557, "objective/rlhf_reward": -4.253932688265962, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 3.5322012901306152, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5859375, "step": 1132, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0027313232421875 }, { "episode": 18144, "epoch": 0.10871050077290866, "loss/policy_avg": -0.12127675116062164, "lr": 9.275945807770961e-06, "objective/entropy": -220.23248291015625, "objective/kl": 32.97924041748047, "objective/non_score_reward": -1.6489620208740234, "objective/rlhf_reward": -4.648436854557927, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 18.976924896240234, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.69140625, "step": 1133, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.001431465148926 }, { "episode": 18160, "epoch": 0.1088063654120382, "loss/policy_avg": 0.2887868881225586, "lr": 9.275306748466258e-06, "objective/entropy": -276.16912841796875, "objective/kl": 37.935035705566406, "objective/non_score_reward": -1.8967517614364624, "objective/rlhf_reward": -6.10605442803657, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 18.96986961364746, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.625, "step": 1134, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9982385635375977 }, { "episode": 18176, "epoch": 0.10890223005116775, "loss/policy_avg": 0.448369562625885, "lr": 9.274667689161555e-06, "objective/entropy": -169.45448303222656, "objective/kl": 37.67509078979492, "objective/non_score_reward": -1.8837544918060303, "objective/rlhf_reward": -6.084420065493926, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 27.178815841674805, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.671875, "step": 1135, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.000478506088257 }, { "episode": 18192, "epoch": 0.1089980946902973, "loss/policy_avg": 0.5679232478141785, "lr": 9.274028629856852e-06, "objective/entropy": -180.7431182861328, "objective/kl": 39.18467330932617, "objective/non_score_reward": -1.9592337608337402, "objective/rlhf_reward": -6.103601590792337, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 27.431682586669922, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.58203125, "step": 1136, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9959231615066528 }, { "episode": 18208, "epoch": 0.10909395932942685, "loss/policy_avg": 0.08655049651861191, "lr": 9.273389570552149e-06, "objective/entropy": -251.33828735351562, "objective/kl": 30.559293746948242, "objective/non_score_reward": -1.52796471118927, "objective/rlhf_reward": -4.164447735028203, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 2.2445521354675293, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.689453125, "step": 1137, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9996728897094727 }, { "episode": 18224, "epoch": 0.1091898239685564, "loss/policy_avg": 0.4302634000778198, "lr": 9.272750511247446e-06, "objective/entropy": -201.8494873046875, "objective/kl": 29.424352645874023, "objective/non_score_reward": -1.4712176322937012, "objective/rlhf_reward": -1.4848706483840939, "objective/scores": 1.1, "policy/approxkl_avg": 20.697341918945312, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8203125, "step": 1138, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9989333152770996 }, { "episode": 18240, "epoch": 0.10928568860768595, "loss/policy_avg": 0.9915270209312439, "lr": 9.27211145194274e-06, "objective/entropy": -195.59429931640625, "objective/kl": 21.045230865478516, "objective/non_score_reward": -1.052261471748352, "objective/rlhf_reward": -2.80904603600502, "objective/scores": 0.35, "policy/approxkl_avg": 28.377094268798828, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.71484375, "step": 1139, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.998765230178833 }, { "episode": 18256, "epoch": 0.10938155324681549, "loss/policy_avg": 0.49453747272491455, "lr": 9.271472392638038e-06, "objective/entropy": -245.22964477539062, "objective/kl": 32.85436248779297, "objective/non_score_reward": -1.6427181959152222, "objective/rlhf_reward": -5.089920165951609, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 14.714433670043945, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.599609375, "step": 1140, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.996521234512329 }, { "episode": 18272, "epoch": 0.10947741788594505, "loss/policy_avg": 1.36152184009552, "lr": 9.270833333333334e-06, "objective/entropy": -272.47137451171875, "objective/kl": 34.61804962158203, "objective/non_score_reward": -1.7309024333953857, "objective/rlhf_reward": -5.581974318533568, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 4.44586181640625, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.654296875, "step": 1141, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9999563694000244 }, { "episode": 18288, "epoch": 0.10957328252507459, "loss/policy_avg": 0.2819780111312866, "lr": 9.270194274028631e-06, "objective/entropy": -202.5043487548828, "objective/kl": 25.666091918945312, "objective/non_score_reward": -1.2833045721054077, "objective/rlhf_reward": -3.7739683029398154, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 2.4244799613952637, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.552734375, "step": 1142, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.999279260635376 }, { "episode": 18304, "epoch": 0.10966914716420414, "loss/policy_avg": 0.25256872177124023, "lr": 9.269555214723928e-06, "objective/entropy": -231.06277465820312, "objective/kl": 30.289072036743164, "objective/non_score_reward": -1.514453649520874, "objective/rlhf_reward": -4.657814359664917, "objective/scores": 0.35, "policy/approxkl_avg": 17.748353958129883, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.71875, "step": 1143, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9983248710632324 }, { "episode": 18320, "epoch": 0.10976501180333369, "loss/policy_avg": -0.33820840716362, "lr": 9.268916155419223e-06, "objective/entropy": -73.95364379882812, "objective/kl": 28.924686431884766, "objective/non_score_reward": -1.4462342262268066, "objective/rlhf_reward": -4.303984346802592, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 2.515535831451416, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.779296875, "step": 1144, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0056748390197754 }, { "episode": 18336, "epoch": 0.10986087644246324, "loss/policy_avg": 0.6078078746795654, "lr": 9.26827709611452e-06, "objective/entropy": -114.01469421386719, "objective/kl": 33.08042526245117, "objective/non_score_reward": -1.6540212631225586, "objective/rlhf_reward": -5.13513237517631, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 18.6502628326416, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6796875, "step": 1145, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.998767614364624 }, { "episode": 18352, "epoch": 0.10995674108159278, "loss/policy_avg": 0.34172698855400085, "lr": 9.267638036809816e-06, "objective/entropy": -220.97189331054688, "objective/kl": 30.25277328491211, "objective/non_score_reward": -1.512638807296753, "objective/rlhf_reward": -4.317221657435099, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 1.9541758298873901, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.603515625, "step": 1146, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0016610622406006 }, { "episode": 18368, "epoch": 0.11005260572072234, "loss/policy_avg": 0.29632118344306946, "lr": 9.266998977505112e-06, "objective/entropy": -200.36410522460938, "objective/kl": 26.179067611694336, "objective/non_score_reward": -1.3089535236358643, "objective/rlhf_reward": -2.835813796520233, "objective/scores": 0.6, "policy/approxkl_avg": 3.2951159477233887, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.576171875, "step": 1147, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.004591464996338 }, { "episode": 18384, "epoch": 0.11014847035985188, "loss/policy_avg": 0.011747203767299652, "lr": 9.26635991820041e-06, "objective/entropy": -194.40054321289062, "objective/kl": 31.329753875732422, "objective/non_score_reward": -1.5664877891540527, "objective/rlhf_reward": -4.784998300488352, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 1.8787118196487427, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.466796875, "step": 1148, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0023856163024902 }, { "episode": 18400, "epoch": 0.11024433499898144, "loss/policy_avg": 0.46494680643081665, "lr": 9.265720858895706e-06, "objective/entropy": -223.58827209472656, "objective/kl": 28.735855102539062, "objective/non_score_reward": -1.4367928504943848, "objective/rlhf_reward": -4.347171103954315, "objective/scores": 0.35, "policy/approxkl_avg": 3.884065866470337, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.490234375, "step": 1149, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9982936382293701 }, { "episode": 18416, "epoch": 0.11034019963811098, "loss/policy_avg": 0.28439557552337646, "lr": 9.265081799591003e-06, "objective/entropy": -147.24366760253906, "objective/kl": 34.880985260009766, "objective/non_score_reward": -1.7440491914749146, "objective/rlhf_reward": -5.242863551775614, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 79.35762023925781, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.734375, "step": 1150, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9970107078552246 }, { "episode": 18432, "epoch": 0.11043606427724054, "loss/policy_avg": 0.3585757613182068, "lr": 9.2644427402863e-06, "objective/entropy": -71.19611358642578, "objective/kl": 26.725967407226562, "objective/non_score_reward": -1.3362984657287598, "objective/rlhf_reward": -3.8642411856011147, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 5.22227668762207, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4990234375, "step": 1151, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9982261657714844 }, { "episode": 18448, "epoch": 0.11053192891637008, "loss/policy_avg": -0.044132016599178314, "lr": 9.263803680981595e-06, "objective/entropy": -228.6917724609375, "objective/kl": 28.40880584716797, "objective/non_score_reward": -1.4204403162002563, "objective/rlhf_reward": -5.6817615032196045, "objective/scores": 0.0, "policy/approxkl_avg": 1.9962811470031738, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.66015625, "step": 1152, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9992928504943848 }, { "episode": 18464, "epoch": 0.11062779355549963, "loss/policy_avg": 0.0064825452864170074, "lr": 9.263164621676892e-06, "objective/entropy": -258.4649658203125, "objective/kl": 27.05806541442871, "objective/non_score_reward": -1.3529033660888672, "objective/rlhf_reward": -3.9306607274368996, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 6.10453462600708, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.546875, "step": 1153, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0005645751953125 }, { "episode": 18480, "epoch": 0.11072365819462919, "loss/policy_avg": -0.0035511665046215057, "lr": 9.262525562372189e-06, "objective/entropy": -282.88446044921875, "objective/kl": 30.65878677368164, "objective/non_score_reward": -1.5329391956329346, "objective/rlhf_reward": -4.575497834888056, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 1.0608371496200562, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.693359375, "step": 1154, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0007755756378174 }, { "episode": 18496, "epoch": 0.11081952283375873, "loss/policy_avg": 0.02788732573390007, "lr": 9.261886503067486e-06, "objective/entropy": -116.1088638305664, "objective/kl": 30.207550048828125, "objective/non_score_reward": -1.5103774070739746, "objective/rlhf_reward": -3.918803753630195, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 75.21327209472656, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.619140625, "step": 1155, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9990315437316895 }, { "episode": 18512, "epoch": 0.11091538747288829, "loss/policy_avg": 0.32094255089759827, "lr": 9.261247443762783e-06, "objective/entropy": -214.9591064453125, "objective/kl": 27.392032623291016, "objective/non_score_reward": -1.3696017265319824, "objective/rlhf_reward": -3.355700316206489, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 28.679149627685547, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.572265625, "step": 1156, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9985802173614502 }, { "episode": 18528, "epoch": 0.11101125211201783, "loss/policy_avg": 0.20208770036697388, "lr": 9.260608384458078e-06, "objective/entropy": -160.56893920898438, "objective/kl": 37.55027770996094, "objective/non_score_reward": -1.8775138854980469, "objective/rlhf_reward": -5.848196154058563, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 12.605989456176758, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.50390625, "step": 1157, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.999098777770996 }, { "episode": 18544, "epoch": 0.11110711675114739, "loss/policy_avg": 0.3753480613231659, "lr": 9.259969325153375e-06, "objective/entropy": -242.1776123046875, "objective/kl": 39.135337829589844, "objective/non_score_reward": -1.9567670822143555, "objective/rlhf_reward": -5.427068269252777, "objective/scores": 0.6, "policy/approxkl_avg": 35.09158706665039, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.68359375, "step": 1158, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.99853515625 }, { "episode": 18560, "epoch": 0.11120298139027693, "loss/policy_avg": -0.17678791284561157, "lr": 9.259330265848672e-06, "objective/entropy": -155.45452880859375, "objective/kl": 29.033279418945312, "objective/non_score_reward": -1.4516640901565552, "objective/rlhf_reward": -4.40665636062622, "objective/scores": 0.35, "policy/approxkl_avg": 9.420263290405273, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.640625, "step": 1159, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9994099140167236 }, { "episode": 18576, "epoch": 0.11129884602940648, "loss/policy_avg": 0.2095283716917038, "lr": 9.258691206543968e-06, "objective/entropy": -245.34713745117188, "objective/kl": 27.264514923095703, "objective/non_score_reward": -1.3632256984710693, "objective/rlhf_reward": -3.896643607822016, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 14.997028350830078, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.66796875, "step": 1160, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9966801404953003 }, { "episode": 18592, "epoch": 0.11139471066853603, "loss/policy_avg": 0.35818007588386536, "lr": 9.258052147239265e-06, "objective/entropy": -235.89605712890625, "objective/kl": 27.769607543945312, "objective/non_score_reward": -1.3884804248809814, "objective/rlhf_reward": -1.153921282291412, "objective/scores": 1.1, "policy/approxkl_avg": 22.18886375427246, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.771484375, "step": 1161, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9975862503051758 }, { "episode": 18608, "epoch": 0.11149057530766558, "loss/policy_avg": 0.31447115540504456, "lr": 9.257413087934562e-06, "objective/entropy": -129.99705505371094, "objective/kl": 39.8328742980957, "objective/non_score_reward": -1.9916437864303589, "objective/rlhf_reward": -6.410315840449885, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 15.095479011535645, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7265625, "step": 1162, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9973938465118408 }, { "episode": 18624, "epoch": 0.11158643994679512, "loss/policy_avg": 0.0677080750465393, "lr": 9.256774028629857e-06, "objective/entropy": -159.806884765625, "objective/kl": 28.60342788696289, "objective/non_score_reward": -1.4301713705062866, "objective/rlhf_reward": -4.3420833135522425, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 4.259771347045898, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.65625, "step": 1163, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000201940536499 }, { "episode": 18640, "epoch": 0.11168230458592468, "loss/policy_avg": 0.19306568801403046, "lr": 9.256134969325154e-06, "objective/entropy": -209.5618133544922, "objective/kl": 19.490875244140625, "objective/non_score_reward": -0.9745436906814575, "objective/rlhf_reward": -0.9744557484400003, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 0.33440613746643066, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.59765625, "step": 1164, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9999253749847412 }, { "episode": 18656, "epoch": 0.11177816922505422, "loss/policy_avg": 0.11631269752979279, "lr": 9.255495910020451e-06, "objective/entropy": -141.29168701171875, "objective/kl": 37.15015411376953, "objective/non_score_reward": -1.8575077056884766, "objective/rlhf_reward": -5.873771755900934, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 2.6151018142700195, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.51953125, "step": 1165, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0023040771484375 }, { "episode": 18672, "epoch": 0.11187403386418378, "loss/policy_avg": 0.01576380617916584, "lr": 9.254856850715748e-06, "objective/entropy": -215.0299835205078, "objective/kl": 14.439537048339844, "objective/non_score_reward": -0.7219768762588501, "objective/rlhf_reward": -2.8879075050354004, "objective/scores": 0.0, "policy/approxkl_avg": 1.3693623542785645, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.509765625, "step": 1166, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.003568172454834 }, { "episode": 18688, "epoch": 0.11196989850331332, "loss/policy_avg": 0.08836716413497925, "lr": 9.254217791411043e-06, "objective/entropy": -220.81651306152344, "objective/kl": 27.33843994140625, "objective/non_score_reward": -1.3669219017028809, "objective/rlhf_reward": -3.951916241439518, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 1.9005239009857178, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7109375, "step": 1167, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.998638391494751 }, { "episode": 18704, "epoch": 0.11206576314244288, "loss/policy_avg": 0.1386057436466217, "lr": 9.25357873210634e-06, "objective/entropy": -206.8209686279297, "objective/kl": 30.620820999145508, "objective/non_score_reward": -1.531041145324707, "objective/rlhf_reward": -4.390831009546916, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 2.888638973236084, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6328125, "step": 1168, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9985263347625732 }, { "episode": 18720, "epoch": 0.11216162778157242, "loss/policy_avg": 0.17286451160907745, "lr": 9.252939672801637e-06, "objective/entropy": -276.5692138671875, "objective/kl": 31.233203887939453, "objective/non_score_reward": -1.5616602897644043, "objective/rlhf_reward": -4.730869614871677, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 7.134778022766113, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.671875, "step": 1169, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9994456768035889 }, { "episode": 18736, "epoch": 0.11225749242070197, "loss/policy_avg": 0.31586384773254395, "lr": 9.252300613496932e-06, "objective/entropy": -248.99765014648438, "objective/kl": 33.04867172241211, "objective/non_score_reward": -1.6524336338043213, "objective/rlhf_reward": -5.005614492956715, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 6.374646186828613, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.626953125, "step": 1170, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.998569369316101 }, { "episode": 18752, "epoch": 0.11235335705983152, "loss/policy_avg": -0.09263397008180618, "lr": 9.251661554192229e-06, "objective/entropy": -183.73135375976562, "objective/kl": 29.070640563964844, "objective/non_score_reward": -1.4535319805145264, "objective/rlhf_reward": -3.9892991736260166, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 47.9519157409668, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.751953125, "step": 1171, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0019617080688477 }, { "episode": 18768, "epoch": 0.11244922169896107, "loss/policy_avg": -0.012390676885843277, "lr": 9.251022494887526e-06, "objective/entropy": -198.5019073486328, "objective/kl": 33.66993713378906, "objective/non_score_reward": -1.6834967136383057, "objective/rlhf_reward": -5.333986735343933, "objective/scores": 0.35, "policy/approxkl_avg": 7.644756317138672, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6640625, "step": 1172, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0004210472106934 }, { "episode": 18784, "epoch": 0.11254508633809061, "loss/policy_avg": -0.12474697828292847, "lr": 9.250383435582823e-06, "objective/entropy": -258.70025634765625, "objective/kl": 36.01386260986328, "objective/non_score_reward": -1.8006932735443115, "objective/rlhf_reward": -5.824170806495053, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 11.808134078979492, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.671875, "step": 1173, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0037684440612793 }, { "episode": 18800, "epoch": 0.11264095097722017, "loss/policy_avg": 0.06612593680620193, "lr": 9.24974437627812e-06, "objective/entropy": -211.03541564941406, "objective/kl": 28.66901397705078, "objective/non_score_reward": -1.4334505796432495, "objective/rlhf_reward": -4.39216672471109, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 4.856602191925049, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6796875, "step": 1174, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0004348754882812 }, { "episode": 18816, "epoch": 0.11273681561634971, "loss/policy_avg": 0.17440900206565857, "lr": 9.249105316973417e-06, "objective/entropy": -233.4525146484375, "objective/kl": 26.882205963134766, "objective/non_score_reward": -1.344110369682312, "objective/rlhf_reward": -3.9978394890702784, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 91.22392272949219, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.66796875, "step": 1175, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9991576671600342 }, { "episode": 18832, "epoch": 0.11283268025547927, "loss/policy_avg": 0.5238691568374634, "lr": 9.248466257668712e-06, "objective/entropy": -173.6719970703125, "objective/kl": 34.459197998046875, "objective/non_score_reward": -1.7229597568511963, "objective/rlhf_reward": -5.335579781737879, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 25.83188247680664, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6796875, "step": 1176, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9986010789871216 }, { "episode": 18848, "epoch": 0.11292854489460881, "loss/policy_avg": -0.12078897655010223, "lr": 9.247827198364009e-06, "objective/entropy": -134.12008666992188, "objective/kl": 34.92095184326172, "objective/non_score_reward": -1.7460476160049438, "objective/rlhf_reward": -5.159361715587686, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 7.610663414001465, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.751953125, "step": 1177, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0009992122650146 }, { "episode": 18864, "epoch": 0.11302440953373837, "loss/policy_avg": 0.3817252516746521, "lr": 9.247188139059305e-06, "objective/entropy": -96.26307678222656, "objective/kl": 44.49664306640625, "objective/non_score_reward": -2.224832057952881, "objective/rlhf_reward": -8.899328708648682, "objective/scores": 0.0, "policy/approxkl_avg": 10.693860054016113, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.80859375, "step": 1178, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.997776985168457 }, { "episode": 18880, "epoch": 0.11312027417286791, "loss/policy_avg": 0.07123968750238419, "lr": 9.246549079754602e-06, "objective/entropy": -199.199951171875, "objective/kl": 27.166889190673828, "objective/non_score_reward": -1.358344554901123, "objective/rlhf_reward": -3.98277972182785, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 8.361668586730957, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.587890625, "step": 1179, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9971075057983398 }, { "episode": 18896, "epoch": 0.11321613881199746, "loss/policy_avg": 0.14846912026405334, "lr": 9.2459100204499e-06, "objective/entropy": -175.1884765625, "objective/kl": 31.658098220825195, "objective/non_score_reward": -1.5829048156738281, "objective/rlhf_reward": -4.6697597555523975, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 0.9070639610290527, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6875, "step": 1180, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0005788803100586 }, { "episode": 18912, "epoch": 0.113312003451127, "loss/policy_avg": 0.3216549754142761, "lr": 9.245270961145194e-06, "objective/entropy": -182.9542236328125, "objective/kl": 31.30569839477539, "objective/non_score_reward": -1.5652849674224854, "objective/rlhf_reward": -4.70488056441839, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 14.701448440551758, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.619140625, "step": 1181, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9980485439300537 }, { "episode": 18928, "epoch": 0.11340786809025656, "loss/policy_avg": 0.4251779019832611, "lr": 9.244631901840491e-06, "objective/entropy": -195.88975524902344, "objective/kl": 28.441465377807617, "objective/non_score_reward": -1.4220733642578125, "objective/rlhf_reward": -4.0841732359567455, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 14.071691513061523, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.623046875, "step": 1182, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9986789226531982 }, { "episode": 18944, "epoch": 0.1135037327293861, "loss/policy_avg": 0.02354581654071808, "lr": 9.243992842535788e-06, "objective/entropy": -164.94105529785156, "objective/kl": 30.754886627197266, "objective/non_score_reward": -1.537744402885437, "objective/rlhf_reward": -3.227258478046629, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 0.801190972328186, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.669921875, "step": 1183, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.002558708190918 }, { "episode": 18960, "epoch": 0.11359959736851566, "loss/policy_avg": 0.04112057387828827, "lr": 9.243353783231085e-06, "objective/entropy": -246.19515991210938, "objective/kl": 34.75521469116211, "objective/non_score_reward": -1.7377607822418213, "objective/rlhf_reward": -5.609407058268218, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 22.861713409423828, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.66015625, "step": 1184, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9987796545028687 }, { "episode": 18976, "epoch": 0.1136954620076452, "loss/policy_avg": 0.21404039859771729, "lr": 9.242714723926382e-06, "objective/entropy": -209.3376922607422, "objective/kl": 35.15364074707031, "objective/non_score_reward": -1.7576820850372314, "objective/rlhf_reward": -5.083317349629338, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 45.4697265625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.650390625, "step": 1185, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.996396541595459 }, { "episode": 18992, "epoch": 0.11379132664677476, "loss/policy_avg": -0.016785871237516403, "lr": 9.242075664621679e-06, "objective/entropy": -135.11508178710938, "objective/kl": 44.11357879638672, "objective/non_score_reward": -2.205678939819336, "objective/rlhf_reward": -7.160855894506561, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 20.996137619018555, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.609375, "step": 1186, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9976164102554321 }, { "episode": 19008, "epoch": 0.1138871912859043, "loss/policy_avg": -0.0332149937748909, "lr": 9.241436605316974e-06, "objective/entropy": -85.4975814819336, "objective/kl": 33.72305679321289, "objective/non_score_reward": -1.6861528158187866, "objective/rlhf_reward": -2.3446113824844357, "objective/scores": 1.1, "policy/approxkl_avg": 115.56517028808594, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.849609375, "step": 1187, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9999394416809082 }, { "episode": 19024, "epoch": 0.11398305592503385, "loss/policy_avg": 0.10150502622127533, "lr": 9.240797546012271e-06, "objective/entropy": -228.79638671875, "objective/kl": 24.752819061279297, "objective/non_score_reward": -1.2376409769058228, "objective/rlhf_reward": -4.9505637884140015, "objective/scores": 0.0, "policy/approxkl_avg": 1.5263676643371582, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.517578125, "step": 1188, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0000452995300293 }, { "episode": 19040, "epoch": 0.1140789205641634, "loss/policy_avg": 1.6640098094940186, "lr": 9.240158486707568e-06, "objective/entropy": -222.82388305664062, "objective/kl": 37.1962890625, "objective/non_score_reward": -1.8598144054412842, "objective/rlhf_reward": -5.958305004055857, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 26.163982391357422, "policy/clipfrac_avg": 0.25, "policy/entropy_avg": 0.583984375, "step": 1189, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.998447299003601 }, { "episode": 19056, "epoch": 0.11417478520329295, "loss/policy_avg": 0.24511002004146576, "lr": 9.239519427402863e-06, "objective/entropy": -169.49942016601562, "objective/kl": 23.688583374023438, "objective/non_score_reward": -1.1844291687011719, "objective/rlhf_reward": -3.0758574656849964, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 25.747421264648438, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.83984375, "step": 1190, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.997087001800537 }, { "episode": 19072, "epoch": 0.1142706498424225, "loss/policy_avg": 0.05934782326221466, "lr": 9.23888036809816e-06, "objective/entropy": -195.64088439941406, "objective/kl": 33.113624572753906, "objective/non_score_reward": -1.6556813716888428, "objective/rlhf_reward": -5.172127108188018, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 15.268495559692383, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.765625, "step": 1191, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.998260736465454 }, { "episode": 19088, "epoch": 0.11436651448155205, "loss/policy_avg": 0.15776914358139038, "lr": 9.238241308793457e-06, "objective/entropy": -227.42486572265625, "objective/kl": 30.864715576171875, "objective/non_score_reward": -1.5432357788085938, "objective/rlhf_reward": -4.722344736667022, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 2.075873613357544, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.703125, "step": 1192, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9991707801818848 }, { "episode": 19104, "epoch": 0.11446237912068159, "loss/policy_avg": 0.07561061531305313, "lr": 9.237602249488754e-06, "objective/entropy": -201.01284790039062, "objective/kl": 36.205970764160156, "objective/non_score_reward": -1.8102984428405762, "objective/rlhf_reward": -5.790595392794952, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 0.3898843228816986, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.640625, "step": 1193, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0015416145324707 }, { "episode": 19120, "epoch": 0.11455824375981115, "loss/policy_avg": -0.5701497793197632, "lr": 9.236963190184049e-06, "objective/entropy": -103.09819030761719, "objective/kl": 30.238616943359375, "objective/non_score_reward": -1.5119309425354004, "objective/rlhf_reward": -4.56677112263, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 4.551431655883789, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.59375, "step": 1194, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.004034996032715 }, { "episode": 19136, "epoch": 0.11465410839894069, "loss/policy_avg": 0.46027839183807373, "lr": 9.236324130879346e-06, "objective/entropy": -208.53213500976562, "objective/kl": 33.96599197387695, "objective/non_score_reward": -1.6982996463775635, "objective/rlhf_reward": -4.968369956287455, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 32.79998016357422, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.76953125, "step": 1195, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9957493543624878 }, { "episode": 19152, "epoch": 0.11474997303807025, "loss/policy_avg": 0.7104591131210327, "lr": 9.235685071574642e-06, "objective/entropy": -161.5511932373047, "objective/kl": 21.182106018066406, "objective/non_score_reward": -1.0591052770614624, "objective/rlhf_reward": -2.7554683117226357, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 113.69915771484375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.810546875, "step": 1196, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9965519905090332 }, { "episode": 19168, "epoch": 0.11484583767719979, "loss/policy_avg": -0.09501040726900101, "lr": 9.23504601226994e-06, "objective/entropy": -149.43408203125, "objective/kl": 36.27130126953125, "objective/non_score_reward": -1.8135650157928467, "objective/rlhf_reward": -7.254260301589966, "objective/scores": 0.0, "policy/approxkl_avg": 4.328031539916992, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.62109375, "step": 1197, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0035271644592285 }, { "episode": 19184, "epoch": 0.11494170231632934, "loss/policy_avg": 0.566871702671051, "lr": 9.234406952965236e-06, "objective/entropy": -217.4463653564453, "objective/kl": 29.27811050415039, "objective/non_score_reward": -1.4639055728912354, "objective/rlhf_reward": -4.51398654869142, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 53.84545135498047, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.58203125, "step": 1198, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9971461296081543 }, { "episode": 19200, "epoch": 0.11503756695545889, "loss/policy_avg": 0.048794396221637726, "lr": 9.233767893660533e-06, "objective/entropy": -174.531005859375, "objective/kl": 29.087738037109375, "objective/non_score_reward": -1.4543869495391846, "objective/rlhf_reward": -2.893828962684843, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 37.782432556152344, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.791015625, "step": 1199, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9999170303344727 }, { "episode": 19216, "epoch": 0.11513343159458844, "loss/policy_avg": -0.18217583000659943, "lr": 9.233128834355828e-06, "objective/entropy": -135.63037109375, "objective/kl": 40.30628967285156, "objective/non_score_reward": -2.0153145790100098, "objective/rlhf_reward": -6.545486414226231, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 8.027623176574707, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.75, "step": 1200, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.00277042388916 }, { "episode": 19232, "epoch": 0.11522929623371798, "loss/policy_avg": 0.19046634435653687, "lr": 9.232489775051125e-06, "objective/entropy": -245.74639892578125, "objective/kl": 29.675251007080078, "objective/non_score_reward": -1.483762502670288, "objective/rlhf_reward": -4.575800263617916, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 1.7879266738891602, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.61328125, "step": 1201, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.00600528717041 }, { "episode": 19248, "epoch": 0.11532516087284754, "loss/policy_avg": 0.21539102494716644, "lr": 9.231850715746422e-06, "objective/entropy": -225.9239044189453, "objective/kl": 28.502708435058594, "objective/non_score_reward": -1.4251353740692139, "objective/rlhf_reward": -4.144282071796015, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 23.979963302612305, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.712890625, "step": 1202, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9997289180755615 }, { "episode": 19264, "epoch": 0.11542102551197708, "loss/policy_avg": 0.04965958744287491, "lr": 9.231211656441719e-06, "objective/entropy": -216.04248046875, "objective/kl": 31.314760208129883, "objective/non_score_reward": -1.5657379627227783, "objective/rlhf_reward": -4.1402458570161205, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 8.937564849853516, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5234375, "step": 1203, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9976413249969482 }, { "episode": 19280, "epoch": 0.11551689015110664, "loss/policy_avg": 0.02601933479309082, "lr": 9.230572597137016e-06, "objective/entropy": -148.66250610351562, "objective/kl": 32.67079162597656, "objective/non_score_reward": -1.6335396766662598, "objective/rlhf_reward": -5.018386685641941, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 18.645111083984375, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.79296875, "step": 1204, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.004350185394287 }, { "episode": 19296, "epoch": 0.11561275479023618, "loss/policy_avg": 1.4007536172866821, "lr": 9.229933537832311e-06, "objective/entropy": -258.147705078125, "objective/kl": 34.21760559082031, "objective/non_score_reward": -1.7108802795410156, "objective/rlhf_reward": -6.843520998954773, "objective/scores": 0.0, "policy/approxkl_avg": 5.961824417114258, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.619140625, "step": 1205, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0000836849212646 }, { "episode": 19312, "epoch": 0.11570861942936574, "loss/policy_avg": -0.39194512367248535, "lr": 9.229294478527608e-06, "objective/entropy": -100.05964660644531, "objective/kl": 36.88145065307617, "objective/non_score_reward": -1.8440725803375244, "objective/rlhf_reward": -5.551461453708718, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 8.472518920898438, "policy/clipfrac_avg": 2.0, "policy/entropy_avg": 0.599609375, "step": 1206, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0011942386627197 }, { "episode": 19328, "epoch": 0.11580448406849528, "loss/policy_avg": 0.31982097029685974, "lr": 9.228655419222905e-06, "objective/entropy": -219.31304931640625, "objective/kl": 38.748992919921875, "objective/non_score_reward": -1.9374498128890991, "objective/rlhf_reward": -3.349799251556396, "objective/scores": 1.1, "policy/approxkl_avg": 78.4788818359375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.701171875, "step": 1207, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9975740909576416 }, { "episode": 19344, "epoch": 0.11590034870762483, "loss/policy_avg": 0.28548291325569153, "lr": 9.228016359918202e-06, "objective/entropy": -157.26446533203125, "objective/kl": 40.80043029785156, "objective/non_score_reward": -2.0400216579437256, "objective/rlhf_reward": -6.335257883342813, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 8.33885383605957, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.767578125, "step": 1208, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9999432563781738 }, { "episode": 19360, "epoch": 0.11599621334675438, "loss/policy_avg": 0.03618919104337692, "lr": 9.227377300613499e-06, "objective/entropy": -179.19644165039062, "objective/kl": 33.20772933959961, "objective/non_score_reward": -1.6603864431381226, "objective/rlhf_reward": -4.908212439219157, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 4.311634540557861, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.681640625, "step": 1209, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9987711906433105 }, { "episode": 19376, "epoch": 0.11609207798588393, "loss/policy_avg": 0.15800103545188904, "lr": 9.226738241308795e-06, "objective/entropy": -270.1763916015625, "objective/kl": 29.480056762695312, "objective/non_score_reward": -1.4740028381347656, "objective/rlhf_reward": -4.570498857527895, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 2.025053024291992, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.603515625, "step": 1210, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9997854232788086 }, { "episode": 19392, "epoch": 0.11618794262501349, "loss/policy_avg": 0.08228084444999695, "lr": 9.22609918200409e-06, "objective/entropy": -265.70428466796875, "objective/kl": 27.694522857666016, "objective/non_score_reward": -1.3847262859344482, "objective/rlhf_reward": -4.1389049053192135, "objective/scores": 0.35, "policy/approxkl_avg": 11.403460502624512, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.546875, "step": 1211, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9995503425598145 }, { "episode": 19408, "epoch": 0.11628380726414303, "loss/policy_avg": 0.24947790801525116, "lr": 9.225460122699387e-06, "objective/entropy": -214.40487670898438, "objective/kl": 36.13543701171875, "objective/non_score_reward": -1.8067721128463745, "objective/rlhf_reward": -7.227088212966919, "objective/scores": 0.0, "policy/approxkl_avg": 13.953628540039062, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.603515625, "step": 1212, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9974710941314697 }, { "episode": 19424, "epoch": 0.11637967190327259, "loss/policy_avg": 0.25788062810897827, "lr": 9.224821063394683e-06, "objective/entropy": -189.17974853515625, "objective/kl": 29.4897518157959, "objective/non_score_reward": -1.4744876623153687, "objective/rlhf_reward": -4.29383042818697, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 2.7698781490325928, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.58984375, "step": 1213, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9973881244659424 }, { "episode": 19440, "epoch": 0.11647553654240213, "loss/policy_avg": -0.10094030201435089, "lr": 9.22418200408998e-06, "objective/entropy": -178.26290893554688, "objective/kl": 36.244503021240234, "objective/non_score_reward": -1.812225103378296, "objective/rlhf_reward": -5.644780430857258, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 3.1554298400878906, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.673828125, "step": 1214, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0035483837127686 }, { "episode": 19456, "epoch": 0.11657140118153168, "loss/policy_avg": -0.2695544958114624, "lr": 9.223542944785276e-06, "objective/entropy": -224.8712158203125, "objective/kl": 32.469635009765625, "objective/non_score_reward": -1.6234817504882812, "objective/rlhf_reward": -3.57020804727194, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 1.8041396141052246, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.65234375, "step": 1215, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0009067058563232 }, { "episode": 19472, "epoch": 0.11666726582066123, "loss/policy_avg": 0.19596442580223083, "lr": 9.222903885480573e-06, "objective/entropy": -216.5953369140625, "objective/kl": 32.196380615234375, "objective/non_score_reward": -1.6098190546035767, "objective/rlhf_reward": -5.039275979995727, "objective/scores": 0.35, "policy/approxkl_avg": 3.0333876609802246, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6875, "step": 1216, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0020110607147217 }, { "episode": 19488, "epoch": 0.11676313045979078, "loss/policy_avg": -0.0265303086489439, "lr": 9.22226482617587e-06, "objective/entropy": -177.5919189453125, "objective/kl": 27.259849548339844, "objective/non_score_reward": -1.362992525100708, "objective/rlhf_reward": -2.528251205326292, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 2.3001770973205566, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.611328125, "step": 1217, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9996999502182007 }, { "episode": 19504, "epoch": 0.11685899509892032, "loss/policy_avg": 0.21211574971675873, "lr": 9.221625766871165e-06, "objective/entropy": -239.07907104492188, "objective/kl": 20.40753936767578, "objective/non_score_reward": -1.0203769207000732, "objective/rlhf_reward": -2.681507921218872, "objective/scores": 0.35, "policy/approxkl_avg": 2.7712390422821045, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.564453125, "step": 1218, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0028648376464844 }, { "episode": 19520, "epoch": 0.11695485973804988, "loss/policy_avg": 0.07235918194055557, "lr": 9.220986707566462e-06, "objective/entropy": -152.53878784179688, "objective/kl": 21.97917366027832, "objective/non_score_reward": -1.0989587306976318, "objective/rlhf_reward": -1.4721155508768287, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 3.2449498176574707, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.62890625, "step": 1219, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.997578501701355 }, { "episode": 19536, "epoch": 0.11705072437717942, "loss/policy_avg": 0.5660937428474426, "lr": 9.220347648261759e-06, "objective/entropy": -249.38014221191406, "objective/kl": 41.259254455566406, "objective/non_score_reward": -2.062962532043457, "objective/rlhf_reward": -6.873248794165951, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 4.481976509094238, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.66796875, "step": 1220, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9982693195343018 }, { "episode": 19552, "epoch": 0.11714658901630898, "loss/policy_avg": 0.01150442287325859, "lr": 9.219708588957056e-06, "objective/entropy": -215.35882568359375, "objective/kl": 34.312686920166016, "objective/non_score_reward": -1.7156343460083008, "objective/rlhf_reward": -5.346765959056553, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 17.738174438476562, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.69140625, "step": 1221, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9997079372406006 }, { "episode": 19568, "epoch": 0.11724245365543852, "loss/policy_avg": -0.059517666697502136, "lr": 9.219069529652353e-06, "objective/entropy": -210.27809143066406, "objective/kl": 27.43333625793457, "objective/non_score_reward": -1.3716667890548706, "objective/rlhf_reward": -4.005714538510203, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 16.074115753173828, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.646484375, "step": 1222, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.001340389251709 }, { "episode": 19584, "epoch": 0.11733831829456808, "loss/policy_avg": -0.15733516216278076, "lr": 9.21843047034765e-06, "objective/entropy": -235.85507202148438, "objective/kl": 28.09206199645996, "objective/non_score_reward": -1.4046030044555664, "objective/rlhf_reward": -4.167813996882781, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 98.53274536132812, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.63671875, "step": 1223, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0014100074768066 }, { "episode": 19600, "epoch": 0.11743418293369762, "loss/policy_avg": 0.035943709313869476, "lr": 9.217791411042945e-06, "objective/entropy": -244.24017333984375, "objective/kl": 37.203941345214844, "objective/non_score_reward": -1.8601970672607422, "objective/rlhf_reward": -6.115275297194643, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 3.5133166313171387, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.697265625, "step": 1224, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9994046688079834 }, { "episode": 19616, "epoch": 0.11753004757282717, "loss/policy_avg": 0.1306331604719162, "lr": 9.217152351738242e-06, "objective/entropy": -190.14393615722656, "objective/kl": 33.84905242919922, "objective/non_score_reward": -1.6924527883529663, "objective/rlhf_reward": -5.254039132388767, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 3.3994088172912598, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.705078125, "step": 1225, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9983464479446411 }, { "episode": 19632, "epoch": 0.11762591221195672, "loss/policy_avg": 0.0009730234742164612, "lr": 9.216513292433539e-06, "objective/entropy": -216.55715942382812, "objective/kl": 30.103256225585938, "objective/non_score_reward": -1.5051627159118652, "objective/rlhf_reward": -3.8979449889817577, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 1.51433527469635, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.62109375, "step": 1226, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0015969276428223 }, { "episode": 19648, "epoch": 0.11772177685108627, "loss/policy_avg": 0.16437333822250366, "lr": 9.215874233128836e-06, "objective/entropy": -255.0314178466797, "objective/kl": 45.41230010986328, "objective/non_score_reward": -2.2706151008605957, "objective/rlhf_reward": -7.478340182367878, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 13.10407829284668, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.693359375, "step": 1227, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9988343715667725 }, { "episode": 19664, "epoch": 0.11781764149021581, "loss/policy_avg": 0.0678139179944992, "lr": 9.215235173824132e-06, "objective/entropy": -190.25567626953125, "objective/kl": 31.204730987548828, "objective/non_score_reward": -1.5602366924285889, "objective/rlhf_reward": -4.725174867900547, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 0.9944963455200195, "policy/clipfrac_avg": 0.25, "policy/entropy_avg": 0.640625, "step": 1228, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.003727436065674 }, { "episode": 19680, "epoch": 0.11791350612934537, "loss/policy_avg": 0.10750436782836914, "lr": 9.21459611451943e-06, "objective/entropy": -212.99404907226562, "objective/kl": 31.576601028442383, "objective/non_score_reward": -1.5788300037384033, "objective/rlhf_reward": -3.391601239086363, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 5.086095333099365, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.71875, "step": 1229, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9975237846374512 }, { "episode": 19696, "epoch": 0.11800937076847491, "loss/policy_avg": 0.26910707354545593, "lr": 9.213957055214725e-06, "objective/entropy": -264.12017822265625, "objective/kl": 27.552576065063477, "objective/non_score_reward": -1.3776288032531738, "objective/rlhf_reward": -3.7771820584932962, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 14.264134407043457, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.671875, "step": 1230, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9975985288619995 }, { "episode": 19712, "epoch": 0.11810523540760447, "loss/policy_avg": 0.09155163168907166, "lr": 9.213317995910021e-06, "objective/entropy": -141.91424560546875, "objective/kl": 32.08643341064453, "objective/non_score_reward": -1.6043215990066528, "objective/rlhf_reward": -4.5924574091759425, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 0.6272682547569275, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5390625, "step": 1231, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0007212162017822 }, { "episode": 19728, "epoch": 0.11820110004673401, "loss/policy_avg": 0.028797071427106857, "lr": 9.212678936605318e-06, "objective/entropy": -301.3397216796875, "objective/kl": 29.216651916503906, "objective/non_score_reward": -1.4608327150344849, "objective/rlhf_reward": -4.287071674075678, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 11.006927490234375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.658203125, "step": 1232, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9975908994674683 }, { "episode": 19744, "epoch": 0.11829696468586356, "loss/policy_avg": 0.12966430187225342, "lr": 9.212039877300615e-06, "objective/entropy": -220.30935668945312, "objective/kl": 42.5980224609375, "objective/non_score_reward": -2.12990140914917, "objective/rlhf_reward": -6.396899165884529, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 10.670743942260742, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.68359375, "step": 1233, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9994142055511475 }, { "episode": 19760, "epoch": 0.1183928293249931, "loss/policy_avg": 0.3277433514595032, "lr": 9.21140081799591e-06, "objective/entropy": -144.93858337402344, "objective/kl": 34.81742858886719, "objective/non_score_reward": -1.740871548652649, "objective/rlhf_reward": -5.447714292796787, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 144.9310302734375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.67578125, "step": 1234, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9978101253509521 }, { "episode": 19776, "epoch": 0.11848869396412266, "loss/policy_avg": 0.6404599547386169, "lr": 9.210761758691207e-06, "objective/entropy": -259.30499267578125, "objective/kl": 39.584476470947266, "objective/non_score_reward": -1.9792238473892212, "objective/rlhf_reward": -6.18356229464213, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 9.638875961303711, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.640625, "step": 1235, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9967763423919678 }, { "episode": 19792, "epoch": 0.1185845586032522, "loss/policy_avg": 0.20158489048480988, "lr": 9.210122699386504e-06, "objective/entropy": -137.64532470703125, "objective/kl": 37.92731475830078, "objective/non_score_reward": -1.8963658809661865, "objective/rlhf_reward": -6.22621377680151, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 3.0997841358184814, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.603515625, "step": 1236, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9978444576263428 }, { "episode": 19808, "epoch": 0.11868042324238176, "loss/policy_avg": 0.013250820338726044, "lr": 9.2094836400818e-06, "objective/entropy": -204.8336944580078, "objective/kl": 25.06024169921875, "objective/non_score_reward": -1.2530121803283691, "objective/rlhf_reward": -3.4962769387089576, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 3.133713960647583, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.55859375, "step": 1237, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000356435775757 }, { "episode": 19824, "epoch": 0.1187762878815113, "loss/policy_avg": 0.23657885193824768, "lr": 9.208844580777096e-06, "objective/entropy": -257.93719482421875, "objective/kl": 34.67414855957031, "objective/non_score_reward": -1.7337074279785156, "objective/rlhf_reward": -5.201496616999308, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 40.29893112182617, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.65234375, "step": 1238, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9974265098571777 }, { "episode": 19840, "epoch": 0.11887215252064086, "loss/policy_avg": 1.0746341943740845, "lr": 9.208205521472393e-06, "objective/entropy": -137.0782928466797, "objective/kl": 38.25480270385742, "objective/non_score_reward": -1.9127401113510132, "objective/rlhf_reward": -6.200362186045989, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 4.038956165313721, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.57421875, "step": 1239, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.000051736831665 }, { "episode": 19856, "epoch": 0.1189680171597704, "loss/policy_avg": 0.2902667224407196, "lr": 9.20756646216769e-06, "objective/entropy": -241.73587036132812, "objective/kl": 32.18947982788086, "objective/non_score_reward": -1.6094739437103271, "objective/rlhf_reward": -4.613067384037088, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 10.234606742858887, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.615234375, "step": 1240, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9977645874023438 }, { "episode": 19872, "epoch": 0.11906388179889996, "loss/policy_avg": 0.02851104736328125, "lr": 9.206927402862987e-06, "objective/entropy": -160.71896362304688, "objective/kl": 47.23845291137695, "objective/non_score_reward": -2.3619225025177, "objective/rlhf_reward": -7.966737869198679, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 35.956363677978516, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.626953125, "step": 1241, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.000300645828247 }, { "episode": 19888, "epoch": 0.1191597464380295, "loss/policy_avg": 0.2741260528564453, "lr": 9.206288343558284e-06, "objective/entropy": -148.2718963623047, "objective/kl": 38.57466125488281, "objective/non_score_reward": -1.9287331104278564, "objective/rlhf_reward": -6.264334539981231, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 3.452293872833252, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.59375, "step": 1242, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0030832290649414 }, { "episode": 19904, "epoch": 0.11925561107715905, "loss/policy_avg": 0.5994369387626648, "lr": 9.205649284253579e-06, "objective/entropy": -123.61450958251953, "objective/kl": 36.576622009277344, "objective/non_score_reward": -1.8288313150405884, "objective/rlhf_reward": -5.973689547091155, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 12.081830024719238, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.626953125, "step": 1243, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.998105764389038 }, { "episode": 19920, "epoch": 0.1193514757162886, "loss/policy_avg": -0.38412266969680786, "lr": 9.205010224948876e-06, "objective/entropy": -250.1025848388672, "objective/kl": 33.524559020996094, "objective/non_score_reward": -1.6762280464172363, "objective/rlhf_reward": -2.3049123644828793, "objective/scores": 1.1, "policy/approxkl_avg": 16.83417510986328, "policy/clipfrac_avg": 2.0, "policy/entropy_avg": 0.736328125, "step": 1244, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.003070592880249 }, { "episode": 19936, "epoch": 0.11944734035541815, "loss/policy_avg": 2.035850763320923, "lr": 9.204371165644173e-06, "objective/entropy": -190.210693359375, "objective/kl": 26.431785583496094, "objective/non_score_reward": -1.321589469909668, "objective/rlhf_reward": -3.6244980148678883, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 25.93347930908203, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.736328125, "step": 1245, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.997817039489746 }, { "episode": 19952, "epoch": 0.1195432049945477, "loss/policy_avg": 1.105665683746338, "lr": 9.20373210633947e-06, "objective/entropy": -201.83714294433594, "objective/kl": 35.3839225769043, "objective/non_score_reward": -1.7691962718963623, "objective/rlhf_reward": -5.698182680693966, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 6.776236534118652, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.857421875, "step": 1246, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.000749349594116 }, { "episode": 19968, "epoch": 0.11963906963367725, "loss/policy_avg": -0.04859113693237305, "lr": 9.203093047034766e-06, "objective/entropy": -258.0498046875, "objective/kl": 28.967775344848633, "objective/non_score_reward": -1.4483888149261475, "objective/rlhf_reward": -3.393555378913879, "objective/scores": 0.6, "policy/approxkl_avg": 2.895939826965332, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.681640625, "step": 1247, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.000812292098999 }, { "episode": 19984, "epoch": 0.11973493427280679, "loss/policy_avg": 0.644065797328949, "lr": 9.202453987730062e-06, "objective/entropy": -258.9081726074219, "objective/kl": 38.442054748535156, "objective/non_score_reward": -1.922102689743042, "objective/rlhf_reward": -6.264578659732905, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 4.295760154724121, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.67578125, "step": 1248, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9992396831512451 }, { "episode": 20000, "epoch": 0.11983079891193635, "loss/policy_avg": 0.9093930125236511, "lr": 9.201814928425358e-06, "objective/entropy": -194.09771728515625, "objective/kl": 41.89799499511719, "objective/non_score_reward": -2.0948996543884277, "objective/rlhf_reward": -6.432187746243413, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 8.594277381896973, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.53515625, "step": 1249, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9973087310791016 }, { "episode": 20016, "epoch": 0.11992666355106589, "loss/policy_avg": 0.09421442449092865, "lr": 9.201175869120655e-06, "objective/entropy": -276.9185485839844, "objective/kl": 33.968955993652344, "objective/non_score_reward": -1.6984477043151855, "objective/rlhf_reward": -5.23753175040777, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 2.0292129516601562, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.736328125, "step": 1250, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9998764991760254 }, { "episode": 20032, "epoch": 0.12002252819019545, "loss/policy_avg": 0.053171977400779724, "lr": 9.200536809815952e-06, "objective/entropy": -104.78028869628906, "objective/kl": 29.34747314453125, "objective/non_score_reward": -1.4673736095428467, "objective/rlhf_reward": -4.388541939671397, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 16.4470157623291, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.76171875, "step": 1251, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9982876777648926 }, { "episode": 20048, "epoch": 0.12011839282932499, "loss/policy_avg": 0.21370352804660797, "lr": 9.199897750511249e-06, "objective/entropy": -211.57241821289062, "objective/kl": 34.70026779174805, "objective/non_score_reward": -1.735013484954834, "objective/rlhf_reward": -5.561452009765011, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 1.8732821941375732, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.634765625, "step": 1252, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.002889394760132 }, { "episode": 20064, "epoch": 0.12021425746845454, "loss/policy_avg": -0.06851379573345184, "lr": 9.199258691206546e-06, "objective/entropy": -247.22412109375, "objective/kl": 24.82408905029297, "objective/non_score_reward": -1.2412043809890747, "objective/rlhf_reward": -4.964817762374878, "objective/scores": 0.0, "policy/approxkl_avg": 2.531271457672119, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.59375, "step": 1253, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0064773559570312 }, { "episode": 20080, "epoch": 0.12031012210758409, "loss/policy_avg": 0.9840347766876221, "lr": 9.198619631901841e-06, "objective/entropy": -122.53502655029297, "objective/kl": 40.514495849609375, "objective/non_score_reward": -2.0257248878479004, "objective/rlhf_reward": -6.652301590056762, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 12.03805923461914, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6640625, "step": 1254, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9971917867660522 }, { "episode": 20096, "epoch": 0.12040598674671364, "loss/policy_avg": 0.18231819570064545, "lr": 9.197980572597138e-06, "objective/entropy": -241.79513549804688, "objective/kl": 38.14476776123047, "objective/non_score_reward": -1.907238483428955, "objective/rlhf_reward": -6.250351646033627, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 3.30513334274292, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.646484375, "step": 1255, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9998397827148438 }, { "episode": 20112, "epoch": 0.12050185138584318, "loss/policy_avg": 0.23248505592346191, "lr": 9.197341513292433e-06, "objective/entropy": -235.57354736328125, "objective/kl": 23.809890747070312, "objective/non_score_reward": -1.1904945373535156, "objective/rlhf_reward": -1.838259015918943, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 16.297555923461914, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.62109375, "step": 1256, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000349521636963 }, { "episode": 20128, "epoch": 0.12059771602497274, "loss/policy_avg": 0.06544123589992523, "lr": 9.19670245398773e-06, "objective/entropy": -148.2562255859375, "objective/kl": 42.84388732910156, "objective/non_score_reward": -2.1421945095062256, "objective/rlhf_reward": -7.144946057994929, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 11.275466918945312, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.556640625, "step": 1257, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.001687526702881 }, { "episode": 20144, "epoch": 0.12069358066410228, "loss/policy_avg": -0.2555674612522125, "lr": 9.196063394683027e-06, "objective/entropy": -262.81939697265625, "objective/kl": 37.51679229736328, "objective/non_score_reward": -1.8758397102355957, "objective/rlhf_reward": -5.380652429834877, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 5.35495662689209, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.658203125, "step": 1258, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0027287006378174 }, { "episode": 20160, "epoch": 0.12078944530323184, "loss/policy_avg": -0.2871710956096649, "lr": 9.195424335378324e-06, "objective/entropy": -230.1177978515625, "objective/kl": 37.040069580078125, "objective/non_score_reward": -1.8520034551620483, "objective/rlhf_reward": -7.408013701438904, "objective/scores": 0.0, "policy/approxkl_avg": 33.58317947387695, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.498046875, "step": 1259, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0012998580932617 }, { "episode": 20176, "epoch": 0.12088530994236138, "loss/policy_avg": 0.29908883571624756, "lr": 9.19478527607362e-06, "objective/entropy": -172.86453247070312, "objective/kl": 45.35060501098633, "objective/non_score_reward": -2.2675304412841797, "objective/rlhf_reward": -7.408261661947357, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 5.0800933837890625, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.69921875, "step": 1260, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9999351501464844 }, { "episode": 20192, "epoch": 0.12098117458149094, "loss/policy_avg": 0.25280916690826416, "lr": 9.194146216768916e-06, "objective/entropy": -241.52896118164062, "objective/kl": 48.261566162109375, "objective/non_score_reward": -2.4130783081054688, "objective/rlhf_reward": -8.20171544990097, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 6.6377339363098145, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.705078125, "step": 1261, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9999206066131592 }, { "episode": 20208, "epoch": 0.12107703922062048, "loss/policy_avg": 0.3357711136341095, "lr": 9.193507157464213e-06, "objective/entropy": -187.2262725830078, "objective/kl": 40.54385757446289, "objective/non_score_reward": -2.0271928310394287, "objective/rlhf_reward": -6.749521457885189, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 3.054933786392212, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.537109375, "step": 1262, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.999709129333496 }, { "episode": 20224, "epoch": 0.12117290385975003, "loss/policy_avg": 0.036879949271678925, "lr": 9.19286809815951e-06, "objective/entropy": -289.2770690917969, "objective/kl": 36.072391510009766, "objective/non_score_reward": -1.803619384765625, "objective/rlhf_reward": -5.790645618637171, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 1.7514324188232422, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.625, "step": 1263, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9988840818405151 }, { "episode": 20240, "epoch": 0.12126876849887958, "loss/policy_avg": 0.022309046238660812, "lr": 9.192229038854807e-06, "objective/entropy": -295.97265625, "objective/kl": 34.17414093017578, "objective/non_score_reward": -1.7087069749832153, "objective/rlhf_reward": -5.509315047293825, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 2.405184507369995, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.634765625, "step": 1264, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0027103424072266 }, { "episode": 20256, "epoch": 0.12136463313800913, "loss/policy_avg": 0.11772053688764572, "lr": 9.191589979550103e-06, "objective/entropy": -216.94451904296875, "objective/kl": 29.35517692565918, "objective/non_score_reward": -1.4677588939666748, "objective/rlhf_reward": -3.923624227719243, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 0.7640889883041382, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.63671875, "step": 1265, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0017471313476562 }, { "episode": 20272, "epoch": 0.12146049777713867, "loss/policy_avg": 0.45337143540382385, "lr": 9.1909509202454e-06, "objective/entropy": -248.708984375, "objective/kl": 25.6322021484375, "objective/non_score_reward": -1.2816100120544434, "objective/rlhf_reward": -3.7848046331697995, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 0.588313102722168, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6875, "step": 1266, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.001028299331665 }, { "episode": 20288, "epoch": 0.12155636241626823, "loss/policy_avg": 0.4821030795574188, "lr": 9.190311860940695e-06, "objective/entropy": -246.07826232910156, "objective/kl": 25.80655288696289, "objective/non_score_reward": -1.2903276681900024, "objective/rlhf_reward": -2.7613106727600094, "objective/scores": 0.6, "policy/approxkl_avg": 10.890335083007812, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.62109375, "step": 1267, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9990689754486084 }, { "episode": 20304, "epoch": 0.12165222705539779, "loss/policy_avg": 0.28960275650024414, "lr": 9.189672801635992e-06, "objective/entropy": -265.9043273925781, "objective/kl": 30.99881362915039, "objective/non_score_reward": -1.5499407052993774, "objective/rlhf_reward": -4.077056469694648, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 13.907394409179688, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.716796875, "step": 1268, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9994606971740723 }, { "episode": 20320, "epoch": 0.12174809169452733, "loss/policy_avg": 0.03770780563354492, "lr": 9.18903374233129e-06, "objective/entropy": -197.77639770507812, "objective/kl": 29.34738540649414, "objective/non_score_reward": -1.4673693180084229, "objective/rlhf_reward": -4.1361438194910685, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 0.8509318828582764, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.701171875, "step": 1269, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0020077228546143 }, { "episode": 20336, "epoch": 0.12184395633365688, "loss/policy_avg": 0.06795699894428253, "lr": 9.188394683026586e-06, "objective/entropy": -213.62989807128906, "objective/kl": 31.280406951904297, "objective/non_score_reward": -1.5640202760696411, "objective/rlhf_reward": -4.930568132430238, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 8.444547653198242, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5703125, "step": 1270, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9978091716766357 }, { "episode": 20352, "epoch": 0.12193982097278643, "loss/policy_avg": 0.43714067339897156, "lr": 9.187755623721883e-06, "objective/entropy": -243.02096557617188, "objective/kl": 39.39186477661133, "objective/non_score_reward": -1.9695932865142822, "objective/rlhf_reward": -6.39742028992927, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 44.565582275390625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.56640625, "step": 1271, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9978324174880981 }, { "episode": 20368, "epoch": 0.12203568561191598, "loss/policy_avg": -0.05719127878546715, "lr": 9.187116564417178e-06, "objective/entropy": -192.6077880859375, "objective/kl": 32.60759735107422, "objective/non_score_reward": -1.6303796768188477, "objective/rlhf_reward": -5.097686667640773, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 5.910696983337402, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.572265625, "step": 1272, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.005018472671509 }, { "episode": 20384, "epoch": 0.12213155025104552, "loss/policy_avg": 0.10466927289962769, "lr": 9.186477505112475e-06, "objective/entropy": -230.49244689941406, "objective/kl": 27.570762634277344, "objective/non_score_reward": -1.3785381317138672, "objective/rlhf_reward": -3.9983805058323707, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 43.770347595214844, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.58984375, "step": 1273, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9983129501342773 }, { "episode": 20400, "epoch": 0.12222741489017508, "loss/policy_avg": 0.32652002573013306, "lr": 9.185838445807772e-06, "objective/entropy": -237.1881866455078, "objective/kl": 35.98992919921875, "objective/non_score_reward": -1.7994965314865112, "objective/rlhf_reward": -5.872473273307008, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 3.772566795349121, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7265625, "step": 1274, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0020248889923096 }, { "episode": 20416, "epoch": 0.12232327952930462, "loss/policy_avg": -0.041273415088653564, "lr": 9.185199386503069e-06, "objective/entropy": -249.03428649902344, "objective/kl": 30.465728759765625, "objective/non_score_reward": -1.5232863426208496, "objective/rlhf_reward": -4.359812394777934, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 34.99227523803711, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.72265625, "step": 1275, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.000488758087158 }, { "episode": 20432, "epoch": 0.12241914416843418, "loss/policy_avg": 0.07646825909614563, "lr": 9.184560327198366e-06, "objective/entropy": -274.52752685546875, "objective/kl": 28.405258178710938, "objective/non_score_reward": -1.4202628135681152, "objective/rlhf_reward": -4.321801745627804, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 0.582542896270752, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6015625, "step": 1276, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.000093936920166 }, { "episode": 20448, "epoch": 0.12251500880756372, "loss/policy_avg": 0.6634305119514465, "lr": 9.183921267893663e-06, "objective/entropy": -237.85279846191406, "objective/kl": 37.730873107910156, "objective/non_score_reward": -1.88654363155365, "objective/rlhf_reward": -6.065221848900675, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 3.300227165222168, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7421875, "step": 1277, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9993836879730225 }, { "episode": 20464, "epoch": 0.12261087344669327, "loss/policy_avg": 0.32286834716796875, "lr": 9.183282208588958e-06, "objective/entropy": -170.94064331054688, "objective/kl": 35.21946716308594, "objective/non_score_reward": -1.7609732151031494, "objective/rlhf_reward": -2.643892979621887, "objective/scores": 1.1, "policy/approxkl_avg": 21.075477600097656, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.69921875, "step": 1278, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9982943534851074 }, { "episode": 20480, "epoch": 0.12270673808582282, "loss/policy_avg": -0.0007353071123361588, "lr": 9.182643149284255e-06, "objective/entropy": -208.5531005859375, "objective/kl": 36.26404571533203, "objective/non_score_reward": -1.8132022619247437, "objective/rlhf_reward": -5.874206879226071, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 1.7173817157745361, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.51953125, "step": 1279, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9998679161071777 }, { "episode": 20496, "epoch": 0.12280260272495237, "loss/policy_avg": 0.0016644150018692017, "lr": 9.18200408997955e-06, "objective/entropy": -276.2265930175781, "objective/kl": 37.951438903808594, "objective/non_score_reward": -1.8975720405578613, "objective/rlhf_reward": -5.765459294590066, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 1.626516580581665, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6484375, "step": 1280, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0007171630859375 }, { "episode": 20512, "epoch": 0.12289846736408191, "loss/policy_avg": 0.9792773723602295, "lr": 9.181365030674847e-06, "objective/entropy": -181.45407104492188, "objective/kl": 47.48221969604492, "objective/non_score_reward": -2.3741111755371094, "objective/rlhf_reward": -8.117842295256953, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 7.093747138977051, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6796875, "step": 1281, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.997685194015503 }, { "episode": 20528, "epoch": 0.12299433200321147, "loss/policy_avg": 0.35386669635772705, "lr": 9.180725971370144e-06, "objective/entropy": -225.07867431640625, "objective/kl": 32.99415588378906, "objective/non_score_reward": -1.6497077941894531, "objective/rlhf_reward": -5.174998839099971, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 36.31614685058594, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7578125, "step": 1282, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9996511936187744 }, { "episode": 20544, "epoch": 0.12309019664234101, "loss/policy_avg": 0.9949113130569458, "lr": 9.18008691206544e-06, "objective/entropy": -144.422119140625, "objective/kl": 41.22947311401367, "objective/non_score_reward": -2.061473846435547, "objective/rlhf_reward": -6.886644804213924, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 29.33792495727539, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.70703125, "step": 1283, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9961674213409424 }, { "episode": 20560, "epoch": 0.12318606128147057, "loss/policy_avg": 0.01584434136748314, "lr": 9.179447852760737e-06, "objective/entropy": -218.75259399414062, "objective/kl": 29.35763168334961, "objective/non_score_reward": -1.467881679534912, "objective/rlhf_reward": -4.512276732657833, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 0.7336653470993042, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.58203125, "step": 1284, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0014827251434326 }, { "episode": 20576, "epoch": 0.12328192592060011, "loss/policy_avg": 0.1053546816110611, "lr": 9.178808793456033e-06, "objective/entropy": -253.1468963623047, "objective/kl": 34.82318878173828, "objective/non_score_reward": -1.741159439086914, "objective/rlhf_reward": -4.5646381139755245, "objective/scores": 0.6, "policy/approxkl_avg": 2.2562737464904785, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.609375, "step": 1285, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9984190464019775 }, { "episode": 20592, "epoch": 0.12337779055972967, "loss/policy_avg": 1.1642229557037354, "lr": 9.17816973415133e-06, "objective/entropy": -255.337646484375, "objective/kl": 33.243751525878906, "objective/non_score_reward": -1.6621875762939453, "objective/rlhf_reward": -6.648750364780426, "objective/scores": 0.0, "policy/approxkl_avg": 38.7473030090332, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.775390625, "step": 1286, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.997887134552002 }, { "episode": 20608, "epoch": 0.12347365519885921, "loss/policy_avg": -0.24089229106903076, "lr": 9.177530674846626e-06, "objective/entropy": -243.97262573242188, "objective/kl": 25.183528900146484, "objective/non_score_reward": -1.259176254272461, "objective/rlhf_reward": -3.6128731562691607, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 9.022109031677246, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.642578125, "step": 1287, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.003718376159668 }, { "episode": 20624, "epoch": 0.12356951983798876, "loss/policy_avg": 0.2587750554084778, "lr": 9.176891615541923e-06, "objective/entropy": -264.50152587890625, "objective/kl": 47.71129608154297, "objective/non_score_reward": -2.3855648040771484, "objective/rlhf_reward": -7.142259335517883, "objective/scores": 0.6, "policy/approxkl_avg": 4.712902069091797, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.607421875, "step": 1288, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9966511726379395 }, { "episode": 20640, "epoch": 0.1236653844771183, "loss/policy_avg": 0.3948793411254883, "lr": 9.17625255623722e-06, "objective/entropy": -154.65003967285156, "objective/kl": 40.509239196777344, "objective/non_score_reward": -2.0254621505737305, "objective/rlhf_reward": -6.7018486022949215, "objective/scores": 0.35, "policy/approxkl_avg": 3.50528621673584, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.533203125, "step": 1289, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000455617904663 }, { "episode": 20656, "epoch": 0.12376124911624786, "loss/policy_avg": 0.20847059786319733, "lr": 9.175613496932517e-06, "objective/entropy": -233.9412078857422, "objective/kl": 41.79835510253906, "objective/non_score_reward": -2.0899178981781006, "objective/rlhf_reward": -7.0180361776644276, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 8.814239501953125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.607421875, "step": 1290, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9992281198501587 }, { "episode": 20672, "epoch": 0.1238571137553774, "loss/policy_avg": 0.34916895627975464, "lr": 9.174974437627812e-06, "objective/entropy": -225.4031982421875, "objective/kl": 40.641937255859375, "objective/non_score_reward": -2.0320968627929688, "objective/rlhf_reward": -6.572128622737482, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 49.47692108154297, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6796875, "step": 1291, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9976723194122314 }, { "episode": 20688, "epoch": 0.12395297839450696, "loss/policy_avg": 2.849823474884033, "lr": 9.174335378323109e-06, "objective/entropy": -330.77435302734375, "objective/kl": 23.790363311767578, "objective/non_score_reward": -1.1895182132720947, "objective/rlhf_reward": -3.307474712939605, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 46.90863800048828, "policy/clipfrac_avg": 0.25, "policy/entropy_avg": 0.603515625, "step": 1292, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.00309157371521 }, { "episode": 20704, "epoch": 0.1240488430336365, "loss/policy_avg": 0.20439790189266205, "lr": 9.173696319018406e-06, "objective/entropy": -275.361328125, "objective/kl": 32.65497589111328, "objective/non_score_reward": -1.6327489614486694, "objective/rlhf_reward": -2.1309958457946774, "objective/scores": 1.1, "policy/approxkl_avg": 3.454024314880371, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.591796875, "step": 1293, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9983692169189453 }, { "episode": 20720, "epoch": 0.12414470767276606, "loss/policy_avg": 0.6102030277252197, "lr": 9.173057259713703e-06, "objective/entropy": -218.39520263671875, "objective/kl": 27.34351348876953, "objective/non_score_reward": -1.3671756982803345, "objective/rlhf_reward": -3.9877501754120583, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 5.690610885620117, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.697265625, "step": 1294, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.999359130859375 }, { "episode": 20736, "epoch": 0.1242405723118956, "loss/policy_avg": 0.12826263904571533, "lr": 9.172418200409e-06, "objective/entropy": -288.58819580078125, "objective/kl": 36.94340133666992, "objective/non_score_reward": -1.847170114517212, "objective/rlhf_reward": -5.9648483588295855, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 9.69528579711914, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.70703125, "step": 1295, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9996252059936523 }, { "episode": 20752, "epoch": 0.12433643695102516, "loss/policy_avg": -0.08016486465930939, "lr": 9.171779141104295e-06, "objective/entropy": -226.99656677246094, "objective/kl": 41.39111328125, "objective/non_score_reward": -2.0695557594299316, "objective/rlhf_reward": -5.878223037719726, "objective/scores": 0.6, "policy/approxkl_avg": 1.6657519340515137, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.677734375, "step": 1296, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.000385046005249 }, { "episode": 20768, "epoch": 0.1244323015901547, "loss/policy_avg": 0.44178086519241333, "lr": 9.171140081799592e-06, "objective/entropy": -236.19082641601562, "objective/kl": 32.86880111694336, "objective/non_score_reward": -1.6434402465820312, "objective/rlhf_reward": -5.232125094442992, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 3.834670066833496, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.701171875, "step": 1297, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9988516569137573 }, { "episode": 20784, "epoch": 0.12452816622928425, "loss/policy_avg": 0.5984504818916321, "lr": 9.170501022494889e-06, "objective/entropy": -274.540771484375, "objective/kl": 29.187076568603516, "objective/non_score_reward": -1.4593539237976074, "objective/rlhf_reward": -4.478166067336483, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 2.234905242919922, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.73828125, "step": 1298, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.004179000854492 }, { "episode": 20800, "epoch": 0.1246240308684138, "loss/policy_avg": -0.41880375146865845, "lr": 9.169861963190185e-06, "objective/entropy": -268.99920654296875, "objective/kl": 33.32693862915039, "objective/non_score_reward": -1.6663470268249512, "objective/rlhf_reward": -4.265388345718383, "objective/scores": 0.6, "policy/approxkl_avg": 9.45730972290039, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.66796875, "step": 1299, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0022175312042236 }, { "episode": 20816, "epoch": 0.12471989550754335, "loss/policy_avg": 0.5496609807014465, "lr": 9.169222903885482e-06, "objective/entropy": -217.79193115234375, "objective/kl": 33.751773834228516, "objective/non_score_reward": -1.6875885725021362, "objective/rlhf_reward": -5.408718755751281, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 2.2236084938049316, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.53125, "step": 1300, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.001005172729492 }, { "episode": 20832, "epoch": 0.1248157601466729, "loss/policy_avg": 0.017860662192106247, "lr": 9.168583844580777e-06, "objective/entropy": -255.07095336914062, "objective/kl": 19.090106964111328, "objective/non_score_reward": -0.9545053839683533, "objective/rlhf_reward": -1.4180216252803803, "objective/scores": 0.6, "policy/approxkl_avg": 11.007080078125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.65234375, "step": 1301, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.001258611679077 }, { "episode": 20848, "epoch": 0.12491162478580245, "loss/policy_avg": 0.02041742019355297, "lr": 9.167944785276074e-06, "objective/entropy": -255.44552612304688, "objective/kl": 40.95478057861328, "objective/non_score_reward": -2.047739028930664, "objective/rlhf_reward": -8.190956592559814, "objective/scores": 0.0, "policy/approxkl_avg": 5.125746250152588, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5703125, "step": 1302, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.999814748764038 }, { "episode": 20864, "epoch": 0.125007489424932, "loss/policy_avg": 0.26476216316223145, "lr": 9.167305725971371e-06, "objective/entropy": -234.08668518066406, "objective/kl": 31.28912353515625, "objective/non_score_reward": -1.5644559860229492, "objective/rlhf_reward": -4.310412774758275, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 26.78909683227539, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.66015625, "step": 1303, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.000152111053467 }, { "episode": 20880, "epoch": 0.12510335406406153, "loss/policy_avg": 0.7206395864486694, "lr": 9.166666666666666e-06, "objective/entropy": -257.04144287109375, "objective/kl": 32.617130279541016, "objective/non_score_reward": -1.6308565139770508, "objective/rlhf_reward": -4.967166869845942, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 1.7028008699417114, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.76171875, "step": 1304, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.000716209411621 }, { "episode": 20896, "epoch": 0.1251992187031911, "loss/policy_avg": 0.9150592088699341, "lr": 9.166027607361963e-06, "objective/entropy": -226.0206298828125, "objective/kl": 28.190610885620117, "objective/non_score_reward": -1.409530520439148, "objective/rlhf_reward": -2.714403127075407, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 9.585868835449219, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.578125, "step": 1305, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9991176128387451 }, { "episode": 20912, "epoch": 0.12529508334232065, "loss/policy_avg": 0.6741877198219299, "lr": 9.16538854805726e-06, "objective/entropy": -244.3083953857422, "objective/kl": 30.657371520996094, "objective/non_score_reward": -1.5328686237335205, "objective/rlhf_reward": -4.731474375724792, "objective/scores": 0.35, "policy/approxkl_avg": 5.305037498474121, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.642578125, "step": 1306, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9968469142913818 }, { "episode": 20928, "epoch": 0.1253909479814502, "loss/policy_avg": 0.09786906093358994, "lr": 9.164749488752557e-06, "objective/entropy": -290.24542236328125, "objective/kl": 33.52435302734375, "objective/non_score_reward": -1.6762176752090454, "objective/rlhf_reward": -5.254272381873474, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 2.0794928073883057, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.671875, "step": 1307, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9996331930160522 }, { "episode": 20944, "epoch": 0.12548681262057973, "loss/policy_avg": -0.041130807250738144, "lr": 9.164110429447854e-06, "objective/entropy": -246.7947235107422, "objective/kl": 30.54619598388672, "objective/non_score_reward": -1.5273098945617676, "objective/rlhf_reward": -4.658641557307586, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 59.35724639892578, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.64453125, "step": 1308, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9982240200042725 }, { "episode": 20960, "epoch": 0.12558267725970929, "loss/policy_avg": 0.05561627447605133, "lr": 9.163471370143149e-06, "objective/entropy": -219.60110473632812, "objective/kl": 30.905031204223633, "objective/non_score_reward": -1.5452516078948975, "objective/rlhf_reward": -4.757174451549616, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 7.214193344116211, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.58203125, "step": 1309, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0004239082336426 }, { "episode": 20976, "epoch": 0.12567854189883884, "loss/policy_avg": 0.42176759243011475, "lr": 9.162832310838446e-06, "objective/entropy": -211.8623504638672, "objective/kl": 39.876808166503906, "objective/non_score_reward": -1.9938405752182007, "objective/rlhf_reward": -6.649849448233766, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 6.471524238586426, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.599609375, "step": 1310, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0001091957092285 }, { "episode": 20992, "epoch": 0.1257744065379684, "loss/policy_avg": 0.1641611009836197, "lr": 9.162193251533743e-06, "objective/entropy": -272.44757080078125, "objective/kl": 35.32935333251953, "objective/non_score_reward": -1.7664676904678345, "objective/rlhf_reward": -5.706620895598812, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 16.05602264404297, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.732421875, "step": 1311, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9982774257659912 }, { "episode": 21008, "epoch": 0.12587027117709793, "loss/policy_avg": 0.10128459334373474, "lr": 9.16155419222904e-06, "objective/entropy": -218.8691864013672, "objective/kl": 34.10152053833008, "objective/non_score_reward": -1.7050760984420776, "objective/rlhf_reward": -5.304532730373081, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 3.924506664276123, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.765625, "step": 1312, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0003185272216797 }, { "episode": 21024, "epoch": 0.12596613581622748, "loss/policy_avg": -0.07266978919506073, "lr": 9.160915132924337e-06, "objective/entropy": -176.869140625, "objective/kl": 32.847267150878906, "objective/non_score_reward": -1.6423635482788086, "objective/rlhf_reward": -4.907594447553741, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 4.819439888000488, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.646484375, "step": 1313, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0008370876312256 }, { "episode": 21040, "epoch": 0.12606200045535704, "loss/policy_avg": 0.4377824664115906, "lr": 9.160276073619634e-06, "objective/entropy": -227.95974731445312, "objective/kl": 32.87003707885742, "objective/non_score_reward": -1.6435017585754395, "objective/rlhf_reward": -5.2484944200813, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 101.58186340332031, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.771484375, "step": 1314, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.001458168029785 }, { "episode": 21056, "epoch": 0.1261578650944866, "loss/policy_avg": 0.029068514704704285, "lr": 9.159637014314929e-06, "objective/entropy": -196.00814819335938, "objective/kl": 41.65742492675781, "objective/non_score_reward": -2.082871437072754, "objective/rlhf_reward": -6.208778919950996, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 3.3673386573791504, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.68359375, "step": 1315, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9980723857879639 }, { "episode": 21072, "epoch": 0.12625372973361612, "loss/policy_avg": 0.9132063984870911, "lr": 9.158997955010226e-06, "objective/entropy": -196.53677368164062, "objective/kl": 34.47105026245117, "objective/non_score_reward": -1.7235524654388428, "objective/rlhf_reward": -6.8942097425460815, "objective/scores": 0.0, "policy/approxkl_avg": 2.7787117958068848, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4892578125, "step": 1316, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.003011703491211 }, { "episode": 21088, "epoch": 0.12634959437274568, "loss/policy_avg": -0.14771617949008942, "lr": 9.158358895705522e-06, "objective/entropy": -225.04312133789062, "objective/kl": 25.410263061523438, "objective/non_score_reward": -1.2705130577087402, "objective/rlhf_reward": -5.08205258846283, "objective/scores": 0.0, "policy/approxkl_avg": 5.080203056335449, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.6953125, "step": 1317, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0016398429870605 }, { "episode": 21104, "epoch": 0.12644545901187523, "loss/policy_avg": -0.03344951570034027, "lr": 9.15771983640082e-06, "objective/entropy": -264.9842529296875, "objective/kl": 34.70489501953125, "objective/non_score_reward": -1.7352447509765625, "objective/rlhf_reward": -4.9935674173402145, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 10.510156631469727, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.62109375, "step": 1318, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9990448951721191 }, { "episode": 21120, "epoch": 0.1265413236510048, "loss/policy_avg": 0.14975669980049133, "lr": 9.157080777096116e-06, "objective/entropy": -281.6861572265625, "objective/kl": 38.669654846191406, "objective/non_score_reward": -1.9334828853607178, "objective/rlhf_reward": -6.000597850481668, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 2.9204273223876953, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.576171875, "step": 1319, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9977920055389404 }, { "episode": 21136, "epoch": 0.12663718829013432, "loss/policy_avg": -0.23473092913627625, "lr": 9.156441717791411e-06, "objective/entropy": -186.2064208984375, "objective/kl": 33.10087203979492, "objective/non_score_reward": -1.655043601989746, "objective/rlhf_reward": -5.294661674529237, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 26.52355194091797, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.578125, "step": 1320, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.002742290496826 }, { "episode": 21152, "epoch": 0.12673305292926387, "loss/policy_avg": 0.8872619867324829, "lr": 9.155802658486708e-06, "objective/entropy": -265.64398193359375, "objective/kl": 33.104408264160156, "objective/non_score_reward": -1.6552205085754395, "objective/rlhf_reward": -4.887548581759134, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 29.555377960205078, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.69921875, "step": 1321, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9992411136627197 }, { "episode": 21168, "epoch": 0.12682891756839343, "loss/policy_avg": -0.05859680473804474, "lr": 9.155163599182005e-06, "objective/entropy": -179.13717651367188, "objective/kl": 27.85260581970215, "objective/non_score_reward": -1.3926303386688232, "objective/rlhf_reward": -3.966401312414723, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 0.4925612211227417, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.611328125, "step": 1322, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0010387897491455 }, { "episode": 21184, "epoch": 0.12692478220752298, "loss/policy_avg": -0.12246014177799225, "lr": 9.1545245398773e-06, "objective/entropy": -237.7357177734375, "objective/kl": 34.5874137878418, "objective/non_score_reward": -1.7293705940246582, "objective/rlhf_reward": -5.466884355159149, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 4.104196548461914, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6328125, "step": 1323, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0004804134368896 }, { "episode": 21200, "epoch": 0.1270206468466525, "loss/policy_avg": 0.22411450743675232, "lr": 9.153885480572597e-06, "objective/entropy": -273.3883361816406, "objective/kl": 42.53919219970703, "objective/non_score_reward": -2.1269593238830566, "objective/rlhf_reward": -6.845978026807892, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 4.946126937866211, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.642578125, "step": 1324, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9990512132644653 }, { "episode": 21216, "epoch": 0.12711651148578207, "loss/policy_avg": 0.06539204716682434, "lr": 9.153246421267894e-06, "objective/entropy": -167.3392333984375, "objective/kl": 33.024253845214844, "objective/non_score_reward": -1.6512128114700317, "objective/rlhf_reward": -5.048591821399286, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 4.672647476196289, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.40234375, "step": 1325, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.005505084991455 }, { "episode": 21232, "epoch": 0.12721237612491162, "loss/policy_avg": 0.17451216280460358, "lr": 9.152607361963191e-06, "objective/entropy": -168.8487548828125, "objective/kl": 26.45303726196289, "objective/non_score_reward": -1.3226518630981445, "objective/rlhf_reward": -3.965094718962831, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 1.9421508312225342, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.51953125, "step": 1326, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0126333236694336 }, { "episode": 21248, "epoch": 0.12730824076404118, "loss/policy_avg": -0.005517004989087582, "lr": 9.151968302658488e-06, "objective/entropy": -176.31719970703125, "objective/kl": 18.665822982788086, "objective/non_score_reward": -0.9332911968231201, "objective/rlhf_reward": -2.3093326284485736, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 1.5738120079040527, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.525390625, "step": 1327, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9999518394470215 }, { "episode": 21264, "epoch": 0.12740410540317074, "loss/policy_avg": 0.1618424952030182, "lr": 9.151329243353783e-06, "objective/entropy": -217.6151123046875, "objective/kl": 24.312286376953125, "objective/non_score_reward": -1.2156143188476562, "objective/rlhf_reward": -2.9150461656617477, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 30.008869171142578, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.74609375, "step": 1328, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.000408172607422 }, { "episode": 21280, "epoch": 0.12749997004230026, "loss/policy_avg": 0.15089087188243866, "lr": 9.15069018404908e-06, "objective/entropy": -231.04893493652344, "objective/kl": 34.12983322143555, "objective/non_score_reward": -1.7064917087554932, "objective/rlhf_reward": -5.48433118155542, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 37.14998245239258, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.78125, "step": 1329, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9984626770019531 }, { "episode": 21296, "epoch": 0.12759583468142982, "loss/policy_avg": 0.13896551728248596, "lr": 9.150051124744377e-06, "objective/entropy": -192.38351440429688, "objective/kl": 22.335050582885742, "objective/non_score_reward": -1.1167525053024292, "objective/rlhf_reward": -3.0164120002702326, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 3.72003173828125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6875, "step": 1330, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9977521896362305 }, { "episode": 21312, "epoch": 0.12769169932055938, "loss/policy_avg": 0.038389697670936584, "lr": 9.149412065439674e-06, "objective/entropy": -206.938232421875, "objective/kl": 23.90731430053711, "objective/non_score_reward": -1.1953657865524292, "objective/rlhf_reward": -3.265691363605198, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 1.9220904111862183, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.564453125, "step": 1331, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0020275115966797 }, { "episode": 21328, "epoch": 0.12778756395968893, "loss/policy_avg": 0.22985966503620148, "lr": 9.14877300613497e-06, "objective/entropy": -236.96868896484375, "objective/kl": 20.446491241455078, "objective/non_score_reward": -1.022324562072754, "objective/rlhf_reward": -1.6892985463142396, "objective/scores": 0.6, "policy/approxkl_avg": 16.64617347717285, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.96875, "step": 1332, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.000746726989746 }, { "episode": 21344, "epoch": 0.12788342859881846, "loss/policy_avg": 0.3077865540981293, "lr": 9.148133946830266e-06, "objective/entropy": -234.60574340820312, "objective/kl": 38.31067657470703, "objective/non_score_reward": -1.9155337810516357, "objective/rlhf_reward": -4.7384163483392925, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 37.95224380493164, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.73046875, "step": 1333, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.998931884765625 }, { "episode": 21360, "epoch": 0.12797929323794802, "loss/policy_avg": 0.13958078622817993, "lr": 9.147494887525563e-06, "objective/entropy": -273.56170654296875, "objective/kl": 31.292470932006836, "objective/non_score_reward": -1.5646235942840576, "objective/rlhf_reward": -1.8584942579269406, "objective/scores": 1.1, "policy/approxkl_avg": 1.8678287267684937, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.619140625, "step": 1334, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.997137188911438 }, { "episode": 21376, "epoch": 0.12807515787707757, "loss/policy_avg": 0.42439746856689453, "lr": 9.14685582822086e-06, "objective/entropy": -267.5999755859375, "objective/kl": 33.0029296875, "objective/non_score_reward": -1.650146484375, "objective/rlhf_reward": -5.2750730848609635, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 4.55873441696167, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.66015625, "step": 1335, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9997644424438477 }, { "episode": 21392, "epoch": 0.12817102251620713, "loss/policy_avg": 0.002216493710875511, "lr": 9.146216768916156e-06, "objective/entropy": -226.58786010742188, "objective/kl": 22.239288330078125, "objective/non_score_reward": -1.111964464187622, "objective/rlhf_reward": -2.500446598009999, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 11.510183334350586, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6953125, "step": 1336, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9985392093658447 }, { "episode": 21408, "epoch": 0.12826688715533666, "loss/policy_avg": 0.2896654009819031, "lr": 9.145577709611453e-06, "objective/entropy": -275.9249267578125, "objective/kl": 33.59234619140625, "objective/non_score_reward": -1.679617166519165, "objective/rlhf_reward": -4.893640096458506, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 2.7253494262695312, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.728515625, "step": 1337, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9982388019561768 }, { "episode": 21424, "epoch": 0.1283627517944662, "loss/policy_avg": 0.4936927556991577, "lr": 9.14493865030675e-06, "objective/entropy": -174.52462768554688, "objective/kl": 30.66004180908203, "objective/non_score_reward": -1.5330020189285278, "objective/rlhf_reward": -4.7320081949234005, "objective/scores": 0.35, "policy/approxkl_avg": 3.5361199378967285, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.52734375, "step": 1338, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.999757170677185 }, { "episode": 21440, "epoch": 0.12845861643359577, "loss/policy_avg": 0.4622963070869446, "lr": 9.144299591002045e-06, "objective/entropy": -278.365966796875, "objective/kl": 37.1561393737793, "objective/non_score_reward": -1.8578070402145386, "objective/rlhf_reward": -5.606399412426065, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 12.119524002075195, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6640625, "step": 1339, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9970488548278809 }, { "episode": 21456, "epoch": 0.12855448107272532, "loss/policy_avg": 0.1313559114933014, "lr": 9.143660531697342e-06, "objective/entropy": -254.86607360839844, "objective/kl": 35.33464813232422, "objective/non_score_reward": -1.7667322158813477, "objective/rlhf_reward": -4.143209849239561, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 22.45963478088379, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6875, "step": 1340, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9981313943862915 }, { "episode": 21472, "epoch": 0.12865034571185485, "loss/policy_avg": 0.38973551988601685, "lr": 9.143021472392639e-06, "objective/entropy": -272.54193115234375, "objective/kl": 27.13404655456543, "objective/non_score_reward": -1.3567023277282715, "objective/rlhf_reward": -3.693476096789042, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 1.679624080657959, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.658203125, "step": 1341, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9994499683380127 }, { "episode": 21488, "epoch": 0.1287462103509844, "loss/policy_avg": 0.11905691772699356, "lr": 9.142382413087936e-06, "objective/entropy": -210.89501953125, "objective/kl": 28.64351463317871, "objective/non_score_reward": -1.432175636291504, "objective/rlhf_reward": -2.804983530880186, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 9.039191246032715, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.603515625, "step": 1342, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9983291625976562 }, { "episode": 21504, "epoch": 0.12884207499011396, "loss/policy_avg": 0.14720244705677032, "lr": 9.141743353783233e-06, "objective/entropy": -224.0950164794922, "objective/kl": 25.995969772338867, "objective/non_score_reward": -1.2997984886169434, "objective/rlhf_reward": -2.2754748209726543, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 2.1929235458374023, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.771484375, "step": 1343, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9995906352996826 }, { "episode": 21520, "epoch": 0.12893793962924352, "loss/policy_avg": 0.17890335619449615, "lr": 9.14110429447853e-06, "objective/entropy": -250.506103515625, "objective/kl": 29.027278900146484, "objective/non_score_reward": -1.4513640403747559, "objective/rlhf_reward": -1.405456072092056, "objective/scores": 1.1, "policy/approxkl_avg": 12.065425872802734, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.80078125, "step": 1344, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9962573051452637 }, { "episode": 21536, "epoch": 0.12903380426837305, "loss/policy_avg": 0.08815973997116089, "lr": 9.140465235173825e-06, "objective/entropy": -284.2688293457031, "objective/kl": 30.836158752441406, "objective/non_score_reward": -1.5418078899383545, "objective/rlhf_reward": -4.5053721718197925, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 6.909310340881348, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.783203125, "step": 1345, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9966943264007568 }, { "episode": 21552, "epoch": 0.1291296689075026, "loss/policy_avg": 0.024522747844457626, "lr": 9.13982617586912e-06, "objective/entropy": -161.51828002929688, "objective/kl": 30.234634399414062, "objective/non_score_reward": -1.5117316246032715, "objective/rlhf_reward": -4.721413645774049, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 1.0650488138198853, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.587890625, "step": 1346, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.0002496242523193 }, { "episode": 21568, "epoch": 0.12922553354663216, "loss/policy_avg": 0.5536386966705322, "lr": 9.139187116564417e-06, "objective/entropy": -235.6590118408203, "objective/kl": 32.029144287109375, "objective/non_score_reward": -1.6014573574066162, "objective/rlhf_reward": -4.283122958914314, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 21.44164276123047, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.619140625, "step": 1347, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.997905969619751 }, { "episode": 21584, "epoch": 0.12932139818576172, "loss/policy_avg": -0.14616435766220093, "lr": 9.138548057259714e-06, "objective/entropy": -236.11582946777344, "objective/kl": 26.366846084594727, "objective/non_score_reward": -1.3183423280715942, "objective/rlhf_reward": -3.947856519251985, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 8.052356719970703, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.68359375, "step": 1348, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.003175735473633 }, { "episode": 21600, "epoch": 0.12941726282489124, "loss/policy_avg": 0.22792214155197144, "lr": 9.13790899795501e-06, "objective/entropy": -209.01907348632812, "objective/kl": 33.44483184814453, "objective/non_score_reward": -1.6722415685653687, "objective/rlhf_reward": -5.31036422499786, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 15.402750015258789, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.81640625, "step": 1349, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.000248432159424 }, { "episode": 21616, "epoch": 0.1295131274640208, "loss/policy_avg": 0.20839962363243103, "lr": 9.137269938650308e-06, "objective/entropy": -292.2127990722656, "objective/kl": 29.052120208740234, "objective/non_score_reward": -1.452605962753296, "objective/rlhf_reward": -1.4104240894317623, "objective/scores": 1.1, "policy/approxkl_avg": 76.34044647216797, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.580078125, "step": 1350, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.997493028640747 }, { "episode": 21632, "epoch": 0.12960899210315036, "loss/policy_avg": -0.08632227778434753, "lr": 9.136630879345604e-06, "objective/entropy": -173.5177764892578, "objective/kl": 29.301441192626953, "objective/non_score_reward": -1.4650721549987793, "objective/rlhf_reward": -4.481686213103634, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 2.284684658050537, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.60546875, "step": 1351, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.002286434173584 }, { "episode": 21648, "epoch": 0.1297048567422799, "loss/policy_avg": 0.10895340144634247, "lr": 9.1359918200409e-06, "objective/entropy": -279.0048828125, "objective/kl": 34.87440872192383, "objective/non_score_reward": -1.7437204122543335, "objective/rlhf_reward": -2.5748816490173336, "objective/scores": 1.1, "policy/approxkl_avg": 13.06716537475586, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.583984375, "step": 1352, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.997962236404419 }, { "episode": 21664, "epoch": 0.12980072138140944, "loss/policy_avg": 0.2365398406982422, "lr": 9.135352760736197e-06, "objective/entropy": -250.8545379638672, "objective/kl": 30.62120819091797, "objective/non_score_reward": -1.5310604572296143, "objective/rlhf_reward": -3.2005228146326274, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 25.900188446044922, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.63671875, "step": 1353, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.997548222541809 }, { "episode": 21680, "epoch": 0.129896586020539, "loss/policy_avg": -0.0653112605214119, "lr": 9.134713701431493e-06, "objective/entropy": -263.48004150390625, "objective/kl": 24.03810691833496, "objective/non_score_reward": -1.2019054889678955, "objective/rlhf_reward": -2.407621836662292, "objective/scores": 0.6, "policy/approxkl_avg": 2.6270973682403564, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.62890625, "step": 1354, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.00001859664917 }, { "episode": 21696, "epoch": 0.12999245065966855, "loss/policy_avg": 0.24467766284942627, "lr": 9.13407464212679e-06, "objective/entropy": -237.13613891601562, "objective/kl": 24.655006408691406, "objective/non_score_reward": -1.2327501773834229, "objective/rlhf_reward": -0.5310010671615597, "objective/scores": 1.1, "policy/approxkl_avg": 23.784618377685547, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.625, "step": 1355, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.004101276397705 }, { "episode": 21712, "epoch": 0.1300883152987981, "loss/policy_avg": 0.0691906288266182, "lr": 9.133435582822087e-06, "objective/entropy": -244.44912719726562, "objective/kl": 30.4073486328125, "objective/non_score_reward": -1.5203675031661987, "objective/rlhf_reward": -1.6814697742462155, "objective/scores": 1.1, "policy/approxkl_avg": 0.9179539680480957, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6640625, "step": 1356, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.001126289367676 }, { "episode": 21728, "epoch": 0.13018417993792764, "loss/policy_avg": 0.21818453073501587, "lr": 9.132796523517384e-06, "objective/entropy": -227.47018432617188, "objective/kl": 28.718124389648438, "objective/non_score_reward": -1.435906171798706, "objective/rlhf_reward": -3.918795819553446, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 0.8305081129074097, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.697265625, "step": 1357, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9998927116394043 }, { "episode": 21744, "epoch": 0.1302800445770572, "loss/policy_avg": 0.5288101434707642, "lr": 9.13215746421268e-06, "objective/entropy": -254.03286743164062, "objective/kl": 40.13897705078125, "objective/non_score_reward": -2.006948947906494, "objective/rlhf_reward": -6.471536367145136, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 2.1408591270446777, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5859375, "step": 1358, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9989702701568604 }, { "episode": 21760, "epoch": 0.13037590921618675, "loss/policy_avg": -0.015707701444625854, "lr": 9.131518404907976e-06, "objective/entropy": -235.0547637939453, "objective/kl": 34.96942901611328, "objective/non_score_reward": -1.7484712600708008, "objective/rlhf_reward": -5.57005317946252, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 39.12907409667969, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.666015625, "step": 1359, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9975736141204834 }, { "episode": 21776, "epoch": 0.1304717738553163, "loss/policy_avg": -0.031348615884780884, "lr": 9.130879345603273e-06, "objective/entropy": -216.28042602539062, "objective/kl": 31.17209243774414, "objective/non_score_reward": -1.5586044788360596, "objective/rlhf_reward": -4.111712040678535, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 0.8966926336288452, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.53125, "step": 1360, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.001847267150879 }, { "episode": 21792, "epoch": 0.13056763849444583, "loss/policy_avg": 0.30280882120132446, "lr": 9.13024028629857e-06, "objective/entropy": -213.22189331054688, "objective/kl": 31.471433639526367, "objective/non_score_reward": -1.573571801185608, "objective/rlhf_reward": -4.632427459180938, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 3.3364553451538086, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.595703125, "step": 1361, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9998619556427002 }, { "episode": 21808, "epoch": 0.1306635031335754, "loss/policy_avg": 0.20570358633995056, "lr": 9.129601226993867e-06, "objective/entropy": -179.83119201660156, "objective/kl": 25.478784561157227, "objective/non_score_reward": -1.2739393711090088, "objective/rlhf_reward": -3.6957572460174557, "objective/scores": 0.35, "policy/approxkl_avg": 90.71241760253906, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.55078125, "step": 1362, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.997685194015503 }, { "episode": 21824, "epoch": 0.13075936777270494, "loss/policy_avg": 0.6416128873825073, "lr": 9.128962167689162e-06, "objective/entropy": -272.2728271484375, "objective/kl": 32.88115692138672, "objective/non_score_reward": -1.6440578699111938, "objective/rlhf_reward": -5.019972293582514, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 12.951141357421875, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.572265625, "step": 1363, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9987308979034424 }, { "episode": 21840, "epoch": 0.1308552324118345, "loss/policy_avg": -0.41209667921066284, "lr": 9.128323108384459e-06, "objective/entropy": -244.28286743164062, "objective/kl": 30.94601058959961, "objective/non_score_reward": -1.5473005771636963, "objective/rlhf_reward": -3.2654829963457317, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 22.515792846679688, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.501953125, "step": 1364, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0037689208984375 }, { "episode": 21856, "epoch": 0.13095109705096403, "loss/policy_avg": -0.01563386619091034, "lr": 9.127684049079756e-06, "objective/entropy": -255.2119140625, "objective/kl": 27.473278045654297, "objective/non_score_reward": -1.3736639022827148, "objective/rlhf_reward": -3.8905355072656445, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 2.797173261642456, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.541015625, "step": 1365, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9988737106323242 }, { "episode": 21872, "epoch": 0.13104696169009358, "loss/policy_avg": 0.23629775643348694, "lr": 9.127044989775053e-06, "objective/entropy": -272.95880126953125, "objective/kl": 34.79148483276367, "objective/non_score_reward": -1.7395741939544678, "objective/rlhf_reward": -6.958296895027161, "objective/scores": 0.0, "policy/approxkl_avg": 3.5488548278808594, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.57421875, "step": 1366, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.0178794860839844 }, { "episode": 21888, "epoch": 0.13114282632922314, "loss/policy_avg": -0.18055079877376556, "lr": 9.126405930470348e-06, "objective/entropy": -238.3826904296875, "objective/kl": 31.74860191345215, "objective/non_score_reward": -1.5874300003051758, "objective/rlhf_reward": -4.8991220994905085, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 6.933784484863281, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.638671875, "step": 1367, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9982986450195312 }, { "episode": 21904, "epoch": 0.1312386909683527, "loss/policy_avg": 0.5160447359085083, "lr": 9.125766871165645e-06, "objective/entropy": -277.48004150390625, "objective/kl": 33.97571563720703, "objective/non_score_reward": -1.6987860202789307, "objective/rlhf_reward": -4.6724373719849925, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 15.784793853759766, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6015625, "step": 1368, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9972174167633057 }, { "episode": 21920, "epoch": 0.13133455560748222, "loss/policy_avg": 0.09226138889789581, "lr": 9.125127811860942e-06, "objective/entropy": -288.6790466308594, "objective/kl": 26.108116149902344, "objective/non_score_reward": -1.305405855178833, "objective/rlhf_reward": -0.8216233015060421, "objective/scores": 1.1, "policy/approxkl_avg": 34.192481994628906, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.55078125, "step": 1369, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9981337785720825 }, { "episode": 21936, "epoch": 0.13143042024661178, "loss/policy_avg": -0.005547836422920227, "lr": 9.124488752556238e-06, "objective/entropy": -261.7656555175781, "objective/kl": 39.76494216918945, "objective/non_score_reward": -1.988247036933899, "objective/rlhf_reward": -6.291128640592682, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 1.2966415882110596, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.63671875, "step": 1370, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.999891996383667 }, { "episode": 21952, "epoch": 0.13152628488574133, "loss/policy_avg": 0.1910111904144287, "lr": 9.123849693251534e-06, "objective/entropy": -71.11714935302734, "objective/kl": 37.65461730957031, "objective/non_score_reward": -1.8827309608459473, "objective/rlhf_reward": -4.607204352260801, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 7.529366493225098, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.763671875, "step": 1371, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.99833345413208 }, { "episode": 21968, "epoch": 0.1316221495248709, "loss/policy_avg": 1.7684245109558105, "lr": 9.12321063394683e-06, "objective/entropy": -231.41371154785156, "objective/kl": 34.49894714355469, "objective/non_score_reward": -1.7249473333358765, "objective/rlhf_reward": -5.237929766595946, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 61.1027717590332, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5546875, "step": 1372, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9997320175170898 }, { "episode": 21984, "epoch": 0.13171801416400042, "loss/policy_avg": -0.03314230218529701, "lr": 9.122571574642127e-06, "objective/entropy": -290.369384765625, "objective/kl": 32.8198127746582, "objective/non_score_reward": -1.6409904956817627, "objective/rlhf_reward": -5.2384496069251725, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 13.562549591064453, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6171875, "step": 1373, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0022025108337402 }, { "episode": 22000, "epoch": 0.13181387880312997, "loss/policy_avg": 0.11191444098949432, "lr": 9.121932515337424e-06, "objective/entropy": -228.10528564453125, "objective/kl": 38.98127746582031, "objective/non_score_reward": -1.9490638971328735, "objective/rlhf_reward": -6.470742616683168, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 2.0331411361694336, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.564453125, "step": 1374, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9986833333969116 }, { "episode": 22016, "epoch": 0.13190974344225953, "loss/policy_avg": -0.26164868474006653, "lr": 9.121293456032721e-06, "objective/entropy": -226.21148681640625, "objective/kl": 34.36164855957031, "objective/non_score_reward": -1.718082308769226, "objective/rlhf_reward": -5.4723293542861935, "objective/scores": 0.35, "policy/approxkl_avg": 3.828913450241089, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.673828125, "step": 1375, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9985249042510986 }, { "episode": 22032, "epoch": 0.1320056080813891, "loss/policy_avg": 0.21197950839996338, "lr": 9.120654396728016e-06, "objective/entropy": -255.317138671875, "objective/kl": 44.30939865112305, "objective/non_score_reward": -2.2154700756073, "objective/rlhf_reward": -7.483277657119137, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 5.002331733703613, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.619140625, "step": 1376, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9972087144851685 }, { "episode": 22048, "epoch": 0.13210147272051861, "loss/policy_avg": 0.2008858621120453, "lr": 9.120015337423313e-06, "objective/entropy": -194.98388671875, "objective/kl": 29.178813934326172, "objective/non_score_reward": -1.4589406251907349, "objective/rlhf_reward": -4.510249648123903, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 1.7124892473220825, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5546875, "step": 1377, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.999535322189331 }, { "episode": 22064, "epoch": 0.13219733735964817, "loss/policy_avg": 0.2521211802959442, "lr": 9.11937627811861e-06, "objective/entropy": -275.01416015625, "objective/kl": 40.16548538208008, "objective/non_score_reward": -2.008274555206299, "objective/rlhf_reward": -6.707584772139711, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 8.357677459716797, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.640625, "step": 1378, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9994690418243408 }, { "episode": 22080, "epoch": 0.13229320199877773, "loss/policy_avg": 0.20335987210273743, "lr": 9.118737218813907e-06, "objective/entropy": -219.05987548828125, "objective/kl": 28.999086380004883, "objective/non_score_reward": -1.4499542713165283, "objective/rlhf_reward": -4.421214976397854, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 1.1380681991577148, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6015625, "step": 1379, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.001814365386963 }, { "episode": 22096, "epoch": 0.13238906663790728, "loss/policy_avg": -0.17124547064304352, "lr": 9.118098159509204e-06, "objective/entropy": -102.70747375488281, "objective/kl": 36.22713851928711, "objective/non_score_reward": -1.8113569021224976, "objective/rlhf_reward": -5.866825440017086, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 0.37671273946762085, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.560546875, "step": 1380, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.001765251159668 }, { "episode": 22112, "epoch": 0.1324849312770368, "loss/policy_avg": 0.6594262719154358, "lr": 9.1174591002045e-06, "objective/entropy": -239.76181030273438, "objective/kl": 38.55724334716797, "objective/non_score_reward": -1.9278624057769775, "objective/rlhf_reward": -6.332847335425717, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 63.58997344970703, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.748046875, "step": 1381, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9986870288848877 }, { "episode": 22128, "epoch": 0.13258079591616637, "loss/policy_avg": 0.26639020442962646, "lr": 9.116820040899796e-06, "objective/entropy": -228.43006896972656, "objective/kl": 41.377357482910156, "objective/non_score_reward": -2.0688676834106445, "objective/rlhf_reward": -5.875471210479736, "objective/scores": 0.6, "policy/approxkl_avg": 24.236587524414062, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.580078125, "step": 1382, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.997843623161316 }, { "episode": 22144, "epoch": 0.13267666055529592, "loss/policy_avg": 0.22560517489910126, "lr": 9.116180981595093e-06, "objective/entropy": -275.1982421875, "objective/kl": 33.85704040527344, "objective/non_score_reward": -1.6928520202636719, "objective/rlhf_reward": -4.371408081054687, "objective/scores": 0.6, "policy/approxkl_avg": 50.331180572509766, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.595703125, "step": 1383, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.996649980545044 }, { "episode": 22160, "epoch": 0.13277252519442548, "loss/policy_avg": 0.3458302617073059, "lr": 9.11554192229039e-06, "objective/entropy": -284.34478759765625, "objective/kl": 41.527374267578125, "objective/non_score_reward": -2.076368808746338, "objective/rlhf_reward": -6.6436157278424375, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 3.7493739128112793, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.615234375, "step": 1384, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.000601291656494 }, { "episode": 22176, "epoch": 0.13286838983355503, "loss/policy_avg": 0.04170902818441391, "lr": 9.114902862985686e-06, "objective/entropy": -232.59671020507812, "objective/kl": 23.403087615966797, "objective/non_score_reward": -1.1701544523239136, "objective/rlhf_reward": -3.2806179285049435, "objective/scores": 0.35, "policy/approxkl_avg": 11.652734756469727, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.642578125, "step": 1385, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0049519538879395 }, { "episode": 22192, "epoch": 0.13296425447268456, "loss/policy_avg": 0.7652486562728882, "lr": 9.114263803680983e-06, "objective/entropy": -264.1070251464844, "objective/kl": 38.55071258544922, "objective/non_score_reward": -1.9275355339050293, "objective/rlhf_reward": -5.885313267978739, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 16.973102569580078, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.732421875, "step": 1386, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9949281215667725 }, { "episode": 22208, "epoch": 0.13306011911181412, "loss/policy_avg": 0.26107269525527954, "lr": 9.113624744376279e-06, "objective/entropy": -274.548828125, "objective/kl": 29.125957489013672, "objective/non_score_reward": -1.4562978744506836, "objective/rlhf_reward": -4.37459359607254, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 11.973018646240234, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.669921875, "step": 1387, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.997715711593628 }, { "episode": 22224, "epoch": 0.13315598375094367, "loss/policy_avg": 0.1225675493478775, "lr": 9.112985685071575e-06, "objective/entropy": -218.16091918945312, "objective/kl": 35.720855712890625, "objective/non_score_reward": -1.7860426902770996, "objective/rlhf_reward": -5.196759770588811, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 32.86650085449219, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8046875, "step": 1388, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0017457008361816 }, { "episode": 22240, "epoch": 0.13325184839007323, "loss/policy_avg": 0.343703955411911, "lr": 9.112346625766872e-06, "objective/entropy": -225.82559204101562, "objective/kl": 24.557886123657227, "objective/non_score_reward": -1.2278943061828613, "objective/rlhf_reward": -3.5523273584589194, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 1.353696584701538, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.671875, "step": 1389, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.00032901763916 }, { "episode": 22256, "epoch": 0.13334771302920276, "loss/policy_avg": 0.22060903906822205, "lr": 9.111707566462168e-06, "objective/entropy": -259.0480651855469, "objective/kl": 25.11700439453125, "objective/non_score_reward": -1.2558501958847046, "objective/rlhf_reward": -3.467141597476557, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 1.6127742528915405, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.638671875, "step": 1390, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9998914003372192 }, { "episode": 22272, "epoch": 0.1334435776683323, "loss/policy_avg": 0.0028184684924781322, "lr": 9.111068507157464e-06, "objective/entropy": -277.342041015625, "objective/kl": 34.34157180786133, "objective/non_score_reward": -1.7170785665512085, "objective/rlhf_reward": -5.352542483600315, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 6.7859721183776855, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.650390625, "step": 1391, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9976534843444824 }, { "episode": 22288, "epoch": 0.13353944230746187, "loss/policy_avg": 0.2673591077327728, "lr": 9.110429447852761e-06, "objective/entropy": -272.81146240234375, "objective/kl": 32.08586883544922, "objective/non_score_reward": -1.6042933464050293, "objective/rlhf_reward": -4.993341405590144, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 66.21798706054688, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.57421875, "step": 1392, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9972776174545288 }, { "episode": 22304, "epoch": 0.13363530694659143, "loss/policy_avg": 0.9891442060470581, "lr": 9.109790388548058e-06, "objective/entropy": -157.80642700195312, "objective/kl": 41.81775665283203, "objective/non_score_reward": -2.0908877849578857, "objective/rlhf_reward": -7.02191524794641, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 140.4240264892578, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.56640625, "step": 1393, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9975658655166626 }, { "episode": 22320, "epoch": 0.13373117158572095, "loss/policy_avg": 0.6556341052055359, "lr": 9.109151329243355e-06, "objective/entropy": -253.7165069580078, "objective/kl": 27.159475326538086, "objective/non_score_reward": -1.3579738140106201, "objective/rlhf_reward": -3.3091887853303295, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 15.467697143554688, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.47265625, "step": 1394, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.006256580352783 }, { "episode": 22336, "epoch": 0.1338270362248505, "loss/policy_avg": -0.04786435142159462, "lr": 9.10851226993865e-06, "objective/entropy": -281.2685546875, "objective/kl": 30.538103103637695, "objective/non_score_reward": -1.5269051790237427, "objective/rlhf_reward": -4.591848814281162, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 12.983705520629883, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.544921875, "step": 1395, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0013580322265625 }, { "episode": 22352, "epoch": 0.13392290086398007, "loss/policy_avg": -0.14957058429718018, "lr": 9.107873210633947e-06, "objective/entropy": -260.2832946777344, "objective/kl": 37.010498046875, "objective/non_score_reward": -1.85052490234375, "objective/rlhf_reward": -6.0604638366991574, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 4.588924407958984, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6875, "step": 1396, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9982390403747559 }, { "episode": 22368, "epoch": 0.13401876550310962, "loss/policy_avg": 0.03792187571525574, "lr": 9.107234151329244e-06, "objective/entropy": -202.82089233398438, "objective/kl": 28.890417098999023, "objective/non_score_reward": -1.4445207118988037, "objective/rlhf_reward": -4.418833160136623, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 2.8451006412506104, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.646484375, "step": 1397, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0012331008911133 }, { "episode": 22384, "epoch": 0.13411463014223915, "loss/policy_avg": 0.26423919200897217, "lr": 9.10659509202454e-06, "objective/entropy": -199.65274047851562, "objective/kl": 19.250408172607422, "objective/non_score_reward": -0.9625204205513, "objective/rlhf_reward": -2.245961788956242, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 6.399721145629883, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.638671875, "step": 1398, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9996623992919922 }, { "episode": 22400, "epoch": 0.1342104947813687, "loss/policy_avg": 0.2948363423347473, "lr": 9.105956032719838e-06, "objective/entropy": -214.3868865966797, "objective/kl": 25.899425506591797, "objective/non_score_reward": -1.294971227645874, "objective/rlhf_reward": -3.820635103915615, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 11.461451530456543, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.51171875, "step": 1399, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9998140335083008 }, { "episode": 22416, "epoch": 0.13430635942049826, "loss/policy_avg": -0.059535130858421326, "lr": 9.105316973415133e-06, "objective/entropy": -251.36669921875, "objective/kl": 24.37100601196289, "objective/non_score_reward": -1.218550205230713, "objective/rlhf_reward": -3.474200969934463, "objective/scores": 0.35, "policy/approxkl_avg": 6.021501541137695, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5234375, "step": 1400, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0187039375305176 }, { "episode": 22432, "epoch": 0.13440222405962782, "loss/policy_avg": 0.09361746907234192, "lr": 9.10467791411043e-06, "objective/entropy": -250.89463806152344, "objective/kl": 40.795570373535156, "objective/non_score_reward": -2.039778470993042, "objective/rlhf_reward": -6.759113764762878, "objective/scores": 0.35, "policy/approxkl_avg": 2.102271795272827, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.697265625, "step": 1401, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9986069202423096 }, { "episode": 22448, "epoch": 0.13449808869875735, "loss/policy_avg": 0.3411298990249634, "lr": 9.104038854805727e-06, "objective/entropy": -189.4188232421875, "objective/kl": 21.962203979492188, "objective/non_score_reward": -1.0981099605560303, "objective/rlhf_reward": -2.9418420597032156, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 3.4877538681030273, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5234375, "step": 1402, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.997504711151123 }, { "episode": 22464, "epoch": 0.1345939533378869, "loss/policy_avg": 0.1954708993434906, "lr": 9.103399795501024e-06, "objective/entropy": -215.69268798828125, "objective/kl": 37.367271423339844, "objective/non_score_reward": -1.868363618850708, "objective/rlhf_reward": -6.022856573672637, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 3.3153672218322754, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.544921875, "step": 1403, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9988248348236084 }, { "episode": 22480, "epoch": 0.13468981797701646, "loss/policy_avg": -0.3388468027114868, "lr": 9.10276073619632e-06, "objective/entropy": -247.29574584960938, "objective/kl": 33.42229080200195, "objective/non_score_reward": -1.671114444732666, "objective/rlhf_reward": -5.080338153902607, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 8.782747268676758, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5390625, "step": 1404, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0028865337371826 }, { "episode": 22496, "epoch": 0.134785682616146, "loss/policy_avg": 0.3877559304237366, "lr": 9.102121676891617e-06, "objective/entropy": -236.95657348632812, "objective/kl": 35.144378662109375, "objective/non_score_reward": -1.7572189569473267, "objective/rlhf_reward": -4.906169237867866, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 23.945987701416016, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.62109375, "step": 1405, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.998518705368042 }, { "episode": 22512, "epoch": 0.13488154725527554, "loss/policy_avg": 0.1250351220369339, "lr": 9.101482617586912e-06, "objective/entropy": -219.36123657226562, "objective/kl": 27.327880859375, "objective/non_score_reward": -1.36639404296875, "objective/rlhf_reward": -4.041744013031093, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 17.805503845214844, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.63671875, "step": 1406, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9986976385116577 }, { "episode": 22528, "epoch": 0.1349774118944051, "loss/policy_avg": 0.07265815138816833, "lr": 9.10084355828221e-06, "objective/entropy": -289.05718994140625, "objective/kl": 26.725826263427734, "objective/non_score_reward": -1.3362910747528076, "objective/rlhf_reward": -3.829392933639225, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 4.363107204437256, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.603515625, "step": 1407, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9994230270385742 }, { "episode": 22544, "epoch": 0.13507327653353465, "loss/policy_avg": -0.3271891176700592, "lr": 9.100204498977506e-06, "objective/entropy": -243.45018005371094, "objective/kl": 35.21052551269531, "objective/non_score_reward": -1.7605262994766235, "objective/rlhf_reward": -5.70046954443994, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 12.5887451171875, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.611328125, "step": 1408, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9998884201049805 }, { "episode": 22560, "epoch": 0.1351691411726642, "loss/policy_avg": 0.20253193378448486, "lr": 9.099565439672803e-06, "objective/entropy": -197.681640625, "objective/kl": 29.323577880859375, "objective/non_score_reward": -1.4661788940429688, "objective/rlhf_reward": -3.917304108815129, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 0.8307449817657471, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.65234375, "step": 1409, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.00246524810791 }, { "episode": 22576, "epoch": 0.13526500581179374, "loss/policy_avg": 0.3828544020652771, "lr": 9.0989263803681e-06, "objective/entropy": -283.22674560546875, "objective/kl": 28.88727378845215, "objective/non_score_reward": -1.4443637132644653, "objective/rlhf_reward": -4.435819080382019, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 37.23944854736328, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.61328125, "step": 1410, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9961497783660889 }, { "episode": 22592, "epoch": 0.1353608704509233, "loss/policy_avg": 0.0013767257332801819, "lr": 9.098287321063395e-06, "objective/entropy": -14.409706115722656, "objective/kl": 35.32271194458008, "objective/non_score_reward": -1.7661356925964355, "objective/rlhf_reward": -5.705292427276058, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 19.86726188659668, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.67578125, "step": 1411, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.000868797302246 }, { "episode": 22608, "epoch": 0.13545673509005285, "loss/policy_avg": 0.3584628403186798, "lr": 9.097648261758692e-06, "objective/entropy": -275.2825927734375, "objective/kl": 30.668413162231445, "objective/non_score_reward": -1.5334208011627197, "objective/rlhf_reward": -4.186271618084843, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 3.2139475345611572, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.658203125, "step": 1412, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9992797374725342 }, { "episode": 22624, "epoch": 0.1355525997291824, "loss/policy_avg": 0.07676204293966293, "lr": 9.097009202453987e-06, "objective/entropy": -161.75140380859375, "objective/kl": 26.458412170410156, "objective/non_score_reward": -1.322920560836792, "objective/rlhf_reward": -3.8107296256378884, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 1.3583320379257202, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6484375, "step": 1413, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.001204013824463 }, { "episode": 22640, "epoch": 0.13564846436831193, "loss/policy_avg": -0.348129540681839, "lr": 9.096370143149284e-06, "objective/entropy": -161.31414794921875, "objective/kl": 40.132015228271484, "objective/non_score_reward": -2.006600856781006, "objective/rlhf_reward": -6.470143883433893, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 9.361883163452148, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.69921875, "step": 1414, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.000566005706787 }, { "episode": 22656, "epoch": 0.1357443290074415, "loss/policy_avg": 0.14402732253074646, "lr": 9.095731083844581e-06, "objective/entropy": -256.77880859375, "objective/kl": 29.49087142944336, "objective/non_score_reward": -1.474543571472168, "objective/rlhf_reward": -4.3824026224934425, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 2.047593355178833, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.634765625, "step": 1415, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0007848739624023 }, { "episode": 22672, "epoch": 0.13584019364657104, "loss/policy_avg": -0.055459946393966675, "lr": 9.095092024539878e-06, "objective/entropy": -226.90335083007812, "objective/kl": 35.10498809814453, "objective/non_score_reward": -1.7552495002746582, "objective/rlhf_reward": -5.570399980159149, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 30.274810791015625, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.69921875, "step": 1416, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.999528408050537 }, { "episode": 22688, "epoch": 0.1359360582857006, "loss/policy_avg": 0.27508485317230225, "lr": 9.094452965235175e-06, "objective/entropy": -143.74124145507812, "objective/kl": 37.461273193359375, "objective/non_score_reward": -1.8730638027191162, "objective/rlhf_reward": -5.092255330085754, "objective/scores": 0.6, "policy/approxkl_avg": 2.542268753051758, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.36328125, "step": 1417, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9973454475402832 }, { "episode": 22704, "epoch": 0.13603192292483013, "loss/policy_avg": 0.22670505940914154, "lr": 9.093813905930472e-06, "objective/entropy": -235.04212951660156, "objective/kl": 27.81060791015625, "objective/non_score_reward": -1.3905303478240967, "objective/rlhf_reward": -4.046349847110447, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 18.150178909301758, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.828125, "step": 1418, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9964942932128906 }, { "episode": 22720, "epoch": 0.13612778756395968, "loss/policy_avg": -0.00467962771654129, "lr": 9.093174846625767e-06, "objective/entropy": -241.10305786132812, "objective/kl": 36.07202911376953, "objective/non_score_reward": -1.803601622581482, "objective/rlhf_reward": -5.790574391086665, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 10.899436950683594, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.609375, "step": 1419, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9986159801483154 }, { "episode": 22736, "epoch": 0.13622365220308924, "loss/policy_avg": 0.11034490168094635, "lr": 9.092535787321064e-06, "objective/entropy": -318.0650939941406, "objective/kl": 31.329748153686523, "objective/non_score_reward": -1.5664875507354736, "objective/rlhf_reward": -4.5326165119806925, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 1.8243210315704346, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.66015625, "step": 1420, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.999915599822998 }, { "episode": 22752, "epoch": 0.1363195168422188, "loss/policy_avg": 0.04376043379306793, "lr": 9.09189672801636e-06, "objective/entropy": -257.55560302734375, "objective/kl": 39.71128845214844, "objective/non_score_reward": -1.9855643510818481, "objective/rlhf_reward": -5.542257642745971, "objective/scores": 0.6, "policy/approxkl_avg": 33.98119354248047, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.681640625, "step": 1421, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9983115196228027 }, { "episode": 22768, "epoch": 0.13641538148134832, "loss/policy_avg": 1.4271972179412842, "lr": 9.091257668711657e-06, "objective/entropy": -200.30636596679688, "objective/kl": 37.72491455078125, "objective/non_score_reward": -1.8862457275390625, "objective/rlhf_reward": -5.422276677862678, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 4.053761005401611, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6171875, "step": 1422, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0006790161132812 }, { "episode": 22784, "epoch": 0.13651124612047788, "loss/policy_avg": 0.14689955115318298, "lr": 9.090618609406954e-06, "objective/entropy": -185.34646606445312, "objective/kl": 29.554128646850586, "objective/non_score_reward": -1.4777064323425293, "objective/rlhf_reward": -3.7881193778672557, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 2.1496503353118896, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.537109375, "step": 1423, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.997967004776001 }, { "episode": 22800, "epoch": 0.13660711075960744, "loss/policy_avg": 1.6550846099853516, "lr": 9.08997955010225e-06, "objective/entropy": -237.4461669921875, "objective/kl": 24.241506576538086, "objective/non_score_reward": -1.2120752334594727, "objective/rlhf_reward": -3.5227884388267228, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 2.2821950912475586, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.623046875, "step": 1424, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0027098655700684 }, { "episode": 22816, "epoch": 0.136702975398737, "loss/policy_avg": 0.04801030457019806, "lr": 9.089340490797546e-06, "objective/entropy": -192.99920654296875, "objective/kl": 42.85979461669922, "objective/non_score_reward": -2.1429896354675293, "objective/rlhf_reward": -7.0910067586258645, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 7.908871650695801, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.640625, "step": 1425, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9976098537445068 }, { "episode": 22832, "epoch": 0.13679884003786652, "loss/policy_avg": 0.773149847984314, "lr": 9.088701431492843e-06, "objective/entropy": -204.166015625, "objective/kl": 33.09935760498047, "objective/non_score_reward": -1.6549677848815918, "objective/rlhf_reward": -4.219871020317077, "objective/scores": 0.6, "policy/approxkl_avg": 20.173633575439453, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.69921875, "step": 1426, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9965462684631348 }, { "episode": 22848, "epoch": 0.13689470467699608, "loss/policy_avg": 0.1709783971309662, "lr": 9.08806237218814e-06, "objective/entropy": -294.40911865234375, "objective/kl": 33.224021911621094, "objective/non_score_reward": -1.6612012386322021, "objective/rlhf_reward": -5.244805312156677, "objective/scores": 0.35, "policy/approxkl_avg": 8.916924476623535, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.62109375, "step": 1427, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9989229440689087 }, { "episode": 22864, "epoch": 0.13699056931612563, "loss/policy_avg": -0.012170173227787018, "lr": 9.087423312883437e-06, "objective/entropy": -246.09201049804688, "objective/kl": 36.022274017333984, "objective/non_score_reward": -1.8011138439178467, "objective/rlhf_reward": -5.862819781809478, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 1.0541167259216309, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.646484375, "step": 1428, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0022614002227783 }, { "episode": 22880, "epoch": 0.1370864339552552, "loss/policy_avg": -0.04561644792556763, "lr": 9.086784253578734e-06, "objective/entropy": -243.6079559326172, "objective/kl": 26.140531539916992, "objective/non_score_reward": -1.3070266246795654, "objective/rlhf_reward": -3.7123345372998084, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 66.2858657836914, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6484375, "step": 1429, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9988305568695068 }, { "episode": 22896, "epoch": 0.13718229859438472, "loss/policy_avg": -0.43349897861480713, "lr": 9.086145194274029e-06, "objective/entropy": -226.036376953125, "objective/kl": 34.60681915283203, "objective/non_score_reward": -1.730340838432312, "objective/rlhf_reward": -5.440410855229258, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 9.1111478805542, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.591796875, "step": 1430, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0083601474761963 }, { "episode": 22912, "epoch": 0.13727816323351427, "loss/policy_avg": -0.0003149360418319702, "lr": 9.085506134969326e-06, "objective/entropy": -238.45263671875, "objective/kl": 36.35710144042969, "objective/non_score_reward": -1.817854881286621, "objective/rlhf_reward": -5.755647504123386, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 9.317008018493652, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.525390625, "step": 1431, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9985694885253906 }, { "episode": 22928, "epoch": 0.13737402787264383, "loss/policy_avg": 0.06102012097835541, "lr": 9.084867075664623e-06, "objective/entropy": -249.6441650390625, "objective/kl": 30.982303619384766, "objective/non_score_reward": -1.5491151809692383, "objective/rlhf_reward": -6.1964609026908875, "objective/scores": 0.0, "policy/approxkl_avg": 1.687659502029419, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.580078125, "step": 1432, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0023670196533203 }, { "episode": 22944, "epoch": 0.13746989251177338, "loss/policy_avg": 0.4118673503398895, "lr": 9.08422801635992e-06, "objective/entropy": -208.8826446533203, "objective/kl": 30.2443790435791, "objective/non_score_reward": -1.512218952178955, "objective/rlhf_reward": -3.926169695631538, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 1.5902602672576904, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.60546875, "step": 1433, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0101170539855957 }, { "episode": 22960, "epoch": 0.1375657571509029, "loss/policy_avg": 0.48478835821151733, "lr": 9.083588957055215e-06, "objective/entropy": -193.95437622070312, "objective/kl": 32.08013153076172, "objective/non_score_reward": -1.6040066480636597, "objective/rlhf_reward": -2.0160264730453488, "objective/scores": 1.1, "policy/approxkl_avg": 3.34360408782959, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.658203125, "step": 1434, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.999372959136963 }, { "episode": 22976, "epoch": 0.13766162179003247, "loss/policy_avg": -0.11271242052316666, "lr": 9.082949897750512e-06, "objective/entropy": -230.10296630859375, "objective/kl": 24.382526397705078, "objective/non_score_reward": -1.2191263437271118, "objective/rlhf_reward": -4.876505374908447, "objective/scores": 0.0, "policy/approxkl_avg": 127.57635498046875, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.701171875, "step": 1435, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.004199504852295 }, { "episode": 22992, "epoch": 0.13775748642916202, "loss/policy_avg": 0.06364642083644867, "lr": 9.082310838445809e-06, "objective/entropy": -229.41696166992188, "objective/kl": 41.24948501586914, "objective/non_score_reward": -2.062474489212036, "objective/rlhf_reward": -6.908262303381591, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 5.2524919509887695, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.732421875, "step": 1436, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.999943733215332 }, { "episode": 23008, "epoch": 0.13785335106829158, "loss/policy_avg": 0.48797979950904846, "lr": 9.081671779141104e-06, "objective/entropy": -276.9367370605469, "objective/kl": 39.9646110534668, "objective/non_score_reward": -1.9982305765151978, "objective/rlhf_reward": -6.331062798917877, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 0.5203732252120972, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.705078125, "step": 1437, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0025830268859863 }, { "episode": 23024, "epoch": 0.1379492157074211, "loss/policy_avg": -0.24298250675201416, "lr": 9.0810327198364e-06, "objective/entropy": -105.76115417480469, "objective/kl": 33.14936447143555, "objective/non_score_reward": -1.657468318939209, "objective/rlhf_reward": -5.0257530546823315, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 8.427331924438477, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.57421875, "step": 1438, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.009620428085327 }, { "episode": 23040, "epoch": 0.13804508034655066, "loss/policy_avg": 0.11671873927116394, "lr": 9.080393660531698e-06, "objective/entropy": -207.38742065429688, "objective/kl": 25.86302375793457, "objective/non_score_reward": -1.2931511402130127, "objective/rlhf_reward": -3.7487728192406573, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 2.233325481414795, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.56640625, "step": 1439, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.003443717956543 }, { "episode": 23056, "epoch": 0.13814094498568022, "loss/policy_avg": 0.32801347970962524, "lr": 9.079754601226994e-06, "objective/entropy": -256.9577941894531, "objective/kl": 46.250244140625, "objective/non_score_reward": -2.312512159347534, "objective/rlhf_reward": -7.30263740845197, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 13.493009567260742, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.69921875, "step": 1440, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9997389316558838 }, { "episode": 23072, "epoch": 0.13823680962480978, "loss/policy_avg": 0.11888322979211807, "lr": 9.079115541922291e-06, "objective/entropy": -188.8265380859375, "objective/kl": 36.61035919189453, "objective/non_score_reward": -1.8305180072784424, "objective/rlhf_reward": -5.717951808039265, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 4.184604644775391, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5703125, "step": 1441, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9999384880065918 }, { "episode": 23088, "epoch": 0.13833267426393933, "loss/policy_avg": 1.4782209396362305, "lr": 9.078476482617588e-06, "objective/entropy": -250.94830322265625, "objective/kl": 32.46335983276367, "objective/non_score_reward": -1.6231679916381836, "objective/rlhf_reward": -5.04207394561325, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 12.02247428894043, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.71484375, "step": 1442, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9968842267990112 }, { "episode": 23104, "epoch": 0.13842853890306886, "loss/policy_avg": 0.3007156252861023, "lr": 9.077837423312883e-06, "objective/entropy": -287.5181884765625, "objective/kl": 36.19750213623047, "objective/non_score_reward": -1.8098750114440918, "objective/rlhf_reward": -2.839500284194946, "objective/scores": 1.1, "policy/approxkl_avg": 74.40210723876953, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.638671875, "step": 1443, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0000827312469482 }, { "episode": 23120, "epoch": 0.13852440354219842, "loss/policy_avg": 0.04818664491176605, "lr": 9.07719836400818e-06, "objective/entropy": -213.17276000976562, "objective/kl": 29.73092269897461, "objective/non_score_reward": -1.4865461587905884, "objective/rlhf_reward": -4.284325008810149, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 7.239911079406738, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.703125, "step": 1444, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0012645721435547 }, { "episode": 23136, "epoch": 0.13862026818132797, "loss/policy_avg": -0.07083216309547424, "lr": 9.076559304703477e-06, "objective/entropy": -245.51406860351562, "objective/kl": 30.533039093017578, "objective/non_score_reward": -1.5266518592834473, "objective/rlhf_reward": -4.706607258319854, "objective/scores": 0.35, "policy/approxkl_avg": 18.655948638916016, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.76953125, "step": 1445, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9989795684814453 }, { "episode": 23152, "epoch": 0.13871613282045753, "loss/policy_avg": -0.036476410925388336, "lr": 9.075920245398774e-06, "objective/entropy": -170.21502685546875, "objective/kl": 32.540916442871094, "objective/non_score_reward": -1.62704598903656, "objective/rlhf_reward": -5.182671103507204, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 32.26573181152344, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.65234375, "step": 1446, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.000288963317871 }, { "episode": 23168, "epoch": 0.13881199745958706, "loss/policy_avg": -0.017804868519306183, "lr": 9.075281186094071e-06, "objective/entropy": -137.2032470703125, "objective/kl": 43.56850814819336, "objective/non_score_reward": -2.1784255504608154, "objective/rlhf_reward": -7.263103942485198, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 6.8924241065979, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.701171875, "step": 1447, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0064001083374023 }, { "episode": 23184, "epoch": 0.1389078620987166, "loss/policy_avg": -0.14877469837665558, "lr": 9.074642126789366e-06, "objective/entropy": -225.8348388671875, "objective/kl": 37.99517059326172, "objective/non_score_reward": -1.8997586965560913, "objective/rlhf_reward": -5.476328434721504, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 12.023405075073242, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.52734375, "step": 1448, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.999727725982666 }, { "episode": 23200, "epoch": 0.13900372673784617, "loss/policy_avg": 0.5529218316078186, "lr": 9.074003067484663e-06, "objective/entropy": -227.67355346679688, "objective/kl": 37.310272216796875, "objective/non_score_reward": -1.865513801574707, "objective/rlhf_reward": -7.462055325508118, "objective/scores": 0.0, "policy/approxkl_avg": 4.481158256530762, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.63671875, "step": 1449, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9991602897644043 }, { "episode": 23216, "epoch": 0.13909959137697572, "loss/policy_avg": 0.12435504049062729, "lr": 9.07336400817996e-06, "objective/entropy": -278.33416748046875, "objective/kl": 31.698009490966797, "objective/non_score_reward": -1.5849003791809082, "objective/rlhf_reward": -4.216895284430061, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 7.932787895202637, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.560546875, "step": 1450, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9972891807556152 }, { "episode": 23232, "epoch": 0.13919545601610525, "loss/policy_avg": 0.281582236289978, "lr": 9.072724948875257e-06, "objective/entropy": -217.79067993164062, "objective/kl": 24.10039520263672, "objective/non_score_reward": -1.2050197124481201, "objective/rlhf_reward": -3.2159590459504894, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 10.788034439086914, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6015625, "step": 1451, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0001490116119385 }, { "episode": 23248, "epoch": 0.1392913206552348, "loss/policy_avg": 0.29366064071655273, "lr": 9.072085889570554e-06, "objective/entropy": -186.27256774902344, "objective/kl": 31.6138858795166, "objective/non_score_reward": -1.580694317817688, "objective/rlhf_reward": -4.981141617804198, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 34.03179931640625, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6328125, "step": 1452, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.998284101486206 }, { "episode": 23264, "epoch": 0.13938718529436436, "loss/policy_avg": -0.07383158057928085, "lr": 9.07144683026585e-06, "objective/entropy": -212.43728637695312, "objective/kl": 32.51192092895508, "objective/non_score_reward": -1.625596046447754, "objective/rlhf_reward": -4.84052479785739, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 12.179756164550781, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.56640625, "step": 1453, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0035171508789062 }, { "episode": 23280, "epoch": 0.13948304993349392, "loss/policy_avg": 0.3167204260826111, "lr": 9.070807770961146e-06, "objective/entropy": -238.9555206298828, "objective/kl": 32.64189529418945, "objective/non_score_reward": -1.6320947408676147, "objective/rlhf_reward": -4.128379082679748, "objective/scores": 0.6, "policy/approxkl_avg": 23.131351470947266, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.666015625, "step": 1454, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.998004674911499 }, { "episode": 23296, "epoch": 0.13957891457262345, "loss/policy_avg": 0.23313897848129272, "lr": 9.070168711656443e-06, "objective/entropy": -61.311492919921875, "objective/kl": 33.116355895996094, "objective/non_score_reward": -1.6558178663253784, "objective/rlhf_reward": -5.297758612662477, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 35.88987350463867, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.880859375, "step": 1455, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9964382648468018 }, { "episode": 23312, "epoch": 0.139674779211753, "loss/policy_avg": 0.24700571596622467, "lr": 9.069529652351738e-06, "objective/entropy": -255.76536560058594, "objective/kl": 32.391082763671875, "objective/non_score_reward": -1.6195542812347412, "objective/rlhf_reward": -5.136581590681701, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 18.76715087890625, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.658203125, "step": 1456, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9971431493759155 }, { "episode": 23328, "epoch": 0.13977064385088256, "loss/policy_avg": 0.15339264273643494, "lr": 9.068890593047035e-06, "objective/entropy": -262.24359130859375, "objective/kl": 44.425445556640625, "objective/non_score_reward": -2.2212722301483154, "objective/rlhf_reward": -7.328829615321711, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 10.498491287231445, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.578125, "step": 1457, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.99570894241333 }, { "episode": 23344, "epoch": 0.13986650849001211, "loss/policy_avg": 0.2555537521839142, "lr": 9.068251533742332e-06, "objective/entropy": -233.40225219726562, "objective/kl": 32.24791717529297, "objective/non_score_reward": -1.6123958826065063, "objective/rlhf_reward": -4.998985271067962, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 26.18222999572754, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.591796875, "step": 1458, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9968748092651367 }, { "episode": 23360, "epoch": 0.13996237312914164, "loss/policy_avg": -0.060967281460762024, "lr": 9.067612474437628e-06, "objective/entropy": -264.13665771484375, "objective/kl": 40.685516357421875, "objective/non_score_reward": -2.034276008605957, "objective/rlhf_reward": -6.014397802130256, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 3.826961040496826, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7109375, "step": 1459, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0120983123779297 }, { "episode": 23376, "epoch": 0.1400582377682712, "loss/policy_avg": 0.20050451159477234, "lr": 9.066973415132925e-06, "objective/entropy": -100.12745666503906, "objective/kl": 37.77911376953125, "objective/non_score_reward": -1.88895583152771, "objective/rlhf_reward": -6.177221098033291, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 11.450329780578613, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.84375, "step": 1460, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0007660388946533 }, { "episode": 23392, "epoch": 0.14015410240740075, "loss/policy_avg": 0.27806586027145386, "lr": 9.06633435582822e-06, "objective/entropy": -227.0691375732422, "objective/kl": 31.303085327148438, "objective/non_score_reward": -1.5651543140411377, "objective/rlhf_reward": -4.935104284316225, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 1.6722452640533447, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.544921875, "step": 1461, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.002023696899414 }, { "episode": 23408, "epoch": 0.1402499670465303, "loss/policy_avg": 0.4921458065509796, "lr": 9.065695296523517e-06, "objective/entropy": -282.57427978515625, "objective/kl": 26.013471603393555, "objective/non_score_reward": -1.3006736040115356, "objective/rlhf_reward": -2.2789752229463787, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 4.31304931640625, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.73046875, "step": 1462, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9963445663452148 }, { "episode": 23424, "epoch": 0.14034583168565984, "loss/policy_avg": 0.3357080817222595, "lr": 9.065056237218814e-06, "objective/entropy": -292.97235107421875, "objective/kl": 25.093395233154297, "objective/non_score_reward": -1.2546697854995728, "objective/rlhf_reward": -5.018679141998291, "objective/scores": 0.0, "policy/approxkl_avg": 7.99601936340332, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.73046875, "step": 1463, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9978545904159546 }, { "episode": 23440, "epoch": 0.1404416963247894, "loss/policy_avg": 0.2813834846019745, "lr": 9.064417177914111e-06, "objective/entropy": -208.256103515625, "objective/kl": 36.03406524658203, "objective/non_score_reward": -1.8017032146453857, "objective/rlhf_reward": -5.47347940603892, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 30.506061553955078, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.73828125, "step": 1464, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.998169183731079 }, { "episode": 23456, "epoch": 0.14053756096391895, "loss/policy_avg": 0.2761915922164917, "lr": 9.063778118609408e-06, "objective/entropy": -172.63931274414062, "objective/kl": 33.94431686401367, "objective/non_score_reward": -1.6972159147262573, "objective/rlhf_reward": -5.365031678875057, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 4.0666117668151855, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.662109375, "step": 1465, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9994304180145264 }, { "episode": 23472, "epoch": 0.1406334256030485, "loss/policy_avg": 0.667277455329895, "lr": 9.063139059304705e-06, "objective/entropy": -179.61021423339844, "objective/kl": 30.94757843017578, "objective/non_score_reward": -1.5473790168762207, "objective/rlhf_reward": -4.8109142566598475, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 1.069715976715088, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.662109375, "step": 1466, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0003573894500732 }, { "episode": 23488, "epoch": 0.14072929024217803, "loss/policy_avg": 0.03628428280353546, "lr": 9.0625e-06, "objective/entropy": -235.0047607421875, "objective/kl": 34.63279342651367, "objective/non_score_reward": -1.7316396236419678, "objective/rlhf_reward": -4.52655873298645, "objective/scores": 0.6, "policy/approxkl_avg": 58.85393524169922, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.734375, "step": 1467, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.000277280807495 }, { "episode": 23504, "epoch": 0.1408251548813076, "loss/policy_avg": 0.2930186688899994, "lr": 9.061860940695297e-06, "objective/entropy": -311.19976806640625, "objective/kl": 39.07278060913086, "objective/non_score_reward": -1.953639030456543, "objective/rlhf_reward": -6.258296697345331, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 5.577837944030762, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.654296875, "step": 1468, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9985158443450928 }, { "episode": 23520, "epoch": 0.14092101952043715, "loss/policy_avg": -0.16989761590957642, "lr": 9.061221881390594e-06, "objective/entropy": -221.3096923828125, "objective/kl": 34.3406982421875, "objective/non_score_reward": -1.7170348167419434, "objective/rlhf_reward": -4.468139624595642, "objective/scores": 0.6, "policy/approxkl_avg": 24.528539657592773, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.650390625, "step": 1469, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.003714084625244 }, { "episode": 23536, "epoch": 0.1410168841595667, "loss/policy_avg": 0.5539761781692505, "lr": 9.06058282208589e-06, "objective/entropy": -208.52297973632812, "objective/kl": 31.98680877685547, "objective/non_score_reward": -1.5993404388427734, "objective/rlhf_reward": -5.038111769889278, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 1.0958806276321411, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.583984375, "step": 1470, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0022308826446533 }, { "episode": 23552, "epoch": 0.14111274879869623, "loss/policy_avg": 0.18784965574741364, "lr": 9.059943762781188e-06, "objective/entropy": -209.775146484375, "objective/kl": 32.73426818847656, "objective/non_score_reward": -1.6367132663726807, "objective/rlhf_reward": -5.205217769652037, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 14.739479064941406, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.830078125, "step": 1471, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9984816312789917 }, { "episode": 23568, "epoch": 0.1412086134378258, "loss/policy_avg": -0.015954041853547096, "lr": 9.059304703476484e-06, "objective/entropy": -266.16412353515625, "objective/kl": 41.6230354309082, "objective/non_score_reward": -2.0811514854431152, "objective/rlhf_reward": -6.965356790755672, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 2.9907431602478027, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.751953125, "step": 1472, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9974491596221924 }, { "episode": 23584, "epoch": 0.14130447807695534, "loss/policy_avg": 0.38890203833580017, "lr": 9.05866564417178e-06, "objective/entropy": -276.6087951660156, "objective/kl": 31.15995216369629, "objective/non_score_reward": -1.5579975843429565, "objective/rlhf_reward": -4.407161588939737, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 27.55563735961914, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.587890625, "step": 1473, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9981493949890137 }, { "episode": 23600, "epoch": 0.1414003427160849, "loss/policy_avg": 0.47803691029548645, "lr": 9.058026584867077e-06, "objective/entropy": -212.98114013671875, "objective/kl": 45.53947067260742, "objective/non_score_reward": -2.2769737243652344, "objective/rlhf_reward": -7.782381329566164, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 18.62641716003418, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.689453125, "step": 1474, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9991447925567627 }, { "episode": 23616, "epoch": 0.14149620735521443, "loss/policy_avg": -0.034300077706575394, "lr": 9.057387525562373e-06, "objective/entropy": -228.06358337402344, "objective/kl": 31.04609489440918, "objective/non_score_reward": -1.5523046255111694, "objective/rlhf_reward": -4.883705649405641, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 2.5281810760498047, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.634765625, "step": 1475, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0011422634124756 }, { "episode": 23632, "epoch": 0.14159207199434398, "loss/policy_avg": 0.021888693794608116, "lr": 9.05674846625767e-06, "objective/entropy": -190.63259887695312, "objective/kl": 25.703638076782227, "objective/non_score_reward": -1.2851818799972534, "objective/rlhf_reward": -3.4788681320553883, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 1.3920494318008423, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.521484375, "step": 1476, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0001204013824463 }, { "episode": 23648, "epoch": 0.14168793663347354, "loss/policy_avg": 0.5509345531463623, "lr": 9.056109406952967e-06, "objective/entropy": -259.56756591796875, "objective/kl": 34.98920440673828, "objective/non_score_reward": -1.749460220336914, "objective/rlhf_reward": -5.335981493414031, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 10.9700927734375, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.57421875, "step": 1477, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0011067390441895 }, { "episode": 23664, "epoch": 0.1417838012726031, "loss/policy_avg": -0.3546954095363617, "lr": 9.055470347648262e-06, "objective/entropy": -182.73776245117188, "objective/kl": 40.87359619140625, "objective/non_score_reward": -2.043679714202881, "objective/rlhf_reward": -5.251000319362852, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 18.994150161743164, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.83203125, "step": 1478, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9988384246826172 }, { "episode": 23680, "epoch": 0.14187966591173262, "loss/policy_avg": 0.18590596318244934, "lr": 9.05483128834356e-06, "objective/entropy": -204.18365478515625, "objective/kl": 27.81656837463379, "objective/non_score_reward": -1.3908284902572632, "objective/rlhf_reward": -4.204064035151882, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 1.7427480220794678, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.595703125, "step": 1479, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9998271465301514 }, { "episode": 23696, "epoch": 0.14197553055086218, "loss/policy_avg": 2.363548755645752, "lr": 9.054192229038854e-06, "objective/entropy": -259.5701599121094, "objective/kl": 34.00105285644531, "objective/non_score_reward": -1.7000526189804077, "objective/rlhf_reward": -4.8527993661927535, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 24.17676544189453, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.705078125, "step": 1480, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9970941543579102 }, { "episode": 23712, "epoch": 0.14207139518999173, "loss/policy_avg": 0.10088340193033218, "lr": 9.053553169734151e-06, "objective/entropy": -170.8048095703125, "objective/kl": 33.465816497802734, "objective/non_score_reward": -1.6732908487319946, "objective/rlhf_reward": -5.367650661498232, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 3.380504608154297, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.56640625, "step": 1481, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9998365640640259 }, { "episode": 23728, "epoch": 0.1421672598291213, "loss/policy_avg": 0.303548663854599, "lr": 9.052914110429448e-06, "objective/entropy": -291.6600341796875, "objective/kl": 36.511512756347656, "objective/non_score_reward": -1.8255757093429565, "objective/rlhf_reward": -5.878470738132563, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 22.745920181274414, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5703125, "step": 1482, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9991157054901123 }, { "episode": 23744, "epoch": 0.14226312446825082, "loss/policy_avg": 0.6809051632881165, "lr": 9.052275051124745e-06, "objective/entropy": -224.0225830078125, "objective/kl": 27.76144790649414, "objective/non_score_reward": -1.3880724906921387, "objective/rlhf_reward": -4.036518180164036, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 1.4273369312286377, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.64453125, "step": 1483, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.000868082046509 }, { "episode": 23760, "epoch": 0.14235898910738037, "loss/policy_avg": 0.5665885210037231, "lr": 9.051635991820042e-06, "objective/entropy": -210.581298828125, "objective/kl": 42.927467346191406, "objective/non_score_reward": -2.1463735103607178, "objective/rlhf_reward": -7.134895424456939, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 106.22406005859375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.75, "step": 1484, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9954371452331543 }, { "episode": 23776, "epoch": 0.14245485374650993, "loss/policy_avg": 0.09896639734506607, "lr": 9.050996932515339e-06, "objective/entropy": -248.680419921875, "objective/kl": 31.273435592651367, "objective/non_score_reward": -1.5636719465255737, "objective/rlhf_reward": -4.830855686863033, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 124.75199890136719, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.73046875, "step": 1485, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.998034954071045 }, { "episode": 23792, "epoch": 0.14255071838563949, "loss/policy_avg": 0.06345228850841522, "lr": 9.050357873210634e-06, "objective/entropy": -207.33094787597656, "objective/kl": 41.283634185791016, "objective/non_score_reward": -2.0641818046569824, "objective/rlhf_reward": -6.775774124081492, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 1.8670159578323364, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.63671875, "step": 1486, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.999525547027588 }, { "episode": 23808, "epoch": 0.142646583024769, "loss/policy_avg": -0.08927027136087418, "lr": 9.049718813905931e-06, "objective/entropy": -192.79010009765625, "objective/kl": 38.38804626464844, "objective/non_score_reward": -1.9194023609161377, "objective/rlhf_reward": -6.073489222590046, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 2.219352960586548, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.607421875, "step": 1487, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.005600929260254 }, { "episode": 23824, "epoch": 0.14274244766389857, "loss/policy_avg": 0.17106056213378906, "lr": 9.049079754601228e-06, "objective/entropy": -174.7689208984375, "objective/kl": 33.021854400634766, "objective/non_score_reward": -1.6510926485061646, "objective/rlhf_reward": -4.871037379900614, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 1.4489188194274902, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6171875, "step": 1488, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9988312721252441 }, { "episode": 23840, "epoch": 0.14283831230302813, "loss/policy_avg": -0.03305444121360779, "lr": 9.048440695296525e-06, "objective/entropy": -201.0426788330078, "objective/kl": 38.58399200439453, "objective/non_score_reward": -1.9291996955871582, "objective/rlhf_reward": -5.316798543930053, "objective/scores": 0.6, "policy/approxkl_avg": 23.38889503479004, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.58203125, "step": 1489, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.998862385749817 }, { "episode": 23856, "epoch": 0.14293417694215768, "loss/policy_avg": -0.13663126528263092, "lr": 9.047801635991821e-06, "objective/entropy": -128.63768005371094, "objective/kl": 31.9277400970459, "objective/non_score_reward": -1.5963871479034424, "objective/rlhf_reward": -4.869776809009251, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 67.26014709472656, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.70703125, "step": 1490, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.001464366912842 }, { "episode": 23872, "epoch": 0.1430300415812872, "loss/policy_avg": -0.10490886867046356, "lr": 9.047162576687117e-06, "objective/entropy": -172.6685791015625, "objective/kl": 40.15425491333008, "objective/non_score_reward": -2.0077128410339355, "objective/rlhf_reward": -6.206022734912942, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 1.7667760848999023, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.630859375, "step": 1491, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.000633716583252 }, { "episode": 23888, "epoch": 0.14312590622041677, "loss/policy_avg": -0.010392919182777405, "lr": 9.046523517382414e-06, "objective/entropy": -114.63517761230469, "objective/kl": 42.928489685058594, "objective/non_score_reward": -2.1464245319366455, "objective/rlhf_reward": -5.6619793518793315, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 23.010705947875977, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.806640625, "step": 1492, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.000854015350342 }, { "episode": 23904, "epoch": 0.14322177085954632, "loss/policy_avg": 0.33309632539749146, "lr": 9.04588445807771e-06, "objective/entropy": -254.52764892578125, "objective/kl": 36.419219970703125, "objective/non_score_reward": -1.8209609985351562, "objective/rlhf_reward": -7.2838438749313354, "objective/scores": 0.0, "policy/approxkl_avg": 15.836235046386719, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7734375, "step": 1493, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.0010738372802734 }, { "episode": 23920, "epoch": 0.14331763549867588, "loss/policy_avg": 0.7723073959350586, "lr": 9.045245398773007e-06, "objective/entropy": -241.48744201660156, "objective/kl": 45.3399658203125, "objective/non_score_reward": -2.266998291015625, "objective/rlhf_reward": -9.06799328327179, "objective/scores": 0.0, "policy/approxkl_avg": 72.61679077148438, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.62109375, "step": 1494, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9994328022003174 }, { "episode": 23936, "epoch": 0.1434135001378054, "loss/policy_avg": 0.12270835041999817, "lr": 9.044606339468304e-06, "objective/entropy": -233.93212890625, "objective/kl": 36.94314956665039, "objective/non_score_reward": -1.8471574783325195, "objective/rlhf_reward": -5.726770406187163, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 18.066661834716797, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7421875, "step": 1495, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.997254729270935 }, { "episode": 23952, "epoch": 0.14350936477693496, "loss/policy_avg": 0.17836476862430573, "lr": 9.043967280163601e-06, "objective/entropy": -189.3540496826172, "objective/kl": 33.162147521972656, "objective/non_score_reward": -1.6581075191497803, "objective/rlhf_reward": -2.232429778575897, "objective/scores": 1.1, "policy/approxkl_avg": 8.499994277954102, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.693359375, "step": 1496, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.998523473739624 }, { "episode": 23968, "epoch": 0.14360522941606452, "loss/policy_avg": 0.02606182172894478, "lr": 9.043328220858896e-06, "objective/entropy": -187.52023315429688, "objective/kl": 27.699565887451172, "objective/non_score_reward": -1.3849782943725586, "objective/rlhf_reward": -3.983654229846552, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 35.55757522583008, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.681640625, "step": 1497, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.000664234161377 }, { "episode": 23984, "epoch": 0.14370109405519407, "loss/policy_avg": 0.2732602655887604, "lr": 9.042689161554193e-06, "objective/entropy": -221.50460815429688, "objective/kl": 31.577119827270508, "objective/non_score_reward": -1.5788559913635254, "objective/rlhf_reward": -1.9154240846633908, "objective/scores": 1.1, "policy/approxkl_avg": 2.376906394958496, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.72265625, "step": 1498, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.000617027282715 }, { "episode": 24000, "epoch": 0.14379695869432363, "loss/policy_avg": 0.20893090963363647, "lr": 9.04205010224949e-06, "objective/entropy": -245.84298706054688, "objective/kl": 30.781171798706055, "objective/non_score_reward": -1.5390586853027344, "objective/rlhf_reward": -4.7776325727380335, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 19.28199577331543, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.705078125, "step": 1499, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9972145557403564 } ], "logging_steps": 500, "max_steps": 7824, "num_input_tokens_seen": 0, "num_train_epochs": 3.0, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": true, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0, "train_batch_size": null, "trial_name": null, "trial_params": null }